Skip to content

Utils

This module contains various utility functions used throughout the FACSIMILE package.

utils

Functions:

  • check_directories

    Checks if the script is being run in the root directory and if the

  • download_googlefont

    Download a font from Google Fonts and save it in the fonts folder.

  • load_model

    Load a model from disk.

  • set_style

    Set the Matplotlib style and download the specified font from Google

  • simple_predict

    Predict target scores using a simple linear model.

  • tqdm_joblib

    Context manager to patch joblib to report into tqdm progress bar given as

  • train_validation_test_split

    Splits X and y data into train/validation/test sets according to given

check_directories

check_directories()

Checks if the script is being run in the root directory and if the required data is present.

Raises:

  • RuntimeError

    If the script is not run from the root directory or if the 'data' directory is empty.

Source code in facsimile/utils.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def check_directories():
    """
    Checks if the script is being run in the root directory and if the
    required data is present.

    Raises:
        RuntimeError: If the script is not run from the root directory
            or if the `'data'` directory is empty.
    """
    # Check if the 'notebooks' directory exists
    if not os.path.isdir("docs"):
        # If we're currently in a subdirectory of the "notebooks", move
        # two directories up
        if os.path.isdir("../examples"):
            print("Changing directory to root directory of repository...")
            os.chdir("../..")
        else:
            raise RuntimeError(
                "You must run this notebook from the root directory of the "
                "repository, otherwise paths will break. You are currently "
                "in {}".format(os.getcwd())
            )

    # Check if the 'data' directory exists and is not empty
    if not os.path.isdir("data") or len(os.listdir("data")) == 0:
        raise RuntimeError(
            "You must download the data files from OSF and place them in the "
            "/data directory before running this notebook."
        )

    # Check if the 'figures' directory exists and create it if not
    if not os.path.isdir("figures"):
        os.mkdir("figures")

download_googlefont

download_googlefont(font: str = 'Heebo') -> None

Download a font from Google Fonts and save it in the fonts folder.

This code is modified from Opinionated (https://github.com/MNoichl/opinionated), which itself is borrowed from https://github.com/TutteInstitute/datamapplot.

Parameters:

  • font

    (str, default: 'Heebo' ) –

    The name of the font to download from Google Fonts. Defaults to "Heebo".

Source code in facsimile/utils.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def download_googlefont(font: str = "Heebo") -> None:
    """
    Download a font from Google Fonts and save it in the `fonts` folder.

    This code is modified from `Opinionated`
    (https://github.com/MNoichl/opinionated), which itself is borrowed from
    https://github.com/TutteInstitute/datamapplot.

    Args:
        font (str, optional): The name of the font to download from Google
            Fonts. Defaults to `"Heebo"`.
    """

    # Replace spaces with '+' to format the font name for the API URL
    api_fontname = font.replace(" ", "+")
    # Retrieve the CSS from Google Fonts API that contains the URLs for font
    # files
    api_response = requests.get(
        f"https://fonts.googleapis.com/css?family={api_fontname}:black,"
        "bold,regular,light"
    )
    # Extract font file URLs from the response content
    font_urls = re.findall(r"https?://[^\)]+", str(api_response.content))

    # Download and process each font file found
    for font_url in font_urls:
        # Download the font file
        font_data = requests.get(font_url)
        # Create a temporary file to save the downloaded font
        with NamedTemporaryFile(delete=False, suffix=".ttf") as f:
            f.write(font_data.content)
            # Ensure the file is written and closed properly
            f.close()

            # Load the font using fontTools library
            font = ttLib.TTFont(f.name)
            # Retrieve the font family name from the font's metadata
            font_family_name = font["name"].getDebugName(1)
            # Add the font to matplotlib's font manager for future use
            matplotlib.font_manager.fontManager.addfont(f.name)
            print(f"Added new font as {font_family_name}")

load_model

load_model(model_path: str) -> object

Load a model from disk.

Parameters:

  • model_path

    (str) –

    Path to the model file.

Returns:

  • object ( object ) –

    The loaded model.

Source code in facsimile/utils.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def load_model(model_path: str) -> object:
    """
    Load a model from disk.

    Args:
        model_path (str): Path to the model file.

    Returns:
        object: The loaded model.
    """

    # Check if the model file exists
    if not os.path.isfile(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")

    # Load the model using joblib
    model = joblib.load(model_path)

    # Warn if the model was saved with a different version of facsimile
    if hasattr(model, "__facsimile_version"):
        if model.__facsimile_version != version("facsimile"):
            print(
                f"Warning: Model was saved with facsimile version "
                f"{model.__facsimile_version}, but the current version is "
                f"{version('facsimile')}."
            )

    return model

set_style

set_style(style_path: str = '../style.mplstyle', font: str = 'Heebo') -> None

Set the Matplotlib style and download the specified font from Google Fonts.

Parameters:

  • style_path

    (str, default: '../style.mplstyle' ) –

    The path to the Matplotlib style file. Defaults to ../style.mplstyle.

  • font

    (str, default: 'Heebo' ) –

    The name of the font to download from Google Fonts. Defaults to "Heebo".

Source code in facsimile/utils.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
def set_style(
    style_path: str = "../style.mplstyle", font: str = "Heebo"
) -> None:
    """
    Set the Matplotlib style and download the specified font from Google
    Fonts.

    Args:
        style_path (str, optional): The path to the Matplotlib style file.
            Defaults to `../style.mplstyle`.
        font (str, optional): The name of the font to download from Google
            Fonts. Defaults to `"Heebo"`.
    """

    # Check whether matplotlib already has the font
    font_names = [
        f.name for f in matplotlib.font_manager.fontManager.ttflist
    ]
    if font in font_names:
        print(f"Font {font} already available in Matplotlib.")
    else:
        download_googlefont(font)

    # Read the original style file and replace the font.family line with the
    # new font
    with open(style_path, "r") as f:
        style_lines = f.readlines()

    new_style_lines = [
        (
            line.replace("font.family: sans-serif", f"font.family: {font}")
            if line.startswith("font.family")
            else line
        )
        for line in style_lines
    ]

    # Use a temporary style file with updated font family
    with open("temp_style.mplstyle", "w") as f:
        f.writelines(new_style_lines)

    plt.style.use("temp_style.mplstyle")
    print(f"Matplotlib style set to: {style_path} with font {font}")

simple_predict

simple_predict(weights: DataFrame, X: ndarray) -> ndarray

Predict target scores using a simple linear model.

Parameters:

  • weights

    (DataFrame) –

    A dataframe containing the weights for each item.

  • X

    (ndarray) –

    The input data.

Returns:

  • ndarray

    np.ndarray: The predicted target variable scores.

Source code in facsimile/utils.py
121
122
123
124
125
126
127
128
129
130
131
132
133
def simple_predict(weights: pd.DataFrame, X: np.ndarray) -> np.ndarray:
    """
    Predict target scores using a simple linear model.

    Args:
        weights (pd.DataFrame): A dataframe containing the weights for each
            item.
        X (np.ndarray): The input data.

    Returns:
        np.ndarray: The predicted target variable scores.
    """
    return X @ weights.values[:-1] + weights.values[-1]

tqdm_joblib

tqdm_joblib(tqdm_object: tqdm)

Context manager to patch joblib to report into tqdm progress bar given as argument

From https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/58936697#58936697

Source code in facsimile/utils.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@contextlib.contextmanager
def tqdm_joblib(tqdm_object: tqdm):
    """
    Context manager to patch joblib to report into tqdm progress bar given as
    argument

    From
    https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/58936697#58936697
    """

    class TqdmBatchCompletionCallback(
        joblib.parallel.BatchCompletionCallBack
    ):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

train_validation_test_split

train_validation_test_split(X: ndarray, y: ndarray, train_size: float, val_size: float, test_size: float, random_seed: int = 42) -> Tuple[ndarray, ndarray, ndarray, ndarray, ndarray, ndarray]

Splits X and y data into train/validation/test sets according to given split proportions, using a random seed.

Parameters:

  • X

    (ndarray) –

    The input data.

  • y

    (ndarray) –

    The target data.

  • train_size

    (float) –

    The proportion of the data to use for training.

  • val_size

    (float) –

    The proportion of the data to use for validation.

  • test_size

    (float) –

    The proportion of the data to use for testing.

  • random_seed

    (int, default: 42 ) –

    The random seed to use. Defaults to 42.

Returns:

  • ndarray

    Tuple[numpy.ndarray]: A tuple containing the train, validation, and

  • ndarray

    test sets.

Source code in facsimile/utils.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def train_validation_test_split(
    X: np.ndarray,
    y: np.ndarray,
    train_size: float,
    val_size: float,
    test_size: float,
    random_seed: int = 42,
) -> Tuple[
    np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray
]:
    """
    Splits X and y data into train/validation/test sets according to given
    split proportions, using a random seed.

    Args:
        X (numpy.ndarray): The input data.
        y (numpy.ndarray): The target data.
        train_size (float): The proportion of the data to use for training.
        val_size (float): The proportion of the data to use for validation.
        test_size (float): The proportion of the data to use for testing.
        random_seed (int, optional): The random seed to use. Defaults to `42`.

    Returns:
        Tuple[numpy.ndarray]: A tuple containing the train, validation, and
        test sets.
    """
    assert (
        train_size + val_size + test_size == 1.0
    ), "Train, validation, and test sizes must add up to 1.0"
    assert (
        X.shape[0] == y.shape[0]
    ), "X and y must have the same number of samples"

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_seed
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val,
        y_train_val,
        test_size=val_size / (train_size + val_size),
        random_state=random_seed,
    )

    return X_train, X_val, X_test, y_train, y_val, y_test