Skip to content

Preprocess

This is the reference to the functions contained in preprocess. For now, they are all accesible directly through machine-learning-datasets and you don't need to use the preprocess namespace.

Common Utility Functions

apply_cmap(img, cmap, cmap_norm=None, alwaysscale=False, overlay_bg=None, **kwargs)

Apply a colormap to an image.

Parameters:

Name Type Description Default
img ndarray

The input image.

required
cmap Union[str, Colormap]

The colormap to apply. Can be a string representing the name of the colormap or a Colormap object.

required
cmap_norm Optional[Union[str, Colormap]]

The normalization to apply to the image before applying the colormap. Can be a string representing the name of the normalization or a Colormap object. Defaults to None.

None
alwaysscale Optional[bool]

Whether to always scale the image before applying the colormap. Defaults to False.

False
overlay_bg Optional[ndarray]

The background image to overlay on the colormap. Defaults to None.

None
**kwargs Any

Additional keyword arguments to pass to the normalize_heatmap function.

{}

Returns:

Type Description
ndarray

np.ndarray: The image with the applied colormap.

Note
  • If the input image has 3 channels, it will be converted to grayscale before applying the colormap.
  • If cmap_norm is provided, the image will be normalized using the normalize_heatmap function before applying the colormap.
  • If alwaysscale is True or the image values are outside the range [0, 1], the image will be scaled using MinMaxScaler before applying the colormap.
  • The alpha channel of the colormap image will be removed.
  • If overlay_bg is provided, it will be overlaid on the colormap image using the heatmap_overlay function.
Source code in machine_learning_datasets/preprocess.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def apply_cmap(
        img:np.ndarray,
        cmap:Union[str,Colormap],
        cmap_norm:Optional[Union[str,Colormap]] = None,
        alwaysscale:Optional[bool] = False,
        overlay_bg:Optional[np.ndarray] = None,
        **kwargs:Any
    ) -> np.ndarray:
    """Apply a colormap to an image.

    Args:
        img (np.ndarray): The input image.
        cmap (Union[str,Colormap]): The colormap to apply. Can be a string representing the name
                                    of the colormap or a Colormap object.
        cmap_norm (Optional[Union[str,Colormap]]): The normalization to apply to the image before
                                                   applying the colormap. Can be a string
                                                   representing the name of the normalization or
                                                   a Colormap object. Defaults to None.
        alwaysscale (Optional[bool]): Whether to always scale the image before applying the
                                      colormap. Defaults to False.
        overlay_bg (Optional[np.ndarray]): The background image to overlay on the colormap.
                                           Defaults to None.
        **kwargs (Any): Additional keyword arguments to pass to the normalize_heatmap function.

    Returns:
        np.ndarray: The image with the applied colormap.

    Note:
        - If the input image has 3 channels, it will be converted to grayscale before
          applying the colormap.
        - If cmap_norm is provided, the image will be normalized using the normalize_heatmap
          function before applying the colormap.
        - If alwaysscale is True or the image values are outside the range [0, 1], the
          image will be scaled using MinMaxScaler before applying the colormap.
        - The alpha channel of the colormap image will be removed.
        - If overlay_bg is provided, it will be overlaid on the colormap image using the
          heatmap_overlay function.
    """
    if len(img.shape) == 3:
        img = np.mean(img, axis=2) #cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) #
    inf_dec = 1e-5
    if cmap_norm is not None:
        img, _, _, _ = normalize_heatmap(img, cmap_norm,\
                                         **kwargs)
    elif alwaysscale or (img.min() < 0 - inf_dec) or (img.max() > 1 + inf_dec):
        img = preprocessing.MinMaxScaler().fit_transform(img)
    colormap = plt.get_cmap(cmap)
    img = np.delete(colormap(img), 3, 2)
    if overlay_bg is not None:
        if len(overlay_bg.shape) == 3:
            if overlay_bg.shape[0] == 3:
                overlay_bg = np.transpose(overlay_bg, (1,2,0))
            overlay_bg = cv2.cvtColor(overlay_bg, cv2.COLOR_RGB2GRAY)
        if len(overlay_bg.shape) == 2:
            overlay_bg = np.stack((overlay_bg,)*3, axis=-1)
        img = heatmap_overlay(img, overlay_bg) / 255
    return img

cumulative_sum_threshold(values, percentile)

Calculate the cumulative sum threshold.

This function calculates the cumulative sum threshold of a given array of values based on a specified percentile.

Parameters:

Name Type Description Default
values ndarray

The array of values.

required
percentile int

The percentile for thresholding. Must be between 0 and 100 inclusive.

required

Returns:

Name Type Description
float float

The threshold value.

Raises:

Type Description
AssertionError

If the percentile is not between 0 and 100 inclusive.

Source code in machine_learning_datasets/preprocess.py
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
def cumulative_sum_threshold(
        values:np.ndarray,
        percentile:int
    ) -> float:
    """Calculate the cumulative sum threshold.

    This function calculates the cumulative sum threshold of a given array
    of values based on a specified percentile.

    Args:
        values (np.ndarray): The array of values.
        percentile (int): The percentile for thresholding. Must be between 0 and 100 inclusive.

    Returns:
        float: The threshold value.

    Raises:
        AssertionError: If the percentile is not between 0 and 100 inclusive.
    """
    # given values should be non-negative
    assert percentile >= 0 and percentile <= 100, (
        "Percentile for thresholding must be " "between 0 and 100 inclusive."
    )
    sorted_vals = np.sort(values.flatten())
    cum_sums = np.cumsum(sorted_vals)
    threshold_id = np.where(cum_sums >= cum_sums[-1] * 0.01 * percentile)[0][0]
    return sorted_vals[threshold_id]

discretize(v, v_intervals, use_quantiles=False, use_continuous_bins=False)

Discretize a variable into intervals.

Parameters:

Name Type Description Default
v Union[str, Series, ndarray]

The variable to be discretized.

required
v_intervals Union[int, list, ndarray]

The intervals to discretize the variable into.

required
use_quantiles Optional[bool], default=False

Whether to use quantiles for discretization.

False
use_continuous_bins Optional[bool], default=False

Whether to use continuous bins for discretization.

False

Returns:

Type Description
Tuple[Union[str, Series, ndarray], ndarray]

Tuple[Union[str, pd.Series, np.ndarray], np.ndarray]: The discretized variable and the bins.

Raises:

Type Description
ValueError

If the length of the interval does not match the number of unique items in the array.

Note
  • If v is a string and v_intervals is a list or array, the function returns v and v_intervals as is.
  • If v is numeric and v_intervals is an integer, the function discretizes v into v_intervals bins.
  • If v is an object or a category, the function converts v into a string and assigns a numerical value to each unique item.

Examples:

>>> v = [1, 2, 3, 4, 5]
>>> v_intervals = 2
>>> discretize(v, v_intervals)
([0, 0, 1, 1, 1], array([1, 3, 5]))
>>> v = pd.Series(['A', 'B', 'C', 'A', 'B'])
>>> v_intervals = ['A', 'B', 'C']
>>> discretize(v, v_intervals)
(0    0
1    1
2    2
3    0
4    1
dtype: object, array(['A', 'B', 'C'], dtype=object))
Source code in machine_learning_datasets/preprocess.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def discretize(
        v:Union[str, pd.Series, np.ndarray],
        v_intervals:Union[int, list, np.ndarray],
        use_quantiles:Optional[bool] = False,
        use_continuous_bins:Optional[bool]=  False
    ) -> Tuple[Union[str, pd.Series, np.ndarray], np.ndarray]:
    """Discretize a variable into intervals.

    Args:
        v (Union[str, pd.Series, np.ndarray]): The variable to be discretized.
        v_intervals (Union[int, list, np.ndarray]): The intervals to discretize the variable into.
        use_quantiles (Optional[bool], default=False): Whether to use quantiles for discretization.
        use_continuous_bins (Optional[bool], default=False): Whether to use continuous bins for
                                                             discretization.

    Returns:
        Tuple[Union[str, pd.Series, np.ndarray], np.ndarray]: The discretized variable and the bins.

    Raises:
        ValueError: If the length of the interval does not match the number of unique items in
                    the array.

    Note:
        - If `v` is a string and `v_intervals` is a list or array, the function returns `v` and
          `v_intervals` as is.
        - If `v` is numeric and `v_intervals` is an integer, the function discretizes `v` into
         `v_intervals` bins.
        - If `v` is an object or a category, the function converts `v` into a string and assigns
          a numerical value to each unique item.

    Examples:
        >>> v = [1, 2, 3, 4, 5]
        >>> v_intervals = 2
        >>> discretize(v, v_intervals)
        ([0, 0, 1, 1, 1], array([1, 3, 5]))

        >>> v = pd.Series(['A', 'B', 'C', 'A', 'B'])
        >>> v_intervals = ['A', 'B', 'C']
        >>> discretize(v, v_intervals)
        (0    0
        1    1
        2    2
        3    0
        4    1
        dtype: object, array(['A', 'B', 'C'], dtype=object))
    """
    if isinstance(v, (pd.core.series.Series, np.ndarray)) and\
        isinstance(v_intervals, (list, np.ndarray)) and len(np.unique(v)) != len(v_intervals):
        raise ValueError("length of interval must match unique items in array")

    if isinstance(v, (str)) and isinstance(v_intervals, (list, np.ndarray)):
        #name of variable instead of array and list of intervals used
        if isinstance(v_intervals, list) is True:
            v_intervals = np.array(v_intervals)
        return v, v_intervals

    if (np.isin(v.dtype, [int, float, 'int8', 'int16', 'int32', 'float16', 'float32'])) and\
        (isinstance(v_intervals, (int))) and (len(np.unique(v)) >= v_intervals) and\
            (max(v) > min(v)):
        #v is discretizable, otherwise assumed to be already discretized
        if use_continuous_bins:
            if use_quantiles:
                v, bins = pd.qcut(v, v_intervals, duplicates='drop', retbins=True,\
                                  labels=True, precision=2)
            else:
                v, bins = pd.cut(v, v_intervals, duplicates='drop', retbins=True,\
                                 labels=True, precision=2)
        else:
            if use_quantiles:
                v = pd.qcut(v, v_intervals, duplicates='drop', precision=2)
            else:
                v = pd.cut(v, v_intervals, duplicates='drop', precision=2)

    if np.isin(v.dtype, [object, 'category']):
        if not isinstance(v, (pd.core.series.Series)):
            v = pd.Series(v)
        bins = np.sort(np.unique(v)).astype(str)
        v = v.astype(str)
        bin_dict = {bins[i]:i for i in range(len(bins))}
        v = v.replace(bin_dict)
    else:
        bins = np.unique(v)

    if isinstance(v_intervals, (list, np.ndarray)) and len(bins) == len(v_intervals):
        bins = v_intervals

    return v, bins

find_closest_datapoint_idx(point, points, metric_or_fn='euclidean', find_exact_first=0, distargs=None, scaler=None)

Find the index of the closest datapoint to a given point.

Parameters:

Name Type Description Default
point ArrayLike

The point for which to find the closest datapoint index.

required
points ArrayLike

The array of datapoints to search for the closest index.

required
metric_or_fn Optional[Union[str, Callable]], default='euclidean'

The distance metric or function to use for calculating distances between points.

'euclidean'
find_exact_first Optional[int], default=0

Determines the behavior when multiple closest datapoints are found. - 0: Return the index of the last closest datapoint found. - 1: Return the index of the last closest datapoint found where the sum of the features of the datapoint matches the sum of the features of the point. - 2: Return the index of the last closest datapoint found where all the features of the datapoint match all the features of the point.

0
distargs Optional[Dict], default=None

Additional arguments to pass to the distance metric or function.

None
scaler Optional[BaseTransformerProtocol], default=None

A scaler object to transform the point and points before calculating distances.

None

Returns:

Name Type Description
int int

The index of the closest datapoint to the given point.

Raises:

Type Description
ValueError

If the point is not 1-dimensional, the points are not 2-dimensional, or the number of features in the point and points do not match.

ValueError

If metric_or_fn is not a string or a callable object.

Note
  • If find_exact_first is set to 1, the function will first check for datapoints where the sum of the features matches the sum of the features of the point.
  • If find_exact_first is set to 2, the function will check for datapoints where all the features match all the features of the point.
  • If scaler is provided, the point and points will be transformed before calculating distances.
  • If metric_or_fn is a string, the function will use the specified distance metric from the scipy.spatial.distance module.
  • If metric_or_fn is a callable object, the function will use the provided distance function to calculate distances.
Source code in machine_learning_datasets/preprocess.py
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
def find_closest_datapoint_idx(
        point:ArrayLike,
        points:ArrayLike,
        metric_or_fn:Optional[Union[str, Callable]] = 'euclidean',
        find_exact_first:Optional[int] = 0,
        distargs:Optional[Dict] = None,
        scaler:Optional[BaseTransformerProtocol] = None
    ) -> int:
    """Find the index of the closest datapoint to a given point.

    Args:
        point (ArrayLike): The point for which to find the closest datapoint index.
        points (ArrayLike): The array of datapoints to search for the closest index.
        metric_or_fn (Optional[Union[str, Callable]], default='euclidean'): The distance metric or
                                         function to use for calculating distances between points.
        find_exact_first (Optional[int], default=0): Determines the behavior when multiple closest
                                                     datapoints are found.
            - 0: Return the index of the last closest datapoint found.
            - 1: Return the index of the last closest datapoint found where the sum of the features
                 of the datapoint matches the sum of the features of the point.
            - 2: Return the index of the last closest datapoint found where all the features of the
                 datapoint match all the features of the point.
        distargs (Optional[Dict], default=None): Additional arguments to pass to the distance metric
                                                 or function.
        scaler (Optional[BaseTransformerProtocol], default=None): A scaler object to transform
                                                                  the point and points before
                                                                  calculating distances.

    Returns:
        int: The index of the closest datapoint to the given point.

    Raises:
        ValueError: If the point is not 1-dimensional, the points are not 2-dimensional,
                    or the number of features in the point and points do not match.
        ValueError: If `metric_or_fn` is not a string or a callable object.

    Note:
        - If `find_exact_first` is set to 1, the function will first check for datapoints where
          the sum of the features matches the sum of the features of the point.
        - If `find_exact_first` is set to 2, the function will check for datapoints where all
          the features match all the features of the point.
        - If `scaler` is provided, the point and points will be transformed before calculating
          distances.
        - If `metric_or_fn` is a string, the function will use the specified distance metric
             from the `scipy.spatial.distance` module.
        - If `metric_or_fn` is a callable object, the function will use the provided distance
          function to calculate distances.
    """
    if distargs is None:
        distargs = {}
    if len(point.shape)!=1 or len(points.shape)!=2 or point.shape[0]!=points.shape[1]:
        raise ValueError("point must be a 1d and points 2d where their number of features match")
    closest_idx = None
    if find_exact_first==1:
        sums_pts = np.sum(points, axis=1)
        sum_pt = np.sum(point, axis=0)
        s = sums_pts==sum_pt
        if isinstance(s, pd.core.series.Series):
            closest_idxs = s[s==True].index.to_list() #TODO: check how to solve C0121
        else:
            closest_idxs = s.nonzero()[0]
        if len(closest_idxs) > 0:
            closest_idx = closest_idxs[-1]
    elif find_exact_first==2:
        if isinstance(points, pd.core.frame.DataFrame):
            for i in reversed(range(points.shape[0])):
                if np.allclose(point, points.iloc[i]):
                    closest_idx = points.iloc[i].name
                    break
        else:
            for i in reversed(range(points.shape[0])):
                if np.allclose(point, points[i]):
                    closest_idx = i
                    break
    if closest_idx is None:
        if scaler is not None:
            point_ = scaler.transform([point])
            #points_ = scaler.transform(points)
        else:
            point_ = [point]
            #points_ = points
        if isinstance(metric_or_fn, str):
            closest_idx = distance.cdist(point_, points, metric=metric_or_fn, **distargs).argmin()
        elif callable(metric_or_fn):
            dists = []
            if isinstance(points, pd.core.frame.DataFrame):
                for i in range(points.shape[0]):
                    dists.append(metric_or_fn(point_[0], points.iloc[i], **distargs))
            else:
                for i in range(points.shape[0]):
                    dists.append(metric_or_fn(point_[0], points[i], **distargs))
            closest_idx = np.array(dists).argmin()
        else:
            raise ValueError("`metric_or_fn` must be a string of a distance metric or valid "
                             "distance function")
        if isinstance(points, pd.core.frame.DataFrame):
            closest_idx = points.iloc[closest_idx].name

    return closest_idx

heatmap_overlay(bg_img, overlay_img, cmap='jet')

Overlay a heatmap on top of an image.

Parameters:

Name Type Description Default
bg_img ndarray

The background image.

required
overlay_img ndarray

The heatmap image to overlay.

required
cmap Optional[Union[str, Colormap]]

The colormap to use for the heatmap. Defaults to 'jet'.

'jet'

Returns:

Type Description
ndarray

np.ndarray: The resulting image with the heatmap overlay.

Source code in machine_learning_datasets/preprocess.py
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
def heatmap_overlay(
        bg_img:np.ndarray,
        overlay_img:np.ndarray,
        cmap:Optional[Union[str,Colormap]] = 'jet'
    ) -> np.ndarray:
    """Overlay a heatmap on top of an image.

    Args:
        bg_img (np.ndarray): The background image.
        overlay_img (np.ndarray): The heatmap image to overlay.
        cmap (Optional[Union[str,Colormap]], optional): The colormap to use for the heatmap.
                                                        Defaults to 'jet'.

    Returns:
        np.ndarray: The resulting image with the heatmap overlay.
    """
    img = np.uint8(bg_img[..., :3] * 255)
    if len(overlay_img.shape) == 2:
        overlay_img = cm.get_cmap(cmap)(overlay_img)
    heatmap = np.uint8(overlay_img[..., :3] * 255)
    return cv2.addWeighted(img, 0.5, heatmap, 0.5, 0)

img_np_from_fig(fig, dpi=14)

Converts a matplotlib figure to a NumPy array representing an image.

Parameters:

Name Type Description Default
fig Figure

The matplotlib figure to convert.

required
dpi Optional[int]

The resolution of the image in dots per inch. Default is 14.

14

Returns:

Type Description
ndarray

np.ndarray: The NumPy array representing the image.

Example

fig = plt.figure()

... create and modify the figure ...

img = img_np_from_fig(fig)

Source code in machine_learning_datasets/preprocess.py
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
def img_np_from_fig(
        fig:Figure,
        dpi:Optional[int] = 14
    ) -> np.ndarray:
    """Converts a matplotlib figure to a NumPy array representing an image.

    Args:
        fig (Figure): The matplotlib figure to convert.
        dpi (Optional[int]): The resolution of the image in dots per inch. Default is 14.

    Returns:
        np.ndarray: The NumPy array representing the image.

    Example:
        fig = plt.figure()
        # ... create and modify the figure ...
        img = img_np_from_fig(fig)
    """
    buffer = io.BytesIO()
    fig.savefig(buffer, format="png", dpi=dpi)
    buffer.seek(0)
    img_np = np.frombuffer(buffer.getvalue(), dtype=np.uint8)
    buffer.close()
    img_np = cv2.imdecode(img_np, 1)
    img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2RGB)
    return img_np

make_dummies_from_dict(df, colname, match_dict, drop_orig=True, nospacechr='_')

Creates dummy variables based on a dictionary or list of values.

Parameters:

Name Type Description Default
df DataFrame

The input DataFrame.

required
colname str

The name of the column to create dummies from.

required
match_dict Union[Dict, List]

A dictionary or list of values to match against in the column.

required
drop_orig Optional[bool]

Whether to drop the original column after creating dummies. Defaults to True.

True
nospacechr Optional[str]

The character to replace spaces with in the dummy variable names. Defaults to '_'.

'_'

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with dummy variables created.

Example

df = pd.DataFrame({'col1': ['apple', 'banana', 'orange']}) match_dict = {'apple': 'fruit', 'banana': 'fruit'} make_dummies_from_dict(df, 'col1', match_dict) col1_fruit col1_orange 0 1 0 1 1 0 2 0 1

Source code in machine_learning_datasets/preprocess.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def make_dummies_from_dict(
        df:pd.DataFrame,
        colname:str,
        match_dict:Union[Dict, List],
        drop_orig:Optional[bool] = True,
        nospacechr:Optional[str] = '_'
    ) -> pd.DataFrame:
    """Creates dummy variables based on a dictionary or list of values.

    Args:
        df (pd.DataFrame): The input DataFrame.
        colname (str): The name of the column to create dummies from.
        match_dict (Union[Dict, List]): A dictionary or list of values to match against
                                        in the column.
        drop_orig (Optional[bool]): Whether to drop the original column after creating
                                    dummies. Defaults to True.
        nospacechr (Optional[str]): The character to replace spaces with in the dummy
                                    variable names. Defaults to '_'.

    Returns:
        pd.DataFrame: The DataFrame with dummy variables created.

    Example:
        >>> df = pd.DataFrame({'col1': ['apple', 'banana', 'orange']})
        >>> match_dict = {'apple': 'fruit', 'banana': 'fruit'}
        >>> make_dummies_from_dict(df, 'col1', match_dict)
           col1_fruit  col1_orange
        0           1            0
        1           1            0
        2           0            1
    """
    if isinstance(match_dict, list) is True:
        if len(nospacechr) > 0:
            match_dict = {match_key:match_key.\
                              replace(' ', nospacechr)\
                              for match_key in match_dict }
        else:
            match_dict = {match_key:match_key\
                              for match_key in match_dict}
    for match_key in match_dict.keys():
        df[colname+'_'+match_dict[match_key]] =\
                    np.where(df[colname].str.contains(match_key), 1, 0)
    if drop_orig:
        return df.drop([colname], axis=1)
    else:
        return df

make_dummies_with_limits(df_, colname, min_recs=0.005, max_dummies=20, defcatname='Other', nospacechr='_')

Make dummies with limits.

Parameters:

Name Type Description Default
df_ DataFrame

The input DataFrame.

required
colname str

The name of the column to create dummies for.

required
min_recs Optional[Union[int, float]], default=0.005

The minimum number of repeated records.

0.005
max_dummies Optional[int], default=20

The maximum number of dummies to create.

20
defcatname Optional[str], default='Other'

The name for the 'Other' category.

'Other'
nospacechr Optional[str], default='_'

The character to replace spaces in the column name.

'_'

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with dummies created.

Note
  • If min_recs is less than 1, it is interpreted as a fraction of the total number of records.
  • Dummies are created for the top values in the specified column, up to the maximum number of dummies.
  • Values that do not meet the minimum number of records or are beyond the maximum number of dummies are grouped into the 'Other' category.
  • Spaces in the column name are replaced with the specified character.
Source code in machine_learning_datasets/preprocess.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def make_dummies_with_limits(
        df_:pd.DataFrame,
        colname:str,
        min_recs:Optional[Union[int, float]] = 0.005,
        max_dummies:Optional[int] = 20,
        defcatname:Optional[str] = 'Other',
        nospacechr:Optional[str] = '_'
    ) -> pd.DataFrame:
    """Make dummies with limits.

    Args:
        df_ (pd.DataFrame): The input DataFrame.
        colname (str): The name of the column to create dummies for.
        min_recs (Optional[Union[int, float]], default=0.005): The minimum number of repeated
                                                               records.
        max_dummies (Optional[int], default=20): The maximum number of dummies to create.
        defcatname (Optional[str], default='Other'): The name for the 'Other' category.
        nospacechr (Optional[str], default='_'): The character to replace spaces in the column name.

    Returns:
        pd.DataFrame: The DataFrame with dummies created.

    Note:
        - If min_recs is less than 1, it is interpreted as a fraction of the total number of
          records.
        - Dummies are created for the top values in the specified column, up to the maximum
          number of dummies.
        - Values that do not meet the minimum number of records or are beyond the maximum
          number of dummies are grouped into the 'Other' category.
        - Spaces in the column name are replaced with the specified character.
    """
    df = df_.copy()
    # min_recs is the number of repeated recalls
    if min_recs < 1:
        min_recs = df.shape[0]*min_recs
    topvals_df = df.groupby(colname).size().reset_index(name="counts").\
                    sort_values(by="counts", ascending=False).reset_index()
    other_l = topvals_df[(topvals_df.index > max_dummies) |\
                         (topvals_df.counts < min_recs)][colname].to_list()
    # Set the column name to the other_l if the column is in other_l
    if len(other_l):
        df.loc[df[colname].isin(other_l), colname] = defcatname
    # Remove nospacechr characters from the column name.
    if len(nospacechr) > 0:
        df[colname] = df[colname].str.replace(' ',\
                                                  nospacechr, regex=False)
    return pd.get_dummies(df, prefix=[colname], columns=[colname])

minmax_scale_img(img)

Scales the input image to the range [0, 1].

Parameters:

Name Type Description Default
img ndarray

The input image.

required

Returns:

Type Description
ndarray

np.ndarray: The scaled image.

Source code in machine_learning_datasets/preprocess.py
437
438
439
440
441
442
443
444
445
446
447
448
449
450
def minmax_scale_img(
        img:np.ndarray
    ) -> np.ndarray:
    """Scales the input image to the range [0, 1].

    Args:
        img (np.ndarray): The input image.

    Returns:
        np.ndarray: The scaled image.
    """
    if img.max() != img.min():
        img = (img - img.min()) / (img.max() - img.min())
    return img

minmax_scale_img_posneg(img)

Scales the input image to the range [0, 1] by performing min-max scaling separately for positive and negative values.

Parameters:

Name Type Description Default
img ndarray

The input image.

required

Returns:

Type Description
ndarray

np.ndarray: The scaled image.

Source code in machine_learning_datasets/preprocess.py
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
def minmax_scale_img_posneg(
        img:np.ndarray
    ) -> np.ndarray:
    """Scales the input image to the range [0, 1] by performing min-max scaling
       separately for positive and negative values.

    Args:
        img (np.ndarray): The input image.

    Returns:
        np.ndarray: The scaled image.
    """
    img_pos = np.where(img > 0, img, 0)
    img_pos = np.where(img > 0, (minmax_scale_img(img_pos) / 2) + 0.5, 0.5)
    img_neg = np.where(img < 0, img, 0)
    img_neg = np.where(img < 0, (minmax_scale_img(img_neg) / 2), 0.5)
    img = np.where(img==0, 0.5, np.where(img > 0, img_pos, img_neg))
    return img

normalize_heatmap(heatmap, sign, outlier_perc=2, reduction_axis=None)

Normalize the heatmap based on the given sign type and outlier percentage.

Parameters:

Name Type Description Default
heatmap ndarray

The input heatmap.

required
sign str

The sign type for normalization. Possible values are "all", "positive", "negative", and "absolute_value".

required
outlier_perc Optional[int]

The percentage of outliers to remove. Default is 2.

2
reduction_axis Optional[int]

The axis along which to reduce the heatmap. Default is None.

None

Returns:

Type Description
Tuple[ndarray, Union[str, Colormap], int, int]

Tuple[np.ndarray, Union[str,Colormap], int, int]: A tuple containing the normalized heatmap, the colormap, vmin, and vmax.

Raises:

Type Description
AssertionError

If the sign type is not valid.

Source code in machine_learning_datasets/preprocess.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
def normalize_heatmap(
        heatmap:np.ndarray,
        sign:str,
        outlier_perc:Optional[int] = 2,
        reduction_axis:Optional[int] = None
    ) -> Tuple[np.ndarray, Union[str,Colormap], int, int]:
    """Normalize the heatmap based on the given sign type and outlier percentage.

    Args:
        heatmap (np.ndarray): The input heatmap.
        sign (str): The sign type for normalization. Possible values are "all", "positive",
                    "negative", and "absolute_value".
        outlier_perc (Optional[int]): The percentage of outliers to remove. Default is 2.
        reduction_axis (Optional[int]): The axis along which to reduce the heatmap. Default is None.

    Returns:
        Tuple[np.ndarray, Union[str,Colormap], int, int]: A tuple containing the normalized heatmap,
                                                          the colormap, vmin, and vmax.

    Raises:
        AssertionError: If the sign type is not valid.
    """
    heatmap_combined = heatmap
    if reduction_axis is not None:
        heatmap_combined = np.sum(heatmap, axis=reduction_axis)

    # Choose appropriate signed values and rescale, removing given outlier percentage.
    default_cmap = "jet"
    if sign == "all":
        threshold = cumulative_sum_threshold(np.abs(heatmap_combined), 100 - outlier_perc)
        default_cmap = LinearSegmentedColormap.from_list(
            "RdWhGn", ["red", "white", "green"]
        )
        vmin, vmax = -1, 1
    elif sign == "positive":
        heatmap_combined = (heatmap_combined > 0) * heatmap_combined
        threshold = cumulative_sum_threshold(heatmap_combined, 100 - outlier_perc)
        #default_cmap = "Greens"
        vmin, vmax = 0, 1
    elif sign == "negative":
        heatmap_combined = (heatmap_combined < 0) * heatmap_combined
        threshold = -1 * cumulative_sum_threshold(
            np.abs(heatmap_combined), 100 - outlier_perc
        )
        #default_cmap = "Reds"
        vmin, vmax = 0, 1
    elif sign == "absolute_value":
        heatmap_combined = np.abs(heatmap_combined)
        threshold = cumulative_sum_threshold(heatmap_combined, 100 - outlier_perc)
        #default_cmap = "Blues"
        vmin, vmax = 0, 1
    else:
        raise AssertionError("Heatmap normalization sign type is not valid.")

    heatmap_scaled = normalize_scale(heatmap_combined, threshold)
    if (vmin == -1) and (vmax == 1):
        heatmap_scaled = minmax_scale_img_posneg(heatmap_scaled)
    return heatmap_scaled, default_cmap, vmin, vmax

normalize_scale(heatmap, scale_factor)

Normalize the given heatmap by dividing it by the specified scale factor.

Parameters: - heatmap (np.ndarray): The input heatmap to be normalized. - scale_factor (float): The scale factor to divide the heatmap by.

Returns: - np.ndarray: The normalized heatmap.

  • UserWarning: If the scale_factor is equal to 0, a warning is raised indicating that normalization is not possible.
  • UserWarning: If the absolute value of the scale_factor is less than 1e-5, a warning is raised indicating that the heatmap values are close to 0 and the visualized results may be misleading.

Note: - The normalized heatmap is obtained by dividing the input heatmap by the scale_factor. - The resulting normalized heatmap is clipped between -1 and 1 using np.clip() function.

Source code in machine_learning_datasets/preprocess.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
def normalize_scale(
        heatmap:np.ndarray,
        scale_factor:float
    ) -> np.ndarray:
    """Normalize the given heatmap by dividing it by the specified scale factor.

    Parameters:
    - heatmap (np.ndarray): The input heatmap to be normalized.
    - scale_factor (float): The scale factor to divide the heatmap by.

    Returns:
    - np.ndarray: The normalized heatmap.

    Raises:
    - UserWarning: If the scale_factor is equal to 0, a warning is raised indicating that
                   normalization is not possible.
    - UserWarning: If the absolute value of the scale_factor is less than 1e-5, a warning
                   is raised indicating that the heatmap values are close to 0 and the
                   visualized results may be misleading.

    Note:
    - The normalized heatmap is obtained by dividing the input heatmap by the scale_factor.
    - The resulting normalized heatmap is clipped between -1 and 1 using np.clip() function.
    """
    if scale_factor == 0:
        warnings.warn("Cannot normalize by scale factor = 0")
        heatmap_norm = heatmap
    else:
        if abs(scale_factor) < 1e-5:
            warnings.warn(
                "Attempting to normalize by value approximately 0, visualized results"
                "may be misleading. This likely means that heatmap values are all"
                "close to 0."
            )
        heatmap_norm = heatmap / scale_factor
    return np.clip(heatmap_norm, -1, 1)

tensor_to_img(tensor, norm_std=None, norm_mean=None, to_numpy=False, cmap_norm=None, cmap=None, cmap_alwaysscale=False, overlay_bg=None, **kwargs)

Converts a tensor to an image.

Parameters:

Name Type Description Default
tensor Tensor

The input tensor.

required
norm_std Optional[Tuple]

The standard deviation for normalization. Default is None.

None
norm_mean Optional[Tuple]

The mean for normalization. Default is None.

None
to_numpy Optional[bool]

Whether to convert the tensor to a numpy array. Default is False.

False
cmap_norm Optional[Union[str, Colormap]]

The normalization method for the colormap. Default is None.

None
cmap Optional[Union[str, Colormap]]

The colormap to apply to the image. Default is None.

None
cmap_alwaysscale Optional[bool]

Whether to always scale the colormap. Default is False.

False
overlay_bg Optional[ndarray]

The background image to overlay. Default is None.

None
**kwargs Any

Additional keyword arguments.

{}

Returns:

Type Description
Optional[ndarray]

Optional[np.ndarray]: The converted image as a numpy array, or None if the conversion fails.

Source code in machine_learning_datasets/preprocess.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def tensor_to_img(
        tensor:Tensor,
        norm_std:Optional[Tuple] = None,
        norm_mean:Optional[Tuple] = None,
        to_numpy:Optional[bool] = False,
        cmap_norm:Optional[Union[str,Colormap]] = None,
        cmap:Optional[Union[str,Colormap]] = None,
        cmap_alwaysscale:Optional[bool] = False,
        overlay_bg:Optional[np.ndarray] = None,
        **kwargs:Any
    ) -> Optional[np.ndarray]:
    """Converts a tensor to an image.

    Args:
        tensor (Tensor): The input tensor.
        norm_std (Optional[Tuple]): The standard deviation for normalization. Default is None.
        norm_mean (Optional[Tuple]): The mean for normalization. Default is None.
        to_numpy (Optional[bool]): Whether to convert the tensor to a numpy array. Default is False.
        cmap_norm (Optional[Union[str,Colormap]]): The normalization method for the colormap.
                                                   Default is None.
        cmap (Optional[Union[str,Colormap]]): The colormap to apply to the image. Default is None.
        cmap_alwaysscale (Optional[bool]): Whether to always scale the colormap. Default is False.
        overlay_bg (Optional[np.ndarray]): The background image to overlay. Default is None.
        **kwargs (Any): Additional keyword arguments.

    Returns:
        Optional[np.ndarray]: The converted image as a numpy array, or None if the conversion fails.
    """
    if norm_std is not None and norm_mean is not None:
        tensor_ = copy.deepcopy(tensor)
        for t, s, m in zip(tensor_, norm_std, norm_mean):
            t.mul_(s).add_(m)
    else:
        tensor_ = tensor

    if to_numpy:
        img_ = tensor_.cpu().detach().numpy()
        if (len(img_.shape) == 3) and (img_.shape[0] == 3):
            img_ = np.transpose(img_, (1,2,0))
            #img_ = np.where(img_ > 0, img_, 0)
        if cmap_norm is not None:
            img_, default_cmap, _, _ = normalize_heatmap(img_, cmap_norm,\
                                                            **kwargs)
            if cmap is None:
                cmap = default_cmap
        if cmap is not None:
            img_ = apply_cmap(img_, cmap, alwaysscale=cmap_alwaysscale, overlay_bg=overlay_bg)
    else:
        img_ = torchvision.transforms.ToPILImage()(tensor_)

    return img_