Skip to content

utils.py

This module contains general pipeline utility functions.

StopWatch

A simple stopwatch to simplify timing code.

Source code in vast_pipeline/utils/utils.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class StopWatch:
    """
    A simple stopwatch to simplify timing code.
    """

    def __init__(self) -> None:
        """
        Initialise the StopWatch

        Returns:
            None.
        """
        self._init = datetime.now()
        self._last = self._init

    def reset(self) -> float:
        """
        Reset the stopwatch and return the time since last reset (seconds).

        Returns:
            The time in seconds since the last reset.
        """
        now = datetime.now()
        diff = (now - self._last).total_seconds()
        self._last = now

        return diff

    def reset_init(self) -> float:
        """
        Reset the stopwatch and return the total time since initialisation.

        Returns:
            The time in seconds since the initialisation.
        """
        now = datetime.now()
        diff = (now - self._init).total_seconds()
        self._last = self._init = now

        return diff

__init__()

Initialise the StopWatch

Returns:

Type Description
None

None.

Source code in vast_pipeline/utils/utils.py
27
28
29
30
31
32
33
34
35
def __init__(self) -> None:
    """
    Initialise the StopWatch

    Returns:
        None.
    """
    self._init = datetime.now()
    self._last = self._init

reset()

Reset the stopwatch and return the time since last reset (seconds).

Returns:

Type Description
float

The time in seconds since the last reset.

Source code in vast_pipeline/utils/utils.py
37
38
39
40
41
42
43
44
45
46
47
48
def reset(self) -> float:
    """
    Reset the stopwatch and return the time since last reset (seconds).

    Returns:
        The time in seconds since the last reset.
    """
    now = datetime.now()
    diff = (now - self._last).total_seconds()
    self._last = now

    return diff

reset_init()

Reset the stopwatch and return the total time since initialisation.

Returns:

Type Description
float

The time in seconds since the initialisation.

Source code in vast_pipeline/utils/utils.py
50
51
52
53
54
55
56
57
58
59
60
61
def reset_init(self) -> float:
    """
    Reset the stopwatch and return the total time since initialisation.

    Returns:
        The time in seconds since the initialisation.
    """
    now = datetime.now()
    diff = (now - self._init).total_seconds()
    self._last = self._init = now

    return diff

calculate_n_partitions(df, n_cpu, partition_size_mb=15)

This function will calculate how many partitions a dataframe should be split into.

Parameters:

Name Type Description Default
df

The pandas dataframe to be partitionined.

required
n_cpu

The number of available CPUs.

required
partition_size

The optimal partition size in MB. NOTE: The default partition size of 15MB is chosen because many of the parallelised operations on partitioned DataFrames can consume a much larger amount of memory than the size of the partition. 15MB avoids consuming too much memory for significant amounts of parallelism (e.g. n_cpu > 10) without significant cost to processing speed.

required

Returns:

Type Description

The optimal number of partitions.

Source code in vast_pipeline/utils/utils.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
def calculate_n_partitions(df, n_cpu, partition_size_mb=15):
    """
    This function will calculate how many partitions a dataframe should be
    split into.

    Args:
        df: The pandas dataframe to be partitionined.
        n_cpu: The number of available CPUs.
        partition_size: The optimal partition size in MB.
            NOTE: The default partition size of 15MB is chosen because
                many of the parallelised operations on partitioned
                DataFrames can consume a much larger amount of memory
                than the size of the partition. 15MB avoids consuming
                too much memory for significant amounts of parallelism
                (e.g. n_cpu > 10) without significant cost to processing
                speed.

    Returns:
        The optimal number of partitions.
    """
    mem_usage_mb = df.memory_usage(deep=True).sum() / 1e6
    n_partitions = int(np.ceil(mem_usage_mb / partition_size_mb))

    # n_partitions should be >= n_cpu for optimal parallel processing
    if n_partitions < n_cpu:
        n_partitions = n_cpu

    partition_size_mb = int(np.ceil(mem_usage_mb / n_partitions))

    logger.debug("Using %d partions of %dMB", n_partitions, partition_size_mb)

    return n_partitions

calculate_workers_and_partitions(df, n_cpu=None, max_partition_mb=15)

Return number of workers and the number of partitions for Dask

Parameters:

Name Type Description Default
df

The pandas dataframe to be partitionined. Don't calculate partitions if df is None

required
num_cpu_max

The maximum number of workers to allocate. The default of None means use one less than all available cores

required
max_partition_mb

The maximum partition size in MB.

15

Returns:

Type Description
(num_workers, n_partitions)

Calculated workers and partitions.

Source code in vast_pipeline/utils/utils.py
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
def calculate_workers_and_partitions(df, n_cpu=None, max_partition_mb=15):
    """
    Return number of workers and the number of partitions for Dask

    Args:
        df: The pandas dataframe to be partitionined.
            Don't calculate partitions if df is None
        num_cpu_max: The maximum number of workers to allocate.
                     The default of None means use one less than all available cores
        max_partition_mb: The maximum partition size in MB.

    Returns:
        (num_workers, n_partitions): Calculated workers and partitions.
    """
    num_cpu = cpu_count() - 1
    num_workers = num_cpu if n_cpu is None else n_cpu
    if num_workers > num_cpu:
        logger.debug("%d desired workers is greater than available cores. Limiting to %s.",
                     num_workers, num_cpu)
        num_workers = num_cpu
    n_partitions = 0
    if df is not None:
        n_partitions = calculate_n_partitions(df, num_workers,
                                              partition_size_mb=max_partition_mb)

    return num_workers, n_partitions

check_read_write_perm(path, perm='W')

Assess the file permission on a path.

Parameters:

Name Type Description Default
path str

The system path to assess.

required
perm str

The permission to check for.

'W'

Returns:

Type Description
None

None

Raises:

Type Description
IOError

The permission is not valid on the checked directory.

Source code in vast_pipeline/utils/utils.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def check_read_write_perm(path: str, perm: str = "W") -> None:
    """
    Assess the file permission on a path.

    Args:
        path: The system path to assess.
        perm: The permission to check for.

    Returns:
        None

    Raises:
        IOError: The permission is not valid on the checked directory.
    """
    assert perm in ("R", "W", "X"), "permission not supported"

    perm_map = {"R": os.R_OK, "W": os.W_OK, "X": os.X_OK}
    if not os.access(path, perm_map[perm]):
        msg = f"permission not valid on folder: {path}"
        logger.error(msg)
        raise IOError(msg)

    pass

deg2dms(deg, dms_format=False, precision=2, truncate=False, latitude=True)

Convert angle in degrees into a DMS formatted string.

Parameters:

Name Type Description Default
deg float

The angle to convert in degrees.

required
dms_format optional

If True, use "d", "m", and "s" as the coorindate separator, otherwise use ":". Defaults to False.

False
precision optional

Floating point precision of the arcseconds component. Can be 0 or a positive integer. Negative values will be interpreted as 0. Defaults to 2.

2
truncate optional

Truncate values after the decimal point instead of rounding. Defaults to False (rounding).

False
latitude optional

The input deg value should be intrepreted as a latitude. Otherwise, it will be interpreted as a longitude. Defaults to True (latitude).

True

Returns:

Type Description
str

deg formatted as a DMS string.

Example

deg2dms(12.582438888888889) '+12:34:56.78' deg2dms(2.582438888888889, dms_format=True) '+02d34m56.78s' deg2dms(-12.582438888888889, precision=1) '-12:34:56.8' deg2dms(-12.582438888888889, precision=1, truncate=True) '-12:34:56.7'

Source code in vast_pipeline/utils/utils.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def deg2dms(
    deg: float,
    dms_format: bool = False,
    precision: int = 2,
    truncate: bool = False,
    latitude: bool = True,
) -> str:
    """Convert angle in degrees into a DMS formatted string.

    Args:
        deg: The angle to convert in degrees.
        dms_format (optional): If `True`, use "d", "m", and "s" as the coorindate
            separator, otherwise use ":". Defaults to False.
        precision (optional): Floating point precision of the arcseconds component.
            Can be 0 or a positive integer. Negative values will be interpreted as 0.
            Defaults to 2.
        truncate (optional): Truncate values after the decimal point instead of rounding.
            Defaults to False (rounding).
        latitude (optional): The input `deg` value should be intrepreted as a latitude.
            Otherwise, it will be interpreted as a longitude.
            Defaults to True (latitude).

    Returns:
        `deg` formatted as a DMS string.

    Example:
        >>> deg2dms(12.582438888888889)
        '+12:34:56.78'
        >>> deg2dms(2.582438888888889, dms_format=True)
        '+02d34m56.78s'
        >>> deg2dms(-12.582438888888889, precision=1)
        '-12:34:56.8'
        >>> deg2dms(-12.582438888888889, precision=1, truncate=True)
        '-12:34:56.7'
    """
    AngleClass = Latitude if latitude else Longitude
    angle = AngleClass(deg, unit="deg")
    precision = precision if precision >= 0 else 0

    output_str: str = angle.to_string(
        unit="deg",
        sep="fromunit" if dms_format else ":",
        precision=precision if not truncate else None,
        alwayssign=True,
        pad=True,
    )
    if truncate:
        # find the decimal point char position and the number of decimal places in the
        # rendered input coordinate (in DMS format, not decimal deg)
        dp_pos = output_str.find(".")
        n_dp = len(output_str[dp_pos + 1:]) if dp_pos >= 0 else 0

        # if the input coordinate precision is less than the requsted output precision,
        # pad the end with zeroes
        if n_dp < precision:
            seconds_str = ""
            # account for rendered input coord having precision = 0
            if dp_pos < 0:
                seconds_str += "."
            seconds_str += "0" * (precision - n_dp)
            output_str += seconds_str
        # otherwise, cut off the excess decimal places
        elif n_dp > precision:
            if precision > 0:
                # account for the decimal point char
                precision += 1
            output_str = output_str[: dp_pos + precision]
        # in the n_dp == precision case, do nothing
    return output_str

deg2hms(deg, hms_format=False, precision=2, truncate=False, longitude=True)

Convert angle in degrees into a HMS formatted string.

Parameters:

Name Type Description Default
deg float

The angle to convert in degrees.

required
hms_format optional

If True, use "h", "m", and "s" as the coorindate separator, otherwise use ":". Defaults to False.

False
precision optional

Floating point precision of the seconds component. Can be 0 or a positive integer. Negative values will be interpreted as 0. Defaults to 2.

2
truncate optional

Truncate values after the decimal point instead of rounding. Defaults to False (rounding).

False
longitude optional

The input deg value should be intrepreted as a longitude. Otherwise, it will be interpreted as a latitude. Defaults to True (longitude).

True

Returns:

Type Description
str

deg formatted as an HMS string.

Example

deg2hms(188.73658333333333) '12:34:56.78' deg2hms(-188.73658333333333, hms_format=True) '12h34m56.78s' deg2hms(188.73658333333333, precision=1) '12:34:56.8' deg2hms(188.73658333333333, precision=1, truncate=True) '12:34:56.7'

Source code in vast_pipeline/utils/utils.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def deg2hms(
    deg: float,
    hms_format: bool = False,
    precision: int = 2,
    truncate: bool = False,
    longitude: bool = True,
) -> str:
    """Convert angle in degrees into a HMS formatted string.

    Args:
        deg: The angle to convert in degrees.
        hms_format (optional): If `True`, use "h", "m", and "s" as the coorindate
            separator, otherwise use ":". Defaults to False.
        precision (optional): Floating point precision of the seconds component.
            Can be 0 or a positive integer. Negative values will be interpreted as 0.
            Defaults to 2.
        truncate (optional): Truncate values after the decimal point instead of rounding.
            Defaults to False (rounding).
        longitude (optional): The input `deg` value should be intrepreted as a longitude.
            Otherwise, it will be interpreted as a latitude.
            Defaults to True (longitude).

    Returns:
        `deg` formatted as an HMS string.

    Example:
        >>> deg2hms(188.73658333333333)
        '12:34:56.78'
        >>> deg2hms(-188.73658333333333, hms_format=True)
        '12h34m56.78s'
        >>> deg2hms(188.73658333333333, precision=1)
        '12:34:56.8'
        >>> deg2hms(188.73658333333333, precision=1, truncate=True)
        '12:34:56.7'
    """
    # use the deg2dms formatter, replace d with h, and cut off the leading ±
    # sign
    return deg2dms(
        deg / 15.0,
        dms_format=hms_format,
        precision=precision,
        truncate=truncate,
        latitude=not longitude,
    ).replace("d", "h")[1:]

dict_merge(dct, merge_dct, add_keys=True)

Recursive dict merge. Inspired by dict.update(), instead of updating only top-level keys, dict_merge recurses down into dicts nested to an arbitrary depth, updating keys. The merge_dct is merged into dct.

This version will return a copy of the dictionary and leave the original arguments untouched.

The optional argument add_keys, determines whether keys which are present in merge_dict but not dct should be included in the new dict.

Parameters:

Name Type Description Default
dct dict

onto which the merge is executed

required
merge_dct dict

dct merged into dct

required
add_keys bool

whether to add new keys

True

Returns:

Type Description
Dict[Any, Any]

Updated dict.

Source code in vast_pipeline/utils/utils.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
def dict_merge(
    dct: Dict[Any, Any], merge_dct: Dict[Any, Any], add_keys=True
) -> Dict[Any, Any]:
    """Recursive dict merge. Inspired by dict.update(), instead of
    updating only top-level keys, dict_merge recurses down into dicts nested
    to an arbitrary depth, updating keys. The `merge_dct` is merged into
    `dct`.

    This version will return a copy of the dictionary and leave the original
    arguments untouched.

    The optional argument `add_keys`, determines whether keys which are
    present in `merge_dict` but not `dct` should be included in the
    new dict.

    Args:
        dct (dict): onto which the merge is executed
        merge_dct (dict): dct merged into dct
        add_keys (bool): whether to add new keys

    Returns:
        Updated dict.
    """
    dct = dct.copy()
    if not add_keys:
        merge_dct = {k: merge_dct[k]
                     for k in set(dct).intersection(set(merge_dct))}

    for k, v in merge_dct.items():
        if (
            k in dct
            and isinstance(dct[k], dict)
            and isinstance(merge_dct[k], collections.abc.Mapping)
        ):
            dct[k] = dict_merge(dct[k], merge_dct[k], add_keys=add_keys)
        else:
            dct[k] = merge_dct[k]

    return dct

eq_to_cart(ra, dec)

Find the cartesian co-ordinates on the unit sphere given the eq. co-ords. ra, dec should be in degrees.

Parameters:

Name Type Description Default
ra float

The right ascension coordinate, in degrees, to convert.

required
dec float

The declination coordinate, in degrees, to convert.

required

Returns:

Type Description
float

The cartesian x coordinate.

float

The cartesian y coordinate.

float

The cartesian z coordinate.

Source code in vast_pipeline/utils/utils.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def eq_to_cart(ra: float, dec: float) -> Tuple[float, float, float]:
    """
    Find the cartesian co-ordinates on the unit sphere given the eq.
    co-ords. ra, dec should be in degrees.

    Args:
        ra: The right ascension coordinate, in degrees, to convert.
        dec: The declination coordinate, in degrees, to convert.

    Returns:
        The cartesian x coordinate.
        The cartesian y coordinate.
        The cartesian z coordinate.
    """
    # TODO: This part of the code can probably be removed along with the
    # storage of these coodinates on the image.
    return (
        m.cos(m.radians(dec)) * m.cos(m.radians(ra)),  # Cartesian x
        m.cos(m.radians(dec)) * m.sin(m.radians(ra)),  # Cartesian y
        m.sin(m.radians(dec)),  # Cartesian z
    )

equ2gal(ra, dec)

Convert equatorial coordinates to galactic

Parameters:

Name Type Description Default
ra float

Right ascension in units of degrees.

required
dec float

Declination in units of degrees.

required

Returns:

Type Description
float

Galactic longitude in degrees.

float

Galactic latitude in degrees.

Source code in vast_pipeline/utils/utils.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def equ2gal(ra: float, dec: float) -> Tuple[float, float]:
    """
    Convert equatorial coordinates to galactic

    Args:
        ra (float): Right ascension in units of degrees.
        dec (float): Declination in units of degrees.

    Returns:
        Galactic longitude in degrees.
        Galactic latitude in degrees.
    """
    c = SkyCoord(
        np.float(ra),
        np.float(dec),
        unit=(
            u.deg,
            u.deg),
        frame="icrs")
    l = c.galactic.l.deg
    b = c.galactic.b.deg

    return l, b

gal2equ(l, b)

Convert galactic coordinates to equatorial.

Parameters:

Name Type Description Default
l float

Galactic longitude in degrees.

required
b float

Galactic latitude in degrees.

required

Returns:

Type Description
float

Right ascension in degrees.

float

Declination in degrees.

Source code in vast_pipeline/utils/utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def gal2equ(l: float, b: float) -> Tuple[float, float]:
    """
    Convert galactic coordinates to equatorial.

    Args:
        l (float): Galactic longitude in degrees.
        b (float): Galactic latitude in degrees.

    Returns:
        Right ascension in degrees.
        Declination in degrees.
    """
    c = SkyCoord(
        l=np.float(l) * u.deg,
        b=np.float(b) * u.deg,
        frame="galactic")
    ra = c.icrs.ra.deg
    dec = c.icrs.dec.deg

    return ra, dec

optimize_floats(df)

Downcast float columns in a pd.DataFrame to the smallest data type without losing any information.

Credit to Robbert van der Gugten.

Parameters:

Name Type Description Default
df DataFrame

input dataframe, no specific columns.

required

Returns:

Type Description
DataFrame

The input dataframe with the float64 type columns downcasted.

Source code in vast_pipeline/utils/utils.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    """
    Downcast float columns in a pd.DataFrame to the smallest
    data type without losing any information.

    Credit to Robbert van der Gugten.

    Args:
        df:
            input dataframe, no specific columns.

    Returns:
        The input dataframe with the `float64` type columns downcasted.
    """
    floats = df.select_dtypes(include=["float64"]).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast="float")

    return df

optimize_ints(df)

Downcast integer columns in a pd.DataFrame to the smallest data type without losing any information.

Credit to Robbert van der Gugten.

Parameters:

Name Type Description Default
df DataFrame

Input dataframe, no specific columns.

required

Returns:

Type Description
DataFrame

The input dataframe with the int64 type columns downcasted.

Source code in vast_pipeline/utils/utils.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    """
    Downcast integer columns in a pd.DataFrame to the smallest
    data type without losing any information.

    Credit to Robbert van der Gugten.

    Args:
        df:
            Input dataframe, no specific columns.

    Returns:
        The input dataframe with the `int64` type columns downcasted.
    """
    ints = df.select_dtypes(include=["int64"]).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")

    return df

parse_coord(coord_string, coord_frame='icrs')

Parse a coordinate string and return a SkyCoord. The units may be expressed within coord_string e.g. "21h52m03.1s -62d08m19.7s", "18.4d +43.1d". If no units are given, the following assumptions are made: - if both coordinate components are decimals, they are assumed to be in degrees. - if a sexagesimal coordinate is given and the frame is galactic, both components are assumed to be in degrees. For any other frame, the first component is assumed to be in hourangles and the second in degrees. Will raise a ValueError if SkyCoord is unable to parse coord_string.

Parameters:

Name Type Description Default
coord_string str

The coordinate string to parse.

required
coord_frame str

The frame of coord_string. Defaults to "icrs".

'icrs'

Returns:

Type Description
SkyCoord

The SkyCoord object.

Source code in vast_pipeline/utils/utils.py
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def parse_coord(coord_string: str, coord_frame: str = "icrs") -> SkyCoord:
    """Parse a coordinate string and return a SkyCoord. The units may be expressed within
    `coord_string` e.g. "21h52m03.1s -62d08m19.7s", "18.4d +43.1d". If no units are given,
    the following assumptions are made:
        - if both coordinate components are decimals, they are assumed to be in degrees.
        - if a sexagesimal coordinate is given and the frame is galactic, both components
            are assumed to be in degrees. For any other frame, the first component is
            assumed to be in hourangles and the second in degrees.
    Will raise a ValueError if SkyCoord is unable to parse `coord_string`.

    Args:
        coord_string (str): The coordinate string to parse.
        coord_frame (str, optional): The frame of `coord_string`. Defaults to "icrs".

    Returns:
        The SkyCoord object.
    """
    # if both coord components are decimals, assume they're in degrees, otherwise assume
    # hourangles and degrees. Note that the unit parameter is ignored if the units are
    # not ambiguous i.e. if coord_string contains the units (e.g. 18.4d,
    # 5h35m, etc)
    try:
        _ = [float(x) for x in coord_string.split()]
        unit = "deg"
    except ValueError:
        if coord_frame == "galactic":
            unit = "deg"
        else:
            unit = "hourangle,deg"

    coord = SkyCoord(coord_string, unit=unit, frame=coord_frame)

    return coord