Source code for spectrochempy.core.dataset.nddataset

# ======================================================================================
# Copyright (©) 2015-2025 LCS - Laboratoire Catalyse et Spectrochimie, Caen, France.
# CeCILL-B FREE SOFTWARE LICENSE AGREEMENT
# See full LICENSE agreement in the root directory.
# ======================================================================================
"""Module implementing the `NDDataset` class."""

__all__ = ["NDDataset"]
# import signal
import sys
import textwrap
from contextlib import suppress
from datetime import datetime
from datetime import tzinfo
from zoneinfo import ZoneInfo
from zoneinfo import ZoneInfoNotFoundError

import numpy as np
import traitlets as tr
from tzlocal import get_localzone

from spectrochempy.application import error_
from spectrochempy.application import warning_
from spectrochempy.core.dataset.arraymixins.ndio import NDIO
from spectrochempy.core.dataset.arraymixins.ndmath import NDMath  # _set_ufuncs,
from spectrochempy.core.dataset.arraymixins.ndmath import _set_operators
from spectrochempy.core.dataset.arraymixins.ndplot import NDPlot
from spectrochempy.core.dataset.baseobjects.ndarray import DEFAULT_DIM_NAME
from spectrochempy.core.dataset.baseobjects.ndarray import NDArray
from spectrochempy.core.dataset.baseobjects.ndcomplex import NDComplexArray
from spectrochempy.core.dataset.coord import Coord
from spectrochempy.core.dataset.coordset import CoordSet
from spectrochempy.extern.traittypes import Array
from spectrochempy.utils.datetimeutils import utcnow
from spectrochempy.utils.exceptions import SpectroChemPyError
from spectrochempy.utils.optional import import_optional_dependency
from spectrochempy.utils.print import colored_output
from spectrochempy.utils.system import get_user_and_node


# ======================================================================================
# NDDataset class definition
# ======================================================================================
[docs] @tr.signature_has_traits class NDDataset(NDMath, NDIO, NDPlot, NDComplexArray): r""" The main N-dimensional dataset class used by `SpectroChemPy`. The `NDDataset` is the main object use by SpectroChemPy. Like numpy `~numpy.ndarray`\ 's, `NDDataset` have the capability to be sliced, sorted and subject to mathematical operations. But, in addition, `NDDataset` may have units, can be masked and each dimensions can have coordinates also with units. This make `NDDataset` aware of unit compatibility, *e.g.,* for binary operation such as additions or subtraction or during the application of mathematical operations. In addition or in replacement of numerical data for coordinates, `NDDataset` can also have labeled coordinates where labels can be different kind of objects (`str`, `datetime`, `~numpy.ndarray` or other `NDDataset`'s, etc...). Parameters ---------- data : :term:`array-like` Data array contained in the object. The data can be a list, a tuple, a `~numpy.ndarray`, a subclass of `~numpy.ndarray`, another `NDDataset` or a `Coord` object. Any size or shape of data is accepted. If not given, an empty `NDDataset` will be inited. At the initialisation the provided data will be eventually cast to a `~numpy.ndarray`. If the provided objects is passed which already contains some mask, or units, these elements will be used if possible to accordingly set those of the created object. If possible, the provided data will not be copied for `data` input, but will be passed by reference, so you should make a copy of the `data` before passing them if that's the desired behavior or set the `copy` argument to `True`. coordset : `CoordSet` instance, optional It contains the coordinates for the different dimensions of the `data`. if `CoordSet` is provided, it must specify the `coord` and `labels` for all dimensions of the `data`. Multiple `coord`'s can be specified in a `CoordSet` instance for each dimension. coordunits : `list`, optional, default: `None` A list of units corresponding to the dimensions in the order of the coordset. coordtitles : `list`, optional, default: `None` A list of titles corresponding of the dimensions in the order of the coordset. **kwargs Optional keyword parameters (see Other Parameters). Other Parameters ---------------- dtype : `str` or `~numpy.dtype`, optional, default: `np.float64` If specified, the data will be cast to this dtype, else the data will be cast to float64 or complex128. dims : `list` of `str`, optional If specified the list must have a length equal to the number od data dimensions (`ndim`) and the elements must be taken among ``x,y,z,u,v,w or t``. If not specified, the dimension names are automatically attributed in this order. name : `str`, optional A user-friendly name for this object. If not given, the automatic `id` given at the object creation will be used as a name. labels : :term:`array-like` of objects, optional Labels for the `data`. labels can be used only for 1D-datasets. The labels array may have an additional dimension, meaning several series of labels for the same data. The given array can be a list, a tuple, a `~numpy.ndarray` , a ndarray-like, a `NDArray` or any subclass of `NDArray` . mask : :term:`array-like` of `bool` or `NOMASK` , optional Mask for the data. The mask array must have the same shape as the data. The given array can be a list, a tuple, or a `~numpy.ndarray` . Each values in the array must be `False` where the data are *valid* and True when they are not (like in numpy masked arrays). If `data` is already a :class:`~numpy.ma.MaskedArray` , or any array object (such as a `NDArray` or subclass of it), providing a `mask` here, will cause the mask from the masked array to be ignored. units : `Unit` instance or `str`, optional Units of the data. If data is a `Quantity` then `units` is set to the unit of the `data`; if a unit is also explicitly provided an error is raised. Handling of units use the `pint <https://pint.readthedocs.org/>`__ package. timezone : `datetime.tzinfo`, optional The timezone where the data were created. If not specified, the local timezone is assumed. title : `str`, optional The title of the data dimension. The `title` attribute should not be confused with the `name` . The `title` attribute is used for instance for labelling plots of the data. It is optional but recommended to give a title to each ndarray data. dlabel : `str`, optional Alias of `title` . meta : `dict`-like object, optional Additional metadata for this object. Must be dict-like but no further restriction is placed on meta. author : `str`, optional Name(s) of the author(s) of this dataset. By default, name of the computer note where this dataset is created. description : `str`, optional An optional description of the nd-dataset. A shorter alias is `desc` . origin : `str`, optional Origin of the data: Name of organization, address, telephone number, name of individual contributor, etc., as appropriate. roi : `list` Region of interest (ROI) limits. history : `str`, optional A string to add to the object history. copy : `bool`, optional Perform a copy of the passed object. Default is False. See Also -------- Coord : Explicit coordinates object. CoordSet : Set of coordinates. Notes ----- The underlying array in a `NDDataset` object can be accessed through the `data` attribute, which will return a conventional `~numpy.ndarray`. """ # Examples # -------- # Usage by an end-user # # >>> x = scp.NDDataset([1, 2, 3]) # >>> print(x.data) # doctest: +NORMALIZE_WHITESPACE # [ 1 2 3.] # """ # coordinates _coordset = tr.Instance(CoordSet, allow_none=True) # model data (e.g., for fit) _modeldata = Array(tr.Float(), allow_none=True) # some setting for NDDataset _copy = tr.Bool(False) _labels_allowed = tr.Bool(False) # no labels for NDDataset # dataset can be members of a project. _parent = tr.Instance( "spectrochempy.core.project.abstractproject.AbstractProject", allow_none=True, ) # For the GUI interface # parameters state # _state = Dict() # processed data (for GUI) # _processeddata = Array(Float(), allow_none=True) # processed mask (for GUI) # _processedmask = Union((Bool(), Array(Bool()), Instance(MaskedConstant))) # baseline data (for GUI) # _baselinedata = Array(Float(), allow_none=True) # reference data (for GUI) # _referencedata = Array(Float(), allow_none=True) # ranges # _ranges = Instance(Meta) # history _history = tr.List(tr.Tuple(), allow_none=True) # Dates _acquisition_date = tr.Instance(datetime, allow_none=True) _created = tr.Instance(datetime) _modified = tr.Instance(datetime) _timezone = tr.Instance(tzinfo, allow_none=True) # Metadata _author = tr.Unicode() _description = tr.Unicode() _origin = tr.Unicode() # ---------------------------------------------------------------------------------- # Initialisation # ---------------------------------------------------------------------------------- def __init__( self, data=None, coordset=None, coordunits=None, coordtitles=None, **kwargs, ): super().__init__(data, **kwargs) self._created = utcnow() self.description = kwargs.pop("description", "") self.author = kwargs.pop("author", get_user_and_node()) history = kwargs.pop("history", None) if history is not None: self.history = history self._parent = None # eventually set the coordinates with optional units and title if isinstance(coordset, CoordSet): self.set_coordset(**coordset) else: if coordset is None: coordset = [None] * self.ndim if coordunits is None: coordunits = [None] * self.ndim if coordtitles is None: coordtitles = [None] * self.ndim _coordset = [] for c, u, t in zip(coordset, coordunits, coordtitles, strict=False): if not isinstance(c, CoordSet): coord = Coord(c) if u is not None: coord.units = u if t is not None: coord.title = t else: if u: # pragma: no cover warning_( "units have been set for a CoordSet, " "but this will be ignored " "(units are only defined at the coordinate level", ) if t: # pragma: no cover warning_( "title will be ignored as they are only defined at " "the coordinates level", ) coord = c _coordset.append(coord) if _coordset and set(_coordset) != { Coord(), }: # if they are no coordinates do nothing self.set_coordset(*_coordset) self._modified = self._created # ---------------------------------------------------------------------------------- # Special methods # ---------------------------------------------------------------------------------- def __dir__(self): # Only these attributes are used for saving dataset # WARNING: be careful to keep the present order of the three first elements! # Needed for save/load operations return [ # Keep the following order "dims", "coordset", "data", # From here it is free "name", "title", "mask", "units", "meta", "preferences", "author", "description", "history", "created", "modified", # "acquisition_date", "origin", "roi", "transposed", "modeldata", # "processeddata", # "referencedata", # "baselinedata", # "state", # "ranges", ] + NDIO().__dir__() def __getitem__(self, items, **kwargs): saveditems = items # coordinate selection to test first if isinstance(items, str): with suppress(Exception): return self._coordset[items] # slicing new, items = super().__getitem__(items, return_index=True) if new is None: return None if self._coordset is not None: names = self._coordset.names # all names of the current coordinates new_coords = self._coordset.copy() # [None] * len(names) if isinstance(items, np.ndarray): # probably a fancy indexing items = (items,) for i, item in enumerate(items): # get the corresponding dimension name in the dims list name = self.dims[i] # get the corresponding index in the coordinate's names list idx = names.index(name) if self._coordset[idx].is_empty: new_coords[idx] = Coord(None, name=name) elif not isinstance(self._coordset[idx], CoordSet): new_coords[idx] = self._coordset[idx][item] else: # we must slice all internal coordinates newc = [] for c in self._coordset[idx]._coords: newc.append(c[item]) new_coords[idx] = CoordSet(*newc[::-1], name=name) # we reverse to be sure # the order will be kept for internal coordinates new_coords[idx]._default = self._coordset[ idx ]._default # set the same default coord new_coords[idx]._is_same_dim = self._coordset[idx]._is_same_dim # elif isinstance(item, (np.ndarray, list)): # new_coords[idx] = self._coordset[idx][item] new.set_coordset(*new_coords, keepnames=True) new.history = f"Slice extracted: ({saveditems})" return new def __getattr__(self, item): # when the attribute was not found if ( item.startswith("_") or item in [ "interface", "clevels", "coords", ] or "_validate" in item or "_changed" in item ): # raise an error so that traits, ipython operation and more ... # will be handled correctly raise AttributeError # syntax such as ds.x, ds.y, etc... if item[0] in self.dims or self._coordset: # look also properties attribute = None index = 0 # print(item) if len(item) > 2 and item[1] == "_": attribute = item[1:] item = item[0] index = self.dims.index(item) if self._coordset: try: c = self._coordset[item] if isinstance(c, str) and c in self.dims: # probably a reference to another coordinate name c = self._coordset[c] if c.name in self.dims or c._parent_dim in self.dims: if attribute is not None: # get the attribute return getattr(c, attribute) return c raise AttributeError except Exception as err: if item in self.dims: return None raise err elif attribute is not None: if attribute == "size": # we want the size but there is no coords, get it from the data shape return self.shape[index] raise AttributeError( f"Can not find `{attribute}` when no coordinate is defined", ) return None raise AttributeError def __setattr__(self, key, value): # TODO: entering this function in debug stepping mode kill the program # need to investigate further, why! if key in DEFAULT_DIM_NAME: # syntax such as ds.x, ds.y, etc... # Note the above test is important to avoid errors with traitlets # even if it looks redundant with the following if key in self.dims: if self._coordset is None: # we need to create a coordset first self.set_coordset({self.dims[i]: None for i in range(self.ndim)}) idx = self._coordset.names.index(key) _coordset = self._coordset listcoord = False if isinstance(value, list): listcoord = all(isinstance(item, Coord) for item in value) if listcoord: _coordset[idx] = list(CoordSet(value).to_dict().values())[0] _coordset[idx].name = key _coordset[idx]._is_same_dim = True elif isinstance(value, CoordSet): if len(value) > 1: value = CoordSet(value) _coordset[idx] = list(value.to_dict().values())[0] _coordset[idx].name = key _coordset[idx]._is_same_dim = True elif isinstance(value, Coord): value.name = key _coordset[idx] = value else: _coordset[idx] = Coord(value, name=key) _coordset = self._valid_coordset(_coordset) self._coordset.set(_coordset) else: raise AttributeError(f"Coordinate `{key}` is not used.") else: # print(key, value) super().__setattr__(key, value) def __eq__(self, other, attrs=None): attrs = self.__dir__() for attr in ( "filename", "preferences", "name", "author", "description", "history", "created", "modified", "origin", "roi", "modeldata", "processeddata", "baselinedata", "referencedata", "state", "ranges", ): # These attributes are not used for comparison (comparison based on data and units!) with suppress(ValueError): attrs.remove(attr) return super().__eq__(other, attrs) def __hash__(self): # all instance of this class has same hash, so they can be compared return super().__hash__ + hash(self._coordset) # ---------------------------------------------------------------------------------- # Private methods and properties # ---------------------------------------------------------------------------------- @tr.default("_coordset") def _coordset_default(self): return None @tr.default("_modeldata") def _modeldata_default(self): return None # @tr.default("_processeddata") # def _processeddata_default(self): # return None # # @tr.default("_baselinedata") # def _baselinedata_default(self): # return None # # @tr.default("_referencedata") # def _referencedata_default(self): # return None # # @tr.default("_ranges") # def _ranges_default(self): # ranges = Meta() # for dim in self.dims: # ranges[dim] = dict(masks={}, baselines={}, integrals={}, others={}) # return ranges @tr.default("_timezone") def _timezone_default(self): # Return the default timezone (local timezone) return get_localzone() # @tr.validate("_created") # def _created_validate(self, proposal): # date = proposal["value"] # if date.tzinfo is not None: # # make the date utc naive # date = date.replace(tzinfo=None) # return date @tr.validate("_history") def _history_validate(self, proposal): history = proposal["value"] if isinstance(history, list) or history is None: # reset self._history = None return history # @tr.validate("_modified") # def _modified_validate(self, proposal): # date = proposal["value"] # if date.tzinfo is not None: # # make the date utc naive # date = date.replace(tzinfo=None) # return date @tr.observe(tr.All) def _anytrait_changed(self, change): # ex: change { # 'owner': object, # The HasTraits instance # 'new': 6, # The new value # 'old': 5, # The old value # 'name': "foo", # The name of the changed trait # 'type': 'change', # The event type of the notification, usually 'change' # } if change["name"] in ["_created", "_modified", "trait_added"]: return # all the time -> update modified date self._modified = utcnow() return def _cstr(self): # Display the metadata of the object and partially the data out = "" out += f" name: {self.name}\n" out += f" author: {self.author}\n" out += f" created: {self.created}\n" out += ( f" modified: {self.modified}\n" if (self._modified - self._created).seconds > 30 else "" ) wrapper1 = textwrap.TextWrapper( initial_indent="", subsequent_indent=" " * 15, replace_whitespace=True, width=self._text_width, ) pars = self.description.strip().splitlines() if pars: out += " description: " desc = "" if pars: desc += f"{wrapper1.fill(pars[0])}\n" for par in pars[1:]: desc += "{}\n".format(textwrap.indent(par, " " * 15)) # the three escaped null characters are here to facilitate # the generation of html outputs desc = f"\0\0\0{desc.rstrip()}\0\0\0\n" out += desc if self._history: pars = self.history out += " history: " hist = "" if pars: hist += f"{wrapper1.fill(pars[0])}\n" for par in pars[1:]: hist += "{}\n".format(textwrap.indent(par, " " * 15)) # the three escaped null characters are here to facilitate # the generation of html outputs hist = f"\0\0\0{hist.rstrip()}\0\0\0\n" out += hist out += f"{self._str_value().rstrip()}\n" out += f"{self._str_shape().rstrip()}\n" if self._str_shape() else "" out += f"{self._str_dims().rstrip()}\n" if not out.endswith("\n"): out += "\n" out += "\n" if not self._html_output: return colored_output(out.rstrip()) return out.rstrip() def _loc2index(self, loc, dim=-1, *, units=None): # Return the index of a location (label or coordinates) along the dim # This can work only if `coords` exists. if self._coordset is None: raise SpectroChemPyError( "No coords have been defined. Slicing or selection" f" by location ({loc}) needs coords definition.", ) coord = self.coord(dim) return coord._loc2index(loc, units=units) def _str_dims(self): if self.is_empty: return "" if len(self.dims) < 1 or not hasattr(self, "_coordset"): return "" if not self._coordset or len(self._coordset) < 1: return "" self._coordset._html_output = ( self._html_output ) # transfer the html flag if necessary: false by default txt = self._coordset._cstr() return txt.rstrip() # remove the trailing '\n' _repr_dims = _str_dims def _dims_update(self, change=None): # when notified that a coords names have been updated _ = self.dims # fire an update @tr.validate("_coordset") def _coordset_validate(self, proposal): coords = proposal["value"] return self._valid_coordset(coords) def _valid_coordset(self, coords): # uses in coords_validate and setattr if coords is None: return None for k, coord in enumerate(coords): if ( coord is not None and not isinstance(coord, CoordSet) and coord.data is None ): continue # For coord to be acceptable, we require at least a NDArray, a NDArray subclass or a CoordSet if not isinstance(coord, Coord | CoordSet): if isinstance(coord, NDArray): coord = coords[k] = Coord(coord) else: raise TypeError( "Coordinates must be an instance or a subclass of Coord class or NDArray, or of " f" CoordSet class, but an instance of {type(coord)} has been passed", ) if self.dims and coord.name in self.dims: # check the validity of the given coordinates in terms of size (if it correspond to one of the dims) size = coord.size if self._implements("NDDataset"): idx = self._get_dims_index(coord.name)[0] # idx in self.dims if size != self._data.shape[idx]: raise ValueError( f"the size of a coordinates array must be None or be equal" f" to that of the respective `{coord.name}`" f" data dimension but coordinate size={size} != data shape[{idx}]=" f"{self._data.shape[idx]}", ) else: pass # bypass this checking for any other derived type (should be done in the subclass) coords._parent = self return coords @property def _dict_dims(self): _dict = {} for index, dim in enumerate(self.dims): if dim not in _dict: _dict[dim] = {"size": self.shape[index], "coord": getattr(self, dim)} return _dict # ---------------------------------------------------------------------------------- # Public methods and property # ---------------------------------------------------------------------------------- @property def acquisition_date(self): """Acquisition date.""" if self._acquisition_date is not None: # take the one which has been previously set for this dataset acq = self._acquisition_date.astimezone(self._timezone) return acq.isoformat(sep=" ", timespec="seconds") return None @acquisition_date.setter def acquisition_date(self, value): self._acquisition_date = value
[docs] def add_coordset(self, *coords, dims=None, **kwargs): """ Add one or a set of coordinates from a dataset. Parameters ---------- *coords : iterable Coordinates object(s). dims : list Name of the coordinates. **kwargs Optional keyword parameters passed to the coordset. """ if not coords and not kwargs: # reset coordinates self._coordset = None return if self._coordset is None: # make the whole coordset at once self._coordset = CoordSet(*coords, dims=dims, **kwargs) else: # add one coordinate self._coordset._append(*coords, **kwargs) if self._coordset: # set a notifier to the updated traits of the CoordSet instance tr.HasTraits.observe(self._coordset, self._dims_update, "_updated") # force it one time after this initialization self._coordset._updated = True
@property def author(self): """Creator of the dataset (str).""" return self._author @author.setter def author(self, value): self._author = value @property def history(self): """Describes the history of actions made on this array (List of strings).""" history = [] for date, value in self._history: date = date.astimezone(self._timezone).isoformat( sep=" ", timespec="seconds", ) value = value[0].capitalize() + value[1:] history.append(f"{date}> {value}") return history @history.setter def history(self, value): if value is None: return if isinstance(value, list): # history will be replaced self._history = [] if len(value) == 0: return value = value[0] date = utcnow() self._history.append((date, value))
[docs] def coord(self, dim="x"): """ Return the coordinates along the given dimension. Parameters ---------- dim : int or str A dimension index or name, default index = `x` . If an integer is provided, it is equivalent to the `axis` parameter for numpy array. Returns ------- `Coord` Coordinates along the given axis. """ idx = self._get_dims_index(dim)[0] # should generate an error if the # dimension name is not recognized if idx is None: return None if self._coordset is None: return None # idx is not necessarily the position of the coordinates in the CoordSet # indeed, transposition may have taken place. So we need to retrieve the coordinates by its name name = self.dims[idx] if name in self._coordset.names: idx = self._coordset.names.index(name) return self._coordset[idx] error_(f"could not find this dimenson name: `{name}`") return None
@property def coordset(self): """ `CoordSet` instance. Contains the coordinates of the various dimensions of the dataset. It's a readonly property. Use set_coords to change one or more coordinates at once. """ if self._coordset and all(c.is_empty for c in self._coordset): # all coordinates are empty, this is equivalent to None for the coordset return None return self._coordset @coordset.setter def coordset(self, coords): if isinstance(coords, CoordSet): self.set_coordset(**coords) else: self.set_coordset(coords) @property def coordnames(self): """ List of the `Coord` names. Read only property. """ if self._coordset is not None: return self._coordset.names return None @property def coordtitles(self): """ List of the `Coord` titles. Read only property. Use set_coordtitle to eventually set titles. """ if self._coordset is not None: return self._coordset.titles return None @property def coordunits(self): """ List of the `Coord` units. Read only property. Use set_coordunits to eventually set units. """ if self._coordset is not None: return self._coordset.units return None @property def created(self): """Creation date object (Datetime).""" created = self._created.astimezone(self._timezone) return created.isoformat(sep=" ", timespec="seconds") @property def data(self): """ The `data` array. If there is no data but labels, then the labels are returned instead of data. """ return super().data @data.setter def data(self, data): # as we can't write super().data = data, we call _set_data # see comment in the data.setter of NDArray super()._set_data(data)
[docs] def delete_coordset(self): """Delete all coordinate settings.""" self._coordset = None
# ........................................................................................................... @property def description(self): """Provides a description of the underlying data (str).""" return self._description comment = description comment.__doc__ = """Provides a comment (Alias to the description attribute).""" # .......................................................................... @description.setter def description(self, value): self._description = value @property def local_timezone(self): """Return the local timezone.""" return str(get_localzone()) @property def modeldata(self): """ `~numpy.ndarray` - models data. Data eventually generated by modelling of the data. """ return self._modeldata @modeldata.setter def modeldata(self, data): self._modeldata = data @property def modified(self): """ Date of modification (readonly property). Returns ------- str Date of modification in isoformat. """ modified = self._modified.astimezone(self._timezone) return modified.isoformat(sep=" ", timespec="seconds") @property def origin(self): """ Origin of the data. e.g. spectrometer or software """ return self._origin @origin.setter def origin(self, value): self._origin = value @property def parent(self): """ `Project` instance. The parent project of the dataset. """ return self._parent @parent.setter def parent(self, value): if self._parent is not None: # A parent project already exists for this dataset but the # entered values gives a different parent. This is not allowed, # as it can produce impredictable results. We will first remove it # from the current project. self._parent.remove_dataset(self.name) self._parent = value
[docs] def set_coordset(self, *args, **kwargs): """ Set one or more coordinates at once. Parameters ---------- *args : `Coord` or `CoordSet` One or more coordinates. **kwargs Optional keyword parameters passed to the coordset. Warnings -------- This method replace all existing coordinates. See Also -------- add_coordset : Add one or a set of coordinates from a dataset. set_coordtitles : Set titles of the one or more coordinates. set_coordunits : Set units of the one or more coordinates. """ self._coordset = None self.add_coordset(*args, dims=self.dims, **kwargs)
[docs] def set_coordtitles(self, *args, **kwargs): """Set titles of the one or more coordinates.""" self._coordset.set_titles(*args, **kwargs)
[docs] def set_coordunits(self, *args, **kwargs): """Set units of the one or more coordinates.""" self._coordset.set_units(*args, **kwargs)
[docs] def sort(self, **kwargs): """ Return the dataset sorted along a given dimension. By default, it is the last dimension [axis=-1]) using the numeric or label values. Parameters ---------- dim : str or int, optional, default=-1 Dimension index or name along which to sort. pos : int , optional If labels are multidimensional - allow to sort on a define row of labels : labels[pos]. Experimental : Not yet checked. by : str among ['value', 'label'], optional, default=`value` Indicate if the sorting is following the order of labels or numeric coord values. descend : `bool` , optional, default=`False` If true the dataset is sorted in a descending direction. Default is False except if coordinates are reversed. inplace : bool, optional, default=`False` Flag to say that the method return a new object (default) or not (inplace=True). Returns ------- `NDDataset` Sorted dataset. """ inplace = kwargs.get("inplace", False) new = self.copy() if not inplace else self # parameter for selecting the level of labels (default None or 0) pos = kwargs.pop("pos", None) # parameter to say if selection is done by values or by labels by = kwargs.pop("by", "value") # determine which axis is sorted (dims or axis can be passed in kwargs) # it will return a tuple with axis and dim axis, dim = self.get_axis(**kwargs) if axis is None: axis, dim = self.get_axis(axis=0) # get the corresponding coordinates (remember their order can be different # from the order of dimension in dims. So we cannot just take the coord from # the indice. multi = getattr(self, dim) # get the coordinate using the syntax such as self.x coord = multi.default # sort on the default coordinate (in case we have multicoordinates) descend = kwargs.pop("descend", None) if descend is None: # when non specified, default is False (except for reversed coordinates) descend = coord.reversed # import warnings # warnings.simplefilter("error") indexes = [] for i in range(self.ndim): if i == axis: if not coord.has_data: # sometimes we have only label for Coord objects. # in this case, we sort labels if they exist! if coord.is_labeled: by = "label" else: # nothing to do for sorting # return inchanged dataset return new args = coord._argsort(by=by, pos=pos, descend=descend) setattr(new, dim, multi[args]) # sort all coordinate in case of multicoordinates indexes.append(args) else: indexes.append(slice(None)) new._data = new._data[tuple(indexes)] if new.is_masked: new._mask = new._mask[tuple(indexes)] return new
[docs] def squeeze(self, *dims, inplace=False): """ Remove single-dimensional entries from the shape of a NDDataset. Parameters ---------- *dims : None or int or tuple of ints, optional Selects a subset of the single-dimensional entries in the shape. If a dimension (dim) is selected with shape entry greater than one, an error is raised. inplace : bool, optional, default=`False` Flag to say that the method return a new object (default) or not (inplace=True). Returns ------- `NDDataset` The input array, but with all or a subset of the dimensions of length 1 removed. Raises ------ ValueError If `dim` is not `None` , and the dimension being squeezed is not of length 1. """ # make a copy of the original dims old = self.dims[:] # squeeze the data and determine which axis must be squeezed new, axis = super().squeeze(*dims, inplace=inplace, return_axis=True) if axis is not None and new._coordset is not None: # if there are coordinates they have to be squeezed as well (remove # coordinate for the squeezed axis) for i in axis: dim = old[i] del new._coordset[dim] return new
[docs] def atleast_2d(self, inplace=False): """ Expand the shape of an array to make it at least 2D. Parameters ---------- inplace : bool, optional, default=`False` Flag to say that the method return a new object (default) or not (inplace=True). Returns ------- `NDDataset` The input array, but with dimensions increased. See Also -------- squeeze : The inverse operation, removing singleton dimensions. """ new = self.copy() if not inplace else self coordset = self.coordset # mask = self.mask if self.ndim == 0: new._data = self._data[np.newaxis, np.newaxis] new._mask = self._mask[np.newaxis, np.newaxis] new.dims = ["v", "u"] new.set_coordset(u=None, v=None) elif self.ndim == 1: xdim = new.dims[0] xcoord = coordset[0] if coordset is not None else None new._data = self._data[np.newaxis] new._mask = self._mask[np.newaxis] new.dims = ["u", xdim] # new.set_coordset(x=coordset[0] if coordset is not None else None, u=None) new.set_coordset({xdim: xcoord, "u": None}) return new
[docs] def swapdims(self, dim1, dim2, inplace=False): """ Interchange two dimensions of a NDDataset. Parameters ---------- dim1 : int First axis. dim2 : int Second axis. inplace : bool, optional, default=`False` Flag to say that the method return a new object (default) or not (inplace=True). Returns ------- `NDDataset` Swaped dataset. See Also -------- transpose : Transpose a dataset. """ new = super().swapdims(dim1, dim2, inplace=inplace) new.history = f"Data swapped between dims {dim1} and {dim2}" return new
@property def T(self): """ Transposed `NDDataset` . The same object is returned if `ndim` is less than 2. """ return self.transpose()
[docs] def take(self, indices, **kwargs): """ Take elements from an array. Returns ------- `NDDataset` A sub dataset defined by the input indices. """ # handle the various syntax to pass the axis dims = self._get_dims_from_args(**kwargs) axis = self._get_dims_index(dims) axis = axis[0] if axis else None # indices = indices.tolist() if axis is None: # just do a fancy indexing return self[indices] if axis < 0: axis = self.ndim + axis index = tuple( [...] + [indices] + [slice(None) for i in range(self.ndim - 1 - axis)], ) return self[index]
@property def timezone(self): """ Return the timezone information. A timezone's offset refers to how many hours the timezone is from Coordinated Universal Time (UTC). In spectrochempy, all datetimes are stored in UTC, so that conversion must be done during the display of these datetimes using tzinfo. """ return str(self._timezone) @timezone.setter def timezone(self, val): try: self._timezone = ZoneInfo(val) except ZoneInfoNotFoundError as e: raise ZoneInfoNotFoundError( "You can get a list of valid timezones in " "https://en.wikipedia.org/wiki/tr.List_of_tz_database_time_zones ", ) from e
[docs] def to_array(self): """ Return a numpy masked array. Other NDDataset attributes are lost. Returns ------- `~numpy.ndarray` The numpy masked array from the NDDataset data. Examples -------- >>> dataset = scp.read('wodger.spg') >>> a = scp.to_array(dataset) equivalent to: >>> a = np.ma.array(dataset) or >>> a = dataset.masked_data """ return np.ma.array(self)
[docs] def to_xarray(self): """ Convert a NDDataset instance to an `~xarray.DataArray` object. Warning: the xarray library must be available. Returns ------- object A axrray.DataArray object. """ # Information about DataArray from the DataArray docstring # # Attributes # ---------- # dims: tuple # Dimension names associated with this array. # values: np.ndarray # Access or modify DataArray values as a numpy array. # coords: dict-like # Dictionary of DataArray objects that label values along each dimension. # name: str or None # Name of this array. # attrs: OrderedDict # Dictionary for holding arbitrary metadata. # Init docstring # # Parameters # ---------- # data: array_like # Values for this array. Must be an `numpy.ndarray` , ndarray like, # or castable to an `~numpy.ndarray` . # coords: sequence or dict of array_like objects, optional # Coordinates (tick labels) to use for indexing along each dimension. # If dict-like, should be a mapping from dimension names to the # corresponding coordinates. If sequence-like, should be a sequence # of tuples where the first element is the dimension name and the # second element is the corresponding coordinate array_like object. # dims: str or sequence of str, optional # Name(s) of the data dimension(s). Must be either a string (only # for 1D data) or a sequence of strings with length equal to the # number of dimensions. If this argument is omitted, dimension names # are taken from `coords` (if possible) and otherwise default to # `['dim_0', ... 'dim_n']` . # name: str or None, optional # Name of this array. # attrs: dict_like or None, optional # Attributes to assign to the new instance. By default, an empty # attribute dictionary is initialized. # encoding: dict_like or None, optional # Dictionary specifying how to encode this array's data into a # serialized format like netCDF4. Currently used keys (for netCDF) # include '_FillValue', 'scale_factor', 'add_offset', 'dtype', # 'units' and 'calendar' (the later two only for datetime arrays). # Unrecognized keys are ignored. xr = import_optional_dependency("xarray") if xr is None: return None x, y = self.x, self.y tx = x.title if y: ty = y.title da = xr.DataArray( np.array(self.data, dtype=np.float64), coords=[(ty, y.data), (tx, x.data)], ) da.attrs["units"] = self.units else: da = xr.DataArray( np.array(self.data, dtype=np.float64), coords=[(tx, x.data)], ) da.attrs["units"] = self.units da.attrs["title"] = self.title return da
[docs] def transpose(self, *dims, inplace=False): """ Permute the dimensions of a NDDataset. Parameters ---------- *dims : sequence of dimension indexes or names, optional By default, reverse the dimensions, otherwise permute the dimensions according to the values given. inplace : bool, optional, default=`False` Flag to say that the method return a new object (default) or not (inplace=True). Returns ------- NDDataset Transposed NDDataset. See Also -------- swapdims : Interchange two dimensions of a NDDataset. """ new = super().transpose(*dims, inplace=inplace) new.history = ( f"Data transposed between dims: {dims}" if dims else "Data transposed" ) return new
# # ---------------------------------------------------------------------------------- # # DASH GUI options (Work in Progress - not used for now) # # ---------------------------------------------------------------------------------- # # # # TODO: refactor the spectrochempy preference system to have a common basis # # # @property # def ranges(self): # return self._ranges # # @ranges.setter # def ranges(self, value): # self._ranges = value # # @property # def state(self): # """ # State of the controller window for this dataset. # """ # return self._state # # @state.setter # def state(self, val): # self._state = val # # @property # def processeddata(self): # """ # Data after processing (optionaly used). # """ # return self._processeddata # # @processeddata.setter # def processeddata(self, val): # self._processeddata = val # # @property # def processedmask(self): # """ # Mask for the optional processed data. # """ # return self._processedmask # # @processedmask.setter # def processedmask(self, val): # self._processedmask = val # # @property # def baselinedata(self): # """ # Data for an optional baseline. # """ # return self._baselinedata # # @baselinedata.setter # def baselinedata(self, val): # self._baselinedata = val # # @property # def referencedata(self): # """ # Data for an optional reference spectra. # """ # return self._referencedata # # @referencedata.setter # def referencedata(self, val): # self._referencedata = val # ====================================================================================== # module function # ====================================================================================== # make some NDDataset operation accessible from the spectrochempy API thismodule = sys.modules[__name__] api_funcs = [ "sort", "copy", "squeeze", "swapdims", "transpose", "to_array", "to_xarray", "take", "set_complex", "set_quaternion", "set_hypercomplex", "component", "to", "to_base_units", "to_reduced_units", "ito", "ito_base_units", "ito_reduced_units", "is_units_compatible", "remove_masks", ] for funcname in api_funcs: setattr(thismodule, funcname, getattr(NDDataset, funcname)) __all__.append(funcname) # import also npy functions # TODO: this will be changed with __array_functions__ from spectrochempy.processing.transformation.npy import dot NDDataset.dot = dot # ====================================================================================== # Set the operators # ====================================================================================== _set_operators(NDDataset, priority=100000)