Source code for spectrochempy.analysis.peakfinding.peakfinding

# -*- coding: utf-8 -*-
# ======================================================================================
# Copyright (©) 2015-2023 LCS - Laboratoire Catalyse et Spectrochimie, Caen, France.
# CeCILL-B FREE SOFTWARE LICENSE AGREEMENT
# See full LICENSE agreement in the root directory.
# ======================================================================================
"""
Peak finding module.

Contains wrappers of `scipy.signal` peak finding functions.
"""

__all__ = ["find_peaks"]

__dataset_methods__ = ["find_peaks"]

import numpy as np
import scipy

from spectrochempy.application import warning_
from spectrochempy.core.units import Quantity

# Todo:
# find_peaks_cwt(vector, widths[, wavelet, ...]) Attempt to find the peaks in a 1-D array.
# argrelmin(data[, axis, order, mode]) 	Calculate the relative minima of data.
# argrelmax(data[, axis, order, mode]) 	Calculate the relative maxima of data.
# argrelextrema(data, comparator[, axis, ...]) 	Calculate the relative extrema of data.


[docs]def find_peaks( dataset, height=None, window_length=3, threshold=None, distance=None, prominence=None, width=None, wlen=None, rel_height=0.5, plateau_size=None, use_coord=True, ): """ Wrapper and extension of `scpy.signal.find_peaks`\ . Find peaks inside a 1D `NDDataset` based on peak properties. This function finds all local maxima by simple comparison of neighbouring values. Optionally, a subset of these peaks can be selected by specifying conditions for a peak's properties. .. warning:: This function may return unexpected results for data containing NaNs. To avoid this, NaNs should either be removed or replaced. Parameters ---------- dataset : `NDDataset` A 1D NDDataset or a 2D NDdataset with `len(X.y) == 1` . height : `float` or :term:`array-like`\ , optional, default: `None` Required height of peaks. Either a number, `None` , an array matching `x` or a 2-element sequence of the former. The first element is always interpreted as the minimal and the second, if supplied, as the maximal required height. window_length : `int`, default: 5 The length of the filter window used to interpolate the maximum. window_length must be a positive odd integer. If set to one, the actual maximum is returned. threshold : `float` or :term:`array-like`\ , optional Required threshold of peaks, the vertical distance to its neighbouring samples. Either a number, `None` , an array matching `x` or a 2-element sequence of the former. The first element is always interpreted as the minimal and the second, if supplied, as the maximal required threshold. distance : `float`\ , optional Required minimal horizontal distance in samples between neighbouring peaks. Smaller peaks are removed first until the condition is fulfilled for all remaining peaks. prominence : `float` or :term:`array-like`\ , optional Required prominence of peaks. Either a number, `None` , an array matching `x` or a 2-element sequence of the former. The first element is always interpreted as the minimal and the second, if supplied, as the maximal required prominence. width : `float` or :term:`array-like`\ , optional Required width of peaks in samples. Either a number, `None` , an array matching `x` or a 2-element sequence of the former. The first element is always interpreted as the minimal and the second, if supplied, as the maximal required width. Floats are interpreted as width measured along the 'x' Coord; ints are interpreted as a number of points. wlen : `int` or `float`, optional Used for calculation of the peaks prominences, thus it is only used if one of the arguments `prominence` or `width` is given. Floats are interpreted as measured along the 'x' Coord; ints are interpreted as a number of points. See argument len` in `peak_prominences` of the scipy documentation for a full description of its effects. rel_height : `float`, optional, Used for calculation of the peaks width, thus it is only used if `width` is given. See argument `rel_height` in `peak_widths` of the scipy documentation for a full description of its effects. plateau_size : `float` or :term:`array-like`\ , optional Required size of the flat top of peaks in samples. Either a number, `None` , an array matching `x` or a 2-element sequence of the former. The first element is always interpreted as the minimal and the second, if supplied as the maximal required plateau size. Floats are interpreted as measured along the 'x' Coord; ints are interpreted as a number of points. use_coord : `bool`\ , optional Set whether the x Coord (when it exists) should be used instead of indices for the positions and width. If True, the units of the other parameters are interpreted according to the coordinates. Returns ------- peaks : `~numpy.ndarray` Indices of peaks in `dataset` that satisfy all given conditions. properties : `dict` A dictionary containing properties of the returned peaks which were calculated as intermediate results during evaluation of the specified conditions: * ``peak_heights`` If `height` is given, the height of each peak in `dataset`\ . * ``left_thresholds``\ , ``right_thresholds`` If `threshold` is given, these keys contain a peaks vertical distance to its neighbouring samples. * ``prominences``\ , ``right_bases``\ , ``left_bases`` If `prominence` is given, these keys are accessible. See `scipy.signal.peak_prominences` for a full description of their content. * ``width_heights``\ , ``left_ips``\ , ``right_ips`` If `width` is given, these keys are accessible. See `scipy.signal.peak_widths` for a full description of their content. * plateau_sizes, left_edges', 'right_edges' If `plateau_size` is given, these keys are accessible and contain the indices of a peak's edges (edges are still part of the plateau) and the calculated plateau sizes. To calculate and return properties without excluding peaks, provide the open interval `(None, None)` as a value to the appropriate argument (excluding `distance`\ ). Warns ----- PeakPropertyWarning Raised if a peak's properties have unexpected values (see `peak_prominences` and `peak_widths` ). See Also -------- find_peaks_cwt: In `scipy.signal`: Find peaks using the wavelet transformation. peak_prominences: In `scipy.signal`: Directly calculate the prominence of peaks. peak_widths: In `scipy.signal`: Directly calculate the width of peaks. Notes ----- In the context of this function, a peak or local maximum is defined as any sample whose two direct neighbours have a smaller amplitude. For flat peaks (more than one sample of equal amplitude wide) the index of the middle sample is returned (rounded down in case the number of samples is even). For noisy signals the peak locations can be off because the noise might change the position of local maxima. In those cases consider smoothing the signal before searching for peaks or use other peak finding and fitting methods (like `scipy.signal.find_peaks_cwt` ). Some additional comments on specifying conditions: * Almost all conditions (excluding `distance`\ ) can be given as half-open or closed intervals, e.g `1` or `(1, None)` defines the half-open interval :math:`[1, \\infty]` while `(None, 1)` defines the interval :math:`[-\\infty, 1]`\ . The open interval `(None, None)` can be specified as well, which returns the matching properties without exclusion of peaks. * The border is always included in the interval used to select valid peaks. * For several conditions the interval borders can be specified with arrays matching `dataset` in shape which enables dynamic constrains based on the sample position. * The conditions are evaluated in the following order: `plateau_size` , `height` , `threshold` , `distance` , `prominence` , `width` . In most cases this order is the fastest one because faster operations are applied first to reduce the number of peaks that need to be evaluated later. * While indices in `peaks` are guaranteed to be at least `distance` samples apart, edges of flat peaks may be closer than the allowed `distance` . * Use `wlen` to reduce the time it takes to evaluate the conditions for `prominence` or `width` if `dataset` is large or has many local maxima (see `scipy.signal.peak_prominences`\ ). Examples -------- >>> dataset = scp.read("irdata/nh4y-activation.spg") >>> X = dataset[0, 1800.0:1300.0] >>> peaks, properties = X.find_peaks(height=1.5, distance=50.0, width=0.0) >>> len(peaks.x) 2 >>> peaks.x.values <Quantity([ 1644 1455], '1 / centimeter')> >>> properties["peak_heights"][0] <Quantity(2.26663446, 'absorbance')> >>> properties["widths"][0] <Quantity(38.729003, '1 / centimeter')> """ # get the dataset X = dataset.squeeze() if X.ndim > 1: raise ValueError( "Works only for 1D NDDataset or a 2D NDdataset with `len(X.y) <= 1`" ) # TODO: implement for 2D datasets (would be useful e.g., for NMR) # be sure that data are real (NMR case for instance) if X.is_complex or X.is_quaternion: X = X.real # Check if we can work with the coordinates use_coord = use_coord and X.coordset is not None # init variable in case we do not use coordinates lastcoord = None xunits = 1 dunits = 1 step = 1 if use_coord: # We will use the last coordinates (but if the data were transposed or sliced, # the name can be something else than 'x') lastcoord = X.coordset[X.dims[-1]] # units xunits = lastcoord.units if lastcoord.units is not None else 1 dunits = X.units if X.units is not None else 1 # assume linear x coordinates # TODO: what if the coordinates are not linear? if not lastcoord.linear: warning_( "The x coordinates are not linear. " "The peak finding might be wrong." ) spacing = np.mean(lastcoord.spacing) else: spacing = lastcoord.spacing if isinstance(spacing, Quantity): spacing = spacing.magnitude step = np.abs(spacing) # transform coord (if exists) to index # TODO: allow units for distance, width, wlen, plateau_size distance = int(round(distance / step)) if distance is not None else None width = int(round(width / step)) if width is not None else None wlen = int(round(wlen / step)) if wlen is not None else None plateau_size = int(round(plateau_size / step)) if plateau_size is not None else None # now the distance, width ... parameters are given in data points peaks, properties = scipy.signal.find_peaks( X.data, height=height, threshold=threshold, distance=distance, prominence=prominence, width=width, wlen=wlen, rel_height=rel_height, plateau_size=plateau_size, ) out = X[peaks] if not use_coord: out.coordset = None # remove the coordinates # quadratic interpolation to find the maximum window_length = window_length if window_length % 2 == 0 else window_length - 1 x_pos = [] if window_length > 1: for i, peak in enumerate(peaks): start = peak - window_length // 2 end = peak + window_length // 2 + 1 sle = slice(start, end) y = X.data[sle] x = lastcoord.data[sle] if use_coord else range(start, end) coef = np.polyfit(x, y, 2) x_at_max = -coef[1] / (2 * coef[0]) y_at_max = np.poly1d(coef)(x_at_max) out[i] = y_at_max if not use_coord: x_pos.append(x_at_max) else: out.coordset(out.dims[-1])[i] = x_at_max if x_pos and not use_coord: from spectrochempy.core.dataset.coord import Coord out.coordset = Coord(x_pos) # transform back index to coord if use_coord: for key in ["peak_heights", "width_heights", "prominences"]: if key in properties: properties[key] = [height * dunits for height in properties[key]] for key in ( "left_bases", "right_bases", "left_edges", "right_edges", ): # values are initially of int type if key in properties: properties[key] = [ lastcoord.values[int(index)] for index in properties[key].astype("float64") ] def _prop(ips): # interpolate coord floor = int(np.floor(ips)) return lastcoord.values[floor] + (ips - floor) * ( lastcoord.values[floor + 1] - lastcoord.values[floor] ) for key in ("left_ips", "right_ips"): # values are float type if key in properties: properties[key] = [_prop(ips) for ips in properties[key]] if "widths" in properties: properties["widths"] = [ np.abs(width * step) * xunits for width in properties["widths"] ] if "plateau_sizes" in properties: properties["plateau_sizes"] = [ np.abs(sizes * step) * xunits for sizes in properties["plateau_sizes"] ] out.name = "peaks of " + X.name out.history = f"find_peaks(): {len(peaks)} peak(s) found" return out, properties