Source code for spectrochempy.core.readers.read_jcamp

# ======================================================================================
# Copyright (©) 2015-2025 LCS - Laboratoire Catalyse et Spectrochimie, Caen, France.
# CeCILL-B FREE SOFTWARE LICENSE AGREEMENT
# See full LICENSE agreement in the root directory.
# ======================================================================================
"""Module to extend NDDataset with import methods."""

__all__ = ["read_jcamp"]
__dataset_methods__ = __all__

import io
import re
from datetime import datetime

import numpy as np

from spectrochempy.core.dataset.coord import Coord
from spectrochempy.core.readers.importer import Importer
from spectrochempy.core.readers.importer import _importer_method
from spectrochempy.utils.datetimeutils import UTC
from spectrochempy.utils.decorators import deprecated
from spectrochempy.utils.docreps import _docstring

# ======================================================================================
# Public functions
# ======================================================================================
_docstring.delete_params("Importer.see_also", "read_jcamp")



[docs]
@_docstring.dedent
def read_jcamp(*paths, **kwargs):
    r"""
    Open Infrared ``JCAMP-DX`` files with extension :file:`.jdx` or :file:`.dx`.

    Limited to AFFN encoding (see :cite:t:`mcdonald:1988`)

    Parameters
    ----------
    %(Importer.parameters)s

    Returns
    -------
    %(Importer.returns)s

    Other Parameters
    ----------------
    %(Importer.other_parameters)s

    See Also
    --------
    %(Importer.see_also.no_read_jcamp)s

    """
    kwargs["filetypes"] = ["JCAMP-DX files (*.jdx *.dx)"]
    kwargs["protocol"] = ["jcamp"]
    importer = Importer()
    return importer(*paths, **kwargs)



@deprecated(replace="read__jcamp")
def read_jdx(*args, **kwargs):
    return read_jcamp(*args, **kwargs)


@deprecated(replace="read_jcamp")
def read_dx(*args, **kwargs):  # pragma: no cover
    return read_jcamp(*args, **kwargs)


# ======================================================================================
# private functions
# ======================================================================================
@_importer_method
def _read_jdx(*args, **kwargs):
    # read jdx file
    dataset, filename = args
    content = kwargs.get("content")
    sortbydate = kwargs.pop("sortbydate", True)

    if content is not None:
        fid = io.StringIO(content.decode("utf-8"))
    else:
        fid = open(filename)  # noqa: SIM115

    # Read header of outer Block

    keyword = ""

    while keyword != "##TITLE":
        keyword, text = _readl(fid)
    if keyword != "EOF":
        jdx_title = text
    else:  # pragma: no cover
        raise ValueError("No ##TITLE LR in outer block header")

    while (keyword != "##DATA TYPE") and (keyword != "##DATATYPE"):
        keyword, text = _readl(fid)
    if keyword != "EOF":
        jdx_data_type = text
    else:  # pragma: no cover
        raise ValueError("No ##DATA TYPE LR in outer block header")

    if jdx_data_type == "LINK":
        while keyword != "##BLOCKS":
            keyword, text = _readl(fid)
        nspec = int(text)
    elif jdx_data_type.replace(" ", "") == "INFRAREDSPECTRUM":
        nspec = 1
    else:
        raise ValueError("DATA TYPE must be LINK or INFRARED SPECTRUM")

    # Create variables

    xaxis = np.array([])
    data = np.array([])
    alltitles, alltimestamps, alldates, xunits, yunits, allorigins = (
        [],
        [],
        [],
        [],
        [],
        [],
    )
    nx, firstx, lastx = (
        np.zeros(nspec, "int"),
        np.zeros(nspec, "float"),
        np.zeros(nspec, "float"),
    )

    # Read the spectra

    for i in range(nspec):
        # Reset variables
        keyword = ""

        # (year, month,...) must be reset at each spectrum because labels "time"
        # and "longdate" are not required in JDX file
        [year, month, day, hour, minute, second] = "", "", "", "", "", ""

        # Read JDX file for spectrum n° i
        while keyword != "##END":
            keyword, text = _readl(fid)
            if keyword in ["##OWNER", "##JCAMP-DX"]:
                continue
            if keyword == "##ORIGIN":
                allorigins.append(text)
            elif keyword == "##TITLE":
                # Add the title of the spectrum in the list alltitles
                alltitles.append(text)
            elif keyword == "##LONGDATE":
                [year, month, day] = text.split("/")
            elif keyword == "##TIME":
                [hour, minute, second] = re.split(r"[:.]", text)
            elif keyword == "##XUNITS":
                xunits.append(text)
            elif keyword == "##YUNITS":
                yunits.append(text)
            elif keyword == "##FIRSTX":
                firstx[i] = float(text)
            elif keyword == "##LASTX":
                lastx[i] = float(text)
            elif keyword == "##XFACTOR":
                xfactor = float(text)
            elif keyword == "##YFACTOR":
                yfactor = float(text)
            elif keyword == "##NPOINTS":
                nx[i] = float(text)
            elif keyword == "##XYDATA":
                # Read the intensities
                allintensities = []
                while keyword != "##END":
                    keyword, text = _readl(fid)
                    # for each line, get all the values except the first one (first value = wavenumber)
                    intensities = list(filter(None, text.split(" ")[1:]))
                    if len(intensities) > 0:
                        allintensities += intensities
                spectra = np.array(
                    [allintensities],
                )  # convert allintensities into an array
                spectra[
                    spectra == "?"
                ] = "nan"  # deals with missing or out of range intensity values
                spectra = spectra.astype(np.float32)
                spectra *= yfactor
                # add spectra in "data" matrix
                data = spectra if not data.size else np.concatenate((data, spectra), 0)

        # Check "firstx", "lastx" and "nx"
        if firstx[i] != 0 and lastx[i] != 0 and nx[i] != 0:
            if not xaxis.size:
                # Creation of xaxis if it doesn't exist yet
                xaxis = np.linspace(firstx[0], lastx[0], nx[0])
                xaxis = np.around((xaxis * xfactor), 3)
            else:
                # Check the consistency of xaxis
                if nx[i] - nx[i - 1] != 0:
                    raise ValueError(
                        "Inconsistent data set: number of wavenumber per spectrum should be identical",
                    )
                if firstx[i] - firstx[i - 1] != 0:
                    raise ValueError(
                        "Inconsistent data set: the x axis should start at same value",
                    )
                if lastx[i] - lastx[i - 1] != 0:
                    raise ValueError(
                        "Inconsistent data set: the x axis should end at same value",
                    )
        else:
            raise ValueError(
                "##FIRST, ##LASTX or ##NPOINTS are unusable in the spectrum n°",
                i + 1,
            )

        # Creation of the acquisition date
        if (
            year != ""
            and month != ""
            and day != ""
            and hour != ""
            and minute != ""
            and second != ""
        ):
            date = datetime(
                int(year),
                int(month),
                int(day),
                int(hour),
                int(minute),
                int(second),
                tzinfo=UTC,
            )
            timestamp = date.timestamp()
            # Transform back to timestamp for storage in the Coord object
            # use datetime.fromtimestamp(d, timezone.utc))
            # to transform back to datetime object
        else:
            timestamp = date = None
            # Todo: cases where incomplete date and/or time info
        alltimestamps.append(timestamp)
        alldates.append(date)

        # Check the consistency of xunits and yunits
        if i > 0:
            if yunits[i] != yunits[i - 1]:
                raise ValueError(
                    f"##YUNITS should be the same for all spectra (check spectrum n°{i + 1}",
                )
            if xunits[i] != xunits[i - 1]:
                raise ValueError(
                    f"##XUNITS should be the same for all spectra (check spectrum n°{i + 1}",
                )

    # Determine xaxis name ****************************************************
    if xunits[0].strip() == "1/CM":
        axisname = "wavenumbers"
        axisunit = "cm^-1"
    elif xunits[0].strip() == "MICROMETERS":
        axisname = "wavelength"
        axisunit = "um"
    elif xunits[0].strip() == "NANOMETERS":
        axisname = "wavelength"
        axisunit = "nm"
    elif xunits[0].strip() == "SECONDS":
        axisname = "time"
        axisunit = "s"
    elif xunits[0].strip() == "ARBITRARY UNITS":
        axisname = "arbitrary unit"
        axisunit = None
    else:
        axisname = ""
        axisunit = ""
    fid.close()

    dataset.data = data
    dataset.name = jdx_title
    dataset.filename = filename

    if yunits[0].strip() == "ABSORBANCE":
        dataset.units = "absorbance"
        dataset.title = "absorbance"
    elif yunits[0].strip() == "TRANSMITTANCE":
        # TODO: This units not in pint. Add this
        dataset.title = "transmittance"

    # now add coordinates
    _x = Coord(xaxis, title=axisname, units=axisunit)
    if jdx_data_type == "LINK":
        _y = Coord(
            alltimestamps,
            title="acquisition timestamp (GMT)",
            units="s",
            labels=(alldates, alltitles),
        )
        dataset.set_coordset(y=_y, x=_x)
    else:
        _y = Coord()
    dataset.set_coordset(y=_y, x=_x)

    # Set origin, description and history
    if nspec > 1:
        origins = set(allorigins)
        if len(origins) == 0:
            pass
        elif len(origins) == 1:
            dataset.origin = allorigins[0]
        else:
            dataset.origin = [(origin + "; ") for origin in set(allorigins)][0][:-2]

    dataset.description = f"Dataset from jdx file: '{jdx_title}'"

    dataset.history = "Imported from jdx file"

    if sortbydate and nspec > 1:
        dataset.sort(dim="x", inplace=True)
        dataset.history = "Sorted by date"
    # Todo: make sure that the lowest index correspond to the largest wavenumber
    #  for compatibility with dataset created by read_omnic:

    # reset modification date to cretion date
    dataset._modified = dataset._created

    return dataset


@_importer_method
def _read_dx(*args, **kwargs):  # pragma: no cover
    return _read_jdx(*args, **kwargs)


def _readl(fid):
    line = fid.readline()
    if not line:
        return "EOF", ""
    line = line.strip(" \n")  # remove newline character
    if line[0:2] == "##":  # if line starts with "##"
        if line[0:5] == "##END":  # END KEYWORD, no text
            keyword = "##END"
            text = ""
        else:  # keyword + text
            keyword, text = line.split("=")
    else:
        keyword = ""
        text = line.strip()
    return keyword, text