Source code for spectrochempy.core.readers.importer

# -*- coding: utf-8 -*-
# ======================================================================================
# Copyright (©) 2015-2023 LCS - Laboratoire Catalyse et Spectrochimie, Caen, France.
# CeCILL-B FREE SOFTWARE LICENSE AGREEMENT
# See full LICENSE agreement in the root directory.
# ======================================================================================
"""
This module define a generic class to import directories, files and contents.
"""
__all__ = ["read", "read_dir"]  # , "read_remote"]
__dataset_methods__ = __all__

import io
import re
from warnings import warn
from zipfile import ZipFile

import requests
import yaml
from traitlets import Dict, HasTraits, List, Type, Unicode

from spectrochempy.application import info_, warning_
from spectrochempy.utils.docstrings import _docstring
from spectrochempy.utils.exceptions import DimensionsCompatibilityError, ProtocolError
from spectrochempy.utils.file import (
    check_filename_to_open,
    get_directory_name,
    get_filenames,
    pathclean,
)

FILETYPES = [
    ("scp", "SpectroChemPy files (*.scp)"),
    ("omnic", "Nicolet OMNIC files and series (*.spa *.spg *.srs)"),
    ("soc", "Surface Optics Corp. (*.ddr *.hdr *.sdr)"),
    ("labspec", "LABSPEC exported files (*.txt)"),
    ("opus", "Bruker OPUS files (*.[0-9]*)"),
    (
        "topspin",
        "Bruker TOPSPIN fid or series or processed data files "
        "(fid ser 1[r|i] 2[r|i]* 3[r|i]*)",
    ),
    ("matlab", "MATLAB files (*.mat)"),
    ("dso", "Data Set Object files (*.dso)"),
    ("jcamp", "JCAMP-DX files (*.jdx *.dx)"),
    ("csv", "CSV files (*.csv)"),
    ("excel", "Microsoft Excel files (*.xls)"),
    ("zip", "Compressed folder of data files (*.zip)"),
    ("quadera", "Quadera ascii files (*.asc)"),
    ("carroucell", "Carroucell files (*spa)"),
    ("galactic", "GRAMS/Thermo Galactic files (*.spc)"),
    ("wire", "Renishaw WiRE files (*.wdf)"),
    #  ('all', 'All files (*.*)')
]
ALIAS = [
    ("spg", "omnic"),
    ("spa", "omnic"),
    ("ddr", "soc"),
    ("hdr", "soc"),
    ("sdr", "soc"),
    ("spc", "galactic"),
    ("srs", "omnic"),
    ("mat", "matlab"),
    ("txt", "labspec"),
    ("jdx", "jcamp"),
    ("dx", "jcamp"),
    ("xls", "excel"),
    ("asc", "quadera"),
    ("wdf", "wire"),
]


# --------------------------------------------------------------------------------------
class Importer(HasTraits):
    # Private Importer class

    objtype = Type()
    datasets = List()
    files = Dict()
    default_key = Unicode()
    protocol = Unicode()

    protocols = Dict()
    filetypes = Dict()

    def __init__(self):
        super().__init__()

        self.filetypes = dict(FILETYPES)
        temp = list(zip(*FILETYPES))
        temp.reverse()
        self.protocols = dict(zip(*temp))

        #  add alias

        self.alias = dict(ALIAS)

    def __call__(self, *args, **kwargs):
        self.datasets = []
        self.default_key = kwargs.pop("default_key", ".scp")

        if "merge" not in kwargs.keys():
            # if merge is not specified, but the args are provided as a single list,
            # then will are supposed to merge the datasets. If merge is specified then
            # it has priority.
            # This is not useful for the 1D datasets, as if they are compatible they
            # are merged automatically
            if args and len(args) == 1 and isinstance(args[0], (list, tuple)):
                kwargs["merge"] = True

        args, kwargs = self._setup_objtype(*args, **kwargs)
        res = check_filename_to_open(*args, **kwargs)
        if res:
            # Normal return
            self.files = res
        else:
            # Cancel in dialog!
            return None

        for key in self.files.keys():
            # particular case of carroucell files
            if key == "" and kwargs.get("protocol") == ["carroucell"]:
                key = ".carroucell"
                self.files = {".carroucell": self.files[""]}

            # particular case of topspin files
            elif key == "" and kwargs.get("protocol") == ["topspin"]:
                key = ".topspin"
                self.files = {".topspin": self.files[""]}

            if key == "frombytes":
                # here we need to read contents
                for filename, content in self.files[key].items():
                    files_ = check_filename_to_open(filename)
                    kwargs["content"] = content
                    key_ = list(files_.keys())[0]
                    self._switch_protocol(key_, files_, **kwargs)
                if len(self.datasets) > 1:
                    self.datasets = self._do_merge(self.datasets, **kwargs)

            elif key and key[1:] not in list(zip(*FILETYPES))[0] + list(zip(*ALIAS))[0]:
                raise TypeError(f"Filetype `{key}` is unknown in spectrochempy")
            else:
                # here files are read / or remotely from the disk using filenames
                self._switch_protocol(key, self.files, **kwargs)

        # now we will reset preference for this newly loaded datasets
        if len(self.datasets) > 0:
            if all(self.datasets) is None:
                return None

            try:
                prefs = self.datasets[0].preferences
                prefs.reset()
            except (FileNotFoundError, AttributeError):
                pass
        else:
            return None

        if len(self.datasets) == 1:
            nd = self.datasets[0]  # a single dataset is returned
            name = kwargs.pop("name", None)
            if name:
                nd.name = name
            return nd

        else:
            nds = self.datasets
            names = kwargs.pop("names", None)
            if names and len(names) == len(nds):
                for nd, name in zip(nds, names):
                    nd.name = name
            elif names and len(names) != len(nds):
                warn(
                    "length of the `names` list and of the list of datasets mismatch - names not applied"
                )
            return sorted(
                nds, key=str
            )  # return a sorted list (sorted according to their string representation)

    def _setup_objtype(self, *args, **kwargs):
        # check if the first argument is an instance of NDDataset or Project

        args = list(args)
        if (
            args
            and hasattr(args[0], "_implements")
            and args[0]._implements() in ["NDDataset"]
        ):
            # the first arg is an instance of NDDataset
            object = args.pop(0)
            self.objtype = type(object)

        else:
            # by default returned objtype is NDDataset (import here to avoid circular import)
            from spectrochempy.core.dataset.nddataset import NDDataset

            self.objtype = kwargs.pop("objtype", NDDataset)

        return args, kwargs

    def _switch_protocol(self, key, files, **kwargs):
        protocol = kwargs.get("protocol", None)
        if protocol is not None and protocol != "ALL":
            if not isinstance(protocol, list):
                protocol = [protocol]
            if key and key[1:] not in protocol and self.alias[key[1:]] not in protocol:
                return

        datasets = []
        files[key] = sorted(files[key])  # sort the files according their names
        for filename in files[key]:
            read_ = getattr(self, f"_read_{key[1:]}")

            dataset = None
            try:
                # read locally or using url if filename is an url
                dataset = read_(self.objtype(), filename, **kwargs)

            except (FileNotFoundError, OSError) as exc:
                # file was not found.
                # it is an url we raise an error
                local_only = kwargs.get("local_only", False)
                if _is_url(filename) or local_only:
                    raise (FileNotFoundError) from exc

                # else, we try on github
                try:
                    # Try to get the file from github
                    kwargs["read_method"] = read_
                    info_(
                        "File/directory not found locally: Attempt to download it from "
                        "the GitHub repository `spectrochempy_data`..."
                    )
                    dataset = _read_remote(self.objtype(), filename, **kwargs)

                except FileNotFoundError as exc:
                    raise (FileNotFoundError) from exc

                except Exception as e:
                    warning_(str(e))

            except Exception as e:
                warning_(str(e))

            if dataset is not None:
                if not isinstance(dataset, list):
                    datasets.append(dataset)
                else:
                    datasets.extend(dataset)

        if len(datasets) > 1:
            datasets = self._do_merge(datasets, **kwargs)
            if kwargs.get("merge", False):
                datasets[0].name = pathclean(filename).stem
                datasets[0].filename = pathclean(filename)

        self.datasets.extend(datasets)

    def _do_merge(self, datasets, **kwargs):
        # several datasets returned (only if several files have been passed) and the `merge` keyword argument is False
        merged = kwargs.get("merge", False)
        shapes = list({nd.shape if hasattr(nd, "shape") else None for nd in datasets})
        if len(shapes) == 1 and None not in shapes:
            # homogeneous set of files
            # we can merge them if they are 1D spectra
            if len(shapes[0]) == 1 or shapes[0][0] == 1:
                merged = kwargs.get("merge", True)  # priority to the keyword setting
        else:
            # not homogeneous
            merged = kwargs.get("merge", False)

        if merged:
            # Try to stack the dataset into a single one
            try:
                if datasets[0].ndim == 1:
                    dataset = self.objtype.stack(datasets)
                    dataset.history = "Stacked from several files"
                else:
                    dataset = self.objtype.concatenate(datasets, axis=0)
                    dataset.history = "Merged from several files"

                if dataset.coordset is not None and kwargs.pop("sortbydate", True):
                    dataset.sort(dim=0, inplace=True)
                    #  dataset.history = "Sorted"  (this not always by date:
                    #  actually for now it is by value which can be a date or not)
                datasets = [dataset]

            except DimensionsCompatibilityError as e:
                warn(str(e))  # return only the list

        return datasets


def _importer_method(func):
    # Decorator to define a given read function as belonging to Importer
    setattr(Importer, func.__name__, staticmethod(func))
    return func


# --------------------------------------------------------------------------------------
# Public Generic Read function
# --------------------------------------------------------------------------------------

_docstring.get_sections(
    """
See Also
--------
read : Generic reader inferring protocol from the filename extension.
read_zip : Read Zip archives (containing spectrochempy readable files)
read_dir : Read an entire directory.
read_opus : Read OPUS spectra.
read_labspec : Read Raman LABSPEC spectra (:file:`.txt`\ ).
read_omnic : Read Omnic spectra (:file:`.spa`\ , :file:`.spg`\ , :file:`.srs`\ ).
read_soc : Read Surface Optics Corps. files (:file:`.ddr` , :file:`.hdr` or :file:`.sdr`\ ).
read_galactic : Read Galactic files (:file:`.spc`\ ).
read_quadera : Read a Pfeiffer Vacuum's QUADERA mass spectrometer software file.
read_topspin : Read TopSpin Bruker NMR spectra.
read_csv : Read CSV files (:file:`.csv`\ ).
read_jcamp : Read Infrared JCAMP-DX files (:file:`.jdx`\ , :file:`.dx`\ ).
read_matlab : Read Matlab files (:file:`.mat`\ , :file:`.dso`\ ).
read_carroucell : Read files in a directory after a carroucell experiment.
read_wire : Read REnishaw Wire files (:file:`.wdf`\ ).
""",
    sections=["See Also"],
    base="Importer",
)

_docstring.delete_params("Importer.see_also", "read")


[docs]@_docstring.dedent
def read(*paths, **kwargs):
    """
    Generic read method.

    This method is generally able to load experimental files based on extensions.

    Parameters
    ----------
    *paths : `str`, `~pathlib.Path` object objects or valid urls, optional
        The data source(s) can be specified by the name or a list of name for the
        file(s) to be loaded:

        * *e.g.,* ( filename1, filename2, ...,  \*\*kwargs )*

        If the list of filenames are enclosed into brackets:

        * *e.g.,* ( **[** *filename1, filename2, ...* **]**, \*\*kwargs *)*

        The returned datasets are merged to form a single dataset,
        except if ``merge`` is set to `False`.

        If a source is not provided (*i.e.,* no ``paths`` , nor ``content``\ ),
        a dialog box will be opened to select files.
    %(kwargs)s

    Returns
    -------
    object : `NDDataset` or list of `NDDataset`
        The returned dataset(s).

    Other Parameters
    ----------------
    protocol : `str`\ , optional
        ``Protocol`` used for reading. It can be one of {``'scp'``\ , ``'omnic'``\ ,
        ``'opus'``\ , ``'topspin'``\ , ``'matlab'``\ , ``'jcamp'``\ , ``'csv'``\ ,
        ``'excel'``\ }. If not provided, the correct protocol
        is inferred (whenever it is possible) from the filename extension.
    directory : `~pathlib.Path` object objects or valid urls, optional
        From where to read the files.
    merge : `bool`\ , optional, default: `False`
        If `True` and several filenames or a ``directory`` have been provided as
        arguments, then a single `NDDataset` with merged (stacked along the first
        dimension) is returned.
    sortbydate : `bool`, optional, default: `True`
        Sort multiple filename by acquisition date.
    description : `str`, optional
        A Custom description.
    origin : one of {``'omnic'``\ , ``'tga'``\ }, optional
        Used when reading with the CSV protocol. In order to properly interpret CSV file
        it can be necessary to set the origin of the spectra.
        Up to now only ``'omnic'`` and ``'tga'`` have been implemented.
    csv_delimiter : `str`\ , optional, default: `~spectrochempy.preferences.csv_delimiter`
        Set the column delimiter in CSV file.
    content : `bytes` object, optional
        Instead of passing a filename for further reading, a bytes content can be
        directly provided as bytes objects.
        The most convenient way is to use a dictionary. This feature is particularly
        useful for a GUI Dash application to handle drag and drop of files into a
        Browser.
    iterdir : `bool`\ , optional, default: `True`
        If `True` and no filename was provided, all files present in the provided
        ``directory`` are returned (and merged if ``merge`` is `True`\ .
        It is assumed that all the files correspond to current reading protocol.

        .. versionchanged:: 0.6.2

            ``iterdir`` replace the deprecated ``listdir`` argument.

    recursive : `bool`, optional, default: `False`
        Read also in subfolders.
    replace_existing: `bool`, optional, default: `False`
        Used only when url are specified. By default, existing files are not replaced
        so not downloaded.
    download_only: `bool`, optional, default: `False`
        Used only when url are specified.  If True, only downloading and saving of the
        files is performed, with no attempt to read their content.
    read_only: `bool`, optional, default: `True`
        Used only when url are specified.  If True, saving of the
        files is performed in the current directory, or in the directory specified by
        the directory parameter.

    See Also
    --------
    %(Importer.see_also.no_read)s

    Examples
    ---------
    Reading a single OPUS file  (providing a windows type filename relative
    to the default `~spectrochempy.preferences.datadir` )

    >>> scp.read('irdata\\\\OPUS\\\\test.0000')
    NDDataset: [float64] a.u. (shape: (y:1, x:2567))

    Reading a single OPUS file  (providing a unix/python type filename relative
    to the default ``datadir`` )
    Note that here read_opus is called as a classmethod of the NDDataset class

    >>> scp.read('irdata/OPUS/test.0000')
    NDDataset: [float64] a.u. (shape: (y:1, x:2567))

    Single file specified with pathlib.Path object

    >>> from pathlib import Path
    >>> folder = Path('irdata/OPUS')
    >>> p = folder / 'test.0000'
    >>> scp.read(p)
    NDDataset: [float64] a.u. (shape: (y:1, x:2567))

    Multiple files not merged (return a list of datasets).
    Note that a directory is specified

    >>> le = scp.read('test.0000', 'test.0001', 'test.0002', directory='irdata/OPUS')
    >>> len(le)
    3
    >>> le[0]
    NDDataset: [float64] a.u. (shape: (y:1, x:2567))

    Multiple files merged as the `merge` keyword is set to true

    >>> scp.read('test.0000', 'test.0001', 'test.0002', directory='irdata/OPUS', merge=True)
    NDDataset: [float64] a.u. (shape: (y:3, x:2567))

    Multiple files to merge : they are passed as a list instead of using the keyword
    `merge`

    >>> scp.read(['test.0000', 'test.0001', 'test.0002'], directory='irdata/OPUS')
    NDDataset: [float64] a.u. (shape: (y:3, x:2567))

    Multiple files not merged : they are passed as a list but `merge` is set to false

    >>> le = scp.read(['test.0000', 'test.0001', 'test.0002'], directory='irdata/OPUS', merge=False)
    >>> len(le)
    3

    Read without a filename. This has the effect of opening a dialog for file(s)
    selection

    >>> nd = scp.read()

    Read in a directory (assume that only OPUS files are present in the directory
    (else we must use the generic `read` function instead)

    >>> le = scp.read(directory='irdata/OPUS')
    >>> len(le)
    2

    Again we can use merge to stack all 4 spectra if thet have compatible dimensions.

    >>> scp.read(directory='irdata/OPUS', merge=True)
    [NDDataset: [float64] a.u. (shape: (y:1, x:5549)), NDDataset: [float64] a.u. (shape: (y:4, x:2567))]
    """

    importer = Importer()

    protocol = kwargs.get("protocol", None)
    available_protocols = list(importer.protocols.values())
    available_protocols.extend(
        list(importer.alias.keys())
    )  # to handle variants of protocols
    if protocol is None:
        kwargs["filetypes"] = list(importer.filetypes.values())
        kwargs["protocol"] = "ALL"
        default_filter = kwargs.get("default_filter", None)
        if default_filter is not None:
            kwargs["default_filter"] = importer.filetypes[default_filter]
    else:
        try:
            kwargs["filetypes"] = [importer.filetypes[protocol]]
        except KeyError:
            raise ProtocolError(protocol, list(importer.protocols.values()))
        except TypeError as e:
            print(e)

    # deprecated kwargs
    listdir = kwargs.pop("listdir", True)
    if "listdir" in kwargs and "iterdir" not in kwargs:
        kwargs["iterdir"] = listdir
        warning_(
            "argument `listdir` is deprecated, use ìterdir` instead",
            category=DeprecationWarning,
        )

    return importer(*paths, **kwargs)


# for some reasons the doctring.getsection modify the signature of the function
# when used as a decorator, so we use it as a function
_docstring.get_sections(
    read.__doc__,
    sections=["Parameters", "Other Parameters", "Returns"],
    base="Importer",
)

_docstring.delete_params("Importer.see_also", "read_dir")


[docs]@_docstring.dedent
def read_dir(directory=None, **kwargs):
    """
    Read an entire directory.

    Open a list of readable files in a and store data/metadata in a dataset or a list of
    datasets according to the following rules :

    * 2D spectroscopic data (e.g. valid .spg files or matlab arrays, etc...) from
      distinct files are stored in distinct `NDdataset`\ s.
    * 1D spectroscopic data (e.g., :file:`.spa` files) in a given directory are merged
      into single `NDDataset`\ , providing their unique dimension are compatible.
      If not, an error is generated.
    * non-readable files are ignored

    Parameters
    ----------
    directory : str or pathlib
        Folder where are located the files to read.

    Returns
    --------
    %(Importer.returns)s
        Depending on the python version, the order of the datasets in the list
        may change.

    See Also
    --------
    %(Importer.see_also.no_read_dir)s

    Examples
    --------

    >>> scp.preferences.csv_delimiter = ','
    >>> A = scp.read_dir('irdata')
    >>> len(A)
    4

    >>> B = scp.read_dir()
    """
    kwargs["iterdir"] = True
    importer = Importer()
    return importer(directory, **kwargs)


# _docstring.delete_params("Importer.see_also", "read_remote")
# @_docstring.dedent
# def read_remote(file_or_dir, **kwargs):
#     """
#     Download and read files or an entire directory from any url
#
#     The first usage in spectrochempy is the loading of test files in the
#     `spectrochempy_data repository <https://github.com/spectrochempy/spectrochempy_data>`__\ .
#     This is done only if the data are not yet
#     downloaded and present in the `~spectrochempy.preferences.datadir` directory.
#
#     It can also be used to download and read file or directory from any url.
#
#     Parameters
#     ----------
#     path : `str`, `~pathlib.Path` object or an url.
#         When a file or folder is specified, it must be written as if it were present
#         locally exactly as for the `read` function. The correponding file or directory
#         is downloaded from the ``github spectrochemp_data`` repository.
#         Otherwise it should be a full and valid url.
#     %(kwargs)s
#
#     Returns
#     --------
#     %(Importer.returns)s
#
#     Other Parameters
#     ----------------
#     %(Importer.other_parameters)s
#
#     See Also
#     --------
#     %(Importer.see_also.no_read_remote)s
#
#     Examples
#     --------
#
#     >>> A = scp.read_remote('irdata/subdir')
#     """
#     kwargs["remote"] = True
#     importer = Importer()
#     return importer(file_or_dir, **kwargs)
#


# ======================================================================================
# Private read functions
# ======================================================================================
@_importer_method
def _read_dir(*args, **kwargs):
    _, directory = args
    directory = get_directory_name(directory)
    files = get_filenames(directory, **kwargs)
    datasets = []
    valid_extensions = list(zip(*FILETYPES))[0] + list(zip(*ALIAS))[0]
    for key in [key for key in files.keys() if key[1:] in valid_extensions]:
        if key:
            importer = Importer()
            nd = importer(files[key], **kwargs)
            if nd is not None:
                if not isinstance(nd, list):
                    nd = [nd]
                datasets.extend(nd)
    return datasets


@_importer_method
def _read_scp(*args, **kwargs):
    dataset, filename = args
    return dataset.load(filename, **kwargs)


@_importer_method
def _read_(*args, **kwargs):
    dataset, filename = args

    if kwargs.pop("remote", False):
        return Importer._read_remote(*args, **kwargs)
    elif not filename or filename.is_dir():
        return Importer._read_dir(*args, **kwargs)
    else:
        raise FileNotFoundError

    # protocol = kwargs.get("protocol", None)
    # if protocol and ".scp" in protocol:
    #     return dataset.load(filename, **kwargs)
    #
    # elif filename and filename.name in ("fid", "ser", "1r", "2rr", "3rrr"):
    #     # probably an Topspin NMR file
    #     return dataset.read_topspin(filename, **kwargs)
    # elif filename:
    #     # try scp format
    #     try:
    #         return dataset.load(filename, **kwargs)
    #     except Exception:
    #         # lets try some common format
    #         for key in ["omnic", "opus", "topspin", "labspec", "matlab", "jdx"]:
    #             try:
    #                 _read = getattr(dataset, f"read_{key}")
    #                 f = f"{filename}.{key}"
    #                 return _read(f, **kwargs)
    #             except Exception:
    #                 pass
    #         raise NotImplementedError


# ======================================================================================
# Private functions
# ======================================================================================
def _is_url(filename):
    return (
        isinstance(filename, str)
        and re.match(r"http[s]?:[\/]{2}", filename) is not None
    )


def _openfid(filename, mode="rb", **kwargs):
    # Return a file ID

    # Check if Content has been passed?
    content = kwargs.get("content", False)

    # default encoding
    encoding = "utf-8"

    if _is_url(filename):
        # by default, we set the read_only flag to True when reading remote url
        kwargs["read_only"] = kwargs.get("read_only", True)

        # use request to read the remote content
        r = requests.get(filename, allow_redirects=True)
        r.raise_for_status()
        content = r.content
        encoding = r.encoding

    else:
        # Transform filename to a Path object is not yet the case
        filename = pathclean(filename)

    # Create the file ID
    if content:
        # if a content has been passed, then it has priority
        fid = (
            io.BytesIO(content)
            if mode == "rb"
            else io.StringIO(content.decode(encoding))
        )
    else:
        fid = open(filename, mode=mode)

    return fid, kwargs


def _write_downloaded_file(content, dst):
    if not dst.parent.exists():
        # create the eventually missing subdirectory
        dst.parent.mkdir(parents=True, exist_ok=True)
    dst.write_bytes(content)
    info_(f"{dst.name} has been downloaded and written in {dst.parent}")


def _get_url_content_and_save(url, dst, replace, read_only=False):
    if not replace and dst.exists():
        return

    try:
        r = requests.get(url, allow_redirects=True)

        r.raise_for_status()

        # write downloaded file
        if not read_only:
            _write_downloaded_file(r.content, dst)

        # in all case return the content
        return r.content

    except OSError:
        raise FileNotFoundError(f"Not found locally or at url:{url}")


def _download_full_testdata_directory():
    from spectrochempy.core import preferences as prefs

    datadir = prefs.datadir

    url = "https://github.com/spectrochempy/spectrochempy_data/archive/refs/heads/master.zip"

    resp = requests.get(url)
    zipfile = ZipFile(io.BytesIO(resp.content))
    files = [zipfile.open(file_name) for file_name in zipfile.namelist()]

    for file in files:
        name = file.name
        if name.endswith("/") or "testdata/" not in name:  # dir
            continue
        uncompressed = zipfile.read(name)
        p = list(pathclean(name).parts)[2:]
        dst = datadir.joinpath("/".join(p))
        _write_downloaded_file(uncompressed, dst)


def _download_from_github(path, dst, replace=False):
    # download on github (always save the downloaded files)
    relative_path = str(pathclean(path).as_posix())
    path = (
        f"https://github.com/spectrochempy/spectrochempy_data/raw/master/"
        f"testdata/{relative_path}"
    )

    # first determine if it is a directory
    r = requests.get(path + "/__index__", allow_redirects=True)
    index = None
    if r.status_code == 200:
        index = yaml.load(r.content, Loader=yaml.CLoader)

    if index is None:
        return _get_url_content_and_save(path, dst, replace)

    else:
        # download folder
        for filename in index["files"]:
            _get_url_content_and_save(f"{path}/{filename}", dst / filename, replace)
        for folder in index["folders"]:
            _download_from_github(f"{relative_path}/{folder}", dst / folder)


def _is_relative_to(path, base):
    # try to emulate the pathlib is_relative_to method which does not work on python
    # 3.7 (needed for Colab!)
    # TODO: replace as Colab is updated to 3.9
    pparts = path.parts
    bparts = base.parts
    if bparts[-1] in pparts:
        idx = pparts.index(bparts[-1])
        pparts_base = pparts[: idx + 1]
        return pparts_base == bparts
    return False


def _relative_to(path, base):
    pparts = path.parts
    bparts = base.parts
    if bparts[-1] in pparts:
        idx = pparts.index(bparts[-1])
        return pathclean("/".join(pparts[idx + 1 :]))
    raise ValueError(
        f"'{path}' is not in the subpath of '{base}' OR one path is "
        f"relative and the other absolute."
    )


@_importer_method
def _read_remote(*args, **kwargs):
    from spectrochempy.core import preferences as prefs

    datadir = prefs.datadir
    dataset, path = args
    kwargs["merge"] = kwargs.get("merge", False)  # by default, no attempt to merge
    read_method = kwargs.pop("read_method", read)
    download_only = kwargs.pop("download_only", False)
    replace = kwargs.pop(
        "replace_existing", False
    )  # by default we download only if needed.

    # downloaded file
    # we try to download the github testdata
    path = pathclean(path)

    # we need to download additional files for topspin
    topspin = True if "topspin" in read_method.__name__ else False
    # we have to treat a special case: topspin, where the parent directory need
    # to be downloaded with the required file
    if topspin:
        savedpath = path
        m = re.match(r"(.*)(\/pdata\/\d+\/\d+[r|i]{1,2}|ser|fid)", str(path))
        if m is not None:
            path = pathclean(m[1])

    if _is_relative_to(path, datadir):
        # try to make it relative for remote downloading on github
        relative_path = _relative_to(path, datadir)
    else:
        # assume it is already relative
        relative_path = path

    # Try to download it
    dst = datadir / relative_path
    if dst.name == "testdata":
        # we are going to download the whole testdata directory
        # -> use a faster method
        _download_full_testdata_directory()
        return
    else:
        content = _download_from_github(relative_path, dst, replace)

    if not download_only:
        if content is None:
            if topspin:
                return read_method(
                    dataset, dst / _relative_to(savedpath, dst), **kwargs
                )
            else:
                return read_method(dataset, dst, **kwargs)
        else:
            return read_method(dataset, dst, content=content, **kwargs)