Source code for spectrochempy.core.readers.download

# -*- coding: utf-8 -*-
# ======================================================================================
# Copyright (©) 2015-2023 LCS - Laboratoire Catalyse et Spectrochimie, Caen, France.
# CeCILL-B FREE SOFTWARE LICENSE AGREEMENT
# See full LICENSE agreement in the root directory.
# ======================================================================================
"""
In this module, methods are provided to download external datasets
from public database.
"""
__all__ = ["load_iris", "download_nist_ir"]
__dataset_methods__ = __all__

from pathlib import Path

import requests

from spectrochempy.application import error_, info_
from spectrochempy.core.dataset.coord import Coord
from spectrochempy.core.dataset.nddataset import NDDataset
from spectrochempy.core.readers.read_jcamp import read_jcamp
from spectrochempy.utils.misc import is_iterable


[docs]def load_iris(): """ Upload the classical "iris" dataset. The "IRIS" dataset is a classical example for machine learning. It is read from the `scikit-learn` package. Returns ------- dataset The `IRIS` dataset. See Also -------- read : Read data from experimental data. """ from sklearn.datasets import load_iris as sklearn_iris data = sklearn_iris() coordx = Coord( labels=["sepal_length", "sepal width", "petal_length", "petal_width"], title="features", ) labels = [data.target_names[i] for i in data.target] coordy = Coord(labels=labels, title="samples") new = NDDataset( data.data, coordset=[coordy, coordx], title="size", name="`IRIS` Dataset", units="cm", ) new.history = "Loaded from scikit-learn datasets" return new
[docs]def download_nist_ir(CAS, index="all"): """ Upload IR spectra from NIST webbook Parameters ---------- CAS : int or str the CAS number, can be given as "XXXX-XX-X" (str), "XXXXXXX" (str), XXXXXXX (int) index : str or int or tuple of ints If set to 'all' (default, import all available spectra for the compound corresponding to the index, or a single spectrum, or selected spectra. Returns ------- list of NDDataset or NDDataset The dataset(s). See Also -------- read : Read data from experimental data. """ if isinstance(CAS, str) and "-" in CAS: CAS = CAS.replace("-", "") if index == "all": # test urls and return list if any... index = [] i = 0 while "continue": url = ( f"https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C{CAS}&Index={i}&Type=IR" ) try: response = requests.get(url, timeout=10) if b"Spectrum not found" in response.content[:30]: break else: index.append(i) i += 1 except OSError: raise OSError("Cannot connect to the NIST server... ") if len(index) == 0: error_(IOError, "NIST IR: no spectrum found") return elif len(index) == 1: info_("NIST IR: 1 spectrum found") else: info_("NISTR IR: {len(index)} spectra found") elif isinstance(index, int): index = [index] elif not is_iterable(index): raise ValueError("index must be 'all', int or iterable of int") out = [] for i in index: # sample adress (water, spectrum 1) # https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C7732185&Index=1&Type=IR url = f"https://webbook.nist.gov/cgi/cbook.cgi?JCAMP=C{CAS}&Index={i}&Type=IR" try: response = requests.get(url, stream=True, timeout=10) if b"Spectrum not found" in response.content[:30]: error_( IOError, f"NIST IR: Spectrum {i} does not exist... please check !" ) if i == index[-1] and out == []: return None else: break except OSError: error_("OSError: Cannot connect... ") return None # Load data txtdata = "" for rd in response.iter_content(): txtdata += rd.decode("utf8") with open("temp.jdx", "w") as f: f.write(txtdata) try: ds = read_jcamp("temp.jdx") # replace the default entry ":imported from jdx file": ds.history[0] = f"Downloaded from NIST: {url}" out.append(ds) (Path(".") / "temp.jdx").unlink() except Exception: raise IOError( "Can't read this JCAMP file: please report the issue to Spectrochempy developpers" ) if len(out) == 1: return out[0] else: return out