Source code for nnbma.dataset.regression_dataset

import os
import pickle
from typing import (
    Callable,
    Dict,
    List,
    Literal,
    Optional,
    Sequence,
    Tuple,
    Union,
    overload,
)

import numpy as np
import pandas as pd
from torch import Tensor, from_numpy
from torch.utils.data import Dataset

__all__ = ["RegressionDataset", "RegressionSubset"]


[docs] class RegressionDataset(Dataset): """Dataset dedicated to regression.""" def __init__( self, x: np.ndarray, y: np.ndarray, inputs_names: Optional[List[str]] = None, outputs_names: Optional[List[str]] = None, ): r""" Parameters ---------- x : numpy.ndarray Array containing the input features of the regression model. Its shape is considered to be :math:`N \times I` where :math:`N` is the number of entries and :math:`I` the number of input features. y : numpy.ndarray Array containing the output features of the regression model. Its shape is considered to be :math:`N \times O` where :math:`N` is the number of entries and :math:`O` the number of output features. inputs_names : Optional[List[str]], optional list of the names of the input features, by default None outputs_names : Optional[List[str]], optional list of the names of the output features, by default None Raises ------ ValueError ``x`` and ``y`` must have the same number of rows :math:`N`. ValueError ``x`` and ``inputs_names`` must have the same number of features :math:`I`. ValueError ``y`` and ``outputs_names`` must have the same number of features :math:`O`. """ super().__init__() if x.shape[0] != y.shape[0]: raise ValueError("x and y must have the same number of rows.") self._x: Tensor = from_numpy(x).float() self._y: Tensor = from_numpy(y).float() if inputs_names is not None and len(inputs_names) != x.shape[1]: raise ValueError("x and inputs_names must have the same number of features") if outputs_names is not None and len(outputs_names) != y.shape[1]: raise ValueError( "y and outputs_names must have the same number of features" ) self._inputs_names: Optional[List[str]] = inputs_names self._outputs_names: Optional[List[str]] = outputs_names def __len__(self) -> int: """ Returns the number of entries :math:`N` in the dataset. Returns ------- int Number of entries :math:`N`. """ return self._x.size(0) def __getitem__(self, idx) -> Tuple[Tensor, Tensor]: """ Returns the entries corresponding to the index set ``idx``. Parameters ---------- idx : Any Indices of entries to return. Returns ------- tuple of torch.Tensor Input and output entries. """ return self._x[idx], self._y[idx] @property def x(self) -> Tensor: """ Input tensor. """ return self._x @property def y(self) -> Tensor: """ Output tensor. """ return self._y @property def n_inputs(self) -> int: """ Number of input features :math:`I`. """ return self.x.size(1) @property def n_outputs(self) -> int: """ Number of output features :math:`O`. """ return self.y.size(1) @property def inputs_names(self) -> int: """ List of the names of the input features. """ return self._inputs_names @property def outputs_names(self) -> int: """ List of the names of the output features. """ return self._outputs_names
[docs] def inputs_size(self) -> int: """ Returns the number of floating point values in ``x``. """ return self.x.numel()
[docs] def outputs_size(self) -> int: """ Returns the number of floating point values in ``y``. """ return self.y.numel()
[docs] def has_nan(self) -> Tuple[bool, bool]: """ Returns a tuple of two boolean. The first one is ``True`` if the input features contain at least one NaN, else ``False``. The second one is ``True`` if the output features contain at least one NaN, else ``False``. Returns ------- tuple of bool Evaluate the presence of NaN in the input and output sets. """ return self.x.isnan().any().item(), self.y.isnan().any().item()
[docs] def has_nonfinite(self) -> Tuple[bool, bool]: """ Returns a tuple of two boolean. The first one is ``True`` if the input features contain at least one non finite value (including NaNs), else ``False``. The second one is ``True`` if the output features contain at least one non finite value (including NaNs), else ``False``. Returns ------- tuple of bool Evaluate the presence of non finite values in the input and output sets. """ return not self.x.isfinite().all().item(), not self.y.isfinite().all().item()
[docs] def apply_transf( self, x_op: Optional[Callable[[np.ndarray], np.ndarray]], y_op: Optional[Callable[[np.ndarray], np.ndarray]], ) -> "RegressionDataset": """ Apply an operator to ``x`` and ``y``. A new dataset is returned so the operators should not use in-place operations. Parameters ---------- x_op : Callable[[numpy.ndarray], np.ndarray] Operator to apply on the input features. y_op : Callable[[numpy.ndarray], np.ndarray] Operator to apply on the output features. Returns ------- RegressionDataset New dataset with transformed values. """ if x_op is None: x_op = lambda t: t if y_op is None: y_op = lambda t: t return type(self)( x_op(self.x.numpy()), y_op(self.y.numpy()), self._inputs_names, self._outputs_names, )
@overload def getall(self, numpy: Literal[True]) -> Tuple[np.ndarray, np.ndarray]: ... @overload def getall(self, numpy: Literal[False]) -> Tuple[Tensor, Tensor]: ...
[docs] def getall(self, numpy: bool = False) -> Tuple[Union[np.ndarray, Tensor], ...]: """ Returns all the dataset in numpy.ndarray or torch.Tensor depending on the value of the ``numpy`` parameter. Parameters ---------- numpy : bool, optional If ``numpy==True``, the returned object will be numpy arrays. Else, they will be torch tensors. Returns ------- tuple of torch.Tensor or numpy.ndarray Inputs and outputs sets. """ x, y = self[list(range(len(self)))] if numpy: return x.numpy(), y.numpy() return x, y
[docs] @staticmethod def from_pandas(df_x: pd.DataFrame, df_y: pd.DataFrame) -> "RegressionDataset": r"""Converts two pandas DataFrames to a RegressionDataset object. Parameters ---------- df_x : pd.DataFrame DataFrame of the inputs. This DataFrame should contain :math:`N` rows, i.e., number of entries, and :math:`I` columns, i.e., features. df_y : pd.DataFrame DataFrame of the outputs. This DataFrame should contain :math:`N` rows, i.e., number of entries, and :math:`O` columns, i.e., features. Returns ------- RegressionDataset associated RegressionDataset object. The ``x`` attribute is set to values in the ``df_x`` DataFrame, and the ``input_names`` attribute to its column names. The ``y`` attribute is set to values in the ``df_y`` DataFrame, and the ``output_names`` attribute to its column names. """ return RegressionDataset( df_x.values, df_y.values, df_x.columns.to_list(), df_y.columns.to_list(), )
[docs] def to_pandas(self) -> Tuple[pd.DataFrame, pd.DataFrame]: """Converts the dataset to two pandas DataFrames. Returns ------- Tuple[pd.DataFrame, pd.DataFrame] DataFrames of the input ``x`` and output ``y``, respectively. The columns are names with the ``input_names`` and ``output_names``, respectively, if they are not ``None``. """ return pd.DataFrame(self.x, columns=self._inputs_names), pd.DataFrame( self.y, columns=self._outputs_names )
[docs] def join( self: "RegressionDataset", other: "RegressionDataset" ) -> "RegressionDataset": """ Returns the union of two datasets. Data are copied. Parameters ---------- other : RegressionDataset Other dataset to join with. Returns ------- type New dataset constructed as the union of the two datasets. """ x1, y1 = self.getall(numpy=True) x2, y2 = other.getall(numpy=True) x = np.concatenate((x1, x2), axis=0) y = np.concatenate((y1, y2), axis=0) return RegressionDataset(x, y)
[docs] def substract( self: "RegressionDataset", other: "RegressionSubset" ) -> "RegressionSubset": """ Returns the subtraction of two datasets. Data are copied. Description. Parameters ---------- other : RegressionSubset Subset of ``self``. Returns ------- RegressionSubset New subset of ``self`` containing all values that were not in `other`. """ if not other.issubsetof(self): raise ValueError( "set2 is not a subset of set1 so it cannot be substracted." ) new_indices = [i for i in range(len(self)) if i not in other._indices] # Algo can be improved. return RegressionSubset(self, new_indices)
[docs] def stats(self) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]: """Provides a few statistics on the dataset (the mean, the standard deviation, the min and the max for each column). Returns ------- Tuple[ Dict[str, np.ndarray], Dict[str, np.ndarray] ] Tuple of dictionaries, each containing the mean, the standard deviation, the min and the max for each column. The first dictionary corresponds to the input x and the second to the output y. """ return { "mean": self.x.mean(axis=0).numpy(), "std": self.x.std(axis=0).numpy(), "min": self.x.min(axis=0).numpy(), "max": self.x.max(axis=0).numpy(), }, { "mean": self.y.mean(axis=0).numpy(), "std": self.y.std(axis=0).numpy(), "min": self.y.min(axis=0).numpy(), "max": self.y.max(axis=0).numpy(), }
[docs] def save(self, filename: str, path: Optional[str] = None) -> None: """saves the dataset to a pickle file. Parameters ---------- filename : str name of the file to be created. path : Optional[str], optional path to the file to be created, by default None """ if path is not None: filename = os.path.join(path, filename) filename = os.path.splitext(filename)[0] with open(f"{filename}.pkl", "wb") as file: pickle.dump(self, file)
[docs] @staticmethod def load(filename: str, path: Optional[str] = None) -> "RegressionDataset": """loads a regression dataset from a pickle file. Parameters ---------- filename : str name of the file to be read. path : Optional[str], optional path to the file to be read, by default None. Returns ------- RegressionDataset loaded regression dataset. """ if path is not None: filename = os.path.join(path, filename) filename = os.path.splitext(filename)[0] with open(f"{filename}.pkl", "rb") as file: dataset = pickle.load(file) return dataset
[docs] class RegressionSubset(RegressionDataset): """Subset of RegressionDataset.""" def __init__(self, dataset: RegressionDataset, indices: Sequence[int]): """ Attributes ---------- dataset : RegressionDataset Dataset from which entries are extracted. indices : Sequence of int Indices of entries to extract. """ self._dataset: RegressionDataset = dataset self._indices: Sequence[int] = indices self._inputs_names = dataset.inputs_names self._outputs_names = dataset.outputs_names def __len__(self) -> int: """ Returns the number of entries in the dataset. Returns ------- int Number of entries. """ return len(self._indices) def __getitem__(self, idx) -> Tuple[Tensor, Tensor]: """ Returns the entries of indice(s) idx. Parameters ---------- idx : Any Indices of entries to return. Returns ------- tuple of torch.Tensor Input and output entries. """ return self._dataset[self._indices[idx]] @property def x(self) -> Tensor: """ Input tensor. """ return self._dataset._x[self._indices] @property def y(self) -> Tensor: """ Output tensor. """ return self._dataset._y[self._indices]
[docs] def issubsetof(self, dataset: RegressionDataset) -> bool: """ Returns ``True`` of ``self`` is a subset of ``dataset``. Parameters ---------- dataset : RegressionDataset Dataset of which we want to know if ``self`` is a subset. Returns ------- bool ``True`` of ``self`` is a subset of ``dataset`` else ``False``. """ # Maybe a better solution is suitable return dataset == self._dataset