import os
import pickle
from typing import (
Callable,
Dict,
List,
Literal,
Optional,
Sequence,
Tuple,
Union,
overload,
)
import numpy as np
import pandas as pd
from torch import Tensor, from_numpy
from torch.utils.data import Dataset
__all__ = ["RegressionDataset", "RegressionSubset"]
[docs]
class RegressionDataset(Dataset):
"""Dataset dedicated to regression."""
def __init__(
self,
x: np.ndarray,
y: np.ndarray,
inputs_names: Optional[List[str]] = None,
outputs_names: Optional[List[str]] = None,
):
r"""
Parameters
----------
x : numpy.ndarray
Array containing the input features of the regression model.
Its shape is considered to be :math:`N \times I` where :math:`N` is the number of entries and :math:`I` the number of input features.
y : numpy.ndarray
Array containing the output features of the regression model.
Its shape is considered to be :math:`N \times O` where :math:`N` is the number of entries and :math:`O` the number of output features.
inputs_names : Optional[List[str]], optional
list of the names of the input features, by default None
outputs_names : Optional[List[str]], optional
list of the names of the output features, by default None
Raises
------
ValueError
``x`` and ``y`` must have the same number of rows :math:`N`.
ValueError
``x`` and ``inputs_names`` must have the same number of features :math:`I`.
ValueError
``y`` and ``outputs_names`` must have the same number of features :math:`O`.
"""
super().__init__()
if x.shape[0] != y.shape[0]:
raise ValueError("x and y must have the same number of rows.")
self._x: Tensor = from_numpy(x).float()
self._y: Tensor = from_numpy(y).float()
if inputs_names is not None and len(inputs_names) != x.shape[1]:
raise ValueError("x and inputs_names must have the same number of features")
if outputs_names is not None and len(outputs_names) != y.shape[1]:
raise ValueError(
"y and outputs_names must have the same number of features"
)
self._inputs_names: Optional[List[str]] = inputs_names
self._outputs_names: Optional[List[str]] = outputs_names
def __len__(self) -> int:
"""
Returns the number of entries :math:`N` in the dataset.
Returns
-------
int
Number of entries :math:`N`.
"""
return self._x.size(0)
def __getitem__(self, idx) -> Tuple[Tensor, Tensor]:
"""
Returns the entries corresponding to the index set ``idx``.
Parameters
----------
idx : Any
Indices of entries to return.
Returns
-------
tuple of torch.Tensor
Input and output entries.
"""
return self._x[idx], self._y[idx]
@property
def x(self) -> Tensor:
"""
Input tensor.
"""
return self._x
@property
def y(self) -> Tensor:
"""
Output tensor.
"""
return self._y
@property
def n_inputs(self) -> int:
"""
Number of input features :math:`I`.
"""
return self.x.size(1)
@property
def n_outputs(self) -> int:
"""
Number of output features :math:`O`.
"""
return self.y.size(1)
@property
def inputs_names(self) -> int:
"""
List of the names of the input features.
"""
return self._inputs_names
@property
def outputs_names(self) -> int:
"""
List of the names of the output features.
"""
return self._outputs_names
[docs]
def outputs_size(self) -> int:
"""
Returns the number of floating point values in ``y``.
"""
return self.y.numel()
[docs]
def has_nan(self) -> Tuple[bool, bool]:
"""
Returns a tuple of two boolean.
The first one is ``True`` if the input features contain at least one NaN, else ``False``.
The second one is ``True`` if the output features contain at least one NaN, else ``False``.
Returns
-------
tuple of bool
Evaluate the presence of NaN in the input and output sets.
"""
return self.x.isnan().any().item(), self.y.isnan().any().item()
[docs]
def has_nonfinite(self) -> Tuple[bool, bool]:
"""
Returns a tuple of two boolean.
The first one is ``True`` if the input features contain at least one non finite value (including NaNs), else ``False``.
The second one is ``True`` if the output features contain at least one non finite value (including NaNs), else ``False``.
Returns
-------
tuple of bool
Evaluate the presence of non finite values in the input and output sets.
"""
return not self.x.isfinite().all().item(), not self.y.isfinite().all().item()
[docs]
def apply_transf(
self,
x_op: Optional[Callable[[np.ndarray], np.ndarray]],
y_op: Optional[Callable[[np.ndarray], np.ndarray]],
) -> "RegressionDataset":
"""
Apply an operator to ``x`` and ``y``. A new dataset is returned so the operators should not use in-place operations.
Parameters
----------
x_op : Callable[[numpy.ndarray], np.ndarray]
Operator to apply on the input features.
y_op : Callable[[numpy.ndarray], np.ndarray]
Operator to apply on the output features.
Returns
-------
RegressionDataset
New dataset with transformed values.
"""
if x_op is None:
x_op = lambda t: t
if y_op is None:
y_op = lambda t: t
return type(self)(
x_op(self.x.numpy()),
y_op(self.y.numpy()),
self._inputs_names,
self._outputs_names,
)
@overload
def getall(self, numpy: Literal[True]) -> Tuple[np.ndarray, np.ndarray]:
...
@overload
def getall(self, numpy: Literal[False]) -> Tuple[Tensor, Tensor]:
...
[docs]
def getall(self, numpy: bool = False) -> Tuple[Union[np.ndarray, Tensor], ...]:
"""
Returns all the dataset in numpy.ndarray or torch.Tensor depending on the value of the ``numpy`` parameter.
Parameters
----------
numpy : bool, optional
If ``numpy==True``, the returned object will be numpy arrays.
Else, they will be torch tensors.
Returns
-------
tuple of torch.Tensor or numpy.ndarray
Inputs and outputs sets.
"""
x, y = self[list(range(len(self)))]
if numpy:
return x.numpy(), y.numpy()
return x, y
[docs]
@staticmethod
def from_pandas(df_x: pd.DataFrame, df_y: pd.DataFrame) -> "RegressionDataset":
r"""Converts two pandas DataFrames to a RegressionDataset object.
Parameters
----------
df_x : pd.DataFrame
DataFrame of the inputs. This DataFrame should contain :math:`N` rows, i.e., number of entries, and :math:`I` columns, i.e., features.
df_y : pd.DataFrame
DataFrame of the outputs. This DataFrame should contain :math:`N` rows, i.e., number of entries, and :math:`O` columns, i.e., features.
Returns
-------
RegressionDataset
associated RegressionDataset object.
The ``x`` attribute is set to values in the ``df_x`` DataFrame, and the ``input_names`` attribute to its column names.
The ``y`` attribute is set to values in the ``df_y`` DataFrame, and the ``output_names`` attribute to its column names.
"""
return RegressionDataset(
df_x.values,
df_y.values,
df_x.columns.to_list(),
df_y.columns.to_list(),
)
[docs]
def to_pandas(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Converts the dataset to two pandas DataFrames.
Returns
-------
Tuple[pd.DataFrame, pd.DataFrame]
DataFrames of the input ``x`` and output ``y``, respectively. The columns are names with the ``input_names`` and ``output_names``, respectively, if they are not ``None``.
"""
return pd.DataFrame(self.x, columns=self._inputs_names), pd.DataFrame(
self.y, columns=self._outputs_names
)
[docs]
def join(
self: "RegressionDataset", other: "RegressionDataset"
) -> "RegressionDataset":
"""
Returns the union of two datasets. Data are copied.
Parameters
----------
other : RegressionDataset
Other dataset to join with.
Returns
-------
type
New dataset constructed as the union of the two datasets.
"""
x1, y1 = self.getall(numpy=True)
x2, y2 = other.getall(numpy=True)
x = np.concatenate((x1, x2), axis=0)
y = np.concatenate((y1, y2), axis=0)
return RegressionDataset(x, y)
[docs]
def substract(
self: "RegressionDataset", other: "RegressionSubset"
) -> "RegressionSubset":
"""
Returns the subtraction of two datasets. Data are copied.
Description.
Parameters
----------
other : RegressionSubset
Subset of ``self``.
Returns
-------
RegressionSubset
New subset of ``self`` containing all values that were not in `other`.
"""
if not other.issubsetof(self):
raise ValueError(
"set2 is not a subset of set1 so it cannot be substracted."
)
new_indices = [i for i in range(len(self)) if i not in other._indices]
# Algo can be improved.
return RegressionSubset(self, new_indices)
[docs]
def stats(self) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""Provides a few statistics on the dataset (the mean, the standard deviation, the min and the max for each column).
Returns
-------
Tuple[ Dict[str, np.ndarray], Dict[str, np.ndarray] ]
Tuple of dictionaries, each containing the mean, the standard deviation, the min and the max for each column. The first dictionary corresponds to the input x and the second to the output y.
"""
return {
"mean": self.x.mean(axis=0).numpy(),
"std": self.x.std(axis=0).numpy(),
"min": self.x.min(axis=0).numpy(),
"max": self.x.max(axis=0).numpy(),
}, {
"mean": self.y.mean(axis=0).numpy(),
"std": self.y.std(axis=0).numpy(),
"min": self.y.min(axis=0).numpy(),
"max": self.y.max(axis=0).numpy(),
}
[docs]
def save(self, filename: str, path: Optional[str] = None) -> None:
"""saves the dataset to a pickle file.
Parameters
----------
filename : str
name of the file to be created.
path : Optional[str], optional
path to the file to be created, by default None
"""
if path is not None:
filename = os.path.join(path, filename)
filename = os.path.splitext(filename)[0]
with open(f"{filename}.pkl", "wb") as file:
pickle.dump(self, file)
[docs]
@staticmethod
def load(filename: str, path: Optional[str] = None) -> "RegressionDataset":
"""loads a regression dataset from a pickle file.
Parameters
----------
filename : str
name of the file to be read.
path : Optional[str], optional
path to the file to be read, by default None.
Returns
-------
RegressionDataset
loaded regression dataset.
"""
if path is not None:
filename = os.path.join(path, filename)
filename = os.path.splitext(filename)[0]
with open(f"{filename}.pkl", "rb") as file:
dataset = pickle.load(file)
return dataset
[docs]
class RegressionSubset(RegressionDataset):
"""Subset of RegressionDataset."""
def __init__(self, dataset: RegressionDataset, indices: Sequence[int]):
"""
Attributes
----------
dataset : RegressionDataset
Dataset from which entries are extracted.
indices : Sequence of int
Indices of entries to extract.
"""
self._dataset: RegressionDataset = dataset
self._indices: Sequence[int] = indices
self._inputs_names = dataset.inputs_names
self._outputs_names = dataset.outputs_names
def __len__(self) -> int:
"""
Returns the number of entries in the dataset.
Returns
-------
int
Number of entries.
"""
return len(self._indices)
def __getitem__(self, idx) -> Tuple[Tensor, Tensor]:
"""
Returns the entries of indice(s) idx.
Parameters
----------
idx : Any
Indices of entries to return.
Returns
-------
tuple of torch.Tensor
Input and output entries.
"""
return self._dataset[self._indices[idx]]
@property
def x(self) -> Tensor:
"""
Input tensor.
"""
return self._dataset._x[self._indices]
@property
def y(self) -> Tensor:
"""
Output tensor.
"""
return self._dataset._y[self._indices]
[docs]
def issubsetof(self, dataset: RegressionDataset) -> bool:
"""
Returns ``True`` of ``self`` is a subset of ``dataset``.
Parameters
----------
dataset : RegressionDataset
Dataset of which we want to know if ``self`` is a subset.
Returns
-------
bool
``True`` of ``self`` is a subset of ``dataset`` else ``False``.
"""
# Maybe a better solution is suitable
return dataset == self._dataset