import os
import pickle
from typing import Dict, List, Literal, Optional, Sequence, Tuple, Union, overload
import numpy as np
import pandas as pd
from torch import Tensor, from_numpy
from torch.utils.data import Dataset
__all__ = ["MaskDataset", "MaskSubset"]
[docs]
class MaskDataset(Dataset):
"""Dataset dedicated to ignore some specified values during learning."""
def __init__(
self,
m: np.ndarray,
features_names: Optional[List[str]] = None,
):
r"""
Parameters
----------
m : np.ndarray
Array containing the output features of the regression model.
y must be of shape :math:`N \times F` where :math:`N` is the number of entries and :math:`F` the number of features.
features_names : Optional[List[str]], optional
list of feature names, by default None.
Raises
------
ValueError
``m`` and ``features_names`` must have the same number of features :math:`F`.
"""
super().__init__()
self._m: Tensor = from_numpy(m).float()
if features_names is not None and len(features_names) != m.shape[1]:
raise ValueError(
"m and features_names must have the same number of features"
)
self._features_names: Optional[List[str]] = features_names
def __len__(self) -> int:
r"""
Returns the number of entries :math:`N` in the dataset.
Returns
-------
int
Number of entries :math:`N`.
"""
return self._m.size(0)
def __getitem__(self, idx) -> Tuple[Tensor, Tensor]:
r"""
Returns the index entries idx.
Parameters
----------
idx : Any
Indices of entries to return.
Returns
-------
tuple of torch.Tensor
Input and output entries.
"""
return self._m[idx]
@property
def m(self) -> Tensor:
r"""
Mask Tensor.
"""
return self._m
@property
def n_features(self) -> int:
r"""
Number of output features.
"""
return self.m.size(1)
@property
def features_names(self) -> Optional[List[str]]:
r"""
Features names.
"""
return self._features_names
[docs]
def features_size(self) -> int:
"""
Returns the number of floating point values in ``m``.
"""
return self.m.numel()
@overload
def getall(self, numpy: Literal[True]) -> np.ndarray:
...
@overload
def getall(self, numpy: Literal[False]) -> Tensor:
...
[docs]
def getall(self, numpy: bool = False) -> Union[np.ndarray, Tensor]:
r"""
Returns all the dataset in numpy.ndarray or torch.Tensor depending on the value of the ``numpy`` parameter.
Parameters
----------
numpy : bool, optional
If ``numpy==True``, the returned object will be numpy arrays.
Else, they will be torch tensors.
Returns
-------
torch.Tensor or numpy.ndarray
Mask.
"""
m = self[list(range(len(self)))]
if numpy:
return m.numpy()
return m
[docs]
@staticmethod
def from_pandas(df_m: pd.DataFrame) -> "MaskDataset":
r"""Converts a pandas DataFrame to a MaskDataset object.
Parameters
----------
df_m : pd.DataFrame
DataFrame of the masked outputs. This DataFrame should contain :math:`N` rows, i.e., number of entries, and :math:`F` columns, i.e., features.
Returns
-------
MaskDataset
associated MaskDataset object. The ``m`` attribute is set to values in the DataFrame, and the ``feature_names`` attribute to the column names.
"""
return MaskDataset(
df_m.values,
df_m.columns.to_list(),
)
[docs]
def to_pandas(self) -> pd.DataFrame:
r"""Converts the mask dataset to a pandas DataFrame.
Returns
-------
pd.DataFrame
DataFrame of the mask on the output y.
"""
return pd.DataFrame(self.m, columns=self._features_names)
[docs]
def join(self: "MaskDataset", other: "MaskDataset") -> "MaskDataset":
r"""
Returns the union of two datasets. Data are copied.
Parameters
----------
other : MaskDataset
Other dataset to join with.
Returns
-------
MaskDataset
New dataset constructed as the union of the two datasets.
"""
x1, y1 = self.getall(numpy=True)
x2, y2 = other.getall(numpy=True)
x = np.concatenate((x1, x2), axis=0)
y = np.concatenate((y1, y2), axis=0)
return MaskDataset(x, y)
[docs]
def substract(self: "MaskDataset", other: "MaskSubset") -> "MaskDataset":
r"""
Returns the subtraction of two datasets. Data are copied.
Parameters
----------
other : MaskDataset
Subset of ``self``.
Returns
-------
MaskDataset
New subset of ``self`` containing all values that were not in `other`.
"""
if not other.issubsetof(self):
raise ValueError("set2 is not a subset of set1 so it cannot be subtracted.")
new_indices = [i for i in range(len(self)) if i not in other.indices]
# Algo can be improved.
return MaskDataset(self, new_indices)
[docs]
def stats(self) -> Dict[str, np.ndarray]:
r"""Computes the proportion of masked entries for each output column.
Returns
-------
Dict[str, np.ndarray]
dictionary of masked entry proportion for each output feature.
"""
return {
"frac": self.m.mean(axis=0).numpy(),
}
[docs]
def save(self, filename: str, path: Optional[str] = None) -> None:
r"""saves the dataset to a pickle file.
Parameters
----------
filename : str
name of the file to be created.
path : Optional[str], optional
path to the file to be created, by default None.
"""
if path is not None:
filename = os.path.join(path, filename)
filename = os.path.splitext(filename)[0]
with open(f"{filename}.pkl", "wb") as file:
pickle.dump(self, file)
[docs]
@staticmethod
def load(filename: str, path: Optional[str] = None) -> "MaskDataset":
r"""loads a mask dataset from a pickle file.
Parameters
----------
filename : str
name of the file to be read.
path : Optional[str], optional
path to the file to be read, by default None.
Returns
-------
MaskDataset
loaded mask dataset.
"""
if path is not None:
filename = os.path.join(path, filename)
filename = os.path.splitext(filename)[0]
with open(f"{filename}.pkl", "rb") as file:
dataset = pickle.load(file)
return dataset
[docs]
class MaskSubset(MaskDataset):
r"""Subset of RegressionDataset."""
def __init__(self, dataset: MaskDataset, indices: Sequence[int]):
r"""
Parameters
----------
dataset : MaskDataset
Dataset from which entries are extracted.
indices : Sequence of int
Indices of entries to extract.
"""
self._dataset: MaskDataset = dataset
self._indices: Sequence[int] = indices
def __len__(self) -> int:
r"""
Returns the number of entries in the dataset.
Returns
-------
int
Number of entries.
"""
return len(self._indices)
def __getitem__(self, idx) -> Tensor:
r"""
Returns the index entries idx.
Parameters
----------
idx : Any
Indices of entries to return.
Returns
-------
tuple of torch.Tensor
Input and output entries.
"""
return self._dataset[self._indices[idx]]
@property
def m(self) -> Tensor:
r"""
Mask Tensor.
"""
return self._dataset._m[self._indices]
[docs]
def issubsetof(self, dataset: MaskDataset) -> bool:
r"""
Returns ``True`` if ``self`` is a subset of ``dataset``.
Parameters
----------
dataset : MaskDataset
Dataset of which we want to know if ``self`` is a subset.
Returns
-------
bool
``True`` if ``self`` is a subset of ``dataset`` else ``False``.
"""
# Maybe a better solution is suitable
return dataset == self._dataset