-
Dominique Benielli authoredDominique Benielli authored
data_sample.py 20.62 KiB
# -*- coding: utf-8 -*-
"""This module contains the DataSample class and Metriclearn_array class
The DataSample class encapsulates a sample 's components
nbL and nbEx numbers,
Metriclearn_arra class inherit from numpy ndarray and contains a 2d data ndarray
with the shape (n_samples, n_view_i * n_features_i)
0 1 2 3
======== ==== ==== ====
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
======== ==== ==== ====
the number nbL and nbEx and , the fourth dictionaries for sample,
prefix, suffix and factor where they are computed
"""
from abc import ABCMeta
import numpy as np
import numpy.ma as ma
import scipy.sparse as sp
class MultiModalData(metaclass=ABCMeta):
@staticmethod
def _first_validate_views_ind(views_ind, n_features):
"""Ensure proper format for views_ind and return number of views."""
views_ind = np.array(views_ind)
if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1:
if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]):
raise ValueError("Values in views_ind must be sorted.")
if views_ind[0] < 0 or views_ind[-1] > n_features:
raise ValueError("Values in views_ind are not in a correct "
+ "range for the provided data.")
view_mode_ = "slices"
n_views = views_ind.shape[0]-1
else:
if views_ind.ndim == 1:
if not views_ind.dtype == np.object:
raise ValueError("The format of views_ind is not "
+ "supported.")
for ind, val in enumerate(views_ind):
views_ind[ind] = np.array(val)
if not np.issubdtype(views_ind[ind].dtype, np.integer):
raise ValueError("Values in views_ind must be "
+ "integers.")
if views_ind[ind].min() < 0 \
or views_ind[ind].max() >= n_features:
raise ValueError("Values in views_ind are not in a "
+ "correct range for the provided "
+ "data.")
elif views_ind.ndim == 2:
if not np.issubdtype(views_ind.dtype, np.integer):
raise ValueError("Values in views_ind must be integers.")
if views_ind.min() < 0 or views_ind.max() >= n_features:
raise ValueError("Values in views_ind are not in a "
+ "correct range for the provided data.")
else:
raise ValueError("The format of views_ind is not supported.")
view_mode_ = "indices"
n_views = views_ind.shape[0]
return (views_ind, n_views, view_mode_)
def _extract_view(self, ind_view):
"""Extract the view for the given index ind_view from the dataset X."""
if self.view_mode_ == "indices":
return self[:, self.views_ind[ind_view]]
else:
return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]]
def _validate_views_ind(self, views_ind, n_features):
"""Ensure proper format for views_ind and return number of views."""
views_ind = np.array(views_ind)
if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1:
if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]):
raise ValueError("Values in views_ind must be sorted.")
if views_ind[0] < 0 or views_ind[-1] > n_features:
raise ValueError("Values in views_ind are not in a correct "
+ "range for the provided data.")
self.view_mode_ = "slices"
n_views = views_ind.shape[0]-1
else:
if views_ind.ndim == 1:
if not views_ind.dtype == np.object:
raise ValueError("The format of views_ind is not "
+ "supported.")
for ind, val in enumerate(views_ind):
views_ind[ind] = np.array(val)
if not np.issubdtype(views_ind[ind].dtype, np.integer):
raise ValueError("Values in views_ind must be "
+ "integers.")
if views_ind[ind].min() < 0 \
or views_ind[ind].max() >= n_features:
raise ValueError("Values in views_ind are not in a "
+ "correct range for the provided "
+ "data.")
elif views_ind.ndim == 2:
if not np.issubdtype(views_ind.dtype, np.integer):
raise ValueError("Values in views_ind must be integers.")
if views_ind.min() < 0 or views_ind.max() >= n_features:
raise ValueError("Values in views_ind are not in a "
+ "correct range for the provided data.")
else:
raise ValueError("The format of views_ind is not supported.")
self.view_mode_ = "indices"
n_views = views_ind.shape[0]
self.views_ind = views_ind
self.n_views = n_views
return (views_ind, n_views)
class MultiModalSparseInfo():
def __init__(self, data, view_ind=None):
"""Constructor of Metriclearn_array"""
shapes_int = []
index = 0
new_data = np.ndarray([])
n_views = data.size
thekeys = None
# view_ind_self = None
view_mode = 'slices'
if (sp.issparse(data)) and data.ndim > 1:
if view_ind is not None:
try:
view_ind = np.asarray(view_ind)
except :
raise TypeError("n_views should be list or nparray")
elif view_ind is None:
if data.shape[1] > 1:
view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
else:
view_ind = np.array([0, data.shape[1]])
new_data = data
# view_ind_self = view_ind
view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind,
data.shape[1])
if view_ind.ndim == 1 and view_mode.startswith("slicing"):
shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])]
if data.shape[0] < 1 or data.shape[1] < 1:
raise ValueError("input data shouldbe not empty")
self.view_mode_ = view_mode
self.views_ind = view_ind
self.shapes_int = shapes_int
self.n_views = n_views
class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData):
"""
MultiModalArray inherit from numpy ndarray
Parameters
----------
data : can be
- dictionary of multiview array with shape = (n_samples, n_features) for multi-view
for each view.
{0: array([[]],
1: array([[]],
...}
- numpy array like with shape = (n_samples, n_features) for multi-view
for each view.
[[[...]],
[[...]],
...]
- {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None'
for Multi-view input samples.
views_ind : array-like (default= None ) if None
[0, n_features//2, n_features]) is constructed (2 views)
Paramater specifying how to extract the data views from X:
- views_ind is a 1-D array of sorted integers, the entries
indicate the limits of the slices used to extract the views,
where view ``n`` is given by
``X[:, views_ind[n]:views_ind[n+1]]``.
Attributes
----------
view_ind : list of views' indice (may be None)
n_views : int number of views
shapes_int: list of int numbers of feature for each views
keys : name of key, where data come from a dictionary
:Example:
>>> from multimodal.datasets.base import load_dict
>>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
>>> from multimodal.datasets.data_sample import DataSample
>>> file = 'input_x_dic.pkl'
>>> data = load_dict(get_dataset_path(file))
"""
def __init__(self, *arg, **kwargs ):
"""Constructor of Metriclearn_array"""
if sp.issparse(arg[0]):
MultiModalSparseInfo.__init__(self, *arg)
if isinstance(arg[0], sp.csr_matrix) :
sp.csr_matrix.__init__(self, arg[0])
elif isinstance(arg[0], sp.csc_matrix):
sp.csc_matrix.__init__(self, arg[0])
else:
raise TypeError("This sparse format is not supported")
else:
if isinstance(self,sp.csr_matrix):
sp.csr_matrix.__init__(self, *arg, **kwargs)
elif isinstance(self, sp.csc_matrix):
sp.csc_matrix.__init__(self, *arg, **kwargs)
class MultiModalArray(np.ndarray, MultiModalData):
"""
MultiModalArray inherit from numpy ndarray
Parameters
----------
data : can be
- dictionary of multiview array with shape = (n_samples, n_features) for multi-view
for each view.
{0: array([[]],
1: array([[]],
...}
- numpy array like with shape = (n_samples, n_features) for multi-view
for each view.
[[[...]],
[[...]],
...]
- {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None'
for Multi-view input samples.
views_ind : array-like (default= None ) if None
[0, n_features//2, n_features]) is constructed (2 views)
Paramater specifying how to extract the data views from X:
- views_ind is a 1-D array of sorted integers, the entries
indicate the limits of the slices used to extract the views,
where view ``n`` is given by
``X[:, views_ind[n]:views_ind[n+1]]``.
Attributes
----------
view_ind : list of views' indice (may be None)
n_views : int number of views
shapes_int: list of int numbers of feature for each views
keys : name of key, where data come from a dictionary
:Example:
>>> from multimodal.datasets.base import load_dict
>>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
>>> from multimodal.datasets.data_sample import DataSample
>>> file = 'input_x_dic.pkl'
>>> data = load_dict(get_dataset_path(file))
>>> print(data.__class__)
<class 'dict'>
>>> multiviews = MultiModalArray(data)
>>> multiviews.shape
(120, 240)
>>> multiviews.keys
dict_keys([0, 1])
>>> multiviews.shapes_int
[120, 120]
>>> multiviews.n_views
2
"""
def __new__(cls, data, view_ind=None):
"""Constructor of MultiModalArray_array"""
shapes_int = []
index = 0
new_data = np.ndarray([])
n_views = 1
thekeys = None
# view_ind_self = None
view_mode = 'slices'
if isinstance(data, dict):
n_views = len(data)
view_ind = [0]
for key, dat_values in data.items():
new_data = cls._populate_new_data(index, dat_values, new_data)
shapes_int.append(dat_values.shape[1])
view_ind.append(dat_values.shape[1] + view_ind[index])
index += 1
thekeys = data.keys()
elif isinstance(data, np.ndarray) and view_ind is None and data.ndim == 1:
try:
dat0 = np.array(data[0])
except Exception:
raise TypeError("input format is not supported")
if dat0.ndim < 2:
data = data[np.newaxis, ...]
if data.shape[1] > 1:
view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
else:
view_ind = np.array([0, data.shape[1]])
new_data = data
else:
new_data, shapes_int, view_ind = cls._for_data(cls, data)
n_views = data.shape[0]
elif (isinstance(data, np.ndarray) ) and data.ndim > 1:
try:
data = np.asarray(data)
except:
raise TypeError("input format is not supported")
if view_ind is not None:
try:
view_ind = np.asarray(view_ind)
except :
raise TypeError("n_views should be list or nparray")
elif view_ind is None:
if data.shape[1] > 1:
view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
else:
view_ind = np.array([0, data.shape[1]])
new_data = data
else:
try:
new_data = np.asarray(data)
# if new_data.ndim == 1:
# new_data = new_data.reshape(1, new_data.shape[0])
if view_ind is None:
view_ind = np.array([0, new_data.shape[1]])
except Exception as e:
raise ValueError('Reshape your data')
if new_data.ndim < 2 or new_data.shape == (1, 1) or view_ind[-1] > new_data.shape[1]:
raise ValueError('Reshape your data')
# view_ind_self = view_ind
# if new_data.shape[1] < 1:
# msg = ("%d feature\(s\) \\(shape=\%s\) while a minimum of \\d* "
# "is required.") % (new_data.shape[1], str(new_data.shape))
# # "%d feature\(s\) \(shape=\(%d, %d\)\) while a minimum of \d* is required." % (new_data.shape[1], new_data.shape[0], new_data.shape[1])
# raise ValueError(msg)
view_ind, n_views, view_mode = cls._first_validate_views_ind(view_ind,
new_data.shape[1])
if view_ind.ndim == 1 and view_mode.startswith("slices"):
shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])]
# obj = ma.MaskedArray.__new(new_data) # new_data.view() a.MaskedArray(new_data, mask=new_data.mask).view(cls)
# bj = super(Metriclearn_array, cls).__new__(cls, new_data.data, new_data.mask)
if hasattr(new_data, "mask"):
obj = ma.masked_array(new_data.data, new_data.mask).view(cls)
elif hasattr(new_data, "data") and \
hasattr(new_data, "shape") and len(new_data.shape) > 0:
obj = np.asarray(new_data.data).view(cls)
else:
obj = np.recarray.__new__(cls, shape=(0, 0), dtype=np.float)
obj.view_mode_ = view_mode
obj.views_ind = view_ind
obj.shapes_int = shapes_int
obj.n_views = n_views
obj.keys = thekeys
return obj
@staticmethod
def _for_data(cls, data):
n_views = data.shape[0]
index = 0
view_ind = np.empty(n_views + 1, dtype=np.int)
view_ind[0] = 0
shapes_int = []
new_data = np.ndarray([])
for dat_values in data:
try:
dat_values = np.array(dat_values)
except Exception:
raise TypeError("input format is not supported")
new_data = cls._populate_new_data(index, dat_values, new_data)
view_ind[index + 1] = dat_values.shape[1] + view_ind[index]
shapes_int.append(dat_values.shape[1])
index += 1
return new_data, shapes_int, view_ind
@staticmethod
def _populate_new_data(index, dat_values, new_data):
if index == 0:
if isinstance(dat_values, ma.MaskedArray) or \
isinstance(dat_values, np.ndarray) or sp.issparse(dat_values):
new_data = dat_values
else:
new_data = dat_values.view(np.ndarray) # ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #(
# new_data.mask = ma.nomask
else:
if isinstance(dat_values, np.ndarray):
new_data = np.hstack((new_data, dat_values))
elif isinstance(dat_values, ma.MaskedArray):
new_data = ma.hstack((new_data, dat_values))
elif sp.issparse(dat_values):
new_data = sp.hstack((new_data, dat_values))
else:
new_data = np.hstack((new_data, dat_values.view(np.ndarray) ) ) # ma.masked_array(dat_values, mask=ma.nomask
return new_data
def __array_finalize__(self, obj):
if obj is None: return
# super(MultiModalArray, self).__array_finalize__(obj)
self.shapes_int = getattr(obj, 'shapes_int', None)
self.n_views = getattr(obj, 'n_views', None)
self.keys = getattr(obj, 'keys', None)
self.views_ind = getattr(obj, 'views_ind', None)
self.view_mode_ = getattr(obj, 'view_mode_', None)
def __reduce__(self):
# Get the parent's __reduce__ tuple
pickled_state = super(MultiModalArray, self).__reduce__()
# Create our own tuple to pass to __setstate__
new_state = pickled_state[2] + (self.__dict__,)
# Return a tuple that replaces the parent's __setstate__ tuple with our own
return (pickled_state[0], pickled_state[1], new_state)
def __setstate__(self, state):
self.__dict__.update(state[-1])
super(MultiModalArray, self).__setstate__(state[0:-1])
def get_col(self, view, col):
start = np.sum(np.asarray(self.shapes_int[0: view]))
return self[start+col, :]
def get_view(self, view):
start = int(np.sum(np.asarray(self.shapes_int[0: view])))
stop = int(start + self.shapes_int[view])
return self[:, start:stop]
def set_view(self, view, data):
start = int(np.sum(np.asarray(self.shapes_int[0: view])))
stop = int(start + self.shapes_int[view])
if stop-start == data.shape[0] and data.shape[1]== self.data.shape[1]:
self[:, start:stop] = data
else:
raise ValueError(
"shape of data does not match (%d, %d)" %stop-start %self.data.shape[1])
def get_raw(self, view, raw):
start = np.sum(np.asarray(self.shapes_int[0: view]))
stop = np.sum(np.asarray(self.shapes_int[0: view+1]))
return self.data[start:stop, raw]
def add_view(self, v, data):
if len(self.shape) > 0:
if data.shape[0] == self.data.shape[0]:
indice = self.shapes_int[v]
np.insert(self.data, data, indice+1, axis=0)
self.shapes_int.append(data.shape[1])
self.n_views +=1
else:
raise ValueError("New view can't initialazed")
# self.shapes_int= [data.shape[1]]
# self.data.reshape(data.shape[0],)
# np.insert(self.data, data, 0)
# self.n_views = 1
def _todict(self):
dico = {}
for view in range(self.n_views):
dico[view] = self.get_view(view)
return dico
class DataSample(dict):
"""
A DataSample instance
:Example:
>>> from multimodal.datasets.base import load_dict
>>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
>>> from multimodal.datasets.data_sample import DataSample
>>> file = 'input_x_dic.pkl'
>>> data = load_dict(get_dataset_path(file))
>>> print(data.__class__)
<class 'dict'>
>>> s = DataSample(data)
>>> type(s.data)
<class 'multimodal.datasets.data_sample.MultiModalArray'>
- Input:
Parameters
----------
data : dict
kwargs : others arguments
Attributes
----------
data : { array like} MultiModalArray
"""
def __init__(self, data=None, **kwargs):
# The dictionary that contains the sample
super(DataSample, self).__init__(kwargs)
self._data = None # Metriclearn_arrayMultiModalArray(np.zeros((0,0)))
if data is not None:
self._data = MultiModalArray(data)
@property
def data(self):
"""MultiModalArray"""
return self._data
@data.setter
def data(self, data):
if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)) or sp.issparse(data):
self._data = data
else:
raise TypeError("sample should be a MultiModalArray or numpy array.")