Skip to content
Snippets Groups Projects
Select Git revision
  • 9907ffa6a0c1c02cc41b8c913b6ae0c75f4c8ffe
  • master default protected
  • develop
  • 0.1.0
  • 0.0.3
  • 0.0.2
  • 0.0.1
  • 0.0.0
8 results

data_sample.py

Blame
  • Dominique Benielli's avatar
    Dominique Benielli authored
    9907ffa6
    History
    data_sample.py 20.62 KiB
    # -*- coding: utf-8 -*-
    
    """This module contains the DataSample class and Metriclearn_array class
    The DataSample class encapsulates a sample 's components
    nbL and nbEx numbers,
    Metriclearn_arra class inherit from numpy ndarray and contains a 2d data ndarray
    with the shape (n_samples, n_view_i * n_features_i)
    
    0        1    2    3
    ======== ==== ==== ====
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    xxxxxxxx xxxx xxxx xxxx
    ======== ==== ==== ====
    
    the number nbL and nbEx and , the fourth dictionaries for sample,
    prefix, suffix and factor where they are computed
    """
    from abc import ABCMeta
    import numpy as np
    import numpy.ma as ma
    import scipy.sparse as sp
    
    class MultiModalData(metaclass=ABCMeta):
    
        @staticmethod
        def _first_validate_views_ind(views_ind, n_features):
            """Ensure proper format for views_ind and return number of views."""
            views_ind = np.array(views_ind)
            if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1:
                if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]):
                    raise ValueError("Values in views_ind must be sorted.")
                if views_ind[0] < 0 or views_ind[-1] > n_features:
                    raise ValueError("Values in views_ind are not in a correct "
                                     + "range for the provided data.")
                view_mode_ = "slices"
                n_views = views_ind.shape[0]-1
            else:
                if views_ind.ndim == 1:
                    if not views_ind.dtype == np.object:
                        raise ValueError("The format of views_ind is not "
                                         + "supported.")
                    for ind, val in enumerate(views_ind):
                        views_ind[ind] = np.array(val)
                        if not np.issubdtype(views_ind[ind].dtype, np.integer):
                            raise ValueError("Values in views_ind must be "
                                             + "integers.")
                        if views_ind[ind].min() < 0 \
                                or views_ind[ind].max() >= n_features:
                            raise ValueError("Values in views_ind are not in a "
                                             + "correct range for the provided "
                                             + "data.")
                elif views_ind.ndim == 2:
                    if not np.issubdtype(views_ind.dtype, np.integer):
                        raise ValueError("Values in views_ind must be integers.")
                    if views_ind.min() < 0 or views_ind.max() >= n_features:
                        raise ValueError("Values in views_ind are not in a "
                                         + "correct range for the provided data.")
                else:
                    raise ValueError("The format of views_ind is not supported.")
                view_mode_ = "indices"
                n_views = views_ind.shape[0]
            return (views_ind, n_views, view_mode_)
    
        def _extract_view(self, ind_view):
            """Extract the view for the given index ind_view from the dataset X."""
            if self.view_mode_ == "indices":
                return self[:, self.views_ind[ind_view]]
            else:
                return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]]
    
        def _validate_views_ind(self, views_ind, n_features):
            """Ensure proper format for views_ind and return number of views."""
            views_ind = np.array(views_ind)
            if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1:
                if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]):
                    raise ValueError("Values in views_ind must be sorted.")
                if views_ind[0] < 0 or views_ind[-1] > n_features:
                    raise ValueError("Values in views_ind are not in a correct "
                                     + "range for the provided data.")
                self.view_mode_ = "slices"
                n_views = views_ind.shape[0]-1
            else:
                if views_ind.ndim == 1:
                    if not views_ind.dtype == np.object:
                        raise ValueError("The format of views_ind is not "
                                         + "supported.")
                    for ind, val in enumerate(views_ind):
                        views_ind[ind] = np.array(val)
                        if not np.issubdtype(views_ind[ind].dtype, np.integer):
                            raise ValueError("Values in views_ind must be "
                                             + "integers.")
                        if views_ind[ind].min() < 0 \
                                or views_ind[ind].max() >= n_features:
                            raise ValueError("Values in views_ind are not in a "
                                             + "correct range for the provided "
                                             + "data.")
                elif views_ind.ndim == 2:
                    if not np.issubdtype(views_ind.dtype, np.integer):
                        raise ValueError("Values in views_ind must be integers.")
                    if views_ind.min() < 0 or views_ind.max() >= n_features:
                        raise ValueError("Values in views_ind are not in a "
                                         + "correct range for the provided data.")
                else:
                    raise ValueError("The format of views_ind is not supported.")
                self.view_mode_ = "indices"
                n_views = views_ind.shape[0]
            self.views_ind = views_ind
            self.n_views = n_views
            return (views_ind, n_views)
    
    class MultiModalSparseInfo():
    
        def __init__(self, data, view_ind=None):
            """Constructor of Metriclearn_array"""
            shapes_int = []
            index = 0
            new_data = np.ndarray([])
            n_views = data.size
            thekeys = None
            # view_ind_self =  None
            view_mode = 'slices'
    
            if (sp.issparse(data)) and data.ndim > 1:
                if  view_ind is not None:
                    try:
                        view_ind = np.asarray(view_ind)
                    except :
                        raise TypeError("n_views should be list or nparray")
                elif view_ind is None:
                    if data.shape[1] > 1:
                        view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
                    else:
                        view_ind = np.array([0, data.shape[1]])
    
                new_data = data
                # view_ind_self = view_ind
            view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind,
                                                                          data.shape[1])
            if view_ind.ndim == 1 and view_mode.startswith("slicing"):
                shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])]
    
            if data.shape[0] < 1 or data.shape[1] < 1:
                raise ValueError("input data shouldbe not empty")
            self.view_mode_ = view_mode
            self.views_ind = view_ind
            self.shapes_int = shapes_int
            self.n_views = n_views
    
    class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData):
        """
        MultiModalArray inherit from numpy ndarray
    
    
        Parameters
        ----------
    
        data : can be
                 - dictionary of multiview array with shape = (n_samples, n_features)  for multi-view
                      for each view.
                   {0: array([[]],
                    1: array([[]],
                    ...}
                 - numpy array like with shape = (n_samples, n_features)  for multi-view
                      for each view.
                    [[[...]],
                     [[...]],
                     ...]
                 - {array like} with (n_samples, nviews *  n_features) with 'views_ind' diferent to 'None'
                    for Multi-view input samples.
    
    
    
    
            views_ind : array-like (default= None ) if None
                        [0, n_features//2, n_features]) is constructed (2 views)
                        Paramater specifying how to extract the data views from X:
    
                - views_ind is a 1-D array of sorted integers, the entries
                  indicate the limits of the slices used to extract the views,
                  where view ``n`` is given by
                  ``X[:, views_ind[n]:views_ind[n+1]]``.
    
            Attributes
            ----------
    
            view_ind : list of views' indice  (may be None)
    
            n_views : int number of views
    
            shapes_int: list of int numbers of feature for each views
    
            keys : name of key, where data come from a dictionary
    
    
        :Example:
    
        >>> from multimodal.datasets.base import load_dict
        >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
        >>> from multimodal.datasets.data_sample import DataSample
        >>> file = 'input_x_dic.pkl'
        >>> data = load_dict(get_dataset_path(file))
    
        """
    
        def __init__(self, *arg, **kwargs ):
            """Constructor of Metriclearn_array"""
            if sp.issparse(arg[0]):
                MultiModalSparseInfo.__init__(self, *arg)
                if isinstance(arg[0], sp.csr_matrix) :
                    sp.csr_matrix.__init__(self, arg[0])
                elif isinstance(arg[0], sp.csc_matrix):
                    sp.csc_matrix.__init__(self, arg[0])
                else:
                    raise TypeError("This sparse format is not supported")
            else:
                if isinstance(self,sp.csr_matrix):
                   sp.csr_matrix.__init__(self, *arg, **kwargs)
                elif isinstance(self, sp.csc_matrix):
                   sp.csc_matrix.__init__(self, *arg, **kwargs)
    
    
    
    class MultiModalArray(np.ndarray, MultiModalData):
        """
        MultiModalArray inherit from numpy ndarray
    
    
        Parameters
        ----------
    
        data : can be
             - dictionary of multiview array with shape = (n_samples, n_features)  for multi-view
                  for each view.
               {0: array([[]],
                1: array([[]],
                ...}
             - numpy array like with shape = (n_samples, n_features)  for multi-view
                  for each view.
                [[[...]],
                 [[...]],
                 ...]
             - {array like} with (n_samples, nviews *  n_features) with 'views_ind' diferent to 'None'
                for Multi-view input samples.
    
    
    
    
        views_ind : array-like (default= None ) if None
                    [0, n_features//2, n_features]) is constructed (2 views)
                    Paramater specifying how to extract the data views from X:
    
            - views_ind is a 1-D array of sorted integers, the entries
              indicate the limits of the slices used to extract the views,
              where view ``n`` is given by
              ``X[:, views_ind[n]:views_ind[n+1]]``.
    
        Attributes
        ----------
    
        view_ind : list of views' indice  (may be None)
    
        n_views : int number of views
    
        shapes_int: list of int numbers of feature for each views
    
        keys : name of key, where data come from a dictionary
    
    
        :Example:
    
        >>> from multimodal.datasets.base import load_dict
        >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
        >>> from multimodal.datasets.data_sample import DataSample
        >>> file = 'input_x_dic.pkl'
        >>> data = load_dict(get_dataset_path(file))
        >>> print(data.__class__)
        <class 'dict'>
        >>> multiviews = MultiModalArray(data)
        >>> multiviews.shape
        (120, 240)
        >>> multiviews.keys
        dict_keys([0, 1])
        >>> multiviews.shapes_int
        [120, 120]
        >>> multiviews.n_views
        2
    
    
        """
        def __new__(cls, data, view_ind=None):
            """Constructor of MultiModalArray_array"""
            shapes_int = []
            index = 0
            new_data = np.ndarray([])
            n_views = 1
            thekeys = None
            # view_ind_self =  None
            view_mode = 'slices'
            if isinstance(data, dict):
                n_views = len(data)
                view_ind = [0]
                for key, dat_values in data.items():
                    new_data = cls._populate_new_data(index, dat_values, new_data)
                    shapes_int.append(dat_values.shape[1])
                    view_ind.append(dat_values.shape[1] + view_ind[index])
                    index += 1
                thekeys = data.keys()
    
            elif isinstance(data, np.ndarray) and view_ind is None and data.ndim == 1:
                try:
                    dat0 = np.array(data[0])
                except Exception:
                    raise TypeError("input format is not supported")
    
                if dat0.ndim < 2:
                    data = data[np.newaxis, ...]
                    if data.shape[1] > 1:
                        view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
                    else:
                        view_ind = np.array([0, data.shape[1]])
                    new_data = data
                else:
                    new_data, shapes_int, view_ind = cls._for_data(cls, data)
                n_views = data.shape[0]
            elif (isinstance(data, np.ndarray) ) and data.ndim > 1:
                try:
                    data = np.asarray(data)
                except:
                    raise TypeError("input format is not supported")
    
                if  view_ind is not None:
                    try:
                        view_ind = np.asarray(view_ind)
                    except :
                        raise TypeError("n_views should be list or nparray")
                elif view_ind is None:
                    if data.shape[1] > 1:
                        view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
                    else:
                        view_ind = np.array([0, data.shape[1]])
                new_data = data
            else:
                try:
                    new_data = np.asarray(data)
                    # if new_data.ndim == 1:
                    #     new_data = new_data.reshape(1, new_data.shape[0])
                    if view_ind is None:
                        view_ind = np.array([0, new_data.shape[1]])
                except  Exception as e:
                    raise ValueError('Reshape your data')
    
                if new_data.ndim < 2 or new_data.shape == (1, 1) or view_ind[-1] > new_data.shape[1]:
                    raise ValueError('Reshape your data')
    
                # view_ind_self = view_ind
            # if new_data.shape[1] < 1:
            #     msg = ("%d feature\(s\) \\(shape=\%s\) while a minimum of \\d* "
            #            "is required.") % (new_data.shape[1], str(new_data.shape))
            #     # "%d feature\(s\) \(shape=\(%d, %d\)\) while a minimum of \d* is required." % (new_data.shape[1], new_data.shape[0], new_data.shape[1])
            #     raise ValueError(msg)
            view_ind, n_views, view_mode = cls._first_validate_views_ind(view_ind,
                                                                          new_data.shape[1])
            if view_ind.ndim == 1 and view_mode.startswith("slices"):
                shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])]
            # obj =   ma.MaskedArray.__new(new_data)   # new_data.view()  a.MaskedArray(new_data, mask=new_data.mask).view(cls)
            # bj = super(Metriclearn_array, cls).__new__(cls, new_data.data, new_data.mask)
    
            if hasattr(new_data, "mask"):
                obj = ma.masked_array(new_data.data, new_data.mask).view(cls)
            elif hasattr(new_data, "data") and \
                    hasattr(new_data, "shape") and len(new_data.shape) > 0:
                    obj = np.asarray(new_data.data).view(cls)
            else:
                obj = np.recarray.__new__(cls, shape=(0, 0), dtype=np.float)
            obj.view_mode_ = view_mode
            obj.views_ind = view_ind
            obj.shapes_int = shapes_int
            obj.n_views = n_views
            obj.keys = thekeys
            return obj
    
        @staticmethod
        def _for_data(cls, data):
            n_views = data.shape[0]
            index = 0
            view_ind = np.empty(n_views + 1, dtype=np.int)
            view_ind[0] = 0
            shapes_int = []
            new_data = np.ndarray([])
            for dat_values in data:
                try:
                    dat_values = np.array(dat_values)
                except Exception:
                    raise TypeError("input format is not supported")
                new_data = cls._populate_new_data(index, dat_values, new_data)
                view_ind[index + 1] = dat_values.shape[1] + view_ind[index]
                shapes_int.append(dat_values.shape[1])
                index += 1
            return new_data, shapes_int, view_ind
    
        @staticmethod
        def _populate_new_data(index, dat_values, new_data):
            if index == 0:
                if isinstance(dat_values, ma.MaskedArray)  or \
                      isinstance(dat_values, np.ndarray) or sp.issparse(dat_values):
                    new_data = dat_values
                else:
                    new_data = dat_values.view(np.ndarray) #  ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #(
                    # new_data.mask = ma.nomask
            else:
                if isinstance(dat_values, np.ndarray):
                    new_data = np.hstack((new_data, dat_values))
                elif isinstance(dat_values, ma.MaskedArray):
                    new_data = ma.hstack((new_data, dat_values))
                elif sp.issparse(dat_values):
                    new_data = sp.hstack((new_data, dat_values))
                else:
                    new_data = np.hstack((new_data,  dat_values.view(np.ndarray) ) ) #  ma.masked_array(dat_values, mask=ma.nomask
            return new_data
    
        def __array_finalize__(self, obj):
            if obj is None: return
            # super(MultiModalArray, self).__array_finalize__(obj)
            self.shapes_int = getattr(obj, 'shapes_int', None)
            self.n_views = getattr(obj, 'n_views', None)
            self.keys = getattr(obj, 'keys', None)
            self.views_ind = getattr(obj, 'views_ind', None)
            self.view_mode_ = getattr(obj, 'view_mode_', None)
    
        def __reduce__(self):
            # Get the parent's __reduce__ tuple
            pickled_state = super(MultiModalArray, self).__reduce__()
            # Create our own tuple to pass to __setstate__
            new_state = pickled_state[2] + (self.__dict__,)
            # Return a tuple that replaces the parent's __setstate__ tuple with our own
            return (pickled_state[0], pickled_state[1], new_state)
    
        def __setstate__(self, state):
            self.__dict__.update(state[-1])
            super(MultiModalArray, self).__setstate__(state[0:-1])
    
        def get_col(self, view, col):
            start = np.sum(np.asarray(self.shapes_int[0: view]))
            return self[start+col, :]
    
        def get_view(self, view):
            start = int(np.sum(np.asarray(self.shapes_int[0: view])))
            stop = int(start + self.shapes_int[view])
            return self[:, start:stop]
    
    
    
        def set_view(self, view, data):
            start = int(np.sum(np.asarray(self.shapes_int[0: view])))
            stop = int(start + self.shapes_int[view])
            if stop-start == data.shape[0] and data.shape[1]== self.data.shape[1]:
                 self[:, start:stop] = data
            else:
                raise ValueError(
                    "shape of data does not match (%d, %d)" %stop-start %self.data.shape[1])
    
        def get_raw(self, view, raw):
            start = np.sum(np.asarray(self.shapes_int[0: view]))
            stop = np.sum(np.asarray(self.shapes_int[0: view+1]))
            return self.data[start:stop, raw]
    
        def add_view(self, v, data):
            if len(self.shape) > 0:
                if data.shape[0] == self.data.shape[0]:
                    indice = self.shapes_int[v]
                    np.insert(self.data, data, indice+1, axis=0)
                    self.shapes_int.append(data.shape[1])
                    self.n_views +=1
            else:
                raise ValueError("New view can't initialazed")
               # self.shapes_int= [data.shape[1]]
               # self.data.reshape(data.shape[0],)
               # np.insert(self.data, data, 0)
               # self.n_views = 1
    
        def _todict(self):
            dico = {}
            for view in range(self.n_views):
                dico[view] = self.get_view(view)
            return dico
    
    
    
    
    class DataSample(dict):
        """
        A DataSample instance
    
    
        :Example:
    
        >>> from multimodal.datasets.base import load_dict
        >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
        >>> from multimodal.datasets.data_sample import DataSample
        >>> file = 'input_x_dic.pkl'
        >>> data = load_dict(get_dataset_path(file))
        >>> print(data.__class__)
        <class 'dict'>
        >>> s = DataSample(data)
        >>> type(s.data)
        <class 'multimodal.datasets.data_sample.MultiModalArray'>
    
    
        - Input:
    
        Parameters
        ----------
        data : dict
        kwargs : others arguments
    
        Attributes
        ----------
    
        data   : { array like}  MultiModalArray
        """
    
        def __init__(self, data=None, **kwargs):
    
    
            # The dictionary that contains the sample
            super(DataSample, self).__init__(kwargs)
            self._data = None # Metriclearn_arrayMultiModalArray(np.zeros((0,0)))
            if data is not None:
                self._data = MultiModalArray(data)
    
    
        @property
        def data(self):
            """MultiModalArray"""
    
            return self._data
    
        @data.setter
        def data(self, data):
            if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)) or sp.issparse(data):
                self._data = data
            else:
                raise TypeError("sample should be a MultiModalArray or numpy array.")