data_sample.py

# -*- coding: utf-8 -*-

"""This module contains the DataSample class and Metriclearn_array class
The DataSample class encapsulates a sample 's components
nbL and nbEx numbers,
Metriclearn_arra class inherit from numpy ndarray and contains a 2d data ndarray
with the shape (n_samples, n_view_i * n_features_i)

0        1    2    3
======== ==== ==== ====
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
xxxxxxxx xxxx xxxx xxxx
======== ==== ==== ====

the number nbL and nbEx and , the fourth dictionaries for sample,
prefix, suffix and factor where they are computed
"""
from abc import ABCMeta
import numpy as np
import numpy.ma as ma
import scipy.sparse as sp

class MultiModalData(metaclass=ABCMeta):

    @staticmethod
    def _first_validate_views_ind(views_ind, n_features):
        """Ensure proper format for views_ind and return number of views."""
        views_ind = np.array(views_ind)
        if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1:
            if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]):
                raise ValueError("Values in views_ind must be sorted.")
            if views_ind[0] < 0 or views_ind[-1] > n_features:
                raise ValueError("Values in views_ind are not in a correct "
                                 + "range for the provided data.")
            view_mode_ = "slices"
            n_views = views_ind.shape[0]-1
        else:
            if views_ind.ndim == 1:
                if not views_ind.dtype == np.object:
                    raise ValueError("The format of views_ind is not "
                                     + "supported.")
                for ind, val in enumerate(views_ind):
                    views_ind[ind] = np.array(val)
                    if not np.issubdtype(views_ind[ind].dtype, np.integer):
                        raise ValueError("Values in views_ind must be "
                                         + "integers.")
                    if views_ind[ind].min() < 0 \
                            or views_ind[ind].max() >= n_features:
                        raise ValueError("Values in views_ind are not in a "
                                         + "correct range for the provided "
                                         + "data.")
            elif views_ind.ndim == 2:
                if not np.issubdtype(views_ind.dtype, np.integer):
                    raise ValueError("Values in views_ind must be integers.")
                if views_ind.min() < 0 or views_ind.max() >= n_features:
                    raise ValueError("Values in views_ind are not in a "
                                     + "correct range for the provided data.")
            else:
                raise ValueError("The format of views_ind is not supported.")
            view_mode_ = "indices"
            n_views = views_ind.shape[0]
        return (views_ind, n_views, view_mode_)

    def _extract_view(self, ind_view):
        """Extract the view for the given index ind_view from the dataset X."""
        if self.view_mode_ == "indices":
            return self[:, self.views_ind[ind_view]]
        else:
            return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]]

    def _validate_views_ind(self, views_ind, n_features):
        """Ensure proper format for views_ind and return number of views."""
        views_ind = np.array(views_ind)
        if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1:
            if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]):
                raise ValueError("Values in views_ind must be sorted.")
            if views_ind[0] < 0 or views_ind[-1] > n_features:
                raise ValueError("Values in views_ind are not in a correct "
                                 + "range for the provided data.")
            self.view_mode_ = "slices"
            n_views = views_ind.shape[0]-1
        else:
            if views_ind.ndim == 1:
                if not views_ind.dtype == np.object:
                    raise ValueError("The format of views_ind is not "
                                     + "supported.")
                for ind, val in enumerate(views_ind):
                    views_ind[ind] = np.array(val)
                    if not np.issubdtype(views_ind[ind].dtype, np.integer):
                        raise ValueError("Values in views_ind must be "
                                         + "integers.")
                    if views_ind[ind].min() < 0 \
                            or views_ind[ind].max() >= n_features:
                        raise ValueError("Values in views_ind are not in a "
                                         + "correct range for the provided "
                                         + "data.")
            elif views_ind.ndim == 2:
                if not np.issubdtype(views_ind.dtype, np.integer):
                    raise ValueError("Values in views_ind must be integers.")
                if views_ind.min() < 0 or views_ind.max() >= n_features:
                    raise ValueError("Values in views_ind are not in a "
                                     + "correct range for the provided data.")
            else:
                raise ValueError("The format of views_ind is not supported.")
            self.view_mode_ = "indices"
            n_views = views_ind.shape[0]
        self.views_ind = views_ind
        self.n_views = n_views
        return (views_ind, n_views)

class MultiModalSparseInfo():

    def __init__(self, data, view_ind=None):
        """Constructor of Metriclearn_array"""
        shapes_int = []
        index = 0
        new_data = np.ndarray([])
        n_views = data.size
        thekeys = None
        # view_ind_self =  None
        view_mode = 'slices'

        if (sp.issparse(data)) and data.ndim > 1:
            if  view_ind is not None:
                try:
                    view_ind = np.asarray(view_ind)
                except :
                    raise TypeError("n_views should be list or nparray")
            elif view_ind is None:
                if data.shape[1] > 1:
                    view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
                else:
                    view_ind = np.array([0, data.shape[1]])

            new_data = data
            # view_ind_self = view_ind
        view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind,
                                                                      data.shape[1])
        if view_ind.ndim == 1 and view_mode.startswith("slicing"):
            shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])]

        if data.shape[0] < 1 or data.shape[1] < 1:
            raise ValueError("input data shouldbe not empty")
        self.view_mode_ = view_mode
        self.views_ind = view_ind
        self.shapes_int = shapes_int
        self.n_views = n_views

class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData):
    """
    MultiModalArray inherit from numpy ndarray


    Parameters
    ----------

    data : can be
             - dictionary of multiview array with shape = (n_samples, n_features)  for multi-view
                  for each view.
               {0: array([[]],
                1: array([[]],
                ...}
             - numpy array like with shape = (n_samples, n_features)  for multi-view
                  for each view.
                [[[...]],
                 [[...]],
                 ...]
             - {array like} with (n_samples, nviews *  n_features) with 'views_ind' diferent to 'None'
                for Multi-view input samples.


        views_ind : array-like (default= None ) if None
                    [0, n_features//2, n_features]) is constructed (2 views)
                    Paramater specifying how to extract the data views from X:

            - views_ind is a 1-D array of sorted integers, the entries
              indicate the limits of the slices used to extract the views,
              where view ``n`` is given by
              ``X[:, views_ind[n]:views_ind[n+1]]``.

        Attributes
        ----------

        view_ind : list of views' indice  (may be None)

        n_views : int number of views

        shapes_int: list of int numbers of feature for each views

        keys : name of key, where data come from a dictionary


    :Example:

    >>> from multimodal.datasets.base import load_dict
    >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
    >>> from multimodal.datasets.data_sample import DataSample
    >>> file = 'input_x_dic.pkl'
    >>> data = load_dict(get_dataset_path(file))

    """

    def __init__(self, *arg, **kwargs ):
        """Constructor of Metriclearn_array"""
        if sp.issparse(arg[0]):
            MultiModalSparseInfo.__init__(self, *arg)
            if isinstance(arg[0], sp.csr_matrix) :
                sp.csr_matrix.__init__(self, arg[0])
            elif isinstance(arg[0], sp.csc_matrix):
                sp.csc_matrix.__init__(self, arg[0])
            else:
                raise TypeError("This sparse format is not supported")
        else:
            if isinstance(self,sp.csr_matrix):
               sp.csr_matrix.__init__(self, *arg, **kwargs)
            elif isinstance(self, sp.csc_matrix):
               sp.csc_matrix.__init__(self, *arg, **kwargs)


class MultiModalArray(np.ndarray, MultiModalData):
    """
    MultiModalArray inherit from numpy ndarray


    Parameters
    ----------

    data : can be
         - dictionary of multiview array with shape = (n_samples, n_features)  for multi-view
              for each view.
           {0: array([[]],
            1: array([[]],
            ...}
         - numpy array like with shape = (n_samples, n_features)  for multi-view
              for each view.
            [[[...]],
             [[...]],
             ...]
         - {array like} with (n_samples, nviews *  n_features) with 'views_ind' diferent to 'None'
            for Multi-view input samples.


    views_ind : array-like (default= None ) if None
                [0, n_features//2, n_features]) is constructed (2 views)
                Paramater specifying how to extract the data views from X:

        - views_ind is a 1-D array of sorted integers, the entries
          indicate the limits of the slices used to extract the views,
          where view ``n`` is given by
          ``X[:, views_ind[n]:views_ind[n+1]]``.

    Attributes
    ----------

    view_ind : list of views' indice  (may be None)

    n_views : int number of views

    shapes_int: list of int numbers of feature for each views

    keys : name of key, where data come from a dictionary


    :Example:

    >>> from multimodal.datasets.base import load_dict
    >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
    >>> from multimodal.datasets.data_sample import DataSample
    >>> file = 'input_x_dic.pkl'
    >>> data = load_dict(get_dataset_path(file))
    >>> print(data.__class__)
    <class 'dict'>
    >>> multiviews = MultiModalArray(data)
    >>> multiviews.shape
    (120, 240)
    >>> multiviews.keys
    dict_keys([0, 1])
    >>> multiviews.shapes_int
    [120, 120]
    >>> multiviews.n_views
    2


    """
    def __new__(cls, data, view_ind=None):
        """Constructor of MultiModalArray_array"""
        shapes_int = []
        index = 0
        new_data = np.ndarray([])
        n_views = 1
        thekeys = None
        # view_ind_self =  None
        view_mode = 'slices'
        if isinstance(data, dict):
            n_views = len(data)
            view_ind = [0]
            for key, dat_values in data.items():
                new_data = cls._populate_new_data(index, dat_values, new_data)
                shapes_int.append(dat_values.shape[1])
                view_ind.append(dat_values.shape[1] + view_ind[index])
                index += 1
            thekeys = data.keys()

        elif isinstance(data, np.ndarray) and view_ind is None and data.ndim == 1:
            try:
                dat0 = np.array(data[0])
            except Exception:
                raise TypeError("input format is not supported")

            if dat0.ndim < 2:
                data = data[np.newaxis, ...]
                if data.shape[1] > 1:
                    view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
                else:
                    view_ind = np.array([0, data.shape[1]])
                new_data = data
            else:
                new_data, shapes_int, view_ind = cls._for_data(cls, data)
            n_views = data.shape[0]
        elif (isinstance(data, np.ndarray) ) and data.ndim > 1:
            try:
                data = np.asarray(data)
            except:
                raise TypeError("input format is not supported")

            if  view_ind is not None:
                try:
                    view_ind = np.asarray(view_ind)
                except :
                    raise TypeError("n_views should be list or nparray")
            elif view_ind is None:
                if data.shape[1] > 1:
                    view_ind = np.array([0, data.shape[1]//2, data.shape[1]])
                else:
                    view_ind = np.array([0, data.shape[1]])
            new_data = data
        else:
            try:
                new_data = np.asarray(data)
                # if new_data.ndim == 1:
                #     new_data = new_data.reshape(1, new_data.shape[0])
                if view_ind is None:
                    view_ind = np.array([0, new_data.shape[1]])
            except  Exception as e:
                raise ValueError('Reshape your data')

            if new_data.ndim < 2 or new_data.shape == (1, 1) or view_ind[-1] > new_data.shape[1]:
                raise ValueError('Reshape your data')

            # view_ind_self = view_ind
        # if new_data.shape[1] < 1:
        #     msg = ("%d feature\(s\) \\(shape=\%s\) while a minimum of \\d* "
        #            "is required.") % (new_data.shape[1], str(new_data.shape))
        #     # "%d feature\(s\) \(shape=\(%d, %d\)\) while a minimum of \d* is required." % (new_data.shape[1], new_data.shape[0], new_data.shape[1])
        #     raise ValueError(msg)
        view_ind, n_views, view_mode = cls._first_validate_views_ind(view_ind,
                                                                      new_data.shape[1])
        if view_ind.ndim == 1 and view_mode.startswith("slices"):
            shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])]
        # obj =   ma.MaskedArray.__new(new_data)   # new_data.view()  a.MaskedArray(new_data, mask=new_data.mask).view(cls)
        # bj = super(Metriclearn_array, cls).__new__(cls, new_data.data, new_data.mask)

        if hasattr(new_data, "mask"):
            obj = ma.masked_array(new_data.data, new_data.mask).view(cls)
        elif hasattr(new_data, "data") and \
                hasattr(new_data, "shape") and len(new_data.shape) > 0:
                obj = np.asarray(new_data.data).view(cls)
        else:
            obj = np.recarray.__new__(cls, shape=(0, 0), dtype=np.float)
        obj.view_mode_ = view_mode
        obj.views_ind = view_ind
        obj.shapes_int = shapes_int
        obj.n_views = n_views
        obj.keys = thekeys
        return obj

    @staticmethod
    def _for_data(cls, data):
        n_views = data.shape[0]
        index = 0
        view_ind = np.empty(n_views + 1, dtype=np.int)
        view_ind[0] = 0
        shapes_int = []
        new_data = np.ndarray([])
        for dat_values in data:
            try:
                dat_values = np.array(dat_values)
            except Exception:
                raise TypeError("input format is not supported")
            new_data = cls._populate_new_data(index, dat_values, new_data)
            view_ind[index + 1] = dat_values.shape[1] + view_ind[index]
            shapes_int.append(dat_values.shape[1])
            index += 1
        return new_data, shapes_int, view_ind

    @staticmethod
    def _populate_new_data(index, dat_values, new_data):
        if index == 0:
            if isinstance(dat_values, ma.MaskedArray)  or \
                  isinstance(dat_values, np.ndarray) or sp.issparse(dat_values):
                new_data = dat_values
            else:
                new_data = dat_values.view(np.ndarray) #  ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #(
                # new_data.mask = ma.nomask
        else:
            if isinstance(dat_values, np.ndarray):
                new_data = np.hstack((new_data, dat_values))
            elif isinstance(dat_values, ma.MaskedArray):
                new_data = ma.hstack((new_data, dat_values))
            elif sp.issparse(dat_values):
                new_data = sp.hstack((new_data, dat_values))
            else:
                new_data = np.hstack((new_data,  dat_values.view(np.ndarray) ) ) #  ma.masked_array(dat_values, mask=ma.nomask
        return new_data

    def __array_finalize__(self, obj):
        if obj is None: return
        # super(MultiModalArray, self).__array_finalize__(obj)
        self.shapes_int = getattr(obj, 'shapes_int', None)
        self.n_views = getattr(obj, 'n_views', None)
        self.keys = getattr(obj, 'keys', None)
        self.views_ind = getattr(obj, 'views_ind', None)
        self.view_mode_ = getattr(obj, 'view_mode_', None)

    def __reduce__(self):
        # Get the parent's __reduce__ tuple
        pickled_state = super(MultiModalArray, self).__reduce__()
        # Create our own tuple to pass to __setstate__
        new_state = pickled_state[2] + (self.__dict__,)
        # Return a tuple that replaces the parent's __setstate__ tuple with our own
        return (pickled_state[0], pickled_state[1], new_state)

    def __setstate__(self, state):
        self.__dict__.update(state[-1])
        super(MultiModalArray, self).__setstate__(state[0:-1])

    def get_col(self, view, col):
        start = np.sum(np.asarray(self.shapes_int[0: view]))
        return self[start+col, :]

    def get_view(self, view):
        start = int(np.sum(np.asarray(self.shapes_int[0: view])))
        stop = int(start + self.shapes_int[view])
        return self[:, start:stop]


    def set_view(self, view, data):
        start = int(np.sum(np.asarray(self.shapes_int[0: view])))
        stop = int(start + self.shapes_int[view])
        if stop-start == data.shape[0] and data.shape[1]== self.data.shape[1]:
             self[:, start:stop] = data
        else:
            raise ValueError(
                "shape of data does not match (%d, %d)" %stop-start %self.data.shape[1])

    def get_raw(self, view, raw):
        start = np.sum(np.asarray(self.shapes_int[0: view]))
        stop = np.sum(np.asarray(self.shapes_int[0: view+1]))
        return self.data[start:stop, raw]

    def add_view(self, v, data):
        if len(self.shape) > 0:
            if data.shape[0] == self.data.shape[0]:
                indice = self.shapes_int[v]
                np.insert(self.data, data, indice+1, axis=0)
                self.shapes_int.append(data.shape[1])
                self.n_views +=1
        else:
            raise ValueError("New view can't initialazed")
           # self.shapes_int= [data.shape[1]]
           # self.data.reshape(data.shape[0],)
           # np.insert(self.data, data, 0)
           # self.n_views = 1

    def _todict(self):
        dico = {}
        for view in range(self.n_views):
            dico[view] = self.get_view(view)
        return dico


class DataSample(dict):
    """
    A DataSample instance


    :Example:

    >>> from multimodal.datasets.base import load_dict
    >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path
    >>> from multimodal.datasets.data_sample import DataSample
    >>> file = 'input_x_dic.pkl'
    >>> data = load_dict(get_dataset_path(file))
    >>> print(data.__class__)
    <class 'dict'>
    >>> s = DataSample(data)
    >>> type(s.data)
    <class 'multimodal.datasets.data_sample.MultiModalArray'>


    - Input:

    Parameters
    ----------
    data : dict
    kwargs : others arguments

    Attributes
    ----------

    data   : { array like}  MultiModalArray
    """

    def __init__(self, data=None, **kwargs):


        # The dictionary that contains the sample
        super(DataSample, self).__init__(kwargs)
        self._data = None # Metriclearn_arrayMultiModalArray(np.zeros((0,0)))
        if data is not None:
            self._data = MultiModalArray(data)


    @property
    def data(self):
        """MultiModalArray"""

        return self._data

    @data.setter
    def data(self, data):
        if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)) or sp.issparse(data):
            self._data = data
        else:
            raise TypeError("sample should be a MultiModalArray or numpy array.")