diff --git a/multimodal/boosting/boost.py b/multimodal/boosting/boost.py index 7de84eeadd8dffbadc1436ce16ba1d1025cf6541..7b039e6f94f37d16cafe9e4999ed51b7c4bda9bd 100644 --- a/multimodal/boosting/boost.py +++ b/multimodal/boosting/boost.py @@ -1,11 +1,14 @@ import numpy as np +import scipy.sparse as sp from abc import ABCMeta from sklearn.utils import check_array, check_X_y, check_random_state from sklearn.tree import DecisionTreeClassifier from sklearn.tree.tree import BaseDecisionTree from sklearn.tree._tree import DTYPE from sklearn.ensemble.forest import BaseForest -from multimodal.datasets.data_sample import DataSample, MultiModalArray +from multimodal.datasets.data_sample import DataSample +from multimodal.datasets.data_sample import MultiModalData, MultiModalArray, MultiModalSparseArray + class UBoosting(metaclass=ABCMeta): """ @@ -22,60 +25,32 @@ class UBoosting(metaclass=ABCMeta): else: check_array(X, accept_sparse=['csr', 'csc']) - if X.shape[1] != self.n_features_: - raise ValueError("X doesn't contain the right number of features.") - return X + if X.ndim < 2: + mes = "Reshape your data" + raise ValueError(mes) + if X.ndim > 1: + if X.shape[1] != self.n_features_: + mes = "Reshape your data" + raise ValueError("Number of features of the model must " + "match the input. Model n_features is %s and " + "input n_features is %s " % (self.n_features_, X.shape[1])) - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - return (views_ind, n_views) + # + # raise ValueError(mes) + return X def _global_X_transform(self, X, views_ind=None): X_ = None - if isinstance(X, np.ndarray) and X.ndim == 1: - X_= MultiModalArray(X, views_ind) - elif isinstance(X, dict): - X_= MultiModalArray(X) - elif isinstance(X, np.ndarray) and X.ndim > 1: + if isinstance(X, sp.spmatrix): + X_ = MultiModalSparseArray(X, views_ind) + else: X_ = MultiModalArray(X, views_ind) - if not isinstance(X_, MultiModalArray): - raise TypeError("Input format is not reconized") - if hasattr(self, "X_"): - if not self.X_.viexs_ind == views_ind: - raise ValueError("Input format (viewd, features) for fit and predict must be the same") - return X_ \ No newline at end of file + if isinstance(X, MultiModalData): + X_ = X + if not isinstance(X_, MultiModalData): + try: + X_ = np.asarray(X) + except Exception as e: + raise TypeError('Reshape your data') + return X_ diff --git a/multimodal/boosting/cumbo.py b/multimodal/boosting/cumbo.py index 3b224962ec22302561e5f4c170bdb90cdf0e5f73..0f928df746ec55d817ca192ce7ea5ef79cff11c7 100644 --- a/multimodal/boosting/cumbo.py +++ b/multimodal/boosting/cumbo.py @@ -78,13 +78,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): If None, the random number generator is the RandomState instance used by `np.random`. - best_view_mode : {"edge", "error"}, optional (default="edge") - Mode used to select the best view at each iteration: - - - if ``best_view_mode == "edge"``, the best view is the view maximizing - the edge value (variable δ (*delta*) in [1]_), - - if ``best_view_mode == "error"``, the best view is the view - minimizing the classification error. Attributes ---------- @@ -120,15 +113,13 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): >>> views_ind = [0, 2, 4] # view 0: sepal data, view 1: petal data >>> clf = MuCumboClassifier(random_state=0) >>> clf.fit(X, y, views_ind) # doctest: +NORMALIZE_WHITESPACE - MumboClassifier(base_estimator=None, best_view_mode='edge', - n_estimators=50, random_state=0) + >>> print(clf.predict([[ 5., 3., 1., 1.]])) [1] >>> views_ind = [[0, 2], [1, 3]] # view 0: length data, view 1: width data >>> clf = MuCumboClassifier(random_state=0) >>> clf.fit(X, y, views_ind) # doctest: +NORMALIZE_WHITESPACE - MumboClassifier(base_estimator=None, best_view_mode='edge', - n_estimators=50, random_state=0) + >>> print(clf.predict([[ 5., 3., 1., 1.]])) [1] @@ -136,13 +127,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): >>> base_estimator = DecisionTreeClassifier(max_depth=2) >>> clf = MuCumboClassifier(base_estimator=base_estimator, random_state=0) >>> clf.fit(X, y, views_ind) # doctest: +NORMALIZE_WHITESPACE - MumboClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, - criterion='gini', max_depth=2, max_features=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, - min_weight_fraction_leaf=0.0, presort=False, random_state=None, - splitter='best'), - best_view_mode='edge', n_estimators=50, random_state=0) + >>> print(clf.predict([[ 5., 3., 1., 1.]])) [1] @@ -176,7 +161,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): base_estimator=base_estimator, n_estimators=n_estimators) self.random_state = random_state - # self.best_view_mode = self._validate_best_view_mode(best_view_mode) def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" @@ -527,7 +511,8 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", - "estimator_weights_beta_", "n_classes_")) + "estimator_weights_beta_", "n_classes_", "X_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -581,6 +566,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", "estimator_weights_beta_", "n_classes_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -605,7 +591,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): else: yield np.array(dec_func) - def predict(self, X, views_ind=None): + def predict(self, X): """Predict classes for X. The predicted class of an input sample is computed as the weighted mean @@ -628,7 +614,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ValueError 'X' input matrix must be have the same total number of features of 'X' fit data """ - X = self._global_X_transform(X, views_ind=views_ind) pred = self.decision_function(X) if self.n_classes_ == 2: diff --git a/multimodal/boosting/mumbo.py b/multimodal/boosting/mumbo.py index c2fb588f92208258786c13ebeda75b31138c21d2..f2c522d2467cbbe66486025e32279430835f08bd 100644 --- a/multimodal/boosting/mumbo.py +++ b/multimodal/boosting/mumbo.py @@ -32,6 +32,7 @@ estimator for classification implemented in the ``MumboClassifier`` class. # structure, notations and behavior where possible. import numpy as np + from sklearn.base import ClassifierMixin from sklearn.ensemble import BaseEnsemble from sklearn.ensemble.forest import BaseForest @@ -343,22 +344,23 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): else: dtype = None accept_sparse = ['csr', 'csc'] - if views_ind is None: - if X.shape[1] > 1: - views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) - else: - views_ind = np.array([0, X.shape[1]]) - self.views_ind_, n_views = self._validate_views_ind(views_ind, - X.shape[1]) - self.X_ = self._global_X_transform(X, views_ind=self.views_ind_) + # if views_ind is None: + # if X.shape[1] > 1: + # views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) + # elif X.shape[1]==1: + # views_ind = np.array([0, X.shape[1]]) + # else: + # views_ind = np.array([0]) + self.X_ = self._global_X_transform(X, views_ind=views_ind) + views_ind_, n_views = self.X_._validate_views_ind(self.X_.views_ind, + self.X_.shape[1]) check_X_y(self.X_, y, accept_sparse=accept_sparse, dtype=dtype) check_classification_targets(y) self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) - self.n_features_ = X.shape[1] - + self.n_features_ = self.X_.shape[1] if self.n_classes_ == 1: # This case would lead to division by 0 when computing the cost # matrix so it needs special handling (but it is an obvious case as @@ -458,7 +460,8 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_", - "best_views_", "n_classes_", "views_ind_")) + "best_views_", "n_classes_", "X_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -504,7 +507,8 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_", - "n_classes_", "views_ind_")) + "n_classes_", "X_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -542,6 +546,7 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): y : numpy.ndarray, shape = (n_samples,) Predicted classes. """ + pred = self.decision_function(X) if self.n_classes_ == 2: @@ -588,8 +593,7 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ---------- X : {array-like, sparse matrix} of shape = (n_samples, n_features) Multi-view test samples. - Sparse matrix can be CSC, CSR, COO, DOK, or LIL. - COO, DOK and LIL are converted to CSR. + Sparse matrix can be CSC, CSR y : array-like, shape = (n_samples,) True labels for X. diff --git a/multimodal/datasets/__init__.py b/multimodal/datasets/__init__.py index 3bffe2098bb9f74b932253f87be329a69369cef6..8b45df1db9bf6e25008f5a8d1dc918d96a3f480b 100644 --- a/multimodal/datasets/__init__.py +++ b/multimodal/datasets/__init__.py @@ -1,2 +1,2 @@ -from metriclearning.datasets.base import * -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array \ No newline at end of file +from multimodal.datasets.base import * +from multimodal.datasets.data_sample import DataSample, MultiModalArray \ No newline at end of file diff --git a/multimodal/datasets/__pycache__/__init__.cpython-36.pyc b/multimodal/datasets/__pycache__/__init__.cpython-36.pyc index 71e809afc1d58e7dd739fc5d6790139d3a7a4230..78203c1c83371c7711d386af2cea6da83faec7cb 100644 Binary files a/multimodal/datasets/__pycache__/__init__.cpython-36.pyc and b/multimodal/datasets/__pycache__/__init__.cpython-36.pyc differ diff --git a/multimodal/datasets/__pycache__/base.cpython-36.pyc b/multimodal/datasets/__pycache__/base.cpython-36.pyc index c952741223ea079326c463e7176cb64f88b719a4..5d08b5d131a62ac27ac9450f9ba3386c212792bd 100644 Binary files a/multimodal/datasets/__pycache__/base.cpython-36.pyc and b/multimodal/datasets/__pycache__/base.cpython-36.pyc differ diff --git a/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc b/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc index 0a201db1c4aaece161b8408837f5bd47c3c60df1..aad5511d634098208d0e9aa1cb41329d923a21d6 100644 Binary files a/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc and b/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc differ diff --git a/multimodal/datasets/base.py b/multimodal/datasets/base.py index 0e033e7b2b1b2bf75b372b657a1839fc21776ecf..3d58c39c79cb76c331159cf7cfb39513386d239b 100644 --- a/multimodal/datasets/base.py +++ b/multimodal/datasets/base.py @@ -1,7 +1,7 @@ from __future__ import print_function import numpy as np import numpy.ma as ma -from metriclearning.datasets.data_sample import DataSample +from multimodal.datasets.data_sample import DataSample from six.moves import cPickle as pickle #for performance diff --git a/multimodal/datasets/data_sample.py b/multimodal/datasets/data_sample.py index fbcf5aa3b245cfb82ded36cc5fb5bcf33aee160e..ed3ab6d2e95a17b95a758e84db1d71396a090963 100644 --- a/multimodal/datasets/data_sample.py +++ b/multimodal/datasets/data_sample.py @@ -22,11 +22,337 @@ xxxxxxxx xxxx xxxx xxxx the number nbL and nbEx and , the fourth dictionaries for sample, prefix, suffix and factor where they are computed """ +from abc import ABCMeta import numpy as np import numpy.ma as ma +import scipy.sparse as sp +class MultiModalData(metaclass=ABCMeta): -class MultiModalArray(ma.MaskedArray, np.ndarray): + @staticmethod + def _first_validate_views_ind(views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + view_mode_ = "indices" + n_views = views_ind.shape[0] + return (views_ind, n_views, view_mode_) + + def _extract_view(self, ind_view): + """Extract the view for the given index ind_view from the dataset X.""" + if self.view_mode_ == "indices": + return self[:, self.views_ind[ind_view]] + else: + return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] + + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + self.views_ind = views_ind + self.n_views = n_views + return (views_ind, n_views) + +class MultiModalSparseInfo(): + + def __init__(self, data, view_ind=None): + """Constructor of Metriclearn_array""" + shapes_int = [] + index = 0 + new_data = np.ndarray([]) + n_views = data.size + thekeys = None + # view_ind_self = None + view_mode = 'slices' + + if (sp.issparse(data)) and data.ndim > 1: + if view_ind is not None: + try: + view_ind = np.asarray(view_ind) + except : + raise TypeError("n_views should be list or nparray") + elif view_ind is None: + if data.shape[1] > 1: + view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) + else: + view_ind = np.array([0, data.shape[1]]) + + new_data = data + # view_ind_self = view_ind + view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind, + data.shape[1]) + if view_ind.ndim == 1 and view_mode.startswith("slicing"): + shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] + + if data.shape[0] < 1 or data.shape[1] < 1: + raise ValueError("input data shouldbe not empty") + self.view_mode_ = view_mode + self.views_ind = view_ind + self.shapes_int = shapes_int + self.n_views = n_views + +class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData): + """ + MultiModalArray inherit from numpy ndarray + + + Parameters + ---------- + + data : can be + - dictionary of multiview array with shape = (n_samples, n_features) for multi-view + for each view. + {0: array([[]], + 1: array([[]], + ...} + - numpy array like with shape = (n_samples, n_features) for multi-view + for each view. + [[[...]], + [[...]], + ...] + - {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None' + for Multi-view input samples. + + + + + views_ind : array-like (default= None ) if None + [0, n_features//2, n_features]) is constructed (2 views) + Paramater specifying how to extract the data views from X: + + - views_ind is a 1-D array of sorted integers, the entries + indicate the limits of the slices used to extract the views, + where view ``n`` is given by + ``X[:, views_ind[n]:views_ind[n+1]]``. + + Attributes + ---------- + + view_ind : list of views' indice (may be None) + + n_views : int number of views + + shapes_int: list of int numbers of feature for each views + + keys : name of key, where data come from a dictionary + + + :Example: + + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample + >>> file = 'input_x_dic.pkl' + >>> data = load_dict(get_dataset_path(file)) + + """ + + def __init__(self, *arg, **kwargs ): + """Constructor of Metriclearn_array""" + if sp.issparse(arg[0]): + MultiModalSparseInfo.__init__(self, *arg) + if isinstance(arg[0], sp.csr_matrix) : + sp.csr_matrix.__init__(self, arg[0]) + elif isinstance(arg[0], sp.csc_matrix): + sp.csc_matrix.__init__(self, arg[0]) + else: + raise TypeError("This sparse format is not supported") + else: + if isinstance(self,sp.csr_matrix): + sp.csr_matrix.__init__(self, *arg, **kwargs) + elif isinstance(self, sp.csc_matrix): + sp.csc_matrix.__init__(self, *arg, **kwargs) + + +# class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalData): +# """ +# MultiModalArray inherit from numpy ndarray +# +# +# Parameters +# ---------- +# +# data : can be +# - dictionary of multiview array with shape = (n_samples, n_features) for multi-view +# for each view. +# {0: array([[]], +# 1: array([[]], +# ...} +# - numpy array like with shape = (n_samples, n_features) for multi-view +# for each view. +# [[[...]], +# [[...]], +# ...] +# - {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None' +# for Multi-view input samples. +# +# +# +# +# views_ind : array-like (default= None ) if None +# [0, n_features//2, n_features]) is constructed (2 views) +# Paramater specifying how to extract the data views from X: +# +# - views_ind is a 1-D array of sorted integers, the entries +# indicate the limits of the slices used to extract the views, +# where view ``n`` is given by +# ``X[:, views_ind[n]:views_ind[n+1]]``. +# +# Attributes +# ---------- +# +# view_ind : list of views' indice (may be None) +# +# n_views : int number of views +# +# shapes_int: list of int numbers of feature for each views +# +# keys : name of key, where data come from a dictionary +# +# +# :Example: +# +# >>> from multimodal.datasets.base import load_dict +# >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path +# >>> from multimodal.datasets.data_sample import DataSample +# >>> file = 'input_x_dic.pkl' +# >>> data = load_dict(get_dataset_path(file)) +# >>> print(data.__class__) +# <class 'dict'> +# >>> multiviews = MultiModalArray(data) +# >>> multiviews.shape +# (120, 240) +# >>> multiviews.keys +# dict_keys([0, 1]) +# >>> multiviews.shapes_int +# [120, 120] +# >>> multiviews.n_views +# 2 +# +# +# """ +# +# def __init__(self, data, view_ind=None, shape=None, dtype=None, copy=False): +# """Constructor of Metriclearn_array""" +# shapes_int = [] +# index = 0 +# new_data = np.ndarray([]) +# n_views = 1 +# thekeys = None +# # view_ind_self = None +# view_mode = 'slices' +# if isinstance(data, tuple) and len(data) == 3: +# data_data = data[0] +# indices = data[1] +# indptr = data[2] +# data_shape = shape +# else: +# if shape is None: +# data_shape = data.shape +# if dtype is None: +# dtype = data.dtype +# data_data = data.data +# data_indices = data.indices +# data_indptr = data.indptr +# if (sp.issparse(data)) and data.ndim > 1: +# if view_ind is not None: +# try: +# view_ind = np.asarray(view_ind) +# except : +# raise TypeError("n_views should be list or nparray") +# elif view_ind is None: +# if data.shape[1] > 1: +# view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) +# else: +# view_ind = np.array([0, data.shape[1]]) +# +# new_data = data +# # view_ind_self = view_ind +# view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind, +# data_shape[1]) +# if view_ind.ndim == 1 and view_mode.startswith("slicing"): +# shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] +# if isinstance(data, sp.csr_matrix) : +# sp.csr_matrix.__init__(self, (data_data, data_indices, data_indptr), shape=data_shape) +# #sp.csr_matrix.__init__(self, data) +# elif isinstance(data, sp.csc_matrix): +# sp.csc_matrix.__init__(self, (data_data, data_indices, data_indptr), shape=data_shape) +# #sp.csc_matrix.__init__(self, data) +# else: +# raise TypeError("This sparse format is not supported") +# if self.shape[0] < 1 or self.shape[1] < 1: +# raise ValueError("input data shouldbe not empty") +# self.view_mode_ = view_mode +# self.views_ind = view_ind +# self.shapes_int = shapes_int +# self.n_views = n_views + + +class MultiModalArray(np.ndarray, MultiModalData): """ MultiModalArray inherit from numpy ndarray @@ -98,9 +424,10 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): shapes_int = [] index = 0 new_data = np.ndarray([]) - n_views = len(data) + n_views = 1 thekeys = None - view_ind_self = None + # view_ind_self = None + view_mode = 'slices' if isinstance(data, dict): n_views = len(data) for key, dat_values in data.items(): @@ -110,38 +437,65 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): thekeys = data.keys() if isinstance(data, np.ndarray) and view_ind is None and data.ndim == 1: n_views = data.shape[0] + view_ind = np.empty(n_views+1) + view_ind[0] = 0 for dat_values in data: + try: + dat_values = np.array(dat_values) + except: + raise TypeError("input format is not supported") shapes_int.append(dat_values.shape[1]) + view_ind[index+1] = dat_values.shape[1] + view_ind[index] new_data = cls._populate_new_data(index, dat_values, new_data) index += 1 - elif isinstance(data, np.ndarray) and data.ndim > 1: + elif (isinstance(data, np.ndarray) ) and data.ndim > 1: + try: + data = np.asarray(data) + except: + raise TypeError("input format is not supported") + if view_ind is not None: try: view_ind = np.asarray(view_ind) except : raise TypeError("n_views should be list or nparray") - n_views = view_ind.shape[0] - 1 elif view_ind is None: if data.shape[1] > 1: view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) else: view_ind = np.array([0, data.shape[1]]) - view_ind, n_views = cls._first_validate_views_ind(view_ind, - data.shape[1]) - shapes_int = [ in2-in1 for in1, in2 in zip(view_ind, view_ind[1: ])] new_data = data - view_ind_self = view_ind - + else: + try: + new_data = np.asarray(data) + if new_data.ndim == 1: + new_data = new_data.reshape(1, new_data.shape[0]) + view_ind = np.array([0, new_data.shape[1]]) + except Exception as e: + raise ValueError('Reshape your data') + + # view_ind_self = view_ind + # if new_data.shape[1] < 1: + # msg = ("%d feature\(s\) \\(shape=\%s\) while a minimum of \\d* " + # "is required.") % (new_data.shape[1], str(new_data.shape)) + # # "%d feature\(s\) \(shape=\(%d, %d\)\) while a minimum of \d* is required." % (new_data.shape[1], new_data.shape[0], new_data.shape[1]) + # raise ValueError(msg) + view_ind, n_views, view_mode = cls._first_validate_views_ind(view_ind, + new_data.shape[1]) + if view_ind.ndim == 1 and view_mode.startswith("slicing"): + shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] # obj = ma.MaskedArray.__new(new_data) # new_data.view() a.MaskedArray(new_data, mask=new_data.mask).view(cls) # bj = super(Metriclearn_array, cls).__new__(cls, new_data.data, new_data.mask) + if hasattr(new_data, "mask"): obj = ma.masked_array(new_data.data, new_data.mask).view(cls) elif hasattr(new_data, "data") and \ hasattr(new_data, "shape") and len(new_data.shape) > 0: - obj = np.asarray(new_data.data).view(cls) + obj = np.asarray(new_data.data).view(cls) else: - obj = np.recarray.__new__(cls, shape=(), dtype=np.float) - obj.views_ind = view_ind_self + obj = np.recarray.__new__(cls, shape=(0, 0), dtype=np.float) + obj.view_mode_ = view_mode + obj.views_ind = view_ind obj.shapes_int = shapes_int obj.n_views = n_views obj.keys = thekeys @@ -150,47 +504,60 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): @staticmethod def _populate_new_data(index, dat_values, new_data): if index == 0: - if isinstance(dat_values, ma.MaskedArray) or isinstance(dat_values, np.ndarray): + if isinstance(dat_values, ma.MaskedArray) or \ + isinstance(dat_values, np.ndarray) or sp.issparse(dat_values): new_data = dat_values else: - new_data = dat_values.view(ma.MaskedArray) # ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #( - new_data.mask = ma.nomask + new_data = dat_values.view(np.ndarray) # ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #( + # new_data.mask = ma.nomask else: - if isinstance(dat_values, ma.MaskedArray) or isinstance(dat_values, np.ndarray): + if isinstance(dat_values, np.ndarray): + new_data = np.hstack((new_data, dat_values)) + elif isinstance(dat_values, ma.MaskedArray): new_data = ma.hstack((new_data, dat_values)) + elif sp.issparse(dat_values): + new_data = sp.hstack((new_data, dat_values)) else: - new_data = ma.hstack((new_data, dat_values.view(ma.MaskedArray) ) ) # ma.masked_array(dat_values, mask=ma.nomask + new_data = np.hstack((new_data, dat_values.view(np.ndarray) ) ) # ma.masked_array(dat_values, mask=ma.nomask return new_data def __array_finalize__(self, obj): if obj is None: return - super(MultiModalArray, self).__array_finalize__(obj) + # super(MultiModalArray, self).__array_finalize__(obj) self.shapes_int = getattr(obj, 'shapes_int', None) self.n_views = getattr(obj, 'n_views', None) self.keys = getattr(obj, 'keys', None) self.views_ind = getattr(obj, 'views_ind', None) + self.view_mode_ = getattr(obj, 'view_mode_', None) + + def __reduce__(self): + # Get the parent's __reduce__ tuple + pickled_state = super(MultiModalArray, self).__reduce__() + # Create our own tuple to pass to __setstate__ + new_state = pickled_state[2] + (self.__dict__,) + # Return a tuple that replaces the parent's __setstate__ tuple with our own + return (pickled_state[0], pickled_state[1], new_state) + + def __setstate__(self, state): + self.__dict__.update(state[-1]) + super(MultiModalArray, self).__setstate__(state[0:-1]) def get_col(self, view, col): start = np.sum(np.asarray(self.shapes_int[0: view])) - return self.data[start+col, :] + return self[start+col, :] def get_view(self, view): start = int(np.sum(np.asarray(self.shapes_int[0: view]))) stop = int(start + self.shapes_int[view]) - return self.data[:, start:stop] + return self[:, start:stop] + - def _extract_view(self, ind_view): - """Extract the view for the given index ind_view from the dataset X.""" - if self.view_mode_ == "indices": - return self.data[:, self.views_ind[ind_view]] - else: - return self.data[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] def set_view(self, view, data): start = int(np.sum(np.asarray(self.shapes_int[0: view]))) stop = int(start + self.shapes_int[view]) if stop-start == data.shape[0] and data.shape[1]== self.data.shape[1]: - self.data[:, start:stop] = data + self[:, start:stop] = data else: raise ValueError( "shape of data does not match (%d, %d)" %stop-start %self.data.shape[1]) @@ -220,63 +587,8 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): dico[view] = self.get_view(view) return dico - @staticmethod - def _first_validate_views_ind(views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - n_views = views_ind.shape[0]-1 - else: - raise ValueError("The format of views_ind is not " - + "supported.") - - return (views_ind, n_views) - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - self.views_ind = views_ind - self.n_views = n_views - return (views_ind, n_views) class DataSample(dict): """ @@ -285,9 +597,9 @@ class DataSample(dict): :Example: - >>> from metriclearning.datasets.base import load_dict - >>> from metriclearning.tests.datasets.get_dataset_path import get_dataset_path - >>> from metriclearning.datasets.data_sample import DataSample + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample >>> file = 'input_x_dic.pkl' >>> data = load_dict(get_dataset_path(file)) >>> print(data.__class__) @@ -315,20 +627,20 @@ class DataSample(dict): # The dictionary that contains the sample super(DataSample, self).__init__(kwargs) - self._data = None # Metriclearn_array(np.zeros((0,0))) + self._data = None # Metriclearn_arrayMultiModalArray(np.zeros((0,0))) if data is not None: self._data = MultiModalArray(data) @property def data(self): - """Metriclearn_array""" + """MultiModalArray""" return self._data @data.setter def data(self, data): - if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)): + if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)) or sp.issparse(data): self._data = data else: raise TypeError("sample should be a MultiModalArray or numpy array.") diff --git a/multimodal/kernels/__init__.py b/multimodal/kernels/__init__.py index 864783655979849bd72c67e1f456694c071e9c72..7d480451ff5477e6e13978711c334e3d3be30e26 100644 --- a/multimodal/kernels/__init__.py +++ b/multimodal/kernels/__init__.py @@ -1 +1 @@ -__all__ = ['MVML'] +__all__ = ['MVML', 'MKernel', 'MVML'] diff --git a/multimodal/kernels/lpMKL.py b/multimodal/kernels/lpMKL.py index bdcfe13becea9c0bcfcde3009bbd24b488698ffa..1eca2721007b00b8709e532292f0e5c0f58f8e12 100644 --- a/multimodal/kernels/lpMKL.py +++ b/multimodal/kernels/lpMKL.py @@ -5,7 +5,7 @@ from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y from sklearn.utils.validation import check_array from sklearn.utils.validation import check_is_fitted -from metriclearning.mkernel import MKernel +from multimodal.kernels.mkernel import MKernel class MKL(BaseEstimator, ClassifierMixin, MKernel): diff --git a/multimodal/kernels/mvml.py b/multimodal/kernels/mvml.py index d42e9a462863ad20bdfedf2e48b75eed3db91d65..535d97480e872798d5c783c893e49fc408173560 100644 --- a/multimodal/kernels/mvml.py +++ b/multimodal/kernels/mvml.py @@ -10,8 +10,8 @@ from sklearn.utils.validation import check_X_y from sklearn.utils.validation import check_array from sklearn.metrics.pairwise import check_pairwise_arrays from sklearn.utils.validation import check_is_fitted -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array -from metriclearning.mkernel import MKernel +from multimodal.datasets.data_sample import DataSample, MultiModalArray +from multimodal.kernels.mkernel import MKernel """ Copyright (C) 2018 Riikka Huusari @@ -196,7 +196,9 @@ class MVML(MKernel, BaseEstimator, ClassifierMixin): # Return the classifier self.learn_mvml(learn_A=self.learn_A, learn_w=self.learn_w, n_loops=self.n_loops) if self.warning_message: - print("warning appears during fit process", self.warning_message) + import logging + logging.warning("warning appears during fit process" + str(self.warning_message)) + # print("warning appears during fit process", self.warning_message) return self def learn_mvml(self, learn_A=1, learn_w=0, n_loops=6): diff --git a/multimodal/tests/test.py b/multimodal/tests/test.py new file mode 100644 index 0000000000000000000000000000000000000000..9a68d84f26be86e9fa5ac7f7791070d22b63c10f --- /dev/null +++ b/multimodal/tests/test.py @@ -0,0 +1,224 @@ + +from abc import ABCMeta +import numpy as np +import numpy.ma as ma +import scipy.sparse as sp + +from multimodal.boosting.mumbo import MumboClassifier + +class MultiModalData(metaclass=ABCMeta): + + @staticmethod + def _first_validate_views_ind(views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + view_mode_ = "indices" + n_views = views_ind.shape[0] + return (views_ind, n_views, view_mode_) + + def _extract_view(self, ind_view): + """Extract the view for the given index ind_view from the dataset X.""" + if self.view_mode_ == "indices": + return self[:, self.views_ind[ind_view]] + else: + return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] + + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + self.views_ind = views_ind + self.n_views = n_views + return (views_ind, n_views) + +class MultiModalSparseInfo(): + + def __init__(self, data, view_ind=None): + """Constructor of Metriclearn_array""" + shapes_int = [] + index = 0 + new_data = np.ndarray([]) + n_views = data.size + thekeys = None + # view_ind_self = None + view_mode = 'slices' + + if (sp.issparse(data)) and data.ndim > 1: + if view_ind is not None: + try: + view_ind = np.asarray(view_ind) + except : + raise TypeError("n_views should be list or nparray") + elif view_ind is None: + if data.shape[1] > 1: + view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) + else: + view_ind = np.array([0, data.shape[1]]) + + new_data = data + # view_ind_self = view_ind + view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind, + data.shape[1]) + if view_ind.ndim == 1 and view_mode.startswith("slicing"): + shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] + + if data.shape[0] < 1 or data.shape[1] < 1: + raise ValueError("input data shouldbe not empty") + self.view_mode_ = view_mode + self.views_ind = view_ind + self.shapes_int = shapes_int + self.n_views = n_views + + +class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData): + """ + MultiModalArray inherit from numpy ndarray + + + Parameters + ---------- + + data : can be + - dictionary of multiview array with shape = (n_samples, n_features) for multi-view + for each view. + {0: array([[]], + 1: array([[]], + ...} + - numpy array like with shape = (n_samples, n_features) for multi-view + for each view. + [[[...]], + [[...]], + ...] + - {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None' + for Multi-view input samples. + + + + + views_ind : array-like (default= None ) if None + [0, n_features//2, n_features]) is constructed (2 views) + Paramater specifying how to extract the data views from X: + + - views_ind is a 1-D array of sorted integers, the entries + indicate the limits of the slices used to extract the views, + where view ``n`` is given by + ``X[:, views_ind[n]:views_ind[n+1]]``. + + Attributes + ---------- + + view_ind : list of views' indice (may be None) + + n_views : int number of views + + shapes_int: list of int numbers of feature for each views + + keys : name of key, where data come from a dictionary + + + :Example: + + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample + >>> file = 'input_x_dic.pkl' + >>> data = load_dict(get_dataset_path(file)) + + """ + + def __init__(self, *arg, **kwargs ): + """Constructor of Metriclearn_array""" + if sp.issparse(arg[0]): + MultiModalSparseInfo.__init__(self, *arg) + if isinstance(arg[0], sp.csr_matrix) : + sp.csr_matrix.__init__(self, arg[0]) + elif isinstance(arg[0], sp.csc_matrix): + sp.csc_matrix.__init__(self, arg[0]) + else: + raise TypeError("This sparse format is not supported") + else: + if isinstance(self,sp.csr_matrix): + sp.csr_matrix.__init__(self, *arg, **kwargs) + elif isinstance(self, sp.csc_matrix): + sp.csc_matrix.__init__(self, *arg, **kwargs) + + + + +if __name__ == '__main__': + rng = np.random.RandomState(0) + X = rng.rand(40, 10) + X[X < .8] = 0 + X_csr = sp.csr_matrix(X) + y = (4 * rng.rand(40)).astype(np.int) + X_ = MultiModalSparseArray(X_csr) + print(X_.shape) + print(X_[:,0:1]) + + X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) + y = [1, 1, 1, 2, 2, 2] + clf = MumboClassifier() + clf.fit(X, y) \ No newline at end of file diff --git a/multimodal/tests/test_cumbo.py b/multimodal/tests/test_cumbo.py index d8a9e008cd7b5fc14cd0fc243b7222ed897e56d4..153bc94f3ffbb2fde17a142eeff3534cedfb6d2d 100644 --- a/multimodal/tests/test_cumbo.py +++ b/multimodal/tests/test_cumbo.py @@ -17,7 +17,7 @@ from sklearn import datasets from multimodal.boosting.cumbo import MuCumboClassifier from multimodal.tests.data.get_dataset_path import get_dataset_path - +from multimodal.datasets.data_sample import MultiModalArray class TestMuCumboClassifier(unittest.TestCase): @@ -909,11 +909,11 @@ class TestMuCumboClassifier(unittest.TestCase): (self.iris.data, target_two_classes, self.iris.views_ind), (self.iris.data, target_two_classes, np.array([[0, 2], [1, 3]])), ) + # for X, y, views_ind in data: clf = MuCumboClassifier(n_estimators=n_estimators, random_state=seed) clf.fit(X, y, views_ind) - staged_dec_func = [dec_f for dec_f in clf.staged_decision_function(X)] staged_predict = [predict for predict in clf.staged_predict(X)] staged_score = [score for score in clf.staged_score(X, y)] diff --git a/multimodal/tests/test_data_sample.py b/multimodal/tests/test_data_sample.py index 43e35400ba6c96affc34159b6d14109ff892ef11..04c6b49e39fc637eddd5e238c323a3802ccf586b 100644 --- a/multimodal/tests/test_data_sample.py +++ b/multimodal/tests/test_data_sample.py @@ -1,9 +1,9 @@ import unittest import numpy as np -from metriclearning.datasets.base import load_dict -from metriclearning.tests.datasets.get_dataset_path import get_dataset_path -from metriclearning.datasets.data_sample import Metriclearn_array +from multimodal.datasets.base import load_dict +from multimodal.tests.datasets.get_dataset_path import get_dataset_path +from multimodal.datasets.data_sample import MultiModalArray import pickle class UnitaryTest(unittest.TestCase): @@ -29,12 +29,12 @@ class UnitaryTest(unittest.TestCase): def testGet_view(self): - a = Metriclearn_array(self.kernel_dict) + a = MultiModalArray(self.kernel_dict) np.testing.assert_almost_equal(a.get_view(0), self.kernel_dict[0], 8) np.testing.assert_almost_equal(a.get_view(1), self.kernel_dict[1], 8) def test_init_Metriclearn_array(self): - a = Metriclearn_array(self.kernel_dict) + a = MultiModalArray(self.kernel_dict) self.assertEqual(a.shape, (120, 240)) self.assertEqual(a.shapes_int, [120, 120]) self.assertEqual(a.n_views, 2) @@ -42,9 +42,9 @@ class UnitaryTest(unittest.TestCase): self.assertEqual(a.keys, dict_key.keys()) def test_init_Array(self): - a = Metriclearn_array(self.kernel_dict) + a = MultiModalArray(self.kernel_dict) array_x = a.data - b = Metriclearn_array(a) + b = MultiModalArray(a) np.testing.assert_equal(b.views_ind, np.array([0, 120, 240])) diff --git a/multimodal/tests/test_mkl.py b/multimodal/tests/test_mkl.py index fffbac516cc2bf6a85f1589ab69cecf4ef95459a..16008f9aab80a24aca683f4dd27f2838352da818 100644 --- a/multimodal/tests/test_mkl.py +++ b/multimodal/tests/test_mkl.py @@ -3,9 +3,9 @@ import unittest import numpy as np from sklearn.metrics.pairwise import rbf_kernel -from metriclearning.tests.datasets.get_dataset_path import get_dataset_path -from metriclearning.lpMKL import MKL -from metriclearning.datasets.data_sample import Metriclearn_array +from multimodal.tests.datasets.get_dataset_path import get_dataset_path +from multimodal.kernels.lpMKL import MKL +from multimodal.datasets.data_sample import MultiModalArray import pickle from sklearn.exceptions import NotFittedError @@ -70,7 +70,7 @@ class MKLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mkl2 = MKL(lmbda=3, m_param = 0.3, kernel=['rbf'], kernel_params=[{'gamma':50}], use_approx = True, precision = 1E0, n_loops = 50) @@ -83,7 +83,7 @@ class MKLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mkl2 = MKL(lmbda=3, m_param = 0.3, kernel="precomputed", use_approx = True, precision = 1E-9, n_loops = 600) @@ -97,7 +97,7 @@ class MKLTest(unittest.TestCase): mkl.predict(self.test_kernel_dict) def testPredictMVML_witoutFit(self): - x_metric = Metriclearn_array(self.kernel_dict) + x_metric = MultiModalArray(self.kernel_dict) mkl = MKL(lmbda=3, m_param = 0.3, kernel=['rbf'], kernel_params=[{'gamma':50}], use_approx = True, precision = 1E-9, n_loops = 50) diff --git a/multimodal/tests/test_mumbo.py b/multimodal/tests/test_mumbo.py index f5b3924b62b6ae6541b4db32467338cacaeb4c92..978244fa63a653500330d0430903ceef21cf61dd 100644 --- a/multimodal/tests/test_mumbo.py +++ b/multimodal/tests/test_mumbo.py @@ -35,7 +35,7 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.cluster import KMeans from sklearn.tree import DecisionTreeClassifier from sklearn import datasets -from multimodalboost.mumbo import MumboClassifier +from multimodal.boosting.mumbo import MumboClassifier class TestMuCumboClassifier(unittest.TestCase): @@ -47,6 +47,15 @@ class TestMuCumboClassifier(unittest.TestCase): iris.views_ind = np.array([0, 2, 4]) clf.iris = iris + def test_sparse(self): + rng = np.random.RandomState(0) + X = rng.rand(40, 10) + X[X < .8] = 0 + X_csr = csr_matrix(X) + clf = MumboClassifier() + y = (4 * rng.rand(40)).astype(np.int) + clf.fit(X_csr, y) + def test_init_var(self): n_classes = 3 @@ -318,7 +327,7 @@ class TestMuCumboClassifier(unittest.TestCase): np.random.seed(seed) n_estimators = 10 - + #print("iris views ind", self.iris.views_ind) clf = MumboClassifier(n_estimators=n_estimators, best_view_mode='edge') clf.fit(self.iris.data, self.iris.target, self.iris.views_ind) score = clf.score(self.iris.data, self.iris.target) @@ -347,7 +356,7 @@ class TestMuCumboClassifier(unittest.TestCase): expected_views_ind = np.array([0, 1, 3]) clf = MumboClassifier() clf.fit(X, y) - np.testing.assert_equal(clf.views_ind_, expected_views_ind) + np.testing.assert_equal(clf.X_.views_ind, expected_views_ind) # Check that classes labels can be integers or strings and can be stored # into any kind of sequence @@ -515,6 +524,7 @@ class TestMuCumboClassifier(unittest.TestCase): np.testing.assert_equal(clf.predict(X), y) np.testing.assert_equal(clf.predict(np.array([[1., 1.], [-1., -1.]])), np.array([0, 1])) + X = clf._global_X_transform(X, clf.X_.views_ind) self.assertEqual(clf.decision_function(X).shape, y.shape) views_ind = np.array([[1, 0]]) @@ -695,6 +705,11 @@ class TestMuCumboClassifier(unittest.TestCase): def test_classifier(self): + X_zero_features = np.empty(0).reshape(3, 0) + y = np.array([1, 0, 1]) + # e = MumboClassifier() + # e.fit(X_zero_features, y) + # print(e.predict(X_zero_features)) return check_estimator(MumboClassifier) def test_iris(self): @@ -742,7 +757,6 @@ class TestMuCumboClassifier(unittest.TestCase): for X, y, views_ind in data: clf = MumboClassifier(n_estimators=n_estimators, random_state=seed) clf.fit(X, y, views_ind) - staged_dec_func = [dec_f for dec_f in clf.staged_decision_function(X)] staged_predict = [predict for predict in clf.staged_predict(X)] staged_score = [score for score in clf.staged_score(X, y)] @@ -782,7 +796,6 @@ class TestMuCumboClassifier(unittest.TestCase): clf.fit(self.iris.data, self.iris.target, self.iris.views_ind) score = clf.score(self.iris.data, self.iris.target) dump = pickle.dumps(clf) - clf_loaded = pickle.loads(dump) self.assertEqual(type(clf_loaded), clf.__class__) score_loaded = clf_loaded.score(self.iris.data, self.iris.target) @@ -828,11 +841,9 @@ class TestMuCumboClassifier(unittest.TestCase): X_dense = self.iris.data y = self.iris.target - for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, - dok_matrix]: + for sparse_format in [csc_matrix, csr_matrix]: #, lil_matrix, coo_matrix,dok_matrix]: for views_ind in (self.iris.views_ind, np.array([[0, 2], [1, 3]])): X_sparse = sparse_format(X_dense) - clf_sparse = MumboClassifier( base_estimator=CustomSVC(), random_state=seed, @@ -872,9 +883,9 @@ class TestMuCumboClassifier(unittest.TestCase): # Check that sparsity of data is maintained during training types = [clf.data_type_ for clf in clf_sparse.estimators_] if sparse_format == csc_matrix: - self.assertTrue(all([type_ == csc_matrix for type_ in types])) + self.assertTrue(all([issubclass(type_, csc_matrix) for type_ in types])) else: - self.assertTrue(all([type_ == csr_matrix for type_ in types])) + self.assertTrue(all([issubclass(type_, csr_matrix) for type_ in types])) if __name__ == '__main__': diff --git a/multimodal/tests/test_mvml.py b/multimodal/tests/test_mvml.py index 3de8a3588fa33010e556b4ac3cb206bf5c38b2b7..4627c5df8916fa36ea52b51fc88fa23c0d7714aa 100644 --- a/multimodal/tests/test_mvml.py +++ b/multimodal/tests/test_mvml.py @@ -6,9 +6,9 @@ import unittest import numpy as np from sklearn.exceptions import NotFittedError -from metriclearning.datasets.data_sample import Metriclearn_array -from metriclearning.mvml import MVML -from metriclearning.tests.datasets.get_dataset_path import get_dataset_path +from multimodal.datasets.data_sample import MultiModalArray +from multimodal.kernels.mvml import MVML +from multimodal.tests.datasets.get_dataset_path import get_dataset_path class MVMLTest(unittest.TestCase): @@ -90,7 +90,7 @@ class MVMLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0) mvml2.fit(x_metricl, y=self.y, views_ind=None) self.assertEqual(mvml2.A.shape, (240, 240)) @@ -105,7 +105,7 @@ class MVMLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0, learn_A=4) mvml2.fit(x_metricl, y=self.y, views_ind=None) self.assertEqual(mvml2.A.shape, (240, 240)) @@ -120,7 +120,7 @@ class MVMLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0, learn_A=3) mvml2.fit(x_metricl, y=self.y, views_ind=None) self.assertEqual(mvml2.A.shape, (240, 240)) @@ -134,7 +134,7 @@ class MVMLTest(unittest.TestCase): # task with Metric array ####################################################### w_expected = np.array([0.2, 0.1]) # [0.94836083 , 0.94175933] [ 0.7182, 0.7388] - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=0.6, learn_A=2, learn_w=1) mvml2.fit(x_metricl, y=self.y, views_ind=None) @@ -149,7 +149,7 @@ class MVMLTest(unittest.TestCase): # task with Metric array ####################################################### w_expected = np.array([1.3, 1.4]) # [0.94836083 , 0.94175933] [ 0.7182, 0.7388] - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=0.6, learn_A=1, learn_w=1) mvml2.fit(x_metricl, y=self.y, views_ind=None) @@ -164,7 +164,7 @@ class MVMLTest(unittest.TestCase): # task with nparray 2d ####################################################### w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) x_array = np.asarray(x_metricl) mvml3 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0) mvml3.fit(x_array, y=self.y, views_ind=[0, 120, 240])