From cdf9409ebfd6c930c520dcdbd68bdcfcc7b1dced Mon Sep 17 00:00:00 2001 From: Dominique Benielli <dominique.benielli@lis-lab.fr> Date: Mon, 6 Jan 2020 18:04:31 +0100 Subject: [PATCH] common dataset --- multimodal/boosting/boost.py | 81 ++++++++++ multimodal/boosting/cumbo.py | 147 ++++-------------- multimodal/boosting/mumbo.py | 96 +++--------- .../__pycache__/data_sample.cpython-36.pyc | Bin 8827 -> 10701 bytes multimodal/datasets/data_sample.py | 85 +++++++--- multimodal/kernels/lpMKL.py | 1 - multimodal/kernels/mkernel.py | 14 +- 7 files changed, 208 insertions(+), 216 deletions(-) diff --git a/multimodal/boosting/boost.py b/multimodal/boosting/boost.py index e69de29..7de84ee 100644 --- a/multimodal/boosting/boost.py +++ b/multimodal/boosting/boost.py @@ -0,0 +1,81 @@ +import numpy as np +from abc import ABCMeta +from sklearn.utils import check_array, check_X_y, check_random_state +from sklearn.tree import DecisionTreeClassifier +from sklearn.tree.tree import BaseDecisionTree +from sklearn.tree._tree import DTYPE +from sklearn.ensemble.forest import BaseForest +from multimodal.datasets.data_sample import DataSample, MultiModalArray + +class UBoosting(metaclass=ABCMeta): + """ + Abstract class MuCumboClassifier and MumboClassifier should inherit from + UBoosting for methods + """ + + def _validate_X_predict(self, X): + """Ensure that X is in the proper format.""" + if (self.base_estimator is None or + isinstance(self.base_estimator, + (BaseDecisionTree, BaseForest))): + check_array(X, accept_sparse='csr', dtype=DTYPE) + + else: + check_array(X, accept_sparse=['csr', 'csc']) + if X.shape[1] != self.n_features_: + raise ValueError("X doesn't contain the right number of features.") + return X + + + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + return (views_ind, n_views) + + def _global_X_transform(self, X, views_ind=None): + X_ = None + if isinstance(X, np.ndarray) and X.ndim == 1: + X_= MultiModalArray(X, views_ind) + elif isinstance(X, dict): + X_= MultiModalArray(X) + elif isinstance(X, np.ndarray) and X.ndim > 1: + X_ = MultiModalArray(X, views_ind) + if not isinstance(X_, MultiModalArray): + raise TypeError("Input format is not reconized") + if hasattr(self, "X_"): + if not self.X_.viexs_ind == views_ind: + raise ValueError("Input format (viewd, features) for fit and predict must be the same") + return X_ \ No newline at end of file diff --git a/multimodal/boosting/cumbo.py b/multimodal/boosting/cumbo.py index 169925a..3b22496 100644 --- a/multimodal/boosting/cumbo.py +++ b/multimodal/boosting/cumbo.py @@ -43,11 +43,11 @@ from sklearn.utils import check_array, check_X_y, check_random_state from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted, has_fit_parameter from cvxopt import solvers, matrix, spdiag, exp, spmatrix, mul, div -from multimodal.datasets.data_sample import Metriclearn_array +from .boost import UBoosting import warnings -class MuCumboClassifier(BaseEnsemble, ClassifierMixin): +class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): r"""It then iterates the process on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases. @@ -114,7 +114,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): Examples -------- - >>> from multiconfusion.cumbo import MuCumboClassifier + >>> from multimodal.boosting.cumbo import MuCumboClassifier >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> views_ind = [0, 2, 4] # view 0: sepal data, view 1: petal data @@ -178,7 +178,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): self.random_state = random_state # self.best_view_mode = self._validate_best_view_mode(best_view_mode) - def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" super(MuCumboClassifier, self)._validate_estimator( @@ -188,83 +187,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__) - def _validate_X_predict(self, X): - """Ensure that X is in the proper format.""" - if (self.base_estimator is None or - isinstance(self.base_estimator, - (BaseDecisionTree, BaseForest))): - X = check_array(X, accept_sparse='csr', dtype=DTYPE) - else: - X = check_array(X, accept_sparse=['csr', 'csc']) - if X.shape[1] != self.n_features_: - raise ValueError("X doesn't contain the right number of features.") - return X - - def _extract_view(self, X, ind_view): - """Extract the view for the given index ind_view from the dataset X.""" - if self.view_mode_ == "indices": - return X[:, self.views_ind_[ind_view]] - else: - return X[:, self.views_ind_[ind_view]:self.views_ind_[ind_view+1]] - - def _compute_predictions(self, X): - """Compute predictions for all the stored estimators on the data X.""" - n_samples = X.shape[0] - n_estimators = len(self.estimators_) - predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) - for ind_estimator, estimator in enumerate(self.estimators_): - # no best view in mucumbo but all view - # ind_view = self.best_views_[ind_estimator] - ind_view = ind_estimator % self.n_views_ - predictions[:, ind_estimator] \ - = estimator.predict(self._extract_view(X, ind_view)) - return predictions - - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - return (views_ind, n_views) - - # def _validate_best_view_mode(self, best_view_mode): - # """Ensure that best_view_mode has a proper value.""" - # if best_view_mode not in ("edge", "error"): - # raise ValueError('best_view_mode value must be either "edge" ' - # + 'or "error"') - # return best_view_mode - def _init_var(self, n_views, y): "Create and initialize the variables used by the MuMBo algorithm." n_classes = self.n_classes_ @@ -279,9 +201,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): n_yi_s[indice_class] = int(n_yi) cost[:, :, indice_class] /= n_yi cost[:, np.arange(n_samples), y] *= -(n_classes-1) - # not necessary in mucombo - # cost_global = np.ones((n_samples, n_classes)) - # cost_global[np.arange(n_samples), y] = -(n_classes-1) label_score = np.zeros((n_views, n_samples, n_classes)) label_score_global = np.zeros((n_samples, n_classes)) predicted_classes = np.empty((n_views, n_samples), dtype=np.int64) @@ -289,15 +208,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): return (cost, label_score, label_score_global, predicted_classes, score_function, beta_class, n_yi_s) - # def _compute_edge_global(self, cost_global, predicted_classes, y): - # """Compute edge values for the global cost matrix.""" - # n_samples = y.shape[0] - # edge_global = - np.sum( - # cost_global[np.arange(n_samples), predicted_classes], axis=1) \ - # / (np.sum(cost_global) - # - np.sum(cost_global[np.arange(n_samples), y])) - # return edge_global - def _compute_dist(self, cost, y): """Compute the sample distribution (i.e. the weights to use).""" n_samples = y.shape[0] @@ -312,13 +222,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): / np.sum(cost[:, np.arange(n_samples), y], axis=1)[:, np.newaxis] return dist - # def _compute_coop_coef(self, predicted_classes, y): - # """Compute the cooperation coefficients.""" - # coop_coef = np.zeros(predicted_classes.shape) - # coop_coef[predicted_classes == y] = 1. - # coop_coef[:, np.logical_not(coop_coef.any(axis=0))] = 1. - # return coop_coef - def _indicatrice(self, predicted_classes, y_i): n_samples = y_i.shape[0] indicate_ones = np.zeros((self.n_views_, n_samples, self.n_classes_), dtype=np.int) @@ -464,6 +367,19 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): print("Value Error on the evaluation on beta coefficient %s "% e) return solver + def _compute_predictions(self, X): + """Compute predictions for all the stored estimators on the data X.""" + n_samples = X.shape[0] + n_estimators = len(self.estimators_) + predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) + for ind_estimator, estimator in enumerate(self.estimators_): + # no best view in mucumbo but all view + # ind_view = self.best_views_[ind_estimator] + ind_view = ind_estimator % self.n_views_ + predictions[:, ind_estimator] \ + = estimator.predict(X._extract_view(ind_view)) + return predictions + def fit(self, X, y, views_ind=None): """Build a multimodal boosted classifier from the training set (X, y). @@ -516,17 +432,19 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): else: dtype = None accept_sparse = ['csr', 'csc'] - X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype) - check_classification_targets(y) - self._validate_estimator() if views_ind is None: if X.shape[1] > 1: views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) else: views_ind = np.array([0, X.shape[1]]) - self.X_ = Metriclearn_array(X, view_ind=views_ind) - self.views_ind_, n_views = self._validate_views_ind(views_ind, - X.shape[1]) + + self.X_ = self._global_X_transform(X, views_ind=views_ind) + views_ind_, n_views = self.X_._validate_views_ind(views_ind, + X.shape[1]) + check_X_y(self.X_, y, accept_sparse=accept_sparse, dtype=dtype) + check_classification_targets(y) + self._validate_estimator() + self.n_iterations_ = self.n_estimators // n_views self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) @@ -560,13 +478,13 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): for ind_view in range(n_views): estimator = self._make_estimator(append=False, random_state=random_state) - estimator.fit(self._extract_view(X, ind_view), y, + estimator.fit(self.X_._extract_view(ind_view), y, sample_weight=dist[ind_view, :]) predicted_classes[ind_view, :] = estimator.predict( - self._extract_view(X, ind_view)) + self.X_._extract_view(ind_view)) self.estimators_.append(estimator) - # fin de choose cost matrix + # end of choose cost matrix # TO DO estimator_errors_ estimate ########################################### @@ -609,7 +527,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", - "estimator_weights_beta_", "n_classes_", "views_ind_")) + "estimator_weights_beta_", "n_classes_")) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -662,7 +580,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", - "estimator_weights_beta_", "n_classes_", "views_ind_")) + "estimator_weights_beta_", "n_classes_")) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -687,7 +605,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): else: yield np.array(dec_func) - def predict(self, X): + def predict(self, X, views_ind=None): """Predict classes for X. The predicted class of an input sample is computed as the weighted mean @@ -710,6 +628,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): ValueError 'X' input matrix must be have the same total number of features of 'X' fit data """ + X = self._global_X_transform(X, views_ind=views_ind) pred = self.decision_function(X) if self.n_classes_ == 2: @@ -739,9 +658,10 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): y : generator of numpy.ndarrays, shape = (n_samples,) Predicted classes. """ + n_classes = self.n_classes_ classes = self.classes_ - + X = self._validate_X_predict(X) if n_classes == 2: for pred in self.staged_decision_function(X): yield np.array(classes.take(pred > 0, axis=0)) @@ -766,6 +686,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): score : float Mean accuracy of self.predict(X) wrt. y. """ + X = self._validate_X_predict(X) return super(MuCumboClassifier, self).score(X, y) def staged_score(self, X, y): diff --git a/multimodal/boosting/mumbo.py b/multimodal/boosting/mumbo.py index fd7bd42..c2fb588 100644 --- a/multimodal/boosting/mumbo.py +++ b/multimodal/boosting/mumbo.py @@ -42,9 +42,10 @@ from sklearn.tree._tree import DTYPE from sklearn.utils import check_array, check_X_y, check_random_state from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted, has_fit_parameter +from .boost import UBoosting -class MumboClassifier(BaseEnsemble, ClassifierMixin): +class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): r"""It then iterates the process on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases. @@ -106,7 +107,7 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): Examples -------- - >>> from multimodalboost.mumbo import MumboClassifier + >>> from multimodal.boosting.mumbo import MumboClassifier >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> views_ind = [0, 2, 4] # view 0: sepal data, view 1: petal data @@ -175,75 +176,6 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__) - def _validate_X_predict(self, X): - """Ensure that X is in the proper format.""" - if (self.base_estimator is None or - isinstance(self.base_estimator, - (BaseDecisionTree, BaseForest))): - X = check_array(X, accept_sparse='csr', dtype=DTYPE) - - else: - X = check_array(X, accept_sparse=['csr', 'csc']) - if X.shape[1] != self.n_features_: - raise ValueError("X doesn't contain the right number of features.") - return X - - def _extract_view(self, X, ind_view): - """Extract the view for the given index ind_view from the dataset X.""" - if self.view_mode_ == "indices": - return X[:, self.views_ind_[ind_view]] - else: - return X[:, self.views_ind_[ind_view]:self.views_ind_[ind_view+1]] - - def _compute_predictions(self, X): - """Compute predictions for all the stored estimators on the data X.""" - n_samples = X.shape[0] - n_estimators = len(self.estimators_) - predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) - for ind_estimator, estimator in enumerate(self.estimators_): - ind_view = self.best_views_[ind_estimator] - predictions[:, ind_estimator] \ - = estimator.predict(self._extract_view(X, ind_view)) - return predictions - - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - return (views_ind, n_views) - def _validate_best_view_mode(self, best_view_mode): """Ensure that best_view_mode has a proper value.""" if best_view_mode not in ("edge", "error"): @@ -353,6 +285,17 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): cost[:, np.arange(n_samples), y] -= np.sum(cost, axis=2) return (cost, label_score) + def _compute_predictions(self, X): + """Compute predictions for all the stored estimators on the data X.""" + n_samples = X.shape[0] + n_estimators = len(self.estimators_) + predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) + for ind_estimator, estimator in enumerate(self.estimators_): + ind_view = self.best_views_[ind_estimator] + predictions[:, ind_estimator] \ + = estimator.predict(X._extract_view(ind_view)) + return predictions + def fit(self, X, y, views_ind=None): """Build a multimodal boosted classifier from the training set (X, y). @@ -400,9 +343,6 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): else: dtype = None accept_sparse = ['csr', 'csc'] - X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype) - check_classification_targets(y) - self._validate_estimator() if views_ind is None: if X.shape[1] > 1: views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) @@ -410,6 +350,10 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): views_ind = np.array([0, X.shape[1]]) self.views_ind_, n_views = self._validate_views_ind(views_ind, X.shape[1]) + self.X_ = self._global_X_transform(X, views_ind=self.views_ind_) + check_X_y(self.X_, y, accept_sparse=accept_sparse, dtype=dtype) + check_classification_targets(y) + self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) @@ -441,11 +385,11 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): for ind_view in range(n_views): estimator = self._make_estimator(append=False, random_state=random_state) - estimator.fit(self._extract_view(X, ind_view), y, + estimator.fit(self.X_._extract_view(ind_view), y, sample_weight=dist[ind_view, :]) estimators.append(estimator) predicted_classes[ind_view, :] = estimator.predict( - self._extract_view(X, ind_view)) + self.X_._extract_view(ind_view)) edges = self._compute_edge_global( cost_global, predicted_classes, y) diff --git a/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc b/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc index 0dd01d941e19a920b48b248b2107c9ec6586b3b1..0a201db1c4aaece161b8408837f5bd47c3c60df1 100644 GIT binary patch delta 3304 zcmezEayFRDn3tDprMGaLh~Y*q9cG~>1_p*yhA74qhA5^K#wg|#>B%0<0X`|BDPk=Q zQGBVaDdH&-Eey?!QT!>ADN-#AQ3CA@EDTYC!3>%*xA=Wab4oIO^HUOY9E*w)D-(6u z85p<}6ciMYBsSk*=4IssDa-{aoE$8xhs5iXbw}bga&CzMsVvJ(EiYEEMK%UxzD7!7 zNunm$0y{f9g<QB&z2c0-f>f}ChM|#xj)IYi0XB7$+qqQ1)`2aUypc-<!P|U`i-%e4 z7IS8vA&4+4Vr5`p029BWC;#D9QY=o+%+4&)1=|YtN=|BGQJy}6KiP%PfJYn2>yw-K z%ouqm@8k<%<ekjNugxwQpO#rvTr$~;--L|^WIXre5`J$15c3uX$W8H?c_~F|lh5;u z@~Ja0Fx+A<$w<vktt`GZ`4hiB8z)GM@?>>^OdfG$Yvu{8j}ZdN$%FK8`z97=r>1~o zq=+9REC3=DL2_()`MHV3*+tx7PDXJ_Vsf@7bCDKEfj)@P1`%+d7lCaQoxD;|o<|AU z#%qGT!fGI;AXgTNgIKBz3=ESSg>(|Ih-)(4Vku6|Nh>m7U|`T>yu}$GpPZ9eTpS+{ z4q%8irVI=WOBom#iuo8A7#J8i7+4qu82P}k3dM4M;T%Pfziu%Xm*y5pf-DEy2?|C{ zmLhYIfa2si!cyuW=ig#aPc4a0&d*s3GMfc#wiII(N&wy$u2BP-SY!#(i^D{al=b9B z5s`YZu^b>{L2<SbWG)-nTscM|#wrP<fC8mjJ($8|CXgZq1_lrf(%=lz>BGRlP{WYL zkiwYFRHRhHkj0q7l)}`?RKt+PRKqCAkj5m*5DZcQ7q5p%Koq1fi!(?vv@$`Ir7#6E zXtGo}x>l4FB_@|Blw_nTfZV8%mR|(sq-U0;<|$<6rKDDX$asikQGPC17L>n=Q%e*g z^lq_d=A~pNrxyDaF@xfa`4$&QX?$*eN@`Mk5h$pOKryb#1P<?89GQ73V8b903Qj@b zVB(EWg}4PQyA9-Hkk=WQxELjvs$|i_3aWatjhM{jCnBfoZ9$=qEoyi{5+G;X;tEU5 zDNS`PD#|a?WCgpX2$a?#ZnOp|vjY)eCxHpDyEuwdOTbRr3vv?3AO=QI^l30kFfqX+ zkrh;AIZieeYfQ)H2#_{pJ8v-;C6*WAv=f{XixSI^fvg0F0;3e899mkG7eALE2y!so zhehDj4YL0hTV`HyYEem%7)Sv)@40|t2a@P-F_wWn4N?V)JM^gFNKApm(do&)65>3_ z>2z|DL|h=)JbRFN%-~?W#gdYloDT_1aKM2HaOknem*l5pCYRg<`Gf<MtQkSMRS4BD zo0TQ!F){{BJ|tC<0&;$lG^p6&$}BD}O-d=LEJ!T^Wpz+eD@jdHEh>@#Ni!$rRThDK zRRjtEO=fU1yTz3kpO%_fQd*Q+Tm*I}*ezfJ<e(xM1_p-vljljx*K1-VDR7w^UzV7Y znUYwN3Xb$*P(hmvO4%T{gQ5at6gWS&F)}ceFk~^-Fg7#RGM0dIVHR@<Ll#RjQ&Ce5 zV-_nor#6)^WU<wN@}x*BlO#h8V+tcE*Gf)qkrb$BOkt5^NMXrlC{C(jOc9l2sA0-x zDatBgPZ4WoY+_7duVIMisA0_F1gm3|V5nuDz>xO=X7~igBEA|%u%%qtOvQ^z*i*zo z`YQI7aD$xX!Vnu1!&J*s%L;ZVYcpdFV-1T9$k|LFR=p$xSTjpAQ}MkL_7n+_W{`v= zLl#dn*kKbGiy5FIyv-ozu@<c<VNa0)sVrO*&X5-Zb_{<u(*(w%DAZ7xz*uZj!k!`k zQd`4_WMV)KV;0CXd!}sG35-QaCG08EAVql`CG06eAXbVnvd?RoYZz-7;`wTrYZ&7B zYZz;o(?I2y%;c+*J2`H#73Y8opviqwx=d9vlVzo3c&j)w^T0`^Sg%TPvb&TbQ<ePW z7gF+kRji<*ORor2yjQ7$)N&(Ks!V<%<<C>(0Ln7~U}EwisZu5kbq<q{OF2wFD&;Zx zn3P++rU0b!1f|)cK#(?2Vl4^+u|Ns6C>X>7Csaoe3zS-KvE?Uar6!l$;!Mpe%}p&z zEJ?k^oST_<i#a#30-TV+iC2>ioWCFfAt245Bn@hHq}*aIOUwaf+<Hjc16(!U62_8x zKZ3Fz$a+vw3Mxi87$q2)7}*%v7=@U47<s^KAx1t%5hzQ5kqIm=#K^%Y!^p(Q!NkGH z#>fTMQKgKYYbIZi7H1Ti{6bo<UKx}vd5TLCOEQylQ%f@PQ;I}Ejs{iOMK&N7sL&_^ z<(?u}5Z4_<fa<j(e-I0lH;ZCGhVVk`{p8e~99Z2C%6OWLMHV2H;H(U)9c4kDlVV_C z;BaCQXXG;D;pgGtVBuipVCUfEVCCTBVB-;*JWnQzaqebL*~5(BRvfrZHo1{glt&NA zGn4<wE#Lw*6d}#j$t&c;yi7oLC4(Hq5g(tKmst`Ye~UF9l!A&tmTN+aKWJH+U7lE! zUJP>~$hF`&DN12rV6dF*q2O+IOVBs9q$o2P+#ZZi1l9SPj76!Sf|L;w(O_L*f)ivl zsM>Izyj>x+-l#Y+w;(4~p*SPIG$%zNDODj6z0s?XU!;&%np;q*05(}qld}li)&sc| zl0w1FJ5bZF$PnZw_Vm=e)S}E}unUSnfe5Xn!5#voEqKzZXJ7yedNMLF6oX1Z1||tc z4kk@TKTXb}SddfWKtw!<NB|KaPv7DwD9SHLEh?!5>%GNRoLW+nT2zz-QULZOh=2tc z*u$W(mIT=Wswz0-SeW>@c-V@P8T=R+G`WgECf{NO`w{G+TO44vUT$I$+*d_lPlEj{ W46+AgB9d1?HWqPCj#iRqtOWpoz7tRY delta 2119 zcmX>b{M&`gn3tF9;Esc_n(7<5beM&T7#J8*8KM|d7^0X`7^9d|#3y?&2RNj%riiA9 zwJ<a@M)9>XurNgN2Qz3&+!FLnEh)-O&Ph!y%8O4dDoU(O)MRI1;8IXfP(YE|{Dhg8 zb@BlfzRi<W92xa<!P*pTQ5AvoYosKWBx-^Uu(PvMfGC`t$tBMU)-rh_mnxLCpGyV8 z+Wdx#hgmF&IWx}?L>LvZGB7ZJiC+nm`S_I7a!YecGRrbk%OR$LU7MMg4s`{LJ2{Zg zfKM02)suVq%ow>RAK?pO<en_augxetnNeAIavZ-R8yCn9&dII(-U1+I6bH!s_{_YN zB9+Mx_(l0t85kI%*h?}}vr{XJqb4&8=(Dkdv?xxt5Xj_{M73tE!1@>gkgO~N1H&zD z-^AkV)D%Zha1`-^g!w>(JV=f$FF!Z2IJ<}w%*iM&NlebxWG+$%DbNBD8Xy8507YOs zg(vS2l;=}HwezuHubc`<G03Gwq9F75Kz=9&c_O|zH79Lyvye^_s*)mo1_lOArduo^ z`66u)hw&C?e0*|FVsUYNJUGxHb{aA;Ff3(YU?>K;oq>^qfrU|kkq->3&|JbVoTCWx z<}K#p(%d3(kek4^g2GjkrN|g0AU}DIuoNh1G2CKLPc4a0&d*s3GM$Bifq{Whim?hU zMDGgM)Pu|{G6m_zVJ1k*9OQA9;*$IVh~vRQ2MV=Y93Xo@A+ix<5gXVdIYuGIDk+qJ z1jUjbOldMRNErhI11kf=WLB}O^_CzzusMVqBmwf_Ev~S{oYGX+qN4mFO;)IHVUk54 z=M`CjxL}8Y35e^8Q%mAOuG<E39S_J_Mv&_?7$umP;BI6GC+_&f$#LS1xWfa<)?3U) ziRHL$1&4i6V)=fMrH~+zVw6J<eRz-*C6>cOgk^G!#EyD?kXPV-F9OFiD0ptMW#$#9 z7L^o<fE0j}qzx!_L8AW_V_6X>tfJU5^HMUCQ!%0{F(m~Y7)L<vf}}V;MlnV%MhQkC zMgb;xl(9hys>#zO<ATAKSc5EK28VhSOG;*Pei0~}p_|VhUy`4anOt%SWIP8rtU<|H z2;JkG-K6F*vbixZFjTotc90gCyg?x*U6T`%i6ucPjVrUbxHKuHq_QBj2o!0z*faA= zQqxn5io`(D%!zrGMIetCfx<|Wxkwo#%9R(NmYP^nT9jH`1a>tz8Gs2;S}2lYU|_g0 z`JjwEpDIeyocu>ghEZU$q_UokBFG$`;*!LY%;enEl8pS6B4JQSgR*In1&9R-i6UDN z%N|5Hg9uj;AvL*ISzgx!#01;M#K6D+DyND;F6MAzVdOI7;pgGtVBuip;N)QC;NxJM zd{;S(ap`6cmBWm@;9?t`3^Vi6CpU76^68;E(o}5$4?<CV;^g~kVQdB<>jEcRtIKGB z^FJu|Z?R>UCl;j_YcdvrU4saKAO;2ouF3W4?ixiP*J&~q1%u+15ge~*kqJ&nB9s59 zr`D?$C*~I9q$(6=<d^28C?usSBw|)bdYYU?;IbKHG9)HJ39JYd1VuWaU}aBF%}XuH zOa@z8BmuIS31TzI)!3|;Vq{<_W~&D!Bqj+)4kk@TKTXaeZ;+pTK!h)d@B<MbpWWgp zD9SHLEh?!5yZshhacW6PYEe-DNC7wyKm;sQz;P7Fz`*bdmRLFDSeW>@c-V?SqMBSq zAcJqQ=9N0;7F2>gbc-XeG`FBqFE<h4t0J%`!G0D5*#pvx<Q0&OMeLK;YRWVE0RSWo B;?@8F diff --git a/multimodal/datasets/data_sample.py b/multimodal/datasets/data_sample.py index e6bcf81..fbcf5aa 100644 --- a/multimodal/datasets/data_sample.py +++ b/multimodal/datasets/data_sample.py @@ -26,9 +26,9 @@ import numpy as np import numpy.ma as ma -class Metriclearn_array(ma.MaskedArray, np.ndarray): +class MultiModalArray(ma.MaskedArray, np.ndarray): """ - Metriclearn_array inherit from numpy ndarray + MultiModalArray inherit from numpy ndarray Parameters @@ -74,21 +74,21 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): :Example: - >>> from metriclearning.datasets.base import load_dict - >>> from metriclearning.tests.datasets.get_dataset_path import get_dataset_path - >>> from metriclearning.datasets.data_sample import DataSample + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample >>> file = 'input_x_dic.pkl' >>> data = load_dict(get_dataset_path(file)) >>> print(data.__class__) <class 'dict'> - >>> metric = Metriclearn_array(data) - >>> metric.shape + >>> multiviews = MultiModalArray(data) + >>> multiviews.shape (120, 240) - >>> metric.keys + >>> multiviews.keys dict_keys([0, 1]) - >>> metric.shapes_int + >>> multiviews.shapes_int [120, 120] - >>> metric.n_views + >>> multiviews.n_views 2 @@ -126,7 +126,7 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) else: view_ind = np.array([0, data.shape[1]]) - view_ind, n_views = cls._validate_views_ind(view_ind, + view_ind, n_views = cls._first_validate_views_ind(view_ind, data.shape[1]) shapes_int = [ in2-in1 for in1, in2 in zip(view_ind, view_ind[1: ])] new_data = data @@ -164,11 +164,11 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): def __array_finalize__(self, obj): if obj is None: return - super(Metriclearn_array, self).__array_finalize__(obj) + super(MultiModalArray, self).__array_finalize__(obj) self.shapes_int = getattr(obj, 'shapes_int', None) self.n_views = getattr(obj, 'n_views', None) self.keys = getattr(obj, 'keys', None) - self.views_ind_self = getattr(obj, 'views_ind_self', None) + self.views_ind = getattr(obj, 'views_ind', None) def get_col(self, view, col): start = np.sum(np.asarray(self.shapes_int[0: view])) @@ -179,6 +179,13 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): stop = int(start + self.shapes_int[view]) return self.data[:, start:stop] + def _extract_view(self, ind_view): + """Extract the view for the given index ind_view from the dataset X.""" + if self.view_mode_ == "indices": + return self.data[:, self.views_ind[ind_view]] + else: + return self.data[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] + def set_view(self, view, data): start = int(np.sum(np.asarray(self.shapes_int[0: view]))) stop = int(start + self.shapes_int[view]) @@ -214,7 +221,7 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): return dico @staticmethod - def _validate_views_ind(views_ind, n_features): + def _first_validate_views_ind(views_ind, n_features): """Ensure proper format for views_ind and return number of views.""" views_ind = np.array(views_ind) if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: @@ -231,6 +238,46 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): return (views_ind, n_views) + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + self.views_ind = views_ind + self.n_views = n_views + return (views_ind, n_views) + class DataSample(dict): """ A DataSample instance @@ -247,7 +294,7 @@ class DataSample(dict): <class 'dict'> >>> s = DataSample(data) >>> type(s.data) - <class 'metriclearning.datasets.data_sample.Metriclearn_array'> + <class 'multimodal.datasets.data_sample.MultiModalArray'> - Input: @@ -260,7 +307,7 @@ class DataSample(dict): Attributes ---------- - data : { array like} Metriclearn_array + data : { array like} MultiModalArray """ def __init__(self, data=None, **kwargs): @@ -270,7 +317,7 @@ class DataSample(dict): super(DataSample, self).__init__(kwargs) self._data = None # Metriclearn_array(np.zeros((0,0))) if data is not None: - self._data = Metriclearn_array(data) + self._data = MultiModalArray(data) @property @@ -281,10 +328,10 @@ class DataSample(dict): @data.setter def data(self, data): - if isinstance(data, (Metriclearn_array, np.ndarray, ma.MaskedArray, np.generic)): + if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)): self._data = data else: - raise TypeError("sample should be a Metriclearn_array.") + raise TypeError("sample should be a MultiModalArray or numpy array.") diff --git a/multimodal/kernels/lpMKL.py b/multimodal/kernels/lpMKL.py index 20ccb24..bdcfe13 100644 --- a/multimodal/kernels/lpMKL.py +++ b/multimodal/kernels/lpMKL.py @@ -5,7 +5,6 @@ from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y from sklearn.utils.validation import check_array from sklearn.utils.validation import check_is_fitted -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array from metriclearning.mkernel import MKernel diff --git a/multimodal/kernels/mkernel.py b/multimodal/kernels/mkernel.py index 9c4644c..ac1ef5c 100644 --- a/multimodal/kernels/mkernel.py +++ b/multimodal/kernels/mkernel.py @@ -2,7 +2,7 @@ import numpy as np import scipy as sp from sklearn.metrics.pairwise import pairwise_kernels from abc import ABCMeta -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array +from multimodal.datasets.data_sample import DataSample, MultiModalArray class MKernel(metaclass=ABCMeta): @@ -36,26 +36,26 @@ class MKernel(metaclass=ABCMeta): if Y is None: y = Y if isinstance(X, np.ndarray) and X.ndim == 1: - X_= Metriclearn_array(X, views_ind) + X_= MultiModalArray(X, views_ind) for v in range(X.shape[0]): if Y is not None: y = Y.get_view(v) # y = self._global_check_pairwise(X_, Y, v) kernel_dict[v] = self._get_kernel(X[v], y) elif isinstance(X, dict): - X_= Metriclearn_array(X) + X_= MultiModalArray(X) for v in X.keys(): if Y is not None: y = Y.get_view(v) # y = self._global_check_pairwise(X_, Y, v) kernel_dict[v] = self._get_kernel(X[v], y) elif isinstance(X, np.ndarray) and X.ndim > 1: - X_ = Metriclearn_array(X, views_ind) + X_ = MultiModalArray(X, views_ind) X = X_ - if isinstance(X, Metriclearn_array): + if isinstance(X, MultiModalArray): for v in range(X.n_views): if Y is not None: y = Y.get_view(v) # y = self._global_check_pairwise(X, Y, v) kernel_dict[v] = self._get_kernel(X.get_view(v), y) X_= X - if not isinstance(X_, Metriclearn_array): + if not isinstance(X_, MultiModalArray): raise TypeError("Input format is not reconized") - K_ = Metriclearn_array(kernel_dict) + K_ = MultiModalArray(kernel_dict) return X_, K_ def _calc_nystrom(self, kernels, n_approx): -- GitLab