diff --git a/multimodal/boosting/boost.py b/multimodal/boosting/boost.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7de84eeadd8dffbadc1436ce16ba1d1025cf6541 100644 --- a/multimodal/boosting/boost.py +++ b/multimodal/boosting/boost.py @@ -0,0 +1,81 @@ +import numpy as np +from abc import ABCMeta +from sklearn.utils import check_array, check_X_y, check_random_state +from sklearn.tree import DecisionTreeClassifier +from sklearn.tree.tree import BaseDecisionTree +from sklearn.tree._tree import DTYPE +from sklearn.ensemble.forest import BaseForest +from multimodal.datasets.data_sample import DataSample, MultiModalArray + +class UBoosting(metaclass=ABCMeta): + """ + Abstract class MuCumboClassifier and MumboClassifier should inherit from + UBoosting for methods + """ + + def _validate_X_predict(self, X): + """Ensure that X is in the proper format.""" + if (self.base_estimator is None or + isinstance(self.base_estimator, + (BaseDecisionTree, BaseForest))): + check_array(X, accept_sparse='csr', dtype=DTYPE) + + else: + check_array(X, accept_sparse=['csr', 'csc']) + if X.shape[1] != self.n_features_: + raise ValueError("X doesn't contain the right number of features.") + return X + + + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + return (views_ind, n_views) + + def _global_X_transform(self, X, views_ind=None): + X_ = None + if isinstance(X, np.ndarray) and X.ndim == 1: + X_= MultiModalArray(X, views_ind) + elif isinstance(X, dict): + X_= MultiModalArray(X) + elif isinstance(X, np.ndarray) and X.ndim > 1: + X_ = MultiModalArray(X, views_ind) + if not isinstance(X_, MultiModalArray): + raise TypeError("Input format is not reconized") + if hasattr(self, "X_"): + if not self.X_.viexs_ind == views_ind: + raise ValueError("Input format (viewd, features) for fit and predict must be the same") + return X_ \ No newline at end of file diff --git a/multimodal/boosting/cumbo.py b/multimodal/boosting/cumbo.py index 169925af11a3a91da6d4845e6604ad0ef2e923b1..3b224962ec22302561e5f4c170bdb90cdf0e5f73 100644 --- a/multimodal/boosting/cumbo.py +++ b/multimodal/boosting/cumbo.py @@ -43,11 +43,11 @@ from sklearn.utils import check_array, check_X_y, check_random_state from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted, has_fit_parameter from cvxopt import solvers, matrix, spdiag, exp, spmatrix, mul, div -from multimodal.datasets.data_sample import Metriclearn_array +from .boost import UBoosting import warnings -class MuCumboClassifier(BaseEnsemble, ClassifierMixin): +class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): r"""It then iterates the process on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases. @@ -114,7 +114,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): Examples -------- - >>> from multiconfusion.cumbo import MuCumboClassifier + >>> from multimodal.boosting.cumbo import MuCumboClassifier >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> views_ind = [0, 2, 4] # view 0: sepal data, view 1: petal data @@ -178,7 +178,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): self.random_state = random_state # self.best_view_mode = self._validate_best_view_mode(best_view_mode) - def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" super(MuCumboClassifier, self)._validate_estimator( @@ -188,83 +187,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__) - def _validate_X_predict(self, X): - """Ensure that X is in the proper format.""" - if (self.base_estimator is None or - isinstance(self.base_estimator, - (BaseDecisionTree, BaseForest))): - X = check_array(X, accept_sparse='csr', dtype=DTYPE) - else: - X = check_array(X, accept_sparse=['csr', 'csc']) - if X.shape[1] != self.n_features_: - raise ValueError("X doesn't contain the right number of features.") - return X - - def _extract_view(self, X, ind_view): - """Extract the view for the given index ind_view from the dataset X.""" - if self.view_mode_ == "indices": - return X[:, self.views_ind_[ind_view]] - else: - return X[:, self.views_ind_[ind_view]:self.views_ind_[ind_view+1]] - - def _compute_predictions(self, X): - """Compute predictions for all the stored estimators on the data X.""" - n_samples = X.shape[0] - n_estimators = len(self.estimators_) - predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) - for ind_estimator, estimator in enumerate(self.estimators_): - # no best view in mucumbo but all view - # ind_view = self.best_views_[ind_estimator] - ind_view = ind_estimator % self.n_views_ - predictions[:, ind_estimator] \ - = estimator.predict(self._extract_view(X, ind_view)) - return predictions - - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - return (views_ind, n_views) - - # def _validate_best_view_mode(self, best_view_mode): - # """Ensure that best_view_mode has a proper value.""" - # if best_view_mode not in ("edge", "error"): - # raise ValueError('best_view_mode value must be either "edge" ' - # + 'or "error"') - # return best_view_mode - def _init_var(self, n_views, y): "Create and initialize the variables used by the MuMBo algorithm." n_classes = self.n_classes_ @@ -279,9 +201,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): n_yi_s[indice_class] = int(n_yi) cost[:, :, indice_class] /= n_yi cost[:, np.arange(n_samples), y] *= -(n_classes-1) - # not necessary in mucombo - # cost_global = np.ones((n_samples, n_classes)) - # cost_global[np.arange(n_samples), y] = -(n_classes-1) label_score = np.zeros((n_views, n_samples, n_classes)) label_score_global = np.zeros((n_samples, n_classes)) predicted_classes = np.empty((n_views, n_samples), dtype=np.int64) @@ -289,15 +208,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): return (cost, label_score, label_score_global, predicted_classes, score_function, beta_class, n_yi_s) - # def _compute_edge_global(self, cost_global, predicted_classes, y): - # """Compute edge values for the global cost matrix.""" - # n_samples = y.shape[0] - # edge_global = - np.sum( - # cost_global[np.arange(n_samples), predicted_classes], axis=1) \ - # / (np.sum(cost_global) - # - np.sum(cost_global[np.arange(n_samples), y])) - # return edge_global - def _compute_dist(self, cost, y): """Compute the sample distribution (i.e. the weights to use).""" n_samples = y.shape[0] @@ -312,13 +222,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): / np.sum(cost[:, np.arange(n_samples), y], axis=1)[:, np.newaxis] return dist - # def _compute_coop_coef(self, predicted_classes, y): - # """Compute the cooperation coefficients.""" - # coop_coef = np.zeros(predicted_classes.shape) - # coop_coef[predicted_classes == y] = 1. - # coop_coef[:, np.logical_not(coop_coef.any(axis=0))] = 1. - # return coop_coef - def _indicatrice(self, predicted_classes, y_i): n_samples = y_i.shape[0] indicate_ones = np.zeros((self.n_views_, n_samples, self.n_classes_), dtype=np.int) @@ -464,6 +367,19 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): print("Value Error on the evaluation on beta coefficient %s "% e) return solver + def _compute_predictions(self, X): + """Compute predictions for all the stored estimators on the data X.""" + n_samples = X.shape[0] + n_estimators = len(self.estimators_) + predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) + for ind_estimator, estimator in enumerate(self.estimators_): + # no best view in mucumbo but all view + # ind_view = self.best_views_[ind_estimator] + ind_view = ind_estimator % self.n_views_ + predictions[:, ind_estimator] \ + = estimator.predict(X._extract_view(ind_view)) + return predictions + def fit(self, X, y, views_ind=None): """Build a multimodal boosted classifier from the training set (X, y). @@ -516,17 +432,19 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): else: dtype = None accept_sparse = ['csr', 'csc'] - X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype) - check_classification_targets(y) - self._validate_estimator() if views_ind is None: if X.shape[1] > 1: views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) else: views_ind = np.array([0, X.shape[1]]) - self.X_ = Metriclearn_array(X, view_ind=views_ind) - self.views_ind_, n_views = self._validate_views_ind(views_ind, - X.shape[1]) + + self.X_ = self._global_X_transform(X, views_ind=views_ind) + views_ind_, n_views = self.X_._validate_views_ind(views_ind, + X.shape[1]) + check_X_y(self.X_, y, accept_sparse=accept_sparse, dtype=dtype) + check_classification_targets(y) + self._validate_estimator() + self.n_iterations_ = self.n_estimators // n_views self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) @@ -560,13 +478,13 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): for ind_view in range(n_views): estimator = self._make_estimator(append=False, random_state=random_state) - estimator.fit(self._extract_view(X, ind_view), y, + estimator.fit(self.X_._extract_view(ind_view), y, sample_weight=dist[ind_view, :]) predicted_classes[ind_view, :] = estimator.predict( - self._extract_view(X, ind_view)) + self.X_._extract_view(ind_view)) self.estimators_.append(estimator) - # fin de choose cost matrix + # end of choose cost matrix # TO DO estimator_errors_ estimate ########################################### @@ -609,7 +527,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", - "estimator_weights_beta_", "n_classes_", "views_ind_")) + "estimator_weights_beta_", "n_classes_")) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -662,7 +580,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", - "estimator_weights_beta_", "n_classes_", "views_ind_")) + "estimator_weights_beta_", "n_classes_")) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -687,7 +605,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): else: yield np.array(dec_func) - def predict(self, X): + def predict(self, X, views_ind=None): """Predict classes for X. The predicted class of an input sample is computed as the weighted mean @@ -710,6 +628,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): ValueError 'X' input matrix must be have the same total number of features of 'X' fit data """ + X = self._global_X_transform(X, views_ind=views_ind) pred = self.decision_function(X) if self.n_classes_ == 2: @@ -739,9 +658,10 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): y : generator of numpy.ndarrays, shape = (n_samples,) Predicted classes. """ + n_classes = self.n_classes_ classes = self.classes_ - + X = self._validate_X_predict(X) if n_classes == 2: for pred in self.staged_decision_function(X): yield np.array(classes.take(pred > 0, axis=0)) @@ -766,6 +686,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin): score : float Mean accuracy of self.predict(X) wrt. y. """ + X = self._validate_X_predict(X) return super(MuCumboClassifier, self).score(X, y) def staged_score(self, X, y): diff --git a/multimodal/boosting/mumbo.py b/multimodal/boosting/mumbo.py index fd7bd42e2e3bac4c1706faac4feae1ad92498dff..c2fb588f92208258786c13ebeda75b31138c21d2 100644 --- a/multimodal/boosting/mumbo.py +++ b/multimodal/boosting/mumbo.py @@ -42,9 +42,10 @@ from sklearn.tree._tree import DTYPE from sklearn.utils import check_array, check_X_y, check_random_state from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted, has_fit_parameter +from .boost import UBoosting -class MumboClassifier(BaseEnsemble, ClassifierMixin): +class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): r"""It then iterates the process on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases. @@ -106,7 +107,7 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): Examples -------- - >>> from multimodalboost.mumbo import MumboClassifier + >>> from multimodal.boosting.mumbo import MumboClassifier >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> views_ind = [0, 2, 4] # view 0: sepal data, view 1: petal data @@ -175,75 +176,6 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): raise ValueError("%s doesn't support sample_weight." % self.base_estimator_.__class__.__name__) - def _validate_X_predict(self, X): - """Ensure that X is in the proper format.""" - if (self.base_estimator is None or - isinstance(self.base_estimator, - (BaseDecisionTree, BaseForest))): - X = check_array(X, accept_sparse='csr', dtype=DTYPE) - - else: - X = check_array(X, accept_sparse=['csr', 'csc']) - if X.shape[1] != self.n_features_: - raise ValueError("X doesn't contain the right number of features.") - return X - - def _extract_view(self, X, ind_view): - """Extract the view for the given index ind_view from the dataset X.""" - if self.view_mode_ == "indices": - return X[:, self.views_ind_[ind_view]] - else: - return X[:, self.views_ind_[ind_view]:self.views_ind_[ind_view+1]] - - def _compute_predictions(self, X): - """Compute predictions for all the stored estimators on the data X.""" - n_samples = X.shape[0] - n_estimators = len(self.estimators_) - predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) - for ind_estimator, estimator in enumerate(self.estimators_): - ind_view = self.best_views_[ind_estimator] - predictions[:, ind_estimator] \ - = estimator.predict(self._extract_view(X, ind_view)) - return predictions - - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - return (views_ind, n_views) - def _validate_best_view_mode(self, best_view_mode): """Ensure that best_view_mode has a proper value.""" if best_view_mode not in ("edge", "error"): @@ -353,6 +285,17 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): cost[:, np.arange(n_samples), y] -= np.sum(cost, axis=2) return (cost, label_score) + def _compute_predictions(self, X): + """Compute predictions for all the stored estimators on the data X.""" + n_samples = X.shape[0] + n_estimators = len(self.estimators_) + predictions = np.zeros((n_samples, n_estimators), dtype=np.int64) + for ind_estimator, estimator in enumerate(self.estimators_): + ind_view = self.best_views_[ind_estimator] + predictions[:, ind_estimator] \ + = estimator.predict(X._extract_view(ind_view)) + return predictions + def fit(self, X, y, views_ind=None): """Build a multimodal boosted classifier from the training set (X, y). @@ -400,9 +343,6 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): else: dtype = None accept_sparse = ['csr', 'csc'] - X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype) - check_classification_targets(y) - self._validate_estimator() if views_ind is None: if X.shape[1] > 1: views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) @@ -410,6 +350,10 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): views_ind = np.array([0, X.shape[1]]) self.views_ind_, n_views = self._validate_views_ind(views_ind, X.shape[1]) + self.X_ = self._global_X_transform(X, views_ind=self.views_ind_) + check_X_y(self.X_, y, accept_sparse=accept_sparse, dtype=dtype) + check_classification_targets(y) + self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) @@ -441,11 +385,11 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin): for ind_view in range(n_views): estimator = self._make_estimator(append=False, random_state=random_state) - estimator.fit(self._extract_view(X, ind_view), y, + estimator.fit(self.X_._extract_view(ind_view), y, sample_weight=dist[ind_view, :]) estimators.append(estimator) predicted_classes[ind_view, :] = estimator.predict( - self._extract_view(X, ind_view)) + self.X_._extract_view(ind_view)) edges = self._compute_edge_global( cost_global, predicted_classes, y) diff --git a/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc b/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc index 0dd01d941e19a920b48b248b2107c9ec6586b3b1..0a201db1c4aaece161b8408837f5bd47c3c60df1 100644 Binary files a/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc and b/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc differ diff --git a/multimodal/datasets/data_sample.py b/multimodal/datasets/data_sample.py index e6bcf8189646fe50fdef6de07bcce86b83635096..fbcf5aa3b245cfb82ded36cc5fb5bcf33aee160e 100644 --- a/multimodal/datasets/data_sample.py +++ b/multimodal/datasets/data_sample.py @@ -26,9 +26,9 @@ import numpy as np import numpy.ma as ma -class Metriclearn_array(ma.MaskedArray, np.ndarray): +class MultiModalArray(ma.MaskedArray, np.ndarray): """ - Metriclearn_array inherit from numpy ndarray + MultiModalArray inherit from numpy ndarray Parameters @@ -74,21 +74,21 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): :Example: - >>> from metriclearning.datasets.base import load_dict - >>> from metriclearning.tests.datasets.get_dataset_path import get_dataset_path - >>> from metriclearning.datasets.data_sample import DataSample + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample >>> file = 'input_x_dic.pkl' >>> data = load_dict(get_dataset_path(file)) >>> print(data.__class__) <class 'dict'> - >>> metric = Metriclearn_array(data) - >>> metric.shape + >>> multiviews = MultiModalArray(data) + >>> multiviews.shape (120, 240) - >>> metric.keys + >>> multiviews.keys dict_keys([0, 1]) - >>> metric.shapes_int + >>> multiviews.shapes_int [120, 120] - >>> metric.n_views + >>> multiviews.n_views 2 @@ -126,7 +126,7 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) else: view_ind = np.array([0, data.shape[1]]) - view_ind, n_views = cls._validate_views_ind(view_ind, + view_ind, n_views = cls._first_validate_views_ind(view_ind, data.shape[1]) shapes_int = [ in2-in1 for in1, in2 in zip(view_ind, view_ind[1: ])] new_data = data @@ -164,11 +164,11 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): def __array_finalize__(self, obj): if obj is None: return - super(Metriclearn_array, self).__array_finalize__(obj) + super(MultiModalArray, self).__array_finalize__(obj) self.shapes_int = getattr(obj, 'shapes_int', None) self.n_views = getattr(obj, 'n_views', None) self.keys = getattr(obj, 'keys', None) - self.views_ind_self = getattr(obj, 'views_ind_self', None) + self.views_ind = getattr(obj, 'views_ind', None) def get_col(self, view, col): start = np.sum(np.asarray(self.shapes_int[0: view])) @@ -179,6 +179,13 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): stop = int(start + self.shapes_int[view]) return self.data[:, start:stop] + def _extract_view(self, ind_view): + """Extract the view for the given index ind_view from the dataset X.""" + if self.view_mode_ == "indices": + return self.data[:, self.views_ind[ind_view]] + else: + return self.data[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] + def set_view(self, view, data): start = int(np.sum(np.asarray(self.shapes_int[0: view]))) stop = int(start + self.shapes_int[view]) @@ -214,7 +221,7 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): return dico @staticmethod - def _validate_views_ind(views_ind, n_features): + def _first_validate_views_ind(views_ind, n_features): """Ensure proper format for views_ind and return number of views.""" views_ind = np.array(views_ind) if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: @@ -231,6 +238,46 @@ class Metriclearn_array(ma.MaskedArray, np.ndarray): return (views_ind, n_views) + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + self.views_ind = views_ind + self.n_views = n_views + return (views_ind, n_views) + class DataSample(dict): """ A DataSample instance @@ -247,7 +294,7 @@ class DataSample(dict): <class 'dict'> >>> s = DataSample(data) >>> type(s.data) - <class 'metriclearning.datasets.data_sample.Metriclearn_array'> + <class 'multimodal.datasets.data_sample.MultiModalArray'> - Input: @@ -260,7 +307,7 @@ class DataSample(dict): Attributes ---------- - data : { array like} Metriclearn_array + data : { array like} MultiModalArray """ def __init__(self, data=None, **kwargs): @@ -270,7 +317,7 @@ class DataSample(dict): super(DataSample, self).__init__(kwargs) self._data = None # Metriclearn_array(np.zeros((0,0))) if data is not None: - self._data = Metriclearn_array(data) + self._data = MultiModalArray(data) @property @@ -281,10 +328,10 @@ class DataSample(dict): @data.setter def data(self, data): - if isinstance(data, (Metriclearn_array, np.ndarray, ma.MaskedArray, np.generic)): + if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)): self._data = data else: - raise TypeError("sample should be a Metriclearn_array.") + raise TypeError("sample should be a MultiModalArray or numpy array.") diff --git a/multimodal/kernels/lpMKL.py b/multimodal/kernels/lpMKL.py index 20ccb24fb2e85b5ab2ef1d4a1cf2d73fd567743f..bdcfe13becea9c0bcfcde3009bbd24b488698ffa 100644 --- a/multimodal/kernels/lpMKL.py +++ b/multimodal/kernels/lpMKL.py @@ -5,7 +5,6 @@ from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y from sklearn.utils.validation import check_array from sklearn.utils.validation import check_is_fitted -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array from metriclearning.mkernel import MKernel diff --git a/multimodal/kernels/mkernel.py b/multimodal/kernels/mkernel.py index 9c4644ce22afc0611d5c846e1cdd42f94cda0e16..ac1ef5ce8fc40e9e03bdf92896978b1afa51779d 100644 --- a/multimodal/kernels/mkernel.py +++ b/multimodal/kernels/mkernel.py @@ -2,7 +2,7 @@ import numpy as np import scipy as sp from sklearn.metrics.pairwise import pairwise_kernels from abc import ABCMeta -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array +from multimodal.datasets.data_sample import DataSample, MultiModalArray class MKernel(metaclass=ABCMeta): @@ -36,26 +36,26 @@ class MKernel(metaclass=ABCMeta): if Y is None: y = Y if isinstance(X, np.ndarray) and X.ndim == 1: - X_= Metriclearn_array(X, views_ind) + X_= MultiModalArray(X, views_ind) for v in range(X.shape[0]): if Y is not None: y = Y.get_view(v) # y = self._global_check_pairwise(X_, Y, v) kernel_dict[v] = self._get_kernel(X[v], y) elif isinstance(X, dict): - X_= Metriclearn_array(X) + X_= MultiModalArray(X) for v in X.keys(): if Y is not None: y = Y.get_view(v) # y = self._global_check_pairwise(X_, Y, v) kernel_dict[v] = self._get_kernel(X[v], y) elif isinstance(X, np.ndarray) and X.ndim > 1: - X_ = Metriclearn_array(X, views_ind) + X_ = MultiModalArray(X, views_ind) X = X_ - if isinstance(X, Metriclearn_array): + if isinstance(X, MultiModalArray): for v in range(X.n_views): if Y is not None: y = Y.get_view(v) # y = self._global_check_pairwise(X, Y, v) kernel_dict[v] = self._get_kernel(X.get_view(v), y) X_= X - if not isinstance(X_, Metriclearn_array): + if not isinstance(X_, MultiModalArray): raise TypeError("Input format is not reconized") - K_ = Metriclearn_array(kernel_dict) + K_ = MultiModalArray(kernel_dict) return X_, K_ def _calc_nystrom(self, kernels, n_approx):