From 100b147b8b58ee7a954c12232fa500331ad4e065 Mon Sep 17 00:00:00 2001 From: Dominique Benielli <dominique.benielli@lis-lab.fr> Date: Fri, 10 Jan 2020 20:21:48 +0100 Subject: [PATCH] test still not ok but improve --- multimodal/boosting/boost.py | 79 +-- multimodal/boosting/cumbo.py | 29 +- multimodal/boosting/mumbo.py | 32 +- multimodal/datasets/__init__.py | 4 +- .../__pycache__/__init__.cpython-36.pyc | Bin 319 -> 304 bytes .../datasets/__pycache__/base.cpython-36.pyc | Bin 1258 -> 1249 bytes .../__pycache__/data_sample.cpython-36.pyc | Bin 10701 -> 16022 bytes multimodal/datasets/base.py | 2 +- multimodal/datasets/data_sample.py | 490 ++++++++++++++---- multimodal/kernels/__init__.py | 2 +- multimodal/kernels/lpMKL.py | 2 +- multimodal/kernels/mvml.py | 8 +- multimodal/tests/test.py | 224 ++++++++ multimodal/tests/test_cumbo.py | 4 +- multimodal/tests/test_data_sample.py | 14 +- multimodal/tests/test_mkl.py | 12 +- multimodal/tests/test_mumbo.py | 31 +- multimodal/tests/test_mvml.py | 18 +- 18 files changed, 732 insertions(+), 219 deletions(-) create mode 100644 multimodal/tests/test.py diff --git a/multimodal/boosting/boost.py b/multimodal/boosting/boost.py index 7de84ee..7b039e6 100644 --- a/multimodal/boosting/boost.py +++ b/multimodal/boosting/boost.py @@ -1,11 +1,14 @@ import numpy as np +import scipy.sparse as sp from abc import ABCMeta from sklearn.utils import check_array, check_X_y, check_random_state from sklearn.tree import DecisionTreeClassifier from sklearn.tree.tree import BaseDecisionTree from sklearn.tree._tree import DTYPE from sklearn.ensemble.forest import BaseForest -from multimodal.datasets.data_sample import DataSample, MultiModalArray +from multimodal.datasets.data_sample import DataSample +from multimodal.datasets.data_sample import MultiModalData, MultiModalArray, MultiModalSparseArray + class UBoosting(metaclass=ABCMeta): """ @@ -22,60 +25,32 @@ class UBoosting(metaclass=ABCMeta): else: check_array(X, accept_sparse=['csr', 'csc']) - if X.shape[1] != self.n_features_: - raise ValueError("X doesn't contain the right number of features.") - return X + if X.ndim < 2: + mes = "Reshape your data" + raise ValueError(mes) + if X.ndim > 1: + if X.shape[1] != self.n_features_: + mes = "Reshape your data" + raise ValueError("Number of features of the model must " + "match the input. Model n_features is %s and " + "input n_features is %s " % (self.n_features_, X.shape[1])) - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - return (views_ind, n_views) + # + # raise ValueError(mes) + return X def _global_X_transform(self, X, views_ind=None): X_ = None - if isinstance(X, np.ndarray) and X.ndim == 1: - X_= MultiModalArray(X, views_ind) - elif isinstance(X, dict): - X_= MultiModalArray(X) - elif isinstance(X, np.ndarray) and X.ndim > 1: + if isinstance(X, sp.spmatrix): + X_ = MultiModalSparseArray(X, views_ind) + else: X_ = MultiModalArray(X, views_ind) - if not isinstance(X_, MultiModalArray): - raise TypeError("Input format is not reconized") - if hasattr(self, "X_"): - if not self.X_.viexs_ind == views_ind: - raise ValueError("Input format (viewd, features) for fit and predict must be the same") - return X_ \ No newline at end of file + if isinstance(X, MultiModalData): + X_ = X + if not isinstance(X_, MultiModalData): + try: + X_ = np.asarray(X) + except Exception as e: + raise TypeError('Reshape your data') + return X_ diff --git a/multimodal/boosting/cumbo.py b/multimodal/boosting/cumbo.py index 3b22496..0f928df 100644 --- a/multimodal/boosting/cumbo.py +++ b/multimodal/boosting/cumbo.py @@ -78,13 +78,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): If None, the random number generator is the RandomState instance used by `np.random`. - best_view_mode : {"edge", "error"}, optional (default="edge") - Mode used to select the best view at each iteration: - - - if ``best_view_mode == "edge"``, the best view is the view maximizing - the edge value (variable δ (*delta*) in [1]_), - - if ``best_view_mode == "error"``, the best view is the view - minimizing the classification error. Attributes ---------- @@ -120,15 +113,13 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): >>> views_ind = [0, 2, 4] # view 0: sepal data, view 1: petal data >>> clf = MuCumboClassifier(random_state=0) >>> clf.fit(X, y, views_ind) # doctest: +NORMALIZE_WHITESPACE - MumboClassifier(base_estimator=None, best_view_mode='edge', - n_estimators=50, random_state=0) + >>> print(clf.predict([[ 5., 3., 1., 1.]])) [1] >>> views_ind = [[0, 2], [1, 3]] # view 0: length data, view 1: width data >>> clf = MuCumboClassifier(random_state=0) >>> clf.fit(X, y, views_ind) # doctest: +NORMALIZE_WHITESPACE - MumboClassifier(base_estimator=None, best_view_mode='edge', - n_estimators=50, random_state=0) + >>> print(clf.predict([[ 5., 3., 1., 1.]])) [1] @@ -136,13 +127,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): >>> base_estimator = DecisionTreeClassifier(max_depth=2) >>> clf = MuCumboClassifier(base_estimator=base_estimator, random_state=0) >>> clf.fit(X, y, views_ind) # doctest: +NORMALIZE_WHITESPACE - MumboClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, - criterion='gini', max_depth=2, max_features=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, - min_weight_fraction_leaf=0.0, presort=False, random_state=None, - splitter='best'), - best_view_mode='edge', n_estimators=50, random_state=0) + >>> print(clf.predict([[ 5., 3., 1., 1.]])) [1] @@ -176,7 +161,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): base_estimator=base_estimator, n_estimators=n_estimators) self.random_state = random_state - # self.best_view_mode = self._validate_best_view_mode(best_view_mode) def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" @@ -527,7 +511,8 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", - "estimator_weights_beta_", "n_classes_")) + "estimator_weights_beta_", "n_classes_", "X_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -581,6 +566,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): """ check_is_fitted(self, ("estimators_", "estimator_weights_alpha_","n_views_", "estimator_weights_beta_", "n_classes_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -605,7 +591,7 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): else: yield np.array(dec_func) - def predict(self, X, views_ind=None): + def predict(self, X): """Predict classes for X. The predicted class of an input sample is computed as the weighted mean @@ -628,7 +614,6 @@ class MuCumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ValueError 'X' input matrix must be have the same total number of features of 'X' fit data """ - X = self._global_X_transform(X, views_ind=views_ind) pred = self.decision_function(X) if self.n_classes_ == 2: diff --git a/multimodal/boosting/mumbo.py b/multimodal/boosting/mumbo.py index c2fb588..f2c522d 100644 --- a/multimodal/boosting/mumbo.py +++ b/multimodal/boosting/mumbo.py @@ -32,6 +32,7 @@ estimator for classification implemented in the ``MumboClassifier`` class. # structure, notations and behavior where possible. import numpy as np + from sklearn.base import ClassifierMixin from sklearn.ensemble import BaseEnsemble from sklearn.ensemble.forest import BaseForest @@ -343,22 +344,23 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): else: dtype = None accept_sparse = ['csr', 'csc'] - if views_ind is None: - if X.shape[1] > 1: - views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) - else: - views_ind = np.array([0, X.shape[1]]) - self.views_ind_, n_views = self._validate_views_ind(views_ind, - X.shape[1]) - self.X_ = self._global_X_transform(X, views_ind=self.views_ind_) + # if views_ind is None: + # if X.shape[1] > 1: + # views_ind = np.array([0, X.shape[1]//2, X.shape[1]]) + # elif X.shape[1]==1: + # views_ind = np.array([0, X.shape[1]]) + # else: + # views_ind = np.array([0]) + self.X_ = self._global_X_transform(X, views_ind=views_ind) + views_ind_, n_views = self.X_._validate_views_ind(self.X_.views_ind, + self.X_.shape[1]) check_X_y(self.X_, y, accept_sparse=accept_sparse, dtype=dtype) check_classification_targets(y) self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) - self.n_features_ = X.shape[1] - + self.n_features_ = self.X_.shape[1] if self.n_classes_ == 1: # This case would lead to division by 0 when computing the cost # matrix so it needs special handling (but it is an obvious case as @@ -458,7 +460,8 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_", - "best_views_", "n_classes_", "views_ind_")) + "best_views_", "n_classes_", "X_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -504,7 +507,8 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ``classes_``. """ check_is_fitted(self, ("estimators_", "estimator_weights_", - "n_classes_", "views_ind_")) + "n_classes_", "X_")) + X = self._global_X_transform(X, views_ind=self.X_.views_ind) X = self._validate_X_predict(X) n_samples = X.shape[0] @@ -542,6 +546,7 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): y : numpy.ndarray, shape = (n_samples,) Predicted classes. """ + pred = self.decision_function(X) if self.n_classes_ == 2: @@ -588,8 +593,7 @@ class MumboClassifier(BaseEnsemble, ClassifierMixin, UBoosting): ---------- X : {array-like, sparse matrix} of shape = (n_samples, n_features) Multi-view test samples. - Sparse matrix can be CSC, CSR, COO, DOK, or LIL. - COO, DOK and LIL are converted to CSR. + Sparse matrix can be CSC, CSR y : array-like, shape = (n_samples,) True labels for X. diff --git a/multimodal/datasets/__init__.py b/multimodal/datasets/__init__.py index 3bffe20..8b45df1 100644 --- a/multimodal/datasets/__init__.py +++ b/multimodal/datasets/__init__.py @@ -1,2 +1,2 @@ -from metriclearning.datasets.base import * -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array \ No newline at end of file +from multimodal.datasets.base import * +from multimodal.datasets.data_sample import DataSample, MultiModalArray \ No newline at end of file diff --git a/multimodal/datasets/__pycache__/__init__.cpython-36.pyc b/multimodal/datasets/__pycache__/__init__.cpython-36.pyc index 71e809afc1d58e7dd739fc5d6790139d3a7a4230..78203c1c83371c7711d386af2cea6da83faec7cb 100644 GIT binary patch delta 121 zcmdnbw1J7!n3tF9e}_n1-bBt2OMc(doRUo6{FKBT$D*RdN<U4OD2ZH<ST0CRFD0=g wu{gD)ST8BDIQ5o1LT2Iu9ma@>XO$F-lQXk3OLP%3IjM<7dHM+cWKl*f07iT)od5s; delta 136 zcmdnMw4aI7n3tDJF{muIVj^dVv!HKkNl|8UPHJLNUVLIvQDUW^CQFn|E<!XjFI_Js wu_UoLwWL@tDX}>9mNKgB#A!N=u@es~spXdDlw_7=rk111>!WffvoUG`0P*-Td;kCd diff --git a/multimodal/datasets/__pycache__/base.cpython-36.pyc b/multimodal/datasets/__pycache__/base.cpython-36.pyc index c952741223ea079326c463e7176cb64f88b719a4..5d08b5d131a62ac27ac9450f9ba3386c212792bd 100644 GIT binary patch delta 79 zcmaFG`H+*-n3tDJzEdRb!A4G9M#hlI=8Q^;#mSl3nI*cpr8y;;x%nxHIXS6`MS1!N S{^okdn@rsD2!YAzEX@EByBpa6 delta 88 zcmaFJ`HGX%n3tD}Z~MX6CmT6+85tudn=>k@<(B4@WR_*7mglCH6lEsoq$U>SW#*;p UqjEP_G2Ue2S4I__9LLfO0CRO9o&W#< diff --git a/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc b/multimodal/datasets/__pycache__/data_sample.cpython-36.pyc index 0a201db1c4aaece161b8408837f5bd47c3c60df1..aad5511d634098208d0e9aa1cb41329d923a21d6 100644 GIT binary patch literal 16022 zcmXr!<>h*`Pa<wz0t3Ti1|-1Fz`)?Zz`#)4$H2gl!Vtxf!kEI8!<fqy#l#3=Gv_cz zv7|7jFz2vlu|=_gS?pOHQJn4!DJ&_hEet8FsZ1$sz06Tu?hGmHDI6^fDICpAQQYnf zDV!->Eet7Kshn9nshnB7sobet&1_M8?hGm1DLgF<DLknxS^TM7%`8y@?hGlsDSRyq zDSWAd&CF3kDa^qPn*51tnHU&CGBS%5a`RJ4b5a$O^Ycm)GxLfSN-|OvToOwXgA;QL zK;k)x#l;GVc_|9MsU=03$vLTsMS1awMMa5~3dtZjt`KY*QuC4%3yMo~5=&Bx6%rMS zA%>|JD<tRV7Ubup=9Lt4<t6!m4bDq)tx(7-%}q)zD%Rmbw+3R1LS|k@YEfp1LRwLN zE=X}fr9xf`#2FyR!JVI|V3eYel30=mljbVVEXe@-syHLDAXPylFCOBWVjYFN__EB@ z^7u>zErq=JwA93s(xTMj_)JYME&~N9Fa*&?Ai|i-777$VG=ku&fC2>&jUa}+ILMnI zjPNR$#l;0mBajGFK#4?9Oz42aIxWAns3b!nB{R7sGe0k}C^NNKAuYcMnh<oj3W`$G zGAncxic8bdGAqEzAT2SuB)>?ZJR`Lz6=X)GLShjpUFH^)mZYX|y#!?~O~za7j!w?L zsU?Yi$shq3W`<=p4F(2=RE8+V6ox3KRAx|?Yhh?+jABV)PGM<bh+<7)O<`+ch+<1& z4`$HhxW(sNnp2YLo1c=H14{JCOi%;a7#J8>85kIxK^eG(k%6IvA&aqwp_#E3#A2#p z$YL&G$YN<`D(b0W$YM=l%w{U;DPaStNMXumDGDoL$YQTyNMTM9X=RdRr~zkq5G$Lt znXxFWgd>HenX!qnh9QMjk|BjPo1r+bh9N~%lA(q%o296%gd;@^B%i`j3$lw7tcp>B zp_X|9L*6@>jT0D)B%m(g%4RBFQ^JuV4$@VzuY?=q5*LQpm>8y7mReS@>sgx_YZz)+ zY@j}<VUdLUqWDD#M~VbUFGz(XLl#dn*i{o4i@Bg8yv-oDu@-GB;Yg7JsVv+U&X5-Z zb_-KB(*(w%loEz4kZYkKGl8+#u7o2+0Hn4C*~F+ChAgOQ6BvtfN;p!aL2B|iN;p!4 zK&%vDgs*FvYZz*nYnakN#g<H!ooilkX;G>|K~a7|Y7r<&<tCPZC<RbXE{@O4O93aQ zqSTVoqC8MRk(64bke>#T)_cjwz`*bl6v=*7%3+B)rK!aVnRyUNu-@F#;u3|VRE6UF zqLS1Uy&{nMDot!EL5VXjzXYT`Q6V|Ms3<kLM4?KiC^0WR73?5TCMhV&FUw3xg_KBo zx7dnvGLut_tK>mtW?Fs`$n7w<fvwLhRsfk<#Z_EdPylg4l?*o9VeYKr%*-oEO;0T< z)~iy)rVz!MRq`;0;&5gaE2!wyt5U^g6wFS9jVc6;gBYesz_$p$-eS+pO96$@%m4rX z|1aWTU|^_n#pV<gUs2f+#CqO=QioB?Yt#uLO}<-9c?Gvv!FB2_uFT@%(xjA<%7WBe ztPqMF8t=DQ@=`K$Z!zbj=G|gW%&WY`1r7+;qN4nwTdd$p_7+=yQdVkm$t}**ywcp% zqQsKaTg<tcdAFEz6Du^?ZgIjh>n$!s#e0hjBpjccpOPAXiycxo7vEye%uBh&T$Y%# zlA%bJfq~&yw0=f@ZmNDter{%7W?^ZnenC-wR%%JHzN24|u1{uJYO#KCa%OgBiEb{a zY|jOi?ci36K7y|gN}0tVO&|ueL7-Pqc}p@rEwiY&B)%*$Co?6nBo*91fD|jBngG<E zV_@cDl48_i<Y1IwWMX7vWMLFy;$h?gvxOM>7)78g0Y)aUxDX=;qYNVxBL@=)BMT!J zW0eMKouh}-N_cGqDzHE`61X<<U|?XVVaQ?t*EtF`3@J=0Os$}Dm$8OXk|B*rk|7vW z-=K)rfNC$s6lQS-NrqM?h?*3pU<OT=Do59flA^@q5>S;3E(qa;S9)eyYMw%7UP@{O zhy(`(r2Pdd+CTwUoLZs~p;rWI>G~D1FfcG^G8IWOFfbH}gW0!Oic@paZgFJhrGPbq z;}1f}F)%RP;*C#**bSBi`KuV@O9mz`Mlq%;8T4R6R-6o~@IiGoNCUj0o&>9?8Jihv z8Nt;NBfO&St6_vz)O{svHH^)SNEJ0ByrO0VwZpMg)QsRN7*bI)f-7oJHIGzL*D%C$ z)-Yyqfz`pP>JKnGComRC)j-_Boy}Cdt^`q4^MKrfrK)CxSJjLtRrM=)RSgm6g;&)) zH4qWLW`-K3Y}TS3@Tz(TxT*&EA5vAPp@z)_#$pF}Rm}*hoIy>J35>-tHH=wM(<U$$ z<zcI;k^NoET*Fwy5YJ!3T*DAAP{UZmoCZo{6j$5ev{eL(=7FfTi$G~?w9bZ<g3uNU zqRs~AesEO{Ca6`Z5#MMSsr9vB5htjq<pL4hAc6-(@PY_F5Wx>31VDr!hya(%LLinf zh!6o0q98&HM1TtHA}LU*%~qrU7mxt4K%zy`AeIb>0M)@rWw<bws?r}+Rl0ymXC~CD zl8uC_l8upzk&Tg!u}TTO9>A=d{WRHcam2^xCFZ8a$KT?L2elzeb3kmK`1r!o#2ly$ zPjN|NNoI0xYDq?ZN|8LsJ4zrz8AKpF24;b3s3K4;17d*dB@QMY4Gwm&m?q;b&fL_J zMDPd*_KqkhSYaK}6qXdG6y_GjDCQK#U<OUrTOtTQ1s5b16{mXUrRDp<D>YEWf?^X~ z-`g=VFr+ioGN&-svVi(=>7ZUHYYk%-Q!QHxQ!RT6b1g>+a~2Dz8Y$WYu5ehgS&BB* zFr_eMa~364@|7@TvDGj&GuASNGZe04WB|bu_7ql7Un`QKa9spLo)1_xN71?(#w?B$ z7D<K_kQuW|7_vB17~x`UQVcM5Iy1<;JP}kC>0n!G7;9N;7_zu(m|&`#nPQl0nQGZ; zIck`)SW{TCnTmeZaAa|(us1Un{Yv3T;cQ_j;i+M&VFCpesQPVYasm018EgV0*gXsr z7>hxbD9n8f6Bvt>N_bPaK>n!Vs9}ibt6@T_e%NXl;swF29PTOw=ls0llA==Z5JP?% z=3o}Mr{@PQ>cO>5l_InOs!*JfUz(Ew>J;W=7MCdG7b)ZwfE&r+3h5R*s85%fm!1sj zDuU8HD9wP{LcyS10Lnq>3^fd~0x?XrjI~UlAdz60$W+J@3~A1SvyvwBEhaq!P*c|s zL>PeuAp|J({o=9#Sq&OGvzrPs71AE8GQ^gO^y1?)^D;}~<MnKE@{<#DitY3ex~pU} z^9o8!AOmC&k0qsoyPT=H1tpc5Lf|sx7CUrk?G{UMW>xAfrs9HI9GS(%;FMYfDqL={ zCl-Ul;1*{{WkD*q?Oy~gT)@Q<C}-W`DlSPZDk%mHzujW4$}A{C6holW<`x&Ys{pE1 zN+2Z<C%ELf#R6(s-r@lF1v2weip)VxMOJV#^cF{6YI!_J9<1XQdr3xWc4}oYq(cG9 z^{J3hgXdsSu>c8<Dp0_gfg*^B1>6p2V`O1sVG?2FU}R$CV`O6#VB}!rV^m@iVPauo zV-#W(VU%G~VB`Q7LTrpxvUmeqlhIF;rAP%-j;MkNH4tG3BEU{T5TN)h(qv#@r~tVK z6p<WEU_MGI0ul%HHHty$2VRPR$`4RE!Vtxh!U8TwMA04P2pX78oWRDwzy%t!L`Z`r z@Qk8zae*}iBo-y+rk11@6@$6DZ~&181-*imLULlBLQ*PN1;`dK)<q2VRe}awz@05n zBr1RtiUKGdD1f6}!4_>07B=##sQ~VKfOYDEw4&Mowgxnan3|ZJ0q&~kp)061u!2~p z5gi>HtAj4BV2EE@PfxEFRgW%c2&n)x6j_v5sgRSIok|~%C`3m`>*?uX^Me9z8ITTi z_g6!LALR5}h}%&{z(K)Wte~ZUGytv%QKk--0u4*4E2Lzmr52^;l_-?tE2#VB=cTHn zTB87oIdF0V$D9JBoPmzC>v4eqIQ|qAz%d8P_9+U`*w6(zPC+9jH7&6;r^HqPWPyUF zLS`D6g<%uOoM;2k=shgq=<6FHb7M6XGC?y9&=Nm2MM1*|Vyh-r!@!9?u_U!fp|~J5 zIWw&?GcR2sBflKvTWF^f+&c!ResHKjy1@}vkX!+B5D4o+ECB@>$N`B8hPp10<Os?T zkRe3{=qO4tWSl%TuLLwUj^-@zC|F_%co<wECo?y*q!?rZsM`&yZBmOBN{drd(47Z% z8G4$B45x#n6cQ5h5)wcTgLLeZD$#6BNQj8G(n0tkIxp4=q!Aof(Rtd2v9SpWdI&E& zmXs7_CY6??Ldpto2%wc0ATf{;;2^P50M%k3kAQ@V)j@*_naQaN3L3eIm7w|+6z`e{ z^I;XR6)4O=`4D+17oiMXrhwd7g0LVnuLP<A9B|Nd0xn(Pg-0>M2v8lOV5I<RC4!un zom!~_oreHNd2)VkDkR1ekqS93h<+>A3h?9>G%eWK*+G<oOEK`^lOD(r&|s5ZQettc zLS`;#WLF_4KQSd9G(Q541#H?%Qj1H95t`FeOTblMacW6?L1IY;%pi1eoaVuu0b)SM zykLfaCO;rEehBTEIjIV^3hLmT9A5!)t6o8N4kYJ;+zT>B!4}~W4Rn`iq-Ex$YC>Za zqkRPKC2V7WPOUJeFiJ8sGZqz;Fl8~<Fr+coFwSG@V_;+`VW|Pjl`v$nHZv7DfoPBl zh-fntBSYaPBsFZ!Ohus44K@rl>?us3hGL#m2}=!V3It@@B$#Os`)Zi_7#SJz6rd_V zW^DpBo<U|Is{qfTvJly*tWpNgvqD-=pd<(y^+)eU`)P6%8G@P%Mj!&zJ}3gU9zf%; znR&$}iFwJXx44pvi{f(=ONufpK<s2FyT}C8ZDWQso|qGh(r>Y4mnRmb7o#<Wzy@9e zwK_mu69y(eMgc|vMiCHZ;$q@q6l3IJtdav4;o$i&NXG=+Zh*Hb{WMw88acPv<Kt8E zljGyT#v@q?Zt{RzJN66=4D&$tf?CBKW-LraAQ6=I4yerxYMK>;T4^Zl9ZB%S9b*(r zDs<|unGrO32bv9!VsB?iV+7BIb3}1~*`V2PjwnvZjCT}Q3V(_~3qurlieQRR3qur7 zig1cZ3qurdifD>h3qurNig=1d3qurtI|B<tlt3_prqnHdL;!%jo~X-C-?kJT+EKXM zQ25$Uc-l^=?Izl^nQ%KAS6c~tI|;Roq^m%W_7TYK;B<wzZ3M0rp*1DA(MDpsh-z&j zYP5%_(iWmpJBY|O5Gm~+w6+ht*`o_-6o8@<8k5Agd5CE5P}tTPZRgOmodas^f*M8z zMVWae8ldKpUVJ=wh%i1L((|wZ^AyxU?LKuocvB19Rs^M)Vg*|Ta6tjCfx#6J*c^y% zkl6^Odf>zWHbcYE$UsNI$ix7vI#8^Gm4R%H2k|sOMTKE3R@Gor!Odx~o@lVqhDHXl zU@n>yph*@iWdv;yBZe_SW6_{-X2>umGqz#O6wn|hb1i!cOD#tUQw?JYb2C#>38+2S z%vj4-VFen_Y-Vs_h}DZ>s^zTZssRmUvea<Zur@Q-veqzXvDa{AailPT+K!A2H7qGi z<_xtgg;wF9flAJ1hFW&8sSH_+%}n_dYM4@(vpMo7famETGhte=0mfRk5{4}98nzTB zanQgcQwn1ZQwpOELq$OiqYFc9MlBb3AeE<vtC_KuE1V%u1T>7w$WX!y8%WJ#ieSKS z5!7`xOrSYDaRv|#vcU#4h09ih@CnT2yTTbJFy<K{Ts?uY=pEF(EFd=66?~w<-daYS zt^oOya{^=045(`Guq=vdHc(qQk^$uAyZ}hJOkgZ}0S*l&Ne0l|=mf^1D<uqBJkSAP zkSIHJ0GLUV0cijjNd;m6I4`f#p@bm|;!@Tc)-0|RP>3@XNz|}r@ux7=ux4?9+x$%6 z<w+njKq4TMVXoC-VJH#E=9<7%6vfC;%gxA8#aAMj!U1x5I71|8aTRkdcO7>G*cUYn zSwb}sQ$eW#dB7QF^NJddEa4Q+X2uCjMJrOcQn*_fN<_ee&P*xH$b-%$qBX23Jk3lK zn2OGnh-Hb_ux5c=E!oVJB?VeoRm+nX09M1>%rt?yhy%>#gR)sbK5J%7Va#R$%Ynjx zHA|wI5v~_BXfB;4Q^SxVP{NQUo5J7AT+36#QNt52SHo1p6E9!GT*DKuP{UTk6R%jq zUc(d5S;G^|peaaXj{`h_4H?UZ4fsMA8=-eQh#lpw5)4WO7YquO`K3kR;o~AL1_lO< zP8~^u-bLD=&W;X<&;=3TE(4eV_Z##W7#N;0GB6Z_Mnl;cs<hEcM!ooW&~Px;f$m!@ zxrxQuMHV1qLF3YzOpxJe(5#9k6L_#6OlT^BN4$&lK?*@5uD4h~rDPFk^6VCCW=U#p z@hzeFg8TyTQdY2ipmO~dTVg>$YF-N1&|55^G8Hl|&I%q!F9MH(gZgDf#-Pz__995= z-QskuNKP#PEs2DThJyymi%dawgGQ!rv1cR}Czg~H-D1j3yu}0ZL23$Qtu4eS?jXZh zKo!L;j-u3La3c}a0RsoaE!MQ0{KOK(0u1oTc@b#(PLuZ*b8=2G#1Ko6b)Z4@A}bKf z8bp8;71@ATwje8*vr{W?ait`J*TIyg7T;n_y~UiLl!es21IJJ(0|Ub{Pz*JII&aW% zbr#ULIwKb&2cr-psE^6R$ib+<$i>LV$ic`39cu@POMzvjU?NN~IRQp4Mgh<`JR=(u z6C)QR52FyH1fvk66bEQjUV>4Eg@;jsk%v)+QG`hZIyw)YC=|z$KH$@8pk5}Z4+^f2 zR)Lq%fEUr&l`zyWmM~^9H8T}!f!WNUrJcoEC9GNCl_Et3HH@`Ph0AIfv)DmXZQ!Y| z41~&}s1oKZ4$$0A8o2Dr^8w3&O)M%aVMLKLfSWp@gcU_b3}y<<7KmvzpfwasevtIa zSp-U@MS-Ab;`U7}&Q47MHxrPOF=8A6lpAlcWfYerCTD9hL*hFU6wBa92j?^}0gg;B z1_p*7pvVLz1_mY$M$nJ|8xtQh52FAh2y-zCFo~he$AWFv!^|Ei3zy&_9tH|=#uSEZ zjv}5EhF}Iz7^N_#Ff@Z!i!g$DOi&&Zn8ys|F@t$5P#z1I#|q`Kf=tq6^Ft)|A|H?s zK~qfNtf0wui#<KH1eC3Ct6>6<o)krcbb><!On`#rmQXw-2gavm<|XE2R;9+r^Dr?m z6oaP07#KMiSeW=2ArN_{8X7VfT2Ts2P<jO6Vgv9fSP5u>F;f;(Gea#SWONI(xHpYa zoS~Mfh7r`psbNfEmSlh~hGp>sPe6i`1-Qi5WVyv!Tw0J?ROALq9bED8MX4#J$)IU) zj(E`UT}gbrCS;r_inky$IXfpcCBC>Mu_QH$6I6nLnV^|%c%v1P-$B7pl*quqAPVv~ zXfX@}6CaZRQ<W5EhJ)FNkpjT}1Qqq5rE6Ir|1m?_Y$Z%3%ry+nOrQzu6s8mwa6Qb( z$N-L4R>-n4h#x`ALBVTGG+B$1Kt5zEEdZ_2D@p)G8`#f0@$sO6R*;|K<250%dW#k6 zu_Dlr6Oxxw85kI3L0$r-PzFW;#ww&GXRruF7zZy#K*~XDk-)_WsFDKB%7YhFF=R1= z8cPr_G&921E{ijyFr>iO-D$$IG7C7*-eN8;%>`Ez;N+>v0!fPCtjnC7p96M&Q3=G| zh-3*$)A7mqIf@7mNHJE4qsItnNH#e?2Ob?Dw}WC3>~<GuvddyBVFV3vX0af<9#qJc zfEug}ps0~#05^-7YoLCqVX9$<tt5o|h!r$@T>|!2Q93A6K(!x+w~8`Bk;hV8l3#%C zD-Mu*LG_y|$Xg(XGBC<93NcnmVDlJAAv{(<F#^&6_8Dl)O$h_2NXOwbP?5r1gX}Yw z8kQPn2?p@`1V%}QY^I_-FdMvjKZ`Ak6_f!XqVO<au3-U%fek~RObL4ma}A3aXstQ} zOtlz;1OuqC3Qo!_kOCZ>Q>#=Vjj8-Ja8pVlB|jCi@H;oLBsoJtLp4Q5K{Z8FlcOjb z6pEZ60yR9r4G&0t4w_ciWJRPJh$twGi*i63!D$9efHLwe4$%AoDCBfNK@4gSL4sI= zQH+TROF+YvV-ILhrGY#7qsxGLKJd)J0uFv~5Y|AKq-ruF1(GH!wuA)s9kO?CF&8D4 zgOe0i?}GD5QDV6%$hV;M!N4fRD2F4b6eX6!y@`@jO29b<w2BteurJD}VafteOVluA zL0Q>MMFpS&8?*|ymbru_3$z9XwBUmo<ZaNh(q0xuP`8g6v>GQ1T%b2If<#ytve-c# zS0;vdcJM9;@ZJz`s$oSA5XM_9i4~c}RZ@Pb<=|;L&_ujCXbVDSNoHbBVpVF2CMUS* zij+LSEj><9j$_NrD^4vcDe?xT3Q#2vZi7M#oLh`#;Gm6S0}r}kuA@jyfh^*+0tF<b z^5bI^W8?z2%><Y*b0|y$N|gr+Rd5ak^&T=AY8fjON<d-4*bM4q3xW52FoD)8)iBmD z*)S9efI9NQ44~x|%(qy<yB5GL+M-;<q(>B}$(~;XDlyT^6ZZI$d{BGd1LSB>{lUP< z!YBYLP%uhLs7jDcf@pIbMXsO_12uMw)IlszqA!XEu|TzA5vU$1$^vmgjw;Fnu?j#0 zDAg2|fr6eFTBRnZ=H$f3BV4J;2&t4nITKuKg8~b@=!3(Fg^??WM}&iuhl7WOgN1{g zgOh`mgAbw#)ZB()P>_Py#h~^(%G?QLP6V{^i4%4HB$X|TJ%t^#5ehV$0-pWg2xida zyv2nWA4{Cb1fBSBMBWbwn~j1F_KePc!lrR?&VG_TDhL`3M8pT$I3Q?eh6ZGI7&43p z8(IVp5TeaHgSxEH312<5id5YW8k4#To}i^+&}nS&6adzBKp^jc%u}!e&xM1B{vh*> z3RVjFB^jwj#R`c<>7}{gJ)U4`0)ypX(-ah}6sn<%B|ytTY8B82LcvD)!CNw*1|<lC zmcAB)8kV5?9^8-wmmi=8WDP?+a}B7&!C9mM8Bzp^fXh`z&`c({-3VHT1)jUS#R{HY z2WK#7GZ|bW7J-^cu#5_?N5NVO;l&Dz03#bC2V<2mBEsO4rl33x%AgFO6@nlPE&xGw z5vVN;D%>D;G8R>W3N=PZRSvcZOn?gq&`3r#$U4wq2m?zMKaxeD0ECANNFHQ1*b>mF z0;s!G!U&pH2h~X>Oj&GcEX_>CN+s-B9H3#uA}#Quea<|N60Q_RczcD(ugVCrd`AJY zP!P02BN07Pz{?>^b73o_^fb94{ZmkD98!IQM^-@H)uI?sNywg_nwMIXnT#~s4e~W) z5*yNBhlD!V>tF)z|3;AiL4IRkQexy_0u|MqSc_`#)Hw%ak6=k9*t}b8#i=DFsYONQ zppqGEE{K4|E7%TDQV0au0jeoD<XD*axOmu#K%$z$;5y(Ib7E3*5h#RjvF3q%4ff(K z4lr9UHxbe;=P6FkEU46jO!(hogU}HFf)XrvwO^4V$Qe!`0yG&88Gix?8fY5~*iJ+( t0bzlhc#Fd(7rbBF4zw_^7&MX3!U!5*;bP=s5{lweld5NM6RBs&2LM(YYy<!R delta 4358 zcmbPMdp5Y<n3tDprMGaLh#>>RV+JI^%D}+jz`(#z3?fn(q8L&bQ<!oXqnN-na~4Y! zi#tOKa|%lfLkdeOOBQP?Qx;n@OBB01LkepOTMI)9TPjC0a};L^Q!s-jd*WIq28NJ~ z%wmPy{FKt1RE6aHypqJsjb|4yY5FB2*~`ET<rFtDFfgPtL@}l?fGmw-PLZCRATJ`- z&XC5K!k)s>!V$%u!kNO=!Vtxg!kxm?!Z3M~yp~)lYl?V^L<>VRV-$aiWQtS^LzF-} z0}DfxU@(KG%;t0QzU=lq3=9k)><sds8xsRVIzugU3S%ux2}2fRIzuyKEo%u=4P!H7 zEn5mxEqe-6Ek_A+4PyyQGgDDT3De{-b^Ur~Nrq-dMur-e6lQaVT9!h)aE2PjEY4<z zT6VBnhAhTrrlM;#Oesv+97We^7_+!in6p75j9DBhjFJqsY!yZ|j4lkZTD4p?%vtO; zTv;3`FjE<_*lJj6xN2CNnQK|$b}S2L$a?@Wy&hzI{wt`lEE5=uwt=nUoWNM5Q^{At zki}iY)XZ4R6wZ(*!pH!EB|IrC&5TWqkqmiE5ezj<5ND(?OEA>3<)wjjb4*|?np6XF zAhRSx3UfBd+>8>2EZ!7GxG1X>$fNaOx27|LtjH5VRgw<2xP~E%uZ9Vxt(hr?sg|jh zErl(Gy@jEKzlN!XDTO(Osg+5Rp_$1A6z0q|jJ2#aps*IGVNKy^W}3iQbgM)#OQ?o5 zOSpzLOQe}8OSA^0k~=S;z67L*vzciEQ;`^$%>`vM)v#u9G&80!X0w3hKq0}JCEUyi z*9>y5SeAGVLmEQ~LzYAecQ11-cMV4kcf4c`a}9UAR1I4Vcf52BdkuFyXAO5SgC<Xv zf^&XeaY<2Wa!G!XLVlXU<XSB;HoupQ3=9mD_i8CKy=0pFOUsmHB|{PGWJeJZHc<8{ zVx64JnZWmp%O)qYxFk70x4^EMVRC}DjpHqrl*E$6Tdc(yi3O>eOhvp53=FqeQ%Wie zQZ;36G36E9V$VxSEGkN@yv3Z8ns<vUvp6%axFj(zIrSDxN@jA&<XhS*T3}r)*{PMq zw^;I0GIMXSCl*6AafVbDq`DRr<rm#z1&K^P!zsghi@7SZV6wN43>zrz7jaL{)lsg$ z#aWh_T3#HVnU`{l3+x;ayW|#oUOZSrljjz5a!zp(NXsn_kPt{|ks8>+nRzLx6}LF@ zQp@8({wY!iiL#euq=MbV3sW6moSKt%i#a>B@)lQ0Vo7{iVoqsl@h#^3q^$ZPNI)?# zFcfiu2xSHah6GTyi~$t`Oe~Bnj9iQyj4)b=k&jV;k%y6kQI3&|k&97)k&ls&QG$_+ zk%^Iykqs;h5>;XnVPs<zV3YyrWZ_|yVB}$xVH5$2vN3{SmAG$dPRZncUCsJB1_p*2 zh7`tZrXrgXh8o5a#w@00reXy!n>h=V>(y!)YnckGY8bOv!Lp3COeKt2Y&A^r?0I%z zbr3xvU^YZgN(o~Y2Pg-mf%8M27>o}y10r5i4=Ny;{E9$nNt3fk2owzRppfMDO)Sn% zO>qRJgCc&AumFfq1j(`G<>w|AXBTmUIT^(ziOJcT%tcxt1^OUD8$_T5t|$Wo!!%If zg6v>m;$W0uWMkrCVdP?xnEYDLsvcDI7Ds?WgE566o1=&)g&~-sgrS77hM}1$3tS>H zGJwjT6vh;WX671(cor~^3Cd#y^O&JLHZYF`%3}warOE0CPCDR}QX~#?qACLegC@r< z*5cBF`qZK#CXj!G;^RT7wlY2~GcPeGvnn+{{uX<BYDr>ANfB24Ot)A-X{^WqWEkTu z&iMG`oW$bd_;^V20;eof1_p+uAkXuGQWhfz0}C|u@G({?fzm?eWI+QtF;EzQ6tFNb zFo1#&Tt^s8GLmP?Vwo&&B<;frj*45%#ihALk{};|@&Y)2XtETUg9N}4#GIU;bBndO zB(bOj;s}so;4}{QN_=vD&RT}ajYd|CZj<*InKEUuPX1^l9msZzIWw;WY(<eJ$S~Yi z6j_79ilw+DzX06^j`Y+LaQ@xMFge0lBM=m-AWwm4HU<U;u%~=L5zUYVN%%@N3|WjR zOrQd}h5;1Sk_>4~k_^ENnBpM$$t$@c>u<4V=A~pNrxyDafkI7_`4$&A%jf2&q{f3o zqzD|lOyFR-#gUnp0(KO-YkA{SD@uwIlOayt28tt)BN>>u7$umhWWjOco1c=H1I`J0 zFx8W*jHMYTOx|OpFnOD?iflm*Qx<CqlO)K+Y!Fs9Q&B+;Qw>8F+vKmt){M-P4NY|G zZ9$<5&P{0H3o1^*uD-<;1}dSz<)<bq*zrY*AYGtB5FF}7b|5Y|M!^Iqf!*RLh6egx zkPkp=85rdlg%~v$C775Pt0W-)n0(EoJ_H<Gkc5OamXR#I#axtFj@wdDwumoEEI-CD zd8O%IM&`-sW?J=vAlJdY1IaNUAKzlj%qvbUDk%~JDNqLyE}$F(Nw>Ec%OKuogG4KO zG;t)RK;rK-$bXPr#K$Pc$i*nZD8wkh1oL73WC8OeQ?TjwAk&$_AqcLw@*&{}4l^(T z4mkGsl6+95c9UW9a&wLPWKb#vrB+ZVfmDH0YZn6psI~?bMU1r|7ARhsOBle#b5#vP z7Av>_uPOmGofuM>QaC|P3sA8RO4E`I*{scsMP4QBDa@cYPz^&0izGt|OEyDsQVl~2 zmn1_CV>U}sRtbBGIyXc?76(`fqXa`OQyxbNdkPOkyoM<aRABQ?&SiCADq^0T&09YC zIj=6)%m4rX|1SyvH3%9x+$S$%*RI#(gk)rCP)6m-EG{lh0@v|Hpuhv=o|4q`)S@B@ zkTi2*US$y|Ho%PmO=fV`xy6+ipO%_fQd*Q+3@I}~;Q@&lP|y|0FfcIO2So=cMj4p7 z7^N8H7(wMa6C)d=xBwFqBO6ndCM4y-bI@c*E@^l<2g+$6L&1UEHo2EuroIVO&4EL+ z2^^ZBazO+fnv9^vKZu1EnvCF}f`lexil`(*4HGmp#Xy0U!d}A=&jD&4gVlk8lz9R} z-UpaH6Bvv5Y8bOXvB{Oq#8<qiggr$Zq_JXO3HM|t9{u`zCG05@Adwoz8Wu^0ES_d? z<V|2KW`K(DHiIl>Em~8;o+1TOS-2*gAuj}M1%EcEEg4k;F4(}<uz*^Z#U>@}DFPt1 zHH=6m2GlTSK~0;$Sd>)4o+7OUQUp!}LLgR(FtUGYnQItp7~=VAKouhYWGP;6Mo6mA zW?RWx1gb|SUu4&qyn!!}57d_eClN3)xsk(t^D=g2#(DunIw=YSWlK;3DGCCyK#8L$ z7{mf64o46Rl(lcM<tJsOCYRjeOwB9JO)W|+Nxj9Ko0)fuIXAHaoOr+qMw1Pk`yc`# zAkCnR394jLZZVf7=77@@5152hPQsH51%(AZg3=VoLQo+r#V9vfS<p^E89f~p2#T<3 zGWux>O>P%d^=FTdPsvY?k1qnX;coF1mn4>CCg-M>WaOt5fm$9#pc=Ty2IK}%xlja3 zaYe2mt~-bTRYyhsAQmW57sX6gv=FujrA)BnG#QI5Kyu(z4=NyJK`xYHU|`^IVqxSm z<KgGw;9%ik<zVOF<Y49C<6z?vDq@;kXpzA<ck(R@Ti(=?qRix+)Wo8^%)Io;3YL0k zB95x=Xd)9;H*b!zN@Vml0a*;{qu%0(kI&4@EQyc5#R_Thf?T2rDd3=GXLfmFQF<}V zxnTEmg7l{_Ffdp$PX1@@W`0Z17vUO6OGlHjC>2yzGD2bttPM<n^@8dWXU567HU^A3 zljqq;@)Rk6lN@^<M+wK|vo=ok&}JjZIgq3TZYhFVkwu1}(uF-eH7~U&GZ}145h&O+ zA+<8t)!-0@5ODiE85tOgL4^YYlLR9N6DXQFi()}Oi31VwAOh4-F9Lb2{uW0;QGP*c zQAs7(z*}s^sU;<;MMX&<Wnhnh2w13qEdd3nB*+pDP$bH+F!6EmuoZzsHMxpFCf{Ps zE6pva1pDU}2biswn^*+*R1w&ZU@uz<gX{t6h4=zw8H5G0>K2DhE_l$v4wRXSxfmE2 OSQt4NxtPW38EOF|Y|w@P diff --git a/multimodal/datasets/base.py b/multimodal/datasets/base.py index 0e033e7..3d58c39 100644 --- a/multimodal/datasets/base.py +++ b/multimodal/datasets/base.py @@ -1,7 +1,7 @@ from __future__ import print_function import numpy as np import numpy.ma as ma -from metriclearning.datasets.data_sample import DataSample +from multimodal.datasets.data_sample import DataSample from six.moves import cPickle as pickle #for performance diff --git a/multimodal/datasets/data_sample.py b/multimodal/datasets/data_sample.py index fbcf5aa..ed3ab6d 100644 --- a/multimodal/datasets/data_sample.py +++ b/multimodal/datasets/data_sample.py @@ -22,11 +22,337 @@ xxxxxxxx xxxx xxxx xxxx the number nbL and nbEx and , the fourth dictionaries for sample, prefix, suffix and factor where they are computed """ +from abc import ABCMeta import numpy as np import numpy.ma as ma +import scipy.sparse as sp +class MultiModalData(metaclass=ABCMeta): -class MultiModalArray(ma.MaskedArray, np.ndarray): + @staticmethod + def _first_validate_views_ind(views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + view_mode_ = "indices" + n_views = views_ind.shape[0] + return (views_ind, n_views, view_mode_) + + def _extract_view(self, ind_view): + """Extract the view for the given index ind_view from the dataset X.""" + if self.view_mode_ == "indices": + return self[:, self.views_ind[ind_view]] + else: + return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] + + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if len(views_ind) > 2 and np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + self.views_ind = views_ind + self.n_views = n_views + return (views_ind, n_views) + +class MultiModalSparseInfo(): + + def __init__(self, data, view_ind=None): + """Constructor of Metriclearn_array""" + shapes_int = [] + index = 0 + new_data = np.ndarray([]) + n_views = data.size + thekeys = None + # view_ind_self = None + view_mode = 'slices' + + if (sp.issparse(data)) and data.ndim > 1: + if view_ind is not None: + try: + view_ind = np.asarray(view_ind) + except : + raise TypeError("n_views should be list or nparray") + elif view_ind is None: + if data.shape[1] > 1: + view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) + else: + view_ind = np.array([0, data.shape[1]]) + + new_data = data + # view_ind_self = view_ind + view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind, + data.shape[1]) + if view_ind.ndim == 1 and view_mode.startswith("slicing"): + shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] + + if data.shape[0] < 1 or data.shape[1] < 1: + raise ValueError("input data shouldbe not empty") + self.view_mode_ = view_mode + self.views_ind = view_ind + self.shapes_int = shapes_int + self.n_views = n_views + +class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData): + """ + MultiModalArray inherit from numpy ndarray + + + Parameters + ---------- + + data : can be + - dictionary of multiview array with shape = (n_samples, n_features) for multi-view + for each view. + {0: array([[]], + 1: array([[]], + ...} + - numpy array like with shape = (n_samples, n_features) for multi-view + for each view. + [[[...]], + [[...]], + ...] + - {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None' + for Multi-view input samples. + + + + + views_ind : array-like (default= None ) if None + [0, n_features//2, n_features]) is constructed (2 views) + Paramater specifying how to extract the data views from X: + + - views_ind is a 1-D array of sorted integers, the entries + indicate the limits of the slices used to extract the views, + where view ``n`` is given by + ``X[:, views_ind[n]:views_ind[n+1]]``. + + Attributes + ---------- + + view_ind : list of views' indice (may be None) + + n_views : int number of views + + shapes_int: list of int numbers of feature for each views + + keys : name of key, where data come from a dictionary + + + :Example: + + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample + >>> file = 'input_x_dic.pkl' + >>> data = load_dict(get_dataset_path(file)) + + """ + + def __init__(self, *arg, **kwargs ): + """Constructor of Metriclearn_array""" + if sp.issparse(arg[0]): + MultiModalSparseInfo.__init__(self, *arg) + if isinstance(arg[0], sp.csr_matrix) : + sp.csr_matrix.__init__(self, arg[0]) + elif isinstance(arg[0], sp.csc_matrix): + sp.csc_matrix.__init__(self, arg[0]) + else: + raise TypeError("This sparse format is not supported") + else: + if isinstance(self,sp.csr_matrix): + sp.csr_matrix.__init__(self, *arg, **kwargs) + elif isinstance(self, sp.csc_matrix): + sp.csc_matrix.__init__(self, *arg, **kwargs) + + +# class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalData): +# """ +# MultiModalArray inherit from numpy ndarray +# +# +# Parameters +# ---------- +# +# data : can be +# - dictionary of multiview array with shape = (n_samples, n_features) for multi-view +# for each view. +# {0: array([[]], +# 1: array([[]], +# ...} +# - numpy array like with shape = (n_samples, n_features) for multi-view +# for each view. +# [[[...]], +# [[...]], +# ...] +# - {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None' +# for Multi-view input samples. +# +# +# +# +# views_ind : array-like (default= None ) if None +# [0, n_features//2, n_features]) is constructed (2 views) +# Paramater specifying how to extract the data views from X: +# +# - views_ind is a 1-D array of sorted integers, the entries +# indicate the limits of the slices used to extract the views, +# where view ``n`` is given by +# ``X[:, views_ind[n]:views_ind[n+1]]``. +# +# Attributes +# ---------- +# +# view_ind : list of views' indice (may be None) +# +# n_views : int number of views +# +# shapes_int: list of int numbers of feature for each views +# +# keys : name of key, where data come from a dictionary +# +# +# :Example: +# +# >>> from multimodal.datasets.base import load_dict +# >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path +# >>> from multimodal.datasets.data_sample import DataSample +# >>> file = 'input_x_dic.pkl' +# >>> data = load_dict(get_dataset_path(file)) +# >>> print(data.__class__) +# <class 'dict'> +# >>> multiviews = MultiModalArray(data) +# >>> multiviews.shape +# (120, 240) +# >>> multiviews.keys +# dict_keys([0, 1]) +# >>> multiviews.shapes_int +# [120, 120] +# >>> multiviews.n_views +# 2 +# +# +# """ +# +# def __init__(self, data, view_ind=None, shape=None, dtype=None, copy=False): +# """Constructor of Metriclearn_array""" +# shapes_int = [] +# index = 0 +# new_data = np.ndarray([]) +# n_views = 1 +# thekeys = None +# # view_ind_self = None +# view_mode = 'slices' +# if isinstance(data, tuple) and len(data) == 3: +# data_data = data[0] +# indices = data[1] +# indptr = data[2] +# data_shape = shape +# else: +# if shape is None: +# data_shape = data.shape +# if dtype is None: +# dtype = data.dtype +# data_data = data.data +# data_indices = data.indices +# data_indptr = data.indptr +# if (sp.issparse(data)) and data.ndim > 1: +# if view_ind is not None: +# try: +# view_ind = np.asarray(view_ind) +# except : +# raise TypeError("n_views should be list or nparray") +# elif view_ind is None: +# if data.shape[1] > 1: +# view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) +# else: +# view_ind = np.array([0, data.shape[1]]) +# +# new_data = data +# # view_ind_self = view_ind +# view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind, +# data_shape[1]) +# if view_ind.ndim == 1 and view_mode.startswith("slicing"): +# shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] +# if isinstance(data, sp.csr_matrix) : +# sp.csr_matrix.__init__(self, (data_data, data_indices, data_indptr), shape=data_shape) +# #sp.csr_matrix.__init__(self, data) +# elif isinstance(data, sp.csc_matrix): +# sp.csc_matrix.__init__(self, (data_data, data_indices, data_indptr), shape=data_shape) +# #sp.csc_matrix.__init__(self, data) +# else: +# raise TypeError("This sparse format is not supported") +# if self.shape[0] < 1 or self.shape[1] < 1: +# raise ValueError("input data shouldbe not empty") +# self.view_mode_ = view_mode +# self.views_ind = view_ind +# self.shapes_int = shapes_int +# self.n_views = n_views + + +class MultiModalArray(np.ndarray, MultiModalData): """ MultiModalArray inherit from numpy ndarray @@ -98,9 +424,10 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): shapes_int = [] index = 0 new_data = np.ndarray([]) - n_views = len(data) + n_views = 1 thekeys = None - view_ind_self = None + # view_ind_self = None + view_mode = 'slices' if isinstance(data, dict): n_views = len(data) for key, dat_values in data.items(): @@ -110,38 +437,65 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): thekeys = data.keys() if isinstance(data, np.ndarray) and view_ind is None and data.ndim == 1: n_views = data.shape[0] + view_ind = np.empty(n_views+1) + view_ind[0] = 0 for dat_values in data: + try: + dat_values = np.array(dat_values) + except: + raise TypeError("input format is not supported") shapes_int.append(dat_values.shape[1]) + view_ind[index+1] = dat_values.shape[1] + view_ind[index] new_data = cls._populate_new_data(index, dat_values, new_data) index += 1 - elif isinstance(data, np.ndarray) and data.ndim > 1: + elif (isinstance(data, np.ndarray) ) and data.ndim > 1: + try: + data = np.asarray(data) + except: + raise TypeError("input format is not supported") + if view_ind is not None: try: view_ind = np.asarray(view_ind) except : raise TypeError("n_views should be list or nparray") - n_views = view_ind.shape[0] - 1 elif view_ind is None: if data.shape[1] > 1: view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) else: view_ind = np.array([0, data.shape[1]]) - view_ind, n_views = cls._first_validate_views_ind(view_ind, - data.shape[1]) - shapes_int = [ in2-in1 for in1, in2 in zip(view_ind, view_ind[1: ])] new_data = data - view_ind_self = view_ind - + else: + try: + new_data = np.asarray(data) + if new_data.ndim == 1: + new_data = new_data.reshape(1, new_data.shape[0]) + view_ind = np.array([0, new_data.shape[1]]) + except Exception as e: + raise ValueError('Reshape your data') + + # view_ind_self = view_ind + # if new_data.shape[1] < 1: + # msg = ("%d feature\(s\) \\(shape=\%s\) while a minimum of \\d* " + # "is required.") % (new_data.shape[1], str(new_data.shape)) + # # "%d feature\(s\) \(shape=\(%d, %d\)\) while a minimum of \d* is required." % (new_data.shape[1], new_data.shape[0], new_data.shape[1]) + # raise ValueError(msg) + view_ind, n_views, view_mode = cls._first_validate_views_ind(view_ind, + new_data.shape[1]) + if view_ind.ndim == 1 and view_mode.startswith("slicing"): + shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] # obj = ma.MaskedArray.__new(new_data) # new_data.view() a.MaskedArray(new_data, mask=new_data.mask).view(cls) # bj = super(Metriclearn_array, cls).__new__(cls, new_data.data, new_data.mask) + if hasattr(new_data, "mask"): obj = ma.masked_array(new_data.data, new_data.mask).view(cls) elif hasattr(new_data, "data") and \ hasattr(new_data, "shape") and len(new_data.shape) > 0: - obj = np.asarray(new_data.data).view(cls) + obj = np.asarray(new_data.data).view(cls) else: - obj = np.recarray.__new__(cls, shape=(), dtype=np.float) - obj.views_ind = view_ind_self + obj = np.recarray.__new__(cls, shape=(0, 0), dtype=np.float) + obj.view_mode_ = view_mode + obj.views_ind = view_ind obj.shapes_int = shapes_int obj.n_views = n_views obj.keys = thekeys @@ -150,47 +504,60 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): @staticmethod def _populate_new_data(index, dat_values, new_data): if index == 0: - if isinstance(dat_values, ma.MaskedArray) or isinstance(dat_values, np.ndarray): + if isinstance(dat_values, ma.MaskedArray) or \ + isinstance(dat_values, np.ndarray) or sp.issparse(dat_values): new_data = dat_values else: - new_data = dat_values.view(ma.MaskedArray) # ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #( - new_data.mask = ma.nomask + new_data = dat_values.view(np.ndarray) # ma.masked_array(dat_values, mask=ma.nomask) dat_values.view(ma.MaskedArray) #( + # new_data.mask = ma.nomask else: - if isinstance(dat_values, ma.MaskedArray) or isinstance(dat_values, np.ndarray): + if isinstance(dat_values, np.ndarray): + new_data = np.hstack((new_data, dat_values)) + elif isinstance(dat_values, ma.MaskedArray): new_data = ma.hstack((new_data, dat_values)) + elif sp.issparse(dat_values): + new_data = sp.hstack((new_data, dat_values)) else: - new_data = ma.hstack((new_data, dat_values.view(ma.MaskedArray) ) ) # ma.masked_array(dat_values, mask=ma.nomask + new_data = np.hstack((new_data, dat_values.view(np.ndarray) ) ) # ma.masked_array(dat_values, mask=ma.nomask return new_data def __array_finalize__(self, obj): if obj is None: return - super(MultiModalArray, self).__array_finalize__(obj) + # super(MultiModalArray, self).__array_finalize__(obj) self.shapes_int = getattr(obj, 'shapes_int', None) self.n_views = getattr(obj, 'n_views', None) self.keys = getattr(obj, 'keys', None) self.views_ind = getattr(obj, 'views_ind', None) + self.view_mode_ = getattr(obj, 'view_mode_', None) + + def __reduce__(self): + # Get the parent's __reduce__ tuple + pickled_state = super(MultiModalArray, self).__reduce__() + # Create our own tuple to pass to __setstate__ + new_state = pickled_state[2] + (self.__dict__,) + # Return a tuple that replaces the parent's __setstate__ tuple with our own + return (pickled_state[0], pickled_state[1], new_state) + + def __setstate__(self, state): + self.__dict__.update(state[-1]) + super(MultiModalArray, self).__setstate__(state[0:-1]) def get_col(self, view, col): start = np.sum(np.asarray(self.shapes_int[0: view])) - return self.data[start+col, :] + return self[start+col, :] def get_view(self, view): start = int(np.sum(np.asarray(self.shapes_int[0: view]))) stop = int(start + self.shapes_int[view]) - return self.data[:, start:stop] + return self[:, start:stop] + - def _extract_view(self, ind_view): - """Extract the view for the given index ind_view from the dataset X.""" - if self.view_mode_ == "indices": - return self.data[:, self.views_ind[ind_view]] - else: - return self.data[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] def set_view(self, view, data): start = int(np.sum(np.asarray(self.shapes_int[0: view]))) stop = int(start + self.shapes_int[view]) if stop-start == data.shape[0] and data.shape[1]== self.data.shape[1]: - self.data[:, start:stop] = data + self[:, start:stop] = data else: raise ValueError( "shape of data does not match (%d, %d)" %stop-start %self.data.shape[1]) @@ -220,63 +587,8 @@ class MultiModalArray(ma.MaskedArray, np.ndarray): dico[view] = self.get_view(view) return dico - @staticmethod - def _first_validate_views_ind(views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - n_views = views_ind.shape[0]-1 - else: - raise ValueError("The format of views_ind is not " - + "supported.") - - return (views_ind, n_views) - def _validate_views_ind(self, views_ind, n_features): - """Ensure proper format for views_ind and return number of views.""" - views_ind = np.array(views_ind) - if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: - if np.any(views_ind[:-1] >= views_ind[1:]): - raise ValueError("Values in views_ind must be sorted.") - if views_ind[0] < 0 or views_ind[-1] > n_features: - raise ValueError("Values in views_ind are not in a correct " - + "range for the provided data.") - self.view_mode_ = "slices" - n_views = views_ind.shape[0]-1 - else: - if views_ind.ndim == 1: - if not views_ind.dtype == np.object: - raise ValueError("The format of views_ind is not " - + "supported.") - for ind, val in enumerate(views_ind): - views_ind[ind] = np.array(val) - if not np.issubdtype(views_ind[ind].dtype, np.integer): - raise ValueError("Values in views_ind must be " - + "integers.") - if views_ind[ind].min() < 0 \ - or views_ind[ind].max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided " - + "data.") - elif views_ind.ndim == 2: - if not np.issubdtype(views_ind.dtype, np.integer): - raise ValueError("Values in views_ind must be integers.") - if views_ind.min() < 0 or views_ind.max() >= n_features: - raise ValueError("Values in views_ind are not in a " - + "correct range for the provided data.") - else: - raise ValueError("The format of views_ind is not supported.") - self.view_mode_ = "indices" - n_views = views_ind.shape[0] - self.views_ind = views_ind - self.n_views = n_views - return (views_ind, n_views) class DataSample(dict): """ @@ -285,9 +597,9 @@ class DataSample(dict): :Example: - >>> from metriclearning.datasets.base import load_dict - >>> from metriclearning.tests.datasets.get_dataset_path import get_dataset_path - >>> from metriclearning.datasets.data_sample import DataSample + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample >>> file = 'input_x_dic.pkl' >>> data = load_dict(get_dataset_path(file)) >>> print(data.__class__) @@ -315,20 +627,20 @@ class DataSample(dict): # The dictionary that contains the sample super(DataSample, self).__init__(kwargs) - self._data = None # Metriclearn_array(np.zeros((0,0))) + self._data = None # Metriclearn_arrayMultiModalArray(np.zeros((0,0))) if data is not None: self._data = MultiModalArray(data) @property def data(self): - """Metriclearn_array""" + """MultiModalArray""" return self._data @data.setter def data(self, data): - if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)): + if isinstance(data, (MultiModalArray, np.ndarray, ma.MaskedArray, np.generic)) or sp.issparse(data): self._data = data else: raise TypeError("sample should be a MultiModalArray or numpy array.") diff --git a/multimodal/kernels/__init__.py b/multimodal/kernels/__init__.py index 8647836..7d48045 100644 --- a/multimodal/kernels/__init__.py +++ b/multimodal/kernels/__init__.py @@ -1 +1 @@ -__all__ = ['MVML'] +__all__ = ['MVML', 'MKernel', 'MVML'] diff --git a/multimodal/kernels/lpMKL.py b/multimodal/kernels/lpMKL.py index bdcfe13..1eca272 100644 --- a/multimodal/kernels/lpMKL.py +++ b/multimodal/kernels/lpMKL.py @@ -5,7 +5,7 @@ from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y from sklearn.utils.validation import check_array from sklearn.utils.validation import check_is_fitted -from metriclearning.mkernel import MKernel +from multimodal.kernels.mkernel import MKernel class MKL(BaseEstimator, ClassifierMixin, MKernel): diff --git a/multimodal/kernels/mvml.py b/multimodal/kernels/mvml.py index d42e9a4..535d974 100644 --- a/multimodal/kernels/mvml.py +++ b/multimodal/kernels/mvml.py @@ -10,8 +10,8 @@ from sklearn.utils.validation import check_X_y from sklearn.utils.validation import check_array from sklearn.metrics.pairwise import check_pairwise_arrays from sklearn.utils.validation import check_is_fitted -from metriclearning.datasets.data_sample import DataSample, Metriclearn_array -from metriclearning.mkernel import MKernel +from multimodal.datasets.data_sample import DataSample, MultiModalArray +from multimodal.kernels.mkernel import MKernel """ Copyright (C) 2018 Riikka Huusari @@ -196,7 +196,9 @@ class MVML(MKernel, BaseEstimator, ClassifierMixin): # Return the classifier self.learn_mvml(learn_A=self.learn_A, learn_w=self.learn_w, n_loops=self.n_loops) if self.warning_message: - print("warning appears during fit process", self.warning_message) + import logging + logging.warning("warning appears during fit process" + str(self.warning_message)) + # print("warning appears during fit process", self.warning_message) return self def learn_mvml(self, learn_A=1, learn_w=0, n_loops=6): diff --git a/multimodal/tests/test.py b/multimodal/tests/test.py new file mode 100644 index 0000000..9a68d84 --- /dev/null +++ b/multimodal/tests/test.py @@ -0,0 +1,224 @@ + +from abc import ABCMeta +import numpy as np +import numpy.ma as ma +import scipy.sparse as sp + +from multimodal.boosting.mumbo import MumboClassifier + +class MultiModalData(metaclass=ABCMeta): + + @staticmethod + def _first_validate_views_ind(views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + view_mode_ = "indices" + n_views = views_ind.shape[0] + return (views_ind, n_views, view_mode_) + + def _extract_view(self, ind_view): + """Extract the view for the given index ind_view from the dataset X.""" + if self.view_mode_ == "indices": + return self[:, self.views_ind[ind_view]] + else: + return self[:, self.views_ind[ind_view]:self.views_ind[ind_view+1]] + + def _validate_views_ind(self, views_ind, n_features): + """Ensure proper format for views_ind and return number of views.""" + views_ind = np.array(views_ind) + if np.issubdtype(views_ind.dtype, np.integer) and views_ind.ndim == 1: + if np.any(views_ind[:-1] >= views_ind[1:]): + raise ValueError("Values in views_ind must be sorted.") + if views_ind[0] < 0 or views_ind[-1] > n_features: + raise ValueError("Values in views_ind are not in a correct " + + "range for the provided data.") + self.view_mode_ = "slices" + n_views = views_ind.shape[0]-1 + else: + if views_ind.ndim == 1: + if not views_ind.dtype == np.object: + raise ValueError("The format of views_ind is not " + + "supported.") + for ind, val in enumerate(views_ind): + views_ind[ind] = np.array(val) + if not np.issubdtype(views_ind[ind].dtype, np.integer): + raise ValueError("Values in views_ind must be " + + "integers.") + if views_ind[ind].min() < 0 \ + or views_ind[ind].max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided " + + "data.") + elif views_ind.ndim == 2: + if not np.issubdtype(views_ind.dtype, np.integer): + raise ValueError("Values in views_ind must be integers.") + if views_ind.min() < 0 or views_ind.max() >= n_features: + raise ValueError("Values in views_ind are not in a " + + "correct range for the provided data.") + else: + raise ValueError("The format of views_ind is not supported.") + self.view_mode_ = "indices" + n_views = views_ind.shape[0] + self.views_ind = views_ind + self.n_views = n_views + return (views_ind, n_views) + +class MultiModalSparseInfo(): + + def __init__(self, data, view_ind=None): + """Constructor of Metriclearn_array""" + shapes_int = [] + index = 0 + new_data = np.ndarray([]) + n_views = data.size + thekeys = None + # view_ind_self = None + view_mode = 'slices' + + if (sp.issparse(data)) and data.ndim > 1: + if view_ind is not None: + try: + view_ind = np.asarray(view_ind) + except : + raise TypeError("n_views should be list or nparray") + elif view_ind is None: + if data.shape[1] > 1: + view_ind = np.array([0, data.shape[1]//2, data.shape[1]]) + else: + view_ind = np.array([0, data.shape[1]]) + + new_data = data + # view_ind_self = view_ind + view_ind, n_views, view_mode = self._first_validate_views_ind(view_ind, + data.shape[1]) + if view_ind.ndim == 1 and view_mode.startswith("slicing"): + shapes_int = [in2 - in1 for in1, in2 in zip(view_ind, view_ind[1:])] + + if data.shape[0] < 1 or data.shape[1] < 1: + raise ValueError("input data shouldbe not empty") + self.view_mode_ = view_mode + self.views_ind = view_ind + self.shapes_int = shapes_int + self.n_views = n_views + + +class MultiModalSparseArray(sp.csr_matrix, sp.csc_matrix, MultiModalSparseInfo, MultiModalData): + """ + MultiModalArray inherit from numpy ndarray + + + Parameters + ---------- + + data : can be + - dictionary of multiview array with shape = (n_samples, n_features) for multi-view + for each view. + {0: array([[]], + 1: array([[]], + ...} + - numpy array like with shape = (n_samples, n_features) for multi-view + for each view. + [[[...]], + [[...]], + ...] + - {array like} with (n_samples, nviews * n_features) with 'views_ind' diferent to 'None' + for Multi-view input samples. + + + + + views_ind : array-like (default= None ) if None + [0, n_features//2, n_features]) is constructed (2 views) + Paramater specifying how to extract the data views from X: + + - views_ind is a 1-D array of sorted integers, the entries + indicate the limits of the slices used to extract the views, + where view ``n`` is given by + ``X[:, views_ind[n]:views_ind[n+1]]``. + + Attributes + ---------- + + view_ind : list of views' indice (may be None) + + n_views : int number of views + + shapes_int: list of int numbers of feature for each views + + keys : name of key, where data come from a dictionary + + + :Example: + + >>> from multimodal.datasets.base import load_dict + >>> from multimodal.tests.datasets.get_dataset_path import get_dataset_path + >>> from multimodal.datasets.data_sample import DataSample + >>> file = 'input_x_dic.pkl' + >>> data = load_dict(get_dataset_path(file)) + + """ + + def __init__(self, *arg, **kwargs ): + """Constructor of Metriclearn_array""" + if sp.issparse(arg[0]): + MultiModalSparseInfo.__init__(self, *arg) + if isinstance(arg[0], sp.csr_matrix) : + sp.csr_matrix.__init__(self, arg[0]) + elif isinstance(arg[0], sp.csc_matrix): + sp.csc_matrix.__init__(self, arg[0]) + else: + raise TypeError("This sparse format is not supported") + else: + if isinstance(self,sp.csr_matrix): + sp.csr_matrix.__init__(self, *arg, **kwargs) + elif isinstance(self, sp.csc_matrix): + sp.csc_matrix.__init__(self, *arg, **kwargs) + + + + +if __name__ == '__main__': + rng = np.random.RandomState(0) + X = rng.rand(40, 10) + X[X < .8] = 0 + X_csr = sp.csr_matrix(X) + y = (4 * rng.rand(40)).astype(np.int) + X_ = MultiModalSparseArray(X_csr) + print(X_.shape) + print(X_[:,0:1]) + + X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) + y = [1, 1, 1, 2, 2, 2] + clf = MumboClassifier() + clf.fit(X, y) \ No newline at end of file diff --git a/multimodal/tests/test_cumbo.py b/multimodal/tests/test_cumbo.py index d8a9e00..153bc94 100644 --- a/multimodal/tests/test_cumbo.py +++ b/multimodal/tests/test_cumbo.py @@ -17,7 +17,7 @@ from sklearn import datasets from multimodal.boosting.cumbo import MuCumboClassifier from multimodal.tests.data.get_dataset_path import get_dataset_path - +from multimodal.datasets.data_sample import MultiModalArray class TestMuCumboClassifier(unittest.TestCase): @@ -909,11 +909,11 @@ class TestMuCumboClassifier(unittest.TestCase): (self.iris.data, target_two_classes, self.iris.views_ind), (self.iris.data, target_two_classes, np.array([[0, 2], [1, 3]])), ) + # for X, y, views_ind in data: clf = MuCumboClassifier(n_estimators=n_estimators, random_state=seed) clf.fit(X, y, views_ind) - staged_dec_func = [dec_f for dec_f in clf.staged_decision_function(X)] staged_predict = [predict for predict in clf.staged_predict(X)] staged_score = [score for score in clf.staged_score(X, y)] diff --git a/multimodal/tests/test_data_sample.py b/multimodal/tests/test_data_sample.py index 43e3540..04c6b49 100644 --- a/multimodal/tests/test_data_sample.py +++ b/multimodal/tests/test_data_sample.py @@ -1,9 +1,9 @@ import unittest import numpy as np -from metriclearning.datasets.base import load_dict -from metriclearning.tests.datasets.get_dataset_path import get_dataset_path -from metriclearning.datasets.data_sample import Metriclearn_array +from multimodal.datasets.base import load_dict +from multimodal.tests.datasets.get_dataset_path import get_dataset_path +from multimodal.datasets.data_sample import MultiModalArray import pickle class UnitaryTest(unittest.TestCase): @@ -29,12 +29,12 @@ class UnitaryTest(unittest.TestCase): def testGet_view(self): - a = Metriclearn_array(self.kernel_dict) + a = MultiModalArray(self.kernel_dict) np.testing.assert_almost_equal(a.get_view(0), self.kernel_dict[0], 8) np.testing.assert_almost_equal(a.get_view(1), self.kernel_dict[1], 8) def test_init_Metriclearn_array(self): - a = Metriclearn_array(self.kernel_dict) + a = MultiModalArray(self.kernel_dict) self.assertEqual(a.shape, (120, 240)) self.assertEqual(a.shapes_int, [120, 120]) self.assertEqual(a.n_views, 2) @@ -42,9 +42,9 @@ class UnitaryTest(unittest.TestCase): self.assertEqual(a.keys, dict_key.keys()) def test_init_Array(self): - a = Metriclearn_array(self.kernel_dict) + a = MultiModalArray(self.kernel_dict) array_x = a.data - b = Metriclearn_array(a) + b = MultiModalArray(a) np.testing.assert_equal(b.views_ind, np.array([0, 120, 240])) diff --git a/multimodal/tests/test_mkl.py b/multimodal/tests/test_mkl.py index fffbac5..16008f9 100644 --- a/multimodal/tests/test_mkl.py +++ b/multimodal/tests/test_mkl.py @@ -3,9 +3,9 @@ import unittest import numpy as np from sklearn.metrics.pairwise import rbf_kernel -from metriclearning.tests.datasets.get_dataset_path import get_dataset_path -from metriclearning.lpMKL import MKL -from metriclearning.datasets.data_sample import Metriclearn_array +from multimodal.tests.datasets.get_dataset_path import get_dataset_path +from multimodal.kernels.lpMKL import MKL +from multimodal.datasets.data_sample import MultiModalArray import pickle from sklearn.exceptions import NotFittedError @@ -70,7 +70,7 @@ class MKLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mkl2 = MKL(lmbda=3, m_param = 0.3, kernel=['rbf'], kernel_params=[{'gamma':50}], use_approx = True, precision = 1E0, n_loops = 50) @@ -83,7 +83,7 @@ class MKLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mkl2 = MKL(lmbda=3, m_param = 0.3, kernel="precomputed", use_approx = True, precision = 1E-9, n_loops = 600) @@ -97,7 +97,7 @@ class MKLTest(unittest.TestCase): mkl.predict(self.test_kernel_dict) def testPredictMVML_witoutFit(self): - x_metric = Metriclearn_array(self.kernel_dict) + x_metric = MultiModalArray(self.kernel_dict) mkl = MKL(lmbda=3, m_param = 0.3, kernel=['rbf'], kernel_params=[{'gamma':50}], use_approx = True, precision = 1E-9, n_loops = 50) diff --git a/multimodal/tests/test_mumbo.py b/multimodal/tests/test_mumbo.py index f5b3924..978244f 100644 --- a/multimodal/tests/test_mumbo.py +++ b/multimodal/tests/test_mumbo.py @@ -35,7 +35,7 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.cluster import KMeans from sklearn.tree import DecisionTreeClassifier from sklearn import datasets -from multimodalboost.mumbo import MumboClassifier +from multimodal.boosting.mumbo import MumboClassifier class TestMuCumboClassifier(unittest.TestCase): @@ -47,6 +47,15 @@ class TestMuCumboClassifier(unittest.TestCase): iris.views_ind = np.array([0, 2, 4]) clf.iris = iris + def test_sparse(self): + rng = np.random.RandomState(0) + X = rng.rand(40, 10) + X[X < .8] = 0 + X_csr = csr_matrix(X) + clf = MumboClassifier() + y = (4 * rng.rand(40)).astype(np.int) + clf.fit(X_csr, y) + def test_init_var(self): n_classes = 3 @@ -318,7 +327,7 @@ class TestMuCumboClassifier(unittest.TestCase): np.random.seed(seed) n_estimators = 10 - + #print("iris views ind", self.iris.views_ind) clf = MumboClassifier(n_estimators=n_estimators, best_view_mode='edge') clf.fit(self.iris.data, self.iris.target, self.iris.views_ind) score = clf.score(self.iris.data, self.iris.target) @@ -347,7 +356,7 @@ class TestMuCumboClassifier(unittest.TestCase): expected_views_ind = np.array([0, 1, 3]) clf = MumboClassifier() clf.fit(X, y) - np.testing.assert_equal(clf.views_ind_, expected_views_ind) + np.testing.assert_equal(clf.X_.views_ind, expected_views_ind) # Check that classes labels can be integers or strings and can be stored # into any kind of sequence @@ -515,6 +524,7 @@ class TestMuCumboClassifier(unittest.TestCase): np.testing.assert_equal(clf.predict(X), y) np.testing.assert_equal(clf.predict(np.array([[1., 1.], [-1., -1.]])), np.array([0, 1])) + X = clf._global_X_transform(X, clf.X_.views_ind) self.assertEqual(clf.decision_function(X).shape, y.shape) views_ind = np.array([[1, 0]]) @@ -695,6 +705,11 @@ class TestMuCumboClassifier(unittest.TestCase): def test_classifier(self): + X_zero_features = np.empty(0).reshape(3, 0) + y = np.array([1, 0, 1]) + # e = MumboClassifier() + # e.fit(X_zero_features, y) + # print(e.predict(X_zero_features)) return check_estimator(MumboClassifier) def test_iris(self): @@ -742,7 +757,6 @@ class TestMuCumboClassifier(unittest.TestCase): for X, y, views_ind in data: clf = MumboClassifier(n_estimators=n_estimators, random_state=seed) clf.fit(X, y, views_ind) - staged_dec_func = [dec_f for dec_f in clf.staged_decision_function(X)] staged_predict = [predict for predict in clf.staged_predict(X)] staged_score = [score for score in clf.staged_score(X, y)] @@ -782,7 +796,6 @@ class TestMuCumboClassifier(unittest.TestCase): clf.fit(self.iris.data, self.iris.target, self.iris.views_ind) score = clf.score(self.iris.data, self.iris.target) dump = pickle.dumps(clf) - clf_loaded = pickle.loads(dump) self.assertEqual(type(clf_loaded), clf.__class__) score_loaded = clf_loaded.score(self.iris.data, self.iris.target) @@ -828,11 +841,9 @@ class TestMuCumboClassifier(unittest.TestCase): X_dense = self.iris.data y = self.iris.target - for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, - dok_matrix]: + for sparse_format in [csc_matrix, csr_matrix]: #, lil_matrix, coo_matrix,dok_matrix]: for views_ind in (self.iris.views_ind, np.array([[0, 2], [1, 3]])): X_sparse = sparse_format(X_dense) - clf_sparse = MumboClassifier( base_estimator=CustomSVC(), random_state=seed, @@ -872,9 +883,9 @@ class TestMuCumboClassifier(unittest.TestCase): # Check that sparsity of data is maintained during training types = [clf.data_type_ for clf in clf_sparse.estimators_] if sparse_format == csc_matrix: - self.assertTrue(all([type_ == csc_matrix for type_ in types])) + self.assertTrue(all([issubclass(type_, csc_matrix) for type_ in types])) else: - self.assertTrue(all([type_ == csr_matrix for type_ in types])) + self.assertTrue(all([issubclass(type_, csr_matrix) for type_ in types])) if __name__ == '__main__': diff --git a/multimodal/tests/test_mvml.py b/multimodal/tests/test_mvml.py index 3de8a35..4627c5d 100644 --- a/multimodal/tests/test_mvml.py +++ b/multimodal/tests/test_mvml.py @@ -6,9 +6,9 @@ import unittest import numpy as np from sklearn.exceptions import NotFittedError -from metriclearning.datasets.data_sample import Metriclearn_array -from metriclearning.mvml import MVML -from metriclearning.tests.datasets.get_dataset_path import get_dataset_path +from multimodal.datasets.data_sample import MultiModalArray +from multimodal.kernels.mvml import MVML +from multimodal.tests.datasets.get_dataset_path import get_dataset_path class MVMLTest(unittest.TestCase): @@ -90,7 +90,7 @@ class MVMLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0) mvml2.fit(x_metricl, y=self.y, views_ind=None) self.assertEqual(mvml2.A.shape, (240, 240)) @@ -105,7 +105,7 @@ class MVMLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0, learn_A=4) mvml2.fit(x_metricl, y=self.y, views_ind=None) self.assertEqual(mvml2.A.shape, (240, 240)) @@ -120,7 +120,7 @@ class MVMLTest(unittest.TestCase): ####################################################### # mvml = MVML.fit(self.kernel_dict, self.y) w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0, learn_A=3) mvml2.fit(x_metricl, y=self.y, views_ind=None) self.assertEqual(mvml2.A.shape, (240, 240)) @@ -134,7 +134,7 @@ class MVMLTest(unittest.TestCase): # task with Metric array ####################################################### w_expected = np.array([0.2, 0.1]) # [0.94836083 , 0.94175933] [ 0.7182, 0.7388] - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=0.6, learn_A=2, learn_w=1) mvml2.fit(x_metricl, y=self.y, views_ind=None) @@ -149,7 +149,7 @@ class MVMLTest(unittest.TestCase): # task with Metric array ####################################################### w_expected = np.array([1.3, 1.4]) # [0.94836083 , 0.94175933] [ 0.7182, 0.7388] - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) mvml2 = MVML(lmbda=0.1, eta=1, nystrom_param=0.6, learn_A=1, learn_w=1) mvml2.fit(x_metricl, y=self.y, views_ind=None) @@ -164,7 +164,7 @@ class MVMLTest(unittest.TestCase): # task with nparray 2d ####################################################### w_expected = np.array([[0.5], [0.5]]) - x_metricl = Metriclearn_array(self.kernel_dict) + x_metricl = MultiModalArray(self.kernel_dict) x_array = np.asarray(x_metricl) mvml3 = MVML(lmbda=0.1, eta=1, nystrom_param=1.0) mvml3.fit(x_array, y=self.y, views_ind=[0, 120, 240]) -- GitLab