Combined random search for mono and multiview

35327b99 · Baptiste Bauvin · d3f1d454 · 35327b99 · 35327b99 · 35327b99
Commit 35327b99 authored Sep 25, 2019 by Baptiste Bauvin
--- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py
+++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py
@@ -17,6 +17,7 @@ from .analyze_result import execute
 # Import own modules
 from .. import monoview_classifiers
 from ..utils.dataset import getValue, extractSubset
+from ..utils import hyper_parameter_search
 # Author-Info
 __author__ = "Nikolas Huelsmann, Baptiste BAUVIN"
@@ -182,16 +183,16 @@ def getHPs(classifierModule, hyperParamSearch, nIter, CL_type, X_train, y_train,
        logging.debug(
            "Start:\t " + hyperParamSearch + " best settings with " + str(
                nIter) + " iterations for " + CL_type)
-        classifierHPSearch = getattr(monoview_utils, hyperParamSearch)
+        classifierHPSearch = getattr(hyper_parameter_search, hyperParamSearch)
-        clKWARGS, testFoldsPreds = classifierHPSearch(X_train, y_train,
+        clKWARGS, testFoldsPreds = classifierHPSearch(X_train, y_train, "monoview",
                                                      randomState,
                                                      outputFileName,
                                                      classifierModule, CL_type,
-                                                      KFolds=KFolds,
+                                                      folds=KFolds,
-                                                      nbCores=nbCores,
+                                                      nb_cores=nbCores,
                                                      metric=metrics[0],
-                                                      nIter=nIter,
+                                                      n_iter=nIter,
-                                                      classifier_KWARGS=kwargs[
+                                                      classifier_kwargs=kwargs[
                                                          CL_type + "KWARGS"])
        logging.debug("Done:\t " + hyperParamSearch + " best settings")
    else:

--- a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py
+++ b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py
@@ -17,50 +17,6 @@ __status__ = "Prototype"  # Production, Development, Prototype
 # __date__ = 2016 - 03 - 25
-def randomizedSearch(X_train, y_train, randomState, outputFileName,
-                     classifierModule, CL_type, KFolds=4, nbCores=1,
-                     metric=["accuracy_score", None], nIter=30,
-                     classifier_KWARGS=None):
-    estimator = getattr(classifierModule, CL_type)(randomState,
-                                                   **classifier_KWARGS)
-    params_dict = estimator.genDistribs()
-    if params_dict:
-        metricModule = getattr(metrics, metric[0])
-        if metric[1] is not None:
-            metricKWARGS = dict((index, metricConfig) for index, metricConfig in
-                                enumerate(metric[1]))
-        else:
-            metricKWARGS = {}
-        scorer = metricModule.get_scorer(**metricKWARGS)
-        nb_possible_combinations = compute_possible_combinations(params_dict)
-        min_list = np.array(
-            [min(nb_possible_combination, nIter) for nb_possible_combination in
-             nb_possible_combinations])
-        randomSearch = RandomizedSearchCV(estimator,
-                                          n_iter=int(np.sum(min_list)),
-                                          param_distributions=params_dict,
-                                          refit=True,
-                                          n_jobs=nbCores, scoring=scorer,
-                                          cv=KFolds, random_state=randomState)
-        detector = randomSearch.fit(X_train, y_train)
-        bestParams = dict((key, value) for key, value in
-                          estimator.genBestParams(detector).items() if
-                          key is not "random_state")
-        scoresArray = detector.cv_results_['mean_test_score']
-        params = estimator.genParamsFromDetector(detector)
-        hyper_parameter_search.genHeatMaps(params, scoresArray, outputFileName)
-        best_estimator = detector.best_estimator_
-    else:
-        best_estimator = estimator
-        bestParams = {}
-    testFoldsPreds = genTestFoldsPreds(X_train, y_train, KFolds, best_estimator)
-    return bestParams, testFoldsPreds
 def change_label_to_minus(y):
    minus_y = np.copy(y)
    minus_y[np.where(y == 0)] = -1

--- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py
+++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py
@@ -113,11 +113,10 @@ def ExecMultiview(directory, DATASET, name, classificationIndices, KFolds,
    logging.debug("Done:\t Getting train/test split")
    logging.debug("Start:\t Getting classifiers modules")
-    classifierPackage = getattr(multiview_classifiers,
+    # classifierPackage = getattr(multiview_classifiers,
-                                CL_type)  # Permet d'appeler un module avec une string
+    #                             CL_type)  # Permet d'appeler un module avec une string
-    classifierModule = getattr(classifierPackage, CL_type + "Module")
+    classifierModule = getattr(multiview_classifiers, CL_type)
    classifierClass = getattr(classifierModule, CL_type + "Class")
-    analysisModule = getattr(classifierPackage, "analyzeResults")
    logging.debug("Done:\t Getting classifiers modules")
    logging.debug("Start:\t Optimizing hyperparameters")

--- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py
+++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py
@@ -12,6 +12,10 @@ from sklearn.utils.validation import check_is_fitted
 from ... import metrics
+def get_names(classed_list):
+    return np.array([object_.__class__.__name__ for object_ in classed_list])
 class BaseMultiviewClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, random_state):

--- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py
+++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py
 import numpy as np
+import pkgutil
 from ..utils.dataset import getV
 from .additions.utils import BaseMultiviewClassifier, get_train_views_indices
@@ -7,22 +8,32 @@ from .. import monoview_classifiers
 class WeightedLinearEarlyFusion(BaseMultiviewClassifier):
-    def __init__(self, view_weights=None, monoview_classifier="decision_tree", monoview_classifier_config=None, random_state=42):
+    def __init__(self, random_state=None, view_weights=None, monoview_classifier="decision_tree", monoview_classifier_config={}):
        super(WeightedLinearEarlyFusion, self).__init__(random_state=random_state)
-        self.view_weights = np.array(view_weights)
+        self.view_weights = view_weights
-        if type(monoview_classifier) == str:
+        if isinstance(monoview_classifier, str):
            monoview_classifier_module = getattr(monoview_classifiers,
                                               monoview_classifier)
-            monoview_classifier_class = getattr(monoview_classifier_module, monoview_classifier_module.classifier_class_name)
+            monoview_classifier_class = getattr(monoview_classifier_module,
+                                                monoview_classifier_module.classifier_class_name)
            self.monoview_classifier = monoview_classifier_class(**monoview_classifier_config)
        else:
            self.monoview_classifier = monoview_classifier
+        self.param_names = ["monoview_classifier",]
+        classifier_classes = []
+        for name in dir(monoview_classifiers):
+            module = getattr(monoview_classifiers, name)
+            if name == "decision_tree":
+                classifier_class = getattr(module, module.classifier_class_name)()
+                classifier_classes.append(classifier_class)
+        self.distribs = [classifier_classes]
+        self.classed_params = ["monoview_classifier"]
    def fit(self, X, y, train_indices=None, view_indices=None):
        train_indices, X = self.transform_data_to_monoview(X, train_indices, view_indices)
        self.monoview_classifier.fit(X, y[train_indices])
-    def predict(self, X, predict_indices, view_indices):
+    def predict(self, X, predict_indices=None, view_indices=None):
        _, X = self.transform_data_to_monoview(X, predict_indices, view_indices)
        predicted_labels = self.monoview_classifier.predict(X)
        return predicted_labels
@@ -35,6 +46,8 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier):
                                                                        view_indices)
        if self.view_weights is None:
            self.view_weights = np.ones(len(self.view_indices), dtype=float)
+        else:
+            self.view_weights = np.array(self.view_weights)
        self.view_weights /= float(np.sum(self.view_weights))
        X = self.hdf5_to_monoview(dataset, example_indices, self.view_indices)

--- a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py
+++ b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py
@@ -3,6 +3,9 @@ import sys
 import matplotlib.pyplot as plt
 import numpy as np
+from scipy.stats import  randint
+from sklearn.model_selection import RandomizedSearchCV
 from .. import metrics
@@ -30,6 +33,185 @@ def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1,
    """Used to perfom gridsearch on the classifiers"""
    pass
+class CustomRandint:
+    """Used as a distribution returning a integer between low and high-1.
+    It can be used with a multiplier agrument to be able to perform more complex generation
+    for example 10 e -(randint)"""
+    def __init__(self, low=0, high=0, multiplier=""):
+        self.randint = randint(low, high)
+        self.multiplier = multiplier
+    def rvs(self, random_state=None):
+        randinteger = self.randint.rvs(random_state=random_state)
+        if self.multiplier == "e-":
+            return 10 ** -randinteger
+        else:
+            return randinteger
+    def get_nb_possibilities(self):
+        return self.randint.b - self.randint.a
+def compute_possible_combinations(params_dict):
+    n_possibs = np.ones(len(params_dict)) * np.inf
+    for value_index, value in enumerate(params_dict.values()):
+        if type(value) == list:
+            n_possibs[value_index] = len(value)
+        elif isinstance(value, CustomRandint):
+            n_possibs[value_index] = value.get_nb_possibilities()
+    return n_possibs
+def get_test_folds_preds(X, y, cv, estimator, framework, available_indices=None):
+    test_folds_prediction = []
+    if framework == "monoview":
+        folds = cv.split(np.arange(len(y)), y)
+    if framework == "multiview":
+        y = y.value
+        folds = cv.split(available_indices, y[available_indices])
+    fold_lengths = np.zeros(cv.n_splits, dtype=int)
+    for fold_idx, (train_indices, test_indices) in enumerate(folds):
+        fold_lengths[fold_idx] = len(test_indices)
+        if framework == "monoview":
+            estimator.fit(X[train_indices], y[train_indices])
+            test_folds_prediction.append(estimator.predict(X[train_indices]))
+        if framework == "multiview":
+            estimator.fit(X, y, available_indices[train_indices])
+            test_folds_prediction.append(
+                estimator.predict(X, available_indices[test_indices]))
+    minFoldLength = fold_lengths.min()
+    test_folds_prediction = np.array(
+        [test_fold_prediction[:minFoldLength] for test_fold_prediction in
+         test_folds_prediction])
+    return test_folds_prediction
+def randomized_search(X, y, framework, random_state, output_file_name, classifier_module,
+                         classifier_name, folds=4, nb_cores=1, metric=["accuracy_score", None], n_iter=30,
+                         classifier_kwargs =None, learning_indices=None):
+    estimator = getattr(classifier_module, classifier_name)(random_state,
+                                                           **classifier_kwargs)
+    params_dict = estimator.genDistribs()
+    if params_dict:
+        metricModule = getattr(metrics, metric[0])
+        if metric[1] is not None:
+            metricKWARGS = dict((index, metricConfig) for index, metricConfig in
+                                enumerate(metric[1]))
+        else:
+            metricKWARGS = {}
+        scorer = metricModule.get_scorer(**metricKWARGS)
+        nb_possible_combinations = compute_possible_combinations(params_dict)
+        min_list = np.array(
+            [min(nb_possible_combination, n_iter) for nb_possible_combination in
+             nb_possible_combinations])
+        randomSearch = MultiviewCompatibleRandomizedSearchCV(estimator,
+                                                             n_iter=int(np.sum(min_list)),
+                                                             param_distributions=params_dict,
+                                                             refit=True,
+                                                             n_jobs=nb_cores, scoring=scorer,
+                                                             cv=folds, random_state=random_state,
+                                                             learning_indices=learning_indices,
+                                                             framework = framework)
+        detector = randomSearch.fit(X, y)
+        bestParams = dict((key, value) for key, value in
+                          estimator.genBestParams(detector).items() if
+                          key is not "random_state")
+        scoresArray = detector.cv_results_['mean_test_score']
+        params = estimator.genParamsFromDetector(detector)
+        genHeatMaps(params, scoresArray, output_file_name)
+        best_estimator = detector.best_estimator_
+    else:
+        best_estimator = estimator
+        bestParams = {}
+    testFoldsPreds = get_test_folds_preds(X, y, folds, best_estimator,
+                                          framework, learning_indices)
+    return bestParams, testFoldsPreds
+from sklearn.base import clone
+class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV):
+    def __init__(self, estimator, param_distributions, n_iter=10,
+                 refit=True, n_jobs=1, scoring=None, cv=None,
+                 random_state=None, learning_indices=None, framework="monoview"):
+        super(MultiviewCompatibleRandomizedSearchCV, self).__init__(estimator,
+                                                                    n_iter=n_iter,
+                                                                    param_distributions=param_distributions,
+                                                                    refit=refit,
+                                                                    n_jobs=n_jobs, scoring=scoring,
+                                                                    cv=cv, random_state=random_state)
+        self.framework = framework
+        self.available_indices = learning_indices
+    def fit(self, X, y=None, groups=None, **fit_params):
+        if self.framework == "monoview":
+            return super(MultiviewCompatibleRandomizedSearchCV, self).fit(X, y=y, groups=groups, **fit_params)
+        elif self.framework == "multiview":
+            return self.fit_multiview(X, y=y.value, groups=groups,**fit_params)
+    def fit_multiview(self, X, y=None, groups=None, **fit_params):
+        n_splits = self.cv.get_n_splits(self.available_indices, y[self.available_indices])
+        folds = self.cv.split(self.available_indices, y[self.available_indices])
+        candidate_params = list(self._get_param_iterator())
+        base_estimator = clone(self.estimator)
+        results = {}
+        self.cv_results_ = dict(("param_"+param_name, []) for param_name in candidate_params[0].keys())
+        self.cv_results_["mean_test_score"] = []
+        for candidate_param_idx, candidate_param in enumerate(candidate_params):
+            test_scores = np.zeros(n_splits)+1000
+            for fold_idx, (train_indices, test_indices) in enumerate(folds):
+                current_estimator = clone(base_estimator)
+                current_estimator.set_params(**candidate_param)
+                current_estimator.fit(X, y,
+                                      train_indices=self.available_indices[train_indices])
+                test_prediction = current_estimator.predict(
+                    X,
+                    self.available_indices[test_indices])
+                test_score = self.scoring._score_func(y[self.available_indices[test_indices]],
+                                                      test_prediction)
+                test_scores[fold_idx] = test_score
+            for param_name, param in candidate_param.items():
+                self.cv_results_["param_"+param_name].append(param)
+            cross_validation_score = np.mean(test_scores)
+            self.cv_results_["mean_test_score"].append(cross_validation_score)
+            results[candidate_param_idx] = cross_validation_score
+            if cross_validation_score <= min(results.values()):
+                self.best_params_ = candidate_params[candidate_param_idx]
+                self.best_score_ = cross_validation_score
+        if self.refit:
+            self.best_estimator_ = clone(base_estimator).set_params(**self.best_params_)
+        self.n_splits_ = n_splits
+        return self
+    def get_test_folds_preds(self, X, y, estimator):
+        test_folds_prediction = []
+        if self.framework=="monoview":
+            folds = self.cv.split(np.arange(len(y)), y)
+        if self.framework=="multiview":
+            folds = self.cv.split(self.available_indices, y)
+        fold_lengths = np.zeros(self.cv.n_splits, dtype=int)
+        for fold_idx, (train_indices, test_indices) in enumerate(folds):
+            fold_lengths[fold_idx] = len(test_indices)
+            if self.framework == "monoview":
+                estimator.fit(X[train_indices], y[train_indices])
+                test_folds_prediction.append(estimator.predict(X[train_indices]))
+            if self.framework =="multiview":
+                estimator.fit(X, y, self.available_indices[train_indices])
+                test_folds_prediction.append(estimator.predict(X, self.available_indices[test_indices]))
+        minFoldLength = fold_lengths.min()
+        test_folds_prediction = np.array(
+            [test_fold_prediction[:minFoldLength] for test_fold_prediction in test_folds_prediction])
+        return test_folds_prediction
 def randomizedSearch(dataset, labels, classifierPackage, classifierName,
                     metrics, learningIndices, KFolds, randomState,

--- a/multiview_platform/tests/test_mono_view/test_ExecClassifMonoView.py
+++ b/multiview_platform/tests/test_mono_view/test_ExecClassifMonoView.py
@@ -3,8 +3,10 @@ import unittest
 import h5py
 import numpy as np
+from sklearn.model_selection import StratifiedKFold
 from ...mono_multi_view_classifiers.monoview import exec_classif_mono_view
+from ...mono_multi_view_classifiers.monoview_classifiers import decision_tree
 class Test_initConstants(unittest.TestCase):
@@ -88,6 +90,48 @@ class Test_initTrainTest(unittest.TestCase):
        np.testing.assert_array_equal(y_train, np.array([0, 0, 1, 0, 0]))
        np.testing.assert_array_equal(y_test, np.array([1, 1, 0, 0, 0]))
+class Test_getHPs(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.mkdir("multiview_platform/tests/tmp_tests")
+        cls.classifierModule = decision_tree
+        cls.hyperParamSearch = "randomized_search"
+        cls.n_iter = 2
+        cls.classifier_name = "DecisionTree"
+        cls.random_state = np.random.RandomState(42)
+        cls.X = cls.random_state.randint(0,10,size=(10,5))
+        cls.y = cls.random_state.randint(0,2,size=10)
+        cls.output_file_name = "multiview_platform/tests/tmp_tests/"
+        cls.cv = StratifiedKFold(n_splits=2, random_state=cls.random_state)
+        cls.nb_cores = 1
+        cls.metrics = [["accuracy_score", None]]
+        cls.kwargs = {"DecisionTreeKWARGS" : {"max_depth": 1,
+                      "criterion": "gini",
+                      "splitter": "best"}}
+    @classmethod
+    def tearDownClass(cls):
+        for file_name in os.listdir("multiview_platform/tests/tmp_tests"):
+            os.remove(
+                os.path.join("multiview_platform/tests/tmp_tests", file_name))
+        os.rmdir("multiview_platform/tests/tmp_tests")
+    def test_simple(self):
+        kwargs, test_folds_predictions = exec_classif_mono_view.getHPs(self.classifierModule,
+                                                                       self.hyperParamSearch,
+                                                                       self.n_iter,
+                                                                       self.classifier_name,
+                                                                       self.X,
+                                                                       self.y,
+                                                                       self.random_state,
+                                                                       self.output_file_name,
+                                                                       self.cv,
+                                                                       self.nb_cores,
+                                                                       self.metrics,
+                                                                       self.kwargs)
 # class Test_getKWARGS(unittest.TestCase):
 #
 #     @classmethod

--- a/multiview_platform/tests/test_mono_view/test_MonoviewUtils.py
+++ b/multiview_platform/tests/test_mono_view/test_MonoviewUtils.py
@@ -30,3 +30,4 @@ class Test_genTestFoldsPreds(unittest.TestCase):
        cls.assertEqual(testFoldsPreds.shape, (3, 10))
        np.testing.assert_array_equal(testFoldsPreds[0], np.array(
            [1, 1, -1, -1, 1, 1, -1, 1, -1, 1]))
--- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py
+++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py
+import os
+import unittest
+import h5py
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+from ...mono_multi_view_classifiers.utils import hyper_parameter_search
+from ...mono_multi_view_classifiers.multiview_classifiers import weighted_linear_early_fusion
+class Test_randomized_search(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.random_state = np.random.RandomState(42)
+        cls.view_weights = [0.5, 0.5]
+        os.mkdir("multiview_platform/tests/tmp_tests")
+        cls.dataset_file = h5py.File(
+            "multiview_platform/tests/tmp_tests/test_file.hdf5", "w")
+        cls.labels = cls.dataset_file.create_dataset("Labels",
+                                                     data=np.array(
+                                                         [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, ]))
+        cls.view0_data = cls.random_state.randint(1, 10, size=(10, 4))
+        view0 = cls.dataset_file.create_dataset("View0",
+                                                data=cls.view0_data)
+        view0.attrs["sparse"] = False
+        cls.view1_data = cls.random_state.randint(1, 10, size=(10, 4))
+        view1 = cls.dataset_file.create_dataset("View1",
+                                                data=cls.view1_data)
+        view1.attrs["sparse"] = False
+        metaDataGrp = cls.dataset_file.create_group("Metadata")
+        metaDataGrp.attrs["nbView"] = 2
+        metaDataGrp.attrs["nbClass"] = 2
+        metaDataGrp.attrs["datasetLength"] = 10
+        cls.monoview_classifier_name = "decision_tree"
+        cls.monoview_classifier_config = {"max_depth": 1,
+                                          "criterion": "gini",
+                                          "splitter": "best"}
+        cls.k_folds = StratifiedKFold(n_splits=3, random_state=cls.random_state)
+        cls.learning_indices = np.array([1,2,3,4, 5,6,7,8,9])
+    @classmethod
+    def tearDownClass(cls):
+        cls.dataset_file.close()
+        for file_name in os.listdir("multiview_platform/tests/tmp_tests"):
+            os.remove(
+                os.path.join("multiview_platform/tests/tmp_tests", file_name))
+        os.rmdir("multiview_platform/tests/tmp_tests")
+    def test_simple(self):
+        best_params, test_folds_preds = hyper_parameter_search.randomized_search(
+            self.dataset_file, self.labels, "multiview", self.random_state, "multiview_platform/tests/tmp_tests/",
+            weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds,
+        1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices)