diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index d228a56b766402017ebe9316ed9b1df45808d259..49b2fa2f848dde1182eb47784b4a3cca7b7ba38b 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -17,6 +17,7 @@ from .analyze_result import execute # Import own modules from .. import monoview_classifiers from ..utils.dataset import getValue, extractSubset +from ..utils import hyper_parameter_search # Author-Info __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" @@ -182,16 +183,16 @@ def getHPs(classifierModule, hyperParamSearch, nIter, CL_type, X_train, y_train, logging.debug( "Start:\t " + hyperParamSearch + " best settings with " + str( nIter) + " iterations for " + CL_type) - classifierHPSearch = getattr(monoview_utils, hyperParamSearch) - clKWARGS, testFoldsPreds = classifierHPSearch(X_train, y_train, + classifierHPSearch = getattr(hyper_parameter_search, hyperParamSearch) + clKWARGS, testFoldsPreds = classifierHPSearch(X_train, y_train, "monoview", randomState, outputFileName, classifierModule, CL_type, - KFolds=KFolds, - nbCores=nbCores, + folds=KFolds, + nb_cores=nbCores, metric=metrics[0], - nIter=nIter, - classifier_KWARGS=kwargs[ + n_iter=nIter, + classifier_kwargs=kwargs[ CL_type + "KWARGS"]) logging.debug("Done:\t " + hyperParamSearch + " best settings") else: diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py index 9f75e36ff13f02c2a4611ba33d344b18879acb07..34da170636c08d158470f972378c4c8993801310 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py @@ -17,50 +17,6 @@ __status__ = "Prototype" # Production, Development, Prototype # __date__ = 2016 - 03 - 25 - -def randomizedSearch(X_train, y_train, randomState, outputFileName, - classifierModule, CL_type, KFolds=4, nbCores=1, - metric=["accuracy_score", None], nIter=30, - classifier_KWARGS=None): - estimator = getattr(classifierModule, CL_type)(randomState, - **classifier_KWARGS) - params_dict = estimator.genDistribs() - if params_dict: - metricModule = getattr(metrics, metric[0]) - if metric[1] is not None: - metricKWARGS = dict((index, metricConfig) for index, metricConfig in - enumerate(metric[1])) - else: - metricKWARGS = {} - scorer = metricModule.get_scorer(**metricKWARGS) - nb_possible_combinations = compute_possible_combinations(params_dict) - min_list = np.array( - [min(nb_possible_combination, nIter) for nb_possible_combination in - nb_possible_combinations]) - randomSearch = RandomizedSearchCV(estimator, - n_iter=int(np.sum(min_list)), - param_distributions=params_dict, - refit=True, - n_jobs=nbCores, scoring=scorer, - cv=KFolds, random_state=randomState) - detector = randomSearch.fit(X_train, y_train) - - bestParams = dict((key, value) for key, value in - estimator.genBestParams(detector).items() if - key is not "random_state") - - scoresArray = detector.cv_results_['mean_test_score'] - params = estimator.genParamsFromDetector(detector) - - hyper_parameter_search.genHeatMaps(params, scoresArray, outputFileName) - best_estimator = detector.best_estimator_ - else: - best_estimator = estimator - bestParams = {} - testFoldsPreds = genTestFoldsPreds(X_train, y_train, KFolds, best_estimator) - return bestParams, testFoldsPreds - - def change_label_to_minus(y): minus_y = np.copy(y) minus_y[np.where(y == 0)] = -1 diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index c6eaf6e83d437c1a699e28b278c11a0aa8bdc59d..6f9569bd37ff76af745bd628f30f20781815df8e 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -113,11 +113,10 @@ def ExecMultiview(directory, DATASET, name, classificationIndices, KFolds, logging.debug("Done:\t Getting train/test split") logging.debug("Start:\t Getting classifiers modules") - classifierPackage = getattr(multiview_classifiers, - CL_type) # Permet d'appeler un module avec une string - classifierModule = getattr(classifierPackage, CL_type + "Module") + # classifierPackage = getattr(multiview_classifiers, + # CL_type) # Permet d'appeler un module avec une string + classifierModule = getattr(multiview_classifiers, CL_type) classifierClass = getattr(classifierModule, CL_type + "Class") - analysisModule = getattr(classifierPackage, "analyzeResults") logging.debug("Done:\t Getting classifiers modules") logging.debug("Start:\t Optimizing hyperparameters") diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py index dd95914da55a88a6586f4272cf6de2fa049491f5..a1905ac68e0444f82ba14040c7aee980deb2cba7 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py @@ -12,6 +12,10 @@ from sklearn.utils.validation import check_is_fitted from ... import metrics +def get_names(classed_list): + return np.array([object_.__class__.__name__ for object_ in classed_list]) + + class BaseMultiviewClassifier(BaseEstimator, ClassifierMixin): def __init__(self, random_state): diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py index fd340f9347f9ad0c76dd045920c5b51497f82558..7e0fd7e925eaf974571b1f18a83cf3d9ffff6905 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py @@ -1,4 +1,5 @@ import numpy as np +import pkgutil from ..utils.dataset import getV from .additions.utils import BaseMultiviewClassifier, get_train_views_indices @@ -7,22 +8,32 @@ from .. import monoview_classifiers class WeightedLinearEarlyFusion(BaseMultiviewClassifier): - def __init__(self, view_weights=None, monoview_classifier="decision_tree", monoview_classifier_config=None, random_state=42): + def __init__(self, random_state=None, view_weights=None, monoview_classifier="decision_tree", monoview_classifier_config={}): super(WeightedLinearEarlyFusion, self).__init__(random_state=random_state) - self.view_weights = np.array(view_weights) - if type(monoview_classifier) == str: + self.view_weights = view_weights + if isinstance(monoview_classifier, str): monoview_classifier_module = getattr(monoview_classifiers, monoview_classifier) - monoview_classifier_class = getattr(monoview_classifier_module, monoview_classifier_module.classifier_class_name) + monoview_classifier_class = getattr(monoview_classifier_module, + monoview_classifier_module.classifier_class_name) self.monoview_classifier = monoview_classifier_class(**monoview_classifier_config) else: self.monoview_classifier = monoview_classifier + self.param_names = ["monoview_classifier",] + classifier_classes = [] + for name in dir(monoview_classifiers): + module = getattr(monoview_classifiers, name) + if name == "decision_tree": + classifier_class = getattr(module, module.classifier_class_name)() + classifier_classes.append(classifier_class) + self.distribs = [classifier_classes] + self.classed_params = ["monoview_classifier"] def fit(self, X, y, train_indices=None, view_indices=None): train_indices, X = self.transform_data_to_monoview(X, train_indices, view_indices) self.monoview_classifier.fit(X, y[train_indices]) - def predict(self, X, predict_indices, view_indices): + def predict(self, X, predict_indices=None, view_indices=None): _, X = self.transform_data_to_monoview(X, predict_indices, view_indices) predicted_labels = self.monoview_classifier.predict(X) return predicted_labels @@ -35,6 +46,8 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier): view_indices) if self.view_weights is None: self.view_weights = np.ones(len(self.view_indices), dtype=float) + else: + self.view_weights = np.array(self.view_weights) self.view_weights /= float(np.sum(self.view_weights)) X = self.hdf5_to_monoview(dataset, example_indices, self.view_indices) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py index 7124c2654125c1b454f5a0ecec05a0277533f833..250bdac228cf94b5db5d6832f412816f5b2ff0d9 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py @@ -3,6 +3,9 @@ import sys import matplotlib.pyplot as plt import numpy as np +from scipy.stats import randint +from sklearn.model_selection import RandomizedSearchCV + from .. import metrics @@ -30,6 +33,185 @@ def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, """Used to perfom gridsearch on the classifiers""" pass +class CustomRandint: + """Used as a distribution returning a integer between low and high-1. + It can be used with a multiplier agrument to be able to perform more complex generation + for example 10 e -(randint)""" + + def __init__(self, low=0, high=0, multiplier=""): + self.randint = randint(low, high) + self.multiplier = multiplier + + def rvs(self, random_state=None): + randinteger = self.randint.rvs(random_state=random_state) + if self.multiplier == "e-": + return 10 ** -randinteger + else: + return randinteger + + def get_nb_possibilities(self): + return self.randint.b - self.randint.a + +def compute_possible_combinations(params_dict): + n_possibs = np.ones(len(params_dict)) * np.inf + for value_index, value in enumerate(params_dict.values()): + if type(value) == list: + n_possibs[value_index] = len(value) + elif isinstance(value, CustomRandint): + n_possibs[value_index] = value.get_nb_possibilities() + return n_possibs + + +def get_test_folds_preds(X, y, cv, estimator, framework, available_indices=None): + test_folds_prediction = [] + if framework == "monoview": + folds = cv.split(np.arange(len(y)), y) + if framework == "multiview": + y = y.value + folds = cv.split(available_indices, y[available_indices]) + fold_lengths = np.zeros(cv.n_splits, dtype=int) + for fold_idx, (train_indices, test_indices) in enumerate(folds): + fold_lengths[fold_idx] = len(test_indices) + if framework == "monoview": + estimator.fit(X[train_indices], y[train_indices]) + test_folds_prediction.append(estimator.predict(X[train_indices])) + if framework == "multiview": + estimator.fit(X, y, available_indices[train_indices]) + test_folds_prediction.append( + estimator.predict(X, available_indices[test_indices])) + minFoldLength = fold_lengths.min() + test_folds_prediction = np.array( + [test_fold_prediction[:minFoldLength] for test_fold_prediction in + test_folds_prediction]) + return test_folds_prediction + + +def randomized_search(X, y, framework, random_state, output_file_name, classifier_module, + classifier_name, folds=4, nb_cores=1, metric=["accuracy_score", None], n_iter=30, + classifier_kwargs =None, learning_indices=None): + estimator = getattr(classifier_module, classifier_name)(random_state, + **classifier_kwargs) + params_dict = estimator.genDistribs() + if params_dict: + metricModule = getattr(metrics, metric[0]) + if metric[1] is not None: + metricKWARGS = dict((index, metricConfig) for index, metricConfig in + enumerate(metric[1])) + else: + metricKWARGS = {} + scorer = metricModule.get_scorer(**metricKWARGS) + nb_possible_combinations = compute_possible_combinations(params_dict) + min_list = np.array( + [min(nb_possible_combination, n_iter) for nb_possible_combination in + nb_possible_combinations]) + randomSearch = MultiviewCompatibleRandomizedSearchCV(estimator, + n_iter=int(np.sum(min_list)), + param_distributions=params_dict, + refit=True, + n_jobs=nb_cores, scoring=scorer, + cv=folds, random_state=random_state, + learning_indices=learning_indices, + framework = framework) + detector = randomSearch.fit(X, y) + + bestParams = dict((key, value) for key, value in + estimator.genBestParams(detector).items() if + key is not "random_state") + + scoresArray = detector.cv_results_['mean_test_score'] + params = estimator.genParamsFromDetector(detector) + + genHeatMaps(params, scoresArray, output_file_name) + best_estimator = detector.best_estimator_ + else: + best_estimator = estimator + bestParams = {} + testFoldsPreds = get_test_folds_preds(X, y, folds, best_estimator, + framework, learning_indices) + return bestParams, testFoldsPreds + + +from sklearn.base import clone + + +class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): + + def __init__(self, estimator, param_distributions, n_iter=10, + refit=True, n_jobs=1, scoring=None, cv=None, + random_state=None, learning_indices=None, framework="monoview"): + super(MultiviewCompatibleRandomizedSearchCV, self).__init__(estimator, + n_iter=n_iter, + param_distributions=param_distributions, + refit=refit, + n_jobs=n_jobs, scoring=scoring, + cv=cv, random_state=random_state) + self.framework = framework + self.available_indices = learning_indices + + def fit(self, X, y=None, groups=None, **fit_params): + if self.framework == "monoview": + return super(MultiviewCompatibleRandomizedSearchCV, self).fit(X, y=y, groups=groups, **fit_params) + elif self.framework == "multiview": + return self.fit_multiview(X, y=y.value, groups=groups,**fit_params) + + def fit_multiview(self, X, y=None, groups=None, **fit_params): + n_splits = self.cv.get_n_splits(self.available_indices, y[self.available_indices]) + folds = self.cv.split(self.available_indices, y[self.available_indices]) + candidate_params = list(self._get_param_iterator()) + base_estimator = clone(self.estimator) + results = {} + self.cv_results_ = dict(("param_"+param_name, []) for param_name in candidate_params[0].keys()) + self.cv_results_["mean_test_score"] = [] + for candidate_param_idx, candidate_param in enumerate(candidate_params): + test_scores = np.zeros(n_splits)+1000 + for fold_idx, (train_indices, test_indices) in enumerate(folds): + current_estimator = clone(base_estimator) + current_estimator.set_params(**candidate_param) + current_estimator.fit(X, y, + train_indices=self.available_indices[train_indices]) + test_prediction = current_estimator.predict( + X, + self.available_indices[test_indices]) + test_score = self.scoring._score_func(y[self.available_indices[test_indices]], + test_prediction) + test_scores[fold_idx] = test_score + for param_name, param in candidate_param.items(): + self.cv_results_["param_"+param_name].append(param) + cross_validation_score = np.mean(test_scores) + self.cv_results_["mean_test_score"].append(cross_validation_score) + results[candidate_param_idx] = cross_validation_score + if cross_validation_score <= min(results.values()): + self.best_params_ = candidate_params[candidate_param_idx] + self.best_score_ = cross_validation_score + if self.refit: + self.best_estimator_ = clone(base_estimator).set_params(**self.best_params_) + self.n_splits_ = n_splits + return self + + + def get_test_folds_preds(self, X, y, estimator): + test_folds_prediction = [] + if self.framework=="monoview": + folds = self.cv.split(np.arange(len(y)), y) + if self.framework=="multiview": + folds = self.cv.split(self.available_indices, y) + fold_lengths = np.zeros(self.cv.n_splits, dtype=int) + for fold_idx, (train_indices, test_indices) in enumerate(folds): + fold_lengths[fold_idx] = len(test_indices) + if self.framework == "monoview": + estimator.fit(X[train_indices], y[train_indices]) + test_folds_prediction.append(estimator.predict(X[train_indices])) + if self.framework =="multiview": + estimator.fit(X, y, self.available_indices[train_indices]) + test_folds_prediction.append(estimator.predict(X, self.available_indices[test_indices])) + minFoldLength = fold_lengths.min() + test_folds_prediction = np.array( + [test_fold_prediction[:minFoldLength] for test_fold_prediction in test_folds_prediction]) + return test_folds_prediction + + + + def randomizedSearch(dataset, labels, classifierPackage, classifierName, metrics, learningIndices, KFolds, randomState, diff --git a/multiview_platform/tests/test_mono_view/test_ExecClassifMonoView.py b/multiview_platform/tests/test_mono_view/test_ExecClassifMonoView.py index 9f380ca9b73695a4bdb414e7103f112ac6211573..04ad40d4ce5eeb9d1e3d0def002151ce7b66c547 100644 --- a/multiview_platform/tests/test_mono_view/test_ExecClassifMonoView.py +++ b/multiview_platform/tests/test_mono_view/test_ExecClassifMonoView.py @@ -3,8 +3,10 @@ import unittest import h5py import numpy as np +from sklearn.model_selection import StratifiedKFold from ...mono_multi_view_classifiers.monoview import exec_classif_mono_view +from ...mono_multi_view_classifiers.monoview_classifiers import decision_tree class Test_initConstants(unittest.TestCase): @@ -88,6 +90,48 @@ class Test_initTrainTest(unittest.TestCase): np.testing.assert_array_equal(y_train, np.array([0, 0, 1, 0, 0])) np.testing.assert_array_equal(y_test, np.array([1, 1, 0, 0, 0])) + +class Test_getHPs(unittest.TestCase): + + @classmethod + def setUpClass(cls): + os.mkdir("multiview_platform/tests/tmp_tests") + cls.classifierModule = decision_tree + cls.hyperParamSearch = "randomized_search" + cls.n_iter = 2 + cls.classifier_name = "DecisionTree" + cls.random_state = np.random.RandomState(42) + cls.X = cls.random_state.randint(0,10,size=(10,5)) + cls.y = cls.random_state.randint(0,2,size=10) + cls.output_file_name = "multiview_platform/tests/tmp_tests/" + cls.cv = StratifiedKFold(n_splits=2, random_state=cls.random_state) + cls.nb_cores = 1 + cls.metrics = [["accuracy_score", None]] + cls.kwargs = {"DecisionTreeKWARGS" : {"max_depth": 1, + "criterion": "gini", + "splitter": "best"}} + + @classmethod + def tearDownClass(cls): + for file_name in os.listdir("multiview_platform/tests/tmp_tests"): + os.remove( + os.path.join("multiview_platform/tests/tmp_tests", file_name)) + os.rmdir("multiview_platform/tests/tmp_tests") + + def test_simple(self): + kwargs, test_folds_predictions = exec_classif_mono_view.getHPs(self.classifierModule, + self.hyperParamSearch, + self.n_iter, + self.classifier_name, + self.X, + self.y, + self.random_state, + self.output_file_name, + self.cv, + self.nb_cores, + self.metrics, + self.kwargs) + # class Test_getKWARGS(unittest.TestCase): # # @classmethod diff --git a/multiview_platform/tests/test_mono_view/test_MonoviewUtils.py b/multiview_platform/tests/test_mono_view/test_MonoviewUtils.py index da1136c66cd8c639a9c4155a261b7426384a06e2..a68c710a446bf09dbee7acc542564a775423b3b7 100644 --- a/multiview_platform/tests/test_mono_view/test_MonoviewUtils.py +++ b/multiview_platform/tests/test_mono_view/test_MonoviewUtils.py @@ -30,3 +30,4 @@ class Test_genTestFoldsPreds(unittest.TestCase): cls.assertEqual(testFoldsPreds.shape, (3, 10)) np.testing.assert_array_equal(testFoldsPreds[0], np.array( [1, 1, -1, -1, 1, 1, -1, 1, -1, 1])) + diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf11dd65db022f1717ab6c7461dca705e99506d --- /dev/null +++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py @@ -0,0 +1,55 @@ +import os +import unittest + +import h5py +import numpy as np +from sklearn.model_selection import StratifiedKFold + +from ...mono_multi_view_classifiers.utils import hyper_parameter_search +from ...mono_multi_view_classifiers.multiview_classifiers import weighted_linear_early_fusion + +class Test_randomized_search(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.random_state = np.random.RandomState(42) + cls.view_weights = [0.5, 0.5] + os.mkdir("multiview_platform/tests/tmp_tests") + cls.dataset_file = h5py.File( + "multiview_platform/tests/tmp_tests/test_file.hdf5", "w") + cls.labels = cls.dataset_file.create_dataset("Labels", + data=np.array( + [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, ])) + cls.view0_data = cls.random_state.randint(1, 10, size=(10, 4)) + view0 = cls.dataset_file.create_dataset("View0", + data=cls.view0_data) + view0.attrs["sparse"] = False + cls.view1_data = cls.random_state.randint(1, 10, size=(10, 4)) + view1 = cls.dataset_file.create_dataset("View1", + data=cls.view1_data) + view1.attrs["sparse"] = False + metaDataGrp = cls.dataset_file.create_group("Metadata") + metaDataGrp.attrs["nbView"] = 2 + metaDataGrp.attrs["nbClass"] = 2 + metaDataGrp.attrs["datasetLength"] = 10 + cls.monoview_classifier_name = "decision_tree" + cls.monoview_classifier_config = {"max_depth": 1, + "criterion": "gini", + "splitter": "best"} + cls.k_folds = StratifiedKFold(n_splits=3, random_state=cls.random_state) + cls.learning_indices = np.array([1,2,3,4, 5,6,7,8,9]) + + @classmethod + def tearDownClass(cls): + cls.dataset_file.close() + for file_name in os.listdir("multiview_platform/tests/tmp_tests"): + os.remove( + os.path.join("multiview_platform/tests/tmp_tests", file_name)) + os.rmdir("multiview_platform/tests/tmp_tests") + + + def test_simple(self): + best_params, test_folds_preds = hyper_parameter_search.randomized_search( + self.dataset_file, self.labels, "multiview", self.random_state, "multiview_platform/tests/tmp_tests/", + weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, + 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices)