Skip to content
Snippets Groups Projects
Commit 35327b99 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Combined random search for mono and multiview

parent d3f1d454
No related branches found
No related tags found
No related merge requests found
Showing
with 314 additions and 59 deletions
......@@ -17,6 +17,7 @@ from .analyze_result import execute
# Import own modules
from .. import monoview_classifiers
from ..utils.dataset import getValue, extractSubset
from ..utils import hyper_parameter_search
# Author-Info
__author__ = "Nikolas Huelsmann, Baptiste BAUVIN"
......@@ -182,16 +183,16 @@ def getHPs(classifierModule, hyperParamSearch, nIter, CL_type, X_train, y_train,
logging.debug(
"Start:\t " + hyperParamSearch + " best settings with " + str(
nIter) + " iterations for " + CL_type)
classifierHPSearch = getattr(monoview_utils, hyperParamSearch)
clKWARGS, testFoldsPreds = classifierHPSearch(X_train, y_train,
classifierHPSearch = getattr(hyper_parameter_search, hyperParamSearch)
clKWARGS, testFoldsPreds = classifierHPSearch(X_train, y_train, "monoview",
randomState,
outputFileName,
classifierModule, CL_type,
KFolds=KFolds,
nbCores=nbCores,
folds=KFolds,
nb_cores=nbCores,
metric=metrics[0],
nIter=nIter,
classifier_KWARGS=kwargs[
n_iter=nIter,
classifier_kwargs=kwargs[
CL_type + "KWARGS"])
logging.debug("Done:\t " + hyperParamSearch + " best settings")
else:
......
......@@ -17,50 +17,6 @@ __status__ = "Prototype" # Production, Development, Prototype
# __date__ = 2016 - 03 - 25
def randomizedSearch(X_train, y_train, randomState, outputFileName,
classifierModule, CL_type, KFolds=4, nbCores=1,
metric=["accuracy_score", None], nIter=30,
classifier_KWARGS=None):
estimator = getattr(classifierModule, CL_type)(randomState,
**classifier_KWARGS)
params_dict = estimator.genDistribs()
if params_dict:
metricModule = getattr(metrics, metric[0])
if metric[1] is not None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in
enumerate(metric[1]))
else:
metricKWARGS = {}
scorer = metricModule.get_scorer(**metricKWARGS)
nb_possible_combinations = compute_possible_combinations(params_dict)
min_list = np.array(
[min(nb_possible_combination, nIter) for nb_possible_combination in
nb_possible_combinations])
randomSearch = RandomizedSearchCV(estimator,
n_iter=int(np.sum(min_list)),
param_distributions=params_dict,
refit=True,
n_jobs=nbCores, scoring=scorer,
cv=KFolds, random_state=randomState)
detector = randomSearch.fit(X_train, y_train)
bestParams = dict((key, value) for key, value in
estimator.genBestParams(detector).items() if
key is not "random_state")
scoresArray = detector.cv_results_['mean_test_score']
params = estimator.genParamsFromDetector(detector)
hyper_parameter_search.genHeatMaps(params, scoresArray, outputFileName)
best_estimator = detector.best_estimator_
else:
best_estimator = estimator
bestParams = {}
testFoldsPreds = genTestFoldsPreds(X_train, y_train, KFolds, best_estimator)
return bestParams, testFoldsPreds
def change_label_to_minus(y):
minus_y = np.copy(y)
minus_y[np.where(y == 0)] = -1
......
......@@ -113,11 +113,10 @@ def ExecMultiview(directory, DATASET, name, classificationIndices, KFolds,
logging.debug("Done:\t Getting train/test split")
logging.debug("Start:\t Getting classifiers modules")
classifierPackage = getattr(multiview_classifiers,
CL_type) # Permet d'appeler un module avec une string
classifierModule = getattr(classifierPackage, CL_type + "Module")
# classifierPackage = getattr(multiview_classifiers,
# CL_type) # Permet d'appeler un module avec une string
classifierModule = getattr(multiview_classifiers, CL_type)
classifierClass = getattr(classifierModule, CL_type + "Class")
analysisModule = getattr(classifierPackage, "analyzeResults")
logging.debug("Done:\t Getting classifiers modules")
logging.debug("Start:\t Optimizing hyperparameters")
......
......@@ -12,6 +12,10 @@ from sklearn.utils.validation import check_is_fitted
from ... import metrics
def get_names(classed_list):
return np.array([object_.__class__.__name__ for object_ in classed_list])
class BaseMultiviewClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, random_state):
......
import numpy as np
import pkgutil
from ..utils.dataset import getV
from .additions.utils import BaseMultiviewClassifier, get_train_views_indices
......@@ -7,22 +8,32 @@ from .. import monoview_classifiers
class WeightedLinearEarlyFusion(BaseMultiviewClassifier):
def __init__(self, view_weights=None, monoview_classifier="decision_tree", monoview_classifier_config=None, random_state=42):
def __init__(self, random_state=None, view_weights=None, monoview_classifier="decision_tree", monoview_classifier_config={}):
super(WeightedLinearEarlyFusion, self).__init__(random_state=random_state)
self.view_weights = np.array(view_weights)
if type(monoview_classifier) == str:
self.view_weights = view_weights
if isinstance(monoview_classifier, str):
monoview_classifier_module = getattr(monoview_classifiers,
monoview_classifier)
monoview_classifier_class = getattr(monoview_classifier_module, monoview_classifier_module.classifier_class_name)
monoview_classifier_class = getattr(monoview_classifier_module,
monoview_classifier_module.classifier_class_name)
self.monoview_classifier = monoview_classifier_class(**monoview_classifier_config)
else:
self.monoview_classifier = monoview_classifier
self.param_names = ["monoview_classifier",]
classifier_classes = []
for name in dir(monoview_classifiers):
module = getattr(monoview_classifiers, name)
if name == "decision_tree":
classifier_class = getattr(module, module.classifier_class_name)()
classifier_classes.append(classifier_class)
self.distribs = [classifier_classes]
self.classed_params = ["monoview_classifier"]
def fit(self, X, y, train_indices=None, view_indices=None):
train_indices, X = self.transform_data_to_monoview(X, train_indices, view_indices)
self.monoview_classifier.fit(X, y[train_indices])
def predict(self, X, predict_indices, view_indices):
def predict(self, X, predict_indices=None, view_indices=None):
_, X = self.transform_data_to_monoview(X, predict_indices, view_indices)
predicted_labels = self.monoview_classifier.predict(X)
return predicted_labels
......@@ -35,6 +46,8 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier):
view_indices)
if self.view_weights is None:
self.view_weights = np.ones(len(self.view_indices), dtype=float)
else:
self.view_weights = np.array(self.view_weights)
self.view_weights /= float(np.sum(self.view_weights))
X = self.hdf5_to_monoview(dataset, example_indices, self.view_indices)
......
......@@ -3,6 +3,9 @@ import sys
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from .. import metrics
......@@ -30,6 +33,185 @@ def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1,
"""Used to perfom gridsearch on the classifiers"""
pass
class CustomRandint:
"""Used as a distribution returning a integer between low and high-1.
It can be used with a multiplier agrument to be able to perform more complex generation
for example 10 e -(randint)"""
def __init__(self, low=0, high=0, multiplier=""):
self.randint = randint(low, high)
self.multiplier = multiplier
def rvs(self, random_state=None):
randinteger = self.randint.rvs(random_state=random_state)
if self.multiplier == "e-":
return 10 ** -randinteger
else:
return randinteger
def get_nb_possibilities(self):
return self.randint.b - self.randint.a
def compute_possible_combinations(params_dict):
n_possibs = np.ones(len(params_dict)) * np.inf
for value_index, value in enumerate(params_dict.values()):
if type(value) == list:
n_possibs[value_index] = len(value)
elif isinstance(value, CustomRandint):
n_possibs[value_index] = value.get_nb_possibilities()
return n_possibs
def get_test_folds_preds(X, y, cv, estimator, framework, available_indices=None):
test_folds_prediction = []
if framework == "monoview":
folds = cv.split(np.arange(len(y)), y)
if framework == "multiview":
y = y.value
folds = cv.split(available_indices, y[available_indices])
fold_lengths = np.zeros(cv.n_splits, dtype=int)
for fold_idx, (train_indices, test_indices) in enumerate(folds):
fold_lengths[fold_idx] = len(test_indices)
if framework == "monoview":
estimator.fit(X[train_indices], y[train_indices])
test_folds_prediction.append(estimator.predict(X[train_indices]))
if framework == "multiview":
estimator.fit(X, y, available_indices[train_indices])
test_folds_prediction.append(
estimator.predict(X, available_indices[test_indices]))
minFoldLength = fold_lengths.min()
test_folds_prediction = np.array(
[test_fold_prediction[:minFoldLength] for test_fold_prediction in
test_folds_prediction])
return test_folds_prediction
def randomized_search(X, y, framework, random_state, output_file_name, classifier_module,
classifier_name, folds=4, nb_cores=1, metric=["accuracy_score", None], n_iter=30,
classifier_kwargs =None, learning_indices=None):
estimator = getattr(classifier_module, classifier_name)(random_state,
**classifier_kwargs)
params_dict = estimator.genDistribs()
if params_dict:
metricModule = getattr(metrics, metric[0])
if metric[1] is not None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in
enumerate(metric[1]))
else:
metricKWARGS = {}
scorer = metricModule.get_scorer(**metricKWARGS)
nb_possible_combinations = compute_possible_combinations(params_dict)
min_list = np.array(
[min(nb_possible_combination, n_iter) for nb_possible_combination in
nb_possible_combinations])
randomSearch = MultiviewCompatibleRandomizedSearchCV(estimator,
n_iter=int(np.sum(min_list)),
param_distributions=params_dict,
refit=True,
n_jobs=nb_cores, scoring=scorer,
cv=folds, random_state=random_state,
learning_indices=learning_indices,
framework = framework)
detector = randomSearch.fit(X, y)
bestParams = dict((key, value) for key, value in
estimator.genBestParams(detector).items() if
key is not "random_state")
scoresArray = detector.cv_results_['mean_test_score']
params = estimator.genParamsFromDetector(detector)
genHeatMaps(params, scoresArray, output_file_name)
best_estimator = detector.best_estimator_
else:
best_estimator = estimator
bestParams = {}
testFoldsPreds = get_test_folds_preds(X, y, folds, best_estimator,
framework, learning_indices)
return bestParams, testFoldsPreds
from sklearn.base import clone
class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV):
def __init__(self, estimator, param_distributions, n_iter=10,
refit=True, n_jobs=1, scoring=None, cv=None,
random_state=None, learning_indices=None, framework="monoview"):
super(MultiviewCompatibleRandomizedSearchCV, self).__init__(estimator,
n_iter=n_iter,
param_distributions=param_distributions,
refit=refit,
n_jobs=n_jobs, scoring=scoring,
cv=cv, random_state=random_state)
self.framework = framework
self.available_indices = learning_indices
def fit(self, X, y=None, groups=None, **fit_params):
if self.framework == "monoview":
return super(MultiviewCompatibleRandomizedSearchCV, self).fit(X, y=y, groups=groups, **fit_params)
elif self.framework == "multiview":
return self.fit_multiview(X, y=y.value, groups=groups,**fit_params)
def fit_multiview(self, X, y=None, groups=None, **fit_params):
n_splits = self.cv.get_n_splits(self.available_indices, y[self.available_indices])
folds = self.cv.split(self.available_indices, y[self.available_indices])
candidate_params = list(self._get_param_iterator())
base_estimator = clone(self.estimator)
results = {}
self.cv_results_ = dict(("param_"+param_name, []) for param_name in candidate_params[0].keys())
self.cv_results_["mean_test_score"] = []
for candidate_param_idx, candidate_param in enumerate(candidate_params):
test_scores = np.zeros(n_splits)+1000
for fold_idx, (train_indices, test_indices) in enumerate(folds):
current_estimator = clone(base_estimator)
current_estimator.set_params(**candidate_param)
current_estimator.fit(X, y,
train_indices=self.available_indices[train_indices])
test_prediction = current_estimator.predict(
X,
self.available_indices[test_indices])
test_score = self.scoring._score_func(y[self.available_indices[test_indices]],
test_prediction)
test_scores[fold_idx] = test_score
for param_name, param in candidate_param.items():
self.cv_results_["param_"+param_name].append(param)
cross_validation_score = np.mean(test_scores)
self.cv_results_["mean_test_score"].append(cross_validation_score)
results[candidate_param_idx] = cross_validation_score
if cross_validation_score <= min(results.values()):
self.best_params_ = candidate_params[candidate_param_idx]
self.best_score_ = cross_validation_score
if self.refit:
self.best_estimator_ = clone(base_estimator).set_params(**self.best_params_)
self.n_splits_ = n_splits
return self
def get_test_folds_preds(self, X, y, estimator):
test_folds_prediction = []
if self.framework=="monoview":
folds = self.cv.split(np.arange(len(y)), y)
if self.framework=="multiview":
folds = self.cv.split(self.available_indices, y)
fold_lengths = np.zeros(self.cv.n_splits, dtype=int)
for fold_idx, (train_indices, test_indices) in enumerate(folds):
fold_lengths[fold_idx] = len(test_indices)
if self.framework == "monoview":
estimator.fit(X[train_indices], y[train_indices])
test_folds_prediction.append(estimator.predict(X[train_indices]))
if self.framework =="multiview":
estimator.fit(X, y, self.available_indices[train_indices])
test_folds_prediction.append(estimator.predict(X, self.available_indices[test_indices]))
minFoldLength = fold_lengths.min()
test_folds_prediction = np.array(
[test_fold_prediction[:minFoldLength] for test_fold_prediction in test_folds_prediction])
return test_folds_prediction
def randomizedSearch(dataset, labels, classifierPackage, classifierName,
metrics, learningIndices, KFolds, randomState,
......
......@@ -3,8 +3,10 @@ import unittest
import h5py
import numpy as np
from sklearn.model_selection import StratifiedKFold
from ...mono_multi_view_classifiers.monoview import exec_classif_mono_view
from ...mono_multi_view_classifiers.monoview_classifiers import decision_tree
class Test_initConstants(unittest.TestCase):
......@@ -88,6 +90,48 @@ class Test_initTrainTest(unittest.TestCase):
np.testing.assert_array_equal(y_train, np.array([0, 0, 1, 0, 0]))
np.testing.assert_array_equal(y_test, np.array([1, 1, 0, 0, 0]))
class Test_getHPs(unittest.TestCase):
@classmethod
def setUpClass(cls):
os.mkdir("multiview_platform/tests/tmp_tests")
cls.classifierModule = decision_tree
cls.hyperParamSearch = "randomized_search"
cls.n_iter = 2
cls.classifier_name = "DecisionTree"
cls.random_state = np.random.RandomState(42)
cls.X = cls.random_state.randint(0,10,size=(10,5))
cls.y = cls.random_state.randint(0,2,size=10)
cls.output_file_name = "multiview_platform/tests/tmp_tests/"
cls.cv = StratifiedKFold(n_splits=2, random_state=cls.random_state)
cls.nb_cores = 1
cls.metrics = [["accuracy_score", None]]
cls.kwargs = {"DecisionTreeKWARGS" : {"max_depth": 1,
"criterion": "gini",
"splitter": "best"}}
@classmethod
def tearDownClass(cls):
for file_name in os.listdir("multiview_platform/tests/tmp_tests"):
os.remove(
os.path.join("multiview_platform/tests/tmp_tests", file_name))
os.rmdir("multiview_platform/tests/tmp_tests")
def test_simple(self):
kwargs, test_folds_predictions = exec_classif_mono_view.getHPs(self.classifierModule,
self.hyperParamSearch,
self.n_iter,
self.classifier_name,
self.X,
self.y,
self.random_state,
self.output_file_name,
self.cv,
self.nb_cores,
self.metrics,
self.kwargs)
# class Test_getKWARGS(unittest.TestCase):
#
# @classmethod
......
......@@ -30,3 +30,4 @@ class Test_genTestFoldsPreds(unittest.TestCase):
cls.assertEqual(testFoldsPreds.shape, (3, 10))
np.testing.assert_array_equal(testFoldsPreds[0], np.array(
[1, 1, -1, -1, 1, 1, -1, 1, -1, 1]))
import os
import unittest
import h5py
import numpy as np
from sklearn.model_selection import StratifiedKFold
from ...mono_multi_view_classifiers.utils import hyper_parameter_search
from ...mono_multi_view_classifiers.multiview_classifiers import weighted_linear_early_fusion
class Test_randomized_search(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.random_state = np.random.RandomState(42)
cls.view_weights = [0.5, 0.5]
os.mkdir("multiview_platform/tests/tmp_tests")
cls.dataset_file = h5py.File(
"multiview_platform/tests/tmp_tests/test_file.hdf5", "w")
cls.labels = cls.dataset_file.create_dataset("Labels",
data=np.array(
[0, 1, 0, 0, 1, 0, 1, 0, 0, 1, ]))
cls.view0_data = cls.random_state.randint(1, 10, size=(10, 4))
view0 = cls.dataset_file.create_dataset("View0",
data=cls.view0_data)
view0.attrs["sparse"] = False
cls.view1_data = cls.random_state.randint(1, 10, size=(10, 4))
view1 = cls.dataset_file.create_dataset("View1",
data=cls.view1_data)
view1.attrs["sparse"] = False
metaDataGrp = cls.dataset_file.create_group("Metadata")
metaDataGrp.attrs["nbView"] = 2
metaDataGrp.attrs["nbClass"] = 2
metaDataGrp.attrs["datasetLength"] = 10
cls.monoview_classifier_name = "decision_tree"
cls.monoview_classifier_config = {"max_depth": 1,
"criterion": "gini",
"splitter": "best"}
cls.k_folds = StratifiedKFold(n_splits=3, random_state=cls.random_state)
cls.learning_indices = np.array([1,2,3,4, 5,6,7,8,9])
@classmethod
def tearDownClass(cls):
cls.dataset_file.close()
for file_name in os.listdir("multiview_platform/tests/tmp_tests"):
os.remove(
os.path.join("multiview_platform/tests/tmp_tests", file_name))
os.rmdir("multiview_platform/tests/tmp_tests")
def test_simple(self):
best_params, test_folds_preds = hyper_parameter_search.randomized_search(
self.dataset_file, self.labels, "multiview", self.random_state, "multiview_platform/tests/tmp_tests/",
weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds,
1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment