diff --git a/.env.example b/.env.example deleted file mode 100644 index 9ca543b382b4889be1d93ed2065ef234b789153c..0000000000000000000000000000000000000000 --- a/.env.example +++ /dev/null @@ -1,12 +0,0 @@ -# Environment variables go here, can be read by `python-dotenv` package: -# -# `src/script.py` -# ---------------------------------------------------------------- -# import dotenv -# -# project_dir = os.path.join(os.path.dirname(__file__), os.pardir) -# dotenv_path = os.path.join(project_dir, '.env') -# dotenv.load_dotenv(dotenv_path) -# ---------------------------------------------------------------- - -project_dir = "." \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5d8638d3bfb7c3a28756fe87a9cc84aed2cdb5e7..ed07278aa03dbf293f143b22d927fa9f08876edb 100644 --- a/.gitignore +++ b/.gitignore @@ -371,3 +371,6 @@ TSWLatexianTemp* *.lpz reports/*.pdf + +# Image +*.png diff --git a/TODO.md b/TODO.md index bfb32e8a131b5147b36c9ccba729a6e13e04e5b7..5ea6cc5cf2c933eed2e7ffbf2567d4fe812412cf 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,7 @@ -* Trouver des jeux de données pertinents -* Entraîner et tester des forêts de différentes tailles -* Entraîner et tester en regression et classification -* Entraîner et tester sur différentes modalités (pas seulement des datasets d'images) -* Entraîner avec différents hyperparamètres (d, profondeur, epsilon) -* Appliquer OMP avec différentes valeurs de k (notamment un petit k) -* Faire des figures -* Implémenter et comparer les systèmes concurrents \ No newline at end of file +* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues. +* Fix ModelFactory.load function. +* Fix model results loading in compute_results.py. +* Check that omp multiclasses classifier is working as expected. +* In the bayesian search computation, output a different file name depending on the task of the trained model. +* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values). +* Prepare the json experiment files to run. \ No newline at end of file diff --git a/code/bolsonaro/data/__init__.py b/code/bolsonaro/data/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ce8e424d0cc66e3f349fcafd744216d818bacaa5 100644 --- a/code/bolsonaro/data/__init__.py +++ b/code/bolsonaro/data/__init__.py @@ -0,0 +1,3 @@ +import os + +LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log') diff --git a/code/bolsonaro/data/dataset.py b/code/bolsonaro/data/dataset.py index b2f67c489be94791397649ba292a49f65dd604d8..7108eb5781e1ef2b69926e9fd2239deaa81f44e2 100644 --- a/code/bolsonaro/data/dataset.py +++ b/code/bolsonaro/data/dataset.py @@ -1,9 +1,8 @@ class Dataset(object): - def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train, + def __init__(self, task, X_train, X_dev, X_test, y_train, y_dev, y_test): self._task = task - self._dataset_parameters = dataset_parameters self._X_train = X_train self._X_dev = X_dev self._X_test = X_test diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index d706b7a07751f715b6398b1f451ec9f337d00f60..b22ecaa83f6e69610c82796e068e91db23e83646 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -1,7 +1,9 @@ from bolsonaro.data.dataset import Dataset from bolsonaro.data.task import Task +from bolsonaro.utils import change_binary_func_load -from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer +from sklearn.datasets import load_boston, load_iris, load_diabetes, \ + load_digits, load_linnerud, load_wine, load_breast_cancer from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing @@ -19,45 +21,46 @@ class DatasetLoader(object): task = Task.REGRESSION elif name == 'iris': dataset_loading_func = load_iris - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'diabetes': dataset_loading_func = load_diabetes task = Task.REGRESSION elif name == 'digits': dataset_loading_func = load_digits - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'linnerud': dataset_loading_func = load_linnerud task = Task.REGRESSION elif name == 'wine': dataset_loading_func = load_wine - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'breast_cancer': - dataset_loading_func = load_breast_cancer - task = Task.CLASSIFICATION - elif name == 'olivetti_faces': + dataset_loading_func = change_binary_func_load(load_breast_cancer) + task = Task.BINARYCLASSIFICATION + elif name == 'olivetti_faces': # bug (no return X_y) dataset_loading_func = fetch_olivetti_faces - task = Task.CLASSIFICATION - elif name == '20newsgroups': + task = Task.MULTICLASSIFICATION + elif name == '20newsgroups': # bug (no return X_y) dataset_loading_func = fetch_20newsgroups - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == '20newsgroups_vectorized': dataset_loading_func = fetch_20newsgroups_vectorized - task = Task.CLASSIFICATION - elif name == 'lfw_people': + task = Task.MULTICLASSIFICATION + elif name == 'lfw_people': # needs PIL (image dataset) dataset_loading_func = fetch_lfw_people - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'lfw_pairs': dataset_loading_func = fetch_lfw_pairs + task = Task.MULTICLASSIFICATION elif name == 'covtype': dataset_loading_func = fetch_covtype - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'rcv1': dataset_loading_func = fetch_rcv1 - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'kddcup99': dataset_loading_func = fetch_kddcup99 - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'california_housing': dataset_loading_func = fetch_california_housing task = Task.REGRESSION @@ -87,5 +90,5 @@ class DatasetLoader(object): X_dev = scaler.transform(X_dev) X_test = scaler.transform(X_test) - return Dataset(task, dataset_parameters, X_train, + return Dataset(task, X_train, X_dev, X_test, y_train, y_dev, y_test) diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index 9854a75eb27ec83990b8ae032c58e7dec52a5e8c..88054257d4241ae0426c16ebaca1cb4985c3b65f 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -15,11 +15,11 @@ class DatasetParameters(object): @property def name(self): return self._name - + @property def test_size(self): return self._test_size - + @property def dev_size(self): return self._dev_size diff --git a/code/bolsonaro/data/task.py b/code/bolsonaro/data/task.py index 2f47fa22f472f769c075f40e1c25a7bf3de45f0d..f1214a64a27873e49f5dbbcb853e4f65f9b07f68 100644 --- a/code/bolsonaro/data/task.py +++ b/code/bolsonaro/data/task.py @@ -2,5 +2,6 @@ from enum import Enum class Task(Enum): - CLASSIFICATION = 1 + BINARYCLASSIFICATION = 1 REGRESSION = 2 + MULTICLASSIFICATION = 3 diff --git a/code/bolsonaro/hyperparameter_searcher.py b/code/bolsonaro/hyperparameter_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..7884d2d4271203e9ebee1e804baa7c1e94a76770 --- /dev/null +++ b/code/bolsonaro/hyperparameter_searcher.py @@ -0,0 +1,47 @@ +''' +This module is used to find the best hyperparameters for a given dataset. +''' + +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.task import Task +from bolsonaro.error_handling.logger_factory import LoggerFactory +from . import LOG_PATH + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from skopt import BayesSearchCV + + +class HyperparameterSearcher(object): + + def __init__(self): + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + def search(self, dataset, hyperparameter_space, n_iter, cv, + random_seed, scorer, verbose=False): + ''' + For a given dataset and the space of hyperparameters, does a + bayesian hyperparameters search. + :input dataset: a Dataset object + :input hyperparameter_space: a dictionnary, keys are hyperparameters, + value their spaces defined with skopt + :input n_iter: the number of iterations of the bayesian search + :input cv: the size of the cross validation + :input random_seed: int, the seed for the bayesian search + :input scorer: str, the name of the scorer + :input verbose: bool, print state of the research + :return: a skopt.searchcv.BayesSearchCV object + ''' + + if dataset.task == Task.REGRESSION: + estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed) + else: + estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed) + + opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter, + cv=cv, n_jobs=-1, random_state=random_seed, + scoring=scorer, verbose=verbose) + + opt.fit(dataset.X_train, dataset.y_train) + + return opt diff --git a/code/bolsonaro/models/__init__.py b/code/bolsonaro/models/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ce8e424d0cc66e3f349fcafd744216d818bacaa5 100644 --- a/code/bolsonaro/models/__init__.py +++ b/code/bolsonaro/models/__init__.py @@ -0,0 +1,3 @@ +import os + +LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log') diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index fb6b32cb26727d2221367f208598f04e1a19dfb1..a93e6090e253dc9bdb3aacfc53e1c99a1f9ef120 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,4 +1,4 @@ -from bolsonaro.models.omp_forest_classifier import OmpForestClassifier +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.data.task import Task from bolsonaro.models.model_parameters import ModelParameters @@ -11,18 +11,22 @@ class ModelFactory(object): @staticmethod def build(task, model_parameters): - if task == Task.CLASSIFICATION: - model_func = OmpForestClassifier + if task == Task.BINARYCLASSIFICATION: + model_func = OmpForestBinaryClassifier elif task == Task.REGRESSION: model_func = OmpForestRegressor + elif task == Task.MULTICLASSIFICATION: + model_func = OmpForestMulticlassClassifier else: raise ValueError("Unsupported task '{}'".format(task)) return model_func(model_parameters) @staticmethod def load(task, directory_path, experiment_id, model_raw_results): + raise NotImplementedError model_parameters = ModelParameters.load(directory_path, experiment_id) model = ModelFactory.build(task, model_parameters) - model.set_forest(model_raw_results.forest) - model.set_weights(model_raw_results.weights) + # todo faire ce qu'il faut ici pour rétablir correctement le modèle + model.set_forest(model_raw_results.model_object.forest) + model.set_weights(model_raw_results.model_object.weights) return model diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 450d97bc01fe8f02ccd399d2c47a0c7397b4cb13..31a451b70578835fe4663508de9e15f99bf6cc19 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -5,16 +5,13 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, normalize_D, use_dev_subset, seed=None): - self._forest_size = forest_size + def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters): self._extracted_forest_size = extracted_forest_size self._normalize_D = normalize_D - self._use_dev_subset = use_dev_subset + self._subsets_used = subsets_used + self._normalize_weights = normalize_weights self._seed = seed - - @property - def forest_size(self): - return self._forest_size + self._hyperparameters = hyperparameters @property def extracted_forest_size(self): @@ -25,13 +22,21 @@ class ModelParameters(object): return self._normalize_D @property - def use_dev_subset(self): - return self._use_dev_subset + def subsets_used(self): + return self._subsets_used + + @property + def normalize_weights(self): + return self._normalize_weights @property def seed(self): return self._seed + @property + def hyperparameters(self): + return self._hyperparameters + def save(self, directory_path, experiment_id): save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), self.__dict__) diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index 673cb0fc65b7378e95c03b186d246cb70b384a07..df8b2ec0b10704a8a8c397b9012298e8b901e14b 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -6,13 +6,12 @@ import datetime class ModelRawResults(object): - def __init__(self, forest, weights, training_time, + def __init__(self, model_object, training_time, datetime, train_score, dev_score, test_score, score_metric, train_score_regressor, dev_score_regressor, test_score_regressor): - self._forest = forest - self._weights = weights + self._model_object = model_object self._training_time = training_time self._datetime = datetime self._train_score = train_score @@ -24,12 +23,8 @@ class ModelRawResults(object): self._test_score_regressor = test_score_regressor @property - def forest(self): - return self._forest - - @property - def weights(self): - return self._weights + def model_object(self): + return self.model_object @property def training_time(self): diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..2da0beab64ef5361efbde6d6197f957fe627886c --- /dev/null +++ b/code/bolsonaro/models/omp_forest.py @@ -0,0 +1,121 @@ +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory + +from abc import abstractmethod, ABCMeta +import numpy as np +from sklearn.linear_model import OrthogonalMatchingPursuit +from sklearn.base import BaseEstimator + + +class OmpForest(BaseEstimator, metaclass=ABCMeta): + def __init__(self, models_parameters, base_forest_estimator): + self._base_forest_estimator = base_forest_estimator + self._models_parameters = models_parameters + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + @property + def models_parameters(self): + return self._models_parameters + + def score_base_estimator(self, X, y): + return self._base_forest_estimator.score(X, y) + + def _base_estimator_predictions(self, X): + return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T + + @property + def forest(self): + return self._base_forest_estimator.estimators_ + + # sklearn baseestimator api methods + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._base_forest_estimator.fit(X_forest, y_forest) + self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit + return self + + def _extract_subforest(self, X, y): + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X data is used for interrogation of every tree in the forest. The y data + is used for finding the weights in OMP. + + :param X: (n_sample, n_features) array + :param y: (n_sample,) array + :return: + """ + self._logger.debug("Forest make prediction on X") + D = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + # question: maybe consider other kinds of normalization.. centering? + self._logger.debug("Compute norm of predicted vectors on X") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) + + self.fit_omp(D, y) + + @staticmethod + def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False): + if normalize_weights: + # we can normalize weights (by their sum) so that they sum to 1 + # and they can be interpreted as impact percentages for interpretability. + # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) + + # question: je comprend pas le truc avec nonszero? + # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) + coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square) + unsigned_coef = (coef_signs * omp_obj.coef_).squeeze() + intercept = omp_obj.intercept_ + + adjusted_forest_predictions = base_predictions * coef_signs + predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept + + else: + predictions = omp_obj.predict(base_predictions) + + return predictions + + @abstractmethod + def fit_omp(self, atoms, objective): + pass + + @abstractmethod + def predict(self, X): + pass + + @abstractmethod + def score(self, X, y): + pass + +class SingleOmpForest(OmpForest): + def __init__(self, models_parameters, base_forest_estimator): + # fit_intercept shouldn't be set to False as the data isn't necessarily centered here + # normalization is handled outsite OMP + self._omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + + super().__init__(models_parameters, base_forest_estimator) + + def fit_omp(self, atoms, objective): + self._omp.fit(atoms, objective) + + def predict(self, X): + """ + Apply the SingleOmpForest to X. + + Make all the base tree predictions then apply the OMP weights for pruning. + + :param X: + :return: + """ + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index 12cc23fab69fc0b79ff40b1d6957db5532a8c452..128347aa61caf79dc908397ef0588d646d8b0dee 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -1,11 +1,106 @@ -from sklearn.base import BaseEstimator +from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest +from bolsonaro.utils import binarize_class_data + +import numpy as np from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import OrthogonalMatchingPursuit + + +class OmpForestBinaryClassifier(SingleOmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + + def _check_classes(self, y): + assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}" + + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._check_classes(y_forest) + self._check_classes(y_omp) + + return super().fit(X_forest, y_forest, X_omp, y_omp) + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + """ + Evaluate OMPForestClassifer on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: might be "indicator" + :return: + """ + predictions = self.predict(X) + + if metric == 'indicator': + evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1)) + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation + + +class OmpForestMulticlassClassifier(OmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + # question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest + self._dct_class_omp = {} + + def fit_omp(self, atoms, objective): + assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__) + possible_classes = sorted(set(objective)) + for class_label in possible_classes: + atoms_binary = binarize_class_data(atoms, class_label, inplace=False) + objective_binary = binarize_class_data(objective, class_label, inplace=False) + # todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP... + omp_class = OrthogonalMatchingPursuit( + n_nonzero_coefs=self.models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + omp_class.fit(atoms_binary, objective_binary) + self._dct_class_omp[class_label] = omp_class + return self._dct_class_omp + + def predict(self, X): + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + label_names = [] + preds = [] + for class_label, omp_class in self._dct_class_omp.items(): + label_names.append(class_label) + atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False) + preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights)) + + # todo verifier que ce n'est pas bugué ici + + preds = np.array(preds).T + max_preds = np.argmax(preds, axis=1) + return np.array(label_names)[max_preds] + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + predictions = self.predict(X) + if metric == 'indicator': + evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0] + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) -class OmpForestClassifier(BaseEstimator): + return evaluation - def __init__(self): - raise ValueError('Classification tasks are not supported for now') - def fit(self, X, y): - pass +if __name__ == "__main__": + forest = RandomForestClassifier(n_estimators=10) + X = np.random.rand(10, 5) + y = np.random.choice([-1, +1], 10) + forest.fit(X, y) + print(forest.predict(np.random.rand(10, 5))) \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 50754246abc13e0282e4cbd6aa1e917a0e3544ed..a0c8b4708d52336bf39544ffd0b66c527466620a 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -1,64 +1,18 @@ -from bolsonaro import LOG_PATH -from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.models.omp_forest import SingleOmpForest from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import OrthogonalMatchingPursuit -from sklearn.base import BaseEstimator import numpy as np -class OmpForestRegressor(BaseEstimator): +class OmpForestRegressor(SingleOmpForest): DEFAULT_SCORE_METRIC = 'mse' def __init__(self, models_parameters): - self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, - random_state=models_parameters.seed) - self._models_parameters = models_parameters - self._logger = LoggerFactory.create(LOG_PATH, __name__) + estimator = RandomForestRegressor(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) - @property - def forest(self): - return self._forest - - def set_forest(self, forest): - self._forest = forest - self._regressor.estimators_ = forest - - @property - def weights(self): - return self._weights - - def set_weights(self, weights): - self._weights = weights - - @property - def models_parameters(self): - return self._models_parameters - - def fit(self, X_forest, y_forest, X_omp, y_omp): - self._forest = self._train_forest(X_forest, y_forest) - self._weights = self._extract_subforest(X_omp, y_omp) - return self - - def score_regressor(self, X, y): - return self._regressor.score(X, y) - - def predict(self, X): - """ - Apply the OMPForestRegressor to X. - - :param X: - :return: - """ - D = self._forest_prediction(X) - - if self._models_parameters.normalize_D: - D /= self._forest_norms - - predictions = D @ self.weights - - return predictions + super().__init__(models_parameters, estimator) def score(self, X, y, metric=DEFAULT_SCORE_METRIC): """ @@ -77,41 +31,3 @@ class OmpForestRegressor(BaseEstimator): raise ValueError("Unsupported metric '{}'.".format(metric)) return evaluation - - def _train_forest(self, X, y): - self._regressor.fit(X, y) - forest = self._regressor.estimators_ - return forest - - def _extract_subforest(self, X, y): - """ - Given an already estimated regressor: apply OMP to get the weight of each tree. - - The X data is used for interrogation of every tree in the forest. The y data - is used for finding the weights in OMP. - - :param X: (n_sample, n_features) array - :param y: (n_sample,) array - :return: - """ - self._logger.debug("Forest make prediction on X") - D = self._forest_prediction(X) - - if self._models_parameters.normalize_D: - # question: maybe consider other kinds of normalization - self._logger.debug("Compute norm of predicted vectors on X") - self._forest_norms = np.linalg.norm(D, axis=0) - D /= self._forest_norms - - omp = OrthogonalMatchingPursuit( - n_nonzero_coefs=self._models_parameters.extracted_forest_size, - fit_intercept=False, normalize=False) - self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." - .format(self._models_parameters.extracted_forest_size)) - omp.fit(D, y) - weights = omp.coef_ - # question: why not to use directly the omp estimator instead of bypassing it using the coefs? - return weights - - def _forest_prediction(self, X): - return np.array([tree.predict(X) for tree in self._forest]).T diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 01d0a036c8058cb55f947c0d80645b5619da0cdb..4a32bffb8c7a2e129bf6f010c8a2c3a339a53a4b 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -8,46 +8,78 @@ import numpy as np class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ def __init__(self, dataset): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ self._dataset = dataset self._logger = LoggerFactory.create(LOG_PATH, __name__) - def train(self, model, models_dir): - self._logger.debug('Training model using train set...') - begin_time = time.time() - - if model.models_parameters.use_dev_subset: - X_forest = self._dataset.X_train - y_forest = self._dataset.y_train - X_omp = self._dataset.X_dev - y_omp = self._dataset.y_dev + def init(self, model): + if model.models_parameters.subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = self._dataset.X_dev + self._y_omp = self._dataset.y_dev self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') - else: - X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - X_omp = X_forest - y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - y_omp = y_forest + elif model.models_parameters.subsets_used == 'train+dev,train+dev': + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._X_omp = self._X_forest + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._y_omp = self._y_forest self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') + elif model.models_parameters.subsets_used == 'train,train+dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + else: + raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) + def train(self, model): + """ + :param model: Object with + :return: + """ + + self._logger.debug('Training model using train set...') + self._begin_time = time.time() model.fit( - X_forest=X_forest, - y_forest=y_forest, - X_omp=X_omp, - y_omp=y_omp + X_forest=self._X_forest, + y_forest=self._y_forest, + X_omp=self._X_omp, + y_omp=self._y_omp ) - end_time = time.time() + self._end_time = time.time() - ModelRawResults( - forest=model.forest, - weights=model.weights, - training_time=end_time - begin_time, + def compute_results(self, model, models_dir): + """ + :param model: Object with + :param models_dir: Where the results will be saved + """ + results = ModelRawResults( + model_object=model, + training_time=self._end_time - self._begin_time, datetime=datetime.datetime.now(), train_score=model.score(self._dataset.X_train, self._dataset.y_train), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), test_score=model.score(self._dataset.X_test, self._dataset.y_test), score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way - train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train), - dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev), - test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test) - ).save(models_dir) + train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train), + dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev), + test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test) + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_regressor)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train: {}".format(results.train_score_regressor)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev: {}".format(results.dev_score_regressor)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 82e501878ba06320914230096213d2d28548e4dc..d7509ad9e85cde3cc0c649f85cfb5b60ead9a854 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -1,6 +1,7 @@ import os import json import pickle +from copy import deepcopy def resolve_experiment_id(models_dir): @@ -45,3 +46,30 @@ def load_obj_from_pickle(file_path, constructor): with open(file_path, 'rb') as input_file: parameters = pickle.load(input_file) return constructor(**parameters) + +def binarize_class_data(data, class_pos, inplace=True): + """ + Replace class_pos by +1 and ~class_pos by -1. + + :param data: an array of classes + :param class_pos: the positive class to be replaced by +1 + :param inplace: If True, modify data in place (still return it, also) + :return: + """ + if not inplace: + data = deepcopy(data) + + position_class_labels = (data == class_pos) + data[~(position_class_labels)] = -1 + data[(position_class_labels)] = +1 + + return data + +def change_binary_func_load(base_load_function): + def func_load(return_X_y): + X, y = base_load_function(return_X_y=return_X_y) + possible_classes = sorted(set(y)) + assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication" + y = binarize_class_data(y, possible_classes[-1]) + return X, y + return func_load diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py index 9d7058732970fb5981ef04ce7a56e022ee68d5a9..0d5706bc27cb0745fe065456231b7e3023707ac9 100644 --- a/code/bolsonaro/visualization/plotter.py +++ b/code/bolsonaro/visualization/plotter.py @@ -1,26 +1,51 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.neighbors.kde import KernelDensity +import pandas as pd class Plotter(object): @staticmethod - def weight_density(weights, X, file_path): - X_plot = [np.exp(elem) for elem in weights] - fig, ax = plt.subplots() + def weight_density(all_experiment_weights, file_path): + """ + Function that creates the figure with the density of the weights + :param all_experiment_weights: The weights for the different experiments + :param file path: str, path where the figure will be saved + TODO: colored by seed number or not? + TODO: represents both the seed AND the extracted tree information in the legend + """ - for kernel in ['gaussian', 'tophat', 'epanechnikov']: - kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot) - log_dens = kde.score_samples(X_plot) - ax.plot(X_plot, np.exp(log_dens), '-', - label="kernel = '{0}'".format(kernel)) + """ + Convert dictionnary of structure + {seed_1:[M x W]], seed_k:[M x W]} + to numpy.ndarray with dim [K x M x W] + where K is the seed number, M is the + number of extracted trees and W the + weight number. + """ + all_experiment_weights = np.array(list(all_experiment_weights.values())) + + n = len(all_experiment_weights) - ax.legend(loc='upper left') - ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') + """ + Get as many different colors from the specified cmap (here nipy_spectral) + as there are seeds used. + """ + colors = Plotter.get_colors_from_cmap(n) + + fig, ax = plt.subplots() + # For each seed + for i in range(n): + # For each weight set of a given extracted tree number + for weights in all_experiment_weights[i]: + """ + Plot the series of weights that aren't zero, + colored by seed number. + """ + pd.Series(weights[np.nonzero(weights)]).plot.kde( + figsize=(15, 10), ax=ax, color=colors[i]) - ax.set_xlim(-4, 9) - ax.set_ylim(-0.02, 0.4) + ax.set_title('Density weights of the OMP') fig.savefig(file_path, dpi=fig.dpi) plt.close(fig) diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..0f7aa3a666d61618a3a5d50b1de8e996c235034c --- /dev/null +++ b/code/compute_hyperparameters.py @@ -0,0 +1,97 @@ +from bolsonaro import LOG_PATH +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.task import Task +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.hyperparameter_searcher import HyperparameterSearcher +from bolsonaro.utils import save_obj_to_json + +import argparse +import os +import pathlib +import pickle +import random +from dotenv import find_dotenv, load_dotenv + +""" +I had to install skopt from this repository +https://github.com/darenr/scikit-optimize that handles +the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762. +""" +from skopt.space import Categorical, Integer, Real + + +def clean_numpy_int_dict(dictionary): + return dict([a, int(x)] if type(x) == Integer else + [a, clean_numpy_int_dict(x)] if type(x) == dict else + [a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)] + for a, x in dictionary.items()) + + +def clean_numpy_int_list(list_n): + return [int(elem) if type(elem) == Integer else + clean_numpy_int_dict(elem) if type(elem) == dict else + clean_numpy_int_list(elem) if type(elem) == list else elem + for elem in list_n] + + +if __name__ == "__main__": + # get environment variables in .env + load_dotenv(find_dotenv('.env')) + + DEFAULT_CV = 3 + DEFAULT_N_ITER = 50 + DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), + 'min_samples_leaf': Integer(1, 1000), + 'max_depth': Integer(1, 20), + 'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])} + DATASET_LIST = ['boston', 'iris', 'diabetes'] + # , 'digits', 'linnerud', 'wine'] + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.') + parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.') + parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.') + parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.') + parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.') + + args = parser.parse_args() + + logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) + + begin_random_seed_range = 1 + end_random_seed_range = 2000 + + if args.seed is None: + random_seed = random.randint(begin_random_seed_range, end_random_seed_range) + else: + random_seed = args.seed + + for dataset_name in args.datasets: + + dataset_dir = os.path.join('experiments', dataset_name, 'stage1') + + pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True) + + logger.info('Bayesian search on dataset {}'.format(dataset_name)) + + dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) + dataset = DatasetLoader.load(dataset_parameters) + + if dataset.task == Task.REGRESSION: + scorer = 'neg_mean_squared_error' + else: + scorer = 'accuracy' + + bayesian_searcher = HyperparameterSearcher() + opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter, + args.cv, random_seed, scorer, args.verbose) + + dict_results = {'_scorer': scorer, + '_best_score_train': opt.best_score_, + '_best_score_test': opt.score(dataset.X_test, dataset.y_test), + '_best_parameters': clean_numpy_int_dict(opt.best_params_), + '_random_seed': random_seed + } + + save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results) diff --git a/code/compute_results.py b/code/compute_results.py index fb09e42d3fc6829358b622ad4291bd064a9c65a3..64124af70954cc6af6a923f03f5a122a75f453fb 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -12,7 +12,7 @@ import os if __name__ == "__main__": # get environment variables in .env - load_dotenv(find_dotenv('.env.example')) + load_dotenv(find_dotenv('.env')) DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' @@ -59,6 +59,8 @@ if __name__ == "__main__": experiment_dev_scores = dict() experiment_test_scores = dict() + experiment_weights = dict() + # Used to check if all losses were computed using the same metric (it should be the case) experiment_score_metrics = list() @@ -74,6 +76,8 @@ if __name__ == "__main__": experiment_dev_scores[seed] = list() experiment_test_scores[seed] = list() + experiment_weights[seed] = list() + # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size extracted_forest_sizes = os.listdir(extracted_forest_size_root_path) for extracted_forest_size in extracted_forest_sizes: @@ -84,9 +88,13 @@ if __name__ == "__main__": # Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results) # Save temporarly some raw results (TODO: to complete to retreive more results) + # Save the scores experiment_train_scores[seed].append(model_raw_results.train_score) experiment_dev_scores[seed].append(model_raw_results.dev_score) experiment_test_scores[seed].append(model_raw_results.test_score) + # Save the weights + experiment_weights[seed].append(model_raw_results.weights) + # Save the metric experiment_score_metrics.append(model_raw_results.score_metric) if len(set(experiment_score_metrics)) > 1: @@ -107,3 +115,48 @@ if __name__ == "__main__": all_labels=['train', 'dev', 'test'], title='Loss values of the trained model' ) + + """ + TODO: + For each dataset: + Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) + Stage 2) A figure for the selection of the best dataset normalization method + Stage 3) A figure for the selection of the best combination of dataset: normalization vs D normalization vs weights normalization + Stage 4) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev + Stage 5) A figure for the selection of the best extracted forest size? + Stage 6) A figure to finally compare the perf of our approach using the previous selected parameters vs the baseline vs other papers + + Stage 3) + In all axis: + - untrained forest + - trained base forest (straight line cause it doesn't depend on the number of extracted trees) + + Axis 1: + - test with forest on train+dev and OMP on train+dev + - test with forest on train+dev and OMP on train+dev with dataset normalization + - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + - test with forest on train+dev and OMP on train+dev with dataset normalization + weights normalization + - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + weights normalization + + Axis 2: + - test with forest on train and OMP on dev + - test with forest on train and OMP on dev with dataset normalization + - test with forest on train and OMP on dev with dataset normalization + D normalization + - test with forest on train and OMP on dev with dataset normalization + weights normalization + - test with forest on train and OMP on dev with dataset normalization + D normalization + weights normalization + + Axis 3: + - test with forest on train and OMP train+dev + - test with forest on train and OMP train+dev with dataset normalization + - test with forest on train and OMP train+dev with dataset normalization + D normalization + - test with forest on train and OMP train+dev with dataset normalization + weights normalization + - test with forest on train and OMP train+dev with dataset normalization + D normalization + weights normalization + + IMPORTANT: Same seeds used in all axis. + """ + + # Plot the density of the weights + Plotter.weight_density( + file_path=args.results_dir + os.sep + experiment_id + os.sep + 'density_weight.png', + all_experiment_weights=experiment_weights + ) diff --git a/code/train.py b/code/train.py index 5783fef4bd32d75f62f5dcf05c9812e82f9c0338..0d9713252b0e5e2345331952edaca6adfa5424c0 100644 --- a/code/train.py +++ b/code/train.py @@ -9,16 +9,75 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory from dotenv import find_dotenv, load_dotenv import argparse +import json import pathlib import random import os -from tqdm import tqdm +from concurrent import futures +import threading +import json +def process_job(seed, parameters, experiment_id, hyperparameters): + """ + Experiment function. + + Will be used as base function for worker in multithreaded application. + + :param seed: + :param parameters: + :param experiment_id: + :return: + """ + logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( + seed, threading.get_ident())) + logger.info('seed={}'.format(seed)) + + seed_str = str(seed) + experiment_id_str = str(experiment_id) + models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ + os.sep + seed_str + pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) + + dataset_parameters = DatasetParameters( + name=parameters['dataset_name'], + test_size=parameters['test_size'], + dev_size=parameters['dev_size'], + random_state=seed, + dataset_normalizer=parameters['dataset_normalizer'] + ) + dataset_parameters.save(models_dir, experiment_id_str) + dataset = DatasetLoader.load(dataset_parameters) + + trainer = Trainer(dataset) + + for extracted_forest_size in parameters['extracted_forest_size']: + # question if training is too long, one may also split experiments for different forest sizes into different workers + logger.info('extracted_forest_size={}'.format(extracted_forest_size)) + sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + + model_parameters = ModelParameters( + extracted_forest_size=extracted_forest_size, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters + ) + model_parameters.save(sub_models_dir, experiment_id) + + model = ModelFactory.build(dataset.task, model_parameters) + + trainer.init(model) + trainer.train(model) + trainer.compute_results(model, sub_models_dir) + logger.info('Training done') + if __name__ == "__main__": - # get environment variables in .env - load_dotenv(find_dotenv('.env.example')) + load_dotenv(find_dotenv('.env')) + DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_DATASET_NAME = 'boston' DEFAULT_NORMALIZE_D = False DEFAULT_DATASET_NORMALIZER = None @@ -29,13 +88,15 @@ if __name__ == "__main__": DEFAULT_DEV_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 - DEFAULT_USE_DEV_SUBSET = False - DEFAULT_DISABLE_PROGRESS = False + DEFAULT_SUBSETS_USED = 'train,dev' + DEFAULT_NORMALIZE_WEIGHTS = False begin_random_seed_range = 1 end_random_seed_range = 2000 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') + parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') @@ -46,66 +107,64 @@ if __name__ == "__main__": parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') - parser.add_argument('--use_dev_subset', action='store_true', default=DEFAULT_USE_DEV_SUBSET, help='If specify the forest will be trained on train subset and OMP on dev subset. Otherwise both the forest and OMP will be trained on train+dev subsets.') - parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.') + parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') + parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') args = parser.parse_args() - pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) + if args.experiment_configuration: + with open(args.experiment_configuration_path + os.sep + \ + args.experiment_configuration + '.json', 'r') as input_file: + parameters = json.load(input_file) + else: + parameters = args.__dict__ + + pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) - args.extracted_forest_size = args.extracted_forest_size \ - if type(args.extracted_forest_size) == list \ - else [args.extracted_forest_size] + # The number of tree to extract from forest (K) + parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ + if type(parameters['extracted_forest_size']) == list \ + else [parameters['extracted_forest_size']] - if args.seeds != None and args.random_seed_number > 1: - logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') + hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json') + if os.path.exists(hyperparameters_path): + logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path)) + with open(hyperparameters_path, 'r+') as file_hyperparameter: + hyperparameters = json.load(file_hyperparameter)['best_parameters'] + else: + hyperparameters = {} - seeds = args.seeds if args.seeds is not None \ - else [random.randint(begin_random_seed_range, end_random_seed_range) \ - for i in range(args.random_seed_number)] + if parameters['forest_size'] is not None: + hyperparameters['n_estimators'] = parameters['forest_size'] - experiment_id = resolve_experiment_id(args.models_dir) - experiment_id_str = str(experiment_id) + if parameters['seeds'] != None and parameters['random_seed_number'] > 1: + logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') - logger.info('Experiment id: {}'.format(experiment_id_str)) - - with tqdm(seeds, disable=args.disable_progress) as seed_bar: - for seed in seed_bar: - seed_bar.set_description('seed={}'.format(seed)) - seed_str = str(seed) - models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ - os.sep + seed_str - pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) - - dataset_parameters = DatasetParameters( - name=args.dataset_name, - test_size=args.test_size, - dev_size=args.dev_size, - random_state=seed, - dataset_normalizer=args.dataset_normalizer + # Seeds are either provided as parameters or generated at random + seeds = parameters['seeds'] if parameters['seeds'] is not None \ + else [random.randint(begin_random_seed_range, end_random_seed_range) \ + for i in range(parameters['random_seed_number'])] + + # Resolve the next experiment id number (last id + 1) + experiment_id = resolve_experiment_id(parameters['models_dir']) + logger.info('Experiment id: {}'.format(experiment_id)) + + """ + If the experiment configuration isn't coming from + an already existing file, save it to a json file to + keep trace of it. + """ + if args.experiment_configuration is None: + with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format( + experiment_id), 'w') as output_file: + json.dump( + parameters, + output_file, + indent=4 ) - dataset_parameters.save(models_dir, experiment_id_str) - - dataset = DatasetLoader.load(dataset_parameters) - - trainer = Trainer(dataset) - - with tqdm(args.extracted_forest_size, disable=args.disable_progress) as extracted_forest_size_bar: - for extracted_forest_size in extracted_forest_size_bar: - extracted_forest_size_bar.set_description('extracted_forest_size={}'.format(extracted_forest_size)) - sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) - pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) - - model_parameters = ModelParameters( - forest_size=args.forest_size, - extracted_forest_size=extracted_forest_size, - normalize_D=args.normalize_D, - use_dev_subset=args.use_dev_subset, - seed=seed - ) - model_parameters.save(sub_models_dir, experiment_id) - - model = ModelFactory.build(dataset.task, model_parameters) - trainer.train(model, sub_models_dir) + # Train as much job as there are seeds + with futures.ProcessPoolExecutor(len(seeds)) as executor: + list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, + parameters, experiment_id, hyperparameters) for seed in seeds)) diff --git a/experiments/.gitkeep b/experiments/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/boston/stage1/params.json b/experiments/boston/stage1/params.json index 6a5a1e9a05d8f081af6abe38fa0aadfff1e736b8..ad9a695df71e3bf329b4d7b30094329daf6c6b4f 100644 --- a/experiments/boston/stage1/params.json +++ b/experiments/boston/stage1/params.json @@ -1,5 +1,6 @@ { "scorer": "neg_mean_squared_error", +<<<<<<< HEAD "best_score_train": -11.238253315624897, "best_score_test": -7.312532120669678, "best_parameters": { @@ -9,4 +10,15 @@ "n_estimators": 1000 }, "random_seed": 289 +======= + "best_score_train": -12.900217003727361, + "best_score_test": -12.682938620298733, + "best_parameters": { + "max_depth": 18, + "max_features": "sqrt", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 883 +>>>>>>> a0f7c96f51b3e1575f6db1b704579b0cf1042c42 } \ No newline at end of file diff --git a/experiments/iris/stage1/params.json b/experiments/iris/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..fd852cace9852ee492649374e915b639fe785b28 --- /dev/null +++ b/experiments/iris/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.9576271186440678, + "best_score_test": 1.0, + "best_parameters": { + "max_depth": 20, + "max_features": "log2", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 883 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 437c0178801cca15b39561318f6e244c15b34533..d585159632442f7293f858e1fad391031658d60a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ awscli flake8 python-dotenv>=0.5.1 scikit-learn +git+git://github.com/darenr/scikit-optimize@master python-dotenv -tqdm -matplotlib \ No newline at end of file +matplotlib +pandas \ No newline at end of file