diff --git a/.env.example b/.env.example deleted file mode 100644 index 9ca543b382b4889be1d93ed2065ef234b789153c..0000000000000000000000000000000000000000 --- a/.env.example +++ /dev/null @@ -1,12 +0,0 @@ -# Environment variables go here, can be read by `python-dotenv` package: -# -# `src/script.py` -# ---------------------------------------------------------------- -# import dotenv -# -# project_dir = os.path.join(os.path.dirname(__file__), os.pardir) -# dotenv_path = os.path.join(project_dir, '.env') -# dotenv.load_dotenv(dotenv_path) -# ---------------------------------------------------------------- - -project_dir = "." \ No newline at end of file diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index ec1f321f70115542a2164c474193a246faa5639d..f4a6d085f45cfa6580949ad6acfabbe4abe71d8a 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -9,6 +9,17 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ from sklearn.model_selection import train_test_split from sklearn import preprocessing +from bolsonaro.utils import binarize_class_data + + +def change_binary_func_load(base_load_function): + def func_load(return_X_y): + X, y = base_load_function(return_X_y=return_X_y) + possible_classes = sorted(set(y)) + assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication" + y = binarize_class_data(y, possible_classes[-1]) + return X, y + return func_load class DatasetLoader(object): @@ -20,45 +31,46 @@ class DatasetLoader(object): task = Task.REGRESSION elif name == 'iris': dataset_loading_func = load_iris - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'diabetes': dataset_loading_func = load_diabetes task = Task.REGRESSION elif name == 'digits': dataset_loading_func = load_digits - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'linnerud': dataset_loading_func = load_linnerud task = Task.REGRESSION elif name == 'wine': dataset_loading_func = load_wine - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'breast_cancer': - dataset_loading_func = load_breast_cancer - task = Task.CLASSIFICATION + dataset_loading_func = change_binary_func_load(load_breast_cancer) + task = Task.BINARYCLASSIFICATION elif name == 'olivetti_faces': # bug (no return X_y) dataset_loading_func = fetch_olivetti_faces - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == '20newsgroups': # bug (no return X_y) dataset_loading_func = fetch_20newsgroups - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == '20newsgroups_vectorized': dataset_loading_func = fetch_20newsgroups_vectorized - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'lfw_people': # needs PIL (image dataset) dataset_loading_func = fetch_lfw_people - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'lfw_pairs': dataset_loading_func = fetch_lfw_pairs + task = Task.MULTICLASSIFICATION elif name == 'covtype': dataset_loading_func = fetch_covtype - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'rcv1': dataset_loading_func = fetch_rcv1 - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'kddcup99': dataset_loading_func = fetch_kddcup99 - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'california_housing': dataset_loading_func = fetch_california_housing task = Task.REGRESSION diff --git a/code/bolsonaro/data/task.py b/code/bolsonaro/data/task.py index 2f47fa22f472f769c075f40e1c25a7bf3de45f0d..f1214a64a27873e49f5dbbcb853e4f65f9b07f68 100644 --- a/code/bolsonaro/data/task.py +++ b/code/bolsonaro/data/task.py @@ -2,5 +2,6 @@ from enum import Enum class Task(Enum): - CLASSIFICATION = 1 + BINARYCLASSIFICATION = 1 REGRESSION = 2 + MULTICLASSIFICATION = 3 diff --git a/code/bolsonaro/hyperparameter_searcher.py b/code/bolsonaro/hyperparameter_searcher.py index 1f54c84e02f02ab8d62ba1441475cbfe2d572858..7884d2d4271203e9ebee1e804baa7c1e94a76770 100644 --- a/code/bolsonaro/hyperparameter_searcher.py +++ b/code/bolsonaro/hyperparameter_searcher.py @@ -33,11 +33,10 @@ class HyperparameterSearcher(object): :return: a skopt.searchcv.BayesSearchCV object ''' - if dataset.task == Task.CLASSIFICATION: - estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed) - if dataset.task == Task.REGRESSION: estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed) + else: + estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed) opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter, cv=cv, n_jobs=-1, random_state=random_seed, diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index fb6b32cb26727d2221367f208598f04e1a19dfb1..2dc578cfaacc99f9fea17b9ae8e64cc08e3038dc 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,4 +1,4 @@ -from bolsonaro.models.omp_forest_classifier import OmpForestClassifier +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.data.task import Task from bolsonaro.models.model_parameters import ModelParameters @@ -11,18 +11,22 @@ class ModelFactory(object): @staticmethod def build(task, model_parameters): - if task == Task.CLASSIFICATION: - model_func = OmpForestClassifier + if task == Task.BINARYCLASSIFICATION: + model_func = OmpForestBinaryClassifier elif task == Task.REGRESSION: model_func = OmpForestRegressor + elif task == Task.MULTICLASSIFICATION: + model_func = OmpForestMulticlassClassifier else: raise ValueError("Unsupported task '{}'".format(task)) return model_func(model_parameters) @staticmethod def load(task, directory_path, experiment_id, model_raw_results): + raise NotImplementedError model_parameters = ModelParameters.load(directory_path, experiment_id) model = ModelFactory.build(task, model_parameters) - model.set_forest(model_raw_results.forest) - model.set_weights(model_raw_results.weights) + # todo faire ce qu'il faut ici pour rétablir correctement le modèle + # model.set_forest(model_raw_results.forest) + # model.set_weights(model_raw_results.weights) return model diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index 673cb0fc65b7378e95c03b186d246cb70b384a07..df8b2ec0b10704a8a8c397b9012298e8b901e14b 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -6,13 +6,12 @@ import datetime class ModelRawResults(object): - def __init__(self, forest, weights, training_time, + def __init__(self, model_object, training_time, datetime, train_score, dev_score, test_score, score_metric, train_score_regressor, dev_score_regressor, test_score_regressor): - self._forest = forest - self._weights = weights + self._model_object = model_object self._training_time = training_time self._datetime = datetime self._train_score = train_score @@ -24,12 +23,8 @@ class ModelRawResults(object): self._test_score_regressor = test_score_regressor @property - def forest(self): - return self._forest - - @property - def weights(self): - return self._weights + def model_object(self): + return self.model_object @property def training_time(self): diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..0c33f09dd07142cfc9f94cee500be3ed8c795fba --- /dev/null +++ b/code/bolsonaro/models/omp_forest.py @@ -0,0 +1,123 @@ +from abc import abstractmethod, ABCMeta + +import numpy as np +from sklearn.linear_model import OrthogonalMatchingPursuit + +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory +from sklearn.base import BaseEstimator + + +class OmpForest(BaseEstimator, metaclass=ABCMeta): + def __init__(self, models_parameters, base_forest_estimator): + self._base_forest_estimator = base_forest_estimator + self._models_parameters = models_parameters + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + @property + def models_parameters(self): + return self._models_parameters + + def score_base_estimator(self, X, y): + return self._base_forest_estimator.score(X, y) + + + def _base_estimator_predictions(self, X): + return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T + + @property + def forest(self): + return self._base_forest_estimator.estimators_ + + # sklearn baseestimator api methods + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._base_forest_estimator.fit(X_forest, y_forest) + self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit + return self + + def _extract_subforest(self, X, y): + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X data is used for interrogation of every tree in the forest. The y data + is used for finding the weights in OMP. + + :param X: (n_sample, n_features) array + :param y: (n_sample,) array + :return: + """ + self._logger.debug("Forest make prediction on X") + D = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + # question: maybe consider other kinds of normalization.. centering? + self._logger.debug("Compute norm of predicted vectors on X") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) + + self.fit_omp(D, y) + + @staticmethod + def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False): + if normalize_weights: + # we can normalize weights (by their sum) so that they sum to 1 + # and they can be interpreted as impact percentages for interpretability. + # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) + + # question: je comprend pas le truc avec nonszero? + # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) + coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square) + unsigned_coef = (coef_signs * omp_obj.coef_).squeeze() + intercept = omp_obj.intercept_ + + adjusted_forest_predictions = base_predictions * coef_signs + predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept + + else: + predictions = omp_obj.predict(base_predictions) + + return predictions + + @abstractmethod + def fit_omp(self, atoms, objective): + pass + + @abstractmethod + def predict(self, X): + pass + + @abstractmethod + def score(self, X, y): + pass + +class SingleOmpForest(OmpForest): + def __init__(self, models_parameters, base_forest_estimator): + # fit_intercept shouldn't be set to False as the data isn't necessarily centered here + # normalization is handled outsite OMP + self._omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + + super().__init__(models_parameters, base_forest_estimator) + + def fit_omp(self, atoms, objective): + self._omp.fit(atoms, objective) + + def predict(self, X): + """ + Apply the SingleOmpForest to X. + + Make all the base tree predictions then apply the OMP weights for pruning. + + :param X: + :return: + """ + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index 12cc23fab69fc0b79ff40b1d6957db5532a8c452..c0526fbad4da9255b99c88a7c2e1239047c08587 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -1,11 +1,117 @@ +from collections import namedtuple +from copy import deepcopy + from sklearn.base import BaseEstimator from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import OrthogonalMatchingPursuit + +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest +import numpy as np + +from bolsonaro.utils import binarize_class_data + + +class OmpForestBinaryClassifier(SingleOmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + + def _check_classes(self, y): + assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier should be {-1, +1}" + + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._check_classes(y_forest) + self._check_classes(y_omp) + + return super().fit(X_forest, y_forest, X_omp, y_omp) + + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + """ + Evaluate OMPForestClassifer on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: might be "indicator" + :return: + """ + predictions = self.predict(X) + + if metric == 'indicator': + evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1)) + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation + + +class OmpForestMulticlassClassifier(OmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + # question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest + self._dct_class_omp = {} + + def fit_omp(self, atoms, objective): + assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__) + possible_classes = sorted(set(objective)) + for class_label in possible_classes: + atoms_binary = binarize_class_data(atoms, class_label, inplace=False) + objective_binary = binarize_class_data(objective, class_label, inplace=False) + # todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP... + omp_class = OrthogonalMatchingPursuit( + n_nonzero_coefs=self.models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + omp_class.fit(atoms_binary, objective_binary) + self._dct_class_omp[class_label] = omp_class + return self._dct_class_omp + + def predict(self, X): + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + label_names = [] + preds = [] + for class_label, omp_class in self._dct_class_omp.items(): + label_names.append(class_label) + atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False) + preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights)) + + # todo verifier que ce n'est pas bugué ici + + preds = np.array(preds).T + max_preds = np.argmax(preds, axis=1) + return np.array(label_names)[max_preds] + + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + predictions = self.predict(X) + + if metric == 'indicator': + evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0] + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation -class OmpForestClassifier(BaseEstimator): - def __init__(self): - raise ValueError('Classification tasks are not supported for now') - def fit(self, X, y): - pass +if __name__ == "__main__": + forest = RandomForestClassifier(n_estimators=10) + X = np.random.rand(10, 5) + y = np.random.choice([-1, +1], 10) + forest.fit(X, y) + print(forest.predict(np.random.rand(10, 5))) \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 013a86a2e889d3ebdc1b809b6d0d50ac5a697f26..9e95453df26e9dc5a688b2dd5217276361b5e96d 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -1,67 +1,20 @@ -from bolsonaro import LOG_PATH -from bolsonaro.error_handling.logger_factory import LoggerFactory + from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import OrthogonalMatchingPursuit -from sklearn.base import BaseEstimator import numpy as np +from bolsonaro.models.omp_forest import SingleOmpForest + -class OmpForestRegressor(BaseEstimator): +class OmpForestRegressor(SingleOmpForest): DEFAULT_SCORE_METRIC = 'mse' def __init__(self, models_parameters): - self._regressor = RandomForestRegressor(**models_parameters.hyperparameters, - random_state=models_parameters.seed, n_jobs=-1) - self._models_parameters = models_parameters - self._logger = LoggerFactory.create(LOG_PATH, __name__) - - @property - def forest(self): - return self._forest - - def set_forest(self, forest): - self._forest = forest - self._regressor.estimators_ = forest - - @property - def weights(self): - return self._weights - - def set_weights(self, weights): - self._weights = weights - - @property - def models_parameters(self): - return self._models_parameters + estimator = RandomForestRegressor(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) - def fit(self, X_forest, y_forest, X_omp, y_omp): - self._forest = self._train_forest(X_forest, y_forest) - self._omp = self._extract_subforest(X_omp, y_omp) - self._weights = self._omp.coef_ - return self - - def score_regressor(self, X, y): - return self._regressor.score(X, y) - - def predict(self, X): - """ - Apply the OMPForestRegressor to X. - - :param X: - :return: - """ - forest_predictions = self._forest_prediction(X) - - if self._models_parameters.normalize_D: - forest_predictions /= self._forest_norms - - predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) \ - if self._models_parameters.normalize_weights \ - else self._omp.predict(forest_predictions) - - return predictions + super().__init__(models_parameters, estimator) def score(self, X, y, metric=DEFAULT_SCORE_METRIC): """ @@ -80,38 +33,3 @@ class OmpForestRegressor(BaseEstimator): raise ValueError("Unsupported metric '{}'.".format(metric)) return evaluation - - def _train_forest(self, X, y): - self._regressor.fit(X, y) - forest = self._regressor.estimators_ - return forest - - def _extract_subforest(self, X, y): - """ - Given an already estimated regressor: apply OMP to get the weight of each tree. - - The X data is used for interrogation of every tree in the forest. The y data - is used for finding the weights in OMP. - - :param X: (n_sample, n_features) array - :param y: (n_sample,) array - :return: - """ - self._logger.debug("Forest make prediction on X") - D = self._forest_prediction(X) - - if self._models_parameters.normalize_D: - # question: maybe consider other kinds of normalization - self._logger.debug("Compute norm of predicted vectors on X") - self._forest_norms = np.linalg.norm(D, axis=0) - D /= self._forest_norms - - omp = OrthogonalMatchingPursuit( - n_nonzero_coefs=self._models_parameters.extracted_forest_size, - fit_intercept=False, normalize=False) - self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." - .format(self._models_parameters.extracted_forest_size)) - return omp.fit(D, y) - - def _forest_prediction(self, X): - return np.array([tree.predict(X) for tree in self._forest]).T diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index b586914166cf80f274a502d8d44b83f6b6f97484..a9bebe044b68475f5cc0cf6c6a2097ffe986e47c 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -8,12 +8,26 @@ import numpy as np class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ def __init__(self, dataset): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ self._dataset = dataset self._logger = LoggerFactory.create(LOG_PATH, __name__) def train(self, model, models_dir): + """ + + :param model: Object with + :param models_dir: Where the results will be saved + :return: + """ + # todo cette fonction ne fait pas que "train", elle choisit le jeu de données, train et evalue le modèle -> nom à changer self._logger.debug('Training model using train set...') begin_time = time.time() @@ -45,16 +59,24 @@ class Trainer(object): ) end_time = time.time() - ModelRawResults( - forest=model.forest, - weights=model.weights, + results = ModelRawResults( + model_object=model, training_time=end_time - begin_time, datetime=datetime.datetime.now(), train_score=model.score(self._dataset.X_train, self._dataset.y_train), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), test_score=model.score(self._dataset.X_test, self._dataset.y_test), score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way - train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train), - dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev), - test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test) - ).save(models_dir) + train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train), + dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev), + test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test) + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_regressor)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train: {}".format(results.train_score_regressor)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev: {}".format(results.dev_score_regressor)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 82e501878ba06320914230096213d2d28548e4dc..21c7f72ac9173caf2cf1b5ccbbe6dde61193d1aa 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -1,6 +1,7 @@ import os import json import pickle +from copy import deepcopy def resolve_experiment_id(models_dir): @@ -45,3 +46,21 @@ def load_obj_from_pickle(file_path, constructor): with open(file_path, 'rb') as input_file: parameters = pickle.load(input_file) return constructor(**parameters) + +def binarize_class_data(data, class_pos, inplace=True): + """ + Replace class_pos by +1 and ~class_pos by -1. + + :param data: an array of classes + :param class_pos: the positive class to be replaced by +1 + :param inplace: If True, modify data in place (still return it, also) + :return: + """ + if not inplace: + data = deepcopy(data) + + position_class_labels = (data == class_pos) + data[~(position_class_labels)] = -1 + data[(position_class_labels)] = +1 + + return data \ No newline at end of file diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py index 199e060f3ee2e3a125a7af05e9205453ae079b83..414a7df007d1c0b020705f29d7481d9453391ab6 100644 --- a/code/compute_hyperparameters.py +++ b/code/compute_hyperparameters.py @@ -38,7 +38,7 @@ def clean_numpy_int_list(list_n): if __name__ == "__main__": # get environment variables in .env - load_dotenv(find_dotenv('.env.example')) + load_dotenv(find_dotenv('.env')) DEFAULT_CV = 3 DEFAULT_N_ITER = 50 @@ -79,11 +79,10 @@ if __name__ == "__main__": dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) dataset = DatasetLoader.load(dataset_parameters) - if dataset.task == Task.CLASSIFICATION: - scorer = 'accuracy' - if dataset.task == Task.REGRESSION: scorer = 'neg_mean_squared_error' + else: + scorer = 'accuracy' bayesian_searcher = HyperparameterSearcher() opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter, diff --git a/code/compute_results.py b/code/compute_results.py index 0f26eb101a1910577593223a166e19b495f73d85..64124af70954cc6af6a923f03f5a122a75f453fb 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -12,7 +12,7 @@ import os if __name__ == "__main__": # get environment variables in .env - load_dotenv(find_dotenv('.env.example')) + load_dotenv(find_dotenv('.env')) DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' diff --git a/code/train.py b/code/train.py index d58871db980369efa254313b9997f5f9e99c0bbe..34c2003db8aef25d105831989b5c38b4e966f640 100644 --- a/code/train.py +++ b/code/train.py @@ -19,9 +19,20 @@ import json def process_job(seed, parameters, experiment_id, hyperparameters): + """ + Experiment function. + + Will be used as base function for worker in multithreaded application. + + :param seed: + :param parameters: + :param experiment_id: + :return: + """ logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( seed, threading.get_ident())) logger.info('seed={}'.format(seed)) + seed_str = str(seed) experiment_id_str = str(experiment_id) models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ @@ -36,12 +47,12 @@ def process_job(seed, parameters, experiment_id, hyperparameters): dataset_normalizer=parameters['dataset_normalizer'] ) dataset_parameters.save(models_dir, experiment_id_str) - dataset = DatasetLoader.load(dataset_parameters) trainer = Trainer(dataset) for extracted_forest_size in parameters['extracted_forest_size']: + # question if training is too long, one may also split experiments for different forest sizes into different workers logger.info('extracted_forest_size={}'.format(extracted_forest_size)) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) @@ -62,8 +73,7 @@ def process_job(seed, parameters, experiment_id, hyperparameters): logger.info('Training done') if __name__ == "__main__": - # get environment variables in .env - load_dotenv(find_dotenv('.env.example')) + load_dotenv(find_dotenv('.env')) DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_DATASET_NAME = 'boston' @@ -110,6 +120,7 @@ if __name__ == "__main__": logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) + # The number of tree to extract from forest (K) parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ if type(parameters['extracted_forest_size']) == list \ else [parameters['extracted_forest_size']] @@ -128,6 +139,7 @@ if __name__ == "__main__": if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') + # Seeds are either provided as parameters or generated at random seeds = parameters['seeds'] if parameters['seeds'] is not None \ else [random.randint(begin_random_seed_range, end_random_seed_range) \ for i in range(parameters['random_seed_number'])] diff --git a/experiments/.gitkeep b/experiments/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev.json b/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev.json deleted file mode 100644 index b6dd49c4a7f9ef9b8ae97c1ac578d35f0a47c171..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train+dev,train+dev", - "normalize_weights": false -} \ No newline at end of file diff --git a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_normalize-D.json b/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_normalize-D.json deleted file mode 100644 index 8d50e1964663c6f4cd88efc2e7c85e4e19b2ced3..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_normalize-D.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": true, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train+dev,train+dev", - "normalize_weights": false -} \ No newline at end of file diff --git a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_normalize-D_weights-normalization.json b/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_normalize-D_weights-normalization.json deleted file mode 100644 index 2e7b19ec64d0d36048022df069377e3cb3b0d88e..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_normalize-D_weights-normalization.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": true, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train+dev,train+dev", - "normalize_weights": true -} \ No newline at end of file diff --git a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_weights-normalization.json b/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_weights-normalization.json deleted file mode 100644 index c0fa623dadbb7a142c9f3916428e225dea94ddba..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train+dev,train+dev/boston_train+dev,train+dev_weights-normalization.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train+dev,train+dev", - "normalize_weights": true -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,dev/boston_train,dev.json b/experiments/boston/stage3/train,dev/boston_train,dev.json deleted file mode 100644 index 0ffac35eb43a7568bb14a85010e538b094490b72..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,dev/boston_train,dev.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,dev", - "normalize_weights": false -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,dev/boston_train,dev_normalize-D.json b/experiments/boston/stage3/train,dev/boston_train,dev_normalize-D.json deleted file mode 100644 index d7f1c2e8427278615e76b7dc734c8936bef6fe57..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,dev/boston_train,dev_normalize-D.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": true, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,dev", - "normalize_weights": false -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,dev/boston_train,dev_normalize-D_weights-normalization.json b/experiments/boston/stage3/train,dev/boston_train,dev_normalize-D_weights-normalization.json deleted file mode 100644 index 824133af36f3c226799c3d5d025f3cfab9fbd421..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,dev/boston_train,dev_normalize-D_weights-normalization.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": true, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,dev", - "normalize_weights": true -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,dev/boston_train,dev_weights-normalization.json b/experiments/boston/stage3/train,dev/boston_train,dev_weights-normalization.json deleted file mode 100644 index 45e91739f838f6c1dbcc94e6dd5da136eca08f1d..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,dev/boston_train,dev_weights-normalization.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,dev", - "normalize_weights": true -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,train+dev/boston_train,train+dev.json b/experiments/boston/stage3/train,train+dev/boston_train,train+dev.json deleted file mode 100644 index 4da1e6d4b9b10d620b23adee3a6b1719078da01d..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,train+dev/boston_train,train+dev.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,train+dev", - "normalize_weights": false -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,train+dev/boston_train,train+dev_normalize-D.json b/experiments/boston/stage3/train,train+dev/boston_train,train+dev_normalize-D.json deleted file mode 100644 index ccc9befa778ccac3eb5d9efeebaa3fb8f1624c61..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,train+dev/boston_train,train+dev_normalize-D.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": true, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,train+dev", - "normalize_weights": false -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,train+dev/boston_train,train+dev_normalize-D_weights-normalization.json b/experiments/boston/stage3/train,train+dev/boston_train,train+dev_normalize-D_weights-normalization.json deleted file mode 100644 index 93c0082c477841a765b3feb4bda6d4529ee14dcc..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,train+dev/boston_train,train+dev_normalize-D_weights-normalization.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": true, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,train+dev", - "normalize_weights": true -} \ No newline at end of file diff --git a/experiments/boston/stage3/train,train+dev/boston_train,train+dev_weights-normalization.json b/experiments/boston/stage3/train,train+dev/boston_train,train+dev_weights-normalization.json deleted file mode 100644 index ed3bf0823c1d2c7b6da82f9554b492820ca9c638..0000000000000000000000000000000000000000 --- a/experiments/boston/stage3/train,train+dev/boston_train,train+dev_weights-normalization.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_name": "boston", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": 100, - "extracted_forest_size": [ - 10, - 20, - 30 - ], - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 3, - "seeds": null, - "subsets_used": "train,train+dev", - "normalize_weights": true -} \ No newline at end of file