diff --git a/.gitignore b/.gitignore index d7c9832ff3c60eeb0b764a3b8d40fa4dad2cfee5..ed07278aa03dbf293f143b22d927fa9f08876edb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +models/* +results/* + +*/.kile/* +*.kilepr # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -87,3 +92,285 @@ target/ # Mypy cache .mypy_cache/ + + +# latex + +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +# fixme +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Comment the next line if you want to keep your tikz graphics files +*.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +# easy-todo +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +reports/*.pdf + +# Image +*.png diff --git a/README.md b/README.md index 8ffb3de812f21466cd7bfe0aaf8c92518529963c..8d46ebd2a36736d601c1dc94ee3f19c5118555fd 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,16 @@ Project Organization Instal project -------------- +First install the project pacakge: + pip install -r requirements.txt + +Then create a file `.env` by copying the file `.env.example`: + cp .env.example .env + +Then you must set the project directory in the `.env` file : + + project_dir = "path/to/your/project/directory" + +This directory will be used for storing the model parameters. \ No newline at end of file diff --git a/TODO.md b/TODO.md index bfb32e8a131b5147b36c9ccba729a6e13e04e5b7..5ea6cc5cf2c933eed2e7ffbf2567d4fe812412cf 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,7 @@ -* Trouver des jeux de données pertinents -* Entraîner et tester des forêts de différentes tailles -* Entraîner et tester en regression et classification -* Entraîner et tester sur différentes modalités (pas seulement des datasets d'images) -* Entraîner avec différents hyperparamètres (d, profondeur, epsilon) -* Appliquer OMP avec différentes valeurs de k (notamment un petit k) -* Faire des figures -* Implémenter et comparer les systèmes concurrents \ No newline at end of file +* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues. +* Fix ModelFactory.load function. +* Fix model results loading in compute_results.py. +* Check that omp multiclasses classifier is working as expected. +* In the bayesian search computation, output a different file name depending on the task of the trained model. +* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values). +* Prepare the json experiment files to run. \ No newline at end of file diff --git a/bolsonaro/data/__init__.py b/bolsonaro/data/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/bolsonaro/data/make_dataset.py b/bolsonaro/data/make_dataset.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/bolsonaro/example.py b/bolsonaro/example.py deleted file mode 100644 index 6e7ed0920d742ea6474b5aa62d42d7f2572b47e4..0000000000000000000000000000000000000000 --- a/bolsonaro/example.py +++ /dev/null @@ -1,3 +0,0 @@ -from bolsonaro.utils import root_directory - -print(str(root_directory)) \ No newline at end of file diff --git a/bolsonaro/models/__init__.py b/bolsonaro/models/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/bolsonaro/models/create_model.py b/bolsonaro/models/create_model.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/bolsonaro/utils.py b/bolsonaro/utils.py deleted file mode 100644 index 6761ac86095a3420cc371a6f2b7aac0a629acc14..0000000000000000000000000000000000000000 --- a/bolsonaro/utils.py +++ /dev/null @@ -1,3 +0,0 @@ -from pathlib import Path - -root_directory = Path(__file__).parent.parent.absolute() diff --git a/bolsonaro/visualization/__init__.py b/bolsonaro/visualization/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/bolsonaro/visualization/visualize.py b/bolsonaro/visualization/visualize.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/code/bolsonaro/__init__.py b/code/bolsonaro/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce8e424d0cc66e3f349fcafd744216d818bacaa5 --- /dev/null +++ b/code/bolsonaro/__init__.py @@ -0,0 +1,3 @@ +import os + +LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log') diff --git a/bolsonaro/data/.gitkeep b/code/bolsonaro/data/.gitkeep similarity index 100% rename from bolsonaro/data/.gitkeep rename to code/bolsonaro/data/.gitkeep diff --git a/code/bolsonaro/data/__init__.py b/code/bolsonaro/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce8e424d0cc66e3f349fcafd744216d818bacaa5 --- /dev/null +++ b/code/bolsonaro/data/__init__.py @@ -0,0 +1,3 @@ +import os + +LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log') diff --git a/code/bolsonaro/data/dataset.py b/code/bolsonaro/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7108eb5781e1ef2b69926e9fd2239deaa81f44e2 --- /dev/null +++ b/code/bolsonaro/data/dataset.py @@ -0,0 +1,43 @@ +class Dataset(object): + + def __init__(self, task, X_train, X_dev, X_test, y_train, + y_dev, y_test): + self._task = task + self._X_train = X_train + self._X_dev = X_dev + self._X_test = X_test + self._y_train = y_train + self._y_dev = y_dev + self._y_test = y_test + + @property + def task(self): + return self._task + + @property + def dataset_parameters(self): + return self._dataset_parameters + + @property + def X_train(self): + return self._X_train + + @property + def X_dev(self): + return self._X_dev + + @property + def X_test(self): + return self._X_test + + @property + def y_train(self): + return self._y_train + + @property + def y_dev(self): + return self._y_dev + + @property + def y_test(self): + return self._y_test diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..b22ecaa83f6e69610c82796e068e91db23e83646 --- /dev/null +++ b/code/bolsonaro/data/dataset_loader.py @@ -0,0 +1,94 @@ +from bolsonaro.data.dataset import Dataset +from bolsonaro.data.task import Task +from bolsonaro.utils import change_binary_func_load + +from sklearn.datasets import load_boston, load_iris, load_diabetes, \ + load_digits, load_linnerud, load_wine, load_breast_cancer +from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ + fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ + fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing +from sklearn.model_selection import train_test_split +from sklearn import preprocessing + + +class DatasetLoader(object): + + @staticmethod + def load(dataset_parameters): + name = dataset_parameters.name + if name == 'boston': + dataset_loading_func = load_boston + task = Task.REGRESSION + elif name == 'iris': + dataset_loading_func = load_iris + task = Task.MULTICLASSIFICATION + elif name == 'diabetes': + dataset_loading_func = load_diabetes + task = Task.REGRESSION + elif name == 'digits': + dataset_loading_func = load_digits + task = Task.MULTICLASSIFICATION + elif name == 'linnerud': + dataset_loading_func = load_linnerud + task = Task.REGRESSION + elif name == 'wine': + dataset_loading_func = load_wine + task = Task.MULTICLASSIFICATION + elif name == 'breast_cancer': + dataset_loading_func = change_binary_func_load(load_breast_cancer) + task = Task.BINARYCLASSIFICATION + elif name == 'olivetti_faces': # bug (no return X_y) + dataset_loading_func = fetch_olivetti_faces + task = Task.MULTICLASSIFICATION + elif name == '20newsgroups': # bug (no return X_y) + dataset_loading_func = fetch_20newsgroups + task = Task.MULTICLASSIFICATION + elif name == '20newsgroups_vectorized': + dataset_loading_func = fetch_20newsgroups_vectorized + task = Task.MULTICLASSIFICATION + elif name == 'lfw_people': # needs PIL (image dataset) + dataset_loading_func = fetch_lfw_people + task = Task.MULTICLASSIFICATION + elif name == 'lfw_pairs': + dataset_loading_func = fetch_lfw_pairs + task = Task.MULTICLASSIFICATION + elif name == 'covtype': + dataset_loading_func = fetch_covtype + task = Task.MULTICLASSIFICATION + elif name == 'rcv1': + dataset_loading_func = fetch_rcv1 + task = Task.MULTICLASSIFICATION + elif name == 'kddcup99': + dataset_loading_func = fetch_kddcup99 + task = Task.MULTICLASSIFICATION + elif name == 'california_housing': + dataset_loading_func = fetch_california_housing + task = Task.REGRESSION + else: + raise ValueError("Unsupported dataset '{}'".format(name)) + + X, y = dataset_loading_func(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=dataset_parameters.test_size, + random_state=dataset_parameters.random_state) + X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, + test_size=dataset_parameters.dev_size, + random_state=dataset_parameters.random_state) + + if dataset_parameters.dataset_normalizer is not None: + if dataset_parameters.dataset_normalizer == 'standard': + scaler = preprocessing.StandardScaler() + elif dataset_parameters.dataset_normalizer == 'minmax': + scaler = preprocessing.MinMaxScaler() + elif dataset_parameters.dataset_normalizer == 'robust': + scaler = preprocessing.RobustScaler() + elif dataset_parameters.dataset_normalizer == 'normalizer': + scaler = preprocessing.Normalizer() + else: + raise ValueError("Unsupported normalizer '{}'".format(dataset_parameters.dataset_normalizer)) + X_train = scaler.fit_transform(X_train) + X_dev = scaler.transform(X_dev) + X_test = scaler.transform(X_test) + + return Dataset(task, X_train, + X_dev, X_test, y_train, y_dev, y_test) diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..88054257d4241ae0426c16ebaca1cb4985c3b65f --- /dev/null +++ b/code/bolsonaro/data/dataset_parameters.py @@ -0,0 +1,42 @@ +from bolsonaro.utils import save_obj_to_json, load_obj_from_json + +import os + + +class DatasetParameters(object): + + def __init__(self, name, test_size, dev_size, random_state, dataset_normalizer): + self._name = name + self._test_size = test_size + self._dev_size = dev_size + self._random_state = random_state + self._dataset_normalizer = dataset_normalizer + + @property + def name(self): + return self._name + + @property + def test_size(self): + return self._test_size + + @property + def dev_size(self): + return self._dev_size + + @property + def random_state(self): + return self._random_state + + @property + def dataset_normalizer(self): + return self._dataset_normalizer + + def save(self, directory_path, experiment_id): + save_obj_to_json(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), + self.__dict__) + + @staticmethod + def load(directory_path, experiment_id): + return load_obj_from_json(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), + DatasetParameters) diff --git a/code/bolsonaro/data/task.py b/code/bolsonaro/data/task.py new file mode 100644 index 0000000000000000000000000000000000000000..f1214a64a27873e49f5dbbcb853e4f65f9b07f68 --- /dev/null +++ b/code/bolsonaro/data/task.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class Task(Enum): + BINARYCLASSIFICATION = 1 + REGRESSION = 2 + MULTICLASSIFICATION = 3 diff --git a/code/bolsonaro/error_handling/__init__.py b/code/bolsonaro/error_handling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ca18d974258231377a6e37b90e5278d81b42f7 --- /dev/null +++ b/code/bolsonaro/error_handling/__init__.py @@ -0,0 +1,29 @@ + ##################################################################################### + # MIT License # + # # + # Copyright (C) 2019 Charly Lamothe # + # # + # This file is part of VQ-VAE-Speech. # + # # + # Permission is hereby granted, free of charge, to any person obtaining a copy # + # of this software and associated documentation files (the "Software"), to deal # + # in the Software without restriction, including without limitation the rights # + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # + # copies of the Software, and to permit persons to whom the Software is # + # furnished to do so, subject to the following conditions: # + # # + # The above copyright notice and this permission notice shall be included in all # + # copies or substantial portions of the Software. # + # # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # + # SOFTWARE. # + ##################################################################################### + +import os + +LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log') diff --git a/code/bolsonaro/error_handling/logger_factory.py b/code/bolsonaro/error_handling/logger_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..09a7ca68033b9f1960a1d9d98a9c4553653313e1 --- /dev/null +++ b/code/bolsonaro/error_handling/logger_factory.py @@ -0,0 +1,66 @@ + ##################################################################################### + # MIT License # + # # + # Copyright (C) 2019 Charly Lamothe # + # # + # This file is part of VQ-VAE-Speech. # + # # + # Permission is hereby granted, free of charge, to any person obtaining a copy # + # of this software and associated documentation files (the "Software"), to deal # + # in the Software without restriction, including without limitation the rights # + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # + # copies of the Software, and to permit persons to whom the Software is # + # furnished to do so, subject to the following conditions: # + # # + # The above copyright notice and this permission notice shall be included in all # + # copies or substantial portions of the Software. # + # # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # + # SOFTWARE. # + ##################################################################################### + +import logging +from logging.handlers import RotatingFileHandler +import os +import errno + + +class LoggerFactory(object): + + @staticmethod + def create(path, module_name): + # Create logger + logger = logging.getLogger(module_name) + logger.setLevel(logging.DEBUG) + + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + # Create file handler + fh = RotatingFileHandler(path + os.sep + module_name + '.log', maxBytes=1000000, backupCount=5) + fh.setLevel(logging.DEBUG) + + # Create console handler + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + + # Create formatter + formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(levelname)s - %(message)s') + + # Add formatter to handlers + fh.setFormatter(formatter) + ch.setFormatter(formatter) # TODO: add another formatter to the console logger? + + # Add fh and ch to logger + logger.addHandler(fh) + logger.addHandler(ch) + + return logger diff --git a/code/bolsonaro/hyperparameter_searcher.py b/code/bolsonaro/hyperparameter_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..7884d2d4271203e9ebee1e804baa7c1e94a76770 --- /dev/null +++ b/code/bolsonaro/hyperparameter_searcher.py @@ -0,0 +1,47 @@ +''' +This module is used to find the best hyperparameters for a given dataset. +''' + +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.task import Task +from bolsonaro.error_handling.logger_factory import LoggerFactory +from . import LOG_PATH + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from skopt import BayesSearchCV + + +class HyperparameterSearcher(object): + + def __init__(self): + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + def search(self, dataset, hyperparameter_space, n_iter, cv, + random_seed, scorer, verbose=False): + ''' + For a given dataset and the space of hyperparameters, does a + bayesian hyperparameters search. + :input dataset: a Dataset object + :input hyperparameter_space: a dictionnary, keys are hyperparameters, + value their spaces defined with skopt + :input n_iter: the number of iterations of the bayesian search + :input cv: the size of the cross validation + :input random_seed: int, the seed for the bayesian search + :input scorer: str, the name of the scorer + :input verbose: bool, print state of the research + :return: a skopt.searchcv.BayesSearchCV object + ''' + + if dataset.task == Task.REGRESSION: + estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed) + else: + estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed) + + opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter, + cv=cv, n_jobs=-1, random_state=random_seed, + scoring=scorer, verbose=verbose) + + opt.fit(dataset.X_train, dataset.y_train) + + return opt diff --git a/bolsonaro/models/.gitkeep b/code/bolsonaro/models/.gitkeep similarity index 100% rename from bolsonaro/models/.gitkeep rename to code/bolsonaro/models/.gitkeep diff --git a/code/bolsonaro/models/__init__.py b/code/bolsonaro/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce8e424d0cc66e3f349fcafd744216d818bacaa5 --- /dev/null +++ b/code/bolsonaro/models/__init__.py @@ -0,0 +1,3 @@ +import os + +LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log') diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..a93e6090e253dc9bdb3aacfc53e1c99a1f9ef120 --- /dev/null +++ b/code/bolsonaro/models/model_factory.py @@ -0,0 +1,32 @@ +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier +from bolsonaro.models.omp_forest_regressor import OmpForestRegressor +from bolsonaro.data.task import Task +from bolsonaro.models.model_parameters import ModelParameters + +import os +import pickle + + +class ModelFactory(object): + + @staticmethod + def build(task, model_parameters): + if task == Task.BINARYCLASSIFICATION: + model_func = OmpForestBinaryClassifier + elif task == Task.REGRESSION: + model_func = OmpForestRegressor + elif task == Task.MULTICLASSIFICATION: + model_func = OmpForestMulticlassClassifier + else: + raise ValueError("Unsupported task '{}'".format(task)) + return model_func(model_parameters) + + @staticmethod + def load(task, directory_path, experiment_id, model_raw_results): + raise NotImplementedError + model_parameters = ModelParameters.load(directory_path, experiment_id) + model = ModelFactory.build(task, model_parameters) + # todo faire ce qu'il faut ici pour rétablir correctement le modèle + model.set_forest(model_raw_results.model_object.forest) + model.set_weights(model_raw_results.model_object.weights) + return model diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..31a451b70578835fe4663508de9e15f99bf6cc19 --- /dev/null +++ b/code/bolsonaro/models/model_parameters.py @@ -0,0 +1,47 @@ +from bolsonaro.utils import save_obj_to_json, load_obj_from_json + +import os + + +class ModelParameters(object): + + def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters): + self._extracted_forest_size = extracted_forest_size + self._normalize_D = normalize_D + self._subsets_used = subsets_used + self._normalize_weights = normalize_weights + self._seed = seed + self._hyperparameters = hyperparameters + + @property + def extracted_forest_size(self): + return self._extracted_forest_size + + @property + def normalize_D(self): + return self._normalize_D + + @property + def subsets_used(self): + return self._subsets_used + + @property + def normalize_weights(self): + return self._normalize_weights + + @property + def seed(self): + return self._seed + + @property + def hyperparameters(self): + return self._hyperparameters + + def save(self, directory_path, experiment_id): + save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), + self.__dict__) + + @staticmethod + def load(directory_path, experiment_id): + return load_obj_from_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), + ModelParameters) diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py new file mode 100644 index 0000000000000000000000000000000000000000..df8b2ec0b10704a8a8c397b9012298e8b901e14b --- /dev/null +++ b/code/bolsonaro/models/model_raw_results.py @@ -0,0 +1,72 @@ +from bolsonaro.utils import save_obj_to_pickle, load_obj_from_pickle + +import os +import datetime + + +class ModelRawResults(object): + + def __init__(self, model_object, training_time, + datetime, train_score, dev_score, test_score, + score_metric, train_score_regressor, dev_score_regressor, + test_score_regressor): + + self._model_object = model_object + self._training_time = training_time + self._datetime = datetime + self._train_score = train_score + self._dev_score = dev_score + self._test_score = test_score + self._score_metric = score_metric + self._train_score_regressor = train_score_regressor + self._dev_score_regressor = dev_score_regressor + self._test_score_regressor = test_score_regressor + + @property + def model_object(self): + return self.model_object + + @property + def training_time(self): + return self._training_time + + @property + def datetime(self): + return self._datetime + + @property + def train_score(self): + return self._train_score + + @property + def dev_score(self): + return self._dev_score + + @property + def test_score(self): + return self._test_score + + @property + def score_metric(self): + return self._score_metric + + @property + def train_score_regressor(self): + return self._train_score_regressor + + @property + def dev_score_regressor(self): + return self._dev_score_regressor + + @property + def test_score_regressor(self): + return self._test_score_regressor + + def save(self, models_dir): + save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle', + self.__dict__) + + @staticmethod + def load(models_dir): + return load_obj_from_pickle(models_dir + os.sep + 'model_raw_results.pickle', + ModelRawResults) diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..2da0beab64ef5361efbde6d6197f957fe627886c --- /dev/null +++ b/code/bolsonaro/models/omp_forest.py @@ -0,0 +1,121 @@ +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory + +from abc import abstractmethod, ABCMeta +import numpy as np +from sklearn.linear_model import OrthogonalMatchingPursuit +from sklearn.base import BaseEstimator + + +class OmpForest(BaseEstimator, metaclass=ABCMeta): + def __init__(self, models_parameters, base_forest_estimator): + self._base_forest_estimator = base_forest_estimator + self._models_parameters = models_parameters + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + @property + def models_parameters(self): + return self._models_parameters + + def score_base_estimator(self, X, y): + return self._base_forest_estimator.score(X, y) + + def _base_estimator_predictions(self, X): + return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T + + @property + def forest(self): + return self._base_forest_estimator.estimators_ + + # sklearn baseestimator api methods + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._base_forest_estimator.fit(X_forest, y_forest) + self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit + return self + + def _extract_subforest(self, X, y): + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X data is used for interrogation of every tree in the forest. The y data + is used for finding the weights in OMP. + + :param X: (n_sample, n_features) array + :param y: (n_sample,) array + :return: + """ + self._logger.debug("Forest make prediction on X") + D = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + # question: maybe consider other kinds of normalization.. centering? + self._logger.debug("Compute norm of predicted vectors on X") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) + + self.fit_omp(D, y) + + @staticmethod + def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False): + if normalize_weights: + # we can normalize weights (by their sum) so that they sum to 1 + # and they can be interpreted as impact percentages for interpretability. + # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) + + # question: je comprend pas le truc avec nonszero? + # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) + coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square) + unsigned_coef = (coef_signs * omp_obj.coef_).squeeze() + intercept = omp_obj.intercept_ + + adjusted_forest_predictions = base_predictions * coef_signs + predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept + + else: + predictions = omp_obj.predict(base_predictions) + + return predictions + + @abstractmethod + def fit_omp(self, atoms, objective): + pass + + @abstractmethod + def predict(self, X): + pass + + @abstractmethod + def score(self, X, y): + pass + +class SingleOmpForest(OmpForest): + def __init__(self, models_parameters, base_forest_estimator): + # fit_intercept shouldn't be set to False as the data isn't necessarily centered here + # normalization is handled outsite OMP + self._omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + + super().__init__(models_parameters, base_forest_estimator) + + def fit_omp(self, atoms, objective): + self._omp.fit(atoms, objective) + + def predict(self, X): + """ + Apply the SingleOmpForest to X. + + Make all the base tree predictions then apply the OMP weights for pruning. + + :param X: + :return: + """ + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..128347aa61caf79dc908397ef0588d646d8b0dee --- /dev/null +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -0,0 +1,106 @@ +from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest +from bolsonaro.utils import binarize_class_data + +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import OrthogonalMatchingPursuit + + +class OmpForestBinaryClassifier(SingleOmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + + def _check_classes(self, y): + assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}" + + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._check_classes(y_forest) + self._check_classes(y_omp) + + return super().fit(X_forest, y_forest, X_omp, y_omp) + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + """ + Evaluate OMPForestClassifer on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: might be "indicator" + :return: + """ + predictions = self.predict(X) + + if metric == 'indicator': + evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1)) + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation + + +class OmpForestMulticlassClassifier(OmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + # question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest + self._dct_class_omp = {} + + def fit_omp(self, atoms, objective): + assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__) + possible_classes = sorted(set(objective)) + for class_label in possible_classes: + atoms_binary = binarize_class_data(atoms, class_label, inplace=False) + objective_binary = binarize_class_data(objective, class_label, inplace=False) + # todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP... + omp_class = OrthogonalMatchingPursuit( + n_nonzero_coefs=self.models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + omp_class.fit(atoms_binary, objective_binary) + self._dct_class_omp[class_label] = omp_class + return self._dct_class_omp + + def predict(self, X): + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + label_names = [] + preds = [] + for class_label, omp_class in self._dct_class_omp.items(): + label_names.append(class_label) + atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False) + preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights)) + + # todo verifier que ce n'est pas bugué ici + + preds = np.array(preds).T + max_preds = np.argmax(preds, axis=1) + return np.array(label_names)[max_preds] + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + predictions = self.predict(X) + + if metric == 'indicator': + evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0] + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation + + +if __name__ == "__main__": + forest = RandomForestClassifier(n_estimators=10) + X = np.random.rand(10, 5) + y = np.random.choice([-1, +1], 10) + forest.fit(X, y) + print(forest.predict(np.random.rand(10, 5))) \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py new file mode 100644 index 0000000000000000000000000000000000000000..a0c8b4708d52336bf39544ffd0b66c527466620a --- /dev/null +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -0,0 +1,33 @@ +from bolsonaro.models.omp_forest import SingleOmpForest + +from sklearn.ensemble import RandomForestRegressor +import numpy as np + + +class OmpForestRegressor(SingleOmpForest): + + DEFAULT_SCORE_METRIC = 'mse' + + def __init__(self, models_parameters): + estimator = RandomForestRegressor(**models_parameters.hyperparameters, + random_state=models_parameters.seed, n_jobs=-1) + + super().__init__(models_parameters, estimator) + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + """ + Evaluate OMPForestRegressor on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: + :return: + """ + predictions = self.predict(X) + + if metric == 'mse': + evaluation = np.mean(np.square(predictions - y)) + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..4a32bffb8c7a2e129bf6f010c8a2c3a339a53a4b --- /dev/null +++ b/code/bolsonaro/trainer.py @@ -0,0 +1,85 @@ +from bolsonaro.models.model_raw_results import ModelRawResults +from bolsonaro.error_handling.logger_factory import LoggerFactory +from . import LOG_PATH + +import time +import datetime +import numpy as np + + +class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ + + def __init__(self, dataset): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ + self._dataset = dataset + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + def init(self, model): + if model.models_parameters.subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = self._dataset.X_dev + self._y_omp = self._dataset.y_dev + self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') + elif model.models_parameters.subsets_used == 'train+dev,train+dev': + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._X_omp = self._X_forest + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._y_omp = self._y_forest + self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') + elif model.models_parameters.subsets_used == 'train,train+dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + else: + raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) + + def train(self, model): + """ + :param model: Object with + :return: + """ + + self._logger.debug('Training model using train set...') + self._begin_time = time.time() + model.fit( + X_forest=self._X_forest, + y_forest=self._y_forest, + X_omp=self._X_omp, + y_omp=self._y_omp + ) + self._end_time = time.time() + + def compute_results(self, model, models_dir): + """ + :param model: Object with + :param models_dir: Where the results will be saved + """ + results = ModelRawResults( + model_object=model, + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=model.score(self._dataset.X_train, self._dataset.y_train), + dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), + test_score=model.score(self._dataset.X_test, self._dataset.y_test), + score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way + train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train), + dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev), + test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test) + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_regressor)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train: {}".format(results.train_score_regressor)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev: {}".format(results.dev_score_regressor)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d7509ad9e85cde3cc0c649f85cfb5b60ead9a854 --- /dev/null +++ b/code/bolsonaro/utils.py @@ -0,0 +1,75 @@ +import os +import json +import pickle +from copy import deepcopy + + +def resolve_experiment_id(models_dir): + """ + Return the ID of the next experiment. + + The ID is an int equal to n+1 where n is the current number of directory in `models_dir + ` + :param models_dir: + :return: + """ + ids = [x for x in os.listdir(models_dir) + if os.path.isdir(models_dir + os.sep + x)] + if len(ids) > 0: + return int(max([int(i) for i in ids])) + 1 + return 1 + +def save_obj_to_json(file_path, attributes_dict): + attributes = dict() + for key, value in attributes_dict.items(): + attributes[key[1:]] = value + with open(file_path, 'w') as output_file: + json.dump( + attributes, + output_file, + indent=4 + ) + +def load_obj_from_json(file_path, constructor): + with open(file_path, 'r') as input_file: + parameters = json.load(input_file) + return constructor(**parameters) + +def save_obj_to_pickle(file_path, attributes_dict): + attributes = dict() + for key, value in attributes_dict.items(): + attributes[key[1:]] = value + with open(file_path, 'wb') as output_file: + pickle.dump(attributes, output_file) + +def load_obj_from_pickle(file_path, constructor): + with open(file_path, 'rb') as input_file: + parameters = pickle.load(input_file) + return constructor(**parameters) + +def binarize_class_data(data, class_pos, inplace=True): + """ + Replace class_pos by +1 and ~class_pos by -1. + + :param data: an array of classes + :param class_pos: the positive class to be replaced by +1 + :param inplace: If True, modify data in place (still return it, also) + :return: + """ + if not inplace: + data = deepcopy(data) + + position_class_labels = (data == class_pos) + data[~(position_class_labels)] = -1 + data[(position_class_labels)] = +1 + + return data + +def change_binary_func_load(base_load_function): + def func_load(return_X_y): + X, y = base_load_function(return_X_y=return_X_y) + possible_classes = sorted(set(y)) + assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication" + y = binarize_class_data(y, possible_classes[-1]) + return X, y + return func_load diff --git a/bolsonaro/notebooks/.gitkeep b/code/bolsonaro/visualization/.gitkeep similarity index 100% rename from bolsonaro/notebooks/.gitkeep rename to code/bolsonaro/visualization/.gitkeep diff --git a/bolsonaro/__init__.py b/code/bolsonaro/visualization/__init__.py similarity index 100% rename from bolsonaro/__init__.py rename to code/bolsonaro/visualization/__init__.py diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py new file mode 100644 index 0000000000000000000000000000000000000000..0d5706bc27cb0745fe065456231b7e3023707ac9 --- /dev/null +++ b/code/bolsonaro/visualization/plotter.py @@ -0,0 +1,99 @@ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +class Plotter(object): + + @staticmethod + def weight_density(all_experiment_weights, file_path): + """ + Function that creates the figure with the density of the weights + :param all_experiment_weights: The weights for the different experiments + :param file path: str, path where the figure will be saved + TODO: colored by seed number or not? + TODO: represents both the seed AND the extracted tree information in the legend + """ + + """ + Convert dictionnary of structure + {seed_1:[M x W]], seed_k:[M x W]} + to numpy.ndarray with dim [K x M x W] + where K is the seed number, M is the + number of extracted trees and W the + weight number. + """ + all_experiment_weights = np.array(list(all_experiment_weights.values())) + + n = len(all_experiment_weights) + + """ + Get as many different colors from the specified cmap (here nipy_spectral) + as there are seeds used. + """ + colors = Plotter.get_colors_from_cmap(n) + + fig, ax = plt.subplots() + # For each seed + for i in range(n): + # For each weight set of a given extracted tree number + for weights in all_experiment_weights[i]: + """ + Plot the series of weights that aren't zero, + colored by seed number. + """ + pd.Series(weights[np.nonzero(weights)]).plot.kde( + figsize=(15, 10), ax=ax, color=colors[i]) + + ax.set_title('Density weights of the OMP') + fig.savefig(file_path, dpi=fig.dpi) + plt.close(fig) + + @staticmethod + def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None): + # plot the shaded range of the confidence intervals + ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5) + # plot the mean on top + ax.plot(x_value, mean, c=color_mean, label=label) + + @staticmethod + def plot_losses(file_path, all_experiment_scores, x_value, xlabel, ylabel, all_labels, title): + fig, ax = plt.subplots() + + n = len(all_experiment_scores) + + """ + Get as many different colors from the specified cmap (here nipy_spectral) + as there are curve to plot. + """ + colors = Plotter.get_colors_from_cmap(n) + + # For each curve to plot + for i in range(n): + # Retreive the scores in a list for each seed + experiment_scores = list(all_experiment_scores[i].values()) + # Compute the mean and the std for the CI + mean_experiment_scores = np.average(experiment_scores, axis=0) + std_experiment_scores = np.std(experiment_scores, axis=0) + # Plot the score curve with the CI + Plotter.plot_mean_and_CI( + ax=ax, + mean=mean_experiment_scores, + lb=mean_experiment_scores + std_experiment_scores, + ub=mean_experiment_scores - std_experiment_scores, + x_value=x_value, + color_mean=colors[i], + facecolor=colors[i], + label=all_labels[i] + ) + + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title(title) + plt.legend(loc='upper right') + fig.savefig(file_path, dpi=fig.dpi) + plt.close(fig) + + @staticmethod + def get_colors_from_cmap(n_colors, colormap_name='nipy_spectral'): + return [plt.get_cmap(colormap_name)(1. * i/n_colors) for i in range(n_colors)] diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..0f7aa3a666d61618a3a5d50b1de8e996c235034c --- /dev/null +++ b/code/compute_hyperparameters.py @@ -0,0 +1,97 @@ +from bolsonaro import LOG_PATH +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.task import Task +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.hyperparameter_searcher import HyperparameterSearcher +from bolsonaro.utils import save_obj_to_json + +import argparse +import os +import pathlib +import pickle +import random +from dotenv import find_dotenv, load_dotenv + +""" +I had to install skopt from this repository +https://github.com/darenr/scikit-optimize that handles +the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762. +""" +from skopt.space import Categorical, Integer, Real + + +def clean_numpy_int_dict(dictionary): + return dict([a, int(x)] if type(x) == Integer else + [a, clean_numpy_int_dict(x)] if type(x) == dict else + [a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)] + for a, x in dictionary.items()) + + +def clean_numpy_int_list(list_n): + return [int(elem) if type(elem) == Integer else + clean_numpy_int_dict(elem) if type(elem) == dict else + clean_numpy_int_list(elem) if type(elem) == list else elem + for elem in list_n] + + +if __name__ == "__main__": + # get environment variables in .env + load_dotenv(find_dotenv('.env')) + + DEFAULT_CV = 3 + DEFAULT_N_ITER = 50 + DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), + 'min_samples_leaf': Integer(1, 1000), + 'max_depth': Integer(1, 20), + 'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])} + DATASET_LIST = ['boston', 'iris', 'diabetes'] + # , 'digits', 'linnerud', 'wine'] + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.') + parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.') + parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.') + parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.') + parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.') + + args = parser.parse_args() + + logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) + + begin_random_seed_range = 1 + end_random_seed_range = 2000 + + if args.seed is None: + random_seed = random.randint(begin_random_seed_range, end_random_seed_range) + else: + random_seed = args.seed + + for dataset_name in args.datasets: + + dataset_dir = os.path.join('experiments', dataset_name, 'stage1') + + pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True) + + logger.info('Bayesian search on dataset {}'.format(dataset_name)) + + dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) + dataset = DatasetLoader.load(dataset_parameters) + + if dataset.task == Task.REGRESSION: + scorer = 'neg_mean_squared_error' + else: + scorer = 'accuracy' + + bayesian_searcher = HyperparameterSearcher() + opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter, + args.cv, random_seed, scorer, args.verbose) + + dict_results = {'_scorer': scorer, + '_best_score_train': opt.best_score_, + '_best_score_test': opt.score(dataset.X_test, dataset.y_test), + '_best_parameters': clean_numpy_int_dict(opt.best_params_), + '_random_seed': random_seed + } + + save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results) diff --git a/code/compute_results.py b/code/compute_results.py new file mode 100644 index 0000000000000000000000000000000000000000..64124af70954cc6af6a923f03f5a122a75f453fb --- /dev/null +++ b/code/compute_results.py @@ -0,0 +1,162 @@ +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.models.model_raw_results import ModelRawResults +from bolsonaro.models.model_factory import ModelFactory +from bolsonaro.visualization.plotter import Plotter + +import argparse +import pathlib +from dotenv import find_dotenv, load_dotenv +import os + + +if __name__ == "__main__": + # get environment variables in .env + load_dotenv(find_dotenv('.env')) + + DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' + DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' + DEFAULT_EXPERIMENT_IDS = None + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') + parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') + parser.add_argument('--experiment_ids', nargs='+', type=int, default=DEFAULT_EXPERIMENT_IDS, help='Compute the results of the specified experiment id(s)') + args = parser.parse_args() + + # Create recursively the results dir tree + pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) + + """ + Use specified list of experiments ids if availabe. + Otherwise, list all existing experiment ids from + the specified models directory. + """ + experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \ + if args.experiment_ids is not None \ + else os.listdir(args.models_dir) + + """ + Raise an error if there's no experiments ids found both + in parameter or in models directory. + """ + if experiments_ids is None or len(experiments_ids) == 0: + raise ValueError("No experiment id was found or specified.") + + # Compute the plots for each experiment id + for experiment_id in experiments_ids: + experiment_id_path = args.models_dir + os.sep + experiment_id # models/{experiment_id} + # Create recursively the tree results/{experiment_id} + pathlib.Path(args.results_dir + os.sep + experiment_id).mkdir(parents=True, exist_ok=True) + experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds + + """ + Dictionaries to temporarly store the scalar results with the following structure: + {seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]} + TODO: to complete to retreive more results + """ + experiment_train_scores = dict() + experiment_dev_scores = dict() + experiment_test_scores = dict() + + experiment_weights = dict() + + # Used to check if all losses were computed using the same metric (it should be the case) + experiment_score_metrics = list() + + # For each seed results stored in models/{experiment_id}/seeds + for seed in os.listdir(experiment_seed_root_path): + experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} + dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed + dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters + extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size + + # {{seed}:[]} + experiment_train_scores[seed] = list() + experiment_dev_scores[seed] = list() + experiment_test_scores[seed] = list() + + experiment_weights[seed] = list() + + # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size + extracted_forest_sizes = os.listdir(extracted_forest_size_root_path) + for extracted_forest_size in extracted_forest_sizes: + # models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size} + extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size + # Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file + model_raw_results = ModelRawResults.load(extracted_forest_size_path) + # Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle + model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results) + # Save temporarly some raw results (TODO: to complete to retreive more results) + # Save the scores + experiment_train_scores[seed].append(model_raw_results.train_score) + experiment_dev_scores[seed].append(model_raw_results.dev_score) + experiment_test_scores[seed].append(model_raw_results.test_score) + # Save the weights + experiment_weights[seed].append(model_raw_results.weights) + # Save the metric + experiment_score_metrics.append(model_raw_results.score_metric) + + if len(set(experiment_score_metrics)) > 1: + raise ValueError("The metrics used to compute the dev score aren't the same everytime") + + """ + Example of plot that just plots the losses computed + on the train, dev and test subsets using a trained + model, with the CI, and depending on the extracted + forest size. + """ + Plotter.plot_losses( + file_path=args.results_dir + os.sep + experiment_id + os.sep + 'losses.png', + all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores], + x_value=extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel=experiment_score_metrics[0], + all_labels=['train', 'dev', 'test'], + title='Loss values of the trained model' + ) + + """ + TODO: + For each dataset: + Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) + Stage 2) A figure for the selection of the best dataset normalization method + Stage 3) A figure for the selection of the best combination of dataset: normalization vs D normalization vs weights normalization + Stage 4) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev + Stage 5) A figure for the selection of the best extracted forest size? + Stage 6) A figure to finally compare the perf of our approach using the previous selected parameters vs the baseline vs other papers + + Stage 3) + In all axis: + - untrained forest + - trained base forest (straight line cause it doesn't depend on the number of extracted trees) + + Axis 1: + - test with forest on train+dev and OMP on train+dev + - test with forest on train+dev and OMP on train+dev with dataset normalization + - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + - test with forest on train+dev and OMP on train+dev with dataset normalization + weights normalization + - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + weights normalization + + Axis 2: + - test with forest on train and OMP on dev + - test with forest on train and OMP on dev with dataset normalization + - test with forest on train and OMP on dev with dataset normalization + D normalization + - test with forest on train and OMP on dev with dataset normalization + weights normalization + - test with forest on train and OMP on dev with dataset normalization + D normalization + weights normalization + + Axis 3: + - test with forest on train and OMP train+dev + - test with forest on train and OMP train+dev with dataset normalization + - test with forest on train and OMP train+dev with dataset normalization + D normalization + - test with forest on train and OMP train+dev with dataset normalization + weights normalization + - test with forest on train and OMP train+dev with dataset normalization + D normalization + weights normalization + + IMPORTANT: Same seeds used in all axis. + """ + + # Plot the density of the weights + Plotter.weight_density( + file_path=args.results_dir + os.sep + experiment_id + os.sep + 'density_weight.png', + all_experiment_weights=experiment_weights + ) diff --git a/code/train.py b/code/train.py new file mode 100644 index 0000000000000000000000000000000000000000..0d9713252b0e5e2345331952edaca6adfa5424c0 --- /dev/null +++ b/code/train.py @@ -0,0 +1,170 @@ +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.models.model_factory import ModelFactory +from bolsonaro.models.model_parameters import ModelParameters +from bolsonaro.trainer import Trainer +from bolsonaro.utils import resolve_experiment_id +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory + +from dotenv import find_dotenv, load_dotenv +import argparse +import json +import pathlib +import random +import os +from concurrent import futures +import threading +import json + + +def process_job(seed, parameters, experiment_id, hyperparameters): + """ + Experiment function. + + Will be used as base function for worker in multithreaded application. + + :param seed: + :param parameters: + :param experiment_id: + :return: + """ + logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( + seed, threading.get_ident())) + logger.info('seed={}'.format(seed)) + + seed_str = str(seed) + experiment_id_str = str(experiment_id) + models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ + os.sep + seed_str + pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) + + dataset_parameters = DatasetParameters( + name=parameters['dataset_name'], + test_size=parameters['test_size'], + dev_size=parameters['dev_size'], + random_state=seed, + dataset_normalizer=parameters['dataset_normalizer'] + ) + dataset_parameters.save(models_dir, experiment_id_str) + dataset = DatasetLoader.load(dataset_parameters) + + trainer = Trainer(dataset) + + for extracted_forest_size in parameters['extracted_forest_size']: + # question if training is too long, one may also split experiments for different forest sizes into different workers + logger.info('extracted_forest_size={}'.format(extracted_forest_size)) + sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + + model_parameters = ModelParameters( + extracted_forest_size=extracted_forest_size, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters + ) + model_parameters.save(sub_models_dir, experiment_id) + + model = ModelFactory.build(dataset.task, model_parameters) + + trainer.init(model) + trainer.train(model) + trainer.compute_results(model, sub_models_dir) + logger.info('Training done') + +if __name__ == "__main__": + load_dotenv(find_dotenv('.env')) + + DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' + DEFAULT_DATASET_NAME = 'boston' + DEFAULT_NORMALIZE_D = False + DEFAULT_DATASET_NORMALIZER = None + DEFAULT_FOREST_SIZE = 100 + DEFAULT_EXTRACTED_FOREST_SIZE = 10 + # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} + DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' + DEFAULT_DEV_SIZE = 0.2 + DEFAULT_TEST_SIZE = 0.2 + DEFAULT_RANDOM_SEED_NUMBER = 1 + DEFAULT_SUBSETS_USED = 'train,dev' + DEFAULT_NORMALIZE_WEIGHTS = False + + begin_random_seed_range = 1 + end_random_seed_range = 2000 + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') + parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') + parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') + parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') + parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') + parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') + parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') + parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') + parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.') + parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') + parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') + parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') + parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') + parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') + args = parser.parse_args() + + if args.experiment_configuration: + with open(args.experiment_configuration_path + os.sep + \ + args.experiment_configuration + '.json', 'r') as input_file: + parameters = json.load(input_file) + else: + parameters = args.__dict__ + + pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) + + logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) + + # The number of tree to extract from forest (K) + parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ + if type(parameters['extracted_forest_size']) == list \ + else [parameters['extracted_forest_size']] + + hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json') + if os.path.exists(hyperparameters_path): + logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path)) + with open(hyperparameters_path, 'r+') as file_hyperparameter: + hyperparameters = json.load(file_hyperparameter)['best_parameters'] + else: + hyperparameters = {} + + if parameters['forest_size'] is not None: + hyperparameters['n_estimators'] = parameters['forest_size'] + + if parameters['seeds'] != None and parameters['random_seed_number'] > 1: + logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') + + # Seeds are either provided as parameters or generated at random + seeds = parameters['seeds'] if parameters['seeds'] is not None \ + else [random.randint(begin_random_seed_range, end_random_seed_range) \ + for i in range(parameters['random_seed_number'])] + + # Resolve the next experiment id number (last id + 1) + experiment_id = resolve_experiment_id(parameters['models_dir']) + logger.info('Experiment id: {}'.format(experiment_id)) + + """ + If the experiment configuration isn't coming from + an already existing file, save it to a json file to + keep trace of it. + """ + if args.experiment_configuration is None: + with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format( + experiment_id), 'w') as output_file: + json.dump( + parameters, + output_file, + indent=4 + ) + + # Train as much job as there are seeds + with futures.ProcessPoolExecutor(len(seeds)) as executor: + list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, + parameters, experiment_id, hyperparameters) for seed in seeds)) diff --git a/bolsonaro/visualization/.gitkeep b/experiments/.gitkeep similarity index 100% rename from bolsonaro/visualization/.gitkeep rename to experiments/.gitkeep diff --git a/experiments/20newsgroups_vectorized/stage1/params.json b/experiments/20newsgroups_vectorized/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..76662d7e5869395dc101dcb5fd509a5a3b9c561d --- /dev/null +++ b/experiments/20newsgroups_vectorized/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.7953125, + "best_score_test": 0.7909854175872735, + "best_parameters": { + "max_depth": 20, + "max_features": "sqrt", + "min_samples_leaf": 1, + "n_estimators": 809 + }, + "random_seed": 1763 +} \ No newline at end of file diff --git a/experiments/boston/stage1/params.json b/experiments/boston/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..6a5a1e9a05d8f081af6abe38fa0aadfff1e736b8 --- /dev/null +++ b/experiments/boston/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -11.238253315624897, + "best_score_test": -7.312532120669678, + "best_parameters": { + "max_depth": 20, + "max_features": "auto", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 289 +} \ No newline at end of file diff --git a/experiments/breast_cancer/stage1/params.json b/experiments/breast_cancer/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..6b1b22834e0d69a477fa7b4f06b1169b9c3e3016 --- /dev/null +++ b/experiments/breast_cancer/stage1/params.json @@ -0,0 +1,11 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.96, + "best_score_test": 0.956140350877193, + "best_parameters": { + "max_depth": 20, + "max_features": "sqrt", + "min_samples_leaf": 1, + "n_estimators": 1000 + } +} \ No newline at end of file diff --git a/experiments/diabetes/stage1/params.json b/experiments/diabetes/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..6c7fbb12b41968d9b8367161a2bb607ad954a65b --- /dev/null +++ b/experiments/diabetes/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -3380.975223665973, + "best_score_test": -2604.589761961369, + "best_parameters": { + "max_depth": 17, + "max_features": "auto", + "min_samples_leaf": 10, + "n_estimators": 804 + }, + "random_seed": 1679 +} \ No newline at end of file diff --git a/experiments/digits/stage1/params.json b/experiments/digits/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..e5662c5afcdfc4ede1a29ef778393d8f8cf95156 --- /dev/null +++ b/experiments/digits/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.9767932489451476, + "best_score_test": 0.9861111111111112, + "best_parameters": { + "max_depth": 16, + "max_features": "sqrt", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 1679 +} \ No newline at end of file diff --git a/experiments/iris/stage1/params.json b/experiments/iris/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..fd852cace9852ee492649374e915b639fe785b28 --- /dev/null +++ b/experiments/iris/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.9576271186440678, + "best_score_test": 1.0, + "best_parameters": { + "max_depth": 20, + "max_features": "log2", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 883 +} \ No newline at end of file diff --git a/experiments/linnerud/stage1/params.json b/experiments/linnerud/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..7db121c4ccb6c0add73e7e554349efbe17410dbe --- /dev/null +++ b/experiments/linnerud/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -268.00052987557854, + "best_score_test": -206.18071759259263, + "best_parameters": { + "max_depth": 3, + "max_features": "sqrt", + "min_samples_leaf": 232, + "n_estimators": 16 + }, + "random_seed": 1679 +} \ No newline at end of file diff --git a/experiments/wine/stage1/params.json b/experiments/wine/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..25950f99a1e9ea38247c4c4b76628aad87442511 --- /dev/null +++ b/experiments/wine/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.9857142857142858, + "best_score_test": 0.9722222222222222, + "best_parameters": { + "max_depth": 20, + "max_features": "log2", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 1679 +} \ No newline at end of file diff --git a/models/.gitkeep b/models/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/requirements.txt b/requirements.txt index d4f7d11c284ddfa9486b93a090035aaa37fca265..d585159632442f7293f858e1fad391031658d60a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,8 @@ coverage awscli flake8 python-dotenv>=0.5.1 +scikit-learn +git+git://github.com/darenr/scikit-optimize@master +python-dotenv +matplotlib +pandas \ No newline at end of file diff --git a/setup.py b/setup.py index 08242fab42758dba87119ee01babd47d701b1864..592074efa2a4c95a89b8a287921811835a2cc35e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,8 @@ from setuptools import find_packages, setup setup( name='bolsonaro', - packages=find_packages(), + packages=find_packages(where="code", exclude=['doc', 'dev']), + package_dir={'': "code"}, version='0.1.0', description='Bolsonaro project of QARMA non-permanents: deforesting random forest using OMP.', author='QARMA team',