diff --git a/TODO.md b/TODO.md index bfb32e8a131b5147b36c9ccba729a6e13e04e5b7..5ea6cc5cf2c933eed2e7ffbf2567d4fe812412cf 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,7 @@ -* Trouver des jeux de données pertinents -* Entraîner et tester des forêts de différentes tailles -* Entraîner et tester en regression et classification -* Entraîner et tester sur différentes modalités (pas seulement des datasets d'images) -* Entraîner avec différents hyperparamètres (d, profondeur, epsilon) -* Appliquer OMP avec différentes valeurs de k (notamment un petit k) -* Faire des figures -* Implémenter et comparer les systèmes concurrents \ No newline at end of file +* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues. +* Fix ModelFactory.load function. +* Fix model results loading in compute_results.py. +* Check that omp multiclasses classifier is working as expected. +* In the bayesian search computation, output a different file name depending on the task of the trained model. +* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values). +* Prepare the json experiment files to run. \ No newline at end of file diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index a9bebe044b68475f5cc0cf6c6a2097ffe986e47c..4a32bffb8c7a2e129bf6f010c8a2c3a339a53a4b 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -20,48 +20,51 @@ class Trainer(object): self._dataset = dataset self._logger = LoggerFactory.create(LOG_PATH, __name__) - def train(self, model, models_dir): - """ - - :param model: Object with - :param models_dir: Where the results will be saved - :return: - """ - # todo cette fonction ne fait pas que "train", elle choisit le jeu de données, train et evalue le modèle -> nom à changer - self._logger.debug('Training model using train set...') - begin_time = time.time() - + def init(self, model): if model.models_parameters.subsets_used == 'train,dev': - X_forest = self._dataset.X_train - y_forest = self._dataset.y_train - X_omp = self._dataset.X_dev - y_omp = self._dataset.y_dev + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = self._dataset.X_dev + self._y_omp = self._dataset.y_dev self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') elif model.models_parameters.subsets_used == 'train+dev,train+dev': - X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - X_omp = X_forest - y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - y_omp = y_forest + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._X_omp = self._X_forest + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._y_omp = self._y_forest self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') elif model.models_parameters.subsets_used == 'train,train+dev': - X_forest = self._dataset.X_train - y_forest = self._dataset.y_train - X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) else: raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) + def train(self, model): + """ + :param model: Object with + :return: + """ + + self._logger.debug('Training model using train set...') + self._begin_time = time.time() model.fit( - X_forest=X_forest, - y_forest=y_forest, - X_omp=X_omp, - y_omp=y_omp + X_forest=self._X_forest, + y_forest=self._y_forest, + X_omp=self._X_omp, + y_omp=self._y_omp ) - end_time = time.time() + self._end_time = time.time() + def compute_results(self, model, models_dir): + """ + :param model: Object with + :param models_dir: Where the results will be saved + """ results = ModelRawResults( model_object=model, - training_time=end_time - begin_time, + training_time=self._end_time - self._begin_time, datetime=datetime.datetime.now(), train_score=model.score(self._dataset.X_train, self._dataset.y_train), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), diff --git a/code/train.py b/code/train.py index 34c2003db8aef25d105831989b5c38b4e966f640..0d9713252b0e5e2345331952edaca6adfa5424c0 100644 --- a/code/train.py +++ b/code/train.py @@ -69,7 +69,9 @@ def process_job(seed, parameters, experiment_id, hyperparameters): model = ModelFactory.build(dataset.task, model_parameters) - trainer.train(model, sub_models_dir) + trainer.init(model) + trainer.train(model) + trainer.compute_results(model, sub_models_dir) logger.info('Training done') if __name__ == "__main__":