diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index 56e267fb8208bd733a428d43f9b6bfe6d19b16da..c7192c141c279370e3d896e7a9dcf71956e9b580 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,5 +1,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor +from bolsonaro.models.nn_omp_forest_regressor import NonNegativeOmpForestRegressor +from bolsonaro.models.nn_omp_forest_classifier import NonNegativeOmpForestBinaryClassifier from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier @@ -19,8 +21,10 @@ class ModelFactory(object): raise ValueError("Unsupported task '{}'".format(task)) if task == Task.BINARYCLASSIFICATION: - if model_parameters.extraction_strategy in ['omp', 'omp_nn', 'omp_distillation']: + if model_parameters.extraction_strategy in ['omp', 'omp_distillation']: return OmpForestBinaryClassifier(model_parameters) + elif model_parameters.extraction_strategy == 'omp_nn': + return NonNegativeOmpForestBinaryClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) @@ -36,8 +40,10 @@ class ModelFactory(object): else: raise ValueError('Invalid extraction strategy') elif task == Task.REGRESSION: - if model_parameters.extraction_strategy in ['omp', 'omp_nn', 'omp_distillation']: + if model_parameters.extraction_strategy in ['omp', 'omp_distillation']: return OmpForestRegressor(model_parameters) + elif model_parameters.extraction_strategy == 'omp_nn': + return NonNegativeOmpForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'random': return RandomForestRegressor(**model_parameters.hyperparameters, random_state=model_parameters.seed) @@ -53,8 +59,10 @@ class ModelFactory(object): else: raise ValueError('Invalid extraction strategy') elif task == Task.MULTICLASSIFICATION: - if model_parameters.extraction_strategy in ['omp', 'omp_nn', 'omp_distillation']: + if model_parameters.extraction_strategy in ['omp', 'omp_distillation']: return OmpForestMulticlassClassifier(model_parameters) + elif model_parameters.extraction_strategy == 'omp_nn': + raise ValueError('omp_nn is unsuported for multi classification') elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 28f0475ea7707679eac2e0396be782baaeffaa63..b9cc3fc105769b713c47d4323137f017dc4095cf 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -6,13 +6,12 @@ import os class ModelParameters(object): def __init__(self, extracted_forest_size, normalize_D, subsets_used, - normalize_weights, seed, hyperparameters, extraction_strategy, non_negative=False, intermediate_solutions_sizes=None): + normalize_weights, seed, hyperparameters, extraction_strategy, intermediate_solutions_sizes=None): """Init of ModelParameters. Args: extracted_forest_size (int): extracted forest size intermediate_solutions_sizes (list): list of all intermediate solutions sizes - non_negative (bool): tell to use the non negative version of OMP, must be used in combination with intermediate solutions size normalize_D (bool): true normalize the distribution, false no subsets_used (list): which dataset use for randomForest and for OMP 'train', 'dev' or 'train+dev' and combination of two of this. @@ -30,15 +29,10 @@ class ModelParameters(object): self._hyperparameters = hyperparameters self._extraction_strategy = extraction_strategy - if non_negative and intermediate_solutions_sizes is None: + if self._extraction_strategy == 'omp_nn' and intermediate_solutions_sizes is None: raise ValueError("Intermediate solutions must be set if non negative option is on.") - self._non_negative = non_negative self._intermediate_solutions_sizes = intermediate_solutions_sizes - @property - def non_negative(self): - return self._non_negative - @property def intermediate_solutions_sizes(self): return self._intermediate_solutions_sizes diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index 3f7af5fcd31c1eb105a3dd39695e1ddc69f38676..96072a62b99eb9c32822280f4db175ebfa3ccc2f 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -10,9 +10,9 @@ class ModelRawResults(object): datetime, train_score, dev_score, test_score, train_score_base, dev_score_base, test_score_base, score_metric, base_score_metric, - #coherence='', correlation=''): train_coherence='', dev_coherence='', test_coherence='', train_correlation='', dev_correlation='', test_correlation='', + train_scores='', dev_scores='', test_scores='', train_strength='', dev_strength='', test_strength=''): self._model_weights = model_weights @@ -26,14 +26,15 @@ class ModelRawResults(object): self._test_score_base = test_score_base self._score_metric = score_metric self._base_score_metric = base_score_metric - """self._coherence = coherence - self._correlation = correlation""" self._train_coherence = train_coherence self._dev_coherence = dev_coherence self._test_coherence = test_coherence self._train_correlation = train_correlation self._dev_correlation = dev_correlation self._test_correlation = test_correlation + self._train_scores = train_scores + self._dev_scores = dev_scores + self._test_scores = test_scores self._train_strength = train_strength self._dev_strength = dev_strength self._test_strength = test_strength @@ -82,14 +83,6 @@ class ModelRawResults(object): def base_score_metric(self): return self._base_score_metric - """@property - def coherence(self): - return self._coherence - - @property - def correlation(self): - return self._correlation""" - @property def train_coherence(self): return self._train_coherence @@ -114,6 +107,18 @@ class ModelRawResults(object): def test_correlation(self): return self._test_correlation + @property + def train_scores(self): + return self._train_scores + + @property + def dev_scores(self): + return self._dev_scores + + @property + def test_scores(self): + return self._test_scores + @property def train_strength(self): return self._train_strength diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index f2bfcf2a873611af98898d3185e7855f633558ea..12ac394d6b1e9e87faf89dd35755b5d9d8af5505 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -105,7 +105,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): class SingleOmpForest(OmpForest): def __init__(self, models_parameters, base_forest_estimator): - if models_parameters.non_negative: + if models_parameters.extraction_strategy == 'omp_nn': self._omp = NonNegativeOrthogonalMatchingPursuit( max_iter=models_parameters.extracted_forest_size, intermediate_solutions_sizes=models_parameters.intermediate_solutions_sizes, @@ -121,17 +121,19 @@ class SingleOmpForest(OmpForest): super().__init__(models_parameters, base_forest_estimator) def fit_omp(self, atoms, objective): - with warnings.catch_warnings(record=True) as caught_warnings: + self._omp.fit(atoms, objective) + + """with warnings.catch_warnings(record=True) as caught_warnings: # Cause all warnings to always be triggered. warnings.simplefilter("always") - self._omp.fit(atoms, objective) + # ignore any non-custom warnings that may be in the list caught_warnings = list(filter(lambda i: i.message != RuntimeWarning(omp_premature_warning), caught_warnings)) if len(caught_warnings) > 0: - self._logger.error(f'number of linear dependences in the dictionary: {len(caught_warnings)}. model parameters: {str(self._models_parameters.__dict__)}') + self._logger.error(f'number of linear dependences in the dictionary: {len(caught_warnings)}. model parameters: {str(self._models_parameters.__dict__)}')""" def predict(self, X): """ diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index b19569cdeae52d5e1d36c869a49aa2c56a93b150..1cf9346c3e94f607cff3c61204254884d66a8766 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -1,6 +1,8 @@ from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier +from bolsonaro.models.nn_omp_forest_regressor import NonNegativeOmpForestRegressor +from bolsonaro.models.nn_omp_forest_classifier import NonNegativeOmpForestBinaryClassifier from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor, EnsembleSelectionForestClassifier @@ -98,7 +100,8 @@ class Trainer(object): y=self._y_forest ) else: - if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier] and \ + if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier, + NonNegativeOmpForestRegressor, NonNegativeOmpForestBinaryClassifier] and \ use_distillation: model.fit( self._X_forest, # X_train or X_train+X_dev @@ -116,13 +119,25 @@ class Trainer(object): ) self._end_time = time.time() - def __score_func(self, model, X, y_true, weights=True): + def __score_func(self, model, X, y_true, weights=True, extracted_forest_size=None): if type(model) in [OmpForestRegressor, RandomForestRegressor]: if weights: y_pred = model.predict(X) else: y_pred = model.predict_no_weights(X) result = self._regression_score_metric(y_true, y_pred) + elif type(model) == NonNegativeOmpForestRegressor: + if weights: + y_pred = model.predict(X, extracted_forest_size) + else: + y_pred = model.predict_no_weights(X, extracted_forest_size) + result = self._regression_score_metric(y_true, y_pred) + elif type(model) == NonNegativeOmpForestBinaryClassifier: + if weights: + y_pred = model.predict(X, extracted_forest_size) + else: + y_pred = model.predict_no_weights(X, extracted_forest_size) + result = self._classification_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: if weights: y_pred = model.predict(X) @@ -138,10 +153,12 @@ class Trainer(object): return result def __score_func_base(self, model, X, y_true): - if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: + if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor, + NonNegativeOmpForestRegressor]: y_pred = model.predict_base_estimator(X) result = self._base_regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestClassifier, EnsembleSelectionForestClassifier]: + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, + SimilarityForestClassifier, EnsembleSelectionForestClassifier, NonNegativeOmpForestBinaryClassifier]: y_pred = model.predict_base_estimator(X) result = self._base_classification_score_metric(y_true, y_pred) elif type(model) == RandomForestClassifier: @@ -152,17 +169,16 @@ class Trainer(object): result = self._base_regression_score_metric(y_true, y_pred) return result - def _evaluate_predictions(self, X, aggregation_function, selected_trees): - predictions = np.array([tree.predict(X) for tree in selected_trees]) - + def _evaluate_predictions(self, predictions, aggregation_function): predictions = normalize(predictions) return aggregation_function(np.abs((predictions @ predictions.T - np.eye(len(predictions))))) - def _compute_forest_strength(self, X, y, metric_function, selected_trees): - return np.mean([metric_function(y, tree.predict(X)) for tree in selected_trees]) + def _compute_forest_strength(self, predictions, y, metric_function): + scores = np.array([metric_function(y, prediction) for prediction in predictions]) + return scores, np.mean(scores) - def compute_results(self, model, models_dir, subsets_used='train+dev,train+dev'): + def compute_results(self, model, models_dir, subsets_used='train+dev,train+dev', extracted_forest_size=None): """ :param model: Object with :param models_dir: Where the results will be saved @@ -202,52 +218,69 @@ class Trainer(object): model_weights = model._dct_class_omp elif type(model) == OmpForestBinaryClassifier: model_weights = model._omp + elif type(model) in [NonNegativeOmpForestRegressor, NonNegativeOmpForestBinaryClassifier]: + model_weights = model._omp.get_coef(extracted_forest_size) if type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor, SimilarityForestClassifier, KMeansForestClassifier, EnsembleSelectionForestClassifier]: selected_trees = model.selected_trees - elif type(model) in [OmpForestRegressor, OmpForestMulticlassClassifier, OmpForestBinaryClassifier]: - selected_trees = np.asarray(model.forest)[model._omp.coef_ != 0] + elif type(model) in [OmpForestRegressor, OmpForestMulticlassClassifier, OmpForestBinaryClassifier, + NonNegativeOmpForestRegressor, NonNegativeOmpForestBinaryClassifier]: + selected_trees = np.asarray(model.forest)[model_weights != 0] elif type(model) in [RandomForestRegressor, RandomForestClassifier]: selected_trees = model.estimators_ if len(selected_trees) > 0: target_selected_tree = int(os.path.split(models_dir)[-1]) if target_selected_tree != len(selected_trees): - predictions_X_omp = model.predict(X_omp) + predictions_X_omp = model.predict(X_omp, extracted_forest_size) \ + if type(model) in [NonNegativeOmpForestBinaryClassifier, NonNegativeOmpForestRegressor] \ + else model.predict(X_omp) error_prediction = np.linalg.norm(predictions_X_omp - y_omp) if not np.isclose(error_prediction, 0): - raise ValueError(f'Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}') + #raise ValueError(f'Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}') + self._logger.error(f'Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}') else: self._logger.warning(f"Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}" " But the prediction is perfect on X_omp. Keep less trees.") with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file: pickle.dump(selected_trees, output_file) - strength_metric = self._regression_score_metric if self._dataset.task == Task.REGRESSION else self._classification_score_metric + strength_metric = self._regression_score_metric if self._dataset.task == Task.REGRESSION \ + else lambda y_true, y_pred: self._classification_score_metric(y_true, (y_pred -0.5)*2) + + train_predictions = np.array([tree.predict(X_forest) for tree in selected_trees]) + dev_predictions = np.array([tree.predict(X_omp) for tree in selected_trees]) + test_predictions = np.array([tree.predict(self._dataset.X_test) for tree in selected_trees]) + train_scores, train_strength = self._compute_forest_strength(train_predictions, y_forest, strength_metric) + dev_scores, dev_strength = self._compute_forest_strength(dev_predictions, y_omp, strength_metric) + test_scores, test_strength = self._compute_forest_strength(test_predictions, self._dataset.y_test, strength_metric) results = ModelRawResults( model_weights=model_weights, training_time=self._end_time - self._begin_time, datetime=datetime.datetime.now(), - train_score=self.__score_func(model, X_forest, y_forest), - dev_score=self.__score_func(model, X_omp, y_omp), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), + train_score=self.__score_func(model, X_forest, y_forest, extracted_forest_size=extracted_forest_size), + dev_score=self.__score_func(model, X_omp, y_omp, extracted_forest_size=extracted_forest_size), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, extracted_forest_size=extracted_forest_size), train_score_base=self.__score_func_base(model, X_forest, y_forest), dev_score_base=self.__score_func_base(model, X_omp, y_omp), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), score_metric=self._score_metric_name, base_score_metric=self._base_score_metric_name, - train_coherence=self._evaluate_predictions(X_forest, aggregation_function=np.max, selected_trees=selected_trees), - dev_coherence=self._evaluate_predictions(X_omp, aggregation_function=np.max, selected_trees=selected_trees), - test_coherence=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.max, selected_trees=selected_trees), - train_correlation=self._evaluate_predictions(X_forest, aggregation_function=np.mean, selected_trees=selected_trees), - dev_correlation=self._evaluate_predictions(X_omp, aggregation_function=np.mean, selected_trees=selected_trees), - test_correlation=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.mean, selected_trees=selected_trees), - train_strength=self._compute_forest_strength(X_forest, y_forest, strength_metric, selected_trees), - dev_strength=self._compute_forest_strength(X_omp, y_omp, strength_metric, selected_trees), - test_strength=self._compute_forest_strength(self._dataset.X_test, self._dataset.y_test, strength_metric, selected_trees) + train_coherence=self._evaluate_predictions(train_predictions, aggregation_function=np.max), + dev_coherence=self._evaluate_predictions(dev_predictions, aggregation_function=np.max), + test_coherence=self._evaluate_predictions(test_predictions, aggregation_function=np.max), + train_correlation=self._evaluate_predictions(train_predictions, aggregation_function=np.mean), + dev_correlation=self._evaluate_predictions(dev_predictions, aggregation_function=np.mean), + test_correlation=self._evaluate_predictions(test_predictions, aggregation_function=np.mean), + train_scores=train_scores, + dev_scores=dev_scores, + test_scores=test_scores, + train_strength=train_strength, + dev_strength=dev_strength, + test_strength=test_strength ) results.save(models_dir) self._logger.info("Base performance on test: {}".format(results.test_score_base)) @@ -263,19 +296,23 @@ class Trainer(object): self._logger.info(f'test_correlation: {results.test_correlation}') self._logger.info(f'test_strength: {results.test_strength}') - if type(model) not in [RandomForestRegressor, RandomForestClassifier]: + if type(model) in [OmpForestBinaryClassifier, OmpForestRegressor, OmpForestMulticlassClassifier, + NonNegativeOmpForestBinaryClassifier, NonNegativeOmpForestRegressor]: results = ModelRawResults( model_weights='', training_time=self._end_time - self._begin_time, datetime=datetime.datetime.now(), - train_score=self.__score_func(model, X_forest, y_forest, False), - dev_score=self.__score_func(model, X_omp, y_omp, False), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), + train_score=self.__score_func(model, X_forest, y_forest, False, extracted_forest_size=extracted_forest_size), + dev_score=self.__score_func(model, X_omp, y_omp, False, extracted_forest_size=extracted_forest_size), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False, extracted_forest_size=extracted_forest_size), train_score_base=self.__score_func_base(model, X_forest, y_forest), dev_score_base=self.__score_func_base(model, X_omp, y_omp), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name + base_score_metric=self._base_score_metric_name, + train_scores=train_scores, + dev_scores=dev_scores, + test_scores=test_scores ) results.save(models_dir+'_no_weights') self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) diff --git a/code/compute_results.py b/code/compute_results.py index 23e3db3ad7c95e5f5732b4d09e945ce53dfd4467..111cac2c71d6ac86a1557f0bfe02f4c615b038f1 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -4,6 +4,7 @@ from bolsonaro import LOG_PATH from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.task import Task import argparse import pathlib @@ -19,6 +20,7 @@ from pyrsa.data.dataset import Dataset import matplotlib.pyplot as plt from sklearn.manifold import MDS from sklearn.preprocessing import normalize +from sklearn.metrics import mean_squared_error, accuracy_score def vect2triu(dsm_vect, dim=None): @@ -312,6 +314,12 @@ def extract_selected_trees_across_seeds(models_dir, results_dir, experiment_id): dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) dataset = DatasetLoader.load(dataset_parameters) + strength_metric = mean_squared_error if dataset.task == Task.REGRESSION \ + else lambda y_true, y_pred: accuracy_score(y_true, (y_pred -0.5)*2) + + X_train = np.concatenate([dataset.X_train, dataset.X_dev]) + y_train = np.concatenate([dataset.y_train, dataset.y_dev]) + # {{seed}:[]} experiment_selected_trees[seed] = list() @@ -327,21 +335,39 @@ def extract_selected_trees_across_seeds(models_dir, results_dir, experiment_id): selected_trees = None with open(os.path.join(extracted_forest_size_path, 'selected_trees.pickle'), 'rb') as file: selected_trees = pickle.load(file) - #test_score = np.mean([tree.score(dataset.X_test, dataset.y_test) for tree in selected_trees]) + selected_trees_train_scores = np.array([strength_metric(y_train, tree.predict(X_train)) for tree in selected_trees]) + selected_trees_test_scores = np.array([strength_metric(dataset.y_test, tree.predict(dataset.X_test)) for tree in selected_trees]) + train_strength = np.mean(selected_trees_train_scores) + test_strength = np.mean(selected_trees_test_scores) + + model_raw_results_path = os.path.join(results_dir, str(experiment_id), 'seeds', str(seed), 'extracted_forest_sizes', + str(extracted_forest_size), 'model_raw_results.pickle') + with open(model_raw_results_path, 'rb') as file: + model_raw_results = pickle.load(file) + model_raw_results['train_scores'] = selected_trees_train_scores + model_raw_results['dev_scores'] = selected_trees_train_scores + model_raw_results['test_scores'] = selected_trees_test_scores + model_raw_results['train_strength'] = train_strength + model_raw_results['dev_strength'] = train_strength + model_raw_results['test_strength'] = test_strength + with open(model_raw_results_path, 'wb') as file: + pickle.dump(model_raw_results, file) + + """#test_score = np.mean([tree.score(dataset.X_test, dataset.y_test) for tree in selected_trees]) #selected_trees_predictions = np.array([tree.score(dataset.X_test, dataset.y_test) for tree in selected_trees]) selected_trees_predictions = [tree.predict(dataset.X_test) for tree in selected_trees] extracted_forest_size_bar.set_description(f'extracted_forest_size: {extracted_forest_size}') #experiment_selected_trees[seed].append(test_score) extracted_forest_size_bar.update(1) selected_trees_predictions = np.array(selected_trees_predictions) - selected_trees_predictions = normalize(selected_trees_predictions) + selected_trees_predictions = normalize(selected_trees_predictions)""" """mds = MDS(len(selected_trees_predictions)) Y = mds.fit_transform(selected_trees_predictions) plt.scatter(Y[:, 0], Y[:, 1]) plt.savefig(f'test_mds_{experiment_id}.png')""" - if int(extracted_forest_size) <= 267: + """if int(extracted_forest_size) <= 267: forest_RDM = calc_rdm(Dataset(selected_trees_predictions), method='euclidean').get_vectors() ranked_forest_RDM = np.apply_along_axis(rankdata, 1, forest_RDM.reshape(1, -1)) @@ -357,8 +383,8 @@ def extract_selected_trees_across_seeds(models_dir, results_dir, experiment_id): rdm=ranked_forest_RDM, file_path=f'test_scores_ranked_forest_RDM_id:{experiment_id}_seed:{seed}_size:{extracted_forest_size}.png', condition_number=len(selected_trees_predictions) - ) - break + )""" + seed_bar.update(1) return experiment_selected_trees @@ -875,13 +901,13 @@ if __name__ == "__main__": title='Forest strength of {}'.format(args.dataset_name)) if args.compute_selected_trees_rdms: - root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_strength') - pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True) + root_output_path = os.path.join(args.results_dir, args.dataset_name, f'bolsonaro_models_29-03-20') + #pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True) _, _, _, with_params_extracted_forest_sizes, _ = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2) all_selected_trees_scores = list() - with tqdm([2, 3, 8]) as experiment_id_bar: + with tqdm(args.experiment_ids) as experiment_id_bar: for experiment_id in experiment_id_bar: experiment_id_bar.set_description(f'experiment_id: {experiment_id}') all_selected_trees_scores.append(extract_selected_trees_across_seeds( diff --git a/code/prepare_models.py b/code/prepare_models.py index 3cd9ea37033063652e15e0e1c84432b831b6562e..519105ff98535dfc25cb8a85a23ea9fda40fcd32 100644 --- a/code/prepare_models.py +++ b/code/prepare_models.py @@ -7,23 +7,22 @@ from tqdm import tqdm if __name__ == "__main__": models_source_path = 'models' - models_destination_path = 'bolsonaro_models_25-03-20' - #datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds', - # 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs'] - datasets = ['kin8nm'] + models_destination_path = 'bolsonaro_models_27-03-20_v2' + datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds', + 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs'] pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True) with tqdm(datasets) as dataset_bar: for dataset in dataset_bar: dataset_bar.set_description(dataset) - found_paths = glob2.glob(os.path.join(models_source_path, dataset, 'stage5_new', + found_paths = glob2.glob(os.path.join(models_source_path, dataset, 'stage5_27-03-20', '**', 'model_raw_results.pickle'), recursive=True) pathlib.Path(os.path.join(models_destination_path, dataset)).mkdir(parents=True, exist_ok=True) with tqdm(found_paths) as found_paths_bar: for path in found_paths_bar: found_paths_bar.set_description(path) - new_path = path.replace(f'models/{dataset}/stage5_new/', '') + new_path = path.replace(f'models/{dataset}/stage5_27-03-20/', '') (new_path, filename) = os.path.split(new_path) new_path = os.path.join(models_destination_path, dataset, new_path) pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) diff --git a/code/train.py b/code/train.py index 457e1c405d203e79c4b58f366ef6adfd596d8948..f5c69820d8e7533d3f7bc428ce7f33b147e42b7e 100644 --- a/code/train.py +++ b/code/train.py @@ -120,15 +120,19 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb normalize_weights=parameters['normalize_weights'], seed=seed, hyperparameters=hyperparameters, - extraction_strategy=parameters['extraction_strategy'] + extraction_strategy=parameters['extraction_strategy'], + intermediate_solutions_sizes=parameters['extracted_forest_size'] ) - model_parameters.save(sub_models_dir, experiment_id) model = ModelFactory.build(dataset.task, model_parameters) trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model) - trainer.compute_results(model, sub_models_dir) + for extracted_forest_size in parameters['extracted_forest_size']: + sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size) + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + trainer.compute_results(model, sub_models_dir, extracted_forest_size=extracted_forest_size) + model_parameters.save(sub_models_dir, experiment_id) else: with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],