Skip to content
Snippets Groups Projects
Commit 87fb436c authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Fix flw_pairs loading. Prepare all new exps: omp_distillation, preds...

Fix flw_pairs loading. Prepare all new exps: omp_distillation, preds coherence, preds correlation, normalize_D when OMP, n_jobs=-1 in SOTA. In exps script, test both train+dev,train+dev and train,dev
parent fcbc03e2
No related branches found
1 merge request!23Resolve "integration-sota"
from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load, change_binary_func_openml from bolsonaro.utils import change_binary_func_load, change_binary_func_openml, binarize_class_data
from sklearn.datasets import load_boston, load_iris, load_diabetes, \ from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer load_digits, load_linnerud, load_wine, load_breast_cancer
...@@ -81,6 +81,8 @@ class DatasetLoader(object): ...@@ -81,6 +81,8 @@ class DatasetLoader(object):
elif name == 'lfw_pairs': elif name == 'lfw_pairs':
dataset = fetch_lfw_pairs() dataset = fetch_lfw_pairs()
X, y = dataset.data, dataset.target X, y = dataset.data, dataset.target
possible_classes = sorted(set(y))
y = binarize_class_data(y, possible_classes[-1])
task = Task.BINARYCLASSIFICATION task = Task.BINARYCLASSIFICATION
elif name == 'covtype': elif name == 'covtype':
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True) X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
......
...@@ -55,7 +55,7 @@ class EnsembleSelectionForestClassifier(EnsembleSelectionForest, metaclass=ABCMe ...@@ -55,7 +55,7 @@ class EnsembleSelectionForestClassifier(EnsembleSelectionForest, metaclass=ABCMe
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2) random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_classification(predictions) return aggregation_classification(predictions)
...@@ -90,7 +90,7 @@ class EnsembleSelectionForestRegressor(EnsembleSelectionForest, metaclass=ABCMet ...@@ -90,7 +90,7 @@ class EnsembleSelectionForestRegressor(EnsembleSelectionForest, metaclass=ABCMet
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2) random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_regression(predictions) return aggregation_regression(predictions)
...@@ -108,32 +108,3 @@ class EnsembleSelectionForestRegressor(EnsembleSelectionForest, metaclass=ABCMet ...@@ -108,32 +108,3 @@ class EnsembleSelectionForestRegressor(EnsembleSelectionForest, metaclass=ABCMet
@staticmethod @staticmethod
def _worse_score_idx(array): def _worse_score_idx(array):
return np.argmax(array) return np.argmax(array)
# @staticmethod
# def generate_library(X_train, y_train, random_state=None):
# criterion_arr = ["mse"]#, "friedman_mse", "mae"]
# splitter_arr = ["best"]#, "random"]
# depth_arr = [i for i in range(5, 20, 1)]
# min_samples_split_arr = [i for i in range(2, 20, 1)]
# min_samples_leaf_arr = [i for i in range(2, 20, 1)]
# max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
#
# library = list()
# with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
# len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
# len(max_features_arr)) as bar:
# bar.set_description('Generating library')
# for criterion in criterion_arr:
# for splitter in splitter_arr:
# for depth in depth_arr:
# for min_samples_split in min_samples_split_arr:
# for min_samples_leaf in min_samples_leaf_arr:
# for max_features in max_features_arr:
# t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
# min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
# t.fit(X_train, y_train)
# library.append(t)
# bar.update(1)
# return library
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
......
...@@ -49,7 +49,7 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): ...@@ -49,7 +49,7 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2) random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_regression(predictions) return aggregation_regression(predictions)
...@@ -70,7 +70,7 @@ class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta): ...@@ -70,7 +70,7 @@ class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2) random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_classification(predictions) return aggregation_classification(predictions)
......
...@@ -14,12 +14,12 @@ import pickle ...@@ -14,12 +14,12 @@ import pickle
class ModelFactory(object): class ModelFactory(object):
@staticmethod @staticmethod
def build(task, model_parameters, library=None): def build(task, model_parameters):
if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]: if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]:
raise ValueError("Unsupported task '{}'".format(task)) raise ValueError("Unsupported task '{}'".format(task))
if task == Task.BINARYCLASSIFICATION: if task == Task.BINARYCLASSIFICATION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy in ['omp', 'omp_distillation']:
return OmpForestBinaryClassifier(model_parameters) return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
...@@ -36,7 +36,7 @@ class ModelFactory(object): ...@@ -36,7 +36,7 @@ class ModelFactory(object):
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION: elif task == Task.REGRESSION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy in ['omp', 'omp_distillation']:
return OmpForestRegressor(model_parameters) return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
...@@ -53,7 +53,7 @@ class ModelFactory(object): ...@@ -53,7 +53,7 @@ class ModelFactory(object):
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
elif task == Task.MULTICLASSIFICATION: elif task == Task.MULTICLASSIFICATION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy in ['omp', 'omp_distillation']:
return OmpForestMulticlassClassifier(model_parameters) return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
......
...@@ -9,7 +9,8 @@ class ModelRawResults(object): ...@@ -9,7 +9,8 @@ class ModelRawResults(object):
def __init__(self, model_weights, training_time, def __init__(self, model_weights, training_time,
datetime, train_score, dev_score, test_score, datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base, train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric): test_score_base, score_metric, base_score_metric,
coherence='', correlation=''):
self._model_weights = model_weights self._model_weights = model_weights
self._training_time = training_time self._training_time = training_time
...@@ -22,6 +23,8 @@ class ModelRawResults(object): ...@@ -22,6 +23,8 @@ class ModelRawResults(object):
self._test_score_base = test_score_base self._test_score_base = test_score_base
self._score_metric = score_metric self._score_metric = score_metric
self._base_score_metric = base_score_metric self._base_score_metric = base_score_metric
self._coherence = coherence
self._correlation = correlation
@property @property
def model_weights(self): def model_weights(self):
...@@ -67,6 +70,14 @@ class ModelRawResults(object): ...@@ -67,6 +70,14 @@ class ModelRawResults(object):
def base_score_metric(self): def base_score_metric(self):
return self._base_score_metric return self._base_score_metric
@property
def coherence(self):
return self._coherence
@property
def correlation(self):
return self._correlation
def save(self, models_dir): def save(self, models_dir):
if not os.path.exists(models_dir): if not os.path.exists(models_dir):
os.mkdir(models_dir) os.mkdir(models_dir)
......
...@@ -36,11 +36,12 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -36,11 +36,12 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.estimators_ return self._base_forest_estimator.estimators_
# sklearn baseestimator api methods # sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp): def fit(self, X_forest, y_forest, X_omp, y_omp, use_distillation=False):
# print(y_forest.shape) # print(y_forest.shape)
# print(set([type(y) for y in y_forest])) # print(set([type(y) for y in y_forest]))
self._base_forest_estimator.fit(X_forest, y_forest) self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit self._extract_subforest(X_omp,
self.predict_base_estimator(X_forest) if use_distillation else y_omp) # type: OrthogonalMatchingPursuit
return self return self
def _extract_subforest(self, X, y): def _extract_subforest(self, X, y):
......
...@@ -87,7 +87,7 @@ class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta): ...@@ -87,7 +87,7 @@ class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta):
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2) random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_regression(predictions) return aggregation_regression(predictions)
...@@ -111,7 +111,7 @@ class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta): ...@@ -111,7 +111,7 @@ class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta):
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2) random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_classification(predictions) return aggregation_classification(predictions)
......
...@@ -10,6 +10,7 @@ from . import LOG_PATH ...@@ -10,6 +10,7 @@ from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import normalize
import time import time
import datetime import datetime
import numpy as np import numpy as np
...@@ -77,7 +78,7 @@ class Trainer(object): ...@@ -77,7 +78,7 @@ class Trainer(object):
else: else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model, extracted_forest_size=None): def train(self, model, extracted_forest_size=None, seed=None, use_distillation=False):
""" """
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier. OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
...@@ -88,6 +89,7 @@ class Trainer(object): ...@@ -88,6 +89,7 @@ class Trainer(object):
if type(model) in [RandomForestRegressor, RandomForestClassifier]: if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if extracted_forest_size is not None: if extracted_forest_size is not None:
estimators_index = np.arange(len(model.estimators_)) estimators_index = np.arange(len(model.estimators_))
np.random.seed(seed)
np.random.shuffle(estimators_index) np.random.shuffle(estimators_index)
choosen_estimators = estimators_index[:extracted_forest_size] choosen_estimators = estimators_index[:extracted_forest_size]
model.estimators_ = np.array(model.estimators_)[choosen_estimators] model.estimators_ = np.array(model.estimators_)[choosen_estimators]
...@@ -98,12 +100,22 @@ class Trainer(object): ...@@ -98,12 +100,22 @@ class Trainer(object):
) )
self._selected_trees = model.estimators_ self._selected_trees = model.estimators_
else: else:
model.fit( if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier] and \
self._X_forest, use_distillation:
self._y_forest, model.fit(
self._X_omp, self._X_forest, # X_train or X_train+X_dev
self._y_omp self._y_forest,
) self._X_omp, # X_train+X_dev or X_dev
self._y_omp,
use_distillation=use_distillation
)
else:
model.fit(
self._X_forest, # X_train or X_train+X_dev
self._y_forest,
self._X_omp, # X_train+X_dev or X_dev
self._y_omp
)
self._end_time = time.time() self._end_time = time.time()
def __score_func(self, model, X, y_true, weights=True): def __score_func(self, model, X, y_true, weights=True):
...@@ -141,6 +153,20 @@ class Trainer(object): ...@@ -141,6 +153,20 @@ class Trainer(object):
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
return result return result
def _evaluate_predictions(self, model, X, aggregation_function):
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
estimators = model.forest
estimators = np.asarray(estimators)[model._omp.coef_ != 0]
elif type(model) in [RandomForestRegressor, RandomForestClassifier]:
estimators = model.estimators_
predictions = np.array([tree.predict(X) for tree in estimators])
predictions = normalize(predictions)
return aggregation_function(np.abs((predictions @ predictions.T - np.eye(len(predictions)))))
def compute_results(self, model, models_dir): def compute_results(self, model, models_dir):
""" """
:param model: Object with :param model: Object with
...@@ -173,7 +199,9 @@ class Trainer(object): ...@@ -173,7 +199,9 @@ class Trainer(object):
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name, score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name base_score_metric=self._base_score_metric_name,
coherence=self._evaluate_predictions(model, self._dataset.X_train, aggregation_function=np.max),
correlation=self._evaluate_predictions(model, self._dataset.X_train, aggregation_function=np.mean)
) )
results.save(models_dir) results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_base)) self._logger.info("Base performance on test: {}".format(results.test_score_base))
...@@ -201,10 +229,10 @@ class Trainer(object): ...@@ -201,10 +229,10 @@ class Trainer(object):
) )
results.save(models_dir+'_no_weights') results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score)) self._logger.info("Performance on test without weights: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score)) self._logger.info("Performance on train without weights: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score)) self._logger.info("Performance on dev without weights: {}".format(results.dev_score))
...@@ -150,6 +150,35 @@ def extract_weights_across_seeds(models_dir, results_dir, experiment_id): ...@@ -150,6 +150,35 @@ def extract_weights_across_seeds(models_dir, results_dir, experiment_id):
return experiment_weights return experiment_weights
def extract_coherences_across_seeds(models_dir, results_dir, experiment_id):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
experiment_coherences = dict()
# For each seed results stored in models/{experiment_id}/seeds
seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
for seed in seeds:
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size
# {{seed}:[]}
experiment_coherences[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int)
for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save the weights
experiment_coherences[seed].append(model_raw_results.coherence)
return experiment_coherences
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
...@@ -507,7 +536,7 @@ if __name__ == "__main__": ...@@ -507,7 +536,7 @@ if __name__ == "__main__":
ylabel=base_with_params_experiment_score_metric, ylabel=base_with_params_experiment_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
if args.plot_weight_density: """if args.plot_weight_density:
root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}') root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}')
if args.stage == 1: if args.stage == 1:
...@@ -542,6 +571,27 @@ if __name__ == "__main__": ...@@ -542,6 +571,27 @@ if __name__ == "__main__":
for (experiment_label, experiment_id) in omp_experiment_ids: for (experiment_label, experiment_id) in omp_experiment_ids:
logger.info(f'Computing weight density plot for experiment {experiment_label}...') logger.info(f'Computing weight density plot for experiment {experiment_label}...')
experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id) experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id)
Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png')) Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png'))"""
if args.plot_weight_density:
logger.info(f'Computing weight density plot for experiment {experiment_label}...')
experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id)
Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png'))
if args.plot_preds_coherence:
root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5')
all_labels = ['random', 'omp', 'omp_normalize_D']
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 4]]
Plotter.plot_stage2_losses(
file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}_30_all.png",
all_experiment_scores=coherence_values,
all_labels=all_labels,
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel='Coherence',
title='Coherence values of {}'.format(args.dataset_name))
logger.info(f'Computing preds coherence plot...')
logger.info('Done.') logger.info('Done.')
...@@ -55,12 +55,6 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -55,12 +55,6 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
trainer = Trainer(dataset) trainer = Trainer(dataset)
# if parameters['extraction_strategy'] == 'ensemble':
if False:
library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed)
else:
library = None
if parameters['extraction_strategy'] == 'random': if parameters['extraction_strategy'] == 'random':
pretrained_model_parameters = ModelParameters( pretrained_model_parameters = ModelParameters(
extracted_forest_size=parameters['forest_size'], extracted_forest_size=parameters['forest_size'],
...@@ -71,7 +65,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -71,7 +65,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
hyperparameters=hyperparameters, hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy'] extraction_strategy=parameters['extraction_strategy']
) )
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library) pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters)
pretraned_trainer = Trainer(dataset) pretraned_trainer = Trainer(dataset)
pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used']) pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_estimator.fit( pretrained_estimator.fit(
...@@ -85,8 +79,9 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -85,8 +79,9 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
if parameters['extraction_strategy'] != 'none': if parameters['extraction_strategy'] != 'none':
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer,
pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters) pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters,
use_distillation=parameters['extraction_strategy'] == 'omp_distillation')
for i in range(len(parameters['extracted_forest_size']))) for i in range(len(parameters['extracted_forest_size'])))
else: else:
forest_size = hyperparameters['n_estimators'] forest_size = hyperparameters['n_estimators']
...@@ -118,7 +113,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -118,7 +113,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters, library=library) model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model) trainer.train(model)
...@@ -127,8 +122,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -127,8 +122,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
seed_job_pb.update(1) seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, seed, parameters, dataset, hyperparameters, experiment_id, trainer,
pretrained_estimator=None, pretrained_model_parameters=None): pretrained_estimator=None, pretrained_model_parameters=None, use_distillation=False):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident())) seed, extracted_forest_size, threading.get_ident()))
...@@ -163,13 +158,14 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ...@@ -163,13 +158,14 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
extraction_strategy=parameters['extraction_strategy'] extraction_strategy=parameters['extraction_strategy']
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters, library=library) model = ModelFactory.build(dataset.task, model_parameters)
else: else:
model = copy.deepcopy(pretrained_estimator) model = copy.deepcopy(pretrained_estimator)
pretrained_model_parameters.save(sub_models_dir, experiment_id) pretrained_model_parameters.save(sub_models_dir, experiment_id)
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model, extracted_forest_size=extracted_forest_size) trainer.train(model, extracted_forest_size=extracted_forest_size, seed=seed,
use_distillation=use_distillation)
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
""" """
...@@ -247,8 +243,8 @@ if __name__ == "__main__": ...@@ -247,8 +243,8 @@ if __name__ == "__main__":
else: else:
parameters = args.__dict__ parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']: if parameters['extraction_strategy'] not in ['omp', 'omp_distillation', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters['extraction_strategy']))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......
#!/bin/bash #!/bin/bash
core_number=5 core_number=5
core_number_sota=50 core_number_sota=5
walltime=1:00 walltime=5:00
walltime_sota=5:00 walltime_sota=5:00
seeds='1 2 3 4 5' seeds='1 2 3 4 5'
for dataset in kin8nm kr-vs-kp spambase steel-plates diabetes diamonds boston california_housing for dataset in boston diabetes linnerud breast_cancer california_housing diamonds steel-plates kr-vs-kp kin8nm spambase musk gamma lfw_pairs
do do
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev --normalize_D"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp_distillation --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev --normalize_D"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=kmeans --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=5 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=kmeans --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=5 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=ensemble --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=6 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity_similarities --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=6 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity_predictions --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=7 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=ensemble --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=8 --models_dir=models/$dataset/stage5_new --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=9 --models_dir=models/$dataset/stage5_new --subsets_used train,dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=10 --models_dir=models/$dataset/stage5_new --subsets_used train,dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=11 --models_dir=models/$dataset/stage5_new --subsets_used train,dev --normalize_D"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=$walltime "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp_distillation --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=12 --models_dir=models/$dataset/stage5_new --subsets_used train,dev --normalize_D"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=kmeans --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=13 --models_dir=models/$dataset/stage5_new --subsets_used train,dev"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity_similarities --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=14 --models_dir=models/$dataset/stage5_new --subsets_used train,dev"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity_predictions --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=15 --models_dir=models/$dataset/stage5_new --subsets_used train,dev"
oarsub -p "(gpu is null)" -l /core=$core_number_sota,walltime=$walltime_sota "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=ensemble --extracted_forest_size_stop=1.0 --extracted_forest_size_samples=30 --experiment_id=16 --models_dir=models/$dataset/stage5_new --subsets_used train,dev"
done done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment