Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 12-experiment-pipeline
  • 13-visualization
  • 14-correction-of-multiclass-classif
  • 15-integration-sota
  • 17-adding-new-datasets
  • 19-add-some-tests
  • 20-coherence-des-arbres-de-predictions
  • 24-non-negative-omp
  • correlation
  • master
  • archive/10-gridsearching-of-the-base-forest
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
16 results

Target

Select target project
  • luc.giffon/bolsonaro
1 result
Select Git revision
  • 12-experiment-pipeline
  • 13-visualization
  • 14-correction-of-multiclass-classif
  • 15-integration-sota
  • 17-adding-new-datasets
  • 19-add-some-tests
  • 20-coherence-des-arbres-de-predictions
  • 24-non-negative-omp
  • correlation
  • master
  • archive/10-gridsearching-of-the-base-forest
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
16 results
Show changes
Commits on Source (61)
Showing
with 607 additions and 503 deletions
from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load from bolsonaro.utils import change_binary_func_load, change_binary_func_openml
from sklearn.datasets import load_boston, load_iris, load_diabetes, \ from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \
fetch_openml
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn import preprocessing from sklearn import preprocessing
import random import random
...@@ -30,13 +31,15 @@ class DatasetLoader(object): ...@@ -30,13 +31,15 @@ class DatasetLoader(object):
dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine', dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people', 'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds'] 'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates',
'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma']
dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5, dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15, 'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
'20newsgroups_vectorized':3, 'lfw_people':3, '20newsgroups_vectorized':3, 'lfw_people':3,
'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3, 'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,
'diamonds': 15} 'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15,
'spambase': 15, 'musk': 15, 'gamma': 15}
@staticmethod @staticmethod
def load(dataset_parameters): def load(dataset_parameters):
...@@ -103,6 +106,24 @@ class DatasetLoader(object): ...@@ -103,6 +106,24 @@ class DatasetLoader(object):
df['clarity'] = label_clarity.fit_transform(df['clarity']) df['clarity'] = label_clarity.fit_transform(df['clarity'])
X, y = df.drop(['price'], axis=1), df['price'] X, y = df.drop(['price'], axis=1), df['price']
task = Task.REGRESSION task = Task.REGRESSION
elif name == 'steel-plates':
dataset_loading_func = change_binary_func_openml('steel-plates-fault')
task = Task.BINARYCLASSIFICATION
elif name == 'kr-vs-kp':
dataset_loading_func = change_binary_func_openml('kr-vs-kp')
task = Task.BINARYCLASSIFICATION
elif name == 'kin8nm':
X, y = fetch_openml('kin8nm', return_X_y=True)
task = Task.REGRESSION
elif name == 'spambase':
dataset_loading_func = change_binary_func_openml('spambase')
task = Task.BINARYCLASSIFICATION
elif name == 'musk':
dataset_loading_func = change_binary_func_openml('musk')
task = Task.BINARYCLASSIFICATION
elif name == 'gamma':
dataset_loading_func = change_binary_func_openml('MagicTelescope')
task = Task.BINARYCLASSIFICATION
else: else:
raise ValueError("Unsupported dataset '{}'".format(name)) raise ValueError("Unsupported dataset '{}'".format(name))
......
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
from joblib import Parallel, delayed
from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X_train))
predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_)
# For each cluster select the best tree on the validation set
extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size))
self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._estimator.predict(X)
def score(self, X, y):
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = self._score_metric(mean_predictions, y)
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
...@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om ...@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
...@@ -22,9 +23,11 @@ class ModelFactory(object): ...@@ -22,9 +23,11 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed) random_state=model_parameters.seed)
else: elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed) random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION: elif task == Task.REGRESSION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters) return OmpForestRegressor(model_parameters)
...@@ -33,15 +36,21 @@ class ModelFactory(object): ...@@ -33,15 +36,21 @@ class ModelFactory(object):
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity': elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters) return SimilarityForestRegressor(model_parameters)
else: elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed) random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.MULTICLASSIFICATION: elif task == Task.MULTICLASSIFICATION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters) return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed) random_state=model_parameters.seed)
else: elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed) random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
...@@ -6,12 +6,12 @@ import datetime ...@@ -6,12 +6,12 @@ import datetime
class ModelRawResults(object): class ModelRawResults(object):
def __init__(self, model_object, training_time, def __init__(self, model_weights, training_time,
datetime, train_score, dev_score, test_score, datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base, train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric): test_score_base, score_metric, base_score_metric):
self._model_object = model_object self._model_weights = model_weights
self._training_time = training_time self._training_time = training_time
self._datetime = datetime self._datetime = datetime
self._train_score = train_score self._train_score = train_score
...@@ -24,8 +24,8 @@ class ModelRawResults(object): ...@@ -24,8 +24,8 @@ class ModelRawResults(object):
self._base_score_metric = base_score_metric self._base_score_metric = base_score_metric
@property @property
def model_object(self): def model_weights(self):
return self.model_object return self.model_weights
@property @property
def training_time(self): def training_time(self):
...@@ -68,6 +68,8 @@ class ModelRawResults(object): ...@@ -68,6 +68,8 @@ class ModelRawResults(object):
return self._base_score_metric return self._base_score_metric
def save(self, models_dir): def save(self, models_dir):
if not os.path.exists(models_dir):
os.mkdir(models_dir)
save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle', save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',
self.__dict__) self.__dict__)
......
...@@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator ...@@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator
class OmpForest(BaseEstimator, metaclass=ABCMeta): class OmpForest(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters, base_forest_estimator): def __init__(self, models_parameters, base_forest_estimator):
self._base_forest_estimator = base_forest_estimator self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters self._models_parameters = models_parameters
...@@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.score(X, y) return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X): def _base_estimator_predictions(self, X):
# We need to use predict_proba to get the probabilities of each class
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property @property
...@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
# sklearn baseestimator api methods # sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp): def fit(self, X_forest, y_forest, X_omp, y_omp):
# print(y_forest.shape)
# print(set([type(y) for y in y_forest]))
self._base_forest_estimator.fit(X_forest, y_forest) self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
return self return self
...@@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
pass pass
class SingleOmpForest(OmpForest): class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator): def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here # fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP # normalization is handled outsite OMP
...@@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest): ...@@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest):
forest_predictions /= self._forest_norms forest_predictions /= self._forest_norms
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = self._base_estimator_predictions(X).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)[0]
select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
print(len(omp_trees_indices))
return select_trees
...@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest): ...@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest):
return super().fit(X_forest, y_forest, X_omp, y_omp) return super().fit(X_forest, y_forest, X_omp, y_omp)
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)
omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]
# Here forest_pred is the probability of being class 1.
result_omp = np.mean(omp_trees_predictions, axis=1)
result_omp = (result_omp - 0.5) * 2
return result_omp
def score(self, X, y, metric=DEFAULT_SCORE_METRIC): def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
""" """
Evaluate OMPForestClassifer on (`X`, `y`) using `metric` Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
...@@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest): ...@@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest):
max_preds = np.argmax(preds, axis=1) max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds] return np.array(label_names)[max_preds]
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
num_class = 0
for class_label, omp_class in self._dct_class_omp.items():
weights = omp_class.coef_
omp_trees_indices = np.nonzero(weights)
label_names.append(class_label)
atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))
num_class += 1
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def score(self, X, y, metric=DEFAULT_SCORE_METRIC): def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
predictions = self.predict(X) predictions = self.predict(X)
......
...@@ -3,6 +3,7 @@ from sklearn.metrics import mean_squared_error ...@@ -3,6 +3,7 @@ from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from tqdm import tqdm
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
...@@ -10,56 +11,69 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -10,56 +11,69 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
""" """
def __init__(self, models_parameters): def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=models_parameters.seed) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property @property
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
self._regressor.fit(X_train, y_train) y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred)
y_val_pred = self._regressor.predict(X_val) forest = self._estimator.estimators_
forest_pred = score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_
selected_trees = list() selected_trees = list()
tree_list = list(self._regressor.estimators_) tree_list = list(self._estimator.estimators_)
val_scores = list()
with tqdm(tree_list) as tree_pred_bar:
tree_pred_bar.set_description('[Initial tree predictions]')
for tree in tree_pred_bar:
val_scores.append(tree.predict(X_val))
tree_pred_bar.update(1)
for _ in range(self._extracted_forest_size): with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar:
pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]')
for i in pruning_forest_bar:
best_similarity = 100000 best_similarity = 100000
found_index = 0 found_index = 0
for i in range(len(tree_list)): with tqdm(range(len(tree_list)), disable=True) as tree_list_bar:
lonely_tree = tree_list[i] tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]')
del tree_list[i] for j in tree_list_bar:
val_list = list() lonely_tree = tree_list[j]
for tree in tree_list: del tree_list[j]
val_pred = tree.predict(X_val) val_mean = np.mean(np.asarray(val_scores), axis=0)
val_list.append(val_pred) val_score = self._score_metric(val_mean, y_val)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score) temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity): if (temp_similarity < best_similarity):
found_index = i found_index = j
best_similarity = temp_similarity best_similarity = temp_similarity
tree_list.insert(i, lonely_tree) tree_list.insert(j, lonely_tree)
val_scores.insert(j, lonely_tree.predict(X_val))
tree_list_bar.update(1)
selected_trees.append(tree_list[found_index]) selected_trees.append(tree_list[found_index])
del tree_list[found_index] del tree_list[found_index]
del val_scores[found_index]
pruning_forest_bar.update(1)
pruned_forest = list(set(forest) - set(selected_trees)) pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest self._estimator.estimators_ = pruned_forest
def score(self, X, y): def score(self, X, y):
test_list = list() test_list = list()
for mod in self._regressor.estimators_: for mod in self._estimator.estimators_:
test_pred = mod.predict(X) test_pred = mod.predict(X)
test_list.append(test_pred) test_list.append(test_pred)
test_list = np.array(test_list) test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0) test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y) score = self._score_metric(test_mean, y)
return score return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
...@@ -95,14 +95,21 @@ class Trainer(object): ...@@ -95,14 +95,21 @@ class Trainer(object):
) )
self._end_time = time.time() self._end_time = time.time()
def __score_func(self, model, X, y_true): def __score_func(self, model, X, y_true, weights=True):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
if weights:
y_pred = model.predict(X) y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
result = self._regression_score_metric(y_true, y_pred) result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
if weights:
y_pred = model.predict(X) y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier: if type(model) is OmpForestBinaryClassifier:
y_pred = y_pred.round() y_pred = np.sign(y_pred)
y_pred = np.where(y_pred==0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred) result = self._classification_score_metric(y_true, y_pred)
return result return result
...@@ -126,8 +133,17 @@ class Trainer(object): ...@@ -126,8 +133,17 @@ class Trainer(object):
:param model: Object with :param model: Object with
:param models_dir: Where the results will be saved :param models_dir: Where the results will be saved
""" """
model_weights = ''
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
model_weights = model._omp.coef_
elif type(model) == OmpForestMulticlassClassifier:
model_weights = model._dct_class_omp
elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp
results = ModelRawResults( results = ModelRawResults(
model_object='', model_weights=model_weights,
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(), datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
...@@ -148,3 +164,27 @@ class Trainer(object): ...@@ -148,3 +164,27 @@ class Trainer(object):
self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score)) self._logger.info("Performance on dev: {}".format(results.dev_score))
if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults(
model_weights='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
...@@ -5,6 +5,8 @@ from copy import deepcopy ...@@ -5,6 +5,8 @@ from copy import deepcopy
import contextlib import contextlib
import joblib import joblib
from sklearn.datasets import fetch_openml
def resolve_experiment_id(models_dir): def resolve_experiment_id(models_dir):
""" """
...@@ -78,6 +80,16 @@ def change_binary_func_load(base_load_function): ...@@ -78,6 +80,16 @@ def change_binary_func_load(base_load_function):
return X, y return X, y
return func_load return func_load
def change_binary_func_openml(dataset_name):
def func_load(return_X_y=True, random_state=None):
X, y = fetch_openml(dataset_name, return_X_y=return_X_y)
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
y = y.astype('int')
return X, y
return func_load
@contextlib.contextmanager @contextlib.contextmanager
def tqdm_joblib(tqdm_object): def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument""" """Context manager to patch joblib to report into tqdm progress bar given as argument"""
......
...@@ -109,16 +109,16 @@ class Plotter(object): ...@@ -109,16 +109,16 @@ class Plotter(object):
fig, ax = plt.subplots() fig, ax = plt.subplots()
n = len(all_experiment_scores) nb_experiments = len(all_experiment_scores)
""" """
Get as many different colors from the specified cmap (here nipy_spectral) Get as many different colors from the specified cmap (here nipy_spectral)
as there are curve to plot. as there are curve to plot.
""" """
colors = Plotter.get_colors_from_cmap(n) colors = Plotter.get_colors_from_cmap(nb_experiments)
# For each curve to plot # For each curve to plot
for i in range(n): for i in range(nb_experiments):
# Retreive the scores in a list for each seed # Retreive the scores in a list for each seed
experiment_scores = list(all_experiment_scores[i].values()) experiment_scores = list(all_experiment_scores[i].values())
# Compute the mean and the std for the CI # Compute the mean and the std for the CI
......
...@@ -17,7 +17,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id): ...@@ -17,7 +17,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes'
return len(os.listdir(extracted_forest_sizes_root_path)) return len(os.listdir(extracted_forest_sizes_root_path))
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id): def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
...@@ -28,6 +28,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d ...@@ -28,6 +28,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_train_scores = dict() experiment_train_scores = dict()
experiment_dev_scores = dict() experiment_dev_scores = dict()
experiment_test_scores = dict() experiment_test_scores = dict()
experiment_weights = dict()
all_extracted_forest_sizes = list() all_extracted_forest_sizes = list()
# Used to check if all losses were computed using the same metric (it should be the case) # Used to check if all losses were computed using the same metric (it should be the case)
...@@ -44,14 +45,19 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d ...@@ -44,14 +45,19 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_train_scores[seed] = list() experiment_train_scores[seed] = list()
experiment_dev_scores[seed] = list() experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list() experiment_test_scores[seed] = list()
experiment_weights[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path) extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int) extracted_forest_sizes.sort(key=int)
all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes))) all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes)))
for extracted_forest_size in extracted_forest_sizes: for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size} # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
if weights:
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
else:
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + '_no_weights'
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path) model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save the scores # Save the scores
...@@ -60,6 +66,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d ...@@ -60,6 +66,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_test_scores[seed].append(model_raw_results.test_score) experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the metric # Save the metric
experiment_score_metrics.append(model_raw_results.score_metric) experiment_score_metrics.append(model_raw_results.score_metric)
# Save the weights
#experiment_weights[seed].append(model_raw_results.model_weights)
# Sanity checks # Sanity checks
if len(set(experiment_score_metrics)) > 1: if len(set(experiment_score_metrics)) > 1:
...@@ -67,7 +75,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d ...@@ -67,7 +75,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1: if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1:
raise ValueError("The extracted forest sizes aren't the sames across seeds.") raise ValueError("The extracted forest sizes aren't the sames across seeds.")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0], experiment_score_metrics[0] return experiment_train_scores, experiment_dev_scores, experiment_test_scores, \
all_extracted_forest_sizes[0], experiment_score_metrics[0]#, experiment_weights
def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number): def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
...@@ -120,6 +129,7 @@ if __name__ == "__main__": ...@@ -120,6 +129,7 @@ if __name__ == "__main__":
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
DEFAULT_PLOT_WEIGHT_DENSITY = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].') parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
...@@ -130,6 +140,7 @@ if __name__ == "__main__": ...@@ -130,6 +140,7 @@ if __name__ == "__main__":
parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.') parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.')
args = parser.parse_args() args = parser.parse_args()
if args.stage not in list(range(1, 6)): if args.stage not in list(range(1, 6)):
...@@ -347,9 +358,17 @@ if __name__ == "__main__": ...@@ -347,9 +358,17 @@ if __name__ == "__main__":
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params # omp_with_params
logger.info('Loading omp_with_params experiment scores...') logger.info('Loading omp_with_params experiment scores...')
"""omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric, experiment_weights = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])"""
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2]) args.models_dir, args.results_dir, args.experiment_ids[2])
#omp_with_params_without_weights
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2], weights=False)
"""# base_with_params """# base_with_params
logger.info('Loading base_with_params experiment scores 2...') logger.info('Loading base_with_params experiment scores 2...')
...@@ -369,13 +388,58 @@ if __name__ == "__main__": ...@@ -369,13 +388,58 @@ if __name__ == "__main__":
raise ValueError('Score metrics of all experiments must be the same.') raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4') output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4_fix')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses( Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png', file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores], all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
all_labels=['base', 'random', 'omp'], omp_with_params_without_weights_test_scores],
all_labels=['base', 'random', 'omp', 'omp_without_weights'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
elif args.stage == 5:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# omp_with_params
logger.info('Loading kmeans_with_params experiment scores...')
kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \
kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
== omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
kmeans_with_params_test_scores],
all_labels=['base', 'random', 'omp', 'kmeans'],
x_value=with_params_extracted_forest_sizes, x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted', xlabel='Number of trees extracted',
ylabel=experiments_score_metric, ylabel=experiments_score_metric,
...@@ -384,16 +448,3 @@ if __name__ == "__main__": ...@@ -384,16 +448,3 @@ if __name__ == "__main__":
raise ValueError('This stage number is not supported yet, but it will be!') raise ValueError('This stage number is not supported yet, but it will be!')
logger.info('Done.') logger.info('Done.')
"""
TODO:
For each dataset:
Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 4) A figure to finally compare the perf of our approach using the previous selected
parameters vs the baseline vs other papers using different extracted forest size
(percentage of the tree size found previously in best hyperparams search) on the abscissa.
IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
"""
...@@ -21,7 +21,7 @@ import numpy as np ...@@ -21,7 +21,7 @@ import numpy as np
import shutil import shutil
def process_job(seed, parameters, experiment_id, hyperparameters): def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
""" """
Experiment function. Experiment function.
...@@ -34,7 +34,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -34,7 +34,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
""" """
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident())) seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed) seed_str = str(seed)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
...@@ -55,13 +54,31 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -55,13 +54,31 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset) trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none': if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']: with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
logger.info('extracted_forest_size={}'.format(extracted_forest_size)) Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size) models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) for i in range(len(parameters['extracted_forest_size'])))
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
if already_exists:
logger.info('Base forest result already exists. Skipping...')
else:
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters( model_parameters = ModelParameters(
extracted_forest_size=extracted_forest_size, extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'], normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'], subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'], normalize_weights=parameters['normalize_weights'],
...@@ -76,14 +93,36 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -76,14 +93,36 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model) trainer.train(model)
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
logger.info(f'Training done for seed {seed_str}')
seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident()))
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
return
else: else:
forest_size = hyperparameters['n_estimators'] already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
logger.info('Base forest training with fixed forest size of {}'.format(forest_size)) break
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size) if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters( model_parameters = ModelParameters(
extracted_forest_size=forest_size, extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'], normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'], subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'], normalize_weights=parameters['normalize_weights'],
...@@ -98,7 +137,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -98,7 +137,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model) trainer.train(model)
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
""" """
Command lines example for stage 1: Command lines example for stage 1:
...@@ -138,6 +176,7 @@ if __name__ == "__main__": ...@@ -138,6 +176,7 @@ if __name__ == "__main__":
DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1 DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp' DEFAULT_EXTRACTION_STRATEGY = 'omp'
DEFAULT_OVERWRITE = False
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
...@@ -163,7 +202,8 @@ if __name__ == "__main__": ...@@ -163,7 +202,8 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args() args = parser.parse_args()
if args.experiment_configuration: if args.experiment_configuration:
...@@ -173,7 +213,7 @@ if __name__ == "__main__": ...@@ -173,7 +213,7 @@ if __name__ == "__main__":
else: else:
parameters = args.__dict__ parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']: if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
...@@ -208,7 +248,7 @@ if __name__ == "__main__": ...@@ -208,7 +248,7 @@ if __name__ == "__main__":
parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] * parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
np.linspace(0, args.extracted_forest_size_stop, np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1, parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]).astype(np.int)).tolist() endpoint=True)[1:]).astype(np.int)).tolist()
if parameters['seeds'] != None and parameters['random_seed_number'] > 1: if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
...@@ -220,6 +260,7 @@ if __name__ == "__main__": ...@@ -220,6 +260,7 @@ if __name__ == "__main__":
if args.experiment_id: if args.experiment_id:
experiment_id = args.experiment_id experiment_id = args.experiment_id
if args.overwrite:
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True) shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else: else:
# Resolve the next experiment id number (last id + 1) # Resolve the next experiment id number (last id + 1)
...@@ -255,6 +296,6 @@ if __name__ == "__main__": ...@@ -255,6 +296,6 @@ if __name__ == "__main__":
) )
# Run as much job as there are seeds # Run as much job as there are seeds
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i], Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
parameters, experiment_id, hyperparameters) for i in range(len(seeds))) parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))
{
"experiment_id": 1,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"none_with_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 4,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"none_wo_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 6,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"omp_wo_params"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.7953125,
"best_score_test": 0.7909854175872735,
"best_parameters": {
"max_depth": 20,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 809
},
"random_seed": 1763
}
\ No newline at end of file
{
"experiment_id": 2,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"random_with_params"
],
"job_number": -1,
"extraction_strategy": "random",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 5,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"random_wo_params"
],
"job_number": -1,
"extraction_strategy": "random",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 1,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage2",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"2",
"no_normalization"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file