Commit 1db36b5d authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Merge branch '17-adding-new-datasets' into 'master'

Resolve "Adding new datasets"

Closes #17

See merge request !15
parents 3f1e869d b0e1c83e
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
from bolsonaro.utils import change_binary_func_load, change_binary_func_openml
from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \
fetch_openml
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random
......@@ -30,13 +31,15 @@ class DatasetLoader(object):
dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds']
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates',
'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma']
dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
'20newsgroups_vectorized':3, 'lfw_people':3,
'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,
'diamonds': 15}
'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15,
'spambase': 15, 'musk': 15, 'gamma': 15}
@staticmethod
def load(dataset_parameters):
......@@ -103,6 +106,24 @@ class DatasetLoader(object):
df['clarity'] = label_clarity.fit_transform(df['clarity'])
X, y = df.drop(['price'], axis=1), df['price']
task = Task.REGRESSION
elif name == 'steel-plates':
dataset_loading_func = change_binary_func_openml('steel-plates-fault')
task = Task.BINARYCLASSIFICATION
elif name == 'kr-vs-kp':
dataset_loading_func = change_binary_func_openml('kr-vs-kp')
task = Task.BINARYCLASSIFICATION
elif name == 'kin8nm':
X, y = fetch_openml('kin8nm', return_X_y=True)
task = Task.REGRESSION
elif name == 'spambase':
dataset_loading_func = change_binary_func_openml('spambase')
task = Task.BINARYCLASSIFICATION
elif name == 'musk':
dataset_loading_func = change_binary_func_openml('musk')
task = Task.BINARYCLASSIFICATION
elif name == 'gamma':
dataset_loading_func = change_binary_func_openml('MagicTelescope')
task = Task.BINARYCLASSIFICATION
else:
raise ValueError("Unsupported dataset '{}'".format(name))
......
......@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
# sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp):
# print(y_forest.shape)
# print(set([type(y) for y in y_forest]))
self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
return self
......@@ -140,8 +142,8 @@ class SingleOmpForest(OmpForest):
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)
omp_trees_indices = np.nonzero(weights)[0]
select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
print(len(omp_trees_indices))
return select_trees
......@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest):
return super().fit(X_forest, y_forest, X_omp, y_omp)
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)
omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]
# Here forest_pred is the probability of being class 1.
result_omp = np.mean(omp_trees_predictions, axis=1)
result_omp = (result_omp - 0.5) * 2
return result_omp
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
......@@ -129,7 +157,7 @@ class OmpForestMulticlassClassifier(OmpForest):
omp_trees_indices = np.nonzero(weights)
label_names.append(class_label)
atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0))
preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))
num_class += 1
preds = np.array(preds).T
......
......@@ -108,7 +108,8 @@ class Trainer(object):
else:
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = y_pred.round()
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred==0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
return result
......@@ -187,5 +188,3 @@ class Trainer(object):
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
......@@ -5,6 +5,8 @@ from copy import deepcopy
import contextlib
import joblib
from sklearn.datasets import fetch_openml
def resolve_experiment_id(models_dir):
"""
......@@ -78,6 +80,16 @@ def change_binary_func_load(base_load_function):
return X, y
return func_load
def change_binary_func_openml(dataset_name):
def func_load(return_X_y=True, random_state=None):
X, y = fetch_openml(dataset_name, return_X_y=return_X_y)
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
y = y.astype('int')
return X, y
return func_load
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
......
# Implemenation of the paper 'Ensemble selection from libraries of models' by Rich Caruana et al.
# A set of trees is trained, then those performing the best on the dev set are added to the forest.
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.externals import joblib
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
(data, target) = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=10000, random_state=2019)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=3000, random_state=2019)
criterion_arr = ["mse"]#, "friedman_mse", "mae"]
splitter_arr = ["best"]#, "random"]
depth_arr = [i for i in range(5, 20, 1)]
min_samples_split_arr = [i for i in range(2, 20, 1)]
min_samples_leaf_arr = [i for i in range(2, 20, 1)]
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
library = list()
for criterion in criterion_arr:
for splitter in splitter_arr:
for depth in depth_arr:
for min_samples_split in min_samples_split_arr:
for min_samples_leaf in min_samples_leaf_arr:
for max_features in max_features_arr:
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=2017)
t.fit(X_train, y_train)
#filename= "t_{}_{}_{}_{}_{}_{}.sav".format(criterion, splitter, depth, min_sample_split, min_sample_leaf, max_features)
library.append(t)
print("classifiers", len(library))
scores_list = list()
for classif in library:
r2 = classif.score(X_val, y_val)
scores_list.append(r2)
print("scores", len(scores_list))
#print(scores_list)
##########################
np_scores_list = np.array(scores_list)
#sort_ind = np.argsort(np_scores_list)[::-1]
#sorted_scores = [scores_list[i] for i in sort_ind]
#sorted_class = [class_list[i] for i in sort_ind]
#print(sorted_class)
#print(sorted_scores)
#res = list()
#for s in [10, 20, 30]:
# best_class = sorted_class[:s]
# temp_res = list()
# for r in best_class:
# r2 = r.score(X_test, y_test)
# temp_res.append(r2)
# res.append(np.mean(temp_res))
#print("scores on test set", res)
###########################
#for k in range(num_sel_tree-1):
# cand_index = 0
# best_mean = 0
# #del scores_sel[-1]
# for j in range(len(scores_list)):
# scores_sel.append(scores_list[j])
# temp_scores_sel = np.array(scores_sel)
# temp_mean = np.mean(temp_scores_sel)
# if (temp_mean > best_mean):
# best_mean = temp_mean
# cand_index = j
# del scores_sel[-1]
# ens_sel.append(class_list[cand_index])
# scores_sel.append(scores_list[cand_index])
# del scores_list[cand_index]
# del class_list[cand_index]
#print("selected models",ens_sel)
#print("selected_scores", scores_sel)
trees_in_forest = list()
perf_prun_forest = list()
for num_sel_tree in [2, 4, 6, 8, 10, 15, 20, 30, 40, 50]:
class_list = list(library)
print("class list", len(class_list))
m = np.argmax(np_scores_list)
ens_sel = [class_list[m]]
#scores_sel = [scores_list[m]]
#del scores_list[m]
temp_pred = class_list[m].predict(X_val)
del class_list[m]
#print("prima di entrare nel for", len(class_list))
for k in range(num_sel_tree-1):
cand_index = 0
r2_best = -10000
#print("ad ogni loop", len(class_list))
for j in range(len(class_list)):
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
temp_mean = np.mean(temp_pred, axis=0)
#print("temp pred and temp mean shapes", temp_pred.shape, temp_mean.shape)
r2_temp = r2_score(y_val, temp_mean)
if (r2_temp > r2_best):
r2_best = r2_temp
cand_index = j
temp_pred = np.delete(temp_pred, -1, 0)
#print(temp_pred.shape)
ens_sel.append(class_list[cand_index])
#scores_sel.append(scores_list[cand_index])
temp_pred = np.vstack((temp_pred, class_list[cand_index].predict(X_val)))
#del scores_list[cand_index]
del class_list[cand_index]
#print("ens_sel", len(ens_sel))
test_list = list()
for mod in ens_sel:
test_pred = mod.predict(X_test)
test_list.append(test_pred)
#print("scores sep", mod.score(X_test, y_test))
test_list = np.array(test_list)
#print("test list shape", test_list.shape)
test_mean = np.mean(test_list, axis=0)
#print("test list shape", test_mean.shape)
r2_test = r2_score(test_mean, y_test)
#print(r2_test)
#print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test))
print(num_sel_tree, r2_test)
trees_in_forest.append(num_sel_tree)
perf_prun_forest.append(r2_test)
print(trees_in_forest)
print(perf_prun_forest)
ax = plt.gca()
ax.plot(trees_in_forest, perf_prun_forest, label='ensemble selection')
ax.legend()
#plt.title('fashion mnist')
plt.xlabel('num trees')
plt.ylabel('r2 score')
plt.savefig("ensemble_selection.pdf")
plt.show()
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.externals import joblib
import numpy as np
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
(data, target) = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=10000, random_state=2019)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=3000, random_state=2019)
num_trees = 100
prun_for_size=[2, 4, 6, 8, 10, 12, 15, 20]
randfor = RandomForestRegressor(num_trees, max_depth=7, random_state=2019)
randfor.fit(X_train, y_train)
randfor_pred = randfor.score(X_val, y_val)
trees_forest = randfor.estimators_
trees_in_forest = list()
perf_prun_forest = list()
for k in range(len(prun_for_size)):
ens_sel = list()
trees_list = list(randfor.estimators_)
#print("dovrebbe essere la taglia iniziale", len(trees_list))
for j in range(num_trees - prun_for_size[k]):
best_simil = 100000
cand_ind = 0
for i in range(len(trees_list)):
lonely_tree = trees_list[i]
del trees_list[i]
val_list = list()
#print("quando poto", len(trees_list))
for tree in trees_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
r2_val = r2_score(val_mean, y_val)
temp_simil = abs(randfor_pred-r2_val)
if (temp_simil < best_simil):
cand_ind = i
best_simil = temp_simil
trees_list.insert(i, lonely_tree)
#print("quando innesto", len(trees_list))
ens_sel.append(trees_list[cand_ind])
del trees_list[cand_ind]
prun_for = list(set(trees_forest) - set(ens_sel))
print("prun_for", len(prun_for))
print("trees forest", len(trees_forest))
print("ens_sel", len(ens_sel))
test_list = list()
for mod in prun_for:
test_pred = mod.predict(X_test)
test_list.append(test_pred)
#print("scores sep", mod.score(X_test, y_test))
test_list = np.array(test_list)
#print("test list shape", test_list.shape)
test_mean = np.mean(test_list, axis=0)
#print("test list shape", test_mean.shape)
r2_test = r2_score(test_mean, y_test)
#print(r2_test)
#print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test))
print(len(prun_for), r2_test)
trees_in_forest.append(len(prun_for))
perf_prun_forest.append(r2_test)
print(trees_in_forest)
print(r2_test)
ax = plt.gca()
ax.plot(trees_in_forest, perf_prun_forest, label='pruned forest')
ax.legend()
#plt.title('fashion mnist')
plt.xlabel('num trees')
plt.ylabel('r2 score')
plt.savefig("pruned_forest.pdf")
plt.show()
......@@ -248,7 +248,7 @@ if __name__ == "__main__":
parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]).astype(np.int)).tolist()
endpoint=True)[1:]).astype(np.int)).tolist()
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
......
{
"experiment_id": 1,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"none_with_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 4,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"none_wo_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 6,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"omp_wo_params"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.7953125,
"best_score_test": 0.7909854175872735,
"best_parameters": {
"max_depth": 20,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 809
},
"random_seed": 1763
}
\ No newline at end of file
{
"experiment_id": 2,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"random_with_params"
],
"job_number": -1,