Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Luc Giffon
bolsonaro
Commits
1db36b5d
Commit
1db36b5d
authored
Mar 06, 2020
by
Charly Lamothe
Browse files
Merge branch '17-adding-new-datasets' into 'master'
Resolve "Adding new datasets" Closes
#17
See merge request
!15
parents
3f1e869d
b0e1c83e
Changes
206
Hide whitespace changes
Inline
Side-by-side
code/bolsonaro/data/dataset_loader.py
View file @
1db36b5d
from
bolsonaro.data.dataset
import
Dataset
from
bolsonaro.data.dataset_parameters
import
DatasetParameters
from
bolsonaro.data.task
import
Task
from
bolsonaro.utils
import
change_binary_func_load
from
bolsonaro.utils
import
change_binary_func_load
,
change_binary_func_openml
from
sklearn.datasets
import
load_boston
,
load_iris
,
load_diabetes
,
\
load_digits
,
load_linnerud
,
load_wine
,
load_breast_cancer
from
sklearn.datasets
import
fetch_olivetti_faces
,
fetch_20newsgroups
,
\
fetch_20newsgroups_vectorized
,
fetch_lfw_people
,
fetch_lfw_pairs
,
\
fetch_covtype
,
fetch_rcv1
,
fetch_kddcup99
,
fetch_california_housing
fetch_covtype
,
fetch_rcv1
,
fetch_kddcup99
,
fetch_california_housing
,
\
fetch_openml
from
sklearn.model_selection
import
train_test_split
from
sklearn
import
preprocessing
import
random
...
...
@@ -30,13 +31,15 @@ class DatasetLoader(object):
dataset_names
=
[
'boston'
,
'iris'
,
'diabetes'
,
'digits'
,
'linnerud'
,
'wine'
,
'breast_cancer'
,
'olivetti_faces'
,
'20newsgroups_vectorized'
,
'lfw_people'
,
'lfw_pairs'
,
'covtype'
,
'rcv1'
,
'california_housing'
,
'diamonds'
]
'lfw_pairs'
,
'covtype'
,
'rcv1'
,
'california_housing'
,
'diamonds'
,
'steel-plates'
,
'kr-vs-kp'
,
'kin8nm'
,
'spambase'
,
'musk'
,
'gamma'
]
dataset_seed_numbers
=
{
'boston'
:
15
,
'iris'
:
15
,
'diabetes'
:
15
,
'digits'
:
5
,
'linnerud'
:
15
,
'wine'
:
15
,
'breast_cancer'
:
15
,
'olivetti_faces'
:
15
,
'20newsgroups_vectorized'
:
3
,
'lfw_people'
:
3
,
'lfw_pairs'
:
3
,
'covtype'
:
3
,
'rcv1'
:
3
,
'california_housing'
:
3
,
'diamonds'
:
15
}
'diamonds'
:
15
,
'steel-plates'
:
15
,
'kr-vs-kp'
:
15
,
'kin8nm'
:
15
,
'spambase'
:
15
,
'musk'
:
15
,
'gamma'
:
15
}
@
staticmethod
def
load
(
dataset_parameters
):
...
...
@@ -103,6 +106,24 @@ class DatasetLoader(object):
df
[
'clarity'
]
=
label_clarity
.
fit_transform
(
df
[
'clarity'
])
X
,
y
=
df
.
drop
([
'price'
],
axis
=
1
),
df
[
'price'
]
task
=
Task
.
REGRESSION
elif
name
==
'steel-plates'
:
dataset_loading_func
=
change_binary_func_openml
(
'steel-plates-fault'
)
task
=
Task
.
BINARYCLASSIFICATION
elif
name
==
'kr-vs-kp'
:
dataset_loading_func
=
change_binary_func_openml
(
'kr-vs-kp'
)
task
=
Task
.
BINARYCLASSIFICATION
elif
name
==
'kin8nm'
:
X
,
y
=
fetch_openml
(
'kin8nm'
,
return_X_y
=
True
)
task
=
Task
.
REGRESSION
elif
name
==
'spambase'
:
dataset_loading_func
=
change_binary_func_openml
(
'spambase'
)
task
=
Task
.
BINARYCLASSIFICATION
elif
name
==
'musk'
:
dataset_loading_func
=
change_binary_func_openml
(
'musk'
)
task
=
Task
.
BINARYCLASSIFICATION
elif
name
==
'gamma'
:
dataset_loading_func
=
change_binary_func_openml
(
'MagicTelescope'
)
task
=
Task
.
BINARYCLASSIFICATION
else
:
raise
ValueError
(
"Unsupported dataset '{}'"
.
format
(
name
))
...
...
code/bolsonaro/models/model_raw_results.py
View file @
1db36b5d
...
...
@@ -22,7 +22,7 @@ class ModelRawResults(object):
self
.
_test_score_base
=
test_score_base
self
.
_score_metric
=
score_metric
self
.
_base_score_metric
=
base_score_metric
@
property
def
model_weights
(
self
):
return
self
.
model_weights
...
...
code/bolsonaro/models/omp_forest.py
View file @
1db36b5d
...
...
@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
# sklearn baseestimator api methods
def
fit
(
self
,
X_forest
,
y_forest
,
X_omp
,
y_omp
):
# print(y_forest.shape)
# print(set([type(y) for y in y_forest]))
self
.
_base_forest_estimator
.
fit
(
X_forest
,
y_forest
)
self
.
_extract_subforest
(
X_omp
,
y_omp
)
# type: OrthogonalMatchingPursuit
return
self
...
...
@@ -140,8 +142,8 @@ class SingleOmpForest(OmpForest):
forest_predictions
/=
self
.
_forest_norms
weights
=
self
.
_omp
.
coef_
omp_trees_indices
=
np
.
nonzero
(
weights
)
omp_trees_indices
=
np
.
nonzero
(
weights
)
[
0
]
select_trees
=
np
.
mean
(
forest_predictions
[
omp_trees_indices
],
axis
=
0
)
print
(
len
(
omp_trees_indices
))
return
select_trees
code/bolsonaro/models/omp_forest_classifier.py
View file @
1db36b5d
...
...
@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest):
return
super
().
fit
(
X_forest
,
y_forest
,
X_omp
,
y_omp
)
def
predict_no_weights
(
self
,
X
):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions
=
np
.
array
([
tree
.
predict_proba
(
X
)
for
tree
in
self
.
_base_forest_estimator
.
estimators_
])
if
self
.
_models_parameters
.
normalize_D
:
forest_predictions
/=
self
.
_forest_norms
weights
=
self
.
_omp
.
coef_
omp_trees_indices
=
np
.
nonzero
(
weights
)
omp_trees_predictions
=
forest_predictions
[
omp_trees_indices
].
T
[
1
]
# Here forest_pred is the probability of being class 1.
result_omp
=
np
.
mean
(
omp_trees_predictions
,
axis
=
1
)
result_omp
=
(
result_omp
-
0.5
)
*
2
return
result_omp
def
score
(
self
,
X
,
y
,
metric
=
DEFAULT_SCORE_METRIC
):
"""
Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
...
...
@@ -129,7 +157,7 @@ class OmpForestMulticlassClassifier(OmpForest):
omp_trees_indices
=
np
.
nonzero
(
weights
)
label_names
.
append
(
class_label
)
atoms_binary
=
(
forest_predictions
[
num_class
].
T
-
0.5
)
*
2
# centré réduit de 0/1 à -1/1
preds
.
append
(
np
.
sum
(
atoms_binary
[
omp_trees_indices
],
axis
=
0
))
preds
.
append
(
np
.
sum
(
atoms_binary
[
omp_trees_indices
],
axis
=
0
)
/
len
(
omp_trees_indices
)
)
num_class
+=
1
preds
=
np
.
array
(
preds
).
T
...
...
code/bolsonaro/trainer.py
View file @
1db36b5d
...
...
@@ -108,7 +108,8 @@ class Trainer(object):
else
:
y_pred
=
model
.
predict_no_weights
(
X
)
if
type
(
model
)
is
OmpForestBinaryClassifier
:
y_pred
=
y_pred
.
round
()
y_pred
=
np
.
sign
(
y_pred
)
y_pred
=
np
.
where
(
y_pred
==
0
,
1
,
y_pred
)
result
=
self
.
_classification_score_metric
(
y_true
,
y_pred
)
return
result
...
...
@@ -187,5 +188,3 @@ class Trainer(object):
self
.
_logger
.
info
(
"Base performance on dev without weights: {}"
.
format
(
results
.
dev_score_base
))
self
.
_logger
.
info
(
"Performance on dev: {}"
.
format
(
results
.
dev_score
))
code/bolsonaro/utils.py
View file @
1db36b5d
...
...
@@ -5,6 +5,8 @@ from copy import deepcopy
import
contextlib
import
joblib
from
sklearn.datasets
import
fetch_openml
def
resolve_experiment_id
(
models_dir
):
"""
...
...
@@ -78,6 +80,16 @@ def change_binary_func_load(base_load_function):
return
X
,
y
return
func_load
def
change_binary_func_openml
(
dataset_name
):
def
func_load
(
return_X_y
=
True
,
random_state
=
None
):
X
,
y
=
fetch_openml
(
dataset_name
,
return_X_y
=
return_X_y
)
possible_classes
=
sorted
(
set
(
y
))
assert
len
(
possible_classes
)
==
2
,
"Function change binary_func_load only work for binary classfication"
y
=
binarize_class_data
(
y
,
possible_classes
[
-
1
])
y
=
y
.
astype
(
'int'
)
return
X
,
y
return
func_load
@
contextlib
.
contextmanager
def
tqdm_joblib
(
tqdm_object
):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
...
...
code/ensemble_selection.py
deleted
100644 → 0
View file @
3f1e869d
# Implemenation of the paper 'Ensemble selection from libraries of models' by Rich Caruana et al.
# A set of trees is trained, then those performing the best on the dev set are added to the forest.
from
sklearn.datasets
import
fetch_california_housing
from
sklearn.model_selection
import
train_test_split
from
sklearn.tree
import
DecisionTreeRegressor
from
sklearn.externals
import
joblib
import
numpy
as
np
from
sklearn.metrics
import
r2_score
import
matplotlib.pyplot
as
plt
(
data
,
target
)
=
fetch_california_housing
(
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
data
,
target
,
test_size
=
10000
,
random_state
=
2019
)
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_train
,
y_train
,
test_size
=
3000
,
random_state
=
2019
)
criterion_arr
=
[
"mse"
]
#, "friedman_mse", "mae"]
splitter_arr
=
[
"best"
]
#, "random"]
depth_arr
=
[
i
for
i
in
range
(
5
,
20
,
1
)]
min_samples_split_arr
=
[
i
for
i
in
range
(
2
,
20
,
1
)]
min_samples_leaf_arr
=
[
i
for
i
in
range
(
2
,
20
,
1
)]
max_features_arr
=
[
"sqrt"
]
#["auto", "sqrt", "log2"]
library
=
list
()
for
criterion
in
criterion_arr
:
for
splitter
in
splitter_arr
:
for
depth
in
depth_arr
:
for
min_samples_split
in
min_samples_split_arr
:
for
min_samples_leaf
in
min_samples_leaf_arr
:
for
max_features
in
max_features_arr
:
t
=
DecisionTreeRegressor
(
criterion
=
criterion
,
splitter
=
splitter
,
max_depth
=
depth
,
min_samples_split
=
min_samples_split
,
min_samples_leaf
=
min_samples_leaf
,
max_features
=
max_features
,
random_state
=
2017
)
t
.
fit
(
X_train
,
y_train
)
#filename= "t_{}_{}_{}_{}_{}_{}.sav".format(criterion, splitter, depth, min_sample_split, min_sample_leaf, max_features)
library
.
append
(
t
)
print
(
"classifiers"
,
len
(
library
))
scores_list
=
list
()
for
classif
in
library
:
r2
=
classif
.
score
(
X_val
,
y_val
)
scores_list
.
append
(
r2
)
print
(
"scores"
,
len
(
scores_list
))
#print(scores_list)
##########################
np_scores_list
=
np
.
array
(
scores_list
)
#sort_ind = np.argsort(np_scores_list)[::-1]
#sorted_scores = [scores_list[i] for i in sort_ind]
#sorted_class = [class_list[i] for i in sort_ind]
#print(sorted_class)
#print(sorted_scores)
#res = list()
#for s in [10, 20, 30]:
# best_class = sorted_class[:s]
# temp_res = list()
# for r in best_class:
# r2 = r.score(X_test, y_test)
# temp_res.append(r2)
# res.append(np.mean(temp_res))
#print("scores on test set", res)
###########################
#for k in range(num_sel_tree-1):
# cand_index = 0
# best_mean = 0
# #del scores_sel[-1]
# for j in range(len(scores_list)):
# scores_sel.append(scores_list[j])
# temp_scores_sel = np.array(scores_sel)
# temp_mean = np.mean(temp_scores_sel)
# if (temp_mean > best_mean):
# best_mean = temp_mean
# cand_index = j
# del scores_sel[-1]
# ens_sel.append(class_list[cand_index])
# scores_sel.append(scores_list[cand_index])
# del scores_list[cand_index]
# del class_list[cand_index]
#print("selected models",ens_sel)
#print("selected_scores", scores_sel)
trees_in_forest
=
list
()
perf_prun_forest
=
list
()
for
num_sel_tree
in
[
2
,
4
,
6
,
8
,
10
,
15
,
20
,
30
,
40
,
50
]:
class_list
=
list
(
library
)
print
(
"class list"
,
len
(
class_list
))
m
=
np
.
argmax
(
np_scores_list
)
ens_sel
=
[
class_list
[
m
]]
#scores_sel = [scores_list[m]]
#del scores_list[m]
temp_pred
=
class_list
[
m
].
predict
(
X_val
)
del
class_list
[
m
]
#print("prima di entrare nel for", len(class_list))
for
k
in
range
(
num_sel_tree
-
1
):
cand_index
=
0
r2_best
=
-
10000
#print("ad ogni loop", len(class_list))
for
j
in
range
(
len
(
class_list
)):
temp_pred
=
np
.
vstack
((
temp_pred
,
class_list
[
j
].
predict
(
X_val
)))
temp_mean
=
np
.
mean
(
temp_pred
,
axis
=
0
)
#print("temp pred and temp mean shapes", temp_pred.shape, temp_mean.shape)
r2_temp
=
r2_score
(
y_val
,
temp_mean
)
if
(
r2_temp
>
r2_best
):
r2_best
=
r2_temp
cand_index
=
j
temp_pred
=
np
.
delete
(
temp_pred
,
-
1
,
0
)
#print(temp_pred.shape)
ens_sel
.
append
(
class_list
[
cand_index
])
#scores_sel.append(scores_list[cand_index])
temp_pred
=
np
.
vstack
((
temp_pred
,
class_list
[
cand_index
].
predict
(
X_val
)))
#del scores_list[cand_index]
del
class_list
[
cand_index
]
#print("ens_sel", len(ens_sel))
test_list
=
list
()
for
mod
in
ens_sel
:
test_pred
=
mod
.
predict
(
X_test
)
test_list
.
append
(
test_pred
)
#print("scores sep", mod.score(X_test, y_test))
test_list
=
np
.
array
(
test_list
)
#print("test list shape", test_list.shape)
test_mean
=
np
.
mean
(
test_list
,
axis
=
0
)
#print("test list shape", test_mean.shape)
r2_test
=
r2_score
(
test_mean
,
y_test
)
#print(r2_test)
#print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test))
print
(
num_sel_tree
,
r2_test
)
trees_in_forest
.
append
(
num_sel_tree
)
perf_prun_forest
.
append
(
r2_test
)
print
(
trees_in_forest
)
print
(
perf_prun_forest
)
ax
=
plt
.
gca
()
ax
.
plot
(
trees_in_forest
,
perf_prun_forest
,
label
=
'ensemble selection'
)
ax
.
legend
()
#plt.title('fashion mnist')
plt
.
xlabel
(
'num trees'
)
plt
.
ylabel
(
'r2 score'
)
plt
.
savefig
(
"ensemble_selection.pdf"
)
plt
.
show
()
code/forest_similarity.py
deleted
100644 → 0
View file @
3f1e869d
from
sklearn.datasets
import
fetch_california_housing
from
sklearn.model_selection
import
train_test_split
from
sklearn.tree
import
DecisionTreeRegressor
from
sklearn.externals
import
joblib
import
numpy
as
np
from
sklearn.metrics
import
r2_score
from
sklearn.ensemble
import
RandomForestRegressor
import
matplotlib.pyplot
as
plt
(
data
,
target
)
=
fetch_california_housing
(
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
data
,
target
,
test_size
=
10000
,
random_state
=
2019
)
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_train
,
y_train
,
test_size
=
3000
,
random_state
=
2019
)
num_trees
=
100
prun_for_size
=
[
2
,
4
,
6
,
8
,
10
,
12
,
15
,
20
]
randfor
=
RandomForestRegressor
(
num_trees
,
max_depth
=
7
,
random_state
=
2019
)
randfor
.
fit
(
X_train
,
y_train
)
randfor_pred
=
randfor
.
score
(
X_val
,
y_val
)
trees_forest
=
randfor
.
estimators_
trees_in_forest
=
list
()
perf_prun_forest
=
list
()
for
k
in
range
(
len
(
prun_for_size
)):
ens_sel
=
list
()
trees_list
=
list
(
randfor
.
estimators_
)
#print("dovrebbe essere la taglia iniziale", len(trees_list))
for
j
in
range
(
num_trees
-
prun_for_size
[
k
]):
best_simil
=
100000
cand_ind
=
0
for
i
in
range
(
len
(
trees_list
)):
lonely_tree
=
trees_list
[
i
]
del
trees_list
[
i
]
val_list
=
list
()
#print("quando poto", len(trees_list))
for
tree
in
trees_list
:
val_pred
=
tree
.
predict
(
X_val
)
val_list
.
append
(
val_pred
)
val_list
=
np
.
array
(
val_list
)
val_mean
=
np
.
mean
(
val_list
,
axis
=
0
)
r2_val
=
r2_score
(
val_mean
,
y_val
)
temp_simil
=
abs
(
randfor_pred
-
r2_val
)
if
(
temp_simil
<
best_simil
):
cand_ind
=
i
best_simil
=
temp_simil
trees_list
.
insert
(
i
,
lonely_tree
)
#print("quando innesto", len(trees_list))
ens_sel
.
append
(
trees_list
[
cand_ind
])
del
trees_list
[
cand_ind
]
prun_for
=
list
(
set
(
trees_forest
)
-
set
(
ens_sel
))
print
(
"prun_for"
,
len
(
prun_for
))
print
(
"trees forest"
,
len
(
trees_forest
))
print
(
"ens_sel"
,
len
(
ens_sel
))
test_list
=
list
()
for
mod
in
prun_for
:
test_pred
=
mod
.
predict
(
X_test
)
test_list
.
append
(
test_pred
)
#print("scores sep", mod.score(X_test, y_test))
test_list
=
np
.
array
(
test_list
)
#print("test list shape", test_list.shape)
test_mean
=
np
.
mean
(
test_list
,
axis
=
0
)
#print("test list shape", test_mean.shape)
r2_test
=
r2_score
(
test_mean
,
y_test
)
#print(r2_test)
#print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test))
print
(
len
(
prun_for
),
r2_test
)
trees_in_forest
.
append
(
len
(
prun_for
))
perf_prun_forest
.
append
(
r2_test
)
print
(
trees_in_forest
)
print
(
r2_test
)
ax
=
plt
.
gca
()
ax
.
plot
(
trees_in_forest
,
perf_prun_forest
,
label
=
'pruned forest'
)
ax
.
legend
()
#plt.title('fashion mnist')
plt
.
xlabel
(
'num trees'
)
plt
.
ylabel
(
'r2 score'
)
plt
.
savefig
(
"pruned_forest.pdf"
)
plt
.
show
()
code/train.py
View file @
1db36b5d
...
...
@@ -248,7 +248,7 @@ if __name__ == "__main__":
parameters
[
'extracted_forest_size'
]
=
np
.
unique
(
np
.
around
(
hyperparameters
[
'n_estimators'
]
*
np
.
linspace
(
0
,
args
.
extracted_forest_size_stop
,
parameters
[
'extracted_forest_size_samples'
]
+
1
,
endpoint
=
Fals
e
)[
1
:]).
astype
(
np
.
int
)).
tolist
()
endpoint
=
Tru
e
)[
1
:]).
astype
(
np
.
int
)).
tolist
()
if
parameters
[
'seeds'
]
!=
None
and
parameters
[
'random_seed_number'
]
>
1
:
logger
.
warning
(
'seeds and random_seed_number parameters are both specified. Seeds will be used.'
)
...
...
experiments/.gitkeep
deleted
100644 → 0
View file @
3f1e869d
experiments/20newsgroups_vectorized/stage1/none_with_params.json
deleted
100644 → 0
View file @
3f1e869d
{
"experiment_id"
:
1
,
"experiment_configuration"
:
null
,
"experiment_configuration_path"
:
"experiments"
,
"dataset_name"
:
"20newsgroups_vectorized"
,
"normalize_D"
:
false
,
"dataset_normalizer"
:
"standard"
,
"forest_size"
:
null
,
"extracted_forest_size_samples"
:
5
,
"extracted_forest_size_stop"
:
0.05
,
"models_dir"
:
"models/20newsgroups_vectorized/stage1"
,
"dev_size"
:
0.2
,
"test_size"
:
0.2
,
"random_seed_number"
:
1
,
"seeds"
:
[
1
,
2
,
3
,
4
,
5
],
"subsets_used"
:
"train,dev"
,
"normalize_weights"
:
false
,
"verbose"
:
false
,
"skip_best_hyperparams"
:
false
,
"save_experiment_configuration"
:
[
"1"
,
"none_with_params"
],
"job_number"
:
-1
,
"extraction_strategy"
:
"none"
,
"extracted_forest_size"
:
[
7
,
13
,
20
,
27
,
34
]
}
\ No newline at end of file
experiments/20newsgroups_vectorized/stage1/none_wo_params.json
deleted
100644 → 0
View file @
3f1e869d
{
"experiment_id"
:
4
,
"experiment_configuration"
:
null
,
"experiment_configuration_path"
:
"experiments"
,
"dataset_name"
:
"20newsgroups_vectorized"
,
"normalize_D"
:
false
,
"dataset_normalizer"
:
"standard"
,
"forest_size"
:
null
,
"extracted_forest_size_samples"
:
5
,
"extracted_forest_size_stop"
:
0.05
,
"models_dir"
:
"models/20newsgroups_vectorized/stage1"
,
"dev_size"
:
0.2
,
"test_size"
:
0.2
,
"random_seed_number"
:
1
,
"seeds"
:
[
1
,
2
,
3
,
4
,
5
],
"subsets_used"
:
"train,dev"
,
"normalize_weights"
:
false
,
"verbose"
:
false
,
"skip_best_hyperparams"
:
true
,
"save_experiment_configuration"
:
[
"1"
,
"none_wo_params"
],
"job_number"
:
-1
,
"extraction_strategy"
:
"none"
,
"extracted_forest_size"
:
[
7
,
13
,
20
,
27
,
34
]
}
\ No newline at end of file
experiments/20newsgroups_vectorized/stage1/omp_wo_params.json
deleted
100644 → 0
View file @
3f1e869d
{
"experiment_id"
:
6
,
"experiment_configuration"
:
null
,
"experiment_configuration_path"
:
"experiments"
,
"dataset_name"
:
"20newsgroups_vectorized"
,
"normalize_D"
:
false
,
"dataset_normalizer"
:
"standard"
,
"forest_size"
:
null
,
"extracted_forest_size_samples"
:
5
,
"extracted_forest_size_stop"
:
0.05
,
"models_dir"
:
"models/20newsgroups_vectorized/stage1"
,
"dev_size"
:
0.2
,
"test_size"
:
0.2
,
"random_seed_number"
:
1
,
"seeds"
:
[
1
,
2
,
3
,
4
,
5
],
"subsets_used"
:
"train,dev"
,
"normalize_weights"
:
false
,
"verbose"
:
false
,
"skip_best_hyperparams"
:
true
,
"save_experiment_configuration"
:
[
"1"
,
"omp_wo_params"
],
"job_number"
:
-1
,
"extraction_strategy"
:
"omp"
,
"extracted_forest_size"
:
[
7
,
13
,
20
,
27
,
34
]
}
\ No newline at end of file
experiments/20newsgroups_vectorized/stage1/params.json
deleted
100644 → 0
View file @
3f1e869d
{
"scorer"
:
"accuracy"
,
"best_score_train"
:
0.7953125
,
"best_score_test"
:
0.7909854175872735
,
"best_parameters"
:
{
"max_depth"
:
20
,
"max_features"
:
"sqrt"
,
"min_samples_leaf"
:
1
,
"n_estimators"
:
809
},
"random_seed"
:
1763
}
\ No newline at end of file
experiments/20newsgroups_vectorized/stage1/random_with_params.json
deleted
100644 → 0
View file @
3f1e869d
{
"experiment_id"
:
2
,
"experiment_configuration"
:
null
,
"experiment_configuration_path"
:
"experiments"
,
"dataset_name"
:
"20newsgroups_vectorized"
,
"normalize_D"
:
false
,
"dataset_normalizer"
:
"standard"
,
"forest_size"
:
null
,
"extracted_forest_size_samples"
:
5
,
"extracted_forest_size_stop"
:
0.05
,
"models_dir"
:
"models/20newsgroups_vectorized/stage1"
,
"dev_size"
:
0.2
,
"test_size"
:
0.2
,
"random_seed_number"
:
1
,
"seeds"
:
[
1
,
2
,
3
,
4
,
5
],
"subsets_used"
:
"train,dev"
,