Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Luc Giffon
bolsonaro
Commits
4ad6e725
Commit
4ad6e725
authored
Mar 10, 2020
by
Charly Lamothe
Browse files
Merge branch '15-integration-sota' into 'master'
Resolve "integration-sota" See merge request
!20
parents
1db36b5d
462e76fa
Changes
297
Hide whitespace changes
Inline
Side-by-side
TODO.md
View file @
4ad6e725
*
Fix model results loading in compute_results.py.
*
Check that omp multiclasses classifier is working as expected.
*
Fix the dataset error of fetcher when job_number > 1.
\ No newline at end of file
*
Check that omp multiclasses classifier is working as expected.
\ No newline at end of file
code/bolsonaro/models/ensemble_selection_forest_regressor.py
0 → 100644
View file @
4ad6e725
from
sklearn.metrics
import
mean_squared_error
from
sklearn.base
import
BaseEstimator
from
sklearn.tree
import
DecisionTreeRegressor
from
abc
import
abstractmethod
,
ABCMeta
import
numpy
as
np
from
tqdm
import
tqdm
class
EnsembleSelectionForestRegressor
(
BaseEstimator
,
metaclass
=
ABCMeta
):
"""
'Ensemble selection from libraries of models' by Rich Caruana et al
"""
def
__init__
(
self
,
models_parameters
,
library
,
score_metric
=
mean_squared_error
):
self
.
_models_parameters
=
models_parameters
self
.
_library
=
library
self
.
_extracted_forest_size
=
self
.
_models_parameters
.
extracted_forest_size
self
.
_score_metric
=
score_metric
@
property
def
models_parameters
(
self
):
return
self
.
_models_parameters
@
property
def
library
(
self
):
return
self
.
_library
def
fit
(
self
,
X_train
,
y_train
,
X_val
,
y_val
):
scores_list
=
list
()
for
estimator
in
self
.
_library
:
val_score
=
self
.
_score_metric
(
estimator
.
predict
(
X_val
),
y_val
)
scores_list
.
append
(
val_score
)
class_list
=
list
(
self
.
_library
)
m
=
np
.
argmax
(
np
.
asarray
(
scores_list
))
self
.
_ensemble_selected
=
[
class_list
[
m
]]
temp_pred
=
class_list
[
m
].
predict
(
X_val
)
del
class_list
[
m
]
for
k
in
range
(
self
.
_extracted_forest_size
-
1
):
candidate_index
=
0
best_score
=
100000
for
j
in
range
(
len
(
class_list
)):
temp_pred
=
np
.
vstack
((
temp_pred
,
class_list
[
j
].
predict
(
X_val
)))
temp_mean
=
np
.
mean
(
temp_pred
,
axis
=
0
)
temp_score
=
self
.
_score_metric
(
temp_mean
,
y_val
)
if
(
temp_score
<
best_score
):
candidate_index
=
j
best_score
=
temp_score
temp_pred
=
np
.
delete
(
temp_pred
,
-
1
,
0
)
self
.
_ensemble_selected
.
append
(
class_list
[
candidate_index
])
temp_pred
=
np
.
vstack
((
temp_pred
,
class_list
[
candidate_index
].
predict
(
X_val
)))
del
class_list
[
candidate_index
]
def
score
(
self
,
X
,
y
):
predictions
=
self
.
_predict_base_estimator
(
X
)
return
self
.
_score_metric
(
predictions
,
y
)
def
predict_base_estimator
(
self
,
X
):
predictions
=
list
()
for
tree
in
self
.
_ensemble_selected
:
predictions
.
append
(
tree
.
predict
(
X
))
mean_predictions
=
np
.
mean
(
np
.
array
(
predictions
),
axis
=
0
)
return
mean_predictions
@
staticmethod
def
generate_library
(
X_train
,
y_train
,
random_state
=
None
):
criterion_arr
=
[
"mse"
]
#, "friedman_mse", "mae"]
splitter_arr
=
[
"best"
]
#, "random"]
depth_arr
=
[
i
for
i
in
range
(
5
,
20
,
1
)]
min_samples_split_arr
=
[
i
for
i
in
range
(
2
,
20
,
1
)]
min_samples_leaf_arr
=
[
i
for
i
in
range
(
2
,
20
,
1
)]
max_features_arr
=
[
"sqrt"
]
#["auto", "sqrt", "log2"]
library
=
list
()
with
tqdm
(
total
=
len
(
criterion_arr
)
*
len
(
splitter_arr
)
*
\
len
(
depth_arr
)
*
len
(
min_samples_split_arr
)
*
len
(
min_samples_leaf_arr
)
*
\
len
(
max_features_arr
))
as
bar
:
bar
.
set_description
(
'Generating library'
)
for
criterion
in
criterion_arr
:
for
splitter
in
splitter_arr
:
for
depth
in
depth_arr
:
for
min_samples_split
in
min_samples_split_arr
:
for
min_samples_leaf
in
min_samples_leaf_arr
:
for
max_features
in
max_features_arr
:
t
=
DecisionTreeRegressor
(
criterion
=
criterion
,
splitter
=
splitter
,
max_depth
=
depth
,
min_samples_split
=
min_samples_split
,
min_samples_leaf
=
min_samples_leaf
,
max_features
=
max_features
,
random_state
=
random_state
)
t
.
fit
(
X_train
,
y_train
)
library
.
append
(
t
)
bar
.
update
(
1
)
return
library
code/bolsonaro/models/model_factory.py
View file @
4ad6e725
...
...
@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from
bolsonaro.models.model_parameters
import
ModelParameters
from
bolsonaro.models.similarity_forest_regressor
import
SimilarityForestRegressor
from
bolsonaro.models.kmeans_forest_regressor
import
KMeansForestRegressor
from
bolsonaro.models.ensemble_selection_forest_regressor
import
EnsembleSelectionForestRegressor
from
bolsonaro.data.task
import
Task
from
sklearn.ensemble
import
RandomForestRegressor
,
RandomForestClassifier
...
...
@@ -13,7 +14,7 @@ import pickle
class
ModelFactory
(
object
):
@
staticmethod
def
build
(
task
,
model_parameters
):
def
build
(
task
,
model_parameters
,
library
=
None
):
if
task
not
in
[
Task
.
BINARYCLASSIFICATION
,
Task
.
REGRESSION
,
Task
.
MULTICLASSIFICATION
]:
raise
ValueError
(
"Unsupported task '{}'"
.
format
(
task
))
...
...
@@ -21,10 +22,10 @@ class ModelFactory(object):
if
model_parameters
.
extraction_strategy
==
'omp'
:
return
OmpForestBinaryClassifier
(
model_parameters
)
elif
model_parameters
.
extraction_strategy
==
'random'
:
return
RandomForestClassifier
(
n_estimators
=
model_parameters
.
extracted_forest_size
,
return
RandomForestClassifier
(
**
model_parameters
.
hyperparameters
,
random_state
=
model_parameters
.
seed
)
elif
model_parameters
.
extraction_strategy
==
'none'
:
return
RandomForestClassifier
(
n_estimators
=
model_parameters
.
hyperparameters
[
'n_estimators'
]
,
return
RandomForestClassifier
(
**
model_parameters
.
hyperparameters
,
random_state
=
model_parameters
.
seed
)
else
:
raise
ValueError
(
'Invalid extraction strategy'
)
...
...
@@ -32,14 +33,16 @@ class ModelFactory(object):
if
model_parameters
.
extraction_strategy
==
'omp'
:
return
OmpForestRegressor
(
model_parameters
)
elif
model_parameters
.
extraction_strategy
==
'random'
:
return
RandomForestRegressor
(
n_estimators
=
model_parameters
.
extracted_forest_size
,
return
RandomForestRegressor
(
**
model_parameters
.
hyperparameters
,
random_state
=
model_parameters
.
seed
)
elif
model_parameters
.
extraction_strategy
==
'similarity'
:
return
SimilarityForestRegressor
(
model_parameters
)
elif
model_parameters
.
extraction_strategy
==
'kmeans'
:
return
KMeansForestRegressor
(
model_parameters
)
elif
model_parameters
.
extraction_strategy
==
'ensemble'
:
return
EnsembleSelectionForestRegressor
(
model_parameters
,
library
=
library
)
elif
model_parameters
.
extraction_strategy
==
'none'
:
return
RandomForestRegressor
(
n_estimators
=
model_parameters
.
hyperparameters
[
'n_estimators'
]
,
return
RandomForestRegressor
(
**
model_parameters
.
hyperparameters
,
random_state
=
model_parameters
.
seed
)
else
:
raise
ValueError
(
'Invalid extraction strategy'
)
...
...
@@ -47,10 +50,10 @@ class ModelFactory(object):
if
model_parameters
.
extraction_strategy
==
'omp'
:
return
OmpForestMulticlassClassifier
(
model_parameters
)
elif
model_parameters
.
extraction_strategy
==
'random'
:
return
RandomForestClassifier
(
n_estimators
=
model_parameters
.
extracted_forest_size
,
return
RandomForestClassifier
(
**
model_parameters
.
hyperparameters
,
random_state
=
model_parameters
.
seed
)
elif
model_parameters
.
extraction_strategy
==
'none'
:
return
RandomForestClassifier
(
n_estimators
=
model_parameters
.
hyperparameters
[
'n_estimators'
]
,
return
RandomForestClassifier
(
**
model_parameters
.
hyperparameters
,
random_state
=
model_parameters
.
seed
)
else
:
raise
ValueError
(
'Invalid extraction strategy'
)
code/bolsonaro/models/omp_forest.py
View file @
4ad6e725
...
...
@@ -123,7 +123,9 @@ class SingleOmpForest(OmpForest):
forest_predictions
=
self
.
_base_estimator_predictions
(
X
)
if
self
.
_models_parameters
.
normalize_D
:
forest_predictions
=
forest_predictions
.
T
forest_predictions
/=
self
.
_forest_norms
forest_predictions
=
forest_predictions
.
T
return
self
.
_make_omp_weighted_prediction
(
forest_predictions
,
self
.
_omp
,
self
.
_models_parameters
.
normalize_weights
)
...
...
@@ -134,16 +136,15 @@ class SingleOmpForest(OmpForest):
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the
entire fores
t
:return: a np.array of the predictions of the
trees selected by OMP without applyong the weigh
t
"""
forest_predictions
=
self
.
_base_estimator
_predictions
(
X
).
T
forest_predictions
=
np
.
array
([
tree
.
predict
(
X
)
for
tree
in
self
.
_base_
forest_
estimator
.
estimators_
])
if
self
.
_models_parameters
.
normalize_D
:
forest_predictions
=
forest_predictions
.
T
forest_predictions
/=
self
.
_forest_norms
forest_predictions
=
forest_predictions
.
T
weights
=
self
.
_omp
.
coef_
omp_trees_indices
=
np
.
nonzero
(
weights
)[
0
]
select_trees
=
np
.
mean
(
forest_predictions
[
omp_trees_indices
],
axis
=
0
)
print
(
len
(
omp_trees_indices
))
select_trees
=
np
.
mean
(
forest_predictions
[
weights
!=
0
],
axis
=
0
)
return
select_trees
code/bolsonaro/models/omp_forest_classifier.py
View file @
4ad6e725
...
...
@@ -37,12 +37,12 @@ class OmpForestBinaryClassifier(SingleOmpForest):
forest_predictions
=
np
.
array
([
tree
.
predict_proba
(
X
)
for
tree
in
self
.
_base_forest_estimator
.
estimators_
])
if
self
.
_models_parameters
.
normalize_D
:
forest_predictions
=
forest_predictions
.
T
forest_predictions
/=
self
.
_forest_norms
forest_predictions
=
forest_predictions
.
T
weights
=
self
.
_omp
.
coef_
omp_trees_indices
=
np
.
nonzero
(
weights
)
omp_trees_predictions
=
forest_predictions
[
omp_trees_indices
].
T
[
1
]
omp_trees_predictions
=
forest_predictions
[
weights
!=
0
].
T
[
1
]
# Here forest_pred is the probability of being class 1.
...
...
@@ -119,7 +119,9 @@ class OmpForestMulticlassClassifier(OmpForest):
forest_predictions
=
np
.
array
([
tree
.
predict_proba
(
X
)
for
tree
in
self
.
_base_forest_estimator
.
estimators_
]).
T
if
self
.
_models_parameters
.
normalize_D
:
forest_predictions
=
forest_predictions
.
T
forest_predictions
/=
self
.
_forest_norms
forest_predictions
=
forest_predictions
.
T
label_names
=
[]
preds
=
[]
...
...
code/bolsonaro/trainer.py
View file @
4ad6e725
...
...
@@ -2,6 +2,8 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from
bolsonaro.models.omp_forest_regressor
import
OmpForestRegressor
from
bolsonaro.models.omp_forest_classifier
import
OmpForestBinaryClassifier
,
OmpForestMulticlassClassifier
from
bolsonaro.models.similarity_forest_regressor
import
SimilarityForestRegressor
from
bolsonaro.models.kmeans_forest_regressor
import
KMeansForestRegressor
from
bolsonaro.models.ensemble_selection_forest_regressor
import
EnsembleSelectionForestRegressor
from
bolsonaro.error_handling.logger_factory
import
LoggerFactory
from
bolsonaro.data.task
import
Task
from
.
import
LOG_PATH
...
...
@@ -72,20 +74,25 @@ class Trainer(object):
else
:
raise
ValueError
(
"Unknown specified subsets_used parameter '{}'"
.
format
(
model
.
models_parameters
.
subsets_used
))
def
train
(
self
,
model
):
def
train
(
self
,
model
,
extracted_forest_size
=
None
):
"""
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
:return:
"""
self
.
_logger
.
debug
(
'Training model using train set...'
)
self
.
_begin_time
=
time
.
time
()
if
type
(
model
)
in
[
RandomForestRegressor
,
RandomForestClassifier
]:
model
.
fit
(
X
=
self
.
_X_forest
,
y
=
self
.
_y_forest
)
if
extracted_forest_size
is
not
None
:
estimators_index
=
np
.
arange
(
len
(
model
.
estimators_
))
np
.
random
.
shuffle
(
estimators_index
)
choosen_estimators
=
estimators_index
[:
extracted_forest_size
]
model
.
estimators_
=
np
.
array
(
model
.
estimators_
)[
choosen_estimators
]
else
:
model
.
fit
(
X
=
self
.
_X_forest
,
y
=
self
.
_y_forest
)
else
:
model
.
fit
(
self
.
_X_forest
,
...
...
@@ -96,7 +103,7 @@ class Trainer(object):
self
.
_end_time
=
time
.
time
()
def
__score_func
(
self
,
model
,
X
,
y_true
,
weights
=
True
):
if
type
(
model
)
in
[
OmpForestRegressor
,
RandomForestRegressor
,
SimilarityForestRegressor
]:
if
type
(
model
)
in
[
OmpForestRegressor
,
RandomForestRegressor
]:
if
weights
:
y_pred
=
model
.
predict
(
X
)
else
:
...
...
@@ -109,12 +116,14 @@ class Trainer(object):
y_pred
=
model
.
predict_no_weights
(
X
)
if
type
(
model
)
is
OmpForestBinaryClassifier
:
y_pred
=
np
.
sign
(
y_pred
)
y_pred
=
np
.
where
(
y_pred
==
0
,
1
,
y_pred
)
y_pred
=
np
.
where
(
y_pred
==
0
,
1
,
y_pred
)
result
=
self
.
_classification_score_metric
(
y_true
,
y_pred
)
elif
type
(
model
)
in
[
SimilarityForestRegressor
,
KMeansForestRegressor
,
EnsembleSelectionForestRegressor
]:
result
=
model
.
score
(
X
,
y_true
)
return
result
def
__score_func_base
(
self
,
model
,
X
,
y_true
):
if
type
(
model
)
==
OmpForestRegressor
:
if
type
(
model
)
in
[
OmpForestRegressor
,
SimilarityForestRegressor
,
KMeansForestRegressor
,
EnsembleSelectionForestRegressor
]
:
y_pred
=
model
.
predict_base_estimator
(
X
)
result
=
self
.
_base_regression_score_metric
(
y_true
,
y_pred
)
elif
type
(
model
)
in
[
OmpForestBinaryClassifier
,
OmpForestMulticlassClassifier
]:
...
...
@@ -123,7 +132,7 @@ class Trainer(object):
elif
type
(
model
)
==
RandomForestClassifier
:
y_pred
=
model
.
predict
(
X
)
result
=
self
.
_base_classification_score_metric
(
y_true
,
y_pred
)
elif
type
(
model
)
i
n
[
RandomForestRegressor
,
SimilarityForestRegressor
]
:
elif
type
(
model
)
i
s
RandomForestRegressor
:
y_pred
=
model
.
predict
(
X
)
result
=
self
.
_base_regression_score_metric
(
y_true
,
y_pred
)
return
result
...
...
code/compute_results.py
View file @
4ad6e725
...
...
@@ -133,10 +133,11 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
'--stage'
,
nargs
=
'?'
,
type
=
int
,
required
=
True
,
help
=
'Specify the stage number among [1, 5].'
)
parser
.
add_argument
(
'--experiment_ids'
,
nargs
=
'+'
,
type
=
int
,
required
=
True
,
help
=
'Compute the results of the specified experiment id(s).'
+
\
parser
.
add_argument
(
'--experiment_ids'
,
nargs
=
'+'
,
type
=
str
,
required
=
True
,
help
=
'Compute the results of the specified experiment id(s).'
+
\
'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}'
+
\
'stage=2: {{no_normalization}} {{normalize_D}} {{normalize_weights}} {{normalize_D_and_weights}}'
+
\
'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}'
)
'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}'
+
\
'stage=5: {{base_with_params}} {{random_with_params}} {{omp_with_params}} [ensemble={{id}}] [similarity={{id}}] [kmean={{id}}]'
)
parser
.
add_argument
(
'--dataset_name'
,
nargs
=
'?'
,
type
=
str
,
required
=
True
,
help
=
'Specify the dataset name. TODO: read it from models dir directly.'
)
parser
.
add_argument
(
'--results_dir'
,
nargs
=
'?'
,
type
=
str
,
default
=
DEFAULT_RESULTS_DIR
,
help
=
'The output directory of the results.'
)
parser
.
add_argument
(
'--models_dir'
,
nargs
=
'?'
,
type
=
str
,
default
=
DEFAULT_MODELS_DIR
,
help
=
'The output directory of the trained models.'
)
...
...
@@ -159,7 +160,7 @@ if __name__ == "__main__":
raise
ValueError
(
'In the case of stage 1, the number of specified experiment ids must be 6.'
)
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number
=
retreive_extracted_forest_sizes_number
(
args
.
models_dir
,
args
.
experiment_ids
[
1
])
extracted_forest_sizes_number
=
retreive_extracted_forest_sizes_number
(
args
.
models_dir
,
int
(
args
.
experiment_ids
[
1
])
)
# Experiments that used the best hyperparameters found for this dataset
...
...
@@ -167,18 +168,18 @@ if __name__ == "__main__":
logger
.
info
(
'Loading base_with_params experiment scores...'
)
base_with_params_train_scores
,
base_with_params_dev_scores
,
base_with_params_test_scores
,
\
base_with_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
0
],
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
0
]
)
,
extracted_forest_sizes_number
)
# random_with_params
logger
.
info
(
'Loading random_with_params experiment scores...'
)
random_with_params_train_scores
,
random_with_params_dev_scores
,
random_with_params_test_scores
,
\
with_params_extracted_forest_sizes
,
random_with_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
1
])
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
1
])
)
# omp_with_params
logger
.
info
(
'Loading omp_with_params experiment scores...'
)
omp_with_params_train_scores
,
omp_with_params_dev_scores
,
omp_with_params_test_scores
,
_
,
\
omp_with_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
2
])
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
2
])
)
# Experiments that didn't use the best hyperparameters found for this dataset
...
...
@@ -186,19 +187,19 @@ if __name__ == "__main__":
logger
.
info
(
'Loading base_wo_params experiment scores...'
)
base_wo_params_train_scores
,
base_wo_params_dev_scores
,
base_wo_params_test_scores
,
\
base_wo_params_experiment_score_metric
=
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
3
],
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
3
]
)
,
extracted_forest_sizes_number
)
# random_wo_params
logger
.
info
(
'Loading random_wo_params experiment scores...'
)
random_wo_params_train_scores
,
random_wo_params_dev_scores
,
random_wo_params_test_scores
,
\
wo_params_extracted_forest_sizes
,
random_wo_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
4
])
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
4
])
)
# base_wo_params
logger
.
info
(
'Loading base_wo_params experiment scores...'
)
omp_wo_params_train_scores
,
omp_wo_params_dev_scores
,
omp_wo_params_test_scores
,
_
,
\
omp_wo_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
5
])
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
5
])
)
# Sanity check on the metrics retreived
if
not
(
base_with_params_experiment_score_metric
==
random_with_params_experiment_score_metric
==
...
...
@@ -243,25 +244,25 @@ if __name__ == "__main__":
logger
.
info
(
'Loading no_normalization experiment scores...'
)
_
,
_
,
no_normalization_test_scores
,
extracted_forest_sizes
,
no_normalization_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
0
])
int
(
args
.
experiment_ids
[
0
])
)
# normalize_D
logger
.
info
(
'Loading normalize_D experiment scores...'
)
_
,
_
,
normalize_D_test_scores
,
_
,
normalize_D_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
1
])
int
(
args
.
experiment_ids
[
1
])
)
# normalize_weights
logger
.
info
(
'Loading normalize_weights experiment scores...'
)
_
,
_
,
normalize_weights_test_scores
,
_
,
normalize_weights_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
2
])
int
(
args
.
experiment_ids
[
2
])
)
# normalize_D_and_weights
logger
.
info
(
'Loading normalize_D_and_weights experiment scores...'
)
_
,
_
,
normalize_D_and_weights_test_scores
,
_
,
normalize_D_and_weights_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
3
])
int
(
args
.
experiment_ids
[
3
])
)
# Sanity check on the metrics retreived
if
not
(
no_normalization_experiment_score_metric
==
normalize_D_experiment_score_metric
...
...
@@ -290,21 +291,21 @@ if __name__ == "__main__":
train_dev_subset_train_scores
,
train_dev_subset_dev_scores
,
train_dev_subset_test_scores
,
\
extracted_forest_sizes
,
train_dev_subset_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
0
])
int
(
args
.
experiment_ids
[
0
])
)
# train-dev_train-dev_subset
logger
.
info
(
'Loading train-dev_train-dev_subset experiment scores...'
)
train_dev_train_dev_subset_train_scores
,
train_dev_train_dev_subset_dev_scores
,
train_dev_train_dev_subset_test_scores
,
\
_
,
train_dev_train_dev_subset_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
1
])
int
(
args
.
experiment_ids
[
1
])
)
# train-train-dev_subset
logger
.
info
(
'Loading train-train-dev_subset experiment scores...'
)
train_train_dev_subset_train_scores
,
train_train_dev_subset_dev_scores
,
train_train_dev_subset_test_scores
,
\
_
,
train_train_dev_subset_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
2
])
int
(
args
.
experiment_ids
[
2
])
)
# Sanity check on the metrics retreived
if
not
(
train_dev_subset_experiment_score_metric
==
train_dev_train_dev_subset_experiment_score_metric
...
...
@@ -349,13 +350,13 @@ if __name__ == "__main__":
logger
.
info
(
'Loading base_with_params experiment scores...'
)
base_with_params_train_scores
,
base_with_params_dev_scores
,
base_with_params_test_scores
,
\
base_with_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
0
],
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
0
]
)
,
extracted_forest_sizes_number
)
# random_with_params
logger
.
info
(
'Loading random_with_params experiment scores...'
)
random_with_params_train_scores
,
random_with_params_dev_scores
,
random_with_params_test_scores
,
\
with_params_extracted_forest_sizes
,
random_with_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
1
])
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
1
])
)
# omp_with_params
logger
.
info
(
'Loading omp_with_params experiment scores...'
)
"""omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _,
\
...
...
@@ -363,12 +364,12 @@ if __name__ == "__main__":
args.models_dir, args.results_dir, args.experiment_ids[2])"""
omp_with_params_train_scores
,
omp_with_params_dev_scores
,
omp_with_params_test_scores
,
_
,
\
omp_with_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
2
])
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
2
])
)
#omp_with_params_without_weights
logger
.
info
(
'Loading omp_
with_param
s experiment scores...'
)
logger
.
info
(
'Loading omp_
no_weight
s experiment scores...'
)
omp_with_params_without_weights_train_scores
,
omp_with_params_without_weights_dev_scores
,
omp_with_params_without_weights_test_scores
,
_
,
\
omp_with_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
2
],
weights
=
False
)
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
2
]
)
,
weights
=
False
)
"""# base_with_params
logger.info('Loading base_with_params experiment scores 2...')
...
...
@@ -388,7 +389,7 @@ if __name__ == "__main__":
raise
ValueError
(
'Score metrics of all experiments must be the same.'
)
experiments_score_metric
=
base_with_params_experiment_score_metric
output_path
=
os
.
path
.
join
(
args
.
results_dir
,
args
.
dataset_name
,
'stage4
_fix
'
)
output_path
=
os
.
path
.
join
(
args
.
results_dir
,
args
.
dataset_name
,
'stage4'
)
pathlib
.
Path
(
output_path
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
Plotter
.
plot_stage2_losses
(
...
...
@@ -402,47 +403,63 @@ if __name__ == "__main__":
title
=
'Loss values of {}
\n
using best params of previous stages'
.
format
(
args
.
dataset_name
))
elif
args
.
stage
==
5
:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number
=
retreive_extracted_forest_sizes_number
(
args
.
models_dir
,
args
.
experiment_ids
[
1
])
extracted_forest_sizes_number
=
retreive_extracted_forest_sizes_number
(
args
.
models_dir
,
int
(
args
.
experiment_ids
[
1
]))
all_labels
=
list
()
all_scores
=
list
()
# base_with_params
logger
.
info
(
'Loading base_with_params experiment scores...'
)
base_with_params_train_scores
,
base_with_params_dev_scores
,
base_with_params_test_scores
,
\
base_with_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
0
],
extract_scores_across_seeds_and_forest_size
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
0
]
)
,
extracted_forest_sizes_number
)
# random_with_params
logger
.
info
(
'Loading random_with_params experiment scores...'
)
random_with_params_train_scores
,
random_with_params_dev_scores
,
random_with_params_test_scores
,
\
with_params_extracted_forest_sizes
,
random_with_params_experiment_score_metric
=
\
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
1
])
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
1
])
)
# omp_with_params
logger
.
info
(
'Loading omp_with_params experiment scores...'
)
omp_with_params_train_scores
,
omp_with_params_dev_scores
,
omp_with_params_test_scores
,
_
,
\
omp_with_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
2
])
# omp_with_params
logger
.
info
(
'Loading kmeans_with_params experiment scores...'
)
kmeans_with_params_train_scores
,
kmeans_with_params_dev_scores
,
kmeans_with_params_test_scores
,
_
,
\
kmeans_with_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
args
.
experiment_ids
[
3
])
# Sanity check on the metrics retreived
if
not
(
base_with_params_experiment_score_metric
==
random_with_params_experiment_score_metric
==
omp_with_params_experiment_score_metric
==
kmeans_with_params_experiment_score_metric
):
raise
ValueError
(
'Score metrics of all experiments must be the same.'
)
experiments_score_metric
=
base_with_params_experiment_score_metric
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
2
]))
#omp_with_params_without_weights
logger
.
info
(
'Loading omp_with_params experiment scores...'
)
omp_with_params_without_weights_train_scores
,
omp_with_params_without_weights_dev_scores
,
omp_with_params_without_weights_test_scores
,
_
,
\
omp_with_params_experiment_score_metric
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
2
]),
weights
=
False
)
all_labels
=
[
'base'
,
'random'
,
'omp'
,
'omp_without_weights'
]
all_scores
=
[
base_with_params_test_scores
,
random_with_params_test_scores
,
omp_with_params_test_scores
,
omp_with_params_without_weights_test_scores
]
for
i
in
range
(
3
,
len
(
args
.
experiment_ids
)):
if
'kmeans'
in
args
.
experiment_ids
[
i
]:
label
=
'kmeans'
elif
'similarity'
in
args
.
experiment_ids
[
i
]:
label
=
'similarity'
elif
'ensemble'
in
args
.
experiment_ids
[
i
]:
label
=
'ensemble'
else
:
logger
.
error
(
'Invalid value encountered'
)
continue
output_path
=
os
.
path
.
join
(
args
.
results_dir
,
args
.
dataset_name
,
'stage5_kmeans'
)
logger
.
info
(
f
'Loading
{
label
}
experiment scores...'
)
_
,
_
,
current_test_scores
,
_
,
_
=
extract_scores_across_seeds_and_extracted_forest_sizes
(
args
.
models_dir
,
args
.
results_dir
,
int
(
args
.
experiment_ids
[
i
].
split
(
'='
)[
1
]))
all_labels
.
append
(
label
)
all_scores
.
append
(
current_test_scores
)
output_path
=
os
.
path
.
join
(
args
.
results_dir
,
args
.
dataset_name
,
'stage5'
)
pathlib
.
Path
(
output_path
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
Plotter
.
plot_stage2_losses
(
file_path
=
output_path
+
os
.
sep
+
'losses.png'
,
all_experiment_scores
=
[
base_with_params_test_scores
,
random_with_params_test_scores
,
omp_with_params_test_scores
,
kmeans_with_params_test_scores
],
all_labels
=
[
'base'
,
'random'
,
'omp'
,
'kmeans'
],
file_path
=
output_path
+
os
.
sep
+
f
"losses_
{
'-'
.
join
(
all_labels
)
}
.png"
,
all_experiment_scores
=
all_scores
,
all_labels
=
all_labels
,
x_value
=
with_params_extracted_forest_sizes
,
xlabel
=
'Number of trees extracted'
,
ylabel
=
experiment
s
_score_metric
,
ylabel
=
base_with_params_
experiment_score_metric
,
title
=
'Loss values of {}
\n
using best params of previous stages'
.
format
(
args
.
dataset_name
))
else
:
raise
ValueError
(
'This stage number is not supported yet, but it will be!'
)
...
...
code/train.py
View file @
4ad6e725
...
...
@@ -2,6 +2,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters
from
bolsonaro.data.dataset_loader
import
DatasetLoader
from
bolsonaro.models.model_factory
import
ModelFactory
from
bolsonaro.models.model_parameters
import
ModelParameters
from
bolsonaro.models.ensemble_selection_forest_regressor
import
EnsembleSelectionForestRegressor
from
bolsonaro.trainer
import
Trainer
from
bolsonaro.utils
import
resolve_experiment_id
,
tqdm_joblib
from
bolsonaro
import
LOG_PATH
...
...
@@ -9,6 +10,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory
from
dotenv
import
find_dotenv
,
load_dotenv
import
argparse
import
copy
import
json
import
pathlib
import
random
...
...
@@ -53,10 +55,37 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
trainer
=
Trainer
(
dataset
)
if
parameters
[
'extraction_strategy'
]
==
'ensemble'
:
library
=
EnsembleSelectionForestRegressor
.
generate_library
(
dataset
.
X_train
,
dataset
.
y_train
,
random_state
=
seed
)
else
:
library
=
None
if
parameters
[
'extraction_strategy'
]
==
'random'
:
pretrained_model_parameters
=
ModelParameters
(
extracted_forest_size
=
parameters
[
'forest_size'
],
normalize_D
=
parameters
[
'normalize_D'
],
subsets_used
=
parameters
[
'subsets_used'
],
normalize_weights
=
parameters
[
'normalize_weights'
],
seed
=
seed
,
hyperparameters
=
hyperparameters
,
extraction_strategy
=
parameters
[
'extraction_strategy'
]
)
pretrained_estimator
=
ModelFactory
.
build
(
dataset
.
task
,
pretrained_model_parameters
,
library
=
library
)
pretraned_trainer
=
Trainer
(
dataset
)
pretraned_trainer
.
init
(
pretrained_estimator
,
subsets_used
=
parameters
[
'subsets_used'
])
pretrained_estimator
.
fit
(
X
=
pretraned_trainer
.
_X_forest
,
y
=
pretraned_trainer
.
_y_forest
)
else
:
pretrained_estimator
=
None
pretrained_model_parameters
=
None
if
parameters
[
'extraction_strategy'
]
!=
'none'
:
with
tqdm_joblib
(
tqdm
(
total
=
len
(
parameters
[
'extracted_forest_size'
]),
disable
=
not
verbose
))
as
extracted_forest_size_job_pb
:
Parallel
(<