Skip to content
Snippets Groups Projects
Commit 149576d3 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Reworking'

parent 3c2a2df3
Branches
Tags
No related merge requests found
Showing
with 175 additions and 164 deletions
......@@ -33,7 +33,7 @@ track_tracebacks: True
# If the dataset is multiclass, will use this multiclass-to-biclass method
multiclass_method: "oneVersusOne"
# The ratio number of test exmaples/number of train examples
# The ratio number of test exmaples/number of train samples
split: 0.8
# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed
nb_folds: 5
......
......@@ -11,19 +11,54 @@ def execute(config_path=None): # pragma: no cover
exec_classif.exec_classif(sys.argv[1:])
else:
if config_path == "example 0":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_0.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_0.yml")
elif config_path == "example 1":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_1.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_1.yml")
elif config_path == "example 2.1.1":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_1_1.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_2_1_1.yml")
elif config_path == "example 2.1.2":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_1_2.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_2_1_2.yml")
elif config_path == "example 2.2":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_2.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_2_2.yml")
elif config_path == "example 2.3":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_3.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_2_3.yml")
elif config_path == "example 3":
config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_3.yml")
config_path = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
"examples",
"config_files",
"config_example_3.yml")
exec_classif.exec_classif(["--config_path", config_path])
......
import itertools
import logging
import os
import pkgutil
......@@ -7,7 +6,6 @@ import traceback
import matplotlib
import numpy as np
from sklearn.tree import DecisionTreeClassifier
# Import own modules
from . import monoview_classifiers
......@@ -16,8 +14,8 @@ from .monoview.exec_classif_mono_view import exec_monoview
from .multiview.exec_multiview import exec_multiview
from .result_analysis.execution import analyze_iterations, analyze
from .utils import execution, dataset, configuration
from .utils.organization import secure_file_path
from .utils.dataset import delete_HDF5
from .utils.organization import secure_file_path
matplotlib.use(
'Agg') # Anti-Grain Geometry C++ library to make a raster (pixel) image of the figure
......@@ -95,7 +93,8 @@ def init_argument_dictionaries(benchmark, views_dictionary,
def init_multiview_exps(classifier_names, views_dictionary, nb_class,
kwargs_init, hps_method, hps_kwargs): # pragma: no cover
kwargs_init, hps_method,
hps_kwargs): # pragma: no cover
multiview_arguments = []
for classifier_name in classifier_names:
arguments = get_path_dict(kwargs_init[classifier_name])
......@@ -104,7 +103,8 @@ def init_multiview_exps(classifier_names, views_dictionary, nb_class,
gen_single_multiview_arg_dictionary(classifier_name,
arguments,
nb_class,
{"param_grid":hps_kwargs[classifier_name]},
{"param_grid": hps_kwargs[
classifier_name]},
views_dictionary=views_dictionary)]
elif hps_method == "Random":
hps_kwargs = dict((key, value)
......@@ -168,7 +168,8 @@ def init_monoview_exps(classifier_names,
view_index,
view_name,
{"param_grid":
hps_kwargs[classifier_name]})
hps_kwargs[
classifier_name]})
elif hps_method == "Random":
hps_kwargs = dict((key, value)
for key, value in hps_kwargs.items()
......@@ -188,7 +189,8 @@ def init_monoview_exps(classifier_names,
hps_kwargs)
else:
raise ValueError('At the moment only "None", "Random" or "Grid" '
raise ValueError(
'At the moment only "None", "Random" or "Grid" '
'are available as hyper-parameter search '
'methods, sadly "{}" is not'.format(hps_method)
)
......@@ -280,6 +282,7 @@ def is_dict_in(dictionary):
paths.append(key)
return paths
def init_kwargs(args, classifiers_names, framework="monoview"):
r"""Used to init kwargs thanks to a function in each monoview classifier package.
......@@ -363,7 +366,9 @@ def arange_metrics(metrics, metric_princ):
metrics : list of lists
The metrics list, but arranged so the first one is the principal one."""
if metric_princ in metrics:
metrics = dict((key, value) if not key == metric_princ else (key+"*", value) for key, value in metrics.items())
metrics = dict(
(key, value) if not key == metric_princ else (key + "*", value) for
key, value in metrics.items())
else:
raise ValueError("{} not in metric pool ({})".format(metric_princ,
metrics))
......@@ -374,7 +379,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary,
k_folds, dataset_var):
"""
Initializes the benchmark, by saving the indices of the train
examples and the cross validation folds.
samples and the cross validation folds.
Parameters
----------
......@@ -382,7 +387,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary,
The benchmark's result directory
classification_indices : numpy array
The indices of the examples, splitted for the train/test split
The indices of the samples, splitted for the train/test split
labels : numpy array
The labels of the dataset
......@@ -400,7 +405,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary,
logging.debug("Start:\t Benchmark initialization")
secure_file_path(os.path.join(directory, "train_labels.csv"))
train_indices = classification_indices[0]
train_labels = dataset_var.get_labels(example_indices=train_indices)
train_labels = dataset_var.get_labels(sample_indices=train_indices)
np.savetxt(os.path.join(directory, "train_labels.csv"), train_labels,
delimiter=",")
np.savetxt(os.path.join(directory, "train_indices.csv"),
......@@ -558,7 +563,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None,
hyper_param_search=hyper_param_search,
metrics=metrics,
**arguments)]
except:
except BaseException:
if track_tracebacks:
traceback_outputs[
arguments["classifier_name"] + "-" + arguments[
......@@ -591,7 +596,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None,
hps_method=hyper_param_search,
metrics=metrics, n_iter=args["hps_iter"],
**arguments)]
except:
except BaseException:
if track_tracebacks:
traceback_outputs[
arguments["classifier_name"]] = traceback.format_exc()
......@@ -673,7 +678,7 @@ def exec_benchmark(nb_cores, stats_iter,
**arguments)
analyze_iterations([benchmark_results],
benchmark_arguments_dictionaries, stats_iter,
metrics, example_ids=dataset_var.example_ids,
metrics, sample_ids=dataset_var.sample_ids,
labels=dataset_var.get_labels())
results += [benchmark_results]
logging.debug("Done:\t Executing all the needed benchmarks")
......@@ -684,7 +689,7 @@ def exec_benchmark(nb_cores, stats_iter,
benchmark_arguments_dictionaries,
metrics,
directory,
dataset_var.example_ids,
dataset_var.sample_ids,
dataset_var.get_labels())
logging.debug("Done:\t Analyzing predictions")
delete(benchmark_arguments_dictionaries, nb_cores, dataset_var)
......
......@@ -3,14 +3,9 @@
get_scorer: returns a sklearn scorer for grid search
"""
import warnings
from sklearn.metrics import accuracy_score as metric
from sklearn.metrics import make_scorer
warnings.warn("the accuracy_score module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......@@ -41,5 +36,6 @@ def get_scorer(**kwargs):
def get_config(**kwargs):
config_string = "Accuracy score using {}, (higher is better)".format(kwargs)
config_string = "Accuracy score using {}, (higher is better)".format(
kwargs)
return config_string
......@@ -3,13 +3,9 @@
get_scorer: returns a sklearn scorer for grid search
"""
import warnings
from sklearn.metrics import f1_score as metric
from sklearn.metrics import make_scorer
warnings.warn("the f1_score module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import fbeta_score as metric
from sklearn.metrics import make_scorer
warnings.warn("the fbeta_score module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import hamming_loss as metric
from sklearn.metrics import make_scorer
warnings.warn("the hamming_loss module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import jaccard_score as metric
from sklearn.metrics import make_scorer
warnings.warn("the jaccard_similarity_score module is deprecated",
DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import log_loss as metric
from sklearn.metrics import make_scorer
warnings.warn("the log_loss module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef as metric
warnings.warn("the matthews_corrcoef module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score as metric
warnings.warn("the precision_score module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score as metric
warnings.warn("the recall_score module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score as metric
from sklearn.preprocessing import MultiLabelBinarizer
warnings.warn("the roc_auc_score module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
import warnings
from sklearn.metrics import make_scorer
from sklearn.metrics import zero_one_loss as metric
warnings.warn("the zero_one_loss module is deprecated", DeprecationWarning,
stacklevel=2)
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
......
......@@ -20,7 +20,7 @@ from ..utils.multiclass import get_mc_estim
from ..utils.organization import secure_file_path
# Author-Info
__author__ = "Nikolas Huelsmann, Baptiste BAUVIN"
__author__ = "Baptiste BAUVIN"
__status__ = "Prototype" # Production, Development, Prototype
......@@ -35,8 +35,7 @@ def exec_monoview_multicore(directory, name, labels_names,
metrics=[["accuracy_score", None]], n_iter=30,
**args): # pragma: no cover
dataset_var = HDF5Dataset(
hdf5_file=h5py.File(path + name + str(dataset_file_index) + ".hdf5",
"r"))
hdf5_file=h5py.File(path + name + str(dataset_file_index) + ".hdf5", "r"))
neededViewIndex = args["view_index"]
X = dataset_var.get_v(neededViewIndex)
Y = labels
......@@ -50,7 +49,8 @@ def exec_monoview_multicore(directory, name, labels_names,
**args)
def exec_monoview(directory, X, Y, database_name, labels_names, classification_indices,
def exec_monoview(directory, X, Y, database_name, labels_names,
classification_indices,
k_folds, nb_cores, databaseType, path,
random_state, hyper_param_search="Random",
metrics={"accuracy_score*": {}}, n_iter=30, view_name="",
......@@ -71,7 +71,8 @@ def exec_monoview(directory, X, Y, database_name, labels_names, classification_i
logging.debug("Done:\t Loading data")
logging.debug(
"Info:\t Classification - Database:" + str(database_name) + " View:" + str(
"Info:\t Classification - Database:" + str(
database_name) + " View:" + str(
view_name) + " train ratio:"
+ str(learningRate) + ", CrossValidation k-folds: " + str(
k_folds.n_splits) + ", cores:"
......@@ -130,7 +131,8 @@ def exec_monoview(directory, X, Y, database_name, labels_names, classification_i
whole_duration = time.monotonic() - t_start
logging.debug(
"Info:\t Duration for training and predicting: " + str(whole_duration) + "[s]")
"Info:\t Duration for training and predicting: " + str(
whole_duration) + "[s]")
logging.debug("Start:\t Getting results")
result_analyzer = MonoviewResultAnalyzer(view_name=view_name,
......@@ -163,7 +165,8 @@ def exec_monoview(directory, X, Y, database_name, labels_names, classification_i
return MonoviewResult(view_index, classifier_name, view_name,
metrics_scores, full_pred, cl_kwargs,
classifier, X_train.shape[1],
hyper_param_duration, fit_duration, pred_duration, class_metrics_scores)
hyper_param_duration, fit_duration, pred_duration,
class_metrics_scores)
def init_constants(args, X, classification_indices, labels_names,
......@@ -223,9 +226,11 @@ def get_hyper_params(classifier_module, search_method, classifier_module_name,
def save_results(string_analysis, output_file_name, full_labels_pred,
y_train_pred,
y_train, images_analysis, y_test, confusion_matrix): # pragma: no cover
y_train, images_analysis, y_test,
confusion_matrix): # pragma: no cover
logging.info(string_analysis)
output_text_file = open(output_file_name + 'summary.txt', 'w', encoding="utf-8")
output_text_file = open(output_file_name + 'summary.txt', 'w',
encoding="utf-8")
output_text_file.write(string_analysis)
output_text_file.close()
np.savetxt(output_file_name + "confusion_matrix.csv", confusion_matrix,
......
import pickle
import os
import pickle
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import FuncFormatter
from scipy.stats import uniform, randint
from ..utils.base import BaseClassifier, ResultAnalyser
from ..utils.hyper_parameter_search import CustomRandint, CustomUniform
from ..utils.hyper_parameter_search import CustomRandint
# Author-Info
__author__ = "Baptiste Bauvin"
......@@ -53,7 +53,7 @@ def change_label_to_zero(y):
def compute_possible_combinations(params_dict):
n_possibs = np.ones(len(params_dict)) * np.inf
for value_index, value in enumerate(params_dict.values()):
if type(value) == list:
if isinstance(value, list):
n_possibs[value_index] = len(value)
elif isinstance(value, CustomRandint):
n_possibs[value_index] = value.get_nb_possibilities()
......@@ -115,7 +115,8 @@ def gen_test_folds_preds(X_train, y_train, KFolds, estimator):
class BaseMonoviewClassifier(BaseClassifier):
def get_feature_importance(self, directory, base_file_name, nb_considered_feats=50):
def get_feature_importance(self, directory, base_file_name,
nb_considered_feats=50):
"""Used to generate a graph and a pickle dictionary representing
feature importances"""
feature_importances = self.feature_importances_
......@@ -129,8 +130,8 @@ class BaseMonoviewClassifier(BaseClassifier):
ax.yaxis.set_major_formatter(formatter)
plt.bar(x, feature_importances_sorted)
plt.title("Importance depending on feature")
fig.savefig(os.path.join(directory, base_file_name + "feature_importances.png")
, transparent=True)
fig.savefig(
os.path.join(directory, base_file_name + "feature_importances.png"), transparent=True)
plt.close()
features_importances_dict = dict((featureIndex, featureImportance)
for featureIndex, featureImportance in
......@@ -180,8 +181,9 @@ class MonoviewResult(object):
def get_accuracy_graph(plotted_data, classifier_name, file_name,
name="Accuracies", bounds=None, bound_name=None,
boosting_bound=None, set="train", zero_to_one=True): # pragma: no cover
if type(name) is not str:
boosting_bound=None, set="train",
zero_to_one=True): # pragma: no cover
if not isinstance(name, str):
name = " ".join(name.getConfig().strip().split(" ")[:2])
f, ax = plt.subplots(nrows=1, ncols=1)
if zero_to_one:
......@@ -211,7 +213,8 @@ class MonoviewResultAnalyzer(ResultAnalyser):
def __init__(self, view_name, classifier_name, shape, classifier,
classification_indices, k_folds, hps_method, metrics_dict,
n_iter, class_label_names, pred,
directory, base_file_name, labels, database_name, nb_cores, duration):
directory, base_file_name, labels, database_name, nb_cores,
duration):
ResultAnalyser.__init__(self, classifier, classification_indices,
k_folds, hps_method, metrics_dict, n_iter,
class_label_names, pred,
......
import time
import os
import time
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from .. import metrics
from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier, \
get_accuracy_graph
from ..monoview.monoview_utils import BaseMonoviewClassifier, get_accuracy_graph
from summit.multiview_platform.utils.hyper_parameter_search import CustomRandint
from ..utils.base import base_boosting_estimators
# Author-Info
......@@ -56,7 +55,6 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
def __init__(self, random_state=None, n_estimators=50,
base_estimator=None, base_estimator_config=None, **kwargs):
base_estimator = BaseMonoviewClassifier.get_base_estimator(self,
base_estimator,
base_estimator_config)
......@@ -128,9 +126,11 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)])
return pred
def get_interpretation(self, directory, base_file_name, y_test, multi_class=False): # pragma: no cover
def get_interpretation(self, directory, base_file_name, y_test,
multi_class=False): # pragma: no cover
interpretString = ""
interpretString += self.get_feature_importance(directory, base_file_name)
interpretString += self.get_feature_importance(directory,
base_file_name)
interpretString += "\n\n Estimator error | Estimator weight\n"
interpretString += "\n".join(
[str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
......@@ -140,12 +140,14 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
[self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions])
get_accuracy_graph(step_test_metrics, "Adaboost",
os.path.join(directory, base_file_name +"test_metrics.png"),
os.path.join(directory,
base_file_name + "test_metrics.png"),
self.plotted_metric_name, set="test")
np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"),
step_test_metrics,
delimiter=',')
np.savetxt(os.path.join(directory, base_file_name + "train_metrics.csv"),
np.savetxt(
os.path.join(directory, base_file_name + "train_metrics.csv"),
self.metrics, delimiter=',')
np.savetxt(os.path.join(directory, base_file_name + "times.csv"),
np.array([self.train_time, self.pred_time]), delimiter=',')
......
from sklearn.tree import DecisionTreeClassifier
from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier
from ..monoview.monoview_utils import BaseMonoviewClassifier
from summit.multiview_platform.utils.hyper_parameter_search import CustomRandint
# Author-Info
__author__ = "Baptiste Bauvin"
......@@ -32,5 +33,6 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier):
interpretString = "First featrue : \n\t{} <= {}\n".format(
self.tree_.feature[0],
self.tree_.threshold[0])
interpretString += self.get_feature_importance(directory, base_file_name)
interpretString += self.get_feature_importance(directory,
base_file_name)
return interpretString
import time
import os
import time
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from .. import metrics
from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier, \
get_accuracy_graph
from ..monoview.monoview_utils import BaseMonoviewClassifier, get_accuracy_graph
from summit.multiview_platform.utils.hyper_parameter_search import CustomRandint
# Author-Info
__author__ = "Baptiste Bauvin"
......@@ -71,12 +71,14 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)])
return pred
def get_interpretation(self, directory, base_file_name, y_test, multi_class=False):
def get_interpretation(self, directory, base_file_name, y_test,
multi_class=False):
interpretString = ""
if multi_class:
return interpretString
else:
interpretString += self.get_feature_importance(directory, base_file_name)
interpretString += self.get_feature_importance(directory,
base_file_name)
step_test_metrics = np.array(
[self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions])
......@@ -86,9 +88,13 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier):
get_accuracy_graph(self.metrics, "AdaboostClassic",
directory + "metrics.png",
self.plotted_metric_name)
np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), step_test_metrics,
np.savetxt(
os.path.join(directory, base_file_name + "test_metrics.csv"),
step_test_metrics,
delimiter=',')
np.savetxt(os.path.join(directory, base_file_name + "train_metrics.csv"), self.metrics,
np.savetxt(
os.path.join(directory, base_file_name + "train_metrics.csv"),
self.metrics,
delimiter=',')
np.savetxt(os.path.join(directory, base_file_name + "times.csv"),
np.array([self.train_time, self.pred_time]),
......
from sklearn.neighbors import KNeighborsClassifier
from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier
from ..monoview.monoview_utils import BaseMonoviewClassifier
from summit.multiview_platform.utils.hyper_parameter_search import CustomRandint
# Author-Info
__author__ = "Baptiste Bauvin"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment