Skip to content
Snippets Groups Projects
Commit 88d015a9 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Implemented interpretation for concerned algorithms

parent 03d5ef8e
No related branches found
No related tags found
No related merge requests found
...@@ -4,7 +4,7 @@ Base : ...@@ -4,7 +4,7 @@ Base :
name: ["plausible"] name: ["plausible"]
label: "_" label: "_"
type: ".hdf5" type: ".hdf5"
views: ["300nm", "350nm"] views:
pathf: "../data/" pathf: "../data/"
nice: 0 nice: 0
random_state: 42 random_state: 42
...@@ -22,8 +22,8 @@ Classification: ...@@ -22,8 +22,8 @@ Classification:
nb_folds: 2 nb_folds: 2
nb_class: 2 nb_class: 2
classes: classes:
type: ["multiview",] type: ["multiview","monoview"]
algos_monoview: ["decision_tree"] algos_monoview: ["decision_tree", "adaboost"]
algos_multiview: ["weighted_linear_early_fusion"] algos_multiview: ["weighted_linear_early_fusion"]
stats_iter: 2 stats_iter: 2
metrics: ["accuracy_score", "f1_score"] metrics: ["accuracy_score", "f1_score"]
......
...@@ -137,7 +137,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, ...@@ -137,7 +137,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices,
testFoldsPreds = y_train_pred testFoldsPreds = y_train_pred
return monoview_utils.MonoviewResult(viewIndex, classifier_name, feat, metricsScores, return monoview_utils.MonoviewResult(viewIndex, classifier_name, feat, metricsScores,
full_pred, clKWARGS, full_pred, clKWARGS,
y_test_multiclass_pred, testFoldsPreds) y_test_multiclass_pred, testFoldsPreds, classifier, X_train.shape[1])
# return viewIndex, [CL_type, feat, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds] # return viewIndex, [CL_type, feat, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds]
......
...@@ -160,7 +160,7 @@ class BaseMonoviewClassifier(BaseEstimator, ClassifierMixin): ...@@ -160,7 +160,7 @@ class BaseMonoviewClassifier(BaseEstimator, ClassifierMixin):
else: else:
return str(self.get_params()[param_name]) return str(self.get_params()[param_name])
def getFeatureImportance(self, directory, nb_considered_feats=50): def get_feature_importance(self, directory, nb_considered_feats=50):
"""Used to generate a graph and a pickle dictionary representing feature importances""" """Used to generate a graph and a pickle dictionary representing feature importances"""
featureImportances = self.feature_importances_ featureImportances = self.feature_importances_
sortedArgs = np.argsort(-featureImportances) sortedArgs = np.argsort(-featureImportances)
...@@ -206,7 +206,7 @@ def percent(x, pos): ...@@ -206,7 +206,7 @@ def percent(x, pos):
class MonoviewResult(object): class MonoviewResult(object):
def __init__(self, view_index, classifier_name, view_name, metrics_scores, def __init__(self, view_index, classifier_name, view_name, metrics_scores,
full_labels_pred, full_labels_pred,
classifier_config, y_test_multiclass_pred, test_folds_preds): classifier_config, y_test_multiclass_pred, test_folds_preds, classifier, n_features):
self.view_index = view_index self.view_index = view_index
self.classifier_name = classifier_name self.classifier_name = classifier_name
self.view_name = view_name self.view_name = view_name
...@@ -215,6 +215,8 @@ class MonoviewResult(object): ...@@ -215,6 +215,8 @@ class MonoviewResult(object):
self.classifier_config = classifier_config self.classifier_config = classifier_config
self.y_test_multiclass_pred = y_test_multiclass_pred self.y_test_multiclass_pred = y_test_multiclass_pred
self.test_folds_preds = test_folds_preds self.test_folds_preds = test_folds_preds
self.clf = classifier
self.n_features = n_features
def get_classifier_name(self): def get_classifier_name(self):
return self.classifier_name + "-" + self.view_name return self.classifier_name + "-" + self.view_name
......
...@@ -135,7 +135,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): ...@@ -135,7 +135,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
def getInterpret(self, directory, y_test): def getInterpret(self, directory, y_test):
interpretString = "" interpretString = ""
interpretString += self.getFeatureImportance(directory) interpretString += self.get_feature_importance(directory)
interpretString += "\n\n Estimator error | Estimator weight\n" interpretString += "\n\n Estimator error | Estimator weight\n"
interpretString += "\n".join( interpretString += "\n".join(
[str(error) + " | " + str(weight / sum(self.estimator_weights_)) for [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
......
...@@ -33,7 +33,7 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier): ...@@ -33,7 +33,7 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier):
def getInterpret(self, directory, y_test): def getInterpret(self, directory, y_test):
interpretString = "" interpretString = ""
interpretString += self.getFeatureImportance(directory) interpretString += self.get_feature_importance(directory)
return interpretString return interpretString
......
...@@ -74,7 +74,7 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier): ...@@ -74,7 +74,7 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier):
def getInterpret(self, directory, y_test): def getInterpret(self, directory, y_test):
interpretString = "" interpretString = ""
interpretString += self.getFeatureImportance(directory) interpretString += self.get_feature_importance(directory)
step_test_metrics = np.array( step_test_metrics = np.array(
[self.plotted_metric.score(y_test, step_pred) for step_pred in [self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions]) self.step_predictions])
......
...@@ -86,7 +86,7 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier): ...@@ -86,7 +86,7 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier):
string for interpretation interpret_string string for interpretation interpret_string
""" """
interpret_string = "" interpret_string = ""
interpret_string += self.getFeatureImportance(directory) interpret_string += self.get_feature_importance(directory)
return interpret_string return interpret_string
......
...@@ -10,9 +10,11 @@ from matplotlib.patches import Patch ...@@ -10,9 +10,11 @@ from matplotlib.patches import Patch
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import plotly
# Import own Modules # Import own Modules
from . import metrics from .monoview.monoview_utils import MonoviewResult
from .multiview.multiview_utils import MultiviewResult
# Author-Info # Author-Info
__author__ = "Baptiste Bauvin" __author__ = "Baptiste Bauvin"
...@@ -181,7 +183,6 @@ def plot_2d(data, classifiers_names, nbClassifiers, nbExamples, ...@@ -181,7 +183,6 @@ def plot_2d(data, classifiers_names, nbClassifiers, nbExamples,
plt.close() plt.close()
### The following part is used to generate an interactive graph. ### The following part is used to generate an interactive graph.
if use_plotly: if use_plotly:
import plotly
hover_text = [["Failed "+ str(stats_iter-data[i,j])+" time(s)" hover_text = [["Failed "+ str(stats_iter-data[i,j])+" time(s)"
for j in range(data.shape[1])] for j in range(data.shape[1])]
for i in range(data.shape[0]) ] for i in range(data.shape[0]) ]
...@@ -559,6 +560,41 @@ def publishExampleErrors(example_errors, directory, databaseName, labels_names, ...@@ -559,6 +560,41 @@ def publishExampleErrors(example_errors, directory, databaseName, labels_names,
logging.debug("Done:\t Biclass Label analysis figures generation") logging.debug("Done:\t Biclass Label analysis figures generation")
def publish_feature_importances(feature_importances, directory, database_name, labels_names, feature_stds=None):
for view_name, feature_importance in feature_importances.items():
file_name = directory + time.strftime(
"%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join(
labels_names) + "-" + view_name + "-feature_importances"
if feature_stds is not None:
feature_std = feature_stds[view_name]
feature_std.to_csv(file_name+"_dataframe_stds.csv")
else:
feature_std = pd.DataFrame(data=np.zeros(feature_importance.shape),
index=feature_importance.index,
columns=feature_importance.columns)
feature_importance.to_csv(file_name+"_dataframe.csv")
hover_text = [["-Feature :" + str(feature_name) +
"<br>-Classifier : "+classifier_name+
"<br>-Importance : "+str(feature_importance.loc[feature_name][classifier_name])+
"<br>-STD : " + str(feature_std.loc[feature_name][classifier_name])
for classifier_name in list(feature_importance.columns)]
for feature_name in list(feature_importance.index)]
fig = plotly.graph_objs.Figure(data=plotly.graph_objs.Heatmap(
x=list(feature_importance.columns),
y=list(feature_importance.index),
z=feature_importance.values,
text=hover_text,
hoverinfo=["text"],
colorscale="Greys",
reversescale=False))
fig.update_layout(
xaxis={"showgrid": False, "showticklabels": False, "ticks": ''},
yaxis={"showgrid": False, "showticklabels": False, "ticks": ''})
plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False)
del fig
def get_arguments(benchmark_argument_dictionaries, flag): def get_arguments(benchmark_argument_dictionaries, flag):
r"""Used to get the arguments passed to the benchmark executing function corresponding to the flag of a r"""Used to get the arguments passed to the benchmark executing function corresponding to the flag of a
biclass experimentation. biclass experimentation.
...@@ -580,6 +616,32 @@ def get_arguments(benchmark_argument_dictionaries, flag): ...@@ -580,6 +616,32 @@ def get_arguments(benchmark_argument_dictionaries, flag):
return benchmarkArgumentDictionary return benchmarkArgumentDictionary
def get_feature_importances(result, feature_names=None):
r"""Extracts the feature importance from the monoview results and stores them in a dictionnary :
feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf
containing a score of importance for each feature.
Parameters
----------
result : list of results
Returns
-------
feature_importances : dict of pd.DataFrame
The dictionary containing all the feature importance for each view as pandas DataFrames
"""
feature_importances = {}
for classifier_result in result:
if isinstance(classifier_result, MonoviewResult):
if classifier_result.view_name not in feature_importances:
feature_importances[classifier_result.view_name] = pd.DataFrame(index=feature_names)
if hasattr(classifier_result.clf, 'feature_importances_'):
feature_importances[classifier_result.view_name][classifier_result.classifier_name] = classifier_result.clf.feature_importances_
else:
feature_importances[classifier_result.view_name][classifier_result.classifier_name] = np.zeros(classifier_result.n_features)
return feature_importances
def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics, example_ids): def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics, example_ids):
r"""Used to extract and format the results of the different biclass experimentations performed. r"""Used to extract and format the results of the different biclass experimentations performed.
...@@ -616,6 +678,7 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric ...@@ -616,6 +678,7 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric
metrics_scores = get_metrics_scores_biclass(metrics, result) metrics_scores = get_metrics_scores_biclass(metrics, result)
example_errors = get_example_errors_biclass(arguments["labels"], result) example_errors = get_example_errors_biclass(arguments["labels"], result)
feature_importances = get_feature_importances(result)
directory = arguments["directory"] directory = arguments["directory"]
...@@ -627,14 +690,18 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric ...@@ -627,14 +690,18 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric
labels_names) labels_names)
publishExampleErrors(example_errors, directory, database_name, publishExampleErrors(example_errors, directory, database_name,
labels_names, example_ids) labels_names, example_ids)
publish_feature_importances(feature_importances, directory, database_name, labels_names)
if not str(classifierPositive) + str(classifierNegative) in biclass_results: if not str(classifierPositive) + str(classifierNegative) in biclass_results:
biclass_results[str(classifierPositive) + str(classifierNegative)] = {} biclass_results[str(classifierPositive) + str(classifierNegative)] = {}
biclass_results[str(classifierPositive) + str(classifierNegative)][ biclass_results[str(classifierPositive) + str(classifierNegative)][
"metrics_scores"] = [i for i in range(stats_iter)] "metrics_scores"] = [i for i in range(stats_iter)]
biclass_results[str(classifierPositive) + str(classifierNegative)][ biclass_results[str(classifierPositive) + str(classifierNegative)][
"example_errors"] = [i for i in range(stats_iter)] "example_errors"] = [i for i in range(stats_iter)]
biclass_results[str(classifierPositive) + str(classifierNegative)][
"feature_importances"] = [i for i in range(stats_iter)]
biclass_results[str(classifierPositive) + str(classifierNegative)]["metrics_scores"][iteridex] = metrics_scores biclass_results[str(classifierPositive) + str(classifierNegative)]["metrics_scores"][iteridex] = metrics_scores
biclass_results[str(classifierPositive) + str(classifierNegative)]["example_errors"][iteridex] = example_errors biclass_results[str(classifierPositive) + str(classifierNegative)]["example_errors"][iteridex] = example_errors
biclass_results[str(classifierPositive) + str(classifierNegative)]["feature_importances"][iteridex] = feature_importances
logging.debug("Done:\t Analzing all biclass resuls") logging.debug("Done:\t Analzing all biclass resuls")
return results, biclass_results return results, biclass_results
...@@ -981,25 +1048,41 @@ def format_previous_results(biclass_results): ...@@ -981,25 +1048,41 @@ def format_previous_results(biclass_results):
""" """
metrics_analysis = dict((key, {}) for key in biclass_results.keys()) metrics_analysis = dict((key, {}) for key in biclass_results.keys())
error_analysis = dict((key, {}) for key in biclass_results.keys()) error_analysis = dict((key, {}) for key in biclass_results.keys())
feature_importances_analysis = dict((key, {}) for key in biclass_results.keys())
feature_importances_stds = dict((key, {}) for key in biclass_results.keys())
for label_combination, biclass_result in biclass_results.items(): for label_combination, biclass_result in biclass_results.items():
concat_dict = {} metric_concat_dict = {}
for iter_index, metrics_score in enumerate( for iter_index, metrics_score in enumerate(
biclass_result["metrics_scores"]): biclass_result["metrics_scores"]):
for metric_name, dataframe in metrics_score.items(): for metric_name, dataframe in metrics_score.items():
if metric_name not in concat_dict: if metric_name not in metric_concat_dict:
concat_dict[metric_name] = dataframe metric_concat_dict[metric_name] = dataframe
else: else:
concat_dict[metric_name] = pd.concat( metric_concat_dict[metric_name] = pd.concat(
[concat_dict[metric_name], dataframe]) [metric_concat_dict[metric_name], dataframe])
for metric_name, dataframe in concat_dict.items(): for metric_name, dataframe in metric_concat_dict.items():
metrics_analysis[label_combination][metric_name] = {} metrics_analysis[label_combination][metric_name] = {}
metrics_analysis[label_combination][metric_name][ metrics_analysis[label_combination][metric_name][
"mean"] = dataframe.groupby(dataframe.index).mean() "mean"] = dataframe.groupby(dataframe.index).mean()
metrics_analysis[label_combination][metric_name][ metrics_analysis[label_combination][metric_name][
"std"] = dataframe.groupby(dataframe.index).std(ddof=0) "std"] = dataframe.groupby(dataframe.index).std(ddof=0)
importance_concat_dict = {}
for iter_index, view_feature_importances in enumerate(biclass_result["feature_importances"]):
for view_name, feature_importances in view_feature_importances.items():
if view_name not in importance_concat_dict:
importance_concat_dict[view_name] = feature_importances
else:
importance_concat_dict[view_name] = pd.concat(
[importance_concat_dict[view_name], feature_importances])
for view_name, dataframe in importance_concat_dict.items():
feature_importances_analysis[label_combination][view_name] = dataframe.groupby(dataframe.index).mean()
feature_importances_stds[label_combination][view_name] = dataframe.groupby(dataframe.index).std(ddof=0)
added_example_errors = {} added_example_errors = {}
for example_errors in biclass_result["example_errors"]: for example_errors in biclass_result["example_errors"]:
for classifier_name, errors in example_errors.items(): for classifier_name, errors in example_errors.items():
...@@ -1008,13 +1091,13 @@ def format_previous_results(biclass_results): ...@@ -1008,13 +1091,13 @@ def format_previous_results(biclass_results):
else: else:
added_example_errors[classifier_name] += errors added_example_errors[classifier_name] += errors
error_analysis[label_combination] = added_example_errors error_analysis[label_combination] = added_example_errors
return metrics_analysis, error_analysis return metrics_analysis, error_analysis, feature_importances_analysis, feature_importances_stds
def analyzebiclass_iter(biclass_results, stats_iter, directory, def analyzebiclass_iter(biclass_results, stats_iter, directory,
labels_dictionary, data_base_name, example_ids): labels_dictionary, data_base_name, example_ids):
"""Used to format the results in order to plot the mean results on the iterations""" """Used to format the results in order to plot the mean results on the iterations"""
metrics_analysis, error_analysis = format_previous_results(biclass_results) metrics_analysis, error_analysis, feature_improtances, feature_improtances_stds = format_previous_results(biclass_results)
results = publish_iter_biclass_metrics_scores(metrics_analysis, results = publish_iter_biclass_metrics_scores(metrics_analysis,
directory, labels_dictionary, directory, labels_dictionary,
...@@ -1022,6 +1105,12 @@ def analyzebiclass_iter(biclass_results, stats_iter, directory, ...@@ -1022,6 +1105,12 @@ def analyzebiclass_iter(biclass_results, stats_iter, directory,
publish_iter_biclass_example_errors(error_analysis, directory, publish_iter_biclass_example_errors(error_analysis, directory,
labels_dictionary, labels_dictionary,
stats_iter, example_ids) stats_iter, example_ids)
for label_combination, feature_improtances_view in feature_improtances.items():
labels = [labels_dictionary[
int(label_combination[0])], labels_dictionary[
int(label_combination[1])]]
publish_feature_importances(feature_improtances_view, os.path.join(directory,"-vs-".join(labels)+"/"),
data_base_name, labels, feature_improtances_stds[label_combination])
return results return results
def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment