diff --git a/config_files/config.yml b/config_files/config.yml index 3643419bd75fe149d106a5749b5e37fd190f2616..ff7c3b52f7c5d398c499f6d115f95332d9ab77ed 100644 --- a/config_files/config.yml +++ b/config_files/config.yml @@ -52,7 +52,9 @@ algos_multiview: ["all"] # split, to have more statistically significant results stats_iter: 1 # The metrics that will be use din the result analysis -metrics: ["accuracy_score", "f1_score"] +metrics: + "accuracy_score": + "f1_score": # The metric that will be used in the hyper-parameter optimization process metric_princ: "f1_score" # The type of hyper-parameter optimization method diff --git a/config_files/config_test.yml b/config_files/config_test.yml index af27496b79071a59abdbe0dc7df450d8a35b07e5..2b4c2a2695566f1221546c5ed6ad5cc415c5a166 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -7,8 +7,8 @@ views: pathf: "examples/data/" nice: 0 random_state: 42 -nb_cores: 4 -full: False +nb_cores: 1 +full: True debug: True add_noise: False noise_std: 0.0 @@ -21,7 +21,7 @@ split: 0.8 nb_folds: 2 nb_class: 3 classes: -type: ["multiview"] +type: ["monoview", "multiview"] algos_monoview: ["decision_tree", ] algos_multiview: ["early_fusion_adaboost"] stats_iter: 3 @@ -29,7 +29,7 @@ metrics: accuracy_score: {} f1_score: {} metric_princ: "accuracy_score" -hps_type: "Random" +hps_type: "None" hps_args: n_iter: 10 equivalent_draws: False @@ -39,6 +39,8 @@ hps_args: low: 1 high: 10 +decision_tree: + max_depth: 1 weighted_linear_early_fusion: view_weights: null diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index 03ef02a44f3a1a40cf60f2ef835e12b467eb7a97..00aff20e7f1d6828fe3187a3be7e7c646b291965 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -465,7 +465,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, nb_cores, args["file_type"], args["pathf"], random_state, >>>>>>> develop hyper_param_search=hyper_param_search, - metrics=metrics, + metrics=metrics, feature_ids=dataset_var.feature_ids[arguments["view_index"]], **arguments)] except BaseException: if track_tracebacks: @@ -560,7 +560,9 @@ def exec_benchmark(nb_cores, stats_iter, analyze_iterations([benchmark_results], benchmark_arguments_dictionaries, stats_iter, metrics, sample_ids=dataset_var.sample_ids, - labels=dataset_var.get_labels()) + labels=dataset_var.get_labels(), + feature_ids=dataset_var.feature_ids, + view_names=dataset_var.view_names) results += [benchmark_results] logging.info("Done:\t Executing all the needed benchmarks") @@ -571,7 +573,8 @@ def exec_benchmark(nb_cores, stats_iter, metrics, directory, dataset_var.sample_ids, - dataset_var.get_labels()) + dataset_var.get_labels(),dataset_var.feature_ids, + dataset_var.view_names) logging.info("Done:\t Analyzing predictions") return results_mean_stds diff --git a/summit/multiview_platform/monoview/exec_classif_mono_view.py b/summit/multiview_platform/monoview/exec_classif_mono_view.py index 98b65e3dbfde3c8cbbbb51e6096065eb2ef0214e..874a2bfce540665c60fa3623c9e903f24f14e1c7 100644 --- a/summit/multiview_platform/monoview/exec_classif_mono_view.py +++ b/summit/multiview_platform/monoview/exec_classif_mono_view.py @@ -54,7 +54,7 @@ def exec_monoview(directory, X, Y, database_name, labels_names, k_folds, nb_cores, databaseType, path, random_state, hyper_param_search="Random", metrics={"accuracy_score*": {}}, n_iter=30, view_name="", - hps_kwargs={}, **args): + hps_kwargs={}, feature_ids=[], **args): logging.info("Start:\t Loading data") kwargs, \ t_start, \ @@ -151,7 +151,8 @@ def exec_monoview(directory, X, Y, database_name, labels_names, labels=Y, database_name=database_name, nb_cores=nb_cores, - duration=whole_duration) + duration=whole_duration, + feature_ids=feature_ids) string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ confusion_matrix = result_analyzer.analyze() logging.info("Done:\t Getting results") diff --git a/summit/multiview_platform/monoview/monoview_utils.py b/summit/multiview_platform/monoview/monoview_utils.py index 2fbf53d3e077e013382663ceaa2af5ce57024531..2d227fb93eb1635ef75f1488e286b54e2066989d 100644 --- a/summit/multiview_platform/monoview/monoview_utils.py +++ b/summit/multiview_platform/monoview/monoview_utils.py @@ -76,7 +76,20 @@ def gen_test_folds_preds(X_train, y_train, KFolds, estimator): class BaseMonoviewClassifier(BaseClassifier): - def get_feature_importance(self, directory, base_file_name, + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, + multi_class=False): + """ + Base method that returns an empty string if there is not interpretation + method in the classifier's module + """ + if hasattr(self, "feature_importances_"): + return self.get_feature_importance(directory, base_file_name, + feature_ids,) + else: + return "" + + + def get_feature_importance(self, directory, base_file_name, feature_ids, nb_considered_feats=50): """Used to generate a graph and a pickle dictionary representing feature importances""" @@ -85,28 +98,19 @@ class BaseMonoviewClassifier(BaseClassifier): feature_importances_sorted = feature_importances[sorted_args][ :nb_considered_feats] feature_indices_sorted = sorted_args[:nb_considered_feats] - fig, ax = plt.subplots() - x = np.arange(len(feature_indices_sorted)) - formatter = FuncFormatter(percent) - ax.yaxis.set_major_formatter(formatter) - plt.bar(x, feature_importances_sorted) - plt.title("Importance depending on feature") - fig.savefig( - os.path.join(directory, base_file_name + "feature_importances.png"), transparent=True) - plt.close() - features_importances_dict = dict((featureIndex, featureImportance) - for featureIndex, featureImportance in + features_importances_dict = dict((feature_ids[feature_index], feature_importance) + for feature_index, feature_importance in enumerate(feature_importances) - if featureImportance != 0) + if feature_importance != 0) with open(directory + 'feature_importances.pickle', 'wb') as handle: pickle.dump(features_importances_dict, handle) interpret_string = "Feature importances : \n" - for featureIndex, featureImportance in zip(feature_indices_sorted, + for feature_index, feature_importance in zip(feature_indices_sorted, feature_importances_sorted): - if featureImportance > 0: - interpret_string += "- Feature index : " + str(featureIndex) + \ + if feature_importance > 0: + interpret_string += "- Feature : " + feature_ids[feature_index] + \ ", feature importance : " + str( - featureImportance) + "\n" + feature_importance) + "\n" return interpret_string def get_name_for_fusion(self): @@ -175,12 +179,12 @@ class MonoviewResultAnalyzer(ResultAnalyser): classification_indices, k_folds, hps_method, metrics_dict, n_iter, class_label_names, pred, directory, base_file_name, labels, database_name, nb_cores, - duration): + duration, feature_ids): ResultAnalyser.__init__(self, classifier, classification_indices, k_folds, hps_method, metrics_dict, n_iter, class_label_names, pred, directory, base_file_name, labels, - database_name, nb_cores, duration) + database_name, nb_cores, duration, feature_ids) self.view_name = view_name self.classifier_name = classifier_name self.shape = shape diff --git a/summit/multiview_platform/monoview_classifiers/decision_tree.py b/summit/multiview_platform/monoview_classifiers/decision_tree.py index 6b309dc3e11f09e165bab1d5c30d9082681de300..33b99090c3149942412296c0bc880d56e30202a6 100644 --- a/summit/multiview_platform/monoview_classifiers/decision_tree.py +++ b/summit/multiview_platform/monoview_classifiers/decision_tree.py @@ -34,11 +34,12 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier): ["best", "random"], [random_state]] self.weird_strings = {} - def get_interpretation(self, directory, base_file_name, y_test, + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, multiclass=False): - interpretString = "First featrue : \n\t{} <= {}\n".format( - self.tree_.feature[0], + interpretString = "First feature : \n\t{} <= {}\n".format( + feature_ids[self.tree_.feature[0]], self.tree_.threshold[0]) interpretString += self.get_feature_importance(directory, - base_file_name) + base_file_name, + feature_ids) return interpretString diff --git a/summit/multiview_platform/multiview/exec_multiview.py b/summit/multiview_platform/multiview/exec_multiview.py index a594624c8f5aab3e332b6ef7d20776d91308500c..254b11b3a676e0a82e3c7ef6abb6a4dc0a181753 100644 --- a/summit/multiview_platform/multiview/exec_multiview.py +++ b/summit/multiview_platform/multiview/exec_multiview.py @@ -61,7 +61,7 @@ def init_constants(kwargs, classification_indices, metrics, for view_index, view_name in zip(views_indices, views): logging.info("Info:\t Shape of " + str(view_name) + " :" + str( - dataset_var.get_shape())) + dataset_var.get_shape(view_index))) labels = dataset_var.get_labels() directory = os.path.join(directory, classifier_name) base_file_name = classifier_name + "-" + dataset_var.get_name() + "-" @@ -266,6 +266,8 @@ def exec_multiview(directory, dataset_var, name, classification_indices, logging.info("Start:\t Optimizing hyperparameters") hps_beg = time.monotonic() + + print(dataset_var.view_dict) if hps_method != "None": hps_method_class = getattr(hyper_parameter_search, hps_method) estimator = getattr(classifier_module, classifier_name)( @@ -294,6 +296,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, classifier.fit(dataset_var, dataset_var.get_labels(), train_indices=learning_indices, view_indices=views_indices) + print("pou") fit_duration = time.monotonic() - fit_beg logging.info("Done:\t Fitting classifier") @@ -332,7 +335,8 @@ def exec_multiview(directory, dataset_var, name, classification_indices, labels=labels, database_name=dataset_var.get_name(), nb_cores=nb_cores, - duration=whole_duration) + duration=whole_duration, + feature_ids=dataset_var.feature_ids) string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ confusion_matrix = result_analyzer.analyze() logging.info("Done:\t Result Analysis for " + cl_type) @@ -344,4 +348,4 @@ def exec_multiview(directory, dataset_var, name, classification_indices, return MultiviewResult(cl_type, classifier_config, metrics_scores, full_pred, hps_duration, fit_duration, - pred_duration, class_metrics_scores) + pred_duration, class_metrics_scores, classifier) diff --git a/summit/multiview_platform/multiview/multiview_utils.py b/summit/multiview_platform/multiview/multiview_utils.py index 6a26aaa37623b1b1f265e10af39ecdc3bef513fa..6f0518a11304a619bf4036278d7136a95ccff7bb 100644 --- a/summit/multiview_platform/multiview/multiview_utils.py +++ b/summit/multiview_platform/multiview/multiview_utils.py @@ -133,7 +133,7 @@ def get_monoview_classifier(classifier_name, multiclass=False): class MultiviewResult(object): def __init__(self, classifier_name, classifier_config, metrics_scores, full_labels, hps_duration, fit_duration, - pred_duration, class_metric_scores): + pred_duration, class_metric_scores, clf): self.classifier_name = classifier_name self.classifier_config = classifier_config self.metrics_scores = metrics_scores @@ -142,6 +142,7 @@ class MultiviewResult(object): self.fit_duration = fit_duration self.pred_duration = pred_duration self.class_metric_scores = class_metric_scores + self.clf=clf def get_classifier_name(self): try: @@ -160,7 +161,7 @@ class MultiviewResultAnalyzer(ResultAnalyser): def __init__(self, view_names, classifier, classification_indices, k_folds, hps_method, metrics_dict, n_iter, class_label_names, pred, directory, base_file_name, labels, - database_name, nb_cores, duration): + database_name, nb_cores, duration, feature_ids): if hps_method.endswith("equiv"): n_iter = n_iter * len(view_names) ResultAnalyser.__init__(self, classifier, classification_indices, @@ -169,7 +170,7 @@ class MultiviewResultAnalyzer(ResultAnalyser): class_label_names, pred, directory, base_file_name, labels, database_name, - nb_cores, duration) + nb_cores, duration, feature_ids) self.classifier_name = classifier.short_name self.view_names = view_names diff --git a/summit/multiview_platform/multiview_classifiers/additions/early_fusion_from_monoview.py b/summit/multiview_platform/multiview_classifiers/additions/early_fusion_from_monoview.py index 582466b018d351cd3ed88dce8884972bff216e85..8b4f0b01ccf36a48bcf5a91d5beb4191f6a3df9c 100644 --- a/summit/multiview_platform/multiview_classifiers/additions/early_fusion_from_monoview.py +++ b/summit/multiview_platform/multiview_classifiers/additions/early_fusion_from_monoview.py @@ -30,8 +30,11 @@ class BaseEarlyFusion(BaseMultiviewClassifier): return monoview_params def fit(self, X, y, train_indices=None, view_indices=None): - train_indices, X = self.transform_data_to_monoview(X, train_indices, - view_indices) + self.view_dict = X.view_dict + train_indices, self.view_indices = get_samples_views_indices(X, + train_indices, + view_indices) + train_indices, X = self.transform_data_to_monoview(X, train_indices,) self.used_views = view_indices if np.unique(y[train_indices]).shape[0] > 2 and \ not (isinstance(self.monoview_classifier, MultiClassWrapper)): @@ -40,22 +43,22 @@ class BaseEarlyFusion(BaseMultiviewClassifier): multiview=False, y=y[train_indices]) self.monoview_classifier.fit(X, y[train_indices]) + if hasattr(self.monoview_classifier, "feature_importances_"): + self.get_feature_importances() return self def predict(self, X, sample_indices=None, view_indices=None): - _, X = self.transform_data_to_monoview(X, sample_indices, view_indices) + _, X = self.transform_data_to_monoview(X, sample_indices) self._check_views(self.view_indices) predicted_labels = self.monoview_classifier.predict(X) return predicted_labels - def transform_data_to_monoview(self, dataset, sample_indices, - view_indices): + def get_feature_importances(self): + self.feature_importances_ = self.monoview_classifier.feature_importances_ + + def transform_data_to_monoview(self, dataset, sample_indices): """Here, we extract the data from the HDF5 dataset file and store all the concatenated views in one variable""" - sample_indices, self.view_indices = get_samples_views_indices(dataset, - sample_indices, - view_indices) - X = self.hdf5_to_monoview(dataset, sample_indices) return sample_indices, X @@ -65,4 +68,8 @@ class BaseEarlyFusion(BaseMultiviewClassifier): [dataset.get_v(view_idx, samples) for index, view_idx in enumerate(self.view_indices)], axis=1) + self.feature_ids = [] + for view_idx in self.view_indices: + view_name = dataset.view_names[view_idx] + self.feature_ids += [view_name+"-"+feat_id for feat_id in dataset.feature_ids[view_idx]] return monoview_data \ No newline at end of file diff --git a/summit/multiview_platform/result_analysis/error_analysis.py b/summit/multiview_platform/result_analysis/error_analysis.py index 90d6b69756560a450b304db55abba3cea496f888..aeec9e1e237d5a3e898930c7ff0025b1898c5f5c 100644 --- a/summit/multiview_platform/result_analysis/error_analysis.py +++ b/summit/multiview_platform/result_analysis/error_analysis.py @@ -46,7 +46,7 @@ def get_sample_errors(groud_truth, results): def publish_sample_errors(sample_errors, directory, database_name, - labels_names, sample_ids, labels): # pragma: no cover + label_names, sample_ids, labels): # pragma: no cover logging.info("Start:\t Label analysis figure generation") base_file_name = os.path.join(directory, database_name + "-") @@ -59,7 +59,7 @@ def publish_sample_errors(sample_errors, directory, database_name, delimiter=",") plot_2d(data_2d, classifiers_names, nb_classifiers, base_file_name, database_name, - sample_ids=sample_ids, labels=labels) + sample_ids=sample_ids, labels=labels, label_names=label_names) plot_errors_bar(error_on_samples, nb_samples, base_file_name, database_name, sample_ids=sample_ids) @@ -69,7 +69,7 @@ def publish_sample_errors(sample_errors, directory, database_name, def publish_all_sample_errors(iter_results, directory, stats_iter, - sample_ids, labels, data_base_name): # pragma: no cover + sample_ids, labels, data_base_name, label_names): # pragma: no cover logging.info( "Start:\t Global label analysis figure generation") @@ -83,7 +83,7 @@ def publish_all_sample_errors(iter_results, directory, plot_2d(data, classifier_names, nb_classifiers, os.path.join(directory, ""), data_base_name, stats_iter=stats_iter, - sample_ids=sample_ids, labels=labels) + sample_ids=sample_ids, labels=labels, label_names=label_names) plot_errors_bar(error_on_samples, nb_samples, os.path.join(directory, ""), data_base_name, sample_ids=sample_ids) @@ -152,7 +152,7 @@ def gen_error_data_glob(iter_results, stats_iter): def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, labels=None, - stats_iter=1, use_plotly=True, sample_ids=None): # pragma: no cover + stats_iter=1, use_plotly=True, sample_ids=None, label_names=None): # pragma: no cover r"""Used to generate a 2D plot of the errors. Parameters @@ -178,6 +178,8 @@ def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, la Returns ------- """ + if label_names is None: + label_names = [str(lab) for lab in np.sort(np.unique(labels))] fig, ax = plt.subplots(nrows=1, ncols=1, ) label_index_list = np.concatenate([np.where(labels == i)[0] for i in np.unique( @@ -202,7 +204,7 @@ def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, la hover_text = [[sample_ids[sample_index] + " failed " + str( stats_iter - data[ sample_index, classifier_index]) + " time(s), labelled " + str( - labels[sample_index]) + label_names[int(labels[sample_index])]) for classifier_index in range(data.shape[1])] for sample_index in range(data.shape[0])] fig = plotly.graph_objs.Figure() @@ -258,9 +260,11 @@ def plot_errors_bar(error_on_samples, nb_samples, file_name, dataset_name, fig = plotly.graph_objs.Figure( [plotly.graph_objs.Bar(x=sample_ids, y=1 - error_on_samples)]) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - title = "Dataset : {} <br> Error % for each sample <br> Generated on <a href='https://baptiste.bauvin.pages.lis-lab.fr/summit'>SuMMIT</a>.".format( - dataset_name) + plot_bgcolor='rgba(0,0,0,0)', + title="Dataset : {} <br> Error % for each sample <br> Generated on <a href='https://baptiste.bauvin.pages.lis-lab.fr/summit'>SuMMIT</a>.".format( + dataset_name) + ) + plotly.offline.plot(fig, filename=file_name + "error_analysis_bar.html", auto_open=False) diff --git a/summit/multiview_platform/result_analysis/execution.py b/summit/multiview_platform/result_analysis/execution.py index 20ff793b2279430d5b4918c38320d1c6b4861654..279891e7d30ea3fa2262e25eea013a883d61db52 100644 --- a/summit/multiview_platform/result_analysis/execution.py +++ b/summit/multiview_platform/result_analysis/execution.py @@ -13,13 +13,14 @@ from .tracebacks_analysis import save_failed, publish_tracebacks def analyze(results, stats_iter, benchmark_argument_dictionaries, - metrics, directory, sample_ids, labels): # pragma: no cover + metrics, directory, sample_ids, labels, feature_ids, + view_names): # pragma: no cover """Used to analyze the results of the previous benchmarks""" data_base_name = benchmark_argument_dictionaries[0]["args"]["name"] results_means_std, iter_results, flagged_failed, label_names = analyze_iterations( results, benchmark_argument_dictionaries, - stats_iter, metrics, sample_ids, labels) + stats_iter, metrics, sample_ids, labels, feature_ids, view_names) if flagged_failed: save_failed(flagged_failed, directory) @@ -31,7 +32,7 @@ def analyze(results, stats_iter, benchmark_argument_dictionaries, def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, - metrics, sample_ids, labels): + metrics, sample_ids, labels, feature_ids, view_names): r"""Used to extract and format the results of the different experimentations performed. @@ -81,7 +82,9 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, result, labels_names) sample_errors = get_sample_errors(labels, result) - feature_importances = get_feature_importances(result) + feature_importances = get_feature_importances(result, + feature_ids=feature_ids, + view_names=view_names) durations = get_duration(result) directory = arguments["directory"] @@ -124,7 +127,7 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name, data_base_name, stats_iter, label_names) publish_all_sample_errors(error_analysis, directory, stats_iter, - sample_ids, labels, data_base_name) + sample_ids, labels, data_base_name, label_names) publish_feature_importances(feature_importances, directory, data_base_name, feature_importances_stds) plot_durations(duration_means, directory, data_base_name, duration_stds) diff --git a/summit/multiview_platform/result_analysis/feature_importances.py b/summit/multiview_platform/result_analysis/feature_importances.py index 86c31366f6b59a8ade8ec547cdd47b6b6ebb1055..e7fec1438bd97aafb1fcabeac4b207cc6cc4fca4 100644 --- a/summit/multiview_platform/result_analysis/feature_importances.py +++ b/summit/multiview_platform/result_analysis/feature_importances.py @@ -7,7 +7,7 @@ import plotly from ..monoview.monoview_utils import MonoviewResult -def get_feature_importances(result, feature_names=None): +def get_feature_importances(result, feature_ids=None, view_names=None): r"""Extracts the feature importance from the monoview results and stores them in a dictionnary : feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf @@ -28,7 +28,7 @@ def get_feature_importances(result, feature_names=None): if isinstance(classifier_result, MonoviewResult): if classifier_result.view_name not in feature_importances: feature_importances[classifier_result.view_name] = pd.DataFrame( - index=feature_names) + index=feature_ids[classifier_result.view_index]) if hasattr(classifier_result.clf, 'feature_importances_'): feature_importances[classifier_result.view_name][ classifier_result.classifier_name] = classifier_result.clf.feature_importances_ @@ -36,25 +36,62 @@ def get_feature_importances(result, feature_names=None): feature_importances[classifier_result.view_name][ classifier_result.classifier_name] = np.zeros( classifier_result.n_features) + else: + if "mv" not in feature_importances: + feat_ids = [] + for view_ind, v_feature_id in enumerate(feature_ids): + feat_ids += [view_names[view_ind] + "-" + ind for ind in + v_feature_id] + feature_importances["mv"] = pd.DataFrame(index=feat_ids) + if hasattr(classifier_result.clf, 'feature_importances_'): + feature_importances["mv"][classifier_result.classifier_name] = classifier_result.clf.feature_importances_ return feature_importances def publish_feature_importances(feature_importances, directory, database_name, feature_stds=None): # pragma: no cover + importance_dfs = [] + std_dfs = [] + if not os.path.exists(os.path.join(directory, "feature_importances")): + os.mkdir(os.path.join(directory, "feature_importances")) for view_name, feature_importance in feature_importances.items(): - if not os.path.exists(os.path.join(directory, "feature_importances")): - os.mkdir(os.path.join(directory, "feature_importances")) - file_name = os.path.join(directory, "feature_importances", - database_name + "-" + view_name - + "-feature_importances") + if view_name!="mv": + + if feature_stds is not None: + feature_std = feature_stds[view_name] + else: + feature_std = pd.DataFrame(data=np.zeros(feature_importance.shape), + index=feature_importance.index, + columns=feature_importance.columns) + feature_std = feature_std.loc[feature_importance.index] + + + importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)]))) + importance_dfs.append(pd.DataFrame(index=[view_name+"-br"], + columns=feature_importance.columns, + data=np.zeros((1, len( + feature_importance.columns))))) + std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind + for ind + in list(feature_std.index)]))) + std_dfs.append(pd.DataFrame(index=[view_name + "-br"], + columns=feature_std.columns, + data=np.zeros((1, len( + feature_std.columns))))) + feature_importances_df = pd.concat(importance_dfs[:-1]) + feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0) + feature_std_df = pd.concat(std_dfs[:-1]) + if "mv" in feature_importances: + feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0) if feature_stds is not None: - feature_std = feature_stds[view_name] - feature_std.to_csv(file_name + "_dataframe_stds.csv") + feature_std_df = pd.concat([feature_std_df, feature_stds["mv"]], axis=1,).fillna(0) else: - feature_std = pd.DataFrame(data=np.zeros(feature_importance.shape), - index=feature_importance.index, - columns=feature_importance.columns) - plot_feature_importances(file_name, feature_importance, feature_std) + fake = pd.DataFrame(data=np.zeros((feature_importances_df.shape[0], feature_importances["mv"].shape[1])), + index=feature_importances_df.index, + columns=feature_importances["mv"].columns).fillna(0) + feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0) + plot_feature_importances(os.path.join(directory, "feature_importances", + database_name), feature_importances_df, feature_std_df) def plot_feature_importances(file_name, feature_importance, diff --git a/summit/multiview_platform/utils/base.py b/summit/multiview_platform/utils/base.py index c88294c59977b3160ddada9b1a343f8fa8f12909..67df47a6eb850da32faa5ea132b1e73a0d746ace 100644 --- a/summit/multiview_platform/utils/base.py +++ b/summit/multiview_platform/utils/base.py @@ -105,7 +105,7 @@ class BaseClassifier(BaseEstimator, ): else: return str(self.get_params()[param_name]) - def get_interpretation(self, directory, base_file_name, y_test, + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, multi_class=False): """ Base method that returns an empty string if there is not interpretation @@ -169,7 +169,7 @@ class ResultAnalyser(): def __init__(self, classifier, classification_indices, k_folds, hps_method, metrics_dict, n_iter, class_label_names, pred, directory, base_file_name, labels, - database_name, nb_cores, duration): + database_name, nb_cores, duration, feature_ids): """ Parameters @@ -219,6 +219,7 @@ class ResultAnalyser(): self.duration = duration self.metric_scores = {} self.class_metric_scores = {} + self.feature_ids=feature_ids def get_all_metrics_scores(self, ): """ @@ -387,7 +388,7 @@ class ResultAnalyser(): string_analysis += "\n\n Classifier Interpretation : \n" string_analysis += self.classifier.get_interpretation( self.directory, self.base_file_name, - self.labels[self.test_indices]) + self.labels[self.test_indices], self.feature_ids) image_analysis = {} return string_analysis, image_analysis, self.metric_scores, \ self.class_metric_scores, self.confusion_matrix diff --git a/summit/multiview_platform/utils/dataset.py b/summit/multiview_platform/utils/dataset.py index 07023b756fea909cc01e821de05f1f28febf61b8..2a33b34bac58766b39049efa68614ddde26b9522 100644 --- a/summit/multiview_platform/utils/dataset.py +++ b/summit/multiview_platform/utils/dataset.py @@ -165,12 +165,18 @@ class Dataset(): return selected_label_names + def gen_feat_id(self): + self.feature_ids = [["ID_" + str(i) for i in + range(self.get_v(view_ind).shape[1])] + for view_ind in self.view_dict.values()] + + class RAMDataset(Dataset): def __init__(self, views=None, labels=None, are_sparse=False, view_names=None, labels_names=None, sample_ids=None, - name=None): + name=None, feature_ids=None): self.saved_on_disk = False self.views = views self.labels = np.asarray(labels) @@ -187,6 +193,13 @@ class RAMDataset(Dataset): self.name = name self.nb_view = len(self.views) self.is_temp = False + if feature_ids is not None: + feature_ids = [[feature_id if not is_just_number(feature_id) + else "ID_" + feature_id for feature_id in + feat_ids] for feat_ids in feature_ids] + self.feature_ids = feature_ids + else: + self.gen_feat_id() def get_view_name(self, view_idx): return self.view_names[view_idx] @@ -319,7 +332,7 @@ class HDF5Dataset(Dataset): def __init__(self, views=None, labels=None, are_sparse=False, file_name="dataset.hdf5", view_names=None, path="", hdf5_file=None, labels_names=None, is_temp=False, - sample_ids=None, ): + sample_ids=None, feature_ids=None): self.is_temp = False if hdf5_file is not None: self.dataset = hdf5_file @@ -364,6 +377,14 @@ class HDF5Dataset(Dataset): else: self.sample_ids = ["ID_" + str(i) for i in range(labels.shape[0])] + if feature_ids is not None: + feature_ids = [[feature_id if not is_just_number(feature_id) + else "ID_" + feature_id for feature_id in + feat_ids] for feat_ids in feature_ids] + self.feature_ids = feature_ids + else: + self.gen_feat_id() + def get_v(self, view_index, sample_indices=None): """ Extract the view and returns a numpy.ndarray containing the description @@ -423,6 +444,7 @@ class HDF5Dataset(Dataset): """ self.nb_view = self.dataset["Metadata"].attrs["nbView"] self.view_dict = self.get_view_dict() + self.view_names = [self.dataset["View{}".format(ind)].attrs['name'] for ind in range(self.nb_view)] if "sample_ids" in self.dataset["Metadata"].keys(): self.sample_ids = [sample_id.decode() if not is_just_number(sample_id.decode()) @@ -432,6 +454,14 @@ class HDF5Dataset(Dataset): else: self.sample_ids = ["ID_" + str(i) for i in range(self.dataset["Labels"].shape[0])] + if "feature_ids" in self.dataset["Metadata"].keys(): + self.feature_ids = [[feature_id.decode() + if not is_just_number(feature_id.decode()) + else "ID_" + feature_id.decode() + for feature_id in feature_ids] for feature_ids in + self.dataset["Metadata"]["feature_ids"]] + else: + self.gen_feat_id() def get_nb_samples(self): """