diff --git a/requirements.txt b/requirements.txt index 5fbfaac93c6434d6879ed9f740eeaa886ce2701b..a9e891d8feb344d2919320d705d946056f27d536 100755 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,4 @@ plotly>=4.2.1 matplotlib>=3.1.1 tabulate>=0.8.6 pyscm-ml>=1.0.0 -imbalanced-learn \ No newline at end of file +imbalanced-learn>=0.10.1 \ No newline at end of file diff --git a/summit/multiview_platform/monoview_classifiers/random_scm.py b/summit/multiview_platform/monoview_classifiers/random_scm.py index c648c01f40c968cbff0d4bc2a570990cc15b63cd..287cfcabf1ae29a1ea88bbd49b3f2b22be2e4ee1 100644 --- a/summit/multiview_platform/monoview_classifiers/random_scm.py +++ b/summit/multiview_platform/monoview_classifiers/random_scm.py @@ -82,7 +82,7 @@ class ScmBagging(RandomScmClassifier, BaseMonoviewClassifier): max_samples=max_samples, max_features=max_features, max_rules=max_rules, - p_options=p_options, + p=p_options, model_type=model_type, random_state=random_state) self.param_names = ["n_estimators", "max_rules", "max_samples", "max_features", "model_type", "p_options", "random_state"] @@ -94,7 +94,7 @@ class ScmBagging(RandomScmClassifier, BaseMonoviewClassifier): def set_params(self, p_options=[0.316], **kwargs): if not isinstance(p_options, list): p_options = [p_options] - kwargs["p_options"] = p_options + kwargs["p"] = p_options for parameter, value in iteritems(kwargs): setattr(self, parameter, value) return self diff --git a/summit/multiview_platform/multiview_classifiers/mucombo.py b/summit/multiview_platform/multiview_classifiers/mucombo.py index 9fbb698c54621f8456e5ae4a74d348134c854a70..2776055ebd88b74cf3ae6a976cc3d7928597233c 100644 --- a/summit/multiview_platform/multiview_classifiers/mucombo.py +++ b/summit/multiview_platform/multiview_classifiers/mucombo.py @@ -7,20 +7,20 @@ from ..utils.hyper_parameter_search import CustomRandint from ..utils.dataset import get_samples_views_indices from ..utils.base import base_boosting_estimators -classifier_class_name = "MuCumbo" +classifier_class_name = "MuCombo" class MuCombo(BaseMultiviewClassifier, MuComboClassifier): - def __init__(self, estimator=None, + def __init__(self, base_estimator=None, n_estimators=50, random_state=None,**kwargs): BaseMultiviewClassifier.__init__(self, random_state) - estimator = self.set_base_estim_from_dict(estimator, **kwargs) - MuComboClassifier.__init__(self, estimator=estimator, + base_estimator = self.set_base_estim_from_dict(base_estimator, **kwargs) + MuComboClassifier.__init__(self, base_estimator=base_estimator, n_estimators=n_estimators, random_state=random_state,) - self.param_names = ["estimator", "n_estimators", "random_state",] + self.param_names = ["base_estimator", "n_estimators", "random_state",] self.distribs = [base_boosting_estimators, CustomRandint(5,200), [random_state],] @@ -43,6 +43,12 @@ class MuCombo(BaseMultiviewClassifier, MuComboClassifier): view_indices=view_indices) return MuComboClassifier.predict(self, numpy_X) - def get_interpretation(self, directory, base_file_name, labels, - multiclass=False): + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, + multi_class=False): return "" + + def set_base_estim_from_dict(self, dict): + key, args = list(dict.items())[0] + + if key == "decision_tree": + return DecisionTreeClassifier(**args) \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/mumbo.py b/summit/multiview_platform/multiview_classifiers/mumbo.py index fcaf64ab5ccdbb29867b19740c986ce4118ae8f6..04d241ad29a87d1b24bdff319f1e7145d74c87cf 100644 --- a/summit/multiview_platform/multiview_classifiers/mumbo.py +++ b/summit/multiview_platform/multiview_classifiers/mumbo.py @@ -13,21 +13,22 @@ from .. import monoview_classifiers classifier_class_name = "Mumbo" + class Mumbo(BaseMultiviewClassifier, MumboClassifier): - def __init__(self, estimator=None, + def __init__(self, base_estimator=None, n_estimators=50, random_state=None, best_view_mode="edge", **kwargs): BaseMultiviewClassifier.__init__(self, random_state) - base_estimator = self.set_base_estim_from_dict(estimator, **kwargs) - MumboClassifier.__init__(self, base_estimator=estimator, - n_estimators=n_estimators, - random_state=random_state, - best_view_mode=best_view_mode) - self.param_names = ["estimator", "n_estimators", "random_state", "best_view_mode"] + base_estimator = self.set_base_estim_from_dict(base_estimator) + MumboClassifier.__init__(self, base_estimator=base_estimator, + n_estimators=n_estimators, + random_state=random_state, + best_view_mode=best_view_mode) + self.param_names = ["base_estimator", "n_estimators", "random_state", "best_view_mode"] self.distribs = [base_boosting_estimators, - CustomRandint(5,200), [random_state], ["edge", "error"]] + CustomRandint(5, 200), [random_state], ["edge", "error"]] def set_params(self, estimator=None, **params): """ @@ -42,23 +43,22 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): self.base_estimator = self.set_base_estim_from_dict(estimator) MumboClassifier.set_params(self, **params) else: - MumboClassifier.set_params(self, estimator=estimator, **params) - + MumboClassifier.set_params(self, base_estimator=estimator, **params) def fit(self, X, y, train_indices=None, view_indices=None): train_indices, view_indices = get_samples_views_indices(X, - train_indices, - view_indices) + train_indices, + view_indices) self.used_views = view_indices self.view_names = [X.get_view_name(view_index) for view_index in view_indices] numpy_X, view_limits = X.to_numpy_array(sample_indices=train_indices, view_indices=view_indices) - self.view_shapes = [view_limits[ind+1]-view_limits[ind] - for ind in range(len(self.used_views)) ] + self.view_shapes = [view_limits[ind + 1] - view_limits[ind] + for ind in range(len(self.used_views))] return MumboClassifier.fit(self, numpy_X, y[train_indices], - view_limits) + view_limits) def predict(self, X, sample_indices=None, view_indices=None): sample_indices, view_indices = get_samples_views_indices(X, @@ -69,10 +69,11 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): view_indices=view_indices) return MumboClassifier.predict(self, numpy_X) - def get_interpretation(self, directory, base_file_name, labels, multiclass=False): + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, + multi_class=False): self.view_importances = np.zeros(len(self.used_views)) self.feature_importances_ = [np.zeros(view_shape) - for view_shape in self.view_shapes] + for view_shape in self.view_shapes] for best_view, estimator_weight, estimator in zip(self.best_views_, self.estimator_weights_, self.estimators_): self.view_importances[best_view] += estimator_weight if hasattr(estimator, "feature_importances_"): @@ -80,26 +81,35 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): importances_sum = sum([np.sum(feature_importances) for feature_importances in self.feature_importances_]) - self.feature_importances_ = [feature_importances/importances_sum + self.feature_importances_ = [feature_importances / importances_sum for feature_importances in self.feature_importances_] for feature_importances, view_name in zip(self.feature_importances_, self.view_names): secure_file_path(os.path.join(directory, "feature_importances", - base_file_name+view_name+"-feature_importances.csv")) + base_file_name + view_name + "-feature_importances.csv")) np.savetxt(os.path.join(directory, "feature_importances", - base_file_name+view_name+"-feature_importances.csv"), + base_file_name + view_name + "-feature_importances.csv"), feature_importances, delimiter=',') + # CHANGE: Making self.feature_importances_ one array, so he can be easy to use in + # summit.multiview_platform.result_analysis.feature_importances.get_feature_importances + self.feature_importances_ = np.concatenate(self.feature_importances_) self.view_importances /= np.sum(self.view_importances) - np.savetxt(os.path.join(directory, base_file_name+"view_importances.csv"), self.view_importances, + np.savetxt(os.path.join(directory, base_file_name + "view_importances.csv"), self.view_importances, delimiter=',') sorted_view_indices = np.argsort(-self.view_importances) interpret_string = "Mumbo used {} iterations to converge.".format(self.best_views_.shape[0]) - interpret_string+= "\n\nViews importance : \n" + interpret_string += "\n\nViews importance : \n" for view_index in sorted_view_indices: - interpret_string+="- View {} ({}), importance {}\n".format(view_index, - self.view_names[view_index], - self.view_importances[view_index]) - interpret_string +="\n The boosting process selected views : \n" + ", ".join(map(str, self.best_views_)) - interpret_string+="\n\n With estimator weights : \n"+ "\n".join(map(str,self.estimator_weights_/np.sum(self.estimator_weights_))) + interpret_string += "- View {} ({}), importance {}\n".format(view_index, + self.view_names[view_index], + self.view_importances[view_index]) + interpret_string += "\n The boosting process selected views : \n" + ", ".join(map(str, self.best_views_)) + interpret_string += "\n\n With estimator weights : \n" + "\n".join( + map(str, self.estimator_weights_ / np.sum(self.estimator_weights_))) return interpret_string + + def set_base_estim_from_dict(self, dict): + key, args = list(dict.items())[0] + if key == "decision_tree": + return DecisionTreeClassifier(**args) diff --git a/summit/multiview_platform/multiview_classifiers/weighted_linear_early_fusion.py b/summit/multiview_platform/multiview_classifiers/weighted_linear_early_fusion.py index 9af0183658e2ebbba32f4c894d1d6fffb4bcf762..c131e9a9a94ab1f768a9fb2958365c9cc3d7076c 100644 --- a/summit/multiview_platform/multiview_classifiers/weighted_linear_early_fusion.py +++ b/summit/multiview_platform/multiview_classifiers/weighted_linear_early_fusion.py @@ -65,6 +65,8 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier, BaseFusionClassifier): y=y[train_indices]) self.monoview_classifier.fit(X, y[train_indices]) self.monoview_classifier_config = self.monoview_classifier.get_params() + if hasattr(self.monoview_classifier, 'feature_importances_'): + self.feature_importances_ = self.monoview_classifier.feature_importances_ return self def predict(self, X, sample_indices=None, view_indices=None): diff --git a/summit/multiview_platform/result_analysis/execution.py b/summit/multiview_platform/result_analysis/execution.py index 931d61863f3ae12128ce2c2c8c4933312045ff00..b503e007c800a735c5eaed0b8fc03f7dbc8b358a 100644 --- a/summit/multiview_platform/result_analysis/execution.py +++ b/summit/multiview_platform/result_analysis/execution.py @@ -84,7 +84,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, sample_errors = get_sample_errors(labels, result) feature_importances = get_feature_importances(result, feature_ids=feature_ids, - view_names=view_names,) + view_names=view_names, ) durations = get_duration(result) directory = arguments["directory"] diff --git a/summit/multiview_platform/result_analysis/feature_importances.py b/summit/multiview_platform/result_analysis/feature_importances.py index 36c0eb3514b0fa3db388af10803b60f2f245f011..1c6b4188035f3a16c0d7e246091458db2816c88b 100644 --- a/summit/multiview_platform/result_analysis/feature_importances.py +++ b/summit/multiview_platform/result_analysis/feature_importances.py @@ -7,7 +7,7 @@ import plotly from ..monoview.monoview_utils import MonoviewResult -def get_feature_importances(result, feature_ids=None, view_names=None,): +def get_feature_importances(result, feature_ids=None, view_names=None, ): r"""Extracts the feature importance from the monoview results and stores them in a dictionnary : feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf @@ -44,75 +44,67 @@ def get_feature_importances(result, feature_ids=None, view_names=None,): v_feature_id] feature_importances["mv"] = pd.DataFrame(index=feat_ids) if hasattr(classifier_result.clf, 'feature_importances_'): - feature_importances["mv"][classifier_result.classifier_name] = classifier_result.clf.feature_importances_ + feature_importances["mv"][classifier_result.get_classifier_name()] = classifier_result.clf.feature_importances_ + else: + # HACK: Assigning a default features importances values to classifier that hasn't feature_importances_ + # attribute (eg: Linear Late Fusion) + feature_importances["mv"][classifier_result.get_classifier_name()] = np.zeros(len(feature_importances["mv"].index)) return feature_importances def publish_feature_importances(feature_importances, directory, database_name, feature_stds=None, metric_scores=None): # pragma: no cover + # TODO: Manage the case with NAN values importance_dfs = [] std_dfs = [] if not os.path.exists(os.path.join(directory, "feature_importances")): os.mkdir(os.path.join(directory, "feature_importances")) for view_name, feature_importance in feature_importances.items(): - if view_name!="mv": + if feature_stds is not None: + feature_std = feature_stds[view_name] + else: + feature_std = pd.DataFrame(data=np.zeros(feature_importance.shape), + index=feature_importance.index, + columns=feature_importance.columns) + feature_std = feature_std.loc[feature_importance.index] + + if view_name == "mv": + importance_dfs.append(feature_importance) + std_dfs.append(feature_std) + else: + importance_dfs.append(feature_importance.set_index( + pd.Index([view_name + "-" + ind for ind in list(feature_importance.index)]))) - if feature_stds is not None: - feature_std = feature_stds[view_name] - else: - feature_std = pd.DataFrame(data=np.zeros(feature_importance.shape), - index=feature_importance.index, - columns=feature_importance.columns) - feature_std = feature_std.loc[feature_importance.index] - - - importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)]))) - # importance_dfs.append(pd.DataFrame(index=[view_name+"-br"], - # columns=feature_importance.columns, - # data=np.zeros((1, len( - # feature_importance.columns))))) - std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind + std_dfs.append(feature_std.set_index(pd.Index([view_name + "-" + ind for ind in list(feature_std.index)]))) - # std_dfs.append(pd.DataFrame(index=[view_name + "-br"], - # columns=feature_std.columns, - # data=np.zeros((1, len( - # feature_std.columns))))) - if len(importance_dfs)>0: + + if len(importance_dfs) > 0: feature_importances_df = pd.concat(importance_dfs) - feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0) + feature_importances_df = feature_importances_df / feature_importances_df.sum(axis=0) feature_std_df = pd.concat(std_dfs) - if "mv" in feature_importances: - feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0) - if feature_stds is not None: - feature_std_df = pd.concat([feature_std_df, feature_stds["mv"]], axis=1,).fillna(0) - else: - fake = pd.DataFrame(data=np.zeros((feature_importances_df.shape[0], feature_importances["mv"].shape[1])), - index=feature_importances_df.index, - columns=feature_importances["mv"].columns).fillna(0) - feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0) plot_feature_importances(os.path.join(directory, "feature_importances", - database_name), feature_importances_df, feature_std_df) + database_name), feature_importances_df, feature_std_df) if metric_scores is not None: plot_feature_relevance(os.path.join(directory, "feature_importances", - database_name), feature_importances_df, feature_std_df, metric_scores) + database_name), feature_importances_df, feature_std_df, metric_scores) def plot_feature_importances(file_name, feature_importance, feature_std): # pragma: no cover s = feature_importance.sum(axis=1) - s = s[s!=0] + s = s[s != 0] feature_importance = feature_importance.loc[s.sort_values(ascending=False).index] feature_importance.to_csv(file_name + "_dataframe.csv") hover_text = [["-Feature :" + str(feature_name) + "<br>-Classifier : " + classifier_name + "<br>-Importance : " + str( feature_importance.loc[feature_name][classifier_name]) + - "<br>-STD : " + str( + "<br>-STD : " + str( feature_std.loc[feature_name][classifier_name]) - for classifier_name in list(feature_importance.columns)] - for feature_name in list(feature_importance.index)] + for classifier_name in list(feature_importance.columns)] + for feature_name in list(feature_importance.index)] fig = plotly.graph_objs.Figure(data=plotly.graph_objs.Heatmap( x=list(feature_importance.columns), y=list(feature_importance.index), @@ -129,20 +121,20 @@ def plot_feature_importances(file_name, feature_importance, plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) del fig - + def plot_feature_relevance(file_name, feature_importance, - feature_std, metric_scores): # pragma: no cover + feature_std, metric_scores): # pragma: no cover for metric, score_df in metric_scores.items(): if metric.endswith("*"): if isinstance(score_df, dict): score_df = score_df["mean"] for score in score_df.columns: - if len(score.split("-"))>1: + if len(score.split("-")) > 1: algo, view = score.split("-") feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test'] else: feature_importance[score] *= score_df[score]['test'] - file_name+="_relevance" + file_name += "_relevance" plot_feature_importances(file_name, feature_importance, feature_std)