diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b7aacdd3be4737f0d9b47fe6460a27c4c6d768cd..cdfcf30ba53ea9841566f0b198075f4baf8baa15 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,9 @@ tests: tags: - docker script: - - pip3 install -e . --no-deps + - export LC_ALL=$(locale -a | grep en_US) + - export LANG=$(locale -a | grep en_US) + - pip3 install -e . - pytest-3 coverage: '/^TOTAL.+?(\d+\%)$/' artifacts: diff --git a/README.rst b/README.rst index fd9da8c86354eca883a8a3d5121d8e31a623c4de..b238fbc8f29fddfd975b4a79617851b54e1acc49 100644 --- a/README.rst +++ b/README.rst @@ -57,6 +57,7 @@ And the following python modules will be automatically installed : * `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files, * `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals, * `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix. +* `pyscm-ml <https://pypi.org/project/pyscm-ml/>`_ - Installing diff --git a/docker/Dockerfile_ubuntu_18.04 b/docker/Dockerfile_ubuntu_18.04 index 4e3bfa76f57ec17ee49037656c60792cbd59fd24..026c49a3542d5ad55a87625665f377fbca3c33f0 100644 --- a/docker/Dockerfile_ubuntu_18.04 +++ b/docker/Dockerfile_ubuntu_18.04 @@ -29,6 +29,7 @@ RUN apt-get install -y --no-install-recommends locales && \ update-locale en_US.UTF-8 && \ echo "export LC_ALL=$(locale -a | grep en_US)" >> /root/.bashrc && \ echo "export LANG=$(locale -a | grep en_US)" >> /root/.bashrc +ENV LANGUAGE'en_US:en' COPY requirements.txt . RUN pip3 install -r requirements.txt diff --git a/requirements.txt b/requirements.txt index 5f4af0b8ad2e9bed3d8f79983591e7534071772d..273d8968b3cf03013d0653fb844de52e4f2502c1 100755 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ pyyaml>=3.12 plotly>=4.2.1 matplotlib>=3.1.1 tabulate>=0.8.6 +pyscm-ml>=1.0.0 diff --git a/summit/multiview_platform/monoview_classifiers/adaboost.py b/summit/multiview_platform/monoview_classifiers/adaboost.py index cd8ce3db0b769e7ad99032487d94da010988138b..412e9a19fe2c5b523bb358018dc8381acf488dd8 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost.py @@ -64,11 +64,12 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): [step_pred for step_pred in self.staged_predict(X)]) return pred - def get_interpretation(self, directory, base_file_name, y_test, + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, multi_class=False): # pragma: no cover interpretString = "" interpretString += self.get_feature_importance(directory, - base_file_name) + base_file_name, + feature_ids) interpretString += "\n\n Estimator error | Estimator weight\n" interpretString += "\n".join( [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for diff --git a/summit/multiview_platform/monoview_classifiers/gradient_boosting.py b/summit/multiview_platform/monoview_classifiers/gradient_boosting.py index e242dee80c6c1ef76daacd2c43d4c178b8f4c495..77242502ca370fabba2f51df167774a2c3ac24e2 100644 --- a/summit/multiview_platform/monoview_classifiers/gradient_boosting.py +++ b/summit/multiview_platform/monoview_classifiers/gradient_boosting.py @@ -76,14 +76,15 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier): [step_pred for step_pred in self.staged_predict(X)]) return pred - def get_interpretation(self, directory, base_file_name, y_test, + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, multi_class=False): interpretString = "" if multi_class: return interpretString else: interpretString += self.get_feature_importance(directory, - base_file_name) + base_file_name, + feature_ids) step_test_metrics = np.array( [self.plotted_metric.score(y_test, step_pred) for step_pred in self.step_predictions]) diff --git a/summit/multiview_platform/monoview_classifiers/random_forest.py b/summit/multiview_platform/monoview_classifiers/random_forest.py index c0ebaaa570e33e6d0fa2a92944a16b7f7ccecb99..f0d3578c3c81ac5f11c6baefd6b845d984986314 100644 --- a/summit/multiview_platform/monoview_classifiers/random_forest.py +++ b/summit/multiview_platform/monoview_classifiers/random_forest.py @@ -34,10 +34,11 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier): ["gini", "entropy"], [random_state]] self.weird_strings = {} - def get_interpretation(self, directory, base_file_name, y_test, + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, multiclass=False): interpret_string = "" interpret_string += self.get_feature_importance(directory, - base_file_name) + base_file_name, + feature_ids) return interpret_string diff --git a/summit/multiview_platform/monoview_classifiers/scm.py b/summit/multiview_platform/monoview_classifiers/scm.py new file mode 100644 index 0000000000000000000000000000000000000000..d094ceec2c8bc1f66896e2e83873c7f7fbcac03b --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/scm.py @@ -0,0 +1,106 @@ +from pyscm.scm import SetCoveringMachineClassifier as scm + +import numpy as np + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +# class Decis +classifier_class_name = "SCM" + +class SCM(scm, BaseMonoviewClassifier): + """ + SCM Classifier + Parameters + ---------- + random_state (default : None) + model_type : string (default: "conjunction") + max_rules : int number maximum of rules (default : 10) + p : float value(default : 0.1 ) + + kwarg : others arguments + + Attributes + ---------- + param_names + + distribs + + classed_params + + weird_strings + + """ + + def __init__(self, random_state=None, model_type="conjunction", + max_rules=10, p=0.1, **kwargs): + """ + + Parameters + ---------- + random_state + model_type + max_rules + p + kwargs + """ + super(SCM, self).__init__( + random_state=random_state, + model_type=model_type, + max_rules=max_rules, + p=p + ) + self.param_names = ["model_type", "max_rules", "p", "random_state"] + self.distribs = [["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [random_state]] + self.classed_params = [] + self.weird_strings = {} + + def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params): + self.n_features = X.shape[1] + scm.fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params) + self.feature_importances_ = np.zeros(self.n_features) + # sum the rules importances : + # rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance + rules_importances = np.ones(len( + self.model_.rules)) # delete it when pyscm will implement importance + for rule, importance in zip(self.model_.rules, rules_importances): + self.feature_importances_[rule.feature_idx] += importance + self.feature_importances_ /= np.sum(self.feature_importances_) + return self + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # + # Returns + # ------- + # return False in any case + # """ + # return False + + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, + multi_class=False): + interpret_string = self.get_feature_importance(directory, + base_file_name, + feature_ids) + interpret_string += "Model used : " + str(self.model_) + return interpret_string + + + +def paramsToSet(nIter, random_state): + paramsSet = [] + for _ in range(nIter): + paramsSet.append( + {"model_type": random_state.choice(["conjunction", "disjunction"]), + "max_rules": random_state.randint(1, 15), + "p": random_state.random_sample()}) + return paramsSet diff --git a/summit/multiview_platform/result_analysis/execution.py b/summit/multiview_platform/result_analysis/execution.py index 279891e7d30ea3fa2262e25eea013a883d61db52..931d61863f3ae12128ce2c2c8c4933312045ff00 100644 --- a/summit/multiview_platform/result_analysis/execution.py +++ b/summit/multiview_platform/result_analysis/execution.py @@ -84,7 +84,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, sample_errors = get_sample_errors(labels, result) feature_importances = get_feature_importances(result, feature_ids=feature_ids, - view_names=view_names) + view_names=view_names,) durations = get_duration(result) directory = arguments["directory"] @@ -98,7 +98,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, publish_sample_errors(sample_errors, directory, database_name, labels_names, sample_ids, labels) publish_feature_importances(feature_importances, directory, - database_name) + database_name, metric_scores=metrics_scores) plot_durations(durations, directory, database_name) iter_results["metrics_scores"][iter_index] = metrics_scores @@ -129,7 +129,8 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name, publish_all_sample_errors(error_analysis, directory, stats_iter, sample_ids, labels, data_base_name, label_names) publish_feature_importances(feature_importances, directory, - data_base_name, feature_importances_stds) + data_base_name, feature_importances_stds, + metric_scores=metrics_analysis) plot_durations(duration_means, directory, data_base_name, duration_stds) return results diff --git a/summit/multiview_platform/result_analysis/feature_importances.py b/summit/multiview_platform/result_analysis/feature_importances.py index 088cd8309f36539b370b6891313432f93246094d..36c0eb3514b0fa3db388af10803b60f2f245f011 100644 --- a/summit/multiview_platform/result_analysis/feature_importances.py +++ b/summit/multiview_platform/result_analysis/feature_importances.py @@ -7,7 +7,7 @@ import plotly from ..monoview.monoview_utils import MonoviewResult -def get_feature_importances(result, feature_ids=None, view_names=None): +def get_feature_importances(result, feature_ids=None, view_names=None,): r"""Extracts the feature importance from the monoview results and stores them in a dictionnary : feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf @@ -49,7 +49,7 @@ def get_feature_importances(result, feature_ids=None, view_names=None): def publish_feature_importances(feature_importances, directory, database_name, - feature_stds=None): # pragma: no cover + feature_stds=None, metric_scores=None): # pragma: no cover importance_dfs = [] std_dfs = [] if not os.path.exists(os.path.join(directory, "feature_importances")): @@ -67,22 +67,22 @@ def publish_feature_importances(feature_importances, directory, database_name, importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)]))) - importance_dfs.append(pd.DataFrame(index=[view_name+"-br"], - columns=feature_importance.columns, - data=np.zeros((1, len( - feature_importance.columns))))) + # importance_dfs.append(pd.DataFrame(index=[view_name+"-br"], + # columns=feature_importance.columns, + # data=np.zeros((1, len( + # feature_importance.columns))))) std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_std.index)]))) - std_dfs.append(pd.DataFrame(index=[view_name + "-br"], - columns=feature_std.columns, - data=np.zeros((1, len( - feature_std.columns))))) + # std_dfs.append(pd.DataFrame(index=[view_name + "-br"], + # columns=feature_std.columns, + # data=np.zeros((1, len( + # feature_std.columns))))) if len(importance_dfs)>0: - feature_importances_df = pd.concat(importance_dfs[:-1]) + feature_importances_df = pd.concat(importance_dfs) feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0) - feature_std_df = pd.concat(std_dfs[:-1]) + feature_std_df = pd.concat(std_dfs) if "mv" in feature_importances: feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0) if feature_stds is not None: @@ -94,10 +94,16 @@ def publish_feature_importances(feature_importances, directory, database_name, feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0) plot_feature_importances(os.path.join(directory, "feature_importances", database_name), feature_importances_df, feature_std_df) + if metric_scores is not None: + plot_feature_relevance(os.path.join(directory, "feature_importances", + database_name), feature_importances_df, feature_std_df, metric_scores) def plot_feature_importances(file_name, feature_importance, feature_std): # pragma: no cover + s = feature_importance.sum(axis=1) + s = s[s!=0] + feature_importance = feature_importance.loc[s.sort_values(ascending=False).index] feature_importance.to_csv(file_name + "_dataframe.csv") hover_text = [["-Feature :" + str(feature_name) + "<br>-Classifier : " + classifier_name + @@ -113,8 +119,8 @@ def plot_feature_importances(file_name, feature_importance, z=feature_importance.values, text=hover_text, hoverinfo=["text"], - colorscale="Greys", - reversescale=False)) + colorscale="Hot", + reversescale=True)) fig.update_layout( xaxis={"showgrid": False, "showticklabels": False, "ticks": ''}, yaxis={"showgrid": False, "showticklabels": False, "ticks": ''}) @@ -123,3 +129,20 @@ def plot_feature_importances(file_name, feature_importance, plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) del fig + + +def plot_feature_relevance(file_name, feature_importance, + feature_std, metric_scores): # pragma: no cover + for metric, score_df in metric_scores.items(): + if metric.endswith("*"): + if isinstance(score_df, dict): + score_df = score_df["mean"] + for score in score_df.columns: + if len(score.split("-"))>1: + algo, view = score.split("-") + feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test'] + else: + feature_importance[score] *= score_df[score]['test'] + file_name+="_relevance" + plot_feature_importances(file_name, feature_importance, + feature_std) diff --git a/summit/tests/test_multi_view/test_multiview_utils.py b/summit/tests/test_multi_view/test_multiview_utils.py index 491d0134b2943be11436bf3bacef2eff6cb08614..99d725253c7a0341719913856259c00d65fba3a9 100644 --- a/summit/tests/test_multi_view/test_multiview_utils.py +++ b/summit/tests/test_multi_view/test_multiview_utils.py @@ -79,6 +79,7 @@ class TestFunctions(unittest.TestCase): 'knn', 'lasso', 'random_forest', + 'scm', 'sgd', 'svm_linear', 'svm_poly', @@ -90,6 +91,7 @@ class TestFunctions(unittest.TestCase): 'gradient_boosting', 'knn', 'random_forest', + 'scm', 'svm_linear', 'svm_poly', 'svm_rbf'])