Skip to content
Snippets Groups Projects
Commit 71f781d4 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Merge branch 'develop'

parents c1e977c7 4a2b2ceb
No related branches found
No related tags found
No related merge requests found
Pipeline #9531 passed
...@@ -4,7 +4,9 @@ tests: ...@@ -4,7 +4,9 @@ tests:
tags: tags:
- docker - docker
script: script:
- pip3 install -e . --no-deps - export LC_ALL=$(locale -a | grep en_US)
- export LANG=$(locale -a | grep en_US)
- pip3 install -e .
- pytest-3 - pytest-3
coverage: '/^TOTAL.+?(\d+\%)$/' coverage: '/^TOTAL.+?(\d+\%)$/'
artifacts: artifacts:
......
...@@ -57,6 +57,7 @@ And the following python modules will be automatically installed : ...@@ -57,6 +57,7 @@ And the following python modules will be automatically installed :
* `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files, * `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files,
* `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals, * `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals,
* `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix. * `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix.
* `pyscm-ml <https://pypi.org/project/pyscm-ml/>`_ -
Installing Installing
......
...@@ -29,6 +29,7 @@ RUN apt-get install -y --no-install-recommends locales && \ ...@@ -29,6 +29,7 @@ RUN apt-get install -y --no-install-recommends locales && \
update-locale en_US.UTF-8 && \ update-locale en_US.UTF-8 && \
echo "export LC_ALL=$(locale -a | grep en_US)" >> /root/.bashrc && \ echo "export LC_ALL=$(locale -a | grep en_US)" >> /root/.bashrc && \
echo "export LANG=$(locale -a | grep en_US)" >> /root/.bashrc echo "export LANG=$(locale -a | grep en_US)" >> /root/.bashrc
ENV LANGUAGE'en_US:en'
COPY requirements.txt . COPY requirements.txt .
RUN pip3 install -r requirements.txt RUN pip3 install -r requirements.txt
......
...@@ -11,4 +11,5 @@ pyyaml>=3.12 ...@@ -11,4 +11,5 @@ pyyaml>=3.12
plotly>=4.2.1 plotly>=4.2.1
matplotlib>=3.1.1 matplotlib>=3.1.1
tabulate>=0.8.6 tabulate>=0.8.6
pyscm-ml>=1.0.0
...@@ -64,11 +64,12 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): ...@@ -64,11 +64,12 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)]) [step_pred for step_pred in self.staged_predict(X)])
return pred return pred
def get_interpretation(self, directory, base_file_name, y_test, def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False): # pragma: no cover multi_class=False): # pragma: no cover
interpretString = "" interpretString = ""
interpretString += self.get_feature_importance(directory, interpretString += self.get_feature_importance(directory,
base_file_name) base_file_name,
feature_ids)
interpretString += "\n\n Estimator error | Estimator weight\n" interpretString += "\n\n Estimator error | Estimator weight\n"
interpretString += "\n".join( interpretString += "\n".join(
[str(error) + " | " + str(weight / sum(self.estimator_weights_)) for [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
......
...@@ -76,14 +76,15 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier): ...@@ -76,14 +76,15 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)]) [step_pred for step_pred in self.staged_predict(X)])
return pred return pred
def get_interpretation(self, directory, base_file_name, y_test, def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False): multi_class=False):
interpretString = "" interpretString = ""
if multi_class: if multi_class:
return interpretString return interpretString
else: else:
interpretString += self.get_feature_importance(directory, interpretString += self.get_feature_importance(directory,
base_file_name) base_file_name,
feature_ids)
step_test_metrics = np.array( step_test_metrics = np.array(
[self.plotted_metric.score(y_test, step_pred) for step_pred in [self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions]) self.step_predictions])
......
...@@ -34,10 +34,11 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier): ...@@ -34,10 +34,11 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier):
["gini", "entropy"], [random_state]] ["gini", "entropy"], [random_state]]
self.weird_strings = {} self.weird_strings = {}
def get_interpretation(self, directory, base_file_name, y_test, def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multiclass=False): multiclass=False):
interpret_string = "" interpret_string = ""
interpret_string += self.get_feature_importance(directory, interpret_string += self.get_feature_importance(directory,
base_file_name) base_file_name,
feature_ids)
return interpret_string return interpret_string
from pyscm.scm import SetCoveringMachineClassifier as scm
import numpy as np
from ..monoview.monoview_utils import BaseMonoviewClassifier
from ..utils.hyper_parameter_search import CustomRandint, CustomUniform
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
# class Decis
classifier_class_name = "SCM"
class SCM(scm, BaseMonoviewClassifier):
"""
SCM Classifier
Parameters
----------
random_state (default : None)
model_type : string (default: "conjunction")
max_rules : int number maximum of rules (default : 10)
p : float value(default : 0.1 )
kwarg : others arguments
Attributes
----------
param_names
distribs
classed_params
weird_strings
"""
def __init__(self, random_state=None, model_type="conjunction",
max_rules=10, p=0.1, **kwargs):
"""
Parameters
----------
random_state
model_type
max_rules
p
kwargs
"""
super(SCM, self).__init__(
random_state=random_state,
model_type=model_type,
max_rules=max_rules,
p=p
)
self.param_names = ["model_type", "max_rules", "p", "random_state"]
self.distribs = [["conjunction", "disjunction"],
CustomRandint(low=1, high=15),
CustomUniform(loc=0, state=1), [random_state]]
self.classed_params = []
self.weird_strings = {}
def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params):
self.n_features = X.shape[1]
scm.fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params)
self.feature_importances_ = np.zeros(self.n_features)
# sum the rules importances :
# rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance
rules_importances = np.ones(len(
self.model_.rules)) # delete it when pyscm will implement importance
for rule, importance in zip(self.model_.rules, rules_importances):
self.feature_importances_[rule.feature_idx] += importance
self.feature_importances_ /= np.sum(self.feature_importances_)
return self
# def canProbas(self):
# """
# Used to know if the classifier can return label probabilities
#
# Returns
# -------
# return False in any case
# """
# return False
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False):
interpret_string = self.get_feature_importance(directory,
base_file_name,
feature_ids)
interpret_string += "Model used : " + str(self.model_)
return interpret_string
def paramsToSet(nIter, random_state):
paramsSet = []
for _ in range(nIter):
paramsSet.append(
{"model_type": random_state.choice(["conjunction", "disjunction"]),
"max_rules": random_state.randint(1, 15),
"p": random_state.random_sample()})
return paramsSet
...@@ -84,7 +84,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, ...@@ -84,7 +84,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter,
sample_errors = get_sample_errors(labels, result) sample_errors = get_sample_errors(labels, result)
feature_importances = get_feature_importances(result, feature_importances = get_feature_importances(result,
feature_ids=feature_ids, feature_ids=feature_ids,
view_names=view_names) view_names=view_names,)
durations = get_duration(result) durations = get_duration(result)
directory = arguments["directory"] directory = arguments["directory"]
...@@ -98,7 +98,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, ...@@ -98,7 +98,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter,
publish_sample_errors(sample_errors, directory, database_name, publish_sample_errors(sample_errors, directory, database_name,
labels_names, sample_ids, labels) labels_names, sample_ids, labels)
publish_feature_importances(feature_importances, directory, publish_feature_importances(feature_importances, directory,
database_name) database_name, metric_scores=metrics_scores)
plot_durations(durations, directory, database_name) plot_durations(durations, directory, database_name)
iter_results["metrics_scores"][iter_index] = metrics_scores iter_results["metrics_scores"][iter_index] = metrics_scores
...@@ -129,7 +129,8 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name, ...@@ -129,7 +129,8 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name,
publish_all_sample_errors(error_analysis, directory, stats_iter, publish_all_sample_errors(error_analysis, directory, stats_iter,
sample_ids, labels, data_base_name, label_names) sample_ids, labels, data_base_name, label_names)
publish_feature_importances(feature_importances, directory, publish_feature_importances(feature_importances, directory,
data_base_name, feature_importances_stds) data_base_name, feature_importances_stds,
metric_scores=metrics_analysis)
plot_durations(duration_means, directory, data_base_name, duration_stds) plot_durations(duration_means, directory, data_base_name, duration_stds)
return results return results
......
...@@ -7,7 +7,7 @@ import plotly ...@@ -7,7 +7,7 @@ import plotly
from ..monoview.monoview_utils import MonoviewResult from ..monoview.monoview_utils import MonoviewResult
def get_feature_importances(result, feature_ids=None, view_names=None): def get_feature_importances(result, feature_ids=None, view_names=None,):
r"""Extracts the feature importance from the monoview results and stores r"""Extracts the feature importance from the monoview results and stores
them in a dictionnary : them in a dictionnary :
feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf
...@@ -49,7 +49,7 @@ def get_feature_importances(result, feature_ids=None, view_names=None): ...@@ -49,7 +49,7 @@ def get_feature_importances(result, feature_ids=None, view_names=None):
def publish_feature_importances(feature_importances, directory, database_name, def publish_feature_importances(feature_importances, directory, database_name,
feature_stds=None): # pragma: no cover feature_stds=None, metric_scores=None): # pragma: no cover
importance_dfs = [] importance_dfs = []
std_dfs = [] std_dfs = []
if not os.path.exists(os.path.join(directory, "feature_importances")): if not os.path.exists(os.path.join(directory, "feature_importances")):
...@@ -67,22 +67,22 @@ def publish_feature_importances(feature_importances, directory, database_name, ...@@ -67,22 +67,22 @@ def publish_feature_importances(feature_importances, directory, database_name,
importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)]))) importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)])))
importance_dfs.append(pd.DataFrame(index=[view_name+"-br"], # importance_dfs.append(pd.DataFrame(index=[view_name+"-br"],
columns=feature_importance.columns, # columns=feature_importance.columns,
data=np.zeros((1, len( # data=np.zeros((1, len(
feature_importance.columns))))) # feature_importance.columns)))))
std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind
for ind for ind
in list(feature_std.index)]))) in list(feature_std.index)])))
std_dfs.append(pd.DataFrame(index=[view_name + "-br"], # std_dfs.append(pd.DataFrame(index=[view_name + "-br"],
columns=feature_std.columns, # columns=feature_std.columns,
data=np.zeros((1, len( # data=np.zeros((1, len(
feature_std.columns))))) # feature_std.columns)))))
if len(importance_dfs)>0: if len(importance_dfs)>0:
feature_importances_df = pd.concat(importance_dfs[:-1]) feature_importances_df = pd.concat(importance_dfs)
feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0) feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0)
feature_std_df = pd.concat(std_dfs[:-1]) feature_std_df = pd.concat(std_dfs)
if "mv" in feature_importances: if "mv" in feature_importances:
feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0) feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0)
if feature_stds is not None: if feature_stds is not None:
...@@ -94,10 +94,16 @@ def publish_feature_importances(feature_importances, directory, database_name, ...@@ -94,10 +94,16 @@ def publish_feature_importances(feature_importances, directory, database_name,
feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0) feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0)
plot_feature_importances(os.path.join(directory, "feature_importances", plot_feature_importances(os.path.join(directory, "feature_importances",
database_name), feature_importances_df, feature_std_df) database_name), feature_importances_df, feature_std_df)
if metric_scores is not None:
plot_feature_relevance(os.path.join(directory, "feature_importances",
database_name), feature_importances_df, feature_std_df, metric_scores)
def plot_feature_importances(file_name, feature_importance, def plot_feature_importances(file_name, feature_importance,
feature_std): # pragma: no cover feature_std): # pragma: no cover
s = feature_importance.sum(axis=1)
s = s[s!=0]
feature_importance = feature_importance.loc[s.sort_values(ascending=False).index]
feature_importance.to_csv(file_name + "_dataframe.csv") feature_importance.to_csv(file_name + "_dataframe.csv")
hover_text = [["-Feature :" + str(feature_name) + hover_text = [["-Feature :" + str(feature_name) +
"<br>-Classifier : " + classifier_name + "<br>-Classifier : " + classifier_name +
...@@ -113,8 +119,8 @@ def plot_feature_importances(file_name, feature_importance, ...@@ -113,8 +119,8 @@ def plot_feature_importances(file_name, feature_importance,
z=feature_importance.values, z=feature_importance.values,
text=hover_text, text=hover_text,
hoverinfo=["text"], hoverinfo=["text"],
colorscale="Greys", colorscale="Hot",
reversescale=False)) reversescale=True))
fig.update_layout( fig.update_layout(
xaxis={"showgrid": False, "showticklabels": False, "ticks": ''}, xaxis={"showgrid": False, "showticklabels": False, "ticks": ''},
yaxis={"showgrid": False, "showticklabels": False, "ticks": ''}) yaxis={"showgrid": False, "showticklabels": False, "ticks": ''})
...@@ -123,3 +129,20 @@ def plot_feature_importances(file_name, feature_importance, ...@@ -123,3 +129,20 @@ def plot_feature_importances(file_name, feature_importance,
plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False)
del fig del fig
def plot_feature_relevance(file_name, feature_importance,
feature_std, metric_scores): # pragma: no cover
for metric, score_df in metric_scores.items():
if metric.endswith("*"):
if isinstance(score_df, dict):
score_df = score_df["mean"]
for score in score_df.columns:
if len(score.split("-"))>1:
algo, view = score.split("-")
feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test']
else:
feature_importance[score] *= score_df[score]['test']
file_name+="_relevance"
plot_feature_importances(file_name, feature_importance,
feature_std)
...@@ -79,6 +79,7 @@ class TestFunctions(unittest.TestCase): ...@@ -79,6 +79,7 @@ class TestFunctions(unittest.TestCase):
'knn', 'knn',
'lasso', 'lasso',
'random_forest', 'random_forest',
'scm',
'sgd', 'sgd',
'svm_linear', 'svm_linear',
'svm_poly', 'svm_poly',
...@@ -90,6 +91,7 @@ class TestFunctions(unittest.TestCase): ...@@ -90,6 +91,7 @@ class TestFunctions(unittest.TestCase):
'gradient_boosting', 'gradient_boosting',
'knn', 'knn',
'random_forest', 'random_forest',
'scm',
'svm_linear', 'svm_linear',
'svm_poly', 'svm_poly',
'svm_rbf']) 'svm_rbf'])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment