Commit 71f781d4 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Merge branch 'develop'

parents c1e977c7 4a2b2ceb
Pipeline #9531 passed with stages
in 2 minutes and 11 seconds
......@@ -4,7 +4,9 @@ tests:
tags:
- docker
script:
- pip3 install -e . --no-deps
- export LC_ALL=$(locale -a | grep en_US)
- export LANG=$(locale -a | grep en_US)
- pip3 install -e .
- pytest-3
coverage: '/^TOTAL.+?(\d+\%)$/'
artifacts:
......
......@@ -57,6 +57,7 @@ And the following python modules will be automatically installed :
* `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files,
* `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals,
* `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix.
* `pyscm-ml <https://pypi.org/project/pyscm-ml/>`_ -
Installing
......
......@@ -29,6 +29,7 @@ RUN apt-get install -y --no-install-recommends locales && \
update-locale en_US.UTF-8 && \
echo "export LC_ALL=$(locale -a | grep en_US)" >> /root/.bashrc && \
echo "export LANG=$(locale -a | grep en_US)" >> /root/.bashrc
ENV LANGUAGE'en_US:en'
COPY requirements.txt .
RUN pip3 install -r requirements.txt
......
......@@ -11,4 +11,5 @@ pyyaml>=3.12
plotly>=4.2.1
matplotlib>=3.1.1
tabulate>=0.8.6
pyscm-ml>=1.0.0
......@@ -64,11 +64,12 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)])
return pred
def get_interpretation(self, directory, base_file_name, y_test,
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False): # pragma: no cover
interpretString = ""
interpretString += self.get_feature_importance(directory,
base_file_name)
base_file_name,
feature_ids)
interpretString += "\n\n Estimator error | Estimator weight\n"
interpretString += "\n".join(
[str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
......
......@@ -76,14 +76,15 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)])
return pred
def get_interpretation(self, directory, base_file_name, y_test,
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False):
interpretString = ""
if multi_class:
return interpretString
else:
interpretString += self.get_feature_importance(directory,
base_file_name)
base_file_name,
feature_ids)
step_test_metrics = np.array(
[self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions])
......
......@@ -34,10 +34,11 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier):
["gini", "entropy"], [random_state]]
self.weird_strings = {}
def get_interpretation(self, directory, base_file_name, y_test,
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multiclass=False):
interpret_string = ""
interpret_string += self.get_feature_importance(directory,
base_file_name)
base_file_name,
feature_ids)
return interpret_string
from pyscm.scm import SetCoveringMachineClassifier as scm
import numpy as np
from ..monoview.monoview_utils import BaseMonoviewClassifier
from ..utils.hyper_parameter_search import CustomRandint, CustomUniform
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
# class Decis
classifier_class_name = "SCM"
class SCM(scm, BaseMonoviewClassifier):
"""
SCM Classifier
Parameters
----------
random_state (default : None)
model_type : string (default: "conjunction")
max_rules : int number maximum of rules (default : 10)
p : float value(default : 0.1 )
kwarg : others arguments
Attributes
----------
param_names
distribs
classed_params
weird_strings
"""
def __init__(self, random_state=None, model_type="conjunction",
max_rules=10, p=0.1, **kwargs):
"""
Parameters
----------
random_state
model_type
max_rules
p
kwargs
"""
super(SCM, self).__init__(
random_state=random_state,
model_type=model_type,
max_rules=max_rules,
p=p
)
self.param_names = ["model_type", "max_rules", "p", "random_state"]
self.distribs = [["conjunction", "disjunction"],
CustomRandint(low=1, high=15),
CustomUniform(loc=0, state=1), [random_state]]
self.classed_params = []
self.weird_strings = {}
def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params):
self.n_features = X.shape[1]
scm.fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params)
self.feature_importances_ = np.zeros(self.n_features)
# sum the rules importances :
# rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance
rules_importances = np.ones(len(
self.model_.rules)) # delete it when pyscm will implement importance
for rule, importance in zip(self.model_.rules, rules_importances):
self.feature_importances_[rule.feature_idx] += importance
self.feature_importances_ /= np.sum(self.feature_importances_)
return self
# def canProbas(self):
# """
# Used to know if the classifier can return label probabilities
#
# Returns
# -------
# return False in any case
# """
# return False
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False):
interpret_string = self.get_feature_importance(directory,
base_file_name,
feature_ids)
interpret_string += "Model used : " + str(self.model_)
return interpret_string
def paramsToSet(nIter, random_state):
paramsSet = []
for _ in range(nIter):
paramsSet.append(
{"model_type": random_state.choice(["conjunction", "disjunction"]),
"max_rules": random_state.randint(1, 15),
"p": random_state.random_sample()})
return paramsSet
......@@ -84,7 +84,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter,
sample_errors = get_sample_errors(labels, result)
feature_importances = get_feature_importances(result,
feature_ids=feature_ids,
view_names=view_names)
view_names=view_names,)
durations = get_duration(result)
directory = arguments["directory"]
......@@ -98,7 +98,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter,
publish_sample_errors(sample_errors, directory, database_name,
labels_names, sample_ids, labels)
publish_feature_importances(feature_importances, directory,
database_name)
database_name, metric_scores=metrics_scores)
plot_durations(durations, directory, database_name)
iter_results["metrics_scores"][iter_index] = metrics_scores
......@@ -129,7 +129,8 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name,
publish_all_sample_errors(error_analysis, directory, stats_iter,
sample_ids, labels, data_base_name, label_names)
publish_feature_importances(feature_importances, directory,
data_base_name, feature_importances_stds)
data_base_name, feature_importances_stds,
metric_scores=metrics_analysis)
plot_durations(duration_means, directory, data_base_name, duration_stds)
return results
......
......@@ -7,7 +7,7 @@ import plotly
from ..monoview.monoview_utils import MonoviewResult
def get_feature_importances(result, feature_ids=None, view_names=None):
def get_feature_importances(result, feature_ids=None, view_names=None,):
r"""Extracts the feature importance from the monoview results and stores
them in a dictionnary :
feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf
......@@ -49,7 +49,7 @@ def get_feature_importances(result, feature_ids=None, view_names=None):
def publish_feature_importances(feature_importances, directory, database_name,
feature_stds=None): # pragma: no cover
feature_stds=None, metric_scores=None): # pragma: no cover
importance_dfs = []
std_dfs = []
if not os.path.exists(os.path.join(directory, "feature_importances")):
......@@ -67,22 +67,22 @@ def publish_feature_importances(feature_importances, directory, database_name,
importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)])))
importance_dfs.append(pd.DataFrame(index=[view_name+"-br"],
columns=feature_importance.columns,
data=np.zeros((1, len(
feature_importance.columns)))))
# importance_dfs.append(pd.DataFrame(index=[view_name+"-br"],
# columns=feature_importance.columns,
# data=np.zeros((1, len(
# feature_importance.columns)))))
std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind
for ind
in list(feature_std.index)])))
std_dfs.append(pd.DataFrame(index=[view_name + "-br"],
columns=feature_std.columns,
data=np.zeros((1, len(
feature_std.columns)))))
# std_dfs.append(pd.DataFrame(index=[view_name + "-br"],
# columns=feature_std.columns,
# data=np.zeros((1, len(
# feature_std.columns)))))
if len(importance_dfs)>0:
feature_importances_df = pd.concat(importance_dfs[:-1])
feature_importances_df = pd.concat(importance_dfs)
feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0)
feature_std_df = pd.concat(std_dfs[:-1])
feature_std_df = pd.concat(std_dfs)
if "mv" in feature_importances:
feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0)
if feature_stds is not None:
......@@ -94,10 +94,16 @@ def publish_feature_importances(feature_importances, directory, database_name,
feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0)
plot_feature_importances(os.path.join(directory, "feature_importances",
database_name), feature_importances_df, feature_std_df)
if metric_scores is not None:
plot_feature_relevance(os.path.join(directory, "feature_importances",
database_name), feature_importances_df, feature_std_df, metric_scores)
def plot_feature_importances(file_name, feature_importance,
feature_std): # pragma: no cover
s = feature_importance.sum(axis=1)
s = s[s!=0]
feature_importance = feature_importance.loc[s.sort_values(ascending=False).index]
feature_importance.to_csv(file_name + "_dataframe.csv")
hover_text = [["-Feature :" + str(feature_name) +
"<br>-Classifier : " + classifier_name +
......@@ -113,8 +119,8 @@ def plot_feature_importances(file_name, feature_importance,
z=feature_importance.values,
text=hover_text,
hoverinfo=["text"],
colorscale="Greys",
reversescale=False))
colorscale="Hot",
reversescale=True))
fig.update_layout(
xaxis={"showgrid": False, "showticklabels": False, "ticks": ''},
yaxis={"showgrid": False, "showticklabels": False, "ticks": ''})
......@@ -123,3 +129,20 @@ def plot_feature_importances(file_name, feature_importance,
plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False)
del fig
def plot_feature_relevance(file_name, feature_importance,
feature_std, metric_scores): # pragma: no cover
for metric, score_df in metric_scores.items():
if metric.endswith("*"):
if isinstance(score_df, dict):
score_df = score_df["mean"]
for score in score_df.columns:
if len(score.split("-"))>1:
algo, view = score.split("-")
feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test']
else:
feature_importance[score] *= score_df[score]['test']
file_name+="_relevance"
plot_feature_importances(file_name, feature_importance,
feature_std)
......@@ -79,6 +79,7 @@ class TestFunctions(unittest.TestCase):
'knn',
'lasso',
'random_forest',
'scm',
'sgd',
'svm_linear',
'svm_poly',
......@@ -90,6 +91,7 @@ class TestFunctions(unittest.TestCase):
'gradient_boosting',
'knn',
'random_forest',
'scm',
'svm_linear',
'svm_poly',
'svm_rbf'])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment