Skip to content
Snippets Groups Projects
Commit 71f781d4 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Merge branch 'develop'

parents c1e977c7 4a2b2ceb
No related branches found
No related tags found
No related merge requests found
Pipeline #9531 passed
......@@ -4,7 +4,9 @@ tests:
tags:
- docker
script:
- pip3 install -e . --no-deps
- export LC_ALL=$(locale -a | grep en_US)
- export LANG=$(locale -a | grep en_US)
- pip3 install -e .
- pytest-3
coverage: '/^TOTAL.+?(\d+\%)$/'
artifacts:
......
......@@ -57,6 +57,7 @@ And the following python modules will be automatically installed :
* `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files,
* `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals,
* `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix.
* `pyscm-ml <https://pypi.org/project/pyscm-ml/>`_ -
Installing
......
......@@ -29,6 +29,7 @@ RUN apt-get install -y --no-install-recommends locales && \
update-locale en_US.UTF-8 && \
echo "export LC_ALL=$(locale -a | grep en_US)" >> /root/.bashrc && \
echo "export LANG=$(locale -a | grep en_US)" >> /root/.bashrc
ENV LANGUAGE'en_US:en'
COPY requirements.txt .
RUN pip3 install -r requirements.txt
......
......@@ -11,4 +11,5 @@ pyyaml>=3.12
plotly>=4.2.1
matplotlib>=3.1.1
tabulate>=0.8.6
pyscm-ml>=1.0.0
......@@ -64,11 +64,12 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)])
return pred
def get_interpretation(self, directory, base_file_name, y_test,
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False): # pragma: no cover
interpretString = ""
interpretString += self.get_feature_importance(directory,
base_file_name)
base_file_name,
feature_ids)
interpretString += "\n\n Estimator error | Estimator weight\n"
interpretString += "\n".join(
[str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
......
......@@ -76,14 +76,15 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier):
[step_pred for step_pred in self.staged_predict(X)])
return pred
def get_interpretation(self, directory, base_file_name, y_test,
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False):
interpretString = ""
if multi_class:
return interpretString
else:
interpretString += self.get_feature_importance(directory,
base_file_name)
base_file_name,
feature_ids)
step_test_metrics = np.array(
[self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions])
......
......@@ -34,10 +34,11 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier):
["gini", "entropy"], [random_state]]
self.weird_strings = {}
def get_interpretation(self, directory, base_file_name, y_test,
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multiclass=False):
interpret_string = ""
interpret_string += self.get_feature_importance(directory,
base_file_name)
base_file_name,
feature_ids)
return interpret_string
from pyscm.scm import SetCoveringMachineClassifier as scm
import numpy as np
from ..monoview.monoview_utils import BaseMonoviewClassifier
from ..utils.hyper_parameter_search import CustomRandint, CustomUniform
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
# class Decis
classifier_class_name = "SCM"
class SCM(scm, BaseMonoviewClassifier):
"""
SCM Classifier
Parameters
----------
random_state (default : None)
model_type : string (default: "conjunction")
max_rules : int number maximum of rules (default : 10)
p : float value(default : 0.1 )
kwarg : others arguments
Attributes
----------
param_names
distribs
classed_params
weird_strings
"""
def __init__(self, random_state=None, model_type="conjunction",
max_rules=10, p=0.1, **kwargs):
"""
Parameters
----------
random_state
model_type
max_rules
p
kwargs
"""
super(SCM, self).__init__(
random_state=random_state,
model_type=model_type,
max_rules=max_rules,
p=p
)
self.param_names = ["model_type", "max_rules", "p", "random_state"]
self.distribs = [["conjunction", "disjunction"],
CustomRandint(low=1, high=15),
CustomUniform(loc=0, state=1), [random_state]]
self.classed_params = []
self.weird_strings = {}
def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params):
self.n_features = X.shape[1]
scm.fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params)
self.feature_importances_ = np.zeros(self.n_features)
# sum the rules importances :
# rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance
rules_importances = np.ones(len(
self.model_.rules)) # delete it when pyscm will implement importance
for rule, importance in zip(self.model_.rules, rules_importances):
self.feature_importances_[rule.feature_idx] += importance
self.feature_importances_ /= np.sum(self.feature_importances_)
return self
# def canProbas(self):
# """
# Used to know if the classifier can return label probabilities
#
# Returns
# -------
# return False in any case
# """
# return False
def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False):
interpret_string = self.get_feature_importance(directory,
base_file_name,
feature_ids)
interpret_string += "Model used : " + str(self.model_)
return interpret_string
def paramsToSet(nIter, random_state):
paramsSet = []
for _ in range(nIter):
paramsSet.append(
{"model_type": random_state.choice(["conjunction", "disjunction"]),
"max_rules": random_state.randint(1, 15),
"p": random_state.random_sample()})
return paramsSet
......@@ -84,7 +84,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter,
sample_errors = get_sample_errors(labels, result)
feature_importances = get_feature_importances(result,
feature_ids=feature_ids,
view_names=view_names)
view_names=view_names,)
durations = get_duration(result)
directory = arguments["directory"]
......@@ -98,7 +98,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter,
publish_sample_errors(sample_errors, directory, database_name,
labels_names, sample_ids, labels)
publish_feature_importances(feature_importances, directory,
database_name)
database_name, metric_scores=metrics_scores)
plot_durations(durations, directory, database_name)
iter_results["metrics_scores"][iter_index] = metrics_scores
......@@ -129,7 +129,8 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name,
publish_all_sample_errors(error_analysis, directory, stats_iter,
sample_ids, labels, data_base_name, label_names)
publish_feature_importances(feature_importances, directory,
data_base_name, feature_importances_stds)
data_base_name, feature_importances_stds,
metric_scores=metrics_analysis)
plot_durations(duration_means, directory, data_base_name, duration_stds)
return results
......
......@@ -7,7 +7,7 @@ import plotly
from ..monoview.monoview_utils import MonoviewResult
def get_feature_importances(result, feature_ids=None, view_names=None):
def get_feature_importances(result, feature_ids=None, view_names=None,):
r"""Extracts the feature importance from the monoview results and stores
them in a dictionnary :
feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf
......@@ -49,7 +49,7 @@ def get_feature_importances(result, feature_ids=None, view_names=None):
def publish_feature_importances(feature_importances, directory, database_name,
feature_stds=None): # pragma: no cover
feature_stds=None, metric_scores=None): # pragma: no cover
importance_dfs = []
std_dfs = []
if not os.path.exists(os.path.join(directory, "feature_importances")):
......@@ -67,22 +67,22 @@ def publish_feature_importances(feature_importances, directory, database_name,
importance_dfs.append(feature_importance.set_index(pd.Index([view_name+"-"+ind for ind in list(feature_importance.index)])))
importance_dfs.append(pd.DataFrame(index=[view_name+"-br"],
columns=feature_importance.columns,
data=np.zeros((1, len(
feature_importance.columns)))))
# importance_dfs.append(pd.DataFrame(index=[view_name+"-br"],
# columns=feature_importance.columns,
# data=np.zeros((1, len(
# feature_importance.columns)))))
std_dfs.append(feature_std.set_index(pd.Index([view_name+"-"+ind
for ind
in list(feature_std.index)])))
std_dfs.append(pd.DataFrame(index=[view_name + "-br"],
columns=feature_std.columns,
data=np.zeros((1, len(
feature_std.columns)))))
# std_dfs.append(pd.DataFrame(index=[view_name + "-br"],
# columns=feature_std.columns,
# data=np.zeros((1, len(
# feature_std.columns)))))
if len(importance_dfs)>0:
feature_importances_df = pd.concat(importance_dfs[:-1])
feature_importances_df = pd.concat(importance_dfs)
feature_importances_df = feature_importances_df/feature_importances_df.sum(axis=0)
feature_std_df = pd.concat(std_dfs[:-1])
feature_std_df = pd.concat(std_dfs)
if "mv" in feature_importances:
feature_importances_df = pd.concat([feature_importances_df,feature_importances["mv"].loc[(feature_importances["mv"] != 0).any(axis=1), :]], axis=1).fillna(0)
if feature_stds is not None:
......@@ -94,10 +94,16 @@ def publish_feature_importances(feature_importances, directory, database_name,
feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0)
plot_feature_importances(os.path.join(directory, "feature_importances",
database_name), feature_importances_df, feature_std_df)
if metric_scores is not None:
plot_feature_relevance(os.path.join(directory, "feature_importances",
database_name), feature_importances_df, feature_std_df, metric_scores)
def plot_feature_importances(file_name, feature_importance,
feature_std): # pragma: no cover
s = feature_importance.sum(axis=1)
s = s[s!=0]
feature_importance = feature_importance.loc[s.sort_values(ascending=False).index]
feature_importance.to_csv(file_name + "_dataframe.csv")
hover_text = [["-Feature :" + str(feature_name) +
"<br>-Classifier : " + classifier_name +
......@@ -113,8 +119,8 @@ def plot_feature_importances(file_name, feature_importance,
z=feature_importance.values,
text=hover_text,
hoverinfo=["text"],
colorscale="Greys",
reversescale=False))
colorscale="Hot",
reversescale=True))
fig.update_layout(
xaxis={"showgrid": False, "showticklabels": False, "ticks": ''},
yaxis={"showgrid": False, "showticklabels": False, "ticks": ''})
......@@ -123,3 +129,20 @@ def plot_feature_importances(file_name, feature_importance,
plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False)
del fig
def plot_feature_relevance(file_name, feature_importance,
feature_std, metric_scores): # pragma: no cover
for metric, score_df in metric_scores.items():
if metric.endswith("*"):
if isinstance(score_df, dict):
score_df = score_df["mean"]
for score in score_df.columns:
if len(score.split("-"))>1:
algo, view = score.split("-")
feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test']
else:
feature_importance[score] *= score_df[score]['test']
file_name+="_relevance"
plot_feature_importances(file_name, feature_importance,
feature_std)
......@@ -79,6 +79,7 @@ class TestFunctions(unittest.TestCase):
'knn',
'lasso',
'random_forest',
'scm',
'sgd',
'svm_linear',
'svm_poly',
......@@ -90,6 +91,7 @@ class TestFunctions(unittest.TestCase):
'gradient_boosting',
'knn',
'random_forest',
'scm',
'svm_linear',
'svm_poly',
'svm_rbf'])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment