Skip to content
Snippets Groups Projects
Commit 6b1a836a authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Added scm_bagging from package and with mincq

parent f99e1ce5
No related branches found
No related tags found
No related merge requests found
# The base configuration of the benchmark
log: True
name: ["ionosphere"]
label: "_"
name: ["demo"]
label: "_1_3"
file_type: ".hdf5"
views:
pathf: "/home/baptiste/Documents/Datasets/UCI/both"
pathf: "/home/baptiste/Documents/Datasets/Generated/"
nice: 0
random_state: 42
nb_cores: 1
......@@ -20,18 +20,20 @@ multiclass_method: "oneVersusOne"
split: 0.75
nb_folds: 5
nb_class: 2
classes:
type: ["monoview"]
algos_monoview: ["scm_bagging", "scm", "cb_boost", "adaboost"]
algos_multiview: ["group_scm"]
stats_iter: 2
classes: ['label_1', 'label_3']
type: ["multiview", "monoview"]
algos_monoview: ["cb_boost",]
algos_multiview: ["multiview_cbound_boosting"]
stats_iter: 5
metrics:
accuracy_score: {}
f1_score:
average: 'binary'
metric_princ: "f1_score"
metric_princ: "accuracy_score"
hps_type: "None"
hps_args: {}
cb_boost:
n_stumps: 10
multiview_cbound_boosting:
n_stumps: 10
\ No newline at end of file
import numpy as np
from pyscm.scm import SetCoveringMachineClassifier as scm
from scm_bagging.scm_bagging_classifier import ScmBaggingClassifier
from ..monoview.monoview_utils import BaseMonoviewClassifier
......@@ -9,25 +8,15 @@ from summit.multiview_platform.utils.hyper_parameter_search import CustomUniform
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
classifier_class_name = "ScmBaggingClassifier"
from sklearn.base import ClassifierMixin
from sklearn.ensemble import BaseEnsemble
from pyscm import SetCoveringMachineClassifier
classifier_class_name = "ScmBagging"
from sklearn.utils import check_random_state
from sklearn.utils.random import sample_without_replacement
from sklearn.metrics import accuracy_score
import numbers
import numpy as np
from six import iteritems
from warnings import warn
import logging
MAX_INT = np.iinfo(np.int32).max
class ScmBaggingClassifier(BaseEnsemble, ClassifierMixin, BaseMonoviewClassifier):
class ScmBagging(ScmBaggingClassifier, BaseMonoviewClassifier):
"""A Bagging classifier. for SetCoveringMachineClassifier()
The base estimators are built on subsets of both samples
and features.
......@@ -86,18 +75,20 @@ class ScmBaggingClassifier(BaseEnsemble, ClassifierMixin, BaseMonoviewClassifier
max_rules=10,
p_options=[0.316],
model_type="conjunction",
min_cq_combination=False,
min_cq_mu=10e-3,
random_state=None):
if isinstance(p_options, float):
p_options = [p_options]
self.n_estimators = n_estimators
self.max_samples = max_samples
self.max_features = max_features
self.max_rules = max_rules
self.p_options = p_options
self.model_type = model_type
self.random_state = random_state
self.labels_to_binary = {}
self.binary_to_labels = {}
ScmBaggingClassifier.__init__(self, n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
max_rules=max_rules,
p_options=p_options,
model_type=model_type,
min_cq_combination=min_cq_combination,
min_cq_mu=min_cq_mu,
random_state=random_state)
self.param_names = ["n_estimators", "max_rules", "max_samples", "max_features", "model_type", "p_options", "random_state"]
self.classed_params = []
self.distribs = [CustomRandint(low=1, high=300), CustomRandint(low=1, high=20),
......@@ -112,213 +103,6 @@ class ScmBaggingClassifier(BaseEnsemble, ClassifierMixin, BaseMonoviewClassifier
setattr(self, parameter, value)
return self
def p_for_estimators(self):
"""Return the value of p for each estimator to fit."""
options_len = len(self.p_options) # number of options
estims_with_same_p = self.n_estimators // options_len # nb of estimators to fit with the same p
p_of_estims = []
if options_len > 1:
for k in range(options_len - 1):
opt = self.p_options[k] # an option
p_of_estims = p_of_estims + ([
opt] * estims_with_same_p) # estims_with_same_p estimators with p=opt
p_of_estims = p_of_estims + ([self.p_options[-1]] * (
self.n_estimators - len(p_of_estims)))
return p_of_estims
def get_estimators(self):
"""Return the list of estimators of the classifier"""
if hasattr(self, 'estimators'):
return self.estimators
else:
return "not defined (model not fitted)"
def get_hyperparams(self):
"""Return the setted hyperparameters"""
hyperparams = {
'n_estimators': self.n_estimators,
'max_samples': self.max_samples,
'max_features': self.max_features,
'max_rules': self.max_rules,
'p_options': self.p_options,
'model_type': self.model_type,
'random_state': self.random_state
}
return hyperparams
# def set_params(self, **parameters):
# for parameter, value in iteritems(parameters):
# setattr(self, parameter, value)
# return self
def labels_conversion(self, labels_list):
l = list(set(labels_list))
labels_dict = {c: idx for idx, c in enumerate(l)}
if len(l) < 2:
raise ValueError("Only 1 classe given to the model, needs 2")
elif len(l) > 2:
raise ValueError(
"{} classes were given, multiclass prediction is not implemented".format(
len(l)))
return labels_dict
def fit(self, X, y):
# Check if 2 classes are inputed and convert labels to binary labels
self.labels_to_binary = self.labels_conversion(y)
self.binary_to_labels = {bin_label: str_label for str_label, bin_label
in self.labels_to_binary.items()}
y = np.array([self.labels_to_binary[l] for l in y])
self.n_features = X.shape[1]
estimators = []
self.estim_features = []
max_rules = self.max_rules
p_of_estims = self.p_for_estimators()
model_type = self.model_type
# seeds for reproductibility
random_state = self.random_state
random_state = check_random_state(random_state)
seeds = random_state.randint(MAX_INT, size=self.n_estimators)
self._seeds = seeds
pop_samples, pop_features = X.shape
max_samples, max_features = self.max_samples, self.max_features
# validate max_samples
if not isinstance(max_samples, numbers.Integral):
max_samples = int(max_samples * pop_samples)
if not (0 < max_samples <= pop_samples):
raise ValueError("max_samples must be in (0, n_samples)")
# store validated integer row sampling values
self._max_samples = max_samples
self._pop_samples = pop_samples
# validate max_features
if isinstance(self.max_features, numbers.Integral):
max_features = self.max_features
elif isinstance(self.max_features, np.float):
max_features = self.max_features * pop_features
else:
raise ValueError("max_features must be int or float")
if not (0 < max_features <= pop_features):
raise ValueError("max_features must be in (0, n_features)")
max_features = max(1, int(max_features))
# store validated integer feature sampling values
self._max_features = max_features
self._pop_features = pop_features
for k in range(self.n_estimators):
p_param = p_of_estims[k] # p param for the classifier to fit
random_state = seeds[k]
estim = SetCoveringMachineClassifier(p=p_param, max_rules=max_rules,
model_type=model_type,
random_state=random_state)
feature_indices = sample_without_replacement(pop_features,
max_features,
random_state=random_state)
samples_indices = sample_without_replacement(pop_samples,
max_samples,
random_state=random_state)
Xk = (X[samples_indices])[:, feature_indices]
yk = y[samples_indices]
if len(list(set(yk))) < 2:
raise ValueError(
"One of the subsamples contains elements from only 1 class, try increase max_samples value")
estim.fit(Xk, yk)
estimators.append(estim)
self.estim_features.append(feature_indices)
self.estimators = estimators
def predict(self, X):
results = []
for (est, features_idx) in zip(self.estimators, self.estim_features):
res = est.predict(X[:, features_idx])
results.append(res)
results = np.array(results)
votes = np.mean(results, axis=0)
predictions = np.array(np.round(votes, 0), dtype=int)
predictions = np.array([self.binary_to_labels[l] for l in predictions])
return predictions
def predict_proba(self, X):
"""
Predict class probabilities
Parameters:
-----------
X: array-like, shape=(n_examples, n_features)
The feature of the input examples.
Returns:
--------
p : array of shape = [n_examples, 2]
The class probabilities for each example. Classes are ordered by lexicographic order.
"""
warn(
"ScmBaggingClassifier do not support probabilistic predictions. The returned values will be zero or one.",
RuntimeWarning)
# X = check_array(X) # TODO: check this
pos_proba = self.predict(X)
neg_proba = 1.0 - pos_proba
return np.hstack((neg_proba.reshape(-1, 1), pos_proba.reshape(-1, 1)))
def decision_rules(self):
# @TODO : overview of the most important decision rules over estimators
pass
def features_importance(self):
"""
Compute features importances in estimators rules
Returns:
--------
importances : dict (feature id as key, importance as value)
The mean importance of each feature over the estimators.
"""
importances = {} # sum of the feature/rule importances
feature_id_occurences = {} # number of occurences of a feature in subsamples
for (estim, features_idx) in zip(self.estimators, self.estim_features):
# increment the total occurences of the feature :
for id_feat in features_idx:
if id_feat in feature_id_occurences:
feature_id_occurences[id_feat] += 1
else:
feature_id_occurences[id_feat] = 1
# sum the rules importances :
# rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance
rules_importances = np.ones(len(
estim.model_.rules)) # delete it when pyscm will implement importance
for rule, importance in zip(estim.model_.rules, rules_importances):
global_feat_id = features_idx[rule.feature_idx]
if global_feat_id in importances:
importances[global_feat_id] += importance
else:
importances[global_feat_id] = importance
print(feature_id_occurences)
importances = {k: round(v / feature_id_occurences[k], 3) for k, v in
importances.items()}
self.feature_importances_ = np.array([importances[k]
if k in importances else 0
for k in range(self.n_features)])
self.feature_importances_ /= np.sum(self.feature_importances_)
return importances
def get_estimators_indices(self):
# get drawn indices along both sample and feature axes
for seed in self._seeds:
# operations accessing random_state must be performed identically
# to those in 'fit'
feature_indices = sample_without_replacement(self._pop_features,
self._max_features,
random_state=seed)
samples_indices = sample_without_replacement(self._pop_samples,
self._max_samples,
random_state=seed)
yield samples_indices
def score(self, X, y):
return accuracy_score(y, self.predict(X))
def get_interpretation(self, directory, base_file_name, y_test,
multi_class=False):
self.features_importance()
......
from .scm_bagging import ScmBagging
from ..utils.hyper_parameter_search import CustomUniform, CustomRandint
classifier_class_name = "ScmBaggingMinCq"
class ScmBaggingMinCq(ScmBagging):
def __init__(self,
n_estimators=10,
max_samples=1.0,
max_features=1.0,
max_rules=10,
p_options=[0.316],
model_type="conjunction",
min_cq_combination=True,
min_cq_mu = 10e-3,
random_state=None):
ScmBagging.__init__(self, n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
max_rules=max_rules,
p_options=p_options,
model_type=model_type,
min_cq_combination=min_cq_combination,
min_cq_mu=min_cq_mu,
random_state=random_state)
self.param_names.append("min_cq_mu")
self.distribs.append(CustomRandint(1,7, multiplier='e-'))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment