Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Supervised MultiModal Integration Tool
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Baptiste Bauvin
Supervised MultiModal Integration Tool
Commits
88347a54
Commit
88347a54
authored
4 years ago
by
Baptiste Bauvin
Browse files
Options
Downloads
Patches
Plain Diff
Added scm bagging
parent
78c2a85d
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
summit/multiview_platform/monoview_classifiers/scm_bagging.py
+303
-0
303 additions, 0 deletions
...it/multiview_platform/monoview_classifiers/scm_bagging.py
with
303 additions
and
0 deletions
summit/multiview_platform/monoview_classifiers/scm_bagging.py
0 → 100644
+
303
−
0
View file @
88347a54
import
numpy
as
np
from
pyscm.scm
import
SetCoveringMachineClassifier
as
scm
from
..monoview.monoview_utils
import
BaseMonoviewClassifier
from
summit.multiview_platform.utils.hyper_parameter_search
import
CustomUniform
,
CustomRandint
# Author-Info
__author__
=
"
Baptiste Bauvin
"
__status__
=
"
Prototype
"
# Production, Development, Prototype
classifier_class_name
=
"
ScmBaggingClassifier
"
from
sklearn.base
import
ClassifierMixin
from
sklearn.ensemble
import
BaseEnsemble
from
pyscm
import
SetCoveringMachineClassifier
from
sklearn.utils
import
check_random_state
from
sklearn.utils.random
import
sample_without_replacement
from
sklearn.metrics
import
accuracy_score
import
numbers
import
numpy
as
np
from
six
import
iteritems
from
warnings
import
warn
MAX_INT
=
np
.
iinfo
(
np
.
int32
).
max
class
ScmBaggingClassifier
(
BaseEnsemble
,
ClassifierMixin
,
BaseMonoviewClassifier
):
"""
A Bagging classifier. for SetCoveringMachineClassifier()
The base estimators are built on subsets of both samples
and features.
Parameters
----------
n_estimators : int, default=10
The number of base estimators in the ensemble.
max_samples : int or float, default=1.0
The number of samples to draw from X to train each base estimator with
replacement.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples.
max_features : int or float, default=1.0
The number of features to draw from X to train each base estimator (
without replacement.
- If int, then draw `max_features` features.
- If float, then draw `max_features * X.shape[1]` features.
p_options : list of float with len =< n_estimators, default=[1.0]
The estimators will be fitted with values of p found in p_options
let k be k = n_estimators/len(p_options),
the k first estimators will have p=p_options[0],
the next k estimators will have p=p_options[1] and so on...
random_state : int or RandomState, default=None
Controls the random resampling of the original dataset
(sample wise and feature wise).
If the base estimator accepts a `random_state` attribute, a different
seed is generated for each instance in the ensemble.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Attributes
----------
n_features_ : int
The number of features when :meth:`fit` is performed.
estimators_ : list of estimators
The collection of fitted base estimators.
estim_features : list of arrays
The subset of drawn features for each base estimator.
Examples
--------
>>>
@TODO
References
----------
.. [1] L. Breiman,
"
Pasting small votes for classification in large
databases and on-line
"
, Machine Learning, 36(1), 85-103, 1999.
.. [2] G. Louppe and P. Geurts,
"
Ensembles on Random Patches
"
, Machine
Learning and Knowledge Discovery in Databases, 346-361, 2012.
"""
def
__init__
(
self
,
n_estimators
=
10
,
max_samples
=
1.0
,
max_features
=
1.0
,
max_rules
=
10
,
p_options
=
[
1.0
],
model_type
=
"
conjunction
"
,
random_state
=
None
):
self
.
n_estimators
=
n_estimators
self
.
max_samples
=
max_samples
self
.
max_features
=
max_features
self
.
max_rules
=
max_rules
self
.
p_options
=
p_options
self
.
model_type
=
model_type
self
.
random_state
=
random_state
self
.
labels_to_binary
=
{}
self
.
binary_to_labels
=
{}
self
.
param_names
=
[
"
n_estimators
"
,
"
max_rules
"
,
"
max_samples
"
,
"
max_features
"
,
"
model_type
"
,
"
random_state
"
]
self
.
classed_params
=
[]
self
.
distribs
=
[
CustomRandint
(
low
=
1
,
high
=
300
),
CustomRandint
(
low
=
1
,
high
=
20
),
CustomUniform
(),
CustomUniform
(),
[
"
conjunction
"
,
"
disjunction
"
],
[
random_state
]]
self
.
weird_strings
=
{}
def
p_for_estimators
(
self
):
"""
Return the value of p for each estimator to fit.
"""
options_len
=
len
(
self
.
p_options
)
# number of options
estims_with_same_p
=
self
.
n_estimators
//
options_len
# nb of estimators to fit with the same p
p_of_estims
=
[]
if
options_len
>
1
:
for
k
in
range
(
options_len
-
1
):
opt
=
self
.
p_options
[
k
]
# an option
p_of_estims
=
p_of_estims
+
([
opt
]
*
estims_with_same_p
)
# estims_with_same_p estimators with p=opt
p_of_estims
=
p_of_estims
+
([
self
.
p_options
[
-
1
]]
*
(
self
.
n_estimators
-
len
(
p_of_estims
)))
return
p_of_estims
def
get_estimators
(
self
):
"""
Return the list of estimators of the classifier
"""
if
hasattr
(
self
,
'
estimators
'
):
return
self
.
estimators
else
:
return
"
not defined (model not fitted)
"
def
get_hyperparams
(
self
):
"""
Return the setted hyperparameters
"""
hyperparams
=
{
'
n_estimators
'
:
self
.
n_estimators
,
'
max_samples
'
:
self
.
max_samples
,
'
max_features
'
:
self
.
max_features
,
'
max_rules
'
:
self
.
max_rules
,
'
p_options
'
:
self
.
p_options
,
'
model_type
'
:
self
.
model_type
,
'
random_state
'
:
self
.
random_state
}
return
hyperparams
def
set_params
(
self
,
**
parameters
):
for
parameter
,
value
in
iteritems
(
parameters
):
setattr
(
self
,
parameter
,
value
)
return
self
def
labels_conversion
(
self
,
labels_list
):
l
=
list
(
set
(
labels_list
))
labels_dict
=
{
c
:
idx
for
idx
,
c
in
enumerate
(
l
)}
if
len
(
l
)
<
2
:
raise
ValueError
(
"
Only 1 classe given to the model, needs 2
"
)
elif
len
(
l
)
>
2
:
raise
ValueError
(
"
{} classes were given, multiclass prediction is not implemented
"
.
format
(
len
(
l
)))
return
labels_dict
def
fit
(
self
,
X
,
y
):
# Check if 2 classes are inputed and convert labels to binary labels
self
.
labels_to_binary
=
self
.
labels_conversion
(
y
)
self
.
binary_to_labels
=
{
bin_label
:
str_label
for
str_label
,
bin_label
in
self
.
labels_to_binary
.
items
()}
y
=
np
.
array
([
self
.
labels_to_binary
[
l
]
for
l
in
y
])
estimators
=
[]
self
.
estim_features
=
[]
max_rules
=
self
.
max_rules
p_of_estims
=
self
.
p_for_estimators
()
model_type
=
self
.
model_type
# seeds for reproductibility
random_state
=
self
.
random_state
random_state
=
check_random_state
(
random_state
)
seeds
=
random_state
.
randint
(
MAX_INT
,
size
=
self
.
n_estimators
)
self
.
_seeds
=
seeds
pop_samples
,
pop_features
=
X
.
shape
max_samples
,
max_features
=
self
.
max_samples
,
self
.
max_features
# validate max_samples
if
not
isinstance
(
max_samples
,
numbers
.
Integral
):
max_samples
=
int
(
max_samples
*
pop_samples
)
if
not
(
0
<
max_samples
<=
pop_samples
):
raise
ValueError
(
"
max_samples must be in (0, n_samples)
"
)
# store validated integer row sampling values
self
.
_max_samples
=
max_samples
self
.
_pop_samples
=
pop_samples
# validate max_features
if
isinstance
(
self
.
max_features
,
numbers
.
Integral
):
max_features
=
self
.
max_features
elif
isinstance
(
self
.
max_features
,
np
.
float
):
max_features
=
self
.
max_features
*
pop_features
else
:
raise
ValueError
(
"
max_features must be int or float
"
)
if
not
(
0
<
max_features
<=
pop_features
):
raise
ValueError
(
"
max_features must be in (0, n_features)
"
)
max_features
=
max
(
1
,
int
(
max_features
))
# store validated integer feature sampling values
self
.
_max_features
=
max_features
self
.
_pop_features
=
pop_features
for
k
in
range
(
self
.
n_estimators
):
p_param
=
p_of_estims
[
k
]
# p param for the classifier to fit
random_state
=
seeds
[
k
]
estim
=
SetCoveringMachineClassifier
(
p
=
p_param
,
max_rules
=
max_rules
,
model_type
=
model_type
,
random_state
=
random_state
)
feature_indices
=
sample_without_replacement
(
pop_features
,
max_features
,
random_state
=
random_state
)
samples_indices
=
sample_without_replacement
(
pop_samples
,
max_samples
,
random_state
=
random_state
)
Xk
=
(
X
[
samples_indices
])[:,
feature_indices
]
yk
=
y
[
samples_indices
]
if
len
(
list
(
set
(
yk
)))
<
2
:
raise
ValueError
(
"
One of the subsamples contains elements from only 1 class, try increase max_samples value
"
)
estim
.
fit
(
Xk
,
yk
)
estimators
.
append
(
estim
)
self
.
estim_features
.
append
(
feature_indices
)
self
.
estimators
=
estimators
def
predict
(
self
,
X
):
results
=
[]
for
(
est
,
features_idx
)
in
zip
(
self
.
estimators
,
self
.
estim_features
):
res
=
est
.
predict
(
X
[:,
features_idx
])
results
.
append
(
res
)
results
=
np
.
array
(
results
)
votes
=
np
.
mean
(
results
,
axis
=
0
)
predictions
=
np
.
array
(
np
.
round
(
votes
,
0
),
dtype
=
int
)
predictions
=
np
.
array
([
self
.
binary_to_labels
[
l
]
for
l
in
predictions
])
return
predictions
def
predict_proba
(
self
,
X
):
"""
Predict class probabilities
Parameters:
-----------
X: array-like, shape=(n_examples, n_features)
The feature of the input examples.
Returns:
--------
p : array of shape = [n_examples, 2]
The class probabilities for each example. Classes are ordered by lexicographic order.
"""
warn
(
"
ScmBaggingClassifier do not support probabilistic predictions. The returned values will be zero or one.
"
,
RuntimeWarning
)
# X = check_array(X) # TODO: check this
pos_proba
=
self
.
predict
(
X
)
neg_proba
=
1.0
-
pos_proba
return
np
.
hstack
((
neg_proba
.
reshape
(
-
1
,
1
),
pos_proba
.
reshape
(
-
1
,
1
)))
def
decision_rules
(
self
):
# @TODO : overview of the most important decision rules over estimators
pass
def
features_importance
(
self
):
"""
Compute features importances in estimators rules
Returns:
--------
importances : dict (feature id as key, importance as value)
The mean importance of each feature over the estimators.
"""
importances
=
{}
# sum of the feature/rule importances
feature_id_occurences
=
{}
# number of occurences of a feature in subsamples
for
(
estim
,
features_idx
)
in
zip
(
self
.
estimators
,
self
.
estim_features
):
# increment the total occurences of the feature :
for
id_feat
in
features_idx
:
if
id_feat
in
feature_id_occurences
:
feature_id_occurences
[
id_feat
]
+=
1
else
:
feature_id_occurences
[
id_feat
]
=
1
# sum the rules importances :
# rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance
rules_importances
=
np
.
ones
(
len
(
estim
.
model_
.
rules
))
# delete it when pyscm will implement importance
for
rule
,
importance
in
zip
(
estim
.
model_
.
rules
,
rules_importances
):
global_feat_id
=
features_idx
[
rule
.
feature_idx
]
if
global_feat_id
in
importances
:
importances
[
global_feat_id
]
+=
importance
else
:
importances
[
global_feat_id
]
=
importance
print
(
feature_id_occurences
)
importances
=
{
k
:
round
(
v
/
feature_id_occurences
[
k
],
3
)
for
k
,
v
in
importances
.
items
()}
return
importances
def
get_estimators_indices
(
self
):
# get drawn indices along both sample and feature axes
for
seed
in
self
.
_seeds
:
# operations accessing random_state must be performed identically
# to those in 'fit'
feature_indices
=
sample_without_replacement
(
self
.
_pop_features
,
self
.
_max_features
,
random_state
=
seed
)
samples_indices
=
sample_without_replacement
(
self
.
_pop_samples
,
self
.
_max_samples
,
random_state
=
seed
)
yield
samples_indices
def
score
(
self
,
X
,
y
):
return
accuracy_score
(
y
,
self
.
predict
(
X
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment