Skip to content
Snippets Groups Projects
Commit 85f372eb authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Merged

parent 71f781d4
Branches
No related tags found
No related merge requests found
Showing
with 2559 additions and 884 deletions
...@@ -6,6 +6,8 @@ tests: ...@@ -6,6 +6,8 @@ tests:
script: script:
- export LC_ALL=$(locale -a | grep en_US) - export LC_ALL=$(locale -a | grep en_US)
- export LANG=$(locale -a | grep en_US) - export LANG=$(locale -a | grep en_US)
- pip3 install --upgrade pip
- pip3 -V
- pip3 install -e . - pip3 install -e .
- pytest-3 - pytest-3
coverage: '/^TOTAL.+?(\d+\%)$/' coverage: '/^TOTAL.+?(\d+\%)$/'
...@@ -24,6 +26,7 @@ doc: ...@@ -24,6 +26,7 @@ doc:
script: script:
- export LC_ALL=$(locale -a | grep en_US) - export LC_ALL=$(locale -a | grep en_US)
- export LANG=$(locale -a | grep en_US) - export LANG=$(locale -a | grep en_US)
- pip3 install --upgrade pip
- pip3 install -e .[doc] - pip3 install -e .[doc]
- sphinx-apidoc -o docs/source summit - sphinx-apidoc -o docs/source summit
- cd docs/source - cd docs/source
...@@ -45,6 +48,7 @@ pages: ...@@ -45,6 +48,7 @@ pages:
script: script:
- export LC_ALL=$(locale -a | grep en_US) - export LC_ALL=$(locale -a | grep en_US)
- export LANG=$(locale -a | grep en_US) - export LANG=$(locale -a | grep en_US)
- pip3 install --upgrade pip
- pip3 install -e .[doc] - pip3 install -e .[doc]
- pytest-3 - pytest-3
- sphinx-apidoc -o docs/source summit - sphinx-apidoc -o docs/source summit
......
...@@ -57,7 +57,9 @@ And the following python modules will be automatically installed : ...@@ -57,7 +57,9 @@ And the following python modules will be automatically installed :
* `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files, * `pyyaml <https://pypi.org/project/PyYAML/>`_ - Used to read the config files,
* `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals, * `plotly <https://plot.ly/>`_ - Used to generate interactive HTML visuals,
* `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix. * `tabulate <https://pypi.org/project/tabulate/>`_ - Used to generated the confusion matrix.
* `pyscm-ml <https://pypi.org/project/pyscm-ml/>`_ - * `pyscm-ml <https://pypi.org/project/pyscm-ml/>`_ - SCM python implementation
* `randomscm <https://github.com/thibgo/randomscm>`_ - Random SCM python implementation
* `imbalance-bagging <https://imbalanced-learn.org/stable>`_ - Imbalanced learning library
Installing Installing
......
# The base configuration of the benchmark
log: True
name: ['tnbc_mazid']
label: ""
file_type: ".hdf5"
views:
pathf: "/home/baptiste/Documents/Datasets/Mazid/"
nice: 0
random_state: 42
nb_cores: 1
full: True
debug: True
add_noise: False
noise_std: 0.0
res_dir: "../results/"
track_tracebacks: True
# All the classification-realted configuration options
multiclass_method: "oneVersusOne"
split: 0.30
nb_folds: 5
nb_class: 2
classes:
type: ["monoview","multiview"]
algos_monoview: ["samba", "scm_bagging", "random_forest", "adaboost", 'scm']
algos_multiview: ["early_fusion_adaboost", "early_fusion_decision_tree", "early_fusion_random_forest", "early_fusion_samba"]
stats_iter: 5
metrics:
balanced_accuracy: {}
f1_score:
average: 'micro'
accuracy_score: {}
metric_princ: "balanced_accuracy"
hps_type: "Random"
hps_args:
n_iter: 20
equivalent_draws: False
svm_rbf:
C: 0.7
scm_bagging:
{max_features: 0.908115713423863, max_rules: 9, max_samples: 0.9277949143533335, model_type: conjunction,
n_estimators: 109, p_options: 0.7823433255515356}
samba:
n_estimators: 22
adaboost:
{base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, base_estimator__class_weight: null,
base_estimator__criterion: gini, base_estimator__max_depth: 5, base_estimator__max_features: null,
base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0,
base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, base_estimator__min_samples_split: 2,
base_estimator__min_weight_fraction_leaf: 0.0, base_estimator__random_state: null,
base_estimator__splitter: best, n_estimators: 354}
svm_linear:
C: 0.3867
cb_boost:
n_stumps: 1
n_max_iterations: 20
estimators_generator: "Stumps"
cq_boost:
n_max_iterations: 10
n_stumps: 1
min_cq:
n_stumps_per_attribute: 1
decision_tree:
{criterion: entropy, max_depth: 271, splitter: random}
early_fusion_adaboost:
{base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, base_estimator__class_weight: null,
base_estimator__criterion: gini, base_estimator__max_depth: 5, base_estimator__max_features: null,
base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0,
base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, base_estimator__min_samples_split: 2,
base_estimator__min_weight_fraction_leaf: 0.0, base_estimator__random_state: null,
base_estimator__splitter: best, base_estimator_config: null, n_estimators: 273}
early_fusion_decision_tree:
{criterion: entropy, max_depth: 293, splitter: random}
early_fusion_random_forest:
{criterion: gini, max_depth: 8, n_estimators: 46}
random_forest:
{criterion: gini, max_depth: 8, n_estimators: 32}
weighted_linear_late_fusion:
classifier_configs:
- decision_tree: {criterion: entropy, max_depth: 112, splitter: random}
- adaboost: {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0,
base_estimator__class_weight: null, base_estimator__criterion: gini, base_estimator__max_depth: 2,
base_estimator__max_features: null, base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0,
base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1,
base_estimator__min_samples_split: 2, base_estimator__min_weight_fraction_leaf: 0.0,
base_estimator__random_state: null, base_estimator__splitter: best, n_estimators: 400}
classifiers_names: [decision_tree, adaboost]
nb_cores: 1
rs: 724
weights: [0.9636627605010293, 0.3834415188257777]
scm:
{max_rules: 10, model_type: conjunction, p: 0.8310271995093625}
mumbo:
base_estimator:
- svm_rbf:
C: 0.001
- svm_rbf:
C: 0.001
- decision_tree:
max_depth: 1
- decision_tree:
max_depth: 1
n_estimators: 100
mv_cb_boost:
n_estimators: 100
base_estimator: ["Stumps", "Stumps", "Stumps", "Stumps"]
base_estimator__n_stumps: [50, 50, 50, 50]
base_estimator__check_diff: False
base_estimator__C: 0.001
base_estimator__kernel: "rbf"
base_estimator__max_depth: 2
base_estimator__distribution_type: "uniform"
base_estimator__low: 0
base_estimator__high: 10
base_estimator__attributes_ratio: 0.5
base_estimator__examples_ratio: 0.55
early_fusion_cb:
monoview_classifier_config:
cb_boost:
n_estimators: 100
base_estimator__max_depth: 1
early_fusion_dt:
monoview_classifier_config:
decision_tree:
max_depth: 2
early_fusion_rf:
monoview_classifier_config:
random_forest:
n_estimators: 100
max_depth: 1
early_fusion_svm:
monoview_classifier_config:
svm_rbf:
C: 0.7
#pb_mv_boost:
# num_iterations: 20
# decision_tree_depth: 1
#weighted_linear_early_fusion:
# monoview_classifier_name: "cb_boost"
# monoview_classifier_config:
# cb_boost:
# n_stumps: 30
# n_max_iterations: 20
# estimators_generator: "Trees"
# max_depth: 1
#weighted_linear_late_fusion:
# classifiers_names: ["cb_boost", "cb_boost", "cb_boost", "cb_boost"]
# classifier_configs:
# - cb_boost:
# n_stumps: 30
# n_max_iterations: 20
# estimators_generator: "Trees"
# max_depth: 1
# - cb_boost:
# n_stumps: 30
# n_max_iterations: 20
# estimators_generator: "Trees"
# max_depth: 1
# - cb_boost:
# n_stumps: 30
# n_max_iterations: 20
# estimators_generator: "Trees"
# max_depth: 1
# - cb_boost:
# n_stumps: 30
# n_max_iterations: 20
# estimators_generator: "Trees"
# max_depth: 1
#
# The base configuration of the benchmark
log: True
name: ["multiview_mnist"]
label: "_"
file_type: ".hdf5"
views:
pathf: "examples/data/"
nice: 0
random_state: 43
nb_cores: 1
full: True
debug: True
add_noise: False
noise_std: 0.0
res_dir: "../results/"
track_tracebacks: False
# All the classification-realted configuration options
multiclass_method: "oneVersusOne"
split: 0.96
nb_folds: 5
nb_class:
classes:
type: ["monoview","multiview"]
algos_monoview: ["decision_tree","adaboost"]
algos_multiview: ["mumbo","mvml", 'lp_norm_mkl', 'mucombo', 'early_fusion_decision_tree', 'early_fusion_adaboost']
stats_iter: 1
metrics:
accuracy_score: {}
f1_score: {}
metric_princ: "accuracy_score"
hps_type: "None"
hps_args:
n_iter: 2
mumbo:
base_estimator:
decision_tree:
max_depth: 3
\ No newline at end of file
...@@ -106,9 +106,27 @@ adaboost: ...@@ -106,9 +106,27 @@ adaboost:
n_estimators: 50 n_estimators: 50
###################################### ######################################
## The Monoview Classifier arguments # ## The Monoview Classifier arguments #
###################################### ######################################
mumbo:
base_estimator__criterion: 'gini'
base_estimator__max_depth: 3
base_estimator__random_state: None
base_estimator__splitter: 'best'
best_view_mode: 'edge'
base_estimator: 'decision_tree'
n_estimators: 10
mucombo:
base_estimator__criterion: 'gini'
base_estimator__max_depth: 3
base_estimator__random_state: None
base_estimator__splitter: 'best'
best_view_mode: 'edge'
base_estimator: 'decision_tree'
n_estimators: 10
# #
#random_forest: #random_forest:
# n_estimators: [25] # n_estimators: [25]
......
...@@ -12,4 +12,5 @@ plotly>=4.2.1 ...@@ -12,4 +12,5 @@ plotly>=4.2.1
matplotlib>=3.1.1 matplotlib>=3.1.1
tabulate>=0.8.6 tabulate>=0.8.6
pyscm-ml>=1.0.0 pyscm-ml>=1.0.0
git+https://github.com/thibgo/randomscm/archive/refs/tags/v0.0.0-alpha.zip
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
#Extracting requrements from requirements.txt #Extracting requrements from requirements.txt
with open('requirements.txt') as f: # with open('requirements.txt') as f:
requirements = f.read().splitlines() # requirements = f.read().splitlines()
# from Cython.Build import cythonize # from Cython.Build import cythonize
from setuptools import setup, find_packages from setuptools import setup, find_packages
# Ceci n'est qu'un appel de fonction. Mais il est trèèèèèèèèèèès long
# et il comporte beaucoup de paramètres
def setup_package(): def setup_package():
setup( setup(
# le nom de votre bibliothèque, tel qu'il apparaitre sur pypi
name='summit', name='summit',
# la version du code
version=0.0, version=0.0,
python_requires = '>=3.5', python_requires = '>=3.5',
# Liste les packages à insérer dans la distribution
# plutôt que de le faire à la main, on utilise la foncton
# find_packages() de setuptools qui va cherche tous les packages
# python recursivement dans le dossier courant.
# C'est pour cette raison que l'on a tout mis dans un seul dossier:
# on peut ainsi utiliser cette fonction facilement
packages=find_packages(), packages=find_packages(),
# votre pti nom
author="Baptiste Bauvin", author="Baptiste Bauvin",
# Votre email, sachant qu'il sera publique visible, avec tous les risques
# que ça implique.
author_email="baptiste.bauvin@lis-lab.fr", author_email="baptiste.bauvin@lis-lab.fr",
# Une description courte
description="Supervised MultiModal Integration Tool", description="Supervised MultiModal Integration Tool",
# Une description longue, sera affichée pour présenter la lib
# Généralement on dump le README ici
long_description=open('README.rst').read(), long_description=open('README.rst').read(),
# Vous pouvez rajouter une liste de dépendances pour votre lib
# et même préciser une version. A l'installation, Python essayera de
# les télécharger et les installer.
#
# Ex: ["gunicorn", "docutils >= 0.3", "lxml==0.5a7"]
#
# Dans notre cas on en a pas besoin, donc je le commente, mais je le
# laisse pour que vous sachiez que ça existe car c'est très utile.
# install_requires= ,
# Active la prise en compte du fichier MANIFEST.in
include_package_data=True, include_package_data=True,
# dependency_links=['https://github.com/aldro61/pyscm.git#egg=pyscm'],
# Une url qui pointe vers la page officielle de votre lib
url='http://gitlab.lis-lab.fr/baptiste.bauvin/summit/', url='http://gitlab.lis-lab.fr/baptiste.bauvin/summit/',
install_requires=requirements, install_requires=['h5py>=2.9.0', 'joblib>=0.13.2', 'numpy>=1.16.4',
'pyparsing>=2.4.0', 'python-dateutil>=2.8.0',
'scikit-learn>=0.19.0', 'scipy>=1.3.0', 'six>=1.12.0',
'pandas>=0.23.3', 'pyyaml>=3.12', 'plotly>=4.2.1',
'matplotlib>=3.1.1', 'tabulate>=0.8.6', 'pyscm-ml>=1.0.0',
"randomscm @ git+https://github.com/thibgo/randomscm.git#egg=randomscm",
"imbalanced-learn"],
extras_require={ extras_require={
'dev': ['pytest', 'pytest-cov'], 'dev': ['pytest', 'pytest-cov'],
'doc': ['sphinx >= 3.0.2', 'numpydoc', 'docutils', 'sphinx-autoapi', 'doc': ['sphinx >= 3.0.2', 'numpydoc', 'docutils', 'sphinx-autoapi',
'sphinx_rtd_theme']}, 'sphinx_rtd_theme']},
# Il est d'usage de mettre quelques metadata à propos de sa lib
# Pour que les robots puissent facilement la classer.
# La liste des marqueurs autorisées est longue:
# https://pypi.python.org/pypi?%3Aaction=list_classifiers.
#
# Il n'y a pas vraiment de règle pour le contenu. Chacun fait un peu
# comme il le sent. Il y en a qui ne mettent rien.
classifiers=[ classifiers=[
"Programming Language :: Python", "Programming Language :: Python",
"Development Status :: 1 - Planning", "Development Status :: 1 - Planning",
...@@ -77,27 +40,8 @@ def setup_package(): ...@@ -77,27 +40,8 @@ def setup_package():
"Programming Language :: Python :: 2/3", "Programming Language :: Python :: 2/3",
"Topic :: Machine Learning", "Topic :: Machine Learning",
], ],
# C'est un système de plugin, mais on s'en sert presque exclusivement
# Pour créer des commandes, comme "django-admin".
# Par exemple, si on veut créer la fabuleuse commande "proclame-sm", on
# va faire pointer ce nom vers la fonction proclamer(). La commande sera
# créé automatiquement.
# La syntaxe est "nom-de-commande-a-creer = package.module:fonction".
# entry_points={
# 'console_scripts': [
# 'exec_multiview = summit.execute:exec',
# ],
# },
# A fournir uniquement si votre licence n'est pas listée dans "classifiers"
# ce qui est notre cas
license="GNUGPL", license="GNUGPL",
# Il y a encore une chiée de paramètres possibles, mais avec ça vous
# couvrez 90% des besoins
# ext_modules=cythonize(
# "summit/multiview_platform/monoview/additions/_custom_criterion.pyx"),
) )
if __name__ == "__main__": if __name__ == "__main__":
......
__version__ = "0.0.0.0" __version__ = "0.0.0.0"
__url__ = "https://gitlab.lis-lab.fr/baptiste.bauvin/summit"
from . import multiview_platform, execute from . import multiview_platform, execute
...@@ -27,7 +27,7 @@ res_dir: "examples/results/example_0/" ...@@ -27,7 +27,7 @@ res_dir: "examples/results/example_0/"
# If an error occurs in a classifier, if track_tracebacks is set to True, the # If an error occurs in a classifier, if track_tracebacks is set to True, the
# benchmark saves the traceback and continues, if it is set to False, it will # benchmark saves the traceback and continues, if it is set to False, it will
# stop the benchmark and raise the error # stop the benchmark and raise the error
track_tracebacks: True track_tracebacks: False
# All the classification-realted configuration options # All the classification-realted configuration options
...@@ -40,14 +40,14 @@ nb_class: ...@@ -40,14 +40,14 @@ nb_class:
# The name of the classes to select in the dataset # The name of the classes to select in the dataset
classes: classes:
# The type of algorithms to run during the benchmark (monoview and/or multiview) # The type of algorithms to run during the benchmark (monoview and/or multiview)
type: ["monoview","multiview"] cl_type: ["monoview","multiview"]
# The name of the monoview algorithms to run, ["all"] to run all the available classifiers # The name of the monoview algorithms to run, ["all"] to run all the available classifiers
algos_monoview: ["decision_tree", "adaboost"] algos_monoview: ["decision_tree", "adaboost"]
# The names of the multiview algorithms to run, ["all"] to run all the available classifiers # The names of the multiview algorithms to run, ["all"] to run all the available classifiers
algos_multiview: ["early_fusion_decision_tree", "early_fusion_adaboost", "weighted_linear_late_fusion",] algos_multiview: ["early_fusion_decision_tree", "early_fusion_adaboost", "weighted_linear_late_fusion",]
# The number of times the benchamrk is repeated with different train/test # The number of times the benchamrk is repeated with different train/test
# split, to have more statistically significant results # split, to have more statistically significant results
stats_iter: 1 stats_iter: 2
# The metrics that will be use din the result analysis # The metrics that will be use din the result analysis
metrics: metrics:
accuracy_score: {} accuracy_score: {}
......
# The base configuration of the benchmark
# Enable logging
log: True
# The name of each dataset in the directory on which the benchmark should be run
name: "multiview_mnist"
# A label for the result directory
label: "mnist"
# The type of dataset, currently supported ".hdf5", and ".csv"
file_type: ".hdf5"
# The views to use in the banchmark, an empty value will result in using all the views
views:
# The path to the directory where the datasets are stored, an absolute path is advised
pathf: "examples/data/"
# The niceness of the processes, useful to lower their priority
nice: 0
# The random state of the benchmark, useful for reproducibility
random_state: 42
# The number of parallel computing threads
nb_cores: 4
# Used to run the benchmark on the full dataset
full: True
# Used to be able to run more than one benchmark per minute
debug: False
# The directory in which the results will be stored, an absolute path is advised
res_dir: "examples/results/example_3/"
# If an error occurs in a classifier, if track_tracebacks is set to True, the
# benchmark saves the traceback and continues, if it is set to False, it will
# stop the benchmark and raise the error
track_tracebacks: True
# All the classification-realted configuration options
# If the dataset is multiclass, will use this multiclass-to-biclass method
multiclass_method: "oneVersusOne"
# The ratio number of test exmaples/number of train samples
split: 0.8
# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed
nb_folds: 5
# The number of classes to select in the dataset
nb_class: 2
# The name of the classes to select in the dataset
classes:
# The type of algorithms to run during the benchmark (monoview and/or multiview)
type: ["monoview","multiview"]
# The name of the monoview algorithms to run, ["all"] to run all the available classifiers
algos_monoview: ["decision_tree", "adaboost", ]
# The names of the multiview algorithms to run, ["all"] to run all the available classifiers
algos_multiview: ["early_fusion_decision_tree", "early_fusion_adaboost"]
# The number of times the benchamrk is repeated with different train/test
# split, to have more statistically significant results
stats_iter: 5
# The metrics that will be use din the result analysis
metrics:
accuracy_score: {}
f1_score:
average: "micro"
# The metric that will be used in the hyper-parameter optimization process
metric_princ: "accuracy_score"
# The type of hyper-parameter optimization method
hps_type: 'Random'
# The number of iteration in the hyper-parameter optimization process
hps_args:
n_iter: 10
decision_tree:
max_depth: 3
adaboost:
base_estimator: "DecisionTreeClassifier"
n_estimators: 10
weighted_linear_late_fusion:
classifiers_names: "decision_tree"
classifier_configs:
decision_tree:
max_depth: 2
# The following arguments are classifier-specific, and are documented in each
# of the corresponding modules.
# In order to run multiple sets of parameters, use multiple values in the
# following lists, and set hps_type to None.
...@@ -8,7 +8,7 @@ def execute(config_path=None): # pragma: no cover ...@@ -8,7 +8,7 @@ def execute(config_path=None): # pragma: no cover
from summit.multiview_platform import exec_classif from summit.multiview_platform import exec_classif
if config_path is None: if config_path is None:
exec_classif.exec_classif(sys.argv[1:]) sum = exec_classif.Summit(config_path=sys.argv[1:])
else: else:
if config_path == "example 0": if config_path == "example 0":
config_path = os.path.join( config_path = os.path.join(
...@@ -59,7 +59,8 @@ def execute(config_path=None): # pragma: no cover ...@@ -59,7 +59,8 @@ def execute(config_path=None): # pragma: no cover
"examples", "examples",
"config_files", "config_files",
"config_example_3.yml") "config_example_3.yml")
exec_classif.exec_classif(["--config_path", config_path]) sum = exec_classif.Summit(["--config_path", config_path])
sum.exec_classif()
if __name__ == "__main__": if __name__ == "__main__":
......
This diff is collapsed.
"""Functions :
score: to get the accuracy score
get_scorer: returns a sklearn scorer for grid search
"""
from sklearn.metrics import balanced_accuracy_score as metric
from sklearn.metrics import make_scorer
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
def score(y_true, y_pred, multiclass=False, **kwargs):
"""Arguments:
y_true: real labels
y_pred: predicted labels
Keyword Arguments:
"0": weights to compute accuracy
Returns:
Weighted accuracy score for y_true, y_pred"""
score = metric(y_true, y_pred, **kwargs)
return score
def get_scorer(**kwargs):
"""Keyword Arguments:
"0": weights to compute accuracy
Returns:
A weighted sklearn scorer for accuracy"""
return make_scorer(metric, greater_is_better=True,
**kwargs)
def get_config(**kwargs):
config_string = "Balanced accuracy score using {}, (higher is better)".format(
kwargs)
return config_string
...@@ -7,7 +7,10 @@ __status__ = "Prototype" # Production, Development, Prototype ...@@ -7,7 +7,10 @@ __status__ = "Prototype" # Production, Development, Prototype
def score(y_true, y_pred, multiclass=False, **kwargs): def score(y_true, y_pred, multiclass=False, **kwargs):
try:
score = metric(y_true, y_pred, **kwargs) score = metric(y_true, y_pred, **kwargs)
except:
score = 0.0
return score return score
......
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix as metric
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
def score(y_true, y_pred, **kwargs):
score = metric(y_true, y_pred, **kwargs)
if score[0,0]+score[0,1] !=0:
return score[0,0]/(score[0,0]+score[0,1])
else:
return 0
def get_scorer(**kwargs):
return make_scorer(score, greater_is_better=True, **kwargs)
def get_config(**kwargs):
configString = "Specificity score (higher is better)".format(kwargs)
return configString
\ No newline at end of file
from randomscm.randomscm import RandomScmClassifier
from ..monoview.monoview_utils import BaseMonoviewClassifier
from summit.multiview_platform.utils.hyper_parameter_search import CustomUniform, CustomRandint
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
classifier_class_name = "ScmBaggingMinCq"
import numpy as np
from six import iteritems
MAX_INT = np.iinfo(np.int32).max
class ScmBaggingMinCq(RandomScmClassifier, BaseMonoviewClassifier):
"""A Bagging classifier. for SetCoveringMachineClassifier()
The base estimators are built on subsets of both samples
and features.
Parameters
----------
n_estimators : int, default=10
The number of base estimators in the ensemble.
max_samples : int or float, default=1.0
The number of samples to draw from X to train each base estimator with
replacement.
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples.
max_features : int or float, default=1.0
The number of features to draw from X to train each base estimator (
without replacement.
- If int, then draw `max_features` features.
- If float, then draw `max_features * X.shape[1]` features.
p_options : list of float with len =< n_estimators, default=[1.0]
The estimators will be fitted with values of p found in p_options
let k be k = n_estimators/len(p_options),
the k first estimators will have p=p_options[0],
the next k estimators will have p=p_options[1] and so on...
random_state : int or RandomState, default=None
Controls the random resampling of the original dataset
(sample wise and feature wise).
If the base estimator accepts a `random_state` attribute, a different
seed is generated for each instance in the ensemble.
Pass an int for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
Attributes
----------
n_features_ : int
The number of features when :meth:`fit` is performed.
estimators_ : list of estimators
The collection of fitted base estimators.
estim_features : list of arrays
The subset of drawn features for each base estimator.
Examples
--------
>>> @TODO
References
----------
.. [1] L. Breiman, "Pasting small votes for classification in large
databases and on-line", Machine Learning, 36(1), 85-103, 1999.
.. [2] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
Learning and Knowledge Discovery in Databases, 346-361, 2012.
"""
def __init__(self,
n_estimators=50,
max_samples=1.0,
max_features=1.0,
max_rules=10,
p_options=[0.316],
model_type="conjunction",
min_cq_combination=True,
min_cq_mu=10e-3,
random_state=None):
if isinstance(p_options, float):
p_options = [p_options]
RandomScmClassifier.__init__(self, n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
max_rules=max_rules,
p_options=p_options,
model_type=model_type,
min_cq_combination=min_cq_combination,
min_cq_mu=min_cq_mu,
random_state=random_state)
self.param_names = ["n_estimators", "max_rules", "max_samples", "max_features", "model_type", "p_options", "random_state"]
self.classed_params = []
self.distribs = [CustomRandint(low=1, high=300), CustomRandint(low=1, high=20),
CustomUniform(), CustomUniform(), ["conjunction", "disjunction"], CustomUniform(), [random_state]]
self.weird_strings = {}
def set_params(self, p_options=[0.316], **kwargs):
if not isinstance(p_options, list):
p_options = [p_options]
kwargs["p_options"] = p_options
for parameter, value in iteritems(kwargs):
setattr(self, parameter, value)
return self
def get_interpretation(self, directory, base_file_name, y_test,
multi_class=False):
self.features_importance()
interpret_string = self.get_feature_importance(directory, base_file_name)
return interpret_string
...@@ -35,7 +35,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): ...@@ -35,7 +35,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
) )
self.param_names = ["n_estimators", "base_estimator"] self.param_names = ["n_estimators", "base_estimator"]
self.classed_params = ["base_estimator"] self.classed_params = ["base_estimator"]
self.distribs = [CustomRandint(low=1, high=500), self.distribs = [CustomRandint(low=1, high=100),
base_boosting_estimators] base_boosting_estimators]
self.weird_strings = {"base_estimator": "class_name"} self.weird_strings = {"base_estimator": "class_name"}
self.plotted_metric = metrics.zero_one_loss self.plotted_metric = metrics.zero_one_loss
...@@ -67,27 +67,27 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): ...@@ -67,27 +67,27 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier):
def get_interpretation(self, directory, base_file_name, y_test, feature_ids, def get_interpretation(self, directory, base_file_name, y_test, feature_ids,
multi_class=False): # pragma: no cover multi_class=False): # pragma: no cover
interpretString = "" interpretString = ""
interpretString += self.get_feature_importance(directory, # interpretString += self.get_feature_importance(directory,
base_file_name, # base_file_name,
feature_ids) # feature_ids)
interpretString += "\n\n Estimator error | Estimator weight\n" # interpretString += "\n\n Estimator error | Estimator weight\n"
interpretString += "\n".join( # interpretString += "\n".join(
[str(error) + " | " + str(weight / sum(self.estimator_weights_)) for # [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
error, weight in # error, weight in
zip(self.estimator_errors_, self.estimator_weights_)]) # zip(self.estimator_errors_, self.estimator_weights_)])
step_test_metrics = np.array( # step_test_metrics = np.array(
[self.plotted_metric.score(y_test, step_pred) for step_pred in # [self.plotted_metric.score(y_test, step_pred) for step_pred in
self.step_predictions]) # self.step_predictions])
get_accuracy_graph(step_test_metrics, "Adaboost", # get_accuracy_graph(step_test_metrics, "Adaboost",
os.path.join(directory, # os.path.join(directory,
base_file_name + "test_metrics.png"), # base_file_name + "test_metrics.png"),
self.plotted_metric_name, set="test") # self.plotted_metric_name, set="test")
np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), # np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"),
step_test_metrics, # step_test_metrics,
delimiter=',') # delimiter=',')
np.savetxt( # np.savetxt(
os.path.join(directory, base_file_name + "train_metrics.csv"), # os.path.join(directory, base_file_name + "train_metrics.csv"),
self.metrics, delimiter=',') # self.metrics, delimiter=',')
np.savetxt(os.path.join(directory, base_file_name + "times.csv"), # np.savetxt(os.path.join(directory, base_file_name + "times.csv"),
np.array([self.train_time, self.pred_time]), delimiter=',') # np.array([self.train_time, self.pred_time]), delimiter=',')
return interpretString return interpretString
import logging
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from ..metrics import zero_one_loss
from .additions.BoostUtils import StumpsClassifiersGenerator, \
BaseBoost
from ..monoview.monoview_utils import CustomRandint, \
BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero
classifier_class_name = "AdaboostGraalpy"
class AdaBoostGP(BaseEstimator, ClassifierMixin, BaseBoost):
"""Scikit-Learn compatible AdaBoost classifier. Original code by Pascal Germain, adapted by Jean-Francis Roy.
Parameters
----------
n_iterations : int, optional
The number of iterations of the algorithm. Defaults to 200.
iterations_to_collect_as_hyperparameters : list
Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time.
Defaults to None.
classifiers_generator : Transformer, optional
A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps
per attributes.
callback_function : function, optional
A function to call at each iteration that is supplied learning information. Defaults to None.
n_stumps : int ( default : 10)
self_complemented : boolean (default : True
Attributes
----------
n_iterations : int, optional
The number of iterations of the algorithm. Defaults to 200.
iterations_to_collect_as_hyperparameters : list
Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time.
Defaults to None.
classifiers_generator : Transformer, optional
A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps
per attributes.
callback_function : function, optional
A function to call at each iteration that is supplied learning information. Defaults to None.
"""
def __init__(self, n_iterations=200,
iterations_to_collect_as_hyperparameters=True,
classifiers_generator=None, callback_function=None,
n_stumps=10, self_complemented=True):
self.n_iterations = n_iterations
self.n_stumps = n_stumps
self.iterations_to_collect_as_hyperparameters = iterations_to_collect_as_hyperparameters
self.estimators_generator = classifiers_generator
self.callback_function = callback_function
self.self_complemented = self_complemented
def fit(self, X, y):
"""Fits the algorithm on training data.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
The input data.
y : ndarray of shape (n_samples, )
The input labels.
Returns
-------
self
"""
y_neg = change_label_to_minus(y)
if self.estimators_generator is None:
self.estimators_generator = StumpsClassifiersGenerator(
n_stumps_per_attribute=self.n_stumps,
self_complemented=self.self_complemented)
# Step 1: We fit the classifiers generator and get its classification matrix.
self.estimators_generator.fit(X, y_neg)
# hint: This is equivalent to construct a new X
classification_matrix = self._binary_classification_matrix(X)
n_samples, n_voters = classification_matrix.shape
# logging.debug("n_voters = {}".format(n_voters))
# Step 2: We initialize the weights on the samples and the weak classifiers.
sample_weights = np.ones(n_samples) / n_samples
alpha_weights = np.zeros(n_voters)
self.losses = []
# Step 3: We loop for each iteration.
self.collected_weight_vectors_ = []
for t in range(self.n_iterations):
# Step 4: We find the classifier that maximizes the success, weighted by the sample weights.
classifier_successes = np.dot(classification_matrix.T,
sample_weights * y_neg)
best_voter_index = np.argmax(classifier_successes)
success = classifier_successes[best_voter_index]
if success >= 1.0:
logging.info("AdaBoost stopped : perfect classifier found!")
self.weights_ = np.zeros(n_voters)
self.weights_[best_voter_index] = 1.0
return self
# Step 5: We calculate the alpha_t parameter and update the alpha weights.
alpha = 0.5 * np.log((1.0 + success) / (1.0 - success))
alpha_weights[best_voter_index] += alpha
# logging.debug("{} : {}".format(t, str(alpha)))
# Step 6: We update the sample weights.
sample_weights *= np.exp(
-1 * alpha * y_neg * classification_matrix[:, best_voter_index])
normalization_constant = sample_weights.sum()
sample_weights = sample_weights / normalization_constant
# We collect iteration information for later evaluation.
if self.iterations_to_collect_as_hyperparameters:
weights = alpha_weights / np.sum(alpha_weights)
self.collected_weight_vectors_.append(weights.copy())
loss = zero_one_loss.score(y_neg, np.sign(np.sum(
np.multiply(classification_matrix,
alpha_weights / np.sum(alpha_weights)), axis=1)))
self.losses.append(loss)
if self.callback_function is not None:
self.callback_function(t, alpha_weights, normalization_constant,
self.estimators_generator, self.weights_)
self.weights_ = alpha_weights / np.sum(alpha_weights)
self.losses = np.array(self.losses)
self.learner_info_ = {
'n_nonzero_weights': np.sum(self.weights_ > 1e-12)}
return self
def predict(self, X):
"""Predict inputs using the fit classifier.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
The data to classify.
Returns
-------
predictions : ndarray of shape (n_samples, )
The estimated labels.
"""
check_is_fitted(self, 'weights_')
classification_matrix = self._binary_classification_matrix(X)
if self.iterations_to_collect_as_hyperparameters:
self.test_preds = []
for weight_vector in self.collected_weight_vectors_:
preds = np.sum(np.multiply(classification_matrix,
weight_vector), axis=1)
self.test_preds.append(change_label_to_zero(np.sign(preds)))
self.test_preds = np.array(self.test_preds)
margins = np.squeeze(
np.asarray(np.dot(classification_matrix, self.weights_)))
return change_label_to_zero(
np.array([int(x) for x in np.sign(margins)]))
class AdaboostGraalpy(AdaBoostGP, BaseMonoviewClassifier):
"""AdaboostGraalpy
Parameters
----------
random_state : int seed, RandomState instance, or None (default=None)
The seed of the pseudo random number generator to use when
shuffling the data.
n_iterations : in number of iterations (default : 200)
n_stumps : int (default 1)
kwargs : others arguments
Attributes
----------
param_names :
distribs :
weird_strings :
n_stumps :
nbCores :
"""
def __init__(self, random_state=None, n_iterations=200, n_stumps=1,
**kwargs):
super(AdaboostGraalpy, self).__init__(
n_iterations=n_iterations,
n_stumps=n_stumps
)
self.param_names = ["n_iterations", "n_stumps", "random_state"]
self.distribs = [CustomRandint(low=1, high=500), [n_stumps],
[random_state]]
self.classed_params = []
self.weird_strings = {}
self.n_stumps = n_stumps
if "nbCores" not in kwargs:
self.nbCores = 1
else:
self.nbCores = kwargs["nbCores"]
# def canProbas(self):
# """
# Used to know if the classifier can return label probabilities
#
# Returns
# -------
# True in any case
# """
# return True
def getInterpret(self, directory, y_test):
"""
Parameters
----------
directory :
y_test :
Returns
-------
retur string of interpret
"""
np.savetxt(directory + "train_metrics.csv", self.losses, delimiter=',')
np.savetxt(directory + "y_test_step.csv", self.test_preds,
delimiter=',')
step_metrics = []
for step_index in range(self.test_preds.shape[0] - 1):
step_metrics.append(zero_one_loss.score(y_test,
self.test_preds[step_index,
:]))
step_metrics = np.array(step_metrics)
np.savetxt(directory + "step_test_metrics.csv", step_metrics,
delimiter=',')
return ""
# def formatCmdArgs(args):
# """Used to format kwargs for the parsed args"""
# kwargsDict = {"n_iterations": args.AdG_n_iter,
# "n_stumps": args.AdG_stumps, }
# return kwargsDict
def paramsToSet(nIter, random_state):
"""Used for weighted linear early fusion to generate random search sets"""
paramsSet = []
for _ in range(nIter):
paramsSet.append({"n_iterations": random_state.randint(1, 500), })
return paramsSet
import time
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from .. import metrics
from .additions.BoostUtils import get_accuracy_graph
from .additions.PregenUtils import PregenClassifier
from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier, \
change_label_to_zero
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
classifier_class_name = "AdaboostPregen"
class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier,
PregenClassifier):
"""
Parameters
----------
random_state : int seed, RandomState instance, or None (default=None)
The seed of the pseudo random number generator to use when
shuffling the data.
n_estimators : int number of estimators (default : 50)
base_estimator :
n_stumps : int (default : 1)
estimators_generator : str, (default : "Stumps")
max_depth : int (default : 1)
self_complemeted : bool, (default : True)
kwargs : others arguments
Attributes
----------
param_names : list of parameters names
classed_params : list of parameters names
distribs :
weird_strings :
plotted_metric
plotted_metric_name : str name of plotted metric
step_predictions :
estimators_generator :
max_depth :
n_stumps :
self_complemented :
"""
def __init__(self, random_state=None, n_estimators=50,
base_estimator=None, n_stumps=1, estimators_generator="Stumps",
max_depth_pregen=1, self_complemeted=True,
**kwargs):
super(AdaboostPregen, self).__init__(
random_state=random_state,
n_estimators=n_estimators,
base_estimator=base_estimator,
algorithm="SAMME"
)
self.param_names = ["n_estimators", "base_estimator", "n_stumps",
"estimators_generator", "max_depth_pregen",
"random_state"]
self.classed_params = ["base_estimator"]
self.distribs = [CustomRandint(low=1, high=500),
[DecisionTreeClassifier(max_depth=1)], [n_stumps],
["Stumps", "Tree"], CustomRandint(low=1, high=5),
[random_state]]
self.weird_strings = {"base_estimator": "class_name"}
self.plotted_metric = metrics.zero_one_loss
self.plotted_metric_name = "zero_one_loss"
self.step_predictions = None
self.estimators_generator = estimators_generator
self.max_depth_pregen = max_depth_pregen
self.n_stumps = n_stumps
self.self_complemented = self_complemeted
def fit(self, X, y, sample_weight=None):
"""
Fit the AdaboostPregen
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
For kernel="precomputed", the expected shape of X is
(n_samples_test, n_samples_train).
y : { array-like, shape (n_samples,)
Target values class labels in classification
sample_weight :
"""
begin = time.time()
pregen_X, pregen_y = self.pregen_voters(X, y)
super(AdaboostPregen, self).fit(pregen_X, pregen_y,
sample_weight=sample_weight)
end = time.time()
self.train_time = end - begin
self.train_shape = pregen_X.shape
self.base_predictions = np.array(
[change_label_to_zero(estim.predict(pregen_X)) for estim in
self.estimators_])
self.metrics = np.array(
[self.plotted_metric.score(change_label_to_zero(pred), y) for pred
in self.staged_predict(pregen_X)])
self.bounds = np.array([np.prod(
np.sqrt(1 - 4 * np.square(0.5 - self.estimator_errors_[:i + 1])))
for i in
range(self.estimator_errors_.shape[0])])
self.feature_importances_ = np.ones(X.shape[1])
return self
# def canProbas(self):
# """
# Used to know if the classifier can return label probabilities
#
# Returns
# -------
# True
# """
# return True
def predict(self, X):
"""
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
For kernel="precomputed", the expected shape of X is
(n_samples_test, n_samples_train).
Returns
-------
"""
begin = time.time()
pregen_X, _ = self.pregen_voters(X)
pred = super(AdaboostPregen, self).predict(pregen_X)
end = time.time()
self.pred_time = end - begin
if pregen_X.shape != self.train_shape:
self.step_predictions = np.array(
[change_label_to_zero(step_pred) for step_pred in
self.staged_predict(pregen_X)])
return change_label_to_zero(pred)
# def set_params(self, **params):
# super().set_params(params)
# self.random_state = params["random_state"]
# self.n_stumps_per_attribute = params["n_tumps"]
# return self
# def getInterpret(self, directory, y_test):
# # interpretString = ""
# # interpretString += self.getFeatureImportance(directory)
# # interpretString += "\n\n Estimator error | Estimator weight\n"
# # interpretString += "\n".join(
# # [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for
# # error, weight in
# # zip(self.estimator_errors_, self.estimator_weights_)])
# # step_test_metrics = np.array(
# # [self.plotted_metric.score(y_test, step_pred) for step_pred in
# # self.step_predictions])
# # get_accuracy_graph(step_test_metrics, "AdaboostPregen",
# # directory + "test_metrics.png",
# # self.plotted_metric_name, set="test")
# # # get_accuracy_graph(self.metrics, "AdaboostPregen",
# # # directory + "metrics.png", self.plotted_metric_name,
# # # bounds=list(self.bounds),
# # # bound_name="boosting bound")
# # np.savetxt(directory + "test_metrics.csv", step_test_metrics,
# # delimiter=',')
# # np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',')
# # np.savetxt(directory + "times.csv",
# # np.array([self.train_time, self.pred_time]), delimiter=',')
# # np.savetxt(directory + "times_iter.csv",
# # np.array([self.train_time, len(self.estimator_weights_)]), delimiter=',')
# return interpretString
def feature_importances_(self, value):
self._feature_importances_ = value
# def formatCmdArgs(args):
# """Used to format kwargs for the parsed args"""
# kwargsDict = {'n_estimators': args.AdP_n_est,
# 'base_estimator': [DecisionTreeClassifier(max_depth=1)],
# 'n_stumps': args.AdP_stumps}
# return kwargsDict
# def paramsToSet(nIter, random_state):
# """Used for weighted linear early fusion to generate random search sets"""
# paramsSet = []
# for _ in range(nIter):
# paramsSet.append({"n_estimators": random_state.randint(1, 500),
# "base_estimator": None})
# return paramsSet
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment