diff --git a/config_files/config_cuisine.yml b/config_files/config_cuisine.yml index d4f2da6e6b03ceeb8287076f76860cba41bdd6ed..a60012a801e17433020480cc1b47170c0dba9c54 100644 --- a/config_files/config_cuisine.yml +++ b/config_files/config_cuisine.yml @@ -1,10 +1,10 @@ # The base configuration of the benchmark log: True -name: ["demo"] +name: ["test_boules"] label: "_1_3" file_type: ".hdf5" views: -pathf: "/home/baptiste/Documents/Datasets/Generated/" +pathf: "/home/baptiste/Documents/Clouded/short_projects/latent_space_study/" nice: 0 random_state: 42 nb_cores: 1 @@ -17,23 +17,74 @@ track_tracebacks: False # All the classification-realted configuration options multiclass_method: "oneVersusOne" -split: 0.75 +split: 0.10 nb_folds: 5 -nb_class: 2 -classes: ['label_1', 'label_3'] -type: ["multiview", "monoview"] -algos_monoview: ["cb_boost",] -algos_multiview: ["multiview_cbound_boosting"] +nb_class: 4 +classes: +type: ["multiview","monoview"] +algos_monoview: ["cb_boost", "decision_tree", 'random_forest'] +algos_multiview: ["mv_cb_boost", "weighted_linear_late_fusion","weighted_linear_early_fusion","mumbo" ] stats_iter: 5 metrics: accuracy_score: {} f1_score: - average: 'binary' + average: 'micro' metric_princ: "accuracy_score" -hps_type: "None" -hps_args: {} +hps_type: "Random" +hps_args: + n_iter: 10 + equivalent_draws: True cb_boost: - n_stumps: 10 -multiview_cbound_boosting: - n_stumps: 10 \ No newline at end of file + n_stumps: 30 + n_max_iterations: 20 + estimators_generator: "Trees" + max_depth: 1 +decision_tree: + max_depth: 2 +mumbo: + base_estimator: decision_tree + base_estimator__max_depth: 1 + n_estimators: 80 + +mv_cb_boost: + n_max_iterations: 80 + n_stumps: 30 + estimators_generator: "Trees" + max_depth: 1 + +pb_mv_boost: + num_iterations: 20 + decision_tree_depth: 1 +weighted_linear_early_fusion: + monoview_classifier_name: "cb_boost" + monoview_classifier_config: + cb_boost: + n_stumps: 30 + n_max_iterations: 20 + estimators_generator: "Trees" + max_depth: 1 +weighted_linear_late_fusion: + classifiers_names: ["cb_boost", "cb_boost", "cb_boost", "cb_boost"] + classifier_configs: + - cb_boost: + n_stumps: 30 + n_max_iterations: 20 + estimators_generator: "Trees" + max_depth: 1 + - cb_boost: + n_stumps: 30 + n_max_iterations: 20 + estimators_generator: "Trees" + max_depth: 1 + - cb_boost: + n_stumps: 30 + n_max_iterations: 20 + estimators_generator: "Trees" + max_depth: 1 + - cb_boost: + n_stumps: 30 + n_max_iterations: 20 + estimators_generator: "Trees" + max_depth: 1 + diff --git a/summit/__init__.py b/summit/__init__.py index b11a66d7464c5cee9e7ebc2b53732938bb4cec1d..fbe62a5795ba99a83170a4a3751ae90c9b1e3d58 100644 --- a/summit/__init__.py +++ b/summit/__init__.py @@ -1,5 +1,6 @@ __version__ = "0.0.0.0" +__url__ = "https://gitlab.lis-lab.fr/baptiste.bauvin/summit" from . import multiview_platform, execute diff --git a/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py b/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py index a5f0c73c45f2f6d1bb38beb508bb6d65f82a9684..604da04328de6191300cb1efffd0fe3f1fd368a2 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py @@ -129,6 +129,7 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, np.sqrt(1 - 4 * np.square(0.5 - self.estimator_errors_[:i + 1]))) for i in range(self.estimator_errors_.shape[0])]) + return self # def canProbas(self): # """ diff --git a/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py b/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py index 704a58951fb617738b42160019c2b311410ce268..44e34d86d8f825da308bf30a193015425db7d2c7 100644 --- a/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py +++ b/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py @@ -140,6 +140,18 @@ class CBBoostClassifier(BaseEstimator, ClassifierMixin, BaseBoost): self.feature_importances_ /= np.sum(self.feature_importances_) return self + def predict_proba(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + margins = np.sum(classification_matrix * self.weights_, axis=1) + proba = np.array([np.array([(1 - vote)/2, (1 + vote)/2]) for vote in margins]) + return proba + def predict(self, X): start = time.time() check_is_fitted(self, 'weights_') diff --git a/summit/multiview_platform/monoview_classifiers/bagging.py b/summit/multiview_platform/monoview_classifiers/bagging.py index 36b2f04183ef75259d5729498d42cb73ea8c0b9d..56c51b77701c16f1fce442c69e0635e2a30f6199 100644 --- a/summit/multiview_platform/monoview_classifiers/bagging.py +++ b/summit/multiview_platform/monoview_classifiers/bagging.py @@ -52,6 +52,7 @@ class Bagging(BaggingClassifier, BaseMonoviewClassifier,): end = time.time() self.train_time = end - begin self.train_shape = X.shape + return self def predict(self, X): diff --git a/summit/multiview_platform/monoview_classifiers/bagging_pregen.py b/summit/multiview_platform/monoview_classifiers/bagging_pregen.py index e87587f5990af60ce6c25175faefc95bab433450..f4c7c1a631fc346540bfc215e111582abc78034e 100644 --- a/summit/multiview_platform/monoview_classifiers/bagging_pregen.py +++ b/summit/multiview_platform/monoview_classifiers/bagging_pregen.py @@ -64,6 +64,7 @@ class BaggingPregen(BaggingClassifier, BaseMonoviewClassifier, end = time.time() self.train_time = end - begin self.train_shape = pregen_X.shape + return self diff --git a/summit/multiview_platform/monoview_classifiers/gradient_boosting_pregen.py b/summit/multiview_platform/monoview_classifiers/gradient_boosting_pregen.py index 8ab04091de96d0333c6e151ef7c5546bae6e8fbf..1b302cc155fc24b3e116ae7f336787d20b729cda 100644 --- a/summit/multiview_platform/monoview_classifiers/gradient_boosting_pregen.py +++ b/summit/multiview_platform/monoview_classifiers/gradient_boosting_pregen.py @@ -63,6 +63,7 @@ class GradientBoostingPregen(GradientBoostingClassifier, BaseMonoviewClassifier, # self.base_predictions = np.array( # [change_label_to_zero(estim.predict(pregen_X)) for estim in # self.estimators_]) + return self diff --git a/summit/multiview_platform/monoview_classifiers/random_forest_pregen.py b/summit/multiview_platform/monoview_classifiers/random_forest_pregen.py index 36f828b5bd6c6911216a39b02718d467d51a8eab..fe54b17a000c125ced979563f308d47e5b85a7b5 100644 --- a/summit/multiview_platform/monoview_classifiers/random_forest_pregen.py +++ b/summit/multiview_platform/monoview_classifiers/random_forest_pregen.py @@ -62,7 +62,7 @@ class RandomForestPregen(RandomForestClassifier, BaseMonoviewClassifier, self.base_predictions = np.array( [change_label_to_zero(estim.predict(pregen_X)) for estim in self.estimators_]) - + return self def predict(self, X): diff --git a/summit/multiview_platform/monoview_classifiers/scm.py b/summit/multiview_platform/monoview_classifiers/scm.py index 56fc0dd031929881a8a9d6bfe9ec9bf45e9aba88..753e4b280a811f4146b88e9dc1168c98c17a49db 100644 --- a/summit/multiview_platform/monoview_classifiers/scm.py +++ b/summit/multiview_platform/monoview_classifiers/scm.py @@ -74,6 +74,7 @@ class SCM(scm, BaseMonoviewClassifier): for rule, importance in zip(self.model_.rules, rules_importances): self.feature_importances_[rule.feature_idx] += importance self.feature_importances_ /= np.sum(self.feature_importances_) + return self # def canProbas(self): # """ diff --git a/summit/multiview_platform/monoview_classifiers/scm_bagging.py b/summit/multiview_platform/monoview_classifiers/scm_bagging.py index 7d4aaebc1fb358ca88304710b82e3607d5d97ce7..173a1dc3ce9c6183867fbd9d6234706648efc610 100644 --- a/summit/multiview_platform/monoview_classifiers/scm_bagging.py +++ b/summit/multiview_platform/monoview_classifiers/scm_bagging.py @@ -69,7 +69,7 @@ class ScmBagging(ScmBaggingClassifier, BaseMonoviewClassifier): """ def __init__(self, - n_estimators=10, + n_estimators=50, max_samples=1.0, max_features=1.0, max_rules=10, diff --git a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py b/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py index 1079f5633817b77fd06ec3d89ca933dffe972c6d..b4b38d60728317bc257deae2d6ee862371798fdf 100644 --- a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py +++ b/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py @@ -1,20 +1,86 @@ -from .scm_bagging import ScmBagging -from ..utils.hyper_parameter_search import CustomUniform, CustomRandint +from scm_bagging.scm_bagging_classifier import ScmBaggingClassifier + + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from summit.multiview_platform.utils.hyper_parameter_search import CustomUniform, CustomRandint + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype classifier_class_name = "ScmBaggingMinCq" -class ScmBaggingMinCq(ScmBagging): +import numpy as np +from six import iteritems + +MAX_INT = np.iinfo(np.int32).max + + +class ScmBaggingMinCq(ScmBaggingClassifier, BaseMonoviewClassifier): + """A Bagging classifier. for SetCoveringMachineClassifier() + The base estimators are built on subsets of both samples + and features. + Parameters + ---------- + n_estimators : int, default=10 + The number of base estimators in the ensemble. + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator with + replacement. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement. + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + p_options : list of float with len =< n_estimators, default=[1.0] + The estimators will be fitted with values of p found in p_options + let k be k = n_estimators/len(p_options), + the k first estimators will have p=p_options[0], + the next k estimators will have p=p_options[1] and so on... + random_state : int or RandomState, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary <random_state>`. + + Attributes + ---------- + n_features_ : int + The number of features when :meth:`fit` is performed. + estimators_ : list of estimators + The collection of fitted base estimators. + estim_features : list of arrays + The subset of drawn features for each base estimator. + + Examples + -------- + >>> @TODO + + References + ---------- + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + .. [2] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + """ + def __init__(self, - n_estimators=10, + n_estimators=50, max_samples=1.0, max_features=1.0, max_rules=10, p_options=[0.316], model_type="conjunction", min_cq_combination=True, - min_cq_mu = 10e-3, + min_cq_mu=10e-3, random_state=None): - ScmBagging.__init__(self, n_estimators=n_estimators, + if isinstance(p_options, float): + p_options = [p_options] + ScmBaggingClassifier.__init__(self, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, max_rules=max_rules, @@ -23,5 +89,22 @@ class ScmBaggingMinCq(ScmBagging): min_cq_combination=min_cq_combination, min_cq_mu=min_cq_mu, random_state=random_state) - self.param_names.append("min_cq_mu") - self.distribs.append(CustomRandint(1,7, multiplier='e-')) \ No newline at end of file + self.param_names = ["n_estimators", "max_rules", "max_samples", "max_features", "model_type", "p_options", "random_state"] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=300), CustomRandint(low=1, high=20), + CustomUniform(), CustomUniform(), ["conjunction", "disjunction"], CustomUniform(), [random_state]] + self.weird_strings = {} + + def set_params(self, p_options=[0.316], **kwargs): + if not isinstance(p_options, list): + p_options = [p_options] + kwargs["p_options"] = p_options + for parameter, value in iteritems(kwargs): + setattr(self, parameter, value) + return self + + def get_interpretation(self, directory, base_file_name, y_test, + multi_class=False): + self.features_importance() + interpret_string = self.get_feature_importance(directory, base_file_name) + return interpret_string diff --git a/summit/multiview_platform/monoview_classifiers/scm_mazid.py b/summit/multiview_platform/monoview_classifiers/scm_mazid.py index 0445ea88031e8ddfe70786765fb0573e93ad7d84..023e92a34cf3ab93adce3432e8088a4a0d481228 100644 --- a/summit/multiview_platform/monoview_classifiers/scm_mazid.py +++ b/summit/multiview_platform/monoview_classifiers/scm_mazid.py @@ -34,6 +34,7 @@ class DecisionStumpSCMNew(BaseMonoviewClassifier): print(self.model_type) self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) self.clf.fit(X=X, y=y) + return self def predict(self, X): return self.clf.predict(X) diff --git a/summit/multiview_platform/multiview_classifiers/additions/mv_cb_boost_adapt.py b/summit/multiview_platform/multiview_classifiers/additions/mv_cb_boost_adapt.py new file mode 100644 index 0000000000000000000000000000000000000000..b2b405924f4fce47ff230c5419e65d758f688d4e --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/additions/mv_cb_boost_adapt.py @@ -0,0 +1,514 @@ +import numpy as np +import numpy.ma as ma +import math +import os +import pandas as pd + +from ...multiview.multiview_utils import BaseMultiviewClassifier +from ...monoview_classifiers.additions import CBBoostUtils, BoostUtils +from ...utils.hyper_parameter_search import CustomRandint +from ...utils.dataset import get_samples_views_indices +from ... import metrics + +from mv_cb_boost.base import MultiviewCBoundBoosting + +classifier_class_name = "MultiviewCBoundBoosting" + +class MultiviewCBoundBoostingAdapt(BaseMultiviewClassifier, MultiviewCBoundBoosting): + + def __init__(self, n_estimators=10, random_state=None, + self_complemented=True, twice_the_same=False, + random_start=False, n_stumps=1, c_bound_sol=True, + base_estimator="Stumps", max_depth=1, mincq_tracking=False, + weight_add=3, weight_strategy="c_bound_based_broken", + weight_update = "multiplicative", full_combination=False, + min_cq_pred=False, min_cq_mu=10e-3, sig_mult=15, sig_offset=5, + use_previous_voters=False, **kwargs): + MultiviewCBoundBoosting.__init__(self, n_estimators=n_estimators, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, max_depth=max_depth, + base_estimator=base_estimator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, use_previous_voters=use_previous_voters, + full_combination=full_combination, + min_cq_pred=min_cq_pred, min_cq_mu=min_cq_mu, + sig_mult=sig_mult, sig_offset=sig_offset, + **kwargs) + BaseMultiviewClassifier.__init__(self, random_state) + self.param_names = ["n_estimators","random_state"] + self.distribs = [CustomRandint(5,200), [random_state]] + + def fit(self, X, y, train_indices=None, view_indices=None): + + train_indices, view_indices = get_samples_views_indices(X, + train_indices, + view_indices) + input_X = {view_index: X.get_v(view_index=view_index, sample_indices=train_indices) + for view_index in view_indices} + # from mv_cb_boost.vizualization import each_voter_error + # fitted=MultiviewCBoundBoosting.fit(self, input_X, y[train_indices]) + # each_voter_error(y[train_indices], fitted.predict(X, sample_indices=train_indices), fitted, sample_ids=[str(i) for i in range(len(train_indices))] ) + return MultiviewCBoundBoosting.fit(self, input_X, y[train_indices]) + + def predict(self, X, sample_indices=None, view_indices=None): + sample_indices, view_indices = get_samples_views_indices(X, + sample_indices, + view_indices) + input_X = {view_index: X.get_v(view_index=view_index, sample_indices=sample_indices) + for view_index in view_indices} + return MultiviewCBoundBoosting.predict(self, input_X) + + # def transform_sample_weights(self): + # df = pd.DataFrame(columns=["weight", "view", "sample", 'iteration', "right", "margin", "mixed_sample_view", "considered_mis_class"]) + # i=0 + # self.min_weight = 100 + # self.max_weight = -100 + # + # for iter_index, view_sample_weights in enumerate(self.sample_weightings): + # for view_index, sample_weights in enumerate(view_sample_weights): + # for sample_index, weight in enumerate(sample_weights): + # weight = weight[0]*10 + # df.loc[i] = [weight, view_index, sample_index, iter_index, self.decisions[iter_index][view_index][sample_index][0], abs(self.margins[iter_index][view_index][sample_index][0]), view_index+sample_index*self.n_view_total, self.considered_misclass[iter_index][view_index][sample_index]] + # i+=1 + # if weight < self.min_weight: + # self.min_weight = weight + # elif weight > self.max_weight: + # self.max_weight = weight + # return df + # + # def get_interpretation(self, directory, base_file_name, labels, multiclass=False): + # self.view_importances = np.zeros(self.n_view_total) + # for (view, index, _), weight in zip(self.general_voters, self.general_weights): + # self.view_importances[view]+=weight + # self.view_importances/=np.sum(self.view_importances) + # interpret_string = "View importances : \n\n{}".format(self.view_importances) + # interpret_string+="\n\n Used {} iterations, on the {} available.".format(self.it, self.n_max_iterations) + # interpret_string+= "\n\n Broke : {} \n\t".format(self.break_cause)+'\n\t'.join(self.view_break_cause) + # # df = self.transform_sample_weights() + # import plotly.express as px + # import plotly + # # fig = px.scatter(df, x="mixed_sample_view", y="weight", animation_frame="iteration", animation_group='mixed_sample_view', size="margin", + # # color="right", text="view", hover_name="sample", hover_data=["weight", "view", "sample", 'iteration', "right", "margin", "mixed_sample_view", "considered_mis_class"], range_x=[0, self.n_total_examples*self.n_view_total], range_y=[self.min_weight, self.max_weight] + # # ) + # # plotly.offline.plot(fig, filename=os.path.join(directory, base_file_name+"_weights.html"), auto_open=False) + # + # return interpret_string + # + # def update_sample_weighting(self, view_index, formatted_y,): + # weight_strategies = ['c_bound_based', 'c_bound_based_broken' ] + # import math + # print("\t 1st voter\t", self.view_previous_vote[0][0]) + # print("\t Sol\t\t", self.view_q[view_index]) + # new_vote = np.multiply(self.view_previous_vote[view_index]+self.view_q[view_index]*self.view_new_voter[view_index],self.sample_weighting[view_index]) + # well_class = np.zeros((self.n_total_examples, self.n_view_total)) + # for view_ind in range(self.n_view_total): + # if view_ind in self.available_view_indices: + # class_vote = np.multiply( + # self.view_previous_vote[view_ind] + self.view_q[view_ind] * + # self.view_new_voter[view_ind], + # self.sample_weighting[view_ind]) + # margins = formatted_y * class_vote + # self.margin[view_index] = margins + # well_class[:, view_ind] = np.array([mg[0] > 0 for mg in margins]) + # considered_well_class = well_class[:, view_index] + np.logical_not(well_class.any(axis=1)) + # self.view_considered_misclass[view_index] = considered_well_class + # self.view_decisions[view_index] = np.array([vote[0]*y>0 for vote, y in zip(new_vote, formatted_y)]) + # if self.weight_strategy == 'c_bound_based': + # c_bound_based_weighting = formatted_y*new_vote/(new_vote**2+self.weight_add) + # normalized_cbound_weights = c_bound_based_weighting+(math.sqrt(self.weight_add)/2*self.weight_add) + # normalized_cbound_weights/= np.sum(normalized_cbound_weights) + # sample_weights = normalized_cbound_weights + # elif self.weight_strategy == 'c_bound_based_broken': + # c_bound_based_weighting = np.array([y * vote - math.sqrt(self.weight_add) / ( + # (vote - math.sqrt(self.weight_add)) ** 2 + self.weight_add) + # if not considered_well_class[sample_index] else y * vote + math.sqrt(self.weight_add) / ( + # (vote + math.sqrt(self.weight_add)) ** 2 + self.weight_add) + # for sample_index, (y, vote) in enumerate(zip(formatted_y, new_vote))]).reshape((self.n_total_examples, 1)) + # # normalized_cbound_weights = c_bound_based_weighting + ( + # # math.sqrt(self.weight_add) / 2 * self.weight_add) + # sample_weights = c_bound_based_weighting/np.sum(c_bound_based_weighting) + # + # elif self.weight_strategy == 'c_bound_based_dec': + # c_bound_based_weighting = np.array([-vote**2 + math.sqrt(self.weight_add)/(2*self.weight_add) + # if not considered_well_class[sample_index] else y * vote + math.sqrt(self.weight_add) / ( + # (vote + math.sqrt(self.weight_add)) ** 2 + self.weight_add) + # for sample_index, (y, vote) in enumerate(zip(formatted_y, new_vote))]).reshape((self.n_total_examples, 1)) + # # normalized_cbound_weights = c_bound_based_weighting + ( + # # math.sqrt(self.weight_add) / 2 * self.weight_add) + # sample_weights = c_bound_based_weighting/np.sum(c_bound_based_weighting) + # elif self.weight_strategy == 'sigmoid': + # sigmoid_weighting = np.array([1/(1+math.exp(self.sig_mult * vote* y + self.sig_offset)) if not considered_well_class[sample_index] else 1 + # for sample_index, (y, vote) in enumerate(zip(formatted_y, new_vote))]).reshape((self.n_total_examples, 1)) + # sample_weights = sigmoid_weighting/np.sum(sigmoid_weighting) + # + # else: + # raise ValueError("weight_strategy must be in {}, here it is {}".format(weight_strategies, self.weight_strategy)) + # + # well_class = np.zeros((self.n_total_examples, self.n_view_total)) + # for view_ind in range(self.n_view_total): + # if view_ind in self.available_view_indices: + # new_vote = self.view_previous_vote[view_ind] + self.view_q[ + # view_ind] * self.view_new_voter[view_ind] + # margins = formatted_y * new_vote + # self.margin[view_index] = margins + # well_class[:, view_ind] = np.array([mg[0] > 0 for mg in margins]) + # min_sample_weights = np.min(sample_weights) + # max_sample_weights = np.max(sample_weights) + # sample_weights = self.normalize(sample_weights) + # + # if self.weight_update =="additive": + # sample_weights = self.normalize(sample_weights, range=1, min_interval=-0.5) + # self.sample_weighting[view_index] += sample_weights + # elif self.weight_update == "multiplicative": + # sample_weights = self.normalize(sample_weights, range=2, + # min_interval=-1) + # + # self.sample_weighting[view_index] *= sample_weights + # elif self.weight_update == "replacement": + # sample_weights = self.normalize(sample_weights, range=1, + # min_interval=0) + # self.sample_weighting[view_index] = sample_weights.reshape((self.n_total_examples,1)) + # + # self.sample_weighting[view_index] /= np.max(self.sample_weighting[view_index])-np.min(self.sample_weighting[view_index]) + # self.sample_weighting[view_index] -= np.min(self.sample_weighting[view_index]) + # self.sample_weighting[view_index] /= np.sum(self.sample_weighting[view_index]) + # self.sample_weighting[view_index].reshape((self.n_total_examples, 1)) + # print("\tMin\t\t", np.min(self.sample_weighting[view_index])) + # print("\tMax\t\t", np.max(self.sample_weighting[view_index])) + # + # def normalize(self, sample_weights, range=2, min_interval=-1.0): + # min_sample_weights = np.min(sample_weights) + # max_sample_weights = np.max(sample_weights) + # if range is None: + # pass + # else: + # sample_weights = sample_weights*(range/(max_sample_weights-min_sample_weights))-(-min_interval+(range*min_sample_weights)/(max_sample_weights-min_sample_weights)) + # return sample_weights + # + # + # def get_best_view_voter(self, ): + # best_margin = 0 + # for view_index, (margin, voter_index) in enumerate(self.view_first_voters): + # if margin > best_margin: + # best_margin = margin + # best_view = view_index + # best_voter = voter_index + # self.general_voters.append([best_view, best_voter, 0]) + # self.general_weights.append(1.0) + # self.general_previous_vote = np.array( + # self.view_classification_matrix[best_view][:, + # best_voter].reshape( + # (self.n_total_examples, 1)), + # copy=True) + # + # + # + # def choose_new_general_voter(self, formatted_y): + # previous_sum = np.multiply(formatted_y, self.general_previous_vote ) + # margin_old = np.sum(previous_sum) + # worst_example = 0 + # if self.use_previous_voters: + # hypotheses = [self.view_classification_matrix[view_index][:,self.view_chosen_columns_[view_index][it_index]].reshape((self.n_total_examples, 1))*formatted_y + # if not self.broken_views_iteration[view_index][it_index] + # else np.zeros((self.n_total_examples, 1))-1 + # for view_index in range(self.n_view_total) + # for it_index in range(self.it)] + # pv_view_indices = [view_index for view_index in range(self.n_view_total) + # for it_index in range(self.it)] + # pv_it_indices = [it_index for view_index in range(self.n_view_total) + # for it_index in range(self.it)] + # n_hyp = len(hypotheses) + # y_kernel_matrix = np.array(hypotheses).reshape((self.n_total_examples, n_hyp)) + # else: + # y_kernel_matrix = np.array([self.view_new_voter[view_index]*formatted_y + # if not self.broken_views[view_index] + # and view_index in self.used_views + # else np.zeros((self.n_total_examples, 1))-1 + # for view_index in range(self.n_view_total)]).reshape((self.n_total_examples, + # self.n_view_total)) + # bad_margins = \ + # np.where(np.sum(y_kernel_matrix, axis=0) <= 0.0)[ + # 0] + # self.B2 = self.n_total_examples + # self.B1s = np.sum( + # 2 * np.multiply(previous_sum, y_kernel_matrix), axis=0) + # self.B0 = np.sum(previous_sum ** 2) + # + # self.A2s = np.sum( + # y_kernel_matrix, axis=0) ** 2 + # self.A1s = np.sum( + # y_kernel_matrix, + # axis=0) * margin_old * 2 + # self.A0 = margin_old ** 2 + # + # C2s = (self.A1s * self.B2 - self.A2s * self.B1s) + # C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) + # C0s = self.A0 * self.B1s - self.A1s * self.B0 + # + # sols = np.zeros(C0s.shape) - 3 + # sols[np.where(C2s != 0)[0]] = (-C1s[ + # np.where(C2s != 0)[0]] + np.sqrt( + # C1s[np.where(C2s != 0)[0]] * C1s[ + # np.where(C2s != 0)[0]] - 4 * C2s[ + # np.where(C2s != 0)[0]] * C0s[ + # np.where(C2s != 0)[0]])) / ( + # 2 * C2s[ + # np.where(C2s != 0)[0]]) + # c_bounds = self.compute_c_bounds_gen(sols) + # print('\tCbounds\t\t', c_bounds) + # print("\tSols \t\t", sols) + # trans_c_bounds = self.compute_c_bounds_gen(sols + 1) + # masked_c_bounds = ma.array(c_bounds, fill_value=np.inf) + # # Masing Maximums + # masked_c_bounds[c_bounds >= trans_c_bounds] = ma.masked + # # Masking magrins <= 0 + # masked_c_bounds[bad_margins] = ma.masked + # print("\tbad_margins\t", bad_margins) + # # Masking weights < 0 (because self-complemented) + # masked_c_bounds[sols < 0] = ma.masked + # # Masking nan c_bounds + # masked_c_bounds[np.isnan(c_bounds)] = ma.masked + # for view_index, broken in enumerate(self.broken_views): + # if broken: + # masked_c_bounds[view_index] = ma.masked + # print('\tCbounds\t\t', masked_c_bounds) + # + # if masked_c_bounds.mask.all(): + # return "No more pertinent voters", 0, -1 + # else: + # + # best_hyp_index = np.argmin(masked_c_bounds) + # self.general_c_bounds.append( + # masked_c_bounds[best_hyp_index]) + # self.general_margins.append( + # math.sqrt(self.A2s[best_hyp_index] / self.n_total_examples)) + # self.general_disagreements.append( + # 0.5 * self.B1s[best_hyp_index] / self.n_total_examples) + # if self.use_previous_voters: + # return sols[best_hyp_index], pv_view_indices[best_hyp_index], pv_it_indices[best_hyp_index] + # else: + # return sols[best_hyp_index], best_hyp_index, -1 + # + # def update_infos(self, view_index, formatted_y): + # self.broken_views_iteration[view_index].append(False) + # self.view_weights_[view_index].append(self.view_q[view_index]) + # ones_matrix = np.zeros(formatted_y.shape) + # ones_matrix[ + # np.multiply(formatted_y, self.view_new_voter[view_index].reshape( + # formatted_y.shape)) < 0] = 1 # can np.divide if needed + # epsilon = np.average( + # np.multiply(formatted_y, self.view_new_voter[view_index].reshape( + # formatted_y.shape)), axis=0) + # self.view_voter_perfs[view_index].append(epsilon) + # + # self.view_tau[view_index].append( + # np.sum(np.multiply(self.view_previous_vote[view_index], + # self.view_new_voter[view_index])) / float( + # self.n_total_examples)) + # self.view_previous_vote[view_index] += self.view_q[view_index] * \ + # self.view_new_voter[view_index] + # self.view_norm[view_index].append( + # np.linalg.norm(self.view_previous_vote[view_index]) ** 2) + # self.view_previous_votes[view_index].append( + # self.view_previous_vote[view_index]) + # self.view_previous_margins[view_index].append( + # np.sum(np.multiply(formatted_y, + # self.view_previous_vote[view_index])) / float( + # self.n_total_examples)) + # self.view_selected_margins[view_index].append( + # np.sum(np.multiply(formatted_y, + # self.view_new_voter[view_index])) / float( + # self.n_total_examples)) + # train_metric = self.plotted_metric.score(formatted_y, np.sign( + # self.view_previous_vote[view_index])) + # self.view_train_metrics[view_index].append(train_metric) + # + # def append_new_voter(self, new_voter_index, view_index): + # self.view_chosen_columns_[view_index].append(new_voter_index) + # if self.estimators_generator_name == "Stumps": + # self.view_chosen_features[view_index].append( + # [(int(new_voter_index % ( + # self.view_n_stumps[view_index] * self.view_n_features[ + # view_index]) / self.view_n_stumps[view_index]), + # 1)]) + # elif self.estimators_generator_name == "Trees": + # self.view_chosen_features[view_index].append([( + # self.view_estimators_generator[view_index].attribute_indices[ + # new_voter_index][fake_ind], + # importance) + # for fake_ind, importance + # in enumerate( + # self.view_estimators_generator[view_index].estimators_[ + # new_voter_index].feature_importances_) + # if importance > 0]) + # self.view_new_voter[view_index] = self.view_classification_matrix[ + # view_index][:, + # new_voter_index].reshape( + # (self.n_total_examples, 1)) + # + # def get_new_voter(self, view_index, view_y_kernel_matrices, formatted_y): + # m = view_y_kernel_matrices[view_index].shape[0] + # previous_sum = np.multiply(formatted_y, + # (self.view_previous_vote[view_index] * self.sample_weighting[view_index]).reshape( + # m, 1)) + # margin_old = np.sum(previous_sum) + # worst_example = 0 + # # worst_example = np.argmin(previous_sum) + # + # bad_margins = \ + # np.where(np.sum(view_y_kernel_matrices[view_index], axis=0) <= 0.0)[ + # 0] + # + # self.B2 = 1 + # self.B1s = np.sum( + # 2 * np.multiply(previous_sum, view_y_kernel_matrices[view_index] * self.sample_weighting[view_index]), + # axis=0) + # self.B0 = np.sum(previous_sum ** 2) + # + # self.A2s = np.sum(view_y_kernel_matrices[view_index] * self.sample_weighting[view_index], axis=0) ** 2 + # self.A1s = np.sum(view_y_kernel_matrices[view_index] * self.sample_weighting[view_index], + # axis=0) * margin_old * 2 + # self.A0 = margin_old ** 2 + # + # C2s = (self.A1s * self.B2 - self.A2s * self.B1s) + # C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) + # C0s = self.A0 * self.B1s - self.A1s * self.B0 + # + # sols = np.zeros(C0s.shape) - 3 + # sols[np.where(C2s != 0)[0]] = m*(-C1s[ + # np.where(C2s != 0)[0]] + np.sqrt( + # C1s[np.where(C2s != 0)[0]] * C1s[ + # np.where(C2s != 0)[0]] - 4 * C2s[ + # np.where(C2s != 0)[0]] * C0s[ + # np.where(C2s != 0)[0]])) / ( + # 2 * C2s[ + # np.where(C2s != 0)[0]]) + # + # c_bounds = self.compute_c_bounds(sols) + # trans_c_bounds = self.compute_c_bounds(sols + 1) + # masked_c_bounds = ma.array(c_bounds, fill_value=np.inf) + # # Masing Maximums + # masked_c_bounds[c_bounds >= trans_c_bounds] = ma.masked + # # Masking magrins <= 0 + # masked_c_bounds[bad_margins] = ma.masked + # # Masking weights < 0 (because self-complemented) + # masked_c_bounds[sols < 0] = ma.masked + # # Masking nan c_bounds + # masked_c_bounds[np.isnan(c_bounds)] = ma.masked + # if not self.twice_the_same: + # masked_c_bounds[self.view_chosen_columns_[view_index]] = ma.masked + # + # if masked_c_bounds.mask.all(): + # return "No more pertinent voters", 0 + # else: + # best_hyp_index = np.argmin(masked_c_bounds) + # # self.try_.append(np.ravel(previous_sum) ) + # # + # # self.try_2.append(np.reshape(previous_sum ** 2, (87,)) + (2 * sols[best_hyp_index]*y_kernel_matrix[:, best_hyp_index]*np.reshape(previous_sum, (87, )))) + # self.view_c_bounds[view_index].append( + # masked_c_bounds[best_hyp_index]) + # self.view_margins[view_index].append( + # math.sqrt(self.A2s[best_hyp_index])) + # self.view_disagreements[view_index].append( + # 0.5 * self.B1s[best_hyp_index]) + # new_weight = sols[best_hyp_index]/(sum(self.view_weights_[view_index])+sols[best_hyp_index]) + # return new_weight, best_hyp_index + # + # def compute_c_bounds(self, sols): + # return 1 - (self.A2s * sols ** 2 + self.A1s * sols + self.A0) / ( + # self.B2 * sols ** 2 + self.B1s * sols + self.B0) + # + # def compute_c_bounds_gen(self, sols): + # return 1 - (self.A2s * sols ** 2 + self.A1s * sols + self.A0) / (( + # self.B2 * sols ** 2 + self.B1s * sols + self.B0)*self.n_total_examples) + # + # def init_boosting(self, view_index, view_first_voter_index, formatted_y): + # self.view_chosen_columns_[view_index].append( + # view_first_voter_index[view_index]) + # self.view_new_voter[view_index] = np.array( + # self.view_classification_matrix[view_index][:, + # view_first_voter_index[view_index]].reshape( + # (self.n_total_examples, 1)), + # copy=True) + # + # self.view_previous_vote[view_index] = self.view_new_voter[view_index] + # self.view_norm[view_index].append( + # np.linalg.norm(self.view_previous_vote[view_index]) ** 2) + # self.view_q[view_index] = 1 + # self.view_weights_[view_index].append(self.view_q[view_index]) + # + # self.view_previous_margins.append( + # np.sum(np.multiply(formatted_y, + # self.view_previous_vote[view_index])) / float( + # self.n_total_examples)) + # self.view_selected_margins[view_index].append( + # np.sum( + # np.multiply(formatted_y, self.view_previous_vote[view_index]))) + # self.view_tau[view_index].append( + # np.sum(np.multiply(self.view_previous_vote[view_index], + # self.view_new_voter[view_index])) / float( + # self.n_total_examples)) + # + # train_metric = self.plotted_metric.score(formatted_y, np.sign( + # self.view_previous_vote[view_index])) + # self.view_train_metrics[view_index].append(train_metric) + # + # def get_first_voter(self, view_index, view_first_voter_index, view_y_kernel_matrices): + # if self.random_start: + # view_first_voter_index[view_index] = self.random_state.choice( + # np.where( + # np.sum(view_y_kernel_matrices[view_index], axis=0) > 0)[0]) + # margin = np.sum(view_y_kernel_matrices[view_index][:, view_first_voter_index[view_index]] * self.sample_weighting[view_index]) + # else: + # pseudo_h_values = ma.array( + # np.sum(view_y_kernel_matrices[view_index] * self.sample_weighting[view_index], axis=0), + # fill_value=-np.inf) + # view_first_voter_index[view_index] = np.argmax(pseudo_h_values) + # + # margin = pseudo_h_values[view_first_voter_index[view_index]] + # self.view_decisions[view_index] = (view_y_kernel_matrices[view_index][:,view_first_voter_index[view_index]] > 0).reshape((self.n_total_examples, 1)) + # return view_first_voter_index, margin + # + # def init_estimator_generator(self, view_index): + # if self.estimators_generator == "Stumps": + # self.view_estimators_generator[ + # view_index] = BoostUtils.StumpsClassifiersGenerator( + # n_stumps_per_attribute=self.n_stumps, + # self_complemented=self.self_complemented) + # if self.estimators_generator == "Trees": + # self.view_estimators_generator[ + # view_index] = BoostUtils.TreeClassifiersGenerator( + # n_trees=self.n_stumps, max_depth=self.max_depth, + # self_complemented=self.self_complemented) + # + # def get_view_vote(self ,X, sample_indices, view_index,): + # classification_matrix = self.get_classification_matrix(X, + # sample_indices, + # view_index, ) + # + # margins = np.sum(classification_matrix * self.view_weights_[view_index], + # axis=1) + # signs_array = np.array([int(x) for x in BoostUtils.sign(margins)]) + # signs_array[signs_array == -1] = 0 + # return signs_array + # + # def get_classification_matrix(self, X, sample_indices, view_index, ): + # if self.view_estimators_generator[view_index].__class__.__name__ == "TreeClassifiersGenerator": + # probas = np.asarray( + # [clf.predict_proba( + # X.get_v(view_index, sample_indices)[:, attribute_indices]) + # for + # clf, attribute_indices in + # zip(self.view_estimators_generator[view_index].estimators_, + # self.view_estimators_generator[ + # view_index].attribute_indices)]) + # else: + # probas = np.asarray( + # [clf.predict_proba(X.get_v(view_index, sample_indices)) for clf + # in + # self.view_estimators_generator[view_index].estimators_]) + # predicted_labels = np.argmax(probas, axis=2) + # predicted_labels[predicted_labels == 0] = -1 + # values = np.max(probas, axis=2) + # return (predicted_labels * values).T + # diff --git a/summit/multiview_platform/multiview_classifiers/multiview_cbound_boosting.py b/summit/multiview_platform/multiview_classifiers/multiview_cbound_boosting.py deleted file mode 100644 index aa92f3d6c445204d4b2246a6abb540b6cbea58db..0000000000000000000000000000000000000000 --- a/summit/multiview_platform/multiview_classifiers/multiview_cbound_boosting.py +++ /dev/null @@ -1,616 +0,0 @@ -from sklearn.tree import DecisionTreeClassifier -import numpy as np -import numpy.ma as ma -import math -import os -import pandas as pd - -from multimodal.boosting.mumbo import MumboClassifier - -from ..multiview.multiview_utils import BaseMultiviewClassifier -from ..monoview_classifiers.additions import CBBoostUtils, BoostUtils -from ..utils.hyper_parameter_search import CustomRandint -from ..utils.dataset import get_samples_views_indices -from .. import metrics -from ..utils.base import base_boosting_estimators -from ..utils.organization import secure_file_path -from .. import monoview_classifiers - -classifier_class_name = "MultiviewCBoundBoosting" - -class MultiviewCBoundBoosting(BaseMultiviewClassifier, CBBoostUtils.CBBoostClassifier): - - def __init__(self, n_max_iterations=10, random_state=None, - self_complemented=True, twice_the_same=False, - random_start=False, n_stumps=1, c_bound_sol=True, - estimators_generator="Stumps", mincq_tracking=False, - weight_add=3, weight_strategy="c_bound_based_broken", - weight_update = "multiplicative", **kwargs): - BaseMultiviewClassifier.__init__(self, random_state) - self.param_names = ["n_max_iterations","random_state"] - self.distribs = [CustomRandint(5,200), [random_state]] - self.n_max_iterations = n_max_iterations - self.random_state = random_state - self.self_complemented = self_complemented - self.twice_the_same = twice_the_same - self.random_start = random_start - self.n_stumps = n_stumps - self.c_bound_sol = c_bound_sol - self.estimators_generator = estimators_generator - self.estimators_generator_name = estimators_generator - self.mincq_tracking = mincq_tracking - self.plotted_metric=metrics.zero_one_loss - self.weight_add = weight_add - self.weight_strategy = weight_strategy - self.weight_update = weight_update - - def init_lists(self, X, view_indices,): - self.used_views = view_indices - self.view_names = [X.get_view_name(view_index) - for view_index in view_indices] - - # Todo HDF5 compatible - self.view_n_stumps = [self.n_stumps for _ in range(X.nb_view)] - self.view_n_features = [X.get_v(view_index).shape[1] for view_index in - range(X.nb_view)] - - self.view_estimators_generator = [_ for _ in range(X.nb_view)] - self.view_classification_matrix = [_ for _ in range(X.nb_view)] - self.view_train_shapes = [_ for _ in range(X.nb_view)] - - self.view_chosen_columns_ = [[] for _ in range(X.nb_view)] - self.view_new_voter = [_ for _ in range(X.nb_view)] - self.view_previous_vote = [_ for _ in range(X.nb_view)] - self.view_q = [_ for _ in range(X.nb_view)] - self.view_train_metrics = [[] for _ in range(X.nb_view)] - self.view_norm = [[] for _ in range(X.nb_view)] - self.view_weights_ = [[] for _ in range(X.nb_view)] - self.view_previous_margins = [[] for _ in range(X.nb_view)] - self.view_selected_margins = [[] for _ in range(X.nb_view)] - self.view_tau = [[] for _ in range(X.nb_view)] - self.view_voter_perfs = [[] for _ in range(X.nb_view)] - self.view_chosen_features = [[] for _ in range(X.nb_view)] - self.view_previous_votes = [[] for _ in range(X.nb_view)] - self.view_c_bounds = [[] for _ in range(X.nb_view)] - self.view_margins = [[] for _ in range(X.nb_view)] - self.view_disagreements = [[] for _ in range(X.nb_view)] - self.view_decisions = [[] for _ in range(X.nb_view)] - self.margin = [_ for _ in range(self.n_view_total)] - self.view_considered_misclass = [_ for _ in range(self.n_view_total)] - - def fit(self, X, y, train_indices=None, view_indices=None): - - train_indices, view_indices = get_samples_views_indices(X, - train_indices, - view_indices) - self.used_labels = y[train_indices] - self.n_view_total = X.nb_view - view_y_kernel_matrices = [_ for _ in range(X.nb_view)] - view_first_voter_index = [_ for _ in range(X.nb_view)] - self.general_voters = [] - self.general_weights = [] - self.general_c_bounds = [] - self.margins = [] - self.general_margins = [] - self.sample_weightings = [] - self.broken_views = [False for _ in range(self.n_view_total)] - self.general_disagreements = [] - self.decisions = [] - self.considered_misclass = [np.zeros((self.n_view_total, len(train_indices)))] - self.init_lists(X, view_indices, ) - self.sample_weighting = [np.ones((train_indices.shape[0], 1)) / train_indices.shape[0] if _ in view_indices else "" for _ in range(X.nb_view)] - self.sample_weightings.append([s.copy() for s in self.sample_weighting]) - - - self.view_first_voters = [[] for _ in range(X.nb_view)] - for view_index in view_indices: - - formatted_X, formatted_y = self.format_X_y(X.get_v(view_index, sample_indices=train_indices), y[train_indices]) - self.init_estimator_generator(view_index) - self.view_estimators_generator[view_index].fit(formatted_X, formatted_y) - - self.view_classification_matrix[view_index] = self.get_classification_matrix(X, train_indices, view_index) - - self.view_train_shapes[view_index] = self.view_classification_matrix[view_index].shape - self.n_total_examples, n = self.view_classification_matrix[view_index].shape - - view_y_kernel_matrices[view_index] = np.multiply(formatted_y, self.view_classification_matrix[view_index]) - - view_first_voter_index, margin = self.get_first_voter(view_index, view_first_voter_index, view_y_kernel_matrices) - self.margin[view_index] = view_y_kernel_matrices[view_index][:,view_first_voter_index] - self.view_first_voters[view_index] = [margin, view_first_voter_index[view_index]] - self.init_boosting(view_index, view_first_voter_index, formatted_y) - self.decisions.append([d.copy() for d in self.view_decisions]) - self.margins.append([m.copy() for m in self.margin]) - self.view_break_cause = [" the maximum number of iterations was attained." for _ in range(X.nb_view)] - self.available_view_indices = view_indices.copy() - self.get_best_view_voter() - - for boosting_iter_index in range(self.n_max_iterations): - self.it = boosting_iter_index+1 - print("iteration ", self.it) - for view_index in self.available_view_indices: - - self.view_q[view_index], new_voter_index = self.get_new_voter(view_index, view_y_kernel_matrices, formatted_y) - - if type(self.view_q[view_index]) == str: - self.view_break_cause[view_index] = new_voter_index # - self.available_view_indices.remove(view_index) - self.broken_views[view_index] = True - break - - self.append_new_voter(new_voter_index, view_index) - self.update_sample_weighting(view_index, formatted_y) - self.update_infos(view_index, formatted_y) - self.sample_weightings.append([s.copy() for s in self.sample_weighting]) - self.decisions.append([d.copy() for d in self.view_decisions]) - self.margins.append([m.copy() for m in self.margin]) - self.considered_misclass.append([c.copy() for c in self.view_considered_misclass]) - - print("\tn_cols_chosen\t", [len(self.view_chosen_columns_[i]) for i in self.used_views]) - print("\tbroken\t\t", self.broken_views) - self.general_q, new_voter_view_index = self.choose_new_general_voter(formatted_y) - print("\tChosen_view\t", new_voter_view_index) - if type(self.general_q) == str: - self.break_cause = new_voter_view_index - self.it -=1 - break - self.general_voters.append([new_voter_view_index, self.view_chosen_columns_[new_voter_view_index][-1]]) - self.general_weights.append(self.general_q) - - - - for view_index in view_indices: - self.view_estimators_generator[view_index].choose(self.view_chosen_columns_[view_index]) - self.view_weights_[view_index] = np.array(self.view_weights_[view_index]) / np.sum( - np.array(self.view_weights_[view_index])) - self.general_weights = np.array(self.general_weights)/np.sum(np.array(self.general_weights)) - # quit() - return self - - def predict(self, X, sample_indices=None, view_indices=None): - sample_indices, view_indices = get_samples_views_indices(X, - sample_indices, - view_indices) - self._check_views(view_indices) - view_classification_matrix = [_ for _ in range(self.n_view_total)] - vote = [] - for view_index in range(self.n_view_total): - if view_index in view_indices: - view_classification_matrix[view_index] = self.get_classification_matrix(X, - sample_indices, - view_index, ) - else: - pass - for iter_index, (view_index, voter_indice) in enumerate(self.general_voters): - vote.append(view_classification_matrix[view_index][:,iter_index]) - vote = np.array(vote) - print((vote * self.general_weights.reshape((self.it+1, 1))).shape) - margins = np.sum(vote * self.general_weights.reshape((self.it+1, 1)), axis=0) - print(margins.shape) - signs_array = np.array([int(x) for x in BoostUtils.sign(margins)]) - signs_array[signs_array == -1] = 0 - return signs_array - - def transform_sample_weights(self): - df = pd.DataFrame(columns=["weight", "view", "sample", 'iteration', "right", "margin", "mixed_sample_view", "considered_mis_class"]) - i=0 - self.min_weight = 100 - self.max_weight = -100 - - for iter_index, view_sample_weights in enumerate(self.sample_weightings): - print(iter_index) - for view_index, sample_weights in enumerate(view_sample_weights): - for sample_index, weight in enumerate(sample_weights): - weight = weight[0]*10 - df.loc[i] = [weight, view_index, sample_index, iter_index, self.decisions[iter_index][view_index][sample_index][0], abs(self.margins[iter_index][view_index][sample_index][0]), view_index+sample_index*self.n_view_total, self.considered_misclass[iter_index][view_index][sample_index]] - i+=1 - if weight < self.min_weight: - self.min_weight = weight - elif weight > self.max_weight: - self.max_weight = weight - return df - - def get_interpretation(self, directory, base_file_name, labels, multiclass=False): - self.view_importances = np.zeros(self.n_view_total) - for (view, index), weight in zip(self.general_voters, self.general_weights): - self.view_importances[view]+=weight - self.view_importances/=np.sum(self.view_importances) - interpret_string = str(self.view_importances) - df = self.transform_sample_weights() - import plotly.express as px - fig = px.scatter(df, x="mixed_sample_view", y="weight", animation_frame="iteration", animation_group='mixed_sample_view', size="margin", - color="right", text="view", hover_name="sample", hover_data=["weight", "view", "sample", 'iteration', "right", "margin", "mixed_sample_view", "considered_mis_class"], range_x=[0, self.n_total_examples*self.n_view_total], range_y=[self.min_weight, self.max_weight] - ) - fig.show() - quit() - - return interpret_string - - def update_sample_weighting(self, view_index, formatted_y,): - weight_strategies = ['c_bound_based', ] - import math - print("\t 1st voter\t", self.view_previous_vote[0][0]) - print("\t Sol\t\t", self.view_q[view_index]) - new_vote = np.multiply(self.view_previous_vote[view_index]+self.view_q[view_index]*self.view_new_voter[view_index],self.sample_weighting[view_index]) - well_class = np.zeros((self.n_total_examples, self.n_view_total)) - for view_ind in range(self.n_view_total): - class_vote = np.multiply( - self.view_previous_vote[view_ind] + self.view_q[view_ind] * - self.view_new_voter[view_ind], - self.sample_weighting[view_ind]) - margins = formatted_y * class_vote - self.margin[view_index] = margins - well_class[:, view_ind] = np.array([mg[0] > 0 for mg in margins]) - print(well_class) - considered_well_class = well_class[:, view_index] + np.logical_not(well_class.any(axis=1)) - self.view_considered_misclass[view_index] = considered_well_class - self.view_decisions[view_index] = np.array([vote[0]*y>0 for vote, y in zip(new_vote, formatted_y)]) - if self.weight_strategy == 'c_bound_based': - c_bound_based_weighting = formatted_y*new_vote/(new_vote**2+self.weight_add) - normalized_cbound_weights = c_bound_based_weighting+(math.sqrt(self.weight_add)/2*self.weight_add) - normalized_cbound_weights/= np.sum(normalized_cbound_weights) - sample_weights = normalized_cbound_weights - elif self.weight_strategy == 'c_bound_based_broken': - c_bound_based_weighting = np.array([y * vote - math.sqrt(self.weight_add) / ( - (vote - math.sqrt(self.weight_add)) ** 2 + self.weight_add) - if not considered_well_class[sample_index] else y * vote + math.sqrt(self.weight_add) / ( - (vote + math.sqrt(self.weight_add)) ** 2 + self.weight_add) - for sample_index, (y, vote) in enumerate(zip(formatted_y, new_vote))]).reshape((self.n_total_examples, 1)) - # normalized_cbound_weights = c_bound_based_weighting + ( - # math.sqrt(self.weight_add) / 2 * self.weight_add) - sample_weights = c_bound_based_weighting/np.sum(c_bound_based_weighting) - - elif self.weight_strategy == 'c_bound_based_dec': - c_bound_based_weighting = np.array([-vote**2 + math.sqrt(self.weight_add)/(2*self.weight_add) - if not considered_well_class[sample_index] else y * vote + math.sqrt(self.weight_add) / ( - (vote + math.sqrt(self.weight_add)) ** 2 + self.weight_add) - for sample_index, (y, vote) in enumerate(zip(formatted_y, new_vote))]).reshape((self.n_total_examples, 1)) - # normalized_cbound_weights = c_bound_based_weighting + ( - # math.sqrt(self.weight_add) / 2 * self.weight_add) - sample_weights = c_bound_based_weighting/np.sum(c_bound_based_weighting) - - else: - raise ValueError("weight_strategy must be in {}, here it is {}".format(weight_strategies, self.weight_strategy)) - - well_class = np.zeros((self.n_total_examples, self.n_view_total)) - for view_ind in range(self.n_view_total): - new_vote = self.view_previous_vote[view_ind] + self.view_q[ - view_ind] * self.view_new_voter[view_ind] - margins = formatted_y * new_vote - self.margin[view_index] = margins - well_class[:, view_ind] = np.array([mg[0] > 0 for mg in margins]) - min_sample_weights = np.min(sample_weights) - max_sample_weights = np.max(sample_weights) - sample_weights = self.normalize(sample_weights) - - if self.weight_update =="additive": - sample_weights = self.normalize(sample_weights, range=1, min_interval=-0.5) - self.sample_weighting[view_index] += sample_weights - elif self.weight_update == "multiplicative": - sample_weights = self.normalize(sample_weights, range=2, - min_interval=-1) - - self.sample_weighting[view_index] *= sample_weights - elif self.weight_update == "replacement": - sample_weights = self.normalize(sample_weights, range=1, - min_interval=0) - self.sample_weighting[view_index] = sample_weights.reshape((self.n_total_examples,1)) - - self.sample_weighting[view_index] /= np.max(self.sample_weighting[view_index])-np.min(self.sample_weighting[view_index]) - self.sample_weighting[view_index] -= np.min(self.sample_weighting[view_index]) - self.sample_weighting[view_index] /= np.sum(self.sample_weighting[view_index]) - print("\tMin\t\t", np.min(self.sample_weighting[view_index])) - print("\tMax\t\t", np.max(self.sample_weighting[view_index])) - - def normalize(self, sample_weights, range=2, min_interval=-1.0): - min_sample_weights = np.min(sample_weights) - max_sample_weights = np.max(sample_weights) - if range is None: - pass - else: - sample_weights = sample_weights*(range/(max_sample_weights-min_sample_weights))-(-min_interval+(range*min_sample_weights)/(max_sample_weights-min_sample_weights)) - return sample_weights - - - def get_best_view_voter(self, ): - best_margin = 0 - for view_index, (margin, voter_index) in enumerate(self.view_first_voters): - if margin > best_margin: - best_margin = margin - best_view = view_index - best_voter = voter_index - self.general_voters.append([best_view, best_voter]) - self.general_weights.append(1.0) - self.general_previous_vote = np.array( - self.view_classification_matrix[best_view][:, - best_voter].reshape( - (self.n_total_examples, 1)), - copy=True) - - - - def choose_new_general_voter(self, formatted_y): - previous_sum = np.multiply(formatted_y, self.general_previous_vote ) - margin_old = np.sum(previous_sum) - worst_example = 0 - # worst_example = np.argmin(previous_sum) - y_kernel_matrix = np.array([self.view_new_voter[view_index]*formatted_y if not self.broken_views[view_index] and view_index in self.used_views else np.zeros((self.n_total_examples, 1))-1 for view_index in range(self.n_view_total)]).reshape((self.n_total_examples, self.n_view_total)) - bad_margins = \ - np.where(np.sum(y_kernel_matrix, axis=0) <= 0.0)[ - 0] - self.B2 = self.n_total_examples - self.B1s = np.sum( - 2 * np.multiply(previous_sum, y_kernel_matrix), axis=0) - self.B0 = np.sum(previous_sum ** 2) - - self.A2s = np.sum( - y_kernel_matrix, axis=0) ** 2 - self.A1s = np.sum( - y_kernel_matrix, - axis=0) * margin_old * 2 - self.A0 = margin_old ** 2 - - C2s = (self.A1s * self.B2 - self.A2s * self.B1s) - C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) - C0s = self.A0 * self.B1s - self.A1s * self.B0 - - sols = np.zeros(C0s.shape) - 3 - sols[np.where(C2s != 0)[0]] = (-C1s[ - np.where(C2s != 0)[0]] + np.sqrt( - C1s[np.where(C2s != 0)[0]] * C1s[ - np.where(C2s != 0)[0]] - 4 * C2s[ - np.where(C2s != 0)[0]] * C0s[ - np.where(C2s != 0)[0]])) / ( - 2 * C2s[ - np.where(C2s != 0)[0]]) - - c_bounds = self.compute_c_bounds(sols) - print('\tCbounds\t\t', c_bounds) - print("\tSols \t\t", sols) - trans_c_bounds = self.compute_c_bounds(sols + 1) - masked_c_bounds = ma.array(c_bounds, fill_value=np.inf) - # Masing Maximums - masked_c_bounds[c_bounds >= trans_c_bounds] = ma.masked - # Masking magrins <= 0 - masked_c_bounds[bad_margins] = ma.masked - print("\tbad_margins\t", bad_margins) - # Masking weights < 0 (because self-complemented) - # masked_c_bounds[sols < 0] = ma.masked - # Masking nan c_bounds - masked_c_bounds[np.isnan(c_bounds)] = ma.masked - for view_index, broken in enumerate(self.broken_views): - if broken: - masked_c_bounds[view_index] = ma.masked - print('\tCbounds\t\t', masked_c_bounds) - - if masked_c_bounds.mask.all(): - return "No more pertinent voters", 0 - else: - best_hyp_index = np.argmin(masked_c_bounds) - self.general_c_bounds.append( - masked_c_bounds[best_hyp_index]) - self.general_margins.append( - math.sqrt(self.A2s[best_hyp_index] / self.n_total_examples)) - self.general_disagreements.append( - 0.5 * self.B1s[best_hyp_index] / self.n_total_examples) - return sols[best_hyp_index], best_hyp_index - - def update_infos(self, view_index, formatted_y): - self.view_weights_[view_index].append(self.view_q[view_index]) - - ones_matrix = np.zeros(formatted_y.shape) - ones_matrix[ - np.multiply(formatted_y, self.view_new_voter[view_index].reshape( - formatted_y.shape)) < 0] = 1 # can np.divide if needed - epsilon = np.average( - np.multiply(formatted_y, self.view_new_voter[view_index].reshape( - formatted_y.shape)), axis=0) - self.view_voter_perfs[view_index].append(epsilon) - - self.view_tau[view_index].append( - np.sum(np.multiply(self.view_previous_vote[view_index], - self.view_new_voter[view_index])) / float( - self.n_total_examples)) - self.view_previous_vote[view_index] += self.view_q[view_index] * \ - self.view_new_voter[view_index] - self.view_norm[view_index].append( - np.linalg.norm(self.view_previous_vote[view_index]) ** 2) - self.view_previous_votes[view_index].append( - self.view_previous_vote[view_index]) - self.view_previous_margins[view_index].append( - np.sum(np.multiply(formatted_y, - self.view_previous_vote[view_index])) / float( - self.n_total_examples)) - self.view_selected_margins[view_index].append( - np.sum(np.multiply(formatted_y, - self.view_new_voter[view_index])) / float( - self.n_total_examples)) - train_metric = self.plotted_metric.score(formatted_y, np.sign( - self.view_previous_vote[view_index])) - self.view_train_metrics[view_index].append(train_metric) - - def append_new_voter(self, new_voter_index, view_index): - self.view_chosen_columns_[view_index].append(new_voter_index) - if self.estimators_generator_name == "Stumps": - self.view_chosen_features[view_index].append( - [(int(new_voter_index % ( - self.view_n_stumps[view_index] * self.view_n_features[ - view_index]) / self.view_n_stumps[view_index]), - 1)]) - elif self.estimators_generator_name == "Trees": - self.view_chosen_features[view_index].append([( - self.view_estimators_generator[view_index].attribute_indices[ - new_voter_index][fake_ind], - importance) - for fake_ind, importance - in enumerate( - self.view_estimators_generator[view_index].estimators_[ - new_voter_index].feature_importances_) - if importance > 0]) - self.view_new_voter[view_index] = self.view_classification_matrix[ - view_index][:, - new_voter_index].reshape( - (self.n_total_examples, 1)) - - def get_new_voter(self, view_index, view_y_kernel_matrices, formatted_y): - m = view_y_kernel_matrices[view_index].shape[0] - previous_sum = np.multiply(formatted_y, - (self.view_previous_vote[view_index] * self.sample_weighting[view_index]).reshape( - m, 1)) - margin_old = np.sum(previous_sum) - worst_example = 0 - # worst_example = np.argmin(previous_sum) - - bad_margins = \ - np.where(np.sum(view_y_kernel_matrices[view_index], axis=0) <= 0.0)[ - 0] - - self.B2 = 1 - self.B1s = np.sum( - 2 * np.multiply(previous_sum, view_y_kernel_matrices[view_index] * self.sample_weighting[view_index]), - axis=0) - self.B0 = np.sum(previous_sum ** 2) - - self.A2s = np.sum(view_y_kernel_matrices[view_index] * self.sample_weighting[view_index], axis=0) ** 2 - self.A1s = np.sum(view_y_kernel_matrices[view_index] * self.sample_weighting[view_index], - axis=0) * margin_old * 2 - self.A0 = margin_old ** 2 - - C2s = (self.A1s * self.B2 - self.A2s * self.B1s) - C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) - C0s = self.A0 * self.B1s - self.A1s * self.B0 - - sols = np.zeros(C0s.shape) - 3 - sols[np.where(C2s != 0)[0]] = m*(-C1s[ - np.where(C2s != 0)[0]] + np.sqrt( - C1s[np.where(C2s != 0)[0]] * C1s[ - np.where(C2s != 0)[0]] - 4 * C2s[ - np.where(C2s != 0)[0]] * C0s[ - np.where(C2s != 0)[0]])) / ( - 2 * C2s[ - np.where(C2s != 0)[0]]) - - c_bounds = self.compute_c_bounds(sols) - trans_c_bounds = self.compute_c_bounds(sols + 1) - masked_c_bounds = ma.array(c_bounds, fill_value=np.inf) - # Masing Maximums - masked_c_bounds[c_bounds >= trans_c_bounds] = ma.masked - # Masking magrins <= 0 - masked_c_bounds[bad_margins] = ma.masked - # Masking weights < 0 (because self-complemented) - masked_c_bounds[sols < 0] = ma.masked - # Masking nan c_bounds - masked_c_bounds[np.isnan(c_bounds)] = ma.masked - if not self.twice_the_same: - masked_c_bounds[self.view_chosen_columns_[view_index]] = ma.masked - - if masked_c_bounds.mask.all(): - return "No more pertinent voters", 0 - else: - best_hyp_index = np.argmin(masked_c_bounds) - # self.try_.append(np.ravel(previous_sum) ) - # - # self.try_2.append(np.reshape(previous_sum ** 2, (87,)) + (2 * sols[best_hyp_index]*y_kernel_matrix[:, best_hyp_index]*np.reshape(previous_sum, (87, )))) - self.view_c_bounds[view_index].append( - masked_c_bounds[best_hyp_index]) - self.view_margins[view_index].append( - math.sqrt(self.A2s[best_hyp_index])) - self.view_disagreements[view_index].append( - 0.5 * self.B1s[best_hyp_index]) - return sols[best_hyp_index], best_hyp_index - - def compute_c_bounds(self, sols): - return 1 - (self.A2s * sols ** 2 + self.A1s * sols + self.A0) / ( - self.B2 * sols ** 2 + self.B1s * sols + self.B0) - - def init_boosting(self, view_index, view_first_voter_index, formatted_y): - self.view_chosen_columns_[view_index].append( - view_first_voter_index[view_index]) - self.view_new_voter[view_index] = np.array( - self.view_classification_matrix[view_index][:, - view_first_voter_index[view_index]].reshape( - (self.n_total_examples, 1)), - copy=True) - - self.view_previous_vote[view_index] = self.view_new_voter[view_index] - self.view_norm[view_index].append( - np.linalg.norm(self.view_previous_vote[view_index]) ** 2) - self.view_q[view_index] = 1 - self.view_weights_[view_index].append(self.view_q[view_index]) - - self.view_previous_margins.append( - np.sum(np.multiply(formatted_y, - self.view_previous_vote[view_index])) / float( - self.n_total_examples)) - self.view_selected_margins[view_index].append( - np.sum( - np.multiply(formatted_y, self.view_previous_vote[view_index]))) - self.view_tau[view_index].append( - np.sum(np.multiply(self.view_previous_vote[view_index], - self.view_new_voter[view_index])) / float( - self.n_total_examples)) - - train_metric = self.plotted_metric.score(formatted_y, np.sign( - self.view_previous_vote[view_index])) - self.view_train_metrics[view_index].append(train_metric) - - def get_first_voter(self, view_index, view_first_voter_index, view_y_kernel_matrices): - if self.random_start: - view_first_voter_index[view_index] = self.random_state.choice( - np.where( - np.sum(view_y_kernel_matrices[view_index], axis=0) > 0)[0]) - margin = np.sum(view_y_kernel_matrices[view_index][:, view_first_voter_index[view_index]] * self.sample_weighting[view_index]) - else: - pseudo_h_values = ma.array( - np.sum(view_y_kernel_matrices[view_index] * self.sample_weighting[view_index], axis=0), - fill_value=-np.inf) - view_first_voter_index[view_index] = np.argmax(pseudo_h_values) - - margin = pseudo_h_values[view_first_voter_index[view_index]] - self.view_decisions[view_index] = (view_y_kernel_matrices[view_index][:,view_first_voter_index[view_index]] > 0).reshape((self.n_total_examples, 1)) - return view_first_voter_index, margin - - def init_estimator_generator(self, view_index): - if self.estimators_generator is "Stumps": - self.view_estimators_generator[ - view_index] = BoostUtils.StumpsClassifiersGenerator( - n_stumps_per_attribute=self.n_stumps, - self_complemented=self.self_complemented) - if self.estimators_generator is "Trees": - self.view_estimators_generator[ - view_index] = BoostUtils.TreeClassifiersGenerator( - n_trees=self.n_stumps, max_depth=self.max_depth, - self_complemented=self.self_complemented) - - def get_view_vote(self ,X, sample_indices, view_index,): - classification_matrix = self.get_classification_matrix(X, - sample_indices, - view_index, ) - - margins = np.sum(classification_matrix * self.view_weights_[view_index], - axis=1) - signs_array = np.array([int(x) for x in BoostUtils.sign(margins)]) - signs_array[signs_array == -1] = 0 - return signs_array - - def get_classification_matrix(self, X, sample_indices, view_index, ): - if self.view_estimators_generator[view_index].__class__.__name__ == "TreeClassifiersGenerator": - probas = np.asarray( - [clf.predict_proba( - X.get_v(view_index, sample_indices)[:, attribute_indices]) - for - clf, attribute_indices in - zip(self.view_estimators_generator[view_index].estimators_, - self.view_estimators_generator[ - view_index].attribute_indices)]) - else: - probas = np.asarray( - [clf.predict_proba(X.get_v(view_index, sample_indices)) for clf - in - self.view_estimators_generator[view_index].estimators_]) - predicted_labels = np.argmax(probas, axis=2) - predicted_labels[predicted_labels == 0] = -1 - values = np.max(probas, axis=2) - return (predicted_labels * values).T - diff --git a/summit/multiview_platform/multiview_classifiers/mv_cb_boost.py b/summit/multiview_platform/multiview_classifiers/mv_cb_boost.py new file mode 100644 index 0000000000000000000000000000000000000000..a4062ba0e13b4111fe9396c065caca7a7a77504f --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mv_cb_boost.py @@ -0,0 +1,35 @@ +from .additions.mv_cb_boost_adapt import MultiviewCBoundBoostingAdapt + +classifier_class_name = "MVCBBoost" + +class MVCBBoost(MultiviewCBoundBoostingAdapt): + def __init__(self, n_estimators=100, + random_state=42, + self_complemented=True, + twice_the_same=False, + random_start=False, + n_stumps=10, + c_bound_sol=True, + base_estimator="Trees", + max_depth=1, + mincq_tracking=False, + weight_add=3, + weight_strategy="c_bound_based_dec", + weight_update="multiplicative", + full_combination=False, + min_cq_pred=False, + min_cq_mu=10e-3, + sig_mult=15, + sig_offset=5, + use_previous_voters=False, **kwargs): + MultiviewCBoundBoostingAdapt.__init__(self, n_estimators=n_estimators, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, max_depth=max_depth, + base_estimator=base_estimator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, use_previous_voters=use_previous_voters, + full_combination=full_combination, + min_cq_pred=min_cq_pred, min_cq_mu=min_cq_mu, + sig_mult=sig_mult, sig_offset=sig_offset, **kwargs) + # self.param_names+=["weight_update", "weight_strategy"] + # self.distribs+=[["multiplicative", "additive", "replacement"],["c_bound_based_broken", "c_bound_based", "c_bound_based_dec", "sigmoid"]] \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/mv_cb_boost_base.py b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_base.py new file mode 100644 index 0000000000000000000000000000000000000000..db1cc7bc7d844c99ef04c0adf786d78be6675a1b --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_base.py @@ -0,0 +1,17 @@ +from .additions.mv_cb_boost_adapt import MultiviewCBoundBoostingAdapt + +classifier_class_name = "MVCBBoostBroken" + +class MVCBBoostBroken(MultiviewCBoundBoostingAdapt): + def __init__(self, n_max_iterations=100, random_state=None, + self_complemented=True, twice_the_same=False, + random_start=False, n_stumps=1, c_bound_sol=True, + estimators_generator="Stumps", mincq_tracking=False, + weight_add=3, weight_strategy="c_bound_based", + weight_update="multiplicative", **kwargs): + MultiviewCBoundBoostingAdapt.__init__(self, n_max_iterations=n_max_iterations, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, + estimators_generator=estimators_generator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, **kwargs) \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/mv_cb_boost_broken.py b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_broken.py new file mode 100644 index 0000000000000000000000000000000000000000..5c1a240769cbcc7ff75eb0552e91876622eb10e7 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_broken.py @@ -0,0 +1,17 @@ +from .additions.mv_cb_boost_adapt import MultiviewCBoundBoostingAdapt + +classifier_class_name = "MVCBBoostBroken" + +class MVCBBoostBroken(MultiviewCBoundBoostingAdapt): + def __init__(self, n_max_iterations=10, random_state=None, + self_complemented=True, twice_the_same=False, + random_start=False, n_stumps=1, c_bound_sol=True, + estimators_generator="Stumps", mincq_tracking=False, + weight_add=3, weight_strategy="c_bound_based_broken", + weight_update="multiplicative", **kwargs): + MultiviewCBoundBoostingAdapt.__init__(self, n_max_iterations=n_max_iterations, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, + estimators_generator=estimators_generator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, **kwargs) \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/mv_cb_boost_full.py b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_full.py new file mode 100644 index 0000000000000000000000000000000000000000..be82fb29bc522b69ecba993d2dfd6986a4e6f259 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_full.py @@ -0,0 +1,17 @@ +from .additions.mv_cb_boost_adapt import MultiviewCBoundBoostingAdapt + +classifier_class_name = "MVCBBoostFull" + +class MVCBBoostFull(MultiviewCBoundBoostingAdapt): + def __init__(self, n_max_iterations=10, random_state=None, + self_complemented=True, twice_the_same=False, + random_start=False, n_stumps=1, c_bound_sol=True, + estimators_generator="Stumps", mincq_tracking=False, + weight_add=3, weight_strategy="c_bound_based_dec", + weight_update="multiplicative", full_combination=True, **kwargs): + MultiviewCBoundBoostingAdapt.__init__(self, n_max_iterations=n_max_iterations, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, + estimators_generator=estimators_generator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, full_combination=full_combination, **kwargs) diff --git a/summit/multiview_platform/multiview_classifiers/mv_cb_boost_mincq.py b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_mincq.py new file mode 100644 index 0000000000000000000000000000000000000000..7c55d6f4216048e026bfac28ea461e6226dfac5a --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_mincq.py @@ -0,0 +1,17 @@ +from .additions.mv_cb_boost_adapt import MultiviewCBoundBoostingAdapt + +classifier_class_name = "MVCBBoostMinCQ" + +class MVCBBoostMinCQ(MultiviewCBoundBoostingAdapt): + def __init__(self, n_max_iterations=10, random_state=None, + self_complemented=True, twice_the_same=False, + random_start=False, n_stumps=1, c_bound_sol=True, + estimators_generator="Stumps", mincq_tracking=False, + weight_add=3, weight_strategy="c_bound_based_dec", + weight_update="multiplicative", full_combination=False, min_cq_pred=True, **kwargs): + MultiviewCBoundBoostingAdapt.__init__(self, n_max_iterations=n_max_iterations, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, + estimators_generator=estimators_generator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, full_combination=full_combination, min_cq_pred=min_cq_pred, **kwargs) diff --git a/summit/multiview_platform/multiview_classifiers/mv_cb_boost_sig.py b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_sig.py new file mode 100644 index 0000000000000000000000000000000000000000..bc2ded45e4bee0db0c6673243427edbb70ed810f --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mv_cb_boost_sig.py @@ -0,0 +1,19 @@ +from .additions.mv_cb_boost_adapt import MultiviewCBoundBoostingAdapt + +classifier_class_name = "MVCBBoostSig" + +class MVCBBoostSig(MultiviewCBoundBoostingAdapt): + def __init__(self, n_max_iterations=100, random_state=None, + self_complemented=True, twice_the_same=False, + random_start=False, n_stumps=1, c_bound_sol=True, + estimators_generator="Stumps", mincq_tracking=False, + weight_add=3, weight_strategy="c_bound_based_broken", + weight_update="multiplicative", use_previous_voters=True, + **kwargs): + MultiviewCBoundBoostingAdapt.__init__(self, n_max_iterations=n_max_iterations, random_state=random_state, + self_complemented=self_complemented, twice_the_same=twice_the_same, + random_start=random_start, n_stumps=n_stumps, c_bound_sol=c_bound_sol, + estimators_generator=estimators_generator, mincq_tracking=mincq_tracking, + weight_add=weight_add, weight_strategy=weight_strategy, + weight_update=weight_update, use_previous_voters=use_previous_voters, + **kwargs) \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/pb_mv_boost.py b/summit/multiview_platform/multiview_classifiers/pb_mv_boost.py new file mode 100644 index 0000000000000000000000000000000000000000..40335ea8e119067690fe796af644b78c3ddc7918 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/pb_mv_boost.py @@ -0,0 +1,418 @@ +import numpy as np +from sklearn.tree import DecisionTreeClassifier +from scipy.optimize import minimize +from sklearn.metrics import f1_score + +from ..multiview.multiview_utils import get_available_monoview_classifiers, \ + BaseMultiviewClassifier, ConfigGenerator +from ..utils.dataset import get_samples_views_indices +from ..utils.multiclass import get_mc_estim, MultiClassWrapper +from ..utils.hyper_parameter_search import CustomRandint +from ..utils.transformations import sign_labels, unsign_labels + +""" +This code is implements the C-bound optimization problem for the Multiview learning algorithm PB-MVBoost. + +Related Paper: +Multiview Boosting by Controlling the Diversity and the Accuracy of View-specific Voters +by Anil Goyal, Emilie Morvant, Pascal Germain and Massih-Reza Amini + +Based on the code of Anil Goyal, +Link to the paper: +https://arxiv.org/abs/1808.05784 +""" + +classifier_class_name = "PBMVBoost" + + +class MV_Cbount_opt(object): + """ + This class solves the C-bound optimization problem for the Multiview learning algorithm PB-MVBoost. + It learns the weights for over the views for our algorithm. + """ + def __init__(self,initial_guess,risk_vector,disagreement_vector): + """ + + :param initial_guess: vector for the initial guess of weights + :param risk_vector: Risk vector + :param disagreement_vector: Vector for disagreement values + """ + + self.initial_guess=initial_guess + self.risk_vector=risk_vector + self.disagreement_vector=disagreement_vector + + + + def func(self, x, r,d,sign=1): + """ Objective function """ + num=1-2 * (sum(x*r)) + den=1-2 * (sum(x*d)) + + return sign * ((num)**2 / den) + + def func_deriv(self, x, r,d, sign=1): + """ Derivative of objective function """ + num = 1 - 2 * (sum(x*r)) + den = 1 - 2 * (sum(x*d)) + + dfdx= sign * ((-1 * 4 * r * num * den + 2 * d * (num)**2) / (den ** 2)) + + return np.array(dfdx) + + + def learn_weights(self): + """ + Learns weights + :param self: + :return: + """ + x = self.initial_guess + r = self.risk_vector + d = self.disagreement_vector + arguments = (r, d, -1) + + + + + cons = ({'type': 'eq', + 'fun': lambda x: np.array([sum(x) - 1]), + 'jac': lambda x: np.array(x)} + ) + + res = minimize(self.func, x, args=arguments, bounds=tuple((0,None) for i in range(len(x))), jac=self.func_deriv, + constraints=cons, method='SLSQP', options={'disp': False}) + + + if np.isnan(res.x[0]): + return self.initial_guess + else: + return res.x + +class PBMVBoost(BaseMultiviewClassifier): + + def __init__(self, random_state=None, num_iterations=100, + decision_tree_depth=1): + BaseMultiviewClassifier.__init__(self, random_state=random_state) + self.num_iterations = num_iterations + self.decision_tree_depth = decision_tree_depth + self.param_names = ["num_iterations", "decision_tree_depth"] + self.distribs = [CustomRandint(1,100), CustomRandint(1,3)] + + # Variables to store the train and test predictions and weights after each iteration + + + def fit(self, X, y, train_indices=None, view_indices=None): + train_indices, views_indices = get_samples_views_indices(X, + train_indices, + view_indices) + view_names = [X.get_view_name(view_ind) + for view_ind in views_indices] + X_train = dict((X.get_view_name(view_ind), X.get_v(view_ind, + train_indices)) + for view_ind in views_indices) + y_train = dict((X.get_view_name(view_ind),sign_labels(y[train_indices])) + for view_ind in views_indices) + self.train_predictions_classifiers = dict((name, []) for name in view_names) + self.weak_classifiers = dict((name, []) for name in view_names) + self.weights_classfiers = dict((name, []) for name in view_names) + return self.learn(X_train, y_train, all_views=view_names) + + def predict(self, X, sample_indices=None, + view_indices=None): + sample_indices, views_indices = get_samples_views_indices(X, + sample_indices, + view_indices) + X = dict((X.get_view_name(view_ind), X.get_v(view_ind, + sample_indices)) + for view_ind in views_indices) + pred = self._compute_majority_vote(X) + pred[pred==-1]=0 + return pred + + + def _compute_weight(self,error,view_index): + """ + This function is helping function to compute weight of hypothesis based + on error pased to it. + It Computes 0.5 * ln (1-error/ error) + :param error: Error value + :return: Weight value + """ + view_weight = self.rho[view_index] + if view_weight == 0: + return 0 + else: + + return 0.5 * np.log((1 - error) / (float(error))) + #return 0.5 * (1 / float(view_weight)) * (np.log((1 - error) / (float(error))) + 2) + + def _learn_classifier(self,X_train, y_train, name_of_view, view_index, + example_weights): + """ + This function learns the weak classifier and returns weight for this learned classifier. Fitting is + done on weighted samples which is passed as an input parameter. + + Input + ====== + :param name_of_view : View name for which we need to learn classfier + :param example_weights : Weight of input training examples + + :return: Weight of Classifier, training data labels, test data labels. + + """ + + #learning classifier + clf = DecisionTreeClassifier(max_depth=self.decision_tree_depth, random_state=1, splitter='random') + clf.fit(X_train[name_of_view], y_train[name_of_view], sample_weight=example_weights) # fitting model according to weighted samples. + self.weak_classifiers[name_of_view].append(clf) + #predicting lables for training and test data + predicted_labels_train = clf.predict(X_train[name_of_view]) + # predicted_labels_test = clf.predict(X_test) + #computing error + error_t = [int(x) for x in (predicted_labels_train != y_train[name_of_view])] + error_t_weighted = np.dot(example_weights, error_t) / sum(example_weights) + + # Reweighing the Weights of hypothesis if weighted error is zero to avoid warning at step 7 of algorithm. + if error_t_weighted == 0: + error_t_weighted = 0.1 * min(example_weights) + + # Compute hypothesis weight (Line 7 of Algorithm) + + Q_t = self._compute_weight(error_t_weighted,view_index) + + + return Q_t, predicted_labels_train + + def _learn_view_weights(self, y_train, initial_guess, example_weights): + """ + This function learns the weights over views. + + :param initial_guess: initial weights over views. + + :return: rho : weight over views. + """ + errors_t = [] + disaggrement_t = [] + + # Computing View-Specific Error and disagreement on weighted training data. + for name_of_view in self.all_views: + + classifier_errors = [] + paired_disagreements = [] + + # compute view-specific error + for classifier_output in self.train_predictions_classifiers[name_of_view]: + error = [int(x) for x in (classifier_output != y_train[name_of_view])] + weighted_error = np.dot(example_weights, error) / sum(example_weights) + classifier_errors.append(weighted_error) + + classifier_errors = np.array(classifier_errors) + classifier_weights = np.array(self.weights_classfiers[name_of_view]) + errors_t.append(sum(classifier_errors * classifier_weights)) + + # compute view-specific disagreement + for index_1, classifier_output_1 in enumerate(self.train_predictions_classifiers[name_of_view]): + for index_2, classifier_output_2 in enumerate(self.train_predictions_classifiers[name_of_view]): + disagreement = [int(x) for x in (classifier_output_1 != classifier_output_2)] + weighted_disagreement = np.dot(example_weights, disagreement) / sum(example_weights) + + classifier_weights = np.array(self.weights_classfiers[name_of_view]) + + weight_1 = classifier_weights[index_1] + weight_2 = classifier_weights[index_2] + + paired_disagreements.append(weighted_disagreement * weight_1 * weight_2) + + disaggrement_t.append(sum(paired_disagreements)) + + optimize = MV_Cbount_opt(initial_guess, np.array(errors_t), np.array(disaggrement_t)) + rho = optimize.learn_weights() + + return rho + + def _compute_Cbound(self,risk, disagreement): + """ + This function computes the C-Bound on the value of gibbs risk and gibbs disagreement. + :return: C-bound value + """ + C_bound=1-((1-2*risk)**2 / (1-2*disagreement)) + return C_bound + + def _compute_majority_vote(self, X): + """More sklearn-ish version of _calculate_majority_vote""" + predictions = predictions = np.zeros(X[list(X.keys())[0]].shape[0]) + weak_outputs = dict((name, + np.array([clf.predict(X[name]) + for clf in self.weak_classifiers[name]])) + for name in self.all_views) + for view_index, name_of_view in enumerate(self.all_views): + for t, output in enumerate(weak_outputs[name_of_view]): + classifier_weights = np.array(self.weights_classfiers[name_of_view]) + predictions = predictions + self.rho[view_index] * classifier_weights[t] * output + + predictions = np.sign(predictions) + + return predictions + + def _calculate_majority_vote(self,data='train'): + """ + This function calculates the majority vote + + :param data : tells on which data we need to compute the majority vote + + :return: predictions of majority vote + """ + if data == 'train': + predictions = np.zeros(self.num_train_examples) + classifiers_outputs = self.train_predictions_classifiers + + elif data == 'test': + predictions = np.zeros(self.num_test_examples) + classifiers_outputs = self.test_predictions_classfiers + + for view_index, name_of_view in enumerate(self.all_views): + for t, output in enumerate(classifiers_outputs[name_of_view]): + classifier_weights = np.array(self.weights_classfiers[name_of_view]) + predictions = predictions + self.rho[view_index] * classifier_weights[t] * output + + predictions = np.sign(predictions) + + return predictions + + def _mv_cbound(self, data = 'train', y_train=None): + """ + This function will compute the 2nd form of multiview c-bound for mv-boost. + + :param data : this parameter will tell on which data we have to compute the c-bound. + + :return: the value of c-bound on input data. + """ + + if data == 'train': + predictions = self.train_predictions_classifiers + labels = y_train + + errors_t = [] + disaggrement_t = [] + # example_weights = np.ones(self.num_train_examples) / self.num_train_examples # to not to consider example weights. + # Computing View-Specific Error and disagreement on weighted training data.(Line 11-12) + for name_of_view in self.all_views: + + classifier_errors = [] + paired_disagreements = [] + + # compute view-specific error (Line 11) + for classifier_output in predictions[name_of_view]: + error = [int(x) for x in (classifier_output != labels[name_of_view])] + weighted_error = np.mean(error) + classifier_errors.append(weighted_error) + + classifier_errors = np.array(classifier_errors) + classifier_weights = np.array(self.weights_classfiers[name_of_view]) / sum(np.array(self.weights_classfiers[name_of_view])) + errors_t.append(sum(classifier_errors * classifier_weights)) + + # compute view-specific disagreement (Line 12) + for index_1, classifier_output_1 in enumerate(predictions[name_of_view]): + for index_2, classifier_output_2 in enumerate(predictions[name_of_view]): + disagreement = [int(x) for x in (classifier_output_1 != classifier_output_2)] + weighted_disagreement = np.mean(disagreement) + classifier_weights = np.array(self.weights_classfiers[name_of_view]) / sum(np.array(self.weights_classfiers[name_of_view])) + + weight_1 = classifier_weights[index_1] + weight_2 = classifier_weights[index_2] + + paired_disagreements.append(weighted_disagreement * weight_1 * weight_2) + + disaggrement_t.append(sum(paired_disagreements)) + + rho = np.array(self.rho) + risk_total = sum(np.array(errors_t) * rho) + disagreement_total = sum(np.array(disaggrement_t) * rho) + c_bound = self._compute_Cbound(risk_total,disagreement_total) + + return c_bound + + def _compute_stats(self,predicted_values,true_values): + """ + This function returns the error and f1-score. + :param predicted_values: Predicted labels of any estimator + :param true_values: True labels + :return: + """ + # removing the elements with output zero. + zero_indices = np.where(predicted_values == 0)[0] + predicted_values = np.delete(predicted_values, zero_indices) + true_values = np.delete(true_values, zero_indices) + error = np.mean(predicted_values * true_values <= 0.0) + # f1 = f1_score(y_true=true_values, y_pred=predicted_values) + + return error + + def learn(self, X_train, y_train, all_views): + """ + This function will learn the mvboost model for input multiview learning data. + :return: Accuracy and F1 Measure on Training and Test Data. Also, Multiview C-Bound value on Training Data + after T iterations. + """ + self.num_train_examples = X_train[list(X_train.keys())[0]].shape[0] + self.all_views=all_views + + #Initializing weights for training data (Line 1 and 2 of Algorithm) + w=np.ones(self.num_train_examples) / self.num_train_examples + + # T Iterations iterations. (Beginnning of loop at line 4 of Algorithm) + for t in range(self.num_iterations): + if t == 0: + self.rho = np.ones(len(self.all_views)) / len(self.all_views) # Line 3 of Algorithm + + print("Iteration: " + str(t+1) + "\n") + + #Learning view-specific classifiers and weights over them (Line 5-7) + for view_index,name_of_view in enumerate(self.all_views): + Q_t, predicted_labels_train= self._learn_classifier(X_train, y_train, name_of_view,view_index,example_weights=w) + + #Storing the view-specific train and test outputs along with hypothesis weights + self.train_predictions_classifiers[name_of_view].append(predicted_labels_train) + self.weights_classfiers[name_of_view].append(Q_t) + + #Computing weights over views (Line 8) + if t == 0: + self.rho=np.ones(len(self.all_views)) / len(self.all_views) #Line 9 of Algorithm. + self.rho_vectors=[] + self.rho_vectors.append(self.rho) + else: + initial_guess = np.ones(len(self.all_views)) / len(self.all_views) + self.rho = self._learn_view_weights(y_train, initial_guess, w) + self.rho_vectors.append(self.rho) + + + # Update weights over training sample (Line 9-10) + train_predictions=np.zeros(self.num_train_examples) + for index,name_of_view in enumerate(self.all_views): + classifier_weights = np.array(self.weights_classfiers[name_of_view]) + predictions=self.rho[index]*classifier_weights[-1]*self.train_predictions_classifiers[name_of_view][-1] + train_predictions=train_predictions+predictions + w = w * np.exp(-train_predictions * y_train[name_of_view]) + w=w/sum(w) + + # Computing Majority-vote error and f1-measure at each iteration. + # test_predictions = self._calculate_majority_vote(data='test') + train_predictions = self._calculate_majority_vote(data='train') + + + # error_test, f1_test = self._compute_stats(predicted_values=test_predictions,true_values=self.y_test[name_of_view]) + error_train = self._compute_stats(predicted_values=train_predictions,true_values=y_train[name_of_view]) + + + c_bound_train = self._mv_cbound(data='train', y_train=y_train) + + print("Accuracy on Training Data: " + str(1 - np.array(error_train)) + "\n") + # print("F1 Score on Training Data: " + str( np.array(f1_train)) + "\n") + print("Multiview C-Bound Training Data: " + str(np.array(c_bound_train)) + "\n") + # print("Accuracy on Test Data: " + str(1 - np.array(error_test)) + "\n") + # print("F1 Score on Test Data: " + str( np.array(f1_test)) + "\n") + print("=========================================== \n") + + + return self \ No newline at end of file diff --git a/summit/multiview_platform/utils/hyper_parameter_search.py b/summit/multiview_platform/utils/hyper_parameter_search.py index 0fd65b9309939b23a8530d5ba116fda38ae7a340..d0886fbabda97f200142b0cf9dfcf1c0eca8863e 100644 --- a/summit/multiview_platform/utils/hyper_parameter_search.py +++ b/summit/multiview_platform/utils/hyper_parameter_search.py @@ -79,6 +79,7 @@ class HPSearch: if cross_validation_score >= max(results.values()): self.best_params_ = self.candidate_params[ candidate_param_idx] + self.best_params_ = current_estimator.get_params() self.best_score_ = cross_validation_score except BaseException: if self.track_tracebacks: