From 6e4d208ab7862d77ba386516950ec8aa94c7d4aa Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Tue, 3 Sep 2019 16:10:09 -0400 Subject: [PATCH] ECMLJ_expes --- .../MonoMultiViewClassifiers/ExecClassif.py | 17 +- .../Monoview/Additions/BoostUtils.py | 2 +- .../Monoview/Additions/CBBoostUtils.py | 20 +- .../Monoview/Additions/CGDescUtils.py | 40 +- .../Monoview/Additions/_custom_criterion.pyx | 621 +++++++++++++++++- .../Monoview/ExecClassifMonoView.py | 4 +- .../Monoview/ExportResults.py | 12 +- .../Monoview/MonoviewUtils.py | 2 +- .../MonoviewClassifiers/AdaboostPregen.py | 1 + .../Multiview/ExecMultiview.py | 4 +- .../ResultAnalysis.py | 6 +- .../utils/GetMultiviewDb.py | 47 +- .../utils/HyperParameterSearch.py | 2 +- .../utils/execution.py | 2 +- 14 files changed, 711 insertions(+), 69 deletions(-) diff --git a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py index e75934e9..bcc03aed 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py +++ b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py @@ -9,6 +9,7 @@ import matplotlib import itertools import numpy as np from joblib import Parallel, delayed +from sklearn.tree import DecisionTreeClassifier matplotlib.use( 'Agg') # Anti-Grain Geometry C++ library to make a raster (pixel) image of the figure @@ -183,15 +184,23 @@ def gen_multiple_kwargs_combinations(clKWARGS): keys = clKWARGS.keys() kwargs_combination = [dict((key, value) for key, value in zip(keys, values)) for values in values_cartesian_prod] - return kwargs_combination + + reduce_dict = {DecisionTreeClassifier: "DT", } + reduced_listed_values = [ + [_ if type(_) not in reduce_dict else reduce_dict[type(_)] for _ in + list_] for list_ in listed_values] + reduced_values_cartesian_prod = [_ for _ in itertools.product(*reduced_listed_values)] + reduced_kwargs_combination = [dict((key, value) for key, value in zip(keys, values)) + for values in reduced_values_cartesian_prod] + return kwargs_combination, reduced_kwargs_combination def gen_multiple_args_dictionnaries(nbClass, kwargsInit, classifier, viewName, viewIndex): - multiple_kwargs_list = gen_multiple_kwargs_combinations(kwargsInit[classifier + "KWARGSInit"]) + multiple_kwargs_list, reduced_multiple_kwargs_list = gen_multiple_kwargs_combinations(kwargsInit[classifier + "KWARGSInit"]) multiple_kwargs_dict = dict( - (classifier+"_"+"_".join(map(str,list(dictionary.values()))), dictionary) - for dictionary in multiple_kwargs_list) + (classifier+"_"+"_".join(map(str,list(reduced_dictionary.values()))), dictionary) + for reduced_dictionary, dictionary in zip(reduced_multiple_kwargs_list, multiple_kwargs_list )) args_dictionnaries = [{ "args": {classifier_name + "KWARGS": arguments, "feat": viewName, diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py index 5573f626..10f034b7 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py @@ -851,7 +851,7 @@ def get_accuracy_graph(plotted_data, classifier_name, file_name, # plt.tight_layout() else: ax.legend((scat,), (name,)) - f.savefig(file_name) + f.savefig(file_name, transparent=True) plt.close() diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py index 3187fc8f..38b3ab87 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py @@ -17,11 +17,11 @@ from ... import Metrics # Used for CBBoost class CBBoostClassifier(BaseEstimator, ClassifierMixin, BaseBoost): - def __init__(self, n_max_iterations=None, estimators_generator=None, - random_state=42, self_complemented=True, twice_the_same=False, - random_start=True, n_stumps=1, c_bound_sol=True, - plotted_metric=Metrics.zero_one_loss, save_train_data=True, - test_graph=True, mincq_tracking=True): + def __init__(self, n_max_iterations=100, estimators_generator="Stumps", + random_state=42, self_complemented=True, twice_the_same=True, + random_start=False, n_stumps=1, c_bound_sol=True, + plotted_metric=Metrics.zero_one_loss, save_train_data=False, + test_graph=True, mincq_tracking=False): super(CBBoostClassifier, self).__init__() r""" @@ -240,16 +240,6 @@ class CBBoostClassifier(BaseEstimator, ClassifierMixin, BaseBoost): self.new_voter = self.classification_matrix[:, new_voter_index].reshape( (self.n_total_examples, 1)) - # def choose_new_voter(self, y_kernel_matrix, formatted_y): - # """Used to choose the voter according to the specified criterion (margin or C-Bound""" - # if self.c_bound_choice: - # sol, new_voter_index = self._find_new_voter(y_kernel_matrix, - # formatted_y) - # else: - # new_voter_index, sol = self._find_best_weighted_margin( - # y_kernel_matrix) - # return sol, new_voter_index - def init_boosting(self, m, y, y_kernel_matrix): """THis initialization corressponds to the first round of boosting with equal weights for each examples and the voter chosen by it's margin.""" diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py index cfc5765a..0fbc8b08 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py @@ -22,7 +22,7 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): c_bound_choice=True, random_start=True, n_stumps=1, use_r=True, c_bound_sol=True, plotted_metric=Metrics.zero_one_loss, save_train_data=True, - test_graph=True, mincq_tracking=True): + test_graph=True, mincq_tracking=False): super(ColumnGenerationClassifierQar, self).__init__() r""" @@ -104,15 +104,15 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): # Print dynamically the step and the error of the current classifier self.it = k - print( - "Resp. bound : {}, {}; {}/{}, eps :{}, ".format( - self.respected_bound, - self.bounds[-1] > self.train_metrics[-1], - k + 2, - self.n_max_iterations, - self.voter_perfs[-1], - ), - end="\r") + # print( + # "Resp. bound : {}, {}; {}/{}, eps :{}, ".format( + # self.respected_bound, + # self.bounds[-1] > self.train_metrics[-1], + # k + 2, + # self.n_max_iterations, + # self.voter_perfs[-1], + # ), + # end="\r") sol, new_voter_index = self.choose_new_voter(y_kernel_matrix, formatted_y) if type(sol) == str: @@ -132,8 +132,8 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): self.raw_weights = self.weights_ self.y_train = formatted_y - print(self.classification_matrix) - print(self.weights_, self.break_cause) + # print(self.classification_matrix) + # print(self.weights_, self.break_cause) self.weights_ = np.array(self.weights_) self.weights_ /= np.sum(self.weights_) @@ -451,11 +451,26 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): self.A2s = np.sum(weighted_hypothesis, axis=0) ** 2 self.A1s = np.sum(weighted_hypothesis, axis=0) * margin_old * 2 self.A0 = margin_old ** 2 + import matplotlib.pyplot as plt + # plt.plot(self.A2s * 0.5 * self.B1s / m**3) + # plt.plot(np.array([margin_old/m for _ in range(len(self.A2s))])) + # plt.savefig("try.png") + + # print("C2 < 0 :", np.where(np.array([margin_old/m for _ in range(len(self.A2s))]) < np.sqrt(self.A2s) * 0.5 * self.B1s / m**2)[0]) + # print("C1 < 0 :", np.where(np.array([margin_old ** 2 / m for _ in range( + # len(self.A2s))]) < self.A2s * self.B0 / m ** 2)[0]) + # print("Double root:", np.where((0.5 * self.B1s / m)**2 * m > self.B0)[0]) + C2s = (self.A1s * self.B2 - self.A2s * self.B1s) + # print("Wrong C2 :" , np.where(C2s < 0)[0].shape, bad_margins.shape) C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) + # print("Wrong C2 :", np.where(C1s < 0)[0].shape, bad_margins.shape) C0s = self.A0 * self.B1s - self.A1s * self.B0 + # print(np.where(C2s==0)) + # print(self.chosen_columns_) + sols = np.zeros(C0s.shape) - 3 # sols[np.where(C2s == 0)[0]] = C0s[np.where(C2s == 0)[0]] / C1s[np.where(C2s == 0)[0]] sols[np.where(C2s != 0)[0]] = (-C1s[np.where(C2s != 0)[0]] + np.sqrt( @@ -469,7 +484,6 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): return "No more pertinent voters", 0 else: best_hyp_index = np.argmin(masked_c_bounds) - self.c_bounds.append(masked_c_bounds[best_hyp_index]) self.margins.append(math.sqrt(self.A2s[best_hyp_index] / m)) self.disagreements.append(0.5 * self.B1s[best_hyp_index] / m) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx index 8e50ea22..f6deb43e 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx @@ -1,3 +1,11 @@ +from sklearn.tree._criterion import ClassificationCriterion + +class Cbound(ClassificationCriterion): + def node_impurity(self): + pass + + + # # cython: cdivision=True # # cython: boundscheck=False # # cython: wraparound=False @@ -14,6 +22,22 @@ # # Jacob Schreiber <jmschreiber91@gmail.com> # # Nelson Liu <nelson@nelsonliu.me> # # +# # License: BSD 3 clause# cython: cdivision=True +# # cython: boundscheck=False +# # cython: wraparound=False +# +# # Authors: Gilles Louppe <g.louppe@gmail.com> +# # Peter Prettenhofer <peter.prettenhofer@gmail.com> +# # Brian Holt <bdholt1@gmail.com> +# # Noel Dawe <noel@dawe.me> +# # Satrajit Gosh <satrajit.ghosh@gmail.com> +# # Lars Buitinck +# # Arnaud Joly <arnaud.v.joly@gmail.com> +# # Joel Nothman <joel.nothman@gmail.com> +# # Fares Hedayati <fares.hedayati@gmail.com> +# # Jacob Schreiber <jmschreiber91@gmail.com> +# # Nelson Liu <nelson@nelsonliu.me> +# # # # License: BSD 3 clause # # calloc @@ -76,7 +100,602 @@ # cdef # # -# class CustomCriterion: +# class CustomCriterion(Criterion): +# """Interface for impurity criteria. +# This object stores methods on how to calculate how good a split is using +# different metrics. +# """ +# +# def __dealloc__(self): +# """Destructor.""" +# +# free(self.sum_total) +# free(self.sum_left) +# free(self.sum_right) +# +# def __getstate__(self): +# return {} +# +# def __setstate__(self, d): +# pass +# +# cdef +# int +# init(self, DOUBLE_t * y, SIZE_t +# y_stride, DOUBLE_t * sample_weight, +# double +# weighted_n_samples, SIZE_t * samples, SIZE_t +# start, +# SIZE_t +# end) nogil except -1: +# """Placeholder for a method which will initialize the criterion. +# Returns -1 in case of failure to allocate memory (and raise MemoryError) +# or 0 otherwise. +# Parameters +# ---------- +# y : array-like, dtype=DOUBLE_t +# y is a buffer that can store values for n_outputs target variables +# y_stride : SIZE_t +# y_stride is used to index the kth output value as follows: +# y[i, k] = y[i * y_stride + k] +# sample_weight : array-like, dtype=DOUBLE_t +# The weight of each sample +# weighted_n_samples : DOUBLE_t +# The total weight of the samples being considered +# samples : array-like, dtype=DOUBLE_t +# Indices of the samples in X and y, where samples[start:end] +# correspond to the samples in this node +# start : SIZE_t +# The first sample to be used on this node +# end : SIZE_t +# The last sample used on this node +# """ +# +# pass +# # +# # cdef int reset(self) nogil except -1: +# # """Reset the criterion at pos=start. +# # This method must be implemented by the subclass. +# # """ +# # +# # pass +# # +# # cdef int reverse_reset(self) nogil except -1: +# # """Reset the criterion at pos=end. +# # This method must be implemented by the subclass. +# # """ +# # pass +# # +# # cdef int update(self, SIZE_t new_pos) nogil except -1: +# # """Updated statistics by moving samples[pos:new_pos] to the left child. +# # This updates the collected statistics by moving samples[pos:new_pos] +# # from the right child to the left child. It must be implemented by +# # the subclass. +# # Parameters +# # ---------- +# # new_pos : SIZE_t +# # New starting index position of the samples in the right child +# # """ +# # +# # pass +# # +# # cdef double node_impurity(self) nogil: +# # """Placeholder for calculating the impurity of the node. +# # Placeholder for a method which will evaluate the impurity of +# # the current node, i.e. the impurity of samples[start:end]. This is the +# # primary function of the criterion class. +# # """ +# # +# # pass +# # +# # cdef void children_impurity(self, double* impurity_left, +# # double* impurity_right) nogil: +# # """Placeholder for calculating the impurity of children. +# # Placeholder for a method which evaluates the impurity in +# # children nodes, i.e. the impurity of samples[start:pos] + the impurity +# # of samples[pos:end]. +# # Parameters +# # ---------- +# # impurity_left : double pointer +# # The memory address where the impurity of the left child should be +# # stored. +# # impurity_right : double pointer +# # The memory address where the impurity of the right child should be +# # stored +# # """ +# # +# # pass +# # +# # cdef void node_value(self, double* dest) nogil: +# # """Placeholder for storing the node value. +# # Placeholder for a method which will compute the node value +# # of samples[start:end] and save the value into dest. +# # Parameters +# # ---------- +# # dest : double pointer +# # The memory address where the node value should be stored. +# # """ +# # +# # pass +# # +# # cdef double proxy_impurity_improvement(self) nogil: +# # """Compute a proxy of the impurity reduction +# # This method is used to speed up the search for the best split. +# # It is a proxy quantity such that the split that maximizes this value +# # also maximizes the impurity improvement. It neglects all constant terms +# # of the impurity decrease for a given split. +# # The absolute impurity improvement is only computed by the +# # impurity_improvement method once the best split has been found. +# # """ +# # cdef double impurity_left +# # cdef double impurity_right +# # self.children_impurity(&impurity_left, &impurity_right) +# # +# # return (- self.weighted_n_right * impurity_right +# # - self.weighted_n_left * impurity_left) +# # +# # cdef double impurity_improvement(self, double impurity) nogil: +# # """Compute the improvement in impurity +# # This method computes the improvement in impurity when a split occurs. +# # The weighted impurity improvement equation is the following: +# # N_t / N * (impurity - N_t_R / N_t * right_impurity +# # - N_t_L / N_t * left_impurity) +# # where N is the total number of samples, N_t is the number of samples +# # at the current node, N_t_L is the number of samples in the left child, +# # and N_t_R is the number of samples in the right child, +# # Parameters +# # ---------- +# # impurity : double +# # The initial impurity of the node before the split +# # Return +# # ------ +# # double : improvement in impurity after the split occurs +# # """ +# # +# # cdef double impurity_left +# # cdef double impurity_right +# # +# # self.children_impurity(&impurity_left, &impurity_right) +# # +# # return ((self.weighted_n_node_samples / self.weighted_n_samples) * +# # (impurity - (self.weighted_n_right / +# # self.weighted_n_node_samples * impurity_right) +# # - (self.weighted_n_left / +# # self.weighted_n_node_samples * impurity_left))) +# # +# # +# # cdef class CustomClassificationCriterion(Criterion): +# # """Abstract criterion for classification.""" +# # +# # def __cinit__(self, SIZE_t n_outputs, +# # np.ndarray[SIZE_t, ndim=1] n_classes): +# # """Initialize attributes for this criterion. +# # Parameters +# # ---------- +# # n_outputs : SIZE_t +# # The number of targets, the dimensionality of the prediction +# # n_classes : numpy.ndarray, dtype=SIZE_t +# # The number of unique classes in each target +# # """ +# # +# # self.y = NULL +# # self.y_stride = 0 +# # self.sample_weight = NULL +# # +# # self.samples = NULL +# # self.start = 0 +# # self.pos = 0 +# # self.end = 0 +# # +# # self.n_outputs = n_outputs +# # self.n_samples = 0 +# # self.n_node_samples = 0 +# # self.weighted_n_node_samples = 0.0 +# # self.weighted_n_left = 0.0 +# # self.weighted_n_right = 0.0 +# # +# # # Count labels for each output +# # self.sum_total = NULL +# # self.sum_left = NULL +# # self.sum_right = NULL +# # self.n_classes = NULL +# # +# # safe_realloc(&self.n_classes, n_outputs) +# # +# # cdef SIZE_t k = 0 +# # cdef SIZE_t sum_stride = 0 +# # +# # # For each target, set the number of unique classes in that target, +# # # and also compute the maximal stride of all targets +# # for k in range(n_outputs): +# # self.n_classes[k] = n_classes[k] +# # +# # if n_classes[k] > sum_stride: +# # sum_stride = n_classes[k] +# # +# # self.sum_stride = sum_stride +# # +# # cdef SIZE_t n_elements = n_outputs * sum_stride +# # self.sum_total = <double*> calloc(n_elements, sizeof(double)) +# # self.sum_left = <double*> calloc(n_elements, sizeof(double)) +# # self.sum_right = <double*> calloc(n_elements, sizeof(double)) +# # +# # if (self.sum_total == NULL or +# # self.sum_left == NULL or +# # self.sum_right == NULL): +# # raise MemoryError() +# # +# # def __dealloc__(self): +# # """Destructor.""" +# # free(self.n_classes) +# # +# # def __reduce__(self): +# # return (type(self), +# # (self.n_outputs, +# # sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)), +# # self.__getstate__()) +# # +# # cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, +# # DOUBLE_t* sample_weight, double weighted_n_samples, +# # SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: +# # """Initialize the criterion at node samples[start:end] and +# # children samples[start:start] and samples[start:end]. +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # Parameters +# # ---------- +# # y : array-like, dtype=DOUBLE_t +# # The target stored as a buffer for memory efficiency +# # y_stride : SIZE_t +# # The stride between elements in the buffer, important if there +# # are multiple targets (multi-output) +# # sample_weight : array-like, dtype=DTYPE_t +# # The weight of each sample +# # weighted_n_samples : SIZE_t +# # The total weight of all samples +# # samples : array-like, dtype=SIZE_t +# # A mask on the samples, showing which ones we want to use +# # start : SIZE_t +# # The first sample to use in the mask +# # end : SIZE_t +# # The last sample to use in the mask +# # """ +# # +# # self.y = y +# # self.y_stride = y_stride +# # self.sample_weight = sample_weight +# # self.samples = samples +# # self.start = start +# # self.end = end +# # self.n_node_samples = end - start +# # self.weighted_n_samples = weighted_n_samples +# # self.weighted_n_node_samples = 0.0 +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef double* sum_total = self.sum_total +# # +# # cdef SIZE_t i +# # cdef SIZE_t p +# # cdef SIZE_t k +# # cdef SIZE_t c +# # cdef DOUBLE_t w = 1.0 +# # cdef SIZE_t offset = 0 +# # +# # for k in range(self.n_outputs): +# # memset(sum_total + offset, 0, n_classes[k] * sizeof(double)) +# # offset += self.sum_stride +# # +# # for p in range(start, end): +# # i = samples[p] +# # +# # # w is originally set to be 1.0, meaning that if no sample weights +# # # are given, the default weight of each sample is 1.0 +# # if sample_weight != NULL: +# # w = sample_weight[i] +# # +# # # Count weighted class frequency for each target +# # for k in range(self.n_outputs): +# # c = <SIZE_t> y[i * y_stride + k] +# # sum_total[k * self.sum_stride + c] += w +# # +# # self.weighted_n_node_samples += w +# # +# # # Reset to pos=start +# # self.reset() +# # return 0 +# # +# # cdef int reset(self) nogil except -1: +# # """Reset the criterion at pos=start +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # """ +# # self.pos = self.start +# # +# # self.weighted_n_left = 0.0 +# # self.weighted_n_right = self.weighted_n_node_samples +# # +# # cdef double* sum_total = self.sum_total +# # cdef double* sum_left = self.sum_left +# # cdef double* sum_right = self.sum_right +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t k +# # +# # for k in range(self.n_outputs): +# # memset(sum_left, 0, n_classes[k] * sizeof(double)) +# # memcpy(sum_right, sum_total, n_classes[k] * sizeof(double)) +# # +# # sum_total += self.sum_stride +# # sum_left += self.sum_stride +# # sum_right += self.sum_stride +# # return 0 +# # +# # cdef int reverse_reset(self) nogil except -1: +# # """Reset the criterion at pos=end +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # """ +# # self.pos = self.end +# # +# # self.weighted_n_left = self.weighted_n_node_samples +# # self.weighted_n_right = 0.0 +# # +# # cdef double* sum_total = self.sum_total +# # cdef double* sum_left = self.sum_left +# # cdef double* sum_right = self.sum_right +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t k +# # +# # for k in range(self.n_outputs): +# # memset(sum_right, 0, n_classes[k] * sizeof(double)) +# # memcpy(sum_left, sum_total, n_classes[k] * sizeof(double)) +# # +# # sum_total += self.sum_stride +# # sum_left += self.sum_stride +# # sum_right += self.sum_stride +# # return 0 +# # +# # cdef int update(self, SIZE_t new_pos) nogil except -1: +# # """Updated statistics by moving samples[pos:new_pos] to the left child. +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # Parameters +# # ---------- +# # new_pos : SIZE_t +# # The new ending position for which to move samples from the right +# # child to the left child. +# # """ +# # cdef DOUBLE_t* y = self.y +# # cdef SIZE_t pos = self.pos +# # cdef SIZE_t end = self.end +# # +# # cdef double* sum_left = self.sum_left +# # cdef double* sum_right = self.sum_right +# # cdef double* sum_total = self.sum_total +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t* samples = self.samples +# # cdef DOUBLE_t* sample_weight = self.sample_weight +# # +# # cdef SIZE_t i +# # cdef SIZE_t p +# # cdef SIZE_t k +# # cdef SIZE_t c +# # cdef SIZE_t label_index +# # cdef DOUBLE_t w = 1.0 +# # +# # # Update statistics up to new_pos +# # # +# # # Given that +# # # sum_left[x] + sum_right[x] = sum_total[x] +# # # and that sum_total is known, we are going to update +# # # sum_left from the direction that require the least amount +# # # of computations, i.e. from pos to new_pos or from end to new_po. +# # +# # if (new_pos - pos) <= (end - new_pos): +# # for p in range(pos, new_pos): +# # i = samples[p] +# # +# # if sample_weight != NULL: +# # w = sample_weight[i] +# # +# # for k in range(self.n_outputs): +# # label_index = (k * self.sum_stride + +# # <SIZE_t> y[i * self.y_stride + k]) +# # sum_left[label_index] += w +# # +# # self.weighted_n_left += w +# # +# # else: +# # self.reverse_reset() +# # +# # for p in range(end - 1, new_pos - 1, -1): +# # i = samples[p] +# # +# # if sample_weight != NULL: +# # w = sample_weight[i] +# # +# # for k in range(self.n_outputs): +# # label_index = (k * self.sum_stride + +# # <SIZE_t> y[i * self.y_stride + k]) +# # sum_left[label_index] -= w +# # +# # self.weighted_n_left -= w +# # +# # # Update right part statistics +# # self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left +# # for k in range(self.n_outputs): +# # for c in range(n_classes[k]): +# # sum_right[c] = sum_total[c] - sum_left[c] +# # +# # sum_right += self.sum_stride +# # sum_left += self.sum_stride +# # sum_total += self.sum_stride +# # +# # self.pos = new_pos +# # return 0 +# # +# # cdef double node_impurity(self) nogil: +# # pass +# # +# # cdef void children_impurity(self, double* impurity_left, +# # double* impurity_right) nogil: +# # pass +# # +# # cdef void node_value(self, double* dest) nogil: +# # """Compute the node value of samples[start:end] and save it into dest. +# # Parameters +# # ---------- +# # dest : double pointer +# # The memory address which we will save the node value into. +# # """ +# # +# # cdef double* sum_total = self.sum_total +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t k +# # +# # for k in range(self.n_outputs): +# # memcpy(dest, sum_total, n_classes[k] * sizeof(double)) +# # dest += self.sum_stride +# # sum_total += self.sum_stride +# # +# # cdef class CCriterion(CustomClassificationCriterion): +# # r"""Cross Entropy impurity criterion. +# # This handles cases where the target is a classification taking values +# # 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, +# # then let +# # count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) +# # be the proportion of class k observations in node m. +# # The cross-entropy is then defined as +# # cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) +# # """ +# # +# # cdef double node_impurity(self) nogil: +# # """Evaluate the impurity of the current node, i.e. the impurity of +# # samples[start:end], using the cross-entropy criterion.""" +# # +# # # cdef SIZE_t* n_classes = self.n_classes +# # # cdef double* sum_total = self.sum_total +# # # cdef double entropy = 0.0 +# # # cdef double count_k +# # # cdef SIZE_t k +# # # cdef SIZE_t c +# # # +# # # for k in range(self.n_outputs): +# # # for c in range(n_classes[k]): +# # # count_k = sum_total[c] +# # # if count_k > 0.0: +# # # count_k /= self.weighted_n_node_samples +# # # entropy -= count_k * log(count_k) +# # # +# # # sum_total += self.sum_stride +# # +# # return 1.0 +# # +# # cdef void children_impurity(self, double* impurity_left, +# # double* impurity_right) nogil: +# # """Evaluate the impurity in children nodes +# # i.e. the impurity of the left child (samples[start:pos]) and the +# # impurity the right child (samples[pos:end]). +# # Parameters +# # ---------- +# # impurity_left : double pointer +# # The memory address to save the impurity of the left node +# # impurity_right : double pointer +# # The memory address to save the impurity of the right node +# # """ +# # +# # # cdef SIZE_t* n_classes = self.n_classes +# # # cdef double* sum_left = self.sum_left +# # # cdef double* sum_right = self.sum_right +# # # cdef double entropy_left = 0.0 +# # # cdef double entropy_right = 0.0 +# # # cdef double count_k +# # # cdef SIZE_t k +# # # cdef SIZE_t c +# # # +# # # for k in range(self.n_outputs): +# # # for c in range(n_classes[k]): +# # # count_k = sum_left[c] +# # # if count_k > 0.0: +# # # count_k /= self.weighted_n_left +# # # entropy_left -= count_k * log(count_k) +# # # +# # # count_k = sum_right[c] +# # # if count_k > 0.0: +# # # count_k /= self.weighted_n_right +# # # entropy_right -= count_k * log(count_k) +# # # +# # # sum_left += self.sum_stride +# # # sum_right += self.sum_stride +# # # +# # # impurity_left[0] = entropy_left / self.n_outputs +# # # impurity_right[0] = entropy_right / self.n_outputs +# +# +# calloc +# +# free +# +# memcpy +# +# memset +# +# fabs +# +# malloc +# +# realloc +# # from libc.math cimport log as ln +# +# import numpy as np +# from sklearn.tree import Crit +# +# cimport +# numpy as np +# np.import_array() +# # from sklearn.tree._criterion cimport Criterion, ClassificationCriterion +# +# cdef +# realloc_ptr +# safe_realloc(realloc_ptr * p, size_t +# nelems) nogil except *: +# # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython +# # 0.20.1 to crash. +# cdef +# size_t +# nbytes = nelems * sizeof(p[0][0]) +# if nbytes / sizeof(p[0][0]) != nelems: +# # Overflow in the multiplication +# with gil: +# raise MemoryError("could not allocate (%d * %d) bytes" +# % (nelems, sizeof(p[0][0]))) +# cdef +# realloc_ptr +# tmp = < realloc_ptr > realloc(p[0], nbytes) +# if tmp == NULL: +# with gil: +# raise MemoryError("could not allocate %d bytes" % nbytes) +# p[0] = tmp +# return tmp # for +# +# cdef +# inline +# np.ndarray +# sizet_ptr_to_ndarray(SIZE_t * data, SIZE_t +# size): +# """Return copied data as 1D numpy array of intp's.""" +# cdef +# np.npy_intp +# shape[1] +# shape[0] = < np.npy_intp > size +# return np.PyArray_SimpleNewFromData(1, shape, np.NPY_INTP, data).copy() +# +# cdef +# +# +# class CustomCriterion(Criterion): # """Interface for impurity criteria. # This object stores methods on how to calculate how good a split is using # different metrics. diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index a1fcab81..0e689085 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -222,11 +222,11 @@ def saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, testFileName = outputFileName + imageName + "-" + str( i) + ".png" if not os.path.isfile(testFileName): - imagesAnalysis[imageName].savefig(testFileName) + imagesAnalysis[imageName].savefig(testFileName, transparent=True) break imagesAnalysis[imageName].savefig( - outputFileName + imageName + '.png') + outputFileName + imageName + '.png', transparent=True) if __name__ == '__main__': diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py index ba1a9088..086080ee 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py @@ -135,11 +135,11 @@ def showScoreTime(directory, filename, store, resScore, resTime, rangeX, for i in range(1, 20): testFileName = filename + "-" + str(i) + ".png" if not os.path.isfile(directory + testFileName): - plt.savefig(directory + testFileName) + plt.savefig(directory + testFileName, transparent=True) break else: - plt.savefig(file) + plt.savefig(file, transparent=True) else: plt.show() @@ -180,11 +180,11 @@ def showResults(directory, filename, db, feat, score): for i in range(1, 20): testFileName = filename + "-" + str(i) + ".png" if not os.path.isfile(directory + testFileName): - plt.savefig(directory + testFileName) + plt.savefig(directory + testFileName, transparent=True) break else: - plt.savefig(file) + plt.savefig(file, transparent=True) plt.close() @@ -262,11 +262,11 @@ def plot_confusion_matrix(directory, filename, df_confusion, for i in range(1, 20): testFileName = filename + "-" + str(i) + ".png" if not os.path.isfile(directory + testFileName): - plt.savefig(directory + testFileName) + plt.savefig(directory + testFileName, transparent=True) break else: - plt.savefig(file) + plt.savefig(file, transparent=True) plt.close() diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py index daa2fff7..1d6c4129 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py @@ -190,7 +190,7 @@ class BaseMonoviewClassifier(object): ax.yaxis.set_major_formatter(formatter) plt.bar(x, featureImportancesSorted) plt.title("Importance depending on feature") - fig.savefig(directory + "feature_importances.png") + fig.savefig(directory + "feature_importances.png", transparent=True) plt.close() featuresImportancesDict = dict((featureIndex, featureImportance) for featureIndex, featureImportance in diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py index 6e70dc9d..9df79130 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py @@ -55,6 +55,7 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, self.metrics = np.array( [self.plotted_metric.score(change_label_to_zero(pred), y) for pred in self.staged_predict(pregen_X)]) + self.bounds = np.array([np.prod( np.sqrt(1 - 4 * np.square(0.5 - self.estimator_errors_[:i + 1]))) for i in diff --git a/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py b/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py index a8b97339..54cd4e85 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py +++ b/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py @@ -69,11 +69,11 @@ def saveResults(LABELS_DICTIONARY, stringAnalysis, views, classifierModule, testFileName = outputFileName + imageName + "-" + str( i) + ".png" if not os.path.isfile(testFileName): - imagesAnalysis[imageName].savefig(testFileName) + imagesAnalysis[imageName].savefig(testFileName, transparent=True) break imagesAnalysis[imageName].savefig( - outputFileName + imageName + '.png') + outputFileName + imageName + '.png', transparent=True) def ExecMultiview_multicore(directory, coreIndex, name, learningRate, nbFolds, diff --git a/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py b/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py index 34fb2ad5..22ba5ec0 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py +++ b/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py @@ -249,7 +249,7 @@ def plotMetricScores(trainScores, testScores, names, nbResults, metricName, plt.tight_layout() except: pass - f.savefig(fileName + '.png') + f.savefig(fileName + '.png', transparent=True) plt.close() import pandas as pd if train_STDs is None: @@ -377,7 +377,7 @@ def publish2Dplot(data, classifiersNames, nbClassifiers, nbExamples, nbCopies, cbar = fig.colorbar(cax, ticks=[-100 * statsIter / 2, 0, statsIter]) cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) fig.tight_layout() - fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight") + fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight", transparent=True) plt.close() @@ -405,7 +405,7 @@ def publishErrorsBarPlot(errorOnExamples, nbClassifiers, nbExamples, fileName): plt.bar(x, errorOnExamples) plt.ylim([0, nbClassifiers]) plt.title("Number of classifiers that failed to classify each example") - fig.savefig(fileName + "error_analysis_bar.png") + fig.savefig(fileName + "error_analysis_bar.png", transparent=True) plt.close() diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py b/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py index 1bfa4f92..c60796db 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py @@ -333,25 +333,26 @@ def filterViews(datasetFile, temp_dataset, views, usedIndices): for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): copyhdf5Dataset(datasetFile, temp_dataset, "View" + str(viewIndex), "View" + str(viewIndex), usedIndices) - for askedViewName in views: - for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): - viewName = datasetFile.get("View" + str(viewIndex)).attrs["name"] - if type(viewName) == bytes: - viewName = viewName.decode("utf-8") - if viewName == askedViewName: - copyhdf5Dataset(datasetFile, temp_dataset, - "View" + str(viewIndex), - "View" + str(newViewIndex), usedIndices) - newViewName = \ - temp_dataset.get("View" + str(newViewIndex)).attrs["name"] - if type(newViewName) == bytes: - temp_dataset.get("View" + str(newViewIndex)).attrs[ - "name"] = newViewName.decode("utf-8") - - newViewIndex += 1 - else: - pass - temp_dataset.get("Metadata").attrs["nbView"] = len(views) + else: + for askedViewName in views: + for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): + viewName = datasetFile.get("View" + str(viewIndex)).attrs["name"] + if type(viewName) == bytes: + viewName = viewName.decode("utf-8") + if viewName == askedViewName: + copyhdf5Dataset(datasetFile, temp_dataset, + "View" + str(viewIndex), + "View" + str(newViewIndex), usedIndices) + newViewName = \ + temp_dataset.get("View" + str(newViewIndex)).attrs["name"] + if type(newViewName) == bytes: + temp_dataset.get("View" + str(newViewIndex)).attrs[ + "name"] = newViewName.decode("utf-8") + + newViewIndex += 1 + else: + pass + temp_dataset.get("Metadata").attrs["nbView"] = len(views) def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, @@ -447,6 +448,14 @@ def add_gaussian_noise(dataset_file, random_state, path_f, dataset_name, view_limits[:, 0], noised_data) noised_data = np.where(noised_data > view_limits[:, 1], view_limits[:, 1], noised_data) + # import matplotlib.pyplot as plt + # plt.imshow(noised_data[1,:].reshape((28,28))) + # plt.savefig("plif.png") + # lower_contrast = view_dset.value[1,:].reshape((28,28))/10 + # print(np.max(lower_contrast)) + # plt.imshow(lower_contrast.astype(int)) + # plt.savefig("plif2.png") + # quit() noisy_dataset[view_name][...] = noised_data # final_shape = noised_data.shape return noisy_dataset, dataset_name + "_noised" diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py b/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py index 84e03d89..08b23063 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py @@ -146,7 +146,7 @@ def genHeatMaps(params, scoresArray, outputFileName): plt.yticks(np.arange(len(paramArray2Set)), paramArray2Set, rotation=45) plt.title('Validation metric') plt.savefig( - outputFileName + "heat_map-" + paramName1 + "-" + paramName2 + ".png") + outputFileName + "heat_map-" + paramName1 + "-" + paramName2 + ".png", transparent=True) plt.close() # nohup python ~/dev/git/spearmint/spearmint/main.py . & diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index ad2b75e7..92b96cbf 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -848,7 +848,6 @@ def genSplits(labels, splitRatio, statsIterRandomStates): random_state=randomState, test_size=splitRatio) folds = foldsObj.split(indices, labels) - print(indices) for fold in folds: train_fold, test_fold = fold trainIndices = indices[train_fold] @@ -907,6 +906,7 @@ def initViews(DATASET, argViews): Names of all the available views in the dataset. """ NB_VIEW = DATASET.get("Metadata").attrs["nbView"] + print(NB_VIEW) if argViews != [""]: allowedViews = argViews allViews = [str(DATASET.get("View" + str(viewIndex)).attrs["name"]) -- GitLab