From 40692642dc593a3c7cc716102c8825bbeb1aee73 Mon Sep 17 00:00:00 2001 From: Dominique Benielli <dominique.benielli@lis-lab.fr> Date: Tue, 17 Dec 2019 17:14:22 +0100 Subject: [PATCH] firts add --- demo/result.py | 89 ++++ generator/__init__.py | 1 + generator/multiviews_datasets.py | 194 +++++++ generator/parameters.py | 83 +++ generator/tests/test_classifier.py | 823 +++++++++++++++++++++++++++++ 5 files changed, 1190 insertions(+) create mode 100644 demo/result.py create mode 100644 generator/__init__.py create mode 100644 generator/multiviews_datasets.py create mode 100644 generator/parameters.py create mode 100644 generator/tests/test_classifier.py diff --git a/demo/result.py b/demo/result.py new file mode 100644 index 0000000..9d2aac6 --- /dev/null +++ b/demo/result.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 27 16:14:14 2019 + +@author: bernardet +""" + +import parameters +from multiviews_datasets import generator_multiviews_dataset, results_to_csv +from test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided + +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +n_samples = parameters.n_samples +n_views = parameters.n_views +n_classes = 3#parameters.n_classes +Z_factor = parameters.Z_factor +R = parameters.R +n_clusters_per_class = 1#parameters.n_clusters_per_class +class_sep_factor = 2#5#2#parameters.class_sep_factor +n_informative_divid = 2#parameters.n_informative_divid +cv = parameters.cv +classifier = parameters.classifier +classifier_dictionary = parameters.classifier_dictionary +d = parameters.d +D = parameters.D +standard_deviation = parameters.standard_deviation +path_data = parameters.path_data +path_graph = parameters.path_graph +n_samples_list = parameters.n_samples_list +R_list = parameters.R_list +Z_factor_list = parameters.Z_factor_list +n_views_list = parameters.n_views_list +class_sep_factor_list = parameters.class_sep_factor_list +n_informative_divid_list = parameters.n_informative_divid_list + + +# Generate one dataset +#Z, y, multiviews_list, unsued_columns_percent = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) +#print(Z, y, multiviews_list) + +# Register one multiview dataset +#results_to_csv(path, Z, y, multiviews_list) + +# Score of one multiview dataset +#df_dimensions, df_scores_means, df_scores_std = score_one_multiview_dataset(cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) +#print(df_dimensions, df_scores_means, df_scores_std) + +# Scores of n_samples_list datasets +#mean_samples, std_samples = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) +#print(mean_samples, std_samples) + +# Plot scores classifier2 vs score classifier1 +classifier1 = "SVM" +classifier2 = "NB" +#graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + +# Scores of R_list datasets +#mean_R, std_R = score_multiviews_R(R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) +#print(mean_R, std_R) + +# Scores of Z_factor_list datasets +#mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) +#print(mean_Z, std_Z, error_Z) + +# Scores divided by scores for R=1 (redundancy null) of n_views_list and R_list datasets +#dict_n_views_R_ratio = score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) +#print(dict_n_views_R_ratio) + +# Scores of class_sep_factor_list datasets +#df_mean, df_std = score_multiviews_class_sep(class_sep_factor_list, path_data, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation) +#print(df_mean, df_std) + +# Scores of n_informative_divid_list datasets +#mean_n_info, std_n_info = score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation) +#print(mean_n_info, std_n_info) + + +Z_factor_list = [1, 3, 10, 25, 100, 250, 1000] +path_graph = "/home/bernardet/Documents/StageL3/Graph/n_views_3_10_1_clus_2_n_info_div/" +n_classes = 2 +n_clusters_per_class = 1 +class_sep_factor = 2 +n_informative_divid = 2 +for n_views in range(3, 11): + n_samples = 500*n_views + mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) diff --git a/generator/__init__.py b/generator/__init__.py new file mode 100644 index 0000000..a1404fc --- /dev/null +++ b/generator/__init__.py @@ -0,0 +1 @@ +__version__ = '1.0.dev0' diff --git a/generator/multiviews_datasets.py b/generator/multiviews_datasets.py new file mode 100644 index 0000000..44babd9 --- /dev/null +++ b/generator/multiviews_datasets.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 26 15:38:38 2019 + +@author: bernardet +""" + +from sklearn.datasets import make_classification +from random import gauss +from math import ceil, floor +import numpy as np +import pandas as pd + + +def latent_space_dimension(views_dimensions_list, R): + """ + Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list + + Parameters: + ----------- + views_dimensions_list : list + R : float + + Returns: + -------- + an int + """ + max_view_dimension = max(views_dimensions_list) + dimension = ceil(R*sum(views_dimensions_list)) + + if dimension < max_view_dimension: + dimension = max_view_dimension + + reduced_dimension = dimension + remove_sum = 0 + + for num_view in range(1, len(views_dimensions_list)): + view_prec = views_dimensions_list[num_view - 1] + view_current = views_dimensions_list[num_view] + remove = floor(R*view_prec) + remove_sum += remove + if reduced_dimension - remove < view_current: + dimension += view_current - (reduced_dimension - remove) + reduced_dimension = dimension - remove_sum + + return dimension + + +def projection(latent_space, chosen_columns_list): + """ + Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order) + + Parameters: + ----------- + latent_space : array + chosen_columns_list : list + + Returns: + -------- + an array of dimension (number of rows of latent_space, length of chosen_columns_list) + """ + return latent_space[:, chosen_columns_list] + + +def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation): + """ + Returns a generator multiviews dataset + + Parameters: + ----------- + n_samples : int + dataset number of samples (number of rows of dataset) + n_views : int >= 2 + dataset number of views + one view is a set of some features (columns) of the latent space + n_classes : int >= 2 + dataset number of classes + Z_factor : float >= 1 + minimal dimension of the latent space (enough to build the dataset) is calculed then multiplied by Z_factor + R : 0 <= float <= 1 + R = 1 <> no possibility of redundancy between views + R = 0 <> maximal possibility of redundancy between views + n_clusters_per_class : int + class_sep_factor : float + class_sep = n_clusters_per_class*class_sep_factor + n_informative_divid : float >= 1 + n_informative_divid raises <> number of non-informative features raises + n_informative_divid = 1 <> no non-informative features, number of informative features = dimension of latent space + number of informative features = round(dimension of latent space / n_informative_divid) + d : float >= 1 + minimal dimension of views + dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D + D : float >= d + maximal dimension of views + dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D + standard_deviation : float + standard deviation of the gaussian distribution N((d+D)/2, standard_deviation^2) + dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D + + Returns: + -------- + Z : an array of dimension(n_samples, R*n_views) = the generated samples + y : an array of dimension (n_samples) = the integer labels for class membership of each sample + a list of n_views tuples (X_v, I_v) with : + X_v = Z projected along d_v (= dimension of the v-ith views) columns in I_v + I_v = X_v columns numeros with numberring of Z columns numeros + unsued_dimensions_percent : percentage of unsued columns of latent space in views + n_informative : number of informative features (dimension of latent space - n_informative = number of non informative features) + """ + + if n_views < 2: + raise ValueError("n_views >= 2") + if n_classes < 2: + raise ValueError("n_classes >= 2") + if Z_factor < 1: + raise ValueError("Z_factor >= 1 pour le bon fonctionnement de l'algorithme") + if d < 1: + raise ValueError("d >= 1") + if (d+D)/2 - 3*standard_deviation < 0: + raise ValueError("Il faut que (d+D)/2 - 3*standard_deviation >= 0 pour avoir des valeurs positives lors de l'emploi de la loi normale") + + # n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2) + d_v = np.random.normal(loc=(d+D)/2, scale=standard_deviation, size=n_views) + d_v = list(d_v) + remove_list, add_list = [], [] + for dim_view in d_v: + if dim_view < d or dim_view > D: # 1 <= d <= dim_view <= D + remove_list.append(dim_view) + add = -1 + while add < d or add > D: + add = gauss((d+D)/2, standard_deviation) + add_list.append(add) + d_v = [view for view in d_v if view not in remove_list] + add_list + d_v = [int(view) for view in d_v] # dimension of views = integer + # d_v = list of views dimension from the highest to the lowest + d_v.sort(reverse=True) + # Dimension of latent space Z (multiplied by Z_factor) + dim_Z = Z_factor*latent_space_dimension(d_v, R) + # Number of informative features + n_informative = round(dim_Z/n_informative_divid) + # Generation of latent space Z + Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0, + n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None, + flip_y=0.01, class_sep=n_clusters_per_class*class_sep_factor, random_state=None) + + I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero + meta_I_v = [] + results = [] + for view in range(n_views): + # choice d_v[view] numeros of Z columns uniformly from I_q + I_v = np.random.choice(I_q, size=d_v[view], replace=False) # tirage dans I_q sans remise de taille d_v[view] + meta_I_v += list(I_v) + # projection of Z along the columns in I_v + X_v = projection(Z, I_v) + results.append((X_v, I_v)) + # remove R*d_v[view] columns numeros of I_v form I_q + elements_to_remove = np.random.choice(I_v, size=floor(R*d_v[view]), replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view]) + I_q = np.setdiff1d(I_q, elements_to_remove) # I_q less elements from elements_to_remove + unsued_dimensions_list = [column for column in I_q if column not in meta_I_v] + unsued_dimensions_percent = round((len(unsued_dimensions_list) / dim_Z)*100, 2) + return Z, y, results, unsued_dimensions_percent, n_informative + + +def results_to_csv(path, latent_space, integer_labels, multiviews_list): + """ + Create length of multiviews_list + 2 csv files to the indicated path + Files name : + latent_space.csv for latent_space + integer_labels.csv for integer_labels + view0.csv for multiviews_list[0] + + Parameters: + ----------- + path : str + latent_space : array + integer_labels : 1D array + multiviews_list : list of tuples + + Returns: + -------- + None + """ + df_latent_space = pd.DataFrame(latent_space) + df_latent_space.to_csv(path+'latent_space.csv', index=False) + + df_labels = pd.DataFrame(integer_labels) + df_labels.to_csv(path+'integer_labels.csv', index=False) + + cpt = 0 + for view_tuple in multiviews_list: + df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1]) + df_view.to_csv(path+'view'+str(cpt)+'.csv', index=False) + cpt += 1 diff --git a/generator/parameters.py b/generator/parameters.py new file mode 100644 index 0000000..1847edf --- /dev/null +++ b/generator/parameters.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 26 13:53:05 2019 + +@author: bernardet +""" +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB +import numpy as np + +# General parameters +n_samples = 1000 +# number of samples (int) +n_views = 3 +# number of views >= 2 (int) +n_classes = 2 +# number of classes >= 3 (int) +Z_factor = 250 +# multiplication factor of Z dimension (default value = 1) +R = 2/3 +# redondance (float) +cv = 10 +# number of cross-validation splitting (int) +n_clusters_per_class = 2 +# number of clusters per class >= 1 (int) +class_sep_factor = 2 +# factor >= 1 as class_sep = n_clusters_per_class*class_sep_factor +n_informative_divid = 1 +# factor >= 1 as number of informative features = round(dimension of latent space / n_informative_divid) +classifier = "SVM" +# name of classifier (str) +classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()} +# dictionary of classifiers +n_samples_list = [100, 500, 1000, 1500, 2000]#, 2500, 3000]#, 3500, 4000, 5000, 7000, 10000] +# list of number of samples to test generator +R_list = list(np.arange(0, 1.05, 0.05)) +# list of diverse R +Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]#[25, 50, 75, 100, 150, 200, 250, 500, 600, 750, 800, 900, 1000] +# list of diverse Z_factor +n_views_list = [n_view for n_view in range(2, 10)] +# list of diverse n_views +class_sep_factor_list = [2, 5, 10] +# list of diverse class_sep_factor +n_informative_divid_list = [1, 2, 3] +# list of diverse n_informative_divid +path_data = "/home/bernardet/Documents/StageL3/Data/" +# path to register the multiview dataset +path_graph = "/home/bernardet/Documents/StageL3/Graph/" +# path to register scores graph + + + +# Parameters of gaussian distribution N((d+D)/2, standard_deviation_2) : +# d <= dim[v] <= D for all v +# (d+D)/2 - 3*sqrt(standard_deviation_2) >= 0 +d = 4 +# < D, > 0 +D = 10 +# > d +standard_deviation = 2 +# standard deviation of the gaussian distribution + + +# make_classification parameters : + +# a trouver comment les utiliser +part_informative = 0 +# proportion of informative features (float between 0 and 1) +part_redundant = 1 +# proportion of redundant features (float between 0 and 1) +# n_redundant >= 1 for redundant +part_repeated = 1 +# # proportion of repeated features (float between 0 and 1) +# n_repeated >= 1 for useless features and correlation + + +weights = [0.7, 0.3] +# proportion of samples assigned to each class (list) len(weights) = nbr_features +# != [0.5, 0.5] / = [0.8, 0.2] for imbalance +flip_y = 0.1 +# fraction of samples whose class are randomly exchanged (float) +# > 0 for noise diff --git a/generator/tests/test_classifier.py b/generator/tests/test_classifier.py new file mode 100644 index 0000000..8571ace --- /dev/null +++ b/generator/tests/test_classifier.py @@ -0,0 +1,823 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Nov 28 14:14:46 2019 + +@author: bernardet +""" + +from multiviews_datasets import generator_multiviews_dataset, results_to_csv +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB +from sklearn.model_selection import cross_val_score, StratifiedKFold +from sklearn.metrics import accuracy_score +from collections import Counter +from mpl_toolkits.mplot3d import Axes3D +from math import sqrt +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + + +def majority_list(predictions_list): + """ + Returns an array which on each row the majority class of the same row in predictions_list + + Parameters: + ----------- + predictions_list : list of 1D array + + Returns: + -------- + an 1D array + """ + n_samples = len(predictions_list[0]) + # majority_prediction[i] = prediction of predictions_list[i] which appears the most on predictions_list[i] + majority_prediction = np.array([-1]*n_samples) + # concatenate_predictions_list[i] = list contains prediction of the i-th data per view + reshape_predictions_list = [predictions_list[i].reshape(len(predictions_list[i]), 1) for i in range(len(predictions_list))] + concatenate_predictions_list = np.hstack(reshape_predictions_list) + for sample in range(n_samples): + # dictionary contains predictions (key) and its occurences in concatenate_predictions_list[sample] + count = Counter(concatenate_predictions_list[sample]) + maj_value = max(count.values()) # maximal number of a prediction + for key in count.keys(): # searchs the prediction with the maximal occurence number + if count[key] == maj_value: + majority_prediction[sample] = key + break + + return majority_prediction + + +def majority_score(views_dictionary, integer_labels, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}): + """ + Returns the mean and the standard deviation of accuracy score when predictions are selected by majority of predictions of different views + + Parameters: + ----------- + views_dictionary : dict + integer_labels = array + cv : int + classifier : str + classifier_dictionary : dict + + Returns: + -------- + Two floats + """ + skf = StratifiedKFold(n_splits=cv, random_state=1, shuffle=True) # provides cv train/test indices to split data in cv train/test sets. + prediction_list = [[] for i in range(cv)] # for majority_list function + test_list = [[] for i in range(cv)] # for score + + for key in views_dictionary.keys(): + i = 0 + for train_index, test_index in skf.split(views_dictionary[key], integer_labels): + # splits data and integer label of one view in test and train sets + X = views_dictionary[key] + train, test = X[train_index], X[test_index] + y_train, y_test = integer_labels[train_index], integer_labels[test_index] + # trains the classifier and tests it with test set + clf = classifier_dictionary[classifier] + clf.fit(train, y_train.ravel()) + y_pred = clf.predict(test) + + prediction_list[i].append(y_pred) + if len(test_list[i]) == 0: # same y_test for all views + test_list[i] = y_test + i += 1 + + score = [] + for i in range(len(prediction_list)): + y_pred_majority = majority_list(prediction_list[i]) # majority of views predictions + score.append(accuracy_score(test_list[i].ravel(), y_pred_majority)) # score of majority of views predictions vs expected predictions + score = np.array(score) + return score.mean(), score.std() + + +def score_one_multiview_dataset(cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 3 Series (first with dimensions of latent space, views and percentage of dimensions of latent space unsued in views, the second with accuracy score and the third with the standard deivation of accuracy score) of latent space, views, + early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) + + Parameters: + ----------- + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 3 Series + """ + # dictionary contains percentage of unsued dimension of latent space and dimension of latent space and views + dimensions = {'unsued dimension of latent space':0, "number of informative features":0, 'latent space':0} + dimensions.update({'view'+str(i):0 for i in range(n_views)}) + # dictionary contains and mean of accuracy scores + dict_scores_means = {'latent space':0} + dict_scores_means.update({'view'+str(i):0 for i in range(n_views)}) + dict_scores_means.update({'early fusion':0, 'late fusion':0}) + # dictionary contains standard deviation of accuracy scores + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + dimensions["unsued dimension of latent space"] = unsued_dimensions_percent + dimensions["number of informative features"] = n_informative + dimensions["latent space"] = Z.shape + + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + dimensions['view'+str(i)] = multiviews_list[i][0].shape + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key] = score.mean() + dict_scores_std[key] = score.std() + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'] = mean_majority + dict_scores_std['late fusion'] = std_majority + + df_dimensions = pd.Series(dimensions) + df_scores_means = pd.Series(dict_scores_means) + df_scores_std = pd.Series(dict_scores_std) + + return df_dimensions, df_scores_means, df_scores_std + + +def score_multiviews_n_samples(n_samples_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, + early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with n_samples_list as index for the indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs n_samples_list + + Parameters: + ----------- + n_samples_list : list + each element from n_samples_list defines a new dataset with element samples + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with n_samples_list as index + """ + # n_samples_list = list of samples dimension from the lowest to the highest + n_samples_list.sort(reverse=False) + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per n_samples + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per n_samples + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for n_samples in n_samples_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data = {'latent space':Z} + dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data.update({'early fusion':early_fusion}) + + for key in dict_data.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + df_scores_means = pd.DataFrame(dict_scores_means, index=n_samples_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=n_samples_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(n_samples_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + for n_samples, percent, n_informative in zip(n_samples_list, unsued_dimensions_percent_list, n_informative_list): + label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative)) + + plt.xticks(n_samples_list, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("Number of samples\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs number of samples for classifier "+classifier) + plt.savefig(path_graph+"score_samples_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std + + +def graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv=10, classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Creates and saves (at the indicated path path_graph) multiple graphs represented scores of classifier2 vs scores of classifier1 (one graph per column of result of score_multiviews_n_samples) + + Parameters: + ----------- + classifier1 : str + classifier2 : str + n_samples_list : list + each element from n_samples_list defines a new dataset with element samples + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + None + """ + df_scores_clf1_means, df_scores_clf1_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier1, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + df_scores_clf2_means, df_scores_clf2_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier2, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + + n_samples_list = df_scores_clf1_means.index + keys = df_scores_clf1_means.keys() + + for key in keys: + plt.figure() + plt.scatter(df_scores_clf1_means[key].values, df_scores_clf2_means[key].values, c=df_scores_clf1_means[key].values) + plt.plot([0.0, 1.1], [0.0, 1.1], "--", c=".7") # diagonal + plt.xlabel("Accuracy score for "+classifier1) + plt.ylabel("Accuracy score for "+classifier2) + plt.xlim(0, 1) + plt.ylim(0, 1) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score of "+key+" for "+classifier2+" vs "+classifier1) + plt.savefig(path_graph+classifier1+"_"+classifier2+"_"+str(n_views)+"_"+key+".png") + plt.show() + plt.close() + + +def score_multiviews_R(R_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, Z_factor=1, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, + early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with R_list as index for the indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs R_list + + Parameters: + ----------- + R_list : list + each element from R_list defines a new dataset with element as R + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with R_list as index + """ + # R_list = list of diverse values of R from the lowest to the highest + R_list.sort(reverse=False) + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per R + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per R + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for R in R_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + df_scores_means = pd.DataFrame(dict_scores_means, index=R_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=R_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(R_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + R_label = [] + for i in range(0, len(R_list), 4): + R_label.append(R_list[i]) + label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i])) + + plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs R for classifier "+classifier) + plt.savefig(path_graph+"score_R_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std + +def score_multiviews_Z_factor(Z_factor_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 3 DataFrames (first with accuracy score, the second with the standard deivation of accuracy score and the third with the error rate) of latent space, views, + early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with sum of views dimension divided by Z_factor_list as index for the indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented accuracy score vs sum of views dimension divided by Z_factor_list and a graph represented error rate (1 - accuracy score) vs sum of views dimension divided by Z_factor_list + + Parameters: + ----------- + Z_factor_list : list + each element from Z_factor_list defines a new dataset with element as Z_factor + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 3 DataFrames with Z_factor_list as index + """ + # Z_factor_list = list of diverse values of Z_factor from the highest to the lowest + Z_factor_list.sort(reverse=True) + # list of sum of views dimension for each Z_factor_list item + d_v = [] + # list of Z dimension for each Z_factor_list item + Z_dim_list = [] + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per Z_factor + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains error rate per Z_factor + dict_scores_error = {'latent space':[]} + dict_scores_error.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_error.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per Z_factor + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for Z_factor in Z_factor_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dimension = number of columns + d_v.append(early_fusion.shape[1]) + Z_dim_list.append(Z.shape[1]) + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_error[key].append(1 - score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_error['late fusion'].append(1 - mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + d_v_divid_Z = np.divide(np.array(d_v), np.array(Z_dim_list)) + + df_scores_means = pd.DataFrame(dict_scores_means, index=d_v_divid_Z) + df_scores_error = pd.DataFrame(dict_scores_error, index=d_v_divid_Z) + df_scores_std = pd.DataFrame(dict_scores_std, index=d_v_divid_Z) + + # index and label for graphics + label_index = [chr(i) for i in range(ord('a'),ord('z')+1)] + label_index = label_index[0:len(d_v)] + label_value = "" + for label, v_Z, dim_v, dim_Z, Z_factor, percent, n_informative in zip(label_index, d_v_divid_Z, d_v, Z_dim_list, Z_factor_list, unsued_dimensions_percent_list, n_informative_list): + label_value = label_value + label+" : V/Z = "+str(round(v_Z, 4))+", V = "+str(dim_v)+", Z = "+str(dim_Z)+", Z_factor = "+str(Z_factor)+", % ="+str(percent)+", n_informative = "+str(n_informative)+'\n' + + x_label = "V/Z = sum of views dimension divided by latent space dimension with :\nV = sum of views dimension\nZ = latent space dimension multiplied by Z_factor\n% = percentage of dimensions of latent space unsued in views\nn_informative = number of informative features" + + plt.figure(figsize=(10, 10)) # accuracy score vs d_v_divid_Z + for key in dict_scores_means.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means[key], '.-', label=key) + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # error rate vs d_v_divid_Z + for key in dict_scores_means.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_error[key], '.-', label=key) + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Error rate for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nError rate vs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"error_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + """ + plt.figure(figsize=(10, 10)) + + for key in dict_scores_means.keys(): + plt.errorbar(d_v_divid_Z, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_errorbar_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + """ + plt.figure(figsize=(10, 10)) # accuracy score of early fusion divided by accuracy score of each view vs d_v_divid_Z + for view in dict_views.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means['early fusion']/df_scores_means[view], '.-', label='early fusion score divided by '+view+' score') + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Ratio accuracy score for early fusion / accuracy score for each view for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRatio accuracy score for early fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_majority_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # accuracy score of late fusion divided by accuracy score of each view vs d_v_divid_Z + for view in dict_views.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means['late fusion']/df_scores_means[view], '.-', label='late fusion score divided by '+view+' score') + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Ratio accuracy score for late fusion / accuracy score for each view for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRation accuracy score for late fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_all_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std, df_scores_error + + +def score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_classes=2, Z_factor=1, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns a dictionary with n_views_list as key containing a list of DataFrames (represented accuracy score divided by accuracy score for R=1 <> redundancy null) of views, + early fusion predictions (concatenate views predictions and late fusion predictions (majority views predictions) with R_list as index for the indicated classifier per key + Creates and saves (at the indicated path path_graph) a graph per value of n_views_list represented accuracy score divided by accuracy score for R=1 vs R_list + + Parameters: + ----------- + n_views_list : list + each element from n_views_list defines a new dataset with element as n_views + R_list : list + each element from R_list defines a new dataset with element as R + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + a dictionary with n_views_list as key containing a list of DataFrames (represented accuracy score divided by accuracy score for R=1 <> redundancy null) with R_list as index per value of n_views_list + """ + dict_n_views_R_ratio = {key:0 for key in n_views_list} + # n_views_list = list of diverse values of n_views from the lowest to the highest + n_views_list.sort(reverse=False) + # same views have same colors on each graphs + dict_colors = {'view'+str(i):0 for i in range(n_views_list[-1])} + prop_cycle = plt.rcParams['axes.prop_cycle'] + colors = prop_cycle.by_key()['color'] + for key, c in zip(dict_colors.keys(), colors): + dict_colors[key] = c + dict_colors.update({'early fusion':'purple', 'late fusion':'maroon'}) + + for n_views in n_views_list: + # R_list = list of diverse values of R from the lowest to the highest + R_list.sort(reverse=False) + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per R + dict_scores_means = {'view'+str(i):[] for i in range(n_views)} + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary of list of scores' mean of view for diverse R divided by score's mean of view for R = 1 (<> redundancy null) + dict_scores_ratio_R_1 = {'view'+str(i):0 for i in range(n_views)} + dict_scores_ratio_R_1.update({'early fusion':0, 'late fusion':0}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for R in R_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data_df = {'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)} + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + + for key in dict_scores_means.keys(): + score_R_1 = dict_scores_means[key][-1] # R = 1 = last value of R_list => last score value in dict_scores_means[key] + dict_scores_ratio_R_1[key] = np.divide(np.array(dict_scores_means[key]), score_R_1) + + df_scores_ratio_R_1 = pd.DataFrame(dict_scores_ratio_R_1, index=R_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.plot(R_list, dict_scores_ratio_R_1[key], '.-', color=dict_colors[key], label=key) + # index and label for graphic + label_index = [] + R_label = [] + for i in range(0, len(R_list), 4): + R_label.append(R_list[i]) + label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i])) + + plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1\n(redundancy null) vs R for classifier "+classifier) + plt.savefig(path_graph+"score_R_divid_R_1_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + dict_n_views_R_ratio[n_views] = df_scores_ratio_R_1 + + plt.figure() + ax = plt.axes(projection="3d") + + for n_views in n_views_list: + for key in dict_n_views_R_ratio[n_views].keys(): + if n_views == n_views_list[-1]: # print legends only once + ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key], label=key) + else: + ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key]) + + ax.set_xlabel("R") + ax.set_ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier) + ax.set_zlabel("Number of views") + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of samples = "+str(n_samples)+" - factor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1 (redundancy null) vs R, number of views for classifier "+classifier) + plt.savefig(path_graph+"score_R_divid_R_1_all_n_views"+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return dict_n_views_R_ratio + + +def score_multiviews_class_sep(class_sep_factor_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_samples=1000, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, + early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with class_sep_factor_list as index for the indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs class_sep_factor_list + + Parameters: + ----------- + class_sep_factor_list : list + each element from n_samples_list defines a new dataset + path_graph : str + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with n_samples_list as index + """ + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per class_sep_factor + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per class_sep_factor + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for class_sep_factor in class_sep_factor_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data = {'latent space':Z} + dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data.update({'early fusion':early_fusion}) + + for key in dict_data.keys(): + print('key', key) + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + print(dict_scores_means) + + df_scores_means = pd.DataFrame(dict_scores_means, index=class_sep_factor_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=class_sep_factor_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(class_sep_factor_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + for n_samples, percent, n_informative in zip(class_sep_factor_list, unsued_dimensions_percent_list, n_informative_list): + label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative)) + + plt.xticks(class_sep_factor_list, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("Factor (class_sep = factor*n_clusters_per_class)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs factor of class_sep for classifier "+classifier) + plt.savefig(path_graph+"score_class_sep_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std + + +def score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_samples=1000, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, + early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with n_informative_divid_list as index for the indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs n_informative_divid_list + + Parameters: + ----------- + n_informative_divid_list : list + each element from n_informative_divid_list defines a new dataset with element as n_informative_divid + path_graph : str + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with n_samples_list as index + """ + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per n_informative_divid + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per n_informative_divid + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for n_informative_divid in n_informative_divid_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data = {'latent space':Z} + dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data.update({'early fusion':early_fusion}) + + for key in dict_data.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + df_scores_means = pd.DataFrame(dict_scores_means, index=n_informative_divid_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=n_informative_divid_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(n_informative_divid_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + for n_informative_divid, percent, n_informative in zip(n_informative_divid_list, unsued_dimensions_percent_list, n_informative_list): + label_index.append(str(n_informative_divid)+'\n'+str(percent)+'\n'+str(n_informative)) + + plt.xticks(n_informative_divid_list, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("Factor (n_informative = dimension of latent space / factor)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs n_informative_divid for classifier "+classifier) + plt.savefig(path_graph+"score_n_informative_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std -- GitLab