diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0e40fe8f57160b43f9ea8e200b1a5d9f91f4aed9 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ + +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..3c2c013aca5383a4194378b8fe233d5f04daa7b8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="JavaScriptSettings"> + <option name="languageLevel" value="ES6" /> + </component> + <component name="ProjectRootManager" version="2" languageLevel="JDK_12" default="false" project-jdk-name="Python 3.6 (develop)" project-jdk-type="Python SDK" /> +</project> \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..6164328c76fbdf70a112333535f10474e6703fd0 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/multiview_generator.iml" filepath="$PROJECT_DIR$/.idea/multiview_generator.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/.idea/multiview_generator.iml b/.idea/multiview_generator.iml new file mode 100644 index 0000000000000000000000000000000000000000..d6ebd4805981b8400db3e3291c74a743fef9a824 --- /dev/null +++ b/.idea/multiview_generator.iml @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module type="JAVA_MODULE" version="4"> + <component name="NewModuleRootManager" inherit-compiler-output="true"> + <exclude-output /> + <content url="file://$MODULE_DIR$" /> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> +</module> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..35eb1ddfbbc029bcab630581847471d7f238ec53 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bfd71d12cd08fb82086e08581d327eb4ec51aae6 --- /dev/null +++ b/__init__.py @@ -0,0 +1,3 @@ +from . import generator +from . import demo + diff --git a/demo/__init__.py b/demo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/demo/__init__.py @@ -0,0 +1 @@ + diff --git a/generator/__pycache__/multiviews_datasets.cpython-36.pyc b/generator/__pycache__/multiviews_datasets.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9932a1bb71068d596d0df24e283e78348088be9 Binary files /dev/null and b/generator/__pycache__/multiviews_datasets.cpython-36.pyc differ diff --git a/generator/__pycache__/parameters.cpython-36.pyc b/generator/__pycache__/parameters.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb1c18a3ebf4324a606544fb3ae873e4fb6bda16 Binary files /dev/null and b/generator/__pycache__/parameters.cpython-36.pyc differ diff --git a/generator/multiviews_datasets.py b/generator/multiviews_datasets.py index 44babd9d3ee14d9e23df2643496599e2408d3e24..00f177708e67861387956f54b5c0d6e8e011baed 100644 --- a/generator/multiviews_datasets.py +++ b/generator/multiviews_datasets.py @@ -63,7 +63,7 @@ def projection(latent_space, chosen_columns_list): return latent_space[:, chosen_columns_list] -def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation): +def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation, random_state=42): """ Returns a generator multiviews dataset @@ -140,9 +140,9 @@ def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_c # Number of informative features n_informative = round(dim_Z/n_informative_divid) # Generation of latent space Z - Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0, - n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None, - flip_y=0.01, class_sep=n_clusters_per_class*class_sep_factor, random_state=None) + Z, y = make_classification(n_samples=200, n_features=10, n_informative=2, n_redundant=0, + n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, + flip_y=0, class_sep=100, random_state=random_state, shuffle=False) I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero meta_I_v = [] diff --git a/demo/result.py b/generator/result.py similarity index 94% rename from demo/result.py rename to generator/result.py index 9d2aac6a8a519871ba4891119a49beb9afe8217b..dfd27f5ed879ca093afa8b18d74caeeaeb81ef79 100644 --- a/demo/result.py +++ b/generator/result.py @@ -5,10 +5,9 @@ Created on Wed Nov 27 16:14:14 2019 @author: bernardet """ - import parameters from multiviews_datasets import generator_multiviews_dataset, results_to_csv -from test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided +from tests.test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided import warnings warnings.simplefilter(action='ignore', category=FutureWarning) diff --git a/generator/tests/__pycache__/test_classifier.cpython-36.pyc b/generator/tests/__pycache__/test_classifier.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d7f2cb640b14537fb387f740ce617f808f9ead6 Binary files /dev/null and b/generator/tests/__pycache__/test_classifier.cpython-36.pyc differ diff --git a/generator/use_generator_baptiste.py b/generator/use_generator_baptiste.py new file mode 100644 index 0000000000000000000000000000000000000000..437d14e5a09b900b51260bd56a586f37729edd42 --- /dev/null +++ b/generator/use_generator_baptiste.py @@ -0,0 +1,41 @@ +import os +import numpy as np + +import parameters +from multiviews_datasets import generator_multiviews_dataset, results_to_csv +from tests.test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided + +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + + +n_samples = 100 +n_views = 3 +n_classes = 2 +Z_factor = 1 +R = 0 +n_clusters_per_class = 1 +class_sep_factor = 100 +n_informative_divid = 1 +standard_deviation = 2 +d = 4 +D = 10 + +path = "/home/baptiste/Documents/Datasets/Generated/try_outlier/" +if not os.path.exists(path): + os.mkdir(path) + +Z, y, results, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, + Z_factor, R, + n_clusters_per_class, + class_sep_factor, + n_informative_divid, d, D, + standard_deviation) +print(y[:10]) +print(unsued_dimensions_percent) +print(n_informative) +print(Z.shape) +y[:10] = np.invert(y[:10].astype(bool)).astype(int) +print(y[:10]) +results_to_csv(path, Z, y, results) + diff --git a/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc b/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cc36e52166be6df44fd7ed479f9ac3fbc60f93c Binary files /dev/null and b/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc differ diff --git a/late/execute.py b/late/execute.py new file mode 100644 index 0000000000000000000000000000000000000000..9538308c1ad26f9102a664b5f29a7c6340253cd6 --- /dev/null +++ b/late/execute.py @@ -0,0 +1,35 @@ +import os +import numpy as np + +from multiviews_datasets_generator import generator_multiviews_dataset, results_to_csv + +n_samples = 200 #Number of samples in tha dataset +n_views = 4 # Number of views in the dataset +n_classes = 2 # Number of classes in the dataset +Z_factor = 1 # Z dim = latent_space_dim * z_factor +R = 0 # Precentage of non-redundant features in the view +n_clusters_per_class = 1 # Number of clusters for each class +class_sep_factor = 100 # Separation between the different classes +n_informative_divid = 1 # Divides the number of informative features in the latent space +standard_deviation = 2 +d = 4 +D = 10 +random_state = 42 +n_outliers = 10 + +path = "/home/baptiste/Documents/Datasets/Generated/outliers_dset/" +if not os.path.exists(path): + os.mkdir(path) + +Z, y, results, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, + Z_factor, R, + n_clusters_per_class, + class_sep_factor, + n_informative_divid, d, D, + standard_deviation) +print(unsued_dimensions_percent) +print(n_informative) +print(Z.shape) +changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers) +y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int) +results_to_csv(path, Z, y, results) \ No newline at end of file diff --git a/late/multiviews_datasets_generator.py b/late/multiviews_datasets_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..1cce9a032e6ba6d2f43e43b3cf5d82b03e2a414d --- /dev/null +++ b/late/multiviews_datasets_generator.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 26 15:38:38 2019 + +@author: bernardet +""" + +from sklearn.datasets import make_classification +from random import gauss +from math import ceil, floor +import numpy as np +import pandas as pd + + +def latent_space_dimension(views_dimensions_list, R): + """ + Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list + + Parameters: + ----------- + views_dimensions_list : list + R : float + + Returns: + -------- + an int + """ + max_view_dimension = max(views_dimensions_list) + dimension = ceil(R*sum(views_dimensions_list)) + + if dimension < max_view_dimension: + dimension = max_view_dimension + + reduced_dimension = dimension + remove_sum = 0 + + for num_view in range(1, len(views_dimensions_list)): + view_prec = views_dimensions_list[num_view - 1] + view_current = views_dimensions_list[num_view] + remove = floor(R*view_prec) + remove_sum += remove + if reduced_dimension - remove < view_current: + dimension += view_current - (reduced_dimension - remove) + reduced_dimension = dimension - remove_sum + + return dimension + + +def projection(latent_space, chosen_columns_list): + """ + Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order) + + Parameters: + ----------- + latent_space : array + chosen_columns_list : list + + Returns: + -------- + an array of dimension (number of rows of latent_space, length of chosen_columns_list) + """ + return latent_space[:, chosen_columns_list] + + +def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, Z_factor=250, R=2/3, n_clusters_per_class=1, class_sep_factor=2, n_informative_divid=2, d=2, D=12, standard_deviation=2): + """ + Returns a generator multiviews dataset + + Parameters: + ----------- + n_samples : int + dataset number of samples (number of rows of dataset) + n_views : int >= 2 + dataset number of views + one view is a set of some features (columns) of the latent space + n_classes : int >= 2 + dataset number of classes + Z_factor : float >= 1 + minimal dimension of the latent space (enough to build the dataset) is calculed then multiplied by Z_factor + R : 0 <= float <= 1 + R = 1 <> no possibility of redundancy between views + R = 0 <> maximal possibility of redundancy between views + n_clusters_per_class : int >= 1 + class_sep_factor : float >= 0 + class_sep = n_clusters_per_class*class_sep_factor + n_informative_divid : float >= 1 + n_informative_divid raises <> number of non-informative features raises + n_informative_divid = 1 <> no non-informative features, number of informative features = dimension of latent space + number of informative features = round(dimension of latent space / n_informative_divid) + d : float >= 1 + minimal dimension of views + dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D + D : float >= d + maximal dimension of views + dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D + standard_deviation : float + standard deviation of the gaussian distribution N((d+D)/2, standard_deviation^2) + dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D + + Returns: + -------- + Z : an array of dimension(n_samples, R*n_views) = the generated samples + y : an array of dimension (n_samples) = the integer labels for class membership of each sample + a list of n_views tuples (X_v, I_v) with : + X_v = Z projected along d_v (= dimension of the v-ith views) columns in I_v + I_v = X_v columns numeros with numberring of Z columns numeros + unsued_dimensions_percent : percentage of unsued columns of latent space in views + n_informative : number of informative features (dimension of latent space - n_informative = number of non informative features) + """ + + if n_views < 2: + raise ValueError("n_views >= 2") + if n_classes < 2: + raise ValueError("n_classes >= 2") + if Z_factor < 1: + raise ValueError("Z_factor >= 1 pour le bon fonctionnement de l'algorithme") + if (R < 0) or (R > 1): + raise ValueError("0 <= R <= 1") + if n_clusters_per_class < 1: + raise ValueError("n_clusters_per_class >= 1") + if class_sep_factor < 0: + raise ValueError("class_sep_factor >= 0") + if n_informative_divid < 1: + raise ValueError("n_informative_divid >= 1") + if d < 1: + raise ValueError("d >= 1") + if (d+D)/2 - 3*standard_deviation < 1: + raise ValueError("Il faut que (d+D)/2 - 3*standard_deviation >= 1 pour avoir des valeurs positives non nulles lors de l'emploi de la loi normale") + + # n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2) + d_v = np.random.normal(loc=(d+D)/2, scale=standard_deviation, size=n_views) + d_v = list(d_v) + remove_list, add_list = [], [] + for dim_view in d_v: + if dim_view < d or dim_view > D: # 1 <= d <= dim_view <= D + remove_list.append(dim_view) + add = -1 + while add < d or add > D: + add = gauss((d+D)/2, standard_deviation) + add_list.append(add) + d_v = [view for view in d_v if view not in remove_list] + add_list + d_v = [int(view) for view in d_v] # dimension of views = integer + # d_v = list of views dimension from the highest to the lowest + d_v.sort(reverse=True) + # Dimension of latent space Z (multiplied by Z_factor) + dim_Z = Z_factor*latent_space_dimension(d_v, R) + print(dim_Z) + # Number of informative features + n_informative = round(dim_Z/n_informative_divid) + # Generation of latent space Z + Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0, + n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None, + flip_y=0.00, class_sep=n_clusters_per_class*class_sep_factor, random_state=None) + + I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero + meta_I_v = [] + results = [] + for view in range(n_views): + # choice d_v[view] numeros of Z columns uniformly from I_q + I_v = np.random.choice(I_q, size=d_v[view], replace=False) # tirage dans I_q sans remise de taille d_v[view] + meta_I_v += list(I_v) + # projection of Z along the columns in I_v + X_v = projection(Z, I_v) + results.append((X_v, I_v)) + # remove R*d_v[view] columns numeros of I_v form I_q + elements_to_remove = np.random.choice(I_v, size=floor(R*d_v[view]), replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view]) + I_q = np.setdiff1d(I_q, elements_to_remove) # I_q less elements from elements_to_remove + unsued_dimensions_list = [column for column in I_q if column not in meta_I_v] + unsued_dimensions_percent = round((len(unsued_dimensions_list) / dim_Z)*100, 2) + return Z, y, results, unsued_dimensions_percent, n_informative + + +def results_to_csv(path, latent_space, integer_labels, multiviews_list): + """ + Create length of multiviews_list + 2 csv files to the indicated path + Files name : + latent_space.csv for latent_space + integer_labels.csv for integer_labels + view0.csv for multiviews_list[0] + + Parameters: + ----------- + path : str + latent_space : array + integer_labels : 1D array + multiviews_list : list of tuples + + Returns: + -------- + None + """ + df_latent_space = pd.DataFrame(latent_space) + df_latent_space.to_csv(path+'latent_space.csv', index=False) + + df_labels = pd.DataFrame(integer_labels) + df_labels.to_csv(path+'integer_labels.csv', index=False) + + cpt = 0 + for view_tuple in multiviews_list: + df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1]) + df_view.to_csv(path+'view'+str(cpt)+'.csv', index=False) + cpt += 1 diff --git a/late/parameters.py b/late/parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..384af4672e11ab0b6a22358a66be3e8e533fe5f7 --- /dev/null +++ b/late/parameters.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 26 13:53:05 2019 + +@author: bernardet +""" +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB +import numpy as np + +# General parameters +n_samples = 1000 +# number of samples (int) +n_views = 3 +# number of views >= 2 (int) +n_classes = 2 +# number of classes >= 3 (int) +Z_factor = 250 +# multiplication factor of Z dimension (default value = 1) +R = 2/3 +# redondance (float) +cv = 10 +# number of cross-validation splitting (int) +n_clusters_per_class = 1 +# number of clusters per class >= 1 (int) +class_sep_factor = 2 +# factor >= 1 as class_sep = n_clusters_per_class*class_sep_factor +n_informative_divid = 2 +# factor >= 1 as number of informative features = round(dimension of latent space / n_informative_divid) +classifier = "SVM" +# name of classifier (str) +classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()} +# dictionary of classifiers +n_samples_list = [100, 500, 1000, 1500, 2000] +# list of number of samples to test generator +R_list = list(np.arange(0, 1.05, 0.05)) +# list of diverse R +Z_factor_list = [1, 3, 10, 25, 100, 250, 1000] +# list of diverse Z_factor +n_views_list = [n_view for n_view in range(2, 10)] +# list of diverse n_views +class_sep_factor_list = [2, 5, 10] +# list of diverse class_sep_factor +n_informative_divid_list = [1, 2, 3] +# list of diverse n_informative_divid +path_data = "/home/bernardet/Documents/StageL3/Data/" +# path to register the multiview dataset +path_graph = "/home/bernardet/Documents/StageL3/Graph/" +# path to register scores graph + +# Parameters of gaussian distribution N((d+D)/2, standard_deviation_2) : +# d <= dim[v] <= D for all v +# (d+D)/2 - 3*sqrt(standard_deviation_2) >= 0 +d = 4 +# < D, > 0 +D = 12 +# > d +standard_deviation = 2 +# standard deviation of the gaussian distribution diff --git a/late/test_generator.py b/late/test_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..dff19b2aa9d7f606782e2c8f29dd3e4058cf4505 --- /dev/null +++ b/late/test_generator.py @@ -0,0 +1,1140 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Nov 28 14:14:46 2019 + +@author: bernardet +""" + +from multiviews_datasets_generator import generator_multiviews_dataset +from sklearn.svm import SVC +from sklearn.naive_bayes import GaussianNB +from sklearn.model_selection import cross_val_score, StratifiedKFold +from sklearn.metrics import accuracy_score +from collections import Counter +from mpl_toolkits.mplot3d import Axes3D +from math import sqrt +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +from multimodalboost.mumbo import MumboClassifier + + +def majority_list(predictions_list): + """ + Returns an array which on each row the majority class of the same row in + predictions_list + + Parameters: + ----------- + predictions_list : list of 1D array + + Returns: + -------- + an 1D array + """ + n_samples = len(predictions_list[0]) + # majority_prediction[i] = prediction of predictions_list[i] which appears + #the most on predictions_list[i] + majority_prediction = np.array([-1]*n_samples) + # concatenate_predictions_list[i] = list contains prediction of the i-th + #data per view + reshape_predictions_list = [predictions_list[i].reshape(len(predictions_list[i]), 1) for i in range(len(predictions_list))] + concatenate_predictions_list = np.hstack(reshape_predictions_list) + for sample in range(n_samples): + # dictionary contains predictions (key) and its occurences in + #concatenate_predictions_list[sample] + count = Counter(concatenate_predictions_list[sample]) + maj_value = max(count.values()) # maximal number of a prediction + for key in count.keys(): # searchs the prediction with the maximal + #occurence number + if count[key] == maj_value: + majority_prediction[sample] = key + break + + return majority_prediction + + +def majority_score(views_dictionary, integer_labels, cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}): + """ + Returns the mean and the standard deviation of accuracy score when + predictions are selected by majority of predictions of different views + + Parameters: + ----------- + views_dictionary : dict + integer_labels = array + cv : int + classifier : str + classifier_dictionary : dict + + Returns: + -------- + Two floats + """ + skf = StratifiedKFold(n_splits=cv, random_state=1, shuffle=True) + # provides cv train/test indices to split data in cv train/test sets. + prediction_list = [[] for i in range(cv)] # for majority_list function + test_list = [[] for i in range(cv)] # for score + + for key in views_dictionary.keys(): + i = 0 + for train_index, test_index in skf.split(views_dictionary[key], integer_labels): + # splits data and integer label of one view in test and train sets + X = views_dictionary[key] + train, test = X[train_index], X[test_index] + y_train = integer_labels[train_index] + y_test = integer_labels[test_index] + # trains the classifier and tests it with test set + clf = classifier_dictionary[classifier] + clf.fit(train, y_train.ravel()) + y_pred = clf.predict(test) + + prediction_list[i].append(y_pred) + if len(test_list[i]) == 0: # same y_test for all views + test_list[i] = y_test + i += 1 + + score = [] + for i in range(len(prediction_list)): + y_pred_majority = majority_list(prediction_list[i]) + # majority of views predictions + score.append(accuracy_score(test_list[i].ravel(), y_pred_majority)) + # score of majority of views predictions vs expected predictions + score = np.array(score) + return score.mean(), score.std() + + +def score_one_multiview_dataset(cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_samples=1000, n_views=3, n_classes=2, + Z_factor=1, R=2/3, n_clusters_per_class=2, + class_sep_factor=2, n_informative_divid=1, + d=4, D=10, standard_deviation=2): + """ + Returns 3 Series (first with dimensions of latent space, views and + percentage of dimensions of latent space unsued in views, the second with + accuracy score and the third with the standard deivation of accuracy score) + of latent space, views, early fusion predictions (concatenate views + predictions) and late fusion predictions (majority views predictions) + + Parameters: + ----------- + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, + class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters + of generator_multiviews_dataset + + Returns: + -------- + 3 Series + """ + # dictionary contains percentage of unsued dimension of latent space and + #dimension of latent space and views + dimensions = {'unsued dimension of latent space':0, "number of informative features":0, 'latent space':0} + dimensions.update({'view'+str(i):0 for i in range(n_views)}) + # dictionary contains and mean of accuracy scores + dict_scores_means = {'latent space':0} + dict_scores_means.update({'view'+str(i):0 for i in range(n_views)}) + dict_scores_means.update({'early fusion':0, 'late fusion':0}) + # dictionary contains standard deviation of accuracy scores + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + dimensions["unsued dimension of latent space"] = unsued_dimensions_percent + dimensions["number of informative features"] = n_informative + dimensions["latent space"] = Z.shape + + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + dimensions['view'+str(i)] = multiviews_list[i][0].shape + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key] = score.mean() + dict_scores_std[key] = score.std() + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'] = mean_majority + dict_scores_std['late fusion'] = std_majority + + df_dimensions = pd.Series(dimensions) + df_scores_means = pd.Series(dict_scores_means) + df_scores_std = pd.Series(dict_scores_std) + + return df_dimensions, df_scores_means, df_scores_std + + +def score_multiviews_n_samples(n_samples_list, path_graph, cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_views=3, n_classes=2, Z_factor=1, R=2/3, + n_clusters_per_class=2, class_sep_factor=2, + n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the + standard deivation of accuracy score) of latent space, views, early fusion + predictions (concatenate views predictions) and late fusion predictions + (majority views predictions) with n_samples_list as index for the indicated + classifier + Creates and saves (at the indicated path path_graph) a graph represented + accuracy score (with confidence interval) vs n_samples_list + + Parameters: + ----------- + n_samples_list : list + each element from n_samples_list defines a new dataset + with element samples + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, + n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with n_samples_list as index + """ + # n_samples_list = list of samples dimension from the lowest to the highest + n_samples_list.sort(reverse=False) + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per n_samples + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per n_samples + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for n_samples in n_samples_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dictionary of data + dict_data = {'latent space':Z} + dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data.update({'early fusion':early_fusion}) + + for key in dict_data.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + df_scores_means = pd.DataFrame(dict_scores_means, index=n_samples_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=n_samples_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(n_samples_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + for n_samples, percent, n_informative in zip(n_samples_list, unsued_dimensions_percent_list, n_informative_list): + label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative)) + + plt.xticks(n_samples_list, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("Number of samples\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs number of samples for classifier "+classifier) + plt.savefig(path_graph+"score_samples_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std + + +def graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, + n_samples_list, path_graph, + cv=10, classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_views=3, n_classes=2, + Z_factor=1, R=2/3, + n_clusters_per_class=2, + class_sep_factor=2, + n_informative_divid=1, + d=4, D=10, standard_deviation=2): + """ + Creates and saves (at the indicated path path_graph) multiple graphs + represented scores of classifier2 vs scores of classifier1 (one graph per + column of result of score_multiviews_n_samples) + + Parameters: + ----------- + classifier1 : str + classifier2 : str + n_samples_list : list + each element from n_samples_list defines a new dataset + with element samples + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, + n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + None + """ + df_scores_clf1_means, df_scores_clf1_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier1, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + df_scores_clf2_means, df_scores_clf2_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier2, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + + n_samples_list = df_scores_clf1_means.index + keys = df_scores_clf1_means.keys() + + for key in keys: + plt.figure() + plt.scatter(df_scores_clf1_means[key].values, df_scores_clf2_means[key].values, c=df_scores_clf1_means[key].values) + plt.plot([0.0, 1.1], [0.0, 1.1], "--", c=".7") # diagonal + plt.xlabel("Accuracy score for "+classifier1) + plt.ylabel("Accuracy score for "+classifier2) + plt.xlim(0, 1) + plt.ylim(0, 1) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score of "+key+" for "+classifier2+" vs "+classifier1) + plt.savefig(path_graph+classifier1+"_"+classifier2+"_"+str(n_views)+"_"+key+".png") + plt.show() + plt.close() + + +def score_multiviews_R(R_list, path_graph, cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_samples=1000, n_views=3, n_classes=2, Z_factor=1, + n_clusters_per_class=2, class_sep_factor=2, + n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the + standard deivation of accuracy score) of latent space, views, early fusion + predictions (concatenate views predictions) and late fusion predictions + (majority views predictions) with R_list as index for the indicated + classifier + Creates and saves (at the indicated path path_graph) a graph represented + accuracy score (with confidence interval) vs R_list + + Parameters: + ----------- + R_list : list + each element from R_list defines a new dataset with element as R + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, + class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters + of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with R_list as index + """ + # R_list = list of diverse values of R from the lowest to the highest + R_list.sort(reverse=False) + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per R + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per R + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for R in R_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + df_scores_means = pd.DataFrame(dict_scores_means, index=R_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=R_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(R_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + R_label = [] + for i in range(0, len(R_list), 4): + R_label.append(R_list[i]) + label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i])) + + plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs R for classifier "+classifier) + plt.savefig(path_graph+"score_R_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std + +def score_multiviews_Z_factor(Z_factor_list, path_graph, cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_samples=1000, n_views=3, n_classes=2, R=2/3, + n_clusters_per_class=2, class_sep_factor=2, + n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 3 DataFrames (first with accuracy score, the second with the + standard deivation of accuracy score and the third with the error rate) of + latent space, views, early fusion predictions (concatenate views + predictions) and late fusion predictions (majority views predictions) with + sum of views dimension divided by Z_factor_list as index for the indicated + classifier + Creates and saves (at the indicated path path_graph) a graph represented + accuracy score vs sum of views dimension divided by Z_factor_list and a + graph represented error rate (1 - accuracy score) vs sum of views dimension + divided by Z_factor_list + + Parameters: + ----------- + Z_factor_list : list + each element from Z_factor_list defines a new dataset with + element as Z_factor + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, + n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 3 DataFrames with Z_factor_list as index + """ + # Z_factor_list = list of diverse values of Z_factor from the highest to the lowest + Z_factor_list.sort(reverse=True) + # list of sum of views dimension for each Z_factor_list item + d_v = [] + # list of Z dimension for each Z_factor_list item + Z_dim_list = [] + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # same views have same colors on each graph + dict_colors = {'latent space':0} + dict_colors.update({'view'+str(i):0 for i in range(n_views)}) + prop_cycle = plt.rcParams['axes.prop_cycle'] + colors = prop_cycle.by_key()['color'] + for key, c in zip(dict_colors.keys(), colors): + dict_colors[key] = c + dict_colors.update({'early fusion':'purple', 'late fusion':'maroon'}) + # dictionary contains mean of accuracy scores per Z_factor + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains error rate per Z_factor + dict_scores_error = {'latent space':[]} + dict_scores_error.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_error.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per Z_factor + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for Z_factor in Z_factor_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dimension = number of columns + d_v.append(early_fusion.shape[1]) + Z_dim_list.append(Z.shape[1]) + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_error[key].append(1 - score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_error['late fusion'].append(1 - mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + d_v_divid_Z = np.divide(np.array(d_v), np.array(Z_dim_list)) + + df_scores_means = pd.DataFrame(dict_scores_means, index=d_v_divid_Z) + df_scores_error = pd.DataFrame(dict_scores_error, index=d_v_divid_Z) + df_scores_std = pd.DataFrame(dict_scores_std, index=d_v_divid_Z) + + # index and label for graphics + label_index = [chr(i) for i in range(ord('a'),ord('z')+1)] + label_index = label_index[0:len(d_v)] + label_value = "" + for label, v_Z, dim_v, dim_Z, Z_factor, percent, n_informative in zip(label_index, d_v_divid_Z, d_v, Z_dim_list, Z_factor_list, unsued_dimensions_percent_list, n_informative_list): + label_value = label_value + label+" : V/Z = "+str(round(v_Z, 4))+", V = "+str(dim_v)+", Z = "+str(dim_Z)+", Z_factor = "+str(Z_factor)+", % ="+str(percent)+", n_informative = "+str(n_informative)+'\n' + + x_label = "V/Z = sum of views dimension divided by latent space dimension with :\nV = sum of views dimension\nZ = latent space dimension multiplied by Z_factor\n% = percentage of dimensions of latent space unsued in views\nn_informative = number of informative features" + + plt.figure(figsize=(10, 10)) # accuracy score vs d_v_divid_Z + for key in dict_scores_means.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means[key], '.-', color=dict_colors[key], label=key) + + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # error rate vs d_v_divid_Z + for key in dict_scores_means.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_error[key], '.-', color=dict_colors[key], label=key) + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Error rate for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nError rate vs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"error_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + """ + plt.figure(figsize=(10, 10)) + + for key in dict_scores_means.keys(): + plt.errorbar(d_v_divid_Z, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_errorbar_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # accuracy score of early fusion divided by + # accuracy score of each view vs d_v_divid_Z + for view in dict_views.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means['early fusion']/df_scores_means[view], '.-', label='early fusion score divided by '+view+' score') + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Ratio accuracy score for early fusion / accuracy score for each view for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRatio accuracy score for early fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_majority_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # accuracy score of late fusion divided by + # accuracy score of each view vs d_v_divid_Z + for view in dict_views.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means['late fusion']/df_scores_means[view], '.-', label='late fusion score divided by '+view+' score') + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Ratio accuracy score for late fusion / accuracy score for each view for "+classifier) + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRation accuracy score for late fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier) + plt.savefig(path_graph+"score_Z_factor_all_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + """ + return df_scores_means, df_scores_std, df_scores_error + + +def score_multiviews_Z_factor_Mumbo(Z_factor_list, path_graph, cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_samples=1000, n_views=3, n_classes=2, + R=2/3, n_clusters_per_class=2, + class_sep_factor=2, n_informative_divid=1, + d=4, D=10, standard_deviation=2): + """ + Returns 3 DataFrames (first with accuracy score, the second with the + standard deivation of accuracy score and the third with the error rate) of + latent space, views, early fusion predictions (concatenate views + predictions) and late fusion predictions (majority views predictions) with + sum of views dimension divided by Z_factor_list as index for the indicated + classifier and for Mumbo classifier + Creates and saves (at the indicated path path_graph) a graph represented + accuracy score vs sum of views dimension divided by Z_factor_list and a + graph represented error rate (1 - accuracy score) vs sum of views dimension + divided by Z_factor_list + + Parameters: + ----------- + Z_factor_list : list + each element from Z_factor_list defines a new dataset with + element as Z_factor + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, + n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 3 DataFrames with Z_factor_list as index + """ + # Z_factor_list = list of diverse values of Z_factor from the highest to the lowest + Z_factor_list.sort(reverse=True) + # list of sum of views dimension for each Z_factor_list item + d_v = [] + # list of Z dimension for each Z_factor_list item + Z_dim_list = [] + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # same views have same colors on each graph + dict_colors = {'latent space':0} + dict_colors.update({'view'+str(i):0 for i in range(n_views)}) + prop_cycle = plt.rcParams['axes.prop_cycle'] + colors = prop_cycle.by_key()['color'] + for key, c in zip(dict_colors.keys(), colors): + dict_colors[key] = c + dict_colors.update({'early fusion':'purple', 'late fusion':'maroon', 'Mumbo':'midnightblue'}) + # dictionary contains mean of accuracy scores per Z_factor + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[], 'Mumbo':[]}) + # dictionary contains error rate per Z_factor + dict_scores_error = {'latent space':[]} + dict_scores_error.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_error.update({'early fusion':[], 'late fusion':[], 'Mumbo':[]}) + # dictionary contains standard deviation of accuracy scores per Z_factor + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[], 'Mumbo':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for Z_factor in Z_factor_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + view_index = [0] # for Mumbo + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + view_index.append(len(multiviews_list[i][1])+view_index[i]) + + concat = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dimension = number of columns + d_v.append(concat.shape[1]) + Z_dim_list.append(Z.shape[1]) + # dictionary of data + dict_data_df = {'latent space':Z} + dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data_df.update({'early fusion':concat}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_error[key].append(1 - score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_error['late fusion'].append(1 - mean_majority) + dict_scores_std['late fusion'].append(std_majority) + # Mumbo + skf = StratifiedKFold(n_splits=cv, random_state=1, shuffle=True) + # provides cv train/test indices to split data in cv train/test sets + score = [] + for train_index, test_index in skf.split(concat, y): + # splits data and integer label of one view in test and train sets + train, test = concat[train_index], concat[test_index] + y_train, y_test = y[train_index], y[test_index] + # trains the classifier and tests it with test set + clf = MumboClassifier() + clf.fit(train, y_train, view_index) + y_pred = clf.predict(test) + score.append(accuracy_score(y_test, y_pred)) + + score = np.array(score) + dict_scores_means['Mumbo'].append(score.mean()) + dict_scores_error['Mumbo'].append(1 - score.mean()) + dict_scores_std['Mumbo'].append(score.std()) + + d_v_divid_Z = np.divide(np.array(d_v), np.array(Z_dim_list)) + + df_scores_means = pd.DataFrame(dict_scores_means, index=d_v_divid_Z) + df_scores_error = pd.DataFrame(dict_scores_error, index=d_v_divid_Z) + df_scores_std = pd.DataFrame(dict_scores_std, index=d_v_divid_Z) + + # index and label for graphics + label_index = [chr(i) for i in range(ord('a'),ord('z')+1)] + label_index = label_index[0:len(d_v)] + label_value = "" + for label, v_Z, dim_v, dim_Z, Z_factor, percent, n_informative in zip(label_index, d_v_divid_Z, d_v, Z_dim_list, Z_factor_list, unsued_dimensions_percent_list, n_informative_list): + label_value = label_value + label+" : V/Z = "+str(round(v_Z, 4))+", V = "+str(dim_v)+", Z = "+str(dim_Z)+", Z_factor = "+str(Z_factor)+", % ="+str(percent)+", n_informative = "+str(n_informative)+'\n' + + x_label = "V/Z = sum of views dimension divided by latent space dimension with :\nV = sum of views dimension\nZ = latent space dimension multiplied by Z_factor\n% = percentage of dimensions of latent space unsued in views\nn_informative = number of informative features" + + plt.figure(figsize=(10, 10)) # accuracy score vs d_v_divid_Z + for key in dict_scores_means.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means[key], '.-', color=dict_colors[key], label=key) + + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Accuracy score for "+classifier+" and Mumbo") + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifiers "+classifier+" and Mumbo") + plt.savefig(path_graph+"score_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_Mumbo_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # error rate vs d_v_divid_Z + for key in dict_scores_means.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_error[key], '.-', color=dict_colors[key], label=key) + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Error rate for "+classifier+" and Mumbo") + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nError rate vs ratio sum of views dimension / latent space dimension for classifiers "+classifier+" and Mumbo") + plt.savefig(path_graph+"error_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_Mumbo_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + """ + plt.figure(figsize=(10, 10)) # accuracy score of early fusion divided by + # accuracy score of each view vs d_v_divid_Z + for view in dict_views.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means['early fusion']/df_scores_means[view], '.-', label='early fusion score divided by '+view+' score') + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Ratio accuracy score for early fusion / accuracy score for each view for "+classifier+" and Mumbo") + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRatio accuracy score for early fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifiers "+classifier+" and Mumbo") + plt.savefig(path_graph+"score_Z_factor_majority_view_divid_"+str(n_views)+"_Mumbo_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + plt.figure(figsize=(10, 10)) # accuracy score of late fusion divided by + # accuracy score of each view vs d_v_divid_Z + for view in dict_views.keys(): + plt.semilogx(d_v_divid_Z, dict_scores_means['late fusion']/df_scores_means[view], '.-', label='late fusion score divided by '+view+' score') + plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) + plt.xlabel(x_label) + plt.ylabel("Ratio accuracy score for late fusion / accuracy score for each view for "+classifier+" and Mumbo") + plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRation accuracy score for late fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifiers "+classifier+" and Mumbo") + plt.savefig(path_graph+"score_Z_factor_all_view_divid_"+str(n_views)+"_"+str(round(R, 4))+"_Mumbo_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + """ + return df_scores_means, df_scores_std, df_scores_error + + +def score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv=10, + classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_samples=1000, n_classes=2, Z_factor=1, + n_clusters_per_class=2, class_sep_factor=2, + n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns a dictionary with n_views_list as key containing a list of + DataFrames (represented accuracy score divided by accuracy score for R=1 <> + redundancy null) of views, early fusion predictions (concatenate views + predictions and late fusion predictions (majority views predictions) with + R_list as index for the indicated classifier per key + Creates and saves (at the indicated path path_graph) a graph per value of + n_views_list represented accuracy score divided by accuracy score for R=1 + vs R_list + + Parameters: + ----------- + n_views_list : list + each element from n_views_list defines a new dataset with + element as n_views + R_list : list + each element from R_list defines a new dataset with element as R + path_graph : str + path to save graphics + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, + n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + a dictionary with n_views_list as key containing a list of DataFrames + (represented accuracy score divided by accuracy score for R=1 <> redundancy + null) with R_list as index per value of n_views_list + """ + dict_n_views_R_ratio = {key:0 for key in n_views_list} + # n_views_list = list of diverse values of n_views from the lowest to the highest + n_views_list.sort(reverse=False) + # same views have same colors on each graph + dict_colors = {'view'+str(i):0 for i in range(n_views_list[-1])} + prop_cycle = plt.rcParams['axes.prop_cycle'] + colors = prop_cycle.by_key()['color'] + for key, c in zip(dict_colors.keys(), colors): + dict_colors[key] = c + dict_colors.update({'early fusion':'purple', 'late fusion':'maroon'}) + + for n_views in n_views_list: + # R_list = list of diverse values of R from the lowest to the highest + R_list.sort(reverse=False) + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per R + dict_scores_means = {'view'+str(i):[] for i in range(n_views)} + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary of list of scores' mean of view for diverse R divided by + #score's mean of view for R = 1 (<> redundancy null) + dict_scores_ratio_R_1 = {'view'+str(i):0 for i in range(n_views)} + dict_scores_ratio_R_1.update({'early fusion':0, 'late fusion':0}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for R in R_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dictionary of data + dict_data_df = {'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)} + dict_data_df.update({'early fusion':early_fusion}) + + for key in dict_data_df.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + + for key in dict_scores_means.keys(): + score_R_1 = dict_scores_means[key][-1] # R = 1 = last value of + # R_list => last score value in dict_scores_means[key] + dict_scores_ratio_R_1[key] = np.divide(np.array(dict_scores_means[key]), score_R_1) + + df_scores_ratio_R_1 = pd.DataFrame(dict_scores_ratio_R_1, index=R_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.plot(R_list, dict_scores_ratio_R_1[key], '.-', color=dict_colors[key], label=key) + # index and label for graphic + label_index = [] + R_label = [] + for i in range(0, len(R_list), 4): + R_label.append(R_list[i]) + label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i])) + + plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1\n(redundancy null) vs R for classifier "+classifier) + plt.savefig(path_graph+"score_R_divid_R_1_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + dict_n_views_R_ratio[n_views] = df_scores_ratio_R_1 + + plt.figure() + ax = plt.axes(projection="3d") + + for n_views in n_views_list: + for key in dict_n_views_R_ratio[n_views].keys(): + if n_views == n_views_list[-1]: # print legends only once + ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key], label=key) + else: + ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key]) + + ax.set_xlabel("R") + ax.set_ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier) + ax.set_zlabel("Number of views") + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of samples = "+str(n_samples)+" - factor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1 (redundancy null) vs R, number of views for classifier "+classifier) + plt.savefig(path_graph+"score_R_divid_R_1_all_n_views"+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return dict_n_views_R_ratio + + +def score_multiviews_class_sep(class_sep_factor_list, path_graph, cv=10, + classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_views=3, n_samples=1000, n_classes=2, + Z_factor=1, R=2/3, n_clusters_per_class=2, + n_informative_divid=1, d=4, D=10, standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the + standard deivation of accuracy score) of latent space, views, early fusion + predictions (concatenate views predictions) and late fusion predictions + (majority views predictions) with class_sep_factor_list as index for the + indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented + accuracy score (with confidence interval) vs class_sep_factor_list + + Parameters: + ----------- + class_sep_factor_list : list + each element from n_samples_list defines a new + dataset + path_graph : str + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, + n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with n_samples_list as index + """ + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per class_sep_factor + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per class_sep_factor + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for class_sep_factor in class_sep_factor_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dictionary of data + dict_data = {'latent space':Z} + dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data.update({'early fusion':early_fusion}) + + for key in dict_data.keys(): + print('key', key) + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + print(dict_scores_means) + + df_scores_means = pd.DataFrame(dict_scores_means, index=class_sep_factor_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=class_sep_factor_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(class_sep_factor_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + for n_samples, percent, n_informative in zip(class_sep_factor_list, unsued_dimensions_percent_list, n_informative_list): + label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative)) + + plt.xticks(class_sep_factor_list, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("Factor (class_sep = factor*n_clusters_per_class)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs factor of class_sep for classifier "+classifier) + plt.savefig(path_graph+"score_class_sep_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std + + +def score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, + cv=10, classifier="SVM", + classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, + n_views=3, n_samples=1000, + n_classes=2, Z_factor=1, R=2/3, + n_clusters_per_class=2, + class_sep_factor=2, d=4, D=10, + standard_deviation=2): + """ + Returns 2 DataFrames (first with accuracy score and the second with the + standard deivation of accuracy score) of latent space, views, early fusion + predictions (concatenate views predictions) and late fusion predictions + (majority views predictions) with n_informative_divid_list as index for the + indicated classifier + Creates and saves (at the indicated path path_graph) a graph represented + accuracy score (with confidence interval) vs n_informative_divid_list + + Parameters: + ----------- + n_informative_divid_list : list + each element from n_informative_divid_list + defines a new dataset with element as + n_informative_divid + path_graph : str + cv : int + classifier : str + classifier_dictionary : dict + n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, + class_sep_factor, d, D, standard_deviation : parameters of generator_multiviews_dataset + + Returns: + -------- + 2 DataFrames with n_samples_list as index + """ + # list of percentage of unsued columns of latent space in views + unsued_dimensions_percent_list = [] + # list of number of informative features of latent space + n_informative_list = [] + # dictionary contains mean of accuracy scores per n_informative_divid + dict_scores_means = {'latent space':[]} + dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_means.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains standard deviation of accuracy scores per + #n_informative_divid + dict_scores_std = {'latent space':[]} + dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) + dict_scores_std.update({'early fusion':[], 'late fusion':[]}) + # dictionary contains data of each view + dict_views = {'view'+str(i):0 for i in range(n_views)} + + for n_informative_divid in n_informative_divid_list: + Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) + unsued_dimensions_percent_list.append(unsued_dimensions_percent) + n_informative_list.append(n_informative) + + for i in range(n_views): + # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) + dict_views['view'+str(i)] = multiviews_list[i][0] + + early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) + # = concatenation of all views + # dictionary of data + dict_data = {'latent space':Z} + dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) + dict_data.update({'early fusion':early_fusion}) + + for key in dict_data.keys(): + clf = classifier_dictionary[classifier] + score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) + dict_scores_means[key].append(score.mean()) + dict_scores_std[key].append(score.std()) + + mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) + dict_scores_means['late fusion'].append(mean_majority) + dict_scores_std['late fusion'].append(std_majority) + + df_scores_means = pd.DataFrame(dict_scores_means, index=n_informative_divid_list) + df_scores_std = pd.DataFrame(dict_scores_std, index=n_informative_divid_list) + + plt.figure() + for key in dict_scores_means.keys(): + plt.errorbar(n_informative_divid_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) + # index and label for graphic + label_index = [] + for n_informative_divid, percent, n_informative in zip(n_informative_divid_list, unsued_dimensions_percent_list, n_informative_list): + label_index.append(str(n_informative_divid)+'\n'+str(percent)+'\n'+str(n_informative)) + + plt.xticks(n_informative_divid_list, label_index, fontsize='medium', multialignment='center') # new x indexes + plt.xlabel("Factor (n_informative = dimension of latent space / factor)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") + plt.ylabel("Accuracy score for "+classifier) + plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) + plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs n_informative_divid for classifier "+classifier) + plt.savefig(path_graph+"score_n_informative_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') + plt.show() + plt.close() + + return df_scores_means, df_scores_std