Skip to content
Snippets Groups Projects
Commit 40692642 authored by Dominique Benielli's avatar Dominique Benielli
Browse files

firts add

parent 3726ce43
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 27 16:14:14 2019
@author: bernardet
"""
import parameters
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
n_samples = parameters.n_samples
n_views = parameters.n_views
n_classes = 3#parameters.n_classes
Z_factor = parameters.Z_factor
R = parameters.R
n_clusters_per_class = 1#parameters.n_clusters_per_class
class_sep_factor = 2#5#2#parameters.class_sep_factor
n_informative_divid = 2#parameters.n_informative_divid
cv = parameters.cv
classifier = parameters.classifier
classifier_dictionary = parameters.classifier_dictionary
d = parameters.d
D = parameters.D
standard_deviation = parameters.standard_deviation
path_data = parameters.path_data
path_graph = parameters.path_graph
n_samples_list = parameters.n_samples_list
R_list = parameters.R_list
Z_factor_list = parameters.Z_factor_list
n_views_list = parameters.n_views_list
class_sep_factor_list = parameters.class_sep_factor_list
n_informative_divid_list = parameters.n_informative_divid_list
# Generate one dataset
#Z, y, multiviews_list, unsued_columns_percent = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(Z, y, multiviews_list)
# Register one multiview dataset
#results_to_csv(path, Z, y, multiviews_list)
# Score of one multiview dataset
#df_dimensions, df_scores_means, df_scores_std = score_one_multiview_dataset(cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(df_dimensions, df_scores_means, df_scores_std)
# Scores of n_samples_list datasets
#mean_samples, std_samples = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_samples, std_samples)
# Plot scores classifier2 vs score classifier1
classifier1 = "SVM"
classifier2 = "NB"
#graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
# Scores of R_list datasets
#mean_R, std_R = score_multiviews_R(R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_R, std_R)
# Scores of Z_factor_list datasets
#mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_Z, std_Z, error_Z)
# Scores divided by scores for R=1 (redundancy null) of n_views_list and R_list datasets
#dict_n_views_R_ratio = score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(dict_n_views_R_ratio)
# Scores of class_sep_factor_list datasets
#df_mean, df_std = score_multiviews_class_sep(class_sep_factor_list, path_data, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation)
#print(df_mean, df_std)
# Scores of n_informative_divid_list datasets
#mean_n_info, std_n_info = score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation)
#print(mean_n_info, std_n_info)
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]
path_graph = "/home/bernardet/Documents/StageL3/Graph/n_views_3_10_1_clus_2_n_info_div/"
n_classes = 2
n_clusters_per_class = 1
class_sep_factor = 2
n_informative_divid = 2
for n_views in range(3, 11):
n_samples = 500*n_views
mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
__version__ = '1.0.dev0'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 15:38:38 2019
@author: bernardet
"""
from sklearn.datasets import make_classification
from random import gauss
from math import ceil, floor
import numpy as np
import pandas as pd
def latent_space_dimension(views_dimensions_list, R):
"""
Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list
Parameters:
-----------
views_dimensions_list : list
R : float
Returns:
--------
an int
"""
max_view_dimension = max(views_dimensions_list)
dimension = ceil(R*sum(views_dimensions_list))
if dimension < max_view_dimension:
dimension = max_view_dimension
reduced_dimension = dimension
remove_sum = 0
for num_view in range(1, len(views_dimensions_list)):
view_prec = views_dimensions_list[num_view - 1]
view_current = views_dimensions_list[num_view]
remove = floor(R*view_prec)
remove_sum += remove
if reduced_dimension - remove < view_current:
dimension += view_current - (reduced_dimension - remove)
reduced_dimension = dimension - remove_sum
return dimension
def projection(latent_space, chosen_columns_list):
"""
Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order)
Parameters:
-----------
latent_space : array
chosen_columns_list : list
Returns:
--------
an array of dimension (number of rows of latent_space, length of chosen_columns_list)
"""
return latent_space[:, chosen_columns_list]
def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation):
"""
Returns a generator multiviews dataset
Parameters:
-----------
n_samples : int
dataset number of samples (number of rows of dataset)
n_views : int >= 2
dataset number of views
one view is a set of some features (columns) of the latent space
n_classes : int >= 2
dataset number of classes
Z_factor : float >= 1
minimal dimension of the latent space (enough to build the dataset) is calculed then multiplied by Z_factor
R : 0 <= float <= 1
R = 1 <> no possibility of redundancy between views
R = 0 <> maximal possibility of redundancy between views
n_clusters_per_class : int
class_sep_factor : float
class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid : float >= 1
n_informative_divid raises <> number of non-informative features raises
n_informative_divid = 1 <> no non-informative features, number of informative features = dimension of latent space
number of informative features = round(dimension of latent space / n_informative_divid)
d : float >= 1
minimal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
D : float >= d
maximal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
standard_deviation : float
standard deviation of the gaussian distribution N((d+D)/2, standard_deviation^2)
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
Returns:
--------
Z : an array of dimension(n_samples, R*n_views) = the generated samples
y : an array of dimension (n_samples) = the integer labels for class membership of each sample
a list of n_views tuples (X_v, I_v) with :
X_v = Z projected along d_v (= dimension of the v-ith views) columns in I_v
I_v = X_v columns numeros with numberring of Z columns numeros
unsued_dimensions_percent : percentage of unsued columns of latent space in views
n_informative : number of informative features (dimension of latent space - n_informative = number of non informative features)
"""
if n_views < 2:
raise ValueError("n_views >= 2")
if n_classes < 2:
raise ValueError("n_classes >= 2")
if Z_factor < 1:
raise ValueError("Z_factor >= 1 pour le bon fonctionnement de l'algorithme")
if d < 1:
raise ValueError("d >= 1")
if (d+D)/2 - 3*standard_deviation < 0:
raise ValueError("Il faut que (d+D)/2 - 3*standard_deviation >= 0 pour avoir des valeurs positives lors de l'emploi de la loi normale")
# n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2)
d_v = np.random.normal(loc=(d+D)/2, scale=standard_deviation, size=n_views)
d_v = list(d_v)
remove_list, add_list = [], []
for dim_view in d_v:
if dim_view < d or dim_view > D: # 1 <= d <= dim_view <= D
remove_list.append(dim_view)
add = -1
while add < d or add > D:
add = gauss((d+D)/2, standard_deviation)
add_list.append(add)
d_v = [view for view in d_v if view not in remove_list] + add_list
d_v = [int(view) for view in d_v] # dimension of views = integer
# d_v = list of views dimension from the highest to the lowest
d_v.sort(reverse=True)
# Dimension of latent space Z (multiplied by Z_factor)
dim_Z = Z_factor*latent_space_dimension(d_v, R)
# Number of informative features
n_informative = round(dim_Z/n_informative_divid)
# Generation of latent space Z
Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0,
n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None,
flip_y=0.01, class_sep=n_clusters_per_class*class_sep_factor, random_state=None)
I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero
meta_I_v = []
results = []
for view in range(n_views):
# choice d_v[view] numeros of Z columns uniformly from I_q
I_v = np.random.choice(I_q, size=d_v[view], replace=False) # tirage dans I_q sans remise de taille d_v[view]
meta_I_v += list(I_v)
# projection of Z along the columns in I_v
X_v = projection(Z, I_v)
results.append((X_v, I_v))
# remove R*d_v[view] columns numeros of I_v form I_q
elements_to_remove = np.random.choice(I_v, size=floor(R*d_v[view]), replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view])
I_q = np.setdiff1d(I_q, elements_to_remove) # I_q less elements from elements_to_remove
unsued_dimensions_list = [column for column in I_q if column not in meta_I_v]
unsued_dimensions_percent = round((len(unsued_dimensions_list) / dim_Z)*100, 2)
return Z, y, results, unsued_dimensions_percent, n_informative
def results_to_csv(path, latent_space, integer_labels, multiviews_list):
"""
Create length of multiviews_list + 2 csv files to the indicated path
Files name :
latent_space.csv for latent_space
integer_labels.csv for integer_labels
view0.csv for multiviews_list[0]
Parameters:
-----------
path : str
latent_space : array
integer_labels : 1D array
multiviews_list : list of tuples
Returns:
--------
None
"""
df_latent_space = pd.DataFrame(latent_space)
df_latent_space.to_csv(path+'latent_space.csv', index=False)
df_labels = pd.DataFrame(integer_labels)
df_labels.to_csv(path+'integer_labels.csv', index=False)
cpt = 0
for view_tuple in multiviews_list:
df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1])
df_view.to_csv(path+'view'+str(cpt)+'.csv', index=False)
cpt += 1
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 13:53:05 2019
@author: bernardet
"""
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import numpy as np
# General parameters
n_samples = 1000
# number of samples (int)
n_views = 3
# number of views >= 2 (int)
n_classes = 2
# number of classes >= 3 (int)
Z_factor = 250
# multiplication factor of Z dimension (default value = 1)
R = 2/3
# redondance (float)
cv = 10
# number of cross-validation splitting (int)
n_clusters_per_class = 2
# number of clusters per class >= 1 (int)
class_sep_factor = 2
# factor >= 1 as class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid = 1
# factor >= 1 as number of informative features = round(dimension of latent space / n_informative_divid)
classifier = "SVM"
# name of classifier (str)
classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}
# dictionary of classifiers
n_samples_list = [100, 500, 1000, 1500, 2000]#, 2500, 3000]#, 3500, 4000, 5000, 7000, 10000]
# list of number of samples to test generator
R_list = list(np.arange(0, 1.05, 0.05))
# list of diverse R
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]#[25, 50, 75, 100, 150, 200, 250, 500, 600, 750, 800, 900, 1000]
# list of diverse Z_factor
n_views_list = [n_view for n_view in range(2, 10)]
# list of diverse n_views
class_sep_factor_list = [2, 5, 10]
# list of diverse class_sep_factor
n_informative_divid_list = [1, 2, 3]
# list of diverse n_informative_divid
path_data = "/home/bernardet/Documents/StageL3/Data/"
# path to register the multiview dataset
path_graph = "/home/bernardet/Documents/StageL3/Graph/"
# path to register scores graph
# Parameters of gaussian distribution N((d+D)/2, standard_deviation_2) :
# d <= dim[v] <= D for all v
# (d+D)/2 - 3*sqrt(standard_deviation_2) >= 0
d = 4
# < D, > 0
D = 10
# > d
standard_deviation = 2
# standard deviation of the gaussian distribution
# make_classification parameters :
# a trouver comment les utiliser
part_informative = 0
# proportion of informative features (float between 0 and 1)
part_redundant = 1
# proportion of redundant features (float between 0 and 1)
# n_redundant >= 1 for redundant
part_repeated = 1
# # proportion of repeated features (float between 0 and 1)
# n_repeated >= 1 for useless features and correlation
weights = [0.7, 0.3]
# proportion of samples assigned to each class (list) len(weights) = nbr_features
# != [0.5, 0.5] / = [0.8, 0.2] for imbalance
flip_y = 0.1
# fraction of samples whose class are randomly exchanged (float)
# > 0 for noise
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 28 14:14:46 2019
@author: bernardet
"""
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def majority_list(predictions_list):
"""
Returns an array which on each row the majority class of the same row in predictions_list
Parameters:
-----------
predictions_list : list of 1D array
Returns:
--------
an 1D array
"""
n_samples = len(predictions_list[0])
# majority_prediction[i] = prediction of predictions_list[i] which appears the most on predictions_list[i]
majority_prediction = np.array([-1]*n_samples)
# concatenate_predictions_list[i] = list contains prediction of the i-th data per view
reshape_predictions_list = [predictions_list[i].reshape(len(predictions_list[i]), 1) for i in range(len(predictions_list))]
concatenate_predictions_list = np.hstack(reshape_predictions_list)
for sample in range(n_samples):
# dictionary contains predictions (key) and its occurences in concatenate_predictions_list[sample]
count = Counter(concatenate_predictions_list[sample])
maj_value = max(count.values()) # maximal number of a prediction
for key in count.keys(): # searchs the prediction with the maximal occurence number
if count[key] == maj_value:
majority_prediction[sample] = key
break
return majority_prediction
def majority_score(views_dictionary, integer_labels, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}):
"""
Returns the mean and the standard deviation of accuracy score when predictions are selected by majority of predictions of different views
Parameters:
-----------
views_dictionary : dict
integer_labels = array
cv : int
classifier : str
classifier_dictionary : dict
Returns:
--------
Two floats
"""
skf = StratifiedKFold(n_splits=cv, random_state=1, shuffle=True) # provides cv train/test indices to split data in cv train/test sets.
prediction_list = [[] for i in range(cv)] # for majority_list function
test_list = [[] for i in range(cv)] # for score
for key in views_dictionary.keys():
i = 0
for train_index, test_index in skf.split(views_dictionary[key], integer_labels):
# splits data and integer label of one view in test and train sets
X = views_dictionary[key]
train, test = X[train_index], X[test_index]
y_train, y_test = integer_labels[train_index], integer_labels[test_index]
# trains the classifier and tests it with test set
clf = classifier_dictionary[classifier]
clf.fit(train, y_train.ravel())
y_pred = clf.predict(test)
prediction_list[i].append(y_pred)
if len(test_list[i]) == 0: # same y_test for all views
test_list[i] = y_test
i += 1
score = []
for i in range(len(prediction_list)):
y_pred_majority = majority_list(prediction_list[i]) # majority of views predictions
score.append(accuracy_score(test_list[i].ravel(), y_pred_majority)) # score of majority of views predictions vs expected predictions
score = np.array(score)
return score.mean(), score.std()
def score_one_multiview_dataset(cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Returns 3 Series (first with dimensions of latent space, views and percentage of dimensions of latent space unsued in views, the second with accuracy score and the third with the standard deivation of accuracy score) of latent space, views,
early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions)
Parameters:
-----------
cv : int
classifier : str
classifier_dictionary : dict
n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
3 Series
"""
# dictionary contains percentage of unsued dimension of latent space and dimension of latent space and views
dimensions = {'unsued dimension of latent space':0, "number of informative features":0, 'latent space':0}
dimensions.update({'view'+str(i):0 for i in range(n_views)})
# dictionary contains and mean of accuracy scores
dict_scores_means = {'latent space':0}
dict_scores_means.update({'view'+str(i):0 for i in range(n_views)})
dict_scores_means.update({'early fusion':0, 'late fusion':0})
# dictionary contains standard deviation of accuracy scores
dict_scores_std = {'latent space':[]}
dict_scores_std.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_std.update({'early fusion':[], 'late fusion':[]})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
dimensions["unsued dimension of latent space"] = unsued_dimensions_percent
dimensions["number of informative features"] = n_informative
dimensions["latent space"] = Z.shape
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
dimensions['view'+str(i)] = multiviews_list[i][0].shape
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dictionary of data
dict_data_df = {'latent space':Z}
dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)})
dict_data_df.update({'early fusion':early_fusion})
for key in dict_data_df.keys():
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key] = score.mean()
dict_scores_std[key] = score.std()
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'] = mean_majority
dict_scores_std['late fusion'] = std_majority
df_dimensions = pd.Series(dimensions)
df_scores_means = pd.Series(dict_scores_means)
df_scores_std = pd.Series(dict_scores_std)
return df_dimensions, df_scores_means, df_scores_std
def score_multiviews_n_samples(n_samples_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views,
early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with n_samples_list as index for the indicated classifier
Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs n_samples_list
Parameters:
-----------
n_samples_list : list
each element from n_samples_list defines a new dataset with element samples
path_graph : str
path to save graphics
cv : int
classifier : str
classifier_dictionary : dict
n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
2 DataFrames with n_samples_list as index
"""
# n_samples_list = list of samples dimension from the lowest to the highest
n_samples_list.sort(reverse=False)
# list of percentage of unsued columns of latent space in views
unsued_dimensions_percent_list = []
# list of number of informative features of latent space
n_informative_list = []
# dictionary contains mean of accuracy scores per n_samples
dict_scores_means = {'latent space':[]}
dict_scores_means.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_means.update({'early fusion':[], 'late fusion':[]})
# dictionary contains standard deviation of accuracy scores per n_samples
dict_scores_std = {'latent space':[]}
dict_scores_std.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_std.update({'early fusion':[], 'late fusion':[]})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
for n_samples in n_samples_list:
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
unsued_dimensions_percent_list.append(unsued_dimensions_percent)
n_informative_list.append(n_informative)
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dictionary of data
dict_data = {'latent space':Z}
dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)})
dict_data.update({'early fusion':early_fusion})
for key in dict_data.keys():
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key].append(score.mean())
dict_scores_std[key].append(score.std())
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'].append(mean_majority)
dict_scores_std['late fusion'].append(std_majority)
df_scores_means = pd.DataFrame(dict_scores_means, index=n_samples_list)
df_scores_std = pd.DataFrame(dict_scores_std, index=n_samples_list)
plt.figure()
for key in dict_scores_means.keys():
plt.errorbar(n_samples_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key)
# index and label for graphic
label_index = []
for n_samples, percent, n_informative in zip(n_samples_list, unsued_dimensions_percent_list, n_informative_list):
label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative))
plt.xticks(n_samples_list, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.xlabel("Number of samples\nPercentage of dimensions of latent space unsued in views\nNumber of informative features")
plt.ylabel("Accuracy score for "+classifier)
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs number of samples for classifier "+classifier)
plt.savefig(path_graph+"score_samples_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
return df_scores_means, df_scores_std
def graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv=10, classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Creates and saves (at the indicated path path_graph) multiple graphs represented scores of classifier2 vs scores of classifier1 (one graph per column of result of score_multiviews_n_samples)
Parameters:
-----------
classifier1 : str
classifier2 : str
n_samples_list : list
each element from n_samples_list defines a new dataset with element samples
path_graph : str
path to save graphics
cv : int
classifier : str
classifier_dictionary : dict
n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
None
"""
df_scores_clf1_means, df_scores_clf1_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier1, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
df_scores_clf2_means, df_scores_clf2_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier2, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
n_samples_list = df_scores_clf1_means.index
keys = df_scores_clf1_means.keys()
for key in keys:
plt.figure()
plt.scatter(df_scores_clf1_means[key].values, df_scores_clf2_means[key].values, c=df_scores_clf1_means[key].values)
plt.plot([0.0, 1.1], [0.0, 1.1], "--", c=".7") # diagonal
plt.xlabel("Accuracy score for "+classifier1)
plt.ylabel("Accuracy score for "+classifier2)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score of "+key+" for "+classifier2+" vs "+classifier1)
plt.savefig(path_graph+classifier1+"_"+classifier2+"_"+str(n_views)+"_"+key+".png")
plt.show()
plt.close()
def score_multiviews_R(R_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, Z_factor=1, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views,
early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with R_list as index for the indicated classifier
Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs R_list
Parameters:
-----------
R_list : list
each element from R_list defines a new dataset with element as R
path_graph : str
path to save graphics
cv : int
classifier : str
classifier_dictionary : dict
n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
2 DataFrames with R_list as index
"""
# R_list = list of diverse values of R from the lowest to the highest
R_list.sort(reverse=False)
# list of percentage of unsued columns of latent space in views
unsued_dimensions_percent_list = []
# list of number of informative features of latent space
n_informative_list = []
# dictionary contains mean of accuracy scores per R
dict_scores_means = {'latent space':[]}
dict_scores_means.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_means.update({'early fusion':[], 'late fusion':[]})
# dictionary contains standard deviation of accuracy scores per R
dict_scores_std = {'latent space':[]}
dict_scores_std.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_std.update({'early fusion':[], 'late fusion':[]})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
for R in R_list:
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
unsued_dimensions_percent_list.append(unsued_dimensions_percent)
n_informative_list.append(n_informative)
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dictionary of data
dict_data_df = {'latent space':Z}
dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)})
dict_data_df.update({'early fusion':early_fusion})
for key in dict_data_df.keys():
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key].append(score.mean())
dict_scores_std[key].append(score.std())
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'].append(mean_majority)
dict_scores_std['late fusion'].append(std_majority)
df_scores_means = pd.DataFrame(dict_scores_means, index=R_list)
df_scores_std = pd.DataFrame(dict_scores_std, index=R_list)
plt.figure()
for key in dict_scores_means.keys():
plt.errorbar(R_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key)
# index and label for graphic
label_index = []
R_label = []
for i in range(0, len(R_list), 4):
R_label.append(R_list[i])
label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i]))
plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features")
plt.ylabel("Accuracy score for "+classifier)
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs R for classifier "+classifier)
plt.savefig(path_graph+"score_R_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
return df_scores_means, df_scores_std
def score_multiviews_Z_factor(Z_factor_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Returns 3 DataFrames (first with accuracy score, the second with the standard deivation of accuracy score and the third with the error rate) of latent space, views,
early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with sum of views dimension divided by Z_factor_list as index for the indicated classifier
Creates and saves (at the indicated path path_graph) a graph represented accuracy score vs sum of views dimension divided by Z_factor_list and a graph represented error rate (1 - accuracy score) vs sum of views dimension divided by Z_factor_list
Parameters:
-----------
Z_factor_list : list
each element from Z_factor_list defines a new dataset with element as Z_factor
path_graph : str
path to save graphics
cv : int
classifier : str
classifier_dictionary : dict
n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
3 DataFrames with Z_factor_list as index
"""
# Z_factor_list = list of diverse values of Z_factor from the highest to the lowest
Z_factor_list.sort(reverse=True)
# list of sum of views dimension for each Z_factor_list item
d_v = []
# list of Z dimension for each Z_factor_list item
Z_dim_list = []
# list of percentage of unsued columns of latent space in views
unsued_dimensions_percent_list = []
# list of number of informative features of latent space
n_informative_list = []
# dictionary contains mean of accuracy scores per Z_factor
dict_scores_means = {'latent space':[]}
dict_scores_means.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_means.update({'early fusion':[], 'late fusion':[]})
# dictionary contains error rate per Z_factor
dict_scores_error = {'latent space':[]}
dict_scores_error.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_error.update({'early fusion':[], 'late fusion':[]})
# dictionary contains standard deviation of accuracy scores per Z_factor
dict_scores_std = {'latent space':[]}
dict_scores_std.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_std.update({'early fusion':[], 'late fusion':[]})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
for Z_factor in Z_factor_list:
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
unsued_dimensions_percent_list.append(unsued_dimensions_percent)
n_informative_list.append(n_informative)
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dimension = number of columns
d_v.append(early_fusion.shape[1])
Z_dim_list.append(Z.shape[1])
# dictionary of data
dict_data_df = {'latent space':Z}
dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)})
dict_data_df.update({'early fusion':early_fusion})
for key in dict_data_df.keys():
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key].append(score.mean())
dict_scores_error[key].append(1 - score.mean())
dict_scores_std[key].append(score.std())
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'].append(mean_majority)
dict_scores_error['late fusion'].append(1 - mean_majority)
dict_scores_std['late fusion'].append(std_majority)
d_v_divid_Z = np.divide(np.array(d_v), np.array(Z_dim_list))
df_scores_means = pd.DataFrame(dict_scores_means, index=d_v_divid_Z)
df_scores_error = pd.DataFrame(dict_scores_error, index=d_v_divid_Z)
df_scores_std = pd.DataFrame(dict_scores_std, index=d_v_divid_Z)
# index and label for graphics
label_index = [chr(i) for i in range(ord('a'),ord('z')+1)]
label_index = label_index[0:len(d_v)]
label_value = ""
for label, v_Z, dim_v, dim_Z, Z_factor, percent, n_informative in zip(label_index, d_v_divid_Z, d_v, Z_dim_list, Z_factor_list, unsued_dimensions_percent_list, n_informative_list):
label_value = label_value + label+" : V/Z = "+str(round(v_Z, 4))+", V = "+str(dim_v)+", Z = "+str(dim_Z)+", Z_factor = "+str(Z_factor)+", % ="+str(percent)+", n_informative = "+str(n_informative)+'\n'
x_label = "V/Z = sum of views dimension divided by latent space dimension with :\nV = sum of views dimension\nZ = latent space dimension multiplied by Z_factor\n% = percentage of dimensions of latent space unsued in views\nn_informative = number of informative features"
plt.figure(figsize=(10, 10)) # accuracy score vs d_v_divid_Z
for key in dict_scores_means.keys():
plt.semilogx(d_v_divid_Z, dict_scores_means[key], '.-', label=key)
plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value)
plt.xlabel(x_label)
plt.ylabel("Accuracy score for "+classifier)
plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier)
plt.savefig(path_graph+"score_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
plt.figure(figsize=(10, 10)) # error rate vs d_v_divid_Z
for key in dict_scores_means.keys():
plt.semilogx(d_v_divid_Z, dict_scores_error[key], '.-', label=key)
plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value)
plt.xlabel(x_label)
plt.ylabel("Error rate for "+classifier)
plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nError rate vs ratio sum of views dimension / latent space dimension for classifier "+classifier)
plt.savefig(path_graph+"error_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
"""
plt.figure(figsize=(10, 10))
for key in dict_scores_means.keys():
plt.errorbar(d_v_divid_Z, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key)
plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center')
plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value)
plt.xlabel(x_label)
plt.ylabel("Accuracy score for "+classifier)
plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier)
plt.savefig(path_graph+"score_Z_factor_errorbar_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
"""
plt.figure(figsize=(10, 10)) # accuracy score of early fusion divided by accuracy score of each view vs d_v_divid_Z
for view in dict_views.keys():
plt.semilogx(d_v_divid_Z, dict_scores_means['early fusion']/df_scores_means[view], '.-', label='early fusion score divided by '+view+' score')
plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value)
plt.xlabel(x_label)
plt.ylabel("Ratio accuracy score for early fusion / accuracy score for each view for "+classifier)
plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRatio accuracy score for early fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier)
plt.savefig(path_graph+"score_Z_factor_majority_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
plt.figure(figsize=(10, 10)) # accuracy score of late fusion divided by accuracy score of each view vs d_v_divid_Z
for view in dict_views.keys():
plt.semilogx(d_v_divid_Z, dict_scores_means['late fusion']/df_scores_means[view], '.-', label='late fusion score divided by '+view+' score')
plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value)
plt.xlabel(x_label)
plt.ylabel("Ratio accuracy score for late fusion / accuracy score for each view for "+classifier)
plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRation accuracy score for late fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier)
plt.savefig(path_graph+"score_Z_factor_all_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
return df_scores_means, df_scores_std, df_scores_error
def score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_classes=2, Z_factor=1, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Returns a dictionary with n_views_list as key containing a list of DataFrames (represented accuracy score divided by accuracy score for R=1 <> redundancy null) of views,
early fusion predictions (concatenate views predictions and late fusion predictions (majority views predictions) with R_list as index for the indicated classifier per key
Creates and saves (at the indicated path path_graph) a graph per value of n_views_list represented accuracy score divided by accuracy score for R=1 vs R_list
Parameters:
-----------
n_views_list : list
each element from n_views_list defines a new dataset with element as n_views
R_list : list
each element from R_list defines a new dataset with element as R
path_graph : str
path to save graphics
cv : int
classifier : str
classifier_dictionary : dict
n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
a dictionary with n_views_list as key containing a list of DataFrames (represented accuracy score divided by accuracy score for R=1 <> redundancy null) with R_list as index per value of n_views_list
"""
dict_n_views_R_ratio = {key:0 for key in n_views_list}
# n_views_list = list of diverse values of n_views from the lowest to the highest
n_views_list.sort(reverse=False)
# same views have same colors on each graphs
dict_colors = {'view'+str(i):0 for i in range(n_views_list[-1])}
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
for key, c in zip(dict_colors.keys(), colors):
dict_colors[key] = c
dict_colors.update({'early fusion':'purple', 'late fusion':'maroon'})
for n_views in n_views_list:
# R_list = list of diverse values of R from the lowest to the highest
R_list.sort(reverse=False)
# list of percentage of unsued columns of latent space in views
unsued_dimensions_percent_list = []
# list of number of informative features of latent space
n_informative_list = []
# dictionary contains mean of accuracy scores per R
dict_scores_means = {'view'+str(i):[] for i in range(n_views)}
dict_scores_means.update({'early fusion':[], 'late fusion':[]})
# dictionary of list of scores' mean of view for diverse R divided by score's mean of view for R = 1 (<> redundancy null)
dict_scores_ratio_R_1 = {'view'+str(i):0 for i in range(n_views)}
dict_scores_ratio_R_1.update({'early fusion':0, 'late fusion':0})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
for R in R_list:
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
unsued_dimensions_percent_list.append(unsued_dimensions_percent)
n_informative_list.append(n_informative)
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dictionary of data
dict_data_df = {'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}
dict_data_df.update({'early fusion':early_fusion})
for key in dict_data_df.keys():
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key].append(score.mean())
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'].append(mean_majority)
for key in dict_scores_means.keys():
score_R_1 = dict_scores_means[key][-1] # R = 1 = last value of R_list => last score value in dict_scores_means[key]
dict_scores_ratio_R_1[key] = np.divide(np.array(dict_scores_means[key]), score_R_1)
df_scores_ratio_R_1 = pd.DataFrame(dict_scores_ratio_R_1, index=R_list)
plt.figure()
for key in dict_scores_means.keys():
plt.plot(R_list, dict_scores_ratio_R_1[key], '.-', color=dict_colors[key], label=key)
# index and label for graphic
label_index = []
R_label = []
for i in range(0, len(R_list), 4):
R_label.append(R_list[i])
label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i]))
plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features")
plt.ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier)
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1\n(redundancy null) vs R for classifier "+classifier)
plt.savefig(path_graph+"score_R_divid_R_1_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
dict_n_views_R_ratio[n_views] = df_scores_ratio_R_1
plt.figure()
ax = plt.axes(projection="3d")
for n_views in n_views_list:
for key in dict_n_views_R_ratio[n_views].keys():
if n_views == n_views_list[-1]: # print legends only once
ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key], label=key)
else:
ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key])
ax.set_xlabel("R")
ax.set_ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier)
ax.set_zlabel("Number of views")
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
plt.title("number of samples = "+str(n_samples)+" - factor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1 (redundancy null) vs R, number of views for classifier "+classifier)
plt.savefig(path_graph+"score_R_divid_R_1_all_n_views"+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
return dict_n_views_R_ratio
def score_multiviews_class_sep(class_sep_factor_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_samples=1000, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, n_informative_divid=1, d=4, D=10, standard_deviation=2):
"""
Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views,
early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with class_sep_factor_list as index for the indicated classifier
Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs class_sep_factor_list
Parameters:
-----------
class_sep_factor_list : list
each element from n_samples_list defines a new dataset
path_graph : str
cv : int
classifier : str
classifier_dictionary : dict
n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
2 DataFrames with n_samples_list as index
"""
# list of percentage of unsued columns of latent space in views
unsued_dimensions_percent_list = []
# list of number of informative features of latent space
n_informative_list = []
# dictionary contains mean of accuracy scores per class_sep_factor
dict_scores_means = {'latent space':[]}
dict_scores_means.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_means.update({'early fusion':[], 'late fusion':[]})
# dictionary contains standard deviation of accuracy scores per class_sep_factor
dict_scores_std = {'latent space':[]}
dict_scores_std.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_std.update({'early fusion':[], 'late fusion':[]})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
for class_sep_factor in class_sep_factor_list:
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
unsued_dimensions_percent_list.append(unsued_dimensions_percent)
n_informative_list.append(n_informative)
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dictionary of data
dict_data = {'latent space':Z}
dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)})
dict_data.update({'early fusion':early_fusion})
for key in dict_data.keys():
print('key', key)
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key].append(score.mean())
dict_scores_std[key].append(score.std())
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'].append(mean_majority)
dict_scores_std['late fusion'].append(std_majority)
print(dict_scores_means)
df_scores_means = pd.DataFrame(dict_scores_means, index=class_sep_factor_list)
df_scores_std = pd.DataFrame(dict_scores_std, index=class_sep_factor_list)
plt.figure()
for key in dict_scores_means.keys():
plt.errorbar(class_sep_factor_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key)
# index and label for graphic
label_index = []
for n_samples, percent, n_informative in zip(class_sep_factor_list, unsued_dimensions_percent_list, n_informative_list):
label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative))
plt.xticks(class_sep_factor_list, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.xlabel("Factor (class_sep = factor*n_clusters_per_class)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features")
plt.ylabel("Accuracy score for "+classifier)
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs factor of class_sep for classifier "+classifier)
plt.savefig(path_graph+"score_class_sep_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
return df_scores_means, df_scores_std
def score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_samples=1000, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, d=4, D=10, standard_deviation=2):
"""
Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views,
early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with n_informative_divid_list as index for the indicated classifier
Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs n_informative_divid_list
Parameters:
-----------
n_informative_divid_list : list
each element from n_informative_divid_list defines a new dataset with element as n_informative_divid
path_graph : str
cv : int
classifier : str
classifier_dictionary : dict
n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation : parameters of generator_multiviews_dataset
Returns:
--------
2 DataFrames with n_samples_list as index
"""
# list of percentage of unsued columns of latent space in views
unsued_dimensions_percent_list = []
# list of number of informative features of latent space
n_informative_list = []
# dictionary contains mean of accuracy scores per n_informative_divid
dict_scores_means = {'latent space':[]}
dict_scores_means.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_means.update({'early fusion':[], 'late fusion':[]})
# dictionary contains standard deviation of accuracy scores per n_informative_divid
dict_scores_std = {'latent space':[]}
dict_scores_std.update({'view'+str(i):[] for i in range(n_views)})
dict_scores_std.update({'early fusion':[], 'late fusion':[]})
# dictionary contains data of each view
dict_views = {'view'+str(i):0 for i in range(n_views)}
for n_informative_divid in n_informative_divid_list:
Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
unsued_dimensions_percent_list.append(unsued_dimensions_percent)
n_informative_list.append(n_informative)
for i in range(n_views):
# multiviews_list[i] = (columns / data of view i, numeros of columns of view i)
dict_views['view'+str(i)] = multiviews_list[i][0]
early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views
# dictionary of data
dict_data = {'latent space':Z}
dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)})
dict_data.update({'early fusion':early_fusion})
for key in dict_data.keys():
clf = classifier_dictionary[classifier]
score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv)
dict_scores_means[key].append(score.mean())
dict_scores_std[key].append(score.std())
mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary)
dict_scores_means['late fusion'].append(mean_majority)
dict_scores_std['late fusion'].append(std_majority)
df_scores_means = pd.DataFrame(dict_scores_means, index=n_informative_divid_list)
df_scores_std = pd.DataFrame(dict_scores_std, index=n_informative_divid_list)
plt.figure()
for key in dict_scores_means.keys():
plt.errorbar(n_informative_divid_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key)
# index and label for graphic
label_index = []
for n_informative_divid, percent, n_informative in zip(n_informative_divid_list, unsued_dimensions_percent_list, n_informative_list):
label_index.append(str(n_informative_divid)+'\n'+str(percent)+'\n'+str(n_informative))
plt.xticks(n_informative_divid_list, label_index, fontsize='medium', multialignment='center') # new x indexes
plt.xlabel("Factor (n_informative = dimension of latent space / factor)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features")
plt.ylabel("Accuracy score for "+classifier)
plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs n_informative_divid for classifier "+classifier)
plt.savefig(path_graph+"score_n_informative_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight')
plt.show()
plt.close()
return df_scores_means, df_scores_std
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment