#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Nov 28 14:14:46 2019 @author: bernardet """ from multiviews_datasets import generator_multiviews_dataset, results_to_csv from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_score, StratifiedKFold from sklearn.metrics import accuracy_score from collections import Counter from mpl_toolkits.mplot3d import Axes3D from math import sqrt import numpy as np import matplotlib.pyplot as plt import pandas as pd def majority_list(predictions_list): """ Returns an array which on each row the majority class of the same row in predictions_list Parameters: ----------- predictions_list : list of 1D array Returns: -------- an 1D array """ n_samples = len(predictions_list[0]) # majority_prediction[i] = prediction of predictions_list[i] which appears the most on predictions_list[i] majority_prediction = np.array([-1]*n_samples) # concatenate_predictions_list[i] = list contains prediction of the i-th data per view reshape_predictions_list = [predictions_list[i].reshape(len(predictions_list[i]), 1) for i in range(len(predictions_list))] concatenate_predictions_list = np.hstack(reshape_predictions_list) for sample in range(n_samples): # dictionary contains predictions (key) and its occurences in concatenate_predictions_list[sample] count = Counter(concatenate_predictions_list[sample]) maj_value = max(count.values()) # maximal number of a prediction for key in count.keys(): # searchs the prediction with the maximal occurence number if count[key] == maj_value: majority_prediction[sample] = key break return majority_prediction def majority_score(views_dictionary, integer_labels, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}): """ Returns the mean and the standard deviation of accuracy score when predictions are selected by majority of predictions of different views Parameters: ----------- views_dictionary : dict integer_labels = array cv : int classifier : str classifier_dictionary : dict Returns: -------- Two floats """ skf = StratifiedKFold(n_splits=cv, random_state=1, shuffle=True) # provides cv train/test indices to split data in cv train/test sets. prediction_list = [[] for i in range(cv)] # for majority_list function test_list = [[] for i in range(cv)] # for score for key in views_dictionary.keys(): i = 0 for train_index, test_index in skf.split(views_dictionary[key], integer_labels): # splits data and integer label of one view in test and train sets X = views_dictionary[key] train, test = X[train_index], X[test_index] y_train, y_test = integer_labels[train_index], integer_labels[test_index] # trains the classifier and tests it with test set clf = classifier_dictionary[classifier] clf.fit(train, y_train.ravel()) y_pred = clf.predict(test) prediction_list[i].append(y_pred) if len(test_list[i]) == 0: # same y_test for all views test_list[i] = y_test i += 1 score = [] for i in range(len(prediction_list)): y_pred_majority = majority_list(prediction_list[i]) # majority of views predictions score.append(accuracy_score(test_list[i].ravel(), y_pred_majority)) # score of majority of views predictions vs expected predictions score = np.array(score) return score.mean(), score.std() def score_one_multiview_dataset(cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Returns 3 Series (first with dimensions of latent space, views and percentage of dimensions of latent space unsued in views, the second with accuracy score and the third with the standard deivation of accuracy score) of latent space, views, early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) Parameters: ----------- cv : int classifier : str classifier_dictionary : dict n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- 3 Series """ # dictionary contains percentage of unsued dimension of latent space and dimension of latent space and views dimensions = {'unsued dimension of latent space':0, "number of informative features":0, 'latent space':0} dimensions.update({'view'+str(i):0 for i in range(n_views)}) # dictionary contains and mean of accuracy scores dict_scores_means = {'latent space':0} dict_scores_means.update({'view'+str(i):0 for i in range(n_views)}) dict_scores_means.update({'early fusion':0, 'late fusion':0}) # dictionary contains standard deviation of accuracy scores dict_scores_std = {'latent space':[]} dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_std.update({'early fusion':[], 'late fusion':[]}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) dimensions["unsued dimension of latent space"] = unsued_dimensions_percent dimensions["number of informative features"] = n_informative dimensions["latent space"] = Z.shape for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] dimensions['view'+str(i)] = multiviews_list[i][0].shape early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dictionary of data dict_data_df = {'latent space':Z} dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) dict_data_df.update({'early fusion':early_fusion}) for key in dict_data_df.keys(): clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) dict_scores_means[key] = score.mean() dict_scores_std[key] = score.std() mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'] = mean_majority dict_scores_std['late fusion'] = std_majority df_dimensions = pd.Series(dimensions) df_scores_means = pd.Series(dict_scores_means) df_scores_std = pd.Series(dict_scores_std) return df_dimensions, df_scores_means, df_scores_std def score_multiviews_n_samples(n_samples_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with n_samples_list as index for the indicated classifier Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs n_samples_list Parameters: ----------- n_samples_list : list each element from n_samples_list defines a new dataset with element samples path_graph : str path to save graphics cv : int classifier : str classifier_dictionary : dict n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- 2 DataFrames with n_samples_list as index """ # n_samples_list = list of samples dimension from the lowest to the highest n_samples_list.sort(reverse=False) # list of percentage of unsued columns of latent space in views unsued_dimensions_percent_list = [] # list of number of informative features of latent space n_informative_list = [] # dictionary contains mean of accuracy scores per n_samples dict_scores_means = {'latent space':[]} dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_means.update({'early fusion':[], 'late fusion':[]}) # dictionary contains standard deviation of accuracy scores per n_samples dict_scores_std = {'latent space':[]} dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_std.update({'early fusion':[], 'late fusion':[]}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} for n_samples in n_samples_list: Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) unsued_dimensions_percent_list.append(unsued_dimensions_percent) n_informative_list.append(n_informative) for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dictionary of data dict_data = {'latent space':Z} dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) dict_data.update({'early fusion':early_fusion}) for key in dict_data.keys(): clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) dict_scores_means[key].append(score.mean()) dict_scores_std[key].append(score.std()) mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'].append(mean_majority) dict_scores_std['late fusion'].append(std_majority) df_scores_means = pd.DataFrame(dict_scores_means, index=n_samples_list) df_scores_std = pd.DataFrame(dict_scores_std, index=n_samples_list) plt.figure() for key in dict_scores_means.keys(): plt.errorbar(n_samples_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) # index and label for graphic label_index = [] for n_samples, percent, n_informative in zip(n_samples_list, unsued_dimensions_percent_list, n_informative_list): label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative)) plt.xticks(n_samples_list, label_index, fontsize='medium', multialignment='center') # new x indexes plt.xlabel("Number of samples\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") plt.ylabel("Accuracy score for "+classifier) plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs number of samples for classifier "+classifier) plt.savefig(path_graph+"score_samples_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() return df_scores_means, df_scores_std def graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv=10, classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Creates and saves (at the indicated path path_graph) multiple graphs represented scores of classifier2 vs scores of classifier1 (one graph per column of result of score_multiviews_n_samples) Parameters: ----------- classifier1 : str classifier2 : str n_samples_list : list each element from n_samples_list defines a new dataset with element samples path_graph : str path to save graphics cv : int classifier : str classifier_dictionary : dict n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- None """ df_scores_clf1_means, df_scores_clf1_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier1, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) df_scores_clf2_means, df_scores_clf2_std = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier2, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) n_samples_list = df_scores_clf1_means.index keys = df_scores_clf1_means.keys() for key in keys: plt.figure() plt.scatter(df_scores_clf1_means[key].values, df_scores_clf2_means[key].values, c=df_scores_clf1_means[key].values) plt.plot([0.0, 1.1], [0.0, 1.1], "--", c=".7") # diagonal plt.xlabel("Accuracy score for "+classifier1) plt.ylabel("Accuracy score for "+classifier2) plt.xlim(0, 1) plt.ylim(0, 1) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score of "+key+" for "+classifier2+" vs "+classifier1) plt.savefig(path_graph+classifier1+"_"+classifier2+"_"+str(n_views)+"_"+key+".png") plt.show() plt.close() def score_multiviews_R(R_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, Z_factor=1, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with R_list as index for the indicated classifier Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs R_list Parameters: ----------- R_list : list each element from R_list defines a new dataset with element as R path_graph : str path to save graphics cv : int classifier : str classifier_dictionary : dict n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- 2 DataFrames with R_list as index """ # R_list = list of diverse values of R from the lowest to the highest R_list.sort(reverse=False) # list of percentage of unsued columns of latent space in views unsued_dimensions_percent_list = [] # list of number of informative features of latent space n_informative_list = [] # dictionary contains mean of accuracy scores per R dict_scores_means = {'latent space':[]} dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_means.update({'early fusion':[], 'late fusion':[]}) # dictionary contains standard deviation of accuracy scores per R dict_scores_std = {'latent space':[]} dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_std.update({'early fusion':[], 'late fusion':[]}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} for R in R_list: Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) unsued_dimensions_percent_list.append(unsued_dimensions_percent) n_informative_list.append(n_informative) for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dictionary of data dict_data_df = {'latent space':Z} dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) dict_data_df.update({'early fusion':early_fusion}) for key in dict_data_df.keys(): clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) dict_scores_means[key].append(score.mean()) dict_scores_std[key].append(score.std()) mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'].append(mean_majority) dict_scores_std['late fusion'].append(std_majority) df_scores_means = pd.DataFrame(dict_scores_means, index=R_list) df_scores_std = pd.DataFrame(dict_scores_std, index=R_list) plt.figure() for key in dict_scores_means.keys(): plt.errorbar(R_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) # index and label for graphic label_index = [] R_label = [] for i in range(0, len(R_list), 4): R_label.append(R_list[i]) label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i])) plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") plt.ylabel("Accuracy score for "+classifier) plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs R for classifier "+classifier) plt.savefig(path_graph+"score_R_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() return df_scores_means, df_scores_std def score_multiviews_Z_factor(Z_factor_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_views=3, n_classes=2, R=2/3, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Returns 3 DataFrames (first with accuracy score, the second with the standard deivation of accuracy score and the third with the error rate) of latent space, views, early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with sum of views dimension divided by Z_factor_list as index for the indicated classifier Creates and saves (at the indicated path path_graph) a graph represented accuracy score vs sum of views dimension divided by Z_factor_list and a graph represented error rate (1 - accuracy score) vs sum of views dimension divided by Z_factor_list Parameters: ----------- Z_factor_list : list each element from Z_factor_list defines a new dataset with element as Z_factor path_graph : str path to save graphics cv : int classifier : str classifier_dictionary : dict n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- 3 DataFrames with Z_factor_list as index """ # Z_factor_list = list of diverse values of Z_factor from the highest to the lowest Z_factor_list.sort(reverse=True) # list of sum of views dimension for each Z_factor_list item d_v = [] # list of Z dimension for each Z_factor_list item Z_dim_list = [] # list of percentage of unsued columns of latent space in views unsued_dimensions_percent_list = [] # list of number of informative features of latent space n_informative_list = [] # dictionary contains mean of accuracy scores per Z_factor dict_scores_means = {'latent space':[]} dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_means.update({'early fusion':[], 'late fusion':[]}) # dictionary contains error rate per Z_factor dict_scores_error = {'latent space':[]} dict_scores_error.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_error.update({'early fusion':[], 'late fusion':[]}) # dictionary contains standard deviation of accuracy scores per Z_factor dict_scores_std = {'latent space':[]} dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_std.update({'early fusion':[], 'late fusion':[]}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} for Z_factor in Z_factor_list: Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) unsued_dimensions_percent_list.append(unsued_dimensions_percent) n_informative_list.append(n_informative) for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dimension = number of columns d_v.append(early_fusion.shape[1]) Z_dim_list.append(Z.shape[1]) # dictionary of data dict_data_df = {'latent space':Z} dict_data_df.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) dict_data_df.update({'early fusion':early_fusion}) for key in dict_data_df.keys(): clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) dict_scores_means[key].append(score.mean()) dict_scores_error[key].append(1 - score.mean()) dict_scores_std[key].append(score.std()) mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'].append(mean_majority) dict_scores_error['late fusion'].append(1 - mean_majority) dict_scores_std['late fusion'].append(std_majority) d_v_divid_Z = np.divide(np.array(d_v), np.array(Z_dim_list)) df_scores_means = pd.DataFrame(dict_scores_means, index=d_v_divid_Z) df_scores_error = pd.DataFrame(dict_scores_error, index=d_v_divid_Z) df_scores_std = pd.DataFrame(dict_scores_std, index=d_v_divid_Z) # index and label for graphics label_index = [chr(i) for i in range(ord('a'),ord('z')+1)] label_index = label_index[0:len(d_v)] label_value = "" for label, v_Z, dim_v, dim_Z, Z_factor, percent, n_informative in zip(label_index, d_v_divid_Z, d_v, Z_dim_list, Z_factor_list, unsued_dimensions_percent_list, n_informative_list): label_value = label_value + label+" : V/Z = "+str(round(v_Z, 4))+", V = "+str(dim_v)+", Z = "+str(dim_Z)+", Z_factor = "+str(Z_factor)+", % ="+str(percent)+", n_informative = "+str(n_informative)+'\n' x_label = "V/Z = sum of views dimension divided by latent space dimension with :\nV = sum of views dimension\nZ = latent space dimension multiplied by Z_factor\n% = percentage of dimensions of latent space unsued in views\nn_informative = number of informative features" plt.figure(figsize=(10, 10)) # accuracy score vs d_v_divid_Z for key in dict_scores_means.keys(): plt.semilogx(d_v_divid_Z, dict_scores_means[key], '.-', label=key) plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) plt.xlabel(x_label) plt.ylabel("Accuracy score for "+classifier) plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier) plt.savefig(path_graph+"score_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() plt.figure(figsize=(10, 10)) # error rate vs d_v_divid_Z for key in dict_scores_means.keys(): plt.semilogx(d_v_divid_Z, dict_scores_error[key], '.-', label=key) plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) plt.xlabel(x_label) plt.ylabel("Error rate for "+classifier) plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nR = "+str(round(R, 4))+" - number of classes = "+str(n_classes)+"\nError rate vs ratio sum of views dimension / latent space dimension for classifier "+classifier) plt.savefig(path_graph+"error_Z_factor_"+str(n_views)+"_"+str(n_samples)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() """ plt.figure(figsize=(10, 10)) for key in dict_scores_means.keys(): plt.errorbar(d_v_divid_Z, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) plt.xlabel(x_label) plt.ylabel("Accuracy score for "+classifier) plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nAccuracy score vs ratio sum of views dimension / latent space dimension for classifier "+classifier) plt.savefig(path_graph+"score_Z_factor_errorbar_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() """ plt.figure(figsize=(10, 10)) # accuracy score of early fusion divided by accuracy score of each view vs d_v_divid_Z for view in dict_views.keys(): plt.semilogx(d_v_divid_Z, dict_scores_means['early fusion']/df_scores_means[view], '.-', label='early fusion score divided by '+view+' score') plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) plt.xlabel(x_label) plt.ylabel("Ratio accuracy score for early fusion / accuracy score for each view for "+classifier) plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRatio accuracy score for early fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier) plt.savefig(path_graph+"score_Z_factor_majority_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() plt.figure(figsize=(10, 10)) # accuracy score of late fusion divided by accuracy score of each view vs d_v_divid_Z for view in dict_views.keys(): plt.semilogx(d_v_divid_Z, dict_scores_means['late fusion']/df_scores_means[view], '.-', label='late fusion score divided by '+view+' score') plt.xticks(d_v_divid_Z, label_index, fontsize='medium', multialignment='center') # new x indexes plt.text(plt.xlim()[1]+0.05, plt.ylim()[1]-(plt.ylim()[1]-plt.ylim()[0])/2, label_value) plt.xlabel(x_label) plt.ylabel("Ratio accuracy score for late fusion / accuracy score for each view for "+classifier) plt.legend(bbox_to_anchor=(1.04, 1), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nRation accuracy score for late fusion / accuracy score for each view \nvs ratio sum of views dimension / latent space dimension for classifier "+classifier) plt.savefig(path_graph+"score_Z_factor_all_view_divid_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() return df_scores_means, df_scores_std, df_scores_error def score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_samples=1000, n_classes=2, Z_factor=1, n_clusters_per_class=2, class_sep_factor=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Returns a dictionary with n_views_list as key containing a list of DataFrames (represented accuracy score divided by accuracy score for R=1 <> redundancy null) of views, early fusion predictions (concatenate views predictions and late fusion predictions (majority views predictions) with R_list as index for the indicated classifier per key Creates and saves (at the indicated path path_graph) a graph per value of n_views_list represented accuracy score divided by accuracy score for R=1 vs R_list Parameters: ----------- n_views_list : list each element from n_views_list defines a new dataset with element as n_views R_list : list each element from R_list defines a new dataset with element as R path_graph : str path to save graphics cv : int classifier : str classifier_dictionary : dict n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- a dictionary with n_views_list as key containing a list of DataFrames (represented accuracy score divided by accuracy score for R=1 <> redundancy null) with R_list as index per value of n_views_list """ dict_n_views_R_ratio = {key:0 for key in n_views_list} # n_views_list = list of diverse values of n_views from the lowest to the highest n_views_list.sort(reverse=False) # same views have same colors on each graphs dict_colors = {'view'+str(i):0 for i in range(n_views_list[-1])} prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] for key, c in zip(dict_colors.keys(), colors): dict_colors[key] = c dict_colors.update({'early fusion':'purple', 'late fusion':'maroon'}) for n_views in n_views_list: # R_list = list of diverse values of R from the lowest to the highest R_list.sort(reverse=False) # list of percentage of unsued columns of latent space in views unsued_dimensions_percent_list = [] # list of number of informative features of latent space n_informative_list = [] # dictionary contains mean of accuracy scores per R dict_scores_means = {'view'+str(i):[] for i in range(n_views)} dict_scores_means.update({'early fusion':[], 'late fusion':[]}) # dictionary of list of scores' mean of view for diverse R divided by score's mean of view for R = 1 (<> redundancy null) dict_scores_ratio_R_1 = {'view'+str(i):0 for i in range(n_views)} dict_scores_ratio_R_1.update({'early fusion':0, 'late fusion':0}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} for R in R_list: Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) unsued_dimensions_percent_list.append(unsued_dimensions_percent) n_informative_list.append(n_informative) for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dictionary of data dict_data_df = {'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)} dict_data_df.update({'early fusion':early_fusion}) for key in dict_data_df.keys(): clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data_df[key], y, scoring='accuracy', cv=cv) dict_scores_means[key].append(score.mean()) mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'].append(mean_majority) for key in dict_scores_means.keys(): score_R_1 = dict_scores_means[key][-1] # R = 1 = last value of R_list => last score value in dict_scores_means[key] dict_scores_ratio_R_1[key] = np.divide(np.array(dict_scores_means[key]), score_R_1) df_scores_ratio_R_1 = pd.DataFrame(dict_scores_ratio_R_1, index=R_list) plt.figure() for key in dict_scores_means.keys(): plt.plot(R_list, dict_scores_ratio_R_1[key], '.-', color=dict_colors[key], label=key) # index and label for graphic label_index = [] R_label = [] for i in range(0, len(R_list), 4): R_label.append(R_list[i]) label_index.append(str(round(R_list[i], 2))+'\n'+str(unsued_dimensions_percent_list[i])+'\n'+str(n_informative_list[i])) plt.xticks(R_label, label_index, fontsize='medium', multialignment='center') # new x indexes plt.xlabel("R\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") plt.ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier) plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - number of samples = "+str(n_samples)+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1\n(redundancy null) vs R for classifier "+classifier) plt.savefig(path_graph+"score_R_divid_R_1_"+str(n_views)+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() dict_n_views_R_ratio[n_views] = df_scores_ratio_R_1 plt.figure() ax = plt.axes(projection="3d") for n_views in n_views_list: for key in dict_n_views_R_ratio[n_views].keys(): if n_views == n_views_list[-1]: # print legends only once ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key], label=key) else: ax.plot(R_list, dict_n_views_R_ratio[n_views][key], n_views, color=dict_colors[key]) ax.set_xlabel("R") ax.set_ylabel("Ratio accuracy score / accuracy score for R = 1 for "+classifier) ax.set_zlabel("Number of views") plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) plt.title("number of samples = "+str(n_samples)+" - factor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nRatio accuracy score / accuracy score for R = 1 (redundancy null) vs R, number of views for classifier "+classifier) plt.savefig(path_graph+"score_R_divid_R_1_all_n_views"+"_"+str(n_samples)+"_"+str(Z_factor)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() return dict_n_views_R_ratio def score_multiviews_class_sep(class_sep_factor_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_samples=1000, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, n_informative_divid=1, d=4, D=10, standard_deviation=2): """ Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with class_sep_factor_list as index for the indicated classifier Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs class_sep_factor_list Parameters: ----------- class_sep_factor_list : list each element from n_samples_list defines a new dataset path_graph : str cv : int classifier : str classifier_dictionary : dict n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- 2 DataFrames with n_samples_list as index """ # list of percentage of unsued columns of latent space in views unsued_dimensions_percent_list = [] # list of number of informative features of latent space n_informative_list = [] # dictionary contains mean of accuracy scores per class_sep_factor dict_scores_means = {'latent space':[]} dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_means.update({'early fusion':[], 'late fusion':[]}) # dictionary contains standard deviation of accuracy scores per class_sep_factor dict_scores_std = {'latent space':[]} dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_std.update({'early fusion':[], 'late fusion':[]}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} for class_sep_factor in class_sep_factor_list: Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) unsued_dimensions_percent_list.append(unsued_dimensions_percent) n_informative_list.append(n_informative) for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dictionary of data dict_data = {'latent space':Z} dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) dict_data.update({'early fusion':early_fusion}) for key in dict_data.keys(): print('key', key) clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) dict_scores_means[key].append(score.mean()) dict_scores_std[key].append(score.std()) mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'].append(mean_majority) dict_scores_std['late fusion'].append(std_majority) print(dict_scores_means) df_scores_means = pd.DataFrame(dict_scores_means, index=class_sep_factor_list) df_scores_std = pd.DataFrame(dict_scores_std, index=class_sep_factor_list) plt.figure() for key in dict_scores_means.keys(): plt.errorbar(class_sep_factor_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) # index and label for graphic label_index = [] for n_samples, percent, n_informative in zip(class_sep_factor_list, unsued_dimensions_percent_list, n_informative_list): label_index.append(str(n_samples)+'\n'+str(percent)+'\n'+str(n_informative)) plt.xticks(class_sep_factor_list, label_index, fontsize='medium', multialignment='center') # new x indexes plt.xlabel("Factor (class_sep = factor*n_clusters_per_class)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") plt.ylabel("Accuracy score for "+classifier) plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs factor of class_sep for classifier "+classifier) plt.savefig(path_graph+"score_class_sep_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() return df_scores_means, df_scores_std def score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv=10, classifier="SVM", classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}, n_views=3, n_samples=1000, n_classes=2, Z_factor=1, R=2/3, n_clusters_per_class=2, class_sep_factor=2, d=4, D=10, standard_deviation=2): """ Returns 2 DataFrames (first with accuracy score and the second with the standard deivation of accuracy score) of latent space, views, early fusion predictions (concatenate views predictions) and late fusion predictions (majority views predictions) with n_informative_divid_list as index for the indicated classifier Creates and saves (at the indicated path path_graph) a graph represented accuracy score (with confidence interval) vs n_informative_divid_list Parameters: ----------- n_informative_divid_list : list each element from n_informative_divid_list defines a new dataset with element as n_informative_divid path_graph : str cv : int classifier : str classifier_dictionary : dict n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation : parameters of generator_multiviews_dataset Returns: -------- 2 DataFrames with n_samples_list as index """ # list of percentage of unsued columns of latent space in views unsued_dimensions_percent_list = [] # list of number of informative features of latent space n_informative_list = [] # dictionary contains mean of accuracy scores per n_informative_divid dict_scores_means = {'latent space':[]} dict_scores_means.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_means.update({'early fusion':[], 'late fusion':[]}) # dictionary contains standard deviation of accuracy scores per n_informative_divid dict_scores_std = {'latent space':[]} dict_scores_std.update({'view'+str(i):[] for i in range(n_views)}) dict_scores_std.update({'early fusion':[], 'late fusion':[]}) # dictionary contains data of each view dict_views = {'view'+str(i):0 for i in range(n_views)} for n_informative_divid in n_informative_divid_list: Z, y, multiviews_list, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation) unsued_dimensions_percent_list.append(unsued_dimensions_percent) n_informative_list.append(n_informative) for i in range(n_views): # multiviews_list[i] = (columns / data of view i, numeros of columns of view i) dict_views['view'+str(i)] = multiviews_list[i][0] early_fusion = np.concatenate([dict_views[key] for key in dict_views.keys()], axis=1) # = concatenation of all views # dictionary of data dict_data = {'latent space':Z} dict_data.update({'view'+str(i):dict_views['view'+str(i)] for i in range(n_views)}) dict_data.update({'early fusion':early_fusion}) for key in dict_data.keys(): clf = classifier_dictionary[classifier] score = cross_val_score(clf, dict_data[key], y, scoring='accuracy', cv=cv) dict_scores_means[key].append(score.mean()) dict_scores_std[key].append(score.std()) mean_majority, std_majority = majority_score(dict_views, y, cv, classifier, classifier_dictionary) dict_scores_means['late fusion'].append(mean_majority) dict_scores_std['late fusion'].append(std_majority) df_scores_means = pd.DataFrame(dict_scores_means, index=n_informative_divid_list) df_scores_std = pd.DataFrame(dict_scores_std, index=n_informative_divid_list) plt.figure() for key in dict_scores_means.keys(): plt.errorbar(n_informative_divid_list, dict_scores_means[key], 1.96*np.array(dict_scores_std[key])/sqrt(cv), label=key) # index and label for graphic label_index = [] for n_informative_divid, percent, n_informative in zip(n_informative_divid_list, unsued_dimensions_percent_list, n_informative_list): label_index.append(str(n_informative_divid)+'\n'+str(percent)+'\n'+str(n_informative)) plt.xticks(n_informative_divid_list, label_index, fontsize='medium', multialignment='center') # new x indexes plt.xlabel("Factor (n_informative = dimension of latent space / factor)\nPercentage of dimensions of latent space unsued in views\nNumber of informative features") plt.ylabel("Accuracy score for "+classifier) plt.legend(bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0) plt.title("number of views = "+str(n_views)+" - R = "+str(round(R, 4))+"\nfactor of latent space dimension = "+str(Z_factor)+" - number of classes = "+str(n_classes)+"\nAccuracy score vs n_informative_divid for classifier "+classifier) plt.savefig(path_graph+"score_n_informative_"+str(n_views)+"_"+classifier+".png", bbox_inches='tight') plt.show() plt.close() return df_scores_means, df_scores_std