Skip to content
Snippets Groups Projects
Commit 40692642 authored by Dominique Benielli's avatar Dominique Benielli
Browse files

firts add

parent 3726ce43
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 27 16:14:14 2019
@author: bernardet
"""
import parameters
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
n_samples = parameters.n_samples
n_views = parameters.n_views
n_classes = 3#parameters.n_classes
Z_factor = parameters.Z_factor
R = parameters.R
n_clusters_per_class = 1#parameters.n_clusters_per_class
class_sep_factor = 2#5#2#parameters.class_sep_factor
n_informative_divid = 2#parameters.n_informative_divid
cv = parameters.cv
classifier = parameters.classifier
classifier_dictionary = parameters.classifier_dictionary
d = parameters.d
D = parameters.D
standard_deviation = parameters.standard_deviation
path_data = parameters.path_data
path_graph = parameters.path_graph
n_samples_list = parameters.n_samples_list
R_list = parameters.R_list
Z_factor_list = parameters.Z_factor_list
n_views_list = parameters.n_views_list
class_sep_factor_list = parameters.class_sep_factor_list
n_informative_divid_list = parameters.n_informative_divid_list
# Generate one dataset
#Z, y, multiviews_list, unsued_columns_percent = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(Z, y, multiviews_list)
# Register one multiview dataset
#results_to_csv(path, Z, y, multiviews_list)
# Score of one multiview dataset
#df_dimensions, df_scores_means, df_scores_std = score_one_multiview_dataset(cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(df_dimensions, df_scores_means, df_scores_std)
# Scores of n_samples_list datasets
#mean_samples, std_samples = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_samples, std_samples)
# Plot scores classifier2 vs score classifier1
classifier1 = "SVM"
classifier2 = "NB"
#graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
# Scores of R_list datasets
#mean_R, std_R = score_multiviews_R(R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_R, std_R)
# Scores of Z_factor_list datasets
#mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_Z, std_Z, error_Z)
# Scores divided by scores for R=1 (redundancy null) of n_views_list and R_list datasets
#dict_n_views_R_ratio = score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(dict_n_views_R_ratio)
# Scores of class_sep_factor_list datasets
#df_mean, df_std = score_multiviews_class_sep(class_sep_factor_list, path_data, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation)
#print(df_mean, df_std)
# Scores of n_informative_divid_list datasets
#mean_n_info, std_n_info = score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation)
#print(mean_n_info, std_n_info)
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]
path_graph = "/home/bernardet/Documents/StageL3/Graph/n_views_3_10_1_clus_2_n_info_div/"
n_classes = 2
n_clusters_per_class = 1
class_sep_factor = 2
n_informative_divid = 2
for n_views in range(3, 11):
n_samples = 500*n_views
mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
__version__ = '1.0.dev0'
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 15:38:38 2019
@author: bernardet
"""
from sklearn.datasets import make_classification
from random import gauss
from math import ceil, floor
import numpy as np
import pandas as pd
def latent_space_dimension(views_dimensions_list, R):
"""
Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list
Parameters:
-----------
views_dimensions_list : list
R : float
Returns:
--------
an int
"""
max_view_dimension = max(views_dimensions_list)
dimension = ceil(R*sum(views_dimensions_list))
if dimension < max_view_dimension:
dimension = max_view_dimension
reduced_dimension = dimension
remove_sum = 0
for num_view in range(1, len(views_dimensions_list)):
view_prec = views_dimensions_list[num_view - 1]
view_current = views_dimensions_list[num_view]
remove = floor(R*view_prec)
remove_sum += remove
if reduced_dimension - remove < view_current:
dimension += view_current - (reduced_dimension - remove)
reduced_dimension = dimension - remove_sum
return dimension
def projection(latent_space, chosen_columns_list):
"""
Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order)
Parameters:
-----------
latent_space : array
chosen_columns_list : list
Returns:
--------
an array of dimension (number of rows of latent_space, length of chosen_columns_list)
"""
return latent_space[:, chosen_columns_list]
def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation):
"""
Returns a generator multiviews dataset
Parameters:
-----------
n_samples : int
dataset number of samples (number of rows of dataset)
n_views : int >= 2
dataset number of views
one view is a set of some features (columns) of the latent space
n_classes : int >= 2
dataset number of classes
Z_factor : float >= 1
minimal dimension of the latent space (enough to build the dataset) is calculed then multiplied by Z_factor
R : 0 <= float <= 1
R = 1 <> no possibility of redundancy between views
R = 0 <> maximal possibility of redundancy between views
n_clusters_per_class : int
class_sep_factor : float
class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid : float >= 1
n_informative_divid raises <> number of non-informative features raises
n_informative_divid = 1 <> no non-informative features, number of informative features = dimension of latent space
number of informative features = round(dimension of latent space / n_informative_divid)
d : float >= 1
minimal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
D : float >= d
maximal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
standard_deviation : float
standard deviation of the gaussian distribution N((d+D)/2, standard_deviation^2)
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
Returns:
--------
Z : an array of dimension(n_samples, R*n_views) = the generated samples
y : an array of dimension (n_samples) = the integer labels for class membership of each sample
a list of n_views tuples (X_v, I_v) with :
X_v = Z projected along d_v (= dimension of the v-ith views) columns in I_v
I_v = X_v columns numeros with numberring of Z columns numeros
unsued_dimensions_percent : percentage of unsued columns of latent space in views
n_informative : number of informative features (dimension of latent space - n_informative = number of non informative features)
"""
if n_views < 2:
raise ValueError("n_views >= 2")
if n_classes < 2:
raise ValueError("n_classes >= 2")
if Z_factor < 1:
raise ValueError("Z_factor >= 1 pour le bon fonctionnement de l'algorithme")
if d < 1:
raise ValueError("d >= 1")
if (d+D)/2 - 3*standard_deviation < 0:
raise ValueError("Il faut que (d+D)/2 - 3*standard_deviation >= 0 pour avoir des valeurs positives lors de l'emploi de la loi normale")
# n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2)
d_v = np.random.normal(loc=(d+D)/2, scale=standard_deviation, size=n_views)
d_v = list(d_v)
remove_list, add_list = [], []
for dim_view in d_v:
if dim_view < d or dim_view > D: # 1 <= d <= dim_view <= D
remove_list.append(dim_view)
add = -1
while add < d or add > D:
add = gauss((d+D)/2, standard_deviation)
add_list.append(add)
d_v = [view for view in d_v if view not in remove_list] + add_list
d_v = [int(view) for view in d_v] # dimension of views = integer
# d_v = list of views dimension from the highest to the lowest
d_v.sort(reverse=True)
# Dimension of latent space Z (multiplied by Z_factor)
dim_Z = Z_factor*latent_space_dimension(d_v, R)
# Number of informative features
n_informative = round(dim_Z/n_informative_divid)
# Generation of latent space Z
Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0,
n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None,
flip_y=0.01, class_sep=n_clusters_per_class*class_sep_factor, random_state=None)
I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero
meta_I_v = []
results = []
for view in range(n_views):
# choice d_v[view] numeros of Z columns uniformly from I_q
I_v = np.random.choice(I_q, size=d_v[view], replace=False) # tirage dans I_q sans remise de taille d_v[view]
meta_I_v += list(I_v)
# projection of Z along the columns in I_v
X_v = projection(Z, I_v)
results.append((X_v, I_v))
# remove R*d_v[view] columns numeros of I_v form I_q
elements_to_remove = np.random.choice(I_v, size=floor(R*d_v[view]), replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view])
I_q = np.setdiff1d(I_q, elements_to_remove) # I_q less elements from elements_to_remove
unsued_dimensions_list = [column for column in I_q if column not in meta_I_v]
unsued_dimensions_percent = round((len(unsued_dimensions_list) / dim_Z)*100, 2)
return Z, y, results, unsued_dimensions_percent, n_informative
def results_to_csv(path, latent_space, integer_labels, multiviews_list):
"""
Create length of multiviews_list + 2 csv files to the indicated path
Files name :
latent_space.csv for latent_space
integer_labels.csv for integer_labels
view0.csv for multiviews_list[0]
Parameters:
-----------
path : str
latent_space : array
integer_labels : 1D array
multiviews_list : list of tuples
Returns:
--------
None
"""
df_latent_space = pd.DataFrame(latent_space)
df_latent_space.to_csv(path+'latent_space.csv', index=False)
df_labels = pd.DataFrame(integer_labels)
df_labels.to_csv(path+'integer_labels.csv', index=False)
cpt = 0
for view_tuple in multiviews_list:
df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1])
df_view.to_csv(path+'view'+str(cpt)+'.csv', index=False)
cpt += 1
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 13:53:05 2019
@author: bernardet
"""
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import numpy as np
# General parameters
n_samples = 1000
# number of samples (int)
n_views = 3
# number of views >= 2 (int)
n_classes = 2
# number of classes >= 3 (int)
Z_factor = 250
# multiplication factor of Z dimension (default value = 1)
R = 2/3
# redondance (float)
cv = 10
# number of cross-validation splitting (int)
n_clusters_per_class = 2
# number of clusters per class >= 1 (int)
class_sep_factor = 2
# factor >= 1 as class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid = 1
# factor >= 1 as number of informative features = round(dimension of latent space / n_informative_divid)
classifier = "SVM"
# name of classifier (str)
classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}
# dictionary of classifiers
n_samples_list = [100, 500, 1000, 1500, 2000]#, 2500, 3000]#, 3500, 4000, 5000, 7000, 10000]
# list of number of samples to test generator
R_list = list(np.arange(0, 1.05, 0.05))
# list of diverse R
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]#[25, 50, 75, 100, 150, 200, 250, 500, 600, 750, 800, 900, 1000]
# list of diverse Z_factor
n_views_list = [n_view for n_view in range(2, 10)]
# list of diverse n_views
class_sep_factor_list = [2, 5, 10]
# list of diverse class_sep_factor
n_informative_divid_list = [1, 2, 3]
# list of diverse n_informative_divid
path_data = "/home/bernardet/Documents/StageL3/Data/"
# path to register the multiview dataset
path_graph = "/home/bernardet/Documents/StageL3/Graph/"
# path to register scores graph
# Parameters of gaussian distribution N((d+D)/2, standard_deviation_2) :
# d <= dim[v] <= D for all v
# (d+D)/2 - 3*sqrt(standard_deviation_2) >= 0
d = 4
# < D, > 0
D = 10
# > d
standard_deviation = 2
# standard deviation of the gaussian distribution
# make_classification parameters :
# a trouver comment les utiliser
part_informative = 0
# proportion of informative features (float between 0 and 1)
part_redundant = 1
# proportion of redundant features (float between 0 and 1)
# n_redundant >= 1 for redundant
part_repeated = 1
# # proportion of repeated features (float between 0 and 1)
# n_repeated >= 1 for useless features and correlation
weights = [0.7, 0.3]
# proportion of samples assigned to each class (list) len(weights) = nbr_features
# != [0.5, 0.5] / = [0.8, 0.2] for imbalance
flip_y = 0.1
# fraction of samples whose class are randomly exchanged (float)
# > 0 for noise
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment