Commit 4e7e6c41 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Before removing old files

parent c91043c4
......@@ -2,7 +2,7 @@
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Python 3.6 (develop)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="R User Library" level="project" />
<orderEntry type="library" name="R Skeletons" level="application" />
......
from . import generator
from . import multiview_generator
from . import demo
......@@ -57,6 +57,8 @@ def make_fig(conf, confusion_output, n_views, n_classes, generator):
x=generator.view_data[view_index][concerned_examples, 0],
y=generator.view_data[view_index][concerned_examples, 1],
z=generator.view_data[view_index][concerned_examples, 2],
text=[generator.example_ids[ind] for ind in concerned_examples],
hoverinfo='text',
mode='markers', marker=dict(
size=1, # set color to an array/list of desired values
color=DEFAULT_PLOTLY_COLORS[lab_index],
......@@ -78,5 +80,5 @@ def make_fig(conf, confusion_output, n_views, n_classes, generator):
# fig.update_zaxes(
# range=[-class_sep - 0.1 * class_sep, +class_sep + margin_ratio * class_sep],
# row=row, col=col)
plotly.offline.plot(fig, filename="center_blob.html")
plotly.offline.plot(fig, filename="description_fig.html")
This diff is collapsed.
n_samples: 100 # Number of samples in tha dataset
n_views: 4 # Number of views in the dataset
n_classes: 3 # Number of classes in the dataset
n_clusters_per_class: 1 # Number of clusters for each class
class_sep: 1.55 # Separation between the different classes
n_informative: 100 # Divides the number of informative features in the latent space
flip_y: 0.00 # Ratio of label noise
random_state: 42
class_weights: None # The proportions of examples in each class
confusion_matrix: [[0.9, 0.5, 0.3, 0.1],
[0.5, 0.3, 0.3, 0.1],
[0.1, 0.1, 0.3, 0.1]]
precision: 0.05
example_subsampling_method: "block"
example_subsampling_config: {}
feature_subampling_method: "block"
feature_subsampling_config: {}
redundancy: None
methods: "uniform"
view_dims: None
estimator_name: "LOneOneScore"
estimator_config: {}
build_method: "iterative"
priority: "random",
n_views: 4
n_classes: 3
confusion_matrix:
- [0.4, 0.4, 0.4, 0.4]
- [0.55, 0.4, 0.4, 0.4]
- [0.4, 0.5, 0.52, 0.55]
# - [0.4, 0.5, 0.5, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
# - [0.4, 0.4, 0.4, 0.4]
n_samples: 2000
n_features: 3
n_informative: 3
class_seps: 10
class_weights: [0.125, 0.125, 0.125,]# 0.125, 0.125, 0.125, 0.125, 0.125,]
mutual_error: 0.2
redundancy: 0.1
complementarity: 0.35
name: "doc_summit"
sub_problem_type: ["base", "base", "base", "gaussian"]
import numpy as np
from generator.multiple_sub_problems import MultiViewSubProblemsGenerator
from multiview_generator.multiple_sub_problems import MultiViewSubProblemsGenerator
from classify_generated import gen_folds, make_fig, test_dataset
n_views = 4
n_classes = 8
conf = np.ones((n_classes, n_views))*0.40
conf[0,3] = 0.70
conf[4,1] = 0.5
conf[5, 1] = 0.6
conf[6, 1] = 0.7
conf[7, 1] = 0.75
conf[6, 2] = 0.5
conf[7, 2] = 0.5
# conf = np.array([
# np.array([0.40, 0.31, 0.31, 0.80]),
# np.array([0.31, 0.31, 0.31, 0.31]),
# np.array([0.31, 0.31, 0.31, 0.31]),
# np.array([0.31, 0.4, 0.31, 0.31]),
# np.array([0.31, 0.5, 0.31, 0.31]),
# np.array([0.31, 0.6, 0.31, 0.31]),
# np.array([0.31, 0.7, 0.41, 0.31]),
# np.array([0.31, 0.8, 0.41, 0.31]),
# ])
n_folds = 10
n_samples = 2000
n_features = 3
class_sep = 10
class_weights = [0.125, 0.1, 0.15, 0.125, 0.01, 0.2, 0.125, 0.125,]
mutual_error = 0.1
redundancy = 0.05
complementarity = 0.5
gene = MultiViewSubProblemsGenerator(confusion_matrix=conf,
n_samples=n_samples,
n_views=n_views,
n_classes=n_classes,
class_seps=class_sep,
n_features=n_features,
n_informative=n_features,
class_weights=class_weights,
mutual_error=mutual_error,
redundancy=redundancy,
complementarity=complementarity)
n_classes = 3
gene = MultiViewSubProblemsGenerator(config_file="config_generator.yml")
conf = np.ones((n_classes, n_views))*0.4
gene.generate_multi_view_dataset()
folds = gen_folds(random_state=42, generator=gene, n_folds=n_folds)
folds = gen_folds(random_state=42, generator=gene)
output_confusion = test_dataset(folds, n_views, n_classes, gene)
make_fig(conf, output_confusion, n_views, n_classes, gene)
\ No newline at end of file
No preview for this file type
__version__ = '1.0.dev0'
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 15:38:38 2019
@author: bernardet
"""
from sklearn.datasets import make_classification
from random import gauss
from math import ceil, floor
import numpy as np
import pandas as pd
def latent_space_dimension(views_dimensions_list, R):
"""
Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list
Parameters:
-----------
views_dimensions_list : list
R : float
Returns:
--------
an int
"""
max_view_dimension = max(views_dimensions_list)
dimension = ceil(R*sum(views_dimensions_list))
if dimension < max_view_dimension:
dimension = max_view_dimension
reduced_dimension = dimension
remove_sum = 0
for num_view in range(1, len(views_dimensions_list)):
view_prec = views_dimensions_list[num_view - 1]
view_current = views_dimensions_list[num_view]
remove = floor(R*view_prec)
remove_sum += remove
if reduced_dimension - remove < view_current:
dimension += view_current - (reduced_dimension - remove)
reduced_dimension = dimension - remove_sum
return dimension
def projection(latent_space, chosen_columns_list):
"""
Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order)
Parameters:
-----------
latent_space : array
chosen_columns_list : list
Returns:
--------
an array of dimension (number of rows of latent_space, length of chosen_columns_list)
"""
return latent_space[:, chosen_columns_list]
def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation, random_state=42):
"""
Returns a generator multiviews dataset
Parameters:
-----------
n_samples : int
dataset number of samples (number of rows of dataset)
n_views : int >= 2
dataset number of views
one view is a set of some features (columns) of the latent space
n_classes : int >= 2
dataset number of classes
Z_factor : float >= 1
minimal dimension of the latent space (enough to build the dataset) is calculed then multiplied by Z_factor
R : 0 <= float <= 1
R = 1 <> no possibility of redundancy between views
R = 0 <> maximal possibility of redundancy between views
n_clusters_per_class : int
class_sep_factor : float
class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid : float >= 1
n_informative_divid raises <> number of non-informative features raises
n_informative_divid = 1 <> no non-informative features, number of informative features = dimension of latent space
number of informative features = round(dimension of latent space / n_informative_divid)
d : float >= 1
minimal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
D : float >= d
maximal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
standard_deviation : float
standard deviation of the gaussian distribution N((d+D)/2, standard_deviation^2)
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
Returns:
--------
Z : an array of dimension(n_samples, R*n_views) = the generated samples
y : an array of dimension (n_samples) = the integer labels for class membership of each sample
a list of n_views tuples (X_v, I_v) with :
X_v = Z projected along d_v (= dimension of the v-ith views) columns in I_v
I_v = X_v columns numeros with numberring of Z columns numeros
unsued_dimensions_percent : percentage of unsued columns of latent space in views
n_informative : number of informative features (dimension of latent space - n_informative = number of non informative features)
"""
if n_views < 2:
raise ValueError("n_views >= 2")
if n_classes < 2:
raise ValueError("n_classes >= 2")
if Z_factor < 1:
raise ValueError("Z_factor >= 1 pour le bon fonctionnement de l'algorithme")
if d < 1:
raise ValueError("d >= 1")
if (d+D)/2 - 3*standard_deviation < 0:
raise ValueError("Il faut que (d+D)/2 - 3*standard_deviation >= 0 pour avoir des valeurs positives lors de l'emploi de la loi normale")
# n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2)
d_v = np.random.normal(loc=(d+D)/2, scale=standard_deviation, size=n_views)
d_v = list(d_v)
remove_list, add_list = [], []
for dim_view in d_v:
if dim_view < d or dim_view > D: # 1 <= d <= dim_view <= D
remove_list.append(dim_view)
add = -1
while add < d or add > D:
add = gauss((d+D)/2, standard_deviation)
add_list.append(add)
d_v = [view for view in d_v if view not in remove_list] + add_list
d_v = [int(view) for view in d_v] # dimension of views = integer
# d_v = list of views dimension from the highest to the lowest
d_v.sort(reverse=True)
# Dimension of latent space Z (multiplied by Z_factor)
dim_Z = Z_factor*latent_space_dimension(d_v, R)
# Number of informative features
n_informative = round(dim_Z/n_informative_divid)
# Generation of latent space Z
Z, y = make_classification(n_samples=200, n_features=10, n_informative=2, n_redundant=0,
n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None,
flip_y=0, class_sep=100, random_state=random_state, shuffle=False)
I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero
meta_I_v = []
results = []
for view in range(n_views):
# choice d_v[view] numeros of Z columns uniformly from I_q
I_v = np.random.choice(I_q, size=d_v[view], replace=False) # tirage dans I_q sans remise de taille d_v[view]
meta_I_v += list(I_v)
# projection of Z along the columns in I_v
X_v = projection(Z, I_v)
results.append((X_v, I_v))
# remove R*d_v[view] columns numeros of I_v form I_q
elements_to_remove = np.random.choice(I_v, size=floor(R*d_v[view]), replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view])
I_q = np.setdiff1d(I_q, elements_to_remove) # I_q less elements from elements_to_remove
unsued_dimensions_list = [column for column in I_q if column not in meta_I_v]
unsued_dimensions_percent = round((len(unsued_dimensions_list) / dim_Z)*100, 2)
return Z, y, results, unsued_dimensions_percent, n_informative
def results_to_csv(path, latent_space, integer_labels, multiviews_list):
"""
Create length of multiviews_list + 2 csv files to the indicated path
Files name :
latent_space.csv for latent_space
integer_labels.csv for integer_labels
view0.csv for multiviews_list[0]
Parameters:
-----------
path : str
latent_space : array
integer_labels : 1D array
multiviews_list : list of tuples
Returns:
--------
None
"""
df_latent_space = pd.DataFrame(latent_space)
df_latent_space.to_csv(path+'latent_space.csv', index=False)
df_labels = pd.DataFrame(integer_labels)
df_labels.to_csv(path+'integer_labels.csv', index=False)
cpt = 0
for view_tuple in multiviews_list:
df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1])
df_view.to_csv(path+'view'+str(cpt)+'.csv', index=False)
cpt += 1
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 13:53:05 2019
@author: bernardet
"""
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import numpy as np
# General parameters
n_samples = 1000
# number of samples (int)
n_views = 3
# number of views >= 2 (int)
n_classes = 2
# number of classes >= 3 (int)
Z_factor = 250
# multiplication factor of Z dimension (default value = 1)
R = 2/3
# redondance (float)
cv = 10
# number of cross-validation splitting (int)
n_clusters_per_class = 2
# number of clusters per class >= 1 (int)
class_sep_factor = 2
# factor >= 1 as class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid = 1
# factor >= 1 as number of informative features = round(dimension of latent space / n_informative_divid)
classifier = "SVM"
# name of classifier (str)
classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}
# dictionary of classifiers
n_samples_list = [100, 500, 1000, 1500, 2000]#, 2500, 3000]#, 3500, 4000, 5000, 7000, 10000]
# list of number of samples to test generator
R_list = list(np.arange(0, 1.05, 0.05))
# list of diverse R
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]#[25, 50, 75, 100, 150, 200, 250, 500, 600, 750, 800, 900, 1000]
# list of diverse Z_factor
n_views_list = [n_view for n_view in range(2, 10)]
# list of diverse n_views
class_sep_factor_list = [2, 5, 10]
# list of diverse class_sep_factor
n_informative_divid_list = [1, 2, 3]
# list of diverse n_informative_divid
path_data = "/home/bernardet/Documents/StageL3/Data/"
# path to register the multiview dataset
path_graph = "/home/bernardet/Documents/StageL3/Graph/"
# path to register scores graph
# Parameters of gaussian distribution N((d+D)/2, standard_deviation_2) :
# d <= dim[v] <= D for all v
# (d+D)/2 - 3*sqrt(standard_deviation_2) >= 0
d = 4
# < D, > 0
D = 10
# > d
standard_deviation = 2
# standard deviation of the gaussian distribution
# make_classification parameters :
# a trouver comment les utiliser
part_informative = 0
# proportion of informative features (float between 0 and 1)
part_redundant = 1
# proportion of redundant features (float between 0 and 1)
# n_redundant >= 1 for redundant
part_repeated = 1
# # proportion of repeated features (float between 0 and 1)
# n_repeated >= 1 for useless features and correlation
weights = [0.7, 0.3]
# proportion of samples assigned to each class (list) len(weights) = nbr_features
# != [0.5, 0.5] / = [0.8, 0.2] for imbalance
flip_y = 0.1
# fraction of samples whose class are randomly exchanged (float)
# > 0 for noise
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 27 16:14:14 2019
@author: bernardet
"""
import parameters
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from tests.test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
n_samples = parameters.n_samples
n_views = parameters.n_views
n_classes = 3#parameters.n_classes
Z_factor = parameters.Z_factor
R = parameters.R
n_clusters_per_class = 1#parameters.n_clusters_per_class
class_sep_factor = 2#5#2#parameters.class_sep_factor
n_informative_divid = 2#parameters.n_informative_divid
cv = parameters.cv
classifier = parameters.classifier
classifier_dictionary = parameters.classifier_dictionary
d = parameters.d
D = parameters.D
standard_deviation = parameters.standard_deviation
path_data = parameters.path_data
path_graph = parameters.path_graph
n_samples_list = parameters.n_samples_list
R_list = parameters.R_list
Z_factor_list = parameters.Z_factor_list
n_views_list = parameters.n_views_list
class_sep_factor_list = parameters.class_sep_factor_list
n_informative_divid_list = parameters.n_informative_divid_list
# Generate one dataset
#Z, y, multiviews_list, unsued_columns_percent = generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(Z, y, multiviews_list)
# Register one multiview dataset
#results_to_csv(path, Z, y, multiviews_list)
# Score of one multiview dataset
#df_dimensions, df_scores_means, df_scores_std = score_one_multiview_dataset(cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(df_dimensions, df_scores_means, df_scores_std)
# Scores of n_samples_list datasets
#mean_samples, std_samples = score_multiviews_n_samples(n_samples_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_samples, std_samples)
# Plot scores classifier2 vs score classifier1
classifier1 = "SVM"
classifier2 = "NB"
#graph_comparaison_classifier_scores_n_samples(classifier1, classifier2, n_samples_list, path_graph, cv, classifier_dictionary, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
# Scores of R_list datasets
#mean_R, std_R = score_multiviews_R(R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_R, std_R)
# Scores of Z_factor_list datasets
#mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(mean_Z, std_Z, error_Z)
# Scores divided by scores for R=1 (redundancy null) of n_views_list and R_list datasets
#dict_n_views_R_ratio = score_multiviews_n_views_R(n_views_list, R_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_classes, Z_factor, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
#print(dict_n_views_R_ratio)
# Scores of class_sep_factor_list datasets
#df_mean, df_std = score_multiviews_class_sep(class_sep_factor_list, path_data, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, n_informative_divid, d, D, standard_deviation)
#print(df_mean, df_std)
# Scores of n_informative_divid_list datasets
#mean_n_info, std_n_info = score_multiviews_n_informative_divided(n_informative_divid_list, path_graph, cv, classifier, classifier_dictionary, n_views, n_samples, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, d, D, standard_deviation)
#print(mean_n_info, std_n_info)
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]
path_graph = "/home/bernardet/Documents/StageL3/Graph/n_views_3_10_1_clus_2_n_info_div/"
n_classes = 2
n_clusters_per_class = 1
class_sep_factor = 2
n_informative_divid = 2
for n_views in range(3, 11):
n_samples = 500*n_views
mean_Z, std_Z, error_Z = score_multiviews_Z_factor(Z_factor_list, path_graph, cv, classifier, classifier_dictionary, n_samples, n_views, n_classes, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation)
This diff is collapsed.
import unittest
import numpy as np
from ..multiple_sub_problems import MultiViewSubProblemsGenerator
class Test_MultiVieSubProblemsGenerator():
def __init__(self):
self.conf = np.array([
np.array([0.0, 0.1, 0.1, 0.9]),
np.array([0.0, 0.2, 0.1, 0.0]),
np.array([0.0, 0.3, 0.1, 0.0]),
np.array([0.0, 0.4, 0.2, 0.0]),
np.array([0.0, 0.5, 0.2, 0.0]),
np.array([0.0, 0.6, 0.2, 0.0]),
np.array([0.0, 0.7, 0.2, 0.0]),
np.array([0.0, 0.8, 0.1, 0.]),
])
self.n_views = 4
self.n_folds = 10
self.n_classes = 8
self.n_samples = 2000
self.class_sep = 1.5
self.class_weights = [0.125, 0.1, 0.15, 0.125, 0.01, 0.2, 0.125, 0.125, ]
import unittest
import numpy as np
from ..update_baptiste import MultiviewDatasetGenetator
class TestSubSmaple(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.random_state = np.random.RandomState(42)
cls.indices = np.arange(100)
cls.quantity = 10
cls.method = "block"
cls.beggining = 0
cls.generator = MultiviewDatasetGenetator(random_state=cls.random_state)
def test_block_simple(self):
chosen_indices = self.generator.sub_sample(self.indices, self.quantity, self.method, self.beggining)
np.testing.assert_array_equal(np.array([0,1,2,3,4,5,6,7,8,9]), chosen_indices)
def test_block_too_big(self):
chosen_indices = self.generator.sub_sample(self.indices, 121,
self.method, self.beggining)
np.testing.assert_array_equal(np.arange(100),
chosen_indices)
def test_block_no_beg(self):
chosen_indices = self.generator.sub_sample(self.indices, 10,
self.method, None)
np.testing.assert_array_equal(np.array([82, 83, 84, 85, 86, 87, 88, 89, 90, 91,]),
chosen_indices)
def test_block_no_beg_too_long(self):
chosen_indices = self.generator.sub_sample(self.indices, 120,
self.method, None)
np.testing.assert_array_equal(np.arange(100),
chosen_indices)
def test_choice_simple(self):
chosen_indices = self.generator.sub_sample(self.indices, 10,
"choice")
np.testing.assert_array_equal(np.array([77, 10, 4, 83, 62, 67, 30, 45, 95, 11]),
chosen_indices)
def test_choice_too_big(self):
chosen_indices = self.generator.sub_sample(self.indices, 105,
"choice")
self.assertEqual(100, chosen_indices.shape[0])
self.assertEqual(100, np.unique(chosen_indices).shape[0])
if __name__ == '__main__':
unittest.main()
This diff is collapsed.
import os
import numpy as np
import parameters
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from tests.test_classifier import score_multiviews_n_samples,