Commit 4513c343 authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

debugging

parent 40692642
# Default ignored files
/workspace.xml
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_12" default="false" project-jdk-name="Python 3.6 (develop)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/multiview_generator.iml" filepath="$PROJECT_DIR$/.idea/multiview_generator.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
from . import generator
from . import demo
......@@ -63,7 +63,7 @@ def projection(latent_space, chosen_columns_list):
return latent_space[:, chosen_columns_list]
def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation):
def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_clusters_per_class, class_sep_factor, n_informative_divid, d, D, standard_deviation, random_state=42):
"""
Returns a generator multiviews dataset
......@@ -140,9 +140,9 @@ def generator_multiviews_dataset(n_samples, n_views, n_classes, Z_factor, R, n_c
# Number of informative features
n_informative = round(dim_Z/n_informative_divid)
# Generation of latent space Z
Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0,
n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None,
flip_y=0.01, class_sep=n_clusters_per_class*class_sep_factor, random_state=None)
Z, y = make_classification(n_samples=200, n_features=10, n_informative=2, n_redundant=0,
n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None,
flip_y=0, class_sep=100, random_state=random_state, shuffle=False)
I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero
meta_I_v = []
......
......@@ -5,10 +5,9 @@ Created on Wed Nov 27 16:14:14 2019
@author: bernardet
"""
import parameters
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided
from tests.test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
......
import os
import numpy as np
import parameters
from multiviews_datasets import generator_multiviews_dataset, results_to_csv
from tests.test_classifier import score_multiviews_n_samples, graph_comparaison_classifier_scores_n_samples, score_multiviews_R, score_multiviews_Z_factor, score_multiviews_n_views_R, score_multiviews_class_sep, score_one_multiview_dataset, score_multiviews_n_informative_divided
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
n_samples = 100
n_views = 3
n_classes = 2
Z_factor = 1
R = 0
n_clusters_per_class = 1
class_sep_factor = 100
n_informative_divid = 1
standard_deviation = 2
d = 4
D = 10
path = "/home/baptiste/Documents/Datasets/Generated/try_outlier/"
if not os.path.exists(path):
os.mkdir(path)
Z, y, results, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes,
Z_factor, R,
n_clusters_per_class,
class_sep_factor,
n_informative_divid, d, D,
standard_deviation)
print(y[:10])
print(unsued_dimensions_percent)
print(n_informative)
print(Z.shape)
y[:10] = np.invert(y[:10].astype(bool)).astype(int)
print(y[:10])
results_to_csv(path, Z, y, results)
import os
import numpy as np
from multiviews_datasets_generator import generator_multiviews_dataset, results_to_csv
n_samples = 200 #Number of samples in tha dataset
n_views = 4 # Number of views in the dataset
n_classes = 2 # Number of classes in the dataset
Z_factor = 1 # Z dim = latent_space_dim * z_factor
R = 0 # Precentage of non-redundant features in the view
n_clusters_per_class = 1 # Number of clusters for each class
class_sep_factor = 100 # Separation between the different classes
n_informative_divid = 1 # Divides the number of informative features in the latent space
standard_deviation = 2
d = 4
D = 10
random_state = 42
n_outliers = 10
path = "/home/baptiste/Documents/Datasets/Generated/outliers_dset/"
if not os.path.exists(path):
os.mkdir(path)
Z, y, results, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes,
Z_factor, R,
n_clusters_per_class,
class_sep_factor,
n_informative_divid, d, D,
standard_deviation)
print(unsued_dimensions_percent)
print(n_informative)
print(Z.shape)
changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers)
y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int)
results_to_csv(path, Z, y, results)
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 15:38:38 2019
@author: bernardet
"""
from sklearn.datasets import make_classification
from random import gauss
from math import ceil, floor
import numpy as np
import pandas as pd
def latent_space_dimension(views_dimensions_list, R):
"""
Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list
Parameters:
-----------
views_dimensions_list : list
R : float
Returns:
--------
an int
"""
max_view_dimension = max(views_dimensions_list)
dimension = ceil(R*sum(views_dimensions_list))
if dimension < max_view_dimension:
dimension = max_view_dimension
reduced_dimension = dimension
remove_sum = 0
for num_view in range(1, len(views_dimensions_list)):
view_prec = views_dimensions_list[num_view - 1]
view_current = views_dimensions_list[num_view]
remove = floor(R*view_prec)
remove_sum += remove
if reduced_dimension - remove < view_current:
dimension += view_current - (reduced_dimension - remove)
reduced_dimension = dimension - remove_sum
return dimension
def projection(latent_space, chosen_columns_list):
"""
Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order)
Parameters:
-----------
latent_space : array
chosen_columns_list : list
Returns:
--------
an array of dimension (number of rows of latent_space, length of chosen_columns_list)
"""
return latent_space[:, chosen_columns_list]
def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, Z_factor=250, R=2/3, n_clusters_per_class=1, class_sep_factor=2, n_informative_divid=2, d=2, D=12, standard_deviation=2):
"""
Returns a generator multiviews dataset
Parameters:
-----------
n_samples : int
dataset number of samples (number of rows of dataset)
n_views : int >= 2
dataset number of views
one view is a set of some features (columns) of the latent space
n_classes : int >= 2
dataset number of classes
Z_factor : float >= 1
minimal dimension of the latent space (enough to build the dataset) is calculed then multiplied by Z_factor
R : 0 <= float <= 1
R = 1 <> no possibility of redundancy between views
R = 0 <> maximal possibility of redundancy between views
n_clusters_per_class : int >= 1
class_sep_factor : float >= 0
class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid : float >= 1
n_informative_divid raises <> number of non-informative features raises
n_informative_divid = 1 <> no non-informative features, number of informative features = dimension of latent space
number of informative features = round(dimension of latent space / n_informative_divid)
d : float >= 1
minimal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
D : float >= d
maximal dimension of views
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
standard_deviation : float
standard deviation of the gaussian distribution N((d+D)/2, standard_deviation^2)
dimension of views (int) chosen randomly from N((d+D)/2, standard_deviation^2) with d <= dimension of views <= D
Returns:
--------
Z : an array of dimension(n_samples, R*n_views) = the generated samples
y : an array of dimension (n_samples) = the integer labels for class membership of each sample
a list of n_views tuples (X_v, I_v) with :
X_v = Z projected along d_v (= dimension of the v-ith views) columns in I_v
I_v = X_v columns numeros with numberring of Z columns numeros
unsued_dimensions_percent : percentage of unsued columns of latent space in views
n_informative : number of informative features (dimension of latent space - n_informative = number of non informative features)
"""
if n_views < 2:
raise ValueError("n_views >= 2")
if n_classes < 2:
raise ValueError("n_classes >= 2")
if Z_factor < 1:
raise ValueError("Z_factor >= 1 pour le bon fonctionnement de l'algorithme")
if (R < 0) or (R > 1):
raise ValueError("0 <= R <= 1")
if n_clusters_per_class < 1:
raise ValueError("n_clusters_per_class >= 1")
if class_sep_factor < 0:
raise ValueError("class_sep_factor >= 0")
if n_informative_divid < 1:
raise ValueError("n_informative_divid >= 1")
if d < 1:
raise ValueError("d >= 1")
if (d+D)/2 - 3*standard_deviation < 1:
raise ValueError("Il faut que (d+D)/2 - 3*standard_deviation >= 1 pour avoir des valeurs positives non nulles lors de l'emploi de la loi normale")
# n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2)
d_v = np.random.normal(loc=(d+D)/2, scale=standard_deviation, size=n_views)
d_v = list(d_v)
remove_list, add_list = [], []
for dim_view in d_v:
if dim_view < d or dim_view > D: # 1 <= d <= dim_view <= D
remove_list.append(dim_view)
add = -1
while add < d or add > D:
add = gauss((d+D)/2, standard_deviation)
add_list.append(add)
d_v = [view for view in d_v if view not in remove_list] + add_list
d_v = [int(view) for view in d_v] # dimension of views = integer
# d_v = list of views dimension from the highest to the lowest
d_v.sort(reverse=True)
# Dimension of latent space Z (multiplied by Z_factor)
dim_Z = Z_factor*latent_space_dimension(d_v, R)
print(dim_Z)
# Number of informative features
n_informative = round(dim_Z/n_informative_divid)
# Generation of latent space Z
Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0,
n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None,
flip_y=0.00, class_sep=n_clusters_per_class*class_sep_factor, random_state=None)
I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero
meta_I_v = []
results = []
for view in range(n_views):
# choice d_v[view] numeros of Z columns uniformly from I_q
I_v = np.random.choice(I_q, size=d_v[view], replace=False) # tirage dans I_q sans remise de taille d_v[view]
meta_I_v += list(I_v)
# projection of Z along the columns in I_v
X_v = projection(Z, I_v)
results.append((X_v, I_v))
# remove R*d_v[view] columns numeros of I_v form I_q
elements_to_remove = np.random.choice(I_v, size=floor(R*d_v[view]), replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view])
I_q = np.setdiff1d(I_q, elements_to_remove) # I_q less elements from elements_to_remove
unsued_dimensions_list = [column for column in I_q if column not in meta_I_v]
unsued_dimensions_percent = round((len(unsued_dimensions_list) / dim_Z)*100, 2)
return Z, y, results, unsued_dimensions_percent, n_informative
def results_to_csv(path, latent_space, integer_labels, multiviews_list):
"""
Create length of multiviews_list + 2 csv files to the indicated path
Files name :
latent_space.csv for latent_space
integer_labels.csv for integer_labels
view0.csv for multiviews_list[0]
Parameters:
-----------
path : str
latent_space : array
integer_labels : 1D array
multiviews_list : list of tuples
Returns:
--------
None
"""
df_latent_space = pd.DataFrame(latent_space)
df_latent_space.to_csv(path+'latent_space.csv', index=False)
df_labels = pd.DataFrame(integer_labels)
df_labels.to_csv(path+'integer_labels.csv', index=False)
cpt = 0
for view_tuple in multiviews_list:
df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1])
df_view.to_csv(path+'view'+str(cpt)+'.csv', index=False)
cpt += 1
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 13:53:05 2019
@author: bernardet
"""
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import numpy as np
# General parameters
n_samples = 1000
# number of samples (int)
n_views = 3
# number of views >= 2 (int)
n_classes = 2
# number of classes >= 3 (int)
Z_factor = 250
# multiplication factor of Z dimension (default value = 1)
R = 2/3
# redondance (float)
cv = 10
# number of cross-validation splitting (int)
n_clusters_per_class = 1
# number of clusters per class >= 1 (int)
class_sep_factor = 2
# factor >= 1 as class_sep = n_clusters_per_class*class_sep_factor
n_informative_divid = 2
# factor >= 1 as number of informative features = round(dimension of latent space / n_informative_divid)
classifier = "SVM"
# name of classifier (str)
classifier_dictionary={'SVM':SVC(kernel='linear'), 'NB':GaussianNB()}
# dictionary of classifiers
n_samples_list = [100, 500, 1000, 1500, 2000]
# list of number of samples to test generator
R_list = list(np.arange(0, 1.05, 0.05))
# list of diverse R
Z_factor_list = [1, 3, 10, 25, 100, 250, 1000]
# list of diverse Z_factor
n_views_list = [n_view for n_view in range(2, 10)]
# list of diverse n_views
class_sep_factor_list = [2, 5, 10]
# list of diverse class_sep_factor
n_informative_divid_list = [1, 2, 3]
# list of diverse n_informative_divid
path_data = "/home/bernardet/Documents/StageL3/Data/"
# path to register the multiview dataset
path_graph = "/home/bernardet/Documents/StageL3/Graph/"
# path to register scores graph
# Parameters of gaussian distribution N((d+D)/2, standard_deviation_2) :
# d <= dim[v] <= D for all v
# (d+D)/2 - 3*sqrt(standard_deviation_2) >= 0
d = 4
# < D, > 0
D = 12
# > d
standard_deviation = 2
# standard deviation of the gaussian distribution
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment