From 701366a4662b112f7577ad334af1ceec2458d5be Mon Sep 17 00:00:00 2001
From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr>
Date: Tue, 28 Jan 2020 09:45:09 +0100
Subject: [PATCH] Worked on adaptation"

---
 .idea/multiview_generator.iml                 |   5 +-
 generator/update_baptiste.py                  | 286 ++++++++++++++++
 ...ltiviews_datasets_generator.cpython-36.pyc | Bin 7364 -> 7621 bytes
 late/execute.py                               | 315 ++++++++++++++++--
 late/multiviews_datasets_generator.py         |  25 +-
 5 files changed, 593 insertions(+), 38 deletions(-)
 create mode 100644 generator/update_baptiste.py

diff --git a/.idea/multiview_generator.iml b/.idea/multiview_generator.iml
index d6ebd48..8427e77 100644
--- a/.idea/multiview_generator.iml
+++ b/.idea/multiview_generator.iml
@@ -1,9 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="JAVA_MODULE" version="4">
-  <component name="NewModuleRootManager" inherit-compiler-output="true">
-    <exclude-output />
+  <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="R User Library" level="project" />
+    <orderEntry type="library" name="R Skeletons" level="application" />
   </component>
 </module>
\ No newline at end of file
diff --git a/generator/update_baptiste.py b/generator/update_baptiste.py
new file mode 100644
index 0000000..a3810c0
--- /dev/null
+++ b/generator/update_baptiste.py
@@ -0,0 +1,286 @@
+import os
+import yaml
+import numpy as np
+from sklearn.datasets import make_classification
+from random import gauss
+from math import ceil, floor
+import pandas as pd
+import shutil
+import h5py
+
+class MultiviewDatasetGenetator():
+
+    def __init__(self, n_samples=100, n_views=2, n_classes=2,
+                                Z_factor=2,
+                                R=0,
+                                n_clusters_per_class=1,
+                                class_sep_factor=10,
+                                n_informative_divid=2,
+                                d=4,
+                                D=10,
+                                standard_deviation=2,
+                                weights=None,
+                                flip_y=0.0,
+                                random_state=42, config_path=None):
+        if config_path is not None:
+            with open(config_path) as config_file:
+                args = yaml.safe_load(config_file)
+                self.__init__(**args)
+        else:
+            self.n_samples = n_samples
+            self.n_views = n_views
+            self.n_classes = n_classes
+            self.Z_factor = Z_factor
+            self.R = R
+            self.n_clusters_per_class = n_clusters_per_class
+            self.class_sep_factor = class_sep_factor
+            self.n_informative_divid = n_informative_divid
+            self.d = d
+            self.D = D
+            self.standard_deviation = standard_deviation
+            self.weights = weights
+            self.flip_y = flip_y
+            self.random_state = random_state
+
+    def generate(self):
+        if self.n_views < 2:
+            raise ValueError("n_views >= 2")
+        if self.n_classes < 2:
+            raise ValueError("n_classes >= 2")
+        if self.Z_factor < 1:
+            raise ValueError(
+                "Z_factor >= 1 pour le bon fonctionnement de l'algorithme")
+        if (self.R < 0) or (self.R > 1):
+            raise ValueError("0 <= R <= 1")
+        if self.n_clusters_per_class < 1:
+            raise ValueError("n_clusters_per_class >= 1")
+        if self.class_sep_factor < 0:
+            raise ValueError("class_sep_factor >= 0")
+        if self.n_informative_divid < 1:
+            raise ValueError("n_informative_divid >= 1")
+        if self.d < 1:
+            raise ValueError("d >= 1")
+        if (self.d + self.D) / 2 - 3 * self.standard_deviation < 1:
+            raise ValueError(
+                "Il faut que (d+D)/2 - 3*standard_deviation >= 1 pour avoir des valeurs positives non nulles lors de l'emploi de la loi normale")
+
+        # n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2)
+        d_v = np.random.normal(loc=(self.d + self.D) / 2,
+                               scale=self.standard_deviation,
+                               size=self.n_views)
+        d_v = list(d_v)
+        remove_list, add_list = [], []
+        for dim_view in d_v:
+            if dim_view < self.d or dim_view > self.D:  # 1 <= d <= dim_view <= D
+                remove_list.append(dim_view)
+                add = -1
+                while add < self.d or add > self.D:
+                    add = gauss((self.d + self.D) / 2, self.standard_deviation)
+                add_list.append(add)
+        d_v = [view for view in d_v if view not in remove_list] + add_list
+        d_v = [int(view) for view in d_v]  # dimension of views = integer
+        # d_v = list of views dimension from the highest to the lowest
+        d_v.sort(reverse=True)
+        # Dimension of latent space Z (multiplied by Z_factor)
+        self.dim_Z = self.Z_factor * self.latent_space_dimension(d_v)
+        # Number of informative features
+        self.n_informative = round(self.dim_Z / self.n_informative_divid)
+        # Generation of latent space Z
+        self.Z, self.y = make_classification(n_samples=self.n_samples, n_features=self.dim_Z,
+                                   n_informative=self.n_informative, n_redundant=0,
+                                   n_repeated=0, n_classes=self.n_classes,
+                                   n_clusters_per_class=self.n_clusters_per_class,
+                                   weights=self.weights,
+                                   flip_y=self.flip_y,
+                                   class_sep=self.n_clusters_per_class * self.class_sep_factor,
+                                   random_state=self.random_state, shuffle=False)
+        I_q = np.arange(self.Z.shape[1])
+        meta_I_v = []
+        self.results = []
+        for view in range(n_views):
+            # choice d_v[view] numeros of Z columns uniformly from I_q
+            I_v = np.random.choice(I_q, size=d_v[view],
+                                   replace=False)  # tirage dans I_q sans remise de taille d_v[view]
+            meta_I_v += list(I_v)
+            # projection of Z along the columns in I_v
+            X_v = self.projection( I_v)
+            self.results.append((X_v, I_v))
+            # remove R*d_v[view] columns numeros of I_v form I_q
+            elements_to_remove = np.random.choice(I_v,
+                                                  size=floor(self.R * d_v[view]),
+                                                  replace=False)  # tirage dans I_v sans remise de taille floor(R*d_v[view])
+            I_q = np.setdiff1d(I_q,
+                               elements_to_remove)  # I_q less elements from elements_to_remove
+        self.unsued_dimensions_list = [column for column in I_q if
+                                  column not in meta_I_v]
+        self.unsued_dimensions_percent = round(
+            (len(self.unsued_dimensions_list) / self.dim_Z) * 100, 2)
+
+    def projection(self, chosen_columns_list):
+        """
+        Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order)
+
+        Parameters:
+        -----------
+        chosen_columns_list : list
+
+        Returns:
+        --------
+        an array of dimension (number of rows of latent_space, length of chosen_columns_list)
+        """
+        return self.Z[:, chosen_columns_list]
+
+    def latent_space_dimension(self, views_dimensions_list):
+        """
+        Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list
+
+        Parameters:
+        -----------
+        views_dimensions_list : list
+        R : float
+
+        Returns:
+        --------
+        an int
+        """
+        max_view_dimension = max(views_dimensions_list)
+        dimension = ceil(self.R * sum(views_dimensions_list))
+
+        if dimension < max_view_dimension:
+            dimension = max_view_dimension
+
+        reduced_dimension = dimension
+        remove_sum = 0
+
+        for num_view in range(1, len(views_dimensions_list)):
+            view_prec = views_dimensions_list[num_view - 1]
+            view_current = views_dimensions_list[num_view]
+            remove = floor(self.R * view_prec)
+            remove_sum += remove
+            if reduced_dimension - remove < view_current:
+                dimension += view_current - (reduced_dimension - remove)
+            reduced_dimension = dimension - remove_sum
+
+        return dimension
+
+    def to_csv(self, saving_path="."):
+        """
+        Create length of multiviews_list + 2 csv files to the indicated path
+        Files name :
+            latent_space.csv for latent_space
+            integer_labels.csv for integer_labels
+            view0.csv for multiviews_list[0]
+
+        Parameters:
+        -----------
+        path : str
+        latent_space : array
+        integer_labels : 1D array
+        multiviews_list : list of tuples
+
+        Returns:
+        --------
+        None
+        """
+        df_latent_space = pd.DataFrame(self.Z)
+        df_latent_space.to_csv(os.path.join(saving_path, 'latent_space.csv')
+                               , index=False)
+
+        df_labels = pd.DataFrame(self.y)
+        df_labels.to_csv(os.path.join(saving_path, 'integer_labels.csv'),
+                         index=False)
+
+        for view_index, view_tuple in enumerate(self.results):
+            df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1])
+            df_view.to_csv(os.path.join(saving_path,
+                                        'view'+str(view_index)+'.csv'),
+                           index=False)
+
+    def to_hdf5(self, saving_path=".", name="generated_dset"):
+
+        dataset_file = h5py.File(os.path.join(saving_path, name+".hdf5"), 'w')
+
+        labels_dataset = dataset_file.create_dataset("Labels",
+                                                     shape=self.y.shape,
+                                                     data=self.y)
+
+        labels_names = ["Label_1", "Label_0"]
+
+        labels_dataset.attrs["names"] = [
+            label_name.encode() if not isinstance(label_name, bytes)
+            else label_name for label_name in labels_names]
+
+        for view_index, (data, feature_indices) in enumerate(self.results):
+            df_dataset = dataset_file.create_dataset("View" + str(view_index),
+                                                     shape=data.shape,
+                                                     data=data)
+
+            df_dataset.attrs["sparse"] = False
+            df_dataset.attrs["name"] = "GeneratedView"+str(view_index)
+
+        meta_data_grp = dataset_file.create_group("Metadata")
+
+        meta_data_grp.attrs["nbView"] = len(self.results)
+        meta_data_grp.attrs["nbClass"] = np.unique(self.y)
+        meta_data_grp.attrs["datasetLength"] = \
+        self.results[0][0].shape[0]
+
+        meta_data_grp.create_dataset("example_ids", data=np.array(
+            ["gen_example_" + str(ex_indx) for ex_indx in
+             range(self.results[0][0].shape[0])]).astype(
+            np.dtype("S100")), dtype=np.dtype("S100"))
+
+        dataset_file.close()
+
+if __name__=="__main__":
+    n_samples = 100  # Number of samples in tha dataset
+    n_views = 4  # Number of views in the dataset
+    n_classes = 2  # Number of classes in the dataset
+    Z_factor = 2  # Z dim = latent_space_dim * z_factor
+    R = 0  # Precentage of non-redundant features in the view
+    n_clusters_per_class = 1  # Number of clusters for each class
+    class_sep_factor = 10000  # Separation between the different classes
+    n_informative_divid = 2  # Divides the number of informative features in the latent space
+    standard_deviation = 2
+    d = 4
+    D = 10
+    flip_y = 0.00
+    random_state = 42
+    weights = None # The proportions of examples in each class
+
+    path = "/home/baptiste/Documents/Datasets/Generated/metrics_dset/"
+    name = "metrics"
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    multiview_generator = MultiviewDatasetGenetator(n_samples=n_samples,
+                                                    n_views=n_views,
+                                                    n_classes=n_classes,
+                                                    Z_factor=Z_factor,
+                                                    R=R,
+                                                    n_clusters_per_class=n_clusters_per_class,
+                                                    class_sep_factor=class_sep_factor,
+                                                    n_informative_divid=n_informative_divid,
+                                                    d=d,
+                                                    D=D,
+                                                    standard_deviation=standard_deviation,
+                                                    flip_y=flip_y,
+                                                    weights=weights,
+                                                    random_state=random_state)
+
+    multiview_generator.generate()
+    multiview_generator.to_hdf5(saving_path=path, name=name)
+
+    # for filename in os.listdir(path):
+    #     file_path = os.path.join(path, filename)
+    #     try:
+    #         if os.path.isfile(file_path) or os.path.islink(file_path):
+    #             os.unlink(file_path)
+    #         elif os.path.isdir(file_path):
+    #             shutil.rmtree(file_path)
+    #     except Exception as e:
+    #         print('Failed to delete %s. Reason: %s' % (file_path, e))
+    # changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers)
+    # print(changing_labels_indices)
+    # y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int)
+    # results_to_csv(path, Z, y, results)
\ No newline at end of file
diff --git a/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc b/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc
index 4cc36e52166be6df44fd7ed479f9ac3fbc60f93c..828277d0901732be8b003cb85e93cd846ecc6115 100644
GIT binary patch
delta 1938
zcmX?NdDL3Un3tC;;iZ0DrwRkZV+JI^J5ef9Tr5Q}MW}@_iZ4YtMWlrxia&)lm_bu~
zV^=Oy{Vmb5%+&JY_>|1t)V$)%{Ji4$oXp~qTZ}=sgmM!r;z6<q$y=Oob+-hIQd3Hk
zQ&W&+xr$PA^UG4>i%WBFapaZef_2~G1e;M%l$v~t2h2<^Eh<XQE4js1l*$28zmlOy
zbaEZ@d`9ujaxATkfualy44O>0cyba;Qu9jUiwhEyQ*Q|;XXF>B=EW!H=al9`U0DQD
z2_eK77#MDG6%^%Xr6!kT=I0qPFfbG|fkJ|}NNVyIRz;bYS_}*f$-E#T1_lOM1_lN`
z1_lOZkh)uwb=Z#7^OmsG@HR8n@}@JSGcYl9GS>1{B-QY`FvLdH^4IX!aAY$SzpLS|
z;ml?!y4As$#ahGP%*e=)qFgIbp;IFOlApj>EL0-^QZa$CNTY<UhNFg4oFRo#nxTfP
znW<J_0z=`yaE2PbEY=zUklw;u;SBAJX^biSDFXE^93ab@8Eg2&8ESb`1XBcB7$73G
zyfwU8>?uMi!o7@)3^mLp95uW(EX_<J47GwKoHc?Vqe{4H1Z&u(8JZbug-SS6L~0nC
znHU*LI8#Jx1feXk8X+i4JcSL)l7O@7C2N?Vf>JfCP?mHJYYm$S#3=3>h8n>dp%k_h
zwi>1y)*9{<8Hj8RPl{}cTrW=yQ>}2VNC{&WPl`Oq@g=-9!dZMNERqb(j4lk#jLnR-
zqUj8s4CxHDVijd2yfus<@z|_d@e;-?rur=Y8qpfw8gWU66xkGoUglbf4u%qz8i{5`
zbB0ca5`h}w8i{76TFDx*EY=#y8i_O}u)i6zn6mh5BuWHpSRlGZ7@8TOCP<YqW(n1Z
z)<`uo)rwAFDl`jcXlF=cOi@e`XyK?41v#o#x`eAlxJJ5Jp0P$ynju9=gdv5onW<K$
zMz}_#Myy7rMko!GT~s!2<ygbXC_6bx=s2UyWG&%SY>?D9`IoS~7?=r4gTJ_JK=~&*
zKexbc(qw&+?0S%tCgUyU%)Am!rXm#v28JS45CIZLq}?KQ1_p-7pu`ADv}_C^nvA#D
zi&D!{i;7dLxbxzR6LSl4Qi~O=s@PL9bK|2Ftg86);xqHo@{4j4OESw+6|Aaw^Ws5y
zGA|`DuSCJBiU-6ks83BSNlj6(0vVE=lUQ5~Hbg8hJ~^kfxFofxIKCjYC>|^WQUqqi
z7pE2|SfxX=3WJ*_&n-^46K-+k#iykvmXsEy7T@AUal<Wckd-N=U>AVY6{QwTRua|M
zyv3H5lUWd7d5aTn{w<!O#JrUJ-1y>>#FEro?8O<SX=yo=3q^Gt;UNPJuUm|nMR2hq
zP$+{#Yb_$QG?|K&K?MMNQEEX>V)Enz!X~_M9UxOCD~L((gGJ!lcTIK|3t$wVJVVTg
zQEKvOF-cX~B2W==i#-oqZWR~lgN)>eicd>SF3B${;sM2)!DLQxmz*L)5ZeSK&zurp
zc8eo1B?VG$a)4q7RLb3APE1L;#R}p^73qLfGDh8Eth~kS8D9vB+2Ycil45X5011`d
zVvdL}yCsyGlbV~FS5h2bk{=H#Om9tIFK#Rja=Io{ktN7lH4p*v#x0r2)*>?8PM~lo
z0Y#S4WCaN~JuXHB<YVSy<X{wIm1AUKWMSlE5@Y0H;$q}ul4BHN6k-%(EQ;IQA~AtU
z7L+2ZSoD&M%ltH1ZZQ?4+~RaeEJ<`LO3Y2YHQ7*Flu=@`yR=H!E&i0Wc$9LP6D*LF
zno}IboLo?HiworW_>$6soYY(FDQV!KM2o#!d_}2X?}L1qTwJz)@<D0oh%5#MhUcMR
z@YG(Dw@4b~O&JgYDyxg+K&)&K!3HA0VhBPWCgxY90IEF%Zn4M5r{pKc$KRT)CnL^i
zH`z@_A<Y&P4QvI8c`1p-MTsCS;vhl_M8tszkQ<O33kr2`db`D8lbfGXnv-hB4hn3L
b1{NkBMjl21Mh->}W)39|0R~10<f;S!Rg(Z6

delta 1719
zcmX?VeZ*4An3tD}W3Fmkk|G1cV+JI^Jy9xBTr@=>MX-f2iZ4YdMYx3_ia&)lm_bu)
zV^=OyeUxZfW@>qHd`f0+YF=?>eqM2WPG)h*EykcIq1?oZc#teYGKv$fE=sT{HKjB;
zH3dnQt0*-$zbrMrxHLD4Bd;_UtUHPmY(_y*YH}10n3-H!RFs-ma*M4fl>?-HB|{PK
z<T~d0jQpGBSXvnac^McOG?{Mk<Rq4)=9R=37bGU9MhPcp<QJ#r#V6<Il;%QRSp-rE
zA@~>=7;bSD6y;~7CYNO9=NU0DFcdR^LV~wQaPk*cMe$^A5RZX@L7IVq0i=-`q@-oC
zA=?p8?h=+7?q<eX?sSH91}271##)|=q#AA)hS;cD-WuK-j%<eFcQw2<oY^czw>lWJ
zSZjEj85tQ;WNP^;bZYoO@)H<~g=+XF+p$a5^G#qV{1?tp!;{5Y!v|7Vcq^QtoiU9u
zg+E20g#%=6Gh+>pI72OWieQRB3j;)?mb->Kiyah4y^M?uHOwU(HQY5U%}gQ;wfrTV
zHT)oxO1NtHYuKe3ni*>aO1NtnYWQmeQrJ@1YM5$RQ$$ikYglX8>O~k*L{h|hxnr1W
z1#5*$7_)d%#6i}S@YV=s@ujdxGBh)~Ff=nZGu8^HGjuYfGt`Pyl$G$-FoMKmvuZ_4
z7_*qN_-lk~xNAfu8B!!tBzu`_#X1;DSZc(Y8O<3w8A=3d1Z%{anQFyrM6y_G#B0RT
zn7|HX%t{4m6)O>}VS#8DVQ6NA8X-}_m?cyrTqDuUR4Y7zvG74SLpwtnV~P|wT!cZ6
zs+BC^DiN-cY-X(Cmu5(j7GX$XY-XyJsu8Rasu8J?su4&7rC!<10-S3&8AT@V6*|r+
zJh@f)6dNc`i+Cr8ipYzBnV=-`i^~R-oRjl&3+(zPcZp={fTT1TZ!u@)m1r^*Nii@m
z6oC>1Bo%=qi$MB{WEmJ3`X?)jiaGeDSBtE3e^}LGugQIjGcUe4F}ENmwK$3^FFq|b
zv81#pwfGiqUVLU=T7FS(Vo7FMY7}=~JScPIr6lH+L`{AxYN`;$mX?!Q5MLR^nVgeY
zTpVAVT5yY}C^0W3KR3R(B(Y?&x0sGO*me*Bb=WP&Ol;1Y14_4`q|L_QrpZ(!IeD#^
zy$MJgSTRg58ziG>GT!3IO)W``_lz$C%OkwEa<YPW03-k8LUAK|K~Olbg7Oe&acW6Q
zW?GtIN)Z<*A4(S~fP~rez?r$YND0K_h>A~3OfJbUDgtGyBIU`C#9iv6m{a1*qBs&$
zQouz>6h}%Xqy)IdoS2dl#R}p^6)A%BGe+HFtc+szj4!;!UX)r~np08?js=iVSrl_b
zd|8xGYEEiyYF<fkd`W&hq+E#-E6pn|MJ{omg_R^oeDZn;F(FVeXfhRPg1jRGBKRg>
zmr!F%0cCx?$;^^&EF6p?Op{|JB^g~emrG7yG6qF{6^mYSahabc%PppYlqgP@#F9j}
zqQu<PD7KRP_~hcUTg=5JMVg$qSPBwLGKvHyTgj-@NAahm#iNu^oM3^Z)STid=H!Bs
zC@yeeS5jJ#lN!aIk_HYxNECov366(bd{EzmJegcvwhk1vLLkQqb1@1rfiMdr7o!-X
z9Al9;0|UeJP%wCEuNlK#1X5Qd4DzH1h!6!4J|Kb(M1aK*gcwXrQ{WbRe0)lNa(sN0
zKyh|XYGP5IUP@v~VsUCoaS<q473od3msN<<Wnf^4Vk=0@OGzv)@&IY)2NB?u;tFDc
p9E9W_gr_)ca`RJ4b5iZtVcub3;$h@r6kz0F<Y4Ad;t=4<0RUM(!La}U

diff --git a/late/execute.py b/late/execute.py
index 9538308..a3810c0 100644
--- a/late/execute.py
+++ b/late/execute.py
@@ -1,35 +1,286 @@
 import os
+import yaml
 import numpy as np
+from sklearn.datasets import make_classification
+from random import gauss
+from math import ceil, floor
+import pandas as pd
+import shutil
+import h5py
 
-from multiviews_datasets_generator import generator_multiviews_dataset, results_to_csv
-
-n_samples = 200 #Number of samples in tha dataset
-n_views = 4 # Number of views in the dataset
-n_classes = 2 # Number of classes in the dataset
-Z_factor = 1 # Z dim = latent_space_dim * z_factor
-R = 0 # Precentage of non-redundant features in the view
-n_clusters_per_class = 1 # Number of clusters for each class
-class_sep_factor = 100 # Separation between the different classes
-n_informative_divid = 1 # Divides the number of informative features in the latent space
-standard_deviation = 2
-d = 4
-D = 10
-random_state = 42
-n_outliers = 10
-
-path = "/home/baptiste/Documents/Datasets/Generated/outliers_dset/"
-if not os.path.exists(path):
-    os.mkdir(path)
-
-Z, y, results, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes,
-                                                                                       Z_factor, R,
-                                                                                       n_clusters_per_class,
-                                                                                       class_sep_factor,
-                                                                                       n_informative_divid, d, D,
-                                                                                       standard_deviation)
-print(unsued_dimensions_percent)
-print(n_informative)
-print(Z.shape)
-changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers)
-y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int)
-results_to_csv(path, Z, y, results)
\ No newline at end of file
+class MultiviewDatasetGenetator():
+
+    def __init__(self, n_samples=100, n_views=2, n_classes=2,
+                                Z_factor=2,
+                                R=0,
+                                n_clusters_per_class=1,
+                                class_sep_factor=10,
+                                n_informative_divid=2,
+                                d=4,
+                                D=10,
+                                standard_deviation=2,
+                                weights=None,
+                                flip_y=0.0,
+                                random_state=42, config_path=None):
+        if config_path is not None:
+            with open(config_path) as config_file:
+                args = yaml.safe_load(config_file)
+                self.__init__(**args)
+        else:
+            self.n_samples = n_samples
+            self.n_views = n_views
+            self.n_classes = n_classes
+            self.Z_factor = Z_factor
+            self.R = R
+            self.n_clusters_per_class = n_clusters_per_class
+            self.class_sep_factor = class_sep_factor
+            self.n_informative_divid = n_informative_divid
+            self.d = d
+            self.D = D
+            self.standard_deviation = standard_deviation
+            self.weights = weights
+            self.flip_y = flip_y
+            self.random_state = random_state
+
+    def generate(self):
+        if self.n_views < 2:
+            raise ValueError("n_views >= 2")
+        if self.n_classes < 2:
+            raise ValueError("n_classes >= 2")
+        if self.Z_factor < 1:
+            raise ValueError(
+                "Z_factor >= 1 pour le bon fonctionnement de l'algorithme")
+        if (self.R < 0) or (self.R > 1):
+            raise ValueError("0 <= R <= 1")
+        if self.n_clusters_per_class < 1:
+            raise ValueError("n_clusters_per_class >= 1")
+        if self.class_sep_factor < 0:
+            raise ValueError("class_sep_factor >= 0")
+        if self.n_informative_divid < 1:
+            raise ValueError("n_informative_divid >= 1")
+        if self.d < 1:
+            raise ValueError("d >= 1")
+        if (self.d + self.D) / 2 - 3 * self.standard_deviation < 1:
+            raise ValueError(
+                "Il faut que (d+D)/2 - 3*standard_deviation >= 1 pour avoir des valeurs positives non nulles lors de l'emploi de la loi normale")
+
+        # n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2)
+        d_v = np.random.normal(loc=(self.d + self.D) / 2,
+                               scale=self.standard_deviation,
+                               size=self.n_views)
+        d_v = list(d_v)
+        remove_list, add_list = [], []
+        for dim_view in d_v:
+            if dim_view < self.d or dim_view > self.D:  # 1 <= d <= dim_view <= D
+                remove_list.append(dim_view)
+                add = -1
+                while add < self.d or add > self.D:
+                    add = gauss((self.d + self.D) / 2, self.standard_deviation)
+                add_list.append(add)
+        d_v = [view for view in d_v if view not in remove_list] + add_list
+        d_v = [int(view) for view in d_v]  # dimension of views = integer
+        # d_v = list of views dimension from the highest to the lowest
+        d_v.sort(reverse=True)
+        # Dimension of latent space Z (multiplied by Z_factor)
+        self.dim_Z = self.Z_factor * self.latent_space_dimension(d_v)
+        # Number of informative features
+        self.n_informative = round(self.dim_Z / self.n_informative_divid)
+        # Generation of latent space Z
+        self.Z, self.y = make_classification(n_samples=self.n_samples, n_features=self.dim_Z,
+                                   n_informative=self.n_informative, n_redundant=0,
+                                   n_repeated=0, n_classes=self.n_classes,
+                                   n_clusters_per_class=self.n_clusters_per_class,
+                                   weights=self.weights,
+                                   flip_y=self.flip_y,
+                                   class_sep=self.n_clusters_per_class * self.class_sep_factor,
+                                   random_state=self.random_state, shuffle=False)
+        I_q = np.arange(self.Z.shape[1])
+        meta_I_v = []
+        self.results = []
+        for view in range(n_views):
+            # choice d_v[view] numeros of Z columns uniformly from I_q
+            I_v = np.random.choice(I_q, size=d_v[view],
+                                   replace=False)  # tirage dans I_q sans remise de taille d_v[view]
+            meta_I_v += list(I_v)
+            # projection of Z along the columns in I_v
+            X_v = self.projection( I_v)
+            self.results.append((X_v, I_v))
+            # remove R*d_v[view] columns numeros of I_v form I_q
+            elements_to_remove = np.random.choice(I_v,
+                                                  size=floor(self.R * d_v[view]),
+                                                  replace=False)  # tirage dans I_v sans remise de taille floor(R*d_v[view])
+            I_q = np.setdiff1d(I_q,
+                               elements_to_remove)  # I_q less elements from elements_to_remove
+        self.unsued_dimensions_list = [column for column in I_q if
+                                  column not in meta_I_v]
+        self.unsued_dimensions_percent = round(
+            (len(self.unsued_dimensions_list) / self.dim_Z) * 100, 2)
+
+    def projection(self, chosen_columns_list):
+        """
+        Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order)
+
+        Parameters:
+        -----------
+        chosen_columns_list : list
+
+        Returns:
+        --------
+        an array of dimension (number of rows of latent_space, length of chosen_columns_list)
+        """
+        return self.Z[:, chosen_columns_list]
+
+    def latent_space_dimension(self, views_dimensions_list):
+        """
+        Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list
+
+        Parameters:
+        -----------
+        views_dimensions_list : list
+        R : float
+
+        Returns:
+        --------
+        an int
+        """
+        max_view_dimension = max(views_dimensions_list)
+        dimension = ceil(self.R * sum(views_dimensions_list))
+
+        if dimension < max_view_dimension:
+            dimension = max_view_dimension
+
+        reduced_dimension = dimension
+        remove_sum = 0
+
+        for num_view in range(1, len(views_dimensions_list)):
+            view_prec = views_dimensions_list[num_view - 1]
+            view_current = views_dimensions_list[num_view]
+            remove = floor(self.R * view_prec)
+            remove_sum += remove
+            if reduced_dimension - remove < view_current:
+                dimension += view_current - (reduced_dimension - remove)
+            reduced_dimension = dimension - remove_sum
+
+        return dimension
+
+    def to_csv(self, saving_path="."):
+        """
+        Create length of multiviews_list + 2 csv files to the indicated path
+        Files name :
+            latent_space.csv for latent_space
+            integer_labels.csv for integer_labels
+            view0.csv for multiviews_list[0]
+
+        Parameters:
+        -----------
+        path : str
+        latent_space : array
+        integer_labels : 1D array
+        multiviews_list : list of tuples
+
+        Returns:
+        --------
+        None
+        """
+        df_latent_space = pd.DataFrame(self.Z)
+        df_latent_space.to_csv(os.path.join(saving_path, 'latent_space.csv')
+                               , index=False)
+
+        df_labels = pd.DataFrame(self.y)
+        df_labels.to_csv(os.path.join(saving_path, 'integer_labels.csv'),
+                         index=False)
+
+        for view_index, view_tuple in enumerate(self.results):
+            df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1])
+            df_view.to_csv(os.path.join(saving_path,
+                                        'view'+str(view_index)+'.csv'),
+                           index=False)
+
+    def to_hdf5(self, saving_path=".", name="generated_dset"):
+
+        dataset_file = h5py.File(os.path.join(saving_path, name+".hdf5"), 'w')
+
+        labels_dataset = dataset_file.create_dataset("Labels",
+                                                     shape=self.y.shape,
+                                                     data=self.y)
+
+        labels_names = ["Label_1", "Label_0"]
+
+        labels_dataset.attrs["names"] = [
+            label_name.encode() if not isinstance(label_name, bytes)
+            else label_name for label_name in labels_names]
+
+        for view_index, (data, feature_indices) in enumerate(self.results):
+            df_dataset = dataset_file.create_dataset("View" + str(view_index),
+                                                     shape=data.shape,
+                                                     data=data)
+
+            df_dataset.attrs["sparse"] = False
+            df_dataset.attrs["name"] = "GeneratedView"+str(view_index)
+
+        meta_data_grp = dataset_file.create_group("Metadata")
+
+        meta_data_grp.attrs["nbView"] = len(self.results)
+        meta_data_grp.attrs["nbClass"] = np.unique(self.y)
+        meta_data_grp.attrs["datasetLength"] = \
+        self.results[0][0].shape[0]
+
+        meta_data_grp.create_dataset("example_ids", data=np.array(
+            ["gen_example_" + str(ex_indx) for ex_indx in
+             range(self.results[0][0].shape[0])]).astype(
+            np.dtype("S100")), dtype=np.dtype("S100"))
+
+        dataset_file.close()
+
+if __name__=="__main__":
+    n_samples = 100  # Number of samples in tha dataset
+    n_views = 4  # Number of views in the dataset
+    n_classes = 2  # Number of classes in the dataset
+    Z_factor = 2  # Z dim = latent_space_dim * z_factor
+    R = 0  # Precentage of non-redundant features in the view
+    n_clusters_per_class = 1  # Number of clusters for each class
+    class_sep_factor = 10000  # Separation between the different classes
+    n_informative_divid = 2  # Divides the number of informative features in the latent space
+    standard_deviation = 2
+    d = 4
+    D = 10
+    flip_y = 0.00
+    random_state = 42
+    weights = None # The proportions of examples in each class
+
+    path = "/home/baptiste/Documents/Datasets/Generated/metrics_dset/"
+    name = "metrics"
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    multiview_generator = MultiviewDatasetGenetator(n_samples=n_samples,
+                                                    n_views=n_views,
+                                                    n_classes=n_classes,
+                                                    Z_factor=Z_factor,
+                                                    R=R,
+                                                    n_clusters_per_class=n_clusters_per_class,
+                                                    class_sep_factor=class_sep_factor,
+                                                    n_informative_divid=n_informative_divid,
+                                                    d=d,
+                                                    D=D,
+                                                    standard_deviation=standard_deviation,
+                                                    flip_y=flip_y,
+                                                    weights=weights,
+                                                    random_state=random_state)
+
+    multiview_generator.generate()
+    multiview_generator.to_hdf5(saving_path=path, name=name)
+
+    # for filename in os.listdir(path):
+    #     file_path = os.path.join(path, filename)
+    #     try:
+    #         if os.path.isfile(file_path) or os.path.islink(file_path):
+    #             os.unlink(file_path)
+    #         elif os.path.isdir(file_path):
+    #             shutil.rmtree(file_path)
+    #     except Exception as e:
+    #         print('Failed to delete %s. Reason: %s' % (file_path, e))
+    # changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers)
+    # print(changing_labels_indices)
+    # y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int)
+    # results_to_csv(path, Z, y, results)
\ No newline at end of file
diff --git a/late/multiviews_datasets_generator.py b/late/multiviews_datasets_generator.py
index 1cce9a0..d3b9bc6 100644
--- a/late/multiviews_datasets_generator.py
+++ b/late/multiviews_datasets_generator.py
@@ -63,7 +63,11 @@ def projection(latent_space, chosen_columns_list):
     return latent_space[:, chosen_columns_list]
 
 
-def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, Z_factor=250, R=2/3, n_clusters_per_class=1, class_sep_factor=2, n_informative_divid=2, d=2, D=12, standard_deviation=2):
+def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2,
+                                 Z_factor=250, R=2/3, n_clusters_per_class=1,
+                                 class_sep_factor=2, n_informative_divid=2,
+                                 d=2, D=12, standard_deviation=2, weights=None,
+                                 random_state=42):
     """
     Returns a generator multiviews dataset
     
@@ -149,9 +153,22 @@ def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, Z_facto
     # Number of informative features
     n_informative = round(dim_Z/n_informative_divid)
     # Generation of latent space Z
-    Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0, 
-                               n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None, 
-                               flip_y=0.00, class_sep=n_clusters_per_class*class_sep_factor, random_state=None)
+    print("n_samples :", n_samples)
+    print("dim_Z :", dim_Z)
+    print("n_informative :", n_informative)
+    print("n_redundant :", 0)
+    print("n_repeated :", 0)
+    print("n_classes :", n_classes)
+    print("n_clusters_per_class :", n_clusters_per_class)
+    print("class_sep :", n_clusters_per_class*class_sep_factor)
+
+
+    Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0,
+                               n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=weights,
+                               flip_y=0.00, class_sep=n_clusters_per_class*class_sep_factor, random_state=random_state, shuffle=False)
+    # Z, y = make_classification(n_samples=200, n_features=10, n_informative=2, n_redundant=0,
+    #                            n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None,
+    #                            flip_y=0, class_sep=100, random_state=random_state, shuffle=False)
         
     I_q = np.array([i for i in range(Z.shape[1])])  # 1D-array of Z columns numero
     meta_I_v = []
-- 
GitLab