From 701366a4662b112f7577ad334af1ceec2458d5be Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Tue, 28 Jan 2020 09:45:09 +0100 Subject: [PATCH] Worked on adaptation" --- .idea/multiview_generator.iml | 5 +- generator/update_baptiste.py | 286 ++++++++++++++++ ...ltiviews_datasets_generator.cpython-36.pyc | Bin 7364 -> 7621 bytes late/execute.py | 315 ++++++++++++++++-- late/multiviews_datasets_generator.py | 25 +- 5 files changed, 593 insertions(+), 38 deletions(-) create mode 100644 generator/update_baptiste.py diff --git a/.idea/multiview_generator.iml b/.idea/multiview_generator.iml index d6ebd48..8427e77 100644 --- a/.idea/multiview_generator.iml +++ b/.idea/multiview_generator.iml @@ -1,9 +1,10 @@ <?xml version="1.0" encoding="UTF-8"?> <module type="JAVA_MODULE" version="4"> - <component name="NewModuleRootManager" inherit-compiler-output="true"> - <exclude-output /> + <component name="NewModuleRootManager"> <content url="file://$MODULE_DIR$" /> <orderEntry type="inheritedJdk" /> <orderEntry type="sourceFolder" forTests="false" /> + <orderEntry type="library" name="R User Library" level="project" /> + <orderEntry type="library" name="R Skeletons" level="application" /> </component> </module> \ No newline at end of file diff --git a/generator/update_baptiste.py b/generator/update_baptiste.py new file mode 100644 index 0000000..a3810c0 --- /dev/null +++ b/generator/update_baptiste.py @@ -0,0 +1,286 @@ +import os +import yaml +import numpy as np +from sklearn.datasets import make_classification +from random import gauss +from math import ceil, floor +import pandas as pd +import shutil +import h5py + +class MultiviewDatasetGenetator(): + + def __init__(self, n_samples=100, n_views=2, n_classes=2, + Z_factor=2, + R=0, + n_clusters_per_class=1, + class_sep_factor=10, + n_informative_divid=2, + d=4, + D=10, + standard_deviation=2, + weights=None, + flip_y=0.0, + random_state=42, config_path=None): + if config_path is not None: + with open(config_path) as config_file: + args = yaml.safe_load(config_file) + self.__init__(**args) + else: + self.n_samples = n_samples + self.n_views = n_views + self.n_classes = n_classes + self.Z_factor = Z_factor + self.R = R + self.n_clusters_per_class = n_clusters_per_class + self.class_sep_factor = class_sep_factor + self.n_informative_divid = n_informative_divid + self.d = d + self.D = D + self.standard_deviation = standard_deviation + self.weights = weights + self.flip_y = flip_y + self.random_state = random_state + + def generate(self): + if self.n_views < 2: + raise ValueError("n_views >= 2") + if self.n_classes < 2: + raise ValueError("n_classes >= 2") + if self.Z_factor < 1: + raise ValueError( + "Z_factor >= 1 pour le bon fonctionnement de l'algorithme") + if (self.R < 0) or (self.R > 1): + raise ValueError("0 <= R <= 1") + if self.n_clusters_per_class < 1: + raise ValueError("n_clusters_per_class >= 1") + if self.class_sep_factor < 0: + raise ValueError("class_sep_factor >= 0") + if self.n_informative_divid < 1: + raise ValueError("n_informative_divid >= 1") + if self.d < 1: + raise ValueError("d >= 1") + if (self.d + self.D) / 2 - 3 * self.standard_deviation < 1: + raise ValueError( + "Il faut que (d+D)/2 - 3*standard_deviation >= 1 pour avoir des valeurs positives non nulles lors de l'emploi de la loi normale") + + # n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2) + d_v = np.random.normal(loc=(self.d + self.D) / 2, + scale=self.standard_deviation, + size=self.n_views) + d_v = list(d_v) + remove_list, add_list = [], [] + for dim_view in d_v: + if dim_view < self.d or dim_view > self.D: # 1 <= d <= dim_view <= D + remove_list.append(dim_view) + add = -1 + while add < self.d or add > self.D: + add = gauss((self.d + self.D) / 2, self.standard_deviation) + add_list.append(add) + d_v = [view for view in d_v if view not in remove_list] + add_list + d_v = [int(view) for view in d_v] # dimension of views = integer + # d_v = list of views dimension from the highest to the lowest + d_v.sort(reverse=True) + # Dimension of latent space Z (multiplied by Z_factor) + self.dim_Z = self.Z_factor * self.latent_space_dimension(d_v) + # Number of informative features + self.n_informative = round(self.dim_Z / self.n_informative_divid) + # Generation of latent space Z + self.Z, self.y = make_classification(n_samples=self.n_samples, n_features=self.dim_Z, + n_informative=self.n_informative, n_redundant=0, + n_repeated=0, n_classes=self.n_classes, + n_clusters_per_class=self.n_clusters_per_class, + weights=self.weights, + flip_y=self.flip_y, + class_sep=self.n_clusters_per_class * self.class_sep_factor, + random_state=self.random_state, shuffle=False) + I_q = np.arange(self.Z.shape[1]) + meta_I_v = [] + self.results = [] + for view in range(n_views): + # choice d_v[view] numeros of Z columns uniformly from I_q + I_v = np.random.choice(I_q, size=d_v[view], + replace=False) # tirage dans I_q sans remise de taille d_v[view] + meta_I_v += list(I_v) + # projection of Z along the columns in I_v + X_v = self.projection( I_v) + self.results.append((X_v, I_v)) + # remove R*d_v[view] columns numeros of I_v form I_q + elements_to_remove = np.random.choice(I_v, + size=floor(self.R * d_v[view]), + replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view]) + I_q = np.setdiff1d(I_q, + elements_to_remove) # I_q less elements from elements_to_remove + self.unsued_dimensions_list = [column for column in I_q if + column not in meta_I_v] + self.unsued_dimensions_percent = round( + (len(self.unsued_dimensions_list) / self.dim_Z) * 100, 2) + + def projection(self, chosen_columns_list): + """ + Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order) + + Parameters: + ----------- + chosen_columns_list : list + + Returns: + -------- + an array of dimension (number of rows of latent_space, length of chosen_columns_list) + """ + return self.Z[:, chosen_columns_list] + + def latent_space_dimension(self, views_dimensions_list): + """ + Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list + + Parameters: + ----------- + views_dimensions_list : list + R : float + + Returns: + -------- + an int + """ + max_view_dimension = max(views_dimensions_list) + dimension = ceil(self.R * sum(views_dimensions_list)) + + if dimension < max_view_dimension: + dimension = max_view_dimension + + reduced_dimension = dimension + remove_sum = 0 + + for num_view in range(1, len(views_dimensions_list)): + view_prec = views_dimensions_list[num_view - 1] + view_current = views_dimensions_list[num_view] + remove = floor(self.R * view_prec) + remove_sum += remove + if reduced_dimension - remove < view_current: + dimension += view_current - (reduced_dimension - remove) + reduced_dimension = dimension - remove_sum + + return dimension + + def to_csv(self, saving_path="."): + """ + Create length of multiviews_list + 2 csv files to the indicated path + Files name : + latent_space.csv for latent_space + integer_labels.csv for integer_labels + view0.csv for multiviews_list[0] + + Parameters: + ----------- + path : str + latent_space : array + integer_labels : 1D array + multiviews_list : list of tuples + + Returns: + -------- + None + """ + df_latent_space = pd.DataFrame(self.Z) + df_latent_space.to_csv(os.path.join(saving_path, 'latent_space.csv') + , index=False) + + df_labels = pd.DataFrame(self.y) + df_labels.to_csv(os.path.join(saving_path, 'integer_labels.csv'), + index=False) + + for view_index, view_tuple in enumerate(self.results): + df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1]) + df_view.to_csv(os.path.join(saving_path, + 'view'+str(view_index)+'.csv'), + index=False) + + def to_hdf5(self, saving_path=".", name="generated_dset"): + + dataset_file = h5py.File(os.path.join(saving_path, name+".hdf5"), 'w') + + labels_dataset = dataset_file.create_dataset("Labels", + shape=self.y.shape, + data=self.y) + + labels_names = ["Label_1", "Label_0"] + + labels_dataset.attrs["names"] = [ + label_name.encode() if not isinstance(label_name, bytes) + else label_name for label_name in labels_names] + + for view_index, (data, feature_indices) in enumerate(self.results): + df_dataset = dataset_file.create_dataset("View" + str(view_index), + shape=data.shape, + data=data) + + df_dataset.attrs["sparse"] = False + df_dataset.attrs["name"] = "GeneratedView"+str(view_index) + + meta_data_grp = dataset_file.create_group("Metadata") + + meta_data_grp.attrs["nbView"] = len(self.results) + meta_data_grp.attrs["nbClass"] = np.unique(self.y) + meta_data_grp.attrs["datasetLength"] = \ + self.results[0][0].shape[0] + + meta_data_grp.create_dataset("example_ids", data=np.array( + ["gen_example_" + str(ex_indx) for ex_indx in + range(self.results[0][0].shape[0])]).astype( + np.dtype("S100")), dtype=np.dtype("S100")) + + dataset_file.close() + +if __name__=="__main__": + n_samples = 100 # Number of samples in tha dataset + n_views = 4 # Number of views in the dataset + n_classes = 2 # Number of classes in the dataset + Z_factor = 2 # Z dim = latent_space_dim * z_factor + R = 0 # Precentage of non-redundant features in the view + n_clusters_per_class = 1 # Number of clusters for each class + class_sep_factor = 10000 # Separation between the different classes + n_informative_divid = 2 # Divides the number of informative features in the latent space + standard_deviation = 2 + d = 4 + D = 10 + flip_y = 0.00 + random_state = 42 + weights = None # The proportions of examples in each class + + path = "/home/baptiste/Documents/Datasets/Generated/metrics_dset/" + name = "metrics" + if not os.path.exists(path): + os.mkdir(path) + + multiview_generator = MultiviewDatasetGenetator(n_samples=n_samples, + n_views=n_views, + n_classes=n_classes, + Z_factor=Z_factor, + R=R, + n_clusters_per_class=n_clusters_per_class, + class_sep_factor=class_sep_factor, + n_informative_divid=n_informative_divid, + d=d, + D=D, + standard_deviation=standard_deviation, + flip_y=flip_y, + weights=weights, + random_state=random_state) + + multiview_generator.generate() + multiview_generator.to_hdf5(saving_path=path, name=name) + + # for filename in os.listdir(path): + # file_path = os.path.join(path, filename) + # try: + # if os.path.isfile(file_path) or os.path.islink(file_path): + # os.unlink(file_path) + # elif os.path.isdir(file_path): + # shutil.rmtree(file_path) + # except Exception as e: + # print('Failed to delete %s. Reason: %s' % (file_path, e)) + # changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers) + # print(changing_labels_indices) + # y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int) + # results_to_csv(path, Z, y, results) \ No newline at end of file diff --git a/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc b/late/__pycache__/multiviews_datasets_generator.cpython-36.pyc index 4cc36e52166be6df44fd7ed479f9ac3fbc60f93c..828277d0901732be8b003cb85e93cd846ecc6115 100644 GIT binary patch delta 1938 zcmX?NdDL3Un3tC;;iZ0DrwRkZV+JI^J5ef9Tr5Q}MW}@_iZ4YtMWlrxia&)lm_bu~ zV^=Oy{Vmb5%+&JY_>|1t)V$)%{Ji4$oXp~qTZ}=sgmM!r;z6<q$y=Oob+-hIQd3Hk zQ&W&+xr$PA^UG4>i%WBFapaZef_2~G1e;M%l$v~t2h2<^Eh<XQE4js1l*$28zmlOy zbaEZ@d`9ujaxATkfualy44O>0cyba;Qu9jUiwhEyQ*Q|;XXF>B=EW!H=al9`U0DQD z2_eK77#MDG6%^%Xr6!kT=I0qPFfbG|fkJ|}NNVyIRz;bYS_}*f$-E#T1_lOM1_lN` z1_lOZkh)uwb=Z#7^OmsG@HR8n@}@JSGcYl9GS>1{B-QY`FvLdH^4IX!aAY$SzpLS| z;ml?!y4As$#ahGP%*e=)qFgIbp;IFOlApj>EL0-^QZa$CNTY<UhNFg4oFRo#nxTfP znW<J_0z=`yaE2PbEY=zUklw;u;SBAJX^biSDFXE^93ab@8Eg2&8ESb`1XBcB7$73G zyfwU8>?uMi!o7@)3^mLp95uW(EX_<J47GwKoHc?Vqe{4H1Z&u(8JZbug-SS6L~0nC znHU*LI8#Jx1feXk8X+i4JcSL)l7O@7C2N?Vf>JfCP?mHJYYm$S#3=3>h8n>dp%k_h zwi>1y)*9{<8Hj8RPl{}cTrW=yQ>}2VNC{&WPl`Oq@g=-9!dZMNERqb(j4lk#jLnR- zqUj8s4CxHDVijd2yfus<@z|_d@e;-?rur=Y8qpfw8gWU66xkGoUglbf4u%qz8i{5` zbB0ca5`h}w8i{76TFDx*EY=#y8i_O}u)i6zn6mh5BuWHpSRlGZ7@8TOCP<YqW(n1Z z)<`uo)rwAFDl`jcXlF=cOi@e`XyK?41v#o#x`eAlxJJ5Jp0P$ynju9=gdv5onW<K$ zMz}_#Myy7rMko!GT~s!2<ygbXC_6bx=s2UyWG&%SY>?D9`IoS~7?=r4gTJ_JK=~&* zKexbc(qw&+?0S%tCgUyU%)Am!rXm#v28JS45CIZLq}?KQ1_p-7pu`ADv}_C^nvA#D zi&D!{i;7dLxbxzR6LSl4Qi~O=s@PL9bK|2Ftg86);xqHo@{4j4OESw+6|Aaw^Ws5y zGA|`DuSCJBiU-6ks83BSNlj6(0vVE=lUQ5~Hbg8hJ~^kfxFofxIKCjYC>|^WQUqqi z7pE2|SfxX=3WJ*_&n-^46K-+k#iykvmXsEy7T@AUal<Wckd-N=U>AVY6{QwTRua|M zyv3H5lUWd7d5aTn{w<!O#JrUJ-1y>>#FEro?8O<SX=yo=3q^Gt;UNPJuUm|nMR2hq zP$+{#Yb_$QG?|K&K?MMNQEEX>V)Enz!X~_M9UxOCD~L((gGJ!lcTIK|3t$wVJVVTg zQEKvOF-cX~B2W==i#-oqZWR~lgN)>eicd>SF3B${;sM2)!DLQxmz*L)5ZeSK&zurp zc8eo1B?VG$a)4q7RLb3APE1L;#R}p^73qLfGDh8Eth~kS8D9vB+2Ycil45X5011`d zVvdL}yCsyGlbV~FS5h2bk{=H#Om9tIFK#Rja=Io{ktN7lH4p*v#x0r2)*>?8PM~lo z0Y#S4WCaN~JuXHB<YVSy<X{wIm1AUKWMSlE5@Y0H;$q}ul4BHN6k-%(EQ;IQA~AtU z7L+2ZSoD&M%ltH1ZZQ?4+~RaeEJ<`LO3Y2YHQ7*Flu=@`yR=H!E&i0Wc$9LP6D*LF zno}IboLo?HiworW_>$6soYY(FDQV!KM2o#!d_}2X?}L1qTwJz)@<D0oh%5#MhUcMR z@YG(Dw@4b~O&JgYDyxg+K&)&K!3HA0VhBPWCgxY90IEF%Zn4M5r{pKc$KRT)CnL^i zH`z@_A<Y&P4QvI8c`1p-MTsCS;vhl_M8tszkQ<O33kr2`db`D8lbfGXnv-hB4hn3L b1{NkBMjl21Mh->}W)39|0R~10<f;S!Rg(Z6 delta 1719 zcmX?VeZ*4An3tD}W3Fmkk|G1cV+JI^Jy9xBTr@=>MX-f2iZ4YdMYx3_ia&)lm_bu) zV^=OyeUxZfW@>qHd`f0+YF=?>eqM2WPG)h*EykcIq1?oZc#teYGKv$fE=sT{HKjB; zH3dnQt0*-$zbrMrxHLD4Bd;_UtUHPmY(_y*YH}10n3-H!RFs-ma*M4fl>?-HB|{PK z<T~d0jQpGBSXvnac^McOG?{Mk<Rq4)=9R=37bGU9MhPcp<QJ#r#V6<Il;%QRSp-rE zA@~>=7;bSD6y;~7CYNO9=NU0DFcdR^LV~wQaPk*cMe$^A5RZX@L7IVq0i=-`q@-oC zA=?p8?h=+7?q<eX?sSH91}271##)|=q#AA)hS;cD-WuK-j%<eFcQw2<oY^czw>lWJ zSZjEj85tQ;WNP^;bZYoO@)H<~g=+XF+p$a5^G#qV{1?tp!;{5Y!v|7Vcq^QtoiU9u zg+E20g#%=6Gh+>pI72OWieQRB3j;)?mb->Kiyah4y^M?uHOwU(HQY5U%}gQ;wfrTV zHT)oxO1NtHYuKe3ni*>aO1NtnYWQmeQrJ@1YM5$RQ$$ikYglX8>O~k*L{h|hxnr1W z1#5*$7_)d%#6i}S@YV=s@ujdxGBh)~Ff=nZGu8^HGjuYfGt`Pyl$G$-FoMKmvuZ_4 z7_*qN_-lk~xNAfu8B!!tBzu`_#X1;DSZc(Y8O<3w8A=3d1Z%{anQFyrM6y_G#B0RT zn7|HX%t{4m6)O>}VS#8DVQ6NA8X-}_m?cyrTqDuUR4Y7zvG74SLpwtnV~P|wT!cZ6 zs+BC^DiN-cY-X(Cmu5(j7GX$XY-XyJsu8Rasu8J?su4&7rC!<10-S3&8AT@V6*|r+ zJh@f)6dNc`i+Cr8ipYzBnV=-`i^~R-oRjl&3+(zPcZp={fTT1TZ!u@)m1r^*Nii@m z6oC>1Bo%=qi$MB{WEmJ3`X?)jiaGeDSBtE3e^}LGugQIjGcUe4F}ENmwK$3^FFq|b zv81#pwfGiqUVLU=T7FS(Vo7FMY7}=~JScPIr6lH+L`{AxYN`;$mX?!Q5MLR^nVgeY zTpVAVT5yY}C^0W3KR3R(B(Y?&x0sGO*me*Bb=WP&Ol;1Y14_4`q|L_QrpZ(!IeD#^ zy$MJgSTRg58ziG>GT!3IO)W``_lz$C%OkwEa<YPW03-k8LUAK|K~Olbg7Oe&acW6Q zW?GtIN)Z<*A4(S~fP~rez?r$YND0K_h>A~3OfJbUDgtGyBIU`C#9iv6m{a1*qBs&$ zQouz>6h}%Xqy)IdoS2dl#R}p^6)A%BGe+HFtc+szj4!;!UX)r~np08?js=iVSrl_b zd|8xGYEEiyYF<fkd`W&hq+E#-E6pn|MJ{omg_R^oeDZn;F(FVeXfhRPg1jRGBKRg> zmr!F%0cCx?$;^^&EF6p?Op{|JB^g~emrG7yG6qF{6^mYSahabc%PppYlqgP@#F9j} zqQu<PD7KRP_~hcUTg=5JMVg$qSPBwLGKvHyTgj-@NAahm#iNu^oM3^Z)STid=H!Bs zC@yeeS5jJ#lN!aIk_HYxNECov366(bd{EzmJegcvwhk1vLLkQqb1@1rfiMdr7o!-X z9Al9;0|UeJP%wCEuNlK#1X5Qd4DzH1h!6!4J|Kb(M1aK*gcwXrQ{WbRe0)lNa(sN0 zKyh|XYGP5IUP@v~VsUCoaS<q473od3msN<<Wnf^4Vk=0@OGzv)@&IY)2NB?u;tFDc p9E9W_gr_)ca`RJ4b5iZtVcub3;$h@r6kz0F<Y4Ad;t=4<0RUM(!La}U diff --git a/late/execute.py b/late/execute.py index 9538308..a3810c0 100644 --- a/late/execute.py +++ b/late/execute.py @@ -1,35 +1,286 @@ import os +import yaml import numpy as np +from sklearn.datasets import make_classification +from random import gauss +from math import ceil, floor +import pandas as pd +import shutil +import h5py -from multiviews_datasets_generator import generator_multiviews_dataset, results_to_csv - -n_samples = 200 #Number of samples in tha dataset -n_views = 4 # Number of views in the dataset -n_classes = 2 # Number of classes in the dataset -Z_factor = 1 # Z dim = latent_space_dim * z_factor -R = 0 # Precentage of non-redundant features in the view -n_clusters_per_class = 1 # Number of clusters for each class -class_sep_factor = 100 # Separation between the different classes -n_informative_divid = 1 # Divides the number of informative features in the latent space -standard_deviation = 2 -d = 4 -D = 10 -random_state = 42 -n_outliers = 10 - -path = "/home/baptiste/Documents/Datasets/Generated/outliers_dset/" -if not os.path.exists(path): - os.mkdir(path) - -Z, y, results, unsued_dimensions_percent, n_informative = generator_multiviews_dataset(n_samples, n_views, n_classes, - Z_factor, R, - n_clusters_per_class, - class_sep_factor, - n_informative_divid, d, D, - standard_deviation) -print(unsued_dimensions_percent) -print(n_informative) -print(Z.shape) -changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers) -y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int) -results_to_csv(path, Z, y, results) \ No newline at end of file +class MultiviewDatasetGenetator(): + + def __init__(self, n_samples=100, n_views=2, n_classes=2, + Z_factor=2, + R=0, + n_clusters_per_class=1, + class_sep_factor=10, + n_informative_divid=2, + d=4, + D=10, + standard_deviation=2, + weights=None, + flip_y=0.0, + random_state=42, config_path=None): + if config_path is not None: + with open(config_path) as config_file: + args = yaml.safe_load(config_file) + self.__init__(**args) + else: + self.n_samples = n_samples + self.n_views = n_views + self.n_classes = n_classes + self.Z_factor = Z_factor + self.R = R + self.n_clusters_per_class = n_clusters_per_class + self.class_sep_factor = class_sep_factor + self.n_informative_divid = n_informative_divid + self.d = d + self.D = D + self.standard_deviation = standard_deviation + self.weights = weights + self.flip_y = flip_y + self.random_state = random_state + + def generate(self): + if self.n_views < 2: + raise ValueError("n_views >= 2") + if self.n_classes < 2: + raise ValueError("n_classes >= 2") + if self.Z_factor < 1: + raise ValueError( + "Z_factor >= 1 pour le bon fonctionnement de l'algorithme") + if (self.R < 0) or (self.R > 1): + raise ValueError("0 <= R <= 1") + if self.n_clusters_per_class < 1: + raise ValueError("n_clusters_per_class >= 1") + if self.class_sep_factor < 0: + raise ValueError("class_sep_factor >= 0") + if self.n_informative_divid < 1: + raise ValueError("n_informative_divid >= 1") + if self.d < 1: + raise ValueError("d >= 1") + if (self.d + self.D) / 2 - 3 * self.standard_deviation < 1: + raise ValueError( + "Il faut que (d+D)/2 - 3*standard_deviation >= 1 pour avoir des valeurs positives non nulles lors de l'emploi de la loi normale") + + # n_views dimension of view v values randomly from N((d+D)/2, standard_deviation^2) + d_v = np.random.normal(loc=(self.d + self.D) / 2, + scale=self.standard_deviation, + size=self.n_views) + d_v = list(d_v) + remove_list, add_list = [], [] + for dim_view in d_v: + if dim_view < self.d or dim_view > self.D: # 1 <= d <= dim_view <= D + remove_list.append(dim_view) + add = -1 + while add < self.d or add > self.D: + add = gauss((self.d + self.D) / 2, self.standard_deviation) + add_list.append(add) + d_v = [view for view in d_v if view not in remove_list] + add_list + d_v = [int(view) for view in d_v] # dimension of views = integer + # d_v = list of views dimension from the highest to the lowest + d_v.sort(reverse=True) + # Dimension of latent space Z (multiplied by Z_factor) + self.dim_Z = self.Z_factor * self.latent_space_dimension(d_v) + # Number of informative features + self.n_informative = round(self.dim_Z / self.n_informative_divid) + # Generation of latent space Z + self.Z, self.y = make_classification(n_samples=self.n_samples, n_features=self.dim_Z, + n_informative=self.n_informative, n_redundant=0, + n_repeated=0, n_classes=self.n_classes, + n_clusters_per_class=self.n_clusters_per_class, + weights=self.weights, + flip_y=self.flip_y, + class_sep=self.n_clusters_per_class * self.class_sep_factor, + random_state=self.random_state, shuffle=False) + I_q = np.arange(self.Z.shape[1]) + meta_I_v = [] + self.results = [] + for view in range(n_views): + # choice d_v[view] numeros of Z columns uniformly from I_q + I_v = np.random.choice(I_q, size=d_v[view], + replace=False) # tirage dans I_q sans remise de taille d_v[view] + meta_I_v += list(I_v) + # projection of Z along the columns in I_v + X_v = self.projection( I_v) + self.results.append((X_v, I_v)) + # remove R*d_v[view] columns numeros of I_v form I_q + elements_to_remove = np.random.choice(I_v, + size=floor(self.R * d_v[view]), + replace=False) # tirage dans I_v sans remise de taille floor(R*d_v[view]) + I_q = np.setdiff1d(I_q, + elements_to_remove) # I_q less elements from elements_to_remove + self.unsued_dimensions_list = [column for column in I_q if + column not in meta_I_v] + self.unsued_dimensions_percent = round( + (len(self.unsued_dimensions_list) / self.dim_Z) * 100, 2) + + def projection(self, chosen_columns_list): + """ + Returns the projection of latent_space on the columns of chosen_columns_list (in chosen_columns_list order) + + Parameters: + ----------- + chosen_columns_list : list + + Returns: + -------- + an array of dimension (number of rows of latent_space, length of chosen_columns_list) + """ + return self.Z[:, chosen_columns_list] + + def latent_space_dimension(self, views_dimensions_list): + """ + Returns the minimal dimension of latent space (enough to build the dataset) for generator_multiviews_dataset compared to views_dimensions_list + + Parameters: + ----------- + views_dimensions_list : list + R : float + + Returns: + -------- + an int + """ + max_view_dimension = max(views_dimensions_list) + dimension = ceil(self.R * sum(views_dimensions_list)) + + if dimension < max_view_dimension: + dimension = max_view_dimension + + reduced_dimension = dimension + remove_sum = 0 + + for num_view in range(1, len(views_dimensions_list)): + view_prec = views_dimensions_list[num_view - 1] + view_current = views_dimensions_list[num_view] + remove = floor(self.R * view_prec) + remove_sum += remove + if reduced_dimension - remove < view_current: + dimension += view_current - (reduced_dimension - remove) + reduced_dimension = dimension - remove_sum + + return dimension + + def to_csv(self, saving_path="."): + """ + Create length of multiviews_list + 2 csv files to the indicated path + Files name : + latent_space.csv for latent_space + integer_labels.csv for integer_labels + view0.csv for multiviews_list[0] + + Parameters: + ----------- + path : str + latent_space : array + integer_labels : 1D array + multiviews_list : list of tuples + + Returns: + -------- + None + """ + df_latent_space = pd.DataFrame(self.Z) + df_latent_space.to_csv(os.path.join(saving_path, 'latent_space.csv') + , index=False) + + df_labels = pd.DataFrame(self.y) + df_labels.to_csv(os.path.join(saving_path, 'integer_labels.csv'), + index=False) + + for view_index, view_tuple in enumerate(self.results): + df_view = pd.DataFrame(view_tuple[0], columns=view_tuple[1]) + df_view.to_csv(os.path.join(saving_path, + 'view'+str(view_index)+'.csv'), + index=False) + + def to_hdf5(self, saving_path=".", name="generated_dset"): + + dataset_file = h5py.File(os.path.join(saving_path, name+".hdf5"), 'w') + + labels_dataset = dataset_file.create_dataset("Labels", + shape=self.y.shape, + data=self.y) + + labels_names = ["Label_1", "Label_0"] + + labels_dataset.attrs["names"] = [ + label_name.encode() if not isinstance(label_name, bytes) + else label_name for label_name in labels_names] + + for view_index, (data, feature_indices) in enumerate(self.results): + df_dataset = dataset_file.create_dataset("View" + str(view_index), + shape=data.shape, + data=data) + + df_dataset.attrs["sparse"] = False + df_dataset.attrs["name"] = "GeneratedView"+str(view_index) + + meta_data_grp = dataset_file.create_group("Metadata") + + meta_data_grp.attrs["nbView"] = len(self.results) + meta_data_grp.attrs["nbClass"] = np.unique(self.y) + meta_data_grp.attrs["datasetLength"] = \ + self.results[0][0].shape[0] + + meta_data_grp.create_dataset("example_ids", data=np.array( + ["gen_example_" + str(ex_indx) for ex_indx in + range(self.results[0][0].shape[0])]).astype( + np.dtype("S100")), dtype=np.dtype("S100")) + + dataset_file.close() + +if __name__=="__main__": + n_samples = 100 # Number of samples in tha dataset + n_views = 4 # Number of views in the dataset + n_classes = 2 # Number of classes in the dataset + Z_factor = 2 # Z dim = latent_space_dim * z_factor + R = 0 # Precentage of non-redundant features in the view + n_clusters_per_class = 1 # Number of clusters for each class + class_sep_factor = 10000 # Separation between the different classes + n_informative_divid = 2 # Divides the number of informative features in the latent space + standard_deviation = 2 + d = 4 + D = 10 + flip_y = 0.00 + random_state = 42 + weights = None # The proportions of examples in each class + + path = "/home/baptiste/Documents/Datasets/Generated/metrics_dset/" + name = "metrics" + if not os.path.exists(path): + os.mkdir(path) + + multiview_generator = MultiviewDatasetGenetator(n_samples=n_samples, + n_views=n_views, + n_classes=n_classes, + Z_factor=Z_factor, + R=R, + n_clusters_per_class=n_clusters_per_class, + class_sep_factor=class_sep_factor, + n_informative_divid=n_informative_divid, + d=d, + D=D, + standard_deviation=standard_deviation, + flip_y=flip_y, + weights=weights, + random_state=random_state) + + multiview_generator.generate() + multiview_generator.to_hdf5(saving_path=path, name=name) + + # for filename in os.listdir(path): + # file_path = os.path.join(path, filename) + # try: + # if os.path.isfile(file_path) or os.path.islink(file_path): + # os.unlink(file_path) + # elif os.path.isdir(file_path): + # shutil.rmtree(file_path) + # except Exception as e: + # print('Failed to delete %s. Reason: %s' % (file_path, e)) + # changing_labels_indices = np.random.RandomState(random_state).choice(np.arange(y.shape[0]), n_outliers) + # print(changing_labels_indices) + # y[changing_labels_indices] = np.invert(y[changing_labels_indices].astype(bool)).astype(int) + # results_to_csv(path, Z, y, results) \ No newline at end of file diff --git a/late/multiviews_datasets_generator.py b/late/multiviews_datasets_generator.py index 1cce9a0..d3b9bc6 100644 --- a/late/multiviews_datasets_generator.py +++ b/late/multiviews_datasets_generator.py @@ -63,7 +63,11 @@ def projection(latent_space, chosen_columns_list): return latent_space[:, chosen_columns_list] -def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, Z_factor=250, R=2/3, n_clusters_per_class=1, class_sep_factor=2, n_informative_divid=2, d=2, D=12, standard_deviation=2): +def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, + Z_factor=250, R=2/3, n_clusters_per_class=1, + class_sep_factor=2, n_informative_divid=2, + d=2, D=12, standard_deviation=2, weights=None, + random_state=42): """ Returns a generator multiviews dataset @@ -149,9 +153,22 @@ def generator_multiviews_dataset(n_samples=1000, n_views=3, n_classes=2, Z_facto # Number of informative features n_informative = round(dim_Z/n_informative_divid) # Generation of latent space Z - Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0, - n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=None, - flip_y=0.00, class_sep=n_clusters_per_class*class_sep_factor, random_state=None) + print("n_samples :", n_samples) + print("dim_Z :", dim_Z) + print("n_informative :", n_informative) + print("n_redundant :", 0) + print("n_repeated :", 0) + print("n_classes :", n_classes) + print("n_clusters_per_class :", n_clusters_per_class) + print("class_sep :", n_clusters_per_class*class_sep_factor) + + + Z, y = make_classification(n_samples=n_samples, n_features=dim_Z, n_informative=n_informative, n_redundant=0, + n_repeated=0, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=weights, + flip_y=0.00, class_sep=n_clusters_per_class*class_sep_factor, random_state=random_state, shuffle=False) + # Z, y = make_classification(n_samples=200, n_features=10, n_informative=2, n_redundant=0, + # n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, + # flip_y=0, class_sep=100, random_state=random_state, shuffle=False) I_q = np.array([i for i in range(Z.shape[1])]) # 1D-array of Z columns numero meta_I_v = [] -- GitLab