""" This file is provided as an example of dataset formatting, using a csv-stored mutliview dataset to build a SuMMIT-compatible hdf5 file. Please see http://baptiste.bauvin.pages.lis-lab.fr/multiview-machine-learning-omis/tutorials/example4.html for complementary information. """ import numpy as np import h5py # The following variables are defined as an example, you should modify them to fite your dataset files. view_names = ["view_name_1", "view_name_2", "view_name_3", ] data_file_paths = ["path/to/view_1.csv", "path/to/view_1.csv", "path/to/view_1.csv",] labels_file_path = "path/to/labels/file.csv" example_ids_path = "path/to/example_ids/file.csv" labels_names = ["Label_1", "Label_2", "Label_3"] # HDF5 dataset initialization : hdf5_file = h5py.File("path/to/file.hdf5", "w") # Store each view in a hdf5 dataset : for view_index, (file_path, view_name) in enumerate(zip(data_file_paths, view_names)): # Get the view's data from the csv file view_data = np.genfromtxt(file_path, delimiter=",") # Store it in a dataset in the hdf5 file, # do not modify the name of the dataset view_dataset = hdf5_file.create_dataset(name="View{}".format(view_index), shape=view_data.shape, data=view_data) # Store the name of the view in an attribute, # do not modify the attribute's key view_dataset.attrs["name"] = view_name # This is an artifact of work in progress for sparse support, not available ATM, # do not modify the attribute's key view_dataset.attrs["sparse"] = False # Get le labels data from a csv file labels_data = np.genfromtxt(labels_file_path, delimiter=',') # Here, we supposed that the labels file contained numerical labels (0,1,2) # that reffer to the label names of label_names. # The Labels HDF5 dataset must contain only integers that represent the # different classes, the names of each class are saved in an attribute # Store the integer labels in the HDF5 dataset, # do not modify the name of the dataset labels_dset = hdf5_file.create_dataset(name="Labels", shape=labels_data.shape, data=labels_data) # Save the labels names in an attribute as encoded strings, # do not modify the attribute's key labels_dset.attrs["names"] = [label_name.encode() for label_name in labels_names] # Create a Metadata HDF5 group to store the metadata, # do not modify the name of the group metadata_group = hdf5_file.create_group(name="Metadata") # Store the number of views in the dataset, # do not modify the attribute's key metadata_group.attrs["nbView"] = len(view_names) # Store the number of classes in the dataset, # do not modify the attribute's key metadata_group.attrs["nbClass"] = np.unique(labels_data) # Store the number of examples in the dataset, # do not modify the attribute's key metadata_group.attrs["datasetLength"] = labels_data.shape[0] # Let us suppose that the examples have string ids, available in a csv file, # they can be stored in the HDF5 and will be used in the result analysis. example_ids = np.genfromtxt(example_ids_path, delimiter=',') # To sore the strings in an HDF5 dataset, be sure to use the S<max_length> type, # do not modify the name of the dataset. metadata_group.create_dataset("example_ids", data=np.array(example_ids).astype(np.dtype("S100")), dtype=np.dtype("S100")) hdf5_file.close()