-
Baptiste Bauvin authoredBaptiste Bauvin authored
format_dataset.py 3.44 KiB
"""
This file is provided as an example of dataset formatting, using a csv-stored
mutliview dataset to build a SuMMIT-compatible hdf5 file.
Please see http://baptiste.bauvin.pages.lis-lab.fr/multiview-machine-learning-omis/tutorials/example4.html
for complementary information.
"""
import numpy as np
import h5py
# The following variables are defined as an example, you should modify them to fite your dataset files.
view_names = ["view_name_1", "view_name_2", "view_name_3", ]
data_file_paths = ["path/to/view_1.csv", "path/to/view_1.csv", "path/to/view_1.csv",]
labels_file_path = "path/to/labels/file.csv"
example_ids_path = "path/to/example_ids/file.csv"
labels_names = ["Label_1", "Label_2", "Label_3"]
# HDF5 dataset initialization :
hdf5_file = h5py.File("path/to/file.hdf5", "w")
# Store each view in a hdf5 dataset :
for view_index, (file_path, view_name) in enumerate(zip(data_file_paths, view_names)):
# Get the view's data from the csv file
view_data = np.genfromtxt(file_path, delimiter=",")
# Store it in a dataset in the hdf5 file,
# do not modify the name of the dataset
view_dataset = hdf5_file.create_dataset(name="View{}".format(view_index),
shape=view_data.shape,
data=view_data)
# Store the name of the view in an attribute,
# do not modify the attribute's key
view_dataset.attrs["name"] = view_name
# This is an artifact of work in progress for sparse support, not available ATM,
# do not modify the attribute's key
view_dataset.attrs["sparse"] = False
# Get le labels data from a csv file
labels_data = np.genfromtxt(labels_file_path, delimiter=',')
# Here, we supposed that the labels file contained numerical labels (0,1,2)
# that reffer to the label names of label_names.
# The Labels HDF5 dataset must contain only integers that represent the
# different classes, the names of each class are saved in an attribute
# Store the integer labels in the HDF5 dataset,
# do not modify the name of the dataset
labels_dset = hdf5_file.create_dataset(name="Labels",
shape=labels_data.shape,
data=labels_data)
# Save the labels names in an attribute as encoded strings,
# do not modify the attribute's key
labels_dset.attrs["names"] = [label_name.encode() for label_name in labels_names]
# Create a Metadata HDF5 group to store the metadata,
# do not modify the name of the group
metadata_group = hdf5_file.create_group(name="Metadata")
# Store the number of views in the dataset,
# do not modify the attribute's key
metadata_group.attrs["nbView"] = len(view_names)
# Store the number of classes in the dataset,
# do not modify the attribute's key
metadata_group.attrs["nbClass"] = np.unique(labels_data)
# Store the number of examples in the dataset,
# do not modify the attribute's key
metadata_group.attrs["datasetLength"] = labels_data.shape[0]
# Let us suppose that the examples have string ids, available in a csv file,
# they can be stored in the HDF5 and will be used in the result analysis.
example_ids = np.genfromtxt(example_ids_path, delimiter=',')
# To sore the strings in an HDF5 dataset, be sure to use the S<max_length> type,
# do not modify the name of the dataset.
metadata_group.create_dataset("example_ids",
data=np.array(example_ids).astype(np.dtype("S100")),
dtype=np.dtype("S100"))
hdf5_file.close()