diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py index 58a5d76528db1841175529e9f402059fc67a19b4..807771c2c4d572907eec798608b96bfe449b11a6 100644 --- a/skluc/main/data/mldatasets/Dataset.py +++ b/skluc/main/data/mldatasets/Dataset.py @@ -1,7 +1,6 @@ import os import numpy as np -from sklearn.model_selection import StratifiedShuffleSplit from sklearn.preprocessing import LabelBinarizer from skluc.main.utils import LabeledData @@ -23,6 +22,7 @@ class Dataset(object): for url in self.l_url: splitted_url = url.split("/") self.l_filenames.append(splitted_url[-1]) + self.s_name = s_name self.s_download_dir = os.path.join(s_download_dir, self.s_name) self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames] @@ -39,25 +39,82 @@ class Dataset(object): kept_indices = self.get_uniform_class_rand_indices_train(new_size) self.permuted_index_train = self.permuted_index_train[kept_indices] - def get_uniform_class_rand_indices_train(self, size): - try: - sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed) - kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels) - except ValueError as e: - logger.warning("In Dataset.get_uniform_class_rand_indices_train Handled exception: {}".format(str(e))) - logger.debug("Use random indexes instead") - kept_indices = np.random.permutation(len(self.train.data))[:size] - return kept_indices - - def get_uniform_class_rand_indices_validation(self, size): - try: - sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed) - kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels) - except ValueError as e: - logger.warning("In Dataset.get_uniform_class_rand_indices_validation Handled exception: {}".format(str(e))) - logger.debug("Use random indexes instead") - kept_indices = np.random.permutation(len(self.validation.data))[:size] - return kept_indices + def get_bool_idx_label(self, find_label, labels): + """ + return the np.array bool where find_label == label in labels + :param find_label: + :param labels: + :return: + """ + + if len(labels.shape) == 1: + bool_idx_labs = labels == find_label + elif len(labels.shape) == 2: + bool_idx_labs = np.all(labels == find_label, axis=-1) + else: + raise ValueError( + "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels") + return bool_idx_labs + + def get_uniform_class_rand_indices(self, labels, size, shuffle=True): + """size is the final size not the size / class""" + logger.debug("Start finding subset indices of size {} with uniform distribution of labels".format(size)) + unique_labels = np.unique(labels, axis=0) + nb_labels = len(unique_labels) + nbr_by_label = size // nb_labels + logger.debug("Need {} (+/- 1) example by label".format(nbr_by_label)) + # return_idx_labels = np.empty(size, dtype=np.int) + return_idx_labels = [] + copy_idx = 0 + + for u_lab in unique_labels: + bool_idx_labs = self.get_bool_idx_label(u_lab, labels) + get_nbr = nbr_by_label + if len(np.where(bool_idx_labs)[0]) < get_nbr: + raise IndexError( + "Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]), u_lab, + get_nbr)) + idx_labs = np.where(bool_idx_labs)[0][:get_nbr] + # return_idx_labels[copy_idx:copy_idx+get_nbr] = idx_labs + logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs))) + return_idx_labels.extend(idx_labs) + copy_idx += get_nbr + + remaining = size - copy_idx + if remaining > 0: + logger.debug("After finding equal number ({}) for example by labels (total = {}), " + "need to find more examples to reach size {}".format(nbr_by_label, nbr_by_label * nb_labels, + size)) + get_nbr = 1 + while remaining > 0: + u_lab_idx = np.random.choice(len(unique_labels), 1)[0] + u_lab = unique_labels[u_lab_idx] + bool_idx_labs = self.get_bool_idx_label(u_lab, labels) + + all_idx_labs = np.where(bool_idx_labs)[0] + idx_labs_not_already_gotten = np.setdiff1d(all_idx_labs, return_idx_labels) + if len(idx_labs_not_already_gotten) < get_nbr: + raise IndexError( + "Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]), + u_lab, get_nbr)) + idx_labs = idx_labs_not_already_gotten[:get_nbr] + # todo set difference + # return_idx_labels[copy_idx:copy_idx + get_nbr] = idx_labs + return_idx_labels.extend(idx_labs) + + remaining -= 1 + + if shuffle: + np.random.seed(self.seed) + np.random.shuffle(return_idx_labels) + + return np.array(return_idx_labels) + + def get_uniform_class_rand_indices_train(self, size, shuffle=True): + return self.get_uniform_class_rand_indices(labels=self.train.labels, size=size, shuffle=shuffle) + + def get_uniform_class_rand_indices_validation(self, size, shuffle=True): + return self.get_uniform_class_rand_indices(labels=self.validation.labels, size=size, shuffle=shuffle) @property def train(self): @@ -80,7 +137,7 @@ class Dataset(object): :return: None """ - self.create_directory_tree() + create_directory(self.s_download_dir) if not check_files(self.l_filepaths): logger.debug("Files need to be downloaded") for s_fname in self.l_filepaths: @@ -92,14 +149,6 @@ class Dataset(object): else: logger.debug("Files {} already exist".format(self.l_filepaths)) - def create_directory_tree(self): - """ - Create the target directory tree - - :return: None - """ - create_directory(self.s_download_dir) - def _check_validation_size(self, data_length): if self.validation_size > data_length: raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \ @@ -108,7 +157,10 @@ class Dataset(object): def to_one_hot(self): """ - Convert categorical labels to one hot encoding + Convert categorical labels to one hot encoding. + + Note: Beware, the information on how the labels are encoded is not stored + in the data file. :return: """ @@ -126,6 +178,14 @@ class Dataset(object): setattr(self, kw, LabeledData(data, labels)) def revert_one_hot(self): + """ + Convert one hot encoded labels to categorical labels. + + Note: Beware, the information on how the labels are encoded is not stored + in the data file. + + :return: + """ logger.info("Revert one hot encoding to dataset {}.".format(self.s_name)) for kw in self.data_groups_private: datlab = getattr(self, kw) @@ -143,6 +203,10 @@ class Dataset(object): Feature scaling normalization. + Note: Beware, the information on whether or not the data has been normalized + is not stored in the data file. If you call normalize on already normalized + it won't have any effect though. + :return: """ logger.info("Apply normalization to data from dataset {}.".format(self.s_name)) @@ -158,6 +222,14 @@ class Dataset(object): setattr(self, kw, LabeledData(data, datlab.labels)) def data_astype(self, _type): + """ + Change data type to _type. + + Note: Beware, the information on the type of the data is not stored in the data file. + + :param _type: + :return: + """ logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name)) for kw in self.data_groups_private: datlab = getattr(self, kw) @@ -171,6 +243,14 @@ class Dataset(object): setattr(self, kw, LabeledData(data, datlab.labels)) def labels_astype(self, _type): + """ + Change labels type to _type. + + Not: Beware, the information on the type of the labels is not stored in the data file. + + :param _type: + :return: + """ logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name)) for kw in self.data_groups_private: datlab = getattr(self, kw) @@ -208,6 +288,13 @@ class Dataset(object): raise Exception("No data loaded at the end of load method.") def save_npz(self, npzdir_path=None): + """ + Save data and labels as their current state to the npz dir path. + + :param npzdir_path: + :return: + """ + # todo trouver une solution pour stocker l'état courant? if npzdir_path is None: npzdir_path = os.path.join(self.s_download_dir, "npzfiles") for kw in self.data_groups_private: @@ -219,6 +306,12 @@ class Dataset(object): np.savez(filepath, **dict_attr) def load_npz(self, npzdir_path=None): + """ + Load data and labels from the npzdir path. + + :param npzdir_path: + :return: + """ if npzdir_path is None: npzdir_path = os.path.join(self.s_download_dir, "npzfiles") for kw in self.data_groups_private: diff --git a/skluc/main/data/transformation/ResizeTransformer.py b/skluc/main/data/transformation/ResizeTransformer.py index 2f5161fdfdb904bd0e91b5e84088360253fcffe4..c8b03f566c74a32aa1207a92f4fbcefc044fbf39 100644 --- a/skluc/main/data/transformation/ResizeTransformer.py +++ b/skluc/main/data/transformation/ResizeTransformer.py @@ -1,11 +1,12 @@ -import tensorflow as tf import numpy as np +import tensorflow as tf from skluc.main.data.transformation.Transformer import Transformer from skluc.main.utils import logger, Singleton class ResizeTransformer(Transformer, metaclass=Singleton): + # todo faire une fit methode? def __init__(self, data_name, output_shape): if len(output_shape) != 2: raise AssertionError("Output shape should be 2D and it is {}D: {}".format(len(output_shape), output_shape)) diff --git a/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py b/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..1e4773d48f6f9b0bec4a9a1ff23c0108f32139a7 --- /dev/null +++ b/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py @@ -0,0 +1,38 @@ +import os +import unittest + +from skluc.main.data.mldatasets import Cifar10Dataset +from skluc.main.utils import silentremove + + +class TestCifar10Dataset(unittest.TestCase): + def setUp(self): + self.mnist_names = [ + 'batches.meta', + 'data_batch_1', + 'data_batch_2', + 'data_batch_3', + 'data_batch_4', + 'data_batch_5', + 'readme.html', + 'test_batch' + ] + self.download_dir = "/tmp" + self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py") + self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname) + self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)] + + def test_cifar10(self): + cifar10 = Cifar10Dataset(s_download_dir=self.download_dir) + cifar10_data = cifar10.load() + for name in self.full_cifar10_names: + self.assertTrue(os.path.exists(name)) + self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"]) + + def tearDown(self): + for name in self.full_cifar10_names: + silentremove(name) + + +if __name__ == "__main__": + unittest.main() diff --git a/skluc/test/test_data/test_mldatasets.py b/skluc/test/test_data/test_mldatasets/TestDataset.py similarity index 61% rename from skluc/test/test_data/test_mldatasets.py rename to skluc/test/test_data/test_mldatasets/TestDataset.py index 3cb6abca99f79c6f601033b1efacc1013d040e32..7873200df8ecee4dad6066c0362bb82437844560 100644 --- a/skluc/test/test_data/test_mldatasets.py +++ b/skluc/test/test_data/test_mldatasets/TestDataset.py @@ -1,75 +1,28 @@ -import os import unittest -from skluc.main.data.mldatasets.Cifar10Dataset import Cifar10Dataset -from skluc.main.data.mldatasets.MnistDataset import MnistDataset -from skluc.main.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset -from skluc.main.data.mldatasets.SVHNDataset import SVHNDataset -from skluc.main.utils import silentremove +import numpy as np +from skluc.main.data.mldatasets.Dataset import Dataset +from skluc.main.utils import LabeledData -class TestMnistDataset(unittest.TestCase): - def setUp(self): - self.mnist_names = [ - "train-images-idx3-ubyte.gz", - "train-labels-idx1-ubyte.gz", - "t10k-images-idx3-ubyte.gz", - "t10k-labels-idx1-ubyte.gz" - ] - self.download_dir = "/tmp" - self.mnist_dirname = "mnist" - self.datadir_name = os.path.join(self.download_dir, self.mnist_dirname) - self.full_mnist_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)] - - def test_mnist(self): - mnist = MnistDataset(s_download_dir=self.download_dir) - mnist_data = mnist.load() - for name in self.full_mnist_names: - self.assertTrue(os.path.exists(name)) - self.assertTrue(kw in mnist_data.keys() for kw in ["train", "test"]) - mnist_image = mnist.to_image() - self.assertTrue(mnist_image["train"][0][0].shape == (28, 28,1)) - - def tearDown(self): - for name in self.full_mnist_names: - silentremove(name) - - -class TestCifar10Dataset(unittest.TestCase): - def setUp(self): - self.mnist_names = [ - 'batches.meta', - 'data_batch_1', - 'data_batch_2', - 'data_batch_3', - 'data_batch_4', - 'data_batch_5', - 'readme.html', - 'test_batch' - ] - self.download_dir = "/tmp" - self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py") - self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname) - self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)] - - def test_cifar10(self): - cifar10 = Cifar10Dataset(s_download_dir=self.download_dir) - cifar10_data = cifar10.load() - for name in self.full_cifar10_names: - self.assertTrue(os.path.exists(name)) - self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"]) - - def tearDown(self): - for name in self.full_cifar10_names: - silentremove(name) + +class FooDataset(Dataset): + def __init__(self, validation_size=1000, seed=0): + super().__init__([], "foo", validation_size=validation_size, seed=seed) + + def read(self): + train_size = 9000 + test_size = 1000 + np.random.seed(self.seed) + self._train = LabeledData(data=np.random.rand(train_size, 200), + labels=np.random.choice(10, size=train_size, replace=True)) + self._test = LabeledData(data=np.random.rand(test_size, 200), + labels=np.random.choice(10, size=test_size, replace=True)) class TestDataset(unittest.TestCase): def setUp(self): - self.dataset_classes = [MnistDataset, - Cifar10Dataset, - SVHNDataset, - MovieReviewV1Dataset] + self.dataset_classes = [FooDataset] def test_seed_train_val(self): for d_class in self.dataset_classes: @@ -79,14 +32,15 @@ class TestDataset(unittest.TestCase): d1.load() d2.load() d3.load() - self.assertTrue((d1.train.data[:10] == d2.train.data[:10]).all(), + self.assertTrue((d1.train.data == d2.train.data).all(), msg="Same seed gives different train/val split in {} dataset".format(d_class.__name__)) - self.assertTrue((d1.train.data[:10] != d3.train.data[:10]).any(), + self.assertTrue((d1.train.data != d3.train.data).any(), msg="Different seeds give same train/val split in {} dataset".format(d_class.__name__)) def test_seed_uniform_rand_indices_train(self): - size = 10 - size2 = 12 + size = 100 + size_bis = 102 + size2 = 120 for d_class in self.dataset_classes: # test same subsample size # ------------------------ @@ -97,11 +51,41 @@ class TestDataset(unittest.TestCase): d2.load() d3.load() sub_indexes1 = d1.get_uniform_class_rand_indices_train(size) + sub_indexes1_not_modulo = d1.get_uniform_class_rand_indices_train(size_bis) sub_indexes2 = d2.get_uniform_class_rand_indices_train(size) - sub_indexes3 = d2.get_uniform_class_rand_indices_train(size) + sub_indexes3 = d3.get_uniform_class_rand_indices_train(size) + # the returned number of index is actually equal to the one asked + self.assertEqual(len(sub_indexes1), size) + self.assertEqual(len(sub_indexes1_not_modulo), size_bis) + self.assertEqual(len(sub_indexes2), size) + self.assertEqual(len(sub_indexes3), size) + subsample1 = d1.train.data[sub_indexes1] subsample2 = d2.train.data[sub_indexes2] subsample3 = d3.train.data[sub_indexes3] + + unique_labels = np.unique(d1.train.labels, axis=0) + labels = d1.train.labels[sub_indexes1] + labels_bis = d1.train.labels[sub_indexes1_not_modulo] + for u_lab in unique_labels: + if len(labels.shape) == 1: + bool_idx_labs = labels == u_lab + bool_idx_labs_bis = labels_bis == u_lab + + elif len(labels.shape) == 2: + bool_idx_labs = np.all(labels == u_lab, axis=-1) + bool_idx_labs_bis = np.all(labels_bis == u_lab, axis=-1) + else: + raise ValueError( + "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels") + + idx_labs = np.where(bool_idx_labs)[0] + idx_labs_bis = np.where(bool_idx_labs_bis)[0] + self.assertTrue( + len(idx_labs) == size // len(unique_labels) or len(idx_labs) == size // len(unique_labels) + 1) + self.assertTrue(len(idx_labs_bis) == size // len(unique_labels) or len(idx_labs_bis) == size // len( + unique_labels) + 1) + self.assertTrue((subsample1 == subsample2).all(), msg="Same seed gives different subsamples in {} dataset".format(d_class.__name__)) self.assertTrue((subsample1 != subsample3).any(), @@ -109,14 +93,17 @@ class TestDataset(unittest.TestCase): # test different subsample size # ----------------------------- - d1 = d_class(validation_size=1000, seed=0) - d2 = d_class(validation_size=1000, seed=0) + d1 = d_class(validation_size=1000, seed=1) + d2 = d_class(validation_size=1000, seed=1) d1.load() d2.load() sub_indexes1 = d1.get_uniform_class_rand_indices_train(size) sub_indexes2 = d2.get_uniform_class_rand_indices_train(size2) + self.assertEqual(len(sub_indexes1), size) + self.assertEqual(len(sub_indexes2), size2) subsample1 = d1.train.data[sub_indexes1] subsample2 = d2.train.data[sub_indexes2] + for subs1 in subsample1: found = False for subs2 in subsample2: @@ -175,3 +162,7 @@ class TestDataset(unittest.TestCase): self.assertTrue(found, msg="Little subsample is not a subset of big subsample using same seed in {} dataset".format( d_class.__name__)) + + +if __name__ == "__main__": + unittest.main() diff --git a/skluc/test/test_data/test_mldatasets/__init__.py b/skluc/test/test_data/test_mldatasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391