Improve data section:

- selecting stratified subsample tested and working - more documentation in dataset.py - split test dataset in a subdirectory

Improve data section:
b5ddb30b · Luc Giffon · fb69843a · b5ddb30b · b5ddb30b · b5ddb30b
Commit b5ddb30b authored 6 years ago by Luc Giffon
--- a/skluc/main/data/mldatasets/Dataset.py
+++ b/skluc/main/data/mldatasets/Dataset.py
 import os
 import numpy as np
-from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.preprocessing import LabelBinarizer
 from skluc.main.utils import LabeledData
@@ -23,6 +22,7 @@ class Dataset(object):
        for url in self.l_url:
            splitted_url = url.split("/")
            self.l_filenames.append(splitted_url[-1])
        self.s_name = s_name
        self.s_download_dir = os.path.join(s_download_dir, self.s_name)
        self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames]
@@ -39,25 +39,82 @@ class Dataset(object):
        kept_indices = self.get_uniform_class_rand_indices_train(new_size)
        self.permuted_index_train = self.permuted_index_train[kept_indices]
-    def get_uniform_class_rand_indices_train(self, size):
+    def get_bool_idx_label(self, find_label, labels):
-        try:
+        """
-            sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed)
+        return the np.array bool where find_label == label in labels
-            kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels)
+        :param find_label:
-        except ValueError as e:
+        :param labels:
-            logger.warning("In Dataset.get_uniform_class_rand_indices_train Handled exception: {}".format(str(e)))
+        :return:
-            logger.debug("Use random indexes instead")
+        """
-            kept_indices = np.random.permutation(len(self.train.data))[:size]
-        return kept_indices
+        if len(labels.shape) == 1:
+            bool_idx_labs = labels == find_label
-    def get_uniform_class_rand_indices_validation(self, size):
+        elif len(labels.shape) == 2:
-        try:
+            bool_idx_labs = np.all(labels == find_label, axis=-1)
-            sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed)
+        else:
-            kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels)
+            raise ValueError(
-        except ValueError as e:
+                "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels")
-            logger.warning("In Dataset.get_uniform_class_rand_indices_validation Handled exception: {}".format(str(e)))
+        return bool_idx_labs
-            logger.debug("Use random indexes instead")
-            kept_indices = np.random.permutation(len(self.validation.data))[:size]
+    def get_uniform_class_rand_indices(self, labels, size, shuffle=True):
-        return kept_indices
+        """size is the final size not the size / class"""
+        logger.debug("Start finding subset indices of size {} with uniform distribution of labels".format(size))
+        unique_labels = np.unique(labels, axis=0)
+        nb_labels = len(unique_labels)
+        nbr_by_label = size // nb_labels
+        logger.debug("Need {} (+/- 1) example by label".format(nbr_by_label))
+        # return_idx_labels = np.empty(size, dtype=np.int)
+        return_idx_labels = []
+        copy_idx = 0
+        for u_lab in unique_labels:
+            bool_idx_labs = self.get_bool_idx_label(u_lab, labels)
+            get_nbr = nbr_by_label
+            if len(np.where(bool_idx_labs)[0]) < get_nbr:
+                raise IndexError(
+                    "Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]), u_lab,
+                                                                              get_nbr))
+            idx_labs = np.where(bool_idx_labs)[0][:get_nbr]
+            # return_idx_labels[copy_idx:copy_idx+get_nbr] = idx_labs
+            logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs)))
+            return_idx_labels.extend(idx_labs)
+            copy_idx += get_nbr
+        remaining = size - copy_idx
+        if remaining > 0:
+            logger.debug("After finding equal number ({}) for example by labels (total = {}), "
+                         "need to find more examples to reach size {}".format(nbr_by_label, nbr_by_label * nb_labels,
+                                                                              size))
+            get_nbr = 1
+            while remaining > 0:
+                u_lab_idx = np.random.choice(len(unique_labels), 1)[0]
+                u_lab = unique_labels[u_lab_idx]
+                bool_idx_labs = self.get_bool_idx_label(u_lab, labels)
+                all_idx_labs = np.where(bool_idx_labs)[0]
+                idx_labs_not_already_gotten = np.setdiff1d(all_idx_labs, return_idx_labels)
+                if len(idx_labs_not_already_gotten) < get_nbr:
+                    raise IndexError(
+                        "Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]),
+                                                                                  u_lab, get_nbr))
+                idx_labs = idx_labs_not_already_gotten[:get_nbr]
+                # todo set difference
+                # return_idx_labels[copy_idx:copy_idx + get_nbr] = idx_labs
+                return_idx_labels.extend(idx_labs)
+                remaining -= 1
+        if shuffle:
+            np.random.seed(self.seed)
+            np.random.shuffle(return_idx_labels)
+        return np.array(return_idx_labels)
+    def get_uniform_class_rand_indices_train(self, size, shuffle=True):
+        return self.get_uniform_class_rand_indices(labels=self.train.labels, size=size, shuffle=shuffle)
+    def get_uniform_class_rand_indices_validation(self, size, shuffle=True):
+        return self.get_uniform_class_rand_indices(labels=self.validation.labels, size=size, shuffle=shuffle)
    @property
    def train(self):
@@ -80,7 +137,7 @@ class Dataset(object):
        :return: None
        """
-        self.create_directory_tree()
+        create_directory(self.s_download_dir)
        if not check_files(self.l_filepaths):
            logger.debug("Files need to be downloaded")
            for s_fname in self.l_filepaths:
@@ -92,14 +149,6 @@ class Dataset(object):
        else:
            logger.debug("Files {} already exist".format(self.l_filepaths))
-    def create_directory_tree(self):
-        """
-        Create the target directory tree
-        :return: None
-        """
-        create_directory(self.s_download_dir)
    def _check_validation_size(self, data_length):
        if self.validation_size > data_length:
            raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \
@@ -108,7 +157,10 @@ class Dataset(object):
    def to_one_hot(self):
        """
-        Convert categorical labels to one hot encoding
+        Convert categorical labels to one hot encoding.
+        Note: Beware, the information on how the labels are encoded is not stored
+        in the data file.
        :return:
        """
@@ -126,6 +178,14 @@ class Dataset(object):
            setattr(self, kw, LabeledData(data, labels))
    def revert_one_hot(self):
+        """
+        Convert one hot encoded labels to categorical labels.
+        Note: Beware, the information on how the labels are encoded is not stored
+        in the data file.
+        :return:
+        """
        logger.info("Revert one hot encoding to dataset {}.".format(self.s_name))
        for kw in self.data_groups_private:
            datlab = getattr(self, kw)
@@ -143,6 +203,10 @@ class Dataset(object):
        Feature scaling normalization.
+        Note: Beware, the information on whether or not the data has been normalized
+        is not stored in the data file. If you call normalize on already normalized
+        it won't have any effect though.
        :return:
        """
        logger.info("Apply normalization to data from dataset {}.".format(self.s_name))
@@ -158,6 +222,14 @@ class Dataset(object):
            setattr(self, kw, LabeledData(data, datlab.labels))
    def data_astype(self, _type):
+        """
+        Change data type to _type.
+        Note: Beware, the information on the type of the data is not stored in the data file.
+        :param _type:
+        :return:
+        """
        logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name))
        for kw in self.data_groups_private:
            datlab = getattr(self, kw)
@@ -171,6 +243,14 @@ class Dataset(object):
            setattr(self, kw, LabeledData(data, datlab.labels))
    def labels_astype(self, _type):
+        """
+        Change labels type to _type.
+        Not: Beware, the information on the type of the labels is not stored in the data file.
+        :param _type:
+        :return:
+        """
        logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name))
        for kw in self.data_groups_private:
            datlab = getattr(self, kw)
@@ -208,6 +288,13 @@ class Dataset(object):
            raise Exception("No data loaded at the end of load method.")
    def save_npz(self, npzdir_path=None):
+        """
+        Save data and labels as their current state to the npz dir path.
+        :param npzdir_path:
+        :return:
+        """
+        # todo trouver une solution pour stocker l'état courant?
        if npzdir_path is None:
            npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
        for kw in self.data_groups_private:
@@ -219,6 +306,12 @@ class Dataset(object):
            np.savez(filepath, **dict_attr)
    def load_npz(self, npzdir_path=None):
+        """
+        Load data and labels from the npzdir path.
+        :param npzdir_path:
+        :return:
+        """
        if npzdir_path is None:
            npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
        for kw in self.data_groups_private:

--- a/skluc/main/data/transformation/ResizeTransformer.py
+++ b/skluc/main/data/transformation/ResizeTransformer.py
-import tensorflow as tf
 import numpy as np
+import tensorflow as tf
 from skluc.main.data.transformation.Transformer import Transformer
 from skluc.main.utils import logger, Singleton
 class ResizeTransformer(Transformer, metaclass=Singleton):
+    # todo faire une fit methode?
    def __init__(self, data_name, output_shape):
        if len(output_shape) != 2:
            raise AssertionError("Output shape should be 2D and it is {}D: {}".format(len(output_shape), output_shape))

--- a/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py
+++ b/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py
+import os
+import unittest
+from skluc.main.data.mldatasets import Cifar10Dataset
+from skluc.main.utils import silentremove
+class TestCifar10Dataset(unittest.TestCase):
+    def setUp(self):
+        self.mnist_names = [
+            'batches.meta',
+            'data_batch_1',
+            'data_batch_2',
+            'data_batch_3',
+            'data_batch_4',
+            'data_batch_5',
+            'readme.html',
+            'test_batch'
+        ]
+        self.download_dir = "/tmp"
+        self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py")
+        self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname)
+        self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
+    def test_cifar10(self):
+        cifar10 = Cifar10Dataset(s_download_dir=self.download_dir)
+        cifar10_data = cifar10.load()
+        for name in self.full_cifar10_names:
+            self.assertTrue(os.path.exists(name))
+        self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"])
+    def tearDown(self):
+        for name in self.full_cifar10_names:
+            silentremove(name)
+if __name__ == "__main__":
+    unittest.main()
--- a/skluc/test/test_data/test_mldatasets.py
+++ b/skluc/test/test_data/test_mldatasets.py
-import os
 import unittest
-from skluc.main.data.mldatasets.Cifar10Dataset import Cifar10Dataset
+import numpy as np
-from skluc.main.data.mldatasets.MnistDataset import MnistDataset
-from skluc.main.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset
-from skluc.main.data.mldatasets.SVHNDataset import SVHNDataset
-from skluc.main.utils import silentremove
+from skluc.main.data.mldatasets.Dataset import Dataset
+from skluc.main.utils import LabeledData
-class TestMnistDataset(unittest.TestCase):
-    def setUp(self):
+class FooDataset(Dataset):
-        self.mnist_names = [
+    def __init__(self, validation_size=1000, seed=0):
-            "train-images-idx3-ubyte.gz",
+        super().__init__([], "foo", validation_size=validation_size, seed=seed)
-            "train-labels-idx1-ubyte.gz",
-            "t10k-images-idx3-ubyte.gz",
+    def read(self):
-            "t10k-labels-idx1-ubyte.gz"
+        train_size = 9000
-        ]
+        test_size = 1000
-        self.download_dir = "/tmp"
+        np.random.seed(self.seed)
-        self.mnist_dirname = "mnist"
+        self._train = LabeledData(data=np.random.rand(train_size, 200),
-        self.datadir_name = os.path.join(self.download_dir, self.mnist_dirname)
+                                  labels=np.random.choice(10, size=train_size, replace=True))
-        self.full_mnist_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
+        self._test = LabeledData(data=np.random.rand(test_size, 200),
+                                 labels=np.random.choice(10, size=test_size, replace=True))
-    def test_mnist(self):
-        mnist = MnistDataset(s_download_dir=self.download_dir)
-        mnist_data = mnist.load()
-        for name in self.full_mnist_names:
-            self.assertTrue(os.path.exists(name))
-        self.assertTrue(kw in mnist_data.keys() for kw in ["train", "test"])
-        mnist_image = mnist.to_image()
-        self.assertTrue(mnist_image["train"][0][0].shape == (28, 28,1))
-    def tearDown(self):
-        for name in self.full_mnist_names:
-            silentremove(name)
-class TestCifar10Dataset(unittest.TestCase):
-    def setUp(self):
-        self.mnist_names = [
-            'batches.meta',
-            'data_batch_1',
-            'data_batch_2',
-            'data_batch_3',
-            'data_batch_4',
-            'data_batch_5',
-            'readme.html',
-            'test_batch'
-        ]
-        self.download_dir = "/tmp"
-        self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py")
-        self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname)
-        self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
-    def test_cifar10(self):
-        cifar10 = Cifar10Dataset(s_download_dir=self.download_dir)
-        cifar10_data = cifar10.load()
-        for name in self.full_cifar10_names:
-            self.assertTrue(os.path.exists(name))
-        self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"])
-    def tearDown(self):
-        for name in self.full_cifar10_names:
-            silentremove(name)
 class TestDataset(unittest.TestCase):
    def setUp(self):
-        self.dataset_classes = [MnistDataset,
+        self.dataset_classes = [FooDataset]
-                                Cifar10Dataset,
-                                SVHNDataset,
-                                MovieReviewV1Dataset]
    def test_seed_train_val(self):
        for d_class in self.dataset_classes:
@@ -79,14 +32,15 @@ class TestDataset(unittest.TestCase):
            d1.load()
            d2.load()
            d3.load()
-            self.assertTrue((d1.train.data[:10] == d2.train.data[:10]).all(),
+            self.assertTrue((d1.train.data == d2.train.data).all(),
                            msg="Same seed gives different train/val split in {} dataset".format(d_class.__name__))
-            self.assertTrue((d1.train.data[:10] != d3.train.data[:10]).any(),
+            self.assertTrue((d1.train.data != d3.train.data).any(),
                            msg="Different seeds give same train/val split in {} dataset".format(d_class.__name__))
    def test_seed_uniform_rand_indices_train(self):
-        size = 10
+        size = 100
-        size2 = 12
+        size_bis = 102
+        size2 = 120
        for d_class in self.dataset_classes:
            # test same subsample size
            # ------------------------
@@ -97,11 +51,41 @@ class TestDataset(unittest.TestCase):
            d2.load()
            d3.load()
            sub_indexes1 = d1.get_uniform_class_rand_indices_train(size)
+            sub_indexes1_not_modulo = d1.get_uniform_class_rand_indices_train(size_bis)
            sub_indexes2 = d2.get_uniform_class_rand_indices_train(size)
-            sub_indexes3 = d2.get_uniform_class_rand_indices_train(size)
+            sub_indexes3 = d3.get_uniform_class_rand_indices_train(size)
+            # the returned number of index is actually equal to the one asked
+            self.assertEqual(len(sub_indexes1), size)
+            self.assertEqual(len(sub_indexes1_not_modulo), size_bis)
+            self.assertEqual(len(sub_indexes2), size)
+            self.assertEqual(len(sub_indexes3), size)
            subsample1 = d1.train.data[sub_indexes1]
            subsample2 = d2.train.data[sub_indexes2]
            subsample3 = d3.train.data[sub_indexes3]
+            unique_labels = np.unique(d1.train.labels, axis=0)
+            labels = d1.train.labels[sub_indexes1]
+            labels_bis = d1.train.labels[sub_indexes1_not_modulo]
+            for u_lab in unique_labels:
+                if len(labels.shape) == 1:
+                    bool_idx_labs = labels == u_lab
+                    bool_idx_labs_bis = labels_bis == u_lab
+                elif len(labels.shape) == 2:
+                    bool_idx_labs = np.all(labels == u_lab, axis=-1)
+                    bool_idx_labs_bis = np.all(labels_bis == u_lab, axis=-1)
+                else:
+                    raise ValueError(
+                        "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels")
+                idx_labs = np.where(bool_idx_labs)[0]
+                idx_labs_bis = np.where(bool_idx_labs_bis)[0]
+                self.assertTrue(
+                    len(idx_labs) == size // len(unique_labels) or len(idx_labs) == size // len(unique_labels) + 1)
+                self.assertTrue(len(idx_labs_bis) == size // len(unique_labels) or len(idx_labs_bis) == size // len(
+                    unique_labels) + 1)
            self.assertTrue((subsample1 == subsample2).all(),
                            msg="Same seed gives different subsamples in {} dataset".format(d_class.__name__))
            self.assertTrue((subsample1 != subsample3).any(),
@@ -109,14 +93,17 @@ class TestDataset(unittest.TestCase):
            # test different subsample size
            # -----------------------------
-            d1 = d_class(validation_size=1000, seed=0)
+            d1 = d_class(validation_size=1000, seed=1)
-            d2 = d_class(validation_size=1000, seed=0)
+            d2 = d_class(validation_size=1000, seed=1)
            d1.load()
            d2.load()
            sub_indexes1 = d1.get_uniform_class_rand_indices_train(size)
            sub_indexes2 = d2.get_uniform_class_rand_indices_train(size2)
+            self.assertEqual(len(sub_indexes1), size)
+            self.assertEqual(len(sub_indexes2), size2)
            subsample1 = d1.train.data[sub_indexes1]
            subsample2 = d2.train.data[sub_indexes2]
            for subs1 in subsample1:
                found = False
                for subs2 in subsample2:
@@ -175,3 +162,7 @@ class TestDataset(unittest.TestCase):
            self.assertTrue(found,
                            msg="Little subsample is not a subset of big subsample using same seed in {} dataset".format(
                                d_class.__name__))
+if __name__ == "__main__":
+    unittest.main()
--- a/skluc/test/test_data/test_mldatasets/__init__.py
+++ b/skluc/test/test_data/test_mldatasets/__init__.py