diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py
index 58a5d76528db1841175529e9f402059fc67a19b4..807771c2c4d572907eec798608b96bfe449b11a6 100644
--- a/skluc/main/data/mldatasets/Dataset.py
+++ b/skluc/main/data/mldatasets/Dataset.py
@@ -1,7 +1,6 @@
 import os
 
 import numpy as np
-from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.preprocessing import LabelBinarizer
 
 from skluc.main.utils import LabeledData
@@ -23,6 +22,7 @@ class Dataset(object):
         for url in self.l_url:
             splitted_url = url.split("/")
             self.l_filenames.append(splitted_url[-1])
+
         self.s_name = s_name
         self.s_download_dir = os.path.join(s_download_dir, self.s_name)
         self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames]
@@ -39,25 +39,82 @@ class Dataset(object):
         kept_indices = self.get_uniform_class_rand_indices_train(new_size)
         self.permuted_index_train = self.permuted_index_train[kept_indices]
 
-    def get_uniform_class_rand_indices_train(self, size):
-        try:
-            sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed)
-            kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels)
-        except ValueError as e:
-            logger.warning("In Dataset.get_uniform_class_rand_indices_train Handled exception: {}".format(str(e)))
-            logger.debug("Use random indexes instead")
-            kept_indices = np.random.permutation(len(self.train.data))[:size]
-        return kept_indices
-
-    def get_uniform_class_rand_indices_validation(self, size):
-        try:
-            sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed)
-            kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels)
-        except ValueError as e:
-            logger.warning("In Dataset.get_uniform_class_rand_indices_validation Handled exception: {}".format(str(e)))
-            logger.debug("Use random indexes instead")
-            kept_indices = np.random.permutation(len(self.validation.data))[:size]
-        return kept_indices
+    def get_bool_idx_label(self, find_label, labels):
+        """
+        return the np.array bool where find_label == label in labels
+        :param find_label:
+        :param labels:
+        :return:
+        """
+
+        if len(labels.shape) == 1:
+            bool_idx_labs = labels == find_label
+        elif len(labels.shape) == 2:
+            bool_idx_labs = np.all(labels == find_label, axis=-1)
+        else:
+            raise ValueError(
+                "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels")
+        return bool_idx_labs
+
+    def get_uniform_class_rand_indices(self, labels, size, shuffle=True):
+        """size is the final size not the size / class"""
+        logger.debug("Start finding subset indices of size {} with uniform distribution of labels".format(size))
+        unique_labels = np.unique(labels, axis=0)
+        nb_labels = len(unique_labels)
+        nbr_by_label = size // nb_labels
+        logger.debug("Need {} (+/- 1) example by label".format(nbr_by_label))
+        # return_idx_labels = np.empty(size, dtype=np.int)
+        return_idx_labels = []
+        copy_idx = 0
+
+        for u_lab in unique_labels:
+            bool_idx_labs = self.get_bool_idx_label(u_lab, labels)
+            get_nbr = nbr_by_label
+            if len(np.where(bool_idx_labs)[0]) < get_nbr:
+                raise IndexError(
+                    "Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]), u_lab,
+                                                                              get_nbr))
+            idx_labs = np.where(bool_idx_labs)[0][:get_nbr]
+            # return_idx_labels[copy_idx:copy_idx+get_nbr] = idx_labs
+            logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs)))
+            return_idx_labels.extend(idx_labs)
+            copy_idx += get_nbr
+
+        remaining = size - copy_idx
+        if remaining > 0:
+            logger.debug("After finding equal number ({}) for example by labels (total = {}), "
+                         "need to find more examples to reach size {}".format(nbr_by_label, nbr_by_label * nb_labels,
+                                                                              size))
+            get_nbr = 1
+            while remaining > 0:
+                u_lab_idx = np.random.choice(len(unique_labels), 1)[0]
+                u_lab = unique_labels[u_lab_idx]
+                bool_idx_labs = self.get_bool_idx_label(u_lab, labels)
+
+                all_idx_labs = np.where(bool_idx_labs)[0]
+                idx_labs_not_already_gotten = np.setdiff1d(all_idx_labs, return_idx_labels)
+                if len(idx_labs_not_already_gotten) < get_nbr:
+                    raise IndexError(
+                        "Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]),
+                                                                                  u_lab, get_nbr))
+                idx_labs = idx_labs_not_already_gotten[:get_nbr]
+                # todo set difference
+                # return_idx_labels[copy_idx:copy_idx + get_nbr] = idx_labs
+                return_idx_labels.extend(idx_labs)
+
+                remaining -= 1
+
+        if shuffle:
+            np.random.seed(self.seed)
+            np.random.shuffle(return_idx_labels)
+
+        return np.array(return_idx_labels)
+
+    def get_uniform_class_rand_indices_train(self, size, shuffle=True):
+        return self.get_uniform_class_rand_indices(labels=self.train.labels, size=size, shuffle=shuffle)
+
+    def get_uniform_class_rand_indices_validation(self, size, shuffle=True):
+        return self.get_uniform_class_rand_indices(labels=self.validation.labels, size=size, shuffle=shuffle)
 
     @property
     def train(self):
@@ -80,7 +137,7 @@ class Dataset(object):
 
         :return: None
         """
-        self.create_directory_tree()
+        create_directory(self.s_download_dir)
         if not check_files(self.l_filepaths):
             logger.debug("Files need to be downloaded")
             for s_fname in self.l_filepaths:
@@ -92,14 +149,6 @@ class Dataset(object):
         else:
             logger.debug("Files {} already exist".format(self.l_filepaths))
 
-    def create_directory_tree(self):
-        """
-        Create the target directory tree
-
-        :return: None
-        """
-        create_directory(self.s_download_dir)
-
     def _check_validation_size(self, data_length):
         if self.validation_size > data_length:
             raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \
@@ -108,7 +157,10 @@ class Dataset(object):
 
     def to_one_hot(self):
         """
-        Convert categorical labels to one hot encoding
+        Convert categorical labels to one hot encoding.
+
+        Note: Beware, the information on how the labels are encoded is not stored
+        in the data file.
 
         :return:
         """
@@ -126,6 +178,14 @@ class Dataset(object):
             setattr(self, kw, LabeledData(data, labels))
 
     def revert_one_hot(self):
+        """
+        Convert one hot encoded labels to categorical labels.
+
+        Note: Beware, the information on how the labels are encoded is not stored
+        in the data file.
+
+        :return:
+        """
         logger.info("Revert one hot encoding to dataset {}.".format(self.s_name))
         for kw in self.data_groups_private:
             datlab = getattr(self, kw)
@@ -143,6 +203,10 @@ class Dataset(object):
 
         Feature scaling normalization.
 
+        Note: Beware, the information on whether or not the data has been normalized
+        is not stored in the data file. If you call normalize on already normalized
+        it won't have any effect though.
+
         :return:
         """
         logger.info("Apply normalization to data from dataset {}.".format(self.s_name))
@@ -158,6 +222,14 @@ class Dataset(object):
             setattr(self, kw, LabeledData(data, datlab.labels))
 
     def data_astype(self, _type):
+        """
+        Change data type to _type.
+
+        Note: Beware, the information on the type of the data is not stored in the data file.
+
+        :param _type:
+        :return:
+        """
         logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name))
         for kw in self.data_groups_private:
             datlab = getattr(self, kw)
@@ -171,6 +243,14 @@ class Dataset(object):
             setattr(self, kw, LabeledData(data, datlab.labels))
 
     def labels_astype(self, _type):
+        """
+        Change labels type to _type.
+
+        Not: Beware, the information on the type of the labels is not stored in the data file.
+
+        :param _type:
+        :return:
+        """
         logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name))
         for kw in self.data_groups_private:
             datlab = getattr(self, kw)
@@ -208,6 +288,13 @@ class Dataset(object):
             raise Exception("No data loaded at the end of load method.")
 
     def save_npz(self, npzdir_path=None):
+        """
+        Save data and labels as their current state to the npz dir path.
+
+        :param npzdir_path:
+        :return:
+        """
+        # todo trouver une solution pour stocker l'état courant?
         if npzdir_path is None:
             npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
         for kw in self.data_groups_private:
@@ -219,6 +306,12 @@ class Dataset(object):
             np.savez(filepath, **dict_attr)
 
     def load_npz(self, npzdir_path=None):
+        """
+        Load data and labels from the npzdir path.
+
+        :param npzdir_path:
+        :return:
+        """
         if npzdir_path is None:
             npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
         for kw in self.data_groups_private:
diff --git a/skluc/main/data/transformation/ResizeTransformer.py b/skluc/main/data/transformation/ResizeTransformer.py
index 2f5161fdfdb904bd0e91b5e84088360253fcffe4..c8b03f566c74a32aa1207a92f4fbcefc044fbf39 100644
--- a/skluc/main/data/transformation/ResizeTransformer.py
+++ b/skluc/main/data/transformation/ResizeTransformer.py
@@ -1,11 +1,12 @@
-import tensorflow as tf
 import numpy as np
+import tensorflow as tf
 
 from skluc.main.data.transformation.Transformer import Transformer
 from skluc.main.utils import logger, Singleton
 
 
 class ResizeTransformer(Transformer, metaclass=Singleton):
+    # todo faire une fit methode?
     def __init__(self, data_name, output_shape):
         if len(output_shape) != 2:
             raise AssertionError("Output shape should be 2D and it is {}D: {}".format(len(output_shape), output_shape))
diff --git a/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py b/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4773d48f6f9b0bec4a9a1ff23c0108f32139a7
--- /dev/null
+++ b/skluc/test/test_data/test_mldatasets/TestCifar10Dataset.py
@@ -0,0 +1,38 @@
+import os
+import unittest
+
+from skluc.main.data.mldatasets import Cifar10Dataset
+from skluc.main.utils import silentremove
+
+
+class TestCifar10Dataset(unittest.TestCase):
+    def setUp(self):
+        self.mnist_names = [
+            'batches.meta',
+            'data_batch_1',
+            'data_batch_2',
+            'data_batch_3',
+            'data_batch_4',
+            'data_batch_5',
+            'readme.html',
+            'test_batch'
+        ]
+        self.download_dir = "/tmp"
+        self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py")
+        self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname)
+        self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
+
+    def test_cifar10(self):
+        cifar10 = Cifar10Dataset(s_download_dir=self.download_dir)
+        cifar10_data = cifar10.load()
+        for name in self.full_cifar10_names:
+            self.assertTrue(os.path.exists(name))
+        self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"])
+
+    def tearDown(self):
+        for name in self.full_cifar10_names:
+            silentremove(name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/skluc/test/test_data/test_mldatasets.py b/skluc/test/test_data/test_mldatasets/TestDataset.py
similarity index 61%
rename from skluc/test/test_data/test_mldatasets.py
rename to skluc/test/test_data/test_mldatasets/TestDataset.py
index 3cb6abca99f79c6f601033b1efacc1013d040e32..7873200df8ecee4dad6066c0362bb82437844560 100644
--- a/skluc/test/test_data/test_mldatasets.py
+++ b/skluc/test/test_data/test_mldatasets/TestDataset.py
@@ -1,75 +1,28 @@
-import os
 import unittest
 
-from skluc.main.data.mldatasets.Cifar10Dataset import Cifar10Dataset
-from skluc.main.data.mldatasets.MnistDataset import MnistDataset
-from skluc.main.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset
-from skluc.main.data.mldatasets.SVHNDataset import SVHNDataset
-from skluc.main.utils import silentremove
+import numpy as np
 
+from skluc.main.data.mldatasets.Dataset import Dataset
+from skluc.main.utils import LabeledData
 
-class TestMnistDataset(unittest.TestCase):
-    def setUp(self):
-        self.mnist_names = [
-            "train-images-idx3-ubyte.gz",
-            "train-labels-idx1-ubyte.gz",
-            "t10k-images-idx3-ubyte.gz",
-            "t10k-labels-idx1-ubyte.gz"
-        ]
-        self.download_dir = "/tmp"
-        self.mnist_dirname = "mnist"
-        self.datadir_name = os.path.join(self.download_dir, self.mnist_dirname)
-        self.full_mnist_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
-
-    def test_mnist(self):
-        mnist = MnistDataset(s_download_dir=self.download_dir)
-        mnist_data = mnist.load()
-        for name in self.full_mnist_names:
-            self.assertTrue(os.path.exists(name))
-        self.assertTrue(kw in mnist_data.keys() for kw in ["train", "test"])
-        mnist_image = mnist.to_image()
-        self.assertTrue(mnist_image["train"][0][0].shape == (28, 28,1))
-
-    def tearDown(self):
-        for name in self.full_mnist_names:
-            silentremove(name)
-
-
-class TestCifar10Dataset(unittest.TestCase):
-    def setUp(self):
-        self.mnist_names = [
-            'batches.meta',
-            'data_batch_1',
-            'data_batch_2',
-            'data_batch_3',
-            'data_batch_4',
-            'data_batch_5',
-            'readme.html',
-            'test_batch'
-        ]
-        self.download_dir = "/tmp"
-        self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py")
-        self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname)
-        self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
-
-    def test_cifar10(self):
-        cifar10 = Cifar10Dataset(s_download_dir=self.download_dir)
-        cifar10_data = cifar10.load()
-        for name in self.full_cifar10_names:
-            self.assertTrue(os.path.exists(name))
-        self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"])
-
-    def tearDown(self):
-        for name in self.full_cifar10_names:
-            silentremove(name)
+
+class FooDataset(Dataset):
+    def __init__(self, validation_size=1000, seed=0):
+        super().__init__([], "foo", validation_size=validation_size, seed=seed)
+
+    def read(self):
+        train_size = 9000
+        test_size = 1000
+        np.random.seed(self.seed)
+        self._train = LabeledData(data=np.random.rand(train_size, 200),
+                                  labels=np.random.choice(10, size=train_size, replace=True))
+        self._test = LabeledData(data=np.random.rand(test_size, 200),
+                                 labels=np.random.choice(10, size=test_size, replace=True))
 
 
 class TestDataset(unittest.TestCase):
     def setUp(self):
-        self.dataset_classes = [MnistDataset,
-                                Cifar10Dataset,
-                                SVHNDataset,
-                                MovieReviewV1Dataset]
+        self.dataset_classes = [FooDataset]
 
     def test_seed_train_val(self):
         for d_class in self.dataset_classes:
@@ -79,14 +32,15 @@ class TestDataset(unittest.TestCase):
             d1.load()
             d2.load()
             d3.load()
-            self.assertTrue((d1.train.data[:10] == d2.train.data[:10]).all(),
+            self.assertTrue((d1.train.data == d2.train.data).all(),
                             msg="Same seed gives different train/val split in {} dataset".format(d_class.__name__))
-            self.assertTrue((d1.train.data[:10] != d3.train.data[:10]).any(),
+            self.assertTrue((d1.train.data != d3.train.data).any(),
                             msg="Different seeds give same train/val split in {} dataset".format(d_class.__name__))
 
     def test_seed_uniform_rand_indices_train(self):
-        size = 10
-        size2 = 12
+        size = 100
+        size_bis = 102
+        size2 = 120
         for d_class in self.dataset_classes:
             # test same subsample size
             # ------------------------
@@ -97,11 +51,41 @@ class TestDataset(unittest.TestCase):
             d2.load()
             d3.load()
             sub_indexes1 = d1.get_uniform_class_rand_indices_train(size)
+            sub_indexes1_not_modulo = d1.get_uniform_class_rand_indices_train(size_bis)
             sub_indexes2 = d2.get_uniform_class_rand_indices_train(size)
-            sub_indexes3 = d2.get_uniform_class_rand_indices_train(size)
+            sub_indexes3 = d3.get_uniform_class_rand_indices_train(size)
+            # the returned number of index is actually equal to the one asked
+            self.assertEqual(len(sub_indexes1), size)
+            self.assertEqual(len(sub_indexes1_not_modulo), size_bis)
+            self.assertEqual(len(sub_indexes2), size)
+            self.assertEqual(len(sub_indexes3), size)
+
             subsample1 = d1.train.data[sub_indexes1]
             subsample2 = d2.train.data[sub_indexes2]
             subsample3 = d3.train.data[sub_indexes3]
+
+            unique_labels = np.unique(d1.train.labels, axis=0)
+            labels = d1.train.labels[sub_indexes1]
+            labels_bis = d1.train.labels[sub_indexes1_not_modulo]
+            for u_lab in unique_labels:
+                if len(labels.shape) == 1:
+                    bool_idx_labs = labels == u_lab
+                    bool_idx_labs_bis = labels_bis == u_lab
+
+                elif len(labels.shape) == 2:
+                    bool_idx_labs = np.all(labels == u_lab, axis=-1)
+                    bool_idx_labs_bis = np.all(labels_bis == u_lab, axis=-1)
+                else:
+                    raise ValueError(
+                        "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels")
+
+                idx_labs = np.where(bool_idx_labs)[0]
+                idx_labs_bis = np.where(bool_idx_labs_bis)[0]
+                self.assertTrue(
+                    len(idx_labs) == size // len(unique_labels) or len(idx_labs) == size // len(unique_labels) + 1)
+                self.assertTrue(len(idx_labs_bis) == size // len(unique_labels) or len(idx_labs_bis) == size // len(
+                    unique_labels) + 1)
+
             self.assertTrue((subsample1 == subsample2).all(),
                             msg="Same seed gives different subsamples in {} dataset".format(d_class.__name__))
             self.assertTrue((subsample1 != subsample3).any(),
@@ -109,14 +93,17 @@ class TestDataset(unittest.TestCase):
 
             # test different subsample size
             # -----------------------------
-            d1 = d_class(validation_size=1000, seed=0)
-            d2 = d_class(validation_size=1000, seed=0)
+            d1 = d_class(validation_size=1000, seed=1)
+            d2 = d_class(validation_size=1000, seed=1)
             d1.load()
             d2.load()
             sub_indexes1 = d1.get_uniform_class_rand_indices_train(size)
             sub_indexes2 = d2.get_uniform_class_rand_indices_train(size2)
+            self.assertEqual(len(sub_indexes1), size)
+            self.assertEqual(len(sub_indexes2), size2)
             subsample1 = d1.train.data[sub_indexes1]
             subsample2 = d2.train.data[sub_indexes2]
+
             for subs1 in subsample1:
                 found = False
                 for subs2 in subsample2:
@@ -175,3 +162,7 @@ class TestDataset(unittest.TestCase):
             self.assertTrue(found,
                             msg="Little subsample is not a subset of big subsample using same seed in {} dataset".format(
                                 d_class.__name__))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/skluc/test/test_data/test_mldatasets/__init__.py b/skluc/test/test_data/test_mldatasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391