From c88d0b49243134e429a1cc058b7424055059587a Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@lis-lab.fr>
Date: Tue, 9 Oct 2018 08:13:47 +0200
Subject: [PATCH] move unused scripts to sandbox + update datasets + create
 test files (still empty for some)

---
 skluc/examples/fc_nn_timed.py                 |  30 ---
 skluc/examples/nystroem_svm_classifier.py     | 140 -----------
 skluc/examples/so_conv_net.py                 |  33 ---
 .../tasks/classification/cifar10/__init__.py  |   0
 .../classification/cifar10}/fc_cnn_cifar.py   |   6 +-
 .../fc_dense_cifar_preprocessing_vgg19.py     |  19 +-
 .../tasks/classification/mnist/__init__.py    |   0
 .../mnist}/deepfriedConvnetMnist.py           |   6 +-
 .../classification/mnist}/fc_cnn_mnist.py     |   4 +-
 .../mnist}/keras_fc_cnn_mnist.py              |   2 +-
 .../scratch/omniglot_snell_28x28_vinyals.py   | 114 +++++++++
 .../tasks/classification/svhn/__init__.py     |   0
 .../fc_dense_svhn_preprocessing_vgg19.py      |  18 +-
 .../svhn}/keras_fc_preprocess_svhn.py         |   7 +-
 skluc/examples/tf_profiling.py                |  27 ---
 skluc/examples/tfrecord_nn.py                 | 217 ------------------
 skluc/examples/time_batch_subsample_ops.py    |  86 -------
 skluc/examples/write_read_tfrecords.py        |  77 -------
 skluc/main/data/mldatasets/Dataset.py         |   4 +-
 skluc/main/data/mldatasets/ImageDataset.py    |   3 +-
 .../data/mldatasets/MovieReviewDataset.py     |  24 +-
 skluc/main/data/mldatasets/OmniglotDataset.py |  39 +++-
 skluc/main/data/mldatasets/RPSDataset.py      | 125 ++++++++++
 .../data/transformation/VinyalsTransformer.py |   6 +-
 .../tCNNTransformer/__init__.py               |  29 +++
 .../test_mldatasets/TestCifar100Dataset.py    |   0
 .../test_mldatasets/TestImageDataset.py       |   0
 .../test_mldatasets/TestSVHNDataset.py        |   0
 28 files changed, 338 insertions(+), 678 deletions(-)
 delete mode 100644 skluc/examples/fc_nn_timed.py
 delete mode 100644 skluc/examples/nystroem_svm_classifier.py
 delete mode 100644 skluc/examples/so_conv_net.py
 create mode 100644 skluc/examples/tasks/classification/cifar10/__init__.py
 rename skluc/examples/{ => tasks/classification/cifar10}/fc_cnn_cifar.py (96%)
 rename skluc/examples/{ => tasks/classification/cifar10}/fc_dense_cifar_preprocessing_vgg19.py (88%)
 create mode 100644 skluc/examples/tasks/classification/mnist/__init__.py
 rename skluc/examples/{ => tasks/classification/mnist}/deepfriedConvnetMnist.py (95%)
 rename skluc/examples/{ => tasks/classification/mnist}/fc_cnn_mnist.py (97%)
 rename skluc/examples/{ => tasks/classification/mnist}/keras_fc_cnn_mnist.py (98%)
 create mode 100644 skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py
 create mode 100644 skluc/examples/tasks/classification/svhn/__init__.py
 rename skluc/examples/{ => tasks/classification/svhn}/fc_dense_svhn_preprocessing_vgg19.py (87%)
 rename skluc/examples/{ => tasks/classification/svhn}/keras_fc_preprocess_svhn.py (89%)
 delete mode 100644 skluc/examples/tf_profiling.py
 delete mode 100644 skluc/examples/tfrecord_nn.py
 delete mode 100644 skluc/examples/time_batch_subsample_ops.py
 delete mode 100644 skluc/examples/write_read_tfrecords.py
 rename skluc/{ => main}/data/mldatasets/MovieReviewDataset.py (90%)
 create mode 100644 skluc/main/data/mldatasets/RPSDataset.py
 create mode 100644 skluc/main/data/transformation/tCNNTransformer/__init__.py
 create mode 100644 skluc/test/test_data/test_mldatasets/TestCifar100Dataset.py
 create mode 100644 skluc/test/test_data/test_mldatasets/TestImageDataset.py
 create mode 100644 skluc/test/test_data/test_mldatasets/TestSVHNDataset.py

diff --git a/skluc/examples/fc_nn_timed.py b/skluc/examples/fc_nn_timed.py
deleted file mode 100644
index e4736ba..0000000
--- a/skluc/examples/fc_nn_timed.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.tensorflow_.utils import fully_connected, get_next_batch, tf_op
-from skluc.utils import time_fct
-
-tf.logging.set_verbosity(tf.logging.ERROR)
-
-# Preparing the dataset #########################
-
-mnist = dataset.MnistDataset()
-mnist.load()
-mnist.normalize()
-mnist.data_astype(np.float32)
-
-X_train, _ = mnist.train
-
-
-#################################################
-
-
-if __name__ == '__main__':
-    input_dim = X_train.shape[1]
-    batch_size = 10
-    with tf.Graph().as_default():
-        x = tf.placeholder(tf.float32, shape=[None, input_dim], name="x")
-        out_fc = fully_connected(x, 4096*2, act=tf.nn.relu)
-        X_batch = get_next_batch(X_train, 0, batch_size)
-        feed_dict = {x: X_batch}
-        print("%.4fs" % time_fct(lambda: tf_op(feed_dict, [out_fc])))
\ No newline at end of file
diff --git a/skluc/examples/nystroem_svm_classifier.py b/skluc/examples/nystroem_svm_classifier.py
deleted file mode 100644
index 5583600..0000000
--- a/skluc/examples/nystroem_svm_classifier.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import numpy as np
-from collections import defaultdict
-import os
-import skluc.data.mldatasets as dataset
-from sklearn.kernel_approximation import Nystroem
-from sklearn.svm import SVC
-
-from skluc.data.transformation import VGG19Cifar10Transformer
-from skluc.utils import logger, compute_euristic_sigma
-import matplotlib.pyplot as plt
-
-kernel_marker = {
-    "additive_chi2": "x",
-    "linear": "o",
-    "rbf": "d",
-    "chi2": "h"
-}
-
-# colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
-colors = [
-    "#009BE5",
-    "#004EE0",
-    "#00DC8F",
-    "#4400D8",
-    "#03D400",
-    "#CC00CF",
-    "#8BCB00",
-    "#C70046",
-    "#C37E00",
-    "#BF3C00"
-]
-
-
-def nystroem_classif(X_train, Y_train, X_test, Y_test, subsample, subsample_label, kernel_fct, kernel_params):
-    nys = Nystroem(kernel=kernel_fct, kernel_params=kernel_params, n_components=subsample.shape[0])
-    logger.debug("Subsample shape (bis): {}".format(subsample.shape))
-    nys.fit(subsample)
-    # X_train_transformed = nys.transform(X_train)
-    X_test_transformed = nys.transform(X_test)
-    subsample_transformed = nys.transform(subsample)
-    f, ax = plt.subplots()
-    if len(subsample) == 2:
-        for i in range(len(X_test)):
-            # if Y_test[i] > 6:
-            #     continue
-            plt.scatter(X_test_transformed[i][0], X_test_transformed[i][1], c=colors[int(Y_test[i])], marker='.')
-        if len(kernel_params) == 0:
-            circle0 = plt.Circle((subsample_transformed[0][0], subsample_transformed[0][1]), 0.2, color='r', fill=False)
-            circle1 = plt.Circle((subsample_transformed[1][0], subsample_transformed[1][1]), 0.2, color='r', fill=False)
-        else:
-            circle0 = plt.Circle((subsample_transformed[0][0], subsample_transformed[0][1]), 0.01, color='r', fill=False)
-            circle1 = plt.Circle((subsample_transformed[1][0], subsample_transformed[1][1]), 0.01, color='r', fill=False)
-        ax.add_artist(circle0)
-        ax.add_artist(circle1)
-
-        plt.legend()
-        plt.title("{}; {}".format(kernel_fct, str(kernel_params)))
-        out_dir_path = "/home/luc/PycharmProjects/deepFriedConvnets/main/experiments/graph_drawing/paper/cifar/plt_2D"
-        out_name = "plot_2D_{}".format(kernel_fct)
-        out_path = os.path.join(out_dir_path, out_name)
-        f.savefig(out_path)
-        plt.show()
-    # clf = SVC(kernel="linear")
-    # clf.fit(X_train_transformed, Y_train)
-    # score = clf.score(X_test_transformed, Y_test)
-    score = 0.1
-    return score
-
-
-if __name__ == "__main__":
-    # SEED = np.random.randint(0, 100)
-    SEED = 10
-    VALIDATION_SIZE = 10000
-    SUBSAMPLE_SIZES = np.logspace(1, 6, dtype=int, base=2, num=5)
-
-    data = dataset.Cifar10Dataset(validation_size=VALIDATION_SIZE, seed=0)
-
-    data.load()
-    data.normalize()
-    data.to_image()
-    data.apply_transformer(VGG19Cifar10Transformer)
-    # todo faire convention pour stockage des data
-    # data.revert_one_hot()
-    data.data_astype(np.float32)
-    data.labels_astype(np.float32)
-    data.flatten()
-
-    X_train, Y_train = data.train
-    X_test, Y_test = data.test
-    X_val, Y_val = data.validation
-    logger.debug("X_train.shape: {} ; Y_train.shape: {}".format(X_train.shape, Y_train.shape))
-    logger.debug("X_test.shape: {} ; Y_test.shape: {}".format(X_test.shape, Y_test.shape))
-    best_gamma = 0.005425247156552446
-    # this value of best_gamma comes from the below command (but it is expensive to compute so I decided to hardcode it)
-    # best_gamma = 1. / compute_euristic_sigma(X_train)
-    logger.debug("Best gamma is: {}".format(best_gamma))
-
-    TESTED_KERNELS = {
-        "chi2": {"gamma": best_gamma},
-        "additive_chi2": {},
-        "rbf": {"gamma": best_gamma},
-        "linear": {}
-    }
-    # results = []
-    results_by_kernel = defaultdict(list)
-    for m_size in SUBSAMPLE_SIZES:
-        logger.debug("Starting with subsample size == {}".format(m_size))
-        np.random.seed(SEED)
-        dataset_indexes = np.random.permutation(X_train.shape[0])
-        X_subsample = X_train[dataset_indexes[:m_size]]
-        Y_subsample = Y_train[dataset_indexes[:m_size]]
-        logger.debug("Shape of subsample: {}".format(X_subsample.shape))
-        for kernel_name, kernel_params in TESTED_KERNELS.items():
-            logger.debug("Starting with kernel {}".format(kernel_name))
-            # kernel_fct = kernel_tuple[0]
-            # kernel_params = kernel_tuple[1]
-            logger.debug("Kernel params are: {}".format(kernel_params))
-            score = nystroem_classif(X_train=X_train,
-                                     Y_train=Y_train,
-                                     X_test=X_test[:1000],
-                                     Y_test=Y_test[:1000],
-                                     subsample=X_subsample,
-                                     subsample_label=Y_subsample,
-                                     kernel_fct=kernel_name,
-                                     kernel_params=kernel_params)
-            logger.debug("Obtained score: {}".format(score))
-            results_by_kernel[kernel_name].append(score)
-
-    for kernel_name, scores in results_by_kernel.items():
-        plt.scatter(SUBSAMPLE_SIZES, scores, label=kernel_name, marker=kernel_marker[kernel_name])
-
-    plt.legend()
-    plt.ylabel("accuracy (%)")
-    plt.xlabel("log(subsample size)")
-    plt.xscale("log")
-    with open("results.txt", "w") as f:
-        f.write(str(results_by_kernel))
-        f.write("\n")
-        f.write(str(SUBSAMPLE_SIZES))
-    plt.savefig('resultat_nystroem_svm_classifer.png', bbox_inches='tight')
diff --git a/skluc/examples/so_conv_net.py b/skluc/examples/so_conv_net.py
deleted file mode 100644
index 32b905a..0000000
--- a/skluc/examples/so_conv_net.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import tensorflow as tf
-import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.data.transformation import VGG19Cifar10BadTransformer
-
-
-def semi_flatten_out_conv(data):
-    return data.reshape((data.shape[0], -1, data.shape[-1]))
-
-
-if __name__ == '__main__':
-    val_size = 5000
-    cifar10 = dataset.Cifar10Dataset(validation_size=val_size)
-    cifar10.load()
-    cifar10.to_image()
-    cifar10.to_one_hot()
-    cifar10.normalize()
-    cifar10.apply_transformer(VGG19Cifar10BadTransformer)
-    cifar10.normalize()
-    cifar10.data_astype(np.float32)
-    cifar10.labels_astype(np.float32)
-    cifar10.to_feature_vectors()
-
-    X_train, Y_train = cifar10.train  # -1 x 2 x 2 x 512
-    X_val, Y_val = cifar10.validation
-    X_test, Y_test = cifar10.test
-
-
-
-
-
-
-
diff --git a/skluc/examples/tasks/classification/cifar10/__init__.py b/skluc/examples/tasks/classification/cifar10/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/skluc/examples/fc_cnn_cifar.py b/skluc/examples/tasks/classification/cifar10/fc_cnn_cifar.py
similarity index 96%
rename from skluc/examples/fc_cnn_cifar.py
rename to skluc/examples/tasks/classification/cifar10/fc_cnn_cifar.py
index fc7c755..d9d6896 100644
--- a/skluc/examples/fc_cnn_cifar.py
+++ b/skluc/examples/tasks/classification/cifar10/fc_cnn_cifar.py
@@ -7,9 +7,9 @@ where the input comes from memory.
 
 import tensorflow as tf
 import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.tensorflow_.utils import inference_cifar10, batch_generator
-import matplotlib.pyplot as plt
+import skluc.main.data.mldatasets as dataset
+from skluc.main.tensorflow_.utils import inference_cifar10, batch_generator
+
 tf.logging.set_verbosity(tf.logging.ERROR)
 
 import time as t
diff --git a/skluc/examples/fc_dense_cifar_preprocessing_vgg19.py b/skluc/examples/tasks/classification/cifar10/fc_dense_cifar_preprocessing_vgg19.py
similarity index 88%
rename from skluc/examples/fc_dense_cifar_preprocessing_vgg19.py
rename to skluc/examples/tasks/classification/cifar10/fc_dense_cifar_preprocessing_vgg19.py
index 7956d7e..19eb294 100644
--- a/skluc/examples/fc_dense_cifar_preprocessing_vgg19.py
+++ b/skluc/examples/tasks/classification/cifar10/fc_dense_cifar_preprocessing_vgg19.py
@@ -1,18 +1,10 @@
-import time as t
-import os
-
 import tensorflow as tf
 import numpy as np
-import matplotlib.pyplot as plt
-from keras.losses import mean_squared_error
-from keras.models import load_model, Model
 
-import skluc.data.mldatasets as dataset
-from skluc.data.transformation import VGG19Cifar10Transformer
-from skluc.tensorflow_.utils import fully_connected, classification_cifar, batch_generator
-from skluc.utils import logger, download_data
-import matplotlib.pyplot as plt
-from keras.metrics import categorical_accuracy, binary_accuracy
+import skluc.main.data.mldatasets as dataset
+from skluc.main.data import VGG19Cifar10Transformer
+from skluc.main.tensorflow_.utils import fully_connected, classification_cifar, batch_generator
+from skluc.main.utils import logger
 
 if __name__ == "__main__":
     VALIDATION_SIZE = 10000
@@ -26,8 +18,9 @@ if __name__ == "__main__":
     data.load()
     data.normalize()
     data.to_image()
+    data.apply_transformer(VGG19Cifar10Transformer())
     data.to_one_hot()
-    data.apply_transformer(VGG19Cifar10Transformer)
+    data.normalize()
     data.flatten()
     data.data_astype(np.float32)
     data.labels_astype(np.float32)
diff --git a/skluc/examples/tasks/classification/mnist/__init__.py b/skluc/examples/tasks/classification/mnist/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/skluc/examples/deepfriedConvnetMnist.py b/skluc/examples/tasks/classification/mnist/deepfriedConvnetMnist.py
similarity index 95%
rename from skluc/examples/deepfriedConvnetMnist.py
rename to skluc/examples/tasks/classification/mnist/deepfriedConvnetMnist.py
index 672c27d..f41fe7b 100644
--- a/skluc/examples/deepfriedConvnetMnist.py
+++ b/skluc/examples/tasks/classification/mnist/deepfriedConvnetMnist.py
@@ -12,9 +12,9 @@ Zichao Yang, Marcin Moczulski, Misha Denil, Nando de Freitas, Alex Smola, Le Son
 
 import tensorflow as tf
 import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.tensorflow_.utils import convolution_mnist, classification_mnist, batch_generator
-from skluc.tensorflow_.kernel_approximation import fastfood_layer
+import skluc.main.data.mldatasets as dataset
+from skluc.main.tensorflow_.utils import convolution_mnist, classification_mnist, batch_generator
+from skluc.main.tensorflow_.kernel_approximation import fastfood_layer
 
 tf.logging.set_verbosity(tf.logging.ERROR)
 
diff --git a/skluc/examples/fc_cnn_mnist.py b/skluc/examples/tasks/classification/mnist/fc_cnn_mnist.py
similarity index 97%
rename from skluc/examples/fc_cnn_mnist.py
rename to skluc/examples/tasks/classification/mnist/fc_cnn_mnist.py
index 235fb8a..f42cd96 100644
--- a/skluc/examples/fc_cnn_mnist.py
+++ b/skluc/examples/tasks/classification/mnist/fc_cnn_mnist.py
@@ -7,8 +7,8 @@ where the input comes from memory.
 
 import tensorflow as tf
 import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.tensorflow_.utils import inference_mnist,  batch_generator
+import skluc.main.data.mldatasets as dataset
+from skluc.main.tensorflow_.utils import inference_mnist,  batch_generator
 
 tf.logging.set_verbosity(tf.logging.ERROR)
 
diff --git a/skluc/examples/keras_fc_cnn_mnist.py b/skluc/examples/tasks/classification/mnist/keras_fc_cnn_mnist.py
similarity index 98%
rename from skluc/examples/keras_fc_cnn_mnist.py
rename to skluc/examples/tasks/classification/mnist/keras_fc_cnn_mnist.py
index 1aaf123..9fe75bf 100644
--- a/skluc/examples/keras_fc_cnn_mnist.py
+++ b/skluc/examples/tasks/classification/mnist/keras_fc_cnn_mnist.py
@@ -5,7 +5,7 @@ from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout
 from keras.callbacks import LearningRateScheduler, TensorBoard
 from keras.preprocessing.image import ImageDataGenerator
 from keras.regularizers import l2
-import skluc.data.mldatasets as dataset
+import skluc.main.data.mldatasets as dataset
 
 batch_size = 128
 epochs = 200
diff --git a/skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py b/skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py
new file mode 100644
index 0000000..2f25700
--- /dev/null
+++ b/skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py
@@ -0,0 +1,114 @@
+import time
+
+import numpy as np
+from keras import optimizers
+from keras.callbacks import LearningRateScheduler, TensorBoard
+from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Activation
+from keras.layers import Dense, Flatten
+from keras.models import Sequential
+from keras.preprocessing.image import ImageDataGenerator
+
+import skluc.main.data.mldatasets as dataset
+from skluc.main.data.transformation.ResizeTransformer import ResizeTransformer
+from skluc.main.utils import logger
+
+
+def scheduler(epoch):
+    """
+    Function to pass to the "LearningrateScheduler"
+
+    :param epoch:
+    :return:
+    """
+    if epoch < 80:
+        return 0.1
+    if epoch < 160:
+        return 0.01
+    return 0.001
+
+
+def model_definition():
+    model = Sequential()
+
+    model.add(
+        Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape))
+    model.add(BatchNormalization())
+    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
+    model.add(Activation("relu"))
+
+    model.add(
+        Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape))
+    model.add(BatchNormalization())
+    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
+    model.add(Activation("relu"))
+
+    model.add(
+        Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape))
+    model.add(BatchNormalization())
+    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
+    model.add(Activation("relu"))
+
+    model.add(
+        Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape))
+    model.add(BatchNormalization())
+    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
+    model.add(Activation("relu"))
+
+
+    model.add(Flatten())
+
+    model.add(Dense(num_classes, kernel_initializer='he_normal'))
+    model.add(Activation("softmax"))
+
+    sgd = optimizers.SGD(lr=.1, momentum=0.9, nesterov=True)
+    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
+    return model
+
+
+if __name__ == "__main__":
+    logger.debug("Executing file {}".format(__file__))
+
+    validation_size = 10000
+    seed = 0
+    num_classes = 1200
+    batch_size = 128
+    epochs = 200
+    dropout = 0.5
+    weight_decay = 0.0001
+    input_shape = (28, 28, 1)
+    log_filepath = r'./vinyals_logs/'
+
+    data = dataset.OmniglotDataset(validation_size=1000, snell_preprocessing=True, seed=seed)
+    data.load()
+    data.normalize()
+    data.data_astype(np.float32)
+    data.labels_astype(np.float32)
+    data.to_image()
+    resizetrans = ResizeTransformer(data.s_name, (28, 28))
+    data.apply_transformer(resizetrans)
+    data.to_one_hot()
+    (x_train, y_train), (x_test, y_test) = data.train, data.test
+    x_val, y_val = data.validation
+    print(x_train.shape, y_train.shape)
+
+    model = model_definition()
+
+    tb_cb = TensorBoard(log_dir=log_filepath, histogram_freq=0)
+    change_lr = LearningRateScheduler(scheduler)
+    cbks = [change_lr, tb_cb]
+
+    print('Using real-time data augmentation.')
+    datagen = ImageDataGenerator(horizontal_flip=True,
+                                 width_shift_range=0.125, height_shift_range=0.125, fill_mode='constant', cval=0.)
+
+    datagen.fit(x_train)
+
+    iterations = int(data.train.data.shape[0] / batch_size)
+    model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
+                        steps_per_epoch=iterations,
+                        epochs=epochs,
+                        callbacks=cbks,
+                        validation_data=(x_val, y_val))
+
+    model.save('{}_vinyals_omniglot_snell.h5'.format(time.time()))
+    print("Final evaluation on val set: {}".format(model.evaluate(x_val, y_val)))
diff --git a/skluc/examples/tasks/classification/svhn/__init__.py b/skluc/examples/tasks/classification/svhn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/skluc/examples/fc_dense_svhn_preprocessing_vgg19.py b/skluc/examples/tasks/classification/svhn/fc_dense_svhn_preprocessing_vgg19.py
similarity index 87%
rename from skluc/examples/fc_dense_svhn_preprocessing_vgg19.py
rename to skluc/examples/tasks/classification/svhn/fc_dense_svhn_preprocessing_vgg19.py
index 31e40df..16b9e60 100644
--- a/skluc/examples/fc_dense_svhn_preprocessing_vgg19.py
+++ b/skluc/examples/tasks/classification/svhn/fc_dense_svhn_preprocessing_vgg19.py
@@ -1,18 +1,10 @@
-import time as t
-import os
-
 import tensorflow as tf
 import numpy as np
-import matplotlib.pyplot as plt
-from keras.losses import mean_squared_error
-from keras.models import load_model, Model
 
-import skluc.data.mldatasets as dataset
-from skluc.data.transformation import VGG19Cifar10Transformer, VGG19SvhnTransformer
-from skluc.tensorflow_.utils import fully_connected, classification_cifar, batch_generator
-from skluc.utils import logger, download_data
-import matplotlib.pyplot as plt
-from keras.metrics import categorical_accuracy, binary_accuracy
+import skluc.main.data.mldatasets as dataset
+from skluc.main.data import VGG19SvhnTransformer
+from skluc.main.tensorflow_.utils import fully_connected, classification_cifar, batch_generator
+from skluc.main.utils import logger
 
 if __name__ == "__main__":
     VALIDATION_SIZE = 10000
@@ -25,7 +17,7 @@ if __name__ == "__main__":
     data.load()
     data.normalize()
     # data.to_image()
-    data.apply_transformer(VGG19SvhnTransformer)
+    data.apply_transformer(VGG19SvhnTransformer(cut_layer_name="block5_conv4"))
     data.to_one_hot()
     data.flatten()
     data.data_astype(np.float32)
diff --git a/skluc/examples/keras_fc_preprocess_svhn.py b/skluc/examples/tasks/classification/svhn/keras_fc_preprocess_svhn.py
similarity index 89%
rename from skluc/examples/keras_fc_preprocess_svhn.py
rename to skluc/examples/tasks/classification/svhn/keras_fc_preprocess_svhn.py
index f194bcb..3173674 100644
--- a/skluc/examples/keras_fc_preprocess_svhn.py
+++ b/skluc/examples/tasks/classification/svhn/keras_fc_preprocess_svhn.py
@@ -1,12 +1,11 @@
 import numpy as np
 from keras import optimizers
 from keras.models import Sequential
-from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout
+from keras.layers import Dense, Dropout
 from keras.callbacks import LearningRateScheduler, TensorBoard
-from keras.preprocessing.image import ImageDataGenerator
 from keras.regularizers import l2
-import skluc.data.mldatasets as dataset
-from skluc.data.transformation import VGG19SvhnTransformer
+import skluc.main.data.mldatasets as dataset
+from skluc.main.data import VGG19SvhnTransformer
 
 batch_size = 128
 epochs = 200
diff --git a/skluc/examples/tf_profiling.py b/skluc/examples/tf_profiling.py
deleted file mode 100644
index 83f9893..0000000
--- a/skluc/examples/tf_profiling.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-Basic example of profiling with tensorflow dans chrome trace.
-
-Example in a tensorflow application
-"""
-
-import tensorflow as tf
-from tensorflow.python.client import timeline
-
-
-if __name__ == '__main__':
-
-    x = tf.random_normal([1000, 1000])
-    y = tf.random_normal([1000, 1000])
-    res = tf.matmul(x, y)
-
-    # Run the graph with full trace option
-    with tf.Session() as sess:
-        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-        run_metadata = tf.RunMetadata()
-        sess.run(res, options=run_options, run_metadata=run_metadata)
-
-        # Create the Timeline object, and write it to a json
-        tl = timeline.Timeline(run_metadata.step_stats)
-        ctf = tl.generate_chrome_trace_format()
-        with open('timeline.json', 'w') as f:
-            f.write(ctf)
\ No newline at end of file
diff --git a/skluc/examples/tfrecord_nn.py b/skluc/examples/tfrecord_nn.py
deleted file mode 100644
index 90a5670..0000000
--- a/skluc/examples/tfrecord_nn.py
+++ /dev/null
@@ -1,217 +0,0 @@
-"""
-Convolutional Neural Netwok implementation in tensorflow using tfrecords as inputs.
-
-From memory, the tfrecords ar written then read.
-
-The neural network is ran against the mnist dataset and we can see an example of distortion of input in
-the case of tfrecords data source.
-"""
-
-import tensorflow as tf
-import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.convert_image_to_records import convert_to
-from skluc.tensorflow_.utils import inference_mnist
-
-tf.logging.set_verbosity(tf.logging.ERROR)
-
-import time as t
-from collections import namedtuple
-
-val_size = 5000
-mnist = dataset.MnistDataset(validation_size=val_size)
-mnist.load()
-mnist.to_one_hot()
-mnist.to_image()
-mnist.normalize()
-mnist.data_astype(np.float32)
-mnist.labels_astype(np.float32)
-
-X_train, Y_train = mnist.train
-X_val, Y_val = mnist.validation
-X_test, Y_test = mnist.test
-
-# build dataset objects
-
-train = namedtuple("Dataset", ["images", "labels", "num_examples"])
-train.images = X_train
-train.labels = Y_train
-train.num_examples = X_train.shape[0]
-
-test = namedtuple("Dataset", ["images", "labels", "num_examples"])
-test.images = X_test
-test.labels = Y_test
-test.num_examples = X_test.shape[0]
-
-val = namedtuple("Dataset", ["images", "labels", "num_examples"])
-val.images = X_val
-val.labels = Y_val
-val.num_examples = X_val.shape[0]
-
-
-def decode(serialized_example):
-    features = tf.parse_single_example(
-        serialized_example,
-        features={
-            'image_raw': tf.FixedLenFeature([], tf.string),
-            'label': tf.FixedLenFeature([], tf.int64),
-            'height': tf.FixedLenFeature([], tf.int64),
-            'width': tf.FixedLenFeature([], tf.int64),
-            'depth': tf.FixedLenFeature([], tf.int64)
-        }
-    )
-    image = tf.decode_raw(features['image_raw'], tf.float32)
-    image.set_shape((784))
-
-    label = tf.cast(features['label'], tf.int32)
-    label = tf.one_hot(label, 10)
-
-    return image, label
-
-
-def distortion(image, label):
-    """
-    Apply som distortion to the input images
-
-    :param image:
-    :param label:
-    :return:
-    """
-    distorted_image = tf.image.random_brightness(image,
-                                                 max_delta=15)
-    return distorted_image, label
-
-
-def get_tf_record(record_filename, num_epochs, batch_size, distord=True):
-    """
-    Gives an iteror to plug to the input of the neural network.
-
-    The iterator gives a new bach on demand.
-
-    :param record_filename: The filename where to find tfrecords.
-    :param num_epochs: The number of epoch we will need
-    :param batch_size: The size of each returned batch
-    :param distord: Parameter for applying or not distortion on input
-    :return: The iterator to plug to the input of the network
-    """
-    dataset = tf.data.TFRecordDataset(record_filename)
-    dataset = dataset.repeat(num_epochs)
-    dataset = dataset.map(decode)
-    # if distord:
-    #     dataset = dataset.map(distortion)
-    dataset = dataset.shuffle(1000 + 3 * batch_size)
-    dataset = dataset.batch(batch_size)  # combine les éléments en batchs
-    iterator = dataset.make_one_shot_iterator()
-    return iterator
-
-
-def trainning():
-    SIGMA = 5.0
-    num_epochs = 10000
-    batch_size = 64
-    print("Sigma = {}".format(SIGMA))
-
-    with tf.Graph().as_default():
-        # retourne batchs après batchs
-        iterator_train = get_tf_record("/tmp/data/mnist/mnist_train.tfrecords", num_epochs, batch_size, distord=False)
-        iterator_val = get_tf_record("/tmp/data/mnist/mnist_val.tfrecords", num_epochs, batch_size)
-        iterator_test = get_tf_record("/tmp/data/mnist/mnist_test.tfrecords", num_epochs, batch_size)
-
-        input_dim, output_dim = 784, 10
-
-        with tf.name_scope("train_set"):
-            x_image_train, y_train = iterator_train.get_next()
-            x_image_train = tf.reshape(x_image_train, [-1, 28, 28, 1])
-            tf.summary.image("digit", x_image_train, max_outputs=3)
-
-        with tf.name_scope("val_set"):
-            x_image_val, y_val = iterator_val.get_next()
-            x_image_val = tf.reshape(x_image_val, tf.stack([-1, 28, 28, 1]))
-
-        with tf.name_scope("test_set"):
-            x_image_test, y_test = iterator_test.get_next()
-            x_image_test = tf.reshape(x_image_test, tf.stack([-1, 28, 28, 1]))
-
-        with tf.variable_scope("inference") as scope_inference:
-            y_train_out, keep_prob_train = inference_mnist(x_image_train, output_dim)
-            scope_inference.reuse_variables()
-            y_val_out, keep_prob_val = inference_mnist(x_image_val, output_dim)
-            scope_inference.reuse_variables()
-            y_test_out, keep_prob_test = inference_mnist(x_image_test, output_dim)
-
-        # calcul de la loss
-        with tf.name_scope("xent"):
-            cross_entropy = tf.reduce_mean(
-                tf.nn.softmax_cross_entropy_with_logits(labels=y_train, logits=y_train_out, name="xentropy"),
-                name="xentropy_mean")
-            tf.summary.scalar('loss-xent', cross_entropy)
-
-        # calcul du gradient
-        with tf.name_scope("train"):
-            global_step = tf.Variable(0, name="global_step", trainable=False)
-            train_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cross_entropy, global_step=global_step)
-
-        # calcul de l'accuracy
-        with tf.name_scope("accuracy"):
-            predictions_val = tf.argmax(y_val_out, 1)
-            correct_prediction_val = tf.equal(predictions_val, tf.argmax(y_val, 1))
-            accuracy_val = tf.reduce_mean(tf.cast(correct_prediction_val, tf.float32))
-            tf.summary.scalar("accuracy", accuracy_val)
-            predictions_test = tf.argmax(y_test_out, 1)
-            correct_prediction_test = tf.equal(predictions_test, tf.argmax(y_test, 1))
-            accuracy_test = tf.reduce_mean(tf.cast(correct_prediction_test, tf.float32))
-
-        merged_summary = tf.summary.merge_all()
-
-        init = tf.global_variables_initializer()
-        # Create a session for running Ops on the Graph.
-        sess = tf.Session()
-        # Instantiate a SummaryWriter to output summaries and the Graph.
-        summary_writer = tf.summary.FileWriter("/tmp/results_tfrecord_nn")
-        summary_writer.add_graph(sess.graph)
-        # Initialize all Variable objects
-        sess.run(init)
-        # actual learning
-        started = t.time()
-        feed_dict = {
-            keep_prob_val: 1.0,
-            keep_prob_train: 0.5,
-            keep_prob_test: 1.0
-        }
-        # todo maybe there is a misunderstanding of epoch definition here.
-        for i in range(num_epochs):
-            # run training and get the loss
-            _, loss = sess.run([train_optimizer, cross_entropy], feed_dict=feed_dict)
-
-            if i % 100 == 0:
-                print('step {}, loss {} (with dropout)'.format(i, loss))
-                r_accuracy = sess.run([accuracy_val], feed_dict=feed_dict)
-                print("accuracy: {} on validation set (without dropout).".format(r_accuracy))
-                summary_str = sess.run(merged_summary, feed_dict=feed_dict)
-                summary_writer.add_summary(summary_str, i)
-
-        stoped = t.time()
-        accuracy_eval, preds_eval, exp_eval = sess.run([accuracy_test, predictions_test, y_test], feed_dict=feed_dict)
-        print('test accuracy %g' % accuracy_eval)
-        np.set_printoptions(threshold=np.nan)
-        print("Prediction sample: " + str(preds_eval[:50]))
-        print("Actual values: " + str(np.argmax(exp_eval[:50], axis=1)))
-        print("Elapsed time: %.4f s" % (stoped - started))
-
-
-def create_records():
-    import os
-    directory = "/tmp/data/mnist"
-
-    if not os.path.exists(os.path.join(directory, "mnist_train.tfrecords")):
-        convert_to(train, "mnist_train", directory)
-        convert_to(test, "mnist_test", directory)
-        convert_to(val, "mnist_val", directory)
-
-
-if __name__ == '__main__':
-    create_records()
-    trainning()
-
-
-
diff --git a/skluc/examples/time_batch_subsample_ops.py b/skluc/examples/time_batch_subsample_ops.py
deleted file mode 100644
index 2541afe..0000000
--- a/skluc/examples/time_batch_subsample_ops.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import tensorflow as tf
-import numpy as np
-from sklearn.metrics.pairwise import rbf_kernel
-
-import skluc.data.mldatasets as dataset
-from skluc.tensorflow_.utils import fully_connected, get_next_batch, tf_op, conv_relu_pool
-from skluc.utils import time_fct
-from skluc.tensorflow_.kernel import tf_rbf_kernel
-
-tf.logging.set_verbosity(tf.logging.ERROR)
-
-# Preparing the dataset #########################
-
-mnist = dataset.MnistDataset()
-mnist.load()
-X_train, _ = mnist.train
-X_train = np.array(X_train / 255)
-X_train = X_train.astype(np.float32)
-
-################################################
-
-# todo timer les autres kernels pour verifier que c'est effectivement plus rapide
-
-if __name__ == '__main__':
-    input_dim = X_train.shape[1]
-    output_dim_fc = 4096*2
-    batch_size = 500
-    subsample_size = 500
-    X_batch = get_next_batch(X_train, 0, batch_size)
-    X_subsample = get_next_batch(X_train, 0, subsample_size)
-
-    with tf.Graph().as_default():
-        # inputs
-        x = tf.placeholder(tf.float32, shape=[None, input_dim], name="x")
-        x_subsample = tf.placeholder(tf.float32, shape=[None, input_dim], name="x_subsample")
-
-        # reshape vector inputs to images
-        side_size = int(np.sqrt(input_dim))
-        x_image = tf.reshape(x, [-1, side_size, side_size, 1])
-        x_subsample_image = tf.reshape(x_subsample, [subsample_size, side_size, side_size, 1])
-
-        # fully connected ops
-        out_fc_x = fully_connected(x, output_dim_fc, act=tf.nn.relu, variable_scope="fc_x")
-        out_fc_subsample = fully_connected(x_subsample, output_dim_fc, act=tf.nn.relu, variable_scope="fc_subsample")
-
-        # convolution ops
-        out_conv_x = conv_relu_pool(x_image, [5, 5, 1, 20], [20], pool_size=3, variable_scope="conv_x")
-        out_conv_subsample = conv_relu_pool(x_subsample_image, [5, 5, 1, 20], [20], pool_size=3,
-                                            variable_scope="conv_subsample")
-
-        init_dim = np.prod([s.value for s in out_conv_x.shape[1:] if s.value is not None])
-        x_conv_flat = tf.reshape(out_conv_x, [-1, init_dim])
-        subsample_conv_flat = tf.reshape(out_conv_subsample, [subsample_size, init_dim])
-
-        # kernel computing ops
-        with tf.device('/cpu:0'):
-            kernel_cpu = tf_rbf_kernel(x_conv_flat, subsample_conv_flat, gamma=0.001)
-        with tf.device('/device:GPU:0'):
-            kernel_gpu = tf_rbf_kernel(x_conv_flat, subsample_conv_flat, gamma=0.001)
-
-        feed_dict = {x: X_batch, x_subsample: X_subsample}
-
-        def kernel_sklearn():
-            with tf.Session() as sess:
-                init = tf.global_variables_initializer()
-                sess.run([init])
-                x, y = sess.run([x_conv_flat, subsample_conv_flat], feed_dict=feed_dict)
-            rbf_kernel(x, y, gamma=0.001)
-
-        d_time_results = {
-            "fc_x": lambda: time_fct(lambda: tf_op(feed_dict, [out_fc_x]), n_iter=10),
-            "fc_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [out_fc_subsample]), n_iter=10),
-            "reshape_x": lambda: time_fct(lambda: tf_op(feed_dict, [x_image]), n_iter=10),
-            "reshape_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [x_subsample_image]), n_iter=10),
-            "reshape_x + conv_x": lambda: time_fct(lambda: tf_op(feed_dict, [out_conv_x]), n_iter=10),
-            "reshape_subsample + conv_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [out_conv_subsample]), n_iter=10),
-            "reshape_x + conv_x + reshape_subsample + conv_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [out_conv_x, out_conv_subsample]), n_iter=10),
-            "reshape_x + conv_x + reshape_subsample + conv_subsample + kernel_cpu": lambda: time_fct(lambda: tf_op(feed_dict, [kernel_cpu]), n_iter=10),
-            "reshape_x + conv_x + reshape_subsample + conv_subsample + kernel_gpu": lambda: time_fct(lambda: tf_op(feed_dict, [kernel_gpu]), n_iter=10),
-            "reshape_x + conv_x + reshape_subsample + conv_subsample + kernel_sklearn": lambda: time_fct(kernel_sklearn, n_iter=10)
-        }
-
-        for key, value in d_time_results.items():
-            print("{}:\t{:.4f}s".format(key, value()))
-            tf.reset_default_graph()
-
diff --git a/skluc/examples/write_read_tfrecords.py b/skluc/examples/write_read_tfrecords.py
deleted file mode 100644
index e3c3703..0000000
--- a/skluc/examples/write_read_tfrecords.py
+++ /dev/null
@@ -1,77 +0,0 @@
-"""
-Example (with mnist) on how to read and write tf records.
-"""
-
-import tensorflow as tf
-import os
-import numpy as np
-import skluc.data.mldatasets as dataset
-from skluc.convert_image_to_records import convert_to
-import matplotlib.pyplot as plt
-
-tf.logging.set_verbosity(tf.logging.ERROR)
-
-from collections import namedtuple
-
-val_size = 5000
-mnist = dataset.MnistDataset(validation_size=val_size)
-mnist.load()
-mnist.to_one_hot()
-mnist.normalize()
-mnist.to_image()
-mnist.data_astype(np.float32)
-mnist.labels_astype(np.float32)
-
-X_train, Y_train = mnist.train
-X_val, Y_val = mnist.validation
-X_test, Y_test = mnist.test
-
-
-train = namedtuple("Dataset", ["images", "labels", "num_examples"])
-train.images = X_train
-train.labels = Y_train
-train.num_examples = X_train.shape[0]
-
-test = namedtuple("Dataset", ["images", "labels", "num_examples"])
-test.images = X_test
-test.labels = Y_test
-test.num_examples = X_test.shape[0]
-
-val = namedtuple("Dataset", ["images", "labels", "num_examples"])
-val.images = X_val
-val.labels = Y_val
-val.num_examples = X_val.shape[0]
-
-DIRECTORY = "/tmp/data/mnist"
-
-
-def write_tf_record_mnist():
-    convert_to(train, "mnist_train", DIRECTORY)
-    convert_to(test, "mnist_test", DIRECTORY)
-    convert_to(val, "mnist_val", DIRECTORY)
-
-
-def read_tf_record_mnist():
-    reconstructed_images = []
-    record_iterator = tf.python_io.tf_record_iterator(path=os.path.join(DIRECTORY, "mnist_train.tfrecords"))
-    for string_record in record_iterator:
-        example = tf.train.Example()
-        example.ParseFromString(string_record)
-        height = int(example.features.feature["height"].int64_list.value[0])
-        width = int(example.features.feature["width"].int64_list.value[0])
-        depth = int(example.features.feature["depth"].int64_list.value[0])
-        img_string = (example.features.feature["image_raw"].bytes_list.value[0])
-        label = (example.features.feature["label"].int64_list.value[0])
-
-        img_1d = np.fromstring(img_string, dtype=np.uint32)
-        reconstructed_img = img_1d.reshape((height, width, depth))
-        reconstructed_images.append((reconstructed_img, label))
-
-    plt.imshow(reconstructed_images[0][0][:,:,0])
-    plt.show()
-    print(reconstructed_images[0][1])
-
-
-if __name__ == '__main__':
-    # write_tf_record_mnist()
-    read_tf_record_mnist()
\ No newline at end of file
diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py
index 1aa8459..d01ae87 100644
--- a/skluc/main/data/mldatasets/Dataset.py
+++ b/skluc/main/data/mldatasets/Dataset.py
@@ -77,7 +77,7 @@ class Dataset(object):
                                                                               get_nbr))
             idx_labs = np.where(bool_idx_labs)[0][:get_nbr]
             # return_idx_labels[copy_idx:copy_idx+get_nbr] = idx_labs
-            logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs)))
+            # logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs)))
             return_idx_labels.extend(idx_labs)
             copy_idx += get_nbr
 
@@ -129,6 +129,7 @@ class Dataset(object):
 
     @property
     def validation(self):
+        # todo doesn't work if val size = 0
         return LabeledData(data=self._train.data[self.permuted_index_validation],
                            labels=self._train.labels[self.permuted_index_validation])
 
@@ -273,6 +274,7 @@ class Dataset(object):
         if self._train is not None:
             logger.debug("Construction of random train indexes (seed: {})".format(self.seed))
             np.random.seed(self.seed)
+            # todo -> faire argument shuffle or not
             permut = np.random.permutation(self._train[0].shape[0])
             if self.validation_size > 0:
                 self.permuted_index_train = permut[:-self.validation_size]
diff --git a/skluc/main/data/mldatasets/ImageDataset.py b/skluc/main/data/mldatasets/ImageDataset.py
index 4151cd3..92053d5 100644
--- a/skluc/main/data/mldatasets/ImageDataset.py
+++ b/skluc/main/data/mldatasets/ImageDataset.py
@@ -43,8 +43,7 @@ class ImageDataset(Dataset):
                 data, labels = getattr(self, kw)
                 transformed_data, transformed_labels = transformer.transform(data, labels)
                 setattr(self, kw, LabeledData(data=transformed_data, labels=transformed_labels))
-
-        self.save_npz(transform_path)
+            self.save_npz(transform_path)
 
     def to_image(self):
         """
diff --git a/skluc/data/mldatasets/MovieReviewDataset.py b/skluc/main/data/mldatasets/MovieReviewDataset.py
similarity index 90%
rename from skluc/data/mldatasets/MovieReviewDataset.py
rename to skluc/main/data/mldatasets/MovieReviewDataset.py
index 9d4e28b..97080db 100644
--- a/skluc/data/mldatasets/MovieReviewDataset.py
+++ b/skluc/main/data/mldatasets/MovieReviewDataset.py
@@ -118,6 +118,8 @@ class MovieReviewV1Dataset(Dataset):
 
     @property
     def train(self):
+        # todo no guarantee on the stratification of classes
+
         indexes = self.permuted_index_train[:self.TRAIN_SIZE - self.validation_size]
         return LabeledData(data=self._train.data[indexes],
                            labels=self._train.labels[indexes])
@@ -182,25 +184,3 @@ class MovieReviewV1Dataset(Dataset):
         negative_labels = [[1, 0] for _ in negative_examples]
         y = np.concatenate([positive_labels, negative_labels], 0)
         return LabeledData(data=x_text, labels=y)
-
-    # todo not yet sure the following is usefull
-    # @staticmethod
-    # def batch_iter(data, batch_size, num_epochs, shuffle=True):
-    #     """
-    #     Generates a batch iterator for a dataset.
-    #     """
-    #     data = np.array(data)
-    #     data_size = len(data)
-    #     num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
-    #     for epoch in range(num_epochs):
-    #         # Shuffle the data at each epoch
-    #         if shuffle:
-    #             shuffle_indices = np.random.permutation(np.arange(data_size))
-    #             shuffled_data = data[shuffle_indices]
-    #         else:
-    #             shuffled_data = data
-    #         for batch_num in range(num_batches_per_epoch):
-    #             start_index = batch_num * batch_size
-    #             end_index = min((batch_num + 1) * batch_size, data_size)
-    #
-    #     yield shuffled_data[start_index:end_index]
diff --git a/skluc/main/data/mldatasets/OmniglotDataset.py b/skluc/main/data/mldatasets/OmniglotDataset.py
index e7141fb..22bcdce 100644
--- a/skluc/main/data/mldatasets/OmniglotDataset.py
+++ b/skluc/main/data/mldatasets/OmniglotDataset.py
@@ -16,12 +16,23 @@ class OmniglotDataset(ImageDataset):
     WIDTH = 105
     DEPTH = 1
 
-    def __init__(self, validation_size=0, seed=None, s_download_dir=None):
+    def __init__(self, validation_size=0, seed=None, s_download_dir=None, snell_preprocessing=False):
+        """
+
+        :param validation_size:
+        :param seed:
+        :param s_download_dir:
+        :param snell_preprocessing: should the data preprocessing used in prototypical be used on omniglot
+        """
         self.__s_url = ["https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip",
                         "https://github.com/brendenlake/omniglot/raw/master/python/images_evaluation.zip"
                         ]
         self.meta = None
+        self.__snell_preprocessing = snell_preprocessing
         name = "omniglot"
+        if self.__snell_preprocessing:
+            name += "_snell"
+
         if s_download_dir is not None:
             super().__init__(self.__s_url, name, s_download_dir, validation_size=validation_size, seed=seed)
         else:
@@ -101,14 +112,36 @@ class OmniglotDataset(ImageDataset):
                 logger.debug("Files {} have already been extracted".format(self.l_filepaths))
 
             logger.debug("Get training data of dataset {}".format(self.s_name))
-            self._train = LabeledData(*self.get_omniglot_data('background'))
+            background_data = LabeledData(*self.get_omniglot_data('background'))
 
             logger.debug("Get testing data of dataset {}".format(self.s_name))
-            self._test = LabeledData(*self.get_omniglot_data('evaluation'))
+            evaluation_data = LabeledData(*self.get_omniglot_data('evaluation'))
+
+            if self.__snell_preprocessing:
+                nb_class_bg_snell = 1200
+                unique_labels_train = np.unique(background_data.labels, axis=0)
+                nb_labels_train = len(unique_labels_train)
+                nb_class_to_move = nb_class_bg_snell - nb_labels_train
+                unique_labels_test = np.unique(evaluation_data.labels, axis=0)
+                labels_to_move = unique_labels_test[:nb_class_to_move]
+                bool_idx_data_to_move = np.zeros(len(evaluation_data.labels), dtype=bool)
+                for label in labels_to_move:
+                    bool_idx_label = OmniglotDataset.get_bool_idx_label(label, evaluation_data.labels)
+                    bool_idx_data_to_move = np.logical_or(bool_idx_data_to_move, bool_idx_label)
+
+                labels_to_add = evaluation_data.labels[bool_idx_data_to_move] + np.max(background_data.labels) + 1
+                self._train = LabeledData(data=np.vstack([background_data.data, evaluation_data.data[bool_idx_data_to_move]]),
+                                          labels=np.hstack([background_data.labels, labels_to_add ]))
+                self._test = LabeledData(data=evaluation_data.data[np.logical_not(bool_idx_data_to_move)],
+                                         labels=evaluation_data.labels[np.logical_not(bool_idx_data_to_move)] - nb_class_to_move)
+            else:
+                self._train = background_data
+                self._test = evaluation_data
 
             self._check_validation_size(self._train[0].shape[0])
 
             self.save_npz()
+
         logger.debug("Number of labels in train set {}".format(len(np.unique(self._train.labels, axis=0))))
         logger.debug("Number of labels in evaluation set {}".format(len(np.unique(self._test.labels, axis=0))))
 
diff --git a/skluc/main/data/mldatasets/RPSDataset.py b/skluc/main/data/mldatasets/RPSDataset.py
new file mode 100644
index 0000000..78e5059
--- /dev/null
+++ b/skluc/main/data/mldatasets/RPSDataset.py
@@ -0,0 +1,125 @@
+import os
+import zipfile
+
+import numpy as np
+import imageio
+import matplotlib.pyplot as plt
+
+from skluc.utils import LabeledData, create_directory
+from skluc.data.mldatasets.ImageDataset import ImageDataset
+from skluc.utils import logger, check_files
+
+
+class RPSDataset(ImageDataset):
+    data_groups_private = ["_train"]
+    HEIGHT = 50
+    WIDTH = 50
+    DEPTH = 3
+    TRAIN_SIZE = 600
+
+    def __init__(self, validation_size=0, seed=None, s_download_dir=None):
+        self.__s_url = ["https://pageperso.lif.univ-mrs.fr/~luc.giffon/datasets/rps_data_resize.zip"]
+        self.meta = None
+        name = "rps"
+        if s_download_dir is not None:
+            super().__init__(self.__s_url, name, s_download_dir, validation_size=validation_size, seed=seed)
+        else:
+            super().__init__(self.__s_url, name, validation_size=validation_size, seed=seed)
+
+        self.__extracted_dirs = [
+            os.path.join(self.s_download_dir, "images_background"),
+            os.path.join(self.s_download_dir, "images_evaluation")
+        ]
+
+    def get_rps_data(self):
+        data_dirname = "rps_data_resize"
+        data_dirpath = os.path.join(self.s_download_dir, data_dirname)
+        class_index = 0
+        list_of_images = []
+        list_of_labels = []
+        for symbol_name in os.listdir(data_dirpath):
+            data_symbol_path = os.path.join(data_dirpath, symbol_name)
+            for symbol_image_file in os.listdir(data_symbol_path):
+                symbol_image_path = os.path.join(data_symbol_path, symbol_image_file)
+                im = imageio.imread(symbol_image_path)
+                list_of_images.append(im)
+                list_of_labels.append(class_index)
+            class_index += 1
+        data = np.array(list_of_images)
+        labels = np.array(list_of_labels)
+        data = data.reshape(data.shape[0], self.WIDTH*self.HEIGHT, self.DEPTH, order="C")
+        data = data.reshape(data.shape[0], self.WIDTH*self.HEIGHT*self.DEPTH, order="F")
+        return data, labels
+
+    def read(self):
+        npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
+        lst_npzfile_paths = [os.path.join(npzdir_path, kw + ".npz")
+                               for kw in self.data_groups_private]
+        create_directory(npzdir_path)
+        if check_files(lst_npzfile_paths):
+            # case npz files already exist
+            logger.debug("Files {} already exists".format(lst_npzfile_paths))
+            logger.info("Loading transformed data from files {}".format(lst_npzfile_paths))
+            for kw in self.data_groups_private:
+                npzfile_path = os.path.join(npzdir_path, kw + ".npz")
+                logger.debug("Loading {}".format(npzfile_path))
+                npzfile = np.load(npzfile_path)
+                data = npzfile[kw + "_data"]
+                logger.debug("Shape of {} set: {}".format(kw, data.shape))
+                labels = npzfile[kw + "_labels"]
+                setattr(self, kw, LabeledData(data=data, labels=labels))
+        else:
+
+            if not check_files(self.__extracted_dirs):
+                # case zip files dont even exist
+                logger.debug("Extracting {} ...".format(self.l_filepaths))
+                for zip_file in self.l_filepaths:
+                    zip_ref = zipfile.ZipFile(zip_file, 'r')
+                    zip_ref.extractall(self.s_download_dir)
+                    zip_ref.close()
+            else:
+                logger.debug("Files {} have already been extracted".format(self.l_filepaths))
+
+            full_data, full_labels = self.get_rps_data()
+            logger.debug("Get training data of dataset {}".format(self.s_name))
+            self._train = LabeledData(data=full_data, labels=full_labels)
+            # self._test = LabeledData(data=np.array([]), labels=np.array([]))
+            #
+            # logger.debug("Get testing data of dataset {}".format(self.s_name))
+            # self._test = LabeledData(*self.get_omniglot_data('evaluation'))
+            #
+            self._check_validation_size(self._train[0].shape[0])
+
+            self.save_npz()
+
+
+    @property
+    def train(self):
+        indexes = self.permuted_index_train[:self.TRAIN_SIZE - self.validation_size]
+        return LabeledData(data=self._train.data[indexes],
+                           labels=self._train.labels[indexes])
+
+    @property
+    def test(self):
+        indexes = self.permuted_index_train[self.TRAIN_SIZE:]
+        return LabeledData(data=self._train.data[indexes],
+                           labels=self._train.labels[indexes])
+
+    @property
+    def validation(self):
+        indexes = self.permuted_index_train[(self.TRAIN_SIZE - self.validation_size):self.TRAIN_SIZE]
+        return LabeledData(data=self._train.data[indexes],
+                           labels=self._train.labels[indexes])
+
+
+if __name__ == "__main__":
+    import time
+    d = RPSDataset(validation_size=100)
+    d.load()
+    d.to_image()
+    print(d.train.data.shape)
+    for i, im in enumerate(d.train.data):
+        plt.imshow(im)
+        plt.show()
+        print(d.train.labels[i])
+        time.sleep(1)
\ No newline at end of file
diff --git a/skluc/main/data/transformation/VinyalsTransformer.py b/skluc/main/data/transformation/VinyalsTransformer.py
index 9658af6..bc971a4 100644
--- a/skluc/main/data/transformation/VinyalsTransformer.py
+++ b/skluc/main/data/transformation/VinyalsTransformer.py
@@ -16,8 +16,12 @@ class VinyalsTransformer(KerasModelTransformer, metaclass=Singleton):
             checksum="a0b815ad2ab81092c75d129f511b2bdb"
         ),
         "omniglot_28x28": DownloadableModel(
-            url="https://pageperso.lis-lab.fr/~luc.giffon/models/1536742266.9412131_vinyals_omniglot.h5",
+            url="https://pageperso.lis-lab.fr/luc.giffon/models/1536742266.9412131_vinyals_omniglot_28x28.h5",
             checksum="6460eb1b7eaa478301a281b12ecd2461"
+        ),
+        "omniglot_snell": DownloadableModel(
+            url="https://pageperso.lis-lab.fr/~luc.giffon/models/1537524783.0678186_vinyals_omniglot_snell.h5",
+            checksum="28a6e4e3748d9971e0450000895ce423"
         )
     }
 
diff --git a/skluc/main/data/transformation/tCNNTransformer/__init__.py b/skluc/main/data/transformation/tCNNTransformer/__init__.py
new file mode 100644
index 0000000..135afb5
--- /dev/null
+++ b/skluc/main/data/transformation/tCNNTransformer/__init__.py
@@ -0,0 +1,29 @@
+import os
+
+import numpy as np
+from keras import Model
+from skluc.utils import logger, deprecated
+
+
+@deprecated
+class tCNNTransformer:
+    """
+    Transform text data with textCNN transformer.
+    """
+    def __init__(self, name, download_dir=os.path.join(os.path.expanduser("~"), "ml_models")):
+        self.tcnn_model = None
+        self.s_download_dir = os.path.join(download_dir, name)
+        super().__init__()
+
+    def transform(self, data, labels):
+        # todo rendre ce truc plus genral aux transformers
+        model = Model(inputs=self.tcnn_model.input, outputs=self.tcnn_model.output)
+        logger.debug("Type fo data to transform: {}".format(type(data)))
+        logger.debug("Length of data to transform: {}".format(len(data)))
+        logger.debug("Transforming data using pretrained model")
+        transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:])
+        logger.debug("Type of transformed data: {}".format(type(transformed_data)))
+        return transformed_data, labels
+
+    def check_model(self):
+        raise NotImplementedError
diff --git a/skluc/test/test_data/test_mldatasets/TestCifar100Dataset.py b/skluc/test/test_data/test_mldatasets/TestCifar100Dataset.py
new file mode 100644
index 0000000..e69de29
diff --git a/skluc/test/test_data/test_mldatasets/TestImageDataset.py b/skluc/test/test_data/test_mldatasets/TestImageDataset.py
new file mode 100644
index 0000000..e69de29
diff --git a/skluc/test/test_data/test_mldatasets/TestSVHNDataset.py b/skluc/test/test_data/test_mldatasets/TestSVHNDataset.py
new file mode 100644
index 0000000..e69de29
-- 
GitLab