From c88d0b49243134e429a1cc058b7424055059587a Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Tue, 9 Oct 2018 08:13:47 +0200 Subject: [PATCH] move unused scripts to sandbox + update datasets + create test files (still empty for some) --- skluc/examples/fc_nn_timed.py | 30 --- skluc/examples/nystroem_svm_classifier.py | 140 ----------- skluc/examples/so_conv_net.py | 33 --- .../tasks/classification/cifar10/__init__.py | 0 .../classification/cifar10}/fc_cnn_cifar.py | 6 +- .../fc_dense_cifar_preprocessing_vgg19.py | 19 +- .../tasks/classification/mnist/__init__.py | 0 .../mnist}/deepfriedConvnetMnist.py | 6 +- .../classification/mnist}/fc_cnn_mnist.py | 4 +- .../mnist}/keras_fc_cnn_mnist.py | 2 +- .../scratch/omniglot_snell_28x28_vinyals.py | 114 +++++++++ .../tasks/classification/svhn/__init__.py | 0 .../fc_dense_svhn_preprocessing_vgg19.py | 18 +- .../svhn}/keras_fc_preprocess_svhn.py | 7 +- skluc/examples/tf_profiling.py | 27 --- skluc/examples/tfrecord_nn.py | 217 ------------------ skluc/examples/time_batch_subsample_ops.py | 86 ------- skluc/examples/write_read_tfrecords.py | 77 ------- skluc/main/data/mldatasets/Dataset.py | 4 +- skluc/main/data/mldatasets/ImageDataset.py | 3 +- .../data/mldatasets/MovieReviewDataset.py | 24 +- skluc/main/data/mldatasets/OmniglotDataset.py | 39 +++- skluc/main/data/mldatasets/RPSDataset.py | 125 ++++++++++ .../data/transformation/VinyalsTransformer.py | 6 +- .../tCNNTransformer/__init__.py | 29 +++ .../test_mldatasets/TestCifar100Dataset.py | 0 .../test_mldatasets/TestImageDataset.py | 0 .../test_mldatasets/TestSVHNDataset.py | 0 28 files changed, 338 insertions(+), 678 deletions(-) delete mode 100644 skluc/examples/fc_nn_timed.py delete mode 100644 skluc/examples/nystroem_svm_classifier.py delete mode 100644 skluc/examples/so_conv_net.py create mode 100644 skluc/examples/tasks/classification/cifar10/__init__.py rename skluc/examples/{ => tasks/classification/cifar10}/fc_cnn_cifar.py (96%) rename skluc/examples/{ => tasks/classification/cifar10}/fc_dense_cifar_preprocessing_vgg19.py (88%) create mode 100644 skluc/examples/tasks/classification/mnist/__init__.py rename skluc/examples/{ => tasks/classification/mnist}/deepfriedConvnetMnist.py (95%) rename skluc/examples/{ => tasks/classification/mnist}/fc_cnn_mnist.py (97%) rename skluc/examples/{ => tasks/classification/mnist}/keras_fc_cnn_mnist.py (98%) create mode 100644 skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py create mode 100644 skluc/examples/tasks/classification/svhn/__init__.py rename skluc/examples/{ => tasks/classification/svhn}/fc_dense_svhn_preprocessing_vgg19.py (87%) rename skluc/examples/{ => tasks/classification/svhn}/keras_fc_preprocess_svhn.py (89%) delete mode 100644 skluc/examples/tf_profiling.py delete mode 100644 skluc/examples/tfrecord_nn.py delete mode 100644 skluc/examples/time_batch_subsample_ops.py delete mode 100644 skluc/examples/write_read_tfrecords.py rename skluc/{ => main}/data/mldatasets/MovieReviewDataset.py (90%) create mode 100644 skluc/main/data/mldatasets/RPSDataset.py create mode 100644 skluc/main/data/transformation/tCNNTransformer/__init__.py create mode 100644 skluc/test/test_data/test_mldatasets/TestCifar100Dataset.py create mode 100644 skluc/test/test_data/test_mldatasets/TestImageDataset.py create mode 100644 skluc/test/test_data/test_mldatasets/TestSVHNDataset.py diff --git a/skluc/examples/fc_nn_timed.py b/skluc/examples/fc_nn_timed.py deleted file mode 100644 index e4736ba..0000000 --- a/skluc/examples/fc_nn_timed.py +++ /dev/null @@ -1,30 +0,0 @@ -import tensorflow as tf -import numpy as np -import skluc.data.mldatasets as dataset -from skluc.tensorflow_.utils import fully_connected, get_next_batch, tf_op -from skluc.utils import time_fct - -tf.logging.set_verbosity(tf.logging.ERROR) - -# Preparing the dataset ######################### - -mnist = dataset.MnistDataset() -mnist.load() -mnist.normalize() -mnist.data_astype(np.float32) - -X_train, _ = mnist.train - - -################################################# - - -if __name__ == '__main__': - input_dim = X_train.shape[1] - batch_size = 10 - with tf.Graph().as_default(): - x = tf.placeholder(tf.float32, shape=[None, input_dim], name="x") - out_fc = fully_connected(x, 4096*2, act=tf.nn.relu) - X_batch = get_next_batch(X_train, 0, batch_size) - feed_dict = {x: X_batch} - print("%.4fs" % time_fct(lambda: tf_op(feed_dict, [out_fc]))) \ No newline at end of file diff --git a/skluc/examples/nystroem_svm_classifier.py b/skluc/examples/nystroem_svm_classifier.py deleted file mode 100644 index 5583600..0000000 --- a/skluc/examples/nystroem_svm_classifier.py +++ /dev/null @@ -1,140 +0,0 @@ -import numpy as np -from collections import defaultdict -import os -import skluc.data.mldatasets as dataset -from sklearn.kernel_approximation import Nystroem -from sklearn.svm import SVC - -from skluc.data.transformation import VGG19Cifar10Transformer -from skluc.utils import logger, compute_euristic_sigma -import matplotlib.pyplot as plt - -kernel_marker = { - "additive_chi2": "x", - "linear": "o", - "rbf": "d", - "chi2": "h" -} - -# colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -colors = [ - "#009BE5", - "#004EE0", - "#00DC8F", - "#4400D8", - "#03D400", - "#CC00CF", - "#8BCB00", - "#C70046", - "#C37E00", - "#BF3C00" -] - - -def nystroem_classif(X_train, Y_train, X_test, Y_test, subsample, subsample_label, kernel_fct, kernel_params): - nys = Nystroem(kernel=kernel_fct, kernel_params=kernel_params, n_components=subsample.shape[0]) - logger.debug("Subsample shape (bis): {}".format(subsample.shape)) - nys.fit(subsample) - # X_train_transformed = nys.transform(X_train) - X_test_transformed = nys.transform(X_test) - subsample_transformed = nys.transform(subsample) - f, ax = plt.subplots() - if len(subsample) == 2: - for i in range(len(X_test)): - # if Y_test[i] > 6: - # continue - plt.scatter(X_test_transformed[i][0], X_test_transformed[i][1], c=colors[int(Y_test[i])], marker='.') - if len(kernel_params) == 0: - circle0 = plt.Circle((subsample_transformed[0][0], subsample_transformed[0][1]), 0.2, color='r', fill=False) - circle1 = plt.Circle((subsample_transformed[1][0], subsample_transformed[1][1]), 0.2, color='r', fill=False) - else: - circle0 = plt.Circle((subsample_transformed[0][0], subsample_transformed[0][1]), 0.01, color='r', fill=False) - circle1 = plt.Circle((subsample_transformed[1][0], subsample_transformed[1][1]), 0.01, color='r', fill=False) - ax.add_artist(circle0) - ax.add_artist(circle1) - - plt.legend() - plt.title("{}; {}".format(kernel_fct, str(kernel_params))) - out_dir_path = "/home/luc/PycharmProjects/deepFriedConvnets/main/experiments/graph_drawing/paper/cifar/plt_2D" - out_name = "plot_2D_{}".format(kernel_fct) - out_path = os.path.join(out_dir_path, out_name) - f.savefig(out_path) - plt.show() - # clf = SVC(kernel="linear") - # clf.fit(X_train_transformed, Y_train) - # score = clf.score(X_test_transformed, Y_test) - score = 0.1 - return score - - -if __name__ == "__main__": - # SEED = np.random.randint(0, 100) - SEED = 10 - VALIDATION_SIZE = 10000 - SUBSAMPLE_SIZES = np.logspace(1, 6, dtype=int, base=2, num=5) - - data = dataset.Cifar10Dataset(validation_size=VALIDATION_SIZE, seed=0) - - data.load() - data.normalize() - data.to_image() - data.apply_transformer(VGG19Cifar10Transformer) - # todo faire convention pour stockage des data - # data.revert_one_hot() - data.data_astype(np.float32) - data.labels_astype(np.float32) - data.flatten() - - X_train, Y_train = data.train - X_test, Y_test = data.test - X_val, Y_val = data.validation - logger.debug("X_train.shape: {} ; Y_train.shape: {}".format(X_train.shape, Y_train.shape)) - logger.debug("X_test.shape: {} ; Y_test.shape: {}".format(X_test.shape, Y_test.shape)) - best_gamma = 0.005425247156552446 - # this value of best_gamma comes from the below command (but it is expensive to compute so I decided to hardcode it) - # best_gamma = 1. / compute_euristic_sigma(X_train) - logger.debug("Best gamma is: {}".format(best_gamma)) - - TESTED_KERNELS = { - "chi2": {"gamma": best_gamma}, - "additive_chi2": {}, - "rbf": {"gamma": best_gamma}, - "linear": {} - } - # results = [] - results_by_kernel = defaultdict(list) - for m_size in SUBSAMPLE_SIZES: - logger.debug("Starting with subsample size == {}".format(m_size)) - np.random.seed(SEED) - dataset_indexes = np.random.permutation(X_train.shape[0]) - X_subsample = X_train[dataset_indexes[:m_size]] - Y_subsample = Y_train[dataset_indexes[:m_size]] - logger.debug("Shape of subsample: {}".format(X_subsample.shape)) - for kernel_name, kernel_params in TESTED_KERNELS.items(): - logger.debug("Starting with kernel {}".format(kernel_name)) - # kernel_fct = kernel_tuple[0] - # kernel_params = kernel_tuple[1] - logger.debug("Kernel params are: {}".format(kernel_params)) - score = nystroem_classif(X_train=X_train, - Y_train=Y_train, - X_test=X_test[:1000], - Y_test=Y_test[:1000], - subsample=X_subsample, - subsample_label=Y_subsample, - kernel_fct=kernel_name, - kernel_params=kernel_params) - logger.debug("Obtained score: {}".format(score)) - results_by_kernel[kernel_name].append(score) - - for kernel_name, scores in results_by_kernel.items(): - plt.scatter(SUBSAMPLE_SIZES, scores, label=kernel_name, marker=kernel_marker[kernel_name]) - - plt.legend() - plt.ylabel("accuracy (%)") - plt.xlabel("log(subsample size)") - plt.xscale("log") - with open("results.txt", "w") as f: - f.write(str(results_by_kernel)) - f.write("\n") - f.write(str(SUBSAMPLE_SIZES)) - plt.savefig('resultat_nystroem_svm_classifer.png', bbox_inches='tight') diff --git a/skluc/examples/so_conv_net.py b/skluc/examples/so_conv_net.py deleted file mode 100644 index 32b905a..0000000 --- a/skluc/examples/so_conv_net.py +++ /dev/null @@ -1,33 +0,0 @@ -import tensorflow as tf -import numpy as np -import skluc.data.mldatasets as dataset -from skluc.data.transformation import VGG19Cifar10BadTransformer - - -def semi_flatten_out_conv(data): - return data.reshape((data.shape[0], -1, data.shape[-1])) - - -if __name__ == '__main__': - val_size = 5000 - cifar10 = dataset.Cifar10Dataset(validation_size=val_size) - cifar10.load() - cifar10.to_image() - cifar10.to_one_hot() - cifar10.normalize() - cifar10.apply_transformer(VGG19Cifar10BadTransformer) - cifar10.normalize() - cifar10.data_astype(np.float32) - cifar10.labels_astype(np.float32) - cifar10.to_feature_vectors() - - X_train, Y_train = cifar10.train # -1 x 2 x 2 x 512 - X_val, Y_val = cifar10.validation - X_test, Y_test = cifar10.test - - - - - - - diff --git a/skluc/examples/tasks/classification/cifar10/__init__.py b/skluc/examples/tasks/classification/cifar10/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skluc/examples/fc_cnn_cifar.py b/skluc/examples/tasks/classification/cifar10/fc_cnn_cifar.py similarity index 96% rename from skluc/examples/fc_cnn_cifar.py rename to skluc/examples/tasks/classification/cifar10/fc_cnn_cifar.py index fc7c755..d9d6896 100644 --- a/skluc/examples/fc_cnn_cifar.py +++ b/skluc/examples/tasks/classification/cifar10/fc_cnn_cifar.py @@ -7,9 +7,9 @@ where the input comes from memory. import tensorflow as tf import numpy as np -import skluc.data.mldatasets as dataset -from skluc.tensorflow_.utils import inference_cifar10, batch_generator -import matplotlib.pyplot as plt +import skluc.main.data.mldatasets as dataset +from skluc.main.tensorflow_.utils import inference_cifar10, batch_generator + tf.logging.set_verbosity(tf.logging.ERROR) import time as t diff --git a/skluc/examples/fc_dense_cifar_preprocessing_vgg19.py b/skluc/examples/tasks/classification/cifar10/fc_dense_cifar_preprocessing_vgg19.py similarity index 88% rename from skluc/examples/fc_dense_cifar_preprocessing_vgg19.py rename to skluc/examples/tasks/classification/cifar10/fc_dense_cifar_preprocessing_vgg19.py index 7956d7e..19eb294 100644 --- a/skluc/examples/fc_dense_cifar_preprocessing_vgg19.py +++ b/skluc/examples/tasks/classification/cifar10/fc_dense_cifar_preprocessing_vgg19.py @@ -1,18 +1,10 @@ -import time as t -import os - import tensorflow as tf import numpy as np -import matplotlib.pyplot as plt -from keras.losses import mean_squared_error -from keras.models import load_model, Model -import skluc.data.mldatasets as dataset -from skluc.data.transformation import VGG19Cifar10Transformer -from skluc.tensorflow_.utils import fully_connected, classification_cifar, batch_generator -from skluc.utils import logger, download_data -import matplotlib.pyplot as plt -from keras.metrics import categorical_accuracy, binary_accuracy +import skluc.main.data.mldatasets as dataset +from skluc.main.data import VGG19Cifar10Transformer +from skluc.main.tensorflow_.utils import fully_connected, classification_cifar, batch_generator +from skluc.main.utils import logger if __name__ == "__main__": VALIDATION_SIZE = 10000 @@ -26,8 +18,9 @@ if __name__ == "__main__": data.load() data.normalize() data.to_image() + data.apply_transformer(VGG19Cifar10Transformer()) data.to_one_hot() - data.apply_transformer(VGG19Cifar10Transformer) + data.normalize() data.flatten() data.data_astype(np.float32) data.labels_astype(np.float32) diff --git a/skluc/examples/tasks/classification/mnist/__init__.py b/skluc/examples/tasks/classification/mnist/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skluc/examples/deepfriedConvnetMnist.py b/skluc/examples/tasks/classification/mnist/deepfriedConvnetMnist.py similarity index 95% rename from skluc/examples/deepfriedConvnetMnist.py rename to skluc/examples/tasks/classification/mnist/deepfriedConvnetMnist.py index 672c27d..f41fe7b 100644 --- a/skluc/examples/deepfriedConvnetMnist.py +++ b/skluc/examples/tasks/classification/mnist/deepfriedConvnetMnist.py @@ -12,9 +12,9 @@ Zichao Yang, Marcin Moczulski, Misha Denil, Nando de Freitas, Alex Smola, Le Son import tensorflow as tf import numpy as np -import skluc.data.mldatasets as dataset -from skluc.tensorflow_.utils import convolution_mnist, classification_mnist, batch_generator -from skluc.tensorflow_.kernel_approximation import fastfood_layer +import skluc.main.data.mldatasets as dataset +from skluc.main.tensorflow_.utils import convolution_mnist, classification_mnist, batch_generator +from skluc.main.tensorflow_.kernel_approximation import fastfood_layer tf.logging.set_verbosity(tf.logging.ERROR) diff --git a/skluc/examples/fc_cnn_mnist.py b/skluc/examples/tasks/classification/mnist/fc_cnn_mnist.py similarity index 97% rename from skluc/examples/fc_cnn_mnist.py rename to skluc/examples/tasks/classification/mnist/fc_cnn_mnist.py index 235fb8a..f42cd96 100644 --- a/skluc/examples/fc_cnn_mnist.py +++ b/skluc/examples/tasks/classification/mnist/fc_cnn_mnist.py @@ -7,8 +7,8 @@ where the input comes from memory. import tensorflow as tf import numpy as np -import skluc.data.mldatasets as dataset -from skluc.tensorflow_.utils import inference_mnist, batch_generator +import skluc.main.data.mldatasets as dataset +from skluc.main.tensorflow_.utils import inference_mnist, batch_generator tf.logging.set_verbosity(tf.logging.ERROR) diff --git a/skluc/examples/keras_fc_cnn_mnist.py b/skluc/examples/tasks/classification/mnist/keras_fc_cnn_mnist.py similarity index 98% rename from skluc/examples/keras_fc_cnn_mnist.py rename to skluc/examples/tasks/classification/mnist/keras_fc_cnn_mnist.py index 1aaf123..9fe75bf 100644 --- a/skluc/examples/keras_fc_cnn_mnist.py +++ b/skluc/examples/tasks/classification/mnist/keras_fc_cnn_mnist.py @@ -5,7 +5,7 @@ from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout from keras.callbacks import LearningRateScheduler, TensorBoard from keras.preprocessing.image import ImageDataGenerator from keras.regularizers import l2 -import skluc.data.mldatasets as dataset +import skluc.main.data.mldatasets as dataset batch_size = 128 epochs = 200 diff --git a/skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py b/skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py new file mode 100644 index 0000000..2f25700 --- /dev/null +++ b/skluc/examples/tasks/classification/omniglot/scratch/omniglot_snell_28x28_vinyals.py @@ -0,0 +1,114 @@ +import time + +import numpy as np +from keras import optimizers +from keras.callbacks import LearningRateScheduler, TensorBoard +from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Activation +from keras.layers import Dense, Flatten +from keras.models import Sequential +from keras.preprocessing.image import ImageDataGenerator + +import skluc.main.data.mldatasets as dataset +from skluc.main.data.transformation.ResizeTransformer import ResizeTransformer +from skluc.main.utils import logger + + +def scheduler(epoch): + """ + Function to pass to the "LearningrateScheduler" + + :param epoch: + :return: + """ + if epoch < 80: + return 0.1 + if epoch < 160: + return 0.01 + return 0.001 + + +def model_definition(): + model = Sequential() + + model.add( + Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(MaxPooling2D((2, 2), strides=(2, 2))) + model.add(Activation("relu")) + + model.add( + Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(MaxPooling2D((2, 2), strides=(2, 2))) + model.add(Activation("relu")) + + model.add( + Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(MaxPooling2D((2, 2), strides=(2, 2))) + model.add(Activation("relu")) + + model.add( + Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(MaxPooling2D((2, 2), strides=(2, 2))) + model.add(Activation("relu")) + + + model.add(Flatten()) + + model.add(Dense(num_classes, kernel_initializer='he_normal')) + model.add(Activation("softmax")) + + sgd = optimizers.SGD(lr=.1, momentum=0.9, nesterov=True) + model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) + return model + + +if __name__ == "__main__": + logger.debug("Executing file {}".format(__file__)) + + validation_size = 10000 + seed = 0 + num_classes = 1200 + batch_size = 128 + epochs = 200 + dropout = 0.5 + weight_decay = 0.0001 + input_shape = (28, 28, 1) + log_filepath = r'./vinyals_logs/' + + data = dataset.OmniglotDataset(validation_size=1000, snell_preprocessing=True, seed=seed) + data.load() + data.normalize() + data.data_astype(np.float32) + data.labels_astype(np.float32) + data.to_image() + resizetrans = ResizeTransformer(data.s_name, (28, 28)) + data.apply_transformer(resizetrans) + data.to_one_hot() + (x_train, y_train), (x_test, y_test) = data.train, data.test + x_val, y_val = data.validation + print(x_train.shape, y_train.shape) + + model = model_definition() + + tb_cb = TensorBoard(log_dir=log_filepath, histogram_freq=0) + change_lr = LearningRateScheduler(scheduler) + cbks = [change_lr, tb_cb] + + print('Using real-time data augmentation.') + datagen = ImageDataGenerator(horizontal_flip=True, + width_shift_range=0.125, height_shift_range=0.125, fill_mode='constant', cval=0.) + + datagen.fit(x_train) + + iterations = int(data.train.data.shape[0] / batch_size) + model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), + steps_per_epoch=iterations, + epochs=epochs, + callbacks=cbks, + validation_data=(x_val, y_val)) + + model.save('{}_vinyals_omniglot_snell.h5'.format(time.time())) + print("Final evaluation on val set: {}".format(model.evaluate(x_val, y_val))) diff --git a/skluc/examples/tasks/classification/svhn/__init__.py b/skluc/examples/tasks/classification/svhn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/skluc/examples/fc_dense_svhn_preprocessing_vgg19.py b/skluc/examples/tasks/classification/svhn/fc_dense_svhn_preprocessing_vgg19.py similarity index 87% rename from skluc/examples/fc_dense_svhn_preprocessing_vgg19.py rename to skluc/examples/tasks/classification/svhn/fc_dense_svhn_preprocessing_vgg19.py index 31e40df..16b9e60 100644 --- a/skluc/examples/fc_dense_svhn_preprocessing_vgg19.py +++ b/skluc/examples/tasks/classification/svhn/fc_dense_svhn_preprocessing_vgg19.py @@ -1,18 +1,10 @@ -import time as t -import os - import tensorflow as tf import numpy as np -import matplotlib.pyplot as plt -from keras.losses import mean_squared_error -from keras.models import load_model, Model -import skluc.data.mldatasets as dataset -from skluc.data.transformation import VGG19Cifar10Transformer, VGG19SvhnTransformer -from skluc.tensorflow_.utils import fully_connected, classification_cifar, batch_generator -from skluc.utils import logger, download_data -import matplotlib.pyplot as plt -from keras.metrics import categorical_accuracy, binary_accuracy +import skluc.main.data.mldatasets as dataset +from skluc.main.data import VGG19SvhnTransformer +from skluc.main.tensorflow_.utils import fully_connected, classification_cifar, batch_generator +from skluc.main.utils import logger if __name__ == "__main__": VALIDATION_SIZE = 10000 @@ -25,7 +17,7 @@ if __name__ == "__main__": data.load() data.normalize() # data.to_image() - data.apply_transformer(VGG19SvhnTransformer) + data.apply_transformer(VGG19SvhnTransformer(cut_layer_name="block5_conv4")) data.to_one_hot() data.flatten() data.data_astype(np.float32) diff --git a/skluc/examples/keras_fc_preprocess_svhn.py b/skluc/examples/tasks/classification/svhn/keras_fc_preprocess_svhn.py similarity index 89% rename from skluc/examples/keras_fc_preprocess_svhn.py rename to skluc/examples/tasks/classification/svhn/keras_fc_preprocess_svhn.py index f194bcb..3173674 100644 --- a/skluc/examples/keras_fc_preprocess_svhn.py +++ b/skluc/examples/tasks/classification/svhn/keras_fc_preprocess_svhn.py @@ -1,12 +1,11 @@ import numpy as np from keras import optimizers from keras.models import Sequential -from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout +from keras.layers import Dense, Dropout from keras.callbacks import LearningRateScheduler, TensorBoard -from keras.preprocessing.image import ImageDataGenerator from keras.regularizers import l2 -import skluc.data.mldatasets as dataset -from skluc.data.transformation import VGG19SvhnTransformer +import skluc.main.data.mldatasets as dataset +from skluc.main.data import VGG19SvhnTransformer batch_size = 128 epochs = 200 diff --git a/skluc/examples/tf_profiling.py b/skluc/examples/tf_profiling.py deleted file mode 100644 index 83f9893..0000000 --- a/skluc/examples/tf_profiling.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Basic example of profiling with tensorflow dans chrome trace. - -Example in a tensorflow application -""" - -import tensorflow as tf -from tensorflow.python.client import timeline - - -if __name__ == '__main__': - - x = tf.random_normal([1000, 1000]) - y = tf.random_normal([1000, 1000]) - res = tf.matmul(x, y) - - # Run the graph with full trace option - with tf.Session() as sess: - run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - run_metadata = tf.RunMetadata() - sess.run(res, options=run_options, run_metadata=run_metadata) - - # Create the Timeline object, and write it to a json - tl = timeline.Timeline(run_metadata.step_stats) - ctf = tl.generate_chrome_trace_format() - with open('timeline.json', 'w') as f: - f.write(ctf) \ No newline at end of file diff --git a/skluc/examples/tfrecord_nn.py b/skluc/examples/tfrecord_nn.py deleted file mode 100644 index 90a5670..0000000 --- a/skluc/examples/tfrecord_nn.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -Convolutional Neural Netwok implementation in tensorflow using tfrecords as inputs. - -From memory, the tfrecords ar written then read. - -The neural network is ran against the mnist dataset and we can see an example of distortion of input in -the case of tfrecords data source. -""" - -import tensorflow as tf -import numpy as np -import skluc.data.mldatasets as dataset -from skluc.convert_image_to_records import convert_to -from skluc.tensorflow_.utils import inference_mnist - -tf.logging.set_verbosity(tf.logging.ERROR) - -import time as t -from collections import namedtuple - -val_size = 5000 -mnist = dataset.MnistDataset(validation_size=val_size) -mnist.load() -mnist.to_one_hot() -mnist.to_image() -mnist.normalize() -mnist.data_astype(np.float32) -mnist.labels_astype(np.float32) - -X_train, Y_train = mnist.train -X_val, Y_val = mnist.validation -X_test, Y_test = mnist.test - -# build dataset objects - -train = namedtuple("Dataset", ["images", "labels", "num_examples"]) -train.images = X_train -train.labels = Y_train -train.num_examples = X_train.shape[0] - -test = namedtuple("Dataset", ["images", "labels", "num_examples"]) -test.images = X_test -test.labels = Y_test -test.num_examples = X_test.shape[0] - -val = namedtuple("Dataset", ["images", "labels", "num_examples"]) -val.images = X_val -val.labels = Y_val -val.num_examples = X_val.shape[0] - - -def decode(serialized_example): - features = tf.parse_single_example( - serialized_example, - features={ - 'image_raw': tf.FixedLenFeature([], tf.string), - 'label': tf.FixedLenFeature([], tf.int64), - 'height': tf.FixedLenFeature([], tf.int64), - 'width': tf.FixedLenFeature([], tf.int64), - 'depth': tf.FixedLenFeature([], tf.int64) - } - ) - image = tf.decode_raw(features['image_raw'], tf.float32) - image.set_shape((784)) - - label = tf.cast(features['label'], tf.int32) - label = tf.one_hot(label, 10) - - return image, label - - -def distortion(image, label): - """ - Apply som distortion to the input images - - :param image: - :param label: - :return: - """ - distorted_image = tf.image.random_brightness(image, - max_delta=15) - return distorted_image, label - - -def get_tf_record(record_filename, num_epochs, batch_size, distord=True): - """ - Gives an iteror to plug to the input of the neural network. - - The iterator gives a new bach on demand. - - :param record_filename: The filename where to find tfrecords. - :param num_epochs: The number of epoch we will need - :param batch_size: The size of each returned batch - :param distord: Parameter for applying or not distortion on input - :return: The iterator to plug to the input of the network - """ - dataset = tf.data.TFRecordDataset(record_filename) - dataset = dataset.repeat(num_epochs) - dataset = dataset.map(decode) - # if distord: - # dataset = dataset.map(distortion) - dataset = dataset.shuffle(1000 + 3 * batch_size) - dataset = dataset.batch(batch_size) # combine les éléments en batchs - iterator = dataset.make_one_shot_iterator() - return iterator - - -def trainning(): - SIGMA = 5.0 - num_epochs = 10000 - batch_size = 64 - print("Sigma = {}".format(SIGMA)) - - with tf.Graph().as_default(): - # retourne batchs après batchs - iterator_train = get_tf_record("/tmp/data/mnist/mnist_train.tfrecords", num_epochs, batch_size, distord=False) - iterator_val = get_tf_record("/tmp/data/mnist/mnist_val.tfrecords", num_epochs, batch_size) - iterator_test = get_tf_record("/tmp/data/mnist/mnist_test.tfrecords", num_epochs, batch_size) - - input_dim, output_dim = 784, 10 - - with tf.name_scope("train_set"): - x_image_train, y_train = iterator_train.get_next() - x_image_train = tf.reshape(x_image_train, [-1, 28, 28, 1]) - tf.summary.image("digit", x_image_train, max_outputs=3) - - with tf.name_scope("val_set"): - x_image_val, y_val = iterator_val.get_next() - x_image_val = tf.reshape(x_image_val, tf.stack([-1, 28, 28, 1])) - - with tf.name_scope("test_set"): - x_image_test, y_test = iterator_test.get_next() - x_image_test = tf.reshape(x_image_test, tf.stack([-1, 28, 28, 1])) - - with tf.variable_scope("inference") as scope_inference: - y_train_out, keep_prob_train = inference_mnist(x_image_train, output_dim) - scope_inference.reuse_variables() - y_val_out, keep_prob_val = inference_mnist(x_image_val, output_dim) - scope_inference.reuse_variables() - y_test_out, keep_prob_test = inference_mnist(x_image_test, output_dim) - - # calcul de la loss - with tf.name_scope("xent"): - cross_entropy = tf.reduce_mean( - tf.nn.softmax_cross_entropy_with_logits(labels=y_train, logits=y_train_out, name="xentropy"), - name="xentropy_mean") - tf.summary.scalar('loss-xent', cross_entropy) - - # calcul du gradient - with tf.name_scope("train"): - global_step = tf.Variable(0, name="global_step", trainable=False) - train_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cross_entropy, global_step=global_step) - - # calcul de l'accuracy - with tf.name_scope("accuracy"): - predictions_val = tf.argmax(y_val_out, 1) - correct_prediction_val = tf.equal(predictions_val, tf.argmax(y_val, 1)) - accuracy_val = tf.reduce_mean(tf.cast(correct_prediction_val, tf.float32)) - tf.summary.scalar("accuracy", accuracy_val) - predictions_test = tf.argmax(y_test_out, 1) - correct_prediction_test = tf.equal(predictions_test, tf.argmax(y_test, 1)) - accuracy_test = tf.reduce_mean(tf.cast(correct_prediction_test, tf.float32)) - - merged_summary = tf.summary.merge_all() - - init = tf.global_variables_initializer() - # Create a session for running Ops on the Graph. - sess = tf.Session() - # Instantiate a SummaryWriter to output summaries and the Graph. - summary_writer = tf.summary.FileWriter("/tmp/results_tfrecord_nn") - summary_writer.add_graph(sess.graph) - # Initialize all Variable objects - sess.run(init) - # actual learning - started = t.time() - feed_dict = { - keep_prob_val: 1.0, - keep_prob_train: 0.5, - keep_prob_test: 1.0 - } - # todo maybe there is a misunderstanding of epoch definition here. - for i in range(num_epochs): - # run training and get the loss - _, loss = sess.run([train_optimizer, cross_entropy], feed_dict=feed_dict) - - if i % 100 == 0: - print('step {}, loss {} (with dropout)'.format(i, loss)) - r_accuracy = sess.run([accuracy_val], feed_dict=feed_dict) - print("accuracy: {} on validation set (without dropout).".format(r_accuracy)) - summary_str = sess.run(merged_summary, feed_dict=feed_dict) - summary_writer.add_summary(summary_str, i) - - stoped = t.time() - accuracy_eval, preds_eval, exp_eval = sess.run([accuracy_test, predictions_test, y_test], feed_dict=feed_dict) - print('test accuracy %g' % accuracy_eval) - np.set_printoptions(threshold=np.nan) - print("Prediction sample: " + str(preds_eval[:50])) - print("Actual values: " + str(np.argmax(exp_eval[:50], axis=1))) - print("Elapsed time: %.4f s" % (stoped - started)) - - -def create_records(): - import os - directory = "/tmp/data/mnist" - - if not os.path.exists(os.path.join(directory, "mnist_train.tfrecords")): - convert_to(train, "mnist_train", directory) - convert_to(test, "mnist_test", directory) - convert_to(val, "mnist_val", directory) - - -if __name__ == '__main__': - create_records() - trainning() - - - diff --git a/skluc/examples/time_batch_subsample_ops.py b/skluc/examples/time_batch_subsample_ops.py deleted file mode 100644 index 2541afe..0000000 --- a/skluc/examples/time_batch_subsample_ops.py +++ /dev/null @@ -1,86 +0,0 @@ -import tensorflow as tf -import numpy as np -from sklearn.metrics.pairwise import rbf_kernel - -import skluc.data.mldatasets as dataset -from skluc.tensorflow_.utils import fully_connected, get_next_batch, tf_op, conv_relu_pool -from skluc.utils import time_fct -from skluc.tensorflow_.kernel import tf_rbf_kernel - -tf.logging.set_verbosity(tf.logging.ERROR) - -# Preparing the dataset ######################### - -mnist = dataset.MnistDataset() -mnist.load() -X_train, _ = mnist.train -X_train = np.array(X_train / 255) -X_train = X_train.astype(np.float32) - -################################################ - -# todo timer les autres kernels pour verifier que c'est effectivement plus rapide - -if __name__ == '__main__': - input_dim = X_train.shape[1] - output_dim_fc = 4096*2 - batch_size = 500 - subsample_size = 500 - X_batch = get_next_batch(X_train, 0, batch_size) - X_subsample = get_next_batch(X_train, 0, subsample_size) - - with tf.Graph().as_default(): - # inputs - x = tf.placeholder(tf.float32, shape=[None, input_dim], name="x") - x_subsample = tf.placeholder(tf.float32, shape=[None, input_dim], name="x_subsample") - - # reshape vector inputs to images - side_size = int(np.sqrt(input_dim)) - x_image = tf.reshape(x, [-1, side_size, side_size, 1]) - x_subsample_image = tf.reshape(x_subsample, [subsample_size, side_size, side_size, 1]) - - # fully connected ops - out_fc_x = fully_connected(x, output_dim_fc, act=tf.nn.relu, variable_scope="fc_x") - out_fc_subsample = fully_connected(x_subsample, output_dim_fc, act=tf.nn.relu, variable_scope="fc_subsample") - - # convolution ops - out_conv_x = conv_relu_pool(x_image, [5, 5, 1, 20], [20], pool_size=3, variable_scope="conv_x") - out_conv_subsample = conv_relu_pool(x_subsample_image, [5, 5, 1, 20], [20], pool_size=3, - variable_scope="conv_subsample") - - init_dim = np.prod([s.value for s in out_conv_x.shape[1:] if s.value is not None]) - x_conv_flat = tf.reshape(out_conv_x, [-1, init_dim]) - subsample_conv_flat = tf.reshape(out_conv_subsample, [subsample_size, init_dim]) - - # kernel computing ops - with tf.device('/cpu:0'): - kernel_cpu = tf_rbf_kernel(x_conv_flat, subsample_conv_flat, gamma=0.001) - with tf.device('/device:GPU:0'): - kernel_gpu = tf_rbf_kernel(x_conv_flat, subsample_conv_flat, gamma=0.001) - - feed_dict = {x: X_batch, x_subsample: X_subsample} - - def kernel_sklearn(): - with tf.Session() as sess: - init = tf.global_variables_initializer() - sess.run([init]) - x, y = sess.run([x_conv_flat, subsample_conv_flat], feed_dict=feed_dict) - rbf_kernel(x, y, gamma=0.001) - - d_time_results = { - "fc_x": lambda: time_fct(lambda: tf_op(feed_dict, [out_fc_x]), n_iter=10), - "fc_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [out_fc_subsample]), n_iter=10), - "reshape_x": lambda: time_fct(lambda: tf_op(feed_dict, [x_image]), n_iter=10), - "reshape_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [x_subsample_image]), n_iter=10), - "reshape_x + conv_x": lambda: time_fct(lambda: tf_op(feed_dict, [out_conv_x]), n_iter=10), - "reshape_subsample + conv_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [out_conv_subsample]), n_iter=10), - "reshape_x + conv_x + reshape_subsample + conv_subsample": lambda: time_fct(lambda: tf_op(feed_dict, [out_conv_x, out_conv_subsample]), n_iter=10), - "reshape_x + conv_x + reshape_subsample + conv_subsample + kernel_cpu": lambda: time_fct(lambda: tf_op(feed_dict, [kernel_cpu]), n_iter=10), - "reshape_x + conv_x + reshape_subsample + conv_subsample + kernel_gpu": lambda: time_fct(lambda: tf_op(feed_dict, [kernel_gpu]), n_iter=10), - "reshape_x + conv_x + reshape_subsample + conv_subsample + kernel_sklearn": lambda: time_fct(kernel_sklearn, n_iter=10) - } - - for key, value in d_time_results.items(): - print("{}:\t{:.4f}s".format(key, value())) - tf.reset_default_graph() - diff --git a/skluc/examples/write_read_tfrecords.py b/skluc/examples/write_read_tfrecords.py deleted file mode 100644 index e3c3703..0000000 --- a/skluc/examples/write_read_tfrecords.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Example (with mnist) on how to read and write tf records. -""" - -import tensorflow as tf -import os -import numpy as np -import skluc.data.mldatasets as dataset -from skluc.convert_image_to_records import convert_to -import matplotlib.pyplot as plt - -tf.logging.set_verbosity(tf.logging.ERROR) - -from collections import namedtuple - -val_size = 5000 -mnist = dataset.MnistDataset(validation_size=val_size) -mnist.load() -mnist.to_one_hot() -mnist.normalize() -mnist.to_image() -mnist.data_astype(np.float32) -mnist.labels_astype(np.float32) - -X_train, Y_train = mnist.train -X_val, Y_val = mnist.validation -X_test, Y_test = mnist.test - - -train = namedtuple("Dataset", ["images", "labels", "num_examples"]) -train.images = X_train -train.labels = Y_train -train.num_examples = X_train.shape[0] - -test = namedtuple("Dataset", ["images", "labels", "num_examples"]) -test.images = X_test -test.labels = Y_test -test.num_examples = X_test.shape[0] - -val = namedtuple("Dataset", ["images", "labels", "num_examples"]) -val.images = X_val -val.labels = Y_val -val.num_examples = X_val.shape[0] - -DIRECTORY = "/tmp/data/mnist" - - -def write_tf_record_mnist(): - convert_to(train, "mnist_train", DIRECTORY) - convert_to(test, "mnist_test", DIRECTORY) - convert_to(val, "mnist_val", DIRECTORY) - - -def read_tf_record_mnist(): - reconstructed_images = [] - record_iterator = tf.python_io.tf_record_iterator(path=os.path.join(DIRECTORY, "mnist_train.tfrecords")) - for string_record in record_iterator: - example = tf.train.Example() - example.ParseFromString(string_record) - height = int(example.features.feature["height"].int64_list.value[0]) - width = int(example.features.feature["width"].int64_list.value[0]) - depth = int(example.features.feature["depth"].int64_list.value[0]) - img_string = (example.features.feature["image_raw"].bytes_list.value[0]) - label = (example.features.feature["label"].int64_list.value[0]) - - img_1d = np.fromstring(img_string, dtype=np.uint32) - reconstructed_img = img_1d.reshape((height, width, depth)) - reconstructed_images.append((reconstructed_img, label)) - - plt.imshow(reconstructed_images[0][0][:,:,0]) - plt.show() - print(reconstructed_images[0][1]) - - -if __name__ == '__main__': - # write_tf_record_mnist() - read_tf_record_mnist() \ No newline at end of file diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py index 1aa8459..d01ae87 100644 --- a/skluc/main/data/mldatasets/Dataset.py +++ b/skluc/main/data/mldatasets/Dataset.py @@ -77,7 +77,7 @@ class Dataset(object): get_nbr)) idx_labs = np.where(bool_idx_labs)[0][:get_nbr] # return_idx_labels[copy_idx:copy_idx+get_nbr] = idx_labs - logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs))) + # logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs))) return_idx_labels.extend(idx_labs) copy_idx += get_nbr @@ -129,6 +129,7 @@ class Dataset(object): @property def validation(self): + # todo doesn't work if val size = 0 return LabeledData(data=self._train.data[self.permuted_index_validation], labels=self._train.labels[self.permuted_index_validation]) @@ -273,6 +274,7 @@ class Dataset(object): if self._train is not None: logger.debug("Construction of random train indexes (seed: {})".format(self.seed)) np.random.seed(self.seed) + # todo -> faire argument shuffle or not permut = np.random.permutation(self._train[0].shape[0]) if self.validation_size > 0: self.permuted_index_train = permut[:-self.validation_size] diff --git a/skluc/main/data/mldatasets/ImageDataset.py b/skluc/main/data/mldatasets/ImageDataset.py index 4151cd3..92053d5 100644 --- a/skluc/main/data/mldatasets/ImageDataset.py +++ b/skluc/main/data/mldatasets/ImageDataset.py @@ -43,8 +43,7 @@ class ImageDataset(Dataset): data, labels = getattr(self, kw) transformed_data, transformed_labels = transformer.transform(data, labels) setattr(self, kw, LabeledData(data=transformed_data, labels=transformed_labels)) - - self.save_npz(transform_path) + self.save_npz(transform_path) def to_image(self): """ diff --git a/skluc/data/mldatasets/MovieReviewDataset.py b/skluc/main/data/mldatasets/MovieReviewDataset.py similarity index 90% rename from skluc/data/mldatasets/MovieReviewDataset.py rename to skluc/main/data/mldatasets/MovieReviewDataset.py index 9d4e28b..97080db 100644 --- a/skluc/data/mldatasets/MovieReviewDataset.py +++ b/skluc/main/data/mldatasets/MovieReviewDataset.py @@ -118,6 +118,8 @@ class MovieReviewV1Dataset(Dataset): @property def train(self): + # todo no guarantee on the stratification of classes + indexes = self.permuted_index_train[:self.TRAIN_SIZE - self.validation_size] return LabeledData(data=self._train.data[indexes], labels=self._train.labels[indexes]) @@ -182,25 +184,3 @@ class MovieReviewV1Dataset(Dataset): negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return LabeledData(data=x_text, labels=y) - - # todo not yet sure the following is usefull - # @staticmethod - # def batch_iter(data, batch_size, num_epochs, shuffle=True): - # """ - # Generates a batch iterator for a dataset. - # """ - # data = np.array(data) - # data_size = len(data) - # num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 - # for epoch in range(num_epochs): - # # Shuffle the data at each epoch - # if shuffle: - # shuffle_indices = np.random.permutation(np.arange(data_size)) - # shuffled_data = data[shuffle_indices] - # else: - # shuffled_data = data - # for batch_num in range(num_batches_per_epoch): - # start_index = batch_num * batch_size - # end_index = min((batch_num + 1) * batch_size, data_size) - # - # yield shuffled_data[start_index:end_index] diff --git a/skluc/main/data/mldatasets/OmniglotDataset.py b/skluc/main/data/mldatasets/OmniglotDataset.py index e7141fb..22bcdce 100644 --- a/skluc/main/data/mldatasets/OmniglotDataset.py +++ b/skluc/main/data/mldatasets/OmniglotDataset.py @@ -16,12 +16,23 @@ class OmniglotDataset(ImageDataset): WIDTH = 105 DEPTH = 1 - def __init__(self, validation_size=0, seed=None, s_download_dir=None): + def __init__(self, validation_size=0, seed=None, s_download_dir=None, snell_preprocessing=False): + """ + + :param validation_size: + :param seed: + :param s_download_dir: + :param snell_preprocessing: should the data preprocessing used in prototypical be used on omniglot + """ self.__s_url = ["https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip", "https://github.com/brendenlake/omniglot/raw/master/python/images_evaluation.zip" ] self.meta = None + self.__snell_preprocessing = snell_preprocessing name = "omniglot" + if self.__snell_preprocessing: + name += "_snell" + if s_download_dir is not None: super().__init__(self.__s_url, name, s_download_dir, validation_size=validation_size, seed=seed) else: @@ -101,14 +112,36 @@ class OmniglotDataset(ImageDataset): logger.debug("Files {} have already been extracted".format(self.l_filepaths)) logger.debug("Get training data of dataset {}".format(self.s_name)) - self._train = LabeledData(*self.get_omniglot_data('background')) + background_data = LabeledData(*self.get_omniglot_data('background')) logger.debug("Get testing data of dataset {}".format(self.s_name)) - self._test = LabeledData(*self.get_omniglot_data('evaluation')) + evaluation_data = LabeledData(*self.get_omniglot_data('evaluation')) + + if self.__snell_preprocessing: + nb_class_bg_snell = 1200 + unique_labels_train = np.unique(background_data.labels, axis=0) + nb_labels_train = len(unique_labels_train) + nb_class_to_move = nb_class_bg_snell - nb_labels_train + unique_labels_test = np.unique(evaluation_data.labels, axis=0) + labels_to_move = unique_labels_test[:nb_class_to_move] + bool_idx_data_to_move = np.zeros(len(evaluation_data.labels), dtype=bool) + for label in labels_to_move: + bool_idx_label = OmniglotDataset.get_bool_idx_label(label, evaluation_data.labels) + bool_idx_data_to_move = np.logical_or(bool_idx_data_to_move, bool_idx_label) + + labels_to_add = evaluation_data.labels[bool_idx_data_to_move] + np.max(background_data.labels) + 1 + self._train = LabeledData(data=np.vstack([background_data.data, evaluation_data.data[bool_idx_data_to_move]]), + labels=np.hstack([background_data.labels, labels_to_add ])) + self._test = LabeledData(data=evaluation_data.data[np.logical_not(bool_idx_data_to_move)], + labels=evaluation_data.labels[np.logical_not(bool_idx_data_to_move)] - nb_class_to_move) + else: + self._train = background_data + self._test = evaluation_data self._check_validation_size(self._train[0].shape[0]) self.save_npz() + logger.debug("Number of labels in train set {}".format(len(np.unique(self._train.labels, axis=0)))) logger.debug("Number of labels in evaluation set {}".format(len(np.unique(self._test.labels, axis=0)))) diff --git a/skluc/main/data/mldatasets/RPSDataset.py b/skluc/main/data/mldatasets/RPSDataset.py new file mode 100644 index 0000000..78e5059 --- /dev/null +++ b/skluc/main/data/mldatasets/RPSDataset.py @@ -0,0 +1,125 @@ +import os +import zipfile + +import numpy as np +import imageio +import matplotlib.pyplot as plt + +from skluc.utils import LabeledData, create_directory +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger, check_files + + +class RPSDataset(ImageDataset): + data_groups_private = ["_train"] + HEIGHT = 50 + WIDTH = 50 + DEPTH = 3 + TRAIN_SIZE = 600 + + def __init__(self, validation_size=0, seed=None, s_download_dir=None): + self.__s_url = ["https://pageperso.lif.univ-mrs.fr/~luc.giffon/datasets/rps_data_resize.zip"] + self.meta = None + name = "rps" + if s_download_dir is not None: + super().__init__(self.__s_url, name, s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__(self.__s_url, name, validation_size=validation_size, seed=seed) + + self.__extracted_dirs = [ + os.path.join(self.s_download_dir, "images_background"), + os.path.join(self.s_download_dir, "images_evaluation") + ] + + def get_rps_data(self): + data_dirname = "rps_data_resize" + data_dirpath = os.path.join(self.s_download_dir, data_dirname) + class_index = 0 + list_of_images = [] + list_of_labels = [] + for symbol_name in os.listdir(data_dirpath): + data_symbol_path = os.path.join(data_dirpath, symbol_name) + for symbol_image_file in os.listdir(data_symbol_path): + symbol_image_path = os.path.join(data_symbol_path, symbol_image_file) + im = imageio.imread(symbol_image_path) + list_of_images.append(im) + list_of_labels.append(class_index) + class_index += 1 + data = np.array(list_of_images) + labels = np.array(list_of_labels) + data = data.reshape(data.shape[0], self.WIDTH*self.HEIGHT, self.DEPTH, order="C") + data = data.reshape(data.shape[0], self.WIDTH*self.HEIGHT*self.DEPTH, order="F") + return data, labels + + def read(self): + npzdir_path = os.path.join(self.s_download_dir, "npzfiles") + lst_npzfile_paths = [os.path.join(npzdir_path, kw + ".npz") + for kw in self.data_groups_private] + create_directory(npzdir_path) + if check_files(lst_npzfile_paths): + # case npz files already exist + logger.debug("Files {} already exists".format(lst_npzfile_paths)) + logger.info("Loading transformed data from files {}".format(lst_npzfile_paths)) + for kw in self.data_groups_private: + npzfile_path = os.path.join(npzdir_path, kw + ".npz") + logger.debug("Loading {}".format(npzfile_path)) + npzfile = np.load(npzfile_path) + data = npzfile[kw + "_data"] + logger.debug("Shape of {} set: {}".format(kw, data.shape)) + labels = npzfile[kw + "_labels"] + setattr(self, kw, LabeledData(data=data, labels=labels)) + else: + + if not check_files(self.__extracted_dirs): + # case zip files dont even exist + logger.debug("Extracting {} ...".format(self.l_filepaths)) + for zip_file in self.l_filepaths: + zip_ref = zipfile.ZipFile(zip_file, 'r') + zip_ref.extractall(self.s_download_dir) + zip_ref.close() + else: + logger.debug("Files {} have already been extracted".format(self.l_filepaths)) + + full_data, full_labels = self.get_rps_data() + logger.debug("Get training data of dataset {}".format(self.s_name)) + self._train = LabeledData(data=full_data, labels=full_labels) + # self._test = LabeledData(data=np.array([]), labels=np.array([])) + # + # logger.debug("Get testing data of dataset {}".format(self.s_name)) + # self._test = LabeledData(*self.get_omniglot_data('evaluation')) + # + self._check_validation_size(self._train[0].shape[0]) + + self.save_npz() + + + @property + def train(self): + indexes = self.permuted_index_train[:self.TRAIN_SIZE - self.validation_size] + return LabeledData(data=self._train.data[indexes], + labels=self._train.labels[indexes]) + + @property + def test(self): + indexes = self.permuted_index_train[self.TRAIN_SIZE:] + return LabeledData(data=self._train.data[indexes], + labels=self._train.labels[indexes]) + + @property + def validation(self): + indexes = self.permuted_index_train[(self.TRAIN_SIZE - self.validation_size):self.TRAIN_SIZE] + return LabeledData(data=self._train.data[indexes], + labels=self._train.labels[indexes]) + + +if __name__ == "__main__": + import time + d = RPSDataset(validation_size=100) + d.load() + d.to_image() + print(d.train.data.shape) + for i, im in enumerate(d.train.data): + plt.imshow(im) + plt.show() + print(d.train.labels[i]) + time.sleep(1) \ No newline at end of file diff --git a/skluc/main/data/transformation/VinyalsTransformer.py b/skluc/main/data/transformation/VinyalsTransformer.py index 9658af6..bc971a4 100644 --- a/skluc/main/data/transformation/VinyalsTransformer.py +++ b/skluc/main/data/transformation/VinyalsTransformer.py @@ -16,8 +16,12 @@ class VinyalsTransformer(KerasModelTransformer, metaclass=Singleton): checksum="a0b815ad2ab81092c75d129f511b2bdb" ), "omniglot_28x28": DownloadableModel( - url="https://pageperso.lis-lab.fr/~luc.giffon/models/1536742266.9412131_vinyals_omniglot.h5", + url="https://pageperso.lis-lab.fr/luc.giffon/models/1536742266.9412131_vinyals_omniglot_28x28.h5", checksum="6460eb1b7eaa478301a281b12ecd2461" + ), + "omniglot_snell": DownloadableModel( + url="https://pageperso.lis-lab.fr/~luc.giffon/models/1537524783.0678186_vinyals_omniglot_snell.h5", + checksum="28a6e4e3748d9971e0450000895ce423" ) } diff --git a/skluc/main/data/transformation/tCNNTransformer/__init__.py b/skluc/main/data/transformation/tCNNTransformer/__init__.py new file mode 100644 index 0000000..135afb5 --- /dev/null +++ b/skluc/main/data/transformation/tCNNTransformer/__init__.py @@ -0,0 +1,29 @@ +import os + +import numpy as np +from keras import Model +from skluc.utils import logger, deprecated + + +@deprecated +class tCNNTransformer: + """ + Transform text data with textCNN transformer. + """ + def __init__(self, name, download_dir=os.path.join(os.path.expanduser("~"), "ml_models")): + self.tcnn_model = None + self.s_download_dir = os.path.join(download_dir, name) + super().__init__() + + def transform(self, data, labels): + # todo rendre ce truc plus genral aux transformers + model = Model(inputs=self.tcnn_model.input, outputs=self.tcnn_model.output) + logger.debug("Type fo data to transform: {}".format(type(data))) + logger.debug("Length of data to transform: {}".format(len(data))) + logger.debug("Transforming data using pretrained model") + transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:]) + logger.debug("Type of transformed data: {}".format(type(transformed_data))) + return transformed_data, labels + + def check_model(self): + raise NotImplementedError diff --git a/skluc/test/test_data/test_mldatasets/TestCifar100Dataset.py b/skluc/test/test_data/test_mldatasets/TestCifar100Dataset.py new file mode 100644 index 0000000..e69de29 diff --git a/skluc/test/test_data/test_mldatasets/TestImageDataset.py b/skluc/test/test_data/test_mldatasets/TestImageDataset.py new file mode 100644 index 0000000..e69de29 diff --git a/skluc/test/test_data/test_mldatasets/TestSVHNDataset.py b/skluc/test/test_data/test_mldatasets/TestSVHNDataset.py new file mode 100644 index 0000000..e69de29 -- GitLab