diff --git a/skluc/data/mldatasets/MnistDataset.py b/skluc/data/mldatasets/MnistDataset.py index cef4adfb831013d9d9371f32fdd8c6786a194356..5388f3e57b6dc0ebba83973c631c7054874b10b0 100644 --- a/skluc/data/mldatasets/MnistDataset.py +++ b/skluc/data/mldatasets/MnistDataset.py @@ -1,5 +1,6 @@ import gzip import os +import struct import numpy as np diff --git a/skluc/data/transformation.py b/skluc/data/transformation.py deleted file mode 100644 index f2ff71a58e86d347f8fe190735b8058b08948ad0..0000000000000000000000000000000000000000 --- a/skluc/data/transformation.py +++ /dev/null @@ -1,386 +0,0 @@ -import os - -import numpy as np -from keras import Model -from keras.applications import VGG19 -from keras.models import load_model - -from skluc.data.mldatasets import MnistDataset, Cifar10Dataset -from skluc.utils import logger, create_directory, download_data, check_file_md5, deprecated, Singleton - - -class LecunTransformer: - """ - Uses the lenet network to transform input data. - """ - def __init__(self, name, download_dir=os.path.join(os.path.expanduser("~"), "ml_models")): - self.lecun_conv_model = None - self.s_download_dir = os.path.join(download_dir, name) - super().__init__() - - def transform(self, data, labels): - # todo regrouper les transformers? enormement de similitude entre - # lecuntransformer et vgg19transformer - model = Model(inputs=self.lecun_conv_model.input, outputs=self.lecun_conv_model.output) - logger.debug("Type of data to transform: {}".format(type(data))) - logger.debug("Shape of data to transform: {}".format(data.shape)) - logger.debug("Transforming data using pretrained model") - transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:]) - logger.debug("Type of transformed data: {}".format(type(transformed_data))) - return transformed_data, labels - - def check_model(self): - return - - -class LecunMnistTransformer(LecunTransformer, metaclass=Singleton): - """ - Extends lecuntransformer with convolutions learned on mnist. - """ - NAME = "lecun_mnist" - # todo a faire - MODEL_URL = "https://pageperso.lis-lab.fr/~luc.giffon/models/1524640419.938414_lenet_mnist.h5" - MODEL_CHECKSUM = "527d7235c213278df1d15d3fe685eb5c" - - # todo faire une fonction qui regarde directement le checksum sur le site ? - - def __init__(self): - super().__init__(name=self.NAME) - - def load(self): - create_directory(self.s_download_dir) - s_model_path = download_data(self.MODEL_URL, self.s_download_dir) - check_file_md5(s_model_path, self.__class__.MODEL_CHECKSUM) - if self.lecun_conv_model is None: - logger.debug("Loading VGG19 model with cifar10 weights") - self.lecun_conv_model = load_model(s_model_path) - self.lecun_conv_model = Model(inputs=self.lecun_conv_model.input, - outputs=self.lecun_conv_model.get_layer('conv_pool_2').output) - else: - logger.debug("Skip loading model Lecun model with mnist weights. Already there.") - - def transform(self, data, labels): - if len(data.shape) != 4: - raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " - "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" - "".format(len(data.shape), data.shape)) - self.load() - return super().transform(data, labels) - - def check_model(self): - name = os.path.basename(os.path.normpath(self.__class__.MODEL_URL)) - s_file_path = os.path.join(self.s_download_dir, name) - if os.path.exists(s_file_path) and check_file_md5(s_file_path, - self.__class__.MODEL_CHECKSUM, - raise_=False): - return True - else: - return False - - -class VGG19Transformer: - """ - Uses the vgg19 convolution network to transform data. - """ - NAME = None - MODEL_URL = None - MODEL_CHECKSUM = None - - def __init__(self, cut_layer_name=None, cut_layer_index=None, download_dir=os.path.join(os.path.expanduser("~"), "ml_models")): - self.vgg_conv_model = None - self.s_download_dir = os.path.join(download_dir, self.NAME) - super().__init__() - - if cut_layer_name is None: - cut_layer_name = "block5_pool" - self.__cut_layer_name = cut_layer_name - self.__cut_layer_index = cut_layer_index - if cut_layer_name is not None: - self.NAME = self.NAME + "_" + str(cut_layer_name) - elif cut_layer_index is not None: - self.NAME = self.NAME + "_" + str(cut_layer_index) - else: - raise AttributeError("Cut layer name or cut_layer index must be given to init VGG19Cifar10Transformer.") - - def load(self): - create_directory(self.s_download_dir) - s_model_path = download_data(self.MODEL_URL, self.s_download_dir) - check_file_md5(s_model_path, self.__class__.MODEL_CHECKSUM) - if self.vgg_conv_model is None: - logger.debug("Loading VGG19 model for {} transformer".format(self.__class__.__name__)) - self.vgg_conv_model = load_model(s_model_path) - - if self.__cut_layer_name is not None: - self.vgg_conv_model = Model(inputs=self.vgg_conv_model.input, - outputs=self.vgg_conv_model.get_layer(name=self.__cut_layer_name).output) - elif self.__cut_layer_index is not None: - self.vgg_conv_model = Model(inputs=self.vgg_conv_model.input, - outputs=self.vgg_conv_model.get_layer(name=self.__cut_layer_index).output) - - else: - logger.debug("Skip loading model VGG19 for {} transformer. Already there.".format(self.__class__.__name__)) - - def transform(self, data, labels): - if len(data.shape) != 4: - raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " - "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" - "".format(len(data.shape), data.shape)) - self.load() - model = Model(inputs=self.vgg_conv_model.input, outputs=self.vgg_conv_model.output) - logger.debug("Type fo data to transform: {}".format(type(data))) - logger.debug("Length of data to transform: {}".format(len(data))) - logger.debug("Transforming data using pretrained model") - transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:]) - logger.debug("Type of transformed data: {}".format(type(transformed_data))) - return transformed_data, labels - - def check_model(self): - name = os.path.basename(os.path.normpath(self.__class__.MODEL_URL)) - s_file_path = os.path.join(self.s_download_dir, name) - if os.path.exists(s_file_path) and check_file_md5(s_file_path, - self.__class__.MODEL_CHECKSUM, - raise_=False): - return True - else: - return False - - -class VGG19SvhnTransformer(VGG19Transformer, metaclass=Singleton): - """ - Extend the vgg19transformer class with weights learned on SVHN. - """ - NAME = "vgg19_svhn" - MODEL_URL = "https://pageperso.lis-lab.fr/~luc.giffon/models/1529968150.5454917_vgg19_svhn.h5" - MODEL_CHECKSUM = "563a9ec2aad37459bd1ed0e329441b05" - - -class VGG19Cifar100Transformer(VGG19Transformer, metaclass=Singleton): - """ - Extend the vgg19transformer class with weights learned on CIFAR10. - """ - NAME = "vgg19_cifar100" - MODEL_URL = "https://pageperso.lis-lab.fr/~luc.giffon/models/1530965727.781668_vgg19_cifar100fine.h5" - MODEL_CHECKSUM = "edf43e263fec05e2c013dd5a2128fc38" - - # todo faire une fonction qui regarde directement le checksum sur le site ? - - -class VGG19Cifar10Transformer(VGG19Transformer, metaclass=Singleton): - """ - Extend the vgg19transformer class with weights learned on CIFAR10. - """ - NAME = "vgg19_cifar10" - MODEL_URL = "https://pageperso.lis-lab.fr/~luc.giffon/models/1522967518.1916964_vgg19_cifar10.h5" - MODEL_CHECKSUM = "0dbb4f02ceb1f4acb6e24831758106e5" - - -# todo check those deprecated things -@deprecated -class VGG19Cifar10CovAbs(VGG19Transformer): - """ - Extend the vgg19transformer class with weights learned on CIFAR10. - The covariance matrix is then computed on the transformed data image. - """ - NAME = "vgg19_cifar10_cov" - MODEL_URL = "https://pageperso.lis-lab.fr/~luc.giffon/models/1522967518.1916964_vgg19_cifar10.h5" - MODEL_CHECKSUM = "0dbb4f02ceb1f4acb6e24831758106e5" - # todo faire une fonction qui regarde directement le checksum sur le site ? - - def __init__(self): - super().__init__(name=self.NAME) - - @staticmethod - def _compute_cov_matrix(data): - """ - - :param data: (b x W x H x D) - :type data: np.ndarray - :return: - """ - data = data.reshape((data.shape[0], data.shape[1] * data.shape[2], data.shape[3])) - mean = np.mean(data, axis=1) - mean = mean.reshape((mean.shape[0], 1, mean.shape[-1])) - data_centered = data - mean - - cov_mat = [] - for i, mat in enumerate(data_centered): - cov_mat.append(mat.T.dot(mat)) - if i % 1000 == 0: - logger.debug("Computing covariance matrix - step {}/{}".format(i, len(data_centered))) - cov_mat = 1. / data.shape[1] * np.array(cov_mat) - logger.debug("Final covariance matrix shape: {}".format(str(cov_mat.shape))) - return cov_mat - - @staticmethod - def _compute_log_matrix(data): - log_mat = [] - for i, mat in enumerate(data): - U, S, V = np.linalg.svd(mat, full_matrices=False) - log_mat.append(U.dot(np.diag(np.log(S))).dot(V)) - if i % 1000 == 0: - logger.debug("Computing log matrix - step {}/{}".format(i, len(data))) - log_mat = np.array(log_mat) - logger.debug("Final log matrix shape: {}".format(str(log_mat.shape))) - return log_mat - - def load(self): - create_directory(self.s_download_dir) - s_model_path = download_data(self.MODEL_URL, self.s_download_dir) - check_file_md5(s_model_path, self.__class__.MODEL_CHECKSUM) - if self.vgg_conv_model is None: - logger.debug("Loading VGG19 model with cifar10 weights") - self.vgg_conv_model = load_model(s_model_path) - bloc3pool_layer = self.vgg_conv_model.get_layer('block3_pool') - # this is weird but the index is actually the index of the layer just before the pooling layer - # so this is what we want here: we don't want the pooling - index_bloc3pool_layer = self.vgg_conv_model.layers.index(bloc3pool_layer) - self.vgg_conv_model = Model(inputs=self.vgg_conv_model.input, - outputs=self.vgg_conv_model.get_layer(index=index_bloc3pool_layer).output) - else: - logger.debug("Skip loading model VGG19 model with cifar10 weights. Already there.") - - def transform(self, data, labels): - if len(data.shape) != 4: - raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " - "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" - "".format(len(data.shape), data.shape)) - self.load() - transformed_data, labels = super().transform(data, labels) - transformed_data = self._compute_cov_matrix(transformed_data) - return transformed_data, labels - - def check_model(self): - name = os.path.basename(os.path.normpath(self.__class__.MODEL_URL)) - s_file_path = os.path.join(self.s_download_dir, name) - if os.path.exists(s_file_path) and check_file_md5(s_file_path, - self.__class__.MODEL_CHECKSUM, - raise_=False): - return True - else: - return False - -# -# @deprecated -# @singleton -# class VGG19Cifar10Cov(VGG19Cifar10CovAbs): -# pass - -# -# @deprecated -# @singleton -# class VGG19Cifar10CovSVDLog(VGG19Cifar10CovAbs): -# """ -# Extend the VGG19Cifar10CovAbs class with computing of the log of the covariance matrix. -# """ -# -# def transform(self, data, labels): -# transformed_data, labels = super().transform(data, labels) -# transformed_data = self._compute_log_matrix(transformed_data) -# return transformed_data, labels -# - -@deprecated -class VGG19ImagenetTransformer(VGG19Transformer, metaclass=Singleton): - """ - Extend the vgg19transformer class with convolutional wieghts learned on the imagenet dataset. - """ - NAME = "vgg19_imagenet" - - def __init__(self): - super().__init__(name=self.NAME) - - def load(self, input_shape): - if self.vgg_conv_model is None: - logger.debug("Loading VGG19 model with imagenet weights from keras") - self.vgg_conv_model = VGG19(include_top=False, weights='imagenet', input_shape=input_shape) - else: - logger.debug("Skip loading model VGG19 model with imagenet weights. Already there.") - - def transform(self, data, labels): - # todo trouver une solution pour ne pas avoir un copier collé entre cette classe et celle avec cifar - if len(data.shape) != 4: - raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " - "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" - "".format(len(data.shape), data.shape)) - self.load(input_shape=data[0].shape) - return super().transform(data, labels) - - -@deprecated -class tCNNTransformer: - """ - Transform text data with textCNN transformer. - """ - def __init__(self, name, download_dir=os.path.join(os.path.expanduser("~"), "ml_models")): - self.tcnn_model = None - self.s_download_dir = os.path.join(download_dir, name) - super().__init__() - - def transform(self, data, labels): - # todo rendre ce truc plus genral aux transformers - model = Model(inputs=self.tcnn_model.input, outputs=self.tcnn_model.output) - logger.debug("Type fo data to transform: {}".format(type(data))) - logger.debug("Length of data to transform: {}".format(len(data))) - logger.debug("Transforming data using pretrained model") - transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:]) - logger.debug("Type of transformed data: {}".format(type(transformed_data))) - return transformed_data, labels - - def check_model(self): - raise NotImplementedError -# -# @deprecated -# @singleton -# class tCNNMovieReviewTransformer(tCNNTransformer): -# """ -# Extend the tcnntransformer class with convolution weights learned on the movie review dataset. -# """ -# NAME = "tcnn_moviereview" -# MODEL_URL = "https://pageperso.lis-lab.fr/~luc.giffon/models/1525939286.4695354_tcnn_rotten_tomatoes.h5" -# MODEL_CHECKSUM = "3d263d874e43c4c9801f76603042633b" -# # todo faire une fonction qui regarde directement le checksum sur le site ? -# -# def __init__(self): -# super().__init__(name=self.NAME) -# -# def load(self): -# create_directory(self.s_download_dir) -# s_model_path = download_data(self.MODEL_URL, self.s_download_dir) -# check_file_md5(s_model_path, self.__class__.MODEL_CHECKSUM) -# if self.tcnn_model is None: -# logger.debug("Loading tCNN model with moviereview weights") -# self.tcnn_model = load_model(s_model_path) -# print([l.name for l in self.tcnn_model.layers]) -# self.tcnn_model = Model(inputs=self.tcnn_model.input, -# outputs=self.tcnn_model.get_layer('flatten_1').output) -# else: -# logger.debug("Skip loading model VGG19 model with cifar10 weights. Already there.") -# -# def transform(self, data, labels): -# if len(data.shape) != 2: -# raise AssertionError("Data shape should be of size 2 (image batch with sentence length). " -# "It is {}: {}" -# "".format(len(data.shape), data.shape)) -# self.load() -# return super().transform(data, labels) -# -# def check_model(self): -# name = os.path.basename(os.path.normpath(self.__class__.MODEL_URL)) -# s_file_path = os.path.join(self.s_download_dir, name) -# if os.path.exists(s_file_path) and check_file_md5(s_file_path, -# self.__class__.MODEL_CHECKSUM, -# raise_=False): -# return True -# else: -# return False -# - - -if __name__ == '__main__': - valsize = 10000 - d = Cifar10Dataset(validation_size=valsize) - - d.load() - d.to_image() - trans = VGG19Cifar10Transformer(cut_layer="block5_pool") - d.apply_transformer(transformer=trans) diff --git a/skluc/data/transformation/ImageTransformer/RescaleTransformer.py b/skluc/data/transformation/ImageTransformer/RescaleTransformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f5f3ac64521e86610854caf562c97af853166ea0 --- /dev/null +++ b/skluc/data/transformation/ImageTransformer/RescaleTransformer.py @@ -0,0 +1,34 @@ +import os +import tensorflow as tf +import numpy as np + +from skluc.data.transformation.Transformer import Transformer +from skluc.utils import logger, Singleton + + +class RescaleTransformer(Transformer, metaclass=Singleton): + def __init__(self, scaling_factor): + self.rescale_factor = scaling_factor + self.__name = os.path.join("resize", "{}".format(str(scaling_factor).replace(".", "-"))) + + @property + def name(self): + return self.__name + + def transform(self, data, labels): + if len(data.shape) != 4: + raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " + "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" + "".format(len(data.shape), data.shape)) + logger.debug("Start resizing image") + logger.debug("Shape of data to transform: {}".format(data.shape)) + + sess = tf.InteractiveSession() + images_mat = data + output_shape = np.multiply(images_mat.shape[1:-1], (self.rescale_factor, self.rescale_factor)) + labels = labels + logger.debug("Expected output shape: {}".format((data.shape[0], *output_shape, data.shape[-1]))) + new_images = tf.image.resize_images(images_mat, output_shape).eval() + logger.debug("Shape of data after rescaling: {}".format(new_images.shape)) + sess.close() + return np.array(new_images), labels \ No newline at end of file diff --git a/skluc/data/transformation/ImageTransformer/ResizeTransformer.py b/skluc/data/transformation/ImageTransformer/ResizeTransformer.py new file mode 100644 index 0000000000000000000000000000000000000000..7be096fdbe3eefd26dedbde9fad1ba76fe19be77 --- /dev/null +++ b/skluc/data/transformation/ImageTransformer/ResizeTransformer.py @@ -0,0 +1,38 @@ +import os +import tensorflow as tf +import numpy as np + +from skluc.data.transformation.Transformer import Transformer +from skluc.utils import logger, Singleton + + +class ResizeTransformer(Transformer, metaclass=Singleton): + def __init__(self, output_shape): + if len(output_shape) != 2: + raise AssertionError("Output shape should be 2D and it is {}D: {}".format(len(output_shape), output_shape)) + self.output_shape = output_shape + self.__name = os.path.join("resize", "{}x{}".format(output_shape[0], output_shape[1])) + + @property + def name(self): + return self.__name + + def transform(self, data, labels): + if len(data.shape) != 4: + raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " + "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" + "".format(len(data.shape), data.shape)) + logger.debug("Start resizing image") + logger.debug("Shape of data to transform: {}".format(data.shape)) + logger.debug("Expected output shape: {}".format((data.shape[0], *self.output_shape, data.shape[-1]))) + + sess = tf.InteractiveSession() + images_mat = data + labels = labels + lst_new_image = [] + for image_mat in images_mat: + new_image = tf.image.resize_images(image_mat, self.output_shape).eval() + lst_new_image.append(new_image) + logger.debug("Shape data after resize: {}".format(np.array(lst_new_image).shape)) + sess.close() + return np.array(lst_new_image), labels diff --git a/skluc/data/transformation/ImageTransformer/__init__.py b/skluc/data/transformation/ImageTransformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/skluc/data/transformation/KerasModelTransformer.py b/skluc/data/transformation/KerasModelTransformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d2c55b7806db8c031dc0da6da568bdda9595b596 --- /dev/null +++ b/skluc/data/transformation/KerasModelTransformer.py @@ -0,0 +1,45 @@ +import os +import numpy as np +from keras import Model + +from skluc.data.transformation.Transformer import Transformer +from skluc.utils import check_file_md5, logger + + +class KerasModelTransformer(Transformer): + + MAP_DATA_MODEL = {} + + def __init__(self, data_name, transformation_name): + super().__init__(data_name, transformation_name) + self.keras_model = None + self.load() + + def load(self): + raise NotImplementedError + + def check_model(self): + name = os.path.basename(os.path.normpath(self.MAP_DATA_MODEL[self.data_name].url)) + s_file_path = os.path.join(self.s_download_dir, name) + if os.path.exists(s_file_path) and check_file_md5(s_file_path, + self.MAP_DATA_MODEL[self.data_name].checksum, + raise_=False): + return True + else: + return False + + def transform(self, data, labels): + if len(data.shape) != 4: + raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " + "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" + "".format(len(data.shape), data.shape)) + model = Model(inputs=self.keras_model.input, outputs=self.keras_model.output) + logger.debug("Type of data to transform: {}".format(type(data))) + logger.debug("Length of data to transform: {}".format(len(data))) + logger.debug("Transforming data using pretrained model") + transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:]) + logger.debug("Type of transformed data: {}".format(type(transformed_data))) + return transformed_data, labels + + def check(self): + return self.check_model() diff --git a/skluc/data/transformation/LeCunTransformer/__init__.py b/skluc/data/transformation/LeCunTransformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..04cb676178a3ea742fc3b99332d2f6e87911c712 --- /dev/null +++ b/skluc/data/transformation/LeCunTransformer/__init__.py @@ -0,0 +1,41 @@ +from keras.models import load_model + +from keras import Model + +from skluc.data.transformation.KerasModelTransformer import KerasModelTransformer +from skluc.utils import logger, create_directory, download_data, check_file_md5, DownloadableModel + + +class LecunTransformer(KerasModelTransformer): + """ + Uses the lenet network to transform input data. + """ + + MAP_DATA_MODEL = { + "mnist": DownloadableModel( + url="https://pageperso.lis-lab.fr/~luc.giffon/models/1524640419.938414_lenet_mnist.h5", + checksum="527d7235c213278df1d15d3fe685eb5c"), + } + + def __init__(self, data_name): + if data_name not in self.MAP_DATA_MODEL.keys(): + raise ValueError("Unknown data name. Can't load weights") + transformation_name = self.__class__.__name__ + + super().__init__(data_name=data_name, + transformation_name=transformation_name) + + self.keras_model = None + + def load(self): + create_directory(self.s_download_dir) + s_model_path = download_data(self.MAP_DATA_MODEL[self.data_name].url, self.s_download_dir) + check_file_md5(s_model_path, self.MAP_DATA_MODEL[self.data_name].checksum) + if self.keras_model is None: + logger.debug("Loading VGG19 model with cifar10 weights") + self.keras_model = load_model(s_model_path) + + self.keras_model = Model(inputs=self.keras_model.input, + outputs=self.keras_model.get_layer('conv_pool_2').output) + else: + logger.debug("Skip loading model Lecun model with mnist weights. Already there.") diff --git a/skluc/data/transformation/Transformer.py b/skluc/data/transformation/Transformer.py index 36f8855a64d534041d8583d4d998b61f2b7766e3..c6a9e1f750391db2f2b9236769d8ae14fd2025d4 100644 --- a/skluc/data/transformation/Transformer.py +++ b/skluc/data/transformation/Transformer.py @@ -1,7 +1,17 @@ -from skluc.utils import Singleton +import os + +from skluc.utils import check_file_md5 class Transformer: + + MAP_DATA_MODEL = {} + + def __init__(self, data_name, transformation_name, root_download_dir=os.path.join(os.path.expanduser("~"), "ml_models")): + self.data_name = data_name + self.transformation_name = transformation_name + self.root_download_dir = root_download_dir + def transform(self, data, labels): """ Apply the transformer to the data and labels. @@ -19,7 +29,8 @@ class Transformer: @property def name(self): - raise NotImplementedError + return self.transformation_name - # @property - # def \ No newline at end of file + @property + def s_download_dir(self): + return os.path.join(os.path.join(self.root_download_dir, self.__class__.__name__), self.data_name) diff --git a/skluc/data/transformation/VGG19Transformer/__init__.py b/skluc/data/transformation/VGG19Transformer/__init__.py index 3ce287858d48c6496d0b9c8e111f261d6e3bd342..05dfdfa5a9229cad1573166dd841de5a142e6498 100644 --- a/skluc/data/transformation/VGG19Transformer/__init__.py +++ b/skluc/data/transformation/VGG19Transformer/__init__.py @@ -1,15 +1,12 @@ -import os -import collections -import numpy as np from keras import Model from keras.models import load_model from skluc.data.mldatasets import Cifar10Dataset -from skluc.data.transformation.Transformer import Transformer +from skluc.data.transformation.KerasModelTransformer import KerasModelTransformer from skluc.utils import logger, create_directory, download_data, check_file_md5, Singleton, DownloadableModel -class VGG19Transformer(Transformer, metaclass=Singleton): +class VGG19Transformer(KerasModelTransformer, metaclass=Singleton): """ Uses the vgg19 convolution network to transform data. """ @@ -26,77 +23,47 @@ class VGG19Transformer(Transformer, metaclass=Singleton): checksum="0dbb4f02ceb1f4acb6e24831758106e5") } - def __init__(self, data_name, cut_layer_name=None, cut_layer_index=None, - download_dir=os.path.join(os.path.expanduser("~"), "ml_models")): + def __init__(self, data_name, cut_layer_name=None, cut_layer_index=None): if data_name not in self.MAP_DATA_MODEL.keys(): raise ValueError("Unknown data name. Can't load weights") else: - self.data_name = data_name - self.vgg_conv_model = None - self.s_download_dir = os.path.join(download_dir, data_name) - super().__init__() + data_name = data_name - if cut_layer_name is None: + if cut_layer_name is None and cut_layer_index is None: cut_layer_name = "block5_pool" - self.__cut_layer_name = cut_layer_name - self.__cut_layer_index = cut_layer_index if cut_layer_name is not None: - self.transformation_name = self.__class__.__name__ + "_" + str(cut_layer_name) + transformation_name = self.__class__.__name__ + "_" + str(cut_layer_name) elif cut_layer_index is not None: - self.transformation_name = self.__class__.__name__\ + transformation_name = self.__class__.__name__ \ + "_" + str(cut_layer_index) + # todo sauvegarder index / nom dans le meme dossier si c'est les meme else: raise AttributeError("Cut layer name or cut_layer index must be given to init VGG19Cifar10Transformer.") + self.__cut_layer_name = cut_layer_name + self.__cut_layer_index = cut_layer_index + + super().__init__(data_name=data_name, + transformation_name=transformation_name) - @property - def name(self): - return self.transformation_name + self.keras_model = None def load(self): create_directory(self.s_download_dir) - logger.debug(self.data_name) s_model_path = download_data(self.MAP_DATA_MODEL[self.data_name].url, self.s_download_dir) check_file_md5(s_model_path, self.MAP_DATA_MODEL[self.data_name].checksum) - if self.vgg_conv_model is None: + if self.keras_model is None: logger.debug("Loading VGG19 model for {} transformation".format(self.transformation_name)) - self.vgg_conv_model = load_model(s_model_path) + self.keras_model = load_model(s_model_path) if self.__cut_layer_name is not None: - self.vgg_conv_model = Model(inputs=self.vgg_conv_model.input, - outputs=self.vgg_conv_model.get_layer(name=self.__cut_layer_name).output) + self.keras_model = Model(inputs=self.keras_model.input, + outputs=self.keras_model.get_layer(name=self.__cut_layer_name).output) elif self.__cut_layer_index is not None: - self.vgg_conv_model = Model(inputs=self.vgg_conv_model.input, - outputs=self.vgg_conv_model.get_layer(name=self.__cut_layer_index).output) - - else: - logger.debug("Skip loading model VGG19 for {} transformer. Already there.".format(self.__class__.__name__)) - - def transform(self, data, labels): - if len(data.shape) != 4: - raise AssertionError("Data shape should be of size 4 (image batch with channel dimension). " - "It is {}: {}. Maybe have you forgotten to reshape it to an image format?" - "".format(len(data.shape), data.shape)) - self.load() - model = Model(inputs=self.vgg_conv_model.input, outputs=self.vgg_conv_model.output) - logger.debug("Type of data to transform: {}".format(type(data))) - logger.debug("Length of data to transform: {}".format(len(data))) - logger.debug("Transforming data using pretrained model") - transformed_data = np.array(model.predict(data)).reshape(-1, *model.output_shape[1:]) - logger.debug("Type of transformed data: {}".format(type(transformed_data))) - return transformed_data, labels - - def check(self): - return self.check_model() + self.keras_model = Model(inputs=self.keras_model.input, + outputs=self.keras_model.get_layer(name=self.__cut_layer_index).output) - def check_model(self): - name = os.path.basename(os.path.normpath(self.MAP_DATA_MODEL[self.data_name].url)) - s_file_path = os.path.join(self.s_download_dir, name) - if os.path.exists(s_file_path) and check_file_md5(s_file_path, - self.MAP_DATA_MODEL[self.data_name].checksum, - raise_=False): - return True else: - return False + logger.debug("Skip loading model VGG19 for {} transformation. Already there.".format(self.transformation_name)) if __name__ == '__main__': diff --git a/skluc/test/test_transformation/TestLeCunTransformer.py b/skluc/test/test_transformation/TestLeCunTransformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d11577ae67e72c2b4327b4e01ac0d8ecbff56ed2 --- /dev/null +++ b/skluc/test/test_transformation/TestLeCunTransformer.py @@ -0,0 +1,34 @@ +import unittest + +from skluc.data.mldatasets import MnistDataset +from skluc.data.transformation.LeCunTransformer import LecunTransformer +from skluc.utils import logger + + +class TestLeCunTransformer(unittest.TestCase): + def setUp(self): + self.dict_datasets = { + "mnist": MnistDataset + } + + def test_transform(self): + valsize = 10000 + for data_name in self.dict_datasets: + logger.info("Testing dataset {}".format(data_name)) + dataset = self.dict_datasets[data_name] + d = dataset(validation_size=valsize) + d.load() + d.flatten() + d.to_image() + trans = LecunTransformer(data_name=data_name) + d.apply_transformer(transformer=trans) + del trans + + def test_init(self): + for data_name in self.dict_datasets: + trans = LecunTransformer(data_name=data_name) + del trans + + +if __name__ == '__main__': + unittest.main() diff --git a/skluc/test/test_transformation/TestVGG19Transformer.py b/skluc/test/test_transformation/TestVGG19Transformer.py index 16ba0f108c66db87eed62f22dcfa1db67c354d27..4c656443779a291744b2aea44ae52d191a3f7303 100644 --- a/skluc/test/test_transformation/TestVGG19Transformer.py +++ b/skluc/test/test_transformation/TestVGG19Transformer.py @@ -22,10 +22,12 @@ class TestVGG19Transformer(unittest.TestCase): def test_transform(self): valsize = 10000 - for name_cut_layer in self.lst_name_cut_layers: - logger.info("Testing cut layer {}".format(name_cut_layer)) - for data_name in self.dict_datasets: - logger.info("Testing dataset {}".format(data_name)) + + for data_name in self.dict_datasets: + logger.info("Testing dataset {}".format(data_name)) + trans = None + for name_cut_layer in self.lst_name_cut_layers: + logger.info("Testing cut layer {}".format(name_cut_layer)) dataset = self.dict_datasets[data_name] d = dataset(validation_size=valsize) d.load() @@ -33,7 +35,16 @@ class TestVGG19Transformer(unittest.TestCase): d.to_image() trans = VGG19Transformer(data_name=data_name, cut_layer_name=name_cut_layer) d.apply_transformer(transformer=trans) - del trans + del trans + + def test_init(self): + for data_name in self.dict_datasets: + logger.info("Testing dataset {}".format(data_name)) + trans = None + for name_cut_layer in self.lst_name_cut_layers: + logger.info("Testing cut layer {}".format(name_cut_layer)) + trans = VGG19Transformer(data_name=data_name, cut_layer_name=name_cut_layer) + del trans if __name__ == '__main__':