diff --git a/skluc/main/data/mldatasets/Caltech101Dataset.py b/skluc/main/data/mldatasets/Caltech101Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb7ff87b9bafb42098a94c4017b14fa11b10903 --- /dev/null +++ b/skluc/main/data/mldatasets/Caltech101Dataset.py @@ -0,0 +1,84 @@ +import os +import pickle +import tarfile + +import numpy as np + +from skluc.main.data.mldatasets.ImageDataset import ImageDataset +from skluc.main.utils import LabeledData +from skluc.main.utils import logger, check_files + + +class Cifar100FineDataset(ImageDataset): + HEIGHT = 32 + WIDTH = 32 + DEPTH = 3 + + def __init__(self, validation_size=0, seed=None, s_download_dir=None): + self.__s_url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" + self.meta = None + name = "cifar100fine" + if s_download_dir is not None: + super().__init__([self.__s_url], name, s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__([self.__s_url], name, validation_size=validation_size, seed=seed) + + self.__extracted_dirname = os.path.join(self.s_download_dir, "cifar-100-python") + self.__extracted_files = [ + 'train', + 'test', + 'meta' + ] + + self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] + + def get_cifar100_data(self, keyword): + """ + Get data from the files containing the keyword in their name. + + :param keyword: + :return: + """ + full_data = [] + full_labels = [] + for fpath in self.__extracted_file_paths: + if keyword in fpath.split('/')[-1]: + with open(fpath, 'rb') as f: + pckl_data = pickle.load(f, encoding='bytes') + full_data.append(pckl_data[b'data']) + full_labels.append(pckl_data[b'fine_labels']) + final_data = np.vstack(full_data) + final_label = np.hstack(full_labels) + + return final_data, final_label + + def get_meta(self): + """ + Get meta data about cifar10 from file. + + :return: + """ + for fpath in self.__extracted_file_paths: + if 'meta' in fpath.split('/')[-1]: + with open(fpath, 'rb') as f: + pckl_data = pickle.load(f, encoding='bytes') + meta = pckl_data[b'fine_label_names'] + return np.array(meta) + + def read(self): + targz_file_path = self.l_filepaths[-1] + if not check_files(self.__extracted_file_paths): + logger.debug("Extracting {} ...".format(targz_file_path)) + tar = tarfile.open(targz_file_path, "r:gz") + tar.extractall(path=self.s_download_dir) + else: + logger.debug("File {} has already been extracted".format(targz_file_path)) + + logger.debug("Get training data of dataset {}".format(self.s_name)) + self._train = LabeledData(*self.get_cifar100_data('train')) + + logger.debug("Get testing data of dataset {}".format(self.s_name)) + self._test = LabeledData(*self.get_cifar100_data('test')) + self.meta = self.get_meta() + + self._check_validation_size(self._train[0].shape[0]) diff --git a/skluc/main/tensorflow_/kernel_approximation/nystrom_layer.py b/skluc/main/tensorflow_/kernel_approximation/nystrom_layer.py index 24c723777a5f241aacc0543016ac7055c6722631..2d25157e6c147279e2e768a69c67898790d42d1c 100644 --- a/skluc/main/tensorflow_/kernel_approximation/nystrom_layer.py +++ b/skluc/main/tensorflow_/kernel_approximation/nystrom_layer.py @@ -311,6 +311,120 @@ class DeepstromLayer(tf.keras.layers.Layer): return out +class DeepstromLayerEndToEnd(tf.keras.layers.Layer): + def __init__(self, + subsample_size, + kernel_name, + out_dim=None, + activation=None, + sum_of_kernels=False, + stack_of_kernels=False, + kernel_dict={} + ): + + def init_kernel(): + if kernel_name == "rbf": + kernel_fct = rbf_kernel + tf_kernel_fct = tf_rbf_kernel + elif kernel_name == "linear": + kernel_fct = linear_kernel + tf_kernel_fct = tf_linear_kernel + elif kernel_name == "chi2_cpd": + kernel_fct = additive_chi2_kernel + tf_kernel_fct = tf_chi_square_CPD + elif kernel_name == "chi2_exp_cpd": + kernel_fct = chi2_kernel + tf_kernel_fct = tf_chi_square_CPD_exp + elif kernel_name == "chi2_pd": + raise NotImplementedError("Bien verifier que ce code ne fait pas bordel") + elif kernel_name == "laplacian": + tf_kernel_fct = tf_laplacian_kernel + kernel_fct = laplacian_kernel + else: + raise ValueError("Unknown kernel name: {}".format(kernel_name)) + return kernel_name, kernel_fct, tf_kernel_fct, kernel_dict + + def init_output_dim(subsample_size): + if out_dim is not None and out_dim > subsample_size: + logger.debug("Output dim is greater than deepstrom subsample size. Aborting.") + exit() + elif out_dim is None: + return subsample_size + else: + return out_dim + + def init_activation(): + if activation == "tan": + activation_fct = tf.nn.tanh + elif activation == "relu": + activation_fct = tf.nn.relu + else: + activation_fct = activation + + return activation_fct + + super().__init__() + + self.__subsample_size = subsample_size + + self.__sum_of_kernels = sum_of_kernels + self.__stack_of_kernels = stack_of_kernels + + self.__kernel_name, self.__kernel_fct, self.__tf_kernel_fct, self.__kernel_dict = init_kernel() + self.__output_dim = init_output_dim(self.__subsample_size) + self.__activation = init_activation() + self.__W_matrix = None + + logger.info("Selecting deepstrom layer function with " + "subsample size = {}, " + "output_dim = {}, " + "{} activation function " + "and kernel = {}" + .format(self.__subsample_size, + self.__output_dim, + "with" if self.__activation else "without", + self.__kernel_name)) + + def build(self, input_shape): + if self.__output_dim != 0: + # outputdim == 0 means there is no W matrix and the kernel vector is directly added as input to + # the next layer + self.__W_matrix = self.add_variable( + name="W_nystrom", + shape=[self.__subsample_size, self.__output_dim], + initializer=tf.random_normal_initializer(stddev=0.1), + trainable=True + ) + + def call(self, inputs, **kwargs): + if type(inputs) is not list: + raise ValueError("Inputs of layer deepstrom should be a list") + if len(inputs[0].shape) != 2: + raise ValueError(f"Input x should be 2D but it is {len(inputs[0].shape)}D") + if len(inputs[1].shape) != 2: + raise ValueError(f"Input subsample should be 2D but it is {len(inputs[1].shape)}D") + if inputs[1].shape[0] != self.__subsample_size: + raise ValueError(f"Subsample should be of size {self.__subsample_size}") + if inputs[0][0].shape[0] != inputs[1][0].shape[0]: + raise ValueError(f"Input and subsample should have the same dimension") + + input_x = inputs[0] + input_sub = inputs[1] + with tf.name_scope("NystromLayer"): + with tf.name_scope("kernel_vec"): + kernel_vector = self.__tf_kernel_fct(input_x, input_sub, **self.__kernel_dict) + + if self.__output_dim != 0: + out = tf.matmul(kernel_vector, self.__W_matrix) + else: + out = kernel_vector + if self.__activation is not None: + out = self.__activation(out) + return out + if __name__ == '__main__': - main() + DeepstromLayerEndToEnd(subsample_size=64, + kernel_name='chi2_cpd', + kernel_dict={}) + diff --git a/skluc/main/tensorflow_/models.py b/skluc/main/tensorflow_/models.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb985789742a0deec4a6df3ff780299f67c1cb0 --- /dev/null +++ b/skluc/main/tensorflow_/models.py @@ -0,0 +1,102 @@ +from tensorflow.python.keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization, Activation +from tensorflow.python.keras.models import Sequential +from tensorflow.python.keras.regularizers import l2 +from tensorflow.python.keras.initializers import he_normal + + +def build_lenet_model(input_shape): + model = Sequential() + model.add( + Conv2D(6, (5, 5), padding='valid', activation='relu', kernel_initializer=he_normal(), input_shape=input_shape)) + model.add(MaxPooling2D((2, 2), strides=(2, 2))) + model.add(Conv2D(16, (5, 5), padding='valid', activation='relu', kernel_initializer=he_normal())) + model.add(MaxPooling2D((2, 2), strides=(2, 2))) + model.add(Flatten()) + return model + + +def build_vgg19_model(input_shape, weight_decay=0.0001): + model = Sequential() + + # Block 1 + model.add(Conv2D(64, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block1_conv1', input_shape=input_shape)) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(64, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block1_conv2')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')) + + # Block 2 + model.add(Conv2D(128, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block2_conv1')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(128, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block2_conv2')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')) + + # Block 3 + model.add(Conv2D(256, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block3_conv1')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(256, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block3_conv2')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(256, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block3_conv3')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(256, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block3_conv4')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')) + + # Block 4 + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block4_conv1')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block4_conv2')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block4_conv3')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block4_conv4')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')) + + # Block 5 + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block5_conv1')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block5_conv2')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block5_conv3')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Conv2D(512, (3, 3), padding='same', kernel_regularizer=l2(weight_decay), + kernel_initializer=he_normal(), name='block5_conv4')) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')) + + model.add(Flatten(name='flatten')) + + return model \ No newline at end of file diff --git a/skluc/main/tools/experiences/cluger.py b/skluc/main/tools/experiences/cluger.py index 77a26c0d914ae1badaaf440db952837b1dc9d2e5..ae645d0dc9d61a2f3ece42734546b813b2135200 100644 --- a/skluc/main/tools/experiences/cluger.py +++ b/skluc/main/tools/experiences/cluger.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ Cluger diff --git a/skluc/main/tools/experiences/executioner.py b/skluc/main/tools/experiences/executioner.py index 159ec37d050b17a92130ff847ccdfc019367ec16..495fd39606c02e428bd2ef6455feaa416e7e2c6e 100644 --- a/skluc/main/tools/experiences/executioner.py +++ b/skluc/main/tools/experiences/executioner.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ The Executioner is a tool used for performing clean experiments by managing the output files generated by them. diff --git a/skluc/main/tools/experiences/gather_results.py b/skluc/main/tools/experiences/gather_results.py index 92d8e28d42f0e36f78c775f82f04b68602fd545f..23bfc56d7fff11780b86e4bc11cf2ada87c784af 100644 --- a/skluc/main/tools/experiences/gather_results.py +++ b/skluc/main/tools/experiences/gather_results.py @@ -9,7 +9,7 @@ Usage: Options: -h --help Show this screen. -i --input-dir=<IPATH> Input directory wher to find results - -p --patern=regex Specify the pattern of the files to be looked at [default: .+\.stdout]. + -p --patern=regex Specify the pattern of the files to be looked at [default: .+\_stdout.txt]. -r --header Says if there is a header in the result files. -v --verbose Print the lines of the final file """ diff --git a/skluc/main/utils.py b/skluc/main/utils.py index aaace63b61deb0b3172135dd33b04e72e7d3cb03..2a7a3af7515eaa2b75b3cc41d03a6405c5dd2140 100644 --- a/skluc/main/utils.py +++ b/skluc/main/utils.py @@ -14,6 +14,7 @@ import psutil import collections from sklearn.metrics.pairwise import additive_chi2_kernel +import tensorflow as tf daiquiri.setup(level=logging.DEBUG) logger = daiquiri.getLogger() @@ -189,6 +190,7 @@ def compute_euristic_sigma(dataset_full, slice_size=1000): :return: """ results = [] + dataset_full = np.reshape(dataset_full, (-1, 1)) if slice_size > dataset_full.shape[0]: slice_size = dataset_full.shape[0] for i in range(dataset_full.shape[0] // slice_size): @@ -240,6 +242,7 @@ def compute_euristic_sigma_chi2(dataset_full, slice_size=100): :param dataset: The dataset on which to look for the best sigma :return: """ + dataset_full = np.reshape(dataset_full, (-1, 1)) results = [] if slice_size > dataset_full.shape[0]: slice_size = dataset_full.shape[0] @@ -297,7 +300,124 @@ LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) DownloadableModel = collections.namedtuple("DownloadableModel", ["url", "checksum"]) +class DictManager(dict): + pass + # def __getattr__(self, item): + # return self[item] + # + # def __setattr__(self, key, value): + # self[key] = value + # + # def __delattr__(self, item): + # del self[item] + + # def __missing__(self, key): + # logger.warning(f"Call to missing key {key} in {self.__class__.__name__}. None value returned.") + # self[key] = None + # return self[key] + +class ParameterManager(DictManager): + def get_gamma_value(self, dat, chi2=False): + if self["--gamma"] is None: + logger.debug("Gamma arguments is None. Need to compute it.") + if chi2: + gamma_value = 1. / compute_euristic_sigma_chi2(dat) + + else: + gamma_value = 1. / compute_euristic_sigma(dat) + else: + gamma_value = eval(self["--gamma"]) + + logger.debug("Gamma value is {}".format(gamma_value)) + return gamma_value + + def init_kernel(self): + if self["--rbf-kernel"]: + return "rbf" + elif self["--linear-kernel"]: + return "linear" + elif self["--chi-square-kernel"]: + return "chi2_cpd" + elif self["--exp-chi-square-kernel"]: + return "chi2_exp_cpd" + elif self["--chi-square-PD-kernel"]: + return "chi2_pd" + elif self["--laplacian-kernel"]: + return "laplacian" + else: + return None + + def init_network(self): + if self["dense"]: + return "dense" + elif self["deepfriedconvnet"]: + return "deepfriedconvnet" + elif self["deepstrom"]: + return "deepstrom" + elif self["none"]: + return "none" + + def init_non_linearity(self): + if self["--non-linearity"] == "tanh": + return tf.nn.tanh + elif self["--non-linearity"] == "relu": + return tf.nn.relu + elif self["--non-linearity"] == "None": + return None + + def init_dataset(self): + if self["--cifar10"]: + return "cifar10" + if self["--cifar100"]: + return "cifar100" + if self["--mnist"]: + return "mnist" + if self["--svhn"]: + return "svhn" + +class ResultManager(DictManager): + pass + +class ResultPrinter: + def __init__(self, *args, header=True): + self.__dicts = [] + self.__dicts.extend(args) + self.__header = header + + def _get_ordered_items(self): + all_keys = [] + all_values = [] + for d in self.__dicts: + keys, values = zip(*d.items()) + all_keys.extend(keys) + all_values.extend(values) + arr_keys, arr_values = np.array(all_keys), np.array(all_values) + indexes_sort = np.argsort(arr_keys) + return list(arr_keys[indexes_sort]), list(arr_values[indexes_sort]) + + def _get_values_ordered_by_keys(self): + _, values = self._get_ordered_items() + return values + + def _get_ordered_keys(self): + keys, _ = self._get_ordered_items() + return keys + + def add(self, d): + self.__dicts.append(d) + + def print(self): + headers, values = self._get_ordered_items() + headers = [str(h) for h in headers] + values = [str(v) for v in values] + if self.__header: + print(",".join(headers)) + print(",".join(values)) + if __name__ == "__main__": - a = np.identity(1000) - print(compute_euristic_sigma(a)) \ No newline at end of file + paraman = ParameterManager({"a": 4}) + resulman = ResultManager({"b": 2}) + resprinter = ResultPrinter(paraman) + resprinter.add(resulman) + resprinter.print() diff --git a/skluc/test/test_data/test_mldatasets/TestCaltech101Dataset.py b/skluc/test/test_data/test_mldatasets/TestCaltech101Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391