diff --git a/skluc/data/mldatasets.py b/skluc/data/mldatasets.py deleted file mode 100644 index 14e424d9a75407d986c635764357e38dcfeb472c..0000000000000000000000000000000000000000 --- a/skluc/data/mldatasets.py +++ /dev/null @@ -1,854 +0,0 @@ -""" -This module defines the Dataset classes usefull for downloading and loading datasets as numpy.ndarrays. - -The currently implemented datasets are: - - mnist - - cifar10 -""" - -import collections -import gzip -import os -import pickle -import re -import struct -import tarfile - -import numpy as np -# --- installed packages -import scipy.io as sio -import tensorflow as tf -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelBinarizer - -from skluc.utils import silentremove, download_data, check_files, create_directory, logger - -__all__ = ['MnistDataset', 'Cifar10Dataset', 'MovieReviewV1Dataset', 'Cifar100FineDataset'] - -LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) - - -class Dataset(object): - """ - Abstract class implementing basic methods for Dataset retrieval. - """ - # data_groups_private will be used to refer to the attributes self._train and self._test via their names - # as stringes. It is usefull when the same operations must be performed on train and test set - data_groups_private = ["_train", "_test"] - # data_groups_public = ["train", "test", "validation"] - - def __init__(self, l_url, s_name, s_download_dir=os.path.join(os.path.expanduser("~"), "ml_datasets"), - validation_size=0, seed=None): - self.l_url = l_url - self.l_filenames = [] - for url in self.l_url: - splitted_url = url.split("/") - self.l_filenames.append(splitted_url[-1]) - self.s_name = s_name - self.s_download_dir = os.path.join(s_download_dir, self.s_name) - self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames] - self._train = None - self._test = None - self.seed = seed - self.permuted_index_train = None - self.permuted_index_test = None - self.permuted_index_validation = None - self.validation_size = validation_size - - def reduce_data_size(self, new_size): - logger.info("Reducing datasize of dataset {} to .".format(self.s_name, new_size)) - kept_indices = self.get_uniform_class_rand_indices_train(new_size) - self.permuted_index_train = self.permuted_index_train[kept_indices] - - def get_uniform_class_rand_indices_train(self, size): - try: - kept_indices, _ = train_test_split(np.arange(len(self.train.data)), - train_size=size, stratify=self.train.labels, random_state=self.seed) - except ValueError as e: - logger.warning("In Dataset.get_uniform_class_rand_indices_train Handled exception: {}".format(str(e))) - logger.debug("Use random indexes instead") - kept_indices = np.random.permutation(len(self.train.data))[:size] - return kept_indices - - def get_uniform_class_rand_indices_validation(self, size): - try: - kept_indices, _ = train_test_split(np.arange(len(self.validation.data)), - train_size=size, stratify=self.validation.labels, random_state=self.seed) - except ValueError as e: - logger.warning("In Dataset.get_uniform_class_rand_indices_validation Handled exception: {}".format(str(e))) - logger.debug("Use random indexes instead") - kept_indices = np.random.permutation(len(self.validation.data))[:size] - return kept_indices - - @property - def train(self): - return LabeledData(data=self._train.data[self.permuted_index_train], - labels=self._train.labels[self.permuted_index_train]) - - @property - def test(self): - return LabeledData(data=self._test.data[self.permuted_index_test], - labels=self._test.labels[self.permuted_index_test]) - - @property - def validation(self): - return LabeledData(data=self._train.data[self.permuted_index_validation], - labels=self._train.labels[self.permuted_index_validation]) - - def download(self): - """ - Download the dataset. - - :return: None - """ - self.create_directory_tree() - if not check_files(self.l_filepaths): - logger.debug("Files need to be downloaded") - for s_fname in self.l_filepaths: - silentremove(s_fname) - for s_url in self.l_url: - logger.debug("Downloading file at url: {}".format(s_url)) - s_file_name = s_url.split("/")[-1] - download_data(s_url, self.s_download_dir, s_file_name) - else: - logger.debug("Files {} already exist".format(self.l_filepaths)) - - def create_directory_tree(self): - """ - Create the target directory tree - - :return: None - """ - create_directory(self.s_download_dir) - - def _check_validation_size(self, data_length): - if self.validation_size > data_length: - raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \ - "Please choose a little validation set size".format(self.validation_size, data_length)) - logger.debug("Validation size < data length ({} < {})".format(self.validation_size, data_length)) - - def to_one_hot(self): - """ - Convert categorical labels to one hot encoding - - :return: - """ - enc = LabelBinarizer() - enc.fit(self._train.labels) - logger.info("Apply one hot encoding to dataset {}.".format(self.s_name)) - for kw in self.data_groups_private: - datlab = getattr(self, kw) - if len(datlab.labels) == 0: - logger.debug("No labels found in {} data of {} dataset".format(kw, self.s_name)) - continue - logger.debug("Apply one hot encoding to {} data of {} dataset".format(kw, self.s_name)) - labels = np.array(enc.transform(datlab.labels)) - data = datlab.data - setattr(self, kw, LabeledData(data, labels)) - - def revert_one_hot(self): - logger.info("Revert one hot encoding to dataset {}.".format(self.s_name)) - for kw in self.data_groups_private: - datlab = getattr(self, kw) - if len(datlab.labels) == 0: - logger.debug("No labels found in {} data of {} dataset".format(kw, self.s_name)) - continue - logger.debug("Apply one hot encoding to {} data of {} dataset".format(kw, self.s_name)) - labels = np.argmax(datlab.labels, axis=1) - data = datlab.data - setattr(self, kw, LabeledData(data, labels)) - - def normalize(self): - """ - Normalize data. - - Feature scaling normalization. - - :return: - """ - logger.info("Apply normalization to data from dataset {}.".format(self.s_name)) - for kw in self.data_groups_private: - datlab = getattr(self, kw) - if len(datlab.labels) == 0: - continue - data = datlab.data - _min = data.min() - _max = data.max() - data = (data - _min) / (_max - _min) - logger.debug("Apply normalization to {} data of {} dataset.".format(kw, self.s_name)) - setattr(self, kw, LabeledData(data, datlab.labels)) - - def data_astype(self, _type): - logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name)) - for kw in self.data_groups_private: - datlab = getattr(self, kw) - if len(datlab.labels) == 0: - continue - logger.debug("Change type of {} data to {} in the dataset {}.".format(kw, str(_type), self.s_name)) - data = datlab.data - logger.debug("{} data was of type {}".format(kw, data.dtype)) - data = data.astype(_type) - logger.debug("{} data is now of type {}".format(kw, data.dtype)) - setattr(self, kw, LabeledData(data, datlab.labels)) - - def labels_astype(self, _type): - logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name)) - for kw in self.data_groups_private: - datlab = getattr(self, kw) - if len(datlab.labels) == 0: - continue - labels = datlab.labels - logger.debug("Change type of {} labels to {} in the dataset {}.".format(kw, str(_type), self.s_name)) - logger.debug("{} labels were of type {}".format(kw, labels.dtype)) - labels = labels.astype(_type) - logger.debug("{} labels are now of type {}".format(kw, labels.dtype)) - setattr(self, kw, LabeledData(datlab.data, labels)) - - def load(self): - # todo faire une verification generique que le jeu de donné à été chargé lorsque des opérations - # sont appliquées aux données - logger.info("Loading dataset {}".format(self.s_name)) - self.download() - self.read() - if self._train is not None: - logger.debug("Construction of random train indexes (seed: {})".format(self.seed)) - np.random.seed(self.seed) - permut = np.random.permutation(self._train[0].shape[0]) - if self.validation_size > 0: - self.permuted_index_train = permut[:-self.validation_size] - self.permuted_index_validation = permut[-self.validation_size:] - else: - self.permuted_index_train = permut - self.permuted_index_validation = np.array([]) - if self._test is not None: - logger.debug("Construction of random test indexes (seed: {})".format(self.seed)) - logger.debug("Dataset size: {}".format(self._train[0].shape[0])) - np.random.seed(self.seed) - self.permuted_index_test = np.random.permutation(self._test[0].shape[0]) - if self._train is None and self._test is None: - raise Exception("No data loaded at the end of load method.") - - - # --- Abstract methods - - def read(self): - """ - This method should load dataset in _train and _test attributes. - :return: - """ - raise NotImplementedError - - -class MovieReviewV1Dataset(Dataset): - data_groups_private = ["_train"] - TRAIN_SIZE = 9000 - - def apply_transformer(self, transformer_class): - # todo, cette fonction devrait marcher pour tout dataset (donc il faudrait la mettre dans la classe Dataset) - transformer = transformer_class() - transformer_name = transformer.__class__.__name__ - transform_path = os.path.join(self.s_download_dir, transformer_name) - transform_filepaths = [os.path.join(transform_path, kw + ".npz") - for kw in self.data_groups_private] - create_directory(transform_path) - if check_files(transform_filepaths) and transformer.check_model(): - # in the case where the transformations already exist in npz files - # and the model is the actual good model - # but I have no guarantee the transformation has been obtained with the stored model though... - # todo make the npz files to store the md5 checksum of the model that has produced them - logger.debug("Files {} already exists".format(transform_filepaths)) - logger.debug("Now load data of files {}".format(transform_filepaths)) - for kw in self.data_groups_private: - npzfile_path = os.path.join(transform_path, kw + ".npz") - logger.debug("Loading {}".format(npzfile_path)) - npzfile = np.load(npzfile_path) - data = npzfile[kw + "_data"] - labels = npzfile[kw + "_labels"] - setattr(self, kw, LabeledData(data=data, labels=labels)) - # todo être plus intelligent avec le mode debug du logger. Pour l'instant je met tout en debug - else: - # in the case the transformations doesn't yet exist - # one nead to apply it to the data - # then to save the transformation - logger.debug("Files {} don't exist or model md5 checksum doesn't match. Need to produce them".format(transform_filepaths)) - logger.info("Apply convolution of {} to dataset {}".format(transformer_name, self.s_name)) - for kw in self.data_groups_private: - data, labels = getattr(self, kw) - transformed_data, transformed_labels = transformer.transform(data, labels) - setattr(self, kw, LabeledData(data=transformed_data, labels=transformed_labels)) - dict_attr = {kw + "_data": transformed_data, kw + "_labels": transformed_labels} - filepath = os.path.join(transform_path, kw + ".npz") - - logger.debug("Saving transformed {} data to {}".format(kw, filepath)) - np.savez(filepath, **dict_attr) - - def __init__(self, validation_size=0, seed=0, s_download_dir=None): - self.__s_url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz" - - if s_download_dir is not None: - super().__init__([self.__s_url], "moviereview", s_download_dir, validation_size=validation_size, seed=seed) - else: - super().__init__([self.__s_url], "moviereview", validation_size=validation_size, seed=seed) - - self.__extracted_files = [ - 'rt-polarity.pos', - 'rt-polarity.neg' - ] - self.__extracted_dirname = os.path.join(self.s_download_dir, "rt-polaritydata") - - self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] - - self.__counter = 1 - self.__vocab = {"<pad>": 0} - self.__reversed_vocab = {0: "<pad>"} - - @property - def vocab(self): - return self.__vocab - - @property - def vocab_inv(self): - return self.__reversed_vocab - - def read(self): - # todo faire une fonction d'extraction commune? - targz_file_path = self.l_filepaths[-1] - if not check_files(self.__extracted_file_paths): - logger.debug("Extracting {} ...".format(targz_file_path)) - tar = tarfile.open(targz_file_path, "r:gz") - tar.extractall(path=self.s_download_dir) - else: - logger.debug("File {} has already been extracted".format(targz_file_path)) - - data_labeled = MovieReviewV1Dataset.load_data_and_labels(self.__extracted_file_paths[0], - self.__extracted_file_paths[1], - encoding="ISO-8859-1") - - max_ = -1 - for l in data_labeled[0]: - max_ = max(max_, len(l.strip().split())) - - lst_arr_ex = [] - for ex in data_labeled[0]: - splitted_ex = ex.strip().split() - splitted_ex_nbr = [] - for wrd in splitted_ex: - if wrd not in self.__vocab: - self.__vocab[wrd] = self.__counter - self.__reversed_vocab[self.__counter] = wrd - self.__counter += 1 - splitted_ex_nbr.append(self.__vocab[wrd]) - arr_splitted_ex_nbr = np.pad(splitted_ex_nbr, (0, max_-len(splitted_ex_nbr)), 'constant', - constant_values=self.__vocab["<pad>"]) - lst_arr_ex.append(np.reshape(arr_splitted_ex_nbr, (1, -1))) - X = np.concatenate(lst_arr_ex, axis=0) - - self._train = LabeledData(data=X, - labels=data_labeled[1]) - - @property - def train(self): - indexes = self.permuted_index_train[:self.TRAIN_SIZE - self.validation_size] - return LabeledData(data=self._train.data[indexes], - labels=self._train.labels[indexes]) - - @property - def test(self): - indexes = self.permuted_index_train[self.TRAIN_SIZE:] - return LabeledData(data=self._train.data[indexes], - labels=self._train.labels[indexes]) - - @property - def validation(self): - indexes = self.permuted_index_train[(self.TRAIN_SIZE - self.validation_size):self.TRAIN_SIZE] - return LabeledData(data=self._train.data[indexes], - labels=self._train.labels[indexes]) - - @property - def vocabulary_length(self): - return len(self.__vocab) - - @staticmethod - def clean_str(string): - """ - Tokenization/string cleaning for all datasets except for SST. - Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py - - Credit to: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py - """ - string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r",", " , ", string) - string = re.sub(r"!", " ! ", string) - string = re.sub(r"\(", " \( ", string) - string = re.sub(r"\)", " \) ", string) - string = re.sub(r"\?", " \? ", string) - string = re.sub(r"\s{2,}", " ", string) - return string.strip().lower() - - @staticmethod - def load_data_and_labels(positive_data_file, negative_data_file, encoding='utf-8'): - """ - Loads MR polarity data from files, splits the data into words and generates labels. - Returns split sentences and labels. - - Credit to: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py - """ - # Load data from files - positive_examples = list(open(positive_data_file, "r", encoding=encoding).readlines()) - positive_examples = [s.strip() for s in positive_examples] - negative_examples = list(open(negative_data_file, "r", encoding=encoding).readlines()) - negative_examples = [s.strip() for s in negative_examples] - # Split by words - x_text = positive_examples + negative_examples - x_text = [MovieReviewV1Dataset.clean_str(sent) for sent in x_text] - # Generate labels - positive_labels = [[0, 1] for _ in positive_examples] - negative_labels = [[1, 0] for _ in negative_examples] - y = np.concatenate([positive_labels, negative_labels], 0) - return LabeledData(data=x_text, labels=y) - - # todo not yet sure the following is usefull - # @staticmethod - # def batch_iter(data, batch_size, num_epochs, shuffle=True): - # """ - # Generates a batch iterator for a dataset. - # """ - # data = np.array(data) - # data_size = len(data) - # num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 - # for epoch in range(num_epochs): - # # Shuffle the data at each epoch - # if shuffle: - # shuffle_indices = np.random.permutation(np.arange(data_size)) - # shuffled_data = data[shuffle_indices] - # else: - # shuffled_data = data - # for batch_num in range(num_batches_per_epoch): - # start_index = batch_num * batch_size - # end_index = min((batch_num + 1) * batch_size, data_size) - # - # yield shuffled_data[start_index:end_index] - - -class ImageDataset(Dataset): - HEIGHT = -1 - WIDTH = -1 - DEPTH = -1 - - def apply_transformer(self, transformer): - """ - - :param transformer: Transformer object (not a class) - :return: - """ - logger.info("Apply transformation {} to data from dataset {}.".format(transformer.__class__.__name__, self.s_name)) - # todo, cette fonction devrait marcher pour tout dataset (donc il faudrait la mettre dans la classe Dataset) - transformer_name = transformer.NAME - transform_path = os.path.join(self.s_download_dir, transformer_name) - transform_filepaths = [os.path.join(transform_path, kw + ".npz") - for kw in self.data_groups_private] - create_directory(transform_path) - if check_files(transform_filepaths) and transformer.check_model(): - # in the case where the transformations already exist in npz files - # and the model is the actual good model - # but I have no guarantee the transformation has been obtained with the stored model though... - # todo make the npz files to store the md5 checksum of the model that has produced them - logger.debug("Files {} already exists".format(transform_filepaths)) - logger.info("Loading transformed data from files {}".format(transform_filepaths)) - for kw in self.data_groups_private: - npzfile_path = os.path.join(transform_path, kw + ".npz") - logger.debug("Loading {}".format(npzfile_path)) - npzfile = np.load(npzfile_path) - data = npzfile[kw + "_data"] - logger.debug("Shape of {} set: {}".format(kw, data.shape)) - labels = npzfile[kw + "_labels"] - setattr(self, kw, LabeledData(data=data, labels=labels)) - else: - # in the case the transformations doesn't yet exist - # one need to apply it to the data - # then to save the transformation - logger.debug("Files {} don't exist or model md5 checksum doesn't match. Need to produce them".format(transform_filepaths)) - logger.info("Apply convolution of {} to dataset {}".format(transformer_name, self.s_name)) - for kw in self.data_groups_private: - data, labels = getattr(self, kw) - transformed_data, transformed_labels = transformer.transform(data, labels) - setattr(self, kw, LabeledData(data=transformed_data, labels=transformed_labels)) - dict_attr = {kw + "_data": transformed_data, kw + "_labels": transformed_labels} - filepath = os.path.join(transform_path, kw + ".npz") - logger.debug("Shape of {} set: {}".format(kw, transformed_data.shape)) - logger.debug("Saving transformed {} data to {}".format(kw, filepath)) - np.savez(filepath, **dict_attr) - - def to_image(self): - """ - Modify data to present it like images (matrices) instead of vectors. - - :return: The modified data. - """ - if self.HEIGHT == -1 or self.WIDTH == -1 or self.DEPTH == -1: - raise ValueError("Height, width and depth static attributes of class {} should be set.".format(self.__class__)) - for kw in self.data_groups_private: - datlab = getattr(self, kw) - if datlab is None: - continue - images_vec = datlab.data - labels = datlab.labels - length_by_chanel = images_vec.shape[1]/self.DEPTH - logger.debug("Images vec shape: {}".format(images_vec.shape)) - if int(length_by_chanel) != length_by_chanel: - raise Exception("Dimensionality problem") - else: - length_by_chanel = int(length_by_chanel) - images_mat = np.reshape(images_vec, (images_vec.shape[0], length_by_chanel, self.DEPTH), - order='F') - images = np.reshape(images_mat, (images_mat.shape[0], self.HEIGHT, self.WIDTH, - self.DEPTH), order='C') - setattr(self, kw, LabeledData(images, labels)) - - def flatten(self): - """ - Flatten all the datasets (matrices to vectors) - - :return: - """ - logger.info("Apply flattening to dataset {}.".format(self.s_name)) - for kw in self.data_groups_private: - logger.debug("Flattening data {} of dataset {}".format(kw, self.s_name)) - datlab = getattr(self, kw) - init_dim = np.prod([s for s in datlab.data.shape[1:]]) - logger.debug("Shape of {} data: {}".format(kw, datlab.data.shape)) - logger.debug("Number of features in {} data: {}".format(kw, init_dim)) - data = datlab.data.reshape(datlab.data.shape[0], init_dim) - setattr(self, kw, LabeledData(data=data, labels=datlab.labels)) - - def rescale(self, factor): - """ - Rescale images by factor. - - :param factor: - :return: - """ - sess = tf.InteractiveSession() - for kw in self.data_groups_private: - datlab = getattr(self, kw) - images_mat = datlab.data - output_shape = np.multiply(images_mat.shape[1:-1], (factor, factor)) - labels = datlab.labels - logger.debug("Shape of {} data before rescaling: {}".format(kw, images_mat.shape)) - logger.debug("Excpected output shape for images: {}".format(output_shape)) - new_image = tf.image.resize_images(images_mat, output_shape).eval() - logger.debug("Shape of {} data after rescaling: {}".format(kw, new_image.shape)) - setattr(self, kw, LabeledData(new_image, labels)) - sess.close() - - def to_feature_vectors(self): - """ - From a feature representation (W x H x D) of the images, gives the feature vector representation of - dimension (N x D) with N being W x H. - - :return: - """ - for kw in self.data_groups_private: - datlab = getattr(self, kw) - images_mat = datlab.data - labels = datlab.labels - - logger.debug("Shape of {} data before reshape: {}".format(kw, images_mat.shape)) - images_mat = images_mat.reshape(images_mat.shape[0], -1, images_mat.shape[-1]) - logger.debug("Shape of {} data after reshape: {}".format(kw, images_mat.shape)) - setattr(self, kw, LabeledData(images_mat, labels)) - - -class MnistDataset(ImageDataset): - - HEIGHT = 28 - WIDTH = 28 - DEPTH = 1 - - def __init__(self, validation_size=0, seed=0, s_download_dir=None): - self.__s_root_url = "http://yann.lecun.com/exdb/mnist/" - self.__d_leaf_url = { - "train_data": "train-images-idx3-ubyte.gz", - "train_label": "train-labels-idx1-ubyte.gz", - "test_data": "t10k-images-idx3-ubyte.gz", - "test_label": "t10k-labels-idx1-ubyte.gz" - } - - l_url = [self.__s_root_url + leaf_url for leaf_url in self.__d_leaf_url.values()] - if s_download_dir is not None: - super().__init__(l_url, "mnist", s_download_dir, validation_size=validation_size, seed=seed) - else: - super().__init__(l_url, "mnist", validation_size=validation_size, seed=seed) - - @staticmethod - def read_gziped_ubyte(fname_img=None, fname_lbl=None): - """ - loosely copied on https://gist.github.com/akesling/5358964 - - Python function for importing the MNIST data set. It returns an iterator - of 2-tuples with the first element being the label and the second element - being a numpy.uint8 2D array of pixel data for the given image. - """ - # Load everything in some numpy arrays - logger.info("Read gziped ubyte file {}".format(fname_img)) - with gzip.open(fname_lbl, 'rb') as flbl: - magic, num = struct.unpack(">II", flbl.read(8)) - lbl = np.fromstring(flbl.read(), dtype=np.int8) - - logger.info("Read gziped ubyte file {}".format(fname_lbl)) - with gzip.open(fname_img, 'rb') as fimg: - magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) - img = np.fromstring(fimg.read(), dtype=np.uint8) - img = img.reshape(len(lbl), -1) - - return img, lbl - - def read(self): - """ - Return a dict of data where, for each key is associated a (data, label) tuple. - - The values of the tuple are np.ndarray. - - :return: dict - """ - # todo add possibility to provide percentage for validation set instead of size - self._train = LabeledData( - *self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["train_data"]), - os.path.join(self.s_download_dir, self.__d_leaf_url["train_label"])) - ) - - self._test = LabeledData( - *self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["test_data"]), - os.path.join(self.s_download_dir, self.__d_leaf_url["test_label"])) - ) - - self._check_validation_size(self._train[0].shape[0]) - - -class Cifar10Dataset(ImageDataset): - - HEIGHT = 32 - WIDTH = 32 - DEPTH = 3 - - def __init__(self, validation_size=0, seed=None, s_download_dir=None): - self.__s_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" - self.meta = None - name = "cifar10" - if s_download_dir is not None: - super().__init__([self.__s_url], name, s_download_dir, validation_size=validation_size, seed=seed) - else: - super().__init__([self.__s_url], name, validation_size=validation_size, seed=seed) - - self.__extracted_dirname = os.path.join(self.s_download_dir, "cifar-10-batches-py") - self.__extracted_files =[ - 'batches.meta', - 'data_batch_1', - 'data_batch_2', - 'data_batch_3', - 'data_batch_4', - 'data_batch_5', - 'readme.html', - 'test_batch' - ] - - self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] - - def get_cifar10_data(self, keyword): - """ - Get data from the files containing the keyword in their name. - - :param keyword: - :return: - """ - full_data = [] - full_labels = [] - for fpath in self.__extracted_file_paths: - if keyword in fpath.split('/')[-1]: - with open(fpath, 'rb') as f: - pckl_data = pickle.load(f, encoding='bytes') - full_data.append(pckl_data[b'data']) - full_labels.append(pckl_data[b'labels']) - final_data = np.vstack(full_data) - final_label = np.hstack(full_labels) - - return final_data, final_label - - def get_meta(self): - """ - Get meta data about cifar10 from file. - - :return: - """ - for fpath in self.__extracted_file_paths: - if 'meta' in fpath.split('/')[-1]: - with open(fpath, 'rb') as f: - pckl_data = pickle.load(f, encoding='bytes') - meta = pckl_data[b'label_names'] - return np.array(meta) - - def read(self): - targz_file_path = self.l_filepaths[-1] - if not check_files(self.__extracted_file_paths): - logger.debug("Extracting {} ...".format(targz_file_path)) - tar = tarfile.open(targz_file_path, "r:gz") - tar.extractall(path=self.s_download_dir) - else: - logger.debug("File {} has already been extracted".format(targz_file_path)) - - logger.debug("Get training data of dataset {}".format(self.s_name)) - self._train = LabeledData(*self.get_cifar10_data('data')) - - logger.debug("Get testing data of dataset {}".format(self.s_name)) - self._test = LabeledData(*self.get_cifar10_data('test')) - self.meta = self.get_meta() - - self._check_validation_size(self._train[0].shape[0]) - - -class Cifar100FineDataset(ImageDataset): - HEIGHT = 32 - WIDTH = 32 - DEPTH = 3 - - def __init__(self, validation_size=0, seed=None, s_download_dir=None): - self.__s_url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" - self.meta = None - name = "cifar100fine" - if s_download_dir is not None: - super().__init__([self.__s_url], name, s_download_dir, validation_size=validation_size, seed=seed) - else: - super().__init__([self.__s_url], name, validation_size=validation_size, seed=seed) - - self.__extracted_dirname = os.path.join(self.s_download_dir, "cifar-100-python") - self.__extracted_files = [ - 'train', - 'test', - 'meta' - ] - - self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] - - def get_cifar100_data(self, keyword): - """ - Get data from the files containing the keyword in their name. - - :param keyword: - :return: - """ - full_data = [] - full_labels = [] - for fpath in self.__extracted_file_paths: - if keyword in fpath.split('/')[-1]: - with open(fpath, 'rb') as f: - pckl_data = pickle.load(f, encoding='bytes') - full_data.append(pckl_data[b'data']) - full_labels.append(pckl_data[b'fine_labels']) - final_data = np.vstack(full_data) - final_label = np.hstack(full_labels) - - return final_data, final_label - - def get_meta(self): - """ - Get meta data about cifar10 from file. - - :return: - """ - for fpath in self.__extracted_file_paths: - if 'meta' in fpath.split('/')[-1]: - with open(fpath, 'rb') as f: - pckl_data = pickle.load(f, encoding='bytes') - meta = pckl_data[b'fine_label_names'] - return np.array(meta) - - def read(self): - targz_file_path = self.l_filepaths[-1] - if not check_files(self.__extracted_file_paths): - logger.debug("Extracting {} ...".format(targz_file_path)) - tar = tarfile.open(targz_file_path, "r:gz") - tar.extractall(path=self.s_download_dir) - else: - logger.debug("File {} has already been extracted".format(targz_file_path)) - - logger.debug("Get training data of dataset {}".format(self.s_name)) - self._train = LabeledData(*self.get_cifar100_data('train')) - - logger.debug("Get testing data of dataset {}".format(self.s_name)) - self._test = LabeledData(*self.get_cifar100_data('test')) - self.meta = self.get_meta() - - self._check_validation_size(self._train[0].shape[0]) - - -class SVHNDataset(ImageDataset): - - HEIGHT = 32 - WIDTH = 32 - DEPTH = 3 - - def __init__(self, validation_size=0, seed=0, s_download_dir=None): - self.__s_root_url = "http://ufldl.stanford.edu/housenumbers/" - self.__d_leaf_url = { - "train_data": "train_32x32.mat", - "test_data": "test_32x32.mat", - } - - l_url = [self.__s_root_url + leaf_url for leaf_url in self.__d_leaf_url.values()] - if s_download_dir is not None: - super().__init__(l_url, "svhn", s_download_dir, validation_size=validation_size, seed=seed) - else: - super().__init__(l_url, "svhn", validation_size=validation_size, seed=seed) - - @staticmethod - def read_mat(fname): - """ - loosely copied on https://stackoverflow.com/questions/29185493/read-svhn-dataset-in-python - - Python function for importing the SVHN data set. - """ - # Load everything in some numpy arrays - logger.info("Read mat file {}".format(fname)) - data = sio.loadmat(fname) - img = np.moveaxis(data['X'], -1, 0) - lbl = data['y'] - return img, lbl - - def read(self): - """ - Return a dict of data where, for each key is associated a (data, label) tuple. - - The values of the tuple are np.ndarray. - - :return: dict - """ - # todo add possibility to provide percentage for validation set instead of size - self._train = LabeledData( - *self.read_mat(os.path.join(self.s_download_dir, self.__d_leaf_url["train_data"])) - ) - - self._test = LabeledData( - *self.read_mat(os.path.join(self.s_download_dir, self.__d_leaf_url["test_data"])) - ) - - self._check_validation_size(self._train[0].shape[0]) - - -if __name__ == "__main__": - d = Cifar100FineDataset(validation_size=10000) - d.load() - print("Before preprocessing") - print(d.train.data.shape, d.train.labels.shape) - print(d.validation.data.shape, d.validation.labels.shape) - print(d.test.data.shape, d.test.labels.shape) - # d.apply_transformer(VGG19SvhnTransformer) - # print("After vgg19 preprocessing") - # print(d.train.data.shape, d.train.labels.shape) - # print(d.validation.data.shape, d.validation.labels.shape) - # print(d.test.data.shape, d.test.labels.shape) diff --git a/skluc/data/mldatasets/Cifar100FineDataset.py b/skluc/data/mldatasets/Cifar100FineDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..83b6747e04d779e0ae1425256a4834caac037548 --- /dev/null +++ b/skluc/data/mldatasets/Cifar100FineDataset.py @@ -0,0 +1,84 @@ +import os +import pickle +import tarfile + +import numpy as np + +from skluc.data.mldatasets import LabeledData +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger, check_files + + +class Cifar100FineDataset(ImageDataset): + HEIGHT = 32 + WIDTH = 32 + DEPTH = 3 + + def __init__(self, validation_size=0, seed=None, s_download_dir=None): + self.__s_url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" + self.meta = None + name = "cifar100fine" + if s_download_dir is not None: + super().__init__([self.__s_url], name, s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__([self.__s_url], name, validation_size=validation_size, seed=seed) + + self.__extracted_dirname = os.path.join(self.s_download_dir, "cifar-100-python") + self.__extracted_files = [ + 'train', + 'test', + 'meta' + ] + + self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] + + def get_cifar100_data(self, keyword): + """ + Get data from the files containing the keyword in their name. + + :param keyword: + :return: + """ + full_data = [] + full_labels = [] + for fpath in self.__extracted_file_paths: + if keyword in fpath.split('/')[-1]: + with open(fpath, 'rb') as f: + pckl_data = pickle.load(f, encoding='bytes') + full_data.append(pckl_data[b'data']) + full_labels.append(pckl_data[b'fine_labels']) + final_data = np.vstack(full_data) + final_label = np.hstack(full_labels) + + return final_data, final_label + + def get_meta(self): + """ + Get meta data about cifar10 from file. + + :return: + """ + for fpath in self.__extracted_file_paths: + if 'meta' in fpath.split('/')[-1]: + with open(fpath, 'rb') as f: + pckl_data = pickle.load(f, encoding='bytes') + meta = pckl_data[b'fine_label_names'] + return np.array(meta) + + def read(self): + targz_file_path = self.l_filepaths[-1] + if not check_files(self.__extracted_file_paths): + logger.debug("Extracting {} ...".format(targz_file_path)) + tar = tarfile.open(targz_file_path, "r:gz") + tar.extractall(path=self.s_download_dir) + else: + logger.debug("File {} has already been extracted".format(targz_file_path)) + + logger.debug("Get training data of dataset {}".format(self.s_name)) + self._train = LabeledData(*self.get_cifar100_data('train')) + + logger.debug("Get testing data of dataset {}".format(self.s_name)) + self._test = LabeledData(*self.get_cifar100_data('test')) + self.meta = self.get_meta() + + self._check_validation_size(self._train[0].shape[0]) diff --git a/skluc/data/mldatasets/Cifar10Dataset.py b/skluc/data/mldatasets/Cifar10Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c48f8364efd6fa04fe79c1e5c823e460499a395f --- /dev/null +++ b/skluc/data/mldatasets/Cifar10Dataset.py @@ -0,0 +1,90 @@ +import os +import pickle +import tarfile + +import numpy as np + +from skluc.data.mldatasets import LabeledData +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger, check_files + + +class Cifar10Dataset(ImageDataset): + + HEIGHT = 32 + WIDTH = 32 + DEPTH = 3 + + def __init__(self, validation_size=0, seed=None, s_download_dir=None): + self.__s_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" + self.meta = None + name = "cifar10" + if s_download_dir is not None: + super().__init__([self.__s_url], name, s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__([self.__s_url], name, validation_size=validation_size, seed=seed) + + self.__extracted_dirname = os.path.join(self.s_download_dir, "cifar-10-batches-py") + self.__extracted_files =[ + 'batches.meta', + 'data_batch_1', + 'data_batch_2', + 'data_batch_3', + 'data_batch_4', + 'data_batch_5', + 'readme.html', + 'test_batch' + ] + + self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] + + def get_cifar10_data(self, keyword): + """ + Get data from the files containing the keyword in their name. + + :param keyword: + :return: + """ + full_data = [] + full_labels = [] + for fpath in self.__extracted_file_paths: + if keyword in fpath.split('/')[-1]: + with open(fpath, 'rb') as f: + pckl_data = pickle.load(f, encoding='bytes') + full_data.append(pckl_data[b'data']) + full_labels.append(pckl_data[b'labels']) + final_data = np.vstack(full_data) + final_label = np.hstack(full_labels) + + return final_data, final_label + + def get_meta(self): + """ + Get meta data about cifar10 from file. + + :return: + """ + for fpath in self.__extracted_file_paths: + if 'meta' in fpath.split('/')[-1]: + with open(fpath, 'rb') as f: + pckl_data = pickle.load(f, encoding='bytes') + meta = pckl_data[b'label_names'] + return np.array(meta) + + def read(self): + targz_file_path = self.l_filepaths[-1] + if not check_files(self.__extracted_file_paths): + logger.debug("Extracting {} ...".format(targz_file_path)) + tar = tarfile.open(targz_file_path, "r:gz") + tar.extractall(path=self.s_download_dir) + else: + logger.debug("File {} has already been extracted".format(targz_file_path)) + + logger.debug("Get training data of dataset {}".format(self.s_name)) + self._train = LabeledData(*self.get_cifar10_data('data')) + + logger.debug("Get testing data of dataset {}".format(self.s_name)) + self._test = LabeledData(*self.get_cifar10_data('test')) + self.meta = self.get_meta() + + self._check_validation_size(self._train[0].shape[0]) \ No newline at end of file diff --git a/skluc/data/mldatasets/Dataset.py b/skluc/data/mldatasets/Dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..67a071a07d036883b8ef80b1162425130d082591 --- /dev/null +++ b/skluc/data/mldatasets/Dataset.py @@ -0,0 +1,219 @@ +import os + +import numpy as np +from sklearn.cross_validation import train_test_split +from sklearn.preprocessing import LabelBinarizer + +from skluc.data.mldatasets import LabeledData +from skluc.utils import logger, check_files, silentremove, download_data, create_directory + + +class Dataset(object): + """ + Abstract class implementing basic methods for Dataset retrieval. + """ + # data_groups_private will be used to refer to the attributes self._train and self._test via their names + # as stringes. It is usefull when the same operations must be performed on train and test set + data_groups_private = ["_train", "_test"] + # data_groups_public = ["train", "test", "validation"] + + def __init__(self, l_url, s_name, s_download_dir=os.path.join(os.path.expanduser("~"), "ml_datasets"), + validation_size=0, seed=None): + self.l_url = l_url + self.l_filenames = [] + for url in self.l_url: + splitted_url = url.split("/") + self.l_filenames.append(splitted_url[-1]) + self.s_name = s_name + self.s_download_dir = os.path.join(s_download_dir, self.s_name) + self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames] + self._train = None + self._test = None + self.seed = seed + self.permuted_index_train = None + self.permuted_index_test = None + self.permuted_index_validation = None + self.validation_size = validation_size + + def reduce_data_size(self, new_size): + logger.info("Reducing datasize of dataset {} to .".format(self.s_name, new_size)) + kept_indices = self.get_uniform_class_rand_indices_train(new_size) + self.permuted_index_train = self.permuted_index_train[kept_indices] + + def get_uniform_class_rand_indices_train(self, size): + try: + kept_indices, _ = train_test_split(np.arange(len(self.train.data)), + train_size=size, stratify=self.train.labels, random_state=self.seed) + except ValueError as e: + logger.warning("In Dataset.get_uniform_class_rand_indices_train Handled exception: {}".format(str(e))) + logger.debug("Use random indexes instead") + kept_indices = np.random.permutation(len(self.train.data))[:size] + return kept_indices + + def get_uniform_class_rand_indices_validation(self, size): + try: + kept_indices, _ = train_test_split(np.arange(len(self.validation.data)), + train_size=size, stratify=self.validation.labels, random_state=self.seed) + except ValueError as e: + logger.warning("In Dataset.get_uniform_class_rand_indices_validation Handled exception: {}".format(str(e))) + logger.debug("Use random indexes instead") + kept_indices = np.random.permutation(len(self.validation.data))[:size] + return kept_indices + + @property + def train(self): + return LabeledData(data=self._train.data[self.permuted_index_train], + labels=self._train.labels[self.permuted_index_train]) + + @property + def test(self): + return LabeledData(data=self._test.data[self.permuted_index_test], + labels=self._test.labels[self.permuted_index_test]) + + @property + def validation(self): + return LabeledData(data=self._train.data[self.permuted_index_validation], + labels=self._train.labels[self.permuted_index_validation]) + + def download(self): + """ + Download the dataset. + + :return: None + """ + self.create_directory_tree() + if not check_files(self.l_filepaths): + logger.debug("Files need to be downloaded") + for s_fname in self.l_filepaths: + silentremove(s_fname) + for s_url in self.l_url: + logger.debug("Downloading file at url: {}".format(s_url)) + s_file_name = s_url.split("/")[-1] + download_data(s_url, self.s_download_dir, s_file_name) + else: + logger.debug("Files {} already exist".format(self.l_filepaths)) + + def create_directory_tree(self): + """ + Create the target directory tree + + :return: None + """ + create_directory(self.s_download_dir) + + def _check_validation_size(self, data_length): + if self.validation_size > data_length: + raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \ + "Please choose a little validation set size".format(self.validation_size, data_length)) + logger.debug("Validation size < data length ({} < {})".format(self.validation_size, data_length)) + + def to_one_hot(self): + """ + Convert categorical labels to one hot encoding + + :return: + """ + enc = LabelBinarizer() + enc.fit(self._train.labels) + logger.info("Apply one hot encoding to dataset {}.".format(self.s_name)) + for kw in self.data_groups_private: + datlab = getattr(self, kw) + if len(datlab.labels) == 0: + logger.debug("No labels found in {} data of {} dataset".format(kw, self.s_name)) + continue + logger.debug("Apply one hot encoding to {} data of {} dataset".format(kw, self.s_name)) + labels = np.array(enc.transform(datlab.labels)) + data = datlab.data + setattr(self, kw, LabeledData(data, labels)) + + def revert_one_hot(self): + logger.info("Revert one hot encoding to dataset {}.".format(self.s_name)) + for kw in self.data_groups_private: + datlab = getattr(self, kw) + if len(datlab.labels) == 0: + logger.debug("No labels found in {} data of {} dataset".format(kw, self.s_name)) + continue + logger.debug("Apply one hot encoding to {} data of {} dataset".format(kw, self.s_name)) + labels = np.argmax(datlab.labels, axis=1) + data = datlab.data + setattr(self, kw, LabeledData(data, labels)) + + def normalize(self): + """ + Normalize data. + + Feature scaling normalization. + + :return: + """ + logger.info("Apply normalization to data from dataset {}.".format(self.s_name)) + for kw in self.data_groups_private: + datlab = getattr(self, kw) + if len(datlab.labels) == 0: + continue + data = datlab.data + _min = data.min() + _max = data.max() + data = (data - _min) / (_max - _min) + logger.debug("Apply normalization to {} data of {} dataset.".format(kw, self.s_name)) + setattr(self, kw, LabeledData(data, datlab.labels)) + + def data_astype(self, _type): + logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name)) + for kw in self.data_groups_private: + datlab = getattr(self, kw) + if len(datlab.labels) == 0: + continue + logger.debug("Change type of {} data to {} in the dataset {}.".format(kw, str(_type), self.s_name)) + data = datlab.data + logger.debug("{} data was of type {}".format(kw, data.dtype)) + data = data.astype(_type) + logger.debug("{} data is now of type {}".format(kw, data.dtype)) + setattr(self, kw, LabeledData(data, datlab.labels)) + + def labels_astype(self, _type): + logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name)) + for kw in self.data_groups_private: + datlab = getattr(self, kw) + if len(datlab.labels) == 0: + continue + labels = datlab.labels + logger.debug("Change type of {} labels to {} in the dataset {}.".format(kw, str(_type), self.s_name)) + logger.debug("{} labels were of type {}".format(kw, labels.dtype)) + labels = labels.astype(_type) + logger.debug("{} labels are now of type {}".format(kw, labels.dtype)) + setattr(self, kw, LabeledData(datlab.data, labels)) + + def load(self): + # todo faire une verification generique que le jeu de donné à été chargé lorsque des opérations + # sont appliquées aux données + logger.info("Loading dataset {}".format(self.s_name)) + self.download() + self.read() + if self._train is not None: + logger.debug("Construction of random train indexes (seed: {})".format(self.seed)) + np.random.seed(self.seed) + permut = np.random.permutation(self._train[0].shape[0]) + if self.validation_size > 0: + self.permuted_index_train = permut[:-self.validation_size] + self.permuted_index_validation = permut[-self.validation_size:] + else: + self.permuted_index_train = permut + self.permuted_index_validation = np.array([]) + if self._test is not None: + logger.debug("Construction of random test indexes (seed: {})".format(self.seed)) + logger.debug("Dataset size: {}".format(self._train[0].shape[0])) + np.random.seed(self.seed) + self.permuted_index_test = np.random.permutation(self._test[0].shape[0]) + if self._train is None and self._test is None: + raise Exception("No data loaded at the end of load method.") + + + # --- Abstract methods + + def read(self): + """ + This method should load dataset in _train and _test attributes. + :return: + """ + raise NotImplementedError diff --git a/skluc/data/mldatasets/ImageDataset.py b/skluc/data/mldatasets/ImageDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..383950e171d3d07dd4be038ebe807bf0188ba9ae --- /dev/null +++ b/skluc/data/mldatasets/ImageDataset.py @@ -0,0 +1,138 @@ +import os + +import numpy as np +import tensorflow as tf + +from skluc.data.mldatasets import LabeledData +from skluc.data.mldatasets.Dataset import Dataset +from skluc.utils import logger, create_directory, check_files + + +class ImageDataset(Dataset): + HEIGHT = -1 + WIDTH = -1 + DEPTH = -1 + + def apply_transformer(self, transformer): + """ + + :param transformer: Transformer object (not a class) + :return: + """ + logger.info("Apply transformation {} to data from dataset {}.".format(transformer.__class__.__name__, self.s_name)) + # todo, cette fonction devrait marcher pour tout dataset (donc il faudrait la mettre dans la classe Dataset) + transformer_name = transformer.NAME + transform_path = os.path.join(self.s_download_dir, transformer_name) + transform_filepaths = [os.path.join(transform_path, kw + ".npz") + for kw in self.data_groups_private] + create_directory(transform_path) + if check_files(transform_filepaths) and transformer.check_model(): + # in the case where the transformations already exist in npz files + # and the model is the actual good model + # but I have no guarantee the transformation has been obtained with the stored model though... + # todo make the npz files to store the md5 checksum of the model that has produced them + logger.debug("Files {} already exists".format(transform_filepaths)) + logger.info("Loading transformed data from files {}".format(transform_filepaths)) + for kw in self.data_groups_private: + npzfile_path = os.path.join(transform_path, kw + ".npz") + logger.debug("Loading {}".format(npzfile_path)) + npzfile = np.load(npzfile_path) + data = npzfile[kw + "_data"] + logger.debug("Shape of {} set: {}".format(kw, data.shape)) + labels = npzfile[kw + "_labels"] + setattr(self, kw, LabeledData(data=data, labels=labels)) + else: + # in the case the transformations doesn't yet exist + # one need to apply it to the data + # then to save the transformation + logger.debug("Files {} don't exist or model md5 checksum doesn't match. Need to produce them".format(transform_filepaths)) + logger.info("Apply convolution of {} to dataset {}".format(transformer_name, self.s_name)) + for kw in self.data_groups_private: + data, labels = getattr(self, kw) + transformed_data, transformed_labels = transformer.transform(data, labels) + setattr(self, kw, LabeledData(data=transformed_data, labels=transformed_labels)) + dict_attr = {kw + "_data": transformed_data, kw + "_labels": transformed_labels} + filepath = os.path.join(transform_path, kw + ".npz") + logger.debug("Shape of {} set: {}".format(kw, transformed_data.shape)) + logger.debug("Saving transformed {} data to {}".format(kw, filepath)) + np.savez(filepath, **dict_attr) + + def to_image(self): + """ + Modify data to present it like images (matrices) instead of vectors. + + :return: The modified data. + """ + if self.HEIGHT == -1 or self.WIDTH == -1 or self.DEPTH == -1: + raise ValueError("Height, width and depth static attributes of class {} should be set.".format(self.__class__)) + for kw in self.data_groups_private: + datlab = getattr(self, kw) + if datlab is None: + continue + images_vec = datlab.data + labels = datlab.labels + length_by_chanel = images_vec.shape[1]/self.DEPTH + logger.debug("Images vec shape: {}".format(images_vec.shape)) + if int(length_by_chanel) != length_by_chanel: + raise Exception("Dimensionality problem") + else: + length_by_chanel = int(length_by_chanel) + images_mat = np.reshape(images_vec, (images_vec.shape[0], length_by_chanel, self.DEPTH), + order='F') + images = np.reshape(images_mat, (images_mat.shape[0], self.HEIGHT, self.WIDTH, + self.DEPTH), order='C') + setattr(self, kw, LabeledData(images, labels)) + + def flatten(self): + """ + Flatten all the datasets (matrices to vectors) + + :return: + """ + logger.info("Apply flattening to dataset {}.".format(self.s_name)) + for kw in self.data_groups_private: + logger.debug("Flattening data {} of dataset {}".format(kw, self.s_name)) + datlab = getattr(self, kw) + init_dim = np.prod([s for s in datlab.data.shape[1:]]) + logger.debug("Shape of {} data: {}".format(kw, datlab.data.shape)) + logger.debug("Number of features in {} data: {}".format(kw, init_dim)) + data = datlab.data.reshape(datlab.data.shape[0], init_dim) + setattr(self, kw, LabeledData(data=data, labels=datlab.labels)) + + def rescale(self, factor): + """ + Rescale images by factor. + + :param factor: + :return: + """ + sess = tf.InteractiveSession() + for kw in self.data_groups_private: + datlab = getattr(self, kw) + images_mat = datlab.data + output_shape = np.multiply(images_mat.shape[1:-1], (factor, factor)) + labels = datlab.labels + logger.debug("Shape of {} data before rescaling: {}".format(kw, images_mat.shape)) + logger.debug("Excpected output shape for images: {}".format(output_shape)) + new_image = tf.image.resize_images(images_mat, output_shape).eval() + logger.debug("Shape of {} data after rescaling: {}".format(kw, new_image.shape)) + setattr(self, kw, LabeledData(new_image, labels)) + sess.close() + + def to_feature_vectors(self): + """ + From a feature representation (W x H x D) of the images, gives the feature vector representation of + dimension (N x D) with N being W x H. + + :return: + """ + for kw in self.data_groups_private: + datlab = getattr(self, kw) + images_mat = datlab.data + labels = datlab.labels + + logger.debug("Shape of {} data before reshape: {}".format(kw, images_mat.shape)) + images_mat = images_mat.reshape(images_mat.shape[0], -1, images_mat.shape[-1]) + logger.debug("Shape of {} data after reshape: {}".format(kw, images_mat.shape)) + setattr(self, kw, LabeledData(images_mat, labels)) + diff --git a/skluc/data/mldatasets/MnistDataset.py b/skluc/data/mldatasets/MnistDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..0882c48b5f6763167c630e14f5055f23cf66ee3e --- /dev/null +++ b/skluc/data/mldatasets/MnistDataset.py @@ -0,0 +1,75 @@ +import gzip +import os + +import numpy as np + +from skluc.data.mldatasets import LabeledData +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger + + +class MnistDataset(ImageDataset): + + HEIGHT = 28 + WIDTH = 28 + DEPTH = 1 + + def __init__(self, validation_size=0, seed=0, s_download_dir=None): + self.__s_root_url = "http://yann.lecun.com/exdb/mnist/" + self.__d_leaf_url = { + "train_data": "train-images-idx3-ubyte.gz", + "train_label": "train-labels-idx1-ubyte.gz", + "test_data": "t10k-images-idx3-ubyte.gz", + "test_label": "t10k-labels-idx1-ubyte.gz" + } + + l_url = [self.__s_root_url + leaf_url for leaf_url in self.__d_leaf_url.values()] + if s_download_dir is not None: + super().__init__(l_url, "mnist", s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__(l_url, "mnist", validation_size=validation_size, seed=seed) + + @staticmethod + def read_gziped_ubyte(fname_img=None, fname_lbl=None): + """ + loosely copied on https://gist.github.com/akesling/5358964 + + Python function for importing the MNIST data set. It returns an iterator + of 2-tuples with the first element being the label and the second element + being a numpy.uint8 2D array of pixel data for the given image. + """ + # Load everything in some numpy arrays + logger.info("Read gziped ubyte file {}".format(fname_img)) + with gzip.open(fname_lbl, 'rb') as flbl: + magic, num = struct.unpack(">II", flbl.read(8)) + lbl = np.fromstring(flbl.read(), dtype=np.int8) + + logger.info("Read gziped ubyte file {}".format(fname_lbl)) + with gzip.open(fname_img, 'rb') as fimg: + magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) + img = np.fromstring(fimg.read(), dtype=np.uint8) + img = img.reshape(len(lbl), -1) + + return img, lbl + + def read(self): + """ + Return a dict of data where, for each key is associated a (data, label) tuple. + + The values of the tuple are np.ndarray. + + :return: dict + """ + # todo add possibility to provide percentage for validation set instead of size + self._train = LabeledData( + *self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["train_data"]), + os.path.join(self.s_download_dir, self.__d_leaf_url["train_label"])) + ) + + self._test = LabeledData( + *self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["test_data"]), + os.path.join(self.s_download_dir, self.__d_leaf_url["test_label"])) + ) + + self._check_validation_size(self._train[0].shape[0]) + diff --git a/skluc/data/mldatasets/MovieReviewDataset.py b/skluc/data/mldatasets/MovieReviewDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..10eb0618c763e5d68df0241410ac26b63d2fd23c --- /dev/null +++ b/skluc/data/mldatasets/MovieReviewDataset.py @@ -0,0 +1,206 @@ +import os +import re +import tarfile + +import numpy as np + +from skluc.data.mldatasets import LabeledData +from skluc.data.mldatasets.Dataset import Dataset +from skluc.utils import create_directory, check_files, logger + + +class MovieReviewV1Dataset(Dataset): + data_groups_private = ["_train"] + TRAIN_SIZE = 9000 + + def apply_transformer(self, transformer_class): + # todo, cette fonction devrait marcher pour tout dataset (donc il faudrait la mettre dans la classe Dataset) + transformer = transformer_class() + transformer_name = transformer.__class__.__name__ + transform_path = os.path.join(self.s_download_dir, transformer_name) + transform_filepaths = [os.path.join(transform_path, kw + ".npz") + for kw in self.data_groups_private] + create_directory(transform_path) + if check_files(transform_filepaths) and transformer.check_model(): + # in the case where the transformations already exist in npz files + # and the model is the actual good model + # but I have no guarantee the transformation has been obtained with the stored model though... + # todo make the npz files to store the md5 checksum of the model that has produced them + logger.debug("Files {} already exists".format(transform_filepaths)) + logger.debug("Now load data of files {}".format(transform_filepaths)) + for kw in self.data_groups_private: + npzfile_path = os.path.join(transform_path, kw + ".npz") + logger.debug("Loading {}".format(npzfile_path)) + npzfile = np.load(npzfile_path) + data = npzfile[kw + "_data"] + labels = npzfile[kw + "_labels"] + setattr(self, kw, LabeledData(data=data, labels=labels)) + # todo être plus intelligent avec le mode debug du logger. Pour l'instant je met tout en debug + else: + # in the case the transformations doesn't yet exist + # one nead to apply it to the data + # then to save the transformation + logger.debug("Files {} don't exist or model md5 checksum doesn't match. Need to produce them".format(transform_filepaths)) + logger.info("Apply convolution of {} to dataset {}".format(transformer_name, self.s_name)) + for kw in self.data_groups_private: + data, labels = getattr(self, kw) + transformed_data, transformed_labels = transformer.transform(data, labels) + setattr(self, kw, LabeledData(data=transformed_data, labels=transformed_labels)) + dict_attr = {kw + "_data": transformed_data, kw + "_labels": transformed_labels} + filepath = os.path.join(transform_path, kw + ".npz") + + logger.debug("Saving transformed {} data to {}".format(kw, filepath)) + np.savez(filepath, **dict_attr) + + def __init__(self, validation_size=0, seed=0, s_download_dir=None): + self.__s_url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz" + + if s_download_dir is not None: + super().__init__([self.__s_url], "moviereview", s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__([self.__s_url], "moviereview", validation_size=validation_size, seed=seed) + + self.__extracted_files = [ + 'rt-polarity.pos', + 'rt-polarity.neg' + ] + self.__extracted_dirname = os.path.join(self.s_download_dir, "rt-polaritydata") + + self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] + + self.__counter = 1 + self.__vocab = {"<pad>": 0} + self.__reversed_vocab = {0: "<pad>"} + + @property + def vocab(self): + return self.__vocab + + @property + def vocab_inv(self): + return self.__reversed_vocab + + def read(self): + # todo faire une fonction d'extraction commune? + targz_file_path = self.l_filepaths[-1] + if not check_files(self.__extracted_file_paths): + logger.debug("Extracting {} ...".format(targz_file_path)) + tar = tarfile.open(targz_file_path, "r:gz") + tar.extractall(path=self.s_download_dir) + else: + logger.debug("File {} has already been extracted".format(targz_file_path)) + + data_labeled = MovieReviewV1Dataset.load_data_and_labels(self.__extracted_file_paths[0], + self.__extracted_file_paths[1], + encoding="ISO-8859-1") + + max_ = -1 + for l in data_labeled[0]: + max_ = max(max_, len(l.strip().split())) + + lst_arr_ex = [] + for ex in data_labeled[0]: + splitted_ex = ex.strip().split() + splitted_ex_nbr = [] + for wrd in splitted_ex: + if wrd not in self.__vocab: + self.__vocab[wrd] = self.__counter + self.__reversed_vocab[self.__counter] = wrd + self.__counter += 1 + splitted_ex_nbr.append(self.__vocab[wrd]) + arr_splitted_ex_nbr = np.pad(splitted_ex_nbr, (0, max_-len(splitted_ex_nbr)), 'constant', + constant_values=self.__vocab["<pad>"]) + lst_arr_ex.append(np.reshape(arr_splitted_ex_nbr, (1, -1))) + X = np.concatenate(lst_arr_ex, axis=0) + + self._train = LabeledData(data=X, + labels=data_labeled[1]) + + @property + def train(self): + indexes = self.permuted_index_train[:self.TRAIN_SIZE - self.validation_size] + return LabeledData(data=self._train.data[indexes], + labels=self._train.labels[indexes]) + + @property + def test(self): + indexes = self.permuted_index_train[self.TRAIN_SIZE:] + return LabeledData(data=self._train.data[indexes], + labels=self._train.labels[indexes]) + + @property + def validation(self): + indexes = self.permuted_index_train[(self.TRAIN_SIZE - self.validation_size):self.TRAIN_SIZE] + return LabeledData(data=self._train.data[indexes], + labels=self._train.labels[indexes]) + + @property + def vocabulary_length(self): + return len(self.__vocab) + + @staticmethod + def clean_str(string): + """ + Tokenization/string cleaning for all datasets except for SST. + Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py + + Credit to: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py + """ + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " \( ", string) + string = re.sub(r"\)", " \) ", string) + string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip().lower() + + @staticmethod + def load_data_and_labels(positive_data_file, negative_data_file, encoding='utf-8'): + """ + Loads MR polarity data from files, splits the data into words and generates labels. + Returns split sentences and labels. + + Credit to: https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py + """ + # Load data from files + positive_examples = list(open(positive_data_file, "r", encoding=encoding).readlines()) + positive_examples = [s.strip() for s in positive_examples] + negative_examples = list(open(negative_data_file, "r", encoding=encoding).readlines()) + negative_examples = [s.strip() for s in negative_examples] + # Split by words + x_text = positive_examples + negative_examples + x_text = [MovieReviewV1Dataset.clean_str(sent) for sent in x_text] + # Generate labels + positive_labels = [[0, 1] for _ in positive_examples] + negative_labels = [[1, 0] for _ in negative_examples] + y = np.concatenate([positive_labels, negative_labels], 0) + return LabeledData(data=x_text, labels=y) + + # todo not yet sure the following is usefull + # @staticmethod + # def batch_iter(data, batch_size, num_epochs, shuffle=True): + # """ + # Generates a batch iterator for a dataset. + # """ + # data = np.array(data) + # data_size = len(data) + # num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1 + # for epoch in range(num_epochs): + # # Shuffle the data at each epoch + # if shuffle: + # shuffle_indices = np.random.permutation(np.arange(data_size)) + # shuffled_data = data[shuffle_indices] + # else: + # shuffled_data = data + # for batch_num in range(num_batches_per_epoch): + # start_index = batch_num * batch_size + # end_index = min((batch_num + 1) * batch_size, data_size) + # + # yield shuffled_data[start_index:end_index] diff --git a/skluc/data/mldatasets/SVHNDataset.py b/skluc/data/mldatasets/SVHNDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..bb15a84ced44bc16b9edac0525944c0474e4f2f6 --- /dev/null +++ b/skluc/data/mldatasets/SVHNDataset.py @@ -0,0 +1,61 @@ +import os + +import numpy as np +import scipy.io as sio + +from skluc.data.mldatasets import LabeledData +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger + + +class SVHNDataset(ImageDataset): + + HEIGHT = 32 + WIDTH = 32 + DEPTH = 3 + + def __init__(self, validation_size=0, seed=0, s_download_dir=None): + self.__s_root_url = "http://ufldl.stanford.edu/housenumbers/" + self.__d_leaf_url = { + "train_data": "train_32x32.mat", + "test_data": "test_32x32.mat", + } + + l_url = [self.__s_root_url + leaf_url for leaf_url in self.__d_leaf_url.values()] + if s_download_dir is not None: + super().__init__(l_url, "svhn", s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__(l_url, "svhn", validation_size=validation_size, seed=seed) + + @staticmethod + def read_mat(fname): + """ + loosely copied on https://stackoverflow.com/questions/29185493/read-svhn-dataset-in-python + + Python function for importing the SVHN data set. + """ + # Load everything in some numpy arrays + logger.info("Read mat file {}".format(fname)) + data = sio.loadmat(fname) + img = np.moveaxis(data['X'], -1, 0) + lbl = data['y'] + return img, lbl + + def read(self): + """ + Return a dict of data where, for each key is associated a (data, label) tuple. + + The values of the tuple are np.ndarray. + + :return: dict + """ + # todo add possibility to provide percentage for validation set instead of size + self._train = LabeledData( + *self.read_mat(os.path.join(self.s_download_dir, self.__d_leaf_url["train_data"])) + ) + + self._test = LabeledData( + *self.read_mat(os.path.join(self.s_download_dir, self.__d_leaf_url["test_data"])) + ) + + self._check_validation_size(self._train[0].shape[0]) diff --git a/skluc/data/mldatasets/__init__.py b/skluc/data/mldatasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ed52441e6cf97325e41a7230f2ee02c83fa0a942 --- /dev/null +++ b/skluc/data/mldatasets/__init__.py @@ -0,0 +1,34 @@ +""" +This module defines the Dataset classes usefull for downloading and loading datasets as numpy.ndarrays. + +The currently implemented datasets are: + - mnist + - cifar10 + - cifar100 + - svhn + - moviereview +""" + +import collections + +from skluc.data.mldatasets.Cifar100FineDataset import Cifar100FineDataset +from skluc.data.mldatasets.Cifar10Dataset import Cifar10Dataset +from skluc.data.mldatasets.MnistDataset import MnistDataset +from skluc.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset +from skluc.data.mldatasets.SVHNDataset import SVHNDataset + +LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) + + +if __name__ == "__main__": + d = Cifar100FineDataset(validation_size=10000) + d.load() + print("Before preprocessing") + print(d.train.data.shape, d.train.labels.shape) + print(d.validation.data.shape, d.validation.labels.shape) + print(d.test.data.shape, d.test.labels.shape) + # d.apply_transformer(VGG19SvhnTransformer) + # print("After vgg19 preprocessing") + # print(d.train.data.shape, d.train.labels.shape) + # print(d.validation.data.shape, d.validation.labels.shape) + # print(d.test.data.shape, d.test.labels.shape) diff --git a/skluc/test/test_mldatasets.py b/skluc/test/test_mldatasets.py index 8be185edd94e83b21aec99cf875762896badadea..66b77b8486fd4a0b6f16ab91e16105ccd29d6e9c 100644 --- a/skluc/test/test_mldatasets.py +++ b/skluc/test/test_mldatasets.py @@ -2,6 +2,7 @@ import os import unittest import skluc.data.mldatasets as dataset +from skluc.utils import silentremove class TestMnistDataset(unittest.TestCase): @@ -28,7 +29,7 @@ class TestMnistDataset(unittest.TestCase): def tearDown(self): for name in self.full_mnist_names: - dataset.silentremove(name) + silentremove(name) class TestCifar10Dataset(unittest.TestCase):