From 529d0a4e8c45c6f1b19bb642712fcfec8ccc553a Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Mon, 13 Aug 2018 11:32:38 +0200 Subject: [PATCH] Add Omniglot dataset Fix bug on labeled data (recursive imports): labeled data is now in skluc.utils --- setup.py | 4 +- skluc/data/mldatasets/Cifar100FineDataset.py | 2 +- skluc/data/mldatasets/Cifar10Dataset.py | 2 +- skluc/data/mldatasets/Dataset.py | 2 +- skluc/data/mldatasets/ImageDataset.py | 2 +- skluc/data/mldatasets/MnistDataset.py | 2 +- skluc/data/mldatasets/MovieReviewDataset.py | 2 +- skluc/data/mldatasets/OmniglotDataset.py | 77 ++++++++++++++++++++ skluc/data/mldatasets/SVHNDataset.py | 2 +- skluc/data/mldatasets/__init__.py | 16 ++-- skluc/utils.py | 5 ++ 11 files changed, 100 insertions(+), 16 deletions(-) create mode 100644 skluc/data/mldatasets/OmniglotDataset.py diff --git a/setup.py b/setup.py index 845e6cd..7bcd499 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,9 @@ setup( 'scikit-learn', 'numba', 'keras', - 'scipy'], + 'scipy', + 'psutil', + 'imageio'], # classifiers is needed for uploading package on pypi. # The list of classifiers elements can be found at : # https://pypi.python.org/pypi?%3Aaction=list_classifiers diff --git a/skluc/data/mldatasets/Cifar100FineDataset.py b/skluc/data/mldatasets/Cifar100FineDataset.py index 83b6747..2cbc540 100644 --- a/skluc/data/mldatasets/Cifar100FineDataset.py +++ b/skluc/data/mldatasets/Cifar100FineDataset.py @@ -4,7 +4,7 @@ import tarfile import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger, check_files diff --git a/skluc/data/mldatasets/Cifar10Dataset.py b/skluc/data/mldatasets/Cifar10Dataset.py index c48f836..5e2292b 100644 --- a/skluc/data/mldatasets/Cifar10Dataset.py +++ b/skluc/data/mldatasets/Cifar10Dataset.py @@ -4,7 +4,7 @@ import tarfile import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger, check_files diff --git a/skluc/data/mldatasets/Dataset.py b/skluc/data/mldatasets/Dataset.py index 67a071a..f30550c 100644 --- a/skluc/data/mldatasets/Dataset.py +++ b/skluc/data/mldatasets/Dataset.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.cross_validation import train_test_split from sklearn.preprocessing import LabelBinarizer -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.utils import logger, check_files, silentremove, download_data, create_directory diff --git a/skluc/data/mldatasets/ImageDataset.py b/skluc/data/mldatasets/ImageDataset.py index 383950e..9232eb2 100644 --- a/skluc/data/mldatasets/ImageDataset.py +++ b/skluc/data/mldatasets/ImageDataset.py @@ -3,7 +3,7 @@ import os import numpy as np import tensorflow as tf -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.Dataset import Dataset from skluc.utils import logger, create_directory, check_files diff --git a/skluc/data/mldatasets/MnistDataset.py b/skluc/data/mldatasets/MnistDataset.py index 0882c48..cef4adf 100644 --- a/skluc/data/mldatasets/MnistDataset.py +++ b/skluc/data/mldatasets/MnistDataset.py @@ -3,7 +3,7 @@ import os import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger diff --git a/skluc/data/mldatasets/MovieReviewDataset.py b/skluc/data/mldatasets/MovieReviewDataset.py index 10eb061..9d4e28b 100644 --- a/skluc/data/mldatasets/MovieReviewDataset.py +++ b/skluc/data/mldatasets/MovieReviewDataset.py @@ -4,7 +4,7 @@ import tarfile import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.Dataset import Dataset from skluc.utils import create_directory, check_files, logger diff --git a/skluc/data/mldatasets/OmniglotDataset.py b/skluc/data/mldatasets/OmniglotDataset.py new file mode 100644 index 0000000..013f632 --- /dev/null +++ b/skluc/data/mldatasets/OmniglotDataset.py @@ -0,0 +1,77 @@ +import os +import zipfile + +import numpy as np +import imageio +import matplotlib.pyplot as plt + +from skluc.utils import LabeledData +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger, check_files + + +class OmniglotDataset(ImageDataset): + HEIGHT = 32 + WIDTH = 32 + DEPTH = 3 + + def __init__(self, validation_size=0, seed=None, s_download_dir=None): + self.__s_url = ["https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip", + "https://github.com/brendenlake/omniglot/raw/master/python/images_evaluation.zip" + ] + self.meta = None + name = "omniglot" + if s_download_dir is not None: + super().__init__(self.__s_url, name, s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__(self.__s_url, name, validation_size=validation_size, seed=seed) + + self.__extracted_dirs = [ + os.path.join(self.s_download_dir, "images_background"), + os.path.join(self.s_download_dir, "images_evaluation") + ] + + # self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] + + def get_omniglot_data(self, tag): + data_dirname = "images_" + tag + data_dirpath = os.path.join(self.s_download_dir, data_dirname) + class_index = 0 + list_of_images = [] + list_of_labels = [] + for alphabet_name in os.listdir(data_dirpath): + data_alphabet_dirpath = os.path.join(data_dirpath, alphabet_name) + for char in os.listdir(data_alphabet_dirpath): + charname = alphabet_name + "_" + char[-2:] + data_char_dirpath = os.path.join(data_alphabet_dirpath, char) + for char_image_file in os.listdir(data_char_dirpath): + char_image_path = os.path.join(data_char_dirpath, char_image_file) + im = imageio.imread(char_image_path) + list_of_images.append(im.flatten()) + list_of_labels.append(class_index) + + class_index += 1 + return np.array(list_of_images), np.array(list_of_labels) + + def read(self): + if not check_files(self.__extracted_dirs): + logger.debug("Extracting {} ...".format(self.l_filepaths)) + for zip_file in self.l_filepaths: + zip_ref = zipfile.ZipFile(zip_file, 'r') + zip_ref.extractall(self.s_download_dir) + zip_ref.close() + else: + logger.debug("Files {} have already been extracted".format(self.l_filepaths)) + + logger.debug("Get training data of dataset {}".format(self.s_name)) + self._train = LabeledData(*self.get_omniglot_data('background')) + + logger.debug("Get testing data of dataset {}".format(self.s_name)) + self._test = LabeledData(*self.get_omniglot_data('evaluation')) + + self._check_validation_size(self._train[0].shape[0]) + + +if __name__ == "__main__": + d = OmniglotDataset(validation_size=10000) + d.load() \ No newline at end of file diff --git a/skluc/data/mldatasets/SVHNDataset.py b/skluc/data/mldatasets/SVHNDataset.py index bb15a84..dc54adb 100644 --- a/skluc/data/mldatasets/SVHNDataset.py +++ b/skluc/data/mldatasets/SVHNDataset.py @@ -3,7 +3,7 @@ import os import numpy as np import scipy.io as sio -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger diff --git a/skluc/data/mldatasets/__init__.py b/skluc/data/mldatasets/__init__.py index ed52441..e3ac016 100644 --- a/skluc/data/mldatasets/__init__.py +++ b/skluc/data/mldatasets/__init__.py @@ -9,25 +9,25 @@ The currently implemented datasets are: - moviereview """ -import collections from skluc.data.mldatasets.Cifar100FineDataset import Cifar100FineDataset from skluc.data.mldatasets.Cifar10Dataset import Cifar10Dataset from skluc.data.mldatasets.MnistDataset import MnistDataset from skluc.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset +from skluc.data.mldatasets.OmniglotDataset import OmniglotDataset from skluc.data.mldatasets.SVHNDataset import SVHNDataset -LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) +__all__ = ["Cifar10Dataset", "Cifar100FineDataset", "MnistDataset", "OmniglotDataset", "MovieReviewV1Dataset", "SVHNDataset"] if __name__ == "__main__": - d = Cifar100FineDataset(validation_size=10000) + d = OmniglotDataset(validation_size=10000) d.load() - print("Before preprocessing") - print(d.train.data.shape, d.train.labels.shape) - print(d.validation.data.shape, d.validation.labels.shape) - print(d.test.data.shape, d.test.labels.shape) - # d.apply_transformer(VGG19SvhnTransformer) + # print("Before preprocessing") + # print(d.train.data.shape, d.train.labels.shape) + # print(d.validation.data.shape, d.validation.labels.shape) + # print(d.test.data.shape, d.test.labels.shape) + # # d.apply_transformer(VGG19SvhnTransformer) # print("After vgg19 preprocessing") # print(d.train.data.shape, d.train.labels.shape) # print(d.validation.data.shape, d.validation.labels.shape) diff --git a/skluc/utils.py b/skluc/utils.py index e9e93de..42f9d43 100644 --- a/skluc/utils.py +++ b/skluc/utils.py @@ -10,6 +10,8 @@ from weakref import WeakValueDictionary import daiquiri import numpy as np import psutil +import collections + from sklearn.metrics.pairwise import additive_chi2_kernel daiquiri.setup(level=logging.DEBUG) @@ -295,6 +297,9 @@ def deprecated(msg=""): return new_func return inner +LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) + + if __name__ == "__main__": a = np.identity(1000) print(compute_euristic_sigma(a)) \ No newline at end of file -- GitLab