diff --git a/setup.py b/setup.py index 845e6cd7fdc841d7b302c739c7c62a50ebdee84c..7bcd4999578f875f747837d43af6333465e967fc 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,9 @@ setup( 'scikit-learn', 'numba', 'keras', - 'scipy'], + 'scipy', + 'psutil', + 'imageio'], # classifiers is needed for uploading package on pypi. # The list of classifiers elements can be found at : # https://pypi.python.org/pypi?%3Aaction=list_classifiers diff --git a/skluc/data/mldatasets/Cifar100FineDataset.py b/skluc/data/mldatasets/Cifar100FineDataset.py index 83b6747e04d779e0ae1425256a4834caac037548..2cbc5406bb2ac44cdca831c2db9637f3c7e0dfcd 100644 --- a/skluc/data/mldatasets/Cifar100FineDataset.py +++ b/skluc/data/mldatasets/Cifar100FineDataset.py @@ -4,7 +4,7 @@ import tarfile import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger, check_files diff --git a/skluc/data/mldatasets/Cifar10Dataset.py b/skluc/data/mldatasets/Cifar10Dataset.py index c48f8364efd6fa04fe79c1e5c823e460499a395f..5e2292b58a52ec5b12b7bf82ae5a7c0f98f5b944 100644 --- a/skluc/data/mldatasets/Cifar10Dataset.py +++ b/skluc/data/mldatasets/Cifar10Dataset.py @@ -4,7 +4,7 @@ import tarfile import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger, check_files diff --git a/skluc/data/mldatasets/Dataset.py b/skluc/data/mldatasets/Dataset.py index 67a071a07d036883b8ef80b1162425130d082591..f30550cec2e76a3e78a12c8b09258ea5abb1ed56 100644 --- a/skluc/data/mldatasets/Dataset.py +++ b/skluc/data/mldatasets/Dataset.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.cross_validation import train_test_split from sklearn.preprocessing import LabelBinarizer -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.utils import logger, check_files, silentremove, download_data, create_directory diff --git a/skluc/data/mldatasets/ImageDataset.py b/skluc/data/mldatasets/ImageDataset.py index 383950e171d3d07dd4be038ebe807bf0188ba9ae..9232eb2465b4d91f9effbde7ad8d6d4c5f0b7b47 100644 --- a/skluc/data/mldatasets/ImageDataset.py +++ b/skluc/data/mldatasets/ImageDataset.py @@ -3,7 +3,7 @@ import os import numpy as np import tensorflow as tf -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.Dataset import Dataset from skluc.utils import logger, create_directory, check_files diff --git a/skluc/data/mldatasets/MnistDataset.py b/skluc/data/mldatasets/MnistDataset.py index 0882c48b5f6763167c630e14f5055f23cf66ee3e..cef4adfb831013d9d9371f32fdd8c6786a194356 100644 --- a/skluc/data/mldatasets/MnistDataset.py +++ b/skluc/data/mldatasets/MnistDataset.py @@ -3,7 +3,7 @@ import os import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger diff --git a/skluc/data/mldatasets/MovieReviewDataset.py b/skluc/data/mldatasets/MovieReviewDataset.py index 10eb0618c763e5d68df0241410ac26b63d2fd23c..9d4e28b3d8c1eeeb288733c85c4e87dc4580fd2e 100644 --- a/skluc/data/mldatasets/MovieReviewDataset.py +++ b/skluc/data/mldatasets/MovieReviewDataset.py @@ -4,7 +4,7 @@ import tarfile import numpy as np -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.Dataset import Dataset from skluc.utils import create_directory, check_files, logger diff --git a/skluc/data/mldatasets/OmniglotDataset.py b/skluc/data/mldatasets/OmniglotDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..013f632a3f5319d3217d57a04afa3e92fe025049 --- /dev/null +++ b/skluc/data/mldatasets/OmniglotDataset.py @@ -0,0 +1,77 @@ +import os +import zipfile + +import numpy as np +import imageio +import matplotlib.pyplot as plt + +from skluc.utils import LabeledData +from skluc.data.mldatasets.ImageDataset import ImageDataset +from skluc.utils import logger, check_files + + +class OmniglotDataset(ImageDataset): + HEIGHT = 32 + WIDTH = 32 + DEPTH = 3 + + def __init__(self, validation_size=0, seed=None, s_download_dir=None): + self.__s_url = ["https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip", + "https://github.com/brendenlake/omniglot/raw/master/python/images_evaluation.zip" + ] + self.meta = None + name = "omniglot" + if s_download_dir is not None: + super().__init__(self.__s_url, name, s_download_dir, validation_size=validation_size, seed=seed) + else: + super().__init__(self.__s_url, name, validation_size=validation_size, seed=seed) + + self.__extracted_dirs = [ + os.path.join(self.s_download_dir, "images_background"), + os.path.join(self.s_download_dir, "images_evaluation") + ] + + # self.__extracted_file_paths = [os.path.join(self.__extracted_dirname, file) for file in self.__extracted_files] + + def get_omniglot_data(self, tag): + data_dirname = "images_" + tag + data_dirpath = os.path.join(self.s_download_dir, data_dirname) + class_index = 0 + list_of_images = [] + list_of_labels = [] + for alphabet_name in os.listdir(data_dirpath): + data_alphabet_dirpath = os.path.join(data_dirpath, alphabet_name) + for char in os.listdir(data_alphabet_dirpath): + charname = alphabet_name + "_" + char[-2:] + data_char_dirpath = os.path.join(data_alphabet_dirpath, char) + for char_image_file in os.listdir(data_char_dirpath): + char_image_path = os.path.join(data_char_dirpath, char_image_file) + im = imageio.imread(char_image_path) + list_of_images.append(im.flatten()) + list_of_labels.append(class_index) + + class_index += 1 + return np.array(list_of_images), np.array(list_of_labels) + + def read(self): + if not check_files(self.__extracted_dirs): + logger.debug("Extracting {} ...".format(self.l_filepaths)) + for zip_file in self.l_filepaths: + zip_ref = zipfile.ZipFile(zip_file, 'r') + zip_ref.extractall(self.s_download_dir) + zip_ref.close() + else: + logger.debug("Files {} have already been extracted".format(self.l_filepaths)) + + logger.debug("Get training data of dataset {}".format(self.s_name)) + self._train = LabeledData(*self.get_omniglot_data('background')) + + logger.debug("Get testing data of dataset {}".format(self.s_name)) + self._test = LabeledData(*self.get_omniglot_data('evaluation')) + + self._check_validation_size(self._train[0].shape[0]) + + +if __name__ == "__main__": + d = OmniglotDataset(validation_size=10000) + d.load() \ No newline at end of file diff --git a/skluc/data/mldatasets/SVHNDataset.py b/skluc/data/mldatasets/SVHNDataset.py index bb15a84ced44bc16b9edac0525944c0474e4f2f6..dc54adb5ba78a993de95f0fe15517655e98adc8f 100644 --- a/skluc/data/mldatasets/SVHNDataset.py +++ b/skluc/data/mldatasets/SVHNDataset.py @@ -3,7 +3,7 @@ import os import numpy as np import scipy.io as sio -from skluc.data.mldatasets import LabeledData +from skluc.utils import LabeledData from skluc.data.mldatasets.ImageDataset import ImageDataset from skluc.utils import logger diff --git a/skluc/data/mldatasets/__init__.py b/skluc/data/mldatasets/__init__.py index ed52441e6cf97325e41a7230f2ee02c83fa0a942..e3ac016159c12b9a06cf2109c61a23c70edefbdf 100644 --- a/skluc/data/mldatasets/__init__.py +++ b/skluc/data/mldatasets/__init__.py @@ -9,25 +9,25 @@ The currently implemented datasets are: - moviereview """ -import collections from skluc.data.mldatasets.Cifar100FineDataset import Cifar100FineDataset from skluc.data.mldatasets.Cifar10Dataset import Cifar10Dataset from skluc.data.mldatasets.MnistDataset import MnistDataset from skluc.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset +from skluc.data.mldatasets.OmniglotDataset import OmniglotDataset from skluc.data.mldatasets.SVHNDataset import SVHNDataset -LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) +__all__ = ["Cifar10Dataset", "Cifar100FineDataset", "MnistDataset", "OmniglotDataset", "MovieReviewV1Dataset", "SVHNDataset"] if __name__ == "__main__": - d = Cifar100FineDataset(validation_size=10000) + d = OmniglotDataset(validation_size=10000) d.load() - print("Before preprocessing") - print(d.train.data.shape, d.train.labels.shape) - print(d.validation.data.shape, d.validation.labels.shape) - print(d.test.data.shape, d.test.labels.shape) - # d.apply_transformer(VGG19SvhnTransformer) + # print("Before preprocessing") + # print(d.train.data.shape, d.train.labels.shape) + # print(d.validation.data.shape, d.validation.labels.shape) + # print(d.test.data.shape, d.test.labels.shape) + # # d.apply_transformer(VGG19SvhnTransformer) # print("After vgg19 preprocessing") # print(d.train.data.shape, d.train.labels.shape) # print(d.validation.data.shape, d.validation.labels.shape) diff --git a/skluc/utils.py b/skluc/utils.py index e9e93de739aca5ca33f4dfd99b9faed271922c75..42f9d43b9cebc368790d7273a1eaf7f493025ea3 100644 --- a/skluc/utils.py +++ b/skluc/utils.py @@ -10,6 +10,8 @@ from weakref import WeakValueDictionary import daiquiri import numpy as np import psutil +import collections + from sklearn.metrics.pairwise import additive_chi2_kernel daiquiri.setup(level=logging.DEBUG) @@ -295,6 +297,9 @@ def deprecated(msg=""): return new_func return inner +LabeledData = collections.namedtuple("LabeledData", ["data", "labels"]) + + if __name__ == "__main__": a = np.identity(1000) print(compute_euristic_sigma(a)) \ No newline at end of file