first commit: readme, manifest, license, authors, mldatasets with mnist support only

27892406 · Luc Giffon · 27892406 · 27892406 · 27892406 · 27892406
Commit 27892406 authored Dec 4, 2017 by Luc Giffon
--- a/.gitignore
+++ b/.gitignore
+.idea
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
+Credits
+=======
+scikit-luc has been written by Luc Giffon.
+Contributors
+------------
+Luc Giffon
--- a/LICENSE.rst
+++ b/LICENSE.rst
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <luc.giffon[at]lif.univ-mrs.fr> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return Luc Giffon.
+ * ----------------------------------------------------------------------------
+ */
\ No newline at end of file
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include *.rst
\ No newline at end of file
--- a/README.rst
+++ b/README.rst
+Scikit-Luc
+==========
+This is a python package containing the usefull script developed or found by Luc Giffon during his PhD thesis.
+Installation from sources
+=========================
+In the root directory of the package, just do:
+    python setup.py install
+or
+    pip install -e .
--- a/setup.py
+++ b/setup.py
+import os
+from setuptools import setup, find_packages
+import skluc
+def read(*paths):
+    """Build a file path from *paths* and return the contents."""
+    with open(os.path.join(*paths), 'r') as f:
+        return f.read()
+setup(
+    # name of the package
+    name='scikit-luc',
+    # You can specify all the packages manually or use the find_package
+    # function
+    # packages=find_packages(exclude=['doc', 'examples']),
+    # See PEP440 for defining a proper version number
+    version='1.0',
+    # Small description of the package
+    description='Science-Kit developed by Luc Giffon for Luc Giffon during his PhD thesis.',
+    # Long description
+    long_description=(read('README.rst') + '\n\n' +
+                      read('AUTHORS.rst') + '\n\n' +
+                      read('LICENSE.rst') + '\n\n'),
+    # Project home page:
+    url='',
+    # license, author and author email
+    license='THE BEER-WARE LICENSE',
+    author='Luc Giffon',
+    author_email='luc.giffon@lif.univ-mrs.fr',
+    # If any packages contains data which are not python files, include them
+    # package_data={'myapp': 'data/*.gif'},
+    install_requires=['daiquiri',
+                      'numpy'],
+    # classifiers is needed for uploading package on pypi.
+    # The list of classifiers elements can be found at :
+    # https://pypi.python.org/pypi?%3Aaction=list_classifiers
+    classifiers=[
+        'Intended Audience :: Science/Research',
+        'Intended Audience :: Developers',
+        'Natural Language :: English',
+        'Topic :: Software Development',
+        'Topic :: Scientific/Engineering',
+        'License :: THE BEER-WARE LICENSE',
+        'Operating System :: Unix',
+        'Programming Language :: Python :: 3 :: Only',
+    ],
+    # What does your project relate to?
+    keywords=['Linux', 'MacOSX', 'Windows'],
+    # Platforms on which the package can be installed:
+    # platforms=['Linux'],
+    # To provide executable scripts, use entry points in preference to the
+    # "scripts" keyword. Entry points provide cross-platform support and allow
+    # pip to create the appropriate form of executable for the target platform.
+    # entry_points={
+    #     'console_scripts': [
+    #         'myapp=myapp:main',
+    #     ],
+    # },
+)
--- a/skluc/__init__.py
+++ b/skluc/__init__.py
+"""
+This package contains some usefull functions for the PhD of Luc Giffon.
+"""
+__version__ = "1.0"
\ No newline at end of file
--- a/skluc/mldatasets.py
+++ b/skluc/mldatasets.py
+"""
+This module defines the Dataset classes usefull for downloading and loading datasets as numpy.ndarrays.
+The currently implemented datasets are:
+    - mnist
+"""
+import urllib.request
+import struct
+import os, errno
+import gzip
+import logging
+# --- installed packages
+import daiquiri
+import numpy as np
+__all__ = ['MnistDataset']
+daiquiri.setup(level=logging.DEBUG)
+logger = daiquiri.getLogger()
+def silentremove(filename):
+    """
+    Remove filename without raising error if the file doesn't exist.
+    :param filename: The filename
+    :type filename: str
+    :return: None
+    """
+    try:
+        os.remove(filename)
+        logger.info("{} has been removed".format(filename))
+    except OSError as e: # this would be "except OSError, e:" before Python 2.6
+        if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
+            raise # re-raise exception if a different error occurred
+        logger.debug("{} directory doesn't exist".format(filename))
+def create_directory(_dir):
+    """
+    Try to create the directory if it does not exist.
+    :param dir: the path to the directory to be created
+    :return: None
+    """
+    try:
+        os.makedirs(_dir)
+        logger.info("{} directory has been created".format(_dir))
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+        logger.debug("{} directory already exists".format(_dir))
+def retrieve_data(url, directory, name):
+    """
+    Download the file at the specified url
+    :param url: the end-point url of the need file
+    :type url: str
+    :param directory: the target directory where to download the file
+    :type directory: str
+    :param name: the name of the target file downloaded
+    :type name: str
+    :return: None
+    """
+    urllib.request.urlretrieve(url, os.path.join(directory, name))
+    logger.info("{} has been downloaded.".format(url))
+class Dataset:
+    """
+    Abstract class implementing basic methods for Dataset retrieval.
+    """
+    def __init__(self, l_url, s_name, s_download_dir=os.path.join(os.path.expanduser("~"), "ml_datasets")):
+        self.l_url = l_url
+        self.l_filenames = []
+        for url in self.l_url:
+            splitted_url = url.split("/")
+            self.l_filenames.append(splitted_url[-1])
+        self.s_name = s_name
+        self.s_download_dir = os.path.join(s_download_dir, self.s_name)
+        self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames]
+    def download(self):
+        """
+        Download the dataset.
+        :return: None
+        """
+        # the download method shouldn't be called if the target directory tree doesn't exist.
+        assert self.check_directory_tree()
+        for s_url in self.l_url:
+            s_file_name = s_url.split("/")[-1]
+            s_file_path = os.path.join(self.s_download_dir, s_file_name)
+            urllib.request.urlretrieve(s_url, s_file_path)
+    def check_directory_tree(self):
+        """
+        Check if the target directory tree exist.
+        :return: None
+        """
+        return os.path.exists(self.s_download_dir)
+    def create_directory_tree(self):
+        """
+        Create the target directory tree
+        :return: None
+        """
+        create_directory(self.s_download_dir)
+    def check_files(self):
+        return all([os.path.exists(fpath) for fpath in self.l_filepaths])
+    def load(self):
+        self.create_directory_tree()
+        if not self.check_files():
+            logger.info("Files need to be downloaded")
+            for s_fname in self.l_filepaths:
+                silentremove(s_fname)
+            self.download()
+        else:
+            logger.debug("Files already exists")
+        return self.read()
+    # --- Abstract methods
+    def read(self):
+        pass
+class MnistDataset(Dataset):
+    def __init__(self, s_download_dir=None):
+        self.__s_root_url = "http://yann.lecun.com/exdb/mnist/"
+        self.__d_leaf_url = {
+            "train_data": "train-images-idx3-ubyte.gz",
+            "train_label": "train-labels-idx1-ubyte.gz",
+            "test_data": "t10k-images-idx3-ubyte.gz",
+            "test_label": "t10k-labels-idx1-ubyte.gz"
+        }
+        l_url = [self.__s_root_url + leaf_url for leaf_url in self.__d_leaf_url.values()]
+        if s_download_dir is not None:
+            super().__init__(l_url, "mnist", s_download_dir)
+        else:
+            super().__init__(l_url, "mnist")
+    @staticmethod
+    def read_gziped_ubyte(fname_img=None, fname_lbl=None):
+        """
+        loosely copied on https://gist.github.com/akesling/5358964
+        Python function for importing the MNIST data set.  It returns an iterator
+        of 2-tuples with the first element being the label and the second element
+        being a numpy.uint8 2D array of pixel data for the given image.
+        """
+        # Load everything in some numpy arrays
+        logger.info("Read gziped ubyte file {}".format(fname_img))
+        with gzip.open(fname_lbl, 'rb') as flbl:
+            magic, num = struct.unpack(">II", flbl.read(8))
+            lbl = np.fromstring(flbl.read(), dtype=np.int8)
+        logger.info("Read gziped ubyte file {}".format(fname_lbl))
+        with gzip.open(fname_img, 'rb') as fimg:
+            magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
+            img = np.fromstring(fimg.read(), dtype=np.uint8)
+            img = img.reshape(len(lbl), rows, cols)
+        return img, lbl
+    def read(self):
+        """
+        Return a dict of data where, for each key is associated a (data, label) tuple.
+        The values of the tuple are np.ndarray.
+        :return: dict
+        """
+        d_data = {
+            "train": (self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["train_data"]),
+                                             os.path.join(self.s_download_dir, self.__d_leaf_url["train_label"]))),
+            "test": (self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["test_data"]),
+                                            os.path.join(self.s_download_dir, self.__d_leaf_url["test_label"])))
+        }
+        return d_data
+if __name__ == "__main__":
+    d = MnistDataset()
+    data = d.load()
+    # print(data["train"])