Skip to content
Snippets Groups Projects
Commit 27892406 authored by Luc Giffon's avatar Luc Giffon
Browse files

first commit: readme, manifest, license, authors, mldatasets with mnist support only

parents
No related branches found
No related tags found
No related merge requests found
.idea
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
Credits
=======
scikit-luc has been written by Luc Giffon.
Contributors
------------
Luc Giffon
/*
* ----------------------------------------------------------------------------
* "THE BEER-WARE LICENSE" (Revision 42):
* <luc.giffon[at]lif.univ-mrs.fr> wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return Luc Giffon.
* ----------------------------------------------------------------------------
*/
\ No newline at end of file
include *.rst
\ No newline at end of file
Scikit-Luc
==========
This is a python package containing the usefull script developed or found by Luc Giffon during his PhD thesis.
Installation from sources
=========================
In the root directory of the package, just do:
python setup.py install
or
pip install -e .
setup.py 0 → 100644
import os
from setuptools import setup, find_packages
import skluc
def read(*paths):
"""Build a file path from *paths* and return the contents."""
with open(os.path.join(*paths), 'r') as f:
return f.read()
setup(
# name of the package
name='scikit-luc',
# You can specify all the packages manually or use the find_package
# function
# packages=find_packages(exclude=['doc', 'examples']),
# See PEP440 for defining a proper version number
version='1.0',
# Small description of the package
description='Science-Kit developed by Luc Giffon for Luc Giffon during his PhD thesis.',
# Long description
long_description=(read('README.rst') + '\n\n' +
read('AUTHORS.rst') + '\n\n' +
read('LICENSE.rst') + '\n\n'),
# Project home page:
url='',
# license, author and author email
license='THE BEER-WARE LICENSE',
author='Luc Giffon',
author_email='luc.giffon@lif.univ-mrs.fr',
# If any packages contains data which are not python files, include them
# package_data={'myapp': 'data/*.gif'},
install_requires=['daiquiri',
'numpy'],
# classifiers is needed for uploading package on pypi.
# The list of classifiers elements can be found at :
# https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifiers=[
'Intended Audience :: Science/Research',
'Intended Audience :: Developers',
'Natural Language :: English',
'Topic :: Software Development',
'Topic :: Scientific/Engineering',
'License :: THE BEER-WARE LICENSE',
'Operating System :: Unix',
'Programming Language :: Python :: 3 :: Only',
],
# What does your project relate to?
keywords=['Linux', 'MacOSX', 'Windows'],
# Platforms on which the package can be installed:
# platforms=['Linux'],
# To provide executable scripts, use entry points in preference to the
# "scripts" keyword. Entry points provide cross-platform support and allow
# pip to create the appropriate form of executable for the target platform.
# entry_points={
# 'console_scripts': [
# 'myapp=myapp:main',
# ],
# },
)
"""
This package contains some usefull functions for the PhD of Luc Giffon.
"""
__version__ = "1.0"
\ No newline at end of file
"""
This module defines the Dataset classes usefull for downloading and loading datasets as numpy.ndarrays.
The currently implemented datasets are:
- mnist
"""
import urllib.request
import struct
import os, errno
import gzip
import logging
# --- installed packages
import daiquiri
import numpy as np
__all__ = ['MnistDataset']
daiquiri.setup(level=logging.DEBUG)
logger = daiquiri.getLogger()
def silentremove(filename):
"""
Remove filename without raising error if the file doesn't exist.
:param filename: The filename
:type filename: str
:return: None
"""
try:
os.remove(filename)
logger.info("{} has been removed".format(filename))
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
logger.debug("{} directory doesn't exist".format(filename))
def create_directory(_dir):
"""
Try to create the directory if it does not exist.
:param dir: the path to the directory to be created
:return: None
"""
try:
os.makedirs(_dir)
logger.info("{} directory has been created".format(_dir))
except OSError as e:
if e.errno != errno.EEXIST:
raise
logger.debug("{} directory already exists".format(_dir))
def retrieve_data(url, directory, name):
"""
Download the file at the specified url
:param url: the end-point url of the need file
:type url: str
:param directory: the target directory where to download the file
:type directory: str
:param name: the name of the target file downloaded
:type name: str
:return: None
"""
urllib.request.urlretrieve(url, os.path.join(directory, name))
logger.info("{} has been downloaded.".format(url))
class Dataset:
"""
Abstract class implementing basic methods for Dataset retrieval.
"""
def __init__(self, l_url, s_name, s_download_dir=os.path.join(os.path.expanduser("~"), "ml_datasets")):
self.l_url = l_url
self.l_filenames = []
for url in self.l_url:
splitted_url = url.split("/")
self.l_filenames.append(splitted_url[-1])
self.s_name = s_name
self.s_download_dir = os.path.join(s_download_dir, self.s_name)
self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames]
def download(self):
"""
Download the dataset.
:return: None
"""
# the download method shouldn't be called if the target directory tree doesn't exist.
assert self.check_directory_tree()
for s_url in self.l_url:
s_file_name = s_url.split("/")[-1]
s_file_path = os.path.join(self.s_download_dir, s_file_name)
urllib.request.urlretrieve(s_url, s_file_path)
def check_directory_tree(self):
"""
Check if the target directory tree exist.
:return: None
"""
return os.path.exists(self.s_download_dir)
def create_directory_tree(self):
"""
Create the target directory tree
:return: None
"""
create_directory(self.s_download_dir)
def check_files(self):
return all([os.path.exists(fpath) for fpath in self.l_filepaths])
def load(self):
self.create_directory_tree()
if not self.check_files():
logger.info("Files need to be downloaded")
for s_fname in self.l_filepaths:
silentremove(s_fname)
self.download()
else:
logger.debug("Files already exists")
return self.read()
# --- Abstract methods
def read(self):
pass
class MnistDataset(Dataset):
def __init__(self, s_download_dir=None):
self.__s_root_url = "http://yann.lecun.com/exdb/mnist/"
self.__d_leaf_url = {
"train_data": "train-images-idx3-ubyte.gz",
"train_label": "train-labels-idx1-ubyte.gz",
"test_data": "t10k-images-idx3-ubyte.gz",
"test_label": "t10k-labels-idx1-ubyte.gz"
}
l_url = [self.__s_root_url + leaf_url for leaf_url in self.__d_leaf_url.values()]
if s_download_dir is not None:
super().__init__(l_url, "mnist", s_download_dir)
else:
super().__init__(l_url, "mnist")
@staticmethod
def read_gziped_ubyte(fname_img=None, fname_lbl=None):
"""
loosely copied on https://gist.github.com/akesling/5358964
Python function for importing the MNIST data set. It returns an iterator
of 2-tuples with the first element being the label and the second element
being a numpy.uint8 2D array of pixel data for the given image.
"""
# Load everything in some numpy arrays
logger.info("Read gziped ubyte file {}".format(fname_img))
with gzip.open(fname_lbl, 'rb') as flbl:
magic, num = struct.unpack(">II", flbl.read(8))
lbl = np.fromstring(flbl.read(), dtype=np.int8)
logger.info("Read gziped ubyte file {}".format(fname_lbl))
with gzip.open(fname_img, 'rb') as fimg:
magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
img = np.fromstring(fimg.read(), dtype=np.uint8)
img = img.reshape(len(lbl), rows, cols)
return img, lbl
def read(self):
"""
Return a dict of data where, for each key is associated a (data, label) tuple.
The values of the tuple are np.ndarray.
:return: dict
"""
d_data = {
"train": (self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["train_data"]),
os.path.join(self.s_download_dir, self.__d_leaf_url["train_label"]))),
"test": (self.read_gziped_ubyte(os.path.join(self.s_download_dir, self.__d_leaf_url["test_data"]),
os.path.join(self.s_download_dir, self.__d_leaf_url["test_label"])))
}
return d_data
if __name__ == "__main__":
d = MnistDataset()
data = d.load()
# print(data["train"])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment