Skip to content
Snippets Groups Projects
Commit b5ddb30b authored by Luc Giffon's avatar Luc Giffon
Browse files

Improve data section:

- selecting stratified subsample tested and working
- more documentation in dataset.py
- split test dataset in a subdirectory
parent fb69843a
Branches
No related tags found
No related merge requests found
import os import os
import numpy as np import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer from sklearn.preprocessing import LabelBinarizer
from skluc.main.utils import LabeledData from skluc.main.utils import LabeledData
...@@ -23,6 +22,7 @@ class Dataset(object): ...@@ -23,6 +22,7 @@ class Dataset(object):
for url in self.l_url: for url in self.l_url:
splitted_url = url.split("/") splitted_url = url.split("/")
self.l_filenames.append(splitted_url[-1]) self.l_filenames.append(splitted_url[-1])
self.s_name = s_name self.s_name = s_name
self.s_download_dir = os.path.join(s_download_dir, self.s_name) self.s_download_dir = os.path.join(s_download_dir, self.s_name)
self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames] self.l_filepaths = [os.path.join(self.s_download_dir, fname) for fname in self.l_filenames]
...@@ -39,25 +39,82 @@ class Dataset(object): ...@@ -39,25 +39,82 @@ class Dataset(object):
kept_indices = self.get_uniform_class_rand_indices_train(new_size) kept_indices = self.get_uniform_class_rand_indices_train(new_size)
self.permuted_index_train = self.permuted_index_train[kept_indices] self.permuted_index_train = self.permuted_index_train[kept_indices]
def get_uniform_class_rand_indices_train(self, size): def get_bool_idx_label(self, find_label, labels):
try: """
sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed) return the np.array bool where find_label == label in labels
kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels) :param find_label:
except ValueError as e: :param labels:
logger.warning("In Dataset.get_uniform_class_rand_indices_train Handled exception: {}".format(str(e))) :return:
logger.debug("Use random indexes instead") """
kept_indices = np.random.permutation(len(self.train.data))[:size]
return kept_indices if len(labels.shape) == 1:
bool_idx_labs = labels == find_label
def get_uniform_class_rand_indices_validation(self, size): elif len(labels.shape) == 2:
try: bool_idx_labs = np.all(labels == find_label, axis=-1)
sss = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=self.seed) else:
kept_indices, _ = sss.split(np.arange(len(self.train.data)), self.train.labels) raise ValueError(
except ValueError as e: "Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels")
logger.warning("In Dataset.get_uniform_class_rand_indices_validation Handled exception: {}".format(str(e))) return bool_idx_labs
logger.debug("Use random indexes instead")
kept_indices = np.random.permutation(len(self.validation.data))[:size] def get_uniform_class_rand_indices(self, labels, size, shuffle=True):
return kept_indices """size is the final size not the size / class"""
logger.debug("Start finding subset indices of size {} with uniform distribution of labels".format(size))
unique_labels = np.unique(labels, axis=0)
nb_labels = len(unique_labels)
nbr_by_label = size // nb_labels
logger.debug("Need {} (+/- 1) example by label".format(nbr_by_label))
# return_idx_labels = np.empty(size, dtype=np.int)
return_idx_labels = []
copy_idx = 0
for u_lab in unique_labels:
bool_idx_labs = self.get_bool_idx_label(u_lab, labels)
get_nbr = nbr_by_label
if len(np.where(bool_idx_labs)[0]) < get_nbr:
raise IndexError(
"Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]), u_lab,
get_nbr))
idx_labs = np.where(bool_idx_labs)[0][:get_nbr]
# return_idx_labels[copy_idx:copy_idx+get_nbr] = idx_labs
logger.debug("Found indexes for label {}: {}; length: {}".format(u_lab, idx_labs, len(idx_labs)))
return_idx_labels.extend(idx_labs)
copy_idx += get_nbr
remaining = size - copy_idx
if remaining > 0:
logger.debug("After finding equal number ({}) for example by labels (total = {}), "
"need to find more examples to reach size {}".format(nbr_by_label, nbr_by_label * nb_labels,
size))
get_nbr = 1
while remaining > 0:
u_lab_idx = np.random.choice(len(unique_labels), 1)[0]
u_lab = unique_labels[u_lab_idx]
bool_idx_labs = self.get_bool_idx_label(u_lab, labels)
all_idx_labs = np.where(bool_idx_labs)[0]
idx_labs_not_already_gotten = np.setdiff1d(all_idx_labs, return_idx_labels)
if len(idx_labs_not_already_gotten) < get_nbr:
raise IndexError(
"Found {} example with label {} when {} was asked".format(len(np.where(bool_idx_labs)[0]),
u_lab, get_nbr))
idx_labs = idx_labs_not_already_gotten[:get_nbr]
# todo set difference
# return_idx_labels[copy_idx:copy_idx + get_nbr] = idx_labs
return_idx_labels.extend(idx_labs)
remaining -= 1
if shuffle:
np.random.seed(self.seed)
np.random.shuffle(return_idx_labels)
return np.array(return_idx_labels)
def get_uniform_class_rand_indices_train(self, size, shuffle=True):
return self.get_uniform_class_rand_indices(labels=self.train.labels, size=size, shuffle=shuffle)
def get_uniform_class_rand_indices_validation(self, size, shuffle=True):
return self.get_uniform_class_rand_indices(labels=self.validation.labels, size=size, shuffle=shuffle)
@property @property
def train(self): def train(self):
...@@ -80,7 +137,7 @@ class Dataset(object): ...@@ -80,7 +137,7 @@ class Dataset(object):
:return: None :return: None
""" """
self.create_directory_tree() create_directory(self.s_download_dir)
if not check_files(self.l_filepaths): if not check_files(self.l_filepaths):
logger.debug("Files need to be downloaded") logger.debug("Files need to be downloaded")
for s_fname in self.l_filepaths: for s_fname in self.l_filepaths:
...@@ -92,14 +149,6 @@ class Dataset(object): ...@@ -92,14 +149,6 @@ class Dataset(object):
else: else:
logger.debug("Files {} already exist".format(self.l_filepaths)) logger.debug("Files {} already exist".format(self.l_filepaths))
def create_directory_tree(self):
"""
Create the target directory tree
:return: None
"""
create_directory(self.s_download_dir)
def _check_validation_size(self, data_length): def _check_validation_size(self, data_length):
if self.validation_size > data_length: if self.validation_size > data_length:
raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \ raise ValueError("The validation set size ({}) is higher than the train set size ({}). " \
...@@ -108,7 +157,10 @@ class Dataset(object): ...@@ -108,7 +157,10 @@ class Dataset(object):
def to_one_hot(self): def to_one_hot(self):
""" """
Convert categorical labels to one hot encoding Convert categorical labels to one hot encoding.
Note: Beware, the information on how the labels are encoded is not stored
in the data file.
:return: :return:
""" """
...@@ -126,6 +178,14 @@ class Dataset(object): ...@@ -126,6 +178,14 @@ class Dataset(object):
setattr(self, kw, LabeledData(data, labels)) setattr(self, kw, LabeledData(data, labels))
def revert_one_hot(self): def revert_one_hot(self):
"""
Convert one hot encoded labels to categorical labels.
Note: Beware, the information on how the labels are encoded is not stored
in the data file.
:return:
"""
logger.info("Revert one hot encoding to dataset {}.".format(self.s_name)) logger.info("Revert one hot encoding to dataset {}.".format(self.s_name))
for kw in self.data_groups_private: for kw in self.data_groups_private:
datlab = getattr(self, kw) datlab = getattr(self, kw)
...@@ -143,6 +203,10 @@ class Dataset(object): ...@@ -143,6 +203,10 @@ class Dataset(object):
Feature scaling normalization. Feature scaling normalization.
Note: Beware, the information on whether or not the data has been normalized
is not stored in the data file. If you call normalize on already normalized
it won't have any effect though.
:return: :return:
""" """
logger.info("Apply normalization to data from dataset {}.".format(self.s_name)) logger.info("Apply normalization to data from dataset {}.".format(self.s_name))
...@@ -158,6 +222,14 @@ class Dataset(object): ...@@ -158,6 +222,14 @@ class Dataset(object):
setattr(self, kw, LabeledData(data, datlab.labels)) setattr(self, kw, LabeledData(data, datlab.labels))
def data_astype(self, _type): def data_astype(self, _type):
"""
Change data type to _type.
Note: Beware, the information on the type of the data is not stored in the data file.
:param _type:
:return:
"""
logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name)) logger.info("Change type of data to {} in the dataset {}.".format(str(_type), self.s_name))
for kw in self.data_groups_private: for kw in self.data_groups_private:
datlab = getattr(self, kw) datlab = getattr(self, kw)
...@@ -171,6 +243,14 @@ class Dataset(object): ...@@ -171,6 +243,14 @@ class Dataset(object):
setattr(self, kw, LabeledData(data, datlab.labels)) setattr(self, kw, LabeledData(data, datlab.labels))
def labels_astype(self, _type): def labels_astype(self, _type):
"""
Change labels type to _type.
Not: Beware, the information on the type of the labels is not stored in the data file.
:param _type:
:return:
"""
logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name)) logger.info("Change type of labels to {} in the dataset {}.".format(str(_type), self.s_name))
for kw in self.data_groups_private: for kw in self.data_groups_private:
datlab = getattr(self, kw) datlab = getattr(self, kw)
...@@ -208,6 +288,13 @@ class Dataset(object): ...@@ -208,6 +288,13 @@ class Dataset(object):
raise Exception("No data loaded at the end of load method.") raise Exception("No data loaded at the end of load method.")
def save_npz(self, npzdir_path=None): def save_npz(self, npzdir_path=None):
"""
Save data and labels as their current state to the npz dir path.
:param npzdir_path:
:return:
"""
# todo trouver une solution pour stocker l'état courant?
if npzdir_path is None: if npzdir_path is None:
npzdir_path = os.path.join(self.s_download_dir, "npzfiles") npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
for kw in self.data_groups_private: for kw in self.data_groups_private:
...@@ -219,6 +306,12 @@ class Dataset(object): ...@@ -219,6 +306,12 @@ class Dataset(object):
np.savez(filepath, **dict_attr) np.savez(filepath, **dict_attr)
def load_npz(self, npzdir_path=None): def load_npz(self, npzdir_path=None):
"""
Load data and labels from the npzdir path.
:param npzdir_path:
:return:
"""
if npzdir_path is None: if npzdir_path is None:
npzdir_path = os.path.join(self.s_download_dir, "npzfiles") npzdir_path = os.path.join(self.s_download_dir, "npzfiles")
for kw in self.data_groups_private: for kw in self.data_groups_private:
......
import tensorflow as tf
import numpy as np import numpy as np
import tensorflow as tf
from skluc.main.data.transformation.Transformer import Transformer from skluc.main.data.transformation.Transformer import Transformer
from skluc.main.utils import logger, Singleton from skluc.main.utils import logger, Singleton
class ResizeTransformer(Transformer, metaclass=Singleton): class ResizeTransformer(Transformer, metaclass=Singleton):
# todo faire une fit methode?
def __init__(self, data_name, output_shape): def __init__(self, data_name, output_shape):
if len(output_shape) != 2: if len(output_shape) != 2:
raise AssertionError("Output shape should be 2D and it is {}D: {}".format(len(output_shape), output_shape)) raise AssertionError("Output shape should be 2D and it is {}D: {}".format(len(output_shape), output_shape))
......
import os
import unittest
from skluc.main.data.mldatasets import Cifar10Dataset
from skluc.main.utils import silentremove
class TestCifar10Dataset(unittest.TestCase):
def setUp(self):
self.mnist_names = [
'batches.meta',
'data_batch_1',
'data_batch_2',
'data_batch_3',
'data_batch_4',
'data_batch_5',
'readme.html',
'test_batch'
]
self.download_dir = "/tmp"
self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py")
self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname)
self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
def test_cifar10(self):
cifar10 = Cifar10Dataset(s_download_dir=self.download_dir)
cifar10_data = cifar10.load()
for name in self.full_cifar10_names:
self.assertTrue(os.path.exists(name))
self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"])
def tearDown(self):
for name in self.full_cifar10_names:
silentremove(name)
if __name__ == "__main__":
unittest.main()
import os
import unittest import unittest
from skluc.main.data.mldatasets.Cifar10Dataset import Cifar10Dataset import numpy as np
from skluc.main.data.mldatasets.MnistDataset import MnistDataset
from skluc.main.data.mldatasets.MovieReviewDataset import MovieReviewV1Dataset
from skluc.main.data.mldatasets.SVHNDataset import SVHNDataset
from skluc.main.utils import silentremove
from skluc.main.data.mldatasets.Dataset import Dataset
from skluc.main.utils import LabeledData
class TestMnistDataset(unittest.TestCase):
def setUp(self): class FooDataset(Dataset):
self.mnist_names = [ def __init__(self, validation_size=1000, seed=0):
"train-images-idx3-ubyte.gz", super().__init__([], "foo", validation_size=validation_size, seed=seed)
"train-labels-idx1-ubyte.gz",
"t10k-images-idx3-ubyte.gz", def read(self):
"t10k-labels-idx1-ubyte.gz" train_size = 9000
] test_size = 1000
self.download_dir = "/tmp" np.random.seed(self.seed)
self.mnist_dirname = "mnist" self._train = LabeledData(data=np.random.rand(train_size, 200),
self.datadir_name = os.path.join(self.download_dir, self.mnist_dirname) labels=np.random.choice(10, size=train_size, replace=True))
self.full_mnist_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)] self._test = LabeledData(data=np.random.rand(test_size, 200),
labels=np.random.choice(10, size=test_size, replace=True))
def test_mnist(self):
mnist = MnistDataset(s_download_dir=self.download_dir)
mnist_data = mnist.load()
for name in self.full_mnist_names:
self.assertTrue(os.path.exists(name))
self.assertTrue(kw in mnist_data.keys() for kw in ["train", "test"])
mnist_image = mnist.to_image()
self.assertTrue(mnist_image["train"][0][0].shape == (28, 28,1))
def tearDown(self):
for name in self.full_mnist_names:
silentremove(name)
class TestCifar10Dataset(unittest.TestCase):
def setUp(self):
self.mnist_names = [
'batches.meta',
'data_batch_1',
'data_batch_2',
'data_batch_3',
'data_batch_4',
'data_batch_5',
'readme.html',
'test_batch'
]
self.download_dir = "/tmp"
self.cifar10_dirname = os.path.join("cifar10", "cifar-10-batches-py")
self.datadir_name = os.path.join(self.download_dir, self.cifar10_dirname)
self.full_cifar10_names = [*map(lambda name: os.path.join(self.datadir_name, name), self.mnist_names)]
def test_cifar10(self):
cifar10 = Cifar10Dataset(s_download_dir=self.download_dir)
cifar10_data = cifar10.load()
for name in self.full_cifar10_names:
self.assertTrue(os.path.exists(name))
self.assertTrue(kw in cifar10_data.keys() for kw in ["train", "test"])
def tearDown(self):
for name in self.full_cifar10_names:
silentremove(name)
class TestDataset(unittest.TestCase): class TestDataset(unittest.TestCase):
def setUp(self): def setUp(self):
self.dataset_classes = [MnistDataset, self.dataset_classes = [FooDataset]
Cifar10Dataset,
SVHNDataset,
MovieReviewV1Dataset]
def test_seed_train_val(self): def test_seed_train_val(self):
for d_class in self.dataset_classes: for d_class in self.dataset_classes:
...@@ -79,14 +32,15 @@ class TestDataset(unittest.TestCase): ...@@ -79,14 +32,15 @@ class TestDataset(unittest.TestCase):
d1.load() d1.load()
d2.load() d2.load()
d3.load() d3.load()
self.assertTrue((d1.train.data[:10] == d2.train.data[:10]).all(), self.assertTrue((d1.train.data == d2.train.data).all(),
msg="Same seed gives different train/val split in {} dataset".format(d_class.__name__)) msg="Same seed gives different train/val split in {} dataset".format(d_class.__name__))
self.assertTrue((d1.train.data[:10] != d3.train.data[:10]).any(), self.assertTrue((d1.train.data != d3.train.data).any(),
msg="Different seeds give same train/val split in {} dataset".format(d_class.__name__)) msg="Different seeds give same train/val split in {} dataset".format(d_class.__name__))
def test_seed_uniform_rand_indices_train(self): def test_seed_uniform_rand_indices_train(self):
size = 10 size = 100
size2 = 12 size_bis = 102
size2 = 120
for d_class in self.dataset_classes: for d_class in self.dataset_classes:
# test same subsample size # test same subsample size
# ------------------------ # ------------------------
...@@ -97,11 +51,41 @@ class TestDataset(unittest.TestCase): ...@@ -97,11 +51,41 @@ class TestDataset(unittest.TestCase):
d2.load() d2.load()
d3.load() d3.load()
sub_indexes1 = d1.get_uniform_class_rand_indices_train(size) sub_indexes1 = d1.get_uniform_class_rand_indices_train(size)
sub_indexes1_not_modulo = d1.get_uniform_class_rand_indices_train(size_bis)
sub_indexes2 = d2.get_uniform_class_rand_indices_train(size) sub_indexes2 = d2.get_uniform_class_rand_indices_train(size)
sub_indexes3 = d2.get_uniform_class_rand_indices_train(size) sub_indexes3 = d3.get_uniform_class_rand_indices_train(size)
# the returned number of index is actually equal to the one asked
self.assertEqual(len(sub_indexes1), size)
self.assertEqual(len(sub_indexes1_not_modulo), size_bis)
self.assertEqual(len(sub_indexes2), size)
self.assertEqual(len(sub_indexes3), size)
subsample1 = d1.train.data[sub_indexes1] subsample1 = d1.train.data[sub_indexes1]
subsample2 = d2.train.data[sub_indexes2] subsample2 = d2.train.data[sub_indexes2]
subsample3 = d3.train.data[sub_indexes3] subsample3 = d3.train.data[sub_indexes3]
unique_labels = np.unique(d1.train.labels, axis=0)
labels = d1.train.labels[sub_indexes1]
labels_bis = d1.train.labels[sub_indexes1_not_modulo]
for u_lab in unique_labels:
if len(labels.shape) == 1:
bool_idx_labs = labels == u_lab
bool_idx_labs_bis = labels_bis == u_lab
elif len(labels.shape) == 2:
bool_idx_labs = np.all(labels == u_lab, axis=-1)
bool_idx_labs_bis = np.all(labels_bis == u_lab, axis=-1)
else:
raise ValueError(
"Function get_uniform_class_rand_indices_train has not been intended for others than scalar and vector valued labels")
idx_labs = np.where(bool_idx_labs)[0]
idx_labs_bis = np.where(bool_idx_labs_bis)[0]
self.assertTrue(
len(idx_labs) == size // len(unique_labels) or len(idx_labs) == size // len(unique_labels) + 1)
self.assertTrue(len(idx_labs_bis) == size // len(unique_labels) or len(idx_labs_bis) == size // len(
unique_labels) + 1)
self.assertTrue((subsample1 == subsample2).all(), self.assertTrue((subsample1 == subsample2).all(),
msg="Same seed gives different subsamples in {} dataset".format(d_class.__name__)) msg="Same seed gives different subsamples in {} dataset".format(d_class.__name__))
self.assertTrue((subsample1 != subsample3).any(), self.assertTrue((subsample1 != subsample3).any(),
...@@ -109,14 +93,17 @@ class TestDataset(unittest.TestCase): ...@@ -109,14 +93,17 @@ class TestDataset(unittest.TestCase):
# test different subsample size # test different subsample size
# ----------------------------- # -----------------------------
d1 = d_class(validation_size=1000, seed=0) d1 = d_class(validation_size=1000, seed=1)
d2 = d_class(validation_size=1000, seed=0) d2 = d_class(validation_size=1000, seed=1)
d1.load() d1.load()
d2.load() d2.load()
sub_indexes1 = d1.get_uniform_class_rand_indices_train(size) sub_indexes1 = d1.get_uniform_class_rand_indices_train(size)
sub_indexes2 = d2.get_uniform_class_rand_indices_train(size2) sub_indexes2 = d2.get_uniform_class_rand_indices_train(size2)
self.assertEqual(len(sub_indexes1), size)
self.assertEqual(len(sub_indexes2), size2)
subsample1 = d1.train.data[sub_indexes1] subsample1 = d1.train.data[sub_indexes1]
subsample2 = d2.train.data[sub_indexes2] subsample2 = d2.train.data[sub_indexes2]
for subs1 in subsample1: for subs1 in subsample1:
found = False found = False
for subs2 in subsample2: for subs2 in subsample2:
...@@ -175,3 +162,7 @@ class TestDataset(unittest.TestCase): ...@@ -175,3 +162,7 @@ class TestDataset(unittest.TestCase):
self.assertTrue(found, self.assertTrue(found,
msg="Little subsample is not a subset of big subsample using same seed in {} dataset".format( msg="Little subsample is not a subset of big subsample using same seed in {} dataset".format(
d_class.__name__)) d_class.__name__))
if __name__ == "__main__":
unittest.main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment