From 6344c6f020871a63b4fd64928e4c59334bc74239 Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Mon, 29 Oct 2018 07:27:22 +0100 Subject: [PATCH] solve normalization problem, add min and max property to dataset --- skluc/main/data/mldatasets/Dataset.py | 13 +++++++++++-- .../test/test_data/test_mldatasets/TestDataset.py | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py index d01ae87..f211bb7 100644 --- a/skluc/main/data/mldatasets/Dataset.py +++ b/skluc/main/data/mldatasets/Dataset.py @@ -34,6 +34,14 @@ class Dataset(object): self.permuted_index_validation = None self.validation_size = validation_size + @property + def min(self): + return np.min(self.train.data) + + @property + def max(self): + return np.max(self.train.data) + def reduce_data_size(self, new_size): logger.info("Reducing datasize of dataset {} to .".format(self.s_name, new_size)) kept_indices = self.get_uniform_class_rand_indices_train(new_size) @@ -217,8 +225,9 @@ class Dataset(object): if len(datlab.labels) == 0: continue data = datlab.data - _min = data.min() - _max = data.max() + _min = self.min + _max = self.max + logger.debug(f"Minimum value of train set is {_min}; max is {_max}") data = (data - _min) / (_max - _min) logger.debug("Apply normalization to {} data of {} dataset.".format(kw, self.s_name)) setattr(self, kw, LabeledData(data, datlab.labels)) diff --git a/skluc/test/test_data/test_mldatasets/TestDataset.py b/skluc/test/test_data/test_mldatasets/TestDataset.py index 7873200..a9c9e2e 100644 --- a/skluc/test/test_data/test_mldatasets/TestDataset.py +++ b/skluc/test/test_data/test_mldatasets/TestDataset.py @@ -24,6 +24,20 @@ class TestDataset(unittest.TestCase): def setUp(self): self.dataset_classes = [FooDataset] + def test_min_max(self): + for d_class in self.dataset_classes: + d1 = d_class(validation_size=1000, seed=0) + d1.load() + mini_train = np.min(d1.train.data) + maxi_train = np.max(d1.train.data) + self.assertEqual(mini_train, d1.min) + self.assertEqual(maxi_train, d1.max) + mini_test = np.min(d1.test.data) + maxi_test = np.max(d1.test.data) + self.assertNotEquals(mini_test, d1.min) + self.assertNotEquals(maxi_test, d1.max) + + def test_seed_train_val(self): for d_class in self.dataset_classes: d1 = d_class(validation_size=1000, seed=0) -- GitLab