From 6344c6f020871a63b4fd64928e4c59334bc74239 Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@lis-lab.fr>
Date: Mon, 29 Oct 2018 07:27:22 +0100
Subject: [PATCH] solve normalization problem, add min and max property to
 dataset

---
 skluc/main/data/mldatasets/Dataset.py              | 13 +++++++++++--
 .../test/test_data/test_mldatasets/TestDataset.py  | 14 ++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py
index d01ae87..f211bb7 100644
--- a/skluc/main/data/mldatasets/Dataset.py
+++ b/skluc/main/data/mldatasets/Dataset.py
@@ -34,6 +34,14 @@ class Dataset(object):
         self.permuted_index_validation = None
         self.validation_size = validation_size
 
+    @property
+    def min(self):
+        return np.min(self.train.data)
+
+    @property
+    def max(self):
+        return np.max(self.train.data)
+
     def reduce_data_size(self, new_size):
         logger.info("Reducing datasize of dataset {} to .".format(self.s_name, new_size))
         kept_indices = self.get_uniform_class_rand_indices_train(new_size)
@@ -217,8 +225,9 @@ class Dataset(object):
             if len(datlab.labels) == 0:
                 continue
             data = datlab.data
-            _min = data.min()
-            _max = data.max()
+            _min = self.min
+            _max = self.max
+            logger.debug(f"Minimum value of train set is {_min}; max is {_max}")
             data = (data - _min) / (_max - _min)
             logger.debug("Apply normalization to {} data of {} dataset.".format(kw, self.s_name))
             setattr(self, kw, LabeledData(data, datlab.labels))
diff --git a/skluc/test/test_data/test_mldatasets/TestDataset.py b/skluc/test/test_data/test_mldatasets/TestDataset.py
index 7873200..a9c9e2e 100644
--- a/skluc/test/test_data/test_mldatasets/TestDataset.py
+++ b/skluc/test/test_data/test_mldatasets/TestDataset.py
@@ -24,6 +24,20 @@ class TestDataset(unittest.TestCase):
     def setUp(self):
         self.dataset_classes = [FooDataset]
 
+    def test_min_max(self):
+        for d_class in self.dataset_classes:
+            d1 = d_class(validation_size=1000, seed=0)
+            d1.load()
+            mini_train = np.min(d1.train.data)
+            maxi_train = np.max(d1.train.data)
+            self.assertEqual(mini_train, d1.min)
+            self.assertEqual(maxi_train, d1.max)
+            mini_test = np.min(d1.test.data)
+            maxi_test = np.max(d1.test.data)
+            self.assertNotEquals(mini_test, d1.min)
+            self.assertNotEquals(maxi_test, d1.max)
+
+
     def test_seed_train_val(self):
         for d_class in self.dataset_classes:
             d1 = d_class(validation_size=1000, seed=0)
-- 
GitLab