diff --git a/skluc/main/data/mldatasets/Dataset.py b/skluc/main/data/mldatasets/Dataset.py index d01ae873b5471ee7a0b58b10c0f5f2b857f97c1b..f211bb74a4e01b2122e017d0470a92a0d3fc0dd5 100644 --- a/skluc/main/data/mldatasets/Dataset.py +++ b/skluc/main/data/mldatasets/Dataset.py @@ -34,6 +34,14 @@ class Dataset(object): self.permuted_index_validation = None self.validation_size = validation_size + @property + def min(self): + return np.min(self.train.data) + + @property + def max(self): + return np.max(self.train.data) + def reduce_data_size(self, new_size): logger.info("Reducing datasize of dataset {} to .".format(self.s_name, new_size)) kept_indices = self.get_uniform_class_rand_indices_train(new_size) @@ -217,8 +225,9 @@ class Dataset(object): if len(datlab.labels) == 0: continue data = datlab.data - _min = data.min() - _max = data.max() + _min = self.min + _max = self.max + logger.debug(f"Minimum value of train set is {_min}; max is {_max}") data = (data - _min) / (_max - _min) logger.debug("Apply normalization to {} data of {} dataset.".format(kw, self.s_name)) setattr(self, kw, LabeledData(data, datlab.labels)) diff --git a/skluc/test/test_data/test_mldatasets/TestDataset.py b/skluc/test/test_data/test_mldatasets/TestDataset.py index 7873200df8ecee4dad6066c0362bb82437844560..a9c9e2e4a8e0a74dc318da02d9587ea0714f345d 100644 --- a/skluc/test/test_data/test_mldatasets/TestDataset.py +++ b/skluc/test/test_data/test_mldatasets/TestDataset.py @@ -24,6 +24,20 @@ class TestDataset(unittest.TestCase): def setUp(self): self.dataset_classes = [FooDataset] + def test_min_max(self): + for d_class in self.dataset_classes: + d1 = d_class(validation_size=1000, seed=0) + d1.load() + mini_train = np.min(d1.train.data) + maxi_train = np.max(d1.train.data) + self.assertEqual(mini_train, d1.min) + self.assertEqual(maxi_train, d1.max) + mini_test = np.min(d1.test.data) + maxi_test = np.max(d1.test.data) + self.assertNotEquals(mini_test, d1.min) + self.assertNotEquals(maxi_test, d1.max) + + def test_seed_train_val(self): for d_class in self.dataset_classes: d1 = d_class(validation_size=1000, seed=0)