Skip to content
Snippets Groups Projects
Commit 28b804c6 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

Train the forest on train and OMP on dev OR train both the forest and OMP on train+dev

parent 9199d9bb
No related branches found
No related tags found
1 merge request!3clean scripts
...@@ -5,12 +5,11 @@ import os ...@@ -5,12 +5,11 @@ import os
class DatasetParameters(object): class DatasetParameters(object):
def __init__(self, name, test_size, dev_size, random_state, train_on_subset, dataset_normalizer): def __init__(self, name, test_size, dev_size, random_state, dataset_normalizer):
self._name = name self._name = name
self._test_size = test_size self._test_size = test_size
self._dev_size = dev_size self._dev_size = dev_size
self._random_state = random_state self._random_state = random_state
self._train_on_subset = train_on_subset
self._dataset_normalizer = dataset_normalizer self._dataset_normalizer = dataset_normalizer
@property @property
...@@ -29,10 +28,6 @@ class DatasetParameters(object): ...@@ -29,10 +28,6 @@ class DatasetParameters(object):
def random_state(self): def random_state(self):
return self._random_state return self._random_state
@property
def train_on_subset(self):
return self._train_on_subset
@property @property
def dataset_normalizer(self): def dataset_normalizer(self):
return self._dataset_normalizer return self._dataset_normalizer
......
...@@ -5,11 +5,12 @@ import os ...@@ -5,11 +5,12 @@ import os
class ModelParameters(object): class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, normalize_D, seed=None): def __init__(self, forest_size, extracted_forest_size, normalize_D, use_dev_subset, seed=None):
self._forest_size = forest_size self._forest_size = forest_size
self._extracted_forest_size = extracted_forest_size self._extracted_forest_size = extracted_forest_size
self._seed = seed
self._normalize_D = normalize_D self._normalize_D = normalize_D
self._use_dev_subset = use_dev_subset
self._seed = seed
@property @property
def forest_size(self): def forest_size(self):
...@@ -19,14 +20,18 @@ class ModelParameters(object): ...@@ -19,14 +20,18 @@ class ModelParameters(object):
def extracted_forest_size(self): def extracted_forest_size(self):
return self._extracted_forest_size return self._extracted_forest_size
@property
def seed(self):
return self._seed
@property @property
def normalize_D(self): def normalize_D(self):
return self._normalize_D return self._normalize_D
@property
def use_dev_subset(self):
return self._use_dev_subset
@property
def seed(self):
return self._seed
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__) self.__dict__)
......
...@@ -36,9 +36,9 @@ class OmpForestRegressor(BaseEstimator): ...@@ -36,9 +36,9 @@ class OmpForestRegressor(BaseEstimator):
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
def fit(self, X, y): def fit(self, X_forest, y_forest, X_omp, y_omp):
self._forest = self._train_forest(X, y) self._forest = self._train_forest(X_forest, y_forest)
self._weights = self._extract_subforest(X, y) self._weights = self._extract_subforest(X_omp, y_omp)
return self return self
def score_regressor(self, X, y): def score_regressor(self, X, y):
......
...@@ -4,6 +4,7 @@ from . import LOG_PATH ...@@ -4,6 +4,7 @@ from . import LOG_PATH
import time import time
import datetime import datetime
import numpy as np
class Trainer(object): class Trainer(object):
...@@ -15,15 +16,26 @@ class Trainer(object): ...@@ -15,15 +16,26 @@ class Trainer(object):
def train(self, model, models_dir): def train(self, model, models_dir):
self._logger.debug('Training model using train set...') self._logger.debug('Training model using train set...')
begin_time = time.time() begin_time = time.time()
train_on_subset = self._dataset.dataset_parameters.train_on_subset
if train_on_subset == 'train': if model.models_parameters.use_dev_subset:
X, y = self._dataset.X_train, self._dataset.y_train X_forest = self._dataset.X_train
elif train_on_subset == 'dev': y_forest = self._dataset.y_train
X, y = self._dataset.X_dev, self._dataset.y_dev X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
self._logger.debug('Fitting the forest on train subset and OMP on dev subset.')
else: else:
raise ValueError("Unsupported train_on_subset value '{}'".format(train_on_subset)) X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._logger.debug('Fitting on {} subset'.format(train_on_subset)) X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
model.fit(X, y) y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._logger.debug('Fitting both the forest and OMP on train+dev subsets.')
model.fit(
X_forest=X_forest,
y_forest=y_forest,
X_omp=X_omp,
y_omp=y_omp
)
end_time = time.time() end_time = time.time()
ModelRawResults( ModelRawResults(
......
...@@ -29,7 +29,7 @@ if __name__ == "__main__": ...@@ -29,7 +29,7 @@ if __name__ == "__main__":
DEFAULT_DEV_SIZE = 0.2 DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1 DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_TRAIN_ON_SUBSET = 'train' DEFAULT_USE_DEV_SUBSET = False
DEFAULT_DISABLE_PROGRESS = False DEFAULT_DISABLE_PROGRESS = False
begin_random_seed_range = 1 begin_random_seed_range = 1
...@@ -46,7 +46,7 @@ if __name__ == "__main__": ...@@ -46,7 +46,7 @@ if __name__ == "__main__":
parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--train_on_subset', nargs='?', type=str, default=DEFAULT_TRAIN_ON_SUBSET, help='Specify on witch subset the model will be trained (either train or dev).') parser.add_argument('--use_dev_subset', action='store_true', default=DEFAULT_USE_DEV_SUBSET, help='If specify the forest will be trained on train subset and OMP on dev subset. Otherwise both the forest and OMP will be trained on train+dev subsets.')
parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.') parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.')
args = parser.parse_args() args = parser.parse_args()
...@@ -83,8 +83,7 @@ if __name__ == "__main__": ...@@ -83,8 +83,7 @@ if __name__ == "__main__":
test_size=args.test_size, test_size=args.test_size,
dev_size=args.dev_size, dev_size=args.dev_size,
random_state=seed, random_state=seed,
dataset_normalizer=args.dataset_normalizer, dataset_normalizer=args.dataset_normalizer
train_on_subset=args.train_on_subset
) )
dataset_parameters.save(models_dir, experiment_id_str) dataset_parameters.save(models_dir, experiment_id_str)
...@@ -101,8 +100,9 @@ if __name__ == "__main__": ...@@ -101,8 +100,9 @@ if __name__ == "__main__":
model_parameters = ModelParameters( model_parameters = ModelParameters(
forest_size=args.forest_size, forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
seed=seed, normalize_D=args.normalize_D,
normalize_D=args.normalize_D use_dev_subset=args.use_dev_subset,
seed=seed
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
......
  • Luc Giffon @luc.giffon ·
    Owner

    Du coup c'est l'idée mais dans le cas"concatenate", ya une copie complète du jeu de donnée en ram. Ça devrait pas être un problème vu la taille de nos données mais c'est pas très propre et sous efficient. Il faudrait mieux utiliser une seule variable pour les deux. Donc xomp=xforest et yomp=yforest. Je crois.

  • Charly Lamothe @charly.lamothe ·
    Maintainer

    Oui c'est ça merci, je viens de changer avec d89d9d52.

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment