Skip to content
Snippets Groups Projects
Commit aae78b61 authored by charly.lamothe's avatar charly.lamothe
Browse files

Initial commit of clean scripts branch

parent 69dd8959
Branches
Tags
1 merge request!3clean scripts
class Dataset(object):
def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train,
y_dev, y_test):
self._task = task
self._dataset_parameters = dataset_parameters
self._X_train = X_train
self._X_dev = X_dev
self._X_test = X_test
self._y_train = y_train
self._y_dev = y_dev
self._y_test = y_test
@property
def task(self):
return self._task
@property
def dataset_parameters(self):
return self._dataset_parameters
@property
def X_train(self):
return self._X_train
@property
def X_dev(self):
return self._X_dev
@property
def X_test(self):
return self._X_test
@property
def y_train(self):
return self._y_train
@property
def y_dev(self):
return self._y_dev
@property
def y_test(self):
return self._y_test
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.task import Task
from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split
class DatasetLoader(object):
@staticmethod
def load_from_name(dataset_parameters):
name = dataset_parameters.name
if name == 'boston':
dataset_loading_func = load_boston
task = Task.REGRESSION
elif name == 'iris':
dataset_loading_func = load_iris
task = Task.CLASSIFICATION
elif name == 'diabetes':
dataset_loading_func = load_diabetes
task = Task.REGRESSION
elif name == 'digits':
dataset_loading_func = load_digits
task = Task.CLASSIFICATION
elif name == 'linnerud':
dataset_loading_func = load_linnerud
task = Task.REGRESSION
elif name == 'wine':
dataset_loading_func = load_wine
task = Task.CLASSIFICATION
elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer
task = Task.CLASSIFICATION
elif name == 'olivetti_faces':
dataset_loading_func = fetch_olivetti_faces
task = Task.CLASSIFICATION
elif name == '20newsgroups':
dataset_loading_func = fetch_20newsgroups
task = Task.CLASSIFICATION
elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized
task = Task.CLASSIFICATION
elif name == 'lfw_people':
dataset_loading_func = fetch_lfw_people
task = Task.CLASSIFICATION
elif name == 'lfw_pairs':
dataset_loading_func = fetch_lfw_pairs
elif name == 'covtype':
dataset_loading_func = fetch_covtype
task = Task.CLASSIFICATION
elif name == 'rcv1':
dataset_loading_func = fetch_rcv1
task = Task.CLASSIFICATION
elif name == 'kddcup99':
dataset_loading_func = fetch_kddcup99
task = Task.CLASSIFICATION
elif name == 'california_housing':
dataset_loading_func = fetch_california_housing
task = Task.REGRESSION
else:
raise ValueError("Unsupported dataset '{}'".format(name))
X, y = dataset_loading_func(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=dataset_parameters.test_size,
random_state=dataset_parameters.seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train,
test_size=dataset_parameters.dev_size,
random_state=dataset_parameters.seed)
if dataset_parameters.normalize:
pass
return Dataset(task, dataset_parameters, X_train,
X_dev, X_test, y_train, y_dev, y_test)
class DatasetParameters(object):
def __init__(self, name, test_size, dev_size, random_state, normalize):
self._name = name
self._test_size = test_size
self._dev_size = dev_size
self._random_state = random_state
self._normalize = normalize
@property
def name(self):
return self._name
@property
def test_size(self):
return self._test_size
@property
def dev_size(self):
return self._dev_size
@property
def random_state(self):
return self._random_state
@property
def normalize(self):
return self._normalize
from enum import Enum
class Task(Enum):
CLASSIFICATION = 1
REGRESSION = 2
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
class ModelFactory(object):
@staticmethod
def build(task, forest_size, extracted_forest_size, seed=None):
if task == Task.CLASSIFICATION:
model_func = OmpForestClassifier
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(
forest_size=forest_size,
extracted_forest_size=extracted_forest_size,
seed=seed
)
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
class OmpForestClassifier(BaseEstimator):
def __init__(self):
raise ValueError('Classification tasks are not supported for now')
def fit(self, X, y):
pass
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
class OmpForestRegressor(BaseEstimator):
def __init__(self, forest_size, extracted_forest_size, seed=None):
self._regressor = RandomForestRegressor(n_estimators=forest_size,
random_state=seed)
self._extracted_forest_size = extracted_forest_size
def fit(self, X_train, y_train):
self._forest = self._train_forest(X_train, y_train)
self._weights = self._extract_subforest(X_train, y_train)
return self
@property
def forest(self):
return self._forest
@property
def weights(self):
return self._weights
def _train_forest(self, X_train, y_train):
self._regressor.fit(X_train, y_train)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X_train, y_train):
D = [[tree.predict([elem])[0] for tree in forest] for elem in X_train]
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=self._extracted_forest_size,
fit_intercept=False, normalize=False)
omp.fit(D, y_train)
weights = omp.coef_
return weights
from bolsonaro.utils import resolve_output_file_name
import pickle
class Trainer(object):
def __init__(self, dataset, model, results_dir, models_dir):
self._dataset = dataset
self._model = model
self._results_dir = results_dir
self._models_dir = models_dir
def process(self):
self._model.fit(self._dataset.X_train, self._dataset.y_train)
output_file_name = resolve_output_file_name(
self._dataset.dataset_parameters,
self._model.model_parameters,
self._results_dir,
self._models_dir
)
with open(output_file_name, 'wb') as output_file:
pickle.dump(output_file, {
})
# save forest and weights here
from pathlib import Path
root_directory = Path(__file__).parent.parent.absolute()
class Plotter(object):
\ No newline at end of file
train.py 0 → 100644
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.trainer import Trainer
import argparse
import pathlib
import random
if __name__ == "__main__":
default_dataset_name = 'boston'
default_normalize = False
default_forest_size = 100
default_extracted_forest_size = 10
default_results_dir = 'results'
default_models_dir = 'models'
default_dev_size = 0.2
default_test_size = 0.2
default_use_random_seed = True
default_random_seed_number = 1
begin_random_seed_range = 1
end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio')
parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio')
parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used')
args = parser.parse_args()
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) for i in range(args.random_seed_number)] \
if args.use_random_seed else None
for random_seed in random_seeds:
dataset = DatasetLoader.load_from_name(
DatasetParameters(
name=args.dataset_name,
test_size=args.test_size,
dev_size=args.dev_size,
random_state=random_seed,
normalize=args.normalize
)
)
for extracted_forest_size in args.extracted_forest_size:
model = ModelFactory(
task=dataset.task,
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
seed=random_seed
)
trainer = Trainer(
dataset=dataset,
model=model,
results_dir=args.results_dir,
models_dir=args.models_dir
)
trainer.process()
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment