Skip to content
Snippets Groups Projects
Commit aae78b61 authored by charly.lamothe's avatar charly.lamothe
Browse files

Initial commit of clean scripts branch

parent 69dd8959
No related branches found
No related tags found
1 merge request!3clean scripts
class Dataset(object):
def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train,
y_dev, y_test):
self._task = task
self._dataset_parameters = dataset_parameters
self._X_train = X_train
self._X_dev = X_dev
self._X_test = X_test
self._y_train = y_train
self._y_dev = y_dev
self._y_test = y_test
@property
def task(self):
return self._task
@property
def dataset_parameters(self):
return self._dataset_parameters
@property
def X_train(self):
return self._X_train
@property
def X_dev(self):
return self._X_dev
@property
def X_test(self):
return self._X_test
@property
def y_train(self):
return self._y_train
@property
def y_dev(self):
return self._y_dev
@property
def y_test(self):
return self._y_test
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.task import Task
from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split
class DatasetLoader(object):
@staticmethod
def load_from_name(dataset_parameters):
name = dataset_parameters.name
if name == 'boston':
dataset_loading_func = load_boston
task = Task.REGRESSION
elif name == 'iris':
dataset_loading_func = load_iris
task = Task.CLASSIFICATION
elif name == 'diabetes':
dataset_loading_func = load_diabetes
task = Task.REGRESSION
elif name == 'digits':
dataset_loading_func = load_digits
task = Task.CLASSIFICATION
elif name == 'linnerud':
dataset_loading_func = load_linnerud
task = Task.REGRESSION
elif name == 'wine':
dataset_loading_func = load_wine
task = Task.CLASSIFICATION
elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer
task = Task.CLASSIFICATION
elif name == 'olivetti_faces':
dataset_loading_func = fetch_olivetti_faces
task = Task.CLASSIFICATION
elif name == '20newsgroups':
dataset_loading_func = fetch_20newsgroups
task = Task.CLASSIFICATION
elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized
task = Task.CLASSIFICATION
elif name == 'lfw_people':
dataset_loading_func = fetch_lfw_people
task = Task.CLASSIFICATION
elif name == 'lfw_pairs':
dataset_loading_func = fetch_lfw_pairs
elif name == 'covtype':
dataset_loading_func = fetch_covtype
task = Task.CLASSIFICATION
elif name == 'rcv1':
dataset_loading_func = fetch_rcv1
task = Task.CLASSIFICATION
elif name == 'kddcup99':
dataset_loading_func = fetch_kddcup99
task = Task.CLASSIFICATION
elif name == 'california_housing':
dataset_loading_func = fetch_california_housing
task = Task.REGRESSION
else:
raise ValueError("Unsupported dataset '{}'".format(name))
X, y = dataset_loading_func(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=dataset_parameters.test_size,
random_state=dataset_parameters.seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train,
test_size=dataset_parameters.dev_size,
random_state=dataset_parameters.seed)
if dataset_parameters.normalize:
pass
return Dataset(task, dataset_parameters, X_train,
X_dev, X_test, y_train, y_dev, y_test)
class DatasetParameters(object):
def __init__(self, name, test_size, dev_size, random_state, normalize):
self._name = name
self._test_size = test_size
self._dev_size = dev_size
self._random_state = random_state
self._normalize = normalize
@property
def name(self):
return self._name
@property
def test_size(self):
return self._test_size
@property
def dev_size(self):
return self._dev_size
@property
def random_state(self):
return self._random_state
@property
def normalize(self):
return self._normalize
from enum import Enum
class Task(Enum):
CLASSIFICATION = 1
REGRESSION = 2
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
class ModelFactory(object):
@staticmethod
def build(task, forest_size, extracted_forest_size, seed=None):
if task == Task.CLASSIFICATION:
model_func = OmpForestClassifier
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(
forest_size=forest_size,
extracted_forest_size=extracted_forest_size,
seed=seed
)
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
class OmpForestClassifier(BaseEstimator):
def __init__(self):
raise ValueError('Classification tasks are not supported for now')
def fit(self, X, y):
pass
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
class OmpForestRegressor(BaseEstimator):
def __init__(self, forest_size, extracted_forest_size, seed=None):
self._regressor = RandomForestRegressor(n_estimators=forest_size,
random_state=seed)
self._extracted_forest_size = extracted_forest_size
def fit(self, X_train, y_train):
self._forest = self._train_forest(X_train, y_train)
self._weights = self._extract_subforest(X_train, y_train)
return self
@property
def forest(self):
return self._forest
@property
def weights(self):
return self._weights
def _train_forest(self, X_train, y_train):
self._regressor.fit(X_train, y_train)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X_train, y_train):
D = [[tree.predict([elem])[0] for tree in forest] for elem in X_train]
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=self._extracted_forest_size,
fit_intercept=False, normalize=False)
omp.fit(D, y_train)
weights = omp.coef_
return weights
from bolsonaro.utils import resolve_output_file_name
import pickle
class Trainer(object):
def __init__(self, dataset, model, results_dir, models_dir):
self._dataset = dataset
self._model = model
self._results_dir = results_dir
self._models_dir = models_dir
def process(self):
self._model.fit(self._dataset.X_train, self._dataset.y_train)
output_file_name = resolve_output_file_name(
self._dataset.dataset_parameters,
self._model.model_parameters,
self._results_dir,
self._models_dir
)
with open(output_file_name, 'wb') as output_file:
pickle.dump(output_file, {
})
# save forest and weights here
from pathlib import Path
root_directory = Path(__file__).parent.parent.absolute()
class Plotter(object):
\ No newline at end of file
train.py 0 → 100644
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.trainer import Trainer
import argparse
import pathlib
import random
if __name__ == "__main__":
default_dataset_name = 'boston'
default_normalize = False
default_forest_size = 100
default_extracted_forest_size = 10
default_results_dir = 'results'
default_models_dir = 'models'
default_dev_size = 0.2
default_test_size = 0.2
default_use_random_seed = True
default_random_seed_number = 1
begin_random_seed_range = 1
end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio')
parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio')
parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used')
args = parser.parse_args()
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) for i in range(args.random_seed_number)] \
if args.use_random_seed else None
for random_seed in random_seeds:
dataset = DatasetLoader.load_from_name(
DatasetParameters(
name=args.dataset_name,
test_size=args.test_size,
dev_size=args.dev_size,
random_state=random_seed,
normalize=args.normalize
)
)
for extracted_forest_size in args.extracted_forest_size:
model = ModelFactory(
task=dataset.task,
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
seed=random_seed
)
trainer = Trainer(
dataset=dataset,
model=model,
results_dir=args.results_dir,
models_dir=args.models_dir
)
trainer.process()
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment