diff --git a/bolsonaro/data/dataset.py b/bolsonaro/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b2f67c489be94791397649ba292a49f65dd604d8 --- /dev/null +++ b/bolsonaro/data/dataset.py @@ -0,0 +1,44 @@ +class Dataset(object): + + def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train, + y_dev, y_test): + self._task = task + self._dataset_parameters = dataset_parameters + self._X_train = X_train + self._X_dev = X_dev + self._X_test = X_test + self._y_train = y_train + self._y_dev = y_dev + self._y_test = y_test + + @property + def task(self): + return self._task + + @property + def dataset_parameters(self): + return self._dataset_parameters + + @property + def X_train(self): + return self._X_train + + @property + def X_dev(self): + return self._X_dev + + @property + def X_test(self): + return self._X_test + + @property + def y_train(self): + return self._y_train + + @property + def y_dev(self): + return self._y_dev + + @property + def y_test(self): + return self._y_test diff --git a/bolsonaro/data/dataset_loader.py b/bolsonaro/data/dataset_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..1e4264ef88d6dc595c7e5df14b678dcb95086a74 --- /dev/null +++ b/bolsonaro/data/dataset_loader.py @@ -0,0 +1,78 @@ +from bolsonaro.data.dataset import Dataset +from bolsonaro.data.task import Task + +from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer +from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ + fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ + fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing +from sklearn.model_selection import train_test_split + + +class DatasetLoader(object): + + @staticmethod + def load_from_name(dataset_parameters): + name = dataset_parameters.name + if name == 'boston': + dataset_loading_func = load_boston + task = Task.REGRESSION + elif name == 'iris': + dataset_loading_func = load_iris + task = Task.CLASSIFICATION + elif name == 'diabetes': + dataset_loading_func = load_diabetes + task = Task.REGRESSION + elif name == 'digits': + dataset_loading_func = load_digits + task = Task.CLASSIFICATION + elif name == 'linnerud': + dataset_loading_func = load_linnerud + task = Task.REGRESSION + elif name == 'wine': + dataset_loading_func = load_wine + task = Task.CLASSIFICATION + elif name == 'breast_cancer': + dataset_loading_func = load_breast_cancer + task = Task.CLASSIFICATION + elif name == 'olivetti_faces': + dataset_loading_func = fetch_olivetti_faces + task = Task.CLASSIFICATION + elif name == '20newsgroups': + dataset_loading_func = fetch_20newsgroups + task = Task.CLASSIFICATION + elif name == '20newsgroups_vectorized': + dataset_loading_func = fetch_20newsgroups_vectorized + task = Task.CLASSIFICATION + elif name == 'lfw_people': + dataset_loading_func = fetch_lfw_people + task = Task.CLASSIFICATION + elif name == 'lfw_pairs': + dataset_loading_func = fetch_lfw_pairs + elif name == 'covtype': + dataset_loading_func = fetch_covtype + task = Task.CLASSIFICATION + elif name == 'rcv1': + dataset_loading_func = fetch_rcv1 + task = Task.CLASSIFICATION + elif name == 'kddcup99': + dataset_loading_func = fetch_kddcup99 + task = Task.CLASSIFICATION + elif name == 'california_housing': + dataset_loading_func = fetch_california_housing + task = Task.REGRESSION + else: + raise ValueError("Unsupported dataset '{}'".format(name)) + + X, y = dataset_loading_func(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=dataset_parameters.test_size, + random_state=dataset_parameters.seed) + X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, + test_size=dataset_parameters.dev_size, + random_state=dataset_parameters.seed) + + if dataset_parameters.normalize: + pass + + return Dataset(task, dataset_parameters, X_train, + X_dev, X_test, y_train, y_dev, y_test) diff --git a/bolsonaro/data/dataset_parameters.py b/bolsonaro/data/dataset_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..e820b8f13a8502d08c7c2f679b28cef1b7262fd7 --- /dev/null +++ b/bolsonaro/data/dataset_parameters.py @@ -0,0 +1,28 @@ +class DatasetParameters(object): + + def __init__(self, name, test_size, dev_size, random_state, normalize): + self._name = name + self._test_size = test_size + self._dev_size = dev_size + self._random_state = random_state + self._normalize = normalize + + @property + def name(self): + return self._name + + @property + def test_size(self): + return self._test_size + + @property + def dev_size(self): + return self._dev_size + + @property + def random_state(self): + return self._random_state + + @property + def normalize(self): + return self._normalize diff --git a/bolsonaro/data/task.py b/bolsonaro/data/task.py new file mode 100644 index 0000000000000000000000000000000000000000..2f47fa22f472f769c075f40e1c25a7bf3de45f0d --- /dev/null +++ b/bolsonaro/data/task.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class Task(Enum): + CLASSIFICATION = 1 + REGRESSION = 2 diff --git a/bolsonaro/models/model_factory.py b/bolsonaro/models/model_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f02a604d3d51c165459f3cd216033cc4ef256a --- /dev/null +++ b/bolsonaro/models/model_factory.py @@ -0,0 +1,20 @@ +from bolsonaro.models.omp_forest_classifier import OmpForestClassifier +from bolsonaro.models.omp_forest_regressor import OmpForestRegressor +from bolsonaro.data.task import Task + + +class ModelFactory(object): + + @staticmethod + def build(task, forest_size, extracted_forest_size, seed=None): + if task == Task.CLASSIFICATION: + model_func = OmpForestClassifier + elif task == Task.REGRESSION: + model_func = OmpForestRegressor + else: + raise ValueError("Unsupported task '{}'".format(task)) + return model_func( + forest_size=forest_size, + extracted_forest_size=extracted_forest_size, + seed=seed + ) diff --git a/bolsonaro/models/omp_forest_classifier.py b/bolsonaro/models/omp_forest_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..12cc23fab69fc0b79ff40b1d6957db5532a8c452 --- /dev/null +++ b/bolsonaro/models/omp_forest_classifier.py @@ -0,0 +1,11 @@ +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestClassifier + + +class OmpForestClassifier(BaseEstimator): + + def __init__(self): + raise ValueError('Classification tasks are not supported for now') + + def fit(self, X, y): + pass diff --git a/bolsonaro/models/omp_forest_regressor.py b/bolsonaro/models/omp_forest_regressor.py new file mode 100644 index 0000000000000000000000000000000000000000..17d99aa452652b96990135009d1199180c2099b8 --- /dev/null +++ b/bolsonaro/models/omp_forest_regressor.py @@ -0,0 +1,39 @@ +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import OrthogonalMatchingPursuit + + +class OmpForestRegressor(BaseEstimator): + + def __init__(self, forest_size, extracted_forest_size, seed=None): + self._regressor = RandomForestRegressor(n_estimators=forest_size, + random_state=seed) + self._extracted_forest_size = extracted_forest_size + + def fit(self, X_train, y_train): + self._forest = self._train_forest(X_train, y_train) + + self._weights = self._extract_subforest(X_train, y_train) + + return self + + @property + def forest(self): + return self._forest + + @property + def weights(self): + return self._weights + + def _train_forest(self, X_train, y_train): + self._regressor.fit(X_train, y_train) + forest = self._regressor.estimators_ + return forest + + def _extract_subforest(self, X_train, y_train): + D = [[tree.predict([elem])[0] for tree in forest] for elem in X_train] + omp = OrthogonalMatchingPursuit(n_nonzero_coefs=self._extracted_forest_size, + fit_intercept=False, normalize=False) + omp.fit(D, y_train) + weights = omp.coef_ + return weights diff --git a/bolsonaro/trainer.py b/bolsonaro/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..cb9f9fedc6d71b01eb85be538280d9bd5e824c36 --- /dev/null +++ b/bolsonaro/trainer.py @@ -0,0 +1,26 @@ +from bolsonaro.utils import resolve_output_file_name + +import pickle + + +class Trainer(object): + + def __init__(self, dataset, model, results_dir, models_dir): + self._dataset = dataset + self._model = model + self._results_dir = results_dir + self._models_dir = models_dir + + def process(self): + self._model.fit(self._dataset.X_train, self._dataset.y_train) + output_file_name = resolve_output_file_name( + self._dataset.dataset_parameters, + self._model.model_parameters, + self._results_dir, + self._models_dir + ) + with open(output_file_name, 'wb') as output_file: + pickle.dump(output_file, { + + }) + # save forest and weights here diff --git a/bolsonaro/utils.py b/bolsonaro/utils.py index 6761ac86095a3420cc371a6f2b7aac0a629acc14..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/bolsonaro/utils.py +++ b/bolsonaro/utils.py @@ -1,3 +0,0 @@ -from pathlib import Path - -root_directory = Path(__file__).parent.parent.absolute() diff --git a/bolsonaro/visualization/plotter.py b/bolsonaro/visualization/plotter.py new file mode 100644 index 0000000000000000000000000000000000000000..01f0f0388885e37b8bdefb5d9e8877a491dcdce5 --- /dev/null +++ b/bolsonaro/visualization/plotter.py @@ -0,0 +1,3 @@ +class Plotter(object): + + \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..cd6f228ce4756cfd487200589701076ffdc6b667 --- /dev/null +++ b/train.py @@ -0,0 +1,69 @@ +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.models.model_factory import ModelFactory +from bolsonaro.trainer import Trainer + +import argparse +import pathlib +import random + + +if __name__ == "__main__": + default_dataset_name = 'boston' + default_normalize = False + default_forest_size = 100 + default_extracted_forest_size = 10 + default_results_dir = 'results' + default_models_dir = 'models' + default_dev_size = 0.2 + default_test_size = 0.2 + default_use_random_seed = True + default_random_seed_number = 1 + begin_random_seed_range = 1 + end_random_seed_range = 2000 + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') + parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.') + parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') + parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') + parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.') + parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') + parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio') + parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio') + parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split') + parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used') + args = parser.parse_args() + + pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) + pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) + + random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) for i in range(args.random_seed_number)] \ + if args.use_random_seed else None + + for random_seed in random_seeds: + dataset = DatasetLoader.load_from_name( + DatasetParameters( + name=args.dataset_name, + test_size=args.test_size, + dev_size=args.dev_size, + random_state=random_seed, + normalize=args.normalize + ) + ) + + for extracted_forest_size in args.extracted_forest_size: + model = ModelFactory( + task=dataset.task, + forest_size=args.forest_size, + extracted_forest_size=extracted_forest_size, + seed=random_seed + ) + + trainer = Trainer( + dataset=dataset, + model=model, + results_dir=args.results_dir, + models_dir=args.models_dir + ) + trainer.process() diff --git a/bolsonaro/notebooks/.gitkeep b/visualize.py similarity index 100% rename from bolsonaro/notebooks/.gitkeep rename to visualize.py