Skip to content
Snippets Groups Projects

Resolve "Experiment pipeline"

Merged Charly Lamothe requested to merge 12-experiment-pipeline into master
Compare and Show latest version
4 files
+ 59
54
Compare changes
  • Side-by-side
  • Inline
Files
4
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
@@ -9,10 +10,26 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random
class DatasetLoader(object):
DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = 'standard'
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE = 10
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_NORMALIZE_WEIGHTS = False
dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
'lfw_pairs', 'covtype', 'rcv1', 'california_housing']
@staticmethod
def load(dataset_parameters):
name = dataset_parameters.name
@@ -39,23 +56,20 @@ class DatasetLoader(object):
dataset_loading_func = change_binary_func_load(load_breast_cancer)
task = Task.BINARYCLASSIFICATION
elif name == 'olivetti_faces':
data = fetch_olivetti_faces(random_state=dataset_parameters.random_state, shuffle=True)
task = Task.MULTICLASSIFICATION
X, y = data.data, data.target
elif name == '20newsgroups':
data = fetch_20newsgroups(random_state=dataset_parameters.random_state, shuffle=True)
#X, y =
dataset = fetch_olivetti_faces(random_state=dataset_parameters.random_state, shuffle=True)
task = Task.MULTICLASSIFICATION
X, y = dataset.data, dataset.target
elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized
dataset = fetch_20newsgroups_vectorized()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
elif name == 'lfw_people':
data = fetch_lfw_people()
X, y = data.data, data.target
dataset = fetch_lfw_people()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
elif name == 'lfw_pairs':
data = fetch_lfw_pairs()
X, y = data.data, data.target
dataset = fetch_lfw_pairs()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
elif name == 'covtype':
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
@@ -63,9 +77,6 @@ class DatasetLoader(object):
elif name == 'rcv1':
X, y = fetch_rcv1(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
elif name == 'kddcup99':
X, y = fetch_kddcup99(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
elif name == 'california_housing':
X, y = fetch_california_housing(return_X_y=True)
task = Task.REGRESSION
@@ -73,7 +84,7 @@ class DatasetLoader(object):
raise ValueError("Unsupported dataset '{}'".format(name))
if X is None:
X, y = dataset_loading_func()
X, y = dataset_loading_func(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=dataset_parameters.test_size,
@@ -99,3 +110,20 @@ class DatasetLoader(object):
return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test)
@staticmethod
def load_default(dataset_name, seed):
begin_random_seed_range = 1
end_random_seed_range = 2000
seed = seed if seed else random.randint(begin_random_seed_range, end_random_seed_range)
dataset_parameters = DatasetParameters(
name=dataset_name,
test_size=DatasetLoader.DEFAULT_TEST_SIZE,
dev_size=DatasetLoader.DEFAULT_DEV_SIZE,
random_state=seed,
dataset_normalizer=DatasetLoader.DEFAULT_DATASET_NORMALIZER
)
return DatasetLoader.load(dataset_parameters)
Loading