Skip to content
Snippets Groups Projects

Resolve "Experiment pipeline"

Merged Charly Lamothe requested to merge 12-experiment-pipeline into master
2 files
+ 121
21
Compare changes
  • Side-by-side
  • Inline
Files
2
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset import Dataset
 
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
from bolsonaro.utils import change_binary_func_load
@@ -9,13 +10,38 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
@@ -9,13 +10,38 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import preprocessing
 
import random
 
import pandas as pd
class DatasetLoader(object):
class DatasetLoader(object):
 
DEFAULT_DATASET_NAME = 'boston'
 
DEFAULT_NORMALIZE_D = False
 
DEFAULT_DATASET_NORMALIZER = 'standard'
 
DEFAULT_FOREST_SIZE = 100
 
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 5
 
DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.1
 
DEFAULT_DEV_SIZE = 0.2
 
DEFAULT_TEST_SIZE = 0.2
 
DEFAULT_RANDOM_SEED_NUMBER = 1
 
DEFAULT_SUBSETS_USED = 'train,dev'
 
DEFAULT_NORMALIZE_WEIGHTS = False
 
 
dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
 
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
 
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds']
 
 
dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
 
'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
 
'20newsgroups_vectorized':3, 'lfw_people':3,
 
'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,
 
'diamonds': 15}
 
@staticmethod
@staticmethod
def load(dataset_parameters):
def load(dataset_parameters):
name = dataset_parameters.name
name = dataset_parameters.name
 
X, y = None, None
if name == 'boston':
if name == 'boston':
dataset_loading_func = load_boston
dataset_loading_func = load_boston
task = Task.REGRESSION
task = Task.REGRESSION
@@ -37,37 +63,52 @@ class DatasetLoader(object):
@@ -37,37 +63,52 @@ class DatasetLoader(object):
elif name == 'breast_cancer':
elif name == 'breast_cancer':
dataset_loading_func = change_binary_func_load(load_breast_cancer)
dataset_loading_func = change_binary_func_load(load_breast_cancer)
task = Task.BINARYCLASSIFICATION
task = Task.BINARYCLASSIFICATION
elif name == 'olivetti_faces': # bug (no return X_y)
elif name == 'olivetti_faces':
dataset_loading_func = fetch_olivetti_faces
dataset = fetch_olivetti_faces(random_state=dataset_parameters.random_state, shuffle=True)
task = Task.MULTICLASSIFICATION
elif name == '20newsgroups': # bug (no return X_y)
dataset_loading_func = fetch_20newsgroups
task = Task.MULTICLASSIFICATION
task = Task.MULTICLASSIFICATION
 
X, y = dataset.data, dataset.target
elif name == '20newsgroups_vectorized':
elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized
dataset = fetch_20newsgroups_vectorized()
 
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'lfw_people': # needs PIL (image dataset)
elif name == 'lfw_people':
dataset_loading_func = fetch_lfw_people
dataset = fetch_lfw_people()
 
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'lfw_pairs':
elif name == 'lfw_pairs':
dataset_loading_func = fetch_lfw_pairs
dataset = fetch_lfw_pairs()
 
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'covtype':
elif name == 'covtype':
dataset_loading_func = fetch_covtype
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'rcv1':
elif name == 'rcv1':
dataset_loading_func = fetch_rcv1
X, y = fetch_rcv1(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
elif name == 'kddcup99':
dataset_loading_func = fetch_kddcup99
task = Task.MULTICLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'california_housing':
elif name == 'california_housing':
dataset_loading_func = fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
 
task = Task.REGRESSION
 
elif name == 'diamonds':
 
# TODO: make a proper fetcher instead of the following code
 
from sklearn.preprocessing import LabelEncoder
 
df = pd.read_csv('data/diamonds.csv')
 
df.drop(['Unnamed: 0'], axis=1 , inplace=True)
 
df = df[(df[['x','y','z']] != 0).all(axis=1)]
 
df.drop(['x','y','z'], axis=1, inplace= True)
 
label_cut = LabelEncoder()
 
label_color = LabelEncoder()
 
label_clarity = LabelEncoder()
 
df['cut'] = label_cut.fit_transform(df['cut'])
 
df['color'] = label_color.fit_transform(df['color'])
 
df['clarity'] = label_clarity.fit_transform(df['clarity'])
 
X, y = df.drop(['price'], axis=1), df['price']
task = Task.REGRESSION
task = Task.REGRESSION
else:
else:
raise ValueError("Unsupported dataset '{}'".format(name))
raise ValueError("Unsupported dataset '{}'".format(name))
 
if X is None:
X, y = dataset_loading_func(return_X_y=True)
X, y = dataset_loading_func(return_X_y=True)
 
X_train, X_test, y_train, y_test = train_test_split(X, y,
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=dataset_parameters.test_size,
test_size=dataset_parameters.test_size,
random_state=dataset_parameters.random_state)
random_state=dataset_parameters.random_state)
@@ -92,3 +133,20 @@ class DatasetLoader(object):
@@ -92,3 +133,20 @@ class DatasetLoader(object):
return Dataset(task, X_train,
return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test)
X_dev, X_test, y_train, y_dev, y_test)
 
 
@staticmethod
 
def load_default(dataset_name, seed):
 
begin_random_seed_range = 1
 
end_random_seed_range = 2000
 
 
seed = seed if seed else random.randint(begin_random_seed_range, end_random_seed_range)
 
 
dataset_parameters = DatasetParameters(
 
name=dataset_name,
 
test_size=DatasetLoader.DEFAULT_TEST_SIZE,
 
dev_size=DatasetLoader.DEFAULT_DEV_SIZE,
 
random_state=seed,
 
dataset_normalizer=DatasetLoader.DEFAULT_DATASET_NORMALIZER
 
)
 
 
return DatasetLoader.load(dataset_parameters)
Loading