Skip to content
Snippets Groups Projects
Commit 347ee357 authored by bbauvin's avatar bbauvin
Browse files

Resolved randomized search for multicore issue

parent a7c52184
No related branches found
No related tags found
No related merge requests found
Showing
with 1169 additions and 36 deletions
......@@ -79,9 +79,9 @@ groupClass.add_argument('--CL_algos_multiview', metavar='STRING', action='store'
help='Determine which multiview classifier to use, separate with : if multiple, if empty, considering all', default='')
groupClass.add_argument('--CL_cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int,
default=1)
groupClass.add_argument('--CL_metrics', metavar='STRING', action='store',
help='Determine which metric to use, separate with ":" if multiple, if empty, considering all, '
'first one will be used for gridsearch', default='')
groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+",
help='Determine which metrics to use, separate metric and configuration with ":". If multiple, separate with space. If no metric is specified, considering all with accuracy for classification '
'first one will be used for classification', default=[''])
groupClass.add_argument('--CL_GS_iter', metavar='INT', action='store',
help='Determine how many Randomized grid search tests to do', type=int, default=30)
groupClass.add_argument('--CL_NoGS', action='store_false',
......@@ -202,11 +202,12 @@ if nbCores>1:
NB_VIEW = DATASET.get("Metadata").attrs["nbView"]
views = [str(DATASET.get("View"+str(viewIndex)).attrs["name"]) for viewIndex in range(NB_VIEW)]
NB_CLASS = DATASET.get("Metadata").attrs["nbClass"]
metrics = args.CL_metrics.split(":")
if metrics == [""]:
metrics = [["accuracy_score", None]]
metric = metrics[0]
metrics = [metric.split(":") for metric in args.CL_metrics]
if metrics == [[""]]:
metricsNames = [name for _, name, isPackage
in pkgutil.iter_modules(['Metrics']) if not isPackage]
metrics = [[metricName, None] for metricName in metricsNames]
print metrics
logging.info("Start:\t Finding all available mono- & multiview algorithms")
benchmark = {"Monoview":{}, "Multiview":[]}
......@@ -256,25 +257,24 @@ fusionMethodConfig = [["0.25", "0.25", "0.25", "0.25"], "b"]
mumboClassifierConfig = "a"
mumboclassifierNames = "a"
RandomForestKWARGS = {"0":map(int, args.CL_RF_trees.split())[0], "1":map(int, args.CL_RF_max_depth.split(":"))[0]}
SVMLinearKWARGS = {"0":map(int, args.CL_SVML_C.split(":"))[0]}
SVMRBFKWARGS = {"0":map(int, args.CL_SVMR_C.split(":"))[0]}
SVMPolyKWARGS = {"0":map(int, args.CL_SVMP_C.split(":"))[0], '1':map(int, args.CL_SVMP_deg.split(":"))[0]}
DecisionTreeKWARGS = {"0":map(int, args.CL_DT_depth.split(":"))[0]}
SGDKWARGS = {"2": map(float, args.CL_SGD_alpha.split(":"))[0], "1": args.CL_SGD_penalty.split(":")[0],
RandomForestKWARGSInit = {"0":map(int, args.CL_RF_trees.split())[0], "1":map(int, args.CL_RF_max_depth.split(":"))[0]}
SVMLinearKWARGSInit = {"0":map(int, args.CL_SVML_C.split(":"))[0]}
SVMRBFKWARGSInit = {"0":map(int, args.CL_SVMR_C.split(":"))[0]}
SVMPolyKWARGSInit = {"0":map(int, args.CL_SVMP_C.split(":"))[0], '1':map(int, args.CL_SVMP_deg.split(":"))[0]}
DecisionTreeKWARGSInit = {"0":map(int, args.CL_DT_depth.split(":"))[0]}
SGDKWARGSInit = {"2": map(float, args.CL_SGD_alpha.split(":"))[0], "1": args.CL_SGD_penalty.split(":")[0],
"0":args.CL_SGD_loss.split(":")[0]}
KNNKWARGS = {"0": map(float, args.CL_KNN_neigh.split(":"))[0]}
AdaboostKWARGS = {"0": args.CL_Ada_n_est.split(":")[0], "1": args.CL_Ada_b_est.split(":")[0]}
KNNKWARGSInit = {"0": map(float, args.CL_KNN_neigh.split(":"))[0]}
AdaboostKWARGSInit = {"0": args.CL_Ada_n_est.split(":")[0], "1": args.CL_Ada_b_est.split(":")[0]}
dataBaseTime = time.time()-start
argumentDictionaries = {"Monoview": {}, "Multiview": []}
print benchmark
try:
if benchmark["Monoview"]:
argumentDictionaries["Monoview"] = []
for view in views:
for classifier in benchmark["Monoview"]:
arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGS"], "feat":view, "fileFeat": args.fileFeat,
arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGSInit"], "feat":view, "fileFeat": args.fileFeat,
"fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": classifier}, "viewIndex":views.index(view)}
argumentDictionaries["Monoview"].append(arguments)
except:
......@@ -287,7 +287,7 @@ if nbCores>1:
for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
resultsMonoview+=(Parallel(n_jobs=nbCores)(
delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, gridSearch=gridSearch,
metric=metric, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex+stepIndex*nbCores])
metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores])
for coreIndex in range(min(nbCores, nbExperiments - (stepIndex + 1) * nbCores))))
accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
classifiersNames = [[result[1][0] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
......@@ -300,7 +300,7 @@ else:
resultsMonoview+=([ExecMonoview(DATASET.get("View"+str(arguments["viewIndex"])),
DATASET.get("labels").value, args.name,
args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
gridSearch=gridSearch, metric=metric, nIter=args.CL_GS_iter,
gridSearch=gridSearch, metrics=metrics, nIter=args.CL_GS_iter,
**arguments)
for arguments in argumentDictionaries["Monoview"]])
......@@ -357,7 +357,7 @@ try:
"LABELS_NAMES": args.CL_classes.split(":"),
"FusionKWARGS": {"fusionType":"EarlyFusion", "fusionMethod":method,
"classifiersNames": [classifier],
"classifiersConfigs": [globals()[classifier+"KWARGS"]],
"classifiersConfigs": [globals()[classifier+"KWARGSInit"]],
'fusionMethodConfig': fusionMethodConfig}}
argumentDictionaries["Multiview"].append(arguments)
except:
......
......@@ -27,7 +27,7 @@ def score(y_true, y_pred, **kwargs):
try:
average = kwargs["3"]
except:
average = "binary"
average = "micro"
score = metric(y_true, y_pred, sample_weight=sample_weight, labels=labels, pos_label=pos_label, average=average)
return score
......@@ -48,7 +48,7 @@ def get_scorer(**kwargs):
try:
average = kwargs["3"]
except:
average = "binary"
average = "micro"
return make_scorer(metric, greater_is_better=True, sample_weight=sample_weight, labels=labels,
pos_label=pos_label, average=average)
......@@ -69,7 +69,7 @@ def getConfig(**kwargs):
try:
average = kwargs["3"]
except:
average = "binary"
average = "micro"
configString = "F1 score using "+str(sample_weight)+" as sample_weights, "+str(labels)+" as labels, "+str(pos_label)\
+" as pos_label, "+average+" as average (higher is better)"
return configString
\ No newline at end of file
......@@ -26,7 +26,7 @@ def score(y_true, y_pred, **kwargs):
try:
average = kwargs["4"]
except:
average = "binary"
average = "micro"
score = metric(y_true, y_pred, beta, sample_weight=sample_weight, labels=labels, pos_label=pos_label, average=average)
return score
......@@ -51,7 +51,7 @@ def get_scorer(**kwargs):
try:
average = kwargs["4"]
except:
average = "binary"
average = "micro"
return make_scorer(metric, greater_is_better=True, beta=beta, sample_weight=sample_weight, labels=labels,
pos_label=pos_label, average=average)
......@@ -76,7 +76,7 @@ def getConfig(**kwargs):
try:
average = kwargs["3"]
except:
average = "binary"
average = "micro"
configString = "F-beta score using "+str(sample_weight)+" as sample_weights, "+str(labels)+" as labels, "+str(pos_label) \
+" as pos_label, "+average+" as average, "+str(beta)+" as beta (higher is better)"
return configString
\ No newline at end of file
......@@ -14,7 +14,7 @@ def score(y_true, y_pred, **kwargs):
try:
average = kwargs["1"]
except:
average = "binary"
average = "micro"
score = metric(y_true, y_pred, sample_weight=sample_weight, average=average)
return score
......@@ -27,7 +27,7 @@ def get_scorer(**kwargs):
try:
average = kwargs["1"]
except:
average = "binary"
average = "micro"
return make_scorer(metric, greater_is_better=True, sample_weight=sample_weight, average=average)
......@@ -39,6 +39,6 @@ def getConfig(**kwargs):
try:
average = kwargs["3"]
except:
average = "binary"
average = "micro"
configString = "ROS AUC score using "+str(sample_weight)+" as sample_weights, "+average+" as average (higher is better)"
return configString
\ No newline at end of file
......@@ -33,7 +33,6 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa
metrics=[["accuracy_score", None]], nIter=30, **args):
DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r")
kwargs = args["args"]
views = [DATASET.get("View"+str(viewIndex)).attrs["name"] for viewIndex in range(DATASET.get("Metadata").attrs["nbView"])]
neededViewIndex = views.index(kwargs["feat"])
X = DATASET.get("View"+str(neededViewIndex))
......@@ -45,7 +44,6 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa
def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args):
try:
kwargs = args["args"]
except:
......@@ -189,7 +187,7 @@ if __name__=='__main__':
groupClass.add_argument('--CL_Cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int, default=1)
groupClass.add_argument('--CL_split', metavar='FLOAT', action='store', help='Split ratio for train and test', type=float, default=0.9)
groupClass.add_argument('--CL_metrics', metavar='STRING', action='store',
help='Determine which metric to use, separate with ":" if multiple, if empty, considering all', default='')
help='Determine which metrics to use, separate with ":" if multiple, if empty, considering all', default='')
groupClassifier = parser.add_argument_group('Classifier Config')
......
......@@ -38,6 +38,7 @@ def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred):
def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames,
shape, y_train, y_train_pred, y_test, y_test_pred, time):
print metrics
metricsScores = {}
metricModule = getattr(Metrics, metrics[0][0])
train = metricModule.score(y_train, y_train_pred)
......
2016-09-06 11:00:31,349 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-06 11:00:31,350 WARNING: WARNING : /!\ This may use a lot of HDD storage space : 0.273145851562 Gbytes /!\
2016-09-06 11:00:40,496 DEBUG: Start: Creating datasets for multiprocessing
2016-09-06 11:02:43,168 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-06 11:02:43,168 WARNING: WARNING : /!\ This may use a lot of HDD storage space : 0.273145851562 Gbytes /!\
2016-09-06 11:02:57,719 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-06 11:02:57,720 WARNING: WARNING : /!\ This may use a lot of HDD storage space : 0.273145851562 Gbytes /!\
2016-09-06 11:03:04,610 DEBUG: Start: Creating datasets for multiprocessing
2016-09-06 11:03:04,612 INFO: Start: Finding all available mono- & multiview algorithms
2016-09-06 11:03:05,483 DEBUG: ### Main Programm for Classification MonoView
2016-09-06 11:03:05,483 DEBUG: ### Classification - Database:MultiOmic Feature:Methyl train_size:0.7, CrossValidation k-folds:5, cores:1, algorithm : DecisionTree
2016-09-06 11:03:05,483 DEBUG: Start: Determine Train/Test split
2016-09-06 11:03:05,539 DEBUG: Info: Shape X_train:(242, 25978), Length of y_train:242
2016-09-06 11:03:05,539 DEBUG: Info: Shape X_test:(105, 25978), Length of y_test:105
2016-09-06 11:03:05,539 DEBUG: Done: Determine Train/Test split
2016-09-06 11:03:05,539 DEBUG: Start: RandomSearch best settings with 1 iterations
2016-09-06 11:03:06,099 DEBUG: ### Main Programm for Classification MonoView
2016-09-06 11:03:06,099 DEBUG: ### Classification - Database:MultiOmic Feature:Methyl train_size:0.7, CrossValidation k-folds:5, cores:1, algorithm : Adaboost
2016-09-06 11:03:06,099 DEBUG: Start: Determine Train/Test split
2016-09-06 11:03:06,130 DEBUG: Info: Shape X_train:(242, 25978), Length of y_train:242
2016-09-06 11:03:06,130 DEBUG: Info: Shape X_test:(105, 25978), Length of y_test:105
2016-09-06 11:03:06,130 DEBUG: Done: Determine Train/Test split
2016-09-06 11:03:06,131 DEBUG: Start: RandomSearch best settings with 1 iterations
Classification on MultiOmic database for Methyl with DecisionTree
accuracy_score on train : 0.971074380165
accuracy_score on test : 0.828571428571
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Decision Tree with max_depth : 3
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.971074380165
- Score on test : 0.828571428571
Classification took 0:00:09
\ No newline at end of file
Classification on MultiOmic database for Methyl with Adaboost
accuracy_score on train : 1.0
accuracy_score on test : 0.819047619048
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Adaboost with num_esimators : 3, base_estimators : DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 1.0
- Score on test : 0.819047619048
Classification took 0:00:11
\ No newline at end of file
Classification on MultiOmic database for Methyl with RandomForest
accuracy_score on train : 0.921487603306
accuracy_score on test : 0.847619047619
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Random Forest with num_esimators : 3, max_depth : 3
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.921487603306
- Score on test : 0.847619047619
Classification took 0:00:00
\ No newline at end of file
Classification on MultiOmic database for Methyl with KNN
accuracy_score on train : 0.892561983471
accuracy_score on test : 0.87619047619
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- K nearest Neighbors with n_neighbors: 35
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.892561983471
- Score on test : 0.87619047619
Classification took 0:00:02
\ No newline at end of file
Classification on MultiOmic database for Methyl with SGD
accuracy_score on train : 0.735537190083
accuracy_score on test : 0.72380952381
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SGDClassifier with loss : log, penalty : l1
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.735537190083
- Score on test : 0.72380952381
Classification took 0:00:03
\ No newline at end of file
Classification on MultiOmic database for Methyl with SVMLinear
accuracy_score on train : 1.0
accuracy_score on test : 0.895238095238
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SVM Linear with C : 5123
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 1.0
- Score on test : 0.895238095238
Classification took 0:00:09
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with DecisionTree
accuracy_score on train : 0.938016528926
accuracy_score on test : 0.819047619048
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Decision Tree with max_depth : 3
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.938016528926
- Score on test : 0.819047619048
Classification took 0:00:00
\ No newline at end of file
Classification on MultiOmic database for Methyl with SVMPoly
accuracy_score on train : 0.735537190083
accuracy_score on test : 0.72380952381
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SVM Linear with C : 5123
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.735537190083
- Score on test : 0.72380952381
Classification took 0:00:11
\ No newline at end of file
Classification on MultiOmic database for Methyl with SVMRBF
accuracy_score on train : 1.0
accuracy_score on test : 0.895238095238
Database configuration :
- Database name : MultiOmic
- View name : Methyl View shape : (347, 25978)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SVM Linear with C : 5123
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 1.0
- Score on test : 0.895238095238
Classification took 0:00:11
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with Adaboost
accuracy_score on train : 1.0
accuracy_score on test : 0.8
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Adaboost with num_esimators : 3, base_estimators : DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
- Executed on 1 core(s)
- Got configuration using randomized search with 1 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 1.0
- Score on test : 0.8
Classification took 0:00:00
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment