Skip to content
Snippets Groups Projects
Commit 9e9e2f25 authored by bbauvin's avatar bbauvin
Browse files

Added multicore computation by mutliplying hdf5 datasets

parent 9019e33a
No related branches found
No related tags found
No related merge requests found
Showing
with 941 additions and 37 deletions
...@@ -12,11 +12,13 @@ import numpy as np ...@@ -12,11 +12,13 @@ import numpy as np
import logging import logging
import matplotlib import matplotlib
matplotlib.use('Agg') matplotlib.use('Agg')
import math
import time
# Import own modules # Import own modules
import Multiview import Multiview
from Multiview.ExecMultiview import ExecMultiview from Multiview.ExecMultiview import ExecMultiview, ExecMultiview_multicore
from Monoview.ExecClassifMonoView import ExecMonoview from Monoview.ExecClassifMonoView import ExecMonoview, ExecMonoview_multicore
import Multiview.GetMultiviewDb as DB import Multiview.GetMultiviewDb as DB
import Monoview import Monoview
from ResultAnalysis import resultAnalysis from ResultAnalysis import resultAnalysis
...@@ -160,6 +162,8 @@ os.nice(args.nice) ...@@ -160,6 +162,8 @@ os.nice(args.nice)
nbCores = args.CL_cores nbCores = args.CL_cores
if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech"]: if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech"]:
getDatabase = getattr(DB, "getClassicDB" + args.type[1:]) getDatabase = getattr(DB, "getClassicDB" + args.type[1:])
else:
getDatabase = getattr(DB, "get" + args.name + "DB" + args.type[1:])
try: try:
gridSearch = args.CL_NoGS gridSearch = args.CL_NoGS
...@@ -183,9 +187,17 @@ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=lo ...@@ -183,9 +187,17 @@ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=lo
if args.log: if args.log:
logging.getLogger().addHandler(logging.StreamHandler()) logging.getLogger().addHandler(logging.StreamHandler())
getDatabase = getattr(DB, "get" + args.name + "DB" + args.type[1:])
DATASET, LABELS_DICTIONARY = getDatabase(args.views.split(":"), args.pathF, args.name, len(args.CL_classes), args.CL_classes) DATASET, LABELS_DICTIONARY = getDatabase(args.views.split(":"), args.pathF, args.name, len(args.CL_classes), args.CL_classes)
datasetLength = DATASET.get("Metadata").attrs["datasetLength"] datasetLength = DATASET.get("Metadata").attrs["datasetLength"]
if nbCores>1:
logging.debug("Start:\t Creating "+str(nbCores)+" temporary datasets for multiprocessing")
logging.warning(" WARNING : /!\ This may use a lot of HDD storage space : "+
str(os.path.getsize(args.pathF+args.name+".hdf5")*nbCores/float(1024)/1000/1000)+" Gbytes /!\ ")
time.sleep(5)
datasetFiles = DB.copyHDF5(args.pathF, args.name, nbCores)
logging.debug("Start:\t Creating datasets for multiprocessing")
NB_VIEW = DATASET.get("Metadata").attrs["nbView"] NB_VIEW = DATASET.get("Metadata").attrs["nbView"]
views = [str(DATASET.get("View"+str(viewIndex)).attrs["name"]) for viewIndex in range(NB_VIEW)] views = [str(DATASET.get("View"+str(viewIndex)).attrs["name"]) for viewIndex in range(NB_VIEW)]
NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] NB_CLASS = DATASET.get("Metadata").attrs["nbClass"]
...@@ -257,31 +269,50 @@ AdaboostKWARGS = {"0": args.CL_Ada_n_est.split(":")[0], "1": args.CL_Ada_b_est.s ...@@ -257,31 +269,50 @@ AdaboostKWARGS = {"0": args.CL_Ada_n_est.split(":")[0], "1": args.CL_Ada_b_est.s
argumentDictionaries = {"Monoview": {}, "Multiview": []} argumentDictionaries = {"Monoview": {}, "Multiview": []}
try: try:
if benchmark["Monoview"]: if benchmark["Monoview"]:
argumentDictionaries["Monoview"] = []
for view in views: for view in views:
argumentDictionaries["Monoview"][str(view)] = []
for classifier in benchmark["Monoview"]: for classifier in benchmark["Monoview"]:
arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGS"], "feat":view, "fileFeat": args.fileFeat,
arguments = {classifier+"KWARGS": globals()[classifier+"KWARGS"], "feat":view, "fileFeat": args.fileFeat, "fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": classifier}, "viewIndex":views.index(view)}
"fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": classifier} argumentDictionaries["Monoview"].append(arguments)
argumentDictionaries["Monoview"][str(view)].append(arguments)
except: except:
pass pass
bestClassifiers = [] bestClassifiers = []
bestClassifiersConfigs = [] bestClassifiersConfigs = []
resultsMonoview = [] resultsMonoview = []
for viewIndex, viewArguments in enumerate(argumentDictionaries["Monoview"].values()): if nbCores>1:
resultsMonoview.append( (Parallel(n_jobs=nbCores)( nbExperiments = len(argumentDictionaries["Monoview"])
delayed(ExecMonoview)(DATASET.get("View"+str(viewIndex)), DATASET.get("labels").value, args.name, print nbExperiments
args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, gridSearch=gridSearch, for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
metric=metric, nIter=args.CL_GS_iter, **arguments) resultsMonoview+=(Parallel(n_jobs=nbCores)(
for arguments in viewArguments))) delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, gridSearch=gridSearch,
metric=metric, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex+stepIndex*nbCores])
accuracies = [result[1] for result in resultsMonoview[viewIndex]] for coreIndex in range(min(nbCores, nbExperiments - (stepIndex + 1) * nbCores))))
classifiersNames = [result[0] for result in resultsMonoview[viewIndex]] accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
classifiersConfigs = [result[2] for result in resultsMonoview[viewIndex]] print accuracies
bestClassifiers.append(classifiersNames[np.argmax(np.array(accuracies))]) for result in resultsMonoview:
bestClassifiersConfigs.append(classifiersConfigs[np.argmax(np.array(accuracies))]) print result[0]
print resultsMonoview[0][0]
classifiersNames = [[result[1][0] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
classifiersConfigs = [[result[1][2] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
for viewIndex, view in enumerate(views):
bestClassifiers.append(classifiersNames[viewIndex][np.argmax(np.array(accuracies[viewIndex]))])
bestClassifiersConfigs.append(classifiersConfigs[viewIndex][np.argmax(np.array(accuracies[viewIndex]))])
else:
resultsMonoview.append([ExecMonoview(datasetFiles[viewIndex].get("View"+str(viewIndex)),
datasetFiles[viewIndex].get("labels").value, args.name,
args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
gridSearch=gridSearch, metric=metric, nIter=args.CL_GS_iter,
**arguments["args"])
for arguments in argumentDictionaries["Monoview"]])
accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
classifiersNames = [[result[1][0] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
classifiersConfigs = [[result[1][2] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
for viewIndex, view in enumerate(views):
bestClassifiers.append(classifiersNames[viewIndex][np.argmax(np.array(accuracies[viewIndex]))])
bestClassifiersConfigs.append(classifiersConfigs[viewIndex][np.argmax(np.array(accuracies[viewIndex]))])
try: try:
if benchmark["Multiview"]: if benchmark["Multiview"]:
try: try:
...@@ -337,13 +368,24 @@ try: ...@@ -337,13 +368,24 @@ try:
pass pass
except: except:
pass pass
if nbCores>1:
resultsMultiview = []
resultsMultiview = Parallel(n_jobs=nbCores)( nbExperiments = len(argumentDictionaries["Multiview"])
delayed(ExecMultiview)(DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
resultsMultiview += Parallel(n_jobs=nbCores)(
delayed(ExecMultiview_multicore)(coreIndex, args.name, args.CL_split, args.CL_nbFolds, args.type, args.pathF,
LABELS_DICTIONARY, gridSearch=gridSearch, LABELS_DICTIONARY, gridSearch=gridSearch,
metrics=metrics, **arguments) metrics=metrics, **argumentDictionaries["Multiview"][stepIndex*nbCores+coreIndex])
for arguments in argumentDictionaries["Multiview"]) for coreIndex in range(min(nbCores, nbExperiments - (stepIndex + 1) * nbCores)))
else:
resultsMultiview = [ExecMultiview(DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
LABELS_DICTIONARY, gridSearch=gridSearch,
metrics=metrics, **arguments) for arguments in argumentDictionaries["Multiview"]]
if nbCores>1:
logging.debug("Start:\t Deleting "+str(nbCores)+" temporary datasets for multiprocessing")
datasetFiles = DB.deleteHDF5(args.pathF, args.name, nbCores)
logging.debug("Start:\t Deleting datasets for multiprocessing")
results = (resultsMonoview, resultsMultiview) results = (resultsMonoview, resultsMultiview)
resultAnalysis(benchmark, results) resultAnalysis(benchmark, results)
......
...@@ -29,7 +29,18 @@ __status__ = "Prototype" # Production, Development, Prototype ...@@ -29,7 +29,18 @@ __status__ = "Prototype" # Production, Development, Prototype
__date__ = 2016-03-25 __date__ = 2016-03-25
### Argument Parser def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args):
DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r")
kwargs = args["args"]
views = [DATASET.get("View"+str(viewIndex)).attrs["name"] for viewIndex in range(DATASET.get("Metadata").attrs["nbView"])]
neededViewIndex = views.index(kwargs["feat"])
X = DATASET.get("View"+str(neededViewIndex))
Y = DATASET.get("labels").value
returnedViewIndex = args["viewIndex"]
return returnedViewIndex, ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, gridSearch=gridSearch,
metrics=metrics, nIter=nIter, **kwargs)
def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True, def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True,
...@@ -99,7 +110,6 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, ...@@ -99,7 +110,6 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
labelsString = "-".join(classLabelsNames) labelsString = "-".join(classLabelsNames)
timestr = time.strftime("%Y%m%d-%H%M%S") timestr = time.strftime("%Y%m%d-%H%M%S")
CL_type_string = CL_type CL_type_string = CL_type
print CL_type_string
outputFileName = "Results/" + timestr + "Results-" + CL_type_string + "-" + labelsString + \ outputFileName = "Results/" + timestr + "Results-" + CL_type_string + "-" + labelsString + \
'-learnRate' + str(learningRate) + '-' + name '-learnRate' + str(learningRate) + '-' + name
......
...@@ -13,6 +13,7 @@ import datetime ...@@ -13,6 +13,7 @@ import datetime
import os import os
import logging import logging
import time import time
import h5py
# Author-Info # Author-Info
...@@ -21,6 +22,13 @@ __status__ = "Prototype" # Production, Development, P ...@@ -21,6 +22,13 @@ __status__ = "Prototype" # Production, Development, P
def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY ,
gridSearch=False, nbCores=1, metrics=None, nIter=30, **arguments):
DATASET = h5py.File(path+name+str(coreIndex)+".hdf5", "r")
return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY,
gridSearch=False, metrics=None, nIter=30, **arguments)
def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY, def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY,
gridSearch=False, metrics=None, nIter=30, **kwargs): gridSearch=False, metrics=None, nIter=30, **kwargs):
......
...@@ -355,6 +355,19 @@ def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): ...@@ -355,6 +355,19 @@ def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
return datasetFile, labelDictionary return datasetFile, labelDictionary
def copyHDF5(pathF, name, nbCores):
datasetFile = h5py.File(pathF+name+".hdf5", "r")
for coreIndex in range(nbCores):
newDataSet = h5py.File(pathF+name+str(coreIndex)+".hdf5", "w")
for dataset in datasetFile:
datasetFile.copy("/"+dataset, newDataSet["/"])
newDataSet.close()
def deleteHDF5(pathF, name, nbCores):
for coreIndex in range(nbCores):
os.remove(pathF+name+str(coreIndex)+".hdf5")
# def getOneViewFromDB(viewName, pathToDB, DBName): # def getOneViewFromDB(viewName, pathToDB, DBName):
# view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';') # view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';')
# return view # return view
......
...@@ -14,14 +14,10 @@ __status__ = "Prototype" # Production, Development, P ...@@ -14,14 +14,10 @@ __status__ = "Prototype" # Production, Development, P
def resultAnalysis(benchmark, results): def resultAnalysis(benchmark, results):
mono, multi = results mono, multi = results
flattenedMono = [] names = [res[1][0]+res[1][3] for res in mono]
for view in mono:
for res in view:
flattenedMono.append(res)
names = [res[0]+res[3] for res in flattenedMono]
names+=[type_ if type_ != "Fusion" else type_+a["fusionType"]+a["fusionMethod"] for type_, a, b, c, d in multi] names+=[type_ if type_ != "Fusion" else type_+a["fusionType"]+a["fusionMethod"] for type_, a, b, c, d in multi]
nbResults = len(flattenedMono)+len(multi) nbResults = len(mono)+len(multi)
accuracies = [float(res[1]) for res in flattenedMono] accuracies = [float(res[1][1]) for res in mono]
accuracies += [float(accuracy) for a, b, c, d, accuracy in multi] accuracies += [float(accuracy) for a, b, c, d, accuracy in multi]
f = pylab.figure(figsize=(40, 30)) f = pylab.figure(figsize=(40, 30))
fig = plt.gcf() fig = plt.gcf()
......
2016-09-02 15:01:57,259 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-02 15:01:57,259 WARNING: This may use a lot of HDD storage space : 0 Gbytes
2016-09-02 15:02:00,228 DEBUG: Start: Creating datasets for multiprocessing
2016-09-02 15:02:00,231 INFO: Start: Finding all available mono- & multiview algorithms
2016-09-02 15:02:35,409 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-02 15:02:35,409 WARNING: This may use a lot of HDD storage space : 0 Gbytes
2016-09-02 15:02:37,811 DEBUG: Start: Creating datasets for multiprocessing
2016-09-02 15:02:37,814 INFO: Start: Finding all available mono- & multiview algorithms
2016-09-02 15:02:54,797 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-02 15:02:54,797 WARNING: This may use a lot of HDD storage space : 0 Gbytes
2016-09-02 15:02:56,866 DEBUG: Start: Creating datasets for multiprocessing
2016-09-02 15:02:57,786 INFO: Start: Finding all available mono- & multiview algorithms
2016-09-02 15:04:32,394 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-02 15:04:32,394 WARNING: This may use a lot of HDD storage space : 0 Gbytes
2016-09-02 15:04:34,167 DEBUG: Start: Creating datasets for multiprocessing
2016-09-02 15:04:34,169 INFO: Start: Finding all available mono- & multiview algorithms
2016-09-02 15:05:40,604 DEBUG: Start: Creating 2 temporary datasets for multiprocessing
2016-09-02 15:05:40,604 WARNING: This may use a lot of HDD storage space : 0 Gbytes
2016-09-02 15:05:42,136 DEBUG: Start: Creating datasets for multiprocessing
2016-09-02 15:05:42,138 INFO: Start: Finding all available mono- & multiview algorithms
Classification on MultiOmic database for MiRNA_ with Adaboost
accuracy_score on train : 1.0
accuracy_score on test : 0.819047619048
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Adaboost with num_esimators : 8, base_estimators : DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 1.0
- Score on test : 0.819047619048
Classification took 0:00:02
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with DecisionTree
accuracy_score on train : 0.933884297521
accuracy_score on test : 0.819047619048
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Decision Tree with max_depth : 3
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.933884297521
- Score on test : 0.819047619048
Classification took 0:00:01
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with KNN
accuracy_score on train : 0.793388429752
accuracy_score on test : 0.8
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- K nearest Neighbors with n_neighbors: 17
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.793388429752
- Score on test : 0.8
Classification took 0:00:01
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with RandomForest
accuracy_score on train : 0.995867768595
accuracy_score on test : 0.87619047619
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Random Forest with num_esimators : 15, max_depth : 24
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.995867768595
- Score on test : 0.87619047619
Classification took 0:00:02
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with SGD
accuracy_score on train : 0.797520661157
accuracy_score on test : 0.895238095238
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SGDClassifier with loss : log, penalty : l1
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.797520661157
- Score on test : 0.895238095238
Classification took 0:00:01
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with SVMLinear
accuracy_score on train : 0.570247933884
accuracy_score on test : 0.561904761905
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SVM Linear with C : 4921
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.570247933884
- Score on test : 0.561904761905
Classification took 0:00:02
\ No newline at end of file
Classification on MultiOmic database for MiRNA_ with SVMPoly
accuracy_score on train : 0.289256198347
accuracy_score on test : 0.219047619048
Database configuration :
- Database name : MultiOmic
- View name : MiRNA_ View shape : (347, 1046)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- SVM Linear with C : 4921
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 0.289256198347
- Score on test : 0.219047619048
Classification took 0:00:00
\ No newline at end of file
Classification on MultiOmic database for Clinic with Adaboost
accuracy_score on train : 1.0
accuracy_score on test : 0.733333333333
Database configuration :
- Database name : MultiOmic
- View name : Clinic View shape : (347, 127)
- Learning Rate : 0.7
- Labels used : Non, Oui
- Number of cross validation folds : 5
Classifier configuration :
- Adaboost with num_esimators : 1, base_estimators : DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
- Executed on 1 core(s)
- Got configuration using randomized search with 10 iterations
For Accuracy score using None as sample_weights (higher is better) :
- Score on train : 1.0
- Score on test : 0.733333333333
Classification took 0:00:00
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment