Skip to content
Snippets Groups Projects
Commit 132f8710 authored by bbauvin's avatar bbauvin
Browse files

Added an option to repat a training/classification to mean the results

parent 1e9c7274
No related branches found
No related tags found
No related merge requests found
......@@ -79,6 +79,8 @@ groupClass.add_argument('--CL_algos_multiview', metavar='STRING', action='store'
help='Determine which multiview classifier to use, separate with : if multiple, if empty, considering all', default='')
groupClass.add_argument('--CL_cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int,
default=1)
groupClass.add_argument('--CL_statsiter', metavar='INT', action='store', help='Number of iteration for each algorithm to mean results', type=int,
default=1)
groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+",
help='Determine which metrics to use, separate metric and configuration with ":". If multiple, separate with space. If no metric is specified, considering all with accuracy for classification '
'first one will be used for classification', default=[''])
......@@ -164,6 +166,7 @@ groupFusion.add_argument('--FU_cl_config', metavar='STRING', action='store', nar
args = parser.parse_args()
os.nice(args.nice)
nbCores = args.CL_cores
statsIter = args.CL_statsiter
start = time.time()
if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech", "Fake", "Plausible"]:
getDatabase = getattr(DB, "getClassicDB" + args.type[1:])
......@@ -321,7 +324,7 @@ if nbCores>1:
nbExperiments = len(argumentDictionaries["Monoview"])
for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
resultsMonoview+=(Parallel(n_jobs=nbCores)(
delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, gridSearch=gridSearch,
delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores])
for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))))
accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
......@@ -334,7 +337,7 @@ if nbCores>1:
else:
resultsMonoview+=([ExecMonoview(DATASET.get("View"+str(arguments["viewIndex"])),
DATASET.get("Labels").value, args.name,
args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, statsIter,
gridSearch=gridSearch, metrics=metrics, nIter=args.CL_GS_iter,
**arguments)
for arguments in argumentDictionaries["Monoview"]])
......@@ -415,12 +418,12 @@ if nbCores>1:
for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
resultsMultiview += Parallel(n_jobs=nbCores)(
delayed(ExecMultiview_multicore)(coreIndex, args.name, args.CL_split, args.CL_nbFolds, args.type, args.pathF,
LABELS_DICTIONARY, gridSearch=gridSearch,
LABELS_DICTIONARY, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Multiview"][stepIndex*nbCores+coreIndex])
for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))
else:
resultsMultiview = [ExecMultiview(DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
LABELS_DICTIONARY, gridSearch=gridSearch,
LABELS_DICTIONARY, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Multiview"]]
multiviewTime = time.time()-monoviewTime-dataBaseTime-start
if nbCores>1:
......
......@@ -30,7 +30,7 @@ __status__ = "Prototype" # Production, Development, Prototype
__date__ = 2016-03-25
def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, gridSearch=True,
def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, statsIter, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args):
DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r")
kwargs = args["args"]
......@@ -38,11 +38,11 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa
neededViewIndex = views.index(kwargs["feat"])
X = DATASET.get("View"+str(neededViewIndex))
Y = DATASET.get("Labels").value
return ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, gridSearch=gridSearch,
return ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=nIter, **args)
def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True,
def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, statsIter, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args):
logging.debug("Start:\t Loading data")
try:
......@@ -64,7 +64,11 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
# Determine the Database to extract features
logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type)
y_trains = []
y_tests = []
y_train_preds = []
y_test_preds = []
for poulet in range(statsIter):
# Calculate Train/Test data
logging.debug("Start:\t Determine Train/Test split")
testIndices = ClassifMonoView.splitDataset(Y, nbClass, learningRate, datasetLength)
......@@ -93,28 +97,31 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
logging.debug("Done:\t RandomSearch best settings")
logging.debug("Start:\t Training")
cl_res = classifierModule.fit(X_train, y_train, NB_CORES=nbCores, **clKWARGS)
t_end = time.time() - t_start
logging.debug("Info:\t Time for Training: " + str(t_end) + "[s]")
logging.debug("Done:\t Training")
logging.debug("Start:\t Predicting")
# Stats Result
y_train_pred = cl_res.predict(X_train)
y_test_pred = cl_res.predict(X_test)
y_trains.append(y_train)
y_train_preds.append(y_train_pred)
y_tests.append(y_test)
y_test_preds.append(y_test_pred)
t_end = time.time() - t_start
logging.debug("Done:\t Predicting")
logging.debug("Info:\t Time for training and predicting: " + str(t_end) + "[s]")
classLabelsDesc = pd.read_csv(path + fileCLD, sep=";", names=['label', 'name'])
classLabelsNames = classLabelsDesc.name
logging.debug("Done:\t Predicting")
#logging.debug("" + str(classLabelsNames))
classLabelsNamesList = classLabelsNames.values.tolist()
#logging.debug(""+ str(classLabelsNamesList))
logging.debug("Start:\t Getting Results")
#Accuracy classification score
stringAnalysis, imagesAnalysis, metricsScores = execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type,
clKWARGS, classLabelsNames, X.shape,
y_train, y_train_pred, y_test, y_test_pred, t_end)
y_trains, y_train_preds, y_tests, y_test_preds, t_end, statsIter)
cl_desc = [value for key, value in sorted(clKWARGS.iteritems())]
logging.debug("Done:\t Getting Results")
logging.info(stringAnalysis)
......
from datetime import timedelta as hms
import numpy as np
import MonoviewClassifiers
import Metrics
......@@ -23,37 +24,39 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS):
classifierConfigString += "\n\n"
return classifierConfigString
def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred):
def getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds):
metricModule = getattr(Metrics, metric[0])
if metric[1]!=None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
else:
metricKWARGS = {}
metricScoreTrain = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]))
metricScoreTest = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))
metricScoreString = "\tFor "+metricModule.getConfig(**metricKWARGS)+" : "
metricScoreString += "\n\t\t- Score on train : "+str(metricModule.score(y_train, y_train_pred))
metricScoreString += "\n\t\t- Score on test : "+str(metricModule.score(y_test, y_test_pred))
metricScoreString += "\n\t\t- Score on train : "+str(metricScoreTrain)
metricScoreString += "\n\t\t- Score on test : "+str(metricScoreTest)
metricScoreString += "\n"
return metricScoreString
def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames,
shape, y_train, y_train_pred, y_test, y_test_pred, time):
shape, y_trains, y_train_preds, y_tests, y_test_preds, time, statsIter):
metricsScores = {}
metricModule = getattr(Metrics, metrics[0][0])
train = metricModule.score(y_train, y_train_pred)
val = metricModule.score(y_test, y_test_pred)
train = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]))
val = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))
stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+"\n\n"
stringAnalysis += metrics[0][0]+" on train : "+str(train)+"\n"+metrics[0][0]+" on test : "+str(val)+"\n\n"
stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds)
stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS)
for metric in metrics:
stringAnalysis+=getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred)
stringAnalysis+=getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds)
if metric[1]!=None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
else:
metricKWARGS = {}
metricsScores[metric[0]] = [getattr(Metrics, metric[0]).score(y_train, y_train_pred, **metricKWARGS), "",
getattr(Metrics, metric[0]).score(y_test, y_test_pred, **metricKWARGS)]
metricsScores[metric[0]] = [np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)])), "",
np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))]
stringAnalysis += "\n\n Classification took "+ str(hms(seconds=int(time)))
imageAnalysis = {}
......
......@@ -22,14 +22,14 @@ __status__ = "Prototype" # Production, Development, P
def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY ,
def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY , statsIter,
gridSearch=False, nbCores=1, metrics=None, nIter=30, **arguments):
DATASET = h5py.File(path+name+str(coreIndex)+".hdf5", "r")
return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY,
return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY, statsIter,
gridSearch=gridSearch, metrics=metrics, nIter=nIter, **arguments)
def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY,
def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY, statsIter,
gridSearch=False, metrics=None, nIter=30, **kwargs):
datasetLength = DATASET.get("Metadata").attrs["datasetLength"]
......
......@@ -369,41 +369,45 @@ def easyFactorize(nbGenes, factorizationParam, t=0):
factorLeft[matrixInf:, t_:t__+t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__+t_].shape)
factorLeft[:, t__+t_] = vectorLeft
factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
if nbGenes%2==1:
factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
else:
factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
factorSup[t__+t_, :] = vectorSup
return t__+t_+1, factorLeft, factorSup
# factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
#
# factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
# if nbGenes%2==1:
# factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
# else:
# factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
# factorSup[t__+t_, :] = vectorSup
return t__+t_+1, factorLeft#, factorSup
def getBaseMatrices(nbGenes, factorizationParam):
def getBaseMatrices(nbGenes, factorizationParam, path):
t, factorLeft, factorSup = easyFactorize(nbGenes, factorizationParam)
np.savetxt("factorSup--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorSup, delimiter=",")
np.savetxt("factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",")
return factorSup, factorLeft
np.savetxt(path+"factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",")
return factorLeft
def findParams(arrayLen, nbPatients, maxNbBins=5000, maxLenBin=300, minOverlapping=30, minNbBinsOverlapped=20, maxNbSolutions=30):
def findParams(arrayLen, nbPatients, maxNbBins=2000, minNbBins = 10, maxLenBin=70000, minOverlapping=1, minNbBinsOverlapped=0, maxNbSolutions=30):
results = []
if arrayLen*arrayLen*10/100>minNbBinsOverlapped*nbPatients:
for lenBin in range(arrayLen-1):
if lenBin+1<maxLenBin:
for overlapping in sorted(range(lenBin+1-1), reverse=True):
if overlapping+1>minOverlapping and math.ceil(float(lenBin)/(lenBin-overlapping))>=minNbBinsOverlapped:
lenBin = lenBin+1
if lenBin<maxLenBin and minNbBins*lenBin<arrayLen:
print lenBin
print results
for overlapping in sorted(range(lenBin-1), reverse=True):
overlapping = overlapping+1
if overlapping>minOverlapping and lenBin%(lenBin-overlapping)==0:
for nbBins in sorted(range(arrayLen-1), reverse=True):
if nbBins+1<maxNbBins:
if arrayLen == (nbBins+1-1)*(lenBin+1-overlapping+1)+lenBin+1:
nbBins = nbBins+1
if nbBins<maxNbBins:
if arrayLen == (nbBins-1)*(lenBin-overlapping)+lenBin:
results.append({"nbBins":nbBins, "overlapping":overlapping, "lenBin":lenBin})
if len(results)==maxNbSolutions:
params = results[random.randrange(len(results))]
return params
def findBins(nbBins, overlapping, lenBin):
def findBins(nbBins=142, overlapping=493, lenBin=986):
bins = []
for binIndex in range(nbBins+1):
bins.append([i+binIndex*(lenBin+1-overlapping+1) for i in range(lenBin+1)])
......@@ -421,6 +425,14 @@ def getBins(array, bins, lenBin, overlapping):
return np.array(binnedcoord)
def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path):
sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8)
step = lenBins-overlapping
for binIndex in nbBins:
sortedBinsMatrix[step*binIndex:lenBins+(step*binIndex)] = np.ones(lenBins, dtype=np.uint8)
np.savetxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", sortedBinsMatrix, delimiter=",")
return sortedBinsMatrix
def makeSparseTotalMatrix(sortedRNASeq):
nbPatients, nbGenes = sortedRNASeq.shape
params = findParams(nbGenes, nbPatients)
......@@ -533,10 +545,10 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
try:
factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View2").shape[1])+"--k-"+str(100)+".csv", delimiter=',')
except:
factorizedSupBaseMatrix, factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k)
brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k*2), dtype=bool)
factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path)
brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k * 2), dtype=bool)
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:]
brnaseqDset[patientIndex] = patientMatrix.flatten()
......@@ -544,16 +556,23 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
brnaseqDset.attrs["sparse"] = False
logging.debug("Done:\t Getting Binarized RNASeq Data")
# logging.debug("Start:\t Getting Binned RNASeq Data")
# sparseBinnedRNASeq = makeSparseTotalMatrix(sortedRNASeqGeneIndices)
# sparseBinnedRNASeqGrp = datasetFile.create_group("View6")
# dataDset = sparseBinnedRNASeqGrp.create_dataset("data", sparseBinnedRNASeq.data.shape, data=sparseBinnedRNASeq.data)
# indicesDset = sparseBinnedRNASeqGrp.create_dataset("indices", sparseBinnedRNASeq.indices.shape, data=sparseBinnedRNASeq.indices)
# indptrDset = sparseBinnedRNASeqGrp.create_dataset("indptr", sparseBinnedRNASeq.indptr.shape, data=sparseBinnedRNASeq.indptr)
# sparseBinnedRNASeqGrp.attrs["name"]="BRNASeq"
# sparseBinnedRNASeqGrp.attrs["sparse"]=True
# sparseBinnedRNASeqGrp.attrs["shape"]=sparseBinnedRNASeq.shape
# logging.debug("Done:\t Getting Binned RNASeq Data")
logging.debug("Start:\t Getting Binned RNASeq Data")
lenBins = 986
nbBins = 142
overlapping = 493
try:
sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",")
except:
sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path)
binnedRNASeq = datasetFile.create_dataset("View6", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*lenBins), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:]
brnaseqDset[patientIndex] = patientMatrix.flatten()
brnaseqDset.attrs["name"] = "bRNASeq"
brnaseqDset.attrs["sparse"] = False
logging.debug("Done:\t Getting Binned RNASeq Data")
# logging.debug("Start:\t Getting Adjacence RNASeq Data")
# sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment