Added an option to repat a training/classification to mean the results

132f8710 · bbauvin · 1e9c7274 · 132f8710 · 132f8710 · 132f8710
Commit 132f8710 authored 8 years ago by bbauvin
--- a/Code/MonoMutliViewClassifiers/ExecClassif.py
+++ b/Code/MonoMutliViewClassifiers/ExecClassif.py
@@ -79,6 +79,8 @@ groupClass.add_argument('--CL_algos_multiview', metavar='STRING', action='store'
                        help='Determine which multiview classifier to use, separate with : if multiple, if empty, considering all', default='')
 groupClass.add_argument('--CL_cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int,
                        default=1)
+groupClass.add_argument('--CL_statsiter', metavar='INT', action='store', help='Number of iteration for each algorithm to mean results', type=int,
+                        default=1)
 groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+",
                        help='Determine which metrics to use, separate metric and configuration with ":". If multiple, separate with space. If no metric is specified, considering all with accuracy for classification '
                             'first one will be used for classification', default=[''])
@@ -164,6 +166,7 @@ groupFusion.add_argument('--FU_cl_config', metavar='STRING', action='store', nar
 args = parser.parse_args()
 os.nice(args.nice)
 nbCores = args.CL_cores
+statsIter = args.CL_statsiter
 start = time.time()
 if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech", "Fake", "Plausible"]:
    getDatabase = getattr(DB, "getClassicDB" + args.type[1:])
@@ -321,7 +324,7 @@ if nbCores>1:
    nbExperiments = len(argumentDictionaries["Monoview"])
    for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
        resultsMonoview+=(Parallel(n_jobs=nbCores)(
-            delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, gridSearch=gridSearch,
+            delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, statsIter, gridSearch=gridSearch,
                                            metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores])
            for coreIndex in range(min(nbCores, nbExperiments - stepIndex  * nbCores))))
    accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
@@ -334,7 +337,7 @@ if nbCores>1:
 else:
    resultsMonoview+=([ExecMonoview(DATASET.get("View"+str(arguments["viewIndex"])),
                                    DATASET.get("Labels").value, args.name,
-                                    args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
+                                    args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, statsIter,
                                    gridSearch=gridSearch, metrics=metrics, nIter=args.CL_GS_iter,
                                    **arguments)
                       for arguments in argumentDictionaries["Monoview"]])
@@ -415,12 +418,12 @@ if nbCores>1:
    for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
        resultsMultiview += Parallel(n_jobs=nbCores)(
            delayed(ExecMultiview_multicore)(coreIndex, args.name, args.CL_split, args.CL_nbFolds, args.type, args.pathF,
-                                   LABELS_DICTIONARY, gridSearch=gridSearch,
+                                   LABELS_DICTIONARY, statsIter, gridSearch=gridSearch,
                                   metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Multiview"][stepIndex*nbCores+coreIndex])
            for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))
 else:
    resultsMultiview = [ExecMultiview(DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
-                               LABELS_DICTIONARY, gridSearch=gridSearch,
+                               LABELS_DICTIONARY, statsIter, gridSearch=gridSearch,
                               metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Multiview"]]
 multiviewTime = time.time()-monoviewTime-dataBaseTime-start
 if nbCores>1:

--- a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py
+++ b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py
@@ -30,7 +30,7 @@ __status__ 	= "Prototype"           # Production, Development, Prototype
 __date__	= 2016-03-25


-def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, gridSearch=True,
+def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, statsIter, gridSearch=True,
                           metrics=[["accuracy_score", None]], nIter=30, **args):
    DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r")
    kwargs = args["args"]
@@ -38,11 +38,11 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa
    neededViewIndex = views.index(kwargs["feat"])
    X = DATASET.get("View"+str(neededViewIndex))
    Y = DATASET.get("Labels").value
-    return ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, gridSearch=gridSearch,
+    return ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, statsIter, gridSearch=gridSearch,
                        metrics=metrics, nIter=nIter, **args)


-def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True,
+def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, statsIter, gridSearch=True,
                metrics=[["accuracy_score", None]], nIter=30, **args):
    logging.debug("Start:\t Loading data")
    try:
@@ -64,7 +64,11 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
    # Determine the Database to extract features
    logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type)

-
+    y_trains = []
+    y_tests = []
+    y_train_preds = []
+    y_test_preds = []
+    for poulet in range(statsIter):
        # Calculate Train/Test data
        logging.debug("Start:\t Determine Train/Test split")
        testIndices = ClassifMonoView.splitDataset(Y, nbClass, learningRate, datasetLength)
@@ -93,28 +97,31 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
            logging.debug("Done:\t RandomSearch best settings")
        logging.debug("Start:\t Training")
        cl_res = classifierModule.fit(X_train, y_train, NB_CORES=nbCores, **clKWARGS)
-    t_end  = time.time() - t_start

-    logging.debug("Info:\t Time for Training: " + str(t_end) + "[s]")
        logging.debug("Done:\t Training")

        logging.debug("Start:\t Predicting")
        # Stats Result
        y_train_pred = cl_res.predict(X_train)
        y_test_pred = cl_res.predict(X_test)
+
+        y_trains.append(y_train)
+        y_train_preds.append(y_train_pred)
+        y_tests.append(y_test)
+        y_test_preds.append(y_test_pred)
+    t_end  = time.time() - t_start
+    logging.debug("Done:\t Predicting")
+    logging.debug("Info:\t Time for training and predicting: " + str(t_end) + "[s]")
    classLabelsDesc = pd.read_csv(path + fileCLD, sep=";", names=['label', 'name'])
    classLabelsNames = classLabelsDesc.name
-    logging.debug("Done:\t Predicting")
-    #logging.debug("" + str(classLabelsNames))
    classLabelsNamesList = classLabelsNames.values.tolist()
-    #logging.debug(""+ str(classLabelsNamesList))

    logging.debug("Start:\t Getting Results")

    #Accuracy classification score
    stringAnalysis, imagesAnalysis, metricsScores = execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type,
                                         clKWARGS, classLabelsNames, X.shape,
-                                         y_train, y_train_pred, y_test, y_test_pred, t_end)
+                                         y_trains, y_train_preds, y_tests, y_test_preds, t_end, statsIter)
    cl_desc = [value for key, value in sorted(clKWARGS.iteritems())]
    logging.debug("Done:\t Getting Results")
    logging.info(stringAnalysis)

--- a/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py
+++ b/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py
 from datetime import timedelta as hms
+import numpy as np

 import MonoviewClassifiers
 import Metrics
@@ -23,37 +24,39 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS):
    classifierConfigString += "\n\n"
    return classifierConfigString

-def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred):
+def getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds):
    metricModule = getattr(Metrics, metric[0])
    if metric[1]!=None:
        metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
    else:
        metricKWARGS = {}
+    metricScoreTrain = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]))
+    metricScoreTest = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))
    metricScoreString = "\tFor "+metricModule.getConfig(**metricKWARGS)+" : "
-    metricScoreString += "\n\t\t- Score on train : "+str(metricModule.score(y_train, y_train_pred))
-    metricScoreString += "\n\t\t- Score on test : "+str(metricModule.score(y_test, y_test_pred))
+    metricScoreString += "\n\t\t- Score on train : "+str(metricScoreTrain)
+    metricScoreString += "\n\t\t- Score on test : "+str(metricScoreTest)
    metricScoreString += "\n"
    return metricScoreString


 def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames,
-            shape, y_train, y_train_pred, y_test, y_test_pred, time):
+            shape, y_trains, y_train_preds, y_tests, y_test_preds, time, statsIter):
    metricsScores = {}
    metricModule = getattr(Metrics, metrics[0][0])
-    train = metricModule.score(y_train, y_train_pred)
-    val = metricModule.score(y_test, y_test_pred)
+    train = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]))
+    val = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))
    stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+"\n\n"
    stringAnalysis += metrics[0][0]+" on train : "+str(train)+"\n"+metrics[0][0]+" on test : "+str(val)+"\n\n"
    stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds)
    stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS)
    for metric in metrics:
-        stringAnalysis+=getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred)
+        stringAnalysis+=getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds)
        if metric[1]!=None:
            metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
        else:
            metricKWARGS = {}
-        metricsScores[metric[0]] = [getattr(Metrics, metric[0]).score(y_train, y_train_pred, **metricKWARGS), "",
-                                    getattr(Metrics, metric[0]).score(y_test, y_test_pred, **metricKWARGS)]
+        metricsScores[metric[0]] = [np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)])), "",
+                                    np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))]
    stringAnalysis += "\n\n Classification took "+ str(hms(seconds=int(time)))

    imageAnalysis = {}

--- a/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py
@@ -22,14 +22,14 @@ __status__ 	= "Prototype"                           # Production, Development, P



-def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY ,
+def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY , statsIter,
                            gridSearch=False, nbCores=1, metrics=None, nIter=30, **arguments):
    DATASET = h5py.File(path+name+str(coreIndex)+".hdf5", "r")
-    return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY,
+    return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY, statsIter,
                         gridSearch=gridSearch, metrics=metrics, nIter=nIter, **arguments)


-def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY,
+def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY, statsIter,
                  gridSearch=False, metrics=None, nIter=30, **kwargs):

    datasetLength = DATASET.get("Metadata").attrs["datasetLength"]

--- a/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py
@@ -369,41 +369,45 @@ def easyFactorize(nbGenes, factorizationParam, t=0):
        factorLeft[matrixInf:, t_:t__+t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__+t_].shape)
    factorLeft[:, t__+t_] = vectorLeft

-    factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
-
-    factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
-    if nbGenes%2==1:
-        factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
-    else:
-        factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
-    factorSup[t__+t_, :] = vectorSup
-    return t__+t_+1, factorLeft, factorSup
+    # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
+    #
+    # factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
+    # if nbGenes%2==1:
+    #     factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
+    # else:
+    #     factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
+    # factorSup[t__+t_, :] = vectorSup
+    return t__+t_+1, factorLeft#, factorSup


-def getBaseMatrices(nbGenes, factorizationParam):
+def getBaseMatrices(nbGenes, factorizationParam, path):
    t, factorLeft, factorSup = easyFactorize(nbGenes, factorizationParam)
-    np.savetxt("factorSup--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorSup, delimiter=",")
-    np.savetxt("factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",")
-    return factorSup, factorLeft
+    np.savetxt(path+"factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",")
+    return factorLeft


-def findParams(arrayLen, nbPatients, maxNbBins=5000, maxLenBin=300, minOverlapping=30, minNbBinsOverlapped=20, maxNbSolutions=30):
+def findParams(arrayLen, nbPatients, maxNbBins=2000, minNbBins = 10, maxLenBin=70000, minOverlapping=1, minNbBinsOverlapped=0, maxNbSolutions=30):
    results = []
    if arrayLen*arrayLen*10/100>minNbBinsOverlapped*nbPatients:
        for lenBin in range(arrayLen-1):
-            if lenBin+1<maxLenBin:
-                for overlapping in sorted(range(lenBin+1-1), reverse=True):
-                    if overlapping+1>minOverlapping and math.ceil(float(lenBin)/(lenBin-overlapping))>=minNbBinsOverlapped:
+            lenBin = lenBin+1
+            if lenBin<maxLenBin and minNbBins*lenBin<arrayLen:
+                print lenBin
+                print results
+                for overlapping in sorted(range(lenBin-1), reverse=True):
+                    overlapping = overlapping+1
+                    if overlapping>minOverlapping and lenBin%(lenBin-overlapping)==0:
                        for nbBins in sorted(range(arrayLen-1), reverse=True):
-                            if nbBins+1<maxNbBins:
-                                if arrayLen == (nbBins+1-1)*(lenBin+1-overlapping+1)+lenBin+1:
+                            nbBins = nbBins+1
+                            if nbBins<maxNbBins:
+                                if arrayLen == (nbBins-1)*(lenBin-overlapping)+lenBin:
                                    results.append({"nbBins":nbBins, "overlapping":overlapping, "lenBin":lenBin})
                                    if len(results)==maxNbSolutions:
                                        params = results[random.randrange(len(results))]
                                        return params


-def findBins(nbBins, overlapping, lenBin):
+def findBins(nbBins=142, overlapping=493, lenBin=986):
    bins = []
    for binIndex in range(nbBins+1):
        bins.append([i+binIndex*(lenBin+1-overlapping+1) for i in range(lenBin+1)])
@@ -421,6 +425,14 @@ def getBins(array, bins, lenBin, overlapping):
    return np.array(binnedcoord)


+def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path):
+    sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8)
+    step = lenBins-overlapping
+    for binIndex in nbBins:
+        sortedBinsMatrix[step*binIndex:lenBins+(step*binIndex)] = np.ones(lenBins, dtype=np.uint8)
+    np.savetxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", sortedBinsMatrix, delimiter=",")
+    return sortedBinsMatrix
+
 def makeSparseTotalMatrix(sortedRNASeq):
    nbPatients, nbGenes = sortedRNASeq.shape
    params = findParams(nbGenes, nbPatients)
@@ -533,10 +545,10 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
    try:
        factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View2").shape[1])+"--k-"+str(100)+".csv", delimiter=',')
    except:
-        factorizedSupBaseMatrix, factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k)
-    brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k*2), dtype=bool)
+        factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path)
+    brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k), dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
-        patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k * 2), dtype=bool)
+        patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:]
        brnaseqDset[patientIndex] = patientMatrix.flatten()
@@ -544,16 +556,23 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
    brnaseqDset.attrs["sparse"] = False
    logging.debug("Done:\t Getting Binarized RNASeq Data")

-    # logging.debug("Start:\t Getting Binned RNASeq Data")
-    # sparseBinnedRNASeq = makeSparseTotalMatrix(sortedRNASeqGeneIndices)
-    # sparseBinnedRNASeqGrp = datasetFile.create_group("View6")
-    # dataDset = sparseBinnedRNASeqGrp.create_dataset("data", sparseBinnedRNASeq.data.shape, data=sparseBinnedRNASeq.data)
-    # indicesDset = sparseBinnedRNASeqGrp.create_dataset("indices", sparseBinnedRNASeq.indices.shape, data=sparseBinnedRNASeq.indices)
-    # indptrDset = sparseBinnedRNASeqGrp.create_dataset("indptr", sparseBinnedRNASeq.indptr.shape, data=sparseBinnedRNASeq.indptr)
-    # sparseBinnedRNASeqGrp.attrs["name"]="BRNASeq"
-    # sparseBinnedRNASeqGrp.attrs["sparse"]=True
-    # sparseBinnedRNASeqGrp.attrs["shape"]=sparseBinnedRNASeq.shape
-    # logging.debug("Done:\t Getting Binned RNASeq Data")
+    logging.debug("Start:\t Getting Binned RNASeq Data")
+    lenBins = 986
+    nbBins = 142
+    overlapping = 493
+    try:
+        sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",")
+    except:
+        sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path)
+    binnedRNASeq = datasetFile.create_dataset("View6", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*lenBins), dtype=np.uint8)
+    for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
+        patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8)
+        for lineIndex, geneIndex in enumerate(patientSortedArray):
+            patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:]
+        brnaseqDset[patientIndex] = patientMatrix.flatten()
+    brnaseqDset.attrs["name"] = "bRNASeq"
+    brnaseqDset.attrs["sparse"] = False
+    logging.debug("Done:\t Getting Binned RNASeq Data")

    # logging.debug("Start:\t Getting Adjacence RNASeq Data")
    # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1)