Refactored and added subsampling file in classifiers module

99432b2a · bbauvin · 8ea80c0f · 99432b2a · 99432b2a · 99432b2a
Commit 99432b2a authored 8 years ago by bbauvin
--- a/Code/FeatExtraction/ExecFeatExtraction.py
+++ b/Code/FeatExtraction/ExecFeatExtraction.py
@@ -14,7 +14,7 @@ import logging                          # To create Log-Files
 # Import own modules
 import DBCrawl			        # Functions to read Images from Database
 import Code.MonoView.ExportResults  # Functions to render results
-import FeatExtraction                   # Functions to extract the features from Database   
+import FeatExtraction                   # Functions to extract the views from Database

 # Author-Info
 __author__ 	= "Nikolas Huelsmann"
@@ -24,7 +24,7 @@ __date__	= 2016-03-25
 ### Argument Parser

 parser = argparse.ArgumentParser(
-description='This method permits to export one or more features at the same time for a database of images (path, name). To extract one feature activate it by using the specific argument (e.g. -RGB). For each feature you can define the parameters by using the optional arguments (e.g. --RGB_Hist 32). The results will be exported to a CSV-File.', 
+description='This method permits to export one or more views at the same time for a database of images (path, name). To extract one feature activate it by using the specific argument (e.g. -RGB). For each feature you can define the parameters by using the optional arguments (e.g. --RGB_Hist 32). The results will be exported to a CSV-File.',
 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

 groupStandard = parser.add_argument_group('Standard arguments')
@@ -120,7 +120,7 @@ logging.debug("### Main Programm for Feature Extraction ###")
 logging.debug("### Extraction - NameDB=" + nameDB + ", Path=" + path  + ", Features=" + features)

 ################################ Read Images from Database
-# Determine the Database to extract features
+# Determine the Database to extract views

 logging.debug("Start:\t Exportation of images from DB")


--- a/Code/MonoView/ExecClassifMonoView.py
+++ b/Code/MonoView/ExecClassifMonoView.py
@@ -33,7 +33,7 @@ groupStandard = parser.add_argument_group('Standard arguments')
 groupStandard.add_argument('-log', action='store_true', help='Use option to activate Logging to Console')
 groupStandard.add_argument('--name', metavar='STRING', action='store', help='Name of Database (default: %(default)s)', default='DB')
 groupStandard.add_argument('--feat', metavar='STRING', action='store', help='Name of Feature for Classification (default: %(default)s)', default='RGB')
-groupStandard.add_argument('--pathF', metavar='STRING', action='store', help='Path to the features (default: %(default)s)', default='Results-FeatExtr/')
+groupStandard.add_argument('--pathF', metavar='STRING', action='store', help='Path to the views (default: %(default)s)', default='Results-FeatExtr/')
 groupStandard.add_argument('--fileCL', metavar='STRING', action='store', help='Name of classLabels CSV-file  (default: %(default)s)', default='classLabels.csv')
 groupStandard.add_argument('--fileCLD', metavar='STRING', action='store', help='Name of classLabels-Description CSV-file  (default: %(default)s)', default='classLabels-Description.csv')
 groupStandard.add_argument('--fileFeat', metavar='STRING', action='store', help='Name of feature CSV-file  (default: %(default)s)', default='feature.csv')

--- a/Code/MultiView/ExecMultiview.py
+++ b/Code/MultiView/ExecMultiview.py
@@ -25,10 +25,10 @@ groupStandard.add_argument('--name', metavar='STRING', action='store', help='Nam
                           default='Caltech')
 groupStandard.add_argument('--type', metavar='STRING', action='store', help='Type of database : .hdf5 or .csv',
                           default='.csv')
-groupStandard.add_argument('--features', metavar='STRING', action='store',
-                           help='Name of the features selected for learning', default='RGB:HOG:SIFT')
+groupStandard.add_argument('--views', metavar='STRING', action='store',
+                           help='Name of the views selected for learning', default='RGB:HOG:SIFT')
 groupStandard.add_argument('--pathF', metavar='STRING', action='store',
-                           help='Path to the features (default: %(default)s)',
+                           help='Path to the views (default: %(default)s)',
                           default='../FeatExtraction/Results-FeatExtr/')

 groupClass = parser.add_argument_group('Classification arguments')
@@ -69,9 +69,9 @@ groupFusion.add_argument('--FU_cl_config', metavar='STRING', action='store',
                         help='Configuration for the monoview classifier', default='100:10:5')

 args = parser.parse_args()
-features = args.features.split(":")
+views = args.features.split(":")
 dataBaseType = args.type
-NB_VIEW = len(features)
+NB_VIEW = len(views)
 mumboClassifierConfig = [argument.split(':') for argument in args.MU_config]

 LEARNING_RATE = args.CL_split
@@ -88,7 +88,7 @@ MumboArguments = (mumboClassifierConfig, NB_ITER, classifierNames)

 dir = os.path.dirname(os.path.abspath(__file__)) + "/Results/"
 logFileName = datetime.datetime.now().strftime(
-        "%Y_%m_%d") + "-CMultiV-" + args.CL_type + "-" + "_".join(features) + "-" + args.name + "-LOG"
+        "%Y_%m_%d") + "-CMultiV-" + args.CL_type + "-" + "_".join(views) + "-" + args.name + "-LOG"
 logFile = dir + logFileName
 if os.path.isfile(logFile + ".log"):
    for i in range(1, 20):
@@ -106,14 +106,14 @@ if (args.log):

 t_start = time.time()
 logging.info("### Main Programm for Multiview Classification")
-logging.info("### Classification - Database : " + str(args.name) + " ; Views : " + ", ".join(features) +
+logging.info("### Classification - Database : " + str(args.name) + " ; Views : " + ", ".join(views) +
             " ; Algorithm : " + args.CL_type + " ; Cores : " + str(NB_CORES))


-logging.info("Start:\t Read CSV Database Files for " + args.name)
+logging.info("Start:\t Read "+str.upper(type[1:])+" Database Files for " + args.name)

 getDatabase = getattr(DB, "get" + args.name + "DB" + dataBaseType[1:])
-DATASET, LABELS_DICTIONARY = getDatabase(features, args.pathF, args.name, NB_CLASS, LABELS_NAMES)
+DATASET, LABELS_DICTIONARY = getDatabase(views, args.pathF, args.name, NB_CLASS, LABELS_NAMES)
 datasetLength = DATASET["/datasetLength"][...]
 dataBaseType = "hdf5"

@@ -121,7 +121,7 @@ logging.info("Info:\t Labels used: " + ", ".join(LABELS_DICTIONARY.values()))
 logging.info("Info:\t Length of dataset:" + str(datasetLength))

 for viewIndex in range(NB_VIEW):
-    logging.info("Info:\t Shape of " + features[viewIndex] + " :" + str(
+    logging.info("Info:\t Shape of " + views[viewIndex] + " :" + str(
            DATASET["View" + str(viewIndex) + "/shape"][...]))
 logging.info("Done:\t Read Database Files")

@@ -192,11 +192,11 @@ times = (extractionTime, kFoldLearningTime, kFoldPredictionTime, classificationT
 stringAnalysis, imagesAnalysis = analysisModule.execute(kFoldClassifier, kFoldPredictedTrainLabels,
                                                        kFoldPredictedTestLabels, kFoldPredictedValidationLabels, DATASET,
                                                        NB_CLASS, trainArguments, LEARNING_RATE, LABELS_DICTIONARY,
-                                                        features, NB_CORES, times, NB_VIEW, kFolds, args.name, nbFolds,
+                                                        views, NB_CORES, times, NB_VIEW, kFolds, args.name, nbFolds,
                                                        validationIndices, datasetLength)
 labelsSet = set(LABELS_DICTIONARY.values())
 logging.info(stringAnalysis)
-featureString = "-".join(features)
+featureString = "-".join(views)
 labelsString = "-".join(labelsSet)
 timestr = time.strftime("%Y%m%d-%H%M%S")
 outputFileName = "Results/" + timestr + "Results-" + args.CL_type + "-" + ":".join(

--- a/Code/MultiView/Fusion/analyzeResults.py
+++ b/Code/MultiView/Fusion/analyzeResults.py
@@ -25,7 +25,7 @@ def execute(classifier, predictedTrainLabels, predictedTestLabels, trainLabels,
                                                                  fusionClassifierConfig)
    #monoviewClassifierConfig+'\n\n '+ \
    stringAnalysis = "\n"+fusionType+" classification using "+monoviewClassifier+ 'as monoview classifier '+ \
-                     "Learning on \n\t- "+", ".join(features)+" as features\n\t- "+", ".join(LABELS_DICTIONARY.values())+ \
+                     "Learning on \n\t- "+", ".join(features)+" as views\n\t- "+", ".join(LABELS_DICTIONARY.values())+ \
                     " as labels\n\t- "+str(trainingSetLength)+" training examples, "+str(testingSetLength)+ \
                     " testing examples ("+str(LEARNING_RATE)+" rate)\n\n With "+str(NB_CORES)+' cores used for computing.\n\n'

@@ -48,7 +48,7 @@ def execute(classifier, predictedTrainLabels, predictedTestLabels, trainLabels,
    #     stringAnalysis+= "\t- Iteration "+str(iterIndex+1)+"\n\t\t Accuracy on train : "+ \
    #                      str(accuracy_score(trainLabels, predictedTrainLabelsByIter[iterIndex]))+'\n\t\t Accuracy on test : '+ \
    #                      str(accuracy_score(testLabels, predictedTestLabelsByIter[iterIndex]))+'\n\t\t Selected View : '+ \
-    #                      features[int(bestViews[iterIndex])]+"\n"
+    #                      views[int(bestViews[iterIndex])]+"\n"
    #
    # name, image = plotAccuracyByIter(predictedTrainLabelsByIter, predictedTestLabelsByIter, trainLabels, testLabels, NB_ITER)
    imagesAnalysis = {}

--- a/Code/MultiView/Mumbo/Classifiers/DecisionTree.py
+++ b/Code/MultiView/Mumbo/Classifiers/DecisionTree.py
 from sklearn import tree
 from sklearn.metrics import precision_recall_fscore_support
 import numpy as np
-# from sklearn.multiclass import OneVsRestClassifier
 from ModifiedMulticlass import OneVsRestClassifier
-import random
-
+from SubSampling import subSample
 # Add weights 

-def getLabelSupports(CLASS_LABELS):
-    labels = set(CLASS_LABELS)
-    supports = [CLASS_LABELS.tolist().count(label) for label in labels]
-    return supports, dict((label, index) for label, index in zip(labels, range(len(labels))))
-
-
-def isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict):
-    if nbTrainingExamples[labelDict[CLASS_LABELS[index]]] != 0:
-        nbTrainingExamples[labelDict[CLASS_LABELS[index]]] -= 1
-        return True, nbTrainingExamples
-    else:
-        return False, nbTrainingExamples
-
-
-def subSample(data, labels, weights, subSampling):
-    nbExamples = len(labels)
-    labelSupports, labelDict = getLabelSupports(labels)
-    nbTrainingExamples = [int(support * subSampling) for support in labelSupports]
-    trainingExamplesIndices = []
-    while nbTrainingExamples != [0 for i in range(len(labelSupports))]:
-        index = int(random.randint(0, nbExamples - 1))
-        isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, labels, labelDict)
-        if isUseFull:
-            trainingExamplesIndices.append(index)
-    subSampledData = []
-    subSampledLabels = []
-    subSampledWeights = []
-    for index in trainingExamplesIndices:
-        subSampledData.append(data[index])
-        subSampledLabels.append(labels[index])
-        subSampledWeights.append(weights[index])
-    return np.array(subSampledData), np.array(subSampledLabels), np.array(subSampledWeights)
-
 def DecisionTree(data, labels, arg, weights):
    depth = int(arg[0])
    subSampling = float(arg[1])
@@ -49,11 +14,9 @@ def DecisionTree(data, labels, arg, weights):
        subSampledData, subSampledLabels, subSampledWeights = data, labels, weights
    isBad = False
    classifier = tree.DecisionTreeClassifier(max_depth=depth)
-
    #classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth))
    classifier.fit(subSampledData, subSampledLabels, subSampledWeights)
    prediction = classifier.predict(data)
-    labelsSet = set(prediction)
    pTr, r, f1, s = precision_recall_fscore_support(labels, prediction, sample_weight=weights)
    if np.mean(pTr) < 0.5:
        isBad = True

--- a/Code/MultiView/Mumbo/Classifiers/SubSampling.py
+++ b/Code/MultiView/Mumbo/Classifiers/SubSampling.py
+import numpy as np
+import random
+
+def getLabelSupports(CLASS_LABELS):
+    labels = set(CLASS_LABELS)
+    supports = [CLASS_LABELS.tolist().count(label) for label in labels]
+    return supports, dict((label, index) for label, index in zip(labels, range(len(labels))))
+
+
+def isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict):
+    if nbTrainingExamples[labelDict[CLASS_LABELS[index]]] != 0:
+        nbTrainingExamples[labelDict[CLASS_LABELS[index]]] -= 1
+        return True, nbTrainingExamples
+    else:
+        return False, nbTrainingExamples
+
+
+def subSample(data, labels, weights, subSampling):
+    nbExamples = len(labels)
+    labelSupports, labelDict = getLabelSupports(labels)
+    nbTrainingExamples = [int(support * subSampling) for support in labelSupports]
+    trainingExamplesIndices = []
+    while nbTrainingExamples != [0 for i in range(len(labelSupports))]:
+        index = int(random.randint(0, nbExamples - 1))
+        isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, labels, labelDict)
+        if isUseFull:
+            trainingExamplesIndices.append(index)
+    subSampledData = []
+    subSampledLabels = []
+    subSampledWeights = []
+    for index in trainingExamplesIndices:
+        subSampledData.append(data[index])
+        subSampledLabels.append(labels[index])
+        subSampledWeights.append(weights[index])
+    return np.array(subSampledData), np.array(subSampledLabels), np.array(subSampledWeights)
\ No newline at end of file
--- a/Code/MultiView/Mumbo/analyzeResults.py
+++ b/Code/MultiView/Mumbo/analyzeResults.py
@@ -10,7 +10,6 @@ import logging

 def findMainView(bestViews):
    views = list(set(bestViews))
-    mainView = ()
    viewCount = np.array([list(bestViews).count(view) for view in views])
    mainView = views[np.argmax(viewCount)]
    return mainView
@@ -32,7 +31,7 @@ def plotAccuracyByIter(trainAccuracy, testAccuracy, validationAccuracy, NB_ITER,
    # for label, x, y in zip(bestViews, x, trainAccuracy):
    #     if label != mainView:
    #         plt.annotate(
-    #                 features[int(label)],
+    #                 views[int(label)],
    #                 xy=(x, y), xytext=(-20, 20),
    #                 textcoords='offset points', ha='right', va='bottom',
    #                 bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),

--- a/Code/MultiView/run.py
+++ b/Code/MultiView/run.py
 import os
-os.system('python ExecMultiview.py -log --name MultiOmic --features Methyl:MiRNA:RNASEQ:Clinical --pathF /home/bbauvin/Documents/Data/Data_multi_omics/ --CL_split 5 --CL_nb_class 2 --CL_classes Positive:Negative --CL_type Mumbo --CL_cores 4 --MU_type DecisionTree:DecisionTree:DecisionTree:DecisionTree --MU_config 1:0.1 1:0.1 1:0.9 1:0.9 --MU_iter 100')
\ No newline at end of file
+os.system('python ExecMultiview.py -log --name MultiOmic --type .hdf5 --views Methyl:MiRNA:RNASEQ:Clinical --pathF /home/bbauvin/Documents/Data/Data_multi_omics/ --CL_split 0.3 --CL_nbFolds 5 --CL_nb_class 2 --CL_classes Positive:Negative --CL_type Mumbo --CL_cores 4 --MU_type DecisionTree:DecisionTree:DecisionTree:DecisionTree --MU_config 1:0.09 1:0.09 1:0.9 2:1.0 --MU_iter 100')