Worked on cluster adaptation for monoview calssification

e78c4e02 · Baptiste Bauvin · 61faae87 · e78c4e02 · e78c4e02 · e78c4e02
Commit e78c4e02 authored Jan 25, 2018 by Baptiste Bauvin
--- a/Code/MonoMultiViewClassifiers/ExecClassif.py
+++ b/Code/MonoMultiViewClassifiers/ExecClassif.py
@@ -167,7 +167,7 @@ def execOneBenchmark(coreIndex=-1, LABELS_DICTIONARY=None, directory=None, class
    resultsMonoview += [ExecMonoview_multicore(directory, args.name, labelsNames, classificationIndices, kFolds,
                                               coreIndex, args.type, args.pathF, randomState, labels,
                                               hyperParamSearch=hyperParamSearch, metrics=metrics,
-                                               nIter=args.CL_GS_iter, **argument)
+                                               nIter=args.CL_HPS_iter, **argument)
                        for argument in argumentDictionaries["Monoview"]]
    logging.debug("Done:\t Monoview benchmark")
@@ -181,7 +181,7 @@ def execOneBenchmark(coreIndex=-1, LABELS_DICTIONARY=None, directory=None, class
    resultsMultiview += [
        ExecMultiview_multicore(directory, coreIndex, args.name, classificationIndices, kFolds, args.type,
                                args.pathF, LABELS_DICTIONARY, randomState, labels, hyperParamSearch=hyperParamSearch,
-                                metrics=metrics, nIter=args.CL_GS_iter, **arguments)
+                                metrics=metrics, nIter=args.CL_HPS_iter, **arguments)
        for arguments in argumentDictionaries["Multiview"]]
    logging.debug("Done:\t Multiview benchmark")
@@ -220,7 +220,7 @@ def execOneBenchmark_multicore(nbCores=-1, LABELS_DICTIONARY=None, directory=Non
            delayed(ExecMonoview_multicore)(directory, args.name, labelsNames, classificationIndices, kFolds,
                                            coreIndex, args.type, args.pathF, randomState, labels,
                                            hyperParamSearch=hyperParamSearch,
-                                            metrics=metrics, nIter=args.CL_GS_iter,
+                                            metrics=metrics, nIter=args.CL_HPS_iter,
                                            **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores])
            for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))))
    logging.debug("Done:\t Monoview benchmark")
@@ -238,7 +238,7 @@ def execOneBenchmark_multicore(nbCores=-1, LABELS_DICTIONARY=None, directory=Non
        resultsMultiview += Parallel(n_jobs=nbCores)(
            delayed(ExecMultiview_multicore)(directory, coreIndex, args.name, classificationIndices, kFolds,
                                             args.type, args.pathF, LABELS_DICTIONARY, randomState, labels,
-                                             hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter,
+                                             hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_HPS_iter,
                                             **argumentDictionaries["Multiview"][stepIndex * nbCores + coreIndex])
            for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))
    logging.debug("Done:\t Multiview benchmark")
@@ -280,7 +280,7 @@ def execOneBenchmarkMonoCore(DATASET=None, LABELS_DICTIONARY=None, directory=Non
        resultsMonoview += [ExecMonoview(directory, X, Y, args.name, labelsNames, classificationIndices, kFolds,
                                                   1, args.type, args.pathF, randomState,
                                                   hyperParamSearch=hyperParamSearch, metrics=metrics,
-                                                   nIter=args.CL_GS_iter, **arguments)]
+                                                   nIter=args.CL_HPS_iter, **arguments)]
    logging.debug("Done:\t Monoview benchmark")
    logging.debug("Start:\t Multiview arguments initialization")
@@ -294,7 +294,7 @@ def execOneBenchmarkMonoCore(DATASET=None, LABELS_DICTIONARY=None, directory=Non
        resultsMultiview += [
            ExecMultiview(directory, DATASET, args.name, classificationIndices, kFolds, 1, args.type,
                                    args.pathF, LABELS_DICTIONARY, randomState, labels, hyperParamSearch=hyperParamSearch,
-                                    metrics=metrics, nIter=args.CL_GS_iter, **arguments)]
+                                    metrics=metrics, nIter=args.CL_HPS_iter, **arguments)]
    logging.debug("Done:\t Multiview benchmark")
    return [flag, resultsMonoview, resultsMultiview]
@@ -444,7 +444,7 @@ def execClassif(arguments):
 #     resultsMonoview += [ExecMonoview_multicore(directory, args.name, labelsNames, classificationIndices, kFolds,
 #                                                coreIndex, args.type, args.pathF, randomState,
 #                                                hyperParamSearch=hyperParamSearch,
-#                                                metrics=metrics, nIter=args.CL_GS_iter,
+#                                                metrics=metrics, nIter=args.CL_HPS_iter,
 #                                                **arguments)
 #                         for arguments in argumentDictionaries["Monoview"]]
 #     monoviewTime = time.time() - dataBaseTime - start
@@ -456,7 +456,7 @@ def execClassif(arguments):
 #     resultsMultiview += [
 #         ExecMultiview_multicore(directory, coreIndex, args.name, classificationIndices, kFolds, args.type,
 #                                 args.pathF, LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch,
-#                                 metrics=metrics, nIter=args.CL_GS_iter, **arguments)
+#                                 metrics=metrics, nIter=args.CL_HPS_iter, **arguments)
 #         for arguments in argumentDictionaries["Multiview"]]
 #     multiviewTime = time.time() - monoviewTime - dataBaseTime - start
 #
@@ -501,14 +501,14 @@ def execClassif(arguments):
 #                                                  args.type,
 #                                                  args.pathF,
 #                                                  LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch,
-#                                                  metrics=metrics, nIter=args.CL_GS_iter,
+#                                                  metrics=metrics, nIter=args.CL_HPS_iter,
 #                                                  **argumentDictionaries["Multiview"][stepIndex * nbCores + coreIndex])
 #                 for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))
 #     else:
 #         resultsMultiview = [
 #             ExecMultiview(directory, DATASET, args.name, classificationIndices, kFolds, 1, args.type, args.pathF,
 #                           LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch,
-#                           metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in
+#                           metrics=metrics, nIter=args.CL_HPS_iter, **arguments) for arguments in
 #             argumentDictionaries["Multiview"]]
 #     multiviewTime = time.time() - monoviewTime - dataBaseTime - start
 #     if nbCores > 1:

--- a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py
+++ b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py
@@ -196,85 +196,106 @@ if __name__ == '__main__':
    import argparse
    import pickle
+    from ..utils import Dataset
    parser = argparse.ArgumentParser(
        description='This methods is used to execute a multiclass classification with one single view. ',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    groupStandard = parser.add_argument_group('Standard arguments')
    groupStandard.add_argument('-log', action='store_true', help='Use option to activate Logging to Console')
-    groupStandard.add_argument('--type', metavar='STRING', action='store', help='Type of Dataset', default=".hdf5")
    groupStandard.add_argument('--name', metavar='STRING', action='store',
-                               help='Name of Database (default: %(default)s)', default='DB')
+                               help='Name of Database', default='Plausible')
+    groupStandard.add_argument('--cl_name', metavar='STRING', action='store',
+                               help='THe name of the monoview classifier to use', default='DecisionTree')
    groupStandard.add_argument('--view', metavar='STRING', action='store',
-                               help='Name of Feature for Classification (default: %(default)s)', default='View0')
+                               help='Name of the view used', default='View0')
    groupStandard.add_argument('--pathF', metavar='STRING', action='store',
-                               help='Path to the views (default: %(default)s)', default='Results-FeatExtr/')
+                               help='Path to the database hdf5 file', default='../../../Data/Plausible')
    groupStandard.add_argument('--directory', metavar='STRING', action='store',
-                               help='Path to the views (default: %(default)s)', default='Results-FeatExtr/')
+                               help='Path of the output directory', default='')
    groupStandard.add_argument('--labelsNames', metavar='STRING', action='store', nargs='+',
-                               help='Name of classLabels CSV-file  (default: %(default)s)', default='classLabels.csv')
+                               help='Name of the labels used for classification', default=['Yes', 'No'])
    groupStandard.add_argument('--classificationIndices', metavar='STRING', action='store',
-                               help='Name of classLabels-Description CSV-file  (default: %(default)s)',
+                               help='Path to the classificationIndices pickle file',
-                               default='classLabels-Description.csv')
+                               default='')
-    groupStandard.add_argument('--nbCores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int,
+    groupStandard.add_argument('--KFolds', metavar='STRING', action='store',
-                            default=1)
+                               help='Path to the kFolds pickle file',
+                               default='')
+    groupStandard.add_argument('--nbCores', metavar='INT', action='store', help='Number of cores, -1 for all',
+                               type=int, default=1)
    groupStandard.add_argument('--randomState', metavar='INT', action='store',
                               help='Seed for the random state or pickable randomstate file', default=42)
    groupStandard.add_argument('--hyperParamSearch', metavar='STRING', action='store',
-                               help='The type of method used tosearch the best set of hyper parameters', default='randomizedSearch')
+                               help='The type of method used to search the best set of hyper parameters',
-    groupStandard.add_argument('--metrics', metavar='STRING', action='store', nargs="+",
+                               default='randomizedSearch')
-                               help='Metrics used in the experimentation, the first will be the one used in CV',
+    groupStandard.add_argument('--metrics', metavar='STRING', action='store',
-                               default=[''])
+                               help='Path to the pickle file describing the metricsused to analyze the performance',
-    groupStandard.add_argument('--nIter', metavar='INT', action='store', help='Number of itetarion in hyper parameter search', type=int,
+                               default='')
+    groupStandard.add_argument('--kwargs', metavar='STRING', action='store',
+                               help='Path to the pickle file containing the key-words arguments used for classification',
+                               default='')
+    groupStandard.add_argument('--nIter', metavar='INT', action='store',
+                               help='Number of itetarion in hyper parameter search', type=int,
                               default=10)
    args = parser.parse_args()
    directory = args.directory
    name = args.name
+    classifierName = args.cl_name
    labelsNames = args.labelsNames
-    classificationIndices = args.classificationIndices
+    viewName = args.view
-    KFolds = args.KFolds
+    with open(args.classificationIndices, 'rb') as handle:
+        classificationIndices = pickle.load(handle)
+    with open(args.KFolds, 'rb') as handle:
+        KFolds = pickle.load(handle)
    nbCores = args.nbCores
-    databaseType = None
    path = args.pathF
-    randomState = args.randomState
+    with open(args.randomState, 'rb') as handle:
+        randomState = pickle.load(handle)
    hyperParamSearch = args.hyperParamSearch
-    metrics = args.metrics
+    with open(args.metrics, 'rb') as handle:
+        metrics = pickle.load(handle)
    nIter = args.nIter
-    kwargs = args.kwargs
+    with open(args.kwargs, 'rb') as handle:
+        kwargs = pickle.load(handle)
-    # Extract the data using MPI
-    X = None
-    Y = None
-    logfilename = "gen a goodlogfilename"
+    databaseType = None
+    # Extract the data using MPI
+    X, Y = Dataset.getMonoviewShared(path, name, viewName)
-    logfile = directory + logfilename
+    # Init log
-    if os.path.isfile(logfile + ".log"):
+    logFileName = time.strftime("%Y_%m_%d-%H:%M:%S") + "-" + name + "-"+ viewName +"-" + classifierName +'-LOG'
+    if not os.path.exists(os.path.dirname(directory + logFileName)):
+        try:
+            os.makedirs(os.path.dirname(directory + logFileName))
+        except OSError as exc:
+            if exc.errno != errno.EEXIST:
+                raise
+    logFile = directory + logFileName
+    if os.path.isfile(logFile + ".log"):
        for i in range(1, 20):
-            testFileName = logfilename + "-" + str(i) + ".log"
+            testFileName = logFileName + "-" + str(i) + ".log"
-            if not os.path.isfile(directory + testFileName):
+            if not (os.path.isfile(directory + testFileName)):
-                logfile = directory + testFileName
+                logFile = directory + testFileName
                break
    else:
-        logfile += ".log"
+        logFile += ".log"
+    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=logFile, level=logging.DEBUG,
-    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=logfile, level=logging.DEBUG,
                        filemode='w')
    if args.log:
        logging.getLogger().addHandler(logging.StreamHandler())
+    # Computing on multiple cores
    res = ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, nbCores, databaseType, path,
                 randomState, hyperParamSearch=hyperParamSearch,
                 metrics=metrics, nIter=nIter, **kwargs)
    with open(directory + "res.pickle", "wb") as handle:
-        pickle.dump(randomState, handle)
+        pickle.dump(res, handle)
    # Pickle the res in a file to be reused.

--- a/Code/MonoMultiViewClassifiers/utils/Dataset.py
+++ b/Code/MonoMultiViewClassifiers/utils/Dataset.py
@@ -2,7 +2,7 @@ import logging
 import os
 import select
 import sys
+import h5py
 import numpy as np
 from scipy import sparse
@@ -114,3 +114,10 @@ def input_(timeout=15):
    else:
        return "y"
+def getMonoviewShared(path, name, viewName, labelsNames, classificationIndices):
+    """ATM is not used with shared memory, but soon :)"""
+    HDF5_dataset_file = h5py.File(path + name + ".hdf5", "w")
+    X = HDF5_dataset_file.get(viewName).value
+    Y = HDF5_dataset_file.get("Labels").value
+    return X, Y
\ No newline at end of file
--- a/Code/MonoMultiViewClassifiers/utils/execution.py
+++ b/Code/MonoMultiViewClassifiers/utils/execution.py
@@ -76,7 +76,7 @@ def parseTheArgs(arguments):
                            , default=[''])
    groupClass.add_argument('--CL_metric_princ', metavar='STRING', action='store',
                            help='Determine which metric to use for randomSearch and optimization', default="f1_score")
-    groupClass.add_argument('--CL_GS_iter', metavar='INT', action='store',
+    groupClass.add_argument('--CL_HPS_iter', metavar='INT', action='store',
                            help='Determine how many hyper parameters optimization tests to do', type=int, default=2)
    groupClass.add_argument('--CL_HPS_type', metavar='STRING', action='store',
                            help='Determine which hyperparamter search function use', default="randomizedSearch")
@@ -239,7 +239,7 @@ def initRandomState(randomStateArg, directory):
 def initLogFile(args):
    """Used to init the directory where the preds will be stored and the log file"""
    resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/"
-    logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join(
+    logFileName = time.strftime("%Y_%m_%d-%H:%M:%S") + "-" + ''.join(args.CL_type) + "-" + "_".join(
        args.views) + "-" + args.name + "-LOG"
    if not os.path.exists(os.path.dirname(resultDirectory + logFileName)):
        try: