Wrote some doc

9fd8694c · Baptiste Bauvin · 60b6ce49 · 9fd8694c · 9fd8694c · 9fd8694c
Commit 9fd8694c authored Oct 24, 2017 by Baptiste Bauvin
--- a/Code/MonoMultiViewClassifiers/utils/Dataset.py
+++ b/Code/MonoMultiViewClassifiers/utils/Dataset.py
@@ -10,6 +10,7 @@ from . import GetMultiviewDb as DB


 def getV(DATASET, viewIndex, usedIndices=None):
+    """Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset"""
    if usedIndices is None:
        usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"])
    if type(usedIndices) is int:
@@ -32,6 +33,7 @@ def getV(DATASET, viewIndex, usedIndices=None):


 def getShape(DATASET, viewIndex):
+    """Used to get the dataset shape even if it's sparse"""
    if not DATASET.get("View" + str(viewIndex)).attrs["sparse"]:
        return DATASET.get("View" + str(viewIndex)).shape
    else:
@@ -39,6 +41,7 @@ def getShape(DATASET, viewIndex):


 def getValue(DATASET):
+    """Used to get the value of a view in the HDF5 dataset even if it sparse"""
    if not DATASET.attrs["sparse"]:
        return DATASET.value
    else:
@@ -50,6 +53,7 @@ def getValue(DATASET):


 def extractSubset(matrix, usedIndices):
+    """Used to extract a subset of a matrix even if it's sparse"""
    if sparse.issparse(matrix):
        newIndptr = np.zeros(len(usedIndices) + 1, dtype=int)
        oldindptr = matrix.indptr
@@ -69,8 +73,7 @@ def extractSubset(matrix, usedIndices):


 def initMultipleDatasets(args, nbCores):
-    """Used to create copies of the dataset if multicore computation is used
-    Needs arg.pathF and arg.name"""
+    """Used to create copies of the dataset if multicore computation is used"""
    if nbCores > 1:
        if DB.datasetsAlreadyExist(args.pathF, args.name, nbCores):
            logging.debug("Info:\t Enough copies of the dataset are already available")
@@ -90,6 +93,7 @@ def initMultipleDatasets(args, nbCores):


 def confirm(resp=True, timeout=15):
+    """Used to process answer"""
    ans = input_(timeout)
    if not ans:
        return resp
@@ -102,6 +106,7 @@ def confirm(resp=True, timeout=15):


 def input_(timeout=15):
+    """used as a UI to stop if too much HDD space will be used"""
    print("You have " + str(timeout) + " seconds to stop the script by typing n")
    i, o, e = select.select([sys.stdin], [], [], timeout)
    if i:

--- a/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py
+++ b/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py
--- a/Code/MonoMultiViewClassifiers/utils/HyperParameterSearch.py
+++ b/Code/MonoMultiViewClassifiers/utils/HyperParameterSearch.py
@@ -8,6 +8,7 @@ from .. import Metrics

 def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLearningIndices, iKFolds, randomState, viewsIndices=None,
                       searchingTool="hyperParamSearch", nIter=1, **kwargs):
+    """Used to select the right hyperparam optimization function to optimize hyper parameters"""
    if viewsIndices is None:
        viewsIndices = range(dataset.get("Metadata").attrs["nbView"])
    thismodule = sys.modules[__name__]
@@ -18,12 +19,13 @@ def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLea


 def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs):
-    # si grid search est selectionne, on veut tester certaines valeurs
+    """Used to perfom gridsearch on the classifiers"""
    pass


 def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learningIndices, KFolds, randomState, viewsIndices=None, nIter=1,
                     nbCores=1, **classificationKWARGS):
+    """Used to perform a random search on the classifiers to optimize hyper parameters"""
    if viewsIndices is None:
        viewsIndices = range(dataset.get("Metadata").attrs["nbView"])
    metric = metrics[0]
@@ -75,10 +77,12 @@ def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learni


 def spearMint(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs):
+    """Used to perform spearmint on the classifiers to optimize hyper parameters"""
    pass


 def genHeatMaps(params, scoresArray, outputFileName):
+    """Used to generate a heat map for each doublet of hyperparms optimized on the previous function"""
    nbParams = len(params)
    if nbParams > 2:
        combinations = itertools.combinations(range(nbParams), 2)

--- a/Code/MonoMultiViewClassifiers/utils/Interpret.py
+++ b/Code/MonoMultiViewClassifiers/utils/Interpret.py
@@ -5,11 +5,12 @@ import pickle


 def percent(x, pos):
-    'The two args are the value and tick position'
+    """Used to print percentage of importance on the y axis"""
    return '%1.1f %%' % (x * 100)


 def getFeatureImportance(classifier, directory, interpretString=""):
+    """Used to generate a graph and a pickle dictionary representing feature importances"""
    featureImportances = classifier.feature_importances_
    sortedArgs = np.argsort(-featureImportances)
    featureImportancesSorted = featureImportances[sortedArgs][:50]

--- a/Code/MonoMultiViewClassifiers/utils/execution.py
+++ b/Code/MonoMultiViewClassifiers/utils/execution.py
@@ -9,6 +9,7 @@ import sklearn


 def parseTheArgs(arguments):
+    """Used to parse the args entered by the user"""

    parser = argparse.ArgumentParser(
        description='This file is used to benchmark the scores fo multiple classification algorithm on multiview data.',
@@ -183,6 +184,7 @@ def parseTheArgs(arguments):
    return args

 def initRandomState(randomStateArg, directory):
+    """Used to init a random state and multiple if needed (multicore)"""
    if randomStateArg is None:
        randomState = np.random.RandomState(randomStateArg)
    else:
@@ -199,6 +201,7 @@ def initRandomState(randomStateArg, directory):


 def initLogFile(args):
+    """Used to init the directory where the results will be stored and the log file"""
    resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/"
    logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join(
        args.views) + "-" + args.name + "-LOG"
@@ -226,11 +229,14 @@ def initLogFile(args):


 def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStates):
+    """Used to gen the train/test splits using one or multiple random states"""
    indices = np.arange(datasetlength)
    if statsIter > 1:
        splits = []
        for randomState in statsIterRandomStates:
-            foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, random_state=randomState, test_size=splitRatio)
+            foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1,
+                                                                      random_state=randomState,
+                                                                      test_size=splitRatio)
            folds = foldsObj.split(indices, DATASET.get("Labels").value)
            for fold in folds:
                train_fold, test_fold = fold
@@ -249,6 +255,7 @@ def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStat


 def genKFolds(statsIter, nbFolds, statsIterRandomStates):
+    """Used to generate folds indices for cross validation and multiple if needed"""
    if statsIter > 1:
        foldsList = []
        for randomState in statsIterRandomStates:
@@ -259,8 +266,7 @@ def genKFolds(statsIter, nbFolds, statsIterRandomStates):


 def initViews(DATASET, args):
-    """Used to return the views names that will be used by the algos, their indices and all the views names
-    Needs args.views"""
+    """Used to return the views names that will be used by the algos, their indices and all the views names"""
    NB_VIEW = DATASET.get("Metadata").attrs["nbView"]
    if args.views != [""]:
        allowedViews = args.views
@@ -278,6 +284,7 @@ def initViews(DATASET, args):


 def genDirecortiesNames(directory, statsIter):
+    """Used to generate the different directories of each iteration if needed"""
    if statsIter > 1:
        directories = []
        for i in range(statsIter):