Skip to content
Snippets Groups Projects
Commit 9fd8694c authored by Baptiste Bauvin's avatar Baptiste Bauvin
Browse files

Wrote some doc

parent 60b6ce49
No related branches found
No related tags found
No related merge requests found
...@@ -10,6 +10,7 @@ from . import GetMultiviewDb as DB ...@@ -10,6 +10,7 @@ from . import GetMultiviewDb as DB
def getV(DATASET, viewIndex, usedIndices=None): def getV(DATASET, viewIndex, usedIndices=None):
"""Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset"""
if usedIndices is None: if usedIndices is None:
usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"])
if type(usedIndices) is int: if type(usedIndices) is int:
...@@ -32,6 +33,7 @@ def getV(DATASET, viewIndex, usedIndices=None): ...@@ -32,6 +33,7 @@ def getV(DATASET, viewIndex, usedIndices=None):
def getShape(DATASET, viewIndex): def getShape(DATASET, viewIndex):
"""Used to get the dataset shape even if it's sparse"""
if not DATASET.get("View" + str(viewIndex)).attrs["sparse"]: if not DATASET.get("View" + str(viewIndex)).attrs["sparse"]:
return DATASET.get("View" + str(viewIndex)).shape return DATASET.get("View" + str(viewIndex)).shape
else: else:
...@@ -39,6 +41,7 @@ def getShape(DATASET, viewIndex): ...@@ -39,6 +41,7 @@ def getShape(DATASET, viewIndex):
def getValue(DATASET): def getValue(DATASET):
"""Used to get the value of a view in the HDF5 dataset even if it sparse"""
if not DATASET.attrs["sparse"]: if not DATASET.attrs["sparse"]:
return DATASET.value return DATASET.value
else: else:
...@@ -50,6 +53,7 @@ def getValue(DATASET): ...@@ -50,6 +53,7 @@ def getValue(DATASET):
def extractSubset(matrix, usedIndices): def extractSubset(matrix, usedIndices):
"""Used to extract a subset of a matrix even if it's sparse"""
if sparse.issparse(matrix): if sparse.issparse(matrix):
newIndptr = np.zeros(len(usedIndices) + 1, dtype=int) newIndptr = np.zeros(len(usedIndices) + 1, dtype=int)
oldindptr = matrix.indptr oldindptr = matrix.indptr
...@@ -69,8 +73,7 @@ def extractSubset(matrix, usedIndices): ...@@ -69,8 +73,7 @@ def extractSubset(matrix, usedIndices):
def initMultipleDatasets(args, nbCores): def initMultipleDatasets(args, nbCores):
"""Used to create copies of the dataset if multicore computation is used """Used to create copies of the dataset if multicore computation is used"""
Needs arg.pathF and arg.name"""
if nbCores > 1: if nbCores > 1:
if DB.datasetsAlreadyExist(args.pathF, args.name, nbCores): if DB.datasetsAlreadyExist(args.pathF, args.name, nbCores):
logging.debug("Info:\t Enough copies of the dataset are already available") logging.debug("Info:\t Enough copies of the dataset are already available")
...@@ -90,6 +93,7 @@ def initMultipleDatasets(args, nbCores): ...@@ -90,6 +93,7 @@ def initMultipleDatasets(args, nbCores):
def confirm(resp=True, timeout=15): def confirm(resp=True, timeout=15):
"""Used to process answer"""
ans = input_(timeout) ans = input_(timeout)
if not ans: if not ans:
return resp return resp
...@@ -102,6 +106,7 @@ def confirm(resp=True, timeout=15): ...@@ -102,6 +106,7 @@ def confirm(resp=True, timeout=15):
def input_(timeout=15): def input_(timeout=15):
"""used as a UI to stop if too much HDD space will be used"""
print("You have " + str(timeout) + " seconds to stop the script by typing n") print("You have " + str(timeout) + " seconds to stop the script by typing n")
i, o, e = select.select([sys.stdin], [], [], timeout) i, o, e = select.select([sys.stdin], [], [], timeout)
if i: if i:
......
This diff is collapsed.
...@@ -8,6 +8,7 @@ from .. import Metrics ...@@ -8,6 +8,7 @@ from .. import Metrics
def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLearningIndices, iKFolds, randomState, viewsIndices=None, def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLearningIndices, iKFolds, randomState, viewsIndices=None,
searchingTool="hyperParamSearch", nIter=1, **kwargs): searchingTool="hyperParamSearch", nIter=1, **kwargs):
"""Used to select the right hyperparam optimization function to optimize hyper parameters"""
if viewsIndices is None: if viewsIndices is None:
viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) viewsIndices = range(dataset.get("Metadata").attrs["nbView"])
thismodule = sys.modules[__name__] thismodule = sys.modules[__name__]
...@@ -18,12 +19,13 @@ def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLea ...@@ -18,12 +19,13 @@ def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLea
def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs): def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs):
# si grid search est selectionne, on veut tester certaines valeurs """Used to perfom gridsearch on the classifiers"""
pass pass
def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learningIndices, KFolds, randomState, viewsIndices=None, nIter=1, def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learningIndices, KFolds, randomState, viewsIndices=None, nIter=1,
nbCores=1, **classificationKWARGS): nbCores=1, **classificationKWARGS):
"""Used to perform a random search on the classifiers to optimize hyper parameters"""
if viewsIndices is None: if viewsIndices is None:
viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) viewsIndices = range(dataset.get("Metadata").attrs["nbView"])
metric = metrics[0] metric = metrics[0]
...@@ -75,10 +77,12 @@ def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learni ...@@ -75,10 +77,12 @@ def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learni
def spearMint(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs): def spearMint(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs):
"""Used to perform spearmint on the classifiers to optimize hyper parameters"""
pass pass
def genHeatMaps(params, scoresArray, outputFileName): def genHeatMaps(params, scoresArray, outputFileName):
"""Used to generate a heat map for each doublet of hyperparms optimized on the previous function"""
nbParams = len(params) nbParams = len(params)
if nbParams > 2: if nbParams > 2:
combinations = itertools.combinations(range(nbParams), 2) combinations = itertools.combinations(range(nbParams), 2)
......
...@@ -5,11 +5,12 @@ import pickle ...@@ -5,11 +5,12 @@ import pickle
def percent(x, pos): def percent(x, pos):
'The two args are the value and tick position' """Used to print percentage of importance on the y axis"""
return '%1.1f %%' % (x * 100) return '%1.1f %%' % (x * 100)
def getFeatureImportance(classifier, directory, interpretString=""): def getFeatureImportance(classifier, directory, interpretString=""):
"""Used to generate a graph and a pickle dictionary representing feature importances"""
featureImportances = classifier.feature_importances_ featureImportances = classifier.feature_importances_
sortedArgs = np.argsort(-featureImportances) sortedArgs = np.argsort(-featureImportances)
featureImportancesSorted = featureImportances[sortedArgs][:50] featureImportancesSorted = featureImportances[sortedArgs][:50]
......
...@@ -9,6 +9,7 @@ import sklearn ...@@ -9,6 +9,7 @@ import sklearn
def parseTheArgs(arguments): def parseTheArgs(arguments):
"""Used to parse the args entered by the user"""
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='This file is used to benchmark the scores fo multiple classification algorithm on multiview data.', description='This file is used to benchmark the scores fo multiple classification algorithm on multiview data.',
...@@ -183,6 +184,7 @@ def parseTheArgs(arguments): ...@@ -183,6 +184,7 @@ def parseTheArgs(arguments):
return args return args
def initRandomState(randomStateArg, directory): def initRandomState(randomStateArg, directory):
"""Used to init a random state and multiple if needed (multicore)"""
if randomStateArg is None: if randomStateArg is None:
randomState = np.random.RandomState(randomStateArg) randomState = np.random.RandomState(randomStateArg)
else: else:
...@@ -199,6 +201,7 @@ def initRandomState(randomStateArg, directory): ...@@ -199,6 +201,7 @@ def initRandomState(randomStateArg, directory):
def initLogFile(args): def initLogFile(args):
"""Used to init the directory where the results will be stored and the log file"""
resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/"
logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join( logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join(
args.views) + "-" + args.name + "-LOG" args.views) + "-" + args.name + "-LOG"
...@@ -226,11 +229,14 @@ def initLogFile(args): ...@@ -226,11 +229,14 @@ def initLogFile(args):
def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStates): def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStates):
"""Used to gen the train/test splits using one or multiple random states"""
indices = np.arange(datasetlength) indices = np.arange(datasetlength)
if statsIter > 1: if statsIter > 1:
splits = [] splits = []
for randomState in statsIterRandomStates: for randomState in statsIterRandomStates:
foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, random_state=randomState, test_size=splitRatio) foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1,
random_state=randomState,
test_size=splitRatio)
folds = foldsObj.split(indices, DATASET.get("Labels").value) folds = foldsObj.split(indices, DATASET.get("Labels").value)
for fold in folds: for fold in folds:
train_fold, test_fold = fold train_fold, test_fold = fold
...@@ -249,6 +255,7 @@ def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStat ...@@ -249,6 +255,7 @@ def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStat
def genKFolds(statsIter, nbFolds, statsIterRandomStates): def genKFolds(statsIter, nbFolds, statsIterRandomStates):
"""Used to generate folds indices for cross validation and multiple if needed"""
if statsIter > 1: if statsIter > 1:
foldsList = [] foldsList = []
for randomState in statsIterRandomStates: for randomState in statsIterRandomStates:
...@@ -259,8 +266,7 @@ def genKFolds(statsIter, nbFolds, statsIterRandomStates): ...@@ -259,8 +266,7 @@ def genKFolds(statsIter, nbFolds, statsIterRandomStates):
def initViews(DATASET, args): def initViews(DATASET, args):
"""Used to return the views names that will be used by the algos, their indices and all the views names """Used to return the views names that will be used by the algos, their indices and all the views names"""
Needs args.views"""
NB_VIEW = DATASET.get("Metadata").attrs["nbView"] NB_VIEW = DATASET.get("Metadata").attrs["nbView"]
if args.views != [""]: if args.views != [""]:
allowedViews = args.views allowedViews = args.views
...@@ -278,6 +284,7 @@ def initViews(DATASET, args): ...@@ -278,6 +284,7 @@ def initViews(DATASET, args):
def genDirecortiesNames(directory, statsIter): def genDirecortiesNames(directory, statsIter):
"""Used to generate the different directories of each iteration if needed"""
if statsIter > 1: if statsIter > 1:
directories = [] directories = []
for i in range(statsIter): for i in range(statsIter):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment