diff --git a/Code/MonoMutliViewClassifiers/ExecClassif.py b/Code/MonoMutliViewClassifiers/ExecClassif.py index 081e6de06367a3cdd054758bc86b57e91960d738..13c59d1e3a5ed581e49447bad5193bc62b590e1f 100644 --- a/Code/MonoMutliViewClassifiers/ExecClassif.py +++ b/Code/MonoMutliViewClassifiers/ExecClassif.py @@ -47,13 +47,6 @@ groupStandard.add_argument('--views', metavar='STRING', action='store',help='Nam default='') groupStandard.add_argument('--pathF', metavar='STRING', action='store',help='Path to the views (default: %(default)s)', default='/home/bbauvin/Documents/Data/Data_multi_omics/') -groupStandard.add_argument('--fileCL', metavar='STRING', action='store', - help='Name of classLabels CSV-file (default: %(default)s)', default='classLabels.csv') -groupStandard.add_argument('--fileCLD', metavar='STRING', action='store', - help='Name of classLabels-Description CSV-file (default: %(default)s)', - default='classLabels-Description.csv') -groupStandard.add_argument('--fileFeat', metavar='STRING', action='store', - help='Name of feature CSV-file (default: %(default)s)', default='feature.csv') groupStandard.add_argument('--nice', metavar='INT', action='store', type=int, help='Niceness for the process', default=0) @@ -86,8 +79,8 @@ groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs= 'first one will be used for classification', default=['']) groupClass.add_argument('--CL_GS_iter', metavar='INT', action='store', help='Determine how many Randomized grid search tests to do', type=int, default=30) -groupClass.add_argument('--CL_NoGS', action='store_false', - help='Determine how many Randomized grid search tests to do') +groupClass.add_argument('--CL_GS_type', metavar='STRING', action='store', + help='Determine which hyperparamter search function use', default="randomizedSearch") groupRF = parser.add_argument_group('Random Forest arguments') groupRF.add_argument('--CL_RF_trees', metavar='STRING', action='store', help='GridSearch: Determine the trees', @@ -176,9 +169,9 @@ else: getDatabase = getattr(DB, "get" + args.name + "DB" + args.type[1:]) try: - gridSearch = args.CL_NoGS + gridSearch = args.CL_GS_type except: - gridSearch = True + gridSearch = "None" directory = os.path.dirname(os.path.abspath(__file__)) + "/Results/" logFileName = time.strftime("%Y%m%d-%H%M%S") + "-CMultiV-" + args.CL_type + "-" + "_".join(args.views.split(":")) + "-" + args.name + \ @@ -318,14 +311,14 @@ try: for classifier in benchmark["Monoview"]: if classifier=="SCM": if DATASET.get("View"+str(allViews.index(view))).attrs["binary"]: - arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGSInit"], "feat":view, "fileFeat": args.fileFeat, - "fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": classifier, "nbClass":NB_CLASS}, "viewIndex":allViews.index(view)} + arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGSInit"], "feat":view, + "CL_type": classifier, "nbClass":NB_CLASS}, "viewIndex":allViews.index(view)} argumentDictionaries["Monoview"].append(arguments) else: pass else: - arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGSInit"], "feat":view, "fileFeat": args.fileFeat, - "fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": classifier, "nbClass":NB_CLASS}, "viewIndex":allViews.index(view)} + arguments = {"args":{classifier+"KWARGS": globals()[classifier+"KWARGSInit"], "feat":view, + "CL_type": classifier, "nbClass":NB_CLASS}, "viewIndex":allViews.index(view)} argumentDictionaries["Monoview"].append(arguments) except: pass @@ -343,10 +336,6 @@ if nbCores>1: accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)] classifiersNames = [[result[1][0] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)] classifiersConfigs = [[result[1][2] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)] - # for viewIndex, view in enumerate(views): - # bestClassifiers.append(classifiersNames[viewIndex][np.argmax(np.array(accuracies[viewIndex]))]) - # bestClassifiersConfigs.append(classifiersConfigs[viewIndex][np.argmax(np.array(accuracies[viewIndex]))]) - else: resultsMonoview+=([ExecMonoview(DATASET.get("View"+str(arguments["viewIndex"])), DATASET.get("Labels").value, args.name, labelsNames, @@ -359,6 +348,7 @@ else: classifiersNames = [[result[1][0] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in viewsIndices] classifiersConfigs = [[result[1][1][:-1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in viewsIndices] monoviewTime = time.time()-dataBaseTime-start +print classifiersConfigs if True: if benchmark["Multiview"]: try: @@ -374,16 +364,16 @@ if True: "MumboKWARGS": {"classifiersNames": mumboClassifiersNames, "maxIter":int(args.MU_iter[0]), "minIter":int(args.MU_iter[1]), "threshold":args.MU_iter[2], - "classifiersConfigs": [argument.split(":") for argument in args.MU_config]}} + "classifiersConfigs": [argument.split(":") for argument in args.MU_config], "nbView":(len(viewsIndices))}} argumentDictionaries["Multiview"].append(arguments) except: pass try: if benchmark["Multiview"]["Fusion"]: - if args.CL_algos_monoview !=['']: - monoClassifiers = args.CL_algos_monoview.split(":") - monoClassifiersConfigs = [classifier+"KWARGS" for classifier in monoClassifiers] + if args.FU_cl_names.split(':') !=['']: + monoClassifiers = args.FU_cl_names.split(":") + monoClassifiersConfigs = [globals()[classifier+"KWARGS"] for classifier in monoClassifiers] if args.FU_method_config != [""]: fusionMethodConfigs = [map(float,config.split(":")) for config in args.FU_method_config] elif not gridSearch: @@ -405,7 +395,7 @@ if True: "FusionKWARGS": {"fusionType":"LateFusion", "fusionMethod":method, "classifiersNames": args.FU_cl_names.split(":"), "classifiersConfigs": monoClassifiersConfigs, - 'fusionMethodConfig': fusionMethodConfigs[methodIndex]}} + 'fusionMethodConfig': fusionMethodConfigs[methodIndex], "nbView":(len(viewsIndices))}} argumentDictionaries["Multiview"].append(arguments) else: for combination in itertools.combinations_with_replacement(range(len(monoClassifiers)), NB_VIEW): @@ -420,7 +410,7 @@ if True: "FusionKWARGS": {"fusionType":"LateFusion", "fusionMethod":method, "classifiersNames": monoClassifiersNamesComb, "classifiersConfigs": monoClassifiersConfigsComb, - 'fusionMethodConfig': fusionMethodConfigs[methodIndex]}} + 'fusionMethodConfig': fusionMethodConfigs[methodIndex], "nbView":(len(viewsIndices))}} argumentDictionaries["Multiview"].append(arguments) except: pass @@ -469,6 +459,7 @@ if True: else: pass # resultsMultiview = [] +print argumentDictionaries["Multiview"] if nbCores>1: resultsMultiview = [] nbExperiments = len(argumentDictionaries["Multiview"]) diff --git a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py index 92653a2ff7b4c04e67019cbf070b0fc813d76996..9919fbd67693fe6ad4fc0745cd9f9350cdd77843 100644 --- a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py @@ -50,11 +50,7 @@ def ExecMonoview(X, Y, name, labelsNames, learningRate, nbFolds, nbCores, databa except: kwargs = args t_start = time.time() - directory = os.path.dirname(os.path.abspath(__file__)) + "/Results-ClassMonoView/" feat = X.attrs["name"] - fileFeat = kwargs["fileFeat"] - fileCL = kwargs["fileCL"] - fileCLD = kwargs["fileCLD"] CL_type = kwargs["CL_type"] nbClass = kwargs["nbClass"] X = getValue(X) @@ -96,7 +92,6 @@ def ExecMonoview(X, Y, name, labelsNames, learningRate, nbFolds, nbCores, databa logging.debug("Done:\t RandomSearch best settings") logging.debug("Start:\t Training") cl_res = classifierModule.fit(X_train, y_train, NB_CORES=nbCores, **clKWARGS) - logging.debug("Done:\t Training") logging.debug("Start:\t Predicting") diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py index 67f0ce4ba5a9404535e46b8f1b934483c8df62c9..2b28f22ef67f288356e671ed33b577c2a6d948fa 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py @@ -33,6 +33,7 @@ def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): binaryAttributes = kwargs["binaryAttributes"] except: attributeClassification, binaryAttributes, dsetFile, name = transformData(DATASET) + print kwargs classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attrtibutes, model_type=model_type, verbose=False) classifier.fit(binaryAttributes, CLASS_LABELS, X=None, attribute_classifications=attributeClassification, iteration_callback=None) try: diff --git a/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py b/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py index 77769d796b0e1e1a5946630793d416368d9b2625..02f82f891b253652b23ca8c1e0fc190145fe9534 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py +++ b/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py @@ -15,6 +15,7 @@ import logging import time import h5py from utils.Dataset import getShape +from utils.HyperParameterSearch import searchBestSettings # Author-Info __author__ = "Baptiste Bauvin" @@ -39,7 +40,7 @@ def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, p NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] if not metrics: metrics = [["accuracy_score", None]] - + metric = metrics[0] CL_type = kwargs["CL_type"] LABELS_NAMES = kwargs["LABELS_NAMES"] classificationKWARGS = kwargs[CL_type+"KWARGS"] @@ -55,14 +56,10 @@ def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, p logging.info("Done:\t Read Database Files") extractionTime = time.time() - t_start - kFoldPredictedTrainLabels = [] - kFoldPredictedTestLabels = [] - kFoldPredictedValidationLabels = [] - kFoldLearningTime = [] - kFoldPredictionTime = [] - kFoldClassifier = [] ivalidationIndices = [] - ikFolds = [] + trainLabelsIterations = [] + testLabelsIterations = [] + classifiersIterations = [] classifierPackage = globals()[CL_type] # Permet d'appeler un module avec une string classifierModule = getattr(classifierPackage, CL_type) classifierClass = getattr(classifierModule, CL_type) @@ -91,63 +88,30 @@ def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, p logging.info("Start:\t Learning with " + CL_type + " and " + str(len(kFolds)) + " folds") logging.info("Start:\t Classification") # Begin Classification - - kFoldPredictedTrainLabelsIter = [] - kFoldPredictedTestLabelsIter = [] - kFoldPredictedValidationLabelsIter = [] - kFoldLearningTimeIter = [] - kFoldPredictionTimeIter = [] - kFoldClassifierIter = [] - for foldIdx, fold in enumerate(kFolds): - if fold != range(classificationSetLength): - fold.sort() - logging.info("\tStart:\t Fold number " + str(foldIdx + 1)) - trainIndices = [index for index in range(datasetLength) if (index not in fold) and (index not in validationIndices)] - if gridSearch: - logging.info("Start:\t Randomsearching best settings for monoview classifiers") - bestSettings, fusionConfig = classifierGridSearch(DATASET, viewsIndices, classificationKWARGS, trainIndices - , metric=metrics[0], nIter=nIter) - classificationKWARGS["classifiersConfigs"] = bestSettings - try: - classificationKWARGS["fusionMethodConfig"] = fusionConfig - except: - pass - logging.info("Done:\t Randomsearching best settings for monoview classifiers") - DATASET_LENGTH = len(trainIndices) - classifier = classifierClass(NB_VIEW, DATASET_LENGTH, DATASET.get("Labels").value[trainIndices], NB_CORES=nbCores, **classificationKWARGS) - - classifier.fit_hdf5(DATASET, trainIndices=trainIndices, viewsIndices=viewsIndices) - kFoldClassifierIter.append(classifier) - - learningTime = time.time() - extractionTime - t_start - kFoldLearningTimeIter.append(learningTime) - kFoldPredictedTrainLabelsIter.append(classifier.predict_hdf5(DATASET, usedIndices=trainIndices, viewsIndices=viewsIndices)) - kFoldPredictedTestLabelsIter.append(classifier.predict_hdf5(DATASET, usedIndices=fold, viewsIndices=viewsIndices)) - kFoldPredictedValidationLabelsIter.append(classifier.predict_hdf5(DATASET, usedIndices=validationIndices, viewsIndices=viewsIndices)) - - kFoldPredictionTimeIter.append(time.time() - extractionTime - t_start - learningTime) - logging.info("\tDone: \t Fold number " + str(foldIdx + 1)) - kFoldPredictedTrainLabels.append(kFoldPredictedTrainLabelsIter) - kFoldPredictedTestLabels.append(kFoldPredictedTestLabelsIter) - kFoldPredictedValidationLabels.append(kFoldPredictedValidationLabelsIter) - kFoldLearningTime.append(kFoldLearningTimeIter) - kFoldPredictionTime.append(kFoldPredictionTimeIter) - kFoldClassifier.append(kFoldClassifierIter) - ikFolds.append(kFolds) + classifier = searchBestSettings(DATASET, CL_type, metrics, viewsIndices=viewsIndices, usedIndices=learningIndices, kFolds=kFolds, searchingTool=gridSearch, nIter=1, **classificationKWARGS) + classifier.fit_hdf5(DATASET, trainIndices=learningIndices, viewsIndices=viewsIndices) + trainLabels = classifier.predict_hdf5(DATASET, usedIndices=learningIndices, viewsIndices=viewsIndices) + testLabels = classifier.predict_hdf5(DATASET, usedIndices=validationIndices, viewsIndices=viewsIndices) + trainLabelsIterations.append(trainLabels) + testLabelsIterations.append(testLabels) ivalidationIndices.append(validationIndices) + classifiersIterations.append(classifier) + logging.info("Done:\t Classification") + classificationTime = time.time() - t_start - logging.info("Done:\t Classification") logging.info("Info:\t Time for Classification: " + str(int(classificationTime)) + "[s]") logging.info("Start:\t Result Analysis for " + CL_type) - times = (extractionTime, kFoldLearningTime, kFoldPredictionTime, classificationTime) + times = (extractionTime, classificationTime) - stringAnalysis, imagesAnalysis, metricsScores = analysisModule.execute(kFoldClassifier, kFoldPredictedTrainLabels, - kFoldPredictedTestLabels, kFoldPredictedValidationLabels, - DATASET, classificationKWARGS, learningRate, LABELS_DICTIONARY, - views, nbCores, times, ikFolds, name, nbFolds, - ivalidationIndices, gridSearch, nIter, metrics, statsIter, viewsIndices) + stringAnalysis, imagesAnalysis, metricsScores = analysisModule.execute(classifiersIterations, trainLabelsIterations, + testLabelsIterations, DATASET, + classificationKWARGS, learningRate, + LABELS_DICTIONARY,views, nbCores, times, + name, nbFolds, ivalidationIndices, + gridSearch, nIter, metrics, statsIter, + viewsIndices) labelsSet = set(LABELS_DICTIONARY.values()) logging.info(stringAnalysis) featureString = "-".join(views) diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Fusion.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Fusion.py index d30c4be38d3059731179d98a475da8a5eda53f2b..62c9d41a6c74849a4a5d730f91c4313bd4cd38c1 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Fusion.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Fusion.py @@ -26,6 +26,15 @@ def makeMonoviewData_hdf5(DATASET, weights=None, usedIndices=None, viewsIndices= return monoviewData +def genParamsSets(classificationKWARGS, nIter=1): + fusionTypeName = classificationKWARGS["fusionType"] + fusionTypePackage = globals()[fusionTypeName+"Package"] + fusionMethodModuleName = classificationKWARGS["fusionMethod"] + fusionMethodModule = getattr(fusionTypePackage, fusionMethodModuleName) + fusionMethodConfig = fusionMethodModule.genParamsSets(classificationKWARGS, nIter=nIter) + return fusionMethodConfig + + def gridSearch_hdf5(DATASET, viewsIndices, classificationKWARGS, learningIndices, metric=None, nIter=30): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) @@ -56,7 +65,7 @@ def gridSearch_hdf5(DATASET, viewsIndices, classificationKWARGS, learningIndices class Fusion: - def __init__(self, NB_VIEW, DATASET_LENGTH, CLASS_LABELS, NB_CORES=1,**kwargs): + def __init__(self, NB_CORES=1,**kwargs): fusionType = kwargs['fusionType'] fusionMethod = kwargs['fusionMethod'] fusionTypePackage = globals()[fusionType+"Package"] @@ -66,16 +75,20 @@ class Fusion: classifierKWARGS = dict((key, value) for key, value in kwargs.iteritems() if key not in ['fusionType', 'fusionMethod']) self.classifier = fusionMethodClass(NB_CORES=nbCores, **classifierKWARGS) + def setParams(self, paramsSet): + self.classifier.setParams(paramsSet) + + def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None): self.classifier.fit_hdf5(DATASET, trainIndices=trainIndices, viewsIndices=viewsIndices) - def fit(self, DATASET, CLASS_LABELS, DATASET_LENGTH, NB_VIEW, NB_CLASS, NB_CORES, trainArguments): - fusionType, fusionMethod, fusionConfig, monoviewClassifier, monoviewClassifierConfig = trainArguments - fusionTypeModule = globals()[fusionType] # Early/late fusion - trainFusion = getattr(fusionTypeModule, fusionMethod+"Train") # linearWeighted for example - classifier = trainFusion(DATASET, CLASS_LABELS, DATASET_LENGTH, NB_VIEW, monoviewClassifier, - monoviewClassifierConfig, fusionConfig) - return fusionType, fusionMethod, classifier + # def fit(self, DATASET, CLASS_LABELS, DATASET_LENGTH, NB_VIEW, NB_CLASS, NB_CORES, trainArguments): + # fusionType, fusionMethod, fusionConfig, monoviewClassifier, monoviewClassifierConfig = trainArguments + # fusionTypeModule = globals()[fusionType] # Early/late fusion + # trainFusion = getattr(fusionTypeModule, fusionMethod+"Train") # linearWeighted for example + # classifier = trainFusion(DATASET, CLASS_LABELS, DATASET_LENGTH, NB_VIEW, monoviewClassifier, + # monoviewClassifierConfig, fusionConfig) + # return fusionType, fusionMethod, classifier def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): if usedIndices == None: @@ -97,12 +110,12 @@ class Fusion: predictedLabels = [] return predictedLabels - def predict(self, DATASET, classifier, NB_CLASS): - fusionType, fusionMethod, fusionClassifier = classifier - fusionType = globals()[fusionType] # Early/late fusion - predictFusion = getattr(fusionType, fusionMethod+"Predict") # linearWeighted for example - predictedLabels = predictFusion(DATASET, fusionClassifier) - return predictedLabels + # def predict(self, DATASET, classifier, NB_CLASS): + # fusionType, fusionMethod, fusionClassifier = classifier + # fusionType = globals()[fusionType] # Early/late fusion + # predictFusion = getattr(fusionType, fusionMethod+"Predict") # linearWeighted for example + # predictedLabels = predictFusion(DATASET, fusionClassifier) + # return predictedLabels diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/EarlyFusionPackage/WeightedLinear.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/EarlyFusionPackage/WeightedLinear.py index 31bdcc3596bbc70a554db1ec90440ae24637468f..09c6cae91958e39f3120ec4eb45f9d7a46fbf6f4 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/EarlyFusionPackage/WeightedLinear.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/EarlyFusionPackage/WeightedLinear.py @@ -4,6 +4,16 @@ import numpy as np from sklearn.metrics import accuracy_score +def genParamsSets(classificationKWARGS, nIter=1): + nbView = classificationKWARGS["nbView"] + paramsSets = [] + for _ in range(nIter): + randomWeightsArray = np.random.random_sample(nbView) + normalizedArray = randomWeightsArray/np.sum(randomWeightsArray) + paramsSets.append([normalizedArray]) + return paramsSets + + def gridSearch(DATASET, classificationKWARGS, trainIndices, nIter=30, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) @@ -43,6 +53,9 @@ class WeightedLinear(EarlyFusionClassifier): NB_CORES=self.nbCores, #**self.monoviewClassifiersConfig) **self.monoviewClassifiersConfig) + def setParams(self, paramsSet): + self.weights = paramsSet[0] + def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/BayesianInference.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/BayesianInference.py index c9ff653f9293883f00e8d58d475e7c28c4835cb4..fc03ff7d80c849c349af418f0dfd4c93f5129a32 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/BayesianInference.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/BayesianInference.py @@ -4,6 +4,16 @@ import numpy as np from sklearn.metrics import accuracy_score from utils.Dataset import getV + +def genParamsSets(classificationKWARGS, nIter=1): + nbView = classificationKWARGS["nbView"] + paramsSets = [] + for _ in range(nIter): + randomWeightsArray = np.random.random_sample(nbView) + normalizedArray = randomWeightsArray/np.sum(randomWeightsArray) + paramsSets.append([normalizedArray]) + return paramsSets + def gridSearch(DATASET, classificationKWARGS, trainIndices, nIter=30, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) @@ -29,7 +39,11 @@ class BayesianInference(LateFusionClassifier): def __init__(self, NB_CORES=1, **kwargs): LateFusionClassifier.__init__(self, kwargs['classifiersNames'], kwargs['classifiersConfigs'], NB_CORES=NB_CORES) - self.weights = np.array(map(float, kwargs['fusionMethodConfig'][0])) + + # self.weights = np.array(map(float, kwargs['fusionMethodConfig'][0])) + self.weights = None #A modifier !! + def setParams(self, paramsSet): + self.weights = paramsSet[0] def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py index 4866cc05a33ed141be50a8c84b461149c23aeeef..4dacd67e43e71dfa63e0c8a597b50215c8b955e8 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/MajorityVoting.py @@ -5,6 +5,15 @@ from sklearn.metrics import accuracy_score from utils.Dataset import getV +def genParamsSets(classificationKWARGS, nIter=1): + nbView = classificationKWARGS["nbView"] + paramsSets = [] + for _ in range(nIter): + randomWeightsArray = np.random.random_sample(nbView) + normalizedArray = randomWeightsArray/np.sum(randomWeightsArray) + paramsSets.append([normalizedArray]) + return paramsSets + def gridSearch(DATASET, classificationKWARGS, trainIndices, nIter=30, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) @@ -32,6 +41,9 @@ class MajorityVoting(LateFusionClassifier): NB_CORES=NB_CORES) self.weights = np.array(map(float, kwargs['fusionMethodConfig'][0])) + def setParams(self, paramsSet): + self.weights = paramsSet[0] + def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py index 6d9bc56403a6c4978bf731b0ac29aac4a7c05b0a..08e5e5e4a5e771840dcac875904b223d8e81d0e7 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py @@ -14,6 +14,18 @@ from sklearn.metrics import accuracy_score import itertools +def genParamsSets(classificationKWARGS, nIter=1): + nbView = classificationKWARGS["nbView"] + paramsSets = [] + for _ in range(nIter): + max_attributes = random.randint(1, 20) + p = random.random() + model = random.choice(["conjunction", "disjunction"]) + order = random.randint(1,nbView) + paramsSets.append([p, max_attributes, model, order]) + return paramsSets + + def gridSearch(DATASET, classificationKWARGS, trainIndices, nIter=30, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) @@ -44,6 +56,16 @@ class SCMForLinear(LateFusionClassifier): NB_CORES=NB_CORES) self.SCMClassifier = None self.config = kwargs['fusionMethodConfig'][0] + self.p = None + self.maxAttributes = None + self.order = None + self.modelType = None + + def setParams(self, paramsSet): + self.p = paramsSet[0] + self.maxAttributes = paramsSet[1] + self.order = paramsSet[2] + self.modelType = paramsSet[3] def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): @@ -84,13 +106,17 @@ class SCMForLinear(LateFusionClassifier): def SCMForLinearFusionFit(self, DATASET, usedIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) - p = float(self.config[0]) - maxAttributes = int(self.config[1]) - modelType = self.config[2] - self.order = self.config[3] + if self.p is None: + self.p = float(self.config[0]) + if self.maxAttributes is None: + self.maxAttributes = int(self.config[1]) + if self.modelType is None: + self.modelType = self.config[2] + if self.order is None: + self.order = self.config[3] nbView = len(viewsIndices) - self.SCMClassifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=maxAttributes, model_type=modelType, verbose=False) + self.SCMClassifier = pyscm.scm.SetCoveringMachine(p=self.p, max_attributes=self.maxAttributes, model_type=self.modelType, verbose=False) monoViewDecisions = np.zeros((len(usedIndices), nbView), dtype=int) for index, viewIndex in enumerate(viewsIndices): monoViewDecisions[:, index] = self.monoviewClassifiers[index].predict( diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py index e75b23130428cc46e5c00670b38192f5306fd840..066cf3d1470a2c94e16388356417edf7e367d770 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SVMForLinear.py @@ -6,6 +6,13 @@ from sklearn.svm import SVC from utils.Dataset import getV +def genParamsSets(classificationKWARGS, nIter=1): + nbView = classificationKWARGS["nbView"] + paramsSets = [] + for _ in range(nIter): + paramsSets.append([]) + return paramsSets + def gridSearch(DATASET, classificationKWARGS, trainIndices, nIter=30, viewsIndices=None): return None @@ -31,6 +38,9 @@ class SVMForLinear(LateFusionClassifier): enumerate(self.monoviewClassifiersConfigs[index])))) self.SVMForLinearFusionFit(DATASET, usedIndices=trainIndices, viewsIndices=viewsIndices) + def setParams(self, paramsSet): + pass + def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) diff --git a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py index 780560e3a6b2855f6447072982f8d0119ba97c07..c8079e59e64f9228b81c4b9917dedccd65cd57d1 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/WeightedLinear.py @@ -5,6 +5,16 @@ from sklearn.metrics import accuracy_score from utils.Dataset import getV +def genParamsSets(classificationKWARGS, nIter=1): + nbView = classificationKWARGS["nbView"] + paramsSets = [] + for _ in range(nIter): + randomWeightsArray = np.random.random_sample(nbView) + normalizedArray = randomWeightsArray/np.sum(randomWeightsArray) + paramsSets.append([normalizedArray]) + return paramsSets + + def gridSearch(DATASET, classificationKWARGS, trainIndices, nIter=30, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) @@ -35,6 +45,9 @@ class WeightedLinear(LateFusionClassifier): else: self.weights = np.array(map(float, kwargs['fusionMethodConfig'][0])) + def setParams(self, paramsSet): + self.weights = paramsSet[0] + def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): if type(viewsIndices)==type(None): viewsIndices = np.arange(DATASET.get("Metadata").attrs["nbView"]) diff --git a/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py b/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py index 34257b61b5ac1fd3f7755091e6d6ab6d50777d54..76b2af6aeb56528847538bd8d714cbebc9dafe45 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py +++ b/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py @@ -13,7 +13,7 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -def makeMeNoisy(viewData, percentage=5): +def makeMeNoisy(viewData, percentage=25): viewData = viewData.astype(bool) nbNoisyCoord = int(percentage/100.0*viewData.shape[0]*viewData.shape[1]) rows = range(viewData.shape[0]) @@ -26,7 +26,7 @@ def makeMeNoisy(viewData, percentage=5): return noisyViewData -def getPlausibleDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME, nbView=10, nbClass=2, datasetLength=500): +def getPlausibleDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME, nbView=3, nbClass=2, datasetLength=500): nbFeatures = 150 datasetFile = h5py.File(pathF+"Plausible.hdf5", "w") CLASS_LABELS = np.array([0 for i in range(datasetLength/2)]+[1 for i in range(datasetLength/2)]) diff --git a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Classifiers/SubSampling.py b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Classifiers/SubSampling.py index 52e4ad78b14ade1daaa32d5e1ffff71c15e9dd86..4dcd4ffeb747744687b85f728b1c77c668606ea9 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Classifiers/SubSampling.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Classifiers/SubSampling.py @@ -16,7 +16,7 @@ def isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict): def subSample(data, labels, subSampling, weights=None): - if weights == None: + if weights is None: weights = np.ones(len(labels))/len(labels) nbExamples = len(labels) labelSupports, labelDict = getLabelSupports(labels) diff --git a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Mumbo.py b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Mumbo.py index 59809434b82ebacc36eb482d21399a6828db4c33..b4c7d5a01c456b768e78293a402cd64167685195 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Mumbo.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Mumbo.py @@ -61,49 +61,51 @@ def gridSearch_hdf5(DATASET, viewIndices, classificationKWARGS, learningIndices, class Mumbo: - def __init__(self, NB_VIEW, DATASET_LENGTH, CLASS_LABELS, NB_CORES=1,**kwargs): + def __init__(self, NB_CORES=1, **kwargs): self.maxIter = kwargs["maxIter"] self.minIter = kwargs["minIter"] self.threshold = kwargs["threshold"] self.classifiersNames = kwargs["classifiersNames"] self.classifiersConfigs = kwargs["classifiersConfigs"] - nbClass = len(set(CLASS_LABELS)) + nbView = kwargs["nbView"] + self.edges = np.zeros((self.maxIter, nbView)) + self.alphas = np.zeros((self.maxIter, nbView)) + self.generalAlphas = np.zeros(self.maxIter) + self.nbCores = NB_CORES + self.iterIndex = 0 + self.bestClassifiers = [] + self.bestViews = np.zeros(self.maxIter, dtype=int) + self.averageAccuracies = np.zeros((self.maxIter, nbView)) + self.iterAccuracies = np.zeros(self.maxIter) + + def initDataDependant(self, datasetLength, nbView, nbClass, labels): self.costMatrices = np.array([ np.array([ np.array([ - np.array([1 if CLASS_LABELS[exampleIndice] != classe + np.array([1 if labels[exampleIndice] != classe else -(nbClass - 1) for classe in range(nbClass) - ]) for exampleIndice in range(DATASET_LENGTH) - ]) for viewIndice in range(NB_VIEW)]) + ]) for exampleIndice in range(datasetLength) + ]) for viewIndice in range(nbView)]) if iteration == 0 - else np.zeros((NB_VIEW, DATASET_LENGTH, nbClass)) + else np.zeros((nbView, datasetLength, nbClass)) for iteration in range(self.maxIter + 1) ]) self.generalCostMatrix = np.array([ np.array([ - np.array([1 if CLASS_LABELS[exampleIndice] != classe + np.array([1 if labels[exampleIndice] != classe else -(nbClass - 1) for classe in range(nbClass) - ]) for exampleIndice in range(DATASET_LENGTH) + ]) for exampleIndice in range(datasetLength) ]) for iteration in range(self.maxIter) ]) - self.fs = np.zeros((self.maxIter, NB_VIEW, DATASET_LENGTH, nbClass)) - self.ds = np.zeros((self.maxIter, NB_VIEW, DATASET_LENGTH)) - self.edges = np.zeros((self.maxIter, NB_VIEW)) - self.alphas = np.zeros((self.maxIter, NB_VIEW)) - self.predictions = np.zeros((self.maxIter, NB_VIEW, DATASET_LENGTH)) - self.generalAlphas = np.zeros(self.maxIter) - self.generalFs = np.zeros((self.maxIter, DATASET_LENGTH, nbClass)) - self.nbCores = NB_CORES - self.iterIndex = 0 - self.bestClassifiers = [] - self.bestViews = np.zeros(self.maxIter, dtype=int) - self.averageAccuracies = np.zeros((self.maxIter, NB_VIEW)) - self.iterAccuracies = np.zeros(self.maxIter) - + self.fs = np.zeros((self.maxIter, nbView, datasetLength, nbClass)) + self.ds = np.zeros((self.maxIter, nbView, datasetLength)) + self.predictions = np.zeros((self.maxIter, nbView, datasetLength)) + self.generalFs = np.zeros((self.maxIter, datasetLength, nbClass)) def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None): + # Initialization if not trainIndices: trainIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) @@ -113,7 +115,7 @@ class Mumbo: NB_VIEW = len(viewsIndices) DATASET_LENGTH = len(trainIndices) LABELS = DATASET["Labels"][trainIndices] - + self.initDataDependant(DATASET_LENGTH, NB_VIEW, NB_CLASS, LABELS) # Learning isStabilized=False self.iterIndex = 0 diff --git a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/analyzeResults.py b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/analyzeResults.py index 427c7c60cc05e5499ca305ec2c12a98b96ae3ac8..602d1e98e636d5d8aeb09069fed00758e01d539d 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/analyzeResults.py +++ b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/analyzeResults.py @@ -25,9 +25,8 @@ def findMainView(bestViews): return mainView -def plotAccuracyByIter(trainAccuracy, testAccuracy, validationAccuracy, NB_ITER, bestViews, features, classifierAnalysis): - x = range(NB_ITER) - mainView = findMainView(bestViews) +def plotAccuracyByIter(scoresOnTainByIter, scoresOnTestByIter, features, classifierAnalysis): + x = range(len(scoresOnTainByIter)) figure = plt.figure() ax1 = figure.add_subplot(111) axes = figure.gca() @@ -35,9 +34,8 @@ def plotAccuracyByIter(trainAccuracy, testAccuracy, validationAccuracy, NB_ITER, titleString = "" for view, classifierConfig in zip(features, classifierAnalysis): titleString += "\n" + view + " : " + classifierConfig - titleString += "\nBest view = " + features[int(mainView)] - ax1.set_title("Accuracy depending on iteration", fontsize=20) + ax1.set_title("Score depending on iteration", fontsize=20) plt.text(0.5, 1.08, titleString, horizontalalignment='center', fontsize=8, @@ -45,9 +43,8 @@ def plotAccuracyByIter(trainAccuracy, testAccuracy, validationAccuracy, NB_ITER, figure.subplots_adjust(top=0.8) ax1.set_xlabel("Iteration Index") ax1.set_ylabel("Accuracy") - ax1.plot(x, trainAccuracy, c='red', label='Train') - ax1.plot(x, testAccuracy, c='black', label='Test') - ax1.plot(x, validationAccuracy, c='blue', label='Validation') + ax1.plot(x, scoresOnTainByIter, c='red', label='Train') + ax1.plot(x, scoresOnTestByIter, c='black', label='Test') ax1.legend(loc='lower center', ncol=3, fancybox=True, shadow=True) @@ -98,13 +95,13 @@ def getAlgoConfig(initKWARGS, NB_CORES, viewNames, gridSearch, nIter, times): minIter = initKWARGS["minIter"] threshold = initKWARGS["threshold"] classifiersConfig = initKWARGS["classifiersConfigs"] - extractionTime, kFoldLearningTime, kFoldPredictionTime, classificationTime = times - kFoldLearningTime = [np.mean(np.array([kFoldLearningTime[statsIterIndex][foldIdx] - for statsIterIndex in range(len(kFoldLearningTime))])) - for foldIdx in range(len(kFoldLearningTime[0]))] - kFoldPredictionTime = [np.mean(np.array([kFoldPredictionTime[statsIterIndex][foldIdx] - for statsIterIndex in range(len(kFoldPredictionTime))])) - for foldIdx in range(len(kFoldPredictionTime[0]))] + extractionTime, classificationTime = times + # kFoldLearningTime = [np.mean(np.array([kFoldLearningTime[statsIterIndex][foldIdx] + # for statsIterIndex in range(len(kFoldLearningTime))])) + # for foldIdx in range(len(kFoldLearningTime[0]))] + # kFoldPredictionTime = [np.mean(np.array([kFoldPredictionTime[statsIterIndex][foldIdx] + # for statsIterIndex in range(len(kFoldPredictionTime))])) + # for foldIdx in range(len(kFoldPredictionTime[0]))] weakClassifierConfigs = [getattr(globals()[classifierName], 'getConfig')(classifiersConfig) for classifiersConfig, classifierName in zip(classifiersConfig, classifierNames)] @@ -123,18 +120,79 @@ def getAlgoConfig(initKWARGS, NB_CORES, viewNames, gridSearch, nIter, times): hms(seconds=int(extractionTime))) + "\n\t" row_format = "{:>15}" * 3 algoString += row_format.format("", *['Learn', 'Prediction']) - for index, (learningTime, predictionTime) in enumerate(zip(kFoldLearningTime, kFoldPredictionTime)): - algoString += '\n\t' - algoString += row_format.format("Fold " + str(index + 1), *[str(hms(seconds=int(learningTime))), - str(hms(seconds=int(predictionTime)))]) + # for index, (learningTime, predictionTime) in enumerate(zip(kFoldLearningTime, kFoldPredictionTime)): + # algoString += '\n\t' + # algoString += row_format.format("Fold " + str(index + 1), *[str(hms(seconds=int(learningTime))), + # str(hms(seconds=int(predictionTime)))]) algoString += '\n\t' - algoString += row_format.format("Total", *[str(hms(seconds=int(sum(kFoldLearningTime)))), - str(hms(seconds=int(sum(kFoldPredictionTime))))]) + # algoString += row_format.format("Total", *[str(hms(seconds=int(sum(kFoldLearningTime)))), + # str(hms(seconds=int(sum(kFoldPredictionTime))))]) algoString += "\n\tSo a total classification time of " + str(hms(seconds=int(classificationTime))) + ".\n\n" algoString += "\n\n" return algoString, classifierAnalysis +def getReport(classifiersIterations, CLASS_LABELS, iterationValidationIndices, DATASET, trainLabelsIterations, + testLabelsIterations, statsIter, viewIndices, metric): + nbView = len(viewIndices) + viewsDict = dict((viewIndex, index) for index, viewIndex in enumerate(viewIndices)) + DATASET_LENGTH = DATASET.get("Metadata").attrs["datasetLength"] + NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] + metricModule = getattr(Metrics, metric[0]) + if metric[1]!=None: + metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) + else: + metricKWARGS = {} + trainScores = [] + testScores = [] + meanAverageAccuraciesIterations =[] + viewsStatsIteration = np.zeros((statsIter, nbView)) + scoresByIterIteration = [] + maxIter = 0 + for statIterIndex in range(statsIter): + predictedTrainLabels = trainLabelsIterations[statIterIndex] + predictedTestLabels = testLabelsIterations[statIterIndex] + validationIndices = iterationValidationIndices[statIterIndex] + learningIndices = [index for index in range(DATASET_LENGTH) if index not in validationIndices] + trainScore = metricModule.score(CLASS_LABELS[learningIndices], predictedTrainLabels) + testScore = metricModule.score(CLASS_LABELS[validationIndices], predictedTestLabels) + trainScores.append(trainScore) + testScores.append(testScore) + mumboClassifier = classifiersIterations[statIterIndex] + if mumboClassifier.iterIndex+1 > maxIter: + maxIter = mumboClassifier.iterIndex + meanAverageAccuraciesIterations.append(np.mean(mumboClassifier.averageAccuracies, axis=0)) + viewsStatsIteration[statIterIndex, :] = np.array([float(list(mumboClassifier.bestViews).count(viewIndex))/ + len(mumboClassifier.bestViews)for viewIndex in range(nbView)]) + PredictedTrainLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, usedIndices=learningIndices, + NB_CLASS=NB_CLASS) + PredictedTestLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, usedIndices=validationIndices, + NB_CLASS=NB_CLASS) + scoresByIter = np.zeros((len(PredictedTestLabelsByIter),2)) + for iterIndex,(iterPredictedTrainLabels, iterPredictedTestLabels) in enumerate(zip(PredictedTrainLabelsByIter, PredictedTestLabelsByIter)): + scoresByIter[iterIndex, 0] = metricModule.score(CLASS_LABELS[learningIndices],iterPredictedTrainLabels) + scoresByIter[iterIndex, 1] = metricModule.score(CLASS_LABELS[validationIndices],iterPredictedTestLabels) + scoresByIterIteration.append(scoresByIter) + + scoresOnTainByIter = [np.mean(np.array([scoresByIterIteration[statsIterIndex][iterIndex, 0] + for statsIterIndex in range(statsIter) + if scoresByIterIteration[statsIterIndex].shape[0]>iterIndex])) + for iterIndex in range(maxIter)] + + scoresOnTestByIter = [np.mean(np.array([scoresByIterIteration[statsIterIndex][iterIndex, 1] + for statsIterIndex in range(statsIter) + if scoresByIterIteration[statsIterIndex].shape[0]>iterIndex])) + for iterIndex in range(maxIter)] + + viewsStats = np.mean(viewsStatsIteration, axis=0) + meanAverageAccuracies = np.mean(np.array(meanAverageAccuraciesIterations), axis=0) + totalScoreOnTrain = np.mean(np.array(trainScores)) + totalScoreOnTest = np.mean(np.array(trainScores)) + return (totalScoreOnTrain, totalScoreOnTest, meanAverageAccuracies, viewsStats, scoresOnTainByIter, + scoresOnTestByIter) + + + def getClassificationReport(kFolds, kFoldClassifier, CLASS_LABELS, validationIndices, DATASET, kFoldPredictedTrainLabels, kFoldPredictedTestLabels, kFoldPredictedValidationLabels,statsIter, viewIndices): nbView = len(viewIndices) @@ -319,16 +377,15 @@ def printMetricScore(metricScores, metrics): else: metricKWARGS = {} metricScoreString += "\tFor "+metricModule.getConfig(**metricKWARGS)+" : " - metricScoreString += "\n\t\t- Score on train : "+str(metricScores[metric[0]][0]) +" with STD : "+str(metricScores[metric[0]][3]) - metricScoreString += "\n\t\t- Score on test : "+str(metricScores[metric[0]][1]) +" with STD : "+str(metricScores[metric[0]][4]) - metricScoreString += "\n\t\t- Score on validation : "+str(metricScores[metric[0]][2]) +" with STD : "+str(metricScores[metric[0]][5]) + metricScoreString += "\n\t\t- Score on train : "+str(metricScores[metric[0]][0]) +" with STD : "+str(metricScores[metric[0]][2]) + metricScoreString += "\n\t\t- Score on test : "+str(metricScores[metric[0]][1]) +" with STD : "+str(metricScores[metric[0]][3]) metricScoreString += "\n\n" return metricScoreString -def getTotalMetricScores(metric, kFoldPredictedTrainLabels, kFoldPredictedTestLabels, - kFoldPredictedValidationLabels, DATASET, validationIndices, kFolds, statsIter): +def getTotalMetricScores(metric, trainLabelsIterations, testLabelsIterations, DATASET, iterationValidationIndices, statsIter): labels = DATASET.get("Labels").value + DATASET_LENGTH = DATASET.get("Metadata").attrs["datasetLength"] metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) @@ -338,18 +395,20 @@ def getTotalMetricScores(metric, kFoldPredictedTrainLabels, kFoldPredictedTestLa testScores = [] validationScores = [] for statsIterIndex in range(statsIter): - trainScores.append(np.mean(np.array([metricModule.score([label for index, label in enumerate(labels) if (index not in fold) and (index not in validationIndices[statsIterIndex])], predictedLabels, **metricKWARGS) for fold, predictedLabels in zip(kFolds[statsIterIndex], kFoldPredictedTrainLabels[statsIterIndex])]))) - testScores.append(np.mean(np.array([metricModule.score(labels[fold], predictedLabels, **metricKWARGS) for fold, predictedLabels in zip(kFolds[statsIterIndex], kFoldPredictedTestLabels[statsIterIndex])]))) - validationScores.append(np.mean(np.array([metricModule.score(labels[validationIndices[statsIterIndex]], predictedLabels, **metricKWARGS) for predictedLabels in kFoldPredictedValidationLabels[statsIterIndex]]))) - return [np.mean(np.array(trainScores)), np.mean(np.array(testScores)), np.mean(np.array(validationScores)), np.std(np.array(testScores)),np.std(np.array(validationScores)), np.std(np.array(trainScores))] + validationIndices = iterationValidationIndices[statsIterIndex] + learningIndices = [index for index in range(DATASET_LENGTH) if index not in validationIndices] + trainScores.append(metricModule.score(labels[learningIndices], trainLabelsIterations[statsIterIndex], **metricKWARGS)) + testScores.append(metricModule.score(labels[validationIndices], testLabelsIterations[statsIterIndex], **metricKWARGS)) + return [np.mean(np.array(trainScores)), np.mean(np.array(testScores)), np.std(np.array(trainScores)), + np.std(np.array(testScores))] -def getMetricsScores(metrics, kFoldPredictedTrainLabels, kFoldPredictedTestLabels, - kFoldPredictedValidationLabels, DATASET, validationIndices, kFolds, statsIter): +def getMetricsScores(metrics, trainLabelsIterations, testLabelsIterations, + DATASET, validationIndices, statsIter): metricsScores = {} for metric in metrics: - metricsScores[metric[0]] = getTotalMetricScores(metric, kFoldPredictedTrainLabels, kFoldPredictedTestLabels, - kFoldPredictedValidationLabels, DATASET, validationIndices, kFolds, statsIter) + metricsScores[metric[0]] = getTotalMetricScores(metric, trainLabelsIterations, testLabelsIterations, + DATASET, validationIndices, statsIter) return metricsScores @@ -357,73 +416,41 @@ def getMeanIterations(kFoldClassifierStats, foldIndex): iterations = np.array([kFoldClassifier[foldIndex].iterIndex+1 for kFoldClassifier in kFoldClassifierStats]) return np.mean(iterations) -def execute(kFoldClassifier, kFoldPredictedTrainLabels, kFoldPredictedTestLabels, kFoldPredictedValidationLabels, - DATASET, initKWARGS, LEARNING_RATE, LABELS_DICTIONARY, views, NB_CORES, times, kFolds, databaseName, - nbFolds, validationIndices, gridSearch, nIter, metrics, statsIter, viewIndices): +def execute(classifiersIterations, trainLabelsIterations,testLabelsIterations, DATASET,initKWARGS, + LEARNING_RATE,LABELS_DICTIONARY,views, NB_CORES, times,databaseName, nbFolds, validationIndices, gridSearch, + nIter, metrics, statsIter,viewIndices): + CLASS_LABELS = DATASET.get("Labels")[...] - maxIter = initKWARGS["maxIter"] - minIter = initKWARGS["minIter"] - nbView = len(viewIndices) + dbConfigurationString, viewNames = getDBConfig(DATASET, LEARNING_RATE, nbFolds, databaseName, validationIndices, LABELS_DICTIONARY) algoConfigurationString, classifierAnalysis = getAlgoConfig(initKWARGS, NB_CORES, viewNames, gridSearch, nIter, times) - (totalAccuracyOnTrain, totalAccuracyOnTest, totalAccuracyOnValidation, kFoldMeanAverageAccuracies, - kFoldBestViewsStats, kFoldAccuracyOnTrainByIter, kFoldAccuracyOnTestByIter, kFoldAccuracyOnValidationByIter, - kFoldBestViews) = getClassificationReport(kFolds, kFoldClassifier, CLASS_LABELS, validationIndices, DATASET, - kFoldPredictedTrainLabels, kFoldPredictedTestLabels, - kFoldPredictedValidationLabels, statsIter, viewIndices) - nbMinIter = maxIter - nbMaxIter = minIter - for classifiers in kFoldClassifier: - for classifier in classifiers: - if classifier.iterIndex+1<nbMinIter: - nbMinIter = classifier.iterIndex+1 - if classifier.iterIndex+1>nbMaxIter: - nbMaxIter = classifier.iterIndex+1 - formatedAccuracies = {"Train":np.zeros((nbFolds, nbMinIter)), "Test":np.zeros((nbFolds, nbMinIter)), - "Validation":np.zeros((nbFolds, nbMinIter))} - surplusAccuracies = {"Train":{}, "Test":{},"Validation":{}} - for classifierIndex, accuracies in enumerate(kFoldAccuracyOnTestByIter): - formatedAccuracies["Test"][classifierIndex] = np.array(kFoldAccuracyOnTestByIter[classifierIndex][:nbMinIter]) - formatedAccuracies["Train"][classifierIndex] = np.array(kFoldAccuracyOnTrainByIter[classifierIndex][:nbMinIter]) - formatedAccuracies["Validation"][classifierIndex] = np.array(kFoldAccuracyOnValidationByIter[classifierIndex][:nbMinIter]) - if len(accuracies)>nbMinIter: - surplusAccuracies["Train"][classifierIndex] = kFoldAccuracyOnTrainByIter[classifierIndex][nbMinIter:] - surplusAccuracies["Test"][classifierIndex] = kFoldAccuracyOnTestByIter[classifierIndex][nbMinIter:] - surplusAccuracies["Validation"][classifierIndex] = kFoldAccuracyOnValidationByIter[classifierIndex][nbMinIter:] - - - - bestViews = [findMainView(np.array(kFoldBestViews)[:, iterIdx]) for iterIdx in range(nbMinIter)] + + + (totalScoreOnTrain, totalScoreOnTest, meanAverageAccuracies, viewsStats, scoresOnTainByIter, + scoresOnTestByIter) = getReport(classifiersIterations, CLASS_LABELS, validationIndices, DATASET, + trainLabelsIterations, testLabelsIterations, statsIter, viewIndices, metrics[0]) + stringAnalysis = "\t\tResult for Multiview classification with Mumbo" \ - "\n\nAverage accuracy :\n\t-On Train : " + str(totalAccuracyOnTrain) + "\n\t-On Test : " + \ - str(totalAccuracyOnTest) + "\n\t-On Validation : " + str(totalAccuracyOnValidation) + "\n\nAverage "+metrics[0][0]+" :\n\t-On Train : " + str(totalScoreOnTrain) + "\n\t-On Test : " + \ + str(totalScoreOnTest) stringAnalysis += dbConfigurationString stringAnalysis += algoConfigurationString - metricsScores = getMetricsScores(metrics, kFoldPredictedTrainLabels, kFoldPredictedTestLabels, - kFoldPredictedValidationLabels, DATASET, validationIndices, kFolds, statsIter) + metricsScores = getMetricsScores(metrics, trainLabelsIterations, testLabelsIterations, + DATASET, validationIndices, statsIter) stringAnalysis += printMetricScore(metricsScores, metrics) - stringAnalysis += "Mean average accuracies and stats for each fold :" - for foldIdx in range(nbFolds): - stringAnalysis += "\n\t- Fold "+str(foldIdx)+", used "+str(getMeanIterations(kFoldClassifier, foldIdx)) - for viewIndex, (meanAverageAccuracy, bestViewStat) in enumerate(zip(kFoldMeanAverageAccuracies[foldIdx], kFoldBestViewsStats[foldIdx])): - stringAnalysis+="\n\t\t- On "+viewNames[viewIndex]+ \ - " : \n\t\t\t- Mean average Accuracy : "+str(meanAverageAccuracy)+ \ - "\n\t\t\t- Percentage of time chosen : "+str(bestViewStat) + stringAnalysis += "Mean average accuracies and stats :" + for viewIndex, (meanAverageAccuracy, bestViewStat) in enumerate(zip(meanAverageAccuracies,viewsStats)): + stringAnalysis+="\n\t- On "+viewNames[viewIndex]+ \ + " : \n\t\t- Mean average Accuracy : "+str(meanAverageAccuracy)+ \ + "\n\t\t- Percentage of time chosen : "+str(bestViewStat) stringAnalysis += "\n\n For each iteration : " - for iterIndex in range(maxIter): - if iterRelevant(iterIndex, kFoldClassifier).any(): - stringAnalysis += "\n\t- Iteration " + str(iterIndex + 1) - for foldIdx in [index for index, value in enumerate(iterRelevant(iterIndex, kFoldClassifier)) if value]: - stringAnalysis += "\n\t\t Fold " + str(foldIdx + 1) + "\n\t\t\tAccuracy on train : " + \ - str(kFoldAccuracyOnTrainByIter[foldIdx][iterIndex]) + '\n\t\t\tAccuracy on test : ' + \ - str(kFoldAccuracyOnTestByIter[foldIdx][iterIndex]) + '\n\t\t\tAccuracy on validation : '+ \ - str(kFoldAccuracyOnValidationByIter[foldIdx][iterIndex]) + '\n\t\t\tSelected View : ' + \ - str(DATASET["View"+str(int(kFoldBestViews[foldIdx][iterIndex]))].attrs["name"]) - - trainAccuracyByIter = list(formatedAccuracies["Train"].mean(axis=0))+modifiedMean(surplusAccuracies["Train"]) - testAccuracyByIter = list(formatedAccuracies["Test"].mean(axis=0))+modifiedMean(surplusAccuracies["Test"]) - validationAccuracyByIter = list(formatedAccuracies["Validation"].mean(axis=0))+modifiedMean(surplusAccuracies["Validation"]) - name, image = plotAccuracyByIter(trainAccuracyByIter, testAccuracyByIter, validationAccuracyByIter, nbMaxIter, - bestViews, views, classifierAnalysis) + for iterIndex in range(len(scoresOnTainByIter)): + stringAnalysis += "\n\t- Iteration " + str(iterIndex + 1) + stringAnalysis += "\n\t\tScore on train : " + \ + str(scoresOnTainByIter[iterIndex]) + '\n\t\tScore on test : ' + \ + str(scoresOnTestByIter[iterIndex]) + + + name, image = plotAccuracyByIter(scoresOnTainByIter, scoresOnTestByIter, views, classifierAnalysis) imagesAnalysis = {name: image} return stringAnalysis, imagesAnalysis, metricsScores diff --git a/Code/MonoMutliViewClassifiers/utils/HyperParameterSearch.py b/Code/MonoMutliViewClassifiers/utils/HyperParameterSearch.py new file mode 100644 index 0000000000000000000000000000000000000000..47d89bb920c6d2fd7cf9d1dbab7d885880702939 --- /dev/null +++ b/Code/MonoMutliViewClassifiers/utils/HyperParameterSearch.py @@ -0,0 +1,203 @@ +import h5py +import numpy as np +import sys +import Multiview +import Metrics + +def searchBestSettings(dataset, classifierName, metrics,viewsIndices=None, usedIndices=None, kFolds=None, searchingTool="gridSearch", nIter=1, **kwargs): + if viewsIndices is None: + viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) + thismodule = sys.modules[__name__] + searchingToolMethod = getattr(thismodule, searchingTool) + bestSettings = searchingToolMethod(dataset, classifierName, metrics, viewsIndices=viewsIndices, usedIndices=usedIndices, kFolds=kFolds, nIter=nIter, **kwargs) + return bestSettings # or well set clasifier ? + + +def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs): + #si grid search est selectionne, on veut tester certaines valeurs + pass + + +def randomizedSearch(dataset, classifierName, metrics, viewsIndices=None, usedIndices=None, kFolds=None, nIter=1, nbCores=1, **classificationKWARGS): + if viewsIndices is None: + viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) + metric = metrics[0] + metricModule = getattr(Metrics, metric[0]) + if metric[1]!=None: + metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) + else: + metricKWARGS = {} + if metricModule.getConfig()[-14]=="h": + baseScore = -1000.0 + isBetter = "higher" + else: + baseScore = 1000.0 + isBetter = "lower" + classifierPackage =getattr(Multiview,classifierName) # Permet d'appeler un module avec une string + classifierModule = getattr(classifierPackage, classifierName) + classifierClass = getattr(classifierModule, classifierName) + + if classifierName != "Mumbo": + datasetLength = dataset.get("Metadata").attrs["datasetLength"] + paramsSets = classifierModule.genParamsSets(classificationKWARGS, nIter=nIter) + bestScore = 0 + bestSettings = [] + for paramsSet in paramsSets: + scores = [] + for fold in kFolds: + fold.sort() + trainIndices = [index for index in range(datasetLength) if (index not in fold) and (index in usedIndices)] + classifier = classifierClass(NB_CORES=nbCores, **classificationKWARGS) + classifier.setParams(paramsSet) + classifier.fit_hdf5(dataset, trainIndices=trainIndices, viewsIndices=viewsIndices) + trainLabels = classifier.predict_hdf5(dataset, usedIndices=trainIndices, viewsIndices=viewsIndices) + testLabels = classifier.predict_hdf5(dataset, usedIndices=fold, viewsIndices=viewsIndices) + trainScore = metricModule.score(dataset.get("Labels").value[trainIndices], trainLabels) + testScore = metricModule.score(dataset.get("Labels").value[fold], testLabels) + scores.append(testScore) + crossValScore = np.mean(np.array(scores)) + + if isBetter=="higher" and crossValScore>bestScore: + baseScore = crossValScore + bestSettings = paramsSet + if isBetter=="lower" and crossValScore<bestScore: + baseScore = crossValScore + bestSettings = paramsSet + classifier = classifierClass(NB_CORES=nbCores, **classificationKWARGS) + classifier.setParams(paramsSet) + + else: + bestConfigs, _ = classifierModule.gridSearch_hdf5(dataset, viewsIndices, classificationKWARGS, usedIndices, metric=metric, nIter=nIter) + classificationKWARGS["classifiersConfigs"] = bestConfigs + classifier = classifierClass(NB_CORES=nbCores, **classificationKWARGS) + return classifier + + +def spearMint(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs): + pass + +# nohup python ~/dev/git/spearmint/spearmint/main.py . & + +# import json +# import numpy as np +# import math +# +# from os import system +# from os.path import join +# +# +# def run_kover(dataset, split, model_type, p, max_rules, output_dir): +# outdir = join(output_dir, "%s_%f" % (model_type, p)) +# kover_command = "kover learn " \ +# "--dataset '%s' " \ +# "--split %s " \ +# "--model-type %s " \ +# "--p %f " \ +# "--max-rules %d " \ +# "--max-equiv-rules 10000 " \ +# "--hp-choice cv " \ +# "--random-seed 0 " \ +# "--output-dir '%s' " \ +# "--n-cpu 1 " \ +# "-v" % (dataset, +# split, +# model_type, +# p, +# max_rules, +# outdir) +# +# system(kover_command) +# +# return json.load(open(join(outdir, "results.json")))["cv"]["best_hp"]["score"] +# +# +# def main(job_id, params): +# print params +# +# max_rules = params["MAX_RULES"][0] +# +# species = params["SPECIES"][0] +# antibiotic = params["ANTIBIOTIC"][0] +# split = params["SPLIT"][0] +# +# model_type = params["model_type"][0] +# +# # LS31 +# if species == "saureus": +# dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/data/earle_2016/saureus/kover_datasets/%s.kover" % antibiotic +# else: +# dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/genome_scm_paper/data/%s/%s.kover" % (species, antibiotic) +# +# output_path = "/home/droale01/droale01-ls31/projects/genome_scm/manifold_scm/spearmint/vanilla_scm/%s/%s" % (species, antibiotic) +# +# # MacBook +# #dataset_path = "/Volumes/Einstein 1/kover_phylo/datasets/%s/%s.kover" % (species, antibiotic) +# #output_path = "/Volumes/Einstein 1/manifold_scm/version2/%s_spearmint" % antibiotic +# +# return run_kover(dataset=dataset_path, +# split=split, +# model_type=model_type, +# p=params["p"][0], +# max_rules=max_rules, +# output_dir=output_path) +# killall mongod && sleep 1 && rm -r database/* && rm mongo.log* +# mongod --fork --logpath mongo.log --dbpath database +# +# { +# "language" : "PYTHON", +# "experiment-name" : "vanilla_scm_cdiff_azithromycin", +# "polling-time" : 1, +# "resources" : { +# "my-machine" : { +# "scheduler" : "local", +# "max-concurrent" : 5, +# "max-finished-jobs" : 100 +# } +# }, +# "tasks": { +# "resistance" : { +# "type" : "OBJECTIVE", +# "likelihood" : "NOISELESS", +# "main-file" : "spearmint_wrapper", +# "resources" : ["my-machine"] +# } +# }, +# "variables": { +# +# "MAX_RULES" : { +# "type" : "ENUM", +# "size" : 1, +# "options": [10] +# }, +# +# +# "SPECIES" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["cdiff"] +# }, +# "ANTIBIOTIC" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["azithromycin"] +# }, +# "SPLIT" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["split_seed_2"] +# }, +# +# +# "model_type" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["conjunction", "disjunction"] +# }, +# "p" : { +# "type" : "FLOAT", +# "size" : 1, +# "min" : 0.01, +# "max" : 100 +# } +# } +# } \ No newline at end of file