diff --git a/Code/MonoMutliViewClassifiers/ExecClassif.py b/Code/MonoMutliViewClassifiers/ExecClassif.py index 7553e6b81dd5c645c11f7dec47790c4026694f33..3b91eb3169904a8515484fabadacac2c167a88d0 100644 --- a/Code/MonoMutliViewClassifiers/ExecClassif.py +++ b/Code/MonoMutliViewClassifiers/ExecClassif.py @@ -13,6 +13,7 @@ from joblib import Parallel, delayed import numpy as np import math import matplotlib +import sklearn # Import own modules import Multiview @@ -263,7 +264,7 @@ groupStandard.add_argument('--randomState', metavar='INT', action='store', type= groupClass = parser.add_argument_group('Classification arguments') groupClass.add_argument('--CL_split', metavar='FLOAT', action='store', help='Determine the split between learning and validation sets', type=float, - default=0.7) + default=0.3) groupClass.add_argument('--CL_nbFolds', metavar='INT', action='store', help='Number of folds in cross validation', type=int, default=2) groupClass.add_argument('--CL_nb_class', metavar='INT', action='store', help='Number of classes, -1 for all', type=int, @@ -424,6 +425,15 @@ directory = initLogFile(args) DATASET, LABELS_DICTIONARY = getDatabase(args.views, args.pathF, args.name, args.CL_nb_class, args.CL_classes, randomState) +datasetLength = DATASET.get("Metadata").attrs["datasetLength"] +indices = np.arange(datasetLength) +trainIndices, testIndices, a, aa = sklearn.model_selection.train_test_split(indices, DATASET.get("Labels").value, + test_size=args.CL_split, + random_state=randomState) +classificationIndices = (trainIndices, testIndices) +kFolds = sklearn.model_selection.KFold(n_splits=args.CL_nbFolds, random_state=randomState) +kFoldsIndices = kFolds.split(trainIndices) + datasetFiles = initMultipleDatasets(args, nbCores) views, viewsIndices, allViews = initViews(DATASET, args) @@ -465,7 +475,7 @@ if nbCores > 1: nbExperiments = len(argumentDictionaries["Monoview"]) for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): resultsMonoview += (Parallel(n_jobs=nbCores)( - delayed(ExecMonoview_multicore)(directory, args.name, labelsNames, args.CL_split, args.CL_nbFolds, + delayed(ExecMonoview_multicore)(directory, args.name, labelsNames, classificationIndices, kFolds, coreIndex, args.type, args.pathF, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter, @@ -481,7 +491,7 @@ if nbCores > 1: else: resultsMonoview += ([ExecMonoview(directory, DATASET.get("View" + str(arguments["viewIndex"])), DATASET.get("Labels").value, args.name, labelsNames, - args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, statsIter, randomState, + classificationIndices, kFolds, 1, args.type, args.pathF, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Monoview"]]) @@ -501,7 +511,7 @@ if nbCores > 1: nbExperiments = len(argumentDictionaries["Multiview"]) for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): resultsMultiview += Parallel(n_jobs=nbCores)( - delayed(ExecMultiview_multicore)(directory, coreIndex, args.name, args.CL_split, args.CL_nbFolds, args.type, + delayed(ExecMultiview_multicore)(directory, coreIndex, args.name, classificationIndices, kFolds, args.type, args.pathF, LABELS_DICTIONARY, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter, @@ -509,7 +519,7 @@ if nbCores > 1: for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))) else: resultsMultiview = [ - ExecMultiview(directory, DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, + ExecMultiview(directory, DATASET, args.name, classificationIndices, kFolds, 1, args.type, args.pathF, LABELS_DICTIONARY, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Multiview"]] diff --git a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py index 9a162b00b31746f6d6158c9bea9c94afd0fc34d4..7581d75509d0062329e595d7553a24abd349a10a 100644 --- a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py @@ -29,7 +29,7 @@ __status__ = "Prototype" # Production, Development, Prototype __date__ = 2016-03-25 -def ExecMonoview_multicore(directory, name, labelsNames, learningRate, nbFolds, datasetFileIndex, databaseType, path, statsIter, randomState, hyperParamSearch="randomizedSearch", +def ExecMonoview_multicore(directory, name, labelsNames, classificationIndices, KFolds, datasetFileIndex, databaseType, path, statsIter, randomState, hyperParamSearch="randomizedSearch", metrics=[["accuracy_score", None]], nIter=30, **args): DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r") kwargs = args["args"] @@ -37,11 +37,11 @@ def ExecMonoview_multicore(directory, name, labelsNames, learningRate, nbFolds, neededViewIndex = views.index(kwargs["feat"]) X = DATASET.get("View"+str(neededViewIndex)) Y = DATASET.get("Labels").value - return ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, 1, databaseType, path, statsIter, randomState, hyperParamSearch=hyperParamSearch, + return ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, 1, databaseType, path, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=nIter, **args) -def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCores, databaseType, path, statsIter, randomState, hyperParamSearch="randomizedSearch", +def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, nbCores, databaseType, path, statsIter, randomState, hyperParamSearch="randomizedSearch", metrics=[["accuracy_score", None]], nIter=30, **args): logging.debug("Start:\t Loading data") try: @@ -57,62 +57,64 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo logging.debug("Done:\t Loading data") # Determine the Database to extract features - logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type) - - y_trains = [] - y_tests = [] - y_train_preds = [] - y_test_preds = [] - for iterationStat in range(statsIter): - # Calculate Train/Test data - logging.debug("Start:\t Determine Train/Test split"+" for iteration "+str(iterationStat+1)) - testIndices = MonoviewUtils.splitDataset(Y, nbClass, learningRate, datasetLength, randomState) - trainIndices = [i for i in range(datasetLength) if i not in testIndices] - X_train = extractSubset(X,trainIndices) - X_test = extractSubset(X,testIndices) - y_train = Y[trainIndices] - y_test = Y[testIndices] - - logging.debug("Info:\t Shape X_train:" + str(X_train.shape) + ", Length of y_train:" + str(len(y_train))) - logging.debug("Info:\t Shape X_test:" + str(X_test.shape) + ", Length of y_test:" + str(len(y_test))) - logging.debug("Done:\t Determine Train/Test split") - - # Begin Classification RandomForest - - classifierModule = getattr(MonoviewClassifiers, CL_type) - - if hyperParamSearch != "None": - classifierGridSearch = getattr(classifierModule, hyperParamSearch) - logging.debug("Start:\t RandomSearch best settings with "+str(nIter)+" iterations for "+CL_type) - cl_desc = classifierGridSearch(X_train, y_train, randomState, nbFolds=nbFolds, nbCores=nbCores, - metric=metrics[0], nIter=nIter) - clKWARGS = dict((str(index), desc) for index, desc in enumerate(cl_desc)) - logging.debug("Done:\t RandomSearch best settings") - else: - clKWARGS = kwargs[kwargs["CL_type"]+"KWARGS"] - logging.debug("Start:\t Training") - cl_res = classifierModule.fit(X_train, y_train, randomState, NB_CORES=nbCores, **clKWARGS) - logging.debug("Done:\t Training") - - logging.debug("Start:\t Predicting") - # Stats Result - y_train_pred = cl_res.predict(X_train) - y_test_pred = cl_res.predict(X_test) - - y_trains.append(y_train) - y_train_preds.append(y_train_pred) - y_tests.append(y_test) - y_test_preds.append(y_test_pred) - full_labels = cl_res.predict(X) - logging.debug("Done:\t Predicting") - t_end = time.time() - t_start + print KFolds + logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + + str(len(classificationIndices[0])) + ", CrossValidation k-folds: " + str(KFolds.n_splits) + ", cores:" + + str(nbCores) + ", algorithm : " + CL_type) + + # y_trains = [] + # y_tests = [] + # y_train_preds = [] + # y_test_preds = [] + trainIndices, testIndices = classificationIndices + # for iterationStat in range(statsIter): + # Calculate Train/Test data + logging.debug("Start:\t Determine Train/Test split") #+" for iteration "+str(iterationStat+1) + # testIndices = MonoviewUtils.splitDataset(Y, nbClass, classificationIndices, datasetLength, randomState) + # trainIndices = [i for i in range(datasetLength) if i not in testIndices] + X_train = extractSubset(X, trainIndices) + X_test = extractSubset(X, testIndices) + y_train = Y[trainIndices] + y_test = Y[testIndices] + + logging.debug("Info:\t Shape X_train:" + str(X_train.shape) + ", Length of y_train:" + str(len(y_train))) + logging.debug("Info:\t Shape X_test:" + str(X_test.shape) + ", Length of y_test:" + str(len(y_test))) + logging.debug("Done:\t Determine Train/Test split") + + classifierModule = getattr(MonoviewClassifiers, CL_type) + + if hyperParamSearch != "None": + classifierHPSearch = getattr(classifierModule, hyperParamSearch) + logging.debug("Start:\t RandomSearch best settings with "+str(nIter)+" iterations for "+CL_type) + cl_desc = classifierHPSearch(X_train, y_train, randomState, KFolds=KFolds, nbCores=nbCores, + metric=metrics[0], nIter=nIter) + clKWARGS = dict((str(index), desc) for index, desc in enumerate(cl_desc)) + logging.debug("Done:\t RandomSearch best settings") + else: + clKWARGS = kwargs[kwargs["CL_type"]+"KWARGS"] + logging.debug("Start:\t Training") + cl_res = classifierModule.fit(X_train, y_train, randomState, NB_CORES=nbCores, **clKWARGS) + logging.debug("Done:\t Training") + + logging.debug("Start:\t Predicting") + # Stats Result + y_train_pred = cl_res.predict(X_train) + y_test_pred = cl_res.predict(X_test) + + # y_trains.append(y_train) + # y_train_preds.append(y_train_pred) + # y_tests.append(y_test) + # y_test_preds.append(y_test_pred) + full_labels = cl_res.predict(X) + logging.debug("Done:\t Predicting") + t_end = time.time() - t_start logging.debug("Info:\t Time for training and predicting: " + str(t_end) + "[s]") logging.debug("Start:\t Getting Results") - stringAnalysis, imagesAnalysis, metricsScores = execute(name, learningRate, nbFolds, nbCores, hyperParamSearch, metrics, nIter, feat, CL_type, + stringAnalysis, imagesAnalysis, metricsScores = execute(name, classificationIndices, KFolds, nbCores, hyperParamSearch, metrics, nIter, feat, CL_type, clKWARGS, labelsNames, X.shape, - y_trains, y_train_preds, y_tests, y_test_preds, t_end, statsIter, randomState) + y_train, y_train_pred, y_test, y_test_pred, t_end, statsIter, randomState) cl_desc = [value for key, value in sorted(clKWARGS.iteritems())] logging.debug("Done:\t Getting Results") logging.info(stringAnalysis) @@ -120,7 +122,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo timestr = time.strftime("%Y%m%d-%H%M%S") CL_type_string = CL_type outputFileName = directory + timestr + "Results-" + CL_type_string + "-" + labelsString + \ - '-learnRate' + str(learningRate) + '-' + name + "-" + feat + '-learnRate' + str(len(classificationIndices)) + '-' + name + "-" + feat outputTextFile = open(outputFileName + '.txt', 'w') outputTextFile.write(stringAnalysis) @@ -131,7 +133,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo if os.path.isfile(outputFileName + imageName + ".png"): for i in range(1,20): testFileName = outputFileName + imageName + "-" + str(i) + ".png" - if os.path.isfile(testFileName )!=True: + if os.path.isfile(testFileName ) != True: imagesAnalysis[imageName].savefig(testFileName) break @@ -140,6 +142,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo logging.info("Done:\t Result Analysis") viewIndex = args["viewIndex"] return viewIndex, [CL_type, cl_desc+[feat], metricsScores, full_labels] + # # Classification Report with Precision, Recall, F1 , Support # logging.debug("Info:\t Classification report:") # filename = datetime.datetime.now().strftime("%Y_%m_%d") + "-CMV-" + name + "-" + feat + "-Report" diff --git a/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py b/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py index 1e7e7df4344471ff9cedb7a063e153ba51decbd9..d495a776ec2006f55cd492f605fe1dc7d9a6e87f 100644 --- a/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py +++ b/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py @@ -4,13 +4,15 @@ import numpy as np import MonoviewClassifiers import Metrics -def getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds): + +def getDBConfigString(name, feat, classificationIndices, shape, classLabelsNames, KFolds): + learningRate = float(len(classificationIndices[0]))/len(classificationIndices[0])+len(classificationIndices[1]) dbConfigString = "Database configuration : \n" dbConfigString += "\t- Database name : "+name+"\n" dbConfigString += "\t- View name : "+feat+"\t View shape : "+str(shape)+"\n" - dbConfigString += "\t- Learning Rate : "+str(learningRate)+"\n" + dbConfigString += "\t- Learning Rate : "+str(learningRate) + "\n" dbConfigString += "\t- Labels used : "+", ".join(classLabelsNames)+"\n" - dbConfigString += "\t- Number of cross validation folds : "+str(nbFolds)+"\n\n" + dbConfigString += "\t- Number of cross validation folds : "+str(KFolds.n_splits) + "\n\n" return dbConfigString @@ -24,14 +26,15 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS): classifierConfigString += "\n\n" return classifierConfigString -def getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds): + +def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred): metricModule = getattr(Metrics, metric[0]) - if metric[1]!=None: + if metric[1] is not None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} - metricScoreTrain = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)])) - metricScoreTest = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)])) + metricScoreTrain = metricModule.score(y_train, y_train_pred) + metricScoreTest = metricModule.score(y_test, y_test_pred) metricScoreString = "\tFor "+metricModule.getConfig(**metricKWARGS)+" : " metricScoreString += "\n\t\t- Score on train : "+str(metricScoreTrain) metricScoreString += "\n\t\t- Score on test : "+str(metricScoreTest) @@ -39,31 +42,29 @@ def getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds): return metricScoreString -def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames, - shape, y_trains, y_train_preds, y_tests, y_test_preds, time, statsIter, randomState): +def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames, + shape, y_train, y_train_pred, y_test, y_test_pred, time, statsIter, randomState): metricsScores = {} metricModule = getattr(Metrics, metrics[0][0]) - trainScores = np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]) - testScores = np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]) - train = np.mean(trainScores) - val = np.mean(testScores) - stdTrain = np.std(trainScores) - stdTest = np.std(testScores) + trainScore = metricModule.score(y_train, y_train_pred) + testScore = metricModule.score(y_test, y_test_pred) + # train = np.mean(trainScores) + # val = np.mean(testScores) + stdTrain = "nan" #np.std(trainScores) + stdTest = "nan" #np.std(testScores) stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+", random state is "+str(randomState)+", and "+str(statsIter)+" statistical iterations\n\n" - stringAnalysis += metrics[0][0]+" on train : "+str(train)+", with STD : "+str(stdTrain)+"\n"+metrics[0][0]+" on test : "+str(val)+", with STD : "+str(stdTest)+"\n\n" - stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds) + stringAnalysis += metrics[0][0]+" on train : "+str(trainScore)+", with STD : "+str(stdTrain)+"\n"+metrics[0][0]+" on test : "+str(testScore)+", with STD : "+str(stdTest)+"\n\n" + stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, KFolds) stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS) for metric in metrics: - stringAnalysis+=getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds) + stringAnalysis+=getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred) if metric[1]!=None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} - metricsScores[metric[0]] = [np.mean(np.array([getattr(Metrics, metric[0]).score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)])), - np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)])), - np.std(np.array([getattr(Metrics, metric[0]).score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)])), - np.std(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))] - stringAnalysis += "\n\n Classification took "+ str(hms(seconds=int(time))) + metricsScores[metric[0]] = [getattr(Metrics, metric[0]).score(y_train, y_train_pred), + getattr(Metrics, metric[0]).score(y_test, y_test_pred)] + stringAnalysis += "\n\n Classification took " + str(hms(seconds=int(time))) imageAnalysis = {} return stringAnalysis, imageAnalysis, metricsScores \ No newline at end of file diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py index 54fa6cd29767d2daa2001dad12aa99a598ad5a38..13ebfa6c84cf1fd534790037fe9e6770e75e9e75 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py @@ -32,7 +32,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): pipeline = Pipeline([('classifier', AdaBoostClassifier())]) param= {"classifier__n_estimators": randint(1, 15), @@ -44,7 +44,7 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, metric=["accuracy metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid = RandomizedSearchCV(pipeline, n_iter=nIter, param_distributions=param, refit=True, n_jobs=nbCores, - scoring=scorer, cv=nbFolds, random_state=randomState) + scoring=scorer, cv=KFolds, random_state=randomState) detector = grid.fit(X_train, y_train) desc_estimators = [detector.best_params_["classifier__n_estimators"], detector.best_params_["classifier__base_estimator"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py index a378bb0a5e135296095e4507ab42dc1ef3f8ff17..5700bd0aea81552b1f017008169398be7c06712f 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py @@ -35,7 +35,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_DT = Pipeline([('classifier', DecisionTreeClassifier())]) param_DT = {"classifier__max_depth": randint(1, 30), "classifier__criterion": ["gini", "entropy"], @@ -47,7 +47,7 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_DT = RandomizedSearchCV(pipeline_DT, n_iter=nIter, param_distributions=param_DT, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds, random_state=randomState) + cv=KFolds, random_state=randomState) DT_detector = grid_DT.fit(X_train, y_train) desc_params = [DT_detector.best_params_["classifier__max_depth"], DT_detector.best_params_["classifier__criterion"], DT_detector.best_params_["classifier__splitter"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py index fd16f1fbe6c5c8cf40b2e2dbbdd82bf89fdd2da4..a3c39983262e8c54c72b3099640c877ea5d03b15 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py @@ -39,7 +39,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30 ): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_KNN = Pipeline([('classifier', KNeighborsClassifier())]) param_KNN = {"classifier__n_neighbors": randint(1, 50), "classifier__weights": ["uniform", "distance"], @@ -53,7 +53,7 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_KNN = RandomizedSearchCV(pipeline_KNN, n_iter=nIter, param_distributions=param_KNN, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds, random_state=randomState) + cv=KFolds, random_state=randomState) KNN_detector = grid_KNN.fit(X_train, y_train) desc_params = [KNN_detector.best_params_["classifier__n_neighbors"], KNN_detector.best_params_["classifier__weights"], diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py index fe95c8dab96d261c8bc93d0da309271a642fb139..76ad16b92c9a089a6be40dfcebc02aeae31a1abe 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py @@ -35,7 +35,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_rf = Pipeline([('classifier', RandomForestClassifier())]) param_rf = {"classifier__n_estimators": randint(1, 30), "classifier__max_depth": randint(1, 30), @@ -46,8 +46,8 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_rf = RandomizedSearchCV(pipeline_rf, n_iter=nIter,param_distributions=param_rf,refit=True,n_jobs=nbCores, - scoring=scorer,cv=nbFolds, random_state=randomState) + grid_rf = RandomizedSearchCV(pipeline_rf, n_iter=nIter, param_distributions=param_rf, refit=True, n_jobs=nbCores, + scoring=scorer, cv=KFolds, random_state=randomState) rf_detector = grid_rf.fit(X_train, y_train) desc_estimators = [rf_detector.best_params_["classifier__n_estimators"], diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py index 06668c7e6b82ff3eeaee3a91ab408c2dbeae7968..62c5b170b0b048a2285acf9f7ca38f3021ccc256 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py @@ -54,7 +54,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): +def randomizedSearch(X_train, y_train, randomState, KFolds=None, metric=["accuracy_score", None], nIter=30, nbCores=1): metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: @@ -68,42 +68,44 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, metric=["accuracy baseScore = 1000.0 isBetter = "lower" config = [] - for iterIndex in range(nIter): - max_attributes = randomState.randint(1, 20) - p = randomState.random_sample() - model = randomState.choice(["conjunction", "disjunction"]) - classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attributes, model_type=model, verbose=False) - if nbFolds != 1: - kFolds = DB.getKFoldIndices(nbFolds, y_train, len(set(y_train)), range(len(y_train)), randomState) - else: - kFolds = [[], range(len(y_train))] - scores = [] - for foldIdx, fold in enumerate(kFolds): - if fold != range(len(y_train)): - fold.sort() - trainIndices = [index for index in range(len(y_train)) if (index not in fold)] - attributeClassification, binaryAttributes, dsetFile, name = transformData(X_train[trainIndices]) - try: - classifier.fit(binaryAttributes, y_train[trainIndices], X=None, attribute_classifications=attributeClassification, iteration_callback=None) - - predictedLabels = classifier.predict(X_train[fold]) - score = metricModule.score(y_train[fold], predictedLabels) - scores.append(score) - except: - pass - dsetFile.close() - os.remove(name) - if scores==[]: - score = baseScore - else: - score = np.mean(np.array(scores)) - - if isBetter=="higher" and score>baseScore: - baseScore = score - config = [max_attributes, p, model] - if isBetter=="lower" and score<baseScore: - baseScore = score - config = [max_attributes, p, model] + # for iterIndex in range(nIter): + max_attributes = randomState.randint(1, 20) + p = randomState.random_sample() + model = randomState.choice(["conjunction", "disjunction"]) + classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attributes, model_type=model, verbose=False) + # if nbFolds != 1: + # kFolds = DB.getKFoldIndices(nbFolds, y_train, len(set(y_train)), range(len(y_train)), randomState) + # else: + # kFolds = [[], range(len(y_train))] + scores = [] + KFolds = KFolds.split(X_train, y_train) + for foldIdx, (trainIndices, testIndices) in enumerate(KFolds): + # if fold != range(len(y_train)): + # fold.sort() + # trainIndices = [index for index in range(len(y_train)) if (index not in fold)] + attributeClassification, binaryAttributes, dsetFile, name = transformData(X_train[trainIndices]) + try: + classifier.fit(binaryAttributes, y_train[trainIndices], X=None, + attribute_classifications=attributeClassification, iteration_callback=None) + + predictedLabels = classifier.predict(X_train[testIndices]) + score = metricModule.score(y_train[testIndices], predictedLabels) + scores.append(score) + except: + pass + dsetFile.close() + os.remove(name) + if scores==[]: + score = baseScore + else: + score = np.mean(np.array(scores)) + + if isBetter=="higher" and score>baseScore: + baseScore = score + config = [max_attributes, p, model] + if isBetter=="lower" and score<baseScore: + baseScore = score + config = [max_attributes, p, model] assert config!=[], "No good configuration found for SCM" return config diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py index 4006470a3a68606452e3adc9959e75a51b4a0dfe..f936eff0da29661bc2be5c15ff05ead9c9c18993 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py @@ -39,7 +39,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SGD = Pipeline([('classifier', SGDClassifier())]) losses = ['log', 'modified_huber'] penalties = ["l1", "l2", "elasticnet"] @@ -53,7 +53,7 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_SGD = RandomizedSearchCV(pipeline_SGD, n_iter=nIter, param_distributions=param_SGD, refit=True, - n_jobs=nbCores, scoring=scorer, cv=nbFolds, random_state=randomState) + n_jobs=nbCores, scoring=scorer, cv=KFolds, random_state=randomState) SGD_detector = grid_SGD.fit(X_train, y_train) desc_params = [SGD_detector.best_params_["classifier__loss"], SGD_detector.best_params_["classifier__penalty"], SGD_detector.best_params_["classifier__alpha"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py index 2e378edd36667200ec4893ec982109df81370b13..28d010c15ef5b62fa5ee7efde37c5a5a3007d61c 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py @@ -29,7 +29,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SVMLinear = Pipeline([('classifier', SVC(kernel="linear", max_iter=1000))]) param_SVMLinear = {"classifier__C": randint(1, 10000)} metricModule = getattr(Metrics, metric[0]) @@ -38,8 +38,8 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_SVMLinear = RandomizedSearchCV(pipeline_SVMLinear, n_iter=nIter,param_distributions=param_SVMLinear, - refit=True, n_jobs=nbCores, scoring=scorer, cv=nbFolds, + grid_SVMLinear = RandomizedSearchCV(pipeline_SVMLinear, n_iter=nIter, param_distributions=param_SVMLinear, + refit=True, n_jobs=nbCores, scoring=scorer, cv=KFolds, random_state=randomState) SVMLinear_detector = grid_SVMLinear.fit(X_train, y_train) diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py index 7b72417332b08cfa903bd18fdd24cbc9dc24f77b..9f5ec44cd7bb213435dc09ea8623bb2322548115 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py @@ -33,7 +33,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SVMPoly = Pipeline([('classifier', SVC(kernel="poly", max_iter=1000))]) param_SVMPoly = {"classifier__C": randint(1, 10000), "classifier__degree": randint(1, 30)} @@ -44,7 +44,7 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_SVMPoly = RandomizedSearchCV(pipeline_SVMPoly, n_iter=nIter, param_distributions=param_SVMPoly, refit=True, - n_jobs=nbCores, scoring=scorer, cv=nbFolds, random_state=randomState) + n_jobs=nbCores, scoring=scorer, cv=KFolds, random_state=randomState) SVMRBF_detector = grid_SVMPoly.fit(X_train, y_train) desc_params = [SVMRBF_detector.best_params_["classifier__C"], SVMRBF_detector.best_params_["classifier__degree"]] return desc_params diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py index 7e32e9cc2577806e6dc7fdbac48495c86ab12e2c..230074683608dcbc6bb4c9008e1085b86eb41c6b 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py @@ -29,7 +29,7 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SVMRBF = Pipeline([('classifier', SVC(kernel="rbf", max_iter=1000))]) param_SVMRBF = {"classifier__C": randint(1, 10000)} metricModule = getattr(Metrics, metric[0]) @@ -39,7 +39,7 @@ def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_SVMRBF = RandomizedSearchCV(pipeline_SVMRBF, n_iter=nIter, param_distributions=param_SVMRBF, refit=True, - n_jobs=nbCores, scoring=scorer, cv=nbFolds, random_state=randomState) + n_jobs=nbCores, scoring=scorer, cv=KFolds, random_state=randomState) SVMRBF_detector = grid_SVMRBF.fit(X_train, y_train) desc_params = [SVMRBF_detector.best_params_["classifier__C"]] return desc_params diff --git a/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py b/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py index e5f723a82f6f5b3bda2b4b918b8f60db1284f05f..0aaa77b248802c13871e386b333c7eea36c81ab4 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py +++ b/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py @@ -53,6 +53,7 @@ def ExecMultiview(directory, DATASET, name, learningRate, nbFolds, nbCores, data extractionTime = time.time() - t_start ivalidationIndices = [] + trainLabelsIterations = [] testLabelsIterations = [] classifiersIterations = [] @@ -87,6 +88,7 @@ def ExecMultiview(directory, DATASET, name, learningRate, nbFolds, nbCores, data else: classifier = classifierClass(NB_CORES=nbCores, **classificationKWARGS) for _ in range(statsIter): + learningIndices, validationIndices = learningRate classifier.fit_hdf5(DATASET, trainIndices=learningIndices, viewsIndices=viewsIndices) trainLabels = classifier.predict_hdf5(DATASET, usedIndices=learningIndices, viewsIndices=viewsIndices) testLabels = classifier.predict_hdf5(DATASET, usedIndices=validationIndices, viewsIndices=viewsIndices)