From 8f1c9a3e596501458d3bcdf32f519e24163f2933 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin.1@ulaval.ca> Date: Tue, 24 Oct 2017 18:25:57 -0400 Subject: [PATCH] Refactored --- .../Monoview/ExecClassifMonoView.py | 325 ++++++++++-------- Code/Versions.py | 58 +--- 2 files changed, 189 insertions(+), 194 deletions(-) diff --git a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index 661b44cd..4d1925da 100644 --- a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -24,29 +24,10 @@ __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" __status__ = "Prototype" # Production, Development, Prototype # __date__ = 2016 - 03 - 25 - -def ExecMonoview_multicore(directory, name, labelsNames, classificationIndices, KFolds, datasetFileIndex, databaseType, - path, randomState, hyperParamSearch="randomizedSearch", - metrics=[["accuracy_score", None]], nIter=30, **args): - DATASET = h5py.File(path + name + str(datasetFileIndex) + ".hdf5", "r") - kwargs = args["args"] - views = [DATASET.get("View" + str(viewIndex)).attrs["name"] for viewIndex in - range(DATASET.get("Metadata").attrs["nbView"])] - neededViewIndex = views.index(kwargs["feat"]) - X = DATASET.get("View" + str(neededViewIndex)) - Y = DATASET.get("Labels").value - return ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, 1, databaseType, path, - randomState, hyperParamSearch=hyperParamSearch, - metrics=metrics, nIter=nIter, **args) - - -def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, nbCores, databaseType, path, - randomState, hyperParamSearch="randomizedSearch", - metrics=[["accuracy_score", None]], nIter=30, **args): - logging.debug("Start:\t Loading data") +def initConstants(args, X, classificationIndices, labelsNames, name, directory): try: kwargs = args["args"] - except: + except KeyError: kwargs = args t_start = time.time() feat = X.attrs["name"] @@ -56,6 +37,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol labelsString = "-".join(labelsNames) timestr = time.strftime("%Y%m%d-%H%M%S") CL_type_string = CL_type + outputFileName = directory + "/" + CL_type_string + "/" + "/" + feat + "/" + timestr + "Results-" + CL_type_string + "-" + labelsString + \ '-learnRate' + str(learningRate) + '-' + name + "-" + feat + "-" if not os.path.exists(os.path.dirname(outputFileName)): @@ -64,26 +46,18 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol except OSError as exc: if exc.errno != errno.EEXIST: raise - logging.debug("Done:\t Loading data") - # Determine the Database to extract features - logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train ratio:" - + str(learningRate) + ", CrossValidation k-folds: " + str(KFolds.n_splits) + ", cores:" - + str(nbCores) + ", algorithm : " + CL_type) + return kwargs, t_start, feat, CL_type, X, learningRate, labelsString, timestr, outputFileName +def initTrainTest(X, Y, classificationIndices): trainIndices, testIndices = classificationIndices - # Calculate Train/Test data - logging.debug("Start:\t Determine Train/Test split") X_train = extractSubset(X, trainIndices) X_test = extractSubset(X, testIndices) y_train = Y[trainIndices] y_test = Y[testIndices] + return X_train, y_train, X_test, y_test - logging.debug("Info:\t Shape X_train:" + str(X_train.shape) + ", Length of y_train:" + str(len(y_train))) - logging.debug("Info:\t Shape X_test:" + str(X_test.shape) + ", Length of y_test:" + str(len(y_test))) - logging.debug("Done:\t Determine Train/Test split") - - classifierModule = getattr(MonoviewClassifiers, CL_type) - +def getKWARGS(classifierModule, hyperParamSearch, nIter, CL_type, X_train, y_train, randomState, + outputFileName, KFolds, nbCores, metrics, kwargs): if hyperParamSearch != "None": classifierHPSearch = getattr(classifierModule, hyperParamSearch) logging.debug("Start:\t RandomSearch best settings with " + str(nIter) + " iterations for " + CL_type) @@ -92,42 +66,15 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol clKWARGS = dict((str(index), desc) for index, desc in enumerate(cl_desc)) logging.debug("Done:\t RandomSearch best settings") else: - clKWARGS = kwargs[kwargs["CL_type"] + "KWARGS"] - logging.debug("Start:\t Training") - cl_res = classifierModule.fit(X_train, y_train, randomState, NB_CORES=nbCores, **clKWARGS) - logging.debug("Done:\t Training") - - logging.debug("Start:\t Predicting") - # Stats Result - y_train_pred = cl_res.predict(X_train) - y_test_pred = cl_res.predict(X_test) - - full_labels = cl_res.predict(X) - logging.debug("Done:\t Predicting") - t_end = time.time() - t_start - logging.debug("Info:\t Time for training and predicting: " + str(t_end) + "[s]") - - logging.debug("Start:\t Getting Results") + clKWARGS = kwargs[CL_type + "KWARGS"] + return clKWARGS - stringAnalysis, imagesAnalysis, metricsScores = execute(name, classificationIndices, KFolds, nbCores, - hyperParamSearch, metrics, nIter, feat, CL_type, - clKWARGS, labelsNames, X.shape, - y_train, y_train_pred, y_test, y_test_pred, t_end, - randomState, cl_res, outputFileName) - cl_desc = [value for key, value in sorted(clKWARGS.items())] - logging.debug("Done:\t Getting Results") +def saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, y_train, imagesAnalysis): logging.info(stringAnalysis) - # labelsString = "-".join(labelsNames) - # timestr = time.strftime("%Y%m%d-%H%M%S") - # CL_type_string = CL_type - # outputFileName = directory + "/"+CL_type_string+"/"+"/"+feat+"/"+timestr +"Results-" + CL_type_string + "-" + labelsString + \ - # '-learnRate' + str(learningRate) + '-' + name + "-" + feat + "-" - - outputTextFile = open(outputFileName + '.txt', 'w') outputTextFile.write(stringAnalysis) outputTextFile.close() - np.savetxt(outputFileName + "full_pred.csv", full_labels.astype(np.int16), delimiter=",") + np.savetxt(outputFileName + "full_pred.csv", full_labels_pred.astype(np.int16), delimiter=",") np.savetxt(outputFileName + "train_pred.csv", y_train_pred.astype(np.int16), delimiter=",") np.savetxt(outputFileName + "train_labels.csv", y_train.astype(np.int16), delimiter=",") @@ -142,92 +89,170 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol imagesAnalysis[imageName].savefig(outputFileName + imageName + '.png') - logging.info("Done:\t Result Analysis") + +def ExecMonoview_multicore(directory, name, labelsNames, classificationIndices, KFolds, datasetFileIndex, databaseType, + path, randomState, hyperParamSearch="randomizedSearch", + metrics=[["accuracy_score", None]], nIter=30, **args): + DATASET = h5py.File(path + name + str(datasetFileIndex) + ".hdf5", "r") + kwargs = args["args"] + views = [DATASET.get("View" + str(viewIndex)).attrs["name"] for viewIndex in + range(DATASET.get("Metadata").attrs["nbView"])] + neededViewIndex = views.index(kwargs["feat"]) + X = DATASET.get("View" + str(neededViewIndex)) + Y = DATASET.get("Labels").value + return ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, 1, databaseType, path, + randomState, hyperParamSearch=hyperParamSearch, + metrics=metrics, nIter=nIter, **args) + + +def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFolds, nbCores, databaseType, path, + randomState, hyperParamSearch="randomizedSearch", + metrics=[["accuracy_score", None]], nIter=30, **args): + + logging.debug("Start:\t Loading data") + kwargs, \ + t_start, \ + feat, \ + CL_type, \ + X, \ + learningRate, \ + labelsString, \ + timestr, \ + outputFileName = initConstants(args, X, classificationIndices, labelsNames, name, directory) + logging.debug("Done:\t Loading data") + + logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train ratio:" + + str(learningRate) + ", CrossValidation k-folds: " + str(KFolds.n_splits) + ", cores:" + + str(nbCores) + ", algorithm : " + CL_type) + + logging.debug("Start:\t Determine Train/Test split") + X_train, y_train, X_test, y_test = initTrainTest(X, Y, classificationIndices) + logging.debug("Info:\t Shape X_train:" + str(X_train.shape) + ", Length of y_train:" + str(len(y_train))) + logging.debug("Info:\t Shape X_test:" + str(X_test.shape) + ", Length of y_test:" + str(len(y_test))) + logging.debug("Done:\t Determine Train/Test split") + + logging.debug("Start:\t Generate classifier args") + classifierModule = getattr(MonoviewClassifiers, CL_type) + clKWARGS = getKWARGS(classifierModule, hyperParamSearch, + nIter, CL_type, X_train, y_train, + randomState, outputFileName, + KFolds, nbCores, metrics, kwargs) + logging.debug("Done:\t Generate classifier args") + + logging.debug("Start:\t Training") + cl_res = classifierModule.fit(X_train, y_train, randomState, NB_CORES=nbCores, **clKWARGS) + logging.debug("Done:\t Training") + + logging.debug("Start:\t Predicting") + full_labels_pred = cl_res.predict(X) + y_train_pred = full_labels_pred[classificationIndices[0]] + y_test_pred = full_labels_pred[classificationIndices[1]] + logging.debug("Done:\t Predicting") + + t_end = time.time() - t_start + logging.debug("Info:\t Time for training and predicting: " + str(t_end) + "[s]") + + logging.debug("Start:\t Getting Results") + stringAnalysis, \ + imagesAnalysis, \ + metricsScores = execute(name, classificationIndices, KFolds, nbCores, + hyperParamSearch, metrics, nIter, feat, CL_type, + clKWARGS, labelsNames, X.shape, + y_train, y_train_pred, y_test, y_test_pred, t_end, + randomState, cl_res, outputFileName) + cl_desc = [value for key, value in sorted(clKWARGS.items())] + logging.debug("Done:\t Getting Results") + + logging.debug("Start:\t Saving results") + saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, y_train, imagesAnalysis) + logging.info("Done:\t Saving Results") + viewIndex = args["viewIndex"] - return viewIndex, [CL_type, cl_desc + [feat], metricsScores, full_labels, clKWARGS] + return viewIndex, [CL_type, cl_desc + [feat], metricsScores, full_labels_pred, clKWARGS] if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='This methods permits to execute a multiclass classification with one single view. At this point the used classifier is a RandomForest. The GridSearch permits to vary the number of trees and CrossValidation with k-folds. The result will be a plot of the score per class and a CSV with the best classifier found by the GridSearch.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - groupStandard = parser.add_argument_group('Standard arguments') - groupStandard.add_argument('-log', action='store_true', help='Use option to activate Logging to Console') - groupStandard.add_argument('--type', metavar='STRING', action='store', help='Type of Dataset', default=".hdf5") - groupStandard.add_argument('--name', metavar='STRING', action='store', - help='Name of Database (default: %(default)s)', default='DB') - groupStandard.add_argument('--feat', metavar='STRING', action='store', - help='Name of Feature for Classification (default: %(default)s)', default='RGB') - groupStandard.add_argument('--pathF', metavar='STRING', action='store', - help='Path to the views (default: %(default)s)', default='Results-FeatExtr/') - groupStandard.add_argument('--fileCL', metavar='STRING', action='store', - help='Name of classLabels CSV-file (default: %(default)s)', default='classLabels.csv') - groupStandard.add_argument('--fileCLD', metavar='STRING', action='store', - help='Name of classLabels-Description CSV-file (default: %(default)s)', - default='classLabels-Description.csv') - groupStandard.add_argument('--fileFeat', metavar='STRING', action='store', - help='Name of feature CSV-file (default: %(default)s)', default='feature.csv') - - groupClass = parser.add_argument_group('Classification arguments') - groupClass.add_argument('--CL_type', metavar='STRING', action='store', help='Classifier to use', - default="RandomForest") - groupClass.add_argument('--CL_CV', metavar='INT', action='store', help='Number of k-folds for CV', type=int, - default=10) - groupClass.add_argument('--CL_Cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int, - default=1) - groupClass.add_argument('--CL_split', metavar='FLOAT', action='store', help='Split ratio for train and test', - type=float, default=0.9) - groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', - help='Determine which metrics to use, separate with ":" if multiple, if empty, considering all', - default='') - - groupClassifier = parser.add_argument_group('Classifier Config') - groupClassifier.add_argument('--CL_config', metavar='STRING', nargs="+", action='store', - help='GridSearch: Determine the trees', default=['25:75:125:175']) - - args = parser.parse_args() - - classifierKWARGS = dict((key, value) for key, value in enumerate([arg.split(":") for arg in args.CL_config])) - ### Main Programm - - - # Configure Logger - directory = os.path.dirname(os.path.abspath(__file__)) + "/Results-ClassMonoView/" - logfilename = datetime.datetime.now().strftime("%Y_%m_%d") + "-CMV-" + args.name + "-" + args.feat + "-LOG" - logfile = directory + logfilename - if os.path.isfile(logfile + ".log"): - for i in range(1, 20): - testFileName = logfilename + "-" + str(i) + ".log" - if not os.path.isfile(directory + testFileName): - logfile = directory + testFileName - break - else: - logfile += ".log" - - logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=logfile, level=logging.DEBUG, - filemode='w') - - if args.log: - logging.getLogger().addHandler(logging.StreamHandler()) - - # Read the features - logging.debug("Start:\t Read " + args.type + " Files") - - if args.type == ".csv": - X = np.genfromtxt(args.pathF + args.fileFeat, delimiter=';') - Y = np.genfromtxt(args.pathF + args.fileCL, delimiter=';') - elif args.type == ".hdf5": - dataset = h5py.File(args.pathF + args.name + ".hdf5", "r") - viewsDict = dict((dataset.get("View" + str(viewIndex)).attrs["name"], viewIndex) for viewIndex in - range(dataset.get("Metadata").attrs["nbView"])) - X = dataset["View" + str(viewsDict[args.feat])][...] - Y = dataset["Labels"][...] - - logging.debug("Info:\t Shape of Feature:" + str(X.shape) + ", Length of classLabels vector:" + str(Y.shape)) - logging.debug("Done:\t Read CSV Files") - - arguments = {args.CL_type + "KWARGS": classifierKWARGS, "feat": args.feat, "fileFeat": args.fileFeat, - "fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": args.CL_type} - ExecMonoview(X, Y, args.name, args.CL_split, args.CL_CV, args.CL_Cores, args.type, args.pathF, - metrics=args.CL_metrics, **arguments) + pass + # parser = argparse.ArgumentParser( + # description='This methods permits to execute a multiclass classification with one single view. At this point the used classifier is a RandomForest. The GridSearch permits to vary the number of trees and CrossValidation with k-folds. The result will be a plot of the score per class and a CSV with the best classifier found by the GridSearch.', + # formatter_class=argparse.ArgumentDefaultsHelpFormatter) + # + # groupStandard = parser.add_argument_group('Standard arguments') + # groupStandard.add_argument('-log', action='store_true', help='Use option to activate Logging to Console') + # groupStandard.add_argument('--type', metavar='STRING', action='store', help='Type of Dataset', default=".hdf5") + # groupStandard.add_argument('--name', metavar='STRING', action='store', + # help='Name of Database (default: %(default)s)', default='DB') + # groupStandard.add_argument('--feat', metavar='STRING', action='store', + # help='Name of Feature for Classification (default: %(default)s)', default='RGB') + # groupStandard.add_argument('--pathF', metavar='STRING', action='store', + # help='Path to the views (default: %(default)s)', default='Results-FeatExtr/') + # groupStandard.add_argument('--fileCL', metavar='STRING', action='store', + # help='Name of classLabels CSV-file (default: %(default)s)', default='classLabels.csv') + # groupStandard.add_argument('--fileCLD', metavar='STRING', action='store', + # help='Name of classLabels-Description CSV-file (default: %(default)s)', + # default='classLabels-Description.csv') + # groupStandard.add_argument('--fileFeat', metavar='STRING', action='store', + # help='Name of feature CSV-file (default: %(default)s)', default='feature.csv') + # + # groupClass = parser.add_argument_group('Classification arguments') + # groupClass.add_argument('--CL_type', metavar='STRING', action='store', help='Classifier to use', + # default="RandomForest") + # groupClass.add_argument('--CL_CV', metavar='INT', action='store', help='Number of k-folds for CV', type=int, + # default=10) + # groupClass.add_argument('--CL_Cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int, + # default=1) + # groupClass.add_argument('--CL_split', metavar='FLOAT', action='store', help='Split ratio for train and test', + # type=float, default=0.9) + # groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', + # help='Determine which metrics to use, separate with ":" if multiple, if empty, considering all', + # default='') + # + # groupClassifier = parser.add_argument_group('Classifier Config') + # groupClassifier.add_argument('--CL_config', metavar='STRING', nargs="+", action='store', + # help='GridSearch: Determine the trees', default=['25:75:125:175']) + # + # args = parser.parse_args() + # + # classifierKWARGS = dict((key, value) for key, value in enumerate([arg.split(":") for arg in args.CL_config])) + # ### Main Programm + # + # + # # Configure Logger + # directory = os.path.dirname(os.path.abspath(__file__)) + "/Results-ClassMonoView/" + # logfilename = datetime.datetime.now().strftime("%Y_%m_%d") + "-CMV-" + args.name + "-" + args.feat + "-LOG" + # logfile = directory + logfilename + # if os.path.isfile(logfile + ".log"): + # for i in range(1, 20): + # testFileName = logfilename + "-" + str(i) + ".log" + # if not os.path.isfile(directory + testFileName): + # logfile = directory + testFileName + # break + # else: + # logfile += ".log" + # + # logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=logfile, level=logging.DEBUG, + # filemode='w') + # + # if args.log: + # logging.getLogger().addHandler(logging.StreamHandler()) + # + # # Read the features + # logging.debug("Start:\t Read " + args.type + " Files") + # + # if args.type == ".csv": + # X = np.genfromtxt(args.pathF + args.fileFeat, delimiter=';') + # Y = np.genfromtxt(args.pathF + args.fileCL, delimiter=';') + # elif args.type == ".hdf5": + # dataset = h5py.File(args.pathF + args.name + ".hdf5", "r") + # viewsDict = dict((dataset.get("View" + str(viewIndex)).attrs["name"], viewIndex) for viewIndex in + # range(dataset.get("Metadata").attrs["nbView"])) + # X = dataset["View" + str(viewsDict[args.feat])][...] + # Y = dataset["Labels"][...] + # + # logging.debug("Info:\t Shape of Feature:" + str(X.shape) + ", Length of classLabels vector:" + str(Y.shape)) + # logging.debug("Done:\t Read CSV Files") + # + # arguments = {args.CL_type + "KWARGS": classifierKWARGS, "feat": args.feat, "fileFeat": args.fileFeat, + # "fileCL": args.fileCL, "fileCLD": args.fileCLD, "CL_type": args.CL_type} + # ExecMonoview(X, Y, args.name, args.CL_split, args.CL_CV, args.CL_Cores, args.type, args.pathF, + # metrics=args.CL_metrics, **arguments) diff --git a/Code/Versions.py b/Code/Versions.py index 0da83a60..e7a8c3d3 100644 --- a/Code/Versions.py +++ b/Code/Versions.py @@ -1,110 +1,80 @@ -#!/usr/bin/env python - -""" Script to render versions of modules used """ - -# Import built-in modules - -# Import 3rd party modules - -# Import own modules - - # Author-Info __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -# __date__ = 2016 - 03 - 25 def testVersions(): isUpToDate = True toInstall = [] + try: import sys - # print("Python-V.: " + sys.version) - except: - # print "Please install Python 2.7" + except ImportError: raise try: import pyscm - except: - # print "Please install pyscm" + except ImportError: isUpToDate = False toInstall.append("pyscm") try: import numpy - # print("Numpy-V.: " + numpy.version.version) - except: - # print "Please install numpy module" + except ImportError: isUpToDate = False toInstall.append("numpy") try: import scipy - # print("Scipy-V.: " + scipy.__version__) - except: - # print "Please install scipy module" + except ImportError: isUpToDate = False toInstall.append("scipy") try: import matplotlib - # print("Matplotlib-V.: " + matplotlib.__version__) - except: - # print "Please install matplotlib module" + except ImportError: isUpToDate = False toInstall.append("matplotlib") try: import sklearn - # print("Sklearn-V.: " + sklearn.__version__) - except: - # print "Please install sklearn module" + except ImportError: isUpToDate = False toInstall.append("sklearn") try: - import logging # To create Log-Files - # print("Logging: " + logging.__version__) - except: - # print "Please install logging module" + import logging + except ImportError: isUpToDate = False toInstall.append("logging") try: import joblib - # print("joblib: " + joblib.__version__) - except: - # print "Pease install joblib module" + except ImportError: isUpToDate = False toInstall.append("joblib") try: import argparse - # print("argparse: " + argparse.__version__) - except: - # print "Pease install argparse module" + except ImportError: isUpToDate = False toInstall.append("argparse") try: import h5py # - # print("h5py: " + h5py.__version__) - except: - # print "Pease install h5py module" + except ImportError: isUpToDate = False toInstall.append("h5py") try: import graphviz # - except: + except ImportError: isUpToDate = False toInstall.append("graphviz") try: import pickle # - except: + except ImportError: isUpToDate = False toInstall.append("pickle") -- GitLab