From 01af11ef2f9ee6b4cbcea6594147175bc3c21d02 Mon Sep 17 00:00:00 2001 From: bbauvin <baptiste.bauvin@centrale-marseille.fr> Date: Tue, 21 Nov 2017 12:45:19 -0500 Subject: [PATCH] Using full dataset --- Code/MonoMultiViewClassifiers/ExecClassif.py | 2 +- .../Monoview/ExecClassifMonoView.py | 6 +- .../utils/GetMultiviewDb.py | 55 +++++++++++-------- .../utils/Interpret.py | 2 +- .../utils/execution.py | 2 + 5 files changed, 40 insertions(+), 27 deletions(-) diff --git a/Code/MonoMultiViewClassifiers/ExecClassif.py b/Code/MonoMultiViewClassifiers/ExecClassif.py index 302eab9e..094dbcdf 100644 --- a/Code/MonoMultiViewClassifiers/ExecClassif.py +++ b/Code/MonoMultiViewClassifiers/ExecClassif.py @@ -356,7 +356,7 @@ def execClassif(arguments): getDatabase = getattr(DB, "get" + args.name + "DB" + args.type[1:]) DATASET, LABELS_DICTIONARY = getDatabase(args.views, args.pathF, args.name, args.CL_nbClass, - args.CL_classes, randomState) + args.CL_classes, randomState, args.full) classificationIndices = execution.genSplits(DATASET.get("Labels").value, args.CL_split, statsIterRandomStates) diff --git a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index 23592ea1..ee0005fa 100644 --- a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -29,12 +29,14 @@ def initConstants(args, X, classificationIndices, labelsNames, name, directory): except KeyError: kwargs = args t_start = time.time() - feat = X.attrs["name"] + if type(X.attrs["name"]) == bytes: + feat = X.attrs["name"].decode("utf-8") + else: + feat = X.attrs["name"] CL_type = kwargs["CL_type"] X = getValue(X) learningRate = float(len(classificationIndices[0])) / (len(classificationIndices[0]) + len(classificationIndices[1])) labelsString = "-".join(labelsNames) - timestr = time.strftime("%Y%m%d-%H%M%S") CL_type_string = CL_type outputFileName = directory + CL_type_string + "/" + feat + "/" + "Results-" + CL_type_string + "-" + labelsString + \ diff --git a/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py b/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py index 1114d4b3..981533ec 100644 --- a/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py +++ b/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py @@ -52,7 +52,7 @@ def makeMeNoisy(viewData, randomState, percentage=15): return noisyViewData -def getPlausibleDBhdf5(features, pathF, name, NB_CLASS=3, LABELS_NAME="", nbView=3, +def getPlausibleDBhdf5(features, pathF, name, NB_CLASS=3, LABELS_NAME="", randomState=None, full=True, nbView=3, nbClass=2, datasetLength=347, randomStateInt=None): """Used to generate a plausible dataset to test the algorithms""" randomStateInt = 42 @@ -257,6 +257,9 @@ def filterLabels(labelsSet, askedLabelsNamesSet, fullLabels, availableLabelsName def filterViews(datasetFile, temp_dataset, views, usedIndices): newViewIndex = 0 + if views == [""]: + for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): + copyhdf5Dataset(datasetFile, temp_dataset, "View" + str(viewIndex), "View" + str(viewIndex), usedIndices) for askedViewName in views: for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): viewName = datasetFile.get("View" + str(viewIndex)).attrs["name"] @@ -286,32 +289,38 @@ def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, dest newDset.attrs[key] = value -def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState): +def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState, full=False): """Used to load a hdf5 database""" - askedLabelsNames = [askedLabelName.encode("utf8") for askedLabelName in askedLabelsNames] - datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") - fullLabels = datasetFile.get("Labels").value - temp_dataset = h5py.File(pathF+nameDB+"_temp_view_label_select.hdf5", "w") - datasetFile.copy("Metadata", temp_dataset) - labelsSet = getClasses(fullLabels) - availableLabelsNames = list(datasetFile.get("Labels").attrs["names"]) - askedLabelsNames, askedLabelsNamesSet = fillLabelNames(NB_CLASS, askedLabelsNames, - randomState, availableLabelsNames) + if full: + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") + labelsDictionary = dict((labelIndex, labelName.decode("utf-8")) for labelIndex, labelName in + enumerate(datasetFile.get("Labels").attrs["names"])) + return datasetFile, labelsDictionary + else: + askedLabelsNames = [askedLabelName.encode("utf8") for askedLabelName in askedLabelsNames] + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") + fullLabels = datasetFile.get("Labels").value + temp_dataset = h5py.File(pathF+nameDB+"_temp_view_label_select.hdf5", "w") + datasetFile.copy("Metadata", temp_dataset) + labelsSet = getClasses(fullLabels) + availableLabelsNames = list(datasetFile.get("Labels").attrs["names"]) + askedLabelsNames, askedLabelsNamesSet = fillLabelNames(NB_CLASS, askedLabelsNames, + randomState, availableLabelsNames) - newLabels, newLabelsNames, usedIndices = filterLabels(labelsSet, askedLabelsNamesSet, fullLabels, - availableLabelsNames, askedLabelsNames) - temp_dataset.get("Metadata").attrs["datasetLength"] = len(usedIndices) - temp_dataset.get("Metadata").attrs["nbClass"] = NB_CLASS - temp_dataset.create_dataset("Labels", data=newLabels) - temp_dataset.get("Labels").attrs["names"] = newLabelsNames - filterViews(datasetFile, temp_dataset, views, usedIndices) + newLabels, newLabelsNames, usedIndices = filterLabels(labelsSet, askedLabelsNamesSet, fullLabels, + availableLabelsNames, askedLabelsNames) + temp_dataset.get("Metadata").attrs["datasetLength"] = len(usedIndices) + temp_dataset.get("Metadata").attrs["nbClass"] = NB_CLASS + temp_dataset.create_dataset("Labels", data=newLabels) + temp_dataset.get("Labels").attrs["names"] = newLabelsNames + filterViews(datasetFile, temp_dataset, views, usedIndices) - labelsDictionary = dict((labelIndex, labelName.decode("utf-8")) for labelIndex, labelName in - enumerate(temp_dataset.get("Labels").attrs["names"])) - return temp_dataset, labelsDictionary + labelsDictionary = dict((labelIndex, labelName.decode("utf-8")) for labelIndex, labelName in + enumerate(temp_dataset.get("Labels").attrs["names"])) + return temp_dataset, labelsDictionary -def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState, delimiter=","): +def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState, full=False, delimiter=","): # TODO : Update this one labelsNames = np.genfromtxt(pathF + nameDB + "-labels-names.csv", dtype='str', delimiter=delimiter) datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") @@ -333,7 +342,7 @@ def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomStat metaDataGrp.attrs["nbClass"] = len(labelsNames) metaDataGrp.attrs["datasetLength"] = len(labels) datasetFile.close() - datasetFile, labelsDictionary = getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState) + datasetFile, labelsDictionary = getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState, full) return datasetFile, labelsDictionary diff --git a/Code/MonoMultiViewClassifiers/utils/Interpret.py b/Code/MonoMultiViewClassifiers/utils/Interpret.py index e83b2e55..03bf3c7d 100644 --- a/Code/MonoMultiViewClassifiers/utils/Interpret.py +++ b/Code/MonoMultiViewClassifiers/utils/Interpret.py @@ -16,7 +16,7 @@ def getFeatureImportance(classifier, directory, interpretString=""): featureImportancesSorted = featureImportances[sortedArgs][:50] featureIndicesSorted = sortedArgs[:50] fig, ax = plt.subplots() - x = np.arange(50) + x = np.arange(len(featureIndicesSorted)) formatter = FuncFormatter(percent) ax.yaxis.set_major_formatter(formatter) plt.bar(x, featureImportancesSorted) diff --git a/Code/MonoMultiViewClassifiers/utils/execution.py b/Code/MonoMultiViewClassifiers/utils/execution.py index 206f2eea..840f4da7 100644 --- a/Code/MonoMultiViewClassifiers/utils/execution.py +++ b/Code/MonoMultiViewClassifiers/utils/execution.py @@ -38,6 +38,8 @@ def parseTheArgs(arguments): type=int, default=2) groupStandard.add_argument('--machine', metavar='STRING', action='store', help='Type of machine on which the script runs', default="PC") + groupStandard.add_argument('-full', action='store_true', help='Use option to use full dataset and no labels or view filtering') + groupClass = parser.add_argument_group('Classification arguments') groupClass.add_argument('--CL_multiclassMethod', metavar='STRING', action='store', -- GitLab