Skip to content
Snippets Groups Projects
Commit b4d13a7a authored by bbauvin's avatar bbauvin
Browse files

moved to the graal

parent 6a7c80e5
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,7 @@ import logging ...@@ -6,6 +6,7 @@ import logging
import h5py import h5py
import operator import operator
import errno import errno
import csv
# Author-Info # Author-Info
__author__ = "Baptiste Bauvin" __author__ = "Baptiste Bauvin"
...@@ -77,11 +78,11 @@ def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, nbView=3, ...@@ -77,11 +78,11 @@ def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, nbView=3,
viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewData.shape, data=viewData.astype(np.uint8)) viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewData.shape, data=viewData.astype(np.uint8))
viewDset.attrs["name"] = "View" + str(viewIndex) viewDset.attrs["name"] = "View" + str(viewIndex)
viewDset.attrs["sparse"] = False viewDset.attrs["sparse"] = False
viewDset.attrs["binary"] = True # viewDset.attrs["binary"] = True
labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape) labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape)
labelsDset[...] = CLASS_LABELS labelsDset[...] = CLASS_LABELS
labelsDset.attrs["name"] = "Labels" labelsDset.attrs["name"] = "Labels"
labelsDset.attrs["names"] = ["No", "Yes"] labelsDset.attrs["names"] = ["No".encode(), "Yes".encode()]
metaDataGrp = datasetFile.create_group("Metadata") metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = nbView metaDataGrp.attrs["nbView"] = nbView
metaDataGrp.attrs["nbClass"] = 2 metaDataGrp.attrs["nbClass"] = 2
...@@ -223,8 +224,9 @@ def filterLabels(labelsSet, askedLabelsNamesSet, fullLabels, availableLabelsName ...@@ -223,8 +224,9 @@ def filterLabels(labelsSet, askedLabelsNamesSet, fullLabels, availableLabelsName
def filterViews(datasetFile, temp_dataset, views, usedIndices): def filterViews(datasetFile, temp_dataset, views, usedIndices):
newViewIndex = 0 newViewIndex = 0
for askedViewName in views:
for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]):
if datasetFile.get("View" + str(viewIndex)).attrs["name"] in views: if datasetFile.get("View" + str(viewIndex)).attrs["name"] == askedViewName:
copyhdf5Dataset(datasetFile, temp_dataset, "View" + str(viewIndex), "View" + str(newViewIndex), usedIndices) copyhdf5Dataset(datasetFile, temp_dataset, "View" + str(viewIndex), "View" + str(newViewIndex), usedIndices)
newViewIndex += 1 newViewIndex += 1
else: else:
...@@ -232,6 +234,18 @@ def filterViews(datasetFile, temp_dataset, views, usedIndices): ...@@ -232,6 +234,18 @@ def filterViews(datasetFile, temp_dataset, views, usedIndices):
temp_dataset.get("Metadata").attrs["nbView"] = len(views) temp_dataset.get("Metadata").attrs["nbView"] = len(views)
def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, destinationDatasetName, usedIndices):
"""Used to copy a view in a new dataset file using only the examples of usedIndices, and copying the args"""
newDset = destinationDataFile.create_dataset(destinationDatasetName,
data=sourceDataFile.get(sourceDatasetName).value[usedIndices,:])
if "sparse" in sourceDataFile.get(sourceDatasetName).attrs.keys() and sourceDataFile.get(sourceDatasetName).attrs["sparse"]:
# TODO : Support sparse
pass
else:
for key, value in sourceDataFile.get(sourceDatasetName).attrs.items():
newDset.attrs[key] = value
def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState): def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState):
"""Used to load a hdf5 database""" """Used to load a hdf5 database"""
askedLabelsNames = [askedLabelName.encode("utf8") for askedLabelName in askedLabelsNames] askedLabelsNames = [askedLabelName.encode("utf8") for askedLabelName in askedLabelsNames]
...@@ -247,27 +261,65 @@ def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomSta ...@@ -247,27 +261,65 @@ def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomSta
newLabels, newLabelsNames, usedIndices = filterLabels(labelsSet, askedLabelsNamesSet, fullLabels, newLabels, newLabelsNames, usedIndices = filterLabels(labelsSet, askedLabelsNamesSet, fullLabels,
availableLabelsNames, askedLabelsNames) availableLabelsNames, askedLabelsNames)
temp_dataset.get("Metadata").attrs["datasetLength"] = len(usedIndices) temp_dataset.get("Metadata").attrs["datasetLength"] = len(usedIndices)
temp_dataset.get("Metadata").attrs["nbClass"] = NB_CLASS
temp_dataset.create_dataset("Labels", data=newLabels) temp_dataset.create_dataset("Labels", data=newLabels)
temp_dataset.get("Labels").attrs["names"] = newLabelsNames temp_dataset.get("Labels").attrs["names"] = newLabelsNames
filterViews(datasetFile, temp_dataset, views, usedIndices) filterViews(datasetFile, temp_dataset, views, usedIndices)
labelsDictionary = dict((labelIndex, labelName) for labelIndex, labelName in
labelsDictionary = dict((labelIndex, labelName.decode("utf-8")) for labelIndex, labelName in
enumerate(temp_dataset.get("Labels").attrs["names"])) enumerate(temp_dataset.get("Labels").attrs["names"]))
return datasetFile, labelsDictionary return temp_dataset, labelsDictionary
def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, destinationDatasetName, usedIndices): def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState, delimiter=","):
"""Used to copy a view in a new dataset file using only the examples of usedIndices, and copying the args""" # TODO : Update this one
newDset = destinationDataFile.create_dataset(destinationDatasetName, labelsNames = np.genfromtxt(pathF + nameDB + "-labels-names.csv", dtype='str', delimiter=delimiter)
data=sourceDataFile.get(sourceDatasetName).value[usedIndices,:]) datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
if "sparse" in sourceDataFile.get(sourceDatasetName).attrs.keys() and sourceDataFile.get(sourceDatasetName).attrs["sparse"]: labels = np.genfromtxt(pathF + nameDB + "-labels.csv", delimiter=delimiter)
# TODO : Support sparse labelsDset = datasetFile.create_dataset("Labels", labels.shape, data=labels)
pass labelsDset.attrs["names"] = [labelName.encode() for labelName in labelsNames]
viewFileNames = [viewFileName for viewFileName in os.listdir(pathF+"Views/")]
# import pdb;pdb.set_trace()
for viewIndex, viewFileName in enumerate(os.listdir(pathF+"Views/")):
viewFile = pathF + "Views/" + viewFileName
if viewFileName[-6:] != "-s.csv":
viewMatrix = np.genfromtxt(viewFile, delimiter=delimiter)
viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
viewDset.attrs["name"] = viewFileName[:-4]
viewDset.attrs["sparse"] = False
else: else:
for key, value in sourceDataFile.get(sourceDatasetName).attrs.items(): pass
newDset.attrs[key] = value metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = len(viewFileNames)
metaDataGrp.attrs["nbClass"] = len(labelsNames)
metaDataGrp.attrs["datasetLength"] = len(labels)
datasetFile.close()
datasetFile, labelsDictionary = getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, randomState)
# if len(askedLabelsNames) != NB_CLASS:
# nbLabelsAvailable = 0
# for l in labelsNamesFile:
# nbLabelsAvailable += 1
# askedLabelsNames = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if
# lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)]
# fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=',').astype(int)
# labelsDictionary = dict((labelIndex, labelName) for labelIndex, labelName in enumerate(labelsNames))
# if len(set(fullLabels)) > NB_CLASS:
# usedIndices = getPositions(labelsDictionary.keys(), fullLabels)
# else:
# usedIndices = range(len(fullLabels))
# for viewIndex, view in enumerate(views):
# viewFile = pathF + nameDB + "-" + view + '.csv'
# viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=','))[usedIndices, :]
# viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
# viewDset.attrs["name"] = view
# viewDset.attrs["sparse"] = False
# viewDset.attrs["binary"] = False
# labelsDset.attrs["labels_indices"] = [labelIndex for labelIndex, labelName in labelsDictionary.iteritems()]
# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
return datasetFile, labelsDictionary
# def getLabelSupports(CLASS_LABELS): # def getLabelSupports(CLASS_LABELS):
# """Used to get the number of example for each label""" # """Used to get the number of example for each label"""
...@@ -333,43 +385,6 @@ def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, dest ...@@ -333,43 +385,6 @@ def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, dest
# return usedIndices # return usedIndices
# def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState):
# TODO : Update this one
# labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv')
# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
# if len(LABELS_NAMES) != NB_CLASS:
# nbLabelsAvailable = 0
# for l in labelsNamesFile:
# nbLabelsAvailable += 1
# LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if
# lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)]
# fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=',').astype(int)
# labelsDictionary = dict((classIndex, labelName) for (classIndex, labelName) in
# [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in
# enumerate(labelsNamesFile) if line.strip().split(";")[0] in LABELS_NAMES])
# if len(set(fullLabels)) > NB_CLASS:
# usedIndices = getPositions(labelsDictionary.keys(), fullLabels)
# else:
# usedIndices = range(len(fullLabels))
# for viewIndex, view in enumerate(views):
# viewFile = pathF + nameDB + "-" + view + '.csv'
# viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=','))[usedIndices, :]
# viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
# viewDset.attrs["name"] = view
# viewDset.attrs["sparse"] = False
# viewDset.attrs["binary"] = False
#
# labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices])
# labelsDset.attrs["labels"] = [labelName for index, labelName in labelsDictionary.iteritems()]
# labelsDset.attrs["labels_indices"] = [labelIndex for labelIndex, labelName in labelsDictionary.iteritems()]
#
# metaDataGrp = datasetFile.create_group("Metadata")
# metaDataGrp.attrs["nbView"] = len(views)
# metaDataGrp.attrs["nbClass"] = NB_CLASS
# metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices])
# datasetFile.close()
# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
# return datasetFile, labelsDictionary
# def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): # def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState):
# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") # datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment