Select Git revision
GetMultiviewDb.py
GetMultiviewDb.py 49.75 KiB
import numpy as np
import math
from scipy import sparse
import os
import logging
import h5py
import operator
# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype" # Production, Development, Prototype
def makeMeNoisy(viewData, randomState, percentage=15):
viewData = viewData.astype(bool)
nbNoisyCoord = int(percentage / 100.0 * viewData.shape[0] * viewData.shape[1])
rows = range(viewData.shape[0])
cols = range(viewData.shape[1])
for _ in range(nbNoisyCoord):
rowIdx = randomState.choice(rows)
colIdx = randomState.choice(cols)
viewData[rowIdx, colIdx] = not viewData[rowIdx, colIdx]
noisyViewData = viewData.astype(np.uint8)
return noisyViewData
def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, nbView=3,
nbClass=2, datasetLength=347, randomStateInt=42):
randomState = np.random.RandomState(randomStateInt)
nbFeatures = 250
datasetFile = h5py.File(pathF + "Plausible.hdf5", "w")
CLASS_LABELS = np.array([0 for i in range(datasetLength / 2)] + [1 for i in range(datasetLength / 2)])
for viewIndex in range(nbView):
viewData = np.array([np.zeros(nbFeatures) for i in range(datasetLength / 2)] + [np.ones(nbFeatures)
for i in
range(datasetLength / 2)])
fakeTrueIndices = randomState.randint(0, datasetLength / 2 - 1, datasetLength / 5)
fakeFalseIndices = randomState.randint(datasetLength / 2, datasetLength - 1, datasetLength / 5)
viewData[fakeTrueIndices] = np.ones((len(fakeTrueIndices), nbFeatures))
viewData[fakeFalseIndices] = np.zeros((len(fakeFalseIndices), nbFeatures))
viewData = makeMeNoisy(viewData, randomState)
viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewData.shape, data=viewData.astype(np.uint8))
viewDset.attrs["name"] = "View" + str(viewIndex)
viewDset.attrs["sparse"] = False
viewDset.attrs["binary"] = True
labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape)
labelsDset[...] = CLASS_LABELS
labelsDset.attrs["name"] = "Labels"
metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = nbView
metaDataGrp.attrs["nbClass"] = 2
metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS)
datasetFile.close()
datasetFile = h5py.File(pathF + "Plausible.hdf5", "r")
LABELS_DICTIONARY = {0: "No", 1: "Yes"}
return datasetFile, LABELS_DICTIONARY
def getFakeDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, randomState):
NB_VIEW = 4
DATASET_LENGTH = 30
NB_CLASS = 2
VIEW_DIMENSIONS = randomState.random_integers(5, 20, NB_VIEW)
DATA = dict((indx,
np.array([
randomState.normal(0.0, 2, viewDimension)
for i in np.arange(DATASET_LENGTH)]))
for indx, viewDimension in enumerate(VIEW_DIMENSIONS))
CLASS_LABELS = randomState.random_integers(0, NB_CLASS - 1, DATASET_LENGTH)
datasetFile = h5py.File(pathF + "Fake.hdf5", "w")
for index, viewData in enumerate(DATA.values()):
if index == 0:
viewData = randomState.randint(0, 1, (DATASET_LENGTH, 300)).astype(
np.uint8)
# np.zeros(viewData.shape, dtype=bool)+np.ones((viewData.shape[0], viewData.shape[1]/2), dtype=bool)
viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape)
viewDset[...] = viewData
viewDset.attrs["name"] = "View" + str(index)
viewDset.attrs["sparse"] = False
elif index == 1:
viewData = sparse.csr_matrix(viewData)
viewGrp = datasetFile.create_group("View" + str(index))
dataDset = viewGrp.create_dataset("data", viewData.data.shape, data=viewData.data)
indicesDset = viewGrp.create_dataset("indices", viewData.indices.shape, data=viewData.indices)
indptrDset = viewGrp.create_dataset("indptr", viewData.indptr.shape, data=viewData.indptr)
viewGrp.attrs["name"] = "View" + str(index)
viewGrp.attrs["sparse"] = True
viewGrp.attrs["shape"] = viewData.shape
else:
viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape)
viewDset[...] = viewData
viewDset.attrs["name"] = "View" + str(index)
viewDset.attrs["sparse"] = False
labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape)
labelsDset[...] = CLASS_LABELS
labelsDset.attrs["name"] = "Labels"
metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = NB_VIEW
metaDataGrp.attrs["nbClass"] = NB_CLASS
metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS)
LABELS_DICTIONARY = {0: "No", 1: "Yes"}
datasetFile.close()
datasetFile = h5py.File(pathF + "Fake.hdf5", "r")
return datasetFile, LABELS_DICTIONARY
def getLabelSupports(CLASS_LABELS):
labels = set(CLASS_LABELS)
supports = [CLASS_LABELS.tolist().count(label) for label in labels]
return supports, dict((label, index) for label, index in zip(labels, range(len(labels))))
def isUseful(labelSupports, index, CLASS_LABELS, labelDict):
if labelSupports[labelDict[CLASS_LABELS[index]]] != 0:
labelSupports[labelDict[CLASS_LABELS[index]]] -= 1
return True, labelSupports
else:
return False, labelSupports
def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH, randomState):
LABELS = DATASET.get("Labels")[...]
NB_CLASS = int(DATASET["Metadata"].attrs["nbClass"])
validationIndices = extractRandomTrainingSet(LABELS, 1 - LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState)
validationIndices.sort()
return validationIndices
def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState):
labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS))
nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports]
trainingExamplesIndices = []
usedIndices = []
while nbTrainingExamples != [0 for i in range(NB_CLASS)]:
isUseFull = False
index = int(randomState.randint(0, DATASET_LENGTH - 1))
if index not in usedIndices:
isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict)
if isUseFull:
trainingExamplesIndices.append(index)
usedIndices.append(index)
return trainingExamplesIndices
def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices, randomState):
labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS[learningIndices]))
nbTrainingExamples = [[int(support / nbFolds) for support in labelSupports] for fold in range(nbFolds)]
trainingExamplesIndices = []
usedIndices = []
for foldIndex, fold in enumerate(nbTrainingExamples):
trainingExamplesIndices.append([])
while fold != [0 for i in range(NB_CLASS)]:
index = randomState.randint(0, len(learningIndices))
if learningIndices[index] not in usedIndices:
isUseFull, fold = isUseful(fold, learningIndices[index], CLASS_LABELS, labelDict)
if isUseFull:
trainingExamplesIndices[foldIndex].append(learningIndices[index])
usedIndices.append(learningIndices[index])
return trainingExamplesIndices
def getPositions(labelsUsed, fullLabels):
usedIndices = []
for labelIndex, label in enumerate(fullLabels):
if label in labelsUsed:
usedIndices.append(labelIndex)
return usedIndices
# def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState):
# labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv')
# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
# if len(LABELS_NAMES) != NB_CLASS:
# nbLabelsAvailable = 0
# for l in labelsNamesFile:
# nbLabelsAvailable += 1
# LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if
# lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)]
# fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=',').astype(int)
# labelsDictionary = dict((classIndex, labelName) for (classIndex, labelName) in
# [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in
# enumerate(labelsNamesFile) if line.strip().split(";")[0] in LABELS_NAMES])
# if len(set(fullLabels)) > NB_CLASS:
# usedIndices = getPositions(labelsDictionary.keys(), fullLabels)
# else:
# usedIndices = range(len(fullLabels))
# for viewIndex, view in enumerate(views):
# viewFile = pathF + nameDB + "-" + view + '.csv'
# viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=','))[usedIndices, :]
# viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
# viewDset.attrs["name"] = view
# viewDset.attrs["sparse"] = False
# viewDset.attrs["binary"] = False
#
# labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices])
# labelsDset.attrs["labels"] = [labelName for index, labelName in labelsDictionary.iteritems()]
# labelsDset.attrs["labels_indices"] = [labelIndex for labelIndex, labelName in labelsDictionary.iteritems()]
#
# metaDataGrp = datasetFile.create_group("Metadata")
# metaDataGrp.attrs["nbView"] = len(views)
# metaDataGrp.attrs["nbClass"] = NB_CLASS
# metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices])
# datasetFile.close()
# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
# return datasetFile, labelsDictionary
def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, LABELS_NAMES):
datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
fullLabels = datasetFile.get("Labels")
labelsDictionary = dict((labelIndex, labelName) for labelIndex, labelName in
zip(fullLabels.attrs["labels_indices"], fullLabels.attrs["labels"]))
return datasetFile, labelsDictionary
def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState):
datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv')
if len(LABELS_NAMES) != NB_CLASS:
nbLabelsAvailable = 0
for l in labelsNamesFile:
nbLabelsAvailable += 1
LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if
lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)]
fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=';').astype(int)
labelsDictionary = dict((classIndice, labelName) for (classIndice, labelName) in
[(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in
labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES])
if len(set(fullLabels)) > NB_CLASS:
usedIndices = getPositions(labelsDictionary.keys(), fullLabels)
else:
usedIndices = range(len(fullLabels))
for viewIndex, view in enumerate(views):
viewFile = pathF + nameDB + "-" + view + '.csv'
viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=';'))[usedIndices, :]
viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
viewDset.attrs["name"] = view
labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices])
metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = len(views)
metaDataGrp.attrs["nbClass"] = NB_CLASS
metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices])
datasetFile.close()
datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
return datasetFile, labelsDictionary
def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES, randomState):
datasetFile = h5py.File(path + "MultiOmic.hdf5", "w")
logging.debug("Start:\t Getting Methylation Data")
methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',')
methylDset = datasetFile.create_dataset("View0", methylData.shape)
methylDset[...] = methylData
methylDset.attrs["name"] = "Methyl"
methylDset.attrs["sparse"] = False
methylDset.attrs["binary"] = False
logging.debug("Done:\t Getting Methylation Data")
logging.debug("Start:\t Getting MiRNA Data")
mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',')
mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape)
mirnaDset[...] = mirnaData
mirnaDset.attrs["name"] = "MiRNA_"
mirnaDset.attrs["sparse"] = False
mirnaDset.attrs["binary"] = False
logging.debug("Done:\t Getting MiRNA Data")
logging.debug("Start:\t Getting RNASeq Data")
rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',')
uselessRows = []
for rowIndex, row in enumerate(np.transpose(rnaseqData)):
if not row.any():
uselessRows.append(rowIndex)
usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows]
rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows)))
rnaseqDset[...] = rnaseqData[:, usefulRows]
rnaseqDset.attrs["name"] = "RNASeq_"
rnaseqDset.attrs["sparse"] = False
rnaseqDset.attrs["binary"] = False
logging.debug("Done:\t Getting RNASeq Data")
logging.debug("Start:\t Getting Clinical Data")
clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',')
clinicalDset = datasetFile.create_dataset("View3", clinical.shape)
clinicalDset[...] = clinical
clinicalDset.attrs["name"] = "Clinic"
clinicalDset.attrs["sparse"] = False
clinicalDset.attrs["binary"] = False
logging.debug("Done:\t Getting Clinical Data")
labelFile = open(path + 'brca_labels_triple-negatif.csv')
labels = np.array([int(line.strip().split(',')[1]) for line in labelFile])
labelsDset = datasetFile.create_dataset("Labels", labels.shape)
labelsDset[...] = labels
labelsDset.attrs["name"] = "Labels"
metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = 4
metaDataGrp.attrs["nbClass"] = 2
metaDataGrp.attrs["datasetLength"] = len(labels)
labelDictionary = {0: "No", 1: "Yes"}
datasetFile.close()
datasetFile = h5py.File(path + "MultiOmic.hdf5", "r")
# datasetFile = getPseudoRNASeq(datasetFile)
return datasetFile, labelDictionary
def getVector(nbGenes):
argmax = [0, 0]
maxi = 0
for i in range(nbGenes):
for j in range(nbGenes):
if j == i + 1:
value = (i + 1) * (nbGenes - j)
if value > maxi:
maxi = value
argmax = [i, j]
i, j = argmax
vectorLeft = np.zeros(nbGenes, dtype=bool)
vectorLeft[:i + 1] = np.ones(i + 1, dtype=bool)
vectorSup = np.zeros(nbGenes, dtype=bool)
vectorSup[j:] = np.ones(nbGenes - j, dtype=bool)
matrixSup = j
matrixInf = nbGenes - j
return vectorLeft, matrixSup, matrixInf
def findClosestPowerOfTwo(factorizationParam):
power = 1
while factorizationParam - power > 0:
power *= 2
if abs(factorizationParam - power) < abs(factorizationParam - power / 2):
return power
else:
return power / 2
def easyFactorize(nbGenes, factorizationParam, t=0):
if math.log(factorizationParam + 1, 2) % 1 == 0.0:
pass
else:
factorizationParam = findClosestPowerOfTwo(factorizationParam) - 1
if nbGenes == 2:
return 1, np.array([True, False])
if nbGenes == 3:
return 1, np.array([True, True, False])
if factorizationParam == 1:
t = 1
return t, getVector(nbGenes)[0]
vectorLeft, matrixSup, matrixInf = getVector(nbGenes)
t_, vectorLeftSup = easyFactorize(matrixSup, (factorizationParam - 1) / 2, t=t)
t__, vectorLeftInf = easyFactorize(matrixInf, (factorizationParam - 1) / 2, t=t)
factorLeft = np.zeros((nbGenes, t_ + t__ + 1), dtype=bool)
factorLeft[:matrixSup, :t_] = vectorLeftSup.reshape(factorLeft[:matrixSup, :t_].shape)
if nbGenes % 2 == 1:
factorLeft[matrixInf - 1:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf - 1:, t_:t__ + t_].shape)
else:
factorLeft[matrixInf:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__ + t_].shape)
factorLeft[:, t__ + t_] = vectorLeft
# factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
#
# factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
# if nbGenes%2==1:
# factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
# else:
# factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
# factorSup[t__+t_, :] = vectorSup
return t__ + t_ + 1, factorLeft # , factorSup
def getBaseMatrices(nbGenes, factorizationParam, path):
t, factorLeft = easyFactorize(nbGenes, factorizationParam)
np.savetxt(path + "factorLeft--n-" + str(nbGenes) + "--k-" + str(factorizationParam) + ".csv", factorLeft,
delimiter=",")
return factorLeft
def findParams(arrayLen, nbPatients, randomState, maxNbBins=2000, minNbBins=10, maxLenBin=70000, minOverlapping=1,
minNbBinsOverlapped=0, maxNbSolutions=30):
results = []
if arrayLen * arrayLen * 10 / 100 > minNbBinsOverlapped * nbPatients:
for lenBin in range(arrayLen - 1):
lenBin += 1
if lenBin < maxLenBin and minNbBins * lenBin < arrayLen:
for overlapping in sorted(range(lenBin - 1), reverse=True):
overlapping += 1
if overlapping > minOverlapping and lenBin % (lenBin - overlapping) == 0:
for nbBins in sorted(range(arrayLen - 1), reverse=True):
nbBins += 1
if nbBins < maxNbBins:
if arrayLen == (nbBins - 1) * (lenBin - overlapping) + lenBin:
results.append({"nbBins": nbBins, "overlapping": overlapping, "lenBin": lenBin})
if len(results) == maxNbSolutions:
params = results[randomState.randrange(len(results))]
return params
def findBins(nbBins=142, overlapping=493, lenBin=986):
bins = []
for binIndex in range(nbBins):
bins.append([i + binIndex * (lenBin - overlapping) for i in range(lenBin)])
return bins
def getBins(array, bins, lenBin, overlapping):
binnedcoord = []
for coordIndex, coord in enumerate(array):
nbBinsFull = 0
for binIndex, bin_ in enumerate(bins):
if coordIndex in bin_:
binnedcoord.append(binIndex + (coord * len(bins)))
return np.array(binnedcoord)
def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path):
sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8)
step = lenBins - overlapping
for binIndex in range(nbBins):
sortedBinsMatrix[step * binIndex:lenBins + (step * binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8)
np.savetxt(path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
sortedBinsMatrix, delimiter=",")
return sortedBinsMatrix
def makeSparseTotalMatrix(sortedRNASeq, randomState):
nbPatients, nbGenes = sortedRNASeq.shape
params = findParams(nbGenes, nbPatients, randomState)
nbBins = params["nbBins"]
overlapping = params["overlapping"]
lenBin = params["lenBin"]
bins = findBins(nbBins, overlapping, lenBin)
sparseFull = sparse.csc_matrix((nbPatients, nbGenes * nbBins))
for patientIndex, patient in enumerate(sortedRNASeq):
columnIndices = getBins(patient, bins, lenBin, overlapping)
rowIndices = np.zeros(len(columnIndices), dtype=int) + patientIndex
data = np.ones(len(columnIndices), dtype=bool)
sparseFull = sparseFull + sparse.csc_matrix((data, (rowIndices, columnIndices)),
shape=(nbPatients, nbGenes * nbBins))
return sparseFull
def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2):
k = int(k) / 2 * 2
indices = np.zeros((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=int)
data = np.ones((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=bool)
indptr = np.zeros(RNASeqRanking.shape[0] + 1, dtype=int)
nbGenes = RNASeqRanking.shape[1]
pointer = 0
for patientIndex in range(RNASeqRanking.shape[0]):
for i in range(nbGenes):
for j in range(k / 2):
try:
indices[pointer] = RNASeqRanking[
patientIndex, (sotredRNASeq[patientIndex, i] - (j + 1))] + i * nbGenes
pointer += 1
except:
pass
try:
indices[pointer] = RNASeqRanking[
patientIndex, (sotredRNASeq[patientIndex, i] + (j + 1))] + i * nbGenes
pointer += 1
except:
pass
# elif i<=k:
# indices.append(patient[1]+patient[i]*nbGenes)
# data.append(True)
# elif i==nbGenes-1:
# indices.append(patient[i-1]+patient[i]*nbGenes)
# data.append(True)
indptr[patientIndex + 1] = pointer
mat = sparse.csr_matrix((data, indices, indptr),
shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1] * RNASeqRanking.shape[1]), dtype=bool)
return mat
def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
datasetFile = h5py.File(path + "KMultiOmic.hdf5", "w")
# logging.debug("Start:\t Getting Methylation Data")
methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',')
logging.debug("Done:\t Getting Methylation Data")
logging.debug("Start:\t Getting Sorted Methyl Data")
Methyl = methylData
sortedMethylGeneIndices = np.zeros(methylData.shape, dtype=int)
MethylRanking = np.zeros(methylData.shape, dtype=int)
for exampleIndex, exampleArray in enumerate(Methyl):
sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray))
sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1))
sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int)
sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray
for geneIndex in range(Methyl.shape[1]):
MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex
logging.debug("Done:\t Getting Sorted Methyl Data")
logging.debug("Start:\t Getting Binarized Methyl Data")
k = findClosestPowerOfTwo(9) - 1
try:
factorizedLeftBaseMatrix = np.genfromtxt(
path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
except:
factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path)
bMethylDset = datasetFile.create_dataset("View0",
(sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k),
dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
bMethylDset[patientIndex] = patientMatrix.flatten()
bMethylDset.attrs["name"] = "BMethyl" + str(k)
bMethylDset.attrs["sparse"] = False
bMethylDset.attrs["binary"] = True
logging.debug("Done:\t Getting Binarized Methyl Data")
logging.debug("Start:\t Getting Binned Methyl Data")
lenBins = 3298
nbBins = 9
overlapping = 463
try:
sortedBinsMatrix = np.genfromtxt(
path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
delimiter=",")
except:
sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path)
binnedMethyl = datasetFile.create_dataset("View1", (
sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
binnedMethyl[patientIndex] = patientMatrix.flatten()
binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins)
binnedMethyl.attrs["sparse"] = False
binnedMethyl.attrs["binary"] = True
logging.debug("Done:\t Getting Binned Methyl Data")
logging.debug("Start:\t Getting Binarized Methyl Data")
k = findClosestPowerOfTwo(17) - 1
try:
factorizedLeftBaseMatrix = np.genfromtxt(
path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
except:
factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path)
bMethylDset = datasetFile.create_dataset("View2",
(sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k),
dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
bMethylDset[patientIndex] = patientMatrix.flatten()
bMethylDset.attrs["name"] = "BMethyl" + str(k)
bMethylDset.attrs["sparse"] = False
bMethylDset.attrs["binary"] = True
logging.debug("Done:\t Getting Binarized Methyl Data")
logging.debug("Start:\t Getting Binned Methyl Data")
lenBins = 2038
nbBins = 16
overlapping = 442
try:
sortedBinsMatrix = np.genfromtxt(
path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
delimiter=",")
except:
sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path)
binnedMethyl = datasetFile.create_dataset("View3", (
sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
binnedMethyl[patientIndex] = patientMatrix.flatten()
binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins)
binnedMethyl.attrs["sparse"] = False
binnedMethyl.attrs["binary"] = True
logging.debug("Done:\t Getting Binned Methyl Data")
labelFile = open(path + 'brca_labels_triple-negatif.csv')
labels = np.array([int(line.strip().split(',')[1]) for line in labelFile])
labelsDset = datasetFile.create_dataset("Labels", labels.shape)
labelsDset[...] = labels
labelsDset.attrs["name"] = "Labels"
metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = 4
metaDataGrp.attrs["nbClass"] = 2
metaDataGrp.attrs["datasetLength"] = len(labels)
labelDictionary = {0: "No", 1: "Yes"}
datasetFile.close()
datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r")
return datasetFile, labelDictionary
def getKMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r")
labelDictionary = {0: "No", 1: "Yes"}
return datasetFile, labelDictionary
def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "w")
logging.debug("Start:\t Getting Methylation Data")
methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',')
methylDset = datasetFile.create_dataset("View0", methylData.shape)
methylDset[...] = methylData
methylDset.attrs["name"] = "Methyl_"
methylDset.attrs["sparse"] = False
methylDset.attrs["binary"] = False
logging.debug("Done:\t Getting Methylation Data")
logging.debug("Start:\t Getting Sorted Methyl Data")
Methyl = datasetFile["View0"][...]
sortedMethylGeneIndices = np.zeros(datasetFile.get("View0").shape, dtype=int)
MethylRanking = np.zeros(datasetFile.get("View0").shape, dtype=int)
for exampleIndex, exampleArray in enumerate(Methyl):
sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray))
sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1))
sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int)
sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray
for geneIndex in range(Methyl.shape[1]):
MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex
mMethylDset = datasetFile.create_dataset("View10", sortedMethylGeneIndices.shape, data=sortedMethylGeneIndices)
mMethylDset.attrs["name"] = "SMethyl"
mMethylDset.attrs["sparse"] = False
mMethylDset.attrs["binary"] = False
logging.debug("Done:\t Getting Sorted Methyl Data")
logging.debug("Start:\t Getting Binarized Methyl Data")
k = findClosestPowerOfTwo(58) - 1
try:
factorizedLeftBaseMatrix = np.genfromtxt(
path + "factorLeft--n-" + str(datasetFile.get("View0").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
except:
factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path)
bMethylDset = datasetFile.create_dataset("View11",
(sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k),
dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
bMethylDset[patientIndex] = patientMatrix.flatten()
bMethylDset.attrs["name"] = "BMethyl"
bMethylDset.attrs["sparse"] = False
bMethylDset.attrs["binary"] = True
logging.debug("Done:\t Getting Binarized Methyl Data")
logging.debug("Start:\t Getting Binned Methyl Data")
lenBins = 2095
nbBins = 58
overlapping = 1676
try:
sortedBinsMatrix = np.genfromtxt(
path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
delimiter=",")
except:
sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View0").shape[1], path)
binnedMethyl = datasetFile.create_dataset("View12", (
sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
binnedMethyl[patientIndex] = patientMatrix.flatten()
binnedMethyl.attrs["name"] = "bMethyl"
binnedMethyl.attrs["sparse"] = False
binnedMethyl.attrs["binary"] = True
logging.debug("Done:\t Getting Binned Methyl Data")
logging.debug("Start:\t Getting MiRNA Data")
mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',')
mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape)
mirnaDset[...] = mirnaData
mirnaDset.attrs["name"] = "MiRNA__"
mirnaDset.attrs["sparse"] = False
mirnaDset.attrs["binary"] = False
logging.debug("Done:\t Getting MiRNA Data")
logging.debug("Start:\t Getting Sorted MiRNA Data")
MiRNA = datasetFile["View1"][...]
sortedMiRNAGeneIndices = np.zeros(datasetFile.get("View1").shape, dtype=int)
MiRNARanking = np.zeros(datasetFile.get("View1").shape, dtype=int)
for exampleIndex, exampleArray in enumerate(MiRNA):
sortedMiRNADictionary = dict((index, value) for index, value in enumerate(exampleArray))
sortedMiRNAIndicesDict = sorted(sortedMiRNADictionary.items(), key=operator.itemgetter(1))
sortedMiRNAIndicesArray = np.array([index for (index, value) in sortedMiRNAIndicesDict], dtype=int)
sortedMiRNAGeneIndices[exampleIndex] = sortedMiRNAIndicesArray
for geneIndex in range(MiRNA.shape[1]):
MiRNARanking[exampleIndex, sortedMiRNAIndicesArray[geneIndex]] = geneIndex
mmirnaDset = datasetFile.create_dataset("View7", sortedMiRNAGeneIndices.shape, data=sortedMiRNAGeneIndices)
mmirnaDset.attrs["name"] = "SMiRNA_"
mmirnaDset.attrs["sparse"] = False
mmirnaDset.attrs["binary"] = False
logging.debug("Done:\t Getting Sorted MiRNA Data")
logging.debug("Start:\t Getting Binarized MiRNA Data")
k = findClosestPowerOfTwo(517) - 1
try:
factorizedLeftBaseMatrix = np.genfromtxt(
path + "factorLeft--n-" + str(datasetFile.get("View1").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
except:
factorizedLeftBaseMatrix = getBaseMatrices(mirnaData.shape[1], k, path)
bmirnaDset = datasetFile.create_dataset("View8",
(sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * k),
dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices):
patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
bmirnaDset[patientIndex] = patientMatrix.flatten()
bmirnaDset.attrs["name"] = "BMiRNA_"
bmirnaDset.attrs["sparse"] = False
bmirnaDset.attrs["binary"] = True
logging.debug("Done:\t Getting Binarized MiRNA Data")
logging.debug("Start:\t Getting Binned MiRNA Data")
lenBins = 14
nbBins = 517
overlapping = 12
try:
sortedBinsMatrix = np.genfromtxt(
path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
delimiter=",")
except:
sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View1").shape[1], path)
binnedMiRNA = datasetFile.create_dataset("View9", (
sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * nbBins), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices):
patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
binnedMiRNA[patientIndex] = patientMatrix.flatten()
binnedMiRNA.attrs["name"] = "bMiRNA_"
binnedMiRNA.attrs["sparse"] = False
binnedMiRNA.attrs["binary"] = True
logging.debug("Done:\t Getting Binned MiRNA Data")
logging.debug("Start:\t Getting RNASeq Data")
rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',')
uselessRows = []
for rowIndex, row in enumerate(np.transpose(rnaseqData)):
if not row.any():
uselessRows.append(rowIndex)
usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows]
rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows)))
rnaseqDset[...] = rnaseqData[:, usefulRows]
rnaseqDset.attrs["name"] = "RNASeq_"
rnaseqDset.attrs["sparse"] = False
rnaseqDset.attrs["binary"] = False
logging.debug("Done:\t Getting RNASeq Data")
logging.debug("Start:\t Getting Sorted RNASeq Data")
RNASeq = datasetFile["View2"][...]
sortedRNASeqGeneIndices = np.zeros(datasetFile.get("View2").shape, dtype=int)
RNASeqRanking = np.zeros(datasetFile.get("View2").shape, dtype=int)
for exampleIndex, exampleArray in enumerate(RNASeq):
sortedRNASeqDictionary = dict((index, value) for index, value in enumerate(exampleArray))
sortedRNASeqIndicesDict = sorted(sortedRNASeqDictionary.items(), key=operator.itemgetter(1))
sortedRNASeqIndicesArray = np.array([index for (index, value) in sortedRNASeqIndicesDict], dtype=int)
sortedRNASeqGeneIndices[exampleIndex] = sortedRNASeqIndicesArray
for geneIndex in range(RNASeq.shape[1]):
RNASeqRanking[exampleIndex, sortedRNASeqIndicesArray[geneIndex]] = geneIndex
mrnaseqDset = datasetFile.create_dataset("View4", sortedRNASeqGeneIndices.shape, data=sortedRNASeqGeneIndices)
mrnaseqDset.attrs["name"] = "SRNASeq"
mrnaseqDset.attrs["sparse"] = False
mrnaseqDset.attrs["binary"] = False
logging.debug("Done:\t Getting Sorted RNASeq Data")
logging.debug("Start:\t Getting Binarized RNASeq Data")
k = findClosestPowerOfTwo(100) - 1
try:
factorizedLeftBaseMatrix = np.genfromtxt(
path + "factorLeft--n-" + str(datasetFile.get("View2").shape[1]) + "--k-" + str(100) + ".csv",
delimiter=',')
except:
factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path)
brnaseqDset = datasetFile.create_dataset("View5",
(sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * k),
dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
brnaseqDset[patientIndex] = patientMatrix.flatten()
brnaseqDset.attrs["name"] = "BRNASeq"
brnaseqDset.attrs["sparse"] = False
brnaseqDset.attrs["binary"] = True
logging.debug("Done:\t Getting Binarized RNASeq Data")
logging.debug("Start:\t Getting Binned RNASeq Data")
lenBins = 986
nbBins = 142
overlapping = 493
try:
sortedBinsMatrix = np.genfromtxt(
path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
delimiter=",")
except:
sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path)
binnedRNASeq = datasetFile.create_dataset("View6", (
sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * nbBins), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
binnedRNASeq[patientIndex] = patientMatrix.flatten()
binnedRNASeq.attrs["name"] = "bRNASeq"
binnedRNASeq.attrs["sparse"] = False
binnedRNASeq.attrs["binary"] = True
logging.debug("Done:\t Getting Binned RNASeq Data")
logging.debug("Start:\t Getting Clinical Data")
clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',')
clinicalDset = datasetFile.create_dataset("View3", clinical.shape)
clinicalDset[...] = clinical
clinicalDset.attrs["name"] = "Clinic_"
clinicalDset.attrs["sparse"] = False
clinicalDset.attrs["binary"] = False
logging.debug("Done:\t Getting Clinical Data")
logging.debug("Start:\t Getting Binarized Clinical Data")
binarized_clinical = np.zeros((347, 1951), dtype=np.uint8)
nb_already_done = 0
for feqtureIndex, feature in enumerate(np.transpose(clinical)):
featureSet = set(feature)
featureDict = dict((val, valIndex) for valIndex, val in enumerate(list(featureSet)))
for valueIndex, value in enumerate(feature):
binarized_clinical[valueIndex, featureDict[value] + nb_already_done] = 1
nb_already_done += len(featureSet)
bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8,
data=binarized_clinical)
bClinicalDset.attrs["name"] = "bClinic"
bClinicalDset.attrs["sparse"] = False
bClinicalDset.attrs["binary"] = True
logging.debug("Done:\t Getting Binarized Clinical Data")
# logging.debug("Start:\t Getting Adjacence RNASeq Data")
# sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1)
# sparseAdjRNASeqGrp = datasetFile.create_group("View6")
# dataDset = sparseAdjRNASeqGrp.create_dataset("data", sparseAdjRNASeq.data.shape, data=sparseAdjRNASeq.data)
# indicesDset = sparseAdjRNASeqGrp.create_dataset("indices",
# sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices)
# indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr",
# sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr)
# sparseAdjRNASeqGrp.attrs["name"]="ARNASeq"
# sparseAdjRNASeqGrp.attrs["sparse"]=True
# sparseAdjRNASeqGrp.attrs["shape"]=sparseAdjRNASeq.shape
# logging.debug("Done:\t Getting Adjacence RNASeq Data")
labelFile = open(path + 'brca_labels_triple-negatif.csv')
labels = np.array([int(line.strip().split(',')[1]) for line in labelFile])
labelsDset = datasetFile.create_dataset("Labels", labels.shape)
labelsDset[...] = labels
labelsDset.attrs["name"] = "Labels"
metaDataGrp = datasetFile.create_group("Metadata")
metaDataGrp.attrs["nbView"] = 14
metaDataGrp.attrs["nbClass"] = 2
metaDataGrp.attrs["datasetLength"] = len(labels)
labelDictionary = {0: "No", 1: "Yes"}
datasetFile.close()
datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r")
return datasetFile, labelDictionary
def getModifiedMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r")
labelDictionary = {0: "No", 1: "Yes"}
return datasetFile, labelDictionary
def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
datasetFile = h5py.File(path + "MultiOmic.hdf5", "r")
labelDictionary = {0: "No", 1: "Yes"}
return datasetFile, labelDictionary
def copyHDF5(pathF, name, nbCores):
datasetFile = h5py.File(pathF + name + ".hdf5", "r")
for coreIndex in range(nbCores):
newDataSet = h5py.File(pathF + name + str(coreIndex) + ".hdf5", "w")
for dataset in datasetFile:
datasetFile.copy("/" + dataset, newDataSet["/"])
newDataSet.close()
def datasetsAlreadyExist(pathF, name, nbCores):
allDatasetExist = True
for coreIndex in range(nbCores):
import os.path
allDatasetExist *= os.path.isfile(pathF + name + str(coreIndex) + ".hdf5")
return allDatasetExist
def deleteHDF5(pathF, name, nbCores):
for coreIndex in range(nbCores):
os.remove(pathF + name + str(coreIndex) + ".hdf5")
# def getOneViewFromDB(viewName, pathToDB, DBName):
# view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';')
# return view
# def getClassLabels(pathToDB, DBName):
# labels = np.genfromtxt(pathToDB + DBName + "-" + "ClassLabels.csv", delimiter=';')
# return labels
# def getDataset(pathToDB, viewNames, DBName):
# dataset = []
# for viewName in viewNames:
# dataset.append(getOneViewFromDB(viewName, pathToDB, DBName))
# return np.array(dataset)
# def getAwaLabels(nbLabels, pathToAwa):
# labelsFile = open(pathToAwa + 'Animals_with_Attributes/classes.txt', 'U')
# linesFile = [''.join(line.strip().split()).translate(None, digits) for line in labelsFile.readlines()]
# return linesFile
# def getAwaDBcsv(views, pathToAwa, nameDB, nbLabels, LABELS_NAMES):
# awaLabels = getAwaLabels(nbLabels, pathToAwa)
# nbView = len(views)
# nbMaxLabels = len(awaLabels)
# if nbLabels == -1:
# nbLabels = nbMaxLabels
# nbNamesGiven = len(LABELS_NAMES)
# if nbNamesGiven > nbLabels:
# labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbLabels)}
# elif nbNamesGiven < nbLabels and nbLabels <= nbMaxLabels:
# if LABELS_NAMES != ['']:
# labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbNamesGiven)}
# else:
# labelDictionary = {}
# nbNamesGiven = 0
# nbLabelsToAdd = nbLabels-nbNamesGiven
# while nbLabelsToAdd > 0:
# currentLabel = random.choice(awaLabels)
# if currentLabel not in labelDictionary.values():
# labelDictionary[nbLabels-nbLabelsToAdd]=currentLabel
# nbLabelsToAdd -= 1
# else:
# pass
# else:
# labelDictionary = {i: LABELS_NAMES[i] for i in np.arange(nbNamesGiven)}
# viewDictionary = {i: views[i] for i in np.arange(nbView)}
# rawData = []
# labels = []
# nbExample = 0
# for view in np.arange(nbView):
# viewData = []
# for labelIndex in np.arange(nbLabels):
# pathToExamples = pathToAwa + 'Animals_with_Attributes/Features/' + viewDictionary[view] + '/' + \
# labelDictionary[labelIndex] + '/'
# examples = os.listdir(pathToExamples)
# if view == 0:
# nbExample += len(examples)
# for example in examples:
# if viewDictionary[view]=='decaf':
# exampleFile = open(pathToExamples + example)
# viewData.append([float(line.strip()) for line in exampleFile])
# else:
# exampleFile = open(pathToExamples + example)
# viewData.append([[float(coordinate) for coordinate in raw.split()] for raw in exampleFile][0])
# if view == 0:
# labels.append(labelIndex)
#
# rawData.append(np.array(viewData))
# data = rawData
# DATASET_LENGTH = len(labels)
# return data, labels, labelDictionary, DATASET_LENGTH
#
#
# def getDbfromCSV(path):
# files = os.listdir(path)
# DATA = np.zeros((3,40,2))
# for file in files:
# if file[-9:]=='moins.csv' and file[:7]=='sample1':
# X = open(path+file)
# for x, i in zip(X, range(20)):
# DATA[0, i] = np.array([float(coord) for coord in x.strip().split('\t')])
# if file[-9:]=='moins.csv' and file[:7]=='sample2':
# X = open(path+file)
# for x, i in zip(X, range(20)):
# DATA[1, i] = np.array([float(coord) for coord in x.strip().split('\t')])
# if file[-9:]=='moins.csv' and file[:7]=='sample3':
# X = open(path+file)
# for x, i in zip(X, range(20)):
# DATA[2, i] = np.array([float(coord) for coord in x.strip().split('\t')])
#
# for file in files:
# if file[-8:]=='plus.csv' and file[:7]=='sample1':
# X = open(path+file)
# for x, i in zip(X, range(20)):
# DATA[0, i+20] = np.array([float(coord) for coord in x.strip().split('\t')])
# if file[-8:]=='plus.csv' and file[:7]=='sample2':
# X = open(path+file)
# for x, i in zip(X, range(20)):
# DATA[1, i+20] = np.array([float(coord) for coord in x.strip().split('\t')])
# if file[-8:]=='plus.csv' and file[:7]=='sample3':
# X = open(path+file)
# for x, i in zip(X, range(20)):
# DATA[2, i+20] = np.array([float(coord) for coord in x.strip().split('\t')])
# LABELS = np.zeros(40)
# LABELS[:20]=LABELS[:20]+1
# return DATA, LABELS
# def makeArrayFromTriangular(pseudoRNASeqMatrix):
# matrixShape = len(pseudoRNASeqMatrix[0,:])
# exampleArray = np.array(((matrixShape-1)*matrixShape)/2)
# arrayIndex = 0
# for i in range(matrixShape-1):
# for j in range(i+1, matrixShape):
# exampleArray[arrayIndex]=pseudoRNASeqMatrix[i,j]
# arrayIndex += 1
# return exampleArray
# def getPseudoRNASeq(dataset):
# nbGenes = len(dataset["/View2/matrix"][0, :])
# pseudoRNASeq = np.zeros((dataset["/datasetlength"][...], ((nbGenes - 1) * nbGenes) / 2), dtype=bool_)
# for exampleIndex in xrange(dataset["/datasetlength"][...]):
# arrayIndex = 0
# for i in xrange(nbGenes):
# for j in xrange(nbGenes):
# if i > j:
# pseudoRNASeq[exampleIndex, arrayIndex] =
# dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i]
# arrayIndex += 1
# dataset["/View4/matrix"] = pseudoRNASeq
# dataset["/View4/name"] = "pseudoRNASeq"
# return dataset
# def allSame(array):
# value = array[0]
# areAllSame = True
# for i in array:
# if i != value:
# areAllSame = False
# return areAllSame