GetMultiviewDb.py

import numpy as np
import math
from scipy import sparse
import os
import logging
import h5py
import operator

# Author-Info
__author__ = "Baptiste Bauvin"
__status__ = "Prototype"  # Production, Development, Prototype


def makeMeNoisy(viewData, randomState, percentage=15):
    viewData = viewData.astype(bool)
    nbNoisyCoord = int(percentage / 100.0 * viewData.shape[0] * viewData.shape[1])
    rows = range(viewData.shape[0])
    cols = range(viewData.shape[1])
    for _ in range(nbNoisyCoord):
        rowIdx = randomState.choice(rows)
        colIdx = randomState.choice(cols)
        viewData[rowIdx, colIdx] = not viewData[rowIdx, colIdx]
    noisyViewData = viewData.astype(np.uint8)
    return noisyViewData


def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, nbView=3,
                       nbClass=2, datasetLength=347, randomStateInt=42):
    randomState = np.random.RandomState(randomStateInt)
    nbFeatures = 250
    datasetFile = h5py.File(pathF + "Plausible.hdf5", "w")
    CLASS_LABELS = np.array([0 for i in range(datasetLength / 2)] + [1 for i in range(datasetLength / 2)])
    for viewIndex in range(nbView):
        viewData = np.array([np.zeros(nbFeatures) for i in range(datasetLength / 2)] + [np.ones(nbFeatures)
                                                                                        for i in
                                                                                        range(datasetLength / 2)])
        fakeTrueIndices = randomState.randint(0, datasetLength / 2 - 1, datasetLength / 5)
        fakeFalseIndices = randomState.randint(datasetLength / 2, datasetLength - 1, datasetLength / 5)

        viewData[fakeTrueIndices] = np.ones((len(fakeTrueIndices), nbFeatures))
        viewData[fakeFalseIndices] = np.zeros((len(fakeFalseIndices), nbFeatures))
        viewData = makeMeNoisy(viewData, randomState)
        viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewData.shape, data=viewData.astype(np.uint8))
        viewDset.attrs["name"] = "View" + str(viewIndex)
        viewDset.attrs["sparse"] = False
        viewDset.attrs["binary"] = True
    labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape)
    labelsDset[...] = CLASS_LABELS
    labelsDset.attrs["name"] = "Labels"
    metaDataGrp = datasetFile.create_group("Metadata")
    metaDataGrp.attrs["nbView"] = nbView
    metaDataGrp.attrs["nbClass"] = 2
    metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS)
    datasetFile.close()
    datasetFile = h5py.File(pathF + "Plausible.hdf5", "r")
    LABELS_DICTIONARY = {0: "No", 1: "Yes"}
    return datasetFile, LABELS_DICTIONARY


def getFakeDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, randomState):
    NB_VIEW = 4
    DATASET_LENGTH = 30
    NB_CLASS = 2
    VIEW_DIMENSIONS = randomState.random_integers(5, 20, NB_VIEW)

    DATA = dict((indx,
                 np.array([
                              randomState.normal(0.0, 2, viewDimension)
                              for i in np.arange(DATASET_LENGTH)]))
                for indx, viewDimension in enumerate(VIEW_DIMENSIONS))

    CLASS_LABELS = randomState.random_integers(0, NB_CLASS - 1, DATASET_LENGTH)
    datasetFile = h5py.File(pathF + "Fake.hdf5", "w")
    for index, viewData in enumerate(DATA.values()):
        if index == 0:
            viewData = randomState.randint(0, 1, (DATASET_LENGTH, 300)).astype(
                np.uint8)
            # np.zeros(viewData.shape, dtype=bool)+np.ones((viewData.shape[0], viewData.shape[1]/2), dtype=bool)
            viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape)
            viewDset[...] = viewData
            viewDset.attrs["name"] = "View" + str(index)
            viewDset.attrs["sparse"] = False
        elif index == 1:
            viewData = sparse.csr_matrix(viewData)
            viewGrp = datasetFile.create_group("View" + str(index))
            dataDset = viewGrp.create_dataset("data", viewData.data.shape, data=viewData.data)
            indicesDset = viewGrp.create_dataset("indices", viewData.indices.shape, data=viewData.indices)
            indptrDset = viewGrp.create_dataset("indptr", viewData.indptr.shape, data=viewData.indptr)
            viewGrp.attrs["name"] = "View" + str(index)
            viewGrp.attrs["sparse"] = True
            viewGrp.attrs["shape"] = viewData.shape
        else:
            viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape)
            viewDset[...] = viewData
            viewDset.attrs["name"] = "View" + str(index)
            viewDset.attrs["sparse"] = False
    labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape)
    labelsDset[...] = CLASS_LABELS
    labelsDset.attrs["name"] = "Labels"

    metaDataGrp = datasetFile.create_group("Metadata")
    metaDataGrp.attrs["nbView"] = NB_VIEW
    metaDataGrp.attrs["nbClass"] = NB_CLASS
    metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS)
    LABELS_DICTIONARY = {0: "No", 1: "Yes"}
    datasetFile.close()
    datasetFile = h5py.File(pathF + "Fake.hdf5", "r")
    return datasetFile, LABELS_DICTIONARY


def getLabelSupports(CLASS_LABELS):
    labels = set(CLASS_LABELS)
    supports = [CLASS_LABELS.tolist().count(label) for label in labels]
    return supports, dict((label, index) for label, index in zip(labels, range(len(labels))))


def isUseful(labelSupports, index, CLASS_LABELS, labelDict):
    if labelSupports[labelDict[CLASS_LABELS[index]]] != 0:
        labelSupports[labelDict[CLASS_LABELS[index]]] -= 1
        return True, labelSupports
    else:
        return False, labelSupports


def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH, randomState):
    LABELS = DATASET.get("Labels")[...]
    NB_CLASS = int(DATASET["Metadata"].attrs["nbClass"])
    validationIndices = extractRandomTrainingSet(LABELS, 1 - LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState)
    validationIndices.sort()
    return validationIndices


def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState):
    labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS))
    nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports]
    trainingExamplesIndices = []
    usedIndices = []
    while nbTrainingExamples != [0 for i in range(NB_CLASS)]:
        isUseFull = False
        index = int(randomState.randint(0, DATASET_LENGTH - 1))
        if index not in usedIndices:
            isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict)
        if isUseFull:
            trainingExamplesIndices.append(index)
            usedIndices.append(index)
    return trainingExamplesIndices


def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices, randomState):
    labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS[learningIndices]))
    nbTrainingExamples = [[int(support / nbFolds) for support in labelSupports] for fold in range(nbFolds)]
    trainingExamplesIndices = []
    usedIndices = []
    for foldIndex, fold in enumerate(nbTrainingExamples):
        trainingExamplesIndices.append([])
        while fold != [0 for i in range(NB_CLASS)]:
            index = randomState.randint(0, len(learningIndices))
            if learningIndices[index] not in usedIndices:
                isUseFull, fold = isUseful(fold, learningIndices[index], CLASS_LABELS, labelDict)
                if isUseFull:
                    trainingExamplesIndices[foldIndex].append(learningIndices[index])
                    usedIndices.append(learningIndices[index])
    return trainingExamplesIndices


def getPositions(labelsUsed, fullLabels):
    usedIndices = []
    for labelIndex, label in enumerate(fullLabels):
        if label in labelsUsed:
            usedIndices.append(labelIndex)
    return usedIndices


# def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState):
#     labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv')
#     datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
#     if len(LABELS_NAMES) != NB_CLASS:
#         nbLabelsAvailable = 0
#         for l in labelsNamesFile:
#             nbLabelsAvailable += 1
#         LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if
#                         lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)]
#     fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=',').astype(int)
#     labelsDictionary = dict((classIndex, labelName) for (classIndex, labelName) in
#                             [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in
#                              enumerate(labelsNamesFile) if line.strip().split(";")[0] in LABELS_NAMES])
#     if len(set(fullLabels)) > NB_CLASS:
#         usedIndices = getPositions(labelsDictionary.keys(), fullLabels)
#     else:
#         usedIndices = range(len(fullLabels))
#     for viewIndex, view in enumerate(views):
#         viewFile = pathF + nameDB + "-" + view + '.csv'
#         viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=','))[usedIndices, :]
#         viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
#         viewDset.attrs["name"] = view
#         viewDset.attrs["sparse"] = False
#         viewDset.attrs["binary"] = False
#
#     labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices])
#     labelsDset.attrs["labels"] = [labelName for index, labelName in labelsDictionary.iteritems()]
#     labelsDset.attrs["labels_indices"] = [labelIndex for labelIndex, labelName in labelsDictionary.iteritems()]
#
#     metaDataGrp = datasetFile.create_group("Metadata")
#     metaDataGrp.attrs["nbView"] = len(views)
#     metaDataGrp.attrs["nbClass"] = NB_CLASS
#     metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices])
#     datasetFile.close()
#     datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
#     return datasetFile, labelsDictionary


def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, LABELS_NAMES):
    datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
    fullLabels = datasetFile.get("Labels")
    labelsDictionary = dict((labelIndex, labelName) for labelIndex, labelName in
                            zip(fullLabels.attrs["labels_indices"], fullLabels.attrs["labels"]))
    return datasetFile, labelsDictionary


def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState):
    datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w")
    labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv')
    if len(LABELS_NAMES) != NB_CLASS:
        nbLabelsAvailable = 0
        for l in labelsNamesFile:
            nbLabelsAvailable += 1
        LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if
                        lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)]
    fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=';').astype(int)
    labelsDictionary = dict((classIndice, labelName) for (classIndice, labelName) in
                            [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in
                             labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES])
    if len(set(fullLabels)) > NB_CLASS:
        usedIndices = getPositions(labelsDictionary.keys(), fullLabels)
    else:
        usedIndices = range(len(fullLabels))
    for viewIndex, view in enumerate(views):
        viewFile = pathF + nameDB + "-" + view + '.csv'
        viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=';'))[usedIndices, :]
        viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix)
        viewDset.attrs["name"] = view

    labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices])

    metaDataGrp = datasetFile.create_group("Metadata")
    metaDataGrp.attrs["nbView"] = len(views)
    metaDataGrp.attrs["nbClass"] = NB_CLASS
    metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices])
    datasetFile.close()
    datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r")
    return datasetFile, labelsDictionary


def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES, randomState):
    datasetFile = h5py.File(path + "MultiOmic.hdf5", "w")

    logging.debug("Start:\t Getting Methylation Data")
    methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',')
    methylDset = datasetFile.create_dataset("View0", methylData.shape)
    methylDset[...] = methylData
    methylDset.attrs["name"] = "Methyl"
    methylDset.attrs["sparse"] = False
    methylDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Methylation Data")

    logging.debug("Start:\t Getting MiRNA Data")
    mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',')
    mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape)
    mirnaDset[...] = mirnaData
    mirnaDset.attrs["name"] = "MiRNA_"
    mirnaDset.attrs["sparse"] = False
    mirnaDset.attrs["binary"] = False
    logging.debug("Done:\t Getting MiRNA Data")

    logging.debug("Start:\t Getting RNASeq Data")
    rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',')
    uselessRows = []
    for rowIndex, row in enumerate(np.transpose(rnaseqData)):
        if not row.any():
            uselessRows.append(rowIndex)
    usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows]
    rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows)))
    rnaseqDset[...] = rnaseqData[:, usefulRows]
    rnaseqDset.attrs["name"] = "RNASeq_"
    rnaseqDset.attrs["sparse"] = False
    rnaseqDset.attrs["binary"] = False
    logging.debug("Done:\t Getting RNASeq Data")

    logging.debug("Start:\t Getting Clinical Data")
    clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',')
    clinicalDset = datasetFile.create_dataset("View3", clinical.shape)
    clinicalDset[...] = clinical
    clinicalDset.attrs["name"] = "Clinic"
    clinicalDset.attrs["sparse"] = False
    clinicalDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Clinical Data")

    labelFile = open(path + 'brca_labels_triple-negatif.csv')
    labels = np.array([int(line.strip().split(',')[1]) for line in labelFile])
    labelsDset = datasetFile.create_dataset("Labels", labels.shape)
    labelsDset[...] = labels
    labelsDset.attrs["name"] = "Labels"

    metaDataGrp = datasetFile.create_group("Metadata")
    metaDataGrp.attrs["nbView"] = 4
    metaDataGrp.attrs["nbClass"] = 2
    metaDataGrp.attrs["datasetLength"] = len(labels)
    labelDictionary = {0: "No", 1: "Yes"}
    datasetFile.close()
    datasetFile = h5py.File(path + "MultiOmic.hdf5", "r")
    # datasetFile = getPseudoRNASeq(datasetFile)
    return datasetFile, labelDictionary


def getVector(nbGenes):
    argmax = [0, 0]
    maxi = 0
    for i in range(nbGenes):
        for j in range(nbGenes):
            if j == i + 1:
                value = (i + 1) * (nbGenes - j)
                if value > maxi:
                    maxi = value
                    argmax = [i, j]
    i, j = argmax
    vectorLeft = np.zeros(nbGenes, dtype=bool)
    vectorLeft[:i + 1] = np.ones(i + 1, dtype=bool)
    vectorSup = np.zeros(nbGenes, dtype=bool)
    vectorSup[j:] = np.ones(nbGenes - j, dtype=bool)
    matrixSup = j
    matrixInf = nbGenes - j
    return vectorLeft, matrixSup, matrixInf


def findClosestPowerOfTwo(factorizationParam):
    power = 1
    while factorizationParam - power > 0:
        power *= 2
    if abs(factorizationParam - power) < abs(factorizationParam - power / 2):
        return power
    else:
        return power / 2


def easyFactorize(nbGenes, factorizationParam, t=0):
    if math.log(factorizationParam + 1, 2) % 1 == 0.0:
        pass
    else:
        factorizationParam = findClosestPowerOfTwo(factorizationParam) - 1

    if nbGenes == 2:
        return 1, np.array([True, False])

    if nbGenes == 3:
        return 1, np.array([True, True, False])

    if factorizationParam == 1:
        t = 1
        return t, getVector(nbGenes)[0]

    vectorLeft, matrixSup, matrixInf = getVector(nbGenes)

    t_, vectorLeftSup = easyFactorize(matrixSup, (factorizationParam - 1) / 2, t=t)
    t__, vectorLeftInf = easyFactorize(matrixInf, (factorizationParam - 1) / 2, t=t)

    factorLeft = np.zeros((nbGenes, t_ + t__ + 1), dtype=bool)

    factorLeft[:matrixSup, :t_] = vectorLeftSup.reshape(factorLeft[:matrixSup, :t_].shape)
    if nbGenes % 2 == 1:
        factorLeft[matrixInf - 1:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf - 1:, t_:t__ + t_].shape)
    else:
        factorLeft[matrixInf:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__ + t_].shape)
    factorLeft[:, t__ + t_] = vectorLeft

    # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
    #
    # factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
    # if nbGenes%2==1:
    #     factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
    # else:
    #     factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
    # factorSup[t__+t_, :] = vectorSup
    return t__ + t_ + 1, factorLeft  # , factorSup


def getBaseMatrices(nbGenes, factorizationParam, path):
    t, factorLeft = easyFactorize(nbGenes, factorizationParam)
    np.savetxt(path + "factorLeft--n-" + str(nbGenes) + "--k-" + str(factorizationParam) + ".csv", factorLeft,
               delimiter=",")
    return factorLeft


def findParams(arrayLen, nbPatients, randomState, maxNbBins=2000, minNbBins=10, maxLenBin=70000, minOverlapping=1,
               minNbBinsOverlapped=0, maxNbSolutions=30):
    results = []
    if arrayLen * arrayLen * 10 / 100 > minNbBinsOverlapped * nbPatients:
        for lenBin in range(arrayLen - 1):
            lenBin += 1
            if lenBin < maxLenBin and minNbBins * lenBin < arrayLen:
                for overlapping in sorted(range(lenBin - 1), reverse=True):
                    overlapping += 1
                    if overlapping > minOverlapping and lenBin % (lenBin - overlapping) == 0:
                        for nbBins in sorted(range(arrayLen - 1), reverse=True):
                            nbBins += 1
                            if nbBins < maxNbBins:
                                if arrayLen == (nbBins - 1) * (lenBin - overlapping) + lenBin:
                                    results.append({"nbBins": nbBins, "overlapping": overlapping, "lenBin": lenBin})
                                    if len(results) == maxNbSolutions:
                                        params = results[randomState.randrange(len(results))]
                                        return params


def findBins(nbBins=142, overlapping=493, lenBin=986):
    bins = []
    for binIndex in range(nbBins):
        bins.append([i + binIndex * (lenBin - overlapping) for i in range(lenBin)])
    return bins


def getBins(array, bins, lenBin, overlapping):
    binnedcoord = []
    for coordIndex, coord in enumerate(array):
        nbBinsFull = 0
        for binIndex, bin_ in enumerate(bins):
            if coordIndex in bin_:
                binnedcoord.append(binIndex + (coord * len(bins)))

    return np.array(binnedcoord)


def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path):
    sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8)
    step = lenBins - overlapping
    for binIndex in range(nbBins):
        sortedBinsMatrix[step * binIndex:lenBins + (step * binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8)
    np.savetxt(path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
               sortedBinsMatrix, delimiter=",")
    return sortedBinsMatrix


def makeSparseTotalMatrix(sortedRNASeq, randomState):
    nbPatients, nbGenes = sortedRNASeq.shape
    params = findParams(nbGenes, nbPatients, randomState)
    nbBins = params["nbBins"]
    overlapping = params["overlapping"]
    lenBin = params["lenBin"]
    bins = findBins(nbBins, overlapping, lenBin)
    sparseFull = sparse.csc_matrix((nbPatients, nbGenes * nbBins))
    for patientIndex, patient in enumerate(sortedRNASeq):
        columnIndices = getBins(patient, bins, lenBin, overlapping)
        rowIndices = np.zeros(len(columnIndices), dtype=int) + patientIndex
        data = np.ones(len(columnIndices), dtype=bool)
        sparseFull = sparseFull + sparse.csc_matrix((data, (rowIndices, columnIndices)),
                                                    shape=(nbPatients, nbGenes * nbBins))
    return sparseFull


def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2):
    k = int(k) / 2 * 2
    indices = np.zeros((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=int)
    data = np.ones((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=bool)
    indptr = np.zeros(RNASeqRanking.shape[0] + 1, dtype=int)
    nbGenes = RNASeqRanking.shape[1]
    pointer = 0
    for patientIndex in range(RNASeqRanking.shape[0]):
        for i in range(nbGenes):
            for j in range(k / 2):
                try:
                    indices[pointer] = RNASeqRanking[
                                           patientIndex, (sotredRNASeq[patientIndex, i] - (j + 1))] + i * nbGenes
                    pointer += 1
                except:
                    pass
                try:
                    indices[pointer] = RNASeqRanking[
                                           patientIndex, (sotredRNASeq[patientIndex, i] + (j + 1))] + i * nbGenes
                    pointer += 1
                except:
                    pass
                    # elif i<=k:
                    # 	indices.append(patient[1]+patient[i]*nbGenes)
                    # 	data.append(True)
                    # elif i==nbGenes-1:
                    # 	indices.append(patient[i-1]+patient[i]*nbGenes)
                    # 	data.append(True)
        indptr[patientIndex + 1] = pointer

    mat = sparse.csr_matrix((data, indices, indptr),
                            shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1] * RNASeqRanking.shape[1]), dtype=bool)
    return mat


def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
    datasetFile = h5py.File(path + "KMultiOmic.hdf5", "w")

    # logging.debug("Start:\t Getting Methylation Data")
    methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',')
    logging.debug("Done:\t Getting Methylation Data")

    logging.debug("Start:\t Getting Sorted Methyl Data")
    Methyl = methylData
    sortedMethylGeneIndices = np.zeros(methylData.shape, dtype=int)
    MethylRanking = np.zeros(methylData.shape, dtype=int)
    for exampleIndex, exampleArray in enumerate(Methyl):
        sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray))
        sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1))
        sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int)
        sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray
        for geneIndex in range(Methyl.shape[1]):
            MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex
    logging.debug("Done:\t Getting Sorted Methyl Data")

    logging.debug("Start:\t Getting Binarized Methyl Data")
    k = findClosestPowerOfTwo(9) - 1
    try:
        factorizedLeftBaseMatrix = np.genfromtxt(
            path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
    except:
        factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path)
    bMethylDset = datasetFile.create_dataset("View0",
                                             (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k),
                                             dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
        patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
        bMethylDset[patientIndex] = patientMatrix.flatten()
    bMethylDset.attrs["name"] = "BMethyl" + str(k)
    bMethylDset.attrs["sparse"] = False
    bMethylDset.attrs["binary"] = True
    logging.debug("Done:\t Getting Binarized Methyl Data")

    logging.debug("Start:\t Getting Binned Methyl Data")
    lenBins = 3298
    nbBins = 9
    overlapping = 463
    try:
        sortedBinsMatrix = np.genfromtxt(
            path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
            delimiter=",")
    except:
        sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path)
    binnedMethyl = datasetFile.create_dataset("View1", (
        sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
        patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
        binnedMethyl[patientIndex] = patientMatrix.flatten()
    binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins)
    binnedMethyl.attrs["sparse"] = False
    binnedMethyl.attrs["binary"] = True
    logging.debug("Done:\t Getting Binned Methyl Data")

    logging.debug("Start:\t Getting Binarized Methyl Data")
    k = findClosestPowerOfTwo(17) - 1
    try:
        factorizedLeftBaseMatrix = np.genfromtxt(
            path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
    except:
        factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path)
    bMethylDset = datasetFile.create_dataset("View2",
                                             (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k),
                                             dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
        patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
        bMethylDset[patientIndex] = patientMatrix.flatten()
    bMethylDset.attrs["name"] = "BMethyl" + str(k)
    bMethylDset.attrs["sparse"] = False
    bMethylDset.attrs["binary"] = True
    logging.debug("Done:\t Getting Binarized Methyl Data")

    logging.debug("Start:\t Getting Binned Methyl Data")
    lenBins = 2038
    nbBins = 16
    overlapping = 442
    try:
        sortedBinsMatrix = np.genfromtxt(
            path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
            delimiter=",")
    except:
        sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path)
    binnedMethyl = datasetFile.create_dataset("View3", (
        sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
        patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
        binnedMethyl[patientIndex] = patientMatrix.flatten()
    binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins)
    binnedMethyl.attrs["sparse"] = False
    binnedMethyl.attrs["binary"] = True
    logging.debug("Done:\t Getting Binned Methyl Data")

    labelFile = open(path + 'brca_labels_triple-negatif.csv')
    labels = np.array([int(line.strip().split(',')[1]) for line in labelFile])
    labelsDset = datasetFile.create_dataset("Labels", labels.shape)
    labelsDset[...] = labels
    labelsDset.attrs["name"] = "Labels"

    metaDataGrp = datasetFile.create_group("Metadata")
    metaDataGrp.attrs["nbView"] = 4
    metaDataGrp.attrs["nbClass"] = 2
    metaDataGrp.attrs["datasetLength"] = len(labels)
    labelDictionary = {0: "No", 1: "Yes"}

    datasetFile.close()
    datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r")

    return datasetFile, labelDictionary


def getKMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
    datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r")
    labelDictionary = {0: "No", 1: "Yes"}
    return datasetFile, labelDictionary


def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
    datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "w")

    logging.debug("Start:\t Getting Methylation Data")
    methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',')
    methylDset = datasetFile.create_dataset("View0", methylData.shape)
    methylDset[...] = methylData
    methylDset.attrs["name"] = "Methyl_"
    methylDset.attrs["sparse"] = False
    methylDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Methylation Data")

    logging.debug("Start:\t Getting Sorted Methyl Data")
    Methyl = datasetFile["View0"][...]
    sortedMethylGeneIndices = np.zeros(datasetFile.get("View0").shape, dtype=int)
    MethylRanking = np.zeros(datasetFile.get("View0").shape, dtype=int)
    for exampleIndex, exampleArray in enumerate(Methyl):
        sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray))
        sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1))
        sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int)
        sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray
        for geneIndex in range(Methyl.shape[1]):
            MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex
    mMethylDset = datasetFile.create_dataset("View10", sortedMethylGeneIndices.shape, data=sortedMethylGeneIndices)
    mMethylDset.attrs["name"] = "SMethyl"
    mMethylDset.attrs["sparse"] = False
    mMethylDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Sorted Methyl Data")

    logging.debug("Start:\t Getting Binarized Methyl Data")
    k = findClosestPowerOfTwo(58) - 1
    try:
        factorizedLeftBaseMatrix = np.genfromtxt(
            path + "factorLeft--n-" + str(datasetFile.get("View0").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
    except:
        factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path)
    bMethylDset = datasetFile.create_dataset("View11",
                                             (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k),
                                             dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
        patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
        bMethylDset[patientIndex] = patientMatrix.flatten()
    bMethylDset.attrs["name"] = "BMethyl"
    bMethylDset.attrs["sparse"] = False
    bMethylDset.attrs["binary"] = True
    logging.debug("Done:\t Getting Binarized Methyl Data")

    logging.debug("Start:\t Getting Binned Methyl Data")
    lenBins = 2095
    nbBins = 58
    overlapping = 1676
    try:
        sortedBinsMatrix = np.genfromtxt(
            path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
            delimiter=",")
    except:
        sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View0").shape[1], path)
    binnedMethyl = datasetFile.create_dataset("View12", (
        sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices):
        patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
        binnedMethyl[patientIndex] = patientMatrix.flatten()
    binnedMethyl.attrs["name"] = "bMethyl"
    binnedMethyl.attrs["sparse"] = False
    binnedMethyl.attrs["binary"] = True
    logging.debug("Done:\t Getting Binned Methyl Data")

    logging.debug("Start:\t Getting MiRNA Data")
    mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',')
    mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape)
    mirnaDset[...] = mirnaData
    mirnaDset.attrs["name"] = "MiRNA__"
    mirnaDset.attrs["sparse"] = False
    mirnaDset.attrs["binary"] = False
    logging.debug("Done:\t Getting MiRNA Data")

    logging.debug("Start:\t Getting Sorted MiRNA Data")
    MiRNA = datasetFile["View1"][...]
    sortedMiRNAGeneIndices = np.zeros(datasetFile.get("View1").shape, dtype=int)
    MiRNARanking = np.zeros(datasetFile.get("View1").shape, dtype=int)
    for exampleIndex, exampleArray in enumerate(MiRNA):
        sortedMiRNADictionary = dict((index, value) for index, value in enumerate(exampleArray))
        sortedMiRNAIndicesDict = sorted(sortedMiRNADictionary.items(), key=operator.itemgetter(1))
        sortedMiRNAIndicesArray = np.array([index for (index, value) in sortedMiRNAIndicesDict], dtype=int)
        sortedMiRNAGeneIndices[exampleIndex] = sortedMiRNAIndicesArray
        for geneIndex in range(MiRNA.shape[1]):
            MiRNARanking[exampleIndex, sortedMiRNAIndicesArray[geneIndex]] = geneIndex
    mmirnaDset = datasetFile.create_dataset("View7", sortedMiRNAGeneIndices.shape, data=sortedMiRNAGeneIndices)
    mmirnaDset.attrs["name"] = "SMiRNA_"
    mmirnaDset.attrs["sparse"] = False
    mmirnaDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Sorted MiRNA Data")

    logging.debug("Start:\t Getting Binarized MiRNA Data")
    k = findClosestPowerOfTwo(517) - 1
    try:
        factorizedLeftBaseMatrix = np.genfromtxt(
            path + "factorLeft--n-" + str(datasetFile.get("View1").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',')
    except:
        factorizedLeftBaseMatrix = getBaseMatrices(mirnaData.shape[1], k, path)
    bmirnaDset = datasetFile.create_dataset("View8",
                                            (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * k),
                                            dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices):
        patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], k), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
        bmirnaDset[patientIndex] = patientMatrix.flatten()
    bmirnaDset.attrs["name"] = "BMiRNA_"
    bmirnaDset.attrs["sparse"] = False
    bmirnaDset.attrs["binary"] = True
    logging.debug("Done:\t Getting Binarized MiRNA Data")

    logging.debug("Start:\t Getting Binned MiRNA Data")
    lenBins = 14
    nbBins = 517
    overlapping = 12
    try:
        sortedBinsMatrix = np.genfromtxt(
            path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
            delimiter=",")
    except:
        sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View1").shape[1], path)
    binnedMiRNA = datasetFile.create_dataset("View9", (
        sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * nbBins), dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices):
        patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], nbBins), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
        binnedMiRNA[patientIndex] = patientMatrix.flatten()
    binnedMiRNA.attrs["name"] = "bMiRNA_"
    binnedMiRNA.attrs["sparse"] = False
    binnedMiRNA.attrs["binary"] = True
    logging.debug("Done:\t Getting Binned MiRNA Data")

    logging.debug("Start:\t Getting RNASeq Data")
    rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',')
    uselessRows = []
    for rowIndex, row in enumerate(np.transpose(rnaseqData)):
        if not row.any():
            uselessRows.append(rowIndex)
    usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows]
    rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows)))
    rnaseqDset[...] = rnaseqData[:, usefulRows]
    rnaseqDset.attrs["name"] = "RNASeq_"
    rnaseqDset.attrs["sparse"] = False
    rnaseqDset.attrs["binary"] = False
    logging.debug("Done:\t Getting RNASeq Data")

    logging.debug("Start:\t Getting Sorted RNASeq Data")
    RNASeq = datasetFile["View2"][...]
    sortedRNASeqGeneIndices = np.zeros(datasetFile.get("View2").shape, dtype=int)
    RNASeqRanking = np.zeros(datasetFile.get("View2").shape, dtype=int)
    for exampleIndex, exampleArray in enumerate(RNASeq):
        sortedRNASeqDictionary = dict((index, value) for index, value in enumerate(exampleArray))
        sortedRNASeqIndicesDict = sorted(sortedRNASeqDictionary.items(), key=operator.itemgetter(1))
        sortedRNASeqIndicesArray = np.array([index for (index, value) in sortedRNASeqIndicesDict], dtype=int)
        sortedRNASeqGeneIndices[exampleIndex] = sortedRNASeqIndicesArray
        for geneIndex in range(RNASeq.shape[1]):
            RNASeqRanking[exampleIndex, sortedRNASeqIndicesArray[geneIndex]] = geneIndex
    mrnaseqDset = datasetFile.create_dataset("View4", sortedRNASeqGeneIndices.shape, data=sortedRNASeqGeneIndices)
    mrnaseqDset.attrs["name"] = "SRNASeq"
    mrnaseqDset.attrs["sparse"] = False
    mrnaseqDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Sorted RNASeq Data")

    logging.debug("Start:\t Getting Binarized RNASeq Data")
    k = findClosestPowerOfTwo(100) - 1
    try:
        factorizedLeftBaseMatrix = np.genfromtxt(
            path + "factorLeft--n-" + str(datasetFile.get("View2").shape[1]) + "--k-" + str(100) + ".csv",
            delimiter=',')
    except:
        factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path)
    brnaseqDset = datasetFile.create_dataset("View5",
                                             (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * k),
                                             dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
        patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :]
        brnaseqDset[patientIndex] = patientMatrix.flatten()
    brnaseqDset.attrs["name"] = "BRNASeq"
    brnaseqDset.attrs["sparse"] = False
    brnaseqDset.attrs["binary"] = True
    logging.debug("Done:\t Getting Binarized RNASeq Data")

    logging.debug("Start:\t Getting Binned RNASeq Data")
    lenBins = 986
    nbBins = 142
    overlapping = 493
    try:
        sortedBinsMatrix = np.genfromtxt(
            path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv",
            delimiter=",")
    except:
        sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path)
    binnedRNASeq = datasetFile.create_dataset("View6", (
        sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * nbBins), dtype=np.uint8)
    for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
        patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8)
        for lineIndex, geneIndex in enumerate(patientSortedArray):
            patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :]
        binnedRNASeq[patientIndex] = patientMatrix.flatten()
    binnedRNASeq.attrs["name"] = "bRNASeq"
    binnedRNASeq.attrs["sparse"] = False
    binnedRNASeq.attrs["binary"] = True
    logging.debug("Done:\t Getting Binned RNASeq Data")

    logging.debug("Start:\t Getting Clinical Data")
    clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',')
    clinicalDset = datasetFile.create_dataset("View3", clinical.shape)
    clinicalDset[...] = clinical
    clinicalDset.attrs["name"] = "Clinic_"
    clinicalDset.attrs["sparse"] = False
    clinicalDset.attrs["binary"] = False
    logging.debug("Done:\t Getting Clinical Data")

    logging.debug("Start:\t Getting Binarized Clinical Data")
    binarized_clinical = np.zeros((347, 1951), dtype=np.uint8)
    nb_already_done = 0
    for feqtureIndex, feature in enumerate(np.transpose(clinical)):
        featureSet = set(feature)
        featureDict = dict((val, valIndex) for valIndex, val in enumerate(list(featureSet)))
        for valueIndex, value in enumerate(feature):
            binarized_clinical[valueIndex, featureDict[value] + nb_already_done] = 1
        nb_already_done += len(featureSet)
    bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8,
                                               data=binarized_clinical)
    bClinicalDset.attrs["name"] = "bClinic"
    bClinicalDset.attrs["sparse"] = False
    bClinicalDset.attrs["binary"] = True
    logging.debug("Done:\t Getting Binarized Clinical Data")

    # logging.debug("Start:\t Getting Adjacence RNASeq Data")
    # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1)
    # sparseAdjRNASeqGrp = datasetFile.create_group("View6")
    # dataDset = sparseAdjRNASeqGrp.create_dataset("data", sparseAdjRNASeq.data.shape, data=sparseAdjRNASeq.data)
    # indicesDset = sparseAdjRNASeqGrp.create_dataset("indices",
    # sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices)
    # indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr",
    # sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr)
    # sparseAdjRNASeqGrp.attrs["name"]="ARNASeq"
    # sparseAdjRNASeqGrp.attrs["sparse"]=True
    # sparseAdjRNASeqGrp.attrs["shape"]=sparseAdjRNASeq.shape
    # logging.debug("Done:\t Getting Adjacence RNASeq Data")

    labelFile = open(path + 'brca_labels_triple-negatif.csv')
    labels = np.array([int(line.strip().split(',')[1]) for line in labelFile])
    labelsDset = datasetFile.create_dataset("Labels", labels.shape)
    labelsDset[...] = labels
    labelsDset.attrs["name"] = "Labels"

    metaDataGrp = datasetFile.create_group("Metadata")
    metaDataGrp.attrs["nbView"] = 14
    metaDataGrp.attrs["nbClass"] = 2
    metaDataGrp.attrs["datasetLength"] = len(labels)
    labelDictionary = {0: "No", 1: "Yes"}

    datasetFile.close()
    datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r")

    return datasetFile, labelDictionary


def getModifiedMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
    datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r")
    labelDictionary = {0: "No", 1: "Yes"}
    return datasetFile, labelDictionary


def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES):
    datasetFile = h5py.File(path + "MultiOmic.hdf5", "r")
    labelDictionary = {0: "No", 1: "Yes"}
    return datasetFile, labelDictionary


def copyHDF5(pathF, name, nbCores):
    datasetFile = h5py.File(pathF + name + ".hdf5", "r")
    for coreIndex in range(nbCores):
        newDataSet = h5py.File(pathF + name + str(coreIndex) + ".hdf5", "w")
        for dataset in datasetFile:
            datasetFile.copy("/" + dataset, newDataSet["/"])
        newDataSet.close()


def datasetsAlreadyExist(pathF, name, nbCores):
    allDatasetExist = True
    for coreIndex in range(nbCores):
        import os.path
        allDatasetExist *= os.path.isfile(pathF + name + str(coreIndex) + ".hdf5")
    return allDatasetExist


def deleteHDF5(pathF, name, nbCores):
    for coreIndex in range(nbCores):
        os.remove(pathF + name + str(coreIndex) + ".hdf5")

# def getOneViewFromDB(viewName, pathToDB, DBName):
#     view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';')
#     return view


# def getClassLabels(pathToDB, DBName):
#     labels = np.genfromtxt(pathToDB + DBName + "-" + "ClassLabels.csv", delimiter=';')
#     return labels


# def getDataset(pathToDB, viewNames, DBName):
#     dataset = []
#     for viewName in viewNames:
#         dataset.append(getOneViewFromDB(viewName, pathToDB, DBName))
#     return np.array(dataset)


# def getAwaLabels(nbLabels, pathToAwa):
#     labelsFile = open(pathToAwa + 'Animals_with_Attributes/classes.txt', 'U')
#     linesFile = [''.join(line.strip().split()).translate(None, digits) for line in labelsFile.readlines()]
#     return linesFile


# def getAwaDBcsv(views, pathToAwa, nameDB, nbLabels, LABELS_NAMES):
#     awaLabels = getAwaLabels(nbLabels, pathToAwa)
#     nbView = len(views)
#     nbMaxLabels = len(awaLabels)
#     if nbLabels == -1:
#         nbLabels = nbMaxLabels
#     nbNamesGiven = len(LABELS_NAMES)
#     if nbNamesGiven > nbLabels:
#         labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbLabels)}
#     elif nbNamesGiven < nbLabels and nbLabels <= nbMaxLabels:
#         if LABELS_NAMES != ['']:
#             labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbNamesGiven)}
#         else:
#             labelDictionary = {}
#             nbNamesGiven = 0
#         nbLabelsToAdd = nbLabels-nbNamesGiven
#         while nbLabelsToAdd > 0:
#             currentLabel = random.choice(awaLabels)
#             if currentLabel not in labelDictionary.values():
#                 labelDictionary[nbLabels-nbLabelsToAdd]=currentLabel
#                 nbLabelsToAdd -= 1
#             else:
#                 pass
#     else:
#         labelDictionary = {i: LABELS_NAMES[i] for i in np.arange(nbNamesGiven)}
#     viewDictionary = {i: views[i] for i in np.arange(nbView)}
#     rawData = []
#     labels = []
#     nbExample = 0
#     for view in np.arange(nbView):
#         viewData = []
#         for labelIndex in np.arange(nbLabels):
#             pathToExamples = pathToAwa + 'Animals_with_Attributes/Features/' + viewDictionary[view] + '/' + \
#                              labelDictionary[labelIndex] + '/'
#             examples = os.listdir(pathToExamples)
#             if view == 0:
#                 nbExample += len(examples)
#             for example in examples:
#                 if viewDictionary[view]=='decaf':
#                     exampleFile = open(pathToExamples + example)
#                     viewData.append([float(line.strip()) for line in exampleFile])
#                 else:
#                     exampleFile = open(pathToExamples + example)
#                     viewData.append([[float(coordinate) for coordinate in raw.split()] for raw in exampleFile][0])
#                 if view == 0:
#                     labels.append(labelIndex)
#
#         rawData.append(np.array(viewData))
#     data = rawData
#     DATASET_LENGTH = len(labels)
#     return data, labels, labelDictionary, DATASET_LENGTH
#
#
# def getDbfromCSV(path):
#     files = os.listdir(path)
#     DATA = np.zeros((3,40,2))
#     for file in files:
#         if file[-9:]=='moins.csv' and file[:7]=='sample1':
#             X = open(path+file)
#             for x, i in zip(X, range(20)):
#                 DATA[0, i] = np.array([float(coord) for coord in x.strip().split('\t')])
#         if file[-9:]=='moins.csv' and file[:7]=='sample2':
#             X = open(path+file)
#             for x, i in zip(X, range(20)):
#                 DATA[1, i] = np.array([float(coord) for coord in x.strip().split('\t')])
#         if file[-9:]=='moins.csv' and file[:7]=='sample3':
#             X = open(path+file)
#             for x, i in zip(X, range(20)):
#                 DATA[2, i] = np.array([float(coord) for coord in x.strip().split('\t')])
#
#     for file in files:
#         if file[-8:]=='plus.csv' and file[:7]=='sample1':
#             X = open(path+file)
#             for x, i in zip(X, range(20)):
#                 DATA[0, i+20] = np.array([float(coord) for coord in x.strip().split('\t')])
#         if file[-8:]=='plus.csv' and file[:7]=='sample2':
#             X = open(path+file)
#             for x, i in zip(X, range(20)):
#                 DATA[1, i+20] = np.array([float(coord) for coord in x.strip().split('\t')])
#         if file[-8:]=='plus.csv' and file[:7]=='sample3':
#             X = open(path+file)
#             for x, i in zip(X, range(20)):
#                 DATA[2, i+20] = np.array([float(coord) for coord in x.strip().split('\t')])
#     LABELS = np.zeros(40)
#     LABELS[:20]=LABELS[:20]+1
#     return DATA, LABELS

# def makeArrayFromTriangular(pseudoRNASeqMatrix):
#     matrixShape = len(pseudoRNASeqMatrix[0,:])
#     exampleArray = np.array(((matrixShape-1)*matrixShape)/2)
#     arrayIndex = 0
#     for i in range(matrixShape-1):
#         for j in range(i+1, matrixShape):
#             exampleArray[arrayIndex]=pseudoRNASeqMatrix[i,j]
#             arrayIndex += 1
#     return exampleArray


# def getPseudoRNASeq(dataset):
#     nbGenes = len(dataset["/View2/matrix"][0, :])
#     pseudoRNASeq = np.zeros((dataset["/datasetlength"][...], ((nbGenes - 1) * nbGenes) / 2), dtype=bool_)
#     for exampleIndex in xrange(dataset["/datasetlength"][...]):
#         arrayIndex = 0
#         for i in xrange(nbGenes):
#             for j in xrange(nbGenes):
#                 if i > j:
#                     pseudoRNASeq[exampleIndex, arrayIndex] =
# dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i]
#                     arrayIndex += 1
#     dataset["/View4/matrix"] = pseudoRNASeq
#     dataset["/View4/name"] = "pseudoRNASeq"
#     return dataset


# def allSame(array):
#     value = array[0]
#     areAllSame = True
#     for i in array:
#         if i != value:
#             areAllSame = False
#     return areAllSame