diff --git a/Code/MonoMultiViewClassifiers/utils/Dataset.py b/Code/MonoMultiViewClassifiers/utils/Dataset.py index 7c74fc76664c5612ed903657f8bebfa6254c7fcb..75866388d0b1f2210a8396b64fb52385bbdbb435 100644 --- a/Code/MonoMultiViewClassifiers/utils/Dataset.py +++ b/Code/MonoMultiViewClassifiers/utils/Dataset.py @@ -10,6 +10,7 @@ from . import GetMultiviewDb as DB def getV(DATASET, viewIndex, usedIndices=None): + """Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset""" if usedIndices is None: usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) if type(usedIndices) is int: @@ -32,6 +33,7 @@ def getV(DATASET, viewIndex, usedIndices=None): def getShape(DATASET, viewIndex): + """Used to get the dataset shape even if it's sparse""" if not DATASET.get("View" + str(viewIndex)).attrs["sparse"]: return DATASET.get("View" + str(viewIndex)).shape else: @@ -39,6 +41,7 @@ def getShape(DATASET, viewIndex): def getValue(DATASET): + """Used to get the value of a view in the HDF5 dataset even if it sparse""" if not DATASET.attrs["sparse"]: return DATASET.value else: @@ -50,6 +53,7 @@ def getValue(DATASET): def extractSubset(matrix, usedIndices): + """Used to extract a subset of a matrix even if it's sparse""" if sparse.issparse(matrix): newIndptr = np.zeros(len(usedIndices) + 1, dtype=int) oldindptr = matrix.indptr @@ -69,8 +73,7 @@ def extractSubset(matrix, usedIndices): def initMultipleDatasets(args, nbCores): - """Used to create copies of the dataset if multicore computation is used - Needs arg.pathF and arg.name""" + """Used to create copies of the dataset if multicore computation is used""" if nbCores > 1: if DB.datasetsAlreadyExist(args.pathF, args.name, nbCores): logging.debug("Info:\t Enough copies of the dataset are already available") @@ -90,6 +93,7 @@ def initMultipleDatasets(args, nbCores): def confirm(resp=True, timeout=15): + """Used to process answer""" ans = input_(timeout) if not ans: return resp @@ -102,6 +106,7 @@ def confirm(resp=True, timeout=15): def input_(timeout=15): + """used as a UI to stop if too much HDD space will be used""" print("You have " + str(timeout) + " seconds to stop the script by typing n") i, o, e = select.select([sys.stdin], [], [], timeout) if i: diff --git a/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py b/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py index 28139df754f29c2ccc1fd6f728c2e7c3feb455a0..044620afa87a49a7f6f38b7b2d07fe4ea122665b 100644 --- a/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py +++ b/Code/MonoMultiViewClassifiers/utils/GetMultiviewDb.py @@ -13,6 +13,7 @@ __status__ = "Prototype" # Production, Development, Prototype def makeMeNoisy(viewData, randomState, percentage=15): + """used to introduce some noise in the generated data""" viewData = viewData.astype(bool) nbNoisyCoord = int(percentage / 100.0 * viewData.shape[0] * viewData.shape[1]) rows = range(viewData.shape[0]) @@ -27,6 +28,7 @@ def makeMeNoisy(viewData, randomState, percentage=15): def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, nbView=3, nbClass=2, datasetLength=347, randomStateInt=42): + """Used to generate a plausible dataset to test the algorithms""" randomState = np.random.RandomState(randomStateInt) nbFeatures = 250 if not os.path.exists(os.path.dirname(pathF + "Plausible.hdf5")): @@ -65,6 +67,7 @@ def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, nbView=3, def getFakeDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, randomState): + """Was used to generateafake dataset to run tests""" NB_VIEW = 4 DATASET_LENGTH = 30 NB_CLASS = 2 @@ -115,70 +118,72 @@ def getFakeDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, randomState): return datasetFile, LABELS_DICTIONARY -def getLabelSupports(CLASS_LABELS): - labels = set(CLASS_LABELS) - supports = [CLASS_LABELS.tolist().count(label) for label in labels] - return supports, dict((label, index) for label, index in zip(labels, range(len(labels)))) +# def getLabelSupports(CLASS_LABELS): +# """Used to get the number of example for each label""" +# labels = set(CLASS_LABELS) +# supports = [CLASS_LABELS.tolist().count(label) for label in labels] +# return supports, dict((label, index) for label, index in zip(labels, range(len(labels)))) -def isUseful(labelSupports, index, CLASS_LABELS, labelDict): - if labelSupports[labelDict[CLASS_LABELS[index]]] != 0: - labelSupports[labelDict[CLASS_LABELS[index]]] -= 1 - return True, labelSupports - else: - return False, labelSupports - - -def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH, randomState): - LABELS = DATASET.get("Labels")[...] - NB_CLASS = int(DATASET["Metadata"].attrs["nbClass"]) - validationIndices = extractRandomTrainingSet(LABELS, 1 - LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState) - validationIndices.sort() - return validationIndices - - -def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState): - labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS)) - nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports] - trainingExamplesIndices = [] - usedIndices = [] - while nbTrainingExamples != [0 for i in range(NB_CLASS)]: - isUseFull = False - index = int(randomState.randint(0, DATASET_LENGTH - 1)) - if index not in usedIndices: - isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict) - if isUseFull: - trainingExamplesIndices.append(index) - usedIndices.append(index) - return trainingExamplesIndices - - -def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices, randomState): - labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS[learningIndices])) - nbTrainingExamples = [[int(support / nbFolds) for support in labelSupports] for fold in range(nbFolds)] - trainingExamplesIndices = [] - usedIndices = [] - for foldIndex, fold in enumerate(nbTrainingExamples): - trainingExamplesIndices.append([]) - while fold != [0 for i in range(NB_CLASS)]: - index = randomState.randint(0, len(learningIndices)) - if learningIndices[index] not in usedIndices: - isUseFull, fold = isUseful(fold, learningIndices[index], CLASS_LABELS, labelDict) - if isUseFull: - trainingExamplesIndices[foldIndex].append(learningIndices[index]) - usedIndices.append(learningIndices[index]) - return trainingExamplesIndices - - -def getPositions(labelsUsed, fullLabels): - usedIndices = [] - for labelIndex, label in enumerate(fullLabels): - if label in labelsUsed: - usedIndices.append(labelIndex) - return usedIndices +# def isUseful(labelSupports, index, CLASS_LABELS, labelDict): + # if labelSupports[labelDict[CLASS_LABELS[index]]] != 0: + # labelSupports[labelDict[CLASS_LABELS[index]]] -= 1 + # return True, labelSupports + # else: + # return False, labelSupports + + +# def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH, randomState): +# LABELS = DATASET.get("Labels")[...] +# NB_CLASS = int(DATASET["Metadata"].attrs["nbClass"]) +# validationIndices = extractRandomTrainingSet(LABELS, 1 - LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState) +# validationIndices.sort() +# return validationIndices + + +# def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState): +# labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS)) +# nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports] +# trainingExamplesIndices = [] +# usedIndices = [] +# while nbTrainingExamples != [0 for i in range(NB_CLASS)]: +# isUseFull = False +# index = int(randomState.randint(0, DATASET_LENGTH - 1)) +# if index not in usedIndices: +# isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict) +# if isUseFull: +# trainingExamplesIndices.append(index) +# usedIndices.append(index) +# return trainingExamplesIndices + + +# def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices, randomState): +# labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS[learningIndices])) +# nbTrainingExamples = [[int(support / nbFolds) for support in labelSupports] for fold in range(nbFolds)] +# trainingExamplesIndices = [] +# usedIndices = [] +# for foldIndex, fold in enumerate(nbTrainingExamples): +# trainingExamplesIndices.append([]) +# while fold != [0 for i in range(NB_CLASS)]: +# index = randomState.randint(0, len(learningIndices)) +# if learningIndices[index] not in usedIndices: +# isUseFull, fold = isUseful(fold, learningIndices[index], CLASS_LABELS, labelDict) +# if isUseFull: +# trainingExamplesIndices[foldIndex].append(learningIndices[index]) +# usedIndices.append(learningIndices[index]) +# return trainingExamplesIndices +# +# +# def getPositions(labelsUsed, fullLabels): +# usedIndices = [] +# for labelIndex, label in enumerate(fullLabels): +# if label in labelsUsed: +# usedIndices.append(labelIndex) +# return usedIndices # def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): +# TODO : Update this one # labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv') # datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") # if len(LABELS_NAMES) != NB_CLASS: @@ -217,6 +222,7 @@ def getPositions(labelsUsed, fullLabels): def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): + """Used to load a hdf5 database""" datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") fullLabels = datasetFile.get("Labels") labelsDictionary = dict((labelIndex, labelName) for labelIndex, labelName in @@ -224,850 +230,856 @@ def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): return datasetFile, labelsDictionary -def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): - datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") - labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv') - if len(LABELS_NAMES) != NB_CLASS: - nbLabelsAvailable = 0 - for l in labelsNamesFile: - nbLabelsAvailable += 1 - LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if - lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)] - fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=';').astype(int) - labelsDictionary = dict((classIndice, labelName) for (classIndice, labelName) in - [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in - labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES]) - if len(set(fullLabels)) > NB_CLASS: - usedIndices = getPositions(labelsDictionary.keys(), fullLabels) - else: - usedIndices = range(len(fullLabels)) - for viewIndex, view in enumerate(views): - viewFile = pathF + nameDB + "-" + view + '.csv' - viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=';'))[usedIndices, :] - viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix) - viewDset.attrs["name"] = view - - labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices]) - - metaDataGrp = datasetFile.create_group("Metadata") - metaDataGrp.attrs["nbView"] = len(views) - metaDataGrp.attrs["nbClass"] = NB_CLASS - metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices]) - datasetFile.close() - datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") - return datasetFile, labelsDictionary - - -def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES, randomState): - datasetFile = h5py.File(path + "MultiOmic.hdf5", "w") - - logging.debug("Start:\t Getting Methylation Data") - methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') - methylDset = datasetFile.create_dataset("View0", methylData.shape) - methylDset[...] = methylData - methylDset.attrs["name"] = "Methyl" - methylDset.attrs["sparse"] = False - methylDset.attrs["binary"] = False - logging.debug("Done:\t Getting Methylation Data") - - logging.debug("Start:\t Getting MiRNA Data") - mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') - mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) - mirnaDset[...] = mirnaData - mirnaDset.attrs["name"] = "MiRNA_" - mirnaDset.attrs["sparse"] = False - mirnaDset.attrs["binary"] = False - logging.debug("Done:\t Getting MiRNA Data") - - logging.debug("Start:\t Getting RNASeq Data") - rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') - uselessRows = [] - for rowIndex, row in enumerate(np.transpose(rnaseqData)): - if not row.any(): - uselessRows.append(rowIndex) - usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] - rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) - rnaseqDset[...] = rnaseqData[:, usefulRows] - rnaseqDset.attrs["name"] = "RNASeq_" - rnaseqDset.attrs["sparse"] = False - rnaseqDset.attrs["binary"] = False - logging.debug("Done:\t Getting RNASeq Data") - - logging.debug("Start:\t Getting Clinical Data") - clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') - clinicalDset = datasetFile.create_dataset("View3", clinical.shape) - clinicalDset[...] = clinical - clinicalDset.attrs["name"] = "Clinic" - clinicalDset.attrs["sparse"] = False - clinicalDset.attrs["binary"] = False - logging.debug("Done:\t Getting Clinical Data") - - labelFile = open(path + 'brca_labels_triple-negatif.csv') - labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) - labelsDset = datasetFile.create_dataset("Labels", labels.shape) - labelsDset[...] = labels - labelsDset.attrs["name"] = "Labels" - - metaDataGrp = datasetFile.create_group("Metadata") - metaDataGrp.attrs["nbView"] = 4 - metaDataGrp.attrs["nbClass"] = 2 - metaDataGrp.attrs["datasetLength"] = len(labels) - labelDictionary = {0: "No", 1: "Yes"} - datasetFile.close() - datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") - # datasetFile = getPseudoRNASeq(datasetFile) - return datasetFile, labelDictionary - - -def getVector(nbGenes): - argmax = [0, 0] - maxi = 0 - for i in range(nbGenes): - for j in range(nbGenes): - if j == i + 1: - value = (i + 1) * (nbGenes - j) - if value > maxi: - maxi = value - argmax = [i, j] - i, j = argmax - vectorLeft = np.zeros(nbGenes, dtype=bool) - vectorLeft[:i + 1] = np.ones(i + 1, dtype=bool) - vectorSup = np.zeros(nbGenes, dtype=bool) - vectorSup[j:] = np.ones(nbGenes - j, dtype=bool) - matrixSup = j - matrixInf = nbGenes - j - return vectorLeft, matrixSup, matrixInf - - -def findClosestPowerOfTwo(factorizationParam): - power = 1 - while factorizationParam - power > 0: - power *= 2 - if abs(factorizationParam - power) < abs(factorizationParam - power / 2): - return power - else: - return power / 2 - - -def easyFactorize(nbGenes, factorizationParam, t=0): - if math.log(factorizationParam + 1, 2) % 1 == 0.0: - pass - else: - factorizationParam = findClosestPowerOfTwo(factorizationParam) - 1 - - if nbGenes == 2: - return 1, np.array([True, False]) - - if nbGenes == 3: - return 1, np.array([True, True, False]) - - if factorizationParam == 1: - t = 1 - return t, getVector(nbGenes)[0] - - vectorLeft, matrixSup, matrixInf = getVector(nbGenes) - - t_, vectorLeftSup = easyFactorize(matrixSup, (factorizationParam - 1) / 2, t=t) - t__, vectorLeftInf = easyFactorize(matrixInf, (factorizationParam - 1) / 2, t=t) - - factorLeft = np.zeros((nbGenes, t_ + t__ + 1), dtype=bool) - - factorLeft[:matrixSup, :t_] = vectorLeftSup.reshape(factorLeft[:matrixSup, :t_].shape) - if nbGenes % 2 == 1: - factorLeft[matrixInf - 1:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf - 1:, t_:t__ + t_].shape) - else: - factorLeft[matrixInf:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__ + t_].shape) - factorLeft[:, t__ + t_] = vectorLeft - - # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool) - # - # factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape) - # if nbGenes%2==1: - # factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape) - # else: - # factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape) - # factorSup[t__+t_, :] = vectorSup - return t__ + t_ + 1, factorLeft # , factorSup - - -def getBaseMatrices(nbGenes, factorizationParam, path): - t, factorLeft = easyFactorize(nbGenes, factorizationParam) - np.savetxt(path + "factorLeft--n-" + str(nbGenes) + "--k-" + str(factorizationParam) + ".csv", factorLeft, - delimiter=",") - return factorLeft - - -def findParams(arrayLen, nbPatients, randomState, maxNbBins=2000, minNbBins=10, maxLenBin=70000, minOverlapping=1, - minNbBinsOverlapped=0, maxNbSolutions=30): - results = [] - if arrayLen * arrayLen * 10 / 100 > minNbBinsOverlapped * nbPatients: - for lenBin in range(arrayLen - 1): - lenBin += 1 - if lenBin < maxLenBin and minNbBins * lenBin < arrayLen: - for overlapping in sorted(range(lenBin - 1), reverse=True): - overlapping += 1 - if overlapping > minOverlapping and lenBin % (lenBin - overlapping) == 0: - for nbBins in sorted(range(arrayLen - 1), reverse=True): - nbBins += 1 - if nbBins < maxNbBins: - if arrayLen == (nbBins - 1) * (lenBin - overlapping) + lenBin: - results.append({"nbBins": nbBins, "overlapping": overlapping, "lenBin": lenBin}) - if len(results) == maxNbSolutions: - params = results[randomState.randrange(len(results))] - return params - - -def findBins(nbBins=142, overlapping=493, lenBin=986): - bins = [] - for binIndex in range(nbBins): - bins.append([i + binIndex * (lenBin - overlapping) for i in range(lenBin)]) - return bins - - -def getBins(array, bins, lenBin, overlapping): - binnedcoord = [] - for coordIndex, coord in enumerate(array): - nbBinsFull = 0 - for binIndex, bin_ in enumerate(bins): - if coordIndex in bin_: - binnedcoord.append(binIndex + (coord * len(bins))) - - return np.array(binnedcoord) - - -def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path): - sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8) - step = lenBins - overlapping - for binIndex in range(nbBins): - sortedBinsMatrix[step * binIndex:lenBins + (step * binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8) - np.savetxt(path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", - sortedBinsMatrix, delimiter=",") - return sortedBinsMatrix - - -def makeSparseTotalMatrix(sortedRNASeq, randomState): - nbPatients, nbGenes = sortedRNASeq.shape - params = findParams(nbGenes, nbPatients, randomState) - nbBins = params["nbBins"] - overlapping = params["overlapping"] - lenBin = params["lenBin"] - bins = findBins(nbBins, overlapping, lenBin) - sparseFull = sparse.csc_matrix((nbPatients, nbGenes * nbBins)) - for patientIndex, patient in enumerate(sortedRNASeq): - columnIndices = getBins(patient, bins, lenBin, overlapping) - rowIndices = np.zeros(len(columnIndices), dtype=int) + patientIndex - data = np.ones(len(columnIndices), dtype=bool) - sparseFull = sparseFull + sparse.csc_matrix((data, (rowIndices, columnIndices)), - shape=(nbPatients, nbGenes * nbBins)) - return sparseFull - - -def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2): - k = int(k) / 2 * 2 - indices = np.zeros((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=int) - data = np.ones((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=bool) - indptr = np.zeros(RNASeqRanking.shape[0] + 1, dtype=int) - nbGenes = RNASeqRanking.shape[1] - pointer = 0 - for patientIndex in range(RNASeqRanking.shape[0]): - for i in range(nbGenes): - for j in range(k / 2): - try: - indices[pointer] = RNASeqRanking[ - patientIndex, (sotredRNASeq[patientIndex, i] - (j + 1))] + i * nbGenes - pointer += 1 - except: - pass - try: - indices[pointer] = RNASeqRanking[ - patientIndex, (sotredRNASeq[patientIndex, i] + (j + 1))] + i * nbGenes - pointer += 1 - except: - pass - # elif i<=k: - # indices.append(patient[1]+patient[i]*nbGenes) - # data.append(True) - # elif i==nbGenes-1: - # indices.append(patient[i-1]+patient[i]*nbGenes) - # data.append(True) - indptr[patientIndex + 1] = pointer - - mat = sparse.csr_matrix((data, indices, indptr), - shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1] * RNASeqRanking.shape[1]), dtype=bool) - return mat - - -def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path + "KMultiOmic.hdf5", "w") - - # logging.debug("Start:\t Getting Methylation Data") - methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') - logging.debug("Done:\t Getting Methylation Data") - - logging.debug("Start:\t Getting Sorted Methyl Data") - Methyl = methylData - sortedMethylGeneIndices = np.zeros(methylData.shape, dtype=int) - MethylRanking = np.zeros(methylData.shape, dtype=int) - for exampleIndex, exampleArray in enumerate(Methyl): - sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray)) - sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1)) - sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int) - sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray - for geneIndex in range(Methyl.shape[1]): - MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex - logging.debug("Done:\t Getting Sorted Methyl Data") - - logging.debug("Start:\t Getting Binarized Methyl Data") - k = findClosestPowerOfTwo(9) - 1 - try: - factorizedLeftBaseMatrix = np.genfromtxt( - path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') - except: - factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) - bMethylDset = datasetFile.create_dataset("View0", - (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), - dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): - patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] - bMethylDset[patientIndex] = patientMatrix.flatten() - bMethylDset.attrs["name"] = "BMethyl" + str(k) - bMethylDset.attrs["sparse"] = False - bMethylDset.attrs["binary"] = True - logging.debug("Done:\t Getting Binarized Methyl Data") - - logging.debug("Start:\t Getting Binned Methyl Data") - lenBins = 3298 - nbBins = 9 - overlapping = 463 - try: - sortedBinsMatrix = np.genfromtxt( - path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", - delimiter=",") - except: - sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) - binnedMethyl = datasetFile.create_dataset("View1", ( - sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): - patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] - binnedMethyl[patientIndex] = patientMatrix.flatten() - binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) - binnedMethyl.attrs["sparse"] = False - binnedMethyl.attrs["binary"] = True - logging.debug("Done:\t Getting Binned Methyl Data") - - logging.debug("Start:\t Getting Binarized Methyl Data") - k = findClosestPowerOfTwo(17) - 1 - try: - factorizedLeftBaseMatrix = np.genfromtxt( - path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') - except: - factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) - bMethylDset = datasetFile.create_dataset("View2", - (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), - dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): - patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] - bMethylDset[patientIndex] = patientMatrix.flatten() - bMethylDset.attrs["name"] = "BMethyl" + str(k) - bMethylDset.attrs["sparse"] = False - bMethylDset.attrs["binary"] = True - logging.debug("Done:\t Getting Binarized Methyl Data") - - logging.debug("Start:\t Getting Binned Methyl Data") - lenBins = 2038 - nbBins = 16 - overlapping = 442 - try: - sortedBinsMatrix = np.genfromtxt( - path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", - delimiter=",") - except: - sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) - binnedMethyl = datasetFile.create_dataset("View3", ( - sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): - patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] - binnedMethyl[patientIndex] = patientMatrix.flatten() - binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) - binnedMethyl.attrs["sparse"] = False - binnedMethyl.attrs["binary"] = True - logging.debug("Done:\t Getting Binned Methyl Data") - - labelFile = open(path + 'brca_labels_triple-negatif.csv') - labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) - labelsDset = datasetFile.create_dataset("Labels", labels.shape) - labelsDset[...] = labels - labelsDset.attrs["name"] = "Labels" - - metaDataGrp = datasetFile.create_group("Metadata") - metaDataGrp.attrs["nbView"] = 4 - metaDataGrp.attrs["nbClass"] = 2 - metaDataGrp.attrs["datasetLength"] = len(labels) - labelDictionary = {0: "No", 1: "Yes"} - - datasetFile.close() - datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") - - return datasetFile, labelDictionary - - -def getKMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") - labelDictionary = {0: "No", 1: "Yes"} - return datasetFile, labelDictionary - - -def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "w") - - logging.debug("Start:\t Getting Methylation Data") - methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') - methylDset = datasetFile.create_dataset("View0", methylData.shape) - methylDset[...] = methylData - methylDset.attrs["name"] = "Methyl_" - methylDset.attrs["sparse"] = False - methylDset.attrs["binary"] = False - logging.debug("Done:\t Getting Methylation Data") - - logging.debug("Start:\t Getting Sorted Methyl Data") - Methyl = datasetFile["View0"][...] - sortedMethylGeneIndices = np.zeros(datasetFile.get("View0").shape, dtype=int) - MethylRanking = np.zeros(datasetFile.get("View0").shape, dtype=int) - for exampleIndex, exampleArray in enumerate(Methyl): - sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray)) - sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1)) - sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int) - sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray - for geneIndex in range(Methyl.shape[1]): - MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex - mMethylDset = datasetFile.create_dataset("View10", sortedMethylGeneIndices.shape, data=sortedMethylGeneIndices) - mMethylDset.attrs["name"] = "SMethyl" - mMethylDset.attrs["sparse"] = False - mMethylDset.attrs["binary"] = False - logging.debug("Done:\t Getting Sorted Methyl Data") - - logging.debug("Start:\t Getting Binarized Methyl Data") - k = findClosestPowerOfTwo(58) - 1 - try: - factorizedLeftBaseMatrix = np.genfromtxt( - path + "factorLeft--n-" + str(datasetFile.get("View0").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') - except: - factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) - bMethylDset = datasetFile.create_dataset("View11", - (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), - dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): - patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] - bMethylDset[patientIndex] = patientMatrix.flatten() - bMethylDset.attrs["name"] = "BMethyl" - bMethylDset.attrs["sparse"] = False - bMethylDset.attrs["binary"] = True - logging.debug("Done:\t Getting Binarized Methyl Data") - - logging.debug("Start:\t Getting Binned Methyl Data") - lenBins = 2095 - nbBins = 58 - overlapping = 1676 - try: - sortedBinsMatrix = np.genfromtxt( - path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", - delimiter=",") - except: - sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View0").shape[1], path) - binnedMethyl = datasetFile.create_dataset("View12", ( - sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): - patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] - binnedMethyl[patientIndex] = patientMatrix.flatten() - binnedMethyl.attrs["name"] = "bMethyl" - binnedMethyl.attrs["sparse"] = False - binnedMethyl.attrs["binary"] = True - logging.debug("Done:\t Getting Binned Methyl Data") - - logging.debug("Start:\t Getting MiRNA Data") - mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') - mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) - mirnaDset[...] = mirnaData - mirnaDset.attrs["name"] = "MiRNA__" - mirnaDset.attrs["sparse"] = False - mirnaDset.attrs["binary"] = False - logging.debug("Done:\t Getting MiRNA Data") - - logging.debug("Start:\t Getting Sorted MiRNA Data") - MiRNA = datasetFile["View1"][...] - sortedMiRNAGeneIndices = np.zeros(datasetFile.get("View1").shape, dtype=int) - MiRNARanking = np.zeros(datasetFile.get("View1").shape, dtype=int) - for exampleIndex, exampleArray in enumerate(MiRNA): - sortedMiRNADictionary = dict((index, value) for index, value in enumerate(exampleArray)) - sortedMiRNAIndicesDict = sorted(sortedMiRNADictionary.items(), key=operator.itemgetter(1)) - sortedMiRNAIndicesArray = np.array([index for (index, value) in sortedMiRNAIndicesDict], dtype=int) - sortedMiRNAGeneIndices[exampleIndex] = sortedMiRNAIndicesArray - for geneIndex in range(MiRNA.shape[1]): - MiRNARanking[exampleIndex, sortedMiRNAIndicesArray[geneIndex]] = geneIndex - mmirnaDset = datasetFile.create_dataset("View7", sortedMiRNAGeneIndices.shape, data=sortedMiRNAGeneIndices) - mmirnaDset.attrs["name"] = "SMiRNA_" - mmirnaDset.attrs["sparse"] = False - mmirnaDset.attrs["binary"] = False - logging.debug("Done:\t Getting Sorted MiRNA Data") - - logging.debug("Start:\t Getting Binarized MiRNA Data") - k = findClosestPowerOfTwo(517) - 1 - try: - factorizedLeftBaseMatrix = np.genfromtxt( - path + "factorLeft--n-" + str(datasetFile.get("View1").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') - except: - factorizedLeftBaseMatrix = getBaseMatrices(mirnaData.shape[1], k, path) - bmirnaDset = datasetFile.create_dataset("View8", - (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * k), - dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): - patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], k), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] - bmirnaDset[patientIndex] = patientMatrix.flatten() - bmirnaDset.attrs["name"] = "BMiRNA_" - bmirnaDset.attrs["sparse"] = False - bmirnaDset.attrs["binary"] = True - logging.debug("Done:\t Getting Binarized MiRNA Data") - - logging.debug("Start:\t Getting Binned MiRNA Data") - lenBins = 14 - nbBins = 517 - overlapping = 12 - try: - sortedBinsMatrix = np.genfromtxt( - path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", - delimiter=",") - except: - sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View1").shape[1], path) - binnedMiRNA = datasetFile.create_dataset("View9", ( - sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * nbBins), dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): - patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], nbBins), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] - binnedMiRNA[patientIndex] = patientMatrix.flatten() - binnedMiRNA.attrs["name"] = "bMiRNA_" - binnedMiRNA.attrs["sparse"] = False - binnedMiRNA.attrs["binary"] = True - logging.debug("Done:\t Getting Binned MiRNA Data") - - logging.debug("Start:\t Getting RNASeq Data") - rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') - uselessRows = [] - for rowIndex, row in enumerate(np.transpose(rnaseqData)): - if not row.any(): - uselessRows.append(rowIndex) - usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] - rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) - rnaseqDset[...] = rnaseqData[:, usefulRows] - rnaseqDset.attrs["name"] = "RNASeq_" - rnaseqDset.attrs["sparse"] = False - rnaseqDset.attrs["binary"] = False - logging.debug("Done:\t Getting RNASeq Data") - - logging.debug("Start:\t Getting Sorted RNASeq Data") - RNASeq = datasetFile["View2"][...] - sortedRNASeqGeneIndices = np.zeros(datasetFile.get("View2").shape, dtype=int) - RNASeqRanking = np.zeros(datasetFile.get("View2").shape, dtype=int) - for exampleIndex, exampleArray in enumerate(RNASeq): - sortedRNASeqDictionary = dict((index, value) for index, value in enumerate(exampleArray)) - sortedRNASeqIndicesDict = sorted(sortedRNASeqDictionary.items(), key=operator.itemgetter(1)) - sortedRNASeqIndicesArray = np.array([index for (index, value) in sortedRNASeqIndicesDict], dtype=int) - sortedRNASeqGeneIndices[exampleIndex] = sortedRNASeqIndicesArray - for geneIndex in range(RNASeq.shape[1]): - RNASeqRanking[exampleIndex, sortedRNASeqIndicesArray[geneIndex]] = geneIndex - mrnaseqDset = datasetFile.create_dataset("View4", sortedRNASeqGeneIndices.shape, data=sortedRNASeqGeneIndices) - mrnaseqDset.attrs["name"] = "SRNASeq" - mrnaseqDset.attrs["sparse"] = False - mrnaseqDset.attrs["binary"] = False - logging.debug("Done:\t Getting Sorted RNASeq Data") - - logging.debug("Start:\t Getting Binarized RNASeq Data") - k = findClosestPowerOfTwo(100) - 1 - try: - factorizedLeftBaseMatrix = np.genfromtxt( - path + "factorLeft--n-" + str(datasetFile.get("View2").shape[1]) + "--k-" + str(100) + ".csv", - delimiter=',') - except: - factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path) - brnaseqDset = datasetFile.create_dataset("View5", - (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * k), - dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): - patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] - brnaseqDset[patientIndex] = patientMatrix.flatten() - brnaseqDset.attrs["name"] = "BRNASeq" - brnaseqDset.attrs["sparse"] = False - brnaseqDset.attrs["binary"] = True - logging.debug("Done:\t Getting Binarized RNASeq Data") - - logging.debug("Start:\t Getting Binned RNASeq Data") - lenBins = 986 - nbBins = 142 - overlapping = 493 - try: - sortedBinsMatrix = np.genfromtxt( - path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", - delimiter=",") - except: - sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path) - binnedRNASeq = datasetFile.create_dataset("View6", ( - sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * nbBins), dtype=np.uint8) - for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): - patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8) - for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] - binnedRNASeq[patientIndex] = patientMatrix.flatten() - binnedRNASeq.attrs["name"] = "bRNASeq" - binnedRNASeq.attrs["sparse"] = False - binnedRNASeq.attrs["binary"] = True - logging.debug("Done:\t Getting Binned RNASeq Data") - - logging.debug("Start:\t Getting Clinical Data") - clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') - clinicalDset = datasetFile.create_dataset("View3", clinical.shape) - clinicalDset[...] = clinical - clinicalDset.attrs["name"] = "Clinic_" - clinicalDset.attrs["sparse"] = False - clinicalDset.attrs["binary"] = False - logging.debug("Done:\t Getting Clinical Data") - - logging.debug("Start:\t Getting Binarized Clinical Data") - binarized_clinical = np.zeros((347, 1951), dtype=np.uint8) - nb_already_done = 0 - for feqtureIndex, feature in enumerate(np.transpose(clinical)): - featureSet = set(feature) - featureDict = dict((val, valIndex) for valIndex, val in enumerate(list(featureSet))) - for valueIndex, value in enumerate(feature): - binarized_clinical[valueIndex, featureDict[value] + nb_already_done] = 1 - nb_already_done += len(featureSet) - bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8, - data=binarized_clinical) - bClinicalDset.attrs["name"] = "bClinic" - bClinicalDset.attrs["sparse"] = False - bClinicalDset.attrs["binary"] = True - logging.debug("Done:\t Getting Binarized Clinical Data") - - # logging.debug("Start:\t Getting Adjacence RNASeq Data") - # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1) - # sparseAdjRNASeqGrp = datasetFile.create_group("View6") - # dataDset = sparseAdjRNASeqGrp.create_dataset("data", sparseAdjRNASeq.data.shape, data=sparseAdjRNASeq.data) - # indicesDset = sparseAdjRNASeqGrp.create_dataset("indices", - # sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices) - # indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr", - # sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr) - # sparseAdjRNASeqGrp.attrs["name"]="ARNASeq" - # sparseAdjRNASeqGrp.attrs["sparse"]=True - # sparseAdjRNASeqGrp.attrs["shape"]=sparseAdjRNASeq.shape - # logging.debug("Done:\t Getting Adjacence RNASeq Data") - - labelFile = open(path + 'brca_labels_triple-negatif.csv') - labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) - labelsDset = datasetFile.create_dataset("Labels", labels.shape) - labelsDset[...] = labels - labelsDset.attrs["name"] = "Labels" - - metaDataGrp = datasetFile.create_group("Metadata") - metaDataGrp.attrs["nbView"] = 14 - metaDataGrp.attrs["nbClass"] = 2 - metaDataGrp.attrs["datasetLength"] = len(labels) - labelDictionary = {0: "No", 1: "Yes"} - - datasetFile.close() - datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") - - return datasetFile, labelDictionary - - -def getModifiedMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") - labelDictionary = {0: "No", 1: "Yes"} - return datasetFile, labelDictionary - - -def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") - labelDictionary = {0: "No", 1: "Yes"} - return datasetFile, labelDictionary - - -def copyHDF5(pathF, name, nbCores): - datasetFile = h5py.File(pathF + name + ".hdf5", "r") - for coreIndex in range(nbCores): - newDataSet = h5py.File(pathF + name + str(coreIndex) + ".hdf5", "w") - for dataset in datasetFile: - datasetFile.copy("/" + dataset, newDataSet["/"]) - newDataSet.close() - - -def datasetsAlreadyExist(pathF, name, nbCores): - allDatasetExist = True - for coreIndex in range(nbCores): - import os.path - allDatasetExist *= os.path.isfile(pathF + name + str(coreIndex) + ".hdf5") - return allDatasetExist - - -def deleteHDF5(pathF, name, nbCores): - for coreIndex in range(nbCores): - os.remove(pathF + name + str(coreIndex) + ".hdf5") - -# def getOneViewFromDB(viewName, pathToDB, DBName): -# view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';') -# return view - - -# def getClassLabels(pathToDB, DBName): -# labels = np.genfromtxt(pathToDB + DBName + "-" + "ClassLabels.csv", delimiter=';') -# return labels - - -# def getDataset(pathToDB, viewNames, DBName): -# dataset = [] -# for viewName in viewNames: -# dataset.append(getOneViewFromDB(viewName, pathToDB, DBName)) -# return np.array(dataset) - - -# def getAwaLabels(nbLabels, pathToAwa): -# labelsFile = open(pathToAwa + 'Animals_with_Attributes/classes.txt', 'U') -# linesFile = [''.join(line.strip().split()).translate(None, digits) for line in labelsFile.readlines()] -# return linesFile - - -# def getAwaDBcsv(views, pathToAwa, nameDB, nbLabels, LABELS_NAMES): -# awaLabels = getAwaLabels(nbLabels, pathToAwa) -# nbView = len(views) -# nbMaxLabels = len(awaLabels) -# if nbLabels == -1: -# nbLabels = nbMaxLabels -# nbNamesGiven = len(LABELS_NAMES) -# if nbNamesGiven > nbLabels: -# labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbLabels)} -# elif nbNamesGiven < nbLabels and nbLabels <= nbMaxLabels: -# if LABELS_NAMES != ['']: -# labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbNamesGiven)} -# else: -# labelDictionary = {} -# nbNamesGiven = 0 -# nbLabelsToAdd = nbLabels-nbNamesGiven -# while nbLabelsToAdd > 0: -# currentLabel = random.choice(awaLabels) -# if currentLabel not in labelDictionary.values(): -# labelDictionary[nbLabels-nbLabelsToAdd]=currentLabel -# nbLabelsToAdd -= 1 -# else: -# pass +# def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): +# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") +# labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv') +# if len(LABELS_NAMES) != NB_CLASS: +# nbLabelsAvailable = 0 +# for l in labelsNamesFile: +# nbLabelsAvailable += 1 +# LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if +# lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)] +# fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=';').astype(int) +# labelsDictionary = dict((classIndice, labelName) for (classIndice, labelName) in +# [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in +# labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES]) +# if len(set(fullLabels)) > NB_CLASS: +# usedIndices = getPositions(labelsDictionary.keys(), fullLabels) # else: -# labelDictionary = {i: LABELS_NAMES[i] for i in np.arange(nbNamesGiven)} -# viewDictionary = {i: views[i] for i in np.arange(nbView)} -# rawData = [] -# labels = [] -# nbExample = 0 -# for view in np.arange(nbView): -# viewData = [] -# for labelIndex in np.arange(nbLabels): -# pathToExamples = pathToAwa + 'Animals_with_Attributes/Features/' + viewDictionary[view] + '/' + \ -# labelDictionary[labelIndex] + '/' -# examples = os.listdir(pathToExamples) -# if view == 0: -# nbExample += len(examples) -# for example in examples: -# if viewDictionary[view]=='decaf': -# exampleFile = open(pathToExamples + example) -# viewData.append([float(line.strip()) for line in exampleFile]) -# else: -# exampleFile = open(pathToExamples + example) -# viewData.append([[float(coordinate) for coordinate in raw.split()] for raw in exampleFile][0]) -# if view == 0: -# labels.append(labelIndex) -# -# rawData.append(np.array(viewData)) -# data = rawData -# DATASET_LENGTH = len(labels) -# return data, labels, labelDictionary, DATASET_LENGTH -# -# -# def getDbfromCSV(path): -# files = os.listdir(path) -# DATA = np.zeros((3,40,2)) -# for file in files: -# if file[-9:]=='moins.csv' and file[:7]=='sample1': -# X = open(path+file) -# for x, i in zip(X, range(20)): -# DATA[0, i] = np.array([float(coord) for coord in x.strip().split('\t')]) -# if file[-9:]=='moins.csv' and file[:7]=='sample2': -# X = open(path+file) -# for x, i in zip(X, range(20)): -# DATA[1, i] = np.array([float(coord) for coord in x.strip().split('\t')]) -# if file[-9:]=='moins.csv' and file[:7]=='sample3': -# X = open(path+file) -# for x, i in zip(X, range(20)): -# DATA[2, i] = np.array([float(coord) for coord in x.strip().split('\t')]) -# -# for file in files: -# if file[-8:]=='plus.csv' and file[:7]=='sample1': -# X = open(path+file) -# for x, i in zip(X, range(20)): -# DATA[0, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) -# if file[-8:]=='plus.csv' and file[:7]=='sample2': -# X = open(path+file) -# for x, i in zip(X, range(20)): -# DATA[1, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) -# if file[-8:]=='plus.csv' and file[:7]=='sample3': -# X = open(path+file) -# for x, i in zip(X, range(20)): -# DATA[2, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) -# LABELS = np.zeros(40) -# LABELS[:20]=LABELS[:20]+1 -# return DATA, LABELS - -# def makeArrayFromTriangular(pseudoRNASeqMatrix): -# matrixShape = len(pseudoRNASeqMatrix[0,:]) -# exampleArray = np.array(((matrixShape-1)*matrixShape)/2) -# arrayIndex = 0 -# for i in range(matrixShape-1): -# for j in range(i+1, matrixShape): -# exampleArray[arrayIndex]=pseudoRNASeqMatrix[i,j] -# arrayIndex += 1 -# return exampleArray - +# usedIndices = range(len(fullLabels)) +# for viewIndex, view in enumerate(views): +# viewFile = pathF + nameDB + "-" + view + '.csv' +# viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=';'))[usedIndices, :] +# viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix) +# viewDset.attrs["name"] = view +# +# labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices]) +# +# metaDataGrp = datasetFile.create_group("Metadata") +# metaDataGrp.attrs["nbView"] = len(views) +# metaDataGrp.attrs["nbClass"] = NB_CLASS +# metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices]) +# datasetFile.close() +# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") +# return datasetFile, labelsDictionary -# def getPseudoRNASeq(dataset): -# nbGenes = len(dataset["/View2/matrix"][0, :]) -# pseudoRNASeq = np.zeros((dataset["/datasetlength"][...], ((nbGenes - 1) * nbGenes) / 2), dtype=bool_) -# for exampleIndex in xrange(dataset["/datasetlength"][...]): -# arrayIndex = 0 -# for i in xrange(nbGenes): -# for j in xrange(nbGenes): -# if i > j: -# pseudoRNASeq[exampleIndex, arrayIndex] = -# dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i] -# arrayIndex += 1 -# dataset["/View4/matrix"] = pseudoRNASeq -# dataset["/View4/name"] = "pseudoRNASeq" -# return dataset +#--------------------------------------------# +# All the functions below are not useful # +# anymore but the binarization methods in # +# it must be kept # +#--------------------------------------------# -# def allSame(array): -# value = array[0] -# areAllSame = True -# for i in array: -# if i != value: -# areAllSame = False -# return areAllSame +# def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES, randomState): +# datasetFile = h5py.File(path + "MultiOmic.hdf5", "w") +# +# logging.debug("Start:\t Getting Methylation Data") +# methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') +# methylDset = datasetFile.create_dataset("View0", methylData.shape) +# methylDset[...] = methylData +# methylDset.attrs["name"] = "Methyl" +# methylDset.attrs["sparse"] = False +# methylDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Methylation Data") +# +# logging.debug("Start:\t Getting MiRNA Data") +# mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') +# mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) +# mirnaDset[...] = mirnaData +# mirnaDset.attrs["name"] = "MiRNA_" +# mirnaDset.attrs["sparse"] = False +# mirnaDset.attrs["binary"] = False +# logging.debug("Done:\t Getting MiRNA Data") +# +# logging.debug("Start:\t Getting RNASeq Data") +# rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') +# uselessRows = [] +# for rowIndex, row in enumerate(np.transpose(rnaseqData)): +# if not row.any(): +# uselessRows.append(rowIndex) +# usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] +# rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) +# rnaseqDset[...] = rnaseqData[:, usefulRows] +# rnaseqDset.attrs["name"] = "RNASeq_" +# rnaseqDset.attrs["sparse"] = False +# rnaseqDset.attrs["binary"] = False +# logging.debug("Done:\t Getting RNASeq Data") +# +# logging.debug("Start:\t Getting Clinical Data") +# clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') +# clinicalDset = datasetFile.create_dataset("View3", clinical.shape) +# clinicalDset[...] = clinical +# clinicalDset.attrs["name"] = "Clinic" +# clinicalDset.attrs["sparse"] = False +# clinicalDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Clinical Data") +# +# labelFile = open(path + 'brca_labels_triple-negatif.csv') +# labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) +# labelsDset = datasetFile.create_dataset("Labels", labels.shape) +# labelsDset[...] = labels +# labelsDset.attrs["name"] = "Labels" +# +# metaDataGrp = datasetFile.create_group("Metadata") +# metaDataGrp.attrs["nbView"] = 4 +# metaDataGrp.attrs["nbClass"] = 2 +# metaDataGrp.attrs["datasetLength"] = len(labels) +# labelDictionary = {0: "No", 1: "Yes"} +# datasetFile.close() +# datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") +# # datasetFile = getPseudoRNASeq(datasetFile) +# return datasetFile, labelDictionary +# +# +# def getVector(nbGenes): +# argmax = [0, 0] +# maxi = 0 +# for i in range(nbGenes): +# for j in range(nbGenes): +# if j == i + 1: +# value = (i + 1) * (nbGenes - j) +# if value > maxi: +# maxi = value +# argmax = [i, j] +# i, j = argmax +# vectorLeft = np.zeros(nbGenes, dtype=bool) +# vectorLeft[:i + 1] = np.ones(i + 1, dtype=bool) +# vectorSup = np.zeros(nbGenes, dtype=bool) +# vectorSup[j:] = np.ones(nbGenes - j, dtype=bool) +# matrixSup = j +# matrixInf = nbGenes - j +# return vectorLeft, matrixSup, matrixInf +# +# +# def findClosestPowerOfTwo(factorizationParam): +# power = 1 +# while factorizationParam - power > 0: +# power *= 2 +# if abs(factorizationParam - power) < abs(factorizationParam - power / 2): +# return power +# else: +# return power / 2 +# +# +# def easyFactorize(nbGenes, factorizationParam, t=0): +# if math.log(factorizationParam + 1, 2) % 1 == 0.0: +# pass +# else: +# factorizationParam = findClosestPowerOfTwo(factorizationParam) - 1 +# +# if nbGenes == 2: +# return 1, np.array([True, False]) +# +# if nbGenes == 3: +# return 1, np.array([True, True, False]) +# +# if factorizationParam == 1: +# t = 1 +# return t, getVector(nbGenes)[0] +# +# vectorLeft, matrixSup, matrixInf = getVector(nbGenes) +# +# t_, vectorLeftSup = easyFactorize(matrixSup, (factorizationParam - 1) / 2, t=t) +# t__, vectorLeftInf = easyFactorize(matrixInf, (factorizationParam - 1) / 2, t=t) +# +# factorLeft = np.zeros((nbGenes, t_ + t__ + 1), dtype=bool) +# +# factorLeft[:matrixSup, :t_] = vectorLeftSup.reshape(factorLeft[:matrixSup, :t_].shape) +# if nbGenes % 2 == 1: +# factorLeft[matrixInf - 1:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf - 1:, t_:t__ + t_].shape) +# else: +# factorLeft[matrixInf:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__ + t_].shape) +# factorLeft[:, t__ + t_] = vectorLeft +# +# # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool) +# # +# # factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape) +# # if nbGenes%2==1: +# # factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape) +# # else: +# # factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape) +# # factorSup[t__+t_, :] = vectorSup +# return t__ + t_ + 1, factorLeft # , factorSup +# +# +# def getBaseMatrices(nbGenes, factorizationParam, path): +# t, factorLeft = easyFactorize(nbGenes, factorizationParam) +# np.savetxt(path + "factorLeft--n-" + str(nbGenes) + "--k-" + str(factorizationParam) + ".csv", factorLeft, +# delimiter=",") +# return factorLeft +# +# +# def findParams(arrayLen, nbPatients, randomState, maxNbBins=2000, minNbBins=10, maxLenBin=70000, minOverlapping=1, +# minNbBinsOverlapped=0, maxNbSolutions=30): +# results = [] +# if arrayLen * arrayLen * 10 / 100 > minNbBinsOverlapped * nbPatients: +# for lenBin in range(arrayLen - 1): +# lenBin += 1 +# if lenBin < maxLenBin and minNbBins * lenBin < arrayLen: +# for overlapping in sorted(range(lenBin - 1), reverse=True): +# overlapping += 1 +# if overlapping > minOverlapping and lenBin % (lenBin - overlapping) == 0: +# for nbBins in sorted(range(arrayLen - 1), reverse=True): +# nbBins += 1 +# if nbBins < maxNbBins: +# if arrayLen == (nbBins - 1) * (lenBin - overlapping) + lenBin: +# results.append({"nbBins": nbBins, "overlapping": overlapping, "lenBin": lenBin}) +# if len(results) == maxNbSolutions: +# params = results[randomState.randrange(len(results))] +# return params +# +# +# def findBins(nbBins=142, overlapping=493, lenBin=986): +# bins = [] +# for binIndex in range(nbBins): +# bins.append([i + binIndex * (lenBin - overlapping) for i in range(lenBin)]) +# return bins +# +# +# def getBins(array, bins, lenBin, overlapping): +# binnedcoord = [] +# for coordIndex, coord in enumerate(array): +# nbBinsFull = 0 +# for binIndex, bin_ in enumerate(bins): +# if coordIndex in bin_: +# binnedcoord.append(binIndex + (coord * len(bins))) +# +# return np.array(binnedcoord) +# +# +# def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path): +# sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8) +# step = lenBins - overlapping +# for binIndex in range(nbBins): +# sortedBinsMatrix[step * binIndex:lenBins + (step * binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8) +# np.savetxt(path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", +# sortedBinsMatrix, delimiter=",") +# return sortedBinsMatrix +# +# +# def makeSparseTotalMatrix(sortedRNASeq, randomState): +# nbPatients, nbGenes = sortedRNASeq.shape +# params = findParams(nbGenes, nbPatients, randomState) +# nbBins = params["nbBins"] +# overlapping = params["overlapping"] +# lenBin = params["lenBin"] +# bins = findBins(nbBins, overlapping, lenBin) +# sparseFull = sparse.csc_matrix((nbPatients, nbGenes * nbBins)) +# for patientIndex, patient in enumerate(sortedRNASeq): +# columnIndices = getBins(patient, bins, lenBin, overlapping) +# rowIndices = np.zeros(len(columnIndices), dtype=int) + patientIndex +# data = np.ones(len(columnIndices), dtype=bool) +# sparseFull = sparseFull + sparse.csc_matrix((data, (rowIndices, columnIndices)), +# shape=(nbPatients, nbGenes * nbBins)) +# return sparseFull +# +# +# def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2): +# k = int(k) / 2 * 2 +# indices = np.zeros((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=int) +# data = np.ones((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=bool) +# indptr = np.zeros(RNASeqRanking.shape[0] + 1, dtype=int) +# nbGenes = RNASeqRanking.shape[1] +# pointer = 0 +# for patientIndex in range(RNASeqRanking.shape[0]): +# for i in range(nbGenes): +# for j in range(k / 2): +# try: +# indices[pointer] = RNASeqRanking[ +# patientIndex, (sotredRNASeq[patientIndex, i] - (j + 1))] + i * nbGenes +# pointer += 1 +# except: +# pass +# try: +# indices[pointer] = RNASeqRanking[ +# patientIndex, (sotredRNASeq[patientIndex, i] + (j + 1))] + i * nbGenes +# pointer += 1 +# except: +# pass +# # elif i<=k: +# # indices.append(patient[1]+patient[i]*nbGenes) +# # data.append(True) +# # elif i==nbGenes-1: +# # indices.append(patient[i-1]+patient[i]*nbGenes) +# # data.append(True) +# indptr[patientIndex + 1] = pointer +# +# mat = sparse.csr_matrix((data, indices, indptr), +# shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1] * RNASeqRanking.shape[1]), dtype=bool) +# return mat +# +# +# def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): +# datasetFile = h5py.File(path + "KMultiOmic.hdf5", "w") +# +# # logging.debug("Start:\t Getting Methylation Data") +# methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') +# logging.debug("Done:\t Getting Methylation Data") +# +# logging.debug("Start:\t Getting Sorted Methyl Data") +# Methyl = methylData +# sortedMethylGeneIndices = np.zeros(methylData.shape, dtype=int) +# MethylRanking = np.zeros(methylData.shape, dtype=int) +# for exampleIndex, exampleArray in enumerate(Methyl): +# sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray)) +# sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1)) +# sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int) +# sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray +# for geneIndex in range(Methyl.shape[1]): +# MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex +# logging.debug("Done:\t Getting Sorted Methyl Data") +# +# logging.debug("Start:\t Getting Binarized Methyl Data") +# k = findClosestPowerOfTwo(9) - 1 +# try: +# factorizedLeftBaseMatrix = np.genfromtxt( +# path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') +# except: +# factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) +# bMethylDset = datasetFile.create_dataset("View0", +# (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), +# dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): +# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] +# bMethylDset[patientIndex] = patientMatrix.flatten() +# bMethylDset.attrs["name"] = "BMethyl" + str(k) +# bMethylDset.attrs["sparse"] = False +# bMethylDset.attrs["binary"] = True +# logging.debug("Done:\t Getting Binarized Methyl Data") +# +# logging.debug("Start:\t Getting Binned Methyl Data") +# lenBins = 3298 +# nbBins = 9 +# overlapping = 463 +# try: +# sortedBinsMatrix = np.genfromtxt( +# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", +# delimiter=",") +# except: +# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) +# binnedMethyl = datasetFile.create_dataset("View1", ( +# sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): +# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] +# binnedMethyl[patientIndex] = patientMatrix.flatten() +# binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) +# binnedMethyl.attrs["sparse"] = False +# binnedMethyl.attrs["binary"] = True +# logging.debug("Done:\t Getting Binned Methyl Data") +# +# logging.debug("Start:\t Getting Binarized Methyl Data") +# k = findClosestPowerOfTwo(17) - 1 +# try: +# factorizedLeftBaseMatrix = np.genfromtxt( +# path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') +# except: +# factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) +# bMethylDset = datasetFile.create_dataset("View2", +# (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), +# dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): +# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] +# bMethylDset[patientIndex] = patientMatrix.flatten() +# bMethylDset.attrs["name"] = "BMethyl" + str(k) +# bMethylDset.attrs["sparse"] = False +# bMethylDset.attrs["binary"] = True +# logging.debug("Done:\t Getting Binarized Methyl Data") +# +# logging.debug("Start:\t Getting Binned Methyl Data") +# lenBins = 2038 +# nbBins = 16 +# overlapping = 442 +# try: +# sortedBinsMatrix = np.genfromtxt( +# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", +# delimiter=",") +# except: +# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) +# binnedMethyl = datasetFile.create_dataset("View3", ( +# sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): +# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] +# binnedMethyl[patientIndex] = patientMatrix.flatten() +# binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) +# binnedMethyl.attrs["sparse"] = False +# binnedMethyl.attrs["binary"] = True +# logging.debug("Done:\t Getting Binned Methyl Data") +# +# labelFile = open(path + 'brca_labels_triple-negatif.csv') +# labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) +# labelsDset = datasetFile.create_dataset("Labels", labels.shape) +# labelsDset[...] = labels +# labelsDset.attrs["name"] = "Labels" +# +# metaDataGrp = datasetFile.create_group("Metadata") +# metaDataGrp.attrs["nbView"] = 4 +# metaDataGrp.attrs["nbClass"] = 2 +# metaDataGrp.attrs["datasetLength"] = len(labels) +# labelDictionary = {0: "No", 1: "Yes"} +# +# datasetFile.close() +# datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") +# +# return datasetFile, labelDictionary +# +# +# def getKMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): +# datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") +# labelDictionary = {0: "No", 1: "Yes"} +# return datasetFile, labelDictionary +# +# +# def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): +# datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "w") +# +# logging.debug("Start:\t Getting Methylation Data") +# methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') +# methylDset = datasetFile.create_dataset("View0", methylData.shape) +# methylDset[...] = methylData +# methylDset.attrs["name"] = "Methyl_" +# methylDset.attrs["sparse"] = False +# methylDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Methylation Data") +# +# logging.debug("Start:\t Getting Sorted Methyl Data") +# Methyl = datasetFile["View0"][...] +# sortedMethylGeneIndices = np.zeros(datasetFile.get("View0").shape, dtype=int) +# MethylRanking = np.zeros(datasetFile.get("View0").shape, dtype=int) +# for exampleIndex, exampleArray in enumerate(Methyl): +# sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray)) +# sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1)) +# sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int) +# sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray +# for geneIndex in range(Methyl.shape[1]): +# MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex +# mMethylDset = datasetFile.create_dataset("View10", sortedMethylGeneIndices.shape, data=sortedMethylGeneIndices) +# mMethylDset.attrs["name"] = "SMethyl" +# mMethylDset.attrs["sparse"] = False +# mMethylDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Sorted Methyl Data") +# +# logging.debug("Start:\t Getting Binarized Methyl Data") +# k = findClosestPowerOfTwo(58) - 1 +# try: +# factorizedLeftBaseMatrix = np.genfromtxt( +# path + "factorLeft--n-" + str(datasetFile.get("View0").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') +# except: +# factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) +# bMethylDset = datasetFile.create_dataset("View11", +# (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), +# dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): +# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] +# bMethylDset[patientIndex] = patientMatrix.flatten() +# bMethylDset.attrs["name"] = "BMethyl" +# bMethylDset.attrs["sparse"] = False +# bMethylDset.attrs["binary"] = True +# logging.debug("Done:\t Getting Binarized Methyl Data") +# +# logging.debug("Start:\t Getting Binned Methyl Data") +# lenBins = 2095 +# nbBins = 58 +# overlapping = 1676 +# try: +# sortedBinsMatrix = np.genfromtxt( +# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", +# delimiter=",") +# except: +# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View0").shape[1], path) +# binnedMethyl = datasetFile.create_dataset("View12", ( +# sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): +# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] +# binnedMethyl[patientIndex] = patientMatrix.flatten() +# binnedMethyl.attrs["name"] = "bMethyl" +# binnedMethyl.attrs["sparse"] = False +# binnedMethyl.attrs["binary"] = True +# logging.debug("Done:\t Getting Binned Methyl Data") +# +# logging.debug("Start:\t Getting MiRNA Data") +# mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') +# mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) +# mirnaDset[...] = mirnaData +# mirnaDset.attrs["name"] = "MiRNA__" +# mirnaDset.attrs["sparse"] = False +# mirnaDset.attrs["binary"] = False +# logging.debug("Done:\t Getting MiRNA Data") +# +# logging.debug("Start:\t Getting Sorted MiRNA Data") +# MiRNA = datasetFile["View1"][...] +# sortedMiRNAGeneIndices = np.zeros(datasetFile.get("View1").shape, dtype=int) +# MiRNARanking = np.zeros(datasetFile.get("View1").shape, dtype=int) +# for exampleIndex, exampleArray in enumerate(MiRNA): +# sortedMiRNADictionary = dict((index, value) for index, value in enumerate(exampleArray)) +# sortedMiRNAIndicesDict = sorted(sortedMiRNADictionary.items(), key=operator.itemgetter(1)) +# sortedMiRNAIndicesArray = np.array([index for (index, value) in sortedMiRNAIndicesDict], dtype=int) +# sortedMiRNAGeneIndices[exampleIndex] = sortedMiRNAIndicesArray +# for geneIndex in range(MiRNA.shape[1]): +# MiRNARanking[exampleIndex, sortedMiRNAIndicesArray[geneIndex]] = geneIndex +# mmirnaDset = datasetFile.create_dataset("View7", sortedMiRNAGeneIndices.shape, data=sortedMiRNAGeneIndices) +# mmirnaDset.attrs["name"] = "SMiRNA_" +# mmirnaDset.attrs["sparse"] = False +# mmirnaDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Sorted MiRNA Data") +# +# logging.debug("Start:\t Getting Binarized MiRNA Data") +# k = findClosestPowerOfTwo(517) - 1 +# try: +# factorizedLeftBaseMatrix = np.genfromtxt( +# path + "factorLeft--n-" + str(datasetFile.get("View1").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') +# except: +# factorizedLeftBaseMatrix = getBaseMatrices(mirnaData.shape[1], k, path) +# bmirnaDset = datasetFile.create_dataset("View8", +# (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * k), +# dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): +# patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], k), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] +# bmirnaDset[patientIndex] = patientMatrix.flatten() +# bmirnaDset.attrs["name"] = "BMiRNA_" +# bmirnaDset.attrs["sparse"] = False +# bmirnaDset.attrs["binary"] = True +# logging.debug("Done:\t Getting Binarized MiRNA Data") +# +# logging.debug("Start:\t Getting Binned MiRNA Data") +# lenBins = 14 +# nbBins = 517 +# overlapping = 12 +# try: +# sortedBinsMatrix = np.genfromtxt( +# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", +# delimiter=",") +# except: +# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View1").shape[1], path) +# binnedMiRNA = datasetFile.create_dataset("View9", ( +# sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * nbBins), dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): +# patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], nbBins), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] +# binnedMiRNA[patientIndex] = patientMatrix.flatten() +# binnedMiRNA.attrs["name"] = "bMiRNA_" +# binnedMiRNA.attrs["sparse"] = False +# binnedMiRNA.attrs["binary"] = True +# logging.debug("Done:\t Getting Binned MiRNA Data") +# +# logging.debug("Start:\t Getting RNASeq Data") +# rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') +# uselessRows = [] +# for rowIndex, row in enumerate(np.transpose(rnaseqData)): +# if not row.any(): +# uselessRows.append(rowIndex) +# usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] +# rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) +# rnaseqDset[...] = rnaseqData[:, usefulRows] +# rnaseqDset.attrs["name"] = "RNASeq_" +# rnaseqDset.attrs["sparse"] = False +# rnaseqDset.attrs["binary"] = False +# logging.debug("Done:\t Getting RNASeq Data") +# +# logging.debug("Start:\t Getting Sorted RNASeq Data") +# RNASeq = datasetFile["View2"][...] +# sortedRNASeqGeneIndices = np.zeros(datasetFile.get("View2").shape, dtype=int) +# RNASeqRanking = np.zeros(datasetFile.get("View2").shape, dtype=int) +# for exampleIndex, exampleArray in enumerate(RNASeq): +# sortedRNASeqDictionary = dict((index, value) for index, value in enumerate(exampleArray)) +# sortedRNASeqIndicesDict = sorted(sortedRNASeqDictionary.items(), key=operator.itemgetter(1)) +# sortedRNASeqIndicesArray = np.array([index for (index, value) in sortedRNASeqIndicesDict], dtype=int) +# sortedRNASeqGeneIndices[exampleIndex] = sortedRNASeqIndicesArray +# for geneIndex in range(RNASeq.shape[1]): +# RNASeqRanking[exampleIndex, sortedRNASeqIndicesArray[geneIndex]] = geneIndex +# mrnaseqDset = datasetFile.create_dataset("View4", sortedRNASeqGeneIndices.shape, data=sortedRNASeqGeneIndices) +# mrnaseqDset.attrs["name"] = "SRNASeq" +# mrnaseqDset.attrs["sparse"] = False +# mrnaseqDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Sorted RNASeq Data") +# +# logging.debug("Start:\t Getting Binarized RNASeq Data") +# k = findClosestPowerOfTwo(100) - 1 +# try: +# factorizedLeftBaseMatrix = np.genfromtxt( +# path + "factorLeft--n-" + str(datasetFile.get("View2").shape[1]) + "--k-" + str(100) + ".csv", +# delimiter=',') +# except: +# factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path) +# brnaseqDset = datasetFile.create_dataset("View5", +# (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * k), +# dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): +# patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] +# brnaseqDset[patientIndex] = patientMatrix.flatten() +# brnaseqDset.attrs["name"] = "BRNASeq" +# brnaseqDset.attrs["sparse"] = False +# brnaseqDset.attrs["binary"] = True +# logging.debug("Done:\t Getting Binarized RNASeq Data") +# +# logging.debug("Start:\t Getting Binned RNASeq Data") +# lenBins = 986 +# nbBins = 142 +# overlapping = 493 +# try: +# sortedBinsMatrix = np.genfromtxt( +# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", +# delimiter=",") +# except: +# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path) +# binnedRNASeq = datasetFile.create_dataset("View6", ( +# sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * nbBins), dtype=np.uint8) +# for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): +# patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8) +# for lineIndex, geneIndex in enumerate(patientSortedArray): +# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] +# binnedRNASeq[patientIndex] = patientMatrix.flatten() +# binnedRNASeq.attrs["name"] = "bRNASeq" +# binnedRNASeq.attrs["sparse"] = False +# binnedRNASeq.attrs["binary"] = True +# logging.debug("Done:\t Getting Binned RNASeq Data") +# +# logging.debug("Start:\t Getting Clinical Data") +# clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') +# clinicalDset = datasetFile.create_dataset("View3", clinical.shape) +# clinicalDset[...] = clinical +# clinicalDset.attrs["name"] = "Clinic_" +# clinicalDset.attrs["sparse"] = False +# clinicalDset.attrs["binary"] = False +# logging.debug("Done:\t Getting Clinical Data") +# +# logging.debug("Start:\t Getting Binarized Clinical Data") +# binarized_clinical = np.zeros((347, 1951), dtype=np.uint8) +# nb_already_done = 0 +# for feqtureIndex, feature in enumerate(np.transpose(clinical)): +# featureSet = set(feature) +# featureDict = dict((val, valIndex) for valIndex, val in enumerate(list(featureSet))) +# for valueIndex, value in enumerate(feature): +# binarized_clinical[valueIndex, featureDict[value] + nb_already_done] = 1 +# nb_already_done += len(featureSet) +# bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8, +# data=binarized_clinical) +# bClinicalDset.attrs["name"] = "bClinic" +# bClinicalDset.attrs["sparse"] = False +# bClinicalDset.attrs["binary"] = True +# logging.debug("Done:\t Getting Binarized Clinical Data") +# +# # logging.debug("Start:\t Getting Adjacence RNASeq Data") +# # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1) +# # sparseAdjRNASeqGrp = datasetFile.create_group("View6") +# # dataDset = sparseAdjRNASeqGrp.create_dataset("data", sparseAdjRNASeq.data.shape, data=sparseAdjRNASeq.data) +# # indicesDset = sparseAdjRNASeqGrp.create_dataset("indices", +# # sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices) +# # indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr", +# # sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr) +# # sparseAdjRNASeqGrp.attrs["name"]="ARNASeq" +# # sparseAdjRNASeqGrp.attrs["sparse"]=True +# # sparseAdjRNASeqGrp.attrs["shape"]=sparseAdjRNASeq.shape +# # logging.debug("Done:\t Getting Adjacence RNASeq Data") +# +# labelFile = open(path + 'brca_labels_triple-negatif.csv') +# labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) +# labelsDset = datasetFile.create_dataset("Labels", labels.shape) +# labelsDset[...] = labels +# labelsDset.attrs["name"] = "Labels" +# +# metaDataGrp = datasetFile.create_group("Metadata") +# metaDataGrp.attrs["nbView"] = 14 +# metaDataGrp.attrs["nbClass"] = 2 +# metaDataGrp.attrs["datasetLength"] = len(labels) +# labelDictionary = {0: "No", 1: "Yes"} +# +# datasetFile.close() +# datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") +# +# return datasetFile, labelDictionary +# +# +# def getModifiedMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): +# datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") +# labelDictionary = {0: "No", 1: "Yes"} +# return datasetFile, labelDictionary +# +# +# def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): +# datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") +# labelDictionary = {0: "No", 1: "Yes"} +# return datasetFile, labelDictionary +# +# +# def copyHDF5(pathF, name, nbCores): +# datasetFile = h5py.File(pathF + name + ".hdf5", "r") +# for coreIndex in range(nbCores): +# newDataSet = h5py.File(pathF + name + str(coreIndex) + ".hdf5", "w") +# for dataset in datasetFile: +# datasetFile.copy("/" + dataset, newDataSet["/"]) +# newDataSet.close() +# +# +# def datasetsAlreadyExist(pathF, name, nbCores): +# allDatasetExist = True +# for coreIndex in range(nbCores): +# import os.path +# allDatasetExist *= os.path.isfile(pathF + name + str(coreIndex) + ".hdf5") +# return allDatasetExist +# +# +# def deleteHDF5(pathF, name, nbCores): +# for coreIndex in range(nbCores): +# os.remove(pathF + name + str(coreIndex) + ".hdf5") +# +# # def getOneViewFromDB(viewName, pathToDB, DBName): +# # view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';') +# # return view +# +# +# # def getClassLabels(pathToDB, DBName): +# # labels = np.genfromtxt(pathToDB + DBName + "-" + "ClassLabels.csv", delimiter=';') +# # return labels +# +# +# # def getDataset(pathToDB, viewNames, DBName): +# # dataset = [] +# # for viewName in viewNames: +# # dataset.append(getOneViewFromDB(viewName, pathToDB, DBName)) +# # return np.array(dataset) +# +# +# # def getAwaLabels(nbLabels, pathToAwa): +# # labelsFile = open(pathToAwa + 'Animals_with_Attributes/classes.txt', 'U') +# # linesFile = [''.join(line.strip().split()).translate(None, digits) for line in labelsFile.readlines()] +# # return linesFile +# +# +# # def getAwaDBcsv(views, pathToAwa, nameDB, nbLabels, LABELS_NAMES): +# # awaLabels = getAwaLabels(nbLabels, pathToAwa) +# # nbView = len(views) +# # nbMaxLabels = len(awaLabels) +# # if nbLabels == -1: +# # nbLabels = nbMaxLabels +# # nbNamesGiven = len(LABELS_NAMES) +# # if nbNamesGiven > nbLabels: +# # labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbLabels)} +# # elif nbNamesGiven < nbLabels and nbLabels <= nbMaxLabels: +# # if LABELS_NAMES != ['']: +# # labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbNamesGiven)} +# # else: +# # labelDictionary = {} +# # nbNamesGiven = 0 +# # nbLabelsToAdd = nbLabels-nbNamesGiven +# # while nbLabelsToAdd > 0: +# # currentLabel = random.choice(awaLabels) +# # if currentLabel not in labelDictionary.values(): +# # labelDictionary[nbLabels-nbLabelsToAdd]=currentLabel +# # nbLabelsToAdd -= 1 +# # else: +# # pass +# # else: +# # labelDictionary = {i: LABELS_NAMES[i] for i in np.arange(nbNamesGiven)} +# # viewDictionary = {i: views[i] for i in np.arange(nbView)} +# # rawData = [] +# # labels = [] +# # nbExample = 0 +# # for view in np.arange(nbView): +# # viewData = [] +# # for labelIndex in np.arange(nbLabels): +# # pathToExamples = pathToAwa + 'Animals_with_Attributes/Features/' + viewDictionary[view] + '/' + \ +# # labelDictionary[labelIndex] + '/' +# # examples = os.listdir(pathToExamples) +# # if view == 0: +# # nbExample += len(examples) +# # for example in examples: +# # if viewDictionary[view]=='decaf': +# # exampleFile = open(pathToExamples + example) +# # viewData.append([float(line.strip()) for line in exampleFile]) +# # else: +# # exampleFile = open(pathToExamples + example) +# # viewData.append([[float(coordinate) for coordinate in raw.split()] for raw in exampleFile][0]) +# # if view == 0: +# # labels.append(labelIndex) +# # +# # rawData.append(np.array(viewData)) +# # data = rawData +# # DATASET_LENGTH = len(labels) +# # return data, labels, labelDictionary, DATASET_LENGTH +# # +# # +# # def getDbfromCSV(path): +# # files = os.listdir(path) +# # DATA = np.zeros((3,40,2)) +# # for file in files: +# # if file[-9:]=='moins.csv' and file[:7]=='sample1': +# # X = open(path+file) +# # for x, i in zip(X, range(20)): +# # DATA[0, i] = np.array([float(coord) for coord in x.strip().split('\t')]) +# # if file[-9:]=='moins.csv' and file[:7]=='sample2': +# # X = open(path+file) +# # for x, i in zip(X, range(20)): +# # DATA[1, i] = np.array([float(coord) for coord in x.strip().split('\t')]) +# # if file[-9:]=='moins.csv' and file[:7]=='sample3': +# # X = open(path+file) +# # for x, i in zip(X, range(20)): +# # DATA[2, i] = np.array([float(coord) for coord in x.strip().split('\t')]) +# # +# # for file in files: +# # if file[-8:]=='plus.csv' and file[:7]=='sample1': +# # X = open(path+file) +# # for x, i in zip(X, range(20)): +# # DATA[0, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) +# # if file[-8:]=='plus.csv' and file[:7]=='sample2': +# # X = open(path+file) +# # for x, i in zip(X, range(20)): +# # DATA[1, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) +# # if file[-8:]=='plus.csv' and file[:7]=='sample3': +# # X = open(path+file) +# # for x, i in zip(X, range(20)): +# # DATA[2, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) +# # LABELS = np.zeros(40) +# # LABELS[:20]=LABELS[:20]+1 +# # return DATA, LABELS +# +# # def makeArrayFromTriangular(pseudoRNASeqMatrix): +# # matrixShape = len(pseudoRNASeqMatrix[0,:]) +# # exampleArray = np.array(((matrixShape-1)*matrixShape)/2) +# # arrayIndex = 0 +# # for i in range(matrixShape-1): +# # for j in range(i+1, matrixShape): +# # exampleArray[arrayIndex]=pseudoRNASeqMatrix[i,j] +# # arrayIndex += 1 +# # return exampleArray +# +# +# # def getPseudoRNASeq(dataset): +# # nbGenes = len(dataset["/View2/matrix"][0, :]) +# # pseudoRNASeq = np.zeros((dataset["/datasetlength"][...], ((nbGenes - 1) * nbGenes) / 2), dtype=bool_) +# # for exampleIndex in xrange(dataset["/datasetlength"][...]): +# # arrayIndex = 0 +# # for i in xrange(nbGenes): +# # for j in xrange(nbGenes): +# # if i > j: +# # pseudoRNASeq[exampleIndex, arrayIndex] = +# # dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i] +# # arrayIndex += 1 +# # dataset["/View4/matrix"] = pseudoRNASeq +# # dataset["/View4/name"] = "pseudoRNASeq" +# # return dataset +# +# +# # def allSame(array): +# # value = array[0] +# # areAllSame = True +# # for i in array: +# # if i != value: +# # areAllSame = False +# # return areAllSame diff --git a/Code/MonoMultiViewClassifiers/utils/HyperParameterSearch.py b/Code/MonoMultiViewClassifiers/utils/HyperParameterSearch.py index bdc430ad5113b365bee0a4c93ceca93ea22d9219..a30b1b7a1b35bc726fddd21a26bcbae75f4e8c88 100644 --- a/Code/MonoMultiViewClassifiers/utils/HyperParameterSearch.py +++ b/Code/MonoMultiViewClassifiers/utils/HyperParameterSearch.py @@ -8,6 +8,7 @@ from .. import Metrics def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLearningIndices, iKFolds, randomState, viewsIndices=None, searchingTool="hyperParamSearch", nIter=1, **kwargs): + """Used to select the right hyperparam optimization function to optimize hyper parameters""" if viewsIndices is None: viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) thismodule = sys.modules[__name__] @@ -18,12 +19,13 @@ def searchBestSettings(dataset, classifierPackage, classifierName, metrics, iLea def gridSearch(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs): - # si grid search est selectionne, on veut tester certaines valeurs + """Used to perfom gridsearch on the classifiers""" pass def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learningIndices, KFolds, randomState, viewsIndices=None, nIter=1, nbCores=1, **classificationKWARGS): + """Used to perform a random search on the classifiers to optimize hyper parameters""" if viewsIndices is None: viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) metric = metrics[0] @@ -75,10 +77,12 @@ def randomizedSearch(dataset, classifierPackage, classifierName, metrics, learni def spearMint(dataset, classifierName, viewsIndices=None, kFolds=None, nIter=1, **kwargs): + """Used to perform spearmint on the classifiers to optimize hyper parameters""" pass def genHeatMaps(params, scoresArray, outputFileName): + """Used to generate a heat map for each doublet of hyperparms optimized on the previous function""" nbParams = len(params) if nbParams > 2: combinations = itertools.combinations(range(nbParams), 2) @@ -110,128 +114,128 @@ def genHeatMaps(params, scoresArray, outputFileName): plt.savefig(outputFileName + "heat_map-" + paramName1 + "-" + paramName2 + ".png") plt.close() - # nohup python ~/dev/git/spearmint/spearmint/main.py . & - - # import json - # import numpy as np - # import math - # - # from os import system - # from os.path import join - # - # - # def run_kover(dataset, split, model_type, p, max_rules, output_dir): - # outdir = join(output_dir, "%s_%f" % (model_type, p)) - # kover_command = "kover learn " \ - # "--dataset '%s' " \ - # "--split %s " \ - # "--model-type %s " \ - # "--p %f " \ - # "--max-rules %d " \ - # "--max-equiv-rules 10000 " \ - # "--hp-choice cv " \ - # "--random-seed 0 " \ - # "--output-dir '%s' " \ - # "--n-cpu 1 " \ - # "-v" % (dataset, - # split, - # model_type, - # p, - # max_rules, - # outdir) - # - # system(kover_command) - # - # return json.load(open(join(outdir, "results.json")))["cv"]["best_hp"]["score"] - # - # - # def main(job_id, params): - # print params - # - # max_rules = params["MAX_RULES"][0] - # - # species = params["SPECIES"][0] - # antibiotic = params["ANTIBIOTIC"][0] - # split = params["SPLIT"][0] - # - # model_type = params["model_type"][0] - # - # # LS31 - # if species == "saureus": - # dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/data/earle_2016/saureus/kover_datasets/%s.kover" % antibiotic - # else: - # dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/genome_scm_paper/data/%s/%s.kover" % (species, antibiotic) - # - # output_path = "/home/droale01/droale01-ls31/projects/genome_scm/manifold_scm/spearmint/vanilla_scm/%s/%s" % (species, antibiotic) - # - # # MacBook - # #dataset_path = "/Volumes/Einstein 1/kover_phylo/datasets/%s/%s.kover" % (species, antibiotic) - # #output_path = "/Volumes/Einstein 1/manifold_scm/version2/%s_spearmint" % antibiotic - # - # return run_kover(dataset=dataset_path, - # split=split, - # model_type=model_type, - # p=params["p"][0], - # max_rules=max_rules, - # output_dir=output_path) - # killall mongod && sleep 1 && rm -r database/* && rm mongo.log* - # mongod --fork --logpath mongo.log --dbpath database - # - # { - # "language" : "PYTHON", - # "experiment-name" : "vanilla_scm_cdiff_azithromycin", - # "polling-time" : 1, - # "resources" : { - # "my-machine" : { - # "scheduler" : "local", - # "max-concurrent" : 5, - # "max-finished-jobs" : 100 - # } - # }, - # "tasks": { - # "resistance" : { - # "type" : "OBJECTIVE", - # "likelihood" : "NOISELESS", - # "main-file" : "spearmint_wrapper", - # "resources" : ["my-machine"] - # } - # }, - # "variables": { - # - # "MAX_RULES" : { - # "type" : "ENUM", - # "size" : 1, - # "options": [10] - # }, - # - # - # "SPECIES" : { - # "type" : "ENUM", - # "size" : 1, - # "options": ["cdiff"] - # }, - # "ANTIBIOTIC" : { - # "type" : "ENUM", - # "size" : 1, - # "options": ["azithromycin"] - # }, - # "SPLIT" : { - # "type" : "ENUM", - # "size" : 1, - # "options": ["split_seed_2"] - # }, - # - # - # "model_type" : { - # "type" : "ENUM", - # "size" : 1, - # "options": ["conjunction", "disjunction"] - # }, - # "p" : { - # "type" : "FLOAT", - # "size" : 1, - # "min" : 0.01, - # "max" : 100 - # } - # } - # } +# nohup python ~/dev/git/spearmint/spearmint/main.py . & + +# import json +# import numpy as np +# import math +# +# from os import system +# from os.path import join +# +# +# def run_kover(dataset, split, model_type, p, max_rules, output_dir): +# outdir = join(output_dir, "%s_%f" % (model_type, p)) +# kover_command = "kover learn " \ +# "--dataset '%s' " \ +# "--split %s " \ +# "--model-type %s " \ +# "--p %f " \ +# "--max-rules %d " \ +# "--max-equiv-rules 10000 " \ +# "--hp-choice cv " \ +# "--random-seed 0 " \ +# "--output-dir '%s' " \ +# "--n-cpu 1 " \ +# "-v" % (dataset, +# split, +# model_type, +# p, +# max_rules, +# outdir) +# +# system(kover_command) +# +# return json.load(open(join(outdir, "results.json")))["cv"]["best_hp"]["score"] +# +# +# def main(job_id, params): +# print params +# +# max_rules = params["MAX_RULES"][0] +# +# species = params["SPECIES"][0] +# antibiotic = params["ANTIBIOTIC"][0] +# split = params["SPLIT"][0] +# +# model_type = params["model_type"][0] +# +# # LS31 +# if species == "saureus": +# dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/data/earle_2016/saureus/kover_datasets/%s.kover" % antibiotic +# else: +# dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/genome_scm_paper/data/%s/%s.kover" % (species, antibiotic) +# +# output_path = "/home/droale01/droale01-ls31/projects/genome_scm/manifold_scm/spearmint/vanilla_scm/%s/%s" % (species, antibiotic) +# +# # MacBook +# #dataset_path = "/Volumes/Einstein 1/kover_phylo/datasets/%s/%s.kover" % (species, antibiotic) +# #output_path = "/Volumes/Einstein 1/manifold_scm/version2/%s_spearmint" % antibiotic +# +# return run_kover(dataset=dataset_path, +# split=split, +# model_type=model_type, +# p=params["p"][0], +# max_rules=max_rules, +# output_dir=output_path) +# killall mongod && sleep 1 && rm -r database/* && rm mongo.log* +# mongod --fork --logpath mongo.log --dbpath database +# +# { +# "language" : "PYTHON", +# "experiment-name" : "vanilla_scm_cdiff_azithromycin", +# "polling-time" : 1, +# "resources" : { +# "my-machine" : { +# "scheduler" : "local", +# "max-concurrent" : 5, +# "max-finished-jobs" : 100 +# } +# }, +# "tasks": { +# "resistance" : { +# "type" : "OBJECTIVE", +# "likelihood" : "NOISELESS", +# "main-file" : "spearmint_wrapper", +# "resources" : ["my-machine"] +# } +# }, +# "variables": { +# +# "MAX_RULES" : { +# "type" : "ENUM", +# "size" : 1, +# "options": [10] +# }, +# +# +# "SPECIES" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["cdiff"] +# }, +# "ANTIBIOTIC" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["azithromycin"] +# }, +# "SPLIT" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["split_seed_2"] +# }, +# +# +# "model_type" : { +# "type" : "ENUM", +# "size" : 1, +# "options": ["conjunction", "disjunction"] +# }, +# "p" : { +# "type" : "FLOAT", +# "size" : 1, +# "min" : 0.01, +# "max" : 100 +# } +# } +# } diff --git a/Code/MonoMultiViewClassifiers/utils/Interpret.py b/Code/MonoMultiViewClassifiers/utils/Interpret.py index 00562135e6cd34d359f1bce174a9072681365d66..e83b2e55b9d9f00adedb65cbd1b44cb8ebb29db1 100644 --- a/Code/MonoMultiViewClassifiers/utils/Interpret.py +++ b/Code/MonoMultiViewClassifiers/utils/Interpret.py @@ -5,11 +5,12 @@ import pickle def percent(x, pos): - 'The two args are the value and tick position' + """Used to print percentage of importance on the y axis""" return '%1.1f %%' % (x * 100) def getFeatureImportance(classifier, directory, interpretString=""): + """Used to generate a graph and a pickle dictionary representing feature importances""" featureImportances = classifier.feature_importances_ sortedArgs = np.argsort(-featureImportances) featureImportancesSorted = featureImportances[sortedArgs][:50] diff --git a/Code/MonoMultiViewClassifiers/utils/execution.py b/Code/MonoMultiViewClassifiers/utils/execution.py index 7552f068f7fcafc341ab2c669b9b9229064b67e6..1a2e287705cc8f1eaca7d1ad573b040af99391d9 100644 --- a/Code/MonoMultiViewClassifiers/utils/execution.py +++ b/Code/MonoMultiViewClassifiers/utils/execution.py @@ -9,6 +9,7 @@ import sklearn def parseTheArgs(arguments): + """Used to parse the args entered by the user""" parser = argparse.ArgumentParser( description='This file is used to benchmark the scores fo multiple classification algorithm on multiview data.', @@ -183,6 +184,7 @@ def parseTheArgs(arguments): return args def initRandomState(randomStateArg, directory): + """Used to init a random state and multiple if needed (multicore)""" if randomStateArg is None: randomState = np.random.RandomState(randomStateArg) else: @@ -199,6 +201,7 @@ def initRandomState(randomStateArg, directory): def initLogFile(args): + """Used to init the directory where the results will be stored and the log file""" resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join( args.views) + "-" + args.name + "-LOG" @@ -226,11 +229,14 @@ def initLogFile(args): def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStates): + """Used to gen the train/test splits using one or multiple random states""" indices = np.arange(datasetlength) if statsIter > 1: splits = [] for randomState in statsIterRandomStates: - foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, random_state=randomState, test_size=splitRatio) + foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, + random_state=randomState, + test_size=splitRatio) folds = foldsObj.split(indices, DATASET.get("Labels").value) for fold in folds: train_fold, test_fold = fold @@ -249,6 +255,7 @@ def genSplits(statsIter, datasetlength, DATASET, splitRatio, statsIterRandomStat def genKFolds(statsIter, nbFolds, statsIterRandomStates): + """Used to generate folds indices for cross validation and multiple if needed""" if statsIter > 1: foldsList = [] for randomState in statsIterRandomStates: @@ -259,8 +266,7 @@ def genKFolds(statsIter, nbFolds, statsIterRandomStates): def initViews(DATASET, args): - """Used to return the views names that will be used by the algos, their indices and all the views names - Needs args.views""" + """Used to return the views names that will be used by the algos, their indices and all the views names""" NB_VIEW = DATASET.get("Metadata").attrs["nbView"] if args.views != [""]: allowedViews = args.views @@ -278,6 +284,7 @@ def initViews(DATASET, args): def genDirecortiesNames(directory, statsIter): + """Used to generate the different directories of each iteration if needed""" if statsIter > 1: directories = [] for i in range(statsIter):