Skip to content
Snippets Groups Projects
Commit 132f8710 authored by bbauvin's avatar bbauvin
Browse files

Added an option to repat a training/classification to mean the results

parent 1e9c7274
Branches
Tags
No related merge requests found
...@@ -79,6 +79,8 @@ groupClass.add_argument('--CL_algos_multiview', metavar='STRING', action='store' ...@@ -79,6 +79,8 @@ groupClass.add_argument('--CL_algos_multiview', metavar='STRING', action='store'
help='Determine which multiview classifier to use, separate with : if multiple, if empty, considering all', default='') help='Determine which multiview classifier to use, separate with : if multiple, if empty, considering all', default='')
groupClass.add_argument('--CL_cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int, groupClass.add_argument('--CL_cores', metavar='INT', action='store', help='Number of cores, -1 for all', type=int,
default=1) default=1)
groupClass.add_argument('--CL_statsiter', metavar='INT', action='store', help='Number of iteration for each algorithm to mean results', type=int,
default=1)
groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+", groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+",
help='Determine which metrics to use, separate metric and configuration with ":". If multiple, separate with space. If no metric is specified, considering all with accuracy for classification ' help='Determine which metrics to use, separate metric and configuration with ":". If multiple, separate with space. If no metric is specified, considering all with accuracy for classification '
'first one will be used for classification', default=['']) 'first one will be used for classification', default=[''])
...@@ -164,6 +166,7 @@ groupFusion.add_argument('--FU_cl_config', metavar='STRING', action='store', nar ...@@ -164,6 +166,7 @@ groupFusion.add_argument('--FU_cl_config', metavar='STRING', action='store', nar
args = parser.parse_args() args = parser.parse_args()
os.nice(args.nice) os.nice(args.nice)
nbCores = args.CL_cores nbCores = args.CL_cores
statsIter = args.CL_statsiter
start = time.time() start = time.time()
if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech", "Fake", "Plausible"]: if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech", "Fake", "Plausible"]:
getDatabase = getattr(DB, "getClassicDB" + args.type[1:]) getDatabase = getattr(DB, "getClassicDB" + args.type[1:])
...@@ -321,7 +324,7 @@ if nbCores>1: ...@@ -321,7 +324,7 @@ if nbCores>1:
nbExperiments = len(argumentDictionaries["Monoview"]) nbExperiments = len(argumentDictionaries["Monoview"])
for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))): for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
resultsMonoview+=(Parallel(n_jobs=nbCores)( resultsMonoview+=(Parallel(n_jobs=nbCores)(
delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, gridSearch=gridSearch, delayed(ExecMonoview_multicore)(args.name, args.CL_split, args.CL_nbFolds, coreIndex, args.type, args.pathF, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores]) metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores])
for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))) for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))))
accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)] accuracies = [[result[1][1] for result in resultsMonoview if result[0]==viewIndex] for viewIndex in range(NB_VIEW)]
...@@ -334,7 +337,7 @@ if nbCores>1: ...@@ -334,7 +337,7 @@ if nbCores>1:
else: else:
resultsMonoview+=([ExecMonoview(DATASET.get("View"+str(arguments["viewIndex"])), resultsMonoview+=([ExecMonoview(DATASET.get("View"+str(arguments["viewIndex"])),
DATASET.get("Labels").value, args.name, DATASET.get("Labels").value, args.name,
args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, statsIter,
gridSearch=gridSearch, metrics=metrics, nIter=args.CL_GS_iter, gridSearch=gridSearch, metrics=metrics, nIter=args.CL_GS_iter,
**arguments) **arguments)
for arguments in argumentDictionaries["Monoview"]]) for arguments in argumentDictionaries["Monoview"]])
...@@ -415,12 +418,12 @@ if nbCores>1: ...@@ -415,12 +418,12 @@ if nbCores>1:
for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))): for stepIndex in range(int(math.ceil(float(nbExperiments)/nbCores))):
resultsMultiview += Parallel(n_jobs=nbCores)( resultsMultiview += Parallel(n_jobs=nbCores)(
delayed(ExecMultiview_multicore)(coreIndex, args.name, args.CL_split, args.CL_nbFolds, args.type, args.pathF, delayed(ExecMultiview_multicore)(coreIndex, args.name, args.CL_split, args.CL_nbFolds, args.type, args.pathF,
LABELS_DICTIONARY, gridSearch=gridSearch, LABELS_DICTIONARY, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Multiview"][stepIndex*nbCores+coreIndex]) metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Multiview"][stepIndex*nbCores+coreIndex])
for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))) for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))
else: else:
resultsMultiview = [ExecMultiview(DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, resultsMultiview = [ExecMultiview(DATASET, args.name, args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF,
LABELS_DICTIONARY, gridSearch=gridSearch, LABELS_DICTIONARY, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Multiview"]] metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Multiview"]]
multiviewTime = time.time()-monoviewTime-dataBaseTime-start multiviewTime = time.time()-monoviewTime-dataBaseTime-start
if nbCores>1: if nbCores>1:
......
...@@ -30,7 +30,7 @@ __status__ = "Prototype" # Production, Development, Prototype ...@@ -30,7 +30,7 @@ __status__ = "Prototype" # Production, Development, Prototype
__date__ = 2016-03-25 __date__ = 2016-03-25
def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, gridSearch=True, def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databaseType, path, statsIter, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args): metrics=[["accuracy_score", None]], nIter=30, **args):
DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r") DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r")
kwargs = args["args"] kwargs = args["args"]
...@@ -38,11 +38,11 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa ...@@ -38,11 +38,11 @@ def ExecMonoview_multicore(name, learningRate, nbFolds, datasetFileIndex, databa
neededViewIndex = views.index(kwargs["feat"]) neededViewIndex = views.index(kwargs["feat"])
X = DATASET.get("View"+str(neededViewIndex)) X = DATASET.get("View"+str(neededViewIndex))
Y = DATASET.get("Labels").value Y = DATASET.get("Labels").value
return ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, gridSearch=gridSearch, return ExecMonoview(X, Y, name, learningRate, nbFolds, 1, databaseType, path, statsIter, gridSearch=gridSearch,
metrics=metrics, nIter=nIter, **args) metrics=metrics, nIter=nIter, **args)
def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, gridSearch=True, def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, statsIter, gridSearch=True,
metrics=[["accuracy_score", None]], nIter=30, **args): metrics=[["accuracy_score", None]], nIter=30, **args):
logging.debug("Start:\t Loading data") logging.debug("Start:\t Loading data")
try: try:
...@@ -64,7 +64,11 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, ...@@ -64,7 +64,11 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
# Determine the Database to extract features # Determine the Database to extract features
logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type) logging.debug("Info:\t Classification - Database:" + str(name) + " Feature:" + str(feat) + " train_size:" + str(learningRate) + ", CrossValidation k-folds:" + str(nbFolds) + ", cores:" + str(nbCores)+", algorithm : "+CL_type)
y_trains = []
y_tests = []
y_train_preds = []
y_test_preds = []
for poulet in range(statsIter):
# Calculate Train/Test data # Calculate Train/Test data
logging.debug("Start:\t Determine Train/Test split") logging.debug("Start:\t Determine Train/Test split")
testIndices = ClassifMonoView.splitDataset(Y, nbClass, learningRate, datasetLength) testIndices = ClassifMonoView.splitDataset(Y, nbClass, learningRate, datasetLength)
...@@ -93,28 +97,31 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path, ...@@ -93,28 +97,31 @@ def ExecMonoview(X, Y, name, learningRate, nbFolds, nbCores, databaseType, path,
logging.debug("Done:\t RandomSearch best settings") logging.debug("Done:\t RandomSearch best settings")
logging.debug("Start:\t Training") logging.debug("Start:\t Training")
cl_res = classifierModule.fit(X_train, y_train, NB_CORES=nbCores, **clKWARGS) cl_res = classifierModule.fit(X_train, y_train, NB_CORES=nbCores, **clKWARGS)
t_end = time.time() - t_start
logging.debug("Info:\t Time for Training: " + str(t_end) + "[s]")
logging.debug("Done:\t Training") logging.debug("Done:\t Training")
logging.debug("Start:\t Predicting") logging.debug("Start:\t Predicting")
# Stats Result # Stats Result
y_train_pred = cl_res.predict(X_train) y_train_pred = cl_res.predict(X_train)
y_test_pred = cl_res.predict(X_test) y_test_pred = cl_res.predict(X_test)
y_trains.append(y_train)
y_train_preds.append(y_train_pred)
y_tests.append(y_test)
y_test_preds.append(y_test_pred)
t_end = time.time() - t_start
logging.debug("Done:\t Predicting")
logging.debug("Info:\t Time for training and predicting: " + str(t_end) + "[s]")
classLabelsDesc = pd.read_csv(path + fileCLD, sep=";", names=['label', 'name']) classLabelsDesc = pd.read_csv(path + fileCLD, sep=";", names=['label', 'name'])
classLabelsNames = classLabelsDesc.name classLabelsNames = classLabelsDesc.name
logging.debug("Done:\t Predicting")
#logging.debug("" + str(classLabelsNames))
classLabelsNamesList = classLabelsNames.values.tolist() classLabelsNamesList = classLabelsNames.values.tolist()
#logging.debug(""+ str(classLabelsNamesList))
logging.debug("Start:\t Getting Results") logging.debug("Start:\t Getting Results")
#Accuracy classification score #Accuracy classification score
stringAnalysis, imagesAnalysis, metricsScores = execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, stringAnalysis, imagesAnalysis, metricsScores = execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type,
clKWARGS, classLabelsNames, X.shape, clKWARGS, classLabelsNames, X.shape,
y_train, y_train_pred, y_test, y_test_pred, t_end) y_trains, y_train_preds, y_tests, y_test_preds, t_end, statsIter)
cl_desc = [value for key, value in sorted(clKWARGS.iteritems())] cl_desc = [value for key, value in sorted(clKWARGS.iteritems())]
logging.debug("Done:\t Getting Results") logging.debug("Done:\t Getting Results")
logging.info(stringAnalysis) logging.info(stringAnalysis)
......
from datetime import timedelta as hms from datetime import timedelta as hms
import numpy as np
import MonoviewClassifiers import MonoviewClassifiers
import Metrics import Metrics
...@@ -23,37 +24,39 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS): ...@@ -23,37 +24,39 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS):
classifierConfigString += "\n\n" classifierConfigString += "\n\n"
return classifierConfigString return classifierConfigString
def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred): def getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds):
metricModule = getattr(Metrics, metric[0]) metricModule = getattr(Metrics, metric[0])
if metric[1]!=None: if metric[1]!=None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
else: else:
metricKWARGS = {} metricKWARGS = {}
metricScoreTrain = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]))
metricScoreTest = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))
metricScoreString = "\tFor "+metricModule.getConfig(**metricKWARGS)+" : " metricScoreString = "\tFor "+metricModule.getConfig(**metricKWARGS)+" : "
metricScoreString += "\n\t\t- Score on train : "+str(metricModule.score(y_train, y_train_pred)) metricScoreString += "\n\t\t- Score on train : "+str(metricScoreTrain)
metricScoreString += "\n\t\t- Score on test : "+str(metricModule.score(y_test, y_test_pred)) metricScoreString += "\n\t\t- Score on test : "+str(metricScoreTest)
metricScoreString += "\n" metricScoreString += "\n"
return metricScoreString return metricScoreString
def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames, def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames,
shape, y_train, y_train_pred, y_test, y_test_pred, time): shape, y_trains, y_train_preds, y_tests, y_test_preds, time, statsIter):
metricsScores = {} metricsScores = {}
metricModule = getattr(Metrics, metrics[0][0]) metricModule = getattr(Metrics, metrics[0][0])
train = metricModule.score(y_train, y_train_pred) train = np.mean(np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]))
val = metricModule.score(y_test, y_test_pred) val = np.mean(np.array([metricModule.score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))
stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+"\n\n" stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+"\n\n"
stringAnalysis += metrics[0][0]+" on train : "+str(train)+"\n"+metrics[0][0]+" on test : "+str(val)+"\n\n" stringAnalysis += metrics[0][0]+" on train : "+str(train)+"\n"+metrics[0][0]+" on test : "+str(val)+"\n\n"
stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds) stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds)
stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS) stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS)
for metric in metrics: for metric in metrics:
stringAnalysis+=getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred) stringAnalysis+=getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds)
if metric[1]!=None: if metric[1]!=None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
else: else:
metricKWARGS = {} metricKWARGS = {}
metricsScores[metric[0]] = [getattr(Metrics, metric[0]).score(y_train, y_train_pred, **metricKWARGS), "", metricsScores[metric[0]] = [np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)])), "",
getattr(Metrics, metric[0]).score(y_test, y_test_pred, **metricKWARGS)] np.mean(np.array([getattr(Metrics, metric[0]).score(y_test, y_test_pred) for y_test, y_test_pred in zip(y_tests, y_test_preds)]))]
stringAnalysis += "\n\n Classification took "+ str(hms(seconds=int(time))) stringAnalysis += "\n\n Classification took "+ str(hms(seconds=int(time)))
imageAnalysis = {} imageAnalysis = {}
......
...@@ -22,14 +22,14 @@ __status__ = "Prototype" # Production, Development, P ...@@ -22,14 +22,14 @@ __status__ = "Prototype" # Production, Development, P
def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY , def ExecMultiview_multicore(coreIndex, name, learningRate, nbFolds, databaseType, path, LABELS_DICTIONARY , statsIter,
gridSearch=False, nbCores=1, metrics=None, nIter=30, **arguments): gridSearch=False, nbCores=1, metrics=None, nIter=30, **arguments):
DATASET = h5py.File(path+name+str(coreIndex)+".hdf5", "r") DATASET = h5py.File(path+name+str(coreIndex)+".hdf5", "r")
return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY, return ExecMultiview(DATASET, name, learningRate, nbFolds, 1, databaseType, path, LABELS_DICTIONARY, statsIter,
gridSearch=gridSearch, metrics=metrics, nIter=nIter, **arguments) gridSearch=gridSearch, metrics=metrics, nIter=nIter, **arguments)
def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY, def ExecMultiview(DATASET, name, learningRate, nbFolds, nbCores, databaseType, path, LABELS_DICTIONARY, statsIter,
gridSearch=False, metrics=None, nIter=30, **kwargs): gridSearch=False, metrics=None, nIter=30, **kwargs):
datasetLength = DATASET.get("Metadata").attrs["datasetLength"] datasetLength = DATASET.get("Metadata").attrs["datasetLength"]
......
...@@ -369,41 +369,45 @@ def easyFactorize(nbGenes, factorizationParam, t=0): ...@@ -369,41 +369,45 @@ def easyFactorize(nbGenes, factorizationParam, t=0):
factorLeft[matrixInf:, t_:t__+t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__+t_].shape) factorLeft[matrixInf:, t_:t__+t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__+t_].shape)
factorLeft[:, t__+t_] = vectorLeft factorLeft[:, t__+t_] = vectorLeft
factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool) # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool)
#
factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape) # factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape)
if nbGenes%2==1: # if nbGenes%2==1:
factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape) # factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape)
else: # else:
factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape) # factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape)
factorSup[t__+t_, :] = vectorSup # factorSup[t__+t_, :] = vectorSup
return t__+t_+1, factorLeft, factorSup return t__+t_+1, factorLeft#, factorSup
def getBaseMatrices(nbGenes, factorizationParam): def getBaseMatrices(nbGenes, factorizationParam, path):
t, factorLeft, factorSup = easyFactorize(nbGenes, factorizationParam) t, factorLeft, factorSup = easyFactorize(nbGenes, factorizationParam)
np.savetxt("factorSup--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorSup, delimiter=",") np.savetxt(path+"factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",")
np.savetxt("factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",") return factorLeft
return factorSup, factorLeft
def findParams(arrayLen, nbPatients, maxNbBins=5000, maxLenBin=300, minOverlapping=30, minNbBinsOverlapped=20, maxNbSolutions=30): def findParams(arrayLen, nbPatients, maxNbBins=2000, minNbBins = 10, maxLenBin=70000, minOverlapping=1, minNbBinsOverlapped=0, maxNbSolutions=30):
results = [] results = []
if arrayLen*arrayLen*10/100>minNbBinsOverlapped*nbPatients: if arrayLen*arrayLen*10/100>minNbBinsOverlapped*nbPatients:
for lenBin in range(arrayLen-1): for lenBin in range(arrayLen-1):
if lenBin+1<maxLenBin: lenBin = lenBin+1
for overlapping in sorted(range(lenBin+1-1), reverse=True): if lenBin<maxLenBin and minNbBins*lenBin<arrayLen:
if overlapping+1>minOverlapping and math.ceil(float(lenBin)/(lenBin-overlapping))>=minNbBinsOverlapped: print lenBin
print results
for overlapping in sorted(range(lenBin-1), reverse=True):
overlapping = overlapping+1
if overlapping>minOverlapping and lenBin%(lenBin-overlapping)==0:
for nbBins in sorted(range(arrayLen-1), reverse=True): for nbBins in sorted(range(arrayLen-1), reverse=True):
if nbBins+1<maxNbBins: nbBins = nbBins+1
if arrayLen == (nbBins+1-1)*(lenBin+1-overlapping+1)+lenBin+1: if nbBins<maxNbBins:
if arrayLen == (nbBins-1)*(lenBin-overlapping)+lenBin:
results.append({"nbBins":nbBins, "overlapping":overlapping, "lenBin":lenBin}) results.append({"nbBins":nbBins, "overlapping":overlapping, "lenBin":lenBin})
if len(results)==maxNbSolutions: if len(results)==maxNbSolutions:
params = results[random.randrange(len(results))] params = results[random.randrange(len(results))]
return params return params
def findBins(nbBins, overlapping, lenBin): def findBins(nbBins=142, overlapping=493, lenBin=986):
bins = [] bins = []
for binIndex in range(nbBins+1): for binIndex in range(nbBins+1):
bins.append([i+binIndex*(lenBin+1-overlapping+1) for i in range(lenBin+1)]) bins.append([i+binIndex*(lenBin+1-overlapping+1) for i in range(lenBin+1)])
...@@ -421,6 +425,14 @@ def getBins(array, bins, lenBin, overlapping): ...@@ -421,6 +425,14 @@ def getBins(array, bins, lenBin, overlapping):
return np.array(binnedcoord) return np.array(binnedcoord)
def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path):
sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8)
step = lenBins-overlapping
for binIndex in nbBins:
sortedBinsMatrix[step*binIndex:lenBins+(step*binIndex)] = np.ones(lenBins, dtype=np.uint8)
np.savetxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", sortedBinsMatrix, delimiter=",")
return sortedBinsMatrix
def makeSparseTotalMatrix(sortedRNASeq): def makeSparseTotalMatrix(sortedRNASeq):
nbPatients, nbGenes = sortedRNASeq.shape nbPatients, nbGenes = sortedRNASeq.shape
params = findParams(nbGenes, nbPatients) params = findParams(nbGenes, nbPatients)
...@@ -533,10 +545,10 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): ...@@ -533,10 +545,10 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
try: try:
factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View2").shape[1])+"--k-"+str(100)+".csv", delimiter=',') factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View2").shape[1])+"--k-"+str(100)+".csv", delimiter=',')
except: except:
factorizedSupBaseMatrix, factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k) factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path)
brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k*2), dtype=bool) brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k), dtype=np.uint8)
for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k * 2), dtype=bool) patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray): for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:] patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:]
brnaseqDset[patientIndex] = patientMatrix.flatten() brnaseqDset[patientIndex] = patientMatrix.flatten()
...@@ -544,16 +556,23 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): ...@@ -544,16 +556,23 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES):
brnaseqDset.attrs["sparse"] = False brnaseqDset.attrs["sparse"] = False
logging.debug("Done:\t Getting Binarized RNASeq Data") logging.debug("Done:\t Getting Binarized RNASeq Data")
# logging.debug("Start:\t Getting Binned RNASeq Data") logging.debug("Start:\t Getting Binned RNASeq Data")
# sparseBinnedRNASeq = makeSparseTotalMatrix(sortedRNASeqGeneIndices) lenBins = 986
# sparseBinnedRNASeqGrp = datasetFile.create_group("View6") nbBins = 142
# dataDset = sparseBinnedRNASeqGrp.create_dataset("data", sparseBinnedRNASeq.data.shape, data=sparseBinnedRNASeq.data) overlapping = 493
# indicesDset = sparseBinnedRNASeqGrp.create_dataset("indices", sparseBinnedRNASeq.indices.shape, data=sparseBinnedRNASeq.indices) try:
# indptrDset = sparseBinnedRNASeqGrp.create_dataset("indptr", sparseBinnedRNASeq.indptr.shape, data=sparseBinnedRNASeq.indptr) sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",")
# sparseBinnedRNASeqGrp.attrs["name"]="BRNASeq" except:
# sparseBinnedRNASeqGrp.attrs["sparse"]=True sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path)
# sparseBinnedRNASeqGrp.attrs["shape"]=sparseBinnedRNASeq.shape binnedRNASeq = datasetFile.create_dataset("View6", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*lenBins), dtype=np.uint8)
# logging.debug("Done:\t Getting Binned RNASeq Data") for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices):
patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8)
for lineIndex, geneIndex in enumerate(patientSortedArray):
patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:]
brnaseqDset[patientIndex] = patientMatrix.flatten()
brnaseqDset.attrs["name"] = "bRNASeq"
brnaseqDset.attrs["sparse"] = False
logging.debug("Done:\t Getting Binned RNASeq Data")
# logging.debug("Start:\t Getting Adjacence RNASeq Data") # logging.debug("Start:\t Getting Adjacence RNASeq Data")
# sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1) # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment