diff --git a/Code/MonoMutliViewClassifiers/ExecClassif.py b/Code/MonoMutliViewClassifiers/ExecClassif.py index 0524f53f99b3aa5014bbbb74d8cd5b65b0fb2a3c..5e3ff07f7369adfa88ce9007d1fce5a8621cbebe 100644 --- a/Code/MonoMutliViewClassifiers/ExecClassif.py +++ b/Code/MonoMutliViewClassifiers/ExecClassif.py @@ -3,7 +3,6 @@ import argparse import pkgutil # for TimeStamp in CSVFile import os import time -import itertools import sys import select import logging @@ -35,8 +34,7 @@ matplotlib.use('Agg') # Anti-Grain Geometry C++ library to make a raster (pixel def initLogFile(args): resultDirectory = "../../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join( - args.views) + "-" + args.name + \ - "-LOG" + args.views) + "-" + args.name + "-LOG" if not os.path.exists(os.path.dirname(resultDirectory + logFileName)): try: os.makedirs(os.path.dirname(resultDirectory + logFileName)) @@ -48,7 +46,7 @@ def initLogFile(args): for i in range(1, 20): testFileName = logFileName + "-" + str(i) + ".log" if not (os.path.isfile(resultDirectory + testFileName)): - logfile = resultDirectory + testFileName + logFile = resultDirectory + testFileName break else: logFile += ".log" @@ -60,7 +58,7 @@ def initLogFile(args): return resultDirectory -def input(timeout=15): +def input_(timeout=15): print "You have " + str(timeout) + " seconds to stop the script by typing n" i, o, e = select.select([sys.stdin], [], [], timeout) @@ -72,7 +70,7 @@ def input(timeout=15): def confirm(resp=True, timeout=15): - ans = input(timeout) + ans = input_(timeout) if not ans: return resp if ans not in ['y', 'Y', 'n', 'N']: @@ -136,27 +134,11 @@ def initBenchmark(args): pkgutil.iter_modules(['MonoviewClassifiers']) if (not isPackage)] benchmark["Monoview"] = allMonoviewAlgos - benchmark["Multiview"]=dict((multiviewPackageName, "_") for multiviewPackageName in allMultiviewPackages) + benchmark["Multiview"] = dict((multiviewPackageName, "_") for multiviewPackageName in allMultiviewPackages) for multiviewPackageName in allMultiviewPackages: multiviewPackage = getattr(Multiview, multiviewPackageName) multiviewModule = getattr(multiviewPackage, multiviewPackageName) benchmark = multiviewModule.getBenchmark(benchmark, args=args) - # fusionModulesNames = [name for _, name, isPackage - # in pkgutil.iter_modules(['Multiview/Fusion/Methods']) if not isPackage] - # fusionModules = [getattr(Multiview.Fusion.Methods, fusionModulesName) - # for fusionModulesName in fusionModulesNames] - # fusionClasses = [getattr(fusionModule, fusionModulesName + "Classifier") - # for fusionModulesName, fusionModule in zip(fusionModulesNames, fusionModules)] - # fusionMethods = dict((fusionModulesName, [name for _, name, isPackage in - # pkgutil.iter_modules( - # ["Multiview/Fusion/Methods/" + fusionModulesName + "Package"]) - # if not isPackage]) - # for fusionModulesName, fusionClasse in zip(fusionModulesNames, fusionClasses)) - # fusionMonoviewClassifiers = allMonoviewAlgos - # allFusionAlgos = {"Methods": fusionMethods, "Classifiers": fusionMonoviewClassifiers} - # # allMumboAlgos = - # allMultiviewAlgos = {"Fusion": allFusionAlgos, "Mumbo": allMumboAlgos} - # benchmark = {"Monoview": allMonoviewAlgos, "Multiview": allMultiviewAlgos} if "Multiview" in args.CL_type: benchmark["Multiview"] = {} @@ -169,37 +151,6 @@ def initBenchmark(args): multiviewPackage = getattr(Multiview, multiviewPackageName) multiviewModule = getattr(multiviewPackage, multiviewPackageName) benchmark = multiviewModule.getBenchmark(benchmark, args=args) - # if "Mumbo" in algosMutliview: - # benchmark["Multiview"]["Mumbo"] = args.MU_types - # if "Fusion" in algosMutliview: - # benchmark["Multiview"]["Fusion"] = {} - # benchmark["Multiview"]["Fusion"]["Methods"] = dict( - # (fusionType, []) for fusionType in args.FU_types) - # if "LateFusion" in args.FU_types: - # if args.FU_late_methods== [""]: - # benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"] = [name for _, name, isPackage in - # pkgutil.iter_modules([ - # "Multiview/Fusion/Methods/LateFusionPackage"]) - # if not isPackage] - # else: - # benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"] = args.FU_late_methods - # if "EarlyFusion" in args.FU_types: - # if args.FU_early_methods == [""]: - # benchmark["Multiview"]["Fusion"]["Methods"]["EarlyFusion"] = [name for _, name, isPackage in - # pkgutil.iter_modules([ - # "Multiview/Fusion/Methods/EarlyFusionPackage"]) - # if not isPackage] - # else: - # benchmark["Multiview"]["Fusion"]["Methods"]["EarlyFusion"] = args.FU_early_methods - # if args.CL_algos_monoview == ['']: - # benchmark["Multiview"]["Fusion"]["Classifiers"] = [name for _, name, isPackage in - # pkgutil.iter_modules(['MonoviewClassifiers']) - # if (not isPackage) and (name != "SGD") and ( - # name[:3] != "SVM") - # and (name != "SCM")] - # else: - # benchmark["Multiview"]["Fusion"]["Classifiers"] = args.CL_algos_monoview - if "Monoview" in args.CL_type: if args.CL_algos_monoview == ['']: benchmark["Monoview"] = [name for _, name, isPackage in pkgutil.iter_modules(["MonoviewClassifiers"]) @@ -235,30 +186,16 @@ def initMonoviewKWARGS(args, classifiersNames): monoviewKWARGS = {} for classifiersName in classifiersNames: classifierModule = getattr(MonoviewClassifiers, classifiersName) - monoviewKWARGS[classifiersName+"KWARGSInit"] = classifierModule.getKWARGS([(key, value) for key, value in vars(args).iteritems() if key.startswith("CL_"+classifiersName)]) + monoviewKWARGS[classifiersName + "KWARGSInit"] = classifierModule.getKWARGS( + [(key, value) for key, value in vars(args).iteritems() if key.startswith("CL_" + classifiersName)]) return monoviewKWARGS def initKWARGS(args, benchmark): if "Monoview" in benchmark: monoviewKWARGS = initMonoviewKWARGS(args, benchmark["Monoview"]) - - - - # kwargsInit = { - # "RandomForestKWARGSInit": {"0": map(int, args.CL_RF_trees.split())[0], - # "1": map(int, args.CL_RF_max_depth.split(":"))[0]}, - # "SVMLinearKWARGSInit": {"0": map(int, args.CL_SVML_C.split(":"))[0]}, - # "SVMRBFKWARGSInit": {"0": map(int, args.CL_SVMR_C.split(":"))[0]}, - # "SVMPolyKWARGSInit": {"0": map(int, args.CL_SVMP_C.split(":"))[0], - # '1': map(int, args.CL_SVMP_deg.split(":"))[0]}, - # "DecisionTreeKWARGSInit": {"0": map(int, args.CL_DT_depth.split(":"))[0]}, - # "SGDKWARGSInit": {"2": map(float, args.CL_SGD_alpha.split(":"))[0], "1": args.CL_SGD_penalty.split(":")[0], - # "0": args.CL_SGD_loss.split(":")[0]}, - # "KNNKWARGSInit": {"0": map(float, args.CL_KNN_neigh.split(":"))[0]}, - # "AdaboostKWARGSInit": {"0": args.CL_Ada_n_est.split(":")[0], "1": args.CL_Ada_b_est.split(":")[0]}, - # "SCMKWARGSInit": {"0": args.CL_SCM_max_rules.split(":")[0]}, - # } + else: + monoviewKWARGS = {} return monoviewKWARGS @@ -280,109 +217,12 @@ def lateFusionSetArgs(views, viewsIndices, classes, method, def initMultiviewArguments(args, benchmark, views, viewsIndices, scores, classifiersConfigs, classifiersNames, NB_VIEW, metrics, argumentDictionaries): - # metricModule = getattr(Metrics, metrics[0]) multiviewArguments = [] if "Multiview" in benchmark: for multiviewAlgoName in benchmark["Multiview"]: multiviewPackage = getattr(Multiview, multiviewAlgoName) mutliviewModule = getattr(multiviewPackage, multiviewAlgoName) - multiviewArguments+= mutliviewModule.getArgs(args, benchmark, views, viewsIndices) - # if benchmark["Multiview"]: - # for multiviewAlgoName in benchmark["Multiview"]: - # multiviewPackage = getattr(Multiview, multiviewAlgoName) - # multiviewArguments[] - # if "Fusion" in benchmark["Multiview"]: - # for method in benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"]: - # import pdb; pdb.set_trace() - # if args.FU_cl_names != ['']: - # monoClassifiers = args.FU_cl_names - # monoClassifiersConfigs = [globals()[classifier + "KWARGS"] for classifier in monoClassifiers] - # if args.FU_method_config != [""]: - # fusionMethodConfigs = [map(float, config.split(":")) for config in args.FU_method_config] - # elif not hyperParamSearch: - # raise ValueError("No config for fusion method given and no gridearch wanted") - # else: - # try: - # fusionMethodConfigs = [["config"] for method in - # benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"]] - # except: - # pass - # try: - # for methodIndex, method in enumerate(benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"]): - # if args.FU_fixed: - # arguments = lateFusionSetArgs(views, viewsIndices, args.CL_classes, method, - # args.FU_cl_names, monoClassifiersConfigs, - # fusionMethodConfigs[methodIndex]) - # argumentDictionaries["Multiview"].append(arguments) - # else: - # for combination in itertools.combinations_with_replacement(range(len(monoClassifiers)), - # NB_VIEW): - # monoClassifiersNamesComb = [monoClassifiers[index] for index in combination] - # monoClassifiersConfigsComb = [monoClassifiersConfigs[index] for index in - # combination] - # arguments = lateFusionSetArgs(views, viewsIndices, args.CL_classes, method, - # monoClassifiersNamesComb, monoClassifiersConfigsComb, - # fusionMethodConfigs[methodIndex]) - # argumentDictionaries["Multiview"].append(arguments) - # except: - # pass - # else: - # if "LateFusion" in benchmark["Multiview"]["Fusion"]["Methods"] and \ - # "Classifiers" in benchmark["Multiview"]["Fusion"]: - # bestClassifiers = [] - # bestClassifiersConfigs = [] - # if argumentDictionaries["Monoview"] != {}: - # for viewIndex, view in enumerate(views): - # if metricModule.getConfig()[-14] == "h": - # bestClassifiers.append( - # classifiersNames[viewIndex][np.argmax(np.array(scores[viewIndex]))]) - # bestClassifiersConfigs.append( - # classifiersConfigs[viewIndex][np.argmax(np.array(scores[viewIndex]))]) - # else: - # bestClassifiers.append( - # classifiersNames[viewIndex][np.argmin(np.array(scores[viewIndex]))]) - # bestClassifiersConfigs.append( - # classifiersConfigs[viewIndex][np.argmin(np.array(scores[viewIndex]))]) - # else: - # raise AttributeError("No Monoview classifiers asked in args and no monoview benchmark done.") - # for method in benchmark["Multiview"]["Fusion"]["Methods"]["LateFusion"]: - # arguments = lateFusionSetArgs(views, viewsIndices, args.CL_classes, method, - # bestClassifiers, bestClassifiersConfigs, - # fusionMethodConfig) - # argumentDictionaries["Multiview"].append(arguments) - # if "EarlyFusion" in benchmark["Multiview"]["Fusion"]["Methods"] and \ - # "Classifiers" in benchmark["Multiview"]["Fusion"]: - # for method in benchmark["Multiview"]["Fusion"]["Methods"]["EarlyFusion"]: - # for classifier in benchmark["Multiview"]["Fusion"]["Classifiers"]: - # arguments = {"CL_type": "Fusion", - # "views": views, - # "NB_VIEW": len(views), - # "viewsIndices": viewsIndices, - # "NB_CLASS": len(args.CL_classes), - # "LABELS_NAMES": args.CL_classes, - # "FusionKWARGS": {"fusionType": "EarlyFusion", "fusionMethod": method, - # "classifiersNames": [classifier], - # "classifiersConfigs": [ - # initKWARGS[classifier + "KWARGSInit"]], - # 'fusionMethodConfig': fusionMethodConfig, - # "nbView": (len(viewsIndices))}} - # argumentDictionaries["Multiview"].append(arguments) - # if "Mumbo" in benchmark["Multiview"]: - # for combination in itertools.combinations_with_replacement(range(len(benchmark["Multiview"]["Mumbo"])), - # NB_VIEW): - # mumboClassifiersNames = [benchmark["Multiview"]["Mumbo"][index] for index in combination] - # arguments = {"CL_type": "Mumbo", - # "views": views, - # "NB_VIEW": len(views), - # "viewsIndices": viewsIndices, - # "NB_CLASS": len(args.CL_classes), - # "LABELS_NAMES": args.CL_classes, - # "MumboKWARGS": {"classifiersNames": mumboClassifiersNames, - # "maxIter": int(args.MU_iter[0]), "minIter": int(args.MU_iter[1]), - # "threshold": args.MU_iter[2], - # "classifiersConfigs": [argument.split(":") for argument in - # args.MU_config], "nbView": (len(viewsIndices))}} - # argumentDictionaries["Multiview"].append(arguments) + multiviewArguments += mutliviewModule.getArgs(args, benchmark, views, viewsIndices) argumentDictionaries["Multiview"] = multiviewArguments return argumentDictionaries @@ -391,10 +231,10 @@ def arangeMetrics(metrics, metricPrinc): if [metricPrinc] in metrics: metricIndex = metrics.index([metricPrinc]) firstMetric = metrics[0] - metrics[0]=[metricPrinc] - metrics[metricIndex]=firstMetric + metrics[0] = [metricPrinc] + metrics[metricIndex] = firstMetric else: - raise AttributeError(metricPrinc+" not in metric pool") + raise AttributeError(metricPrinc + " not in metric pool") return metrics @@ -417,6 +257,8 @@ groupStandard.add_argument('--pathF', metavar='STRING', action='store', help='Pa default='/home/bbauvin/Documents/Data/Data_multi_omics/') groupStandard.add_argument('--nice', metavar='INT', action='store', type=int, help='Niceness for the process', default=0) +groupStandard.add_argument('--randomState', metavar='INT', action='store', type=int, + help='Niceness for the process', default=None) groupClass = parser.add_argument_group('Classification arguments') groupClass.add_argument('--CL_split', metavar='FLOAT', action='store', @@ -428,8 +270,8 @@ groupClass.add_argument('--CL_nb_class', metavar='INT', action='store', help='Nu default=2) groupClass.add_argument('--CL_classes', metavar='STRING', action='store', nargs="+", help='Classes used in the dataset (names of the folders) if not filled, random classes will be ' - 'selected ex. walrus mole leopard', default=["yes","no"]) -groupClass.add_argument('--CL_type', metavar='STRING', action='store', nargs ="+", + 'selected ex. walrus mole leopard', default=["yes", "no"]) +groupClass.add_argument('--CL_type', metavar='STRING', action='store', nargs="+", help='Determine whether to use Multiview and/or Monoview, or Benchmark', default=['Benchmark']) groupClass.add_argument('--CL_algos_monoview', metavar='STRING', action='store', nargs="+", @@ -444,10 +286,12 @@ groupClass.add_argument('--CL_statsiter', metavar='INT', action='store', help='Number of iteration for each algorithm to mean results', type=int, default=2) groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+", - help='Determine which metrics to use, separate metric and configuration with ":". If multiple, separate with space. If no metric is specified, considering all with accuracy for classification ' - , default=['']) + help='Determine which metrics to use, separate metric and configuration with ":".' + ' If multiple, separate with space. If no metric is specified, ' + 'considering all with accuracy for classification ' + , default=['']) groupClass.add_argument('--CL_metric_princ', metavar='STRING', action='store', - help='Determine which metric to use for randomSearch and optimization' , default="f1_score") + help='Determine which metric to use for randomSearch and optimization', default="f1_score") groupClass.add_argument('--CL_GS_iter', metavar='INT', action='store', help='Determine how many Randomized grid search tests to do', type=int, default=2) groupClass.add_argument('--CL_HPS_type', metavar='STRING', action='store', @@ -456,7 +300,8 @@ groupClass.add_argument('--CL_HPS_type', metavar='STRING', action='store', groupRF = parser.add_argument_group('Random Forest arguments') groupRF.add_argument('--CL_RandomForest_trees', metavar='INT', type=int, action='store', help='Number max trees', default=25) -groupRF.add_argument('--CL_RandomForest_max_depth', metavar='INT', type=int, action='store', help='Max depth for the trees', +groupRF.add_argument('--CL_RandomForest_max_depth', metavar='INT', type=int, action='store', + help='Max depth for the trees', default=5) groupRF.add_argument('--CL_RandomForest_criterion', metavar='STRING', action='store', help='Criterion for the trees', default="entropy") @@ -489,7 +334,6 @@ groupDT.add_argument('--CL_DecisionTree_criterion', metavar='STRING', action='st groupDT.add_argument('--CL_DecisionTree_splitter', metavar='STRING', action='store', help='Determine criterion for Decision Trees', default="random") - groupSGD = parser.add_argument_group('SGD arguments') groupSGD.add_argument('--CL_SGD_alpha', metavar='FLOAT', type=float, action='store', help='Determine alpha for SGDClassifier', default=0.1) @@ -516,10 +360,10 @@ groupSCM.add_argument('--CL_SCM_p', metavar='FLOAT', type=float, action='store', groupSCM.add_argument('--CL_SCM_model_type', metavar='STRING', action='store', help='Max number of rules for SCM', default="conjunction") - groupMumbo = parser.add_argument_group('Mumbo arguments') groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+", - help='Determine which monoview classifier to use with Mumbo', default=['DecisionTree', 'DecisionTree', 'DecisionTree']) + help='Determine which monoview classifier to use with Mumbo', + default=['DecisionTree', 'DecisionTree', 'DecisionTree']) groupMumbo.add_argument('--MU_config', metavar='STRING', action='store', nargs='+', help='Configuration for the monoview classifier in Mumbo', default=['2:0.5', '2:0.5', '2:0.5']) @@ -533,36 +377,41 @@ groupFusion.add_argument('--FU_types', metavar='STRING', action='store', nargs=" default=['']) groupEarlyFusion = parser.add_argument_group('Early Fusion arguments') groupEarlyFusion.add_argument('--FU_early_methods', metavar='STRING', action='store', nargs="+", - help='Determine which early fusion method of fusion to use', - default=['']) + help='Determine which early fusion method of fusion to use', + default=['']) groupEarlyFusion.add_argument('--FU_E_method_configs', metavar='STRING', action='store', nargs='+', - help='Configuration for the early fusion methods separate method by space and values by :', + help='Configuration for the early fusion methods separate ' + 'method by space and values by :', default=['']) groupEarlyFusion.add_argument('--FU_E_cl_config', metavar='STRING', action='store', nargs='+', - help='Configuration for the monoview classifiers used separate classifier by space ' - 'and configs must be of form argument1_name:value,argument2_name:value', + help='Configuration for the monoview classifiers used separate classifier by space ' + 'and configs must be of form argument1_name:value,argument2_name:value', default=['']) groupEarlyFusion.add_argument('--FU_E_cl_names', metavar='STRING', action='store', nargs='+', - help='Name of the classifiers used for each early fusion method', default=['']) - + help='Name of the classifiers used for each early fusion method', default=['']) groupLateFusion = parser.add_argument_group('Late Early Fusion arguments') groupLateFusion.add_argument('--FU_late_methods', metavar='STRING', action='store', nargs="+", - help='Determine which late fusion method of fusion to use', - default=['']) + help='Determine which late fusion method of fusion to use', + default=['']) groupLateFusion.add_argument('--FU_L_method_config', metavar='STRING', action='store', nargs='+', - help='Configuration for the fusion method', default=['']) + help='Configuration for the fusion method', default=['']) groupLateFusion.add_argument('--FU_L_cl_config', metavar='STRING', action='store', nargs='+', - help='Configuration for the monoview classifiers used', default=['']) + help='Configuration for the monoview classifiers used', default=['']) groupLateFusion.add_argument('--FU_L_cl_names', metavar='STRING', action='store', nargs="+", - help='Names of the classifier used for late fusion', default=['']) + help='Names of the classifier used for late fusion', default=['']) groupLateFusion.add_argument('--FU_L_select_monoview', metavar='STRING', action='store', - help='Determine which method to use to select the monoview classifiers', default="intersect") + help='Determine which method to use to select the monoview classifiers', + default="intersect") args = parser.parse_args() + os.nice(args.nice) nbCores = args.CL_cores statsIter = args.CL_statsiter +randomState = np.random.RandomState(args.randomState) +hyperParamSearch = args.CL_HPS_type + start = time.time() if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech", "Fake", "Plausible", "KMultiOmic"]: @@ -570,12 +419,10 @@ if args.name not in ["MultiOmic", "ModifiedMultiOmic", "Caltech", "Fake", "Plaus else: getDatabase = getattr(DB, "get" + args.name + "DB" + args.type[1:]) -hyperParamSearch = args.CL_HPS_type - directory = initLogFile(args) DATASET, LABELS_DICTIONARY = getDatabase(args.views, args.pathF, args.name, args.CL_nb_class, - args.CL_classes) + args.CL_classes, randomState) datasetFiles = initMultipleDatasets(args, nbCores) @@ -619,7 +466,8 @@ if nbCores > 1: for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): resultsMonoview += (Parallel(n_jobs=nbCores)( delayed(ExecMonoview_multicore)(directory, args.name, labelsNames, args.CL_split, args.CL_nbFolds, - coreIndex, args.type, args.pathF, statsIter, hyperParamSearch=hyperParamSearch, + coreIndex, args.type, args.pathF, statsIter, randomState, + hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter, **argumentDictionaries["Monoview"][coreIndex + stepIndex * nbCores]) for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))) @@ -633,7 +481,7 @@ if nbCores > 1: else: resultsMonoview += ([ExecMonoview(directory, DATASET.get("View" + str(arguments["viewIndex"])), DATASET.get("Labels").value, args.name, labelsNames, - args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, statsIter, + args.CL_split, args.CL_nbFolds, 1, args.type, args.pathF, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=args.CL_GS_iter, **arguments) for arguments in argumentDictionaries["Monoview"]]) @@ -645,7 +493,6 @@ else: viewsIndices] monoviewTime = time.time() - dataBaseTime - start - argumentDictionaries = initMultiviewArguments(args, benchmark, views, viewsIndices, scores, classifiersConfigs, classifiersNames, NB_VIEW, metrics[0], argumentDictionaries) diff --git a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py index f4a2f3ea9afb0a7f64eb9e9f7a3b320fb146e1fc..58445abf8ca09c9da7a01e693d4d7f207860a068 100644 --- a/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/Code/MonoMutliViewClassifiers/Monoview/ExecClassifMonoView.py @@ -21,7 +21,7 @@ import ExportResults # Functions to render results import MonoviewClassifiers import Metrics from analyzeResult import execute -from utils.Dataset import getV, getValue, extractSubset +from ..utils.Dataset import getV, getValue, extractSubset # Author-Info __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" @@ -29,7 +29,7 @@ __status__ = "Prototype" # Production, Development, Prototype __date__ = 2016-03-25 -def ExecMonoview_multicore(directory, name, labelsNames, learningRate, nbFolds, datasetFileIndex, databaseType, path, statsIter, hyperParamSearch="randomizedSearch", +def ExecMonoview_multicore(directory, name, labelsNames, learningRate, nbFolds, datasetFileIndex, databaseType, path, statsIter, randomState, hyperParamSearch="randomizedSearch", metrics=[["accuracy_score", None]], nIter=30, **args): DATASET = h5py.File(path+name+str(datasetFileIndex)+".hdf5", "r") kwargs = args["args"] @@ -37,11 +37,11 @@ def ExecMonoview_multicore(directory, name, labelsNames, learningRate, nbFolds, neededViewIndex = views.index(kwargs["feat"]) X = DATASET.get("View"+str(neededViewIndex)) Y = DATASET.get("Labels").value - return ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, 1, databaseType, path, statsIter, hyperParamSearch=hyperParamSearch, + return ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, 1, databaseType, path, statsIter, randomState, hyperParamSearch=hyperParamSearch, metrics=metrics, nIter=nIter, **args) -def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCores, databaseType, path, statsIter, hyperParamSearch="randomizedSearch", +def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCores, databaseType, path, statsIter, randomState, hyperParamSearch="randomizedSearch", metrics=[["accuracy_score", None]], nIter=30, **args): logging.debug("Start:\t Loading data") try: @@ -66,7 +66,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo for iterationStat in range(statsIter): # Calculate Train/Test data logging.debug("Start:\t Determine Train/Test split"+" for iteration "+str(iterationStat+1)) - testIndices = MonoviewUtils.splitDataset(Y, nbClass, learningRate, datasetLength) + testIndices = MonoviewUtils.splitDataset(Y, nbClass, learningRate, datasetLength, randomState) trainIndices = [i for i in range(datasetLength) if i not in testIndices] X_train = extractSubset(X,trainIndices) X_test = extractSubset(X,testIndices) @@ -84,13 +84,14 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo if hyperParamSearch != "None": classifierGridSearch = getattr(classifierModule, hyperParamSearch) logging.debug("Start:\t RandomSearch best settings with "+str(nIter)+" iterations for "+CL_type) - cl_desc = classifierGridSearch(X_train, y_train, nbFolds=nbFolds, nbCores=nbCores, metric=metrics[0], nIter=nIter) + cl_desc = classifierGridSearch(X_train, y_train, randomState, nbFolds=nbFolds, nbCores=nbCores, + metric=metrics[0], nIter=nIter) clKWARGS = dict((str(index), desc) for index, desc in enumerate(cl_desc)) logging.debug("Done:\t RandomSearch best settings") else: clKWARGS = kwargs[kwargs["CL_type"]+"KWARGS"] logging.debug("Start:\t Training") - cl_res = classifierModule.fit(X_train, y_train, NB_CORES=nbCores, **clKWARGS) + cl_res = classifierModule.fit(X_train, y_train, randomState, NB_CORES=nbCores, **clKWARGS) logging.debug("Done:\t Training") logging.debug("Start:\t Predicting") @@ -111,7 +112,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, learningRate, nbFolds, nbCo stringAnalysis, imagesAnalysis, metricsScores = execute(name, learningRate, nbFolds, nbCores, hyperParamSearch, metrics, nIter, feat, CL_type, clKWARGS, labelsNames, X.shape, - y_trains, y_train_preds, y_tests, y_test_preds, t_end, statsIter) + y_trains, y_train_preds, y_tests, y_test_preds, t_end, statsIter, randomState) cl_desc = [value for key, value in sorted(clKWARGS.iteritems())] logging.debug("Done:\t Getting Results") logging.info(stringAnalysis) diff --git a/Code/MonoMutliViewClassifiers/Monoview/MonoviewUtils.py b/Code/MonoMutliViewClassifiers/Monoview/MonoviewUtils.py index 487708e992560c66626ad2523a0de6e2afb8c3a3..626890dfc587be75ffd0df7b698761492698e6f8 100644 --- a/Code/MonoMutliViewClassifiers/Monoview/MonoviewUtils.py +++ b/Code/MonoMutliViewClassifiers/Monoview/MonoviewUtils.py @@ -3,7 +3,6 @@ """ Library: MultiClass Classification with MonoView """ # Import built-in modules -import pandas as pd # For DataFrames # Import sci-kit learn party modules #from sklearn.tests import train_test_split # For calculating the train/test split @@ -12,7 +11,6 @@ from sklearn.model_selection import GridSearchCV # GridSearch for par from sklearn.ensemble import RandomForestClassifier # RandomForest-Classifier import sklearn import numpy as np -import random # Import own modules @@ -36,20 +34,20 @@ def getLabelSupports(CLASS_LABELS): return supports, dict((label, index) for label, index in zip(labels, range(len(labels)))) -def splitDataset(LABELS, NB_CLASS, LEARNING_RATE, DATASET_LENGTH): - validationIndices = extractRandomTrainingSet(LABELS, 1-LEARNING_RATE, DATASET_LENGTH, NB_CLASS) +def splitDataset(LABELS, NB_CLASS, LEARNING_RATE, DATASET_LENGTH, randomState): + validationIndices = extractRandomTrainingSet(LABELS, 1-LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState) validationIndices.sort() return validationIndices -def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS): +def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState): labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS)) nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports] trainingExamplesIndices = [] usedIndices = [] while nbTrainingExamples != [0 for i in range(NB_CLASS)]: isUseFull = False - index = int(random.randint(0, DATASET_LENGTH-1)) + index = int(randomState.randint(0, DATASET_LENGTH-1)) if index not in usedIndices: isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict) if isUseFull: diff --git a/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py b/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py index 47e71752705c696b20ce69611b1ca020ba3d59cb..1e7e7df4344471ff9cedb7a063e153ba51decbd9 100644 --- a/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py +++ b/Code/MonoMutliViewClassifiers/Monoview/analyzeResult.py @@ -40,7 +40,7 @@ def getMetricScore(metric, y_trains, y_train_preds, y_tests, y_test_preds): def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames, - shape, y_trains, y_train_preds, y_tests, y_test_preds, time, statsIter): + shape, y_trains, y_train_preds, y_tests, y_test_preds, time, statsIter, randomState): metricsScores = {} metricModule = getattr(Metrics, metrics[0][0]) trainScores = np.array([metricModule.score(y_train, y_train_pred) for y_train, y_train_pred in zip(y_trains, y_train_preds)]) @@ -49,7 +49,7 @@ def execute(name, learningRate, nbFolds, nbCores, gridSearch, metrics, nIter, fe val = np.mean(testScores) stdTrain = np.std(trainScores) stdTest = np.std(testScores) - stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+", and "+str(statsIter)+" statistical iterations\n\n" + stringAnalysis = "Classification on "+name+" database for "+feat+" with "+CL_type+", random state is "+str(randomState)+", and "+str(statsIter)+" statistical iterations\n\n" stringAnalysis += metrics[0][0]+" on train : "+str(train)+", with STD : "+str(stdTrain)+"\n"+metrics[0][0]+" on test : "+str(val)+", with STD : "+str(stdTest)+"\n\n" stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, nbFolds) stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS) diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py index fb5e6a741edb775c6864988566290bd028aca45d..567cfa786b2666939167d67b09b9d06e39b6d3ca 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/Adaboost.py @@ -3,7 +3,6 @@ from sklearn.pipeline import Pipeline from sklearn.model_selection import RandomizedSearchCV from sklearn.tree import DecisionTreeClassifier import Metrics -from scipy.stats import randint # Author-Info __author__ = "Baptiste Bauvin" @@ -14,10 +13,11 @@ def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): num_estimators = int(kwargs['0']) base_estimators = DecisionTreeClassifier()#kwargs['1'] - classifier = AdaBoostClassifier(n_estimators=num_estimators, base_estimator=base_estimators) + classifier = AdaBoostClassifier(n_estimators=num_estimators, base_estimator=base_estimators, + random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -32,10 +32,10 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): pipeline = Pipeline([('classifier', AdaBoostClassifier())]) - param= {"classifier__n_estimators": randint(1, 15), + param= {"classifier__n_estimators": randomState.randint(1, 15), "classifier__base_estimator": [DecisionTreeClassifier()]} metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: @@ -43,7 +43,8 @@ def randomizedSearch(X_train, y_train, nbFolds=4, metric=["accuracy_score", None else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid = RandomizedSearchCV(pipeline, n_iter=nIter, param_distributions=param, refit=True, n_jobs=nbCores, scoring=scorer, cv=nbFolds) + grid = RandomizedSearchCV(pipeline, n_iter=nIter, param_distributions=param, refit=True, n_jobs=nbCores, + scoring=scorer, cv=nbFolds, random_state=randomState) detector = grid.fit(X_train, y_train) desc_estimators = [detector.best_params_["classifier__n_estimators"], detector.best_params_["classifier__base_estimator"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py index e3c92498983f92b4ff47a552c0d80b36d6a54ee3..2c9cbc4b9b3578b419767a3109b48299189f00eb 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/DecisionTree.py @@ -2,7 +2,6 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import randint # Author-Info @@ -14,11 +13,12 @@ def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1, **kwargs): +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): maxDepth = int(kwargs['0']) criterion = kwargs['1'] splitter = kwargs['2'] - classifier = DecisionTreeClassifier(max_depth=maxDepth, criterion=criterion, splitter=splitter) + classifier = DecisionTreeClassifier(max_depth=maxDepth, criterion=criterion, splitter=splitter, + random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -35,9 +35,9 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_DT = Pipeline([('classifier', DecisionTreeClassifier())]) - param_DT = {"classifier__max_depth": randint(1, 30), + param_DT = {"classifier__max_depth": randomState.randint(1, 30), "classifier__criterion": ["gini", "entropy"], "classifier__splitter": ["best", "random"]} metricModule = getattr(Metrics, metric[0]) @@ -47,7 +47,7 @@ def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_s metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_DT = RandomizedSearchCV(pipeline_DT, n_iter=nIter, param_distributions=param_DT, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds) + cv=nbFolds, random_state=randomState) DT_detector = grid_DT.fit(X_train, y_train) desc_params = [DT_detector.best_params_["classifier__max_depth"], DT_detector.best_params_["classifier__criterion"], DT_detector.best_params_["classifier__splitter"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py index 707c81dab7103963c8252a1972ff1c3209c8882d..8953721c0705f37b68204888857b68452b9dd2dc 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/KNN.py @@ -2,7 +2,6 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import randint # Author-Info @@ -13,12 +12,14 @@ __status__ = "Prototype" # Production, Development, P def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): nNeighbors = int(kwargs['0']) weights = kwargs["1"] algorithm = kwargs["2"] p = int(kwargs["3"]) - classifier = KNeighborsClassifier(n_neighbors=nNeighbors, weights=weights, algorithm=algorithm, p=p, n_jobs=NB_CORES) + classifier = KNeighborsClassifier(n_neighbors=nNeighbors, weights=weights, algorithm=algorithm, p=p, + n_jobs=NB_CORES, ) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -37,9 +38,9 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30 ): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30 ): pipeline_KNN = Pipeline([('classifier', KNeighborsClassifier())]) - param_KNN = {"classifier__n_neighbors": randint(1, 50), + param_KNN = {"classifier__n_neighbors": randomState.randint(1, 50), "classifier__weights": ["uniform", "distance"], "classifier__algorithm": ["auto", "ball_tree", "kd_tree", "brute"], "classifier__p": [1,2], @@ -51,7 +52,7 @@ def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_s metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) grid_KNN = RandomizedSearchCV(pipeline_KNN, n_iter=nIter, param_distributions=param_KNN, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds) + cv=nbFolds, random_state=randomState) KNN_detector = grid_KNN.fit(X_train, y_train) desc_params = [KNN_detector.best_params_["classifier__n_neighbors"], KNN_detector.best_params_["classifier__weights"], diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py index 28011651d9299388d52dcdd2713d9b0e486d6d50..ee85e6fde487ea060e15dbcb77eb983aa30dcf46 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/RandomForest.py @@ -2,7 +2,6 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import randint # Author-Info @@ -14,11 +13,12 @@ def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): num_estimators = int(kwargs['0']) maxDepth = int(kwargs['1']) criterion = kwargs["2"] - classifier = RandomForestClassifier(n_estimators=num_estimators, max_depth=maxDepth, criterion=criterion, n_jobs=NB_CORES) + classifier = RandomForestClassifier(n_estimators=num_estimators, max_depth=maxDepth, criterion=criterion, + n_jobs=NB_CORES, random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -35,18 +35,19 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_rf = Pipeline([('classifier', RandomForestClassifier())]) - param_rf = {"classifier__n_estimators": randint(1, 30), - "classifier__max_depth":randint(1, 30), - "classifier__criterion":["gini", "entropy"]} + param_rf = {"classifier__n_estimators": randomState.randint(1, 30), + "classifier__max_depth": randomState.randint(1, 30), + "classifier__criterion": ["gini", "entropy"]} metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_rf = RandomizedSearchCV(pipeline_rf, n_iter=nIter,param_distributions=param_rf,refit=True,n_jobs=nbCores,scoring=scorer,cv=nbFolds) + grid_rf = RandomizedSearchCV(pipeline_rf, n_iter=nIter,param_distributions=param_rf,refit=True,n_jobs=nbCores, + scoring=scorer,cv=nbFolds, random_state=randomState) rf_detector = grid_rf.fit(X_train, y_train) desc_estimators = [rf_detector.best_params_["classifier__n_estimators"], diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py index 89e240fcb56343a893d6fc3fa0ca82ce4eafc36a..afc365b9e805857aca6147cf3f2758268f2345e7 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SCM.py @@ -1,17 +1,10 @@ -from sklearn.ensemble import AdaBoostClassifier -from sklearn.pipeline import Pipeline -from sklearn.grid_search import RandomizedSearchCV -from sklearn.tree import DecisionTreeClassifier import Metrics -import random from pyscm.utils import _pack_binary_bytes_to_ints import pyscm -from scipy.stats import randint -from utils.Dataset import getShape +from ..utils.Dataset import getShape import h5py from Multiview import GetMultiviewDb as DB from pyscm.binary_attributes.base import BaseBinaryAttributeList -import logging import os # Author-Info __author__ = "Baptiste Bauvin" @@ -38,7 +31,8 @@ def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): except: attributeClassification, binaryAttributes, dsetFile, name = transformData(DATASET) classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attrtibutes, model_type=model_type, verbose=False) - classifier.fit(binaryAttributes, CLASS_LABELS, X=None, attribute_classifications=attributeClassification, iteration_callback=None) + classifier.fit(binaryAttributes, CLASS_LABELS, X=None, attribute_classifications=attributeClassification, + iteration_callback=None) try: dsetFile.close() os.remove(name) @@ -58,9 +52,7 @@ def getKWARGS(kwargsList): return kwargsDict - - -def randomizedSearch(X_train, y_train, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, metric=["accuracy_score", None], nIter=30, nbCores=1): metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: @@ -75,9 +67,9 @@ def randomizedSearch(X_train, y_train, nbFolds=4, metric=["accuracy_score", None isBetter = "lower" config = [] for iterIndex in range(nIter): - max_attributes = random.randint(1, 20) - p = random.random() - model = random.choice(["conjunction", "disjunction"]) + max_attributes = randomState.randint(1, 20) + p = randomState.random() + model = randomState.choice(["conjunction", "disjunction"]) classifier = pyscm.scm.SetCoveringMachine(p=p, max_attributes=max_attributes, model_type=model, verbose=False) if nbFolds != 1: kFolds = DB.getKFoldIndices(nbFolds, y_train, len(set(y_train)), range(len(y_train))) diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py index 59a772b5606260ef622f9d05cf377a0a2efa4dcb..3d861549b4b7cbe5f806d346d9a3773809fdcadb 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SGD.py @@ -2,7 +2,6 @@ from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import uniform # Author-Info @@ -13,14 +12,15 @@ __status__ = "Prototype" # Production, Development, P def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): loss = kwargs['0'] penalty = kwargs['1'] try: alpha = float(kwargs['2']) except: alpha = 0.15 - classifier = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha) + classifier = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, random_state=randomState, n_jobs=NB_CORES) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -37,11 +37,11 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SGD = Pipeline([('classifier', SGDClassifier())]) losses = ['log', 'modified_huber'] penalties = ["l1", "l2", "elasticnet"] - alphas = uniform() + alphas = randomState.uniform() param_SGD = {"classifier__loss": losses, "classifier__penalty": penalties, "classifier__alpha": alphas} metricModule = getattr(Metrics, metric[0]) @@ -50,8 +50,8 @@ def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_s else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_SGD = RandomizedSearchCV(pipeline_SGD, n_iter=nIter, param_distributions=param_SGD, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds) + grid_SGD = RandomizedSearchCV(pipeline_SGD, n_iter=nIter, param_distributions=param_SGD, refit=True, + n_jobs=nbCores, scoring=scorer, cv=nbFolds, random_state=randomState) SGD_detector = grid_SGD.fit(X_train, y_train) desc_params = [SGD_detector.best_params_["classifier__loss"], SGD_detector.best_params_["classifier__penalty"], SGD_detector.best_params_["classifier__alpha"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py index 3b98ef6697de2b03070aab088ea690638f918556..4dab4815c46a69e88aa9b976e79a28258432f02f 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMLinear.py @@ -2,7 +2,6 @@ from sklearn.svm import SVC from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import randint # Author-Info @@ -13,9 +12,10 @@ __status__ = "Prototype" # Production, Development, P def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): C = int(kwargs['0']) - classifier = SVC(C=C, kernel='linear', probability=True, max_iter=1000) + classifier = SVC(C=C, kernel='linear', probability=True, max_iter=1000, random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -28,17 +28,18 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SVMLinear = Pipeline([('classifier', SVC(kernel="linear", max_iter=1000))]) - param_SVMLinear = {"classifier__C":randint(1, 10000)} + param_SVMLinear = {"classifier__C": randomState.randint(1, 10000)} metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_SVMLinear = RandomizedSearchCV(pipeline_SVMLinear, n_iter=nIter,param_distributions=param_SVMLinear, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds) + grid_SVMLinear = RandomizedSearchCV(pipeline_SVMLinear, n_iter=nIter,param_distributions=param_SVMLinear, + refit=True, n_jobs=nbCores, scoring=scorer, cv=nbFolds, + random_state=randomState) SVMLinear_detector = grid_SVMLinear.fit(X_train, y_train) desc_params = [SVMLinear_detector.best_params_["classifier__C"]] diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py index 2300e481653249d3275e6308c4f2bdc2c20ccacb..454ad82826b3586b3b345bd194aff77b6928ad45 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMPoly.py @@ -2,7 +2,6 @@ from sklearn.svm import SVC from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import randint # Author-Info @@ -13,10 +12,11 @@ __status__ = "Prototype" # Production, Development, P def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): C = int(kwargs['0']) degree = int(kwargs['1']) - classifier = SVC(C=C, kernel='poly', degree=degree, probability=True, max_iter=1000) + classifier = SVC(C=C, kernel='poly', degree=degree, probability=True, max_iter=1000, random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -31,17 +31,18 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SVMPoly = Pipeline([('classifier', SVC(kernel="poly", max_iter=1000))]) - param_SVMPoly = {"classifier__C": randint(1, 10000), "classifier__degree":randint(1, 30)} + param_SVMPoly = {"classifier__C": randomState.randint(1, 10000), + "classifier__degree": randomState.randint(1, 30)} metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_SVMPoly = RandomizedSearchCV(pipeline_SVMPoly, n_iter=nIter, param_distributions=param_SVMPoly, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds) + grid_SVMPoly = RandomizedSearchCV(pipeline_SVMPoly, n_iter=nIter, param_distributions=param_SVMPoly, refit=True, + n_jobs=nbCores, scoring=scorer, cv=nbFolds, random_state=randomState) SVMRBF_detector = grid_SVMPoly.fit(X_train, y_train) desc_params = [SVMRBF_detector.best_params_["classifier__C"], SVMRBF_detector.best_params_["classifier__degree"]] return desc_params diff --git a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py index 11415da2ab637fc5d558be51ab308cb7422d8230..24cc8681b2c08a9e3643c569bc5a2e39e5d7b6dc 100644 --- a/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py +++ b/Code/MonoMutliViewClassifiers/MonoviewClassifiers/SVMRBF.py @@ -2,7 +2,6 @@ from sklearn.svm import SVC from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV import Metrics -from scipy.stats import randint # Author-Info @@ -14,9 +13,9 @@ def canProbas(): return True -def fit(DATASET, CLASS_LABELS, NB_CORES=1,**kwargs): +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1,**kwargs): C = int(kwargs['0']) - classifier = SVC(C=C, kernel='rbf', probability=True, max_iter=1000) + classifier = SVC(C=C, kernel='rbf', probability=True, max_iter=1000, random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -29,17 +28,17 @@ def getKWARGS(kwargsList): return kwargsDict -def randomizedSearch(X_train, y_train, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): +def randomizedSearch(X_train, y_train, randomState, nbFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): pipeline_SVMRBF = Pipeline([('classifier', SVC(kernel="rbf", max_iter=1000))]) - param_SVMRBF = {"classifier__C": randint(1, 10000)} + param_SVMRBF = {"classifier__C": randomState.randint(1, 10000)} metricModule = getattr(Metrics, metric[0]) if metric[1]!=None: metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) else: metricKWARGS = {} scorer = metricModule.get_scorer(**metricKWARGS) - grid_SVMRBF = RandomizedSearchCV(pipeline_SVMRBF, n_iter=nIter, param_distributions=param_SVMRBF, refit=True, n_jobs=nbCores, scoring=scorer, - cv=nbFolds) + grid_SVMRBF = RandomizedSearchCV(pipeline_SVMRBF, n_iter=nIter, param_distributions=param_SVMRBF, refit=True, + n_jobs=nbCores, scoring=scorer, cv=nbFolds, random_state=randomState) SVMRBF_detector = grid_SVMRBF.fit(X_train, y_train) desc_params = [SVMRBF_detector.best_params_["classifier__C"]] return desc_params diff --git a/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py b/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py index 1d49fcfff32d45902f798530916431cc57c604a5..d3b04b4388fef64cb8fa42d5ddf3ed1164385d84 100644 --- a/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py +++ b/Code/MonoMutliViewClassifiers/Multiview/GetMultiviewDb.py @@ -1,54 +1,50 @@ import numpy as np import math -from scipy import sparse, io -from string import digits +from scipy import sparse +# from string import digits import os -import random +# import random import logging import h5py import operator # Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype -def makeMeNoisy(viewData, percentage=15): +def makeMeNoisy(viewData, randomState, percentage=15): viewData = viewData.astype(bool) - nbNoisyCoord = int(percentage/100.0*viewData.shape[0]*viewData.shape[1]) + nbNoisyCoord = int(percentage / 100.0 * viewData.shape[0] * viewData.shape[1]) rows = range(viewData.shape[0]) cols = range(viewData.shape[1]) for _ in range(nbNoisyCoord): - rowIdx = random.choice(rows) - colIdx = random.choice(cols) + rowIdx = randomState.choice(rows) + colIdx = randomState.choice(cols) viewData[rowIdx, colIdx] = not viewData[rowIdx, colIdx] noisyViewData = viewData.astype(np.uint8) return noisyViewData -def getPlausibleDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME, nbView=3, nbClass=2, datasetLength=347): +def getPlausibleDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, randomState, nbView=3, + nbClass=2, datasetLength=347): nbFeatures = 250 - datasetFile = h5py.File(pathF+"Plausible.hdf5", "w") - CLASS_LABELS = np.array([0 for i in range(datasetLength/2)]+[1 for i in range(datasetLength/2)]) + datasetFile = h5py.File(pathF + "Plausible.hdf5", "w") + CLASS_LABELS = np.array([0 for i in range(datasetLength / 2)] + [1 for i in range(datasetLength / 2)]) for viewIndex in range(nbView): - # if viewIndex== 0 : - viewData = np.array([np.zeros(nbFeatures) for i in range(datasetLength/2)]+[np.ones(nbFeatures) for i in range(datasetLength/2)]) - fakeTrueIndices = np.random.randint(0, datasetLength/2-1, datasetLength/5) - fakeFalseIndices = np.random.randint(datasetLength/2, datasetLength-1, datasetLength/5) + viewData = np.array([np.zeros(nbFeatures) for i in range(datasetLength / 2)] + [np.ones(nbFeatures) + for i in + range(datasetLength / 2)]) + fakeTrueIndices = randomState.randint(0, datasetLength / 2 - 1, datasetLength / 5) + fakeFalseIndices = randomState.randint(datasetLength / 2, datasetLength - 1, datasetLength / 5) viewData[fakeTrueIndices] = np.ones((len(fakeTrueIndices), nbFeatures)) viewData[fakeFalseIndices] = np.zeros((len(fakeFalseIndices), nbFeatures)) - viewData = makeMeNoisy(viewData) - viewDset = datasetFile.create_dataset("View"+str(viewIndex), viewData.shape, data=viewData.astype(np.uint8)) - viewDset.attrs["name"] = "View"+str(viewIndex) + viewData = makeMeNoisy(viewData, randomState) + viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewData.shape, data=viewData.astype(np.uint8)) + viewDset.attrs["name"] = "View" + str(viewIndex) viewDset.attrs["sparse"] = False viewDset.attrs["binary"] = True - # else: - # viewData = np.array([np.random.normal(float((viewIndex+1)*10), 0.42, nbFeatures) for i in range(datasetLength/2)]+[np.random.normal(-float((viewIndex+1)*10),0.42,nbFeatures) for j in range(datasetLength/2)]) - # viewDset = datasetFile.create_dataset("View"+str(viewIndex), viewData.shape) - # viewDset.attrs["name"] = "View"+str(viewIndex) - # viewDset.attrs["sparse"] = False - # viewDset.attrs["binary"] = False labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape) labelsDset[...] = CLASS_LABELS labelsDset.attrs["name"] = "Labels" @@ -57,45 +53,47 @@ def getPlausibleDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME, nbView=3, metaDataGrp.attrs["nbClass"] = 2 metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS) datasetFile.close() - datasetFile = h5py.File(pathF+"Plausible.hdf5", "r") - LABELS_DICTIONARY = {0:"No", 1:"Yes"} + datasetFile = h5py.File(pathF + "Plausible.hdf5", "r") + LABELS_DICTIONARY = {0: "No", 1: "Yes"} return datasetFile, LABELS_DICTIONARY -def getFakeDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME): +def getFakeDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, randomState): NB_VIEW = 4 DATASET_LENGTH = 30 NB_CLASS = 2 - VIEW_DIMENSIONS = np.random.random_integers(5, 20, NB_VIEW) + VIEW_DIMENSIONS = randomState.random_integers(5, 20, NB_VIEW) DATA = dict((indx, - np.array([ - np.random.normal(0.0, 2, viewDimension) - for i in np.arange(DATASET_LENGTH)])) - for indx, viewDimension in enumerate(VIEW_DIMENSIONS)) + np.array([ + randomState.normal(0.0, 2, viewDimension) + for i in np.arange(DATASET_LENGTH)])) + for indx, viewDimension in enumerate(VIEW_DIMENSIONS)) - CLASS_LABELS = np.random.random_integers(0, NB_CLASS-1, DATASET_LENGTH) - datasetFile = h5py.File(pathF+"Fake.hdf5", "w") + CLASS_LABELS = randomState.random_integers(0, NB_CLASS - 1, DATASET_LENGTH) + datasetFile = h5py.File(pathF + "Fake.hdf5", "w") for index, viewData in enumerate(DATA.values()): - if index==0: - viewData = np.random.randint(0, 1, (DATASET_LENGTH,300)).astype(np.uint8)#np.zeros(viewData.shape, dtype=bool)+np.ones((viewData.shape[0], viewData.shape[1]/2), dtype=bool) - viewDset = datasetFile.create_dataset("View"+str(index), viewData.shape) + if index == 0: + viewData = randomState.randint(0, 1, (DATASET_LENGTH, 300)).astype( + np.uint8) + # np.zeros(viewData.shape, dtype=bool)+np.ones((viewData.shape[0], viewData.shape[1]/2), dtype=bool) + viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape) viewDset[...] = viewData - viewDset.attrs["name"] = "View"+str(index) + viewDset.attrs["name"] = "View" + str(index) viewDset.attrs["sparse"] = False elif index == 1: viewData = sparse.csr_matrix(viewData) - viewGrp = datasetFile.create_group("View"+str(index)) + viewGrp = datasetFile.create_group("View" + str(index)) dataDset = viewGrp.create_dataset("data", viewData.data.shape, data=viewData.data) indicesDset = viewGrp.create_dataset("indices", viewData.indices.shape, data=viewData.indices) indptrDset = viewGrp.create_dataset("indptr", viewData.indptr.shape, data=viewData.indptr) - viewGrp.attrs["name"] = "View"+str(index) + viewGrp.attrs["name"] = "View" + str(index) viewGrp.attrs["sparse"] = True viewGrp.attrs["shape"] = viewData.shape else: - viewDset = datasetFile.create_dataset("View"+str(index), viewData.shape) + viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape) viewDset[...] = viewData - viewDset.attrs["name"] = "View"+str(index) + viewDset.attrs["name"] = "View" + str(index) viewDset.attrs["sparse"] = False labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape) labelsDset[...] = CLASS_LABELS @@ -105,9 +103,9 @@ def getFakeDBhdf5(features, pathF, name , NB_CLASS, LABELS_NAME): metaDataGrp.attrs["nbView"] = NB_VIEW metaDataGrp.attrs["nbClass"] = NB_CLASS metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS) - LABELS_DICTIONARY = {0:"No", 1:"Yes"} + LABELS_DICTIONARY = {0: "No", 1: "Yes"} datasetFile.close() - datasetFile = h5py.File(pathF+"Fake.hdf5", "r") + datasetFile = h5py.File(pathF + "Fake.hdf5", "r") return datasetFile, LABELS_DICTIONARY @@ -117,7 +115,7 @@ def getLabelSupports(CLASS_LABELS): return supports, dict((label, index) for label, index in zip(labels, range(len(labels)))) -def isUseful (labelSupports, index, CLASS_LABELS, labelDict): +def isUseful(labelSupports, index, CLASS_LABELS, labelDict): if labelSupports[labelDict[CLASS_LABELS[index]]] != 0: labelSupports[labelDict[CLASS_LABELS[index]]] -= 1 return True, labelSupports @@ -125,22 +123,22 @@ def isUseful (labelSupports, index, CLASS_LABELS, labelDict): return False, labelSupports -def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH): +def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH, randomState): LABELS = DATASET.get("Labels")[...] NB_CLASS = int(DATASET["Metadata"].attrs["nbClass"]) - validationIndices = extractRandomTrainingSet(LABELS, 1-LEARNING_RATE, DATASET_LENGTH, NB_CLASS) + validationIndices = extractRandomTrainingSet(LABELS, 1 - LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState) validationIndices.sort() return validationIndices -def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS): +def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, randomState): labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS)) nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports] trainingExamplesIndices = [] usedIndices = [] while nbTrainingExamples != [0 for i in range(NB_CLASS)]: isUseFull = False - index = int(random.randint(0, DATASET_LENGTH-1)) + index = int(randomState.randint(0, DATASET_LENGTH - 1)) if index not in usedIndices: isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict) if isUseFull: @@ -149,7 +147,7 @@ def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLA return trainingExamplesIndices -def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices): +def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices, randomState): labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS[learningIndices])) nbTrainingExamples = [[int(support / nbFolds) for support in labelSupports] for fold in range(nbFolds)] trainingExamplesIndices = [] @@ -157,7 +155,7 @@ def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices): for foldIndex, fold in enumerate(nbTrainingExamples): trainingExamplesIndices.append([]) while fold != [0 for i in range(NB_CLASS)]: - index = random.randint(0, len(learningIndices)-1) + index = randomState.randint(0, len(learningIndices) - 1) if learningIndices[index] not in usedIndices: isUseFull, fold = isUseful(fold, learningIndices[index], CLASS_LABELS, labelDict) if isUseFull: @@ -174,25 +172,27 @@ def getPositions(labelsUsed, fullLabels): return usedIndices -def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): - labelsNamesFile = open(pathF+nameDB+'-ClassLabels-Description.csv') - datasetFile = h5py.File(pathF+nameDB+".hdf5", "w") - if len(LABELS_NAMES)!=NB_CLASS: +def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): + labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv') + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") + if len(LABELS_NAMES) != NB_CLASS: nbLabelsAvailable = 0 for l in labelsNamesFile: - nbLabelsAvailable+=1 - LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if lineIdx in np.random.randint(nbLabelsAvailable, size=NB_CLASS)] + nbLabelsAvailable += 1 + LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if + lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)] fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=',').astype(int) labelsDictionary = dict((classIndex, labelName) for (classIndex, labelName) in - [(int(line.strip().split(";")[0]),line.strip().split(";")[1])for lineIndex, line in enumerate(labelsNamesFile) if line.strip().split(";")[0] in LABELS_NAMES]) - if len(set(fullLabels))>NB_CLASS: + [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in + enumerate(labelsNamesFile) if line.strip().split(";")[0] in LABELS_NAMES]) + if len(set(fullLabels)) > NB_CLASS: usedIndices = getPositions(labelsDictionary.keys(), fullLabels) else: usedIndices = range(len(fullLabels)) for viewIndex, view in enumerate(views): viewFile = pathF + nameDB + "-" + view + '.csv' viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=','))[usedIndices, :] - viewDset = datasetFile.create_dataset("View"+str(viewIndex), viewMatrix.shape, data=viewMatrix) + viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix) viewDset.attrs["name"] = view viewDset.attrs["sparse"] = False viewDset.attrs["binary"] = False @@ -206,37 +206,39 @@ def getClassicDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): metaDataGrp.attrs["nbClass"] = NB_CLASS metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices]) datasetFile.close() - datasetFile = h5py.File(pathF+nameDB+".hdf5", "r") + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") return datasetFile, labelsDictionary -def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(pathF+nameDB+".hdf5", "r") +def getClassicDBhdf5(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") fullLabels = datasetFile.get("Labels") labelsDictionary = dict((labelIndex, labelName) for labelIndex, labelName in - zip(fullLabels.attrs["labels_indices"], fullLabels.attrs["labels"])) + zip(fullLabels.attrs["labels_indices"], fullLabels.attrs["labels"])) return datasetFile, labelsDictionary -def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(pathF+nameDB+".hdf5", "w") - labelsNamesFile = open(pathF+nameDB+'-ClassLabels-Description.csv') - if len(LABELS_NAMES)!=NB_CLASS: +def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, randomState): + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") + labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv') + if len(LABELS_NAMES) != NB_CLASS: nbLabelsAvailable = 0 for l in labelsNamesFile: - nbLabelsAvailable+=1 - LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if lineIdx in np.random.randint(nbLabelsAvailable, size=NB_CLASS)] + nbLabelsAvailable += 1 + LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if + lineIdx in randomState.randint(nbLabelsAvailable, size=NB_CLASS)] fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=';').astype(int) labelsDictionary = dict((classIndice, labelName) for (classIndice, labelName) in - [(int(line.strip().split(";")[0]),line.strip().split(";")[1])for lineIndex, line in labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES]) - if len(set(fullLabels))>NB_CLASS: + [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in + labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES]) + if len(set(fullLabels)) > NB_CLASS: usedIndices = getPositions(labelsDictionary.keys(), fullLabels) else: usedIndices = range(len(fullLabels)) for viewIndex, view in enumerate(views): viewFile = pathF + nameDB + "-" + view + '.csv' viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=';'))[usedIndices, :] - viewDset = datasetFile.create_dataset("View"+str(viewIndex), viewMatrix.shape, data=viewMatrix) + viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix) viewDset.attrs["name"] = view labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices]) @@ -246,16 +248,15 @@ def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES): metaDataGrp.attrs["nbClass"] = NB_CLASS metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices]) datasetFile.close() - datasetFile = h5py.File(pathF+nameDB+".hdf5", "r") + datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") return datasetFile, labelsDictionary -def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): - - datasetFile = h5py.File(path+"MultiOmic.hdf5", "w") +def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES, randomState): + datasetFile = h5py.File(path + "MultiOmic.hdf5", "w") logging.debug("Start:\t Getting Methylation Data") - methylData = np.genfromtxt(path+"matching_methyl.csv", delimiter=',') + methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') methylDset = datasetFile.create_dataset("View0", methylData.shape) methylDset[...] = methylData methylDset.attrs["name"] = "Methyl" @@ -264,16 +265,16 @@ def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): logging.debug("Done:\t Getting Methylation Data") logging.debug("Start:\t Getting MiRNA Data") - mirnaData = np.genfromtxt(path+"matching_mirna.csv", delimiter=',') + mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) mirnaDset[...] = mirnaData - mirnaDset.attrs["name"]="MiRNA_" + mirnaDset.attrs["name"] = "MiRNA_" mirnaDset.attrs["sparse"] = False mirnaDset.attrs["binary"] = False logging.debug("Done:\t Getting MiRNA Data") logging.debug("Start:\t Getting RNASeq Data") - rnaseqData = np.genfromtxt(path+"matching_rnaseq.csv", delimiter=',') + rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') uselessRows = [] for rowIndex, row in enumerate(np.transpose(rnaseqData)): if not row.any(): @@ -281,13 +282,13 @@ def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) rnaseqDset[...] = rnaseqData[:, usefulRows] - rnaseqDset.attrs["name"]="RNASeq_" + rnaseqDset.attrs["name"] = "RNASeq_" rnaseqDset.attrs["sparse"] = False rnaseqDset.attrs["binary"] = False logging.debug("Done:\t Getting RNASeq Data") logging.debug("Start:\t Getting Clinical Data") - clinical = np.genfromtxt(path+"clinicalMatrix.csv", delimiter=',') + clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') clinicalDset = datasetFile.create_dataset("View3", clinical.shape) clinicalDset[...] = clinical clinicalDset.attrs["name"] = "Clinic" @@ -295,7 +296,7 @@ def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): clinicalDset.attrs["binary"] = False logging.debug("Done:\t Getting Clinical Data") - labelFile = open(path+'brca_labels_triple-negatif.csv') + labelFile = open(path + 'brca_labels_triple-negatif.csv') labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) labelsDset = datasetFile.create_dataset("Labels", labels.shape) labelsDset[...] = labels @@ -305,57 +306,57 @@ def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): metaDataGrp.attrs["nbView"] = 4 metaDataGrp.attrs["nbClass"] = 2 metaDataGrp.attrs["datasetLength"] = len(labels) - labelDictionary = {0:"No", 1:"Yes"} + labelDictionary = {0: "No", 1: "Yes"} datasetFile.close() - datasetFile = h5py.File(path+"MultiOmic.hdf5", "r") + datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") # datasetFile = getPseudoRNASeq(datasetFile) return datasetFile, labelDictionary def getVector(nbGenes): - argmax = [0,0] + argmax = [0, 0] maxi = 0 for i in range(nbGenes): for j in range(nbGenes): - if j==i+1: - value = (i+1)*(nbGenes-j) - if value>maxi: - maxi= value - argmax = [i,j] - i,j = argmax + if j == i + 1: + value = (i + 1) * (nbGenes - j) + if value > maxi: + maxi = value + argmax = [i, j] + i, j = argmax vectorLeft = np.zeros(nbGenes, dtype=bool) - vectorLeft[:i+1] = np.ones(i+1, dtype=bool) + vectorLeft[:i + 1] = np.ones(i + 1, dtype=bool) vectorSup = np.zeros(nbGenes, dtype=bool) - vectorSup[j:] = np.ones(nbGenes-j, dtype=bool) + vectorSup[j:] = np.ones(nbGenes - j, dtype=bool) matrixSup = j - matrixInf = nbGenes-j + matrixInf = nbGenes - j return vectorLeft, matrixSup, matrixInf def findClosestPowerOfTwo(factorizationParam): - power=1 - while factorizationParam-power>0: - power = 2*power - if abs(factorizationParam-power)<abs(factorizationParam-power/2): + power = 1 + while factorizationParam - power > 0: + power *= 2 + if abs(factorizationParam - power) < abs(factorizationParam - power / 2): return power else: - return power/2 + return power / 2 def easyFactorize(nbGenes, factorizationParam, t=0): - if math.log(factorizationParam+1, 2)%1==0.0: + if math.log(factorizationParam + 1, 2) % 1 == 0.0: pass else: factorizationParam = findClosestPowerOfTwo(factorizationParam) - 1 - if nbGenes==2: + if nbGenes == 2: return 1, np.array([True, False]) - if nbGenes==3: - return 1, np.array([True, True, False]) + if nbGenes == 3: + return 1, np.array([True, True, False]) - if factorizationParam==1: - t=1 + if factorizationParam == 1: + t = 1 return t, getVector(nbGenes)[0] vectorLeft, matrixSup, matrixInf = getVector(nbGenes) @@ -363,14 +364,14 @@ def easyFactorize(nbGenes, factorizationParam, t=0): t_, vectorLeftSup = easyFactorize(matrixSup, (factorizationParam - 1) / 2, t=t) t__, vectorLeftInf = easyFactorize(matrixInf, (factorizationParam - 1) / 2, t=t) - factorLeft = np.zeros((nbGenes,t_+t__+1), dtype=bool) + factorLeft = np.zeros((nbGenes, t_ + t__ + 1), dtype=bool) factorLeft[:matrixSup, :t_] = vectorLeftSup.reshape(factorLeft[:matrixSup, :t_].shape) - if nbGenes%2==1: - factorLeft[matrixInf-1:, t_:t__+t_] = vectorLeftInf.reshape(factorLeft[matrixInf-1:, t_:t__+t_].shape) + if nbGenes % 2 == 1: + factorLeft[matrixInf - 1:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf - 1:, t_:t__ + t_].shape) else: - factorLeft[matrixInf:, t_:t__+t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__+t_].shape) - factorLeft[:, t__+t_] = vectorLeft + factorLeft[matrixInf:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__ + t_].shape) + factorLeft[:, t__ + t_] = vectorLeft # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool) # @@ -380,38 +381,40 @@ def easyFactorize(nbGenes, factorizationParam, t=0): # else: # factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape) # factorSup[t__+t_, :] = vectorSup - return t__+t_+1, factorLeft#, factorSup + return t__ + t_ + 1, factorLeft # , factorSup def getBaseMatrices(nbGenes, factorizationParam, path): t, factorLeft = easyFactorize(nbGenes, factorizationParam) - np.savetxt(path+"factorLeft--n-"+str(nbGenes)+"--k-"+str(factorizationParam)+".csv", factorLeft, delimiter=",") + np.savetxt(path + "factorLeft--n-" + str(nbGenes) + "--k-" + str(factorizationParam) + ".csv", factorLeft, + delimiter=",") return factorLeft -def findParams(arrayLen, nbPatients, maxNbBins=2000, minNbBins = 10, maxLenBin=70000, minOverlapping=1, minNbBinsOverlapped=0, maxNbSolutions=30): +def findParams(arrayLen, nbPatients, randomState, maxNbBins=2000, minNbBins=10, maxLenBin=70000, minOverlapping=1, + minNbBinsOverlapped=0, maxNbSolutions=30): results = [] - if arrayLen*arrayLen*10/100>minNbBinsOverlapped*nbPatients: - for lenBin in range(arrayLen-1): - lenBin = lenBin+1 - if lenBin<maxLenBin and minNbBins*lenBin<arrayLen: - for overlapping in sorted(range(lenBin-1), reverse=True): - overlapping = overlapping+1 - if overlapping>minOverlapping and lenBin%(lenBin-overlapping)==0: - for nbBins in sorted(range(arrayLen-1), reverse=True): - nbBins = nbBins+1 - if nbBins<maxNbBins: - if arrayLen == (nbBins-1)*(lenBin-overlapping)+lenBin: - results.append({"nbBins":nbBins, "overlapping":overlapping, "lenBin":lenBin}) - if len(results)==maxNbSolutions: - params = results[random.randrange(len(results))] + if arrayLen * arrayLen * 10 / 100 > minNbBinsOverlapped * nbPatients: + for lenBin in range(arrayLen - 1): + lenBin += 1 + if lenBin < maxLenBin and minNbBins * lenBin < arrayLen: + for overlapping in sorted(range(lenBin - 1), reverse=True): + overlapping += 1 + if overlapping > minOverlapping and lenBin % (lenBin - overlapping) == 0: + for nbBins in sorted(range(arrayLen - 1), reverse=True): + nbBins += 1 + if nbBins < maxNbBins: + if arrayLen == (nbBins - 1) * (lenBin - overlapping) + lenBin: + results.append({"nbBins": nbBins, "overlapping": overlapping, "lenBin": lenBin}) + if len(results) == maxNbSolutions: + params = results[randomState.randrange(len(results))] return params def findBins(nbBins=142, overlapping=493, lenBin=986): bins = [] for binIndex in range(nbBins): - bins.append([i+binIndex*(lenBin-overlapping) for i in range(lenBin)]) + bins.append([i + binIndex * (lenBin - overlapping) for i in range(lenBin)]) return bins @@ -419,56 +422,60 @@ def getBins(array, bins, lenBin, overlapping): binnedcoord = [] for coordIndex, coord in enumerate(array): nbBinsFull = 0 - for binIndex, bin in enumerate(bins): - if coordIndex in bin: - binnedcoord.append(binIndex+(coord*len(bins))) + for binIndex, bin_ in enumerate(bins): + if coordIndex in bin_: + binnedcoord.append(binIndex + (coord * len(bins))) return np.array(binnedcoord) def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path): sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8) - step = lenBins-overlapping + step = lenBins - overlapping for binIndex in range(nbBins): - sortedBinsMatrix[step*binIndex:lenBins+(step*binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8) - np.savetxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", sortedBinsMatrix, delimiter=",") + sortedBinsMatrix[step * binIndex:lenBins + (step * binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8) + np.savetxt(path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", + sortedBinsMatrix, delimiter=",") return sortedBinsMatrix -def makeSparseTotalMatrix(sortedRNASeq): +def makeSparseTotalMatrix(sortedRNASeq, randomState): nbPatients, nbGenes = sortedRNASeq.shape - params = findParams(nbGenes, nbPatients) + params = findParams(nbGenes, nbPatients, randomState) nbBins = params["nbBins"] overlapping = params["overlapping"] lenBin = params["lenBin"] bins = findBins(nbBins, overlapping, lenBin) - sparseFull = sparse.csc_matrix((nbPatients, nbGenes*nbBins)) + sparseFull = sparse.csc_matrix((nbPatients, nbGenes * nbBins)) for patientIndex, patient in enumerate(sortedRNASeq): columnIndices = getBins(patient, bins, lenBin, overlapping) - rowIndices = np.zeros(len(columnIndices), dtype=int)+patientIndex + rowIndices = np.zeros(len(columnIndices), dtype=int) + patientIndex data = np.ones(len(columnIndices), dtype=bool) - sparseFull = sparseFull+sparse.csc_matrix((data, (rowIndices, columnIndices)), shape=(nbPatients, nbGenes*nbBins)) + sparseFull = sparseFull + sparse.csc_matrix((data, (rowIndices, columnIndices)), + shape=(nbPatients, nbGenes * nbBins)) return sparseFull def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2): - k=int(k)/2*2 - indices = np.zeros((RNASeqRanking.shape[0]*k*RNASeqRanking.shape[1]), dtype=int) - data = np.ones((RNASeqRanking.shape[0]*k*RNASeqRanking.shape[1]), dtype=bool) - indptr = np.zeros(RNASeqRanking.shape[0]+1, dtype=int) + k = int(k) / 2 * 2 + indices = np.zeros((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=int) + data = np.ones((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=bool) + indptr = np.zeros(RNASeqRanking.shape[0] + 1, dtype=int) nbGenes = RNASeqRanking.shape[1] pointer = 0 for patientIndex in range(RNASeqRanking.shape[0]): for i in range(nbGenes): - for j in range(k/2): + for j in range(k / 2): try: - indices[pointer]=RNASeqRanking[patientIndex, (sotredRNASeq[patientIndex, i]-(j+1))]+i*nbGenes - pointer+=1 + indices[pointer] = RNASeqRanking[ + patientIndex, (sotredRNASeq[patientIndex, i] - (j + 1))] + i * nbGenes + pointer += 1 except: pass try: - indices[pointer]=RNASeqRanking[patientIndex, (sotredRNASeq[patientIndex, i]+(j+1))]+i*nbGenes - pointer+=1 + indices[pointer] = RNASeqRanking[ + patientIndex, (sotredRNASeq[patientIndex, i] + (j + 1))] + i * nbGenes + pointer += 1 except: pass # elif i<=k: @@ -477,18 +484,18 @@ def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2): # elif i==nbGenes-1: # indices.append(patient[i-1]+patient[i]*nbGenes) # data.append(True) - indptr[patientIndex+1] = pointer + indptr[patientIndex + 1] = pointer - mat = sparse.csr_matrix((data, indices, indptr), shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1]*RNASeqRanking.shape[1]), dtype=bool) + mat = sparse.csr_matrix((data, indices, indptr), + shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1] * RNASeqRanking.shape[1]), dtype=bool) return mat def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): - - datasetFile = h5py.File(path+"KMultiOmic.hdf5", "w") + datasetFile = h5py.File(path + "KMultiOmic.hdf5", "w") # logging.debug("Start:\t Getting Methylation Data") - methylData = np.genfromtxt(path+"matching_methyl.csv", delimiter=',') + methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') logging.debug("Done:\t Getting Methylation Data") logging.debug("Start:\t Getting Sorted Methyl Data") @@ -504,20 +511,22 @@ def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex logging.debug("Done:\t Getting Sorted Methyl Data") - logging.debug("Start:\t Getting Binarized Methyl Data") - k=findClosestPowerOfTwo(9)-1 + k = findClosestPowerOfTwo(9) - 1 try: - factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(methylData.shape[1])+"--k-"+str(k)+".csv", delimiter=',') + factorizedLeftBaseMatrix = np.genfromtxt( + path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') except: factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) - bMethylDset = datasetFile.create_dataset("View0", (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1]*k), dtype=np.uint8) + bMethylDset = datasetFile.create_dataset("View0", + (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), + dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:] + patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] bMethylDset[patientIndex] = patientMatrix.flatten() - bMethylDset.attrs["name"] = "BMethyl"+str(k) + bMethylDset.attrs["name"] = "BMethyl" + str(k) bMethylDset.attrs["sparse"] = False bMethylDset.attrs["binary"] = True logging.debug("Done:\t Getting Binarized Methyl Data") @@ -527,33 +536,39 @@ def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): nbBins = 9 overlapping = 463 try: - sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",") + sortedBinsMatrix = np.genfromtxt( + path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", + delimiter=",") except: sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) - binnedMethyl = datasetFile.create_dataset("View1", (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1]*nbBins), dtype=np.uint8) + binnedMethyl = datasetFile.create_dataset("View1", ( + sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:] + patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] binnedMethyl[patientIndex] = patientMatrix.flatten() - binnedMethyl.attrs["name"] = "bMethyl"+str(nbBins) + binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) binnedMethyl.attrs["sparse"] = False binnedMethyl.attrs["binary"] = True logging.debug("Done:\t Getting Binned Methyl Data") logging.debug("Start:\t Getting Binarized Methyl Data") - k=findClosestPowerOfTwo(17)-1 + k = findClosestPowerOfTwo(17) - 1 try: - factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(methylData.shape[1])+"--k-"+str(k)+".csv", delimiter=',') + factorizedLeftBaseMatrix = np.genfromtxt( + path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') except: factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) - bMethylDset = datasetFile.create_dataset("View2", (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1]*k), dtype=np.uint8) + bMethylDset = datasetFile.create_dataset("View2", + (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), + dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:] + patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] bMethylDset[patientIndex] = patientMatrix.flatten() - bMethylDset.attrs["name"] = "BMethyl"+str(k) + bMethylDset.attrs["name"] = "BMethyl" + str(k) bMethylDset.attrs["sparse"] = False bMethylDset.attrs["binary"] = True logging.debug("Done:\t Getting Binarized Methyl Data") @@ -563,24 +578,24 @@ def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): nbBins = 16 overlapping = 442 try: - sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",") + sortedBinsMatrix = np.genfromtxt( + path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", + delimiter=",") except: sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) - binnedMethyl = datasetFile.create_dataset("View3", (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1]*nbBins), dtype=np.uint8) + binnedMethyl = datasetFile.create_dataset("View3", ( + sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:] + patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] binnedMethyl[patientIndex] = patientMatrix.flatten() - binnedMethyl.attrs["name"] = "bMethyl"+str(nbBins) + binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) binnedMethyl.attrs["sparse"] = False binnedMethyl.attrs["binary"] = True logging.debug("Done:\t Getting Binned Methyl Data") - - - - labelFile = open(path+'brca_labels_triple-negatif.csv') + labelFile = open(path + 'brca_labels_triple-negatif.csv') labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) labelsDset = datasetFile.create_dataset("Labels", labels.shape) labelsDset[...] = labels @@ -590,26 +605,25 @@ def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): metaDataGrp.attrs["nbView"] = 4 metaDataGrp.attrs["nbClass"] = 2 metaDataGrp.attrs["datasetLength"] = len(labels) - labelDictionary = {0:"No", 1:"Yes"} + labelDictionary = {0: "No", 1: "Yes"} datasetFile.close() - datasetFile = h5py.File(path+"KMultiOmic.hdf5", "r") + datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") return datasetFile, labelDictionary def getKMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path+"KMultiOmic.hdf5", "r") - labelDictionary = {0:"No", 1:"Yes"} + datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") + labelDictionary = {0: "No", 1: "Yes"} return datasetFile, labelDictionary def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): - - datasetFile = h5py.File(path+"ModifiedMultiOmic.hdf5", "w") + datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "w") logging.debug("Start:\t Getting Methylation Data") - methylData = np.genfromtxt(path+"matching_methyl.csv", delimiter=',') + methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') methylDset = datasetFile.create_dataset("View0", methylData.shape) methylDset[...] = methylData methylDset.attrs["name"] = "Methyl_" @@ -634,18 +648,20 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): mMethylDset.attrs["binary"] = False logging.debug("Done:\t Getting Sorted Methyl Data") - logging.debug("Start:\t Getting Binarized Methyl Data") - k=findClosestPowerOfTwo(58)-1 + k = findClosestPowerOfTwo(58) - 1 try: - factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View0").shape[1])+"--k-"+str(k)+".csv", delimiter=',') + factorizedLeftBaseMatrix = np.genfromtxt( + path + "factorLeft--n-" + str(datasetFile.get("View0").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') except: factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) - bMethylDset = datasetFile.create_dataset("View11", (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1]*k), dtype=np.uint8) + bMethylDset = datasetFile.create_dataset("View11", + (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), + dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:] + patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] bMethylDset[patientIndex] = patientMatrix.flatten() bMethylDset.attrs["name"] = "BMethyl" bMethylDset.attrs["sparse"] = False @@ -657,14 +673,17 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): nbBins = 58 overlapping = 1676 try: - sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",") + sortedBinsMatrix = np.genfromtxt( + path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", + delimiter=",") except: sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View0").shape[1], path) - binnedMethyl = datasetFile.create_dataset("View12", (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1]*nbBins), dtype=np.uint8) + binnedMethyl = datasetFile.create_dataset("View12", ( + sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:] + patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] binnedMethyl[patientIndex] = patientMatrix.flatten() binnedMethyl.attrs["name"] = "bMethyl" binnedMethyl.attrs["sparse"] = False @@ -672,11 +691,11 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): logging.debug("Done:\t Getting Binned Methyl Data") logging.debug("Start:\t Getting MiRNA Data") - mirnaData = np.genfromtxt(path+"matching_mirna.csv", delimiter=',') + mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) mirnaDset[...] = mirnaData - mirnaDset.attrs["name"]="MiRNA__" - mirnaDset.attrs["sparse"]=False + mirnaDset.attrs["name"] = "MiRNA__" + mirnaDset.attrs["sparse"] = False mirnaDset.attrs["binary"] = False logging.debug("Done:\t Getting MiRNA Data") @@ -697,18 +716,20 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): mmirnaDset.attrs["binary"] = False logging.debug("Done:\t Getting Sorted MiRNA Data") - logging.debug("Start:\t Getting Binarized MiRNA Data") - k=findClosestPowerOfTwo(517)-1 + k = findClosestPowerOfTwo(517) - 1 try: - factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View1").shape[1])+"--k-"+str(k)+".csv", delimiter=',') + factorizedLeftBaseMatrix = np.genfromtxt( + path + "factorLeft--n-" + str(datasetFile.get("View1").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') except: factorizedLeftBaseMatrix = getBaseMatrices(mirnaData.shape[1], k, path) - bmirnaDset = datasetFile.create_dataset("View8", (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1]*k), dtype=np.uint8) + bmirnaDset = datasetFile.create_dataset("View8", + (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * k), + dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], k), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:] + patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] bmirnaDset[patientIndex] = patientMatrix.flatten() bmirnaDset.attrs["name"] = "BMiRNA_" bmirnaDset.attrs["sparse"] = False @@ -720,14 +741,17 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): nbBins = 517 overlapping = 12 try: - sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",") + sortedBinsMatrix = np.genfromtxt( + path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", + delimiter=",") except: sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View1").shape[1], path) - binnedMiRNA = datasetFile.create_dataset("View9", (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1]*nbBins), dtype=np.uint8) + binnedMiRNA = datasetFile.create_dataset("View9", ( + sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * nbBins), dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], nbBins), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:] + patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] binnedMiRNA[patientIndex] = patientMatrix.flatten() binnedMiRNA.attrs["name"] = "bMiRNA_" binnedMiRNA.attrs["sparse"] = False @@ -735,7 +759,7 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): logging.debug("Done:\t Getting Binned MiRNA Data") logging.debug("Start:\t Getting RNASeq Data") - rnaseqData = np.genfromtxt(path+"matching_rnaseq.csv", delimiter=',') + rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') uselessRows = [] for rowIndex, row in enumerate(np.transpose(rnaseqData)): if not row.any(): @@ -743,8 +767,8 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) rnaseqDset[...] = rnaseqData[:, usefulRows] - rnaseqDset.attrs["name"]="RNASeq_" - rnaseqDset.attrs["sparse"]=False + rnaseqDset.attrs["name"] = "RNASeq_" + rnaseqDset.attrs["sparse"] = False rnaseqDset.attrs["binary"] = False logging.debug("Done:\t Getting RNASeq Data") @@ -766,16 +790,20 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): logging.debug("Done:\t Getting Sorted RNASeq Data") logging.debug("Start:\t Getting Binarized RNASeq Data") - k=findClosestPowerOfTwo(100)-1 + k = findClosestPowerOfTwo(100) - 1 try: - factorizedLeftBaseMatrix = np.genfromtxt(path+"factorLeft--n-"+str(datasetFile.get("View2").shape[1])+"--k-"+str(100)+".csv", delimiter=',') + factorizedLeftBaseMatrix = np.genfromtxt( + path + "factorLeft--n-" + str(datasetFile.get("View2").shape[1]) + "--k-" + str(100) + ".csv", + delimiter=',') except: factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path) - brnaseqDset = datasetFile.create_dataset("View5", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*k), dtype=np.uint8) + brnaseqDset = datasetFile.create_dataset("View5", + (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * k), + dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= factorizedLeftBaseMatrix[lineIndex,:] + patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] brnaseqDset[patientIndex] = patientMatrix.flatten() brnaseqDset.attrs["name"] = "BRNASeq" brnaseqDset.attrs["sparse"] = False @@ -787,14 +815,17 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): nbBins = 142 overlapping = 493 try: - sortedBinsMatrix = np.genfromtxt(path+"sortedBinsMatrix--t-"+str(lenBins)+"--n-"+str(nbBins)+"--c-"+str(overlapping)+".csv", delimiter=",") + sortedBinsMatrix = np.genfromtxt( + path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", + delimiter=",") except: sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path) - binnedRNASeq = datasetFile.create_dataset("View6", (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1]*nbBins), dtype=np.uint8) + binnedRNASeq = datasetFile.create_dataset("View6", ( + sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * nbBins), dtype=np.uint8) for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8) for lineIndex, geneIndex in enumerate(patientSortedArray): - patientMatrix[geneIndex]= sortedBinsMatrix[lineIndex,:] + patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] binnedRNASeq[patientIndex] = patientMatrix.flatten() binnedRNASeq.attrs["name"] = "bRNASeq" binnedRNASeq.attrs["sparse"] = False @@ -802,7 +833,7 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): logging.debug("Done:\t Getting Binned RNASeq Data") logging.debug("Start:\t Getting Clinical Data") - clinical = np.genfromtxt(path+"clinicalMatrix.csv", delimiter=',') + clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') clinicalDset = datasetFile.create_dataset("View3", clinical.shape) clinicalDset[...] = clinical clinicalDset.attrs["name"] = "Clinic_" @@ -811,15 +842,16 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): logging.debug("Done:\t Getting Clinical Data") logging.debug("Start:\t Getting Binarized Clinical Data") - binarized_clinical = np.zeros((347,1951), dtype=np.uint8) + binarized_clinical = np.zeros((347, 1951), dtype=np.uint8) nb_already_done = 0 for feqtureIndex, feature in enumerate(np.transpose(clinical)): featureSet = set(feature) - featureDict = dict((val,valIndex) for valIndex, val in enumerate(list(featureSet))) + featureDict = dict((val, valIndex) for valIndex, val in enumerate(list(featureSet))) for valueIndex, value in enumerate(feature): - binarized_clinical[valueIndex, featureDict[value]+nb_already_done] = 1 - nb_already_done+= len(featureSet) - bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8, data=binarized_clinical) + binarized_clinical[valueIndex, featureDict[value] + nb_already_done] = 1 + nb_already_done += len(featureSet) + bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8, + data=binarized_clinical) bClinicalDset.attrs["name"] = "bClinic" bClinicalDset.attrs["sparse"] = False bClinicalDset.attrs["binary"] = True @@ -829,14 +861,16 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1) # sparseAdjRNASeqGrp = datasetFile.create_group("View6") # dataDset = sparseAdjRNASeqGrp.create_dataset("data", sparseAdjRNASeq.data.shape, data=sparseAdjRNASeq.data) - # indicesDset = sparseAdjRNASeqGrp.create_dataset("indices", sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices) - # indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr", sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr) + # indicesDset = sparseAdjRNASeqGrp.create_dataset("indices", + # sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices) + # indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr", + # sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr) # sparseAdjRNASeqGrp.attrs["name"]="ARNASeq" # sparseAdjRNASeqGrp.attrs["sparse"]=True # sparseAdjRNASeqGrp.attrs["shape"]=sparseAdjRNASeq.shape # logging.debug("Done:\t Getting Adjacence RNASeq Data") - labelFile = open(path+'brca_labels_triple-negatif.csv') + labelFile = open(path + 'brca_labels_triple-negatif.csv') labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) labelsDset = datasetFile.create_dataset("Labels", labels.shape) labelsDset[...] = labels @@ -846,32 +880,32 @@ def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): metaDataGrp.attrs["nbView"] = 14 metaDataGrp.attrs["nbClass"] = 2 metaDataGrp.attrs["datasetLength"] = len(labels) - labelDictionary = {0:"No", 1:"Yes"} + labelDictionary = {0: "No", 1: "Yes"} datasetFile.close() - datasetFile = h5py.File(path+"ModifiedMultiOmic.hdf5", "r") + datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") return datasetFile, labelDictionary def getModifiedMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path+"ModifiedMultiOmic.hdf5", "r") - labelDictionary = {0:"No", 1:"Yes"} + datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") + labelDictionary = {0: "No", 1: "Yes"} return datasetFile, labelDictionary def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): - datasetFile = h5py.File(path+"MultiOmic.hdf5", "r") - labelDictionary = {0:"No", 1:"Yes"} + datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") + labelDictionary = {0: "No", 1: "Yes"} return datasetFile, labelDictionary def copyHDF5(pathF, name, nbCores): - datasetFile = h5py.File(pathF+name+".hdf5", "r") + datasetFile = h5py.File(pathF + name + ".hdf5", "r") for coreIndex in range(nbCores): - newDataSet = h5py.File(pathF+name+str(coreIndex)+".hdf5", "w") + newDataSet = h5py.File(pathF + name + str(coreIndex) + ".hdf5", "w") for dataset in datasetFile: - datasetFile.copy("/"+dataset, newDataSet["/"]) + datasetFile.copy("/" + dataset, newDataSet["/"]) newDataSet.close() @@ -879,13 +913,13 @@ def datasetsAlreadyExist(pathF, name, nbCores): allDatasetExist = True for coreIndex in range(nbCores): import os.path - allDatasetExist *= os.path.isfile(pathF+name+str(coreIndex)+".hdf5") + allDatasetExist *= os.path.isfile(pathF + name + str(coreIndex) + ".hdf5") return allDatasetExist def deleteHDF5(pathF, name, nbCores): for coreIndex in range(nbCores): - os.remove(pathF+name+str(coreIndex)+".hdf5") + os.remove(pathF + name + str(coreIndex) + ".hdf5") # def getOneViewFromDB(viewName, pathToDB, DBName): # view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';') @@ -1016,7 +1050,8 @@ def deleteHDF5(pathF, name, nbCores): # for i in xrange(nbGenes): # for j in xrange(nbGenes): # if i > j: -# pseudoRNASeq[exampleIndex, arrayIndex] = dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i] +# pseudoRNASeq[exampleIndex, arrayIndex] = +# dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i] # arrayIndex += 1 # dataset["/View4/matrix"] = pseudoRNASeq # dataset["/View4/name"] = "pseudoRNASeq" @@ -1030,4 +1065,3 @@ def deleteHDF5(pathF, name, nbCores): # if i != value: # areAllSame = False # return areAllSame -