Mumbo can use as much classifiers as needed and gridsearch is now sklearn's

e3e36cb1 · bbauvin · ff0b76cd · e3e36cb1 · e3e36cb1 · e3e36cb1
Commit e3e36cb1 authored 7 years ago by bbauvin
--- a/Code/MonoMutliViewClassifiers/ExecClassif.py
+++ b/Code/MonoMutliViewClassifiers/ExecClassif.py
@@ -537,13 +537,17 @@ groupSCM.add_argument('--CL_SCM_model_type', metavar='STRING', action='store',
 groupMumbo = parser.add_argument_group('Mumbo arguments')
 groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+",
                        help='Determine which monoview classifier to use with Mumbo',
-                        default=['DecisionTree', 'DecisionTree', 'DecisionTree'])
+                        default=[''])
 groupMumbo.add_argument('--MU_config', metavar='STRING', action='store', nargs='+',
-                        help='Configuration for the monoview classifier in Mumbo',
+                        help='Configuration for the monoview classifier in Mumbo separate each classifier with sapce and each argument with:',
-                        default=['2:0.5', '2:0.5', '2:0.5'])
+                        default=[''])
 groupMumbo.add_argument('--MU_iter', metavar='INT', action='store', nargs=3,
                        help='Max number of iteration, min number of iteration, convergence threshold', type=float,
                        default=[10, 1, 0.01])
+groupMumbo.add_argument('--MU_combination', action='store_true',
+                        help='Try all the monoview classifiers combinations for each view',
+                        default=False)
 groupFusion = parser.add_argument_group('Fusion arguments')
 groupFusion.add_argument('--FU_types', metavar='STRING', action='store', nargs="+",
@@ -670,6 +674,15 @@ if statsIter > 1:
    else:
        iterResults = []
        for iterIndex in range(statsIter):
+            if not os.path.exists(os.path.dirname(directories[iterIndex] + "train_labels.csv")):
+                try:
+                    os.makedirs(os.path.dirname(directories[iterIndex] + "train_labels.csv"))
+                except OSError as exc:
+                    if exc.errno != errno.EEXIST:
+                        raise
+            trainIndices, testIndices = classificationIndices[iterIndex]
+            trainLabels = DATASET.get("Labels").value[trainIndices]
+            np.savetxt(directories[iterIndex] + "train_labels.csv", trainLabels, delimiter=",")
            iterResults.append(
                classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories[iterIndex], args,
                                classificationIndices[iterIndex], kFolds[iterIndex], statsIterRandomStates[iterIndex],
@@ -678,6 +691,15 @@ if statsIter > 1:
    analyzeIterResults(iterResults, args.name, metrics, directory)
 else:
+    if not os.path.exists(os.path.dirname(directories + "train_labels.csv")):
+        try:
+            os.makedirs(os.path.dirname(directories + "train_labels.csv"))
+        except OSError as exc:
+            if exc.errno != errno.EEXIST:
+                raise
+    trainIndices, testIndices = classificationIndices
+    trainLabels = DATASET.get("Labels").value[trainIndices]
+    np.savetxt(directories + "train_labels.csv", trainLabels, delimiter=",")
    res = classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories, args, classificationIndices,
                          kFolds,
                          statsIterRandomStates, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start,

--- a/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/ExecMultiview.py
@@ -66,7 +66,7 @@ def ExecMultiview(directory, DATASET, name, classificationIndices, KFolds, nbCor
    else:
        classifier = classifierClass(randomState, NB_CORES=nbCores, **classificationKWARGS)
-    classifier.fit_hdf5(DATASET, trainIndices=learningIndices, viewsIndices=viewsIndices)
+    classifier.fit_hdf5(DATASET, trainIndices=learningIndices, viewsIndices=viewsIndices, metric=metrics[0])
    trainLabels = classifier.predict_hdf5(DATASET, usedIndices=learningIndices, viewsIndices=viewsIndices)
    testLabels = classifier.predict_hdf5(DATASET, usedIndices=validationIndices, viewsIndices=viewsIndices)
    fullLabels = classifier.predict_hdf5(DATASET, viewsIndices=viewsIndices)

--- a/Code/MonoMutliViewClassifiers/Multiview/Fusion/Fusion.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/Fusion/Fusion.py
@@ -160,7 +160,7 @@ class Fusion:
    def setParams(self, paramsSet):
        self.classifier.setParams(paramsSet)
-    def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None):
+    def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None, metric=["f1_score", None]):
        self.classifier.fit_hdf5(DATASET, trainIndices=trainIndices, viewsIndices=viewsIndices)
    def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None):

--- a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Classifiers/DecisionTree.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Classifiers/DecisionTree.py
-from sklearn import tree
+import sklearn
-from sklearn.metrics import accuracy_score
+from sklearn.base import BaseEstimator, ClassifierMixin
 import numpy as np
 from ModifiedMulticlass import OneVsRestClassifier
 from SubSampling import subSample
 import logging
 # Add weights
 import Metrics
-def DecisionTree(data, labels, arg, weights, randomState):
+class DecisionTree(BaseEstimator, ClassifierMixin):
-    depth = int(arg[0])
+    def __init__(self, depth=10, criterion="gini", splitter="best", subSampling=1.0, randomState=None, **kwargs):
-    subSampling = float(arg[1])
+        if kwargs:
-    if subSampling != 1.0:
+            self.depth = kwargs["depth"]
-        subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, randomState,
+            self.criterion = kwargs["criterion"]
+            self.splitter = kwargs["splitter"]
+            self.subSampling = kwargs["subSampling"]
+            self.randomState = kwargs["randomState"]
+        else:
+            self.depth = depth
+            self.criterion = criterion
+            self.splitter = splitter
+            self.subSampling = subSampling
+            if randomState is None:
+                self.randomState=np.random.RandomState()
+            else:
+                self.randomState=randomState
+        self.decisionTree = sklearn.tree.DecisionTreeClassifier(splitter=self.splitter, criterion=self.criterion, max_depth=self.depth)
+    def fit(self, data, labels, sample_weight=None):
+        if sample_weight is None:
+            sample_weight = np.ones(len(data))/len(data)
+        if self.subSampling != 1.0:
+            subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, self.subSampling, self.randomState,
+                                                                            weights=sample_weight)
+        else:
+            subSampledData, subSampledLabels, subSampledWeights = data, labels, sample_weight
+        self.decisionTree.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights)
+        return self
+    def fit_hdf5(self, data, labels, weights, metric):
+        metricModule = getattr(Metrics, metric[0])
+        if metric[1] is not None:
+            metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
+        else:
+            metricKWARGS = {}
+        if weights is None:
+            weights = np.ones(len(data))/len(data)
+        # Check that X and y have correct shape
+        if self.subSampling != 1.0:
+            subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, self.subSampling, self.randomState,
                                                                            weights=weights)
        else:
            subSampledData, subSampledLabels, subSampledWeights = data, labels, weights
-    isBad = False
+        # self.subSampledData = subSampledData
-    classifier = tree.DecisionTreeClassifier(max_depth=depth)
+        # self.
-    # classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth))
+        # self.
-    classifier.fit(subSampledData, subSampledLabels, subSampledWeights)
+        # Store the classes seen during fit
-    prediction = classifier.predict(data)
+        self.decisionTree.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights)
-    accuracy = accuracy_score(labels, prediction)
+        prediction = self.decisionTree.predict(data)
-    if accuracy < 0.5:
+        metricKWARGS = {"0":weights}
+        averageScore = metricModule.score(labels, prediction, **metricKWARGS)
+        if averageScore < 0.5:
            isBad = True
+        else:
+            isBad = False
+        # self.X_ = X
+        # self.y_ = y
+        # Return the classifier
+        # self.decisionTree, prediction, isBad, averageScore
+        return self.decisionTree, prediction, isBad, averageScore
+    def predict(self, data):
-    return classifier, prediction, isBad, accuracy
+        # Check is fit had been called
+        # check_is_fitted(self, ['X_', 'y_'])
+        # Input validation
+        # X = check_array(X)
+        predictedLabels = self.decisionTree.predict(data)
+        # closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
+        return predictedLabels
+    def get_params(self, deep=True):
+        # suppose this estimator has parameters "alpha" and "recursive"
+        return {"depth": self.depth, "criterion": self.criterion, "splitter": self.splitter, "subSampling": self.subSampling}
+    def set_params(self, **parameters):
+        self.depth = parameters["depth"]
+        self.criterion = parameters["criterion"]
+        self.splitter = parameters["splitter"]
+        self.subSampling = parameters["subSampling"]
+        # for parameter, value in parameters.items():
+        #     print parameter, value
+        #     self.setattr(parameter, value)
+        return self
+# def DecisionTree(data, labels, arg, weights, randomState):
+#     depth = int(arg[0])
+#     subSampling = float(arg[1])
+#     if subSampling != 1.0:
+#         subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, randomState,
+#                                                                         weights=weights)
+#     else:
+#         subSampledData, subSampledLabels, subSampledWeights = data, labels, weights
+#     isBad = False
+#     classifier = sklearn.tree.DecisionTreeClassifier(max_depth=depth)
+#     # classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth))
+#     classifier.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights)
+#     prediction = classifier.predict(data)
+#     accuracy = accuracy_score(labels, prediction)
+#     if accuracy < 0.5:
+#         isBad = True
+#
+#     return classifier, prediction, isBad, accuracy
+def getKWARGS(argList, randomState):
+    kwargs = {"depth":int(argList[0]), "criterion":argList[1], "splitter":argList[2], "subSampling":float(argList[3]), "randomState":randomState}
+    return kwargs
 def getConfig(classifierConfig):
-    depth = classifierConfig[0]
+    try:
-    subSampling = classifierConfig[1]
+        depth = classifierConfig["depth"]
-    return 'with depth ' + str(depth) + ', ' + ' sub-sampled at ' + str(subSampling) + ' '
+        splitter = classifierConfig["splitter"]
+        criterion = classifierConfig["criterion"]
+        subSampling = classifierConfig["subSampling"]
-def hyperParamSearch(data, labels, randomState, metric="accuracy_score"):
+        return 'with depth ' + str(depth) + ', ' + \
-    minSubSampling = 1.0 / (len(labels) / 2)
+               'with splitter ' + splitter + ', ' + \
-    bestSettings = []
+               'with criterion ' + criterion + ', ' + \
-    bestResults = []
+               ' sub-sampled at ' + str(subSampling) + ' '
-    classifier = tree.DecisionTreeClassifier(max_depth=1)
+    except KeyError:
-    preliminary_accuracies = np.zeros(50)
+        print classifierConfig
-    for i in range(50):
-        subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.05, randomState)
-        classifier.fit(subSampledData, subSampledLabels)
-        prediction = classifier.predict(data)
+def findClosest(scores, base=0.5):
-        preliminary_accuracies[i] = accuracy_score(labels, prediction)
+    diffToBase = 100.0
-    preliminary_accuracy = np.mean(preliminary_accuracies)
+    bestSettingsIndex = 0
-    if preliminary_accuracy < 0.50:
+    for resultIndex, result in enumerate(scores):
-        for max_depth in np.arange(10) + 1:
+        if abs(base - result) < diffToBase:
-            for subSampling in sorted((np.arange(20, dtype=float) + 1) / 20, reverse=True):
+            diffToBase = abs(base - result)
-                if subSampling > minSubSampling:
+            bestResult = result
-                    accuracies = np.zeros(50)
+            bestSettingsIndex = resultIndex
-                    for i in range(50):
+    return bestSettingsIndex
-                        if subSampling != 1.0:
-                            subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
-                                                                                            randomState)
+def hyperParamSearch(data, labels, randomState, metric=["accuracy_score", None], nbSubSamplingTests=20):
-                        else:
+    metricModule = getattr(Metrics, metric[0])
-                            subSampledData, subSampledLabels, = data, labels
+    if metric[1] is not None:
-                        classifier = tree.DecisionTreeClassifier(max_depth=max_depth)
+        metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
-                        classifier.fit(subSampledData, subSampledLabels)
-                        prediction = classifier.predict(data)
-                        accuracies[i] = accuracy_score(labels, prediction)
-                    accuracy = np.mean(accuracies)
-                    if 0.5 < accuracy < 0.60:
-                        bestSettings.append([max_depth, subSampling])
-                        bestResults.append(accuracy)
-    else:
-        preliminary_accuracies = np.zeros(50)
-        if minSubSampling < 0.01:
-            for i in range(50):
-                subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.01, randomState)
-                classifier.fit(subSampledData, subSampledLabels)
-                prediction = classifier.predict(data)
-                preliminary_accuracies[i] = accuracy_score(labels, prediction)
-        preliminary_accuracy = np.mean(preliminary_accuracies)
-        if preliminary_accuracy < 0.50:
-            for subSampling in sorted((np.arange(19, dtype=float) + 1) / 200, reverse=True):
-                if minSubSampling < subSampling:
-                    accuracies = np.zeros(50)
-                    for i in range(50):
-                        subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
-                                                                                        randomState)
-                        classifier = tree.DecisionTreeClassifier(max_depth=1)
-                        classifier.fit(subSampledData, subSampledLabels)
-                        prediction = classifier.predict(data)
-                        accuracies[i] = accuracy_score(labels, prediction)
-                    accuracy = np.mean(accuracies)
-                    if 0.5 < accuracy < 0.60:
-                        bestSettings.append([1, subSampling])
-                        bestResults.append(accuracy)
    else:
-            for subSampling in sorted((np.arange(19, dtype=float) + 1) / 2000, reverse=True):
+        metricKWARGS = {}
-                accuracies = np.zeros(50)
+    scorer = metricModule.get_scorer(**metricKWARGS)
-                for i in range(50):
+    subSamplingRatios = np.arange(nbSubSamplingTests, dtype=float)/nbSubSamplingTests
-                    subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
+    maxDepths = np.arange(1)+1
-                                                                                    randomState)
+    criterions = ["gini", "entropy"]
-                    if minSubSampling < subSampling:
+    splitters = ["best", "random"]
-                        classifier1 = tree.DecisionTreeClassifier(max_depth=1)
+    parameters = {"depth":maxDepths, "criterion":criterions, "splitter":splitters, "subSampling":subSamplingRatios}
-                        classifier1.fit(subSampledData, subSampledLabels)
+    classifier = DecisionTree()
-                        prediction = classifier1.predict(data)
+    grid = sklearn.model_selection.GridSearchCV(classifier, parameters, scoring=scorer)
-                        accuracies[i] = accuracy_score(labels, prediction)
+    grid.fit(data, labels)
-                accuracy = np.mean(accuracies)
+    GSSubSamplingRatios = grid.cv_results_["param_subSampling"]
-                if 0.5 < accuracy < 0.60:
+    GSMaxDepths = grid.cv_results_["param_depth"]
-                    bestSettings.append([1, subSampling])
+    GSCriterions = grid.cv_results_["param_criterion"]
-                    bestResults.append(accuracy)
+    GSSplitters = grid.cv_results_["param_splitter"]
+    GSScores = grid.cv_results_["mean_test_score"]
-    assert bestResults != [], "No good settings found for Decision Tree!"
+    configIndex = findClosest(GSScores)
+    return {"depth":GSMaxDepths[configIndex], "criterion":GSCriterions[configIndex], "splitter":GSSplitters[configIndex], "subSampling":GSSubSamplingRatios[configIndex], "randomState":randomState}
-    return getBestSetting(bestSettings, bestResults)
+    # bestSettings = []
+    # bestResults = []
+    # classifier = sklearn.tree.DecisionTreeClassifier(max_depth=1)
+    # subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.05, randomState)
+    # classifier.fit(subSampledData, subSampledLabels)
+    # prediction = classifier.predict(data)
+    # preliminary_accuracy = accuracy_score(labels, prediction)
+    # if preliminary_accuracy < 0.50:
+    #     for max_depth in np.arange(10) + 1:
+    #         for subSampling in sorted((np.arange(20, dtype=float) + 1) / 20, reverse=True):
+    #             if subSampling > minSubSampling:
+    #                 accuracies = np.zeros(50)
+    #                 for i in range(50):
+    #                     if subSampling != 1.0:
+    #                         subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
+    #                                                                                         randomState)
+    #                     else:
+    #                         subSampledData, subSampledLabels, = data, labels
+    #                     classifier = tree.DecisionTreeClassifier(max_depth=max_depth)
+    #                     classifier.fit(subSampledData, subSampledLabels)
+    #                     prediction = classifier.predict(data)
+    #                     accuracies[i] = accuracy_score(labels, prediction)
+    #                 accuracy = np.mean(accuracies)
+    #                 if 0.5 < accuracy < 0.60:
+    #                     bestSettings.append([max_depth, subSampling])
+    #                     bestResults.append(accuracy)
+    # else:
+    #     preliminary_accuracies = np.zeros(50)
+    #     if minSubSampling < 0.01:
+    #         for i in range(50):
+    #             subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.01, randomState)
+    #             classifier.fit(subSampledData, subSampledLabels)
+    #             prediction = classifier.predict(data)
+    #             preliminary_accuracies[i] = accuracy_score(labels, prediction)
+    #     preliminary_accuracy = np.mean(preliminary_accuracies)
+    #     if preliminary_accuracy < 0.50:
+    #         for subSampling in sorted((np.arange(19, dtype=float) + 1) / 200, reverse=True):
+    #             if minSubSampling < subSampling:
+    #                 accuracies = np.zeros(50)
+    #                 for i in range(50):
+    #                     subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
+    #                                                                                     randomState)
+    #                     classifier = tree.DecisionTreeClassifier(max_depth=1)
+    #                     classifier.fit(subSampledData, subSampledLabels)
+    #                     prediction = classifier.predict(data)
+    #                     accuracies[i] = accuracy_score(labels, prediction)
+    #                 accuracy = np.mean(accuracies)
+    #                 if 0.5 < accuracy < 0.60:
+    #                     bestSettings.append([1, subSampling])
+    #                     bestResults.append(accuracy)
+    #     else:
+    #         for subSampling in sorted((np.arange(19, dtype=float) + 1) / 2000, reverse=True):
+    #             accuracies = np.zeros(50)
+    #             for i in range(50):
+    #                 subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
+    #                                                                                 randomState)
+    #                 if minSubSampling < subSampling:
+    #                     classifier1 = tree.DecisionTreeClassifier(max_depth=1)
+    #                     classifier1.fit(subSampledData, subSampledLabels)
+    #                     prediction = classifier1.predict(data)
+    #                     accuracies[i] = accuracy_score(labels, prediction)
+    #             accuracy = np.mean(accuracies)
+    #             if 0.5 < accuracy < 0.60:
+    #                 bestSettings.append([1, subSampling])
+    #                 bestResults.append(accuracy)
+    #
+    # # assert bestResults != [], "No good settings found for Decision Tree!"
+    # if bestResults == []:
+    #     bestSetting = None
+    # else:
+    #     bestSetting = getBestSetting(bestSettings, bestResults)
+    # return bestSetting
 def getBestSetting(bestSettings, bestResults):

--- a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Mumbo.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/Mumbo.py
--- a/Code/MonoMutliViewClassifiers/Multiview/Mumbo/analyzeResults.py
+++ b/Code/MonoMutliViewClassifiers/Multiview/Mumbo/analyzeResults.py
@@ -130,7 +130,7 @@ def getReport(classifier, CLASS_LABELS, classificationIndices, DATASET, trainLab
    testScore = metricModule.score(CLASS_LABELS[validationIndices], testLabels)
    mumboClassifier = classifier
    maxIter = mumboClassifier.iterIndex
-    meanAverageAccuracies = np.mean(mumboClassifier.averageAccuracies, axis=0)
+    meanAverageAccuracies = np.mean(mumboClassifier.averageScores, axis=0)
    viewsStats = np.array([float(list(mumboClassifier.bestViews).count(viewIndex)) /
                           len(mumboClassifier.bestViews) for viewIndex in range(nbView)])
    PredictedTrainLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, fakeViewsIndicesDict,
@@ -230,6 +230,11 @@ def execute(classifier, trainLabels,
            hyperParamSearch, nIter, metrics,
            viewsIndices, randomState):
    learningIndices, validationIndices = classificationIndices
+    if classifier.classifiersConfigs is None:
+        metricsScores = getMetricsScores(metrics, trainLabels, testLabels,
+                                         DATASET, validationIndices, learningIndices)
+        return "No good setting for monoview classifier", None, metricsScores
+    else:
        LEARNING_RATE = len(learningIndices) / (len(learningIndices) + len(validationIndices))
        nbFolds = KFolds.n_splits