diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index 47621800f7e62dfa2cfe2624f1b9026387dc7b91..f4a3a5e392e80796186cba60d1c7de36db9d3b6b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -110,6 +110,8 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol logging.info("Done:\t Saving Results") viewIndex = args["viewIndex"] + if testFoldsPreds is None: + testFoldsPreds = y_train_pred return viewIndex, [CL_type, cl_desc + [feat], metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds] @@ -165,6 +167,7 @@ def getHPs(classifierModule, hyperParamSearch, nIter, CL_type, X_train, y_train, logging.debug("Done:\t " + hyperParamSearch + " best settings") else: clKWARGS = kwargs[CL_type + "KWARGS"] + testFoldsPreds = None return clKWARGS, testFoldsPreds diff --git a/multiview_platform/MonoMultiViewClassifiers/Multiview/Additions/diversity_utils.py b/multiview_platform/MonoMultiViewClassifiers/Multiview/Additions/diversity_utils.py index b1478cd6949b4e60c38a9a6d4adf5eab5a203982..eba14b61079537a1b23c284ee00bd02fd9804eff 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Multiview/Additions/diversity_utils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Multiview/Additions/diversity_utils.py @@ -16,18 +16,29 @@ def getClassifiersDecisions(allClassifersNames, viewsIndices, resultsMonoview): """ nbViews = len(viewsIndices) nbClassifiers = len(allClassifersNames) - nbFolds = len(resultsMonoview[0][1][6]) - foldsLen = len(resultsMonoview[0][1][6][0]) classifiersNames = [[] for _ in viewsIndices] - classifiersDecisions = np.zeros((nbViews, nbClassifiers, nbFolds, foldsLen)) - - for resultMonoview in resultsMonoview: - if resultMonoview[1][0] in classifiersNames[viewsIndices.index(resultMonoview[0])]: - pass - else: - classifiersNames[viewsIndices.index(resultMonoview[0])].append(resultMonoview[1][0]) - classifierIndex = classifiersNames[viewsIndices.index(resultMonoview[0])].index(resultMonoview[1][0]) - classifiersDecisions[viewsIndices.index(resultMonoview[0]), classifierIndex] = resultMonoview[1][6] + if len(resultsMonoview[0][1][6].shape) is not 1: + nbFolds = resultsMonoview[0][1][6].shape[0] + foldsLen = resultsMonoview[0][1][6].shape[1] + classifiersDecisions = np.zeros((nbViews, nbClassifiers, nbFolds, foldsLen)) + + for resultMonoview in resultsMonoview: + if resultMonoview[1][0] in classifiersNames[viewsIndices.index(resultMonoview[0])]: + pass + else: + classifiersNames[viewsIndices.index(resultMonoview[0])].append(resultMonoview[1][0]) + classifierIndex = classifiersNames[viewsIndices.index(resultMonoview[0])].index(resultMonoview[1][0]) + classifiersDecisions[viewsIndices.index(resultMonoview[0]), classifierIndex] = resultMonoview[1][6] + else: + train_len = resultsMonoview[0][1][6].shape[0] + classifiersDecisions = np.zeros((nbViews, nbClassifiers, 1, train_len)) + for resultMonoview in resultsMonoview: + if resultMonoview[1][0] in classifiersNames[viewsIndices.index(resultMonoview[0])]: + pass + else: + classifiersNames[viewsIndices.index(resultMonoview[0])].append(resultMonoview[1][0]) + classifierIndex = classifiersNames[viewsIndices.index(resultMonoview[0])].index(resultMonoview[1][0]) + classifiersDecisions[viewsIndices.index(resultMonoview[0]), classifierIndex] = resultMonoview[1][6] return classifiersDecisions, classifiersNames @@ -42,8 +53,7 @@ def couple_div_measure(allClassifersNames, viewsIndices, resultsMonoview, measur viewsIndices, resultsMonoview) - nbViews = len(viewsIndices) - nbClassifiers = len(allClassifersNames) + nbViews, nbClassifiers, nbFolds, foldsLen = classifiersDecisions.shape combinations = itertools.combinations_with_replacement(range(nbClassifiers), nbViews) nbCombinations = int(math.factorial(nbClassifiers+nbViews-1) / math.factorial(nbViews) / math.factorial(nbClassifiers-1)) div_measure = np.zeros(nbCombinations) @@ -78,9 +88,7 @@ def global_div_measure(allClassifersNames, viewsIndices, resultsMonoview, measur viewsIndices, resultsMonoview) - foldsLen = len(resultsMonoview[0][1][6][0]) - nbViews = len(viewsIndices) - nbClassifiers = len(allClassifersNames) + nbViews, nbClassifiers, nbFolds, foldsLen = classifiersDecisions.shape combinations = itertools.combinations_with_replacement(range(nbClassifiers), nbViews) nbCombinations = int(math.factorial(nbClassifiers + nbViews - 1) / math.factorial(nbViews) / math.factorial( nbClassifiers - 1)) @@ -104,9 +112,7 @@ def CQ_div_measure(allClassifersNames, viewsIndices, resultsMonoview, measuremen classifiersDecisions, classifiersNames = getClassifiersDecisions(allClassifersNames, viewsIndices, resultsMonoview) - foldsLen = len(resultsMonoview[0][1][6][0]) - nbViews = len(viewsIndices) - nbClassifiers = len(allClassifersNames) + nbViews, nbClassifiers, nbFolds, foldsLen = classifiersDecisions.shape combinations = itertools.combinations_with_replacement(range(nbClassifiers), nbViews) nbCombinations = int( math.factorial(nbClassifiers + nbViews - 1) / math.factorial(nbViews) / math.factorial(nbClassifiers - 1)) @@ -134,23 +140,33 @@ def CQ_div_measure(allClassifersNames, viewsIndices, resultsMonoview, measuremen bestCombiIndex] -def getFoldsGroundTruth(directory): +def getFoldsGroundTruth(directory, folds=True): """This function is used to get the labels of each fold example used in the measurements foldsGroundTruth is formatted as foldsGroundTruth[foldIndex, exampleIndex]""" - foldsFilesNames = os.listdir(directory+"folds/") - foldLen = len(np.genfromtxt(directory+"folds/"+foldsFilesNames[0], delimiter=',')) - foldsGroudTruth = np.zeros((len(foldsFilesNames), foldLen), dtype=int) - for fileName in foldsFilesNames: - foldIndex = int(fileName[-5]) - foldsGroudTruth[foldIndex] = np.genfromtxt(directory+"folds/"+fileName, delimiter=',') - return foldsGroudTruth + if folds: + foldsFilesNames = os.listdir(directory+"folds/") + foldLen = len(np.genfromtxt(directory+"folds/"+foldsFilesNames[0], delimiter=',')) + foldsGroudTruth = np.zeros((len(foldsFilesNames), foldLen), dtype=int) + for fileName in foldsFilesNames: + foldIndex = int(fileName[-5]) + foldsGroudTruth[foldIndex] = np.genfromtxt(directory+"folds/"+fileName, delimiter=',') + return foldsGroudTruth + else: + train_labels = np.genfromtxt(directory+"train_labels.csv", delimiter=',') + foldsGroudTruth = np.zeros((1, train_labels.shape[0])) + foldsGroudTruth[0] = train_labels + return foldsGroudTruth + def getArgs(args, benchmark, views, viewsIndices, randomState, directory, resultsMonoview, classificationIndices, measurement, name): """This function is a general function to get the args for all the measurements used""" - foldsGroundTruth = getFoldsGroundTruth(directory) + if len(resultsMonoview[0][1][6].shape) is not 1: + foldsGroundTruth = getFoldsGroundTruth(directory, folds=True) + else: + foldsGroundTruth = getFoldsGroundTruth(directory, folds=False) monoviewClassifierModulesNames = benchmark["Monoview"] if name in ['DisagreeFusion', 'DoubleFaultFusion']: classifiersNames, div_measure = couple_div_measure(monoviewClassifierModulesNames, diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/FatLateFusion/FatLateFusionModule.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/FatLateFusion/FatLateFusionModule.py index e2b886c16b9d8f1e69684f928dd07328490cab76..fc6c22643c1053212961e951fbefa792cfcbba43 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/FatLateFusion/FatLateFusionModule.py +++ b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/FatLateFusion/FatLateFusionModule.py @@ -19,6 +19,10 @@ def getArgs(args, benchmark, views, viewsIndices, randomState, directory, result monoviewDecisions = np.array([monoviewResult[1][3] for monoviewResult in resultsMonoview]) else: monoviewDecisions = np.array([genMulticlassMonoviewDecision(monoviewResult, classificationIndices) for monoviewResult in resultsMonoview]) + if len(args.FLF_weights) == 0: + weights = [1.0 for _ in range(monoviewDecisions.shape[0])] + else: + weights = args.FLF_weights arguments = {"CL_type": "FatLateFusion", "views": views, "NB_VIEW": len(resultsMonoview), @@ -27,7 +31,7 @@ def getArgs(args, benchmark, views, viewsIndices, randomState, directory, result "LABELS_NAMES": args.CL_classes, "FatLateFusionKWARGS": { "monoviewDecisions": monoviewDecisions, - "weights": args.FLF_weights + "weights": weights } } argumentsList.append(arguments) @@ -63,6 +67,9 @@ class FatLateFusionClass: votes = np.zeros((len(usedIndices), DATASET.get("Metadata").attrs["nbClass"]), dtype=float) for usedIndex, exampleIndex in enumerate(usedIndices): for monoviewDecisionIndex, monoviewDecision in enumerate(self.monoviewDecisions): + print(monoviewDecision[exampleIndex]) + print(self.weights[monoviewDecisionIndex]) + print(votes[usedIndex, monoviewDecision[exampleIndex]]) votes[usedIndex, monoviewDecision[exampleIndex]] += self.weights[monoviewDecisionIndex] predictedLabels = np.argmax(votes, axis=1) return predictedLabels diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/DecisionTree.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/DecisionTree.py deleted file mode 100644 index 874f3ce549dba3ffd5054779f8a9feac93b34190..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/DecisionTree.py +++ /dev/null @@ -1,261 +0,0 @@ -import sklearn -from sklearn.base import BaseEstimator, ClassifierMixin -import numpy as np -# from ModifiedMulticlass import OneVsRestClassifier -from .SubSampling import subSample -import logging - -# Add weights - -from .... import Metrics - - -class DecisionTree(BaseEstimator, ClassifierMixin): - def __init__(self, depth=10, criterion="gini", splitter="best", subSampling=1.0, randomState=None, **kwargs): - if kwargs: - self.depth = kwargs["depth"] - self.criterion = kwargs["criterion"] - self.splitter = kwargs["splitter"] - self.subSampling = kwargs["subSampling"] - self.randomState = kwargs["randomState"] - else: - self.depth = depth - self.criterion = criterion - self.splitter = splitter - self.subSampling = subSampling - if randomState is None: - self.randomState=np.random.RandomState() - else: - self.randomState=randomState - self.decisionTree = sklearn.tree.DecisionTreeClassifier(splitter=self.splitter, criterion=self.criterion, max_depth=self.depth) - - def fit(self, data, labels, sample_weight=None): - if sample_weight is None: - sample_weight = np.ones(len(data))/len(data) - - if self.subSampling != 1.0: - subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, self.subSampling, self.randomState, - weights=sample_weight) - else: - subSampledData, subSampledLabels, subSampledWeights = data, labels, sample_weight - - self.decisionTree.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights) - - return self - - def fit_hdf5(self, data, labels, weights, metric): - metricModule = getattr(Metrics, metric[0]) - if metric[1] is not None: - metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) - else: - metricKWARGS = {} - if weights is None: - weights = np.ones(len(data))/len(data) - - # Check that X and y have correct shape - if self.subSampling != 1.0: - subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, self.subSampling, self.randomState, - weights=weights) - else: - subSampledData, subSampledLabels, subSampledWeights = data, labels, weights - # self.subSampledData = subSampledData - # self. - # self. - # Store the classes seen during fit - self.decisionTree.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights) - prediction = self.decisionTree.predict(data) - metricKWARGS = {"0":weights} - averageScore = metricModule.score(labels, prediction, **metricKWARGS) - if averageScore < 0.5: - isBad = True - else: - isBad = False - - # self.X_ = X - # self.y_ = y - # Return the classifier - # self.decisionTree, prediction, isBad, averageScore - return self.decisionTree, prediction, isBad, averageScore - - def predict(self, data): - - # Check is fit had been called - # check_is_fitted(self, ['X_', 'y_']) - - # Input validation - # X = check_array(X) - predictedLabels = self.decisionTree.predict(data) - # closest = np.argmin(euclidean_distances(X, self.X_), axis=1) - return predictedLabels - - def get_params(self, deep=True): - # suppose this estimator has parameters "alpha" and "recursive" - return {"depth": self.depth, "criterion": self.criterion, "splitter": self.splitter, "subSampling": self.subSampling} - - def set_params(self, **parameters): - self.depth = parameters["depth"] - self.criterion = parameters["criterion"] - self.splitter = parameters["splitter"] - self.subSampling = parameters["subSampling"] - # for parameter, value in parameters.items(): - # print parameter, value - # self.setattr(parameter, value) - return self - -# def DecisionTree(data, labels, arg, weights, randomState): -# depth = int(arg[0]) -# subSampling = float(arg[1]) -# if subSampling != 1.0: -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, randomState, -# weights=weights) -# else: -# subSampledData, subSampledLabels, subSampledWeights = data, labels, weights -# isBad = False -# classifier = sklearn.tree.DecisionTreeClassifier(max_depth=depth) -# # classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth)) -# classifier.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights) -# prediction = classifier.predict(data) -# accuracy = accuracy_score(labels, prediction) -# if accuracy < 0.5: -# isBad = True -# -# return classifier, prediction, isBad, accuracy - - -def getKWARGS(argList, randomState): - kwargs = {"depth":int(argList[0]), "criterion":argList[1], "splitter":argList[2], "subSampling":float(argList[3]), "randomState":randomState} - return kwargs - - -def getConfig(classifierConfig): - try: - depth = classifierConfig["depth"] - splitter = classifierConfig["splitter"] - criterion = classifierConfig["criterion"] - subSampling = classifierConfig["subSampling"] - return 'with depth ' + str(depth) + ', ' + \ - 'with splitter ' + splitter + ', ' + \ - 'with criterion ' + criterion + ', ' + \ - ' sub-sampled at ' + str(subSampling) + ' ' - except KeyError: - return "Go back, you're drunk" - - -def findClosest(scores, base=0.5): - diffToBase = 100.0 - bestSettingsIndex = 0 - for resultIndex, result in enumerate(scores): - if abs(base - result) < diffToBase: - diffToBase = abs(base - result) - bestResult = result - bestSettingsIndex = resultIndex - return bestSettingsIndex - - -def hyperParamSearch(data, labels, randomState, metric=["accuracy_score", None], nbSubSamplingTests=20): - metricModule = getattr(Metrics, metric[0]) - if metric[1] is not None: - metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1])) - else: - metricKWARGS = {} - scorer = metricModule.get_scorer(**metricKWARGS) - subSamplingRatios = np.arange(nbSubSamplingTests, dtype=float)/nbSubSamplingTests - maxDepths = np.arange(1)+1 - criterions = ["gini", "entropy"] - splitters = ["best", "random"] - parameters = {"depth":maxDepths, "criterion":criterions, "splitter":splitters, "subSampling":subSamplingRatios} - classifier = DecisionTree() - grid = sklearn.model_selection.GridSearchCV(classifier, parameters, scoring=scorer) - grid.fit(data, labels) - GSSubSamplingRatios = grid.cv_results_["param_subSampling"] - GSMaxDepths = grid.cv_results_["param_depth"] - GSCriterions = grid.cv_results_["param_criterion"] - GSSplitters = grid.cv_results_["param_splitter"] - GSScores = grid.cv_results_["mean_test_score"] - configIndex = findClosest(GSScores) - return {"depth":GSMaxDepths[configIndex], "criterion":GSCriterions[configIndex], "splitter":GSSplitters[configIndex], "subSampling":GSSubSamplingRatios[configIndex], "randomState":randomState} - # bestSettings = [] - # bestResults = [] - # classifier = sklearn.tree.DecisionTreeClassifier(max_depth=1) - # subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.05, randomState) - # classifier.fit(subSampledData, subSampledLabels) - # prediction = classifier.predict(data) - # preliminary_accuracy = accuracy_score(labels, prediction) - # if preliminary_accuracy < 0.50: - # for max_depth in np.arange(10) + 1: - # for subSampling in sorted((np.arange(20, dtype=float) + 1) / 20, reverse=True): - # if subSampling > minSubSampling: - # accuracies = np.zeros(50) - # for i in range(50): - # if subSampling != 1.0: - # subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, - # randomState) - # else: - # subSampledData, subSampledLabels, = data, labels - # classifier = tree.DecisionTreeClassifier(max_depth=max_depth) - # classifier.fit(subSampledData, subSampledLabels) - # prediction = classifier.predict(data) - # accuracies[i] = accuracy_score(labels, prediction) - # accuracy = np.mean(accuracies) - # if 0.5 < accuracy < 0.60: - # bestSettings.append([max_depth, subSampling]) - # bestResults.append(accuracy) - # else: - # preliminary_accuracies = np.zeros(50) - # if minSubSampling < 0.01: - # for i in range(50): - # subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.01, randomState) - # classifier.fit(subSampledData, subSampledLabels) - # prediction = classifier.predict(data) - # preliminary_accuracies[i] = accuracy_score(labels, prediction) - # preliminary_accuracy = np.mean(preliminary_accuracies) - # if preliminary_accuracy < 0.50: - # for subSampling in sorted((np.arange(19, dtype=float) + 1) / 200, reverse=True): - # if minSubSampling < subSampling: - # accuracies = np.zeros(50) - # for i in range(50): - # subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, - # randomState) - # classifier = tree.DecisionTreeClassifier(max_depth=1) - # classifier.fit(subSampledData, subSampledLabels) - # prediction = classifier.predict(data) - # accuracies[i] = accuracy_score(labels, prediction) - # accuracy = np.mean(accuracies) - # if 0.5 < accuracy < 0.60: - # bestSettings.append([1, subSampling]) - # bestResults.append(accuracy) - # else: - # for subSampling in sorted((np.arange(19, dtype=float) + 1) / 2000, reverse=True): - # accuracies = np.zeros(50) - # for i in range(50): - # subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, - # randomState) - # if minSubSampling < subSampling: - # classifier1 = tree.DecisionTreeClassifier(max_depth=1) - # classifier1.fit(subSampledData, subSampledLabels) - # prediction = classifier1.predict(data) - # accuracies[i] = accuracy_score(labels, prediction) - # accuracy = np.mean(accuracies) - # if 0.5 < accuracy < 0.60: - # bestSettings.append([1, subSampling]) - # bestResults.append(accuracy) - # - # # assert bestResults != [], "No good settings found for Decision Tree!" - # if bestResults == []: - # bestSetting = None - # else: - # bestSetting = getBestSetting(bestSettings, bestResults) - # return bestSetting - - -def getBestSetting(bestSettings, bestResults): - diffTo52 = 100.0 - bestSettingsIndex = 0 - for resultIndex, result in enumerate(bestResults): - if abs(0.55 - result) < diffTo52: - diffTo52 = abs(0.55 - result) - bestResult = result - bestSettingsIndex = resultIndex - logging.debug("\t\tInfo:\t Best Result : " + str(result)) - - return map(lambda p: round(p, 4), bestSettings[bestSettingsIndex]) diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/Kover.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/Kover.py deleted file mode 100644 index 596368ac998720265ed433490688b45d3daf5f7b..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/Kover.py +++ /dev/null @@ -1,124 +0,0 @@ -# from sklearn.metrics import precision_recall_fscore_support -# from sklearn.cross_validation import StratifiedShuffleSplit as split -# import numpy as np -# # from sklearn.multiclass import OneVsRestClassifier -# from ModifiedMulticlass import OneVsRestClassifier -# -# from sklearn import tree -# from sklearn.metrics import accuracy_score -# import numpy as np -# from ModifiedMulticlass import OneVsRestClassifier -# from SubSampling import subSample -# import logging -# # Add weights -# -# def DecisionTree(data, labels, arg, weights): -# depth = int(arg[0]) -# subSampling = float(arg[1]) -# if subSampling != 1.0: -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, weights=weights) -# else: -# subSampledData, subSampledLabels, subSampledWeights = data, labels, weights -# isBad = False -# classifier = tree.DecisionTreeClassifier(max_depth=depth) -# #classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth)) -# classifier.fit(subSampledData, subSampledLabels, subSampledWeights) -# prediction = classifier.predict(data) -# accuracy = accuracy_score(labels, prediction) -# if accuracy < 0.5: -# isBad = True -# -# return classifier, prediction, isBad, accuracy -# -# -# def getConfig(classifierConfig): -# depth = classifierConfig[0] -# subSampling = classifierConfig[1] -# return 'with depth ' + str(depth) + ', ' + ' sub-sampled at ' + str(subSampling) + ' ' -# -# -# def gridSearch(data, labels, metric="accuracy_score"): -# minSubSampling = 1.0/(len(labels)/2) -# bestSettings = [] -# bestResults = [] -# classifier = tree.DecisionTreeClassifier(max_depth=1) -# preliminary_accuracies = np.zeros(50) -# for i in range(50): -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.05) -# classifier.fit(subSampledData, subSampledLabels) -# prediction = classifier.predict(data) -# preliminary_accuracies[i] = accuracy_score(labels, prediction) -# preliminary_accuracy = np.mean(preliminary_accuracies) -# if preliminary_accuracy < 0.50: -# for max_depth in np.arange(10)+1: -# for subSampling in sorted(np.arange(20, dtype=float)+1/20, reverse=True): -# if subSampling > minSubSampling: -# accuracies = np.zeros(50) -# for i in range(50): -# if subSampling != 1.0: -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling) -# else: -# subSampledData, subSampledLabels, = data, labels -# classifier = tree.DecisionTreeClassifier(max_depth=max_depth) -# classifier.fit(subSampledData, subSampledLabels) -# prediction = classifier.predict(data) -# accuracies[i] = accuracy_score(labels, prediction) -# accuracy = np.mean(accuracies) -# if 0.5 < accuracy < 0.60: -# bestSettings.append([max_depth, subSampling]) -# bestResults.append(accuracy) -# else: -# preliminary_accuracies = np.zeros(50) -# if minSubSampling < 0.01: -# for i in range(50): -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.01) -# classifier.fit(subSampledData, subSampledLabels) -# prediction = classifier.predict(data) -# preliminary_accuracies[i] = accuracy_score(labels, prediction) -# preliminary_accuracy = np.mean(preliminary_accuracies) -# if preliminary_accuracy < 0.50: -# for subSampling in sorted((np.arange(19, dtype=float)+1)/200, reverse=True): -# if minSubSampling < subSampling: -# accuracies = np.zeros(50) -# for i in range(50): -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling) -# classifier = tree.DecisionTreeClassifier(max_depth=1) -# classifier.fit(subSampledData, subSampledLabels) -# prediction = classifier.predict(data) -# accuracies[i] = accuracy_score(labels, prediction) -# accuracy = np.mean(accuracies) -# if 0.5 < accuracy < 0.60: -# bestSettings.append([1, subSampling]) -# bestResults.append(accuracy) -# else: -# for subSampling in sorted((np.arange(19, dtype=float)+1)/2000, reverse=True): -# accuracies = np.zeros(50) -# for i in range(50): -# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling) -# if minSubSampling < subSampling: -# classifier1 = tree.DecisionTreeClassifier(max_depth=1) -# classifier1.fit(subSampledData, subSampledLabels) -# prediction = classifier1.predict(data) -# accuracies[i] = accuracy_score(labels, prediction) -# accuracy = np.mean(accuracies) -# if 0.5 < accuracy < 0.60: -# bestSettings.append([1, subSampling]) -# bestResults.append(accuracy) -# -# assert bestResults!=[], "No good settings found for Decision Tree!" -# -# return getBestSetting(bestSettings, bestResults) -# -# -# def getBestSetting(bestSettings, bestResults): -# diffTo52 = 100.0 -# bestSettingsIndex = 0 -# for resultIndex, result in enumerate(bestResults): -# if abs(0.55-result) < diffTo52: -# diffTo52 = abs(0.55-result) -# bestResult = result -# bestSettingsIndex = resultIndex -# logging.debug("\t\tInfo:\t Best Reslut : "+str(result)) -# -# return map(lambda p: round(p, 4), bestSettings[bestSettingsIndex]) -# # return map(round(,4), bestSettings[bestSettingsIndex]) diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/ModifiedMulticlass.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/ModifiedMulticlass.py deleted file mode 100644 index 2f965e7d38e143ee7ef62cde028a51e69a6beb17..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/ModifiedMulticlass.py +++ /dev/null @@ -1,716 +0,0 @@ -""" -Multiclass and multilabel classification strategies -=================================================== -This module implements multiclass learning algorithms: - - one-vs-the-rest / one-vs-all - - one-vs-one - - error correcting output codes -The estimators provided in this module are meta-estimators: they require a base -estimator to be provided in their constructor. For example, it is possible to -use these estimators to turn a binary classifier or a regressor into a -multiclass classifier. It is also possible to use these estimators with -multiclass estimators in the hope that their accuracy or runtime performance -improves. -All classifiers in scikit-learn implement multiclass classification; you -only need to use this module if you want to experiment with custom multiclass -strategies. -The one-vs-the-rest meta-classifier also implements a `predict_proba` method, -so long as such a method is implemented by the base classifier. This method -returns probabilities of class membership in both the single label and -multilabel case. Note that in the multilabel case, probabilities are the -marginal probability that a given sample falls in the given class. As such, in -the multilabel case the sum of these probabilities over all possible labels -for a given sample *will not* sum to unity, as they do in the single label -case. -""" - -# Author: Mathieu Blondel <mathieu@mblondel.org> -# Author: Hamzeh Alsalhi <93hamsal@gmail.com> -# -# License: BSD 3 clause - -import array -import numpy as np -import warnings -import scipy.sparse as sp - -from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_classifier -from sklearn.base import MetaEstimatorMixin, is_regressor -from sklearn.preprocessing import LabelBinarizer -from sklearn.metrics.pairwise import euclidean_distances -from sklearn.utils import check_random_state -from sklearn.utils.validation import _num_samples -from sklearn.utils.validation import check_consistent_length -from sklearn.utils.validation import check_is_fitted -from sklearn.utils import deprecated -from sklearn.externals.joblib import Parallel -from sklearn.externals.joblib import delayed - -__all__ = [ - "OneVsRestClassifier", - "OneVsOneClassifier", - "OutputCodeClassifier", -] - - -def _fit_binary(estimator, X, y, classes=None, sample_weight=None): - """Fit a single binary estimator.""" - unique_y = np.unique(y) - if len(unique_y) == 1: - if classes is not None: - if y[0] == -1: - c = 0 - else: - c = y[0] - warnings.warn("Label %s is present in all training examples." % - str(classes[c])) - estimator = _ConstantPredictor().fit(X, unique_y) - else: - estimator = clone(estimator) - estimator.fit(X, y, sample_weight=sample_weight) - return estimator - - -def _predict_binary(estimator, X): - """Make predictions using a single binary estimator.""" - if is_regressor(estimator): - return estimator.predict(X) - try: - score = np.ravel(estimator.decision_function(X)) - except (AttributeError, NotImplementedError): - # probabilities of the positive class - score = estimator.predict_proba(X)[:, 1] - return score - - -def _check_estimator(estimator): - """Make sure that an estimator implements the necessary methods.""" - if (not hasattr(estimator, "decision_function") and - not hasattr(estimator, "predict_proba")): - raise ValueError("The base estimator should implement " - "decision_function or predict_proba!") - - -@deprecated("fit_ovr is deprecated and will be removed in 0.18." - "Use the OneVsRestClassifier instead.") -def fit_ovr(estimator, X, y, n_jobs=1): - """Fit a one-vs-the-rest strategy. - Parameters - ---------- - estimator : estimator object - An estimator object implementing `fit` and one of `decision_function` - or `predict_proba`. - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - y : (sparse) array-like, shape = [n_samples] or [n_samples, n_classes] - Multi-class targets. An indicator matrix turns on multilabel - classification. - Returns - ------- - estimators : list of estimators object - The list of fitted estimator. - lb : fitted LabelBinarizer - """ - ovr = OneVsRestClassifier(estimator, n_jobs=n_jobs).fit(X, y) - return ovr.estimators_, ovr.label_binarizer_ - - -@deprecated("predict_ovr is deprecated and will be removed in 0.18." - "Use the OneVsRestClassifier instead.") -def predict_ovr(estimators, label_binarizer, X): - """Predict multi-class targets using the one vs rest strategy. - Parameters - ---------- - estimators : list of `n_classes` estimators, Estimators used for - predictions. The list must be homogeneous with respect to the type of - estimators. fit_ovr supplies this list as part of its output. - label_binarizer : LabelBinarizer object, Object used to transform - multiclass labels to binary labels and vice-versa. fit_ovr supplies - this object as part of its output. - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - Returns - ------- - y : (sparse) array-like, shape = [n_samples] or [n_samples, n_classes]. - Predicted multi-class targets. - """ - e_types = set([type(e) for e in estimators if not - isinstance(e, _ConstantPredictor)]) - if len(e_types) > 1: - raise ValueError("List of estimators must contain estimators of the" - " same type but contains types {0}".format(e_types)) - - ovr = OneVsRestClassifier(clone(estimators[0])) - ovr.estimators_ = estimators - ovr.label_binarizer_ = label_binarizer - - return ovr.predict(X) - - -@deprecated("predict_proba_ovr is deprecated and will be removed in 0.18." - "Use the OneVsRestClassifier instead.") -def predict_proba_ovr(estimators, X, is_multilabel): - e_types = set([type(e) for e in estimators if not - isinstance(e, _ConstantPredictor)]) - if len(e_types) > 1: - raise ValueError("List of estimators must contain estimators of the" - " same type but contains types {0}".format(e_types)) - - Y = np.array([e.predict_proba(X)[:, 1] for e in estimators]).T - - if not is_multilabel: - # Then, probabilities should be normalized to 1. - Y /= np.sum(Y, axis=1)[:, np.newaxis] - - return Y - - -class _ConstantPredictor(BaseEstimator): - def fit(self, X, y): - self.y_ = y - return self - - def predict(self, X): - check_is_fitted(self, 'y_') - - return np.repeat(self.y_, X.shape[0]) - - def decision_function(self, X): - check_is_fitted(self, 'y_') - - return np.repeat(self.y_, X.shape[0]) - - def predict_proba(self, X): - check_is_fitted(self, 'y_') - - return np.repeat([np.hstack([1 - self.y_, self.y_])], - X.shape[0], axis=0) - - -class OneVsRestClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): - """One-vs-the-rest (OvR) multiclass/multilabel strategy - Also known as one-vs-all, this strategy consists in fitting one classifier - per class. For each classifier, the class is fitted against all the other - classes. In addition to its computational efficiency (only `n_classes` - classifiers are needed), one advantage of this approach is its - interpretability. Since each class is represented by one and one classifier - only, it is possible to gain knowledge about the class by inspecting its - corresponding classifier. This is the most commonly used strategy for - multiclass classification and is a fair default choice. - This strategy can also be used for multilabel learning, where a classifier - is used to predict multiple labels for instance, by fitting on a 2-d matrix - in which cell [i, j] is 1 if sample i has label j and 0 otherwise. - In the multilabel learning literature, OvR is also known as the binary - relevance method. - Read more in the :ref:`User Guide <ovr_classification>`. - Parameters - ---------- - estimator : estimator object - An estimator object implementing `fit` and one of `decision_function` - or `predict_proba`. - n_jobs : int, optional, default: 1 - The number of jobs to use for the computation. If -1 all CPUs are used. - If 1 is given, no parallel computing code is used at all, which is - useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are - used. Thus for n_jobs = -2, all CPUs but one are used. - Attributes - ---------- - estimators_ : list of `n_classes` estimators - Estimators used for predictions. - classes_ : array, shape = [`n_classes`] - Class labels. - label_binarizer_ : LabelBinarizer object - Object used to transform multiclass labels to binary labels and - vice-versa. - multilabel_ : boolean - Whether a OneVsRestClassifier is a multilabel classifier. - """ - - def __init__(self, estimator, n_jobs=1): - self.estimator = estimator - self.n_jobs = n_jobs - - def fit(self, X, y, sample_weight=None): - """Fit underlying estimators. - Parameters - ---------- - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - y : (sparse) array-like, shape = [n_samples] or [n_samples, n_classes] - Multi-class targets. An indicator matrix turns on multilabel - classification. - Returns - ------- - self - """ - # A sparse LabelBinarizer, with sparse_output=True, has been shown to - # outpreform or match a dense label binarizer in all cases and has also - # resulted in less or equal memory consumption in the fit_ovr function - # overall. - self.label_binarizer_ = LabelBinarizer(sparse_output=True) - Y = self.label_binarizer_.fit_transform(y) - Y = Y.tocsc() - columns = (col.toarray().ravel() for col in Y.T) - # In cases where individual estimators are very fast to train setting - # n_jobs > 1 in can results in slower performance due to the overhead - # of spawning threads. See joblib issue #112. - self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)( - self.estimator, X, column, classes=[ - "not %s" % self.label_binarizer_.classes_[i], - self.label_binarizer_.classes_[i]], sample_weight=sample_weight) - for i, column in enumerate(columns)) - - return self - - def predict(self, X): - """Predict multi-class targets using underlying estimators. - Parameters - ---------- - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - Returns - ------- - y : (sparse) array-like, shape = [n_samples] or [n_samples, n_classes]. - Predicted multi-class targets. - """ - check_is_fitted(self, 'estimators_') - if (hasattr(self.estimators_[0], "decision_function") and - is_classifier(self.estimators_[0])): - thresh = 0 - else: - thresh = .5 - - n_samples = _num_samples(X) - if self.label_binarizer_.y_type_ == "multiclass": - maxima = np.empty(n_samples, dtype=float) - maxima.fill(-np.inf) - argmaxima = np.zeros(n_samples, dtype=int) - for i, e in enumerate(self.estimators_): - pred = _predict_binary(e, X) - np.maximum(maxima, pred, out=maxima) - argmaxima[maxima == pred] = i - return self.label_binarizer_.classes_[np.array(argmaxima.T)] - else: - indices = array.array('i') - indptr = array.array('i', [0]) - for e in self.estimators_: - indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) - indptr.append(len(indices)) - data = np.ones(len(indices), dtype=int) - indicator = sp.csc_matrix((data, indices, indptr), - shape=(n_samples, len(self.estimators_))) - return self.label_binarizer_.inverse_transform(indicator) - - def predict_proba(self, X): - """Probability estimates. - The returned estimates for all classes are ordered by label of classes. - Note that in the multilabel case, each sample can have any number of - labels. This returns the marginal probability that the given sample has - the label in question. For example, it is entirely consistent that two - labels both have a 90% probability of applying to a given sample. - In the single label multiclass case, the rows of the returned matrix - sum to 1. - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Returns - ------- - T : (sparse) array-like, shape = [n_samples, n_classes] - Returns the probability of the sample for each class in the model, - where classes are ordered as they are in `self.classes_`. - """ - check_is_fitted(self, 'estimators_') - # Y[i,j] gives the probability that sample i has the label j. - # In the multi-label case, these are not disjoint. - Y = np.array([e.predict_proba(X)[:, 1] for e in self.estimators_]).T - - if len(self.estimators_) == 1: - # Only one estimator, but we still want to return probabilities - # for two classes. - Y = np.concatenate(((1 - Y), Y), axis=1) - - if not self.multilabel_: - # Then, probabilities should be normalized to 1. - Y /= np.sum(Y, axis=1)[:, np.newaxis] - return Y - - def decision_function(self, X): - """Returns the distance of each sample from the decision boundary for - each class. This can only be used with estimators which implement the - decision_function method. - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Returns - ------- - T : array-like, shape = [n_samples, n_classes] - """ - check_is_fitted(self, 'estimators_') - if not hasattr(self.estimators_[0], "decision_function"): - raise AttributeError( - "Base estimator doesn't have a decision_function attribute.") - return np.array([est.decision_function(X).ravel() - for est in self.estimators_]).T - - @property - def multilabel_(self): - """Whether this is a multilabel classifier""" - return self.label_binarizer_.y_type_.startswith('multilabel') - - @property - def classes_(self): - return self.label_binarizer_.classes_ - - @property - def coef_(self): - check_is_fitted(self, 'estimators_') - if not hasattr(self.estimators_[0], "coef_"): - raise AttributeError( - "Base estimator doesn't have a coef_ attribute.") - coefs = [e.coef_ for e in self.estimators_] - if sp.issparse(coefs[0]): - return sp.vstack(coefs) - return np.vstack(coefs) - - @property - def intercept_(self): - check_is_fitted(self, 'estimators_') - if not hasattr(self.estimators_[0], "intercept_"): - raise AttributeError( - "Base estimator doesn't have an intercept_ attribute.") - return np.array([e.intercept_.ravel() for e in self.estimators_]) - - -def _fit_ovo_binary(estimator, X, y, i, j): - """Fit a single binary estimator (one-vs-one).""" - cond = np.logical_or(y == i, y == j) - y = y[cond] - y_binary = np.empty(y.shape, np.int) - y_binary[y == i] = 0 - y_binary[y == j] = 1 - ind = np.arange(X.shape[0]) - return _fit_binary(estimator, X[ind[cond]], y_binary, classes=[i, j]) - - -@deprecated("fit_ovo is deprecated and will be removed in 0.18." - "Use the OneVsOneClassifier instead.") -def fit_ovo(estimator, X, y, n_jobs=1): - ovo = OneVsOneClassifier(estimator, n_jobs=n_jobs).fit(X, y) - return ovo.estimators_, ovo.classes_ - - -@deprecated("predict_ovo is deprecated and will be removed in 0.18." - "Use the OneVsOneClassifier instead.") -def predict_ovo(estimators, classes, X): - """Make predictions using the one-vs-one strategy.""" - - e_types = set([type(e) for e in estimators if not - isinstance(e, _ConstantPredictor)]) - if len(e_types) > 1: - raise ValueError("List of estimators must contain estimators of the" - " same type but contains types {0}".format(e_types)) - - ovo = OneVsOneClassifier(clone(estimators[0])) - ovo.estimators_ = estimators - ovo.classes_ = classes - return ovo.predict(X) - - -class OneVsOneClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): - """One-vs-one multiclass strategy - This strategy consists in fitting one classifier per class pair. - At prediction time, the class which received the most votes is selected. - Since it requires to fit `n_classes * (n_classes - 1) / 2` classifiers, - this method is usually slower than one-vs-the-rest, due to its - O(n_classes^2) complexity. However, this method may be advantageous for - algorithms such as kernel algorithms which don't scale well with - `n_samples`. This is because each individual learning problem only involves - a small subset of the data whereas, with one-vs-the-rest, the complete - dataset is used `n_classes` times. - Read more in the :ref:`User Guide <ovo_classification>`. - Parameters - ---------- - estimator : estimator object - An estimator object implementing `fit` and one of `decision_function` - or `predict_proba`. - n_jobs : int, optional, default: 1 - The number of jobs to use for the computation. If -1 all CPUs are used. - If 1 is given, no parallel computing code is used at all, which is - useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are - used. Thus for n_jobs = -2, all CPUs but one are used. - Attributes - ---------- - estimators_ : list of `n_classes * (n_classes - 1) / 2` estimators - Estimators used for predictions. - classes_ : numpy array of shape [n_classes] - Array containing labels. - """ - - def __init__(self, estimator, n_jobs=1): - self.estimator = estimator - self.n_jobs = n_jobs - - def fit(self, X, y): - """Fit underlying estimators. - Parameters - ---------- - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - y : array-like, shape = [n_samples] - Multi-class targets. - Returns - ------- - self - """ - y = np.asarray(y) - check_consistent_length(X, y) - - self.classes_ = np.unique(y) - n_classes = self.classes_.shape[0] - self.estimators_ = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_ovo_binary)( - self.estimator, X, y, self.classes_[i], self.classes_[j]) - for i in range(n_classes) for j in range(i + 1, n_classes)) - - return self - - def predict(self, X): - """Estimate the best class label for each sample in X. - This is implemented as ``argmax(decision_function(X), axis=1)`` which - will return the label of the class with most votes by estimators - predicting the outcome of a decision for each possible class pair. - Parameters - ---------- - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - Returns - ------- - y : numpy array of shape [n_samples] - Predicted multi-class targets. - """ - Y = self.decision_function(X) - return self.classes_[Y.argmax(axis=1)] - - def decision_function(self, X): - """Decision function for the OneVsOneClassifier. - The decision values for the samples are computed by adding the - normalized sum of pair-wise classification confidence levels to the - votes in order to disambiguate between the decision values when the - votes for all the classes are equal leading to a tie. - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Returns - ------- - Y : array-like, shape = [n_samples, n_classes] - """ - check_is_fitted(self, 'estimators_') - - predictions = np.vstack([est.predict(X) for est in self.estimators_]).T - confidences = np.vstack([_predict_binary(est, X) for est in self.estimators_]).T - return _ovr_decision_function(predictions, confidences, - len(self.classes_)) - - -def _ovr_decision_function(predictions, confidences, n_classes): - """Compute a continuous, tie-breaking ovr decision function. - It is important to include a continuous value, not only votes, - to make computing AUC or calibration meaningful. - Parameters - ---------- - predictions : array-like, shape (n_samples, n_classifiers) - Predicted classes for each binary classifier. - confidences : array-like, shape (n_samples, n_classifiers) - Decision functions or predicted probabilities for positive class - for each binary classifier. - n_classes : int - Number of classes. n_classifiers must be - ``n_classes * (n_classes - 1 ) / 2`` - """ - n_samples = predictions.shape[0] - votes = np.zeros((n_samples, n_classes)) - sum_of_confidences = np.zeros((n_samples, n_classes)) - - k = 0 - for i in range(n_classes): - for j in range(i + 1, n_classes): - sum_of_confidences[:, i] -= confidences[:, k] - sum_of_confidences[:, j] += confidences[:, k] - votes[predictions[:, k] == 0, i] += 1 - votes[predictions[:, k] == 1, j] += 1 - k += 1 - - max_confidences = sum_of_confidences.max() - min_confidences = sum_of_confidences.min() - - if max_confidences == min_confidences: - return votes - - # Scale the sum_of_confidences to (-0.5, 0.5) and add it with votes. - # The motivation is to use confidence levels as a way to break ties in - # the votes without switching any decision made based on a difference - # of 1 vote. - eps = np.finfo(sum_of_confidences.dtype).eps - max_abs_confidence = max(abs(max_confidences), abs(min_confidences)) - scale = (0.5 - eps) / max_abs_confidence - return votes + sum_of_confidences * scale - - -@deprecated("fit_ecoc is deprecated and will be removed in 0.18." - "Use the OutputCodeClassifier instead.") -def fit_ecoc(estimator, X, y, code_size=1.5, random_state=None, n_jobs=1): - """Fit an error-correcting output-code strategy. - Parameters - ---------- - estimator : estimator object - An estimator object implementing `fit` and one of `decision_function` - or `predict_proba`. - code_size : float, optional - Percentage of the number of classes to be used to create the code book. - random_state : numpy.RandomState, optional - The generator used to initialize the codebook. Defaults to - numpy.random. - Returns - -------- - estimators : list of `int(n_classes * code_size)` estimators - Estimators used for predictions. - classes : numpy array of shape [n_classes] - Array containing labels. - code_book_ : numpy array of shape [n_classes, code_size] - Binary array containing the code of each class. - """ - ecoc = OutputCodeClassifier(estimator, random_state=random_state, - n_jobs=n_jobs).fit(X, y) - return ecoc.estimators_, ecoc.classes_, ecoc.code_book_ - - -@deprecated("predict_ecoc is deprecated and will be removed in 0.18." - "Use the OutputCodeClassifier instead.") -def predict_ecoc(estimators, classes, code_book, X): - """Make predictions using the error-correcting output-code strategy.""" - ecoc = OutputCodeClassifier(clone(estimators[0])) - ecoc.classes_ = classes - ecoc.estimators_ = estimators - ecoc.code_book_ = code_book - - return ecoc.predict(X) - - -class OutputCodeClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): - """(Error-Correcting) Output-multiview_platform multiclass strategy - Output-code based strategies consist in representing each class with a - binary code (an array of 0s and 1s). At fitting time, one binary - classifier per bit in the code book is fitted. At prediction time, the - classifiers are used to project new points in the class space and the class - closest to the points is chosen. The main advantage of these strategies is - that the number of classifiers used can be controlled by the user, either - for compressing the model (0 < code_size < 1) or for making the model more - robust to errors (code_size > 1). See the documentation for more details. - Read more in the :ref:`User Guide <ecoc>`. - Parameters - ---------- - estimator : estimator object - An estimator object implementing `fit` and one of `decision_function` - or `predict_proba`. - code_size : float - Percentage of the number of classes to be used to create the code book. - A number between 0 and 1 will require fewer classifiers than - one-vs-the-rest. A number greater than 1 will require more classifiers - than one-vs-the-rest. - random_state : numpy.RandomState, optional - The generator used to initialize the codebook. Defaults to - numpy.random. - n_jobs : int, optional, default: 1 - The number of jobs to use for the computation. If -1 all CPUs are used. - If 1 is given, no parallel computing code is used at all, which is - useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are - used. Thus for n_jobs = -2, all CPUs but one are used. - Attributes - ---------- - estimators_ : list of `int(n_classes * code_size)` estimators - Estimators used for predictions. - classes_ : numpy array of shape [n_classes] - Array containing labels. - code_book_ : numpy array of shape [n_classes, code_size] - Binary array containing the code of each class. - References - ---------- - .. [1] "Solving multiclass learning problems via error-correcting output - codes", - Dietterich T., Bakiri G., - Journal of Artificial Intelligence Research 2, - 1995. - .. [2] "The error coding method and PICTs", - James G., Hastie T., - Journal of Computational and Graphical statistics 7, - 1998. - .. [3] "The Elements of Statistical Learning", - Hastie T., Tibshirani R., Friedman J., page 606 (second-edition) - 2008. - """ - - def __init__(self, estimator, code_size=1.5, random_state=None, n_jobs=1): - self.estimator = estimator - self.code_size = code_size - self.random_state = random_state - self.n_jobs = n_jobs - - def fit(self, X, y): - """Fit underlying estimators. - Parameters - ---------- - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - y : numpy array of shape [n_samples] - Multi-class targets. - Returns - ------- - self - """ - if self.code_size <= 0: - raise ValueError("code_size should be greater than 0, got {1}" - "".format(self.code_size)) - - _check_estimator(self.estimator) - random_state = check_random_state(self.random_state) - - self.classes_ = np.unique(y) - n_classes = self.classes_.shape[0] - code_size_ = int(n_classes * self.code_size) - - # FIXME: there are more elaborate methods than generating the codebook - # randomly. - self.code_book_ = random_state.random_sample((n_classes, code_size_)) - self.code_book_[self.code_book_ > 0.5] = 1 - - if hasattr(self.estimator, "decision_function"): - self.code_book_[self.code_book_ != 1] = -1 - else: - self.code_book_[self.code_book_ != 1] = 0 - - classes_index = dict((c, i) for i, c in enumerate(self.classes_)) - - Y = np.array([self.code_book_[classes_index[y[i]]] - for i in range(X.shape[0])], dtype=np.int) - - self.estimators_ = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_binary)(self.estimator, X, Y[:, i]) - for i in range(Y.shape[1])) - - return self - - def predict(self, X): - """Predict multi-class targets using underlying estimators. - Parameters - ---------- - X : (sparse) array-like, shape = [n_samples, n_features] - Data. - Returns - ------- - y : numpy array of shape [n_samples] - Predicted multi-class targets. - """ - check_is_fitted(self, 'estimators_') - Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T - pred = euclidean_distances(Y, self.code_book_).argmin(axis=1) - return self.classes_[pred] diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/SubSampling.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/SubSampling.py deleted file mode 100644 index d8f2bd5cc87fe78c7f3e1d3bda9fcfb04aee1d9b..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/SubSampling.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np - - -def getLabelSupports(CLASS_LABELS): - labels = set(CLASS_LABELS) - supports = [CLASS_LABELS.tolist().count(label) for label in labels] - return supports, dict((label, index) for label, index in zip(labels, range(len(labels)))) - - -def isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict): - if nbTrainingExamples[labelDict[CLASS_LABELS[index]]] != 0: - nbTrainingExamples[labelDict[CLASS_LABELS[index]]] -= 1 - return True, nbTrainingExamples - else: - return False, nbTrainingExamples - - -def subSample(data, labels, subSampling, randomState, weights=None): - if weights is None: - weights = np.ones(len(labels)) / len(labels) - nbExamples = len(labels) - labelSupports, labelDict = getLabelSupports(labels) - - nbTrainingExamples = [int(support * subSampling) if int(support * subSampling) > 0 else 1 - for support in labelSupports] - trainingExamplesIndices = [] - usedIndices = [] - while nbTrainingExamples != [0 for i in range(len(labelSupports))]: - index = int(randomState.randint(0, nbExamples - 1)) - isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, labels, labelDict) - if isUseFull and index not in usedIndices: - trainingExamplesIndices.append(index) - usedIndices.append(index) - subSampledData = [] - subSampledLabels = [] - subSampledWeights = [] - for index in trainingExamplesIndices: - subSampledData.append(data[index]) - subSampledLabels.append(labels[index]) - subSampledWeights.append(weights[index]) - return np.array(subSampledData), np.array(subSampledLabels), np.array(subSampledWeights) diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/__init__.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/__init__.py deleted file mode 100644 index 8db5fdbaef4faa4a8cb43a126042d1abb03bbc24..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# from os import listdir -# from os.path import isfile, join -# mypath="." -# modules = [f[:-3] for f in listdir(mypath) if isfile(join(mypath, f)) and f[-3:] == ".py" and f!="__init__.py" ] -# __all__ = modules - -import os -for module in os.listdir(os.path.dirname(os.path.realpath(__file__))): - if module == '__init__.py' or module[-3:] != '.py': - continue - __import__(module[:-3], locals(), globals(), [], 1) -del module -del os diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/MumboModule.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/MumboModule.py deleted file mode 100644 index dbb7ced858c5d6a692b62e83d93e197e42458fef..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/MumboModule.py +++ /dev/null @@ -1,523 +0,0 @@ -import itertools -import logging -import math -import pkgutil - -import numpy as np -from joblib import Parallel, delayed -from sklearn.metrics import accuracy_score - -from ...utils.Dataset import getV -from . import Classifiers - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -# Data shape : ((Views, Examples, Corrdinates)) - -def genName(config): - return "Mumbo" - - -def getBenchmark(benchmark, args=None): - allAlgos = [name for _, name, isPackage in - pkgutil.iter_modules("./MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/Classifiers") - if not isPackage and not name in ["SubSampling", "ModifiedMulticlass", "Kover"]] - if args is None or args.MU_types != ['']: - benchmark["Multiview"]["Mumbo"] = allAlgos - else: - benchmark["Multiview"]["Mumbo"] = args.MU_types - return benchmark - - -def getArgs(args, benchmark, views, viewsIndices, randomState, directory, resultsMonoview, classificationIndices): - argumentsList = [] - nbViews = len(views) - if args.MU_combination and args.MU_types != [""]: - classifiersCombinations = itertools.combinations_with_replacement(args.MU_types, nbViews) - for classifierCombination in classifiersCombinations: - arguments = {"CL_type": "Mumbo", - "views": views, - "NB_VIEW": len(views), - "viewsIndices": viewsIndices, - "NB_CLASS": len(args.CL_classes), - "LABELS_NAMES": args.CL_classes, - "MumboKWARGS": {"classifiersNames": classifierCombination, - "maxIter": int(args.MU_iter[0]), "minIter": int(args.MU_iter[1]), - "threshold": args.MU_iter[2], - "classifiersConfigs": [], - "nbView": (len(viewsIndices))}} - argumentsList.append(arguments) - else: - if len(args.MU_types) == nbViews: - pass - elif len(args.MU_types) < nbViews and args.MU_types != ['']: - while len(args.MU_types) < nbViews: - args.MU_types.append(args.MU_types[0]) - elif len(args.MU_types) > nbViews: - args.MU_types = args.MU_types[:nbViews] - else: - args.MU_types = ["DecisionTree" for _ in views] - classifiersModules = [getattr(Classifiers, classifierName) for classifierName in args.MU_types] - if args.MU_config != [""]: - arguments = {"CL_type": "Mumbo", - "views": views, - "NB_VIEW": len(views), - "viewsIndices": viewsIndices, - "NB_CLASS": len(args.CL_classes), - "LABELS_NAMES": args.CL_classes, - "MumboKWARGS": {"classifiersNames": args.MU_types, - "maxIter": int(args.MU_iter[0]), "minIter": int(args.MU_iter[1]), - "threshold": args.MU_iter[2], - "classifiersConfigs": [classifierModule.getKWARGS(argument.split(":"), randomState) for argument, classifierModule in - zip(args.MU_config, classifiersModules)], - "nbView": (len(viewsIndices))}} - else: - arguments = {"CL_type": "Mumbo", - "views": views, - "NB_VIEW": len(views), - "viewsIndices": viewsIndices, - "NB_CLASS": len(args.CL_classes), - "LABELS_NAMES": args.CL_classes, - "MumboKWARGS": {"classifiersNames": args.MU_types, - "maxIter": int(args.MU_iter[0]), "minIter": int(args.MU_iter[1]), - "threshold": args.MU_iter[2], - "classifiersConfigs": [], - "nbView": (len(viewsIndices))}} - argumentsList.append(arguments) - return argumentsList - - -def computeWeights(DATASET_LENGTH, iterIndex, viewIndice, CLASS_LABELS, costMatrices): - dist = np.sum(costMatrices[iterIndex, viewIndice]) - dist = dist - np.sum(np.array( - [costMatrices[iterIndex, viewIndice, exampleIndice, int(CLASS_LABELS[exampleIndice])] for exampleIndice in - range(DATASET_LENGTH)])) - - weights = np.array([-costMatrices[iterIndex, viewIndice, - exampleIndice, int(CLASS_LABELS[exampleIndice])] / dist - for exampleIndice in range(DATASET_LENGTH)]) - return weights - -# -# def trainWeakClassifier(classifierName, monoviewDataset, CLASS_LABELS, -# DATASET_LENGTH, viewIndice, classifier_config, iterIndex, costMatrices): -# weights = computeWeights(DATASET_LENGTH, iterIndex, viewIndice, CLASS_LABELS, costMatrices) -# classifierModule = globals()[classifierName] # Permet d'appeler une fonction avec une string -# classifierMethod = getattr(classifierModule, classifierName) -# classifier, classes, isBad, averageAccuracy = classifierMethod(monoviewDataset, CLASS_LABELS, classifier_config, -# weights) -# logging.debug("\t\t\tView " + str(viewIndice) + " : " + str(averageAccuracy)) -# return classifier, classes, isBad, averageAccuracy - - -def trainWeakClassifier_hdf5(classifier, classifierName, monoviewDataset, CLASS_LABELS, DATASET_LENGTH, - viewIndice, classifier_config, viewName, iterIndex, costMatrices, classifierIndex, - randomState, metric): - weights = computeWeights(DATASET_LENGTH, iterIndex, classifierIndex, CLASS_LABELS, costMatrices) - classifier, classes, isBad, averageScore = classifier.fit_hdf5(monoviewDataset, CLASS_LABELS, weights, metric) - if type(viewName) == bytes: - viewName = viewName.decode("utf-8") - logging.debug("\t\t\t"+viewName + " : " + str(averageScore)) - return classifier, classes, isBad, averageScore - - -def gridSearch_hdf5(DATASET, labels, viewIndices, classificationKWARGS, learningIndices, randomState, metric=None, nIter=None): - classifiersNames = classificationKWARGS["classifiersNames"] - bestSettings = [] - for classifierIndex, classifierName in enumerate(classifiersNames): - logging.debug("\tStart:\t Random search for " + classifierName + " on View" + str(viewIndices[classifierIndex])) - classifierModule = getattr(Classifiers, classifierName) # Permet d'appeler une fonction avec une string - classifierGridSearch = getattr(classifierModule, "hyperParamSearch") - bestSettings.append(classifierGridSearch(getV(DATASET, viewIndices[classifierIndex], learningIndices), - labels[learningIndices], randomState, - metric=metric)) - logging.debug("\tDone:\t Gridsearch for " + classifierName) - if None in bestSettings: - return None, None - else: - return bestSettings, None - - -class MumboClass: - def __init__(self, randomState, NB_CORES=1, **kwargs): - self.maxIter = kwargs["maxIter"] - self.minIter = kwargs["minIter"] - self.threshold = kwargs["threshold"] - classifiersClasses = [] - for classifierName in kwargs["classifiersNames"]: - classifierModule = getattr(Classifiers, classifierName) - classifiersClasses.append(getattr(classifierModule, classifierName)) - self.monoviewClassifiers = [classifierClass(**classifierConfig) - for classifierClass, classifierConfig - in zip(classifiersClasses, kwargs["classifiersConfigs"])] - self.classifiersNames = kwargs["classifiersNames"] - self.classifiersConfigs = kwargs["classifiersConfigs"] - nbView = kwargs["nbView"] - self.nbCores = NB_CORES - self.iterIndex = 0 - self.edges = np.zeros((self.maxIter, nbView)) - self.alphas = np.zeros((self.maxIter, nbView)) - self.generalAlphas = np.zeros(self.maxIter) - self.bestClassifiers = [] - self.bestViews = np.zeros(self.maxIter, dtype=int) - 1 - self.averageScores = np.zeros((self.maxIter, nbView)) - self.iterAccuracies = np.zeros(self.maxIter) - self.randomState = randomState - - def initDataDependant(self, trainLength, nbView, nbClass, labels): - self.edges = np.zeros((self.maxIter, nbView)) - self.alphas = np.zeros((self.maxIter, nbView)) - self.generalAlphas = np.zeros(self.maxIter) - self.bestClassifiers = [] - self.bestViews = np.zeros(self.maxIter, dtype=int) - 1 - self.averageScores = np.zeros((self.maxIter, nbView)) - self.costMatrices = np.array([ - np.array([ - np.array([ - np.array([1 if labels[exampleIndice] != classe - else -(nbClass - 1) - for classe in range(nbClass) - ]) for exampleIndice in - range(trainLength) - ]) for _ in range(nbView)]) - if iteration == 0 - else np.zeros((nbView, trainLength, nbClass)) - for iteration in range(self.maxIter + 1) - ]) - self.generalCostMatrix = np.array([ - np.array([ - np.array([1 if labels[exampleIndice] != classe - else -(nbClass - 1) - for classe in range(nbClass) - ]) for exampleIndice in range(trainLength) - ]) for _ in range(self.maxIter) - ]) - self.fs = np.zeros((self.maxIter, nbView, trainLength, nbClass)) - self.ds = np.zeros((self.maxIter, nbView, trainLength)) - self.predictions = np.zeros((self.maxIter, nbView, trainLength)) - self.generalFs = np.zeros((self.maxIter, trainLength, nbClass)) - - def fit_hdf5(self, DATASET, labels, trainIndices=None, viewsIndices=None, metric=["f1_score", None]): - - # Initialization - if self.classifiersConfigs is None: - pass - else: - if trainIndices is None: - trainIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) - if type(viewsIndices) == type(None): - viewsIndices = range(DATASET.get("Metadata").attrs["nbView"]) - NB_CLASS = len(set(labels[trainIndices])) - NB_VIEW = len(viewsIndices) - trainLength = len(trainIndices) - LABELS = labels[trainIndices] - self.initDataDependant(trainLength, NB_VIEW, NB_CLASS, LABELS) - # Learning - isStabilized = False - self.iterIndex = 0 - while not isStabilized and not self.iterIndex >= self.maxIter - 1: - if self.iterIndex > self.minIter: - coeffs = np.polyfit(np.log(np.arange(self.iterIndex) + 0.00001), self.iterAccuracies[:self.iterIndex], - 1) - if abs(coeffs[0]) / self.iterIndex < self.threshold: - isStabilized = True - else: - pass - - logging.debug('\t\tStart:\t Iteration ' + str(self.iterIndex + 1)) - classifiers, predictedLabels, areBad = self.trainWeakClassifiers_hdf5(DATASET, labels, trainIndices, NB_CLASS, - trainLength, viewsIndices, metric) - if areBad.all(): - logging.warning("\t\tWARNING:\tAll bad for iteration " + str(self.iterIndex)) - - self.predictions[self.iterIndex] = predictedLabels - - for viewFakeIndex in range(NB_VIEW): - self.computeEdge(viewFakeIndex, trainLength, LABELS) - if areBad[viewFakeIndex]: - self.alphas[self.iterIndex, viewFakeIndex] = 0. - else: - self.alphas[self.iterIndex, viewFakeIndex] = self.computeAlpha( - self.edges[self.iterIndex, viewFakeIndex]) - - self.updateDs(LABELS, NB_VIEW, trainLength) - self.updateFs(NB_VIEW, trainLength, NB_CLASS) - - self.updateCostmatrices(NB_VIEW, trainLength, NB_CLASS, LABELS) - bestView, edge, bestFakeView = self.chooseView(viewsIndices, LABELS, trainLength) - self.bestViews[self.iterIndex] = bestView - logging.debug("\t\t\t Best view : \t\t View" + str(bestView)) - if areBad.all(): - self.generalAlphas[self.iterIndex] = 0. - else: - self.generalAlphas[self.iterIndex] = self.computeAlpha(edge) - self.bestClassifiers.append(classifiers[bestFakeView]) - self.updateGeneralFs(trainLength, NB_CLASS, bestFakeView) - self.updateGeneralCostMatrix(trainLength, NB_CLASS, LABELS) - predictedLabels = self.predict_hdf5(DATASET, usedIndices=trainIndices, viewsIndices=viewsIndices) - accuracy = accuracy_score(labels[trainIndices], predictedLabels) - self.iterAccuracies[self.iterIndex] = accuracy - - self.iterIndex += 1 - - def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): - NB_CLASS = 2 # DATASET.get("Metadata").attrs["nbClass"] #change if multiclass - if usedIndices is None: - usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) - if viewsIndices is None: - viewsIndices = range(DATASET.get("Metadata").attrs["nbView"]) - if self.classifiersConfigs is None: - return np.zeros(len(usedIndices), dtype=int) - else: - viewDict = dict((viewIndex, index) for index, viewIndex in enumerate(viewsIndices)) - if usedIndices is not None: - DATASET_LENGTH = len(usedIndices) - predictedLabels = np.zeros(DATASET_LENGTH) - - for labelIndex, exampleIndex in enumerate(usedIndices): - votes = np.zeros(2) #change if multiclass - for classifier, alpha, view in zip(self.bestClassifiers, self.alphas, self.bestViews): - if view != -1: - data = getV(DATASET, int(view), int(exampleIndex)) - votes[int(classifier.predict(np.array([data])))] += alpha[viewDict[view]] - else: - pass - predictedLabels[labelIndex] = np.argmax(votes) - else: - predictedLabels = [] - return predictedLabels - - def predict_proba_hdf5(self, DATASET, usedIndices=None): - NB_CLASS = 2 # DATASET.get("Metadata").attrs["nbClass"] #change if multiclass - if usedIndices is None: - usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) - DATASET_LENGTH = len(usedIndices) - predictedProbas = np.zeros((DATASET_LENGTH, NB_CLASS)) - if self.classifiersConfigs is None: - predictedProbas[:,0]=1.0 - else: - for labelIndex, exampleIndex in enumerate(usedIndices): - for classifier, alpha, view in zip(self.bestClassifiers, self.alphas, self.bestViews): - data = getV(DATASET, int(view), exampleIndex) - predictedProbas[labelIndex, int(classifier.predict(np.array([data])))] += alpha[view] - predictedProbas[labelIndex, :] = predictedProbas[labelIndex, :] / np.sum(predictedProbas[labelIndex, :]) - return predictedProbas - - # def trainWeakClassifiers(self, DATASET, CLASS_LABELS, NB_CLASS, DATASET_LENGTH, NB_VIEW): - # trainedClassifiers = [] - # labelsMatrix = [] - # areBad = [] - # if self.nbCores > NB_VIEW: - # NB_JOBS = NB_VIEW - # else: - # NB_JOBS = self.nbCores - # classifiersConfigs = self.classifiersConfigs - # costMatrices = self.costMatrices - # classifiersNames = self.classifiersNames - # iterIndex = self.iterIndex - # trainedClassifiersAndLabels = Parallel(n_jobs=NB_JOBS)( - # delayed(trainWeakClassifier)(classifiersNames[viewIndice], DATASET[viewIndice], CLASS_LABELS, - # DATASET_LENGTH, viewIndice, classifiersConfigs[viewIndice], iterIndex, - # costMatrices) - # for viewIndice in range(NB_VIEW)) - # - # for viewIndex, (classifier, labelsArray, isBad, averageAccuracy) in enumerate(trainedClassifiersAndLabels): - # self.averageScores[self.iterIndex, viewIndex] = averageAccuracy - # trainedClassifiers.append(classifier) - # labelsMatrix.append(labelsArray) - # areBad.append(isBad) - # return np.array(trainedClassifiers), np.array(labelsMatrix), np.array(areBad) - - def trainWeakClassifiers_hdf5(self, DATASET, labels, trainIndices, NB_CLASS, - DATASET_LENGTH, viewIndices, metric): - NB_VIEW = len(viewIndices) - trainedClassifiers = [] - labelsMatrix = [] - areBad = [] - if self.nbCores > NB_VIEW: - NB_JOBS = NB_VIEW - else: - NB_JOBS = self.nbCores - classifiersConfigs = self.classifiersConfigs - costMatrices = self.costMatrices - classifiersNames = self.classifiersNames - classifiers = self.monoviewClassifiers - iterIndex = self.iterIndex - trainedClassifiersAndLabels = Parallel(n_jobs=NB_JOBS)( - delayed(trainWeakClassifier_hdf5)(classifiers[classifierIndex], classifiersNames[classifierIndex], - getV(DATASET, viewIndex, trainIndices), - labels[trainIndices], - DATASET_LENGTH, - viewIndex, classifiersConfigs[classifierIndex], - DATASET.get("View" + str(viewIndex)).attrs["name"], iterIndex, - costMatrices, classifierIndex, self.randomState, metric) - for classifierIndex, viewIndex in enumerate(viewIndices)) - - for viewFakeIndex, (classifier, labelsArray, isBad, averageScore) in enumerate(trainedClassifiersAndLabels): - self.averageScores[self.iterIndex, viewFakeIndex] = averageScore - trainedClassifiers.append(classifier) - labelsMatrix.append(labelsArray) - areBad.append(isBad) - return np.array(trainedClassifiers), np.array(labelsMatrix), np.array(areBad) - - def computeEdge(self, viewFakeIndex, DATASET_LENGTH, CLASS_LABELS): - predictionMatrix = self.predictions[self.iterIndex, viewFakeIndex] - costMatrix = self.costMatrices[self.iterIndex, viewFakeIndex] - cCost = float(np.sum(np.array( - [costMatrix[exampleIndice, int(predictionMatrix[exampleIndice])] for exampleIndice in - range(DATASET_LENGTH)]))) - tCost = float(np.sum( - np.array([-costMatrix[exampleIndice, int(CLASS_LABELS[exampleIndice])] for exampleIndice in - range(DATASET_LENGTH)]))) - if tCost == 0.: - self.edges[self.iterIndex, viewFakeIndex] = -cCost - else: - self.edges[self.iterIndex, viewFakeIndex] = -cCost / tCost - - def computeAlpha(self, edge): - if 1 > edge > -1: - return 0.5 * math.log((1 + edge) / (1 - edge)) - else: - return 0 - - def allViewsClassifyBadly(self, predictions, pastIterIndice, NB_VIEW, CLASS_LABEL, exampleIndice): - boolean = True - for viewIndice in range(NB_VIEW): - if predictions[pastIterIndice, viewIndice, exampleIndice] == CLASS_LABEL: - boolean = False - return boolean - - def updateDs(self, CLASS_LABELS, NB_VIEW, DATASET_LENGTH): - for viewIndice in range(NB_VIEW): - for exampleIndice in range(DATASET_LENGTH): - for pastIterIndice in range(self.iterIndex): - - if self.predictions[pastIterIndice, viewIndice, exampleIndice] \ - == \ - CLASS_LABELS[exampleIndice] \ - or self.allViewsClassifyBadly(self.predictions, pastIterIndice, - NB_VIEW, CLASS_LABELS[exampleIndice], - exampleIndice): - - self.ds[pastIterIndice, viewIndice, exampleIndice] = 1 - else: - self.ds[pastIterIndice, viewIndice, exampleIndice] = 0 - - def updateFs(self, NB_VIEW, DATASET_LENGTH, NB_CLASS): - for viewIndice in range(NB_VIEW): - for exampleIndice in range(DATASET_LENGTH): - for classe in range(NB_CLASS): - self.fs[self.iterIndex, viewIndice, exampleIndice, classe] \ - = np.sum(np.array([self.alphas[pastIterIndice, viewIndice] - * self.ds[pastIterIndice, viewIndice, exampleIndice] - if self.predictions[pastIterIndice, viewIndice, - exampleIndice] - == - classe - else 0 - for pastIterIndice in range(self.iterIndex)])) - if np.amax(np.absolute(self.fs)) != 0: - self.fs /= np.amax(np.absolute(self.fs)) - - def updateCostmatrices(self, NB_VIEW, DATASET_LENGTH, NB_CLASS, CLASS_LABELS): - for viewIndice in range(NB_VIEW): - for exampleIndice in range(DATASET_LENGTH): - for classe in range(NB_CLASS): - if classe != CLASS_LABELS[exampleIndice]: - self.costMatrices[self.iterIndex + 1, viewIndice, exampleIndice, classe] \ - = 1.0 * math.exp(self.fs[self.iterIndex, viewIndice, exampleIndice, classe] - - self.fs[self.iterIndex, viewIndice, exampleIndice, int( - CLASS_LABELS[exampleIndice])]) - else: - self.costMatrices[self.iterIndex + 1, viewIndice, exampleIndice, classe] \ - = -1. * np.sum(np.exp(self.fs[self.iterIndex, viewIndice, exampleIndice] - - self.fs[self.iterIndex, viewIndice, exampleIndice, classe])) - self.costMatrices /= np.amax(np.absolute(self.costMatrices)) - - def chooseView(self, viewIndices, CLASS_LABELS, DATASET_LENGTH): - for viewIndex in range(len(viewIndices)): - self.computeEdge(viewIndex, DATASET_LENGTH, CLASS_LABELS) - - bestFakeView = np.argmax(self.edges[self.iterIndex, :]) - bestView = viewIndices[np.argmax(self.edges[self.iterIndex, :])] - return bestView, self.edges[self.iterIndex, bestFakeView], bestFakeView - - def updateGeneralFs(self, DATASET_LENGTH, NB_CLASS, bestView): - for exampleIndice in range(DATASET_LENGTH): - for classe in range(NB_CLASS): - self.generalFs[self.iterIndex, exampleIndice, classe] \ - = np.sum(np.array([self.generalAlphas[pastIterIndice] - if self.predictions[pastIterIndice, - bestView, - exampleIndice] - == - classe - else 0 - for pastIterIndice in range(self.iterIndex) - ]) - ) - if np.amax(np.absolute(self.generalFs)) != 0: - self.generalFs /= np.amax(np.absolute(self.generalFs)) - - def updateGeneralCostMatrix(self, DATASET_LENGTH, NB_CLASS, CLASS_LABELS): - for exampleIndice in range(DATASET_LENGTH): - for classe in range(NB_CLASS): - if classe != CLASS_LABELS[exampleIndice]: - self.generalCostMatrix[self.iterIndex, exampleIndice, classe] \ - = math.exp(self.generalFs[self.iterIndex, exampleIndice, classe] - - self.generalFs[self.iterIndex, exampleIndice, int(CLASS_LABELS[exampleIndice])]) - else: - self.generalCostMatrix[self.iterIndex, exampleIndice, classe] \ - = -1 * np.sum(np.exp(self.generalFs[self.iterIndex, exampleIndice] - - self.generalFs[self.iterIndex, exampleIndice, classe])) - - def predict(self, DATASET, NB_CLASS=2): - DATASET_LENGTH = len(DATASET[0]) - predictedLabels = np.zeros(DATASET_LENGTH) - - for exampleIndice in range(DATASET_LENGTH): - votes = np.zeros(NB_CLASS) - for classifier, alpha, view in zip(self.bestClassifiers, self.alphas, self.bestViews): - data = DATASET[int(view)][exampleIndice] - votes[int(classifier.predict(np.array([data])))] += alpha - predictedLabels[exampleIndice] = np.argmax(votes) - return predictedLabels - - def classifyMumbobyIter(self, DATASET, NB_CLASS=2): - DATASET_LENGTH = len(DATASET[0]) - NB_ITER = len(self.bestClassifiers) - predictedLabels = np.zeros((DATASET_LENGTH, NB_ITER)) - votes = np.zeros((DATASET_LENGTH, NB_CLASS)) - - for classifier, alpha, view, iterIndice in zip(self.bestClassifiers, self.alphas, self.bestViews, - range(NB_ITER)): - votesByIter = np.zeros((DATASET_LENGTH, NB_CLASS)) - - for exampleIndice in range(DATASET_LENGTH): - data = np.array([np.array(DATASET[int(view)][exampleIndice])]) - votesByIter[exampleIndice, int(self.predict(data, NB_CLASS))] += alpha - votes[exampleIndice] = votes[exampleIndice] + np.array(votesByIter[exampleIndice]) - predictedLabels[exampleIndice, iterIndice] = np.argmax(votes[exampleIndice]) - - return np.transpose(predictedLabels) - - def classifyMumbobyIter_hdf5(self, DATASET, fakeViewsIndicesDict, usedIndices=None, NB_CLASS=2): - if usedIndices is None: - usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) - DATASET_LENGTH = len(usedIndices) - predictedLabels = np.zeros((DATASET_LENGTH, self.maxIter)) - votes = np.zeros((DATASET_LENGTH, NB_CLASS)) - - for iterIndex, (classifier, alpha, view) in enumerate(zip(self.bestClassifiers, self.alphas, self.bestViews)): - votesByIter = np.zeros((DATASET_LENGTH, NB_CLASS)) - - for usedExampleIndex, exampleIndex in enumerate(usedIndices): - data = np.array([np.array(getV(DATASET, int(view), int(exampleIndex)))]) - votesByIter[usedExampleIndex, int(classifier.predict(data))] += alpha[fakeViewsIndicesDict[view]] - votes[usedExampleIndex] = votes[usedExampleIndex] + np.array(votesByIter[usedExampleIndex]) - predictedLabels[usedExampleIndex, iterIndex] = np.argmax(votes[usedExampleIndex]) - - return np.transpose(predictedLabels) diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/__init__.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/__init__.py deleted file mode 100644 index 059b8c7195747f0c29f238811631d756bd47d67a..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import MumboModule, analyzeResults - -__all__ = ["MumboModule.py", "Classifiers"] diff --git a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/analyzeResults.py b/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/analyzeResults.py deleted file mode 100644 index db5777a06e899bb94aaaab3e33aa02ce63624d37..0000000000000000000000000000000000000000 --- a/multiview_platform/MonoMultiViewClassifiers/MultiviewClassifiers/Mumbo/analyzeResults.py +++ /dev/null @@ -1,235 +0,0 @@ -import operator -from datetime import timedelta as hms - -import matplotlib.pyplot as plt -import numpy as np - -from ... import Metrics -from ...utils.Dataset import getV, getShape -from . import Classifiers -from ...utils.MultiviewResultAnalysis import printMetricScore, getMetricsScores - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def findMainView(bestViews): - views = list(set(bestViews)) - viewCount = np.array([list(bestViews).count(view) for view in views]) - mainView = views[np.argmax(viewCount)] - return mainView - - -def plotAccuracyByIter(scoresOnTainByIter, scoresOnTestByIter, features, classifierAnalysis): - x = range(len(scoresOnTainByIter)) - figure = plt.figure() - ax1 = figure.add_subplot(111) - axes = figure.gca() - axes.set_ylim([0.40, 1.00]) - titleString = "" - for view, classifierConfig in zip(features, classifierAnalysis): - titleString += "\n" + view + " : " + classifierConfig - - ax1.set_title("Score depending on iteration", fontsize=20) - plt.text(0.5, 1.08, titleString, - horizontalalignment='center', - fontsize=8, - transform=ax1.transAxes) - figure.subplots_adjust(top=0.8) - ax1.set_xlabel("Iteration Index") - ax1.set_ylabel("Accuracy") - ax1.plot(x, scoresOnTainByIter, c='red', label='Train') - ax1.plot(x, scoresOnTestByIter, c='black', label='Test') - - ax1.legend(loc='lower center', - ncol=3, fancybox=True, shadow=True) - - return '-accuracyByIteration', figure - - -def classifyMumbobyIter_hdf5(usedIndices, DATASET, classifiers, alphas, views, NB_CLASS): - DATASET_LENGTH = len(usedIndices) - NB_ITER = len(classifiers) - predictedLabels = np.zeros((DATASET_LENGTH, NB_ITER)) - votes = np.zeros((DATASET_LENGTH, NB_CLASS)) - - for classifier, alpha, view, iterIndex in zip(classifiers, alphas, views, range(NB_ITER)): - votesByIter = np.zeros((DATASET_LENGTH, NB_CLASS)) - - for usedExampleIndex, exampleIndex in enumerate(usedIndices): - data = np.array([np.array(getV(DATASET, int(view), exampleIndex))]) - votesByIter[usedExampleIndex, int(classifier.predict(data))] += alpha - votes[usedExampleIndex] = votes[usedExampleIndex] + np.array(votesByIter[usedExampleIndex]) - predictedLabels[usedExampleIndex, iterIndex] = np.argmax(votes[usedExampleIndex]) - - return np.transpose(predictedLabels) - - -def error(testLabels, computedLabels): - error = sum(map(operator.ne, computedLabels, testLabels)) - return float(error) * 100 / len(computedLabels) - - -def getDBConfig(DATASET, LEARNING_RATE, nbFolds, databaseName, validationIndices, LABELS_DICTIONARY): - nbView = DATASET.get("Metadata").attrs["nbView"] - viewNames = [DATASET.get("View" + str(viewIndex)).attrs["name"] - if type(DATASET.get("View" + str(viewIndex)).attrs["name"]) != bytes - else DATASET.get("View" + str(viewIndex)).attrs["name"].decode("utf-8") - for viewIndex in range(nbView)] - viewShapes = [getShape(DATASET, viewIndex) for viewIndex in range(nbView)] - DBString = "Dataset info :\n\t-Dataset name : " + databaseName - DBString += "\n\t-Labels : " + ', '.join(LABELS_DICTIONARY.values()) - DBString += "\n\t-Views : " + ', '.join([viewName + " of shape " + str(viewShape) - for viewName, viewShape in zip(viewNames, viewShapes)]) - DBString += "\n\t-" + str(nbFolds) + " folds" - DBString += "\n\t- Validation set length : " + str(len(validationIndices)) + " for learning rate : " + str( - LEARNING_RATE) + " on a total number of examples of " + str(DATASET.get("Metadata").attrs["datasetLength"]) - DBString += "\n\n" - return DBString, viewNames - - -def getAlgoConfig(classifier, classificationKWARGS, nbCores, viewNames, hyperParamSearch, nIter, times): - maxIter = classificationKWARGS["maxIter"] - minIter = classificationKWARGS["minIter"] - threshold = classificationKWARGS["threshold"] - extractionTime, classificationTime = times - weakClassifierConfigs = [getattr(getattr(Classifiers, classifierName), 'getConfig')(classifiersConfig) for classifiersConfig, - classifierName - in zip(classifier.classifiersConfigs, classifier.classifiersNames)] - classifierAnalysis = [classifierName + " " + weakClassifierConfig + "on " + feature for classifierName, - weakClassifierConfig, - feature - in zip(classifier.classifiersNames, weakClassifierConfigs, viewNames)] - gridSearchString = "" - if hyperParamSearch: - gridSearchString += "Configurations found by randomized search with " + str(nIter) + " iterations" - algoString = "\n\nMumbo configuration : \n\t-Used " + str(nbCores) + " core(s)" - algoString += "\n\t-Iterations : min " + str(minIter) + ", max " + str(maxIter) + ", threshold " + str(threshold) - algoString += "\n\t-Weak Classifiers : " + "\n\t\t-".join(classifierAnalysis) - algoString += "\n\n" - algoString += "\n\nComputation time on " + str(nbCores) + " cores : \n\tDatabase extraction time : " + str( - hms(seconds=int(extractionTime))) + "\n\t" - algoString += "\n\tSo a total classification time of " + str(hms(seconds=int(classificationTime))) + ".\n\n" - algoString += "\n\n" - return algoString, classifierAnalysis - - -def getReport(classifier, CLASS_LABELS, classificationIndices, DATASET, trainLabels, - testLabels, viewIndices, metric): - learningIndices, validationIndices, multiviewTestIndices = classificationIndices - nbView = len(viewIndices) - NB_CLASS = len(set(CLASS_LABELS)) - metricModule = getattr(Metrics, metric[0]) - fakeViewsIndicesDict = dict( - (viewIndex, fakeViewIndex) for viewIndex, fakeViewIndex in zip(viewIndices, range(nbView))) - trainScore = metricModule.score(CLASS_LABELS[learningIndices], trainLabels) - testScore = metricModule.score(CLASS_LABELS[validationIndices], testLabels) - mumboClassifier = classifier - maxIter = mumboClassifier.iterIndex - meanAverageAccuracies = np.mean(mumboClassifier.averageScores, axis=0) - viewsStats = np.array([float(list(mumboClassifier.bestViews).count(viewIndex)) / - len(mumboClassifier.bestViews) for viewIndex in range(nbView)]) - PredictedTrainLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, fakeViewsIndicesDict, - usedIndices=learningIndices, - NB_CLASS=NB_CLASS) - PredictedTestLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, fakeViewsIndicesDict, - usedIndices=validationIndices, - NB_CLASS=NB_CLASS) - scoresByIter = np.zeros((len(PredictedTestLabelsByIter), 2)) - for iterIndex, (iterPredictedTrainLabels, iterPredictedTestLabels) in enumerate( - zip(PredictedTrainLabelsByIter, PredictedTestLabelsByIter)): - scoresByIter[iterIndex, 0] = metricModule.score(CLASS_LABELS[learningIndices], iterPredictedTrainLabels) - scoresByIter[iterIndex, 1] = metricModule.score(CLASS_LABELS[validationIndices], iterPredictedTestLabels) - - scoresOnTainByIter = [scoresByIter[iterIndex, 0] for iterIndex in range(maxIter)] - - scoresOnTestByIter = [scoresByIter[iterIndex, 1] for iterIndex in range(maxIter)] - - return (trainScore, testScore, meanAverageAccuracies, viewsStats, scoresOnTainByIter, - scoresOnTestByIter) - - -def iterRelevant(iterIndex, kFoldClassifierStats): - relevants = np.zeros(len(kFoldClassifierStats[0]), dtype=bool) - for statsIterIndex, kFoldClassifier in enumerate(kFoldClassifierStats): - for classifierIndex, classifier in enumerate(kFoldClassifier): - if classifier.iterIndex >= iterIndex: - relevants[classifierIndex] = True - return relevants - - -def modifiedMean(surplusAccuracies): - maxLen = 0 - for foldAccuracies in surplusAccuracies.values(): - if len(foldAccuracies) > maxLen: - maxLen = len(foldAccuracies) - meanAccuracies = [] - for accuracyIndex in range(maxLen): - accuraciesToMean = [] - for foldIndex in surplusAccuracies.keys(): - try: - accuraciesToMean.append(surplusAccuracies[foldIndex][accuracyIndex]) - except: - pass - meanAccuracies.append(np.mean(np.array(accuraciesToMean))) - return meanAccuracies - - -def getMeanIterations(kFoldClassifierStats, foldIndex): - iterations = np.array([kFoldClassifier[foldIndex].iterIndex + 1 for kFoldClassifier in kFoldClassifierStats]) - return np.mean(iterations) - - -def execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classificationIndices, - LABELS_DICTIONARY, views, nbCores, times, - databaseName, KFolds, - hyperParamSearch, nIter, metrics, - viewsIndices, randomState, labels, classifierModule): - - learningIndices, validationIndices, testIndicesMulticlass = classificationIndices - if classifier.classifiersConfigs is None: - metricsScores = getMetricsScores(metrics, trainLabels, testLabels, - validationIndices, learningIndices, labels) - return "No good setting for monoview classifier", None, metricsScores - else: - LEARNING_RATE = len(learningIndices) / (len(learningIndices) + len(validationIndices)) - nbFolds = KFolds.n_splits - - CLASS_LABELS = labels - - dbConfigurationString, viewNames = getDBConfig(DATASET, LEARNING_RATE, nbFolds, databaseName, validationIndices, - LABELS_DICTIONARY) - algoConfigurationString, classifierAnalysis = getAlgoConfig(classifier, classificationKWARGS, nbCores, viewNames, - hyperParamSearch, nIter, times) - - (totalScoreOnTrain, totalScoreOnTest, meanAverageAccuracies, viewsStats, scoresOnTainByIter, - scoresOnTestByIter) = getReport(classifier, CLASS_LABELS, classificationIndices, DATASET, - trainLabels, testLabels, viewsIndices, metrics[0]) - - stringAnalysis = "\t\tResult for Multiview classification with Mumbo with random state : " + str(randomState) + \ - "\n\nAverage " + metrics[0][0] + " :\n\t-On Train : " + str( - totalScoreOnTrain) + "\n\t-On Test : " + \ - str(totalScoreOnTest) - stringAnalysis += dbConfigurationString - stringAnalysis += algoConfigurationString - metricsScores = getMetricsScores(metrics, trainLabels, testLabels, - validationIndices, learningIndices, labels) - stringAnalysis += printMetricScore(metricsScores, metrics) - stringAnalysis += "Mean average scores and stats :" - for viewIndex, (meanAverageAccuracy, bestViewStat) in enumerate(zip(meanAverageAccuracies, viewsStats)): - stringAnalysis += "\n\t- On " + viewNames[viewIndex] + \ - " : \n\t\t- Mean average Accuracy : " + str(meanAverageAccuracy) + \ - "\n\t\t- Percentage of time chosen : " + str(bestViewStat) - stringAnalysis += "\n\n For each iteration : " - for iterIndex in range(len(scoresOnTainByIter)): - stringAnalysis += "\n\t- Iteration " + str(iterIndex + 1) - stringAnalysis += "\n\t\tScore on train : " + \ - str(scoresOnTainByIter[iterIndex]) + '\n\t\tScore on test : ' + \ - str(scoresOnTestByIter[iterIndex]) - - name, image = plotAccuracyByIter(scoresOnTainByIter, scoresOnTestByIter, views, classifierAnalysis) - imagesAnalysis = {name: image} - return stringAnalysis, imagesAnalysis, metricsScores diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py b/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py index 54cc0a376610871dc5864666d917db0c945e2212..60d61816e5d5c33a004fefcf2c1c1f71e783737c 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py @@ -7,11 +7,12 @@ from .. import Metrics def searchBestSettings(dataset, labels, classifierPackage, classifierName, metrics, iLearningIndices, iKFolds, randomState, viewsIndices=None, - searchingTool="hyperParamSearch", nIter=1, **kwargs): + searchingTool="randomizedSearch", nIter=1, **kwargs): """Used to select the right hyperparam optimization function to optimize hyper parameters""" if viewsIndices is None: viewsIndices = range(dataset.get("Metadata").attrs["nbView"]) thismodule = sys.modules[__name__] + searchingTool = "randomizedSearch" # Todo find a nice way to configure multiview classifier without hp search searchingToolMethod = getattr(thismodule, searchingTool) bestSettings = searchingToolMethod(dataset, labels, classifierPackage, classifierName, metrics, iLearningIndices, iKFolds, randomState, viewsIndices=viewsIndices, nIter=nIter, **kwargs)