Skip to content
Snippets Groups Projects
Commit e3e36cb1 authored by bbauvin's avatar bbauvin
Browse files

Mumbo can use as much classifiers as needed and gridsearch is now sklearn's

parent ff0b76cd
No related branches found
No related tags found
No related merge requests found
...@@ -537,13 +537,17 @@ groupSCM.add_argument('--CL_SCM_model_type', metavar='STRING', action='store', ...@@ -537,13 +537,17 @@ groupSCM.add_argument('--CL_SCM_model_type', metavar='STRING', action='store',
groupMumbo = parser.add_argument_group('Mumbo arguments') groupMumbo = parser.add_argument_group('Mumbo arguments')
groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+", groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+",
help='Determine which monoview classifier to use with Mumbo', help='Determine which monoview classifier to use with Mumbo',
default=['DecisionTree', 'DecisionTree', 'DecisionTree']) default=[''])
groupMumbo.add_argument('--MU_config', metavar='STRING', action='store', nargs='+', groupMumbo.add_argument('--MU_config', metavar='STRING', action='store', nargs='+',
help='Configuration for the monoview classifier in Mumbo', help='Configuration for the monoview classifier in Mumbo separate each classifier with sapce and each argument with:',
default=['2:0.5', '2:0.5', '2:0.5']) default=[''])
groupMumbo.add_argument('--MU_iter', metavar='INT', action='store', nargs=3, groupMumbo.add_argument('--MU_iter', metavar='INT', action='store', nargs=3,
help='Max number of iteration, min number of iteration, convergence threshold', type=float, help='Max number of iteration, min number of iteration, convergence threshold', type=float,
default=[10, 1, 0.01]) default=[10, 1, 0.01])
groupMumbo.add_argument('--MU_combination', action='store_true',
help='Try all the monoview classifiers combinations for each view',
default=False)
groupFusion = parser.add_argument_group('Fusion arguments') groupFusion = parser.add_argument_group('Fusion arguments')
groupFusion.add_argument('--FU_types', metavar='STRING', action='store', nargs="+", groupFusion.add_argument('--FU_types', metavar='STRING', action='store', nargs="+",
...@@ -670,6 +674,15 @@ if statsIter > 1: ...@@ -670,6 +674,15 @@ if statsIter > 1:
else: else:
iterResults = [] iterResults = []
for iterIndex in range(statsIter): for iterIndex in range(statsIter):
if not os.path.exists(os.path.dirname(directories[iterIndex] + "train_labels.csv")):
try:
os.makedirs(os.path.dirname(directories[iterIndex] + "train_labels.csv"))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
trainIndices, testIndices = classificationIndices[iterIndex]
trainLabels = DATASET.get("Labels").value[trainIndices]
np.savetxt(directories[iterIndex] + "train_labels.csv", trainLabels, delimiter=",")
iterResults.append( iterResults.append(
classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories[iterIndex], args, classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories[iterIndex], args,
classificationIndices[iterIndex], kFolds[iterIndex], statsIterRandomStates[iterIndex], classificationIndices[iterIndex], kFolds[iterIndex], statsIterRandomStates[iterIndex],
...@@ -678,6 +691,15 @@ if statsIter > 1: ...@@ -678,6 +691,15 @@ if statsIter > 1:
analyzeIterResults(iterResults, args.name, metrics, directory) analyzeIterResults(iterResults, args.name, metrics, directory)
else: else:
if not os.path.exists(os.path.dirname(directories + "train_labels.csv")):
try:
os.makedirs(os.path.dirname(directories + "train_labels.csv"))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
trainIndices, testIndices = classificationIndices
trainLabels = DATASET.get("Labels").value[trainIndices]
np.savetxt(directories + "train_labels.csv", trainLabels, delimiter=",")
res = classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories, args, classificationIndices, res = classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories, args, classificationIndices,
kFolds, kFolds,
statsIterRandomStates, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, statsIterRandomStates, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start,
......
...@@ -66,7 +66,7 @@ def ExecMultiview(directory, DATASET, name, classificationIndices, KFolds, nbCor ...@@ -66,7 +66,7 @@ def ExecMultiview(directory, DATASET, name, classificationIndices, KFolds, nbCor
else: else:
classifier = classifierClass(randomState, NB_CORES=nbCores, **classificationKWARGS) classifier = classifierClass(randomState, NB_CORES=nbCores, **classificationKWARGS)
classifier.fit_hdf5(DATASET, trainIndices=learningIndices, viewsIndices=viewsIndices) classifier.fit_hdf5(DATASET, trainIndices=learningIndices, viewsIndices=viewsIndices, metric=metrics[0])
trainLabels = classifier.predict_hdf5(DATASET, usedIndices=learningIndices, viewsIndices=viewsIndices) trainLabels = classifier.predict_hdf5(DATASET, usedIndices=learningIndices, viewsIndices=viewsIndices)
testLabels = classifier.predict_hdf5(DATASET, usedIndices=validationIndices, viewsIndices=viewsIndices) testLabels = classifier.predict_hdf5(DATASET, usedIndices=validationIndices, viewsIndices=viewsIndices)
fullLabels = classifier.predict_hdf5(DATASET, viewsIndices=viewsIndices) fullLabels = classifier.predict_hdf5(DATASET, viewsIndices=viewsIndices)
......
...@@ -160,7 +160,7 @@ class Fusion: ...@@ -160,7 +160,7 @@ class Fusion:
def setParams(self, paramsSet): def setParams(self, paramsSet):
self.classifier.setParams(paramsSet) self.classifier.setParams(paramsSet)
def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None): def fit_hdf5(self, DATASET, trainIndices=None, viewsIndices=None, metric=["f1_score", None]):
self.classifier.fit_hdf5(DATASET, trainIndices=trainIndices, viewsIndices=viewsIndices) self.classifier.fit_hdf5(DATASET, trainIndices=trainIndices, viewsIndices=viewsIndices)
def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None): def predict_hdf5(self, DATASET, usedIndices=None, viewsIndices=None):
......
from sklearn import tree import sklearn
from sklearn.metrics import accuracy_score from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np import numpy as np
from ModifiedMulticlass import OneVsRestClassifier from ModifiedMulticlass import OneVsRestClassifier
from SubSampling import subSample from SubSampling import subSample
import logging import logging
# Add weights # Add weights
import Metrics import Metrics
def DecisionTree(data, labels, arg, weights, randomState): class DecisionTree(BaseEstimator, ClassifierMixin):
depth = int(arg[0]) def __init__(self, depth=10, criterion="gini", splitter="best", subSampling=1.0, randomState=None, **kwargs):
subSampling = float(arg[1]) if kwargs:
if subSampling != 1.0: self.depth = kwargs["depth"]
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, randomState, self.criterion = kwargs["criterion"]
self.splitter = kwargs["splitter"]
self.subSampling = kwargs["subSampling"]
self.randomState = kwargs["randomState"]
else:
self.depth = depth
self.criterion = criterion
self.splitter = splitter
self.subSampling = subSampling
if randomState is None:
self.randomState=np.random.RandomState()
else:
self.randomState=randomState
self.decisionTree = sklearn.tree.DecisionTreeClassifier(splitter=self.splitter, criterion=self.criterion, max_depth=self.depth)
def fit(self, data, labels, sample_weight=None):
if sample_weight is None:
sample_weight = np.ones(len(data))/len(data)
if self.subSampling != 1.0:
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, self.subSampling, self.randomState,
weights=sample_weight)
else:
subSampledData, subSampledLabels, subSampledWeights = data, labels, sample_weight
self.decisionTree.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights)
return self
def fit_hdf5(self, data, labels, weights, metric):
metricModule = getattr(Metrics, metric[0])
if metric[1] is not None:
metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
else:
metricKWARGS = {}
if weights is None:
weights = np.ones(len(data))/len(data)
# Check that X and y have correct shape
if self.subSampling != 1.0:
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, self.subSampling, self.randomState,
weights=weights) weights=weights)
else: else:
subSampledData, subSampledLabels, subSampledWeights = data, labels, weights subSampledData, subSampledLabels, subSampledWeights = data, labels, weights
isBad = False # self.subSampledData = subSampledData
classifier = tree.DecisionTreeClassifier(max_depth=depth) # self.
# classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth)) # self.
classifier.fit(subSampledData, subSampledLabels, subSampledWeights) # Store the classes seen during fit
prediction = classifier.predict(data) self.decisionTree.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights)
accuracy = accuracy_score(labels, prediction) prediction = self.decisionTree.predict(data)
if accuracy < 0.5: metricKWARGS = {"0":weights}
averageScore = metricModule.score(labels, prediction, **metricKWARGS)
if averageScore < 0.5:
isBad = True isBad = True
else:
isBad = False
# self.X_ = X
# self.y_ = y
# Return the classifier
# self.decisionTree, prediction, isBad, averageScore
return self.decisionTree, prediction, isBad, averageScore
def predict(self, data):
return classifier, prediction, isBad, accuracy # Check is fit had been called
# check_is_fitted(self, ['X_', 'y_'])
# Input validation
# X = check_array(X)
predictedLabels = self.decisionTree.predict(data)
# closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
return predictedLabels
def get_params(self, deep=True):
# suppose this estimator has parameters "alpha" and "recursive"
return {"depth": self.depth, "criterion": self.criterion, "splitter": self.splitter, "subSampling": self.subSampling}
def set_params(self, **parameters):
self.depth = parameters["depth"]
self.criterion = parameters["criterion"]
self.splitter = parameters["splitter"]
self.subSampling = parameters["subSampling"]
# for parameter, value in parameters.items():
# print parameter, value
# self.setattr(parameter, value)
return self
# def DecisionTree(data, labels, arg, weights, randomState):
# depth = int(arg[0])
# subSampling = float(arg[1])
# if subSampling != 1.0:
# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, randomState,
# weights=weights)
# else:
# subSampledData, subSampledLabels, subSampledWeights = data, labels, weights
# isBad = False
# classifier = sklearn.tree.DecisionTreeClassifier(max_depth=depth)
# # classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=depth))
# classifier.fit(subSampledData, subSampledLabels, sample_weight=subSampledWeights)
# prediction = classifier.predict(data)
# accuracy = accuracy_score(labels, prediction)
# if accuracy < 0.5:
# isBad = True
#
# return classifier, prediction, isBad, accuracy
def getKWARGS(argList, randomState):
kwargs = {"depth":int(argList[0]), "criterion":argList[1], "splitter":argList[2], "subSampling":float(argList[3]), "randomState":randomState}
return kwargs
def getConfig(classifierConfig): def getConfig(classifierConfig):
depth = classifierConfig[0] try:
subSampling = classifierConfig[1] depth = classifierConfig["depth"]
return 'with depth ' + str(depth) + ', ' + ' sub-sampled at ' + str(subSampling) + ' ' splitter = classifierConfig["splitter"]
criterion = classifierConfig["criterion"]
subSampling = classifierConfig["subSampling"]
def hyperParamSearch(data, labels, randomState, metric="accuracy_score"): return 'with depth ' + str(depth) + ', ' + \
minSubSampling = 1.0 / (len(labels) / 2) 'with splitter ' + splitter + ', ' + \
bestSettings = [] 'with criterion ' + criterion + ', ' + \
bestResults = [] ' sub-sampled at ' + str(subSampling) + ' '
classifier = tree.DecisionTreeClassifier(max_depth=1) except KeyError:
preliminary_accuracies = np.zeros(50) print classifierConfig
for i in range(50):
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.05, randomState)
classifier.fit(subSampledData, subSampledLabels)
prediction = classifier.predict(data) def findClosest(scores, base=0.5):
preliminary_accuracies[i] = accuracy_score(labels, prediction) diffToBase = 100.0
preliminary_accuracy = np.mean(preliminary_accuracies) bestSettingsIndex = 0
if preliminary_accuracy < 0.50: for resultIndex, result in enumerate(scores):
for max_depth in np.arange(10) + 1: if abs(base - result) < diffToBase:
for subSampling in sorted((np.arange(20, dtype=float) + 1) / 20, reverse=True): diffToBase = abs(base - result)
if subSampling > minSubSampling: bestResult = result
accuracies = np.zeros(50) bestSettingsIndex = resultIndex
for i in range(50): return bestSettingsIndex
if subSampling != 1.0:
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
randomState) def hyperParamSearch(data, labels, randomState, metric=["accuracy_score", None], nbSubSamplingTests=20):
else: metricModule = getattr(Metrics, metric[0])
subSampledData, subSampledLabels, = data, labels if metric[1] is not None:
classifier = tree.DecisionTreeClassifier(max_depth=max_depth) metricKWARGS = dict((index, metricConfig) for index, metricConfig in enumerate(metric[1]))
classifier.fit(subSampledData, subSampledLabels)
prediction = classifier.predict(data)
accuracies[i] = accuracy_score(labels, prediction)
accuracy = np.mean(accuracies)
if 0.5 < accuracy < 0.60:
bestSettings.append([max_depth, subSampling])
bestResults.append(accuracy)
else:
preliminary_accuracies = np.zeros(50)
if minSubSampling < 0.01:
for i in range(50):
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.01, randomState)
classifier.fit(subSampledData, subSampledLabels)
prediction = classifier.predict(data)
preliminary_accuracies[i] = accuracy_score(labels, prediction)
preliminary_accuracy = np.mean(preliminary_accuracies)
if preliminary_accuracy < 0.50:
for subSampling in sorted((np.arange(19, dtype=float) + 1) / 200, reverse=True):
if minSubSampling < subSampling:
accuracies = np.zeros(50)
for i in range(50):
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
randomState)
classifier = tree.DecisionTreeClassifier(max_depth=1)
classifier.fit(subSampledData, subSampledLabels)
prediction = classifier.predict(data)
accuracies[i] = accuracy_score(labels, prediction)
accuracy = np.mean(accuracies)
if 0.5 < accuracy < 0.60:
bestSettings.append([1, subSampling])
bestResults.append(accuracy)
else: else:
for subSampling in sorted((np.arange(19, dtype=float) + 1) / 2000, reverse=True): metricKWARGS = {}
accuracies = np.zeros(50) scorer = metricModule.get_scorer(**metricKWARGS)
for i in range(50): subSamplingRatios = np.arange(nbSubSamplingTests, dtype=float)/nbSubSamplingTests
subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling, maxDepths = np.arange(1)+1
randomState) criterions = ["gini", "entropy"]
if minSubSampling < subSampling: splitters = ["best", "random"]
classifier1 = tree.DecisionTreeClassifier(max_depth=1) parameters = {"depth":maxDepths, "criterion":criterions, "splitter":splitters, "subSampling":subSamplingRatios}
classifier1.fit(subSampledData, subSampledLabels) classifier = DecisionTree()
prediction = classifier1.predict(data) grid = sklearn.model_selection.GridSearchCV(classifier, parameters, scoring=scorer)
accuracies[i] = accuracy_score(labels, prediction) grid.fit(data, labels)
accuracy = np.mean(accuracies) GSSubSamplingRatios = grid.cv_results_["param_subSampling"]
if 0.5 < accuracy < 0.60: GSMaxDepths = grid.cv_results_["param_depth"]
bestSettings.append([1, subSampling]) GSCriterions = grid.cv_results_["param_criterion"]
bestResults.append(accuracy) GSSplitters = grid.cv_results_["param_splitter"]
GSScores = grid.cv_results_["mean_test_score"]
assert bestResults != [], "No good settings found for Decision Tree!" configIndex = findClosest(GSScores)
return {"depth":GSMaxDepths[configIndex], "criterion":GSCriterions[configIndex], "splitter":GSSplitters[configIndex], "subSampling":GSSubSamplingRatios[configIndex], "randomState":randomState}
return getBestSetting(bestSettings, bestResults) # bestSettings = []
# bestResults = []
# classifier = sklearn.tree.DecisionTreeClassifier(max_depth=1)
# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.05, randomState)
# classifier.fit(subSampledData, subSampledLabels)
# prediction = classifier.predict(data)
# preliminary_accuracy = accuracy_score(labels, prediction)
# if preliminary_accuracy < 0.50:
# for max_depth in np.arange(10) + 1:
# for subSampling in sorted((np.arange(20, dtype=float) + 1) / 20, reverse=True):
# if subSampling > minSubSampling:
# accuracies = np.zeros(50)
# for i in range(50):
# if subSampling != 1.0:
# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
# randomState)
# else:
# subSampledData, subSampledLabels, = data, labels
# classifier = tree.DecisionTreeClassifier(max_depth=max_depth)
# classifier.fit(subSampledData, subSampledLabels)
# prediction = classifier.predict(data)
# accuracies[i] = accuracy_score(labels, prediction)
# accuracy = np.mean(accuracies)
# if 0.5 < accuracy < 0.60:
# bestSettings.append([max_depth, subSampling])
# bestResults.append(accuracy)
# else:
# preliminary_accuracies = np.zeros(50)
# if minSubSampling < 0.01:
# for i in range(50):
# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, 0.01, randomState)
# classifier.fit(subSampledData, subSampledLabels)
# prediction = classifier.predict(data)
# preliminary_accuracies[i] = accuracy_score(labels, prediction)
# preliminary_accuracy = np.mean(preliminary_accuracies)
# if preliminary_accuracy < 0.50:
# for subSampling in sorted((np.arange(19, dtype=float) + 1) / 200, reverse=True):
# if minSubSampling < subSampling:
# accuracies = np.zeros(50)
# for i in range(50):
# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
# randomState)
# classifier = tree.DecisionTreeClassifier(max_depth=1)
# classifier.fit(subSampledData, subSampledLabels)
# prediction = classifier.predict(data)
# accuracies[i] = accuracy_score(labels, prediction)
# accuracy = np.mean(accuracies)
# if 0.5 < accuracy < 0.60:
# bestSettings.append([1, subSampling])
# bestResults.append(accuracy)
# else:
# for subSampling in sorted((np.arange(19, dtype=float) + 1) / 2000, reverse=True):
# accuracies = np.zeros(50)
# for i in range(50):
# subSampledData, subSampledLabels, subSampledWeights = subSample(data, labels, subSampling,
# randomState)
# if minSubSampling < subSampling:
# classifier1 = tree.DecisionTreeClassifier(max_depth=1)
# classifier1.fit(subSampledData, subSampledLabels)
# prediction = classifier1.predict(data)
# accuracies[i] = accuracy_score(labels, prediction)
# accuracy = np.mean(accuracies)
# if 0.5 < accuracy < 0.60:
# bestSettings.append([1, subSampling])
# bestResults.append(accuracy)
#
# # assert bestResults != [], "No good settings found for Decision Tree!"
# if bestResults == []:
# bestSetting = None
# else:
# bestSetting = getBestSetting(bestSettings, bestResults)
# return bestSetting
def getBestSetting(bestSettings, bestResults): def getBestSetting(bestSettings, bestResults):
......
This diff is collapsed.
...@@ -130,7 +130,7 @@ def getReport(classifier, CLASS_LABELS, classificationIndices, DATASET, trainLab ...@@ -130,7 +130,7 @@ def getReport(classifier, CLASS_LABELS, classificationIndices, DATASET, trainLab
testScore = metricModule.score(CLASS_LABELS[validationIndices], testLabels) testScore = metricModule.score(CLASS_LABELS[validationIndices], testLabels)
mumboClassifier = classifier mumboClassifier = classifier
maxIter = mumboClassifier.iterIndex maxIter = mumboClassifier.iterIndex
meanAverageAccuracies = np.mean(mumboClassifier.averageAccuracies, axis=0) meanAverageAccuracies = np.mean(mumboClassifier.averageScores, axis=0)
viewsStats = np.array([float(list(mumboClassifier.bestViews).count(viewIndex)) / viewsStats = np.array([float(list(mumboClassifier.bestViews).count(viewIndex)) /
len(mumboClassifier.bestViews) for viewIndex in range(nbView)]) len(mumboClassifier.bestViews) for viewIndex in range(nbView)])
PredictedTrainLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, fakeViewsIndicesDict, PredictedTrainLabelsByIter = mumboClassifier.classifyMumbobyIter_hdf5(DATASET, fakeViewsIndicesDict,
...@@ -230,6 +230,11 @@ def execute(classifier, trainLabels, ...@@ -230,6 +230,11 @@ def execute(classifier, trainLabels,
hyperParamSearch, nIter, metrics, hyperParamSearch, nIter, metrics,
viewsIndices, randomState): viewsIndices, randomState):
learningIndices, validationIndices = classificationIndices learningIndices, validationIndices = classificationIndices
if classifier.classifiersConfigs is None:
metricsScores = getMetricsScores(metrics, trainLabels, testLabels,
DATASET, validationIndices, learningIndices)
return "No good setting for monoview classifier", None, metricsScores
else:
LEARNING_RATE = len(learningIndices) / (len(learningIndices) + len(validationIndices)) LEARNING_RATE = len(learningIndices) / (len(learningIndices) + len(validationIndices))
nbFolds = KFolds.n_splits nbFolds = KFolds.n_splits
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment