From 7fb057942628d853f1f11c9d81a7513c91710d9a Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin.1@ulaval.ca> Date: Thu, 19 Oct 2017 15:10:30 -0400 Subject: [PATCH] Added interpret for adaboost and DT --- .../Monoview/ExecClassifMonoView.py | 2 +- .../Monoview/analyzeResult.py | 12 +++++--- .../MonoviewClassifiers/Adaboost.py | 18 ++++++++++++ .../MonoviewClassifiers/DecisionTree.py | 28 +++++++++++++++++-- .../Methods/LateFusionPackage/SCMForLinear.py | 11 -------- Code/Versions.py | 7 +++++ 6 files changed, 59 insertions(+), 19 deletions(-) diff --git a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index 312f4fe4..e8d0b308 100644 --- a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -113,7 +113,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol hyperParamSearch, metrics, nIter, feat, CL_type, clKWARGS, labelsNames, X.shape, y_train, y_train_pred, y_test, y_test_pred, t_end, - randomState) + randomState, cl_res, outputFileName) cl_desc = [value for key, value in sorted(clKWARGS.iteritems())] logging.debug("Done:\t Getting Results") logging.info(stringAnalysis) diff --git a/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py b/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py index 998c3c86..e86c88d3 100644 --- a/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py +++ b/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py @@ -16,7 +16,7 @@ def getDBConfigString(name, feat, classificationIndices, shape, classLabelsNames return dbConfigString -def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS): +def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS, classififer, directory): classifierModule = getattr(MonoviewClassifiers, CL_type) classifierConfigString = "Classifier configuration : \n" classifierConfigString += "\t- " + classifierModule.getConfig(clKWARGS)[5:] + "\n" @@ -24,7 +24,8 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS): if gridSearch: classifierConfigString += "\t- Got configuration using randomized search with " + str(nIter) + " iterations \n" classifierConfigString += "\n\n" - return classifierConfigString + classifierInterpretString = classifierModule.getInterpret(classififer, directory) + return classifierConfigString, classifierInterpretString def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred): @@ -43,7 +44,7 @@ def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred): def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames, - shape, y_train, y_train_pred, y_test, y_test_pred, time, randomState): + shape, y_train, y_train_pred, y_test, y_test_pred, time, randomState, classifier, directory): metricsScores = {} metricModule = getattr(Metrics, metrics[0][0]) trainScore = metricModule.score(y_train, y_train_pred) @@ -53,7 +54,8 @@ def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, fea stringAnalysis += metrics[0][0] + " on train : " + str(trainScore) + "\n" + metrics[0][0] + " on test : " + str( testScore) + "\n\n" stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, KFolds) - stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS) + classifierConfigString, classifierIntepretString = getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS, classifier, directory) + stringAnalysis += classifierConfigString for metric in metrics: stringAnalysis += getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred) if metric[1] is not None: @@ -63,6 +65,8 @@ def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, fea metricsScores[metric[0]] = [getattr(Metrics, metric[0]).score(y_train, y_train_pred), getattr(Metrics, metric[0]).score(y_test, y_test_pred)] stringAnalysis += "\n\n Classification took " + str(hms(seconds=int(time))) + stringAnalysis += "\n\n Classifier Interpretation : \n" + stringAnalysis+= classifierIntepretString imageAnalysis = {} return stringAnalysis, imageAnalysis, metricsScores diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py index cad2b46b..66a39edd 100644 --- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py +++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py @@ -4,6 +4,7 @@ from sklearn.model_selection import RandomizedSearchCV from sklearn.tree import DecisionTreeClassifier from scipy.stats import randint import numpy as np +import cPickle from .. import Metrics from ..utils.HyperParameterSearch import genHeatMaps @@ -79,3 +80,20 @@ def getConfig(config): except: return "\n\t\t- Adaboost with num_esimators : " + str(config["0"]) + ", base_estimators : " + str( config["1"]) + +def getInterpret(classifier, directory): + featureImportances = classifier.feature_importances_ + sortedArgs = np.argsort(-featureImportances) + featureImportancesSorted = featureImportances[sortedArgs][:50] + featureIndicesSorted = sortedArgs[:50] + featuresImportancesDict = dict((featureIndex, featureImportance) + for featureIndex, featureImportance in enumerate(featureImportances) + if featureImportance != 0) + with open(directory+'-feature_importances.pickle', 'wb') as handle: + cPickle.dump(featuresImportancesDict, handle) + interpretString = "Feature importances : \n" + for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted): + if featureImportance>0: + interpretString+="- Feature index : "+str(featureIndex)+\ + ", feature importance : "+str(featureImportance)+"\n" + return interpretString \ No newline at end of file diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py index 9b813c53..152e9e7d 100644 --- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py +++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py @@ -1,8 +1,10 @@ -from sklearn.tree import DecisionTreeClassifier +from sklearn import tree from sklearn.pipeline import Pipeline # Pipelining in classification from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint import numpy as np +import graphviz +import cPickle from .. import Metrics from ..utils.HyperParameterSearch import genHeatMaps @@ -20,7 +22,7 @@ def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): maxDepth = int(kwargs['0']) criterion = kwargs['1'] splitter = kwargs['2'] - classifier = DecisionTreeClassifier(max_depth=maxDepth, criterion=criterion, splitter=splitter, + classifier = tree.DecisionTreeClassifier(max_depth=maxDepth, criterion=criterion, splitter=splitter, random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) return classifier @@ -48,7 +50,7 @@ def getKWARGS(kwargsList): def randomizedSearch(X_train, y_train, randomState, outputFileName, KFolds=4, nbCores=1, metric=["accuracy_score", None], nIter=30): - pipeline_DT = Pipeline([('classifier', DecisionTreeClassifier())]) + pipeline_DT = Pipeline([('classifier', tree.DecisionTreeClassifier())]) param_DT = {"classifier__max_depth": randint(1, 300), "classifier__criterion": ["gini", "entropy"], "classifier__splitter": ["best", "random"]} @@ -85,3 +87,23 @@ def getConfig(config): except: return "\n\t\t- Decision Tree with max_depth : " + str(config["0"]) + ", criterion : " + config[ "1"] + ", splitter : " + config["2"] + +def getInterpret(classifier, directory): + dot_data = tree.export_graphviz(classifier, out_file=None) + graph = graphviz.Source(dot_data) + graph.render(directory+"-tree.pdf") + featureImportances = classifier.feature_importances_ + sortedArgs = np.argsort(-featureImportances) + featureImportancesSorted = featureImportances[sortedArgs][:50] + featureIndicesSorted = sortedArgs[:50] + featuresImportancesDict = dict((featureIndex, featureImportance) + for featureIndex, featureImportance in enumerate(featureImportances) + if featureImportance != 0) + with open(directory + '-feature_importances.pickle', 'wb') as handle: + cPickle.dump(featuresImportancesDict, handle) + interpretString = "Feature importances : \n" + for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted): + if featureImportance > 0: + interpretString += "- Feature index : " + str(featureIndex) + \ + ", feature importance : " + str(featureImportance) + "\n" + return interpretString \ No newline at end of file diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py index 29a879d4..43a22a2f 100644 --- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py +++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py @@ -1,22 +1,11 @@ import numpy as np -import pyscm -# from pyscm.utils import _pack_binary_bytes_to_ints -import os -import h5py -# from pyscm.binary_attributes.classifications.popcount import inplace_popcount_32, inplace_popcount_64 -# from pyscm.utils import _unpack_binary_bytes_from_ints from pyscm.scm import SetCoveringMachineClassifier as scm from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.pipeline import Pipeline -from sklearn.model_selection import RandomizedSearchCV from sklearn.externals.six import iteritems, iterkeys, itervalues -from math import ceil -import random from sklearn.metrics import accuracy_score import itertools -import pkgutil from ..LateFusion import LateFusionClassifier, getClassifiers, getConfig from ..... import MonoviewClassifiers diff --git a/Code/Versions.py b/Code/Versions.py index 8a83f563..b3a02c11 100644 --- a/Code/Versions.py +++ b/Code/Versions.py @@ -96,8 +96,15 @@ def testVersions(): isUpToDate = False toInstall.append("h5py") + try: + import graphviz # + except: + isUpToDate = False + toInstall.append("graphviz") + if not isUpToDate: print "You can't run at the moment, please install the following modules : \n"+ "\n".join(toInstall) + quit() if __name__== "__main__": testVersions() \ No newline at end of file -- GitLab