From 7fb057942628d853f1f11c9d81a7513c91710d9a Mon Sep 17 00:00:00 2001
From: Baptiste Bauvin <baptiste.bauvin.1@ulaval.ca>
Date: Thu, 19 Oct 2017 15:10:30 -0400
Subject: [PATCH] Added interpret for adaboost and DT

---
 .../Monoview/ExecClassifMonoView.py           |  2 +-
 .../Monoview/analyzeResult.py                 | 12 +++++---
 .../MonoviewClassifiers/Adaboost.py           | 18 ++++++++++++
 .../MonoviewClassifiers/DecisionTree.py       | 28 +++++++++++++++++--
 .../Methods/LateFusionPackage/SCMForLinear.py | 11 --------
 Code/Versions.py                              |  7 +++++
 6 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py
index 312f4fe4..e8d0b308 100644
--- a/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py
+++ b/Code/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py
@@ -113,7 +113,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol
                                                             hyperParamSearch, metrics, nIter, feat, CL_type,
                                                             clKWARGS, labelsNames, X.shape,
                                                             y_train, y_train_pred, y_test, y_test_pred, t_end,
-                                                            randomState)
+                                                            randomState, cl_res, outputFileName)
     cl_desc = [value for key, value in sorted(clKWARGS.iteritems())]
     logging.debug("Done:\t Getting Results")
     logging.info(stringAnalysis)
diff --git a/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py b/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py
index 998c3c86..e86c88d3 100644
--- a/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py
+++ b/Code/MonoMultiViewClassifiers/Monoview/analyzeResult.py
@@ -16,7 +16,7 @@ def getDBConfigString(name, feat, classificationIndices, shape, classLabelsNames
     return dbConfigString
 
 
-def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS):
+def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS, classififer, directory):
     classifierModule = getattr(MonoviewClassifiers, CL_type)
     classifierConfigString = "Classifier configuration : \n"
     classifierConfigString += "\t- " + classifierModule.getConfig(clKWARGS)[5:] + "\n"
@@ -24,7 +24,8 @@ def getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS):
     if gridSearch:
         classifierConfigString += "\t- Got configuration using randomized search with " + str(nIter) + " iterations \n"
     classifierConfigString += "\n\n"
-    return classifierConfigString
+    classifierInterpretString = classifierModule.getInterpret(classififer, directory)
+    return classifierConfigString, classifierInterpretString
 
 
 def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred):
@@ -43,7 +44,7 @@ def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred):
 
 
 def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, feat, CL_type, clKWARGS, classLabelsNames,
-            shape, y_train, y_train_pred, y_test, y_test_pred, time, randomState):
+            shape, y_train, y_train_pred, y_test, y_test_pred, time, randomState, classifier, directory):
     metricsScores = {}
     metricModule = getattr(Metrics, metrics[0][0])
     trainScore = metricModule.score(y_train, y_train_pred)
@@ -53,7 +54,8 @@ def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, fea
     stringAnalysis += metrics[0][0] + " on train : " + str(trainScore) + "\n" + metrics[0][0] + " on test : " + str(
         testScore) + "\n\n"
     stringAnalysis += getDBConfigString(name, feat, learningRate, shape, classLabelsNames, KFolds)
-    stringAnalysis += getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS)
+    classifierConfigString, classifierIntepretString = getClassifierConfigString(CL_type, gridSearch, nbCores, nIter, clKWARGS, classifier, directory)
+    stringAnalysis += classifierConfigString
     for metric in metrics:
         stringAnalysis += getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred)
         if metric[1] is not None:
@@ -63,6 +65,8 @@ def execute(name, learningRate, KFolds, nbCores, gridSearch, metrics, nIter, fea
         metricsScores[metric[0]] = [getattr(Metrics, metric[0]).score(y_train, y_train_pred),
                                     getattr(Metrics, metric[0]).score(y_test, y_test_pred)]
     stringAnalysis += "\n\n Classification took " + str(hms(seconds=int(time)))
+    stringAnalysis += "\n\n Classifier Interpretation : \n"
+    stringAnalysis+= classifierIntepretString
 
     imageAnalysis = {}
     return stringAnalysis, imageAnalysis, metricsScores
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py
index cad2b46b..66a39edd 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py
@@ -4,6 +4,7 @@ from sklearn.model_selection import RandomizedSearchCV
 from sklearn.tree import DecisionTreeClassifier
 from scipy.stats import randint
 import numpy as np
+import cPickle
 
 from .. import Metrics
 from ..utils.HyperParameterSearch import genHeatMaps
@@ -79,3 +80,20 @@ def getConfig(config):
         except:
             return "\n\t\t- Adaboost with num_esimators : " + str(config["0"]) + ", base_estimators : " + str(
                 config["1"])
+
+def getInterpret(classifier, directory):
+    featureImportances = classifier.feature_importances_
+    sortedArgs = np.argsort(-featureImportances)
+    featureImportancesSorted = featureImportances[sortedArgs][:50]
+    featureIndicesSorted = sortedArgs[:50]
+    featuresImportancesDict = dict((featureIndex, featureImportance)
+                                   for featureIndex, featureImportance in enumerate(featureImportances)
+                                   if featureImportance != 0)
+    with open(directory+'-feature_importances.pickle', 'wb') as handle:
+        cPickle.dump(featuresImportancesDict, handle)
+    interpretString = "Feature importances : \n"
+    for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted):
+        if featureImportance>0:
+            interpretString+="- Feature index : "+str(featureIndex)+\
+                             ", feature importance : "+str(featureImportance)+"\n"
+    return interpretString
\ No newline at end of file
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py
index 9b813c53..152e9e7d 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py
@@ -1,8 +1,10 @@
-from sklearn.tree import DecisionTreeClassifier
+from sklearn import tree
 from sklearn.pipeline import Pipeline  # Pipelining in classification
 from sklearn.model_selection import RandomizedSearchCV
 from scipy.stats import randint
 import numpy as np
+import graphviz
+import cPickle
 
 from .. import Metrics
 from ..utils.HyperParameterSearch import genHeatMaps
@@ -20,7 +22,7 @@ def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs):
     maxDepth = int(kwargs['0'])
     criterion = kwargs['1']
     splitter = kwargs['2']
-    classifier = DecisionTreeClassifier(max_depth=maxDepth, criterion=criterion, splitter=splitter,
+    classifier = tree.DecisionTreeClassifier(max_depth=maxDepth, criterion=criterion, splitter=splitter,
                                         random_state=randomState)
     classifier.fit(DATASET, CLASS_LABELS)
     return classifier
@@ -48,7 +50,7 @@ def getKWARGS(kwargsList):
 
 def randomizedSearch(X_train, y_train, randomState, outputFileName, KFolds=4, nbCores=1,
                      metric=["accuracy_score", None], nIter=30):
-    pipeline_DT = Pipeline([('classifier', DecisionTreeClassifier())])
+    pipeline_DT = Pipeline([('classifier', tree.DecisionTreeClassifier())])
     param_DT = {"classifier__max_depth": randint(1, 300),
                 "classifier__criterion": ["gini", "entropy"],
                 "classifier__splitter": ["best", "random"]}
@@ -85,3 +87,23 @@ def getConfig(config):
         except:
             return "\n\t\t- Decision Tree with max_depth : " + str(config["0"]) + ", criterion : " + config[
                 "1"] + ", splitter : " + config["2"]
+
+def getInterpret(classifier, directory):
+    dot_data = tree.export_graphviz(classifier, out_file=None)
+    graph = graphviz.Source(dot_data)
+    graph.render(directory+"-tree.pdf")
+    featureImportances = classifier.feature_importances_
+    sortedArgs = np.argsort(-featureImportances)
+    featureImportancesSorted = featureImportances[sortedArgs][:50]
+    featureIndicesSorted = sortedArgs[:50]
+    featuresImportancesDict = dict((featureIndex, featureImportance)
+                                   for featureIndex, featureImportance in enumerate(featureImportances)
+                                   if featureImportance != 0)
+    with open(directory + '-feature_importances.pickle', 'wb') as handle:
+        cPickle.dump(featuresImportancesDict, handle)
+    interpretString = "Feature importances : \n"
+    for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted):
+        if featureImportance > 0:
+            interpretString += "- Feature index : " + str(featureIndex) + \
+                               ", feature importance : " + str(featureImportance) + "\n"
+    return interpretString
\ No newline at end of file
diff --git a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py
index 29a879d4..43a22a2f 100644
--- a/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py
+++ b/Code/MonoMultiViewClassifiers/Multiview/Fusion/Methods/LateFusionPackage/SCMForLinear.py
@@ -1,22 +1,11 @@
 import numpy as np
-import pyscm
-# from pyscm.utils import _pack_binary_bytes_to_ints
-import os
-import h5py
-# from pyscm.binary_attributes.classifications.popcount import inplace_popcount_32, inplace_popcount_64
-# from pyscm.utils import _unpack_binary_bytes_from_ints
 
 from pyscm.scm import SetCoveringMachineClassifier as scm
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import RandomizedSearchCV
 from sklearn.externals.six import iteritems, iterkeys, itervalues
 
-from math import ceil
-import random
 from sklearn.metrics import accuracy_score
 import itertools
-import pkgutil
 
 from ..LateFusion import LateFusionClassifier, getClassifiers, getConfig
 from ..... import MonoviewClassifiers
diff --git a/Code/Versions.py b/Code/Versions.py
index 8a83f563..b3a02c11 100644
--- a/Code/Versions.py
+++ b/Code/Versions.py
@@ -96,8 +96,15 @@ def testVersions():
         isUpToDate = False
         toInstall.append("h5py")
 
+    try:
+        import graphviz  #
+    except:
+        isUpToDate = False
+        toInstall.append("graphviz")
+
     if not isUpToDate:
         print "You can't run at the moment, please install the following modules : \n"+ "\n".join(toInstall)
+        quit()
 
 if __name__== "__main__":
     testVersions()
\ No newline at end of file
-- 
GitLab