From b279acb5bfc1623fa33a315a9bf027881e7dd765 Mon Sep 17 00:00:00 2001
From: Baptiste Bauvin <baptiste.bauvin.1@ulaval.ca>
Date: Sat, 21 Oct 2017 19:05:35 -0400
Subject: [PATCH] Added graphs to vizualize feature importances

---
 .../MonoviewClassifiers/Adaboost.py           | 20 +++--------
 .../MonoviewClassifiers/DecisionTree.py       | 18 ++--------
 .../MonoviewClassifiers/RandomForest.py       | 18 ++--------
 .../MonoviewClassifiers/SCM.py                | 10 +++---
 .../MonoviewClassifiers/SGD.py                |  2 ++
 .../MonoviewClassifiers/SVMLinear.py          |  1 +
 .../utils/Interpret.py                        | 35 +++++++++++++++++++
 7 files changed, 54 insertions(+), 50 deletions(-)
 create mode 100644 Code/MonoMultiViewClassifiers/utils/Interpret.py

diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py
index 66a39edd..b5f75f49 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py
@@ -4,10 +4,13 @@ from sklearn.model_selection import RandomizedSearchCV
 from sklearn.tree import DecisionTreeClassifier
 from scipy.stats import randint
 import numpy as np
-import cPickle
+# import cPickle
+# import matplotlib.pyplot as plt
+# from matplotlib.ticker import FuncFormatter
 
 from .. import Metrics
 from ..utils.HyperParameterSearch import genHeatMaps
+from ..utils.Interpret import getFeatureImportance
 
 # Author-Info
 __author__ = "Baptiste Bauvin"
@@ -82,18 +85,5 @@ def getConfig(config):
                 config["1"])
 
 def getInterpret(classifier, directory):
-    featureImportances = classifier.feature_importances_
-    sortedArgs = np.argsort(-featureImportances)
-    featureImportancesSorted = featureImportances[sortedArgs][:50]
-    featureIndicesSorted = sortedArgs[:50]
-    featuresImportancesDict = dict((featureIndex, featureImportance)
-                                   for featureIndex, featureImportance in enumerate(featureImportances)
-                                   if featureImportance != 0)
-    with open(directory+'-feature_importances.pickle', 'wb') as handle:
-        cPickle.dump(featuresImportancesDict, handle)
-    interpretString = "Feature importances : \n"
-    for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted):
-        if featureImportance>0:
-            interpretString+="- Feature index : "+str(featureIndex)+\
-                             ", feature importance : "+str(featureImportance)+"\n"
+    interpretString = getFeatureImportance(classifier, directory)
     return interpretString
\ No newline at end of file
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py
index 152e9e7d..1dbd83ad 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/DecisionTree.py
@@ -4,10 +4,11 @@ from sklearn.model_selection import RandomizedSearchCV
 from scipy.stats import randint
 import numpy as np
 import graphviz
-import cPickle
+# import cPickle
 
 from .. import Metrics
 from ..utils.HyperParameterSearch import genHeatMaps
+from ..utils.Interpret import getFeatureImportance
 
 # Author-Info
 __author__ = "Baptiste Bauvin"
@@ -92,18 +93,5 @@ def getInterpret(classifier, directory):
     dot_data = tree.export_graphviz(classifier, out_file=None)
     graph = graphviz.Source(dot_data)
     graph.render(directory+"-tree.pdf")
-    featureImportances = classifier.feature_importances_
-    sortedArgs = np.argsort(-featureImportances)
-    featureImportancesSorted = featureImportances[sortedArgs][:50]
-    featureIndicesSorted = sortedArgs[:50]
-    featuresImportancesDict = dict((featureIndex, featureImportance)
-                                   for featureIndex, featureImportance in enumerate(featureImportances)
-                                   if featureImportance != 0)
-    with open(directory + '-feature_importances.pickle', 'wb') as handle:
-        cPickle.dump(featuresImportancesDict, handle)
-    interpretString = "Feature importances : \n"
-    for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted):
-        if featureImportance > 0:
-            interpretString += "- Feature index : " + str(featureIndex) + \
-                               ", feature importance : " + str(featureImportance) + "\n"
+    interpretString = getFeatureImportance(classifier, directory)
     return interpretString
\ No newline at end of file
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/RandomForest.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/RandomForest.py
index 66caa2d4..79ea71e4 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/RandomForest.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/RandomForest.py
@@ -3,10 +3,11 @@ from sklearn.pipeline import Pipeline
 from sklearn.model_selection import RandomizedSearchCV
 from scipy.stats import randint
 import numpy as np
-import cPickle
+# import cPickle
 
 from .. import Metrics
 from ..utils.HyperParameterSearch import genHeatMaps
+from ..utils.Interpret import getFeatureImportance
 
 # Author-Info
 __author__ = "Baptiste Bauvin"
@@ -90,18 +91,5 @@ def getConfig(config):
 
 
 def getInterpret(classifier, directory):
-    featureImportances = classifier.feature_importances_
-    sortedArgs = np.argsort(-featureImportances)
-    featureImportancesSorted = featureImportances[sortedArgs][:50]
-    featureIndicesSorted = sortedArgs[:50]
-    featuresImportancesDict = dict((featureIndex, featureImportance)
-                                   for featureIndex, featureImportance in enumerate(featureImportances)
-                                   if featureImportance != 0)
-    with open(directory+'-feature_importances.pickle', 'wb') as handle:
-        cPickle.dump(featuresImportancesDict, handle)
-    interpretString = "Feature importances : \n"
-    for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted):
-        if featureImportance>0:
-            interpretString+="- Feature index : "+str(featureIndex)+ \
-                             ", feature importance : "+str(featureImportance)+"\n"
+    interpretString = getFeatureImportance(classifier, directory)
     return interpretString
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py
index ce998ed4..56084400 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SCM.py
@@ -78,10 +78,6 @@ def paramsToSet(nIter, randomState):
     return paramsSet
 
 
-def getInterpret(classifier, directory):
-    return ""
-
-
 def getKWARGS(kwargsList):
     kwargsDict = {}
     for (kwargName, kwargValue) in kwargsList:
@@ -133,4 +129,8 @@ def getConfig(config):
                    str(config[2])
         except:
             return "\n\t\t- SCM with model_type: " + config["0"] + ", max_rules : " + str(config["1"]) + ", p : " + \
-                   str(config["2"])
\ No newline at end of file
+                   str(config["2"])
+
+
+def getInterpret(classifier, directory):
+    return ""
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SGD.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SGD.py
index e3182787..0bbd424d 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SGD.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SGD.py
@@ -91,4 +91,6 @@ def getConfig(config):
                 "1"] + ", alpha : " + str(config["2"])
 
 def getInterpret(classifier, directory):
+    # TODO : coeffs
     return ""
+# 
\ No newline at end of file
diff --git a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SVMLinear.py b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SVMLinear.py
index 63872d5f..87ad608a 100644
--- a/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SVMLinear.py
+++ b/Code/MonoMultiViewClassifiers/MonoviewClassifiers/SVMLinear.py
@@ -75,4 +75,5 @@ def getConfig(config):
             return "\n\t\t- SVM Linear with C : " + str(config["0"])
 
 def getInterpret(classifier, directory):
+    # TODO : coeffs
     return ""
diff --git a/Code/MonoMultiViewClassifiers/utils/Interpret.py b/Code/MonoMultiViewClassifiers/utils/Interpret.py
new file mode 100644
index 00000000..6f1882a5
--- /dev/null
+++ b/Code/MonoMultiViewClassifiers/utils/Interpret.py
@@ -0,0 +1,35 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.ticker import FuncFormatter
+import cPickle
+
+
+def percent(x, pos):
+    'The two args are the value and tick position'
+    return '%1.1f %%' % (x * 100)
+
+
+def getFeatureImportance(classifier, directory, interpretString=""):
+    featureImportances = classifier.feature_importances_
+    sortedArgs = np.argsort(-featureImportances)
+    featureImportancesSorted = featureImportances[sortedArgs][:50]
+    featureIndicesSorted = sortedArgs[:50]
+    fig, ax = plt.subplots()
+    x = np.arange(50)
+    formatter = FuncFormatter(percent)
+    ax.yaxis.set_major_formatter(formatter)
+    plt.bar(x, featureImportancesSorted)
+    plt.title("Importance depending on feature")
+    fig.savefig(directory + "-feature_importances.png")
+    plt.close()
+    featuresImportancesDict = dict((featureIndex, featureImportance)
+                                   for featureIndex, featureImportance in enumerate(featureImportances)
+                                   if featureImportance != 0)
+    with open(directory+'-feature_importances.pickle', 'wb') as handle:
+        cPickle.dump(featuresImportancesDict, handle)
+    interpretString += "Feature importances : \n"
+    for featureIndex, featureImportance in zip(featureIndicesSorted, featureImportancesSorted):
+        if featureImportance>0:
+            interpretString+="- Feature index : "+str(featureIndex)+\
+                             ", feature importance : "+str(featureImportance)+"\n"
+    return interpretString
\ No newline at end of file
-- 
GitLab