diff --git a/Code/Baptiste/mumbo.py b/Code/Baptiste/mumbo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb778c82731972269cbbbc6a602dcd59f259c4ea
--- /dev/null
+++ b/Code/Baptiste/mumbo.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8
+
+# Work in progress
\ No newline at end of file
diff --git a/Code/MultiView/GetMutliviewDb.py b/Code/MultiView/GetMutliviewDb.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Code/MultiView/Mumbo/Classifiers/DecisionTree.py b/Code/MultiView/Mumbo/Classifiers/DecisionTree.py
index c7223b371bcb72bbd06b5c81b18101b5e60d62ca..64d28b6f6a23de2d53d4af1e44b986841b5ca7b6 100644
--- a/Code/MultiView/Mumbo/Classifiers/DecisionTree.py
+++ b/Code/MultiView/Mumbo/Classifiers/DecisionTree.py
@@ -1,6 +1,9 @@
 from sklearn import tree
 from sklearn.multiclass import OneVsRestClassifier
 
+
+# Add weights 
+
 def decisionTree(data, labels, arg):
     classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(max_depth=arg))
     classifier.fit(data, labels)
diff --git a/Code/MultiView/Mumbo/Mumbo.py b/Code/MultiView/Mumbo/Mumbo.py
index ee31b12ad79349ee57c6dfc04a9caa0bc718e8b8..d49511ba846e3868aef52f0f47cbbe7d7dacca39 100644
--- a/Code/MultiView/Mumbo/Mumbo.py
+++ b/Code/MultiView/Mumbo/Mumbo.py
@@ -1,5 +1,8 @@
 import numpy as np
 import math
+from joblib import Parallel, delayed
+import Classifers
+
 
 def initialize(NB_CLASS, NB_VIEW, NB_ITER, DATASET_LENGTH, CLASS_LABELS):
     costMatrices = np.array([
@@ -9,7 +12,10 @@ def initialize(NB_CLASS, NB_VIEW, NB_ITER, DATASET_LENGTH, CLASS_LABELS):
                                        else -(NB_CLASS-1)
                                        for classe in range(NB_CLASS)
                                     ]) for exampleIndice in range(DATASET_LENGTH)
-                            ]) for viewIndice in range(NB_VIEW)]) if iteration==0 else np.zeros((NB_VIEW, DATASET_LENGTH, NB_CLASS)) for iteration in range(NB_ITER+1)
+                            ]) for viewIndice in range(NB_VIEW)]) \
+                        if iteration==0 \
+                        else np.zeros((NB_VIEW, DATASET_LENGTH, NB_CLASS)) \
+                        for iteration in range(NB_ITER+1)
                     ])
     generalCostMatrix = np.array([
                             np.array([
@@ -26,11 +32,43 @@ def initialize(NB_CLASS, NB_VIEW, NB_ITER, DATASET_LENGTH, CLASS_LABELS):
     predictions = np.zeros((NB_ITER, NB_VIEW, DATASET_LENGTH))
     generalAlphas = np.zeros(NB_ITER)
     generalFs = np.zeros((NB_ITER, DATASET_LENGTH, NB_CLASS))
-    return costMatrices, generalCostMatrix, fs, ds, edges, alphas, predictions, generalAlphas, generalFs
+    return costMatrices, generalCostMatrix, fs, ds, edges, alphas, \
+            predictions, generalAlphas, generalFs
+
+
+def computeWeights(costMatrices, NB_CLASS, DATASET_LENGTH, iterIndice, 
+                    viewIndice, CLASS_LABELS):
+    dist = np.sum(costMatrices[iterIndice, viewIndice]) # ATTENTION
+    weights = np.array([costMatrices[iterIndice, viewIndice, 
+                                    exampleIndice, CLASS_LABELS[exampleIndice]]/dist\
+                         for exampleIndice in range(DATASET_LENGTH)])
+    return weights
+
+
+def trainWeakClassifier(classifierName, DATASET, CLASS_LABELS, costMatrices, 
+                        NB_CLASS, DATASET_LENGTH, iterIndice, viewIndice, 
+                        classifier_config):
+    weights = computeWeights(costMatrices, NB_CLASS, DATASET_LENGTH, 
+                            iterIndice, viewIndice)
+    #Train classifier(classifierName, weights, DATASET, CLASS_LABEL, classifier_config)
+    return classifier, classes
 
 
-def trainWeakClassifers(classifer, costMatrices):
-    return
+
+def trainWeakClassifers(classifierName, DATASET, CLASS_LABELS, costMatrices, 
+                        NB_CLASS, DATASET_LENGTH, iterIndice, classifier_config,
+                        NB_CORES):
+    if NB_CORES > NB_VIEW:
+        NB_JOBS = NB_VIEW
+    else:
+        NB_JOBS = NB_CORES
+
+    trainedClassifiers, classesMatrix = Parallel(n_jobs=NB_JOBS)(
+        delayed(trainWeakClassifier)(classifierName, DATASET, CLASS_LABELS, 
+                                    costMatrices, NB_CLASS, DATASET_LENGTH, 
+                                    iterIndice, viewIndice, classifier_config) 
+        for viewIndice in range(NB_VIEW))
+        return trainedClassifiers, classesMatrix
 
 
 def computeEdge (predictionMatrix, costMatrix):
@@ -41,7 +79,8 @@ def computeAlpha(edge):
     return 0.5*math.log((1+edge)/(1-edge))
 
 
-def allViewsClassifyWell(predictions, pastIterIndice, NB_VIEW, CLASS_LABEL, exampleIndice):
+def allViewsClassifyWell(predictions, pastIterIndice, NB_VIEW, CLASS_LABEL, 
+                        exampleIndice):
     bool = True
     for viewIndice in range(NB_VIEW):
         if predictions[pastIterIndice, viewIndice, exampleIndice]!=CLASS_LABEL:
@@ -49,70 +88,185 @@ def allViewsClassifyWell(predictions, pastIterIndice, NB_VIEW, CLASS_LABEL, exam
     return bool
 
 
-def updateDs(ds, predictions, CLASS_LABELS, NB_VIEW, DATASET_LENGTH, NB_CLASS, iterIndice):
+def updateDs(ds, predictions, CLASS_LABELS, NB_VIEW, DATASET_LENGTH, NB_CLASS,
+             iterIndice):
     for viewIndice in range(NB_VIEW):
         for exampleIndice in range(DATASET_LENGTH):
             for pastIterIndice in range(iterIndice):
-                if predictions[pastIterIndice, viewIndice, exampleIndice]==CLASS_LABELS[exampleIndice] or allViewsClassifyWell(predictions, pastIterIndice, NB_VIEW, CLASS_LABELS[exampleIndice], exampleIndice):
+                
+                if predictions[pastIterIndice, viewIndice, exampleIndice]\
+                 ==\
+                   CLASS_LABELS[exampleIndice]\
+                 or allViewsClassifyWell(predictions, pastIterIndice, 
+                                        NB_VIEW, CLASS_LABELS[exampleIndice],
+                                        exampleIndice):
+
                     ds[pastIterIndice, viewIndice, exampleIndice]=1
                 else:
                     ds[pastIterIndice, viewIndice, exampleIndice]=0
     return ds
 
 
-def updateFs(predictions, ds, alphas, fs, iterIndice, NB_VIEW, DATASET_LENGTH, NB_CLASS, CLASS_LABELS):
+def updateFs(predictions, ds, alphas, fs, iterIndice, NB_VIEW, DATASET_LENGTH,
+             NB_CLASS, CLASS_LABELS):
     for viewIndice in range(NB_VIEW):
         for exampleIndice in range(DATASET_LENGTH):
             for classe in range(NB_CLASS):
-                fs[iterIndice, viewIndice, exampleIndice, classe] = np.sum(np.array([alphas[pastIterIndice, viewIndice]*ds[pastIterIndice, viewIndice, exampleIndice] if predictions[pastIterIndice, viewIndice, exampleIndice]==CLASS_LABELS[exampleIndice] else 0 for pastIterIndice in range(iterIndice)]))
+                fs[iterIndice, viewIndice, exampleIndice, classe] \
+                = np.sum(np.array([alphas[pastIterIndice, viewIndice]\
+                                    *ds[pastIterIndice, viewIndice, exampleIndice] \
+                                    if predictions[pastIterIndice, viewIndice, 
+                                                    exampleIndice]\
+                                        ==\
+                                       CLASS_LABELS[exampleIndice] \
+                                    else 0 \
+                                    for pastIterIndice in range(iterIndice)]))
     return fs
 
-def updateCostmatrices(costMatrices, fs, iterIndice, NB_VIEW, DATASET_LENGTH, NB_CLASS, CLASS_LABELS):
+def updateCostmatrices(costMatrices, fs, iterIndice, NB_VIEW, DATASET_LENGTH, 
+                        NB_CLASS, CLASS_LABELS):
     for viewIndice in range(NB_VIEW):
         for exampleIndice in range(DATASET_LENGTH):
             for classe in range(NB_CLASS):
                 if classe != CLASS_LABELS[exampleIndice]:
-                    costMatrices[iterIndice, viewIndice, exampleIndice, classe] = math.exp(fs[iterIndice, viewIndice, exampleIndice, classe] - fs[iterIndice, viewIndice, exampleIndice, CLASS_LABELS[exampleIndice]])
+                    costMatrices[iterIndice, viewIndice, exampleIndice, classe] \
+                    = math.exp(fs[iterIndice, viewIndice, exampleIndice, classe] - \
+                      fs[iterIndice, viewIndice, 
+                        exampleIndice, CLASS_LABELS[exampleIndice]])
                 else:
-                    costMatrices[iterIndice, viewIndice, exampleIndice, classe] = -1*np.sum(np.exp(fs[iterIndice, viewIndice, exampleIndice] - fs[iterIndice, viewIndice, exampleIndice, classe]))
-
+                    costMatrices[iterIndice, viewIndice, exampleIndice, classe] \
+                    = -1*np.sum(np.exp(fs[iterIndice, viewIndice, exampleIndice] - \
+                        fs[iterIndice, viewIndice, exampleIndice, classe]))
     return costMatrices
 
 
 def chooseView(predictions, generalCostMatrix, iterIndice, NB_VIEW):
-    edges = np.array([computeEdge(predictions[iterIndice, viewIndice], generalCostMatrix) for viewIndice in range(NB_VIEW)])
+    edges = np.array([computeEdge(predictions[iterIndice, viewIndice], 
+                                    generalCostMatrix) \
+                      for viewIndice in range(NB_VIEW)])
     bestView = np.argmax(edges)
     return bestView, edges[bestView]
 
 
-def updateGeneralFs(generalFs, iterIndice, predictions, alphas, DATASET_LENGTH, NB_CLASS, bestView, generalAlphas, CLASS_LABELS):
+def updateGeneralFs(generalFs, iterIndice, predictions, alphas,
+                    DATASET_LENGTH, NB_CLASS, bestView, generalAlphas, 
+                    CLASS_LABELS):
     for exampleIndice in range(DATASET_LENGTH):
         for classe in range(NB_CLASS):
-            generalFs[iterIndice, exampleIndice, classe]=np.sum(np.array([generalAlphas[pastIterIndice] if  predictions[pastIterIndice, bestView, exampleIndice]==CLASS_LABELS[exampleIndice] else 0 for pastIterIndice in range(iterIndice)]))
+            generalFs[iterIndice, exampleIndice, classe]\
+            = np.sum(np.array([generalAlphas[pastIterIndice] \
+                                if predictions[pastIterIndice, 
+                                                bestView, 
+                                                exampleIndice]\
+                                    ==\
+                                    CLASS_LABELS[exampleIndice]\
+                                 else 0 \
+                                 for pastIterIndice in range(iterIndice)\
+                            ])\
+                    )
         return generalFs
 
 
-def updateGeneralCostMatrix(generalCostMatrix, generalFs, iterIndice, DATASET_LENGTH, NB_CLASS, CLASS_LABELS):
+def updateGeneralCostMatrix(generalCostMatrix, generalFs, iterIndice, 
+                            DATASET_LENGTH, NB_CLASS, CLASS_LABELS):
     for exampleIndice in range(DATASET_LENGTH):
         for classe in range(NB_CLASS):
             if classe != CLASS_LABELS[exampleIndice]:
-                generalCostMatrix[iterIndice, exampleIndice, classe]=math.exp(generalFs[iterIndice, exampleIndice, classe] - generalFs[iterIndice, exampleIndice, CLASS_LABELS[exampleIndice]])
+                generalCostMatrix[iterIndice, exampleIndice, classe]\
+                = math.exp(generalFs[iterIndice, exampleIndice, classe] - \
+                  generalFs[iterIndice, exampleIndice, CLASS_LABELS[exampleIndice]])
             else:
-                generalCostMatrix[iterIndice, exampleIndice, classe]= -1 * np.sum(np.exp(generalFs[iterIndice, exampleIndice] - generalFs[iterIndice, exampleIndice, classe]))
+                generalCostMatrix[iterIndice, exampleIndice, classe]\
+                = -1 * np.sum(np.exp(generalFs[iterIndice, exampleIndice] - \
+                  generalFs[iterIndice, exampleIndice, classe]))
     return generalCostMatrix
 
-def mumbo(DATASET, CLASS_LABELS, NB_CLASS, NB_VIEW, NB_ITER, DATASET_LENGTH, classifier):
-    costMatrices, generalCostMatrix, fs, ds, edges, alphas, predictions, generalAlphas, generalFs = initialize(NB_CLASS, NB_VIEW, NB_ITER, DATASET_LENGTH, CLASS_LABELS)
+
+def computeFinalFs(DATASET_LENGTH, NB_CLASS, generalAlphas, predictions, 
+                    bestViews, CLASS_LABELS, NB_ITER):
+    finalFs = np.zeros((DATASET_LENGTH, NB_CLASS))
+    for exampleIndice in range(DATASET_LENGTH):
+        for classe in range(NB_CLASS):
+            finalFs[exampleIndice, classe] = np.sum(np.array([\
+                                                        generalAlphas[iterIndice]\
+                                                        if predictions[iterIndice, 
+                                                                    bestViews[iterIndice],
+                                                                    exampleIndice]\
+                                                                ==\
+                                                            CLASS_LABELS[exampleIndice]\
+                                                        else 0 \
+                                                        for iterIndice in range(NB_ITER)\
+                                                    ])\
+                                                )
+    return finalFs
+
+
+def trainMumbo(DATASET, CLASS_LABELS, NB_CLASS, NB_VIEW, NB_ITER, DATASET_LENGTH,
+                 classifierName, NB_CORES):
+    
+    # Initialization
+    costMatrices, \
+    generalCostMatrix, fs, ds, edges, alphas, \
+    predictions, generalAlphas, generalFs = initialize(NB_CLASS, NB_VIEW, 
+                                                        NB_ITER, DATASET_LENGTH, 
+                                                        CLASS_LABELS)
+    bestViews = np.zeros(NB_ITER)
+    bestClassifiers = []
+
+    # Learning
     for iterIndice in range(NB_ITER):
-        poulet = trainWeakClassifers(classifier, costMatrices)
+
+        classifiers, predictedLabels = trainWeakClassifers(classifierName, 
+                                                            DATASET, 
+                                                            CLASS_LABELS, 
+                                                            costMatrices, 
+                                                            NB_CLASS, 
+                                                            DATASET_LENGTH, 
+                                                            iterIndice, 
+                                                            classifier_config, 
+                                                            NB_CORES)
+        predictions[iterIndice] = predictedLabels
+        
         for viewIndice in range(NB_VIEW):
-            edges[iterIndice, viewIndice] = computeEdge(predictions[iterIndice, viewIndice], costMatrices[iterIndice+1, viewIndice])
-            alphas[iterIndice, viewIndice] = computeAlpha(edges[iterIndice, viewIndice])
-        ds = updateDs(ds, predictions, CLASS_LABELS, NB_VIEW, DATASET_LENGTH, NB_CLASS, iterIndice)
-        fs = updateFs(predictions, ds, alphas, fs, iterIndice, NB_VIEW, DATASET_LENGTH, NB_CLASS, CLASS_LABELS)
-        costMatrices = updateCostmatrices(costMatrices, fs, iterIndice, NB_VIEW, DATASET_LENGTH, NB_CLASS, CLASS_LABELS)
-        bestView, edge = chooseView(predictions, generalCostMatrix, iterIndice, NB_VIEW)
+        
+            edges[iterIndice, viewIndice] = computeEdge(predictions[iterIndice,
+                                                                     viewIndice], 
+                                                        costMatrices[iterIndice+1, 
+                                                                    viewIndice])
+            alphas[iterIndice, viewIndice] = computeAlpha(edges[iterIndice, 
+                                                                viewIndice])
+        
+        ds = updateDs(ds, predictions, CLASS_LABELS, NB_VIEW, DATASET_LENGTH, 
+                        NB_CLASS, iterIndice)
+        fs = updateFs(predictions, ds, alphas, fs, iterIndice, NB_VIEW, 
+                        DATASET_LENGTH, NB_CLASS, CLASS_LABELS)
+        costMatrices = updateCostmatrices(costMatrices, fs, iterIndice, 
+                                            NB_VIEW, DATASET_LENGTH, 
+                                            NB_CLASS, CLASS_LABELS)
+        
+        bestView, edge = chooseView(predictions, generalCostMatrix, 
+                                    iterIndice, NB_VIEW)
+        
+        bestViews[iterIndice] = bestView
         generalAlphas[iterIndice] = computeAlpha(edge)
-        generalFs = updateGeneralFs(generalFs, iterIndice, predictions, alphas, DATASET_LENGTH, NB_CLASS, bestView, generalAlphas, CLASS_LABELS)
-        generalCostMatrix = updateGeneralCostMatrix(generalCostMatrix, generalFs, iterIndice, DATASET_LENGTH, NB_CLASS, CLASS_LABELS)
-    return
\ No newline at end of file
+        bestClassifiers.append(classifiers[bestView])
+        generalFs = updateGeneralFs(generalFs, iterIndice, predictions, alphas,
+                                     DATASET_LENGTH, NB_CLASS, bestView, 
+                                     generalAlphas, CLASS_LABELS)
+        generalCostMatrix = updateGeneralCostMatrix(generalCostMatrix, 
+                                                    generalFs, iterIndice, 
+                                                    DATASET_LENGTH, NB_CLASS, 
+                                                    CLASS_LABELS)
+    
+    # finalFs = computeFinalFs(DATASET_LENGTH, NB_CLASS, generalAlphas, predictions, bestViews, CLASS_LABELS, NB_ITER)
+    return bestClassifiers, generalAlphas
+
+def classifyMumbo(DATASET, classifiers, alphas, NB_CLASS):
+    DATASET_LENGTH = len(DATASET)
+    predictedLabels = np.zeros(DATASET_LENGTH)
+    for exampleIndice in range(DATASET_LENGTH):
+        votes = np.zeros(NB_CLASS)
+        for classifier, alpha in zip(classifiers, alphas):
+            votes[int(classifier.predict(DATASET[exampleIndice]))]+=alpha
+        predictedLabels[exampleIndice] = np.argmax(votes)
+    return predictedLabels
\ No newline at end of file
diff --git a/ipynb/.ipynb_checkpoints/FeatureExtraction-All-checkpoint.ipynb b/ipynb/.ipynb_checkpoints/FeatureExtraction-All-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..34e8e06ce8f26758b19fe9dd7232c4659903e4fe
--- /dev/null
+++ b/ipynb/.ipynb_checkpoints/FeatureExtraction-All-checkpoint.ipynb
@@ -0,0 +1,707 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Code to Extract all Features from Database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Author: Nikolas Hülsmann\n",
+    "#### Date: 2015-12-06"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions for Extract Data\n",
+    "\n",
+    "### Function to iterate through given directory and return images paths and classLabels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def imgCrawl(path, sClassLabels): #path to 'highest' folder\n",
+    "    rootdir = path\n",
+    "    df = pd.DataFrame()\n",
+    "        \n",
+    "    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories\n",
+    "        for file in files:\n",
+    "            pathOfFile = os.path.join(subdir, file) #path of file\n",
+    "            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel\n",
+    "            \n",
+    "            # assign integer label for dataframe\n",
+    "            classLabel = sClassLabels[sClassLabels == classLabel].index[0]\n",
+    "            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) \n",
+    "            \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to determine Class-Labels with Integer representation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# function to determine Class-labels and return Series\n",
+    "def getClassLabels(path):\n",
+    "    data = os.listdir(path) # listdir returns all subdirectories\n",
+    "    index = range(0,len(data))\n",
+    "    \n",
+    "    return pd.Series(data,index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to calculate the RGBColorHistogram for given Images "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate RGBColorHistograms for all images\n",
+    "\n",
+    "### Points to improve: \n",
+    "\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# numberOfBins_: Number of bins Histogram\n",
+    "def calcRGBColorHisto(dfImages_, numberOfBins_):\n",
+    "    # Initialize function\n",
+    "\n",
+    "    npImages = dfImages_.values\n",
+    "    numberOfBins = numberOfBins_\n",
+    "    npColorHist = np.zeros((len(npImages), numberOfBins*3), \"float32\")\n",
+    "    i=0\n",
+    "    \n",
+    "    ## algo\n",
+    "    for images in npImages:\n",
+    "        image = cv2.imread(images[1])\n",
+    "        \n",
+    "        # Image Size for Normalization\n",
+    "        #height, width, channels = image.shape\n",
+    "        #img_size = height * width\n",
+    "        \n",
+    "        # Split into color chanels rgb\n",
+    "        chans = cv2.split(image)\n",
+    "        colors = (\"b\", \"g\", \"r\")\n",
+    "        \n",
+    "        histogram = []\n",
+    "\n",
+    "        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels\n",
+    "        for (chan, color) in zip(chans, colors):         \n",
+    "            \n",
+    "            # Calculate Color Histogram - 16 bins cf. paper (test with 64 has shown that die result is similair in score)\n",
+    "            # Seperates the intesity for each color from 0 to 256, and creates 16 bins of same size: 0-15, 16-31, .. , 241-256\n",
+    "            hist = cv2.calcHist([chan], [0], None, [numberOfBins], [0, 256])\n",
+    "\n",
+    "            # to get raw values\n",
+    "            hist = hist[:,0]\n",
+    "            \n",
+    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (red/blue/green): \n",
+    "            #        (number of pixels in bin)/(sum of pixels in histogram)\n",
+    "            #hist[:] = [x / img_size for x in hist]\n",
+    "            sumHist = sum(hist)\n",
+    "            if(sumHist>0):\n",
+    "                hist[:] = [x / sumHist for x in hist]\n",
+    "            else:\n",
+    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
+    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
+    "            \n",
+    "\n",
+    "            # Normalize with MinMax from 0 to 1 -> feature scaling\n",
+    "            #cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)\n",
+    "            \n",
+    "            histogram.extend(hist)\n",
+    "\n",
+    "        # append features_colHist to npColorHist\n",
+    "        npColorHist[i] = histogram\n",
+    "        i = i+1\n",
+    "        \n",
+    "    \n",
+    "    return npColorHist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate HSVColorHistograms for all images\n",
+    "\n",
+    "### Points to improve: \n",
+    "\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# histSize: Number of bins in Histogram (differenz for each channel H,S,V)\n",
+    "def calcHSVColorHisto(dfImages_, histSize_):\n",
+    "    # Initialize function\n",
+    "    npImages = dfImages_.values\n",
+    "    histSize = histSize_\n",
+    "    npColorHist = np.zeros((len(npImages), int(histSize[0]+histSize[1]+histSize[2])), \"float32\")\n",
+    "    i=0\n",
+    "\n",
+    "    h_ranges = [ 0, 180 ]\n",
+    "    s_ranges = [ 0, 256 ]\n",
+    "    v_ranges = [ 0, 256 ]\n",
+    "\n",
+    "    ranges = []\n",
+    "\n",
+    "    ranges.append(h_ranges)\n",
+    "    ranges.append(s_ranges)\n",
+    "    ranges.append(v_ranges)\n",
+    "\n",
+    "    channels = [ 0, 1, 2]\n",
+    "    \n",
+    "    histogram = []\n",
+    "    \n",
+    "    ## algo\n",
+    "    for images in npImages:\n",
+    "        image = cv2.imread(images[1])\n",
+    "        hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)\n",
+    "        \n",
+    "        \n",
+    "        # Split into color chanels rgb\n",
+    "        chans = cv2.split(image)\n",
+    "        colors = (\"H\", \"S\", \"V\")\n",
+    "        \n",
+    "        histogram = []\n",
+    "        \n",
+    "        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels\n",
+    "        for (chan, color, binsize, range_chan) in zip(chans, colors, histSize, ranges): \n",
+    "            hist = cv2.calcHist([chan], [0], None, [binsize], range_chan )\n",
+    "            \n",
+    "            # to get raw values\n",
+    "            hist = hist[:,0]\n",
+    "            \n",
+    "            \n",
+    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (H/S/V): \n",
+    "            #        (number of pixels in bin)/(sum of pixels in histogram)\n",
+    "            #hist[:] = [x / img_size for x in hist]\n",
+    "            sumHist = sum(hist)\n",
+    "            if(sumHist>0):\n",
+    "                hist[:] = [x / sumHist for x in hist]\n",
+    "            else:\n",
+    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
+    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
+    "                        \n",
+    "            histogram.extend(hist)\n",
+    "        \n",
+    "        \n",
+    "        # append features_colHist to npColorHist\n",
+    "        npColorHist[i] = histogram\n",
+    "        i = i+1\n",
+    "        \n",
+    "    \n",
+    "    return npColorHist\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to calculate Surf Histogram"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "################# FEATURE SURF (cf. http://docs.opencv.org/3.0-beta/doc/py_tutorials/py_feature2d/py_surf_intro/py_surf_intro.html#surf)\n",
+    "# API cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html\n",
+    "\n",
+    "#### Calculate Histogramm of SURF Descripteurs with Bag Of Words appraoch for all images\n",
+    "\n",
+    "### Points to improve: \n",
+    "# - use spatial histogram: http://www.di.ens.fr/willow/events/cvml2011/materials/practical-classification/\n",
+    "# - change function: parameter how many K clustes/feature length (in regard of overfitting)\n",
+    "\n",
+    "\n",
+    "# path to higehst folder\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# k: number of K-Cluster -> length of feature vector\n",
+    "def calcSurfHisto(dfImages_, k_):\n",
+    "    \n",
+    "    # Initialize function\n",
+    "    npImages = dfImages_.values\n",
+    "    k = k_\n",
+    "    \n",
+    "    # List where all the descriptors are stored\n",
+    "    des_list = []\n",
+    "    \n",
+    "    #### Feature Detection and Description (Surf): \n",
+    "    # Detect (localize) for each image the keypoints (Points of Interest in an image - Surf uses therefore like SIFT corners)\n",
+    "    # Pro: SIFT/SURF are scale and rotation invariant!\n",
+    "    for images in npImages:\n",
+    "        # Read image\n",
+    "        image = cv2.imread(images[1], cv2.CV_LOAD_IMAGE_COLOR)\n",
+    "        #cv2.cvtColor(image, gray, cv.CV_BGR2GRAY);\n",
+    "        \n",
+    "        # Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call\n",
+    "        # Each image has different amount of kp, but each kp has a describteur of fixed length (128)\n",
+    "        kp, des = sift.detectAndCompute(image,None)\n",
+    "        des_list.append(des)\n",
+    "    \n",
+    "    # Stack all the descriptors vertically in a numpy array\n",
+    "    descriptors = des_list[0]\n",
+    "    for descriptor in des_list[0:]:\n",
+    "        descriptors = np.vstack((descriptors, descriptor)) \n",
+    "    \n",
+    "    #### Bag of Words Approach\n",
+    "    ### 1. Step: using K-means cluster to create dictionary/vocabulary/codebook:\n",
+    "    # Encoding is the quantization of the image kp/des that constitute the image to be classified. \n",
+    "    # Basic encoding schemes work by first running K-means on the set of all des that you collect \n",
+    "    # across multiple images.\n",
+    "    # This builds what is known a dictionary/vocabulary/codebook represented by the centroids obtained from the clustering.\n",
+    "    \n",
+    "    # Perform k-means clustering -> creates the words from all describteurs -> this is the (dic) dictionary/vocabulary/codebook\n",
+    "    # k: amount of different clusters to build! Will result in a feature length k\n",
+    "    dic, variance = kmeans(descriptors, k, 1) \n",
+    "    \n",
+    "    ### 2. Step: encoding/coding/vector quantization(vq) to assign each descripteur the closest \"visual word\" from dictionary:\n",
+    "    # At the end of this process, you end up with K representative \"visual words\" (the centroid of each cluster after \n",
+    "    # K means ends) of your image descripteurs. These \"visual words\" represent what is usually understood as your \n",
+    "    # visual dictionary. Once you have these visual words, encoding is the process of assigning \n",
+    "    # each descripteur within your image the \"visual word\" (nearest neighbor) in the dictionary.\n",
+    "    \n",
+    "    npSurfHist = np.zeros((len(npImages), k), \"float32\")\n",
+    "    for i in xrange(len(npImages)):\n",
+    "        # vq: (Encoding) Assign words from the dictionary to each descripteur\n",
+    "        words, distance = vq(des_list[i],dic)\n",
+    "        \n",
+    "        ### 3. Step: Pooling - calculate a histogram for each image\n",
+    "        # Pooling refers to the process of representing an image as a \"bag of words\". \n",
+    "        # The word bag here is meant to convey that once you have encoded each descripteur with a word  (a number between 1 and K), \n",
+    "        # you build a new representation (a bag) that discards the spatial relationship between the words that \n",
+    "        # constitute your image.\n",
+    "\n",
+    "        # This representation is often a histogram or a collection of spatially adjacent histograms of the desribteurs \n",
+    "        # (i.e. histograms of values 1 to K) that together form your image. \"Pooling\" is thus the process of \n",
+    "        # building a histogram of words (i.e. pooling ~ \"sampling\" words from the image to build a probability \n",
+    "        # mass function of words)\n",
+    "\n",
+    "        # To clarify, the purpose of pooling is two fold:\n",
+    "        #           By building a feature vector that is a histogram of words (as opposed to putting the full \"sentence of words\" \n",
+    "        #           in the feature vector), your descriptor will be invariant to changes in \"the ordering of words\". \n",
+    "        #           In computer vision this translates into invariance with respect to rotations and distortions of the image \n",
+    "        #           and object, which is a desirable thing to have.\n",
+    "\n",
+    "        #           If the dictionary is small compared to the length of the sentence, a histogram of words has less dimensions \n",
+    "        #           than the original vector. Less dimensions makes learning (training) much easier.\n",
+    "        \n",
+    "        \n",
+    "        # Count the accuarance of each word (w) in image (i) to build histogram\n",
+    "        for w in words:\n",
+    "            npSurfHist[i][w] += 1\n",
+    "        \n",
+    "        #### 4. Step: Normalization of features vector (Can be changed to distribution like ColorHisto)\n",
+    "        # Frequency divided by amount of words (k)\n",
+    "        sumHist = sum(npSurfHist[i])\n",
+    "        if(sumHist>0):\n",
+    "            for x in range(0,k):\n",
+    "                #npSurfHist[i][x] = npSurfHist[i][x]/k\n",
+    "                npSurfHist[i][x] = npSurfHist[i][x]/sumHist #sumHist can be zero...change system\n",
+    "        else:      \n",
+    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
+    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
+    "                        \n",
+    "        \n",
+    "        #stdSlr = StandardScaler().fit(npSurfHist)\n",
+    "        #npSurfHist = stdSlr.transform(npSurfHist)\n",
+    "    \n",
+    "    return npSurfHist"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SIFT Experimental - use SURF "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(100L, 128L)\n"
+     ]
+    }
+   ],
+   "source": [
+    "########### Feature SIFT (Scale-invariant feature transform cf. http://docs.opencv.org/master/da/df5/tutorial_py_sift_intro.html#gsc.tab=0)\n",
+    "# Api cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "\n",
+    "img = cv2.imread('D:/Caltech//airplanes//image_0306.jpg')\n",
+    "gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
+    "\n",
+    "sift = cv2.SIFT(nfeatures=100)\n",
+    "#sift = cv2.xfeatures2d.SIFT_create()\n",
+    "\n",
+    "# Detector which detects the Keypoints in the Image\n",
+    "#kp = sift.detect(gray,None)\n",
+    "\n",
+    "# Just a visualization of the Keypoints in the Image\n",
+    "#img=cv2.drawKeypoints(gray,kp)\n",
+    "#cv2.imwrite('D:\\Sift-test\\sift_keypoints.jpg',img)\n",
+    "\n",
+    "# Another visualization with FLAG: draw a circle with size of keypoint and it will even show its orientation\n",
+    "#img=cv2.drawKeypoints(gray,kp,flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)\n",
+    "#cv2.imwrite('D:\\Sift-test\\sift_keypoints.jpg',img)\n",
+    "\n",
+    "# Method to compute the descripteurs after one has already detected the keypoints\n",
+    "#kp,des = sift.compute(gray,kp)\n",
+    "\n",
+    "#sift = cv2.xfeatures2d.SIFT_create()\n",
+    "#sift = cv2.SIFT()\n",
+    "\n",
+    "# Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call\n",
+    "kp, des = sift.detectAndCompute(gray,None)\n",
+    "\n",
+    "print des.shape\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functions to export calculated Data to csv "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Export Features to csv\n",
+    "def exportToCSV(pandasSorDF, filename):\n",
+    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Feature\"\n",
+    "    path = os.getcwdu() + \"\\\\\" + filename\n",
+    "    \n",
+    "    if os.path.isfile(path + \".csv\"):\n",
+    "        for i in range(1,20):\n",
+    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
+    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
+    "                pandasSorDF.to_csv(testFileName)\n",
+    "                break\n",
+    "\n",
+    "    else:\n",
+    "        pandasSorDF.to_csv(filename + \".csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def exportNumpyToCSV(numpyArray, filename):\n",
+    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Feature\"\n",
+    "    path = os.getcwdu() + \"\\\\\" + filename\n",
+    "    \n",
+    "    if os.path.isfile(path + \".csv\"):\n",
+    "        for i in range(1,20):\n",
+    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
+    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
+    "                np.savetxt(testFileName, numpyArray, delimiter=\",\")\n",
+    "                break\n",
+    "\n",
+    "    else:\n",
+    "        np.savetxt(filename + \".csv\", numpyArray, delimiter=\",\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Main Programm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os           # for iteration throug directories\n",
+    "import pandas as pd # for Series and DataFrames\n",
+    "import cv2          # for OpenCV \n",
+    "import cv           # for OpenCV\n",
+    "import datetime     # for TimeStamp in CSVFile\n",
+    "from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html\n",
+    "import numpy as np  # for arrays\n",
+    "import time       # for time calculations\n",
+    "from termcolor import colored #to color Warnings - PACKAGE MIGHT NOT BE AVAILABLE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Extract images path from DataBase "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(9144L,)\n",
+      "Time to extract all images: 26.3950002193\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Determine the Database to extract features\n",
+    "path ='D:\\Caltech'\n",
+    "\n",
+    "# get dictionary to link classLabels Text to Integers\n",
+    "sClassLabels = getClassLabels(path)\n",
+    "\n",
+    "# Get all path from all images inclusive classLabel as Integer\n",
+    "dfImages = imgCrawl(path, sClassLabels)\n",
+    "print dfImages.classLabel.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to extract all images: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-ClassLabels\"\n",
+    "exportNumpyToCSV(dfImages.classLabel, fileNameClassLabels)\n",
+    "\n",
+    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-ClassLabels-Description\"\n",
+    "exportToCSV(sClassLabels, fileNameClassLabels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### RGB Color Histogram "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(9144L, 48L)\n",
+      "Time to calculate RGBColorHistogram: 19.3360002041\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Calculate RGB Color Histogram wit 16 bins for each color -> feature length = 3 x 16 = 48\n",
+    "npRGBColorHistogram = calcRGBColorHisto(dfImages, 16)\n",
+    "print npRGBColorHistogram.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to calculate RGBColorHistogram: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-RGBColorHistogram\"\n",
+    "exportNumpyToCSV(npRGBColorHistogram, fileNameColorHis)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### HSV Color Histogram "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31mWARNING NORMELIZATION: sumHIST is zero (0)\u001b[0m\n",
+      "\u001b[31mimage: D:\\Caltech\\BACKGROUND_Google\\image_0031.jpg\n",
+      "\u001b[0m\n",
+      "\u001b[31mWARNING NORMELIZATION: sumHIST is zero (0)\u001b[0m\n",
+      "\u001b[31mimage: D:\\Caltech\\menorah\\image_0022.jpg\n",
+      "\u001b[0m\n",
+      "(9144L, 14L)\n",
+      "Time to calculate HSVColorHistogram: 22.4220001698\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Calculate HSV Color Histogramm -> feature length = 8+3+3 = 14\n",
+    "\n",
+    "h_bins = 8 \n",
+    "s_bins = 3\n",
+    "v_bins = 3\n",
+    "histSize = [ h_bins, s_bins, v_bins ]\n",
+    "\n",
+    "npHSVColorHistogram = calcHSVColorHisto(dfImages, histSize)\n",
+    "print npHSVColorHistogram.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to calculate HSVColorHistogram: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-HSVColorHistogram\"\n",
+    "exportNumpyToCSV(npHSVColorHistogram, fileNameColorHis)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SURF Histogram "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Calculate Surf Histogramm with K=100 Cluster\n",
+    "npSurfHistogram = calcSurfHisto(dfImages, 100)\n",
+    "print npSurfHistogram.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to calculate SurfHistogram: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameSurfHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-SurfHistogram\"\n",
+    "exportNumpyToCSV(npSurfHistogram, fileNameSurfHis)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/ipynb/.ipynb_checkpoints/HOG Extraction-checkpoint.ipynb b/ipynb/.ipynb_checkpoints/HOG Extraction-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..baafb339c6d77094303a9b186d45f1009c3501f0
--- /dev/null
+++ b/ipynb/.ipynb_checkpoints/HOG Extraction-checkpoint.ipynb	
@@ -0,0 +1,354 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h1>Imports </h1>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import os as os        # for iteration throug directories\n",
+    "import pandas as pd # for Series and DataFrames\n",
+    "import cv2          # for OpenCV \n",
+    "import numpy as np  # for arrays\n",
+    "import time       # for time calculations\n",
+    "from feature_extraction_try import imgCrawl, getClassLabels #for fetching images\n",
+    "from skimage.feature import hog #calculates HOGs \n",
+    "from sklearn.cluster import MiniBatchKMeans #for clustering"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h1>HOG computation :</h1>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this file, we aim to compute a HOG descriptor thas has the same dimension for each image. So we used a bag of word approach : \n",
+    "<ul>\n",
+    "    <li>Resizing images to a 0mod5 height and width</li>\n",
+    "    <li>Sequencing each image in 5*5 cells</li>\n",
+    "    <li>Computing local histograms on each cell</li>\n",
+    "    <li>Clustering the local histograms</li>\n",
+    "    <li>Binning the local histograms to create our feature</li>\n",
+    "</ul>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>Resizing images : </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We decided to resize images by duplicating the missing pixels on each side (width and height) because it seem to be the less disturbing way for gradient computation. \n",
+    "Another possible way is to crop each image, but it may delete useful information. \n",
+    "Returns the resized image as a np.array "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def reSize(image, CELL_DIMENSION):\n",
+    "  height, width, channels = image.shape\n",
+    "  \n",
+    "  if height%CELL_DIMENSION==0 and width%CELL_DIMENSION==0:\n",
+    "    resizedImage = image\n",
+    "  \n",
+    "  elif width%CELL_DIMENSION==0:\n",
+    "    missingPixels = CELL_DIMENSION-height%CELL_DIMENSION\n",
+    "    resizedImage = cv2.copyMakeBorder(image,0,missingPixels,0,\\\n",
+    "                                      0,cv2.BORDER_REPLICATE)\n",
+    "  \n",
+    "  elif height%CELL_DIMENSION==0:\n",
+    "    missingPixels = CELL_DIMENSION-width%CELL_DIMENSION\n",
+    "    resizedImage = cv2.copyMakeBorder(image,0,0,0,missingPixels,\\\n",
+    "                                      cv2.BORDER_REPLICATE)\n",
+    "  \n",
+    "  else:\n",
+    "    missingWidthPixels = CELL_DIMENSION-width%CELL_DIMENSION\n",
+    "    missingHeightPixels = CELL_DIMENSION-height%CELL_DIMENSION\n",
+    "    resizedImage = cv2.copyMakeBorder(image,0,missingHeightPixels,0,\\\n",
+    "                                      missingWidthPixels,cv2.BORDER_REPLICATE)\n",
+    "  return resizedImage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>Sequencing images : </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we just tranfrom our corpus in $(5*5)$ cells to be able to execute the bag of words approach. \n",
+    "\n",
+    "Returns a (nbImages$*$imageHeight mod 5$*$imageWidth mod 5$*$nbChannels) np.array"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def imageSequencing(npImages, CELL_DIMENSION):\n",
+    "  cells=[]\n",
+    "  \n",
+    "  for k in range(len(npImages)):\n",
+    "    image = cv2.imread(npImages[k][1])\n",
+    "    resizedImage = reSize(image, CELL_DIMENSION)\n",
+    "    height, width, channels = resizedImage.shape\n",
+    "    cells.append(\\\n",
+    "      np.array([\\\n",
+    "        resizedImage[\\\n",
+    "          j*CELL_DIMENSION:j*CELL_DIMENSION+CELL_DIMENSION,\\\n",
+    "          i*CELL_DIMENSION:i*CELL_DIMENSION+CELL_DIMENSION] \\\n",
+    "        for i in range(width/CELL_DIMENSION) \\\n",
+    "        for j in range(height/CELL_DIMENSION)\\\n",
+    "      ])\\\n",
+    "    )\n",
+    "  return np.array(cells)  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>Compute HOG descriptor on each cell : </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we convert our images in grayscale toi be able to apply our localHistogram function (doc : http://scikit-image.org/docs/dev/auto_examples/plot_hog.html)\n",
+    "\n",
+    "Then we compute local HOGs on our cells with NB_ORIENTATIONS orientations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def computeLocalHistograms(cells, NB_ORIENTATIONS, CELL_DIMENSION):\n",
+    "  localHistograms = np.array([\\\n",
+    "                      np.array([\\\n",
+    "                        hog(cv2.cvtColor( cell, \\\n",
+    "                                          cv2.COLOR_BGR2GRAY), \\\n",
+    "                                          orientations=NB_ORIENTATIONS, \\\n",
+    "                                          pixels_per_cell=(CELL_DIMENSION,\\\n",
+    "                                                          CELL_DIMENSION),\\\n",
+    "                                          cells_per_block=(1,1)) \\\n",
+    "                        for cell in image]) \\\n",
+    "                      for image in cells])\n",
+    "  return localHistograms"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>Clustering local HOGs : </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we search for NB_CLUSTERS clusters in all our localHistograms to be able to bin later our localHistograms.\n",
+    "\n",
+    "doc for MiniBatchKMeans : http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def clusterGradients(localHistograms, NB_CLUSTERS, MAXITER):\n",
+    "  sizes = np.array([len(localHistogram) for localHistogram in localHistograms])\n",
+    "  nbImages =  len(localHistograms)\n",
+    "  flattenedHogs = np.array([cell for image in localHistograms for cell in image])\n",
+    "  miniBatchKMeans = MiniBatchKMeans(n_clusters=NB_CLUSTERS, max_iter=MAXITER, \\\n",
+    "                    compute_labels=True)\n",
+    "  localHistogramLabels = miniBatchKMeans.fit_predict(flattenedHogs)\n",
+    "  return localHistogramLabels, sizes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>Binning local HOGs : </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we just need to bin our local HOGs to avec a constant-sized feature based on HOG to describe our images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def makeHistograms(labels, NB_CLUSTERS, sizes):\n",
+    "  indiceInLabels = 0\n",
+    "  hogs = []\n",
+    "  for image in sizes:\n",
+    "    histogram = np.zeros(NB_CLUSTERS)\n",
+    "    for i in range(image):\n",
+    "      histogram[labels[indiceInLabels+i]] += 1\n",
+    "    hogs.append(histogram)\n",
+    "    indiceInLabels+=i \n",
+    "  return np.array(hogs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>Global function</h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def extractHOGFeature(npImages, CELL_DIMENSION, NB_ORIENTATIONS, \\\n",
+    "                      NB_CLUSTERS, MAXITER):\n",
+    "  cells = imageSequencing(npImages, CELL_DIMENSION)\n",
+    "  localHistograms = computeLocalHistograms(cells)\n",
+    "  localHistogramLabels, sizes = clusterGradients(localHistograms, \\\n",
+    "                                                NB_CLUSTERS, MAXITER)\n",
+    "  hogs = makeHistograms(localHistogramLabels, NB_CLUSTERS, sizes)\n",
+    "  return hogs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h1>Test zone</h1>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "if __name__ == '__main__':\n",
+    "\n",
+    "\n",
+    "  start = time.time()\n",
+    "  path ='/home/doob/Dropbox/Marseille/OMIS-Projet/03-jeux-de-donnees/101_ObjectCategories'\n",
+    "  testNpImages = [ [1,'testImage.jpg'] ]\n",
+    "  CELL_DIMENSION = 5\n",
+    "  NB_ORIENTATIONS = 8\n",
+    "  NB_CLUSTERS = 12\n",
+    "  MAXITER = 100\n",
+    "\n",
+    "  print \"Fetching Images in \" + path\n",
+    "  # get dictionary to link classLabels Text to Integers\n",
+    "  sClassLabels = getClassLabels(path)\n",
+    "  # Get all path from all images inclusive classLabel as Integer\n",
+    "  dfImages = imgCrawl(path, sClassLabels)\n",
+    "  npImages = dfImages.values\n",
+    "  extractedTime = time.time()\n",
+    "  print \"Extracted images in \" + str(extractedTime-start) +'sec'\n",
+    "  print \"Sequencing Images ...\"\n",
+    "  blocks = imageSequencing(testNpImages, 5)\n",
+    "  sequencedTime = time.time()\n",
+    "  print \"Sequenced images in \" + str(sequencedTime-extractedTime) +'sec'\n",
+    "  print \"Computing gradient on each block ...\"\n",
+    "  gradients = computeLocalHistograms(blocks, NB_ORIENTATIONS, CELL_DIMENSION)\n",
+    "  hogedTime = time.time()\n",
+    "  print \"Computed gradients in \" + str(hogedTime - sequencedTime) + 'sec'\n",
+    "  print \"Clustering gradients ...\"\n",
+    "  gradientLabels, sizes = clusterGradients(gradients, NB_CLUSTERS, MAXITER)\n",
+    "  clusteredItme = time.time()\n",
+    "  print \"Clustered gradients in \" + str(hogedTime - sequencedTime) + 'sec'\n",
+    "  print \"Computing histograms ...\"\n",
+    "  histograms = makeHistograms(gradientLabels, NB_CLUSTERS, sizes)\n",
+    "  end = time.time()\n",
+    "  print \"Computed histograms in \" + str(int(end - hogedTime)) + 'sec'\n",
+    "  print \"Histogram shape : \" +str(histograms.shape)\n",
+    "  print \"Total time : \" + str(end-start) + 'sec'\n",
+    "  #hogs = extractHOGFeature(testNpImages, CELL_DIMENSION, \\\n",
+    "  #                         NB_ORIENTATIONS, NB_CLUSTERS, MAXITER)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/ipynb/FeatureExtraction-All.ipynb b/ipynb/FeatureExtraction-All.ipynb
index f3e70d63ff286ebc5eda82d6380bbd217a2b25da..34e8e06ce8f26758b19fe9dd7232c4659903e4fe 100644
--- a/ipynb/FeatureExtraction-All.ipynb
+++ b/ipynb/FeatureExtraction-All.ipynb
@@ -1,707 +1,707 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Code to Extract all Features from Database"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Author: Nikolas Hülsmann\n",
-    "#### Date: 2015-12-06"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Functions for Extract Data\n",
-    "\n",
-    "### Function to iterate through given directory and return images paths and classLabels"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "def imgCrawl(path, sClassLabels): #path to 'highest' folder\n",
-    "    rootdir = path\n",
-    "    df = pd.DataFrame()\n",
-    "        \n",
-    "    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories\n",
-    "        for file in files:\n",
-    "            pathOfFile = os.path.join(subdir, file) #path of file\n",
-    "            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel\n",
-    "            \n",
-    "            # assign integer label for dataframe\n",
-    "            classLabel = sClassLabels[sClassLabels == classLabel].index[0]\n",
-    "            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) \n",
-    "            \n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Function to determine Class-Labels with Integer representation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# function to determine Class-labels and return Series\n",
-    "def getClassLabels(path):\n",
-    "    data = os.listdir(path) # listdir returns all subdirectories\n",
-    "    index = range(0,len(data))\n",
-    "    \n",
-    "    return pd.Series(data,index)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Function to calculate the RGBColorHistogram for given Images "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "#### Calculate RGBColorHistograms for all images\n",
-    "\n",
-    "### Points to improve: \n",
-    "\n",
-    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
-    "# numberOfBins_: Number of bins Histogram\n",
-    "def calcRGBColorHisto(dfImages_, numberOfBins_):\n",
-    "    # Initialize function\n",
-    "\n",
-    "    npImages = dfImages_.values\n",
-    "    numberOfBins = numberOfBins_\n",
-    "    npColorHist = np.zeros((len(npImages), numberOfBins*3), \"float32\")\n",
-    "    i=0\n",
-    "    \n",
-    "    ## algo\n",
-    "    for images in npImages:\n",
-    "        image = cv2.imread(images[1])\n",
-    "        \n",
-    "        # Image Size for Normalization\n",
-    "        #height, width, channels = image.shape\n",
-    "        #img_size = height * width\n",
-    "        \n",
-    "        # Split into color chanels rgb\n",
-    "        chans = cv2.split(image)\n",
-    "        colors = (\"b\", \"g\", \"r\")\n",
-    "        \n",
-    "        histogram = []\n",
-    "\n",
-    "        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels\n",
-    "        for (chan, color) in zip(chans, colors):         \n",
-    "            \n",
-    "            # Calculate Color Histogram - 16 bins cf. paper (test with 64 has shown that die result is similair in score)\n",
-    "            # Seperates the intesity for each color from 0 to 256, and creates 16 bins of same size: 0-15, 16-31, .. , 241-256\n",
-    "            hist = cv2.calcHist([chan], [0], None, [numberOfBins], [0, 256])\n",
-    "\n",
-    "            # to get raw values\n",
-    "            hist = hist[:,0]\n",
-    "            \n",
-    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (red/blue/green): \n",
-    "            #        (number of pixels in bin)/(sum of pixels in histogram)\n",
-    "            #hist[:] = [x / img_size for x in hist]\n",
-    "            sumHist = sum(hist)\n",
-    "            if(sumHist>0):\n",
-    "                hist[:] = [x / sumHist for x in hist]\n",
-    "            else:\n",
-    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
-    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
-    "            \n",
-    "\n",
-    "            # Normalize with MinMax from 0 to 1 -> feature scaling\n",
-    "            #cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)\n",
-    "            \n",
-    "            histogram.extend(hist)\n",
-    "\n",
-    "        # append features_colHist to npColorHist\n",
-    "        npColorHist[i] = histogram\n",
-    "        i = i+1\n",
-    "        \n",
-    "    \n",
-    "    return npColorHist"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "#### Calculate HSVColorHistograms for all images\n",
-    "\n",
-    "### Points to improve: \n",
-    "\n",
-    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
-    "# histSize: Number of bins in Histogram (differenz for each channel H,S,V)\n",
-    "def calcHSVColorHisto(dfImages_, histSize_):\n",
-    "    # Initialize function\n",
-    "    npImages = dfImages_.values\n",
-    "    histSize = histSize_\n",
-    "    npColorHist = np.zeros((len(npImages), int(histSize[0]+histSize[1]+histSize[2])), \"float32\")\n",
-    "    i=0\n",
-    "\n",
-    "    h_ranges = [ 0, 180 ]\n",
-    "    s_ranges = [ 0, 256 ]\n",
-    "    v_ranges = [ 0, 256 ]\n",
-    "\n",
-    "    ranges = []\n",
-    "\n",
-    "    ranges.append(h_ranges)\n",
-    "    ranges.append(s_ranges)\n",
-    "    ranges.append(v_ranges)\n",
-    "\n",
-    "    channels = [ 0, 1, 2]\n",
-    "    \n",
-    "    histogram = []\n",
-    "    \n",
-    "    ## algo\n",
-    "    for images in npImages:\n",
-    "        image = cv2.imread(images[1])\n",
-    "        hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)\n",
-    "        \n",
-    "        \n",
-    "        # Split into color chanels rgb\n",
-    "        chans = cv2.split(image)\n",
-    "        colors = (\"H\", \"S\", \"V\")\n",
-    "        \n",
-    "        histogram = []\n",
-    "        \n",
-    "        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels\n",
-    "        for (chan, color, binsize, range_chan) in zip(chans, colors, histSize, ranges): \n",
-    "            hist = cv2.calcHist([chan], [0], None, [binsize], range_chan )\n",
-    "            \n",
-    "            # to get raw values\n",
-    "            hist = hist[:,0]\n",
-    "            \n",
-    "            \n",
-    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (H/S/V): \n",
-    "            #        (number of pixels in bin)/(sum of pixels in histogram)\n",
-    "            #hist[:] = [x / img_size for x in hist]\n",
-    "            sumHist = sum(hist)\n",
-    "            if(sumHist>0):\n",
-    "                hist[:] = [x / sumHist for x in hist]\n",
-    "            else:\n",
-    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
-    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
-    "                        \n",
-    "            histogram.extend(hist)\n",
-    "        \n",
-    "        \n",
-    "        # append features_colHist to npColorHist\n",
-    "        npColorHist[i] = histogram\n",
-    "        i = i+1\n",
-    "        \n",
-    "    \n",
-    "    return npColorHist\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Function to calculate Surf Histogram"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "################# FEATURE SURF (cf. http://docs.opencv.org/3.0-beta/doc/py_tutorials/py_feature2d/py_surf_intro/py_surf_intro.html#surf)\n",
-    "# API cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html\n",
-    "\n",
-    "#### Calculate Histogramm of SURF Descripteurs with Bag Of Words appraoch for all images\n",
-    "\n",
-    "### Points to improve: \n",
-    "# - use spatial histogram: http://www.di.ens.fr/willow/events/cvml2011/materials/practical-classification/\n",
-    "# - change function: parameter how many K clustes/feature length (in regard of overfitting)\n",
-    "\n",
-    "\n",
-    "# path to higehst folder\n",
-    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
-    "# k: number of K-Cluster -> length of feature vector\n",
-    "def calcSurfHisto(dfImages_, k_):\n",
-    "    \n",
-    "    # Initialize function\n",
-    "    npImages = dfImages_.values\n",
-    "    k = k_\n",
-    "    \n",
-    "    # List where all the descriptors are stored\n",
-    "    des_list = []\n",
-    "    \n",
-    "    #### Feature Detection and Description (Surf): \n",
-    "    # Detect (localize) for each image the keypoints (Points of Interest in an image - Surf uses therefore like SIFT corners)\n",
-    "    # Pro: SIFT/SURF are scale and rotation invariant!\n",
-    "    for images in npImages:\n",
-    "        # Read image\n",
-    "        image = cv2.imread(images[1], cv2.CV_LOAD_IMAGE_COLOR)\n",
-    "        #cv2.cvtColor(image, gray, cv.CV_BGR2GRAY);\n",
-    "        \n",
-    "        # Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call\n",
-    "        # Each image has different amount of kp, but each kp has a describteur of fixed length (128)\n",
-    "        kp, des = sift.detectAndCompute(image,None)\n",
-    "        des_list.append(des)\n",
-    "    \n",
-    "    # Stack all the descriptors vertically in a numpy array\n",
-    "    descriptors = des_list[0]\n",
-    "    for descriptor in des_list[0:]:\n",
-    "        descriptors = np.vstack((descriptors, descriptor)) \n",
-    "    \n",
-    "    #### Bag of Words Approach\n",
-    "    ### 1. Step: using K-means cluster to create dictionary/vocabulary/codebook:\n",
-    "    # Encoding is the quantization of the image kp/des that constitute the image to be classified. \n",
-    "    # Basic encoding schemes work by first running K-means on the set of all des that you collect \n",
-    "    # across multiple images.\n",
-    "    # This builds what is known a dictionary/vocabulary/codebook represented by the centroids obtained from the clustering.\n",
-    "    \n",
-    "    # Perform k-means clustering -> creates the words from all describteurs -> this is the (dic) dictionary/vocabulary/codebook\n",
-    "    # k: amount of different clusters to build! Will result in a feature length k\n",
-    "    dic, variance = kmeans(descriptors, k, 1) \n",
-    "    \n",
-    "    ### 2. Step: encoding/coding/vector quantization(vq) to assign each descripteur the closest \"visual word\" from dictionary:\n",
-    "    # At the end of this process, you end up with K representative \"visual words\" (the centroid of each cluster after \n",
-    "    # K means ends) of your image descripteurs. These \"visual words\" represent what is usually understood as your \n",
-    "    # visual dictionary. Once you have these visual words, encoding is the process of assigning \n",
-    "    # each descripteur within your image the \"visual word\" (nearest neighbor) in the dictionary.\n",
-    "    \n",
-    "    npSurfHist = np.zeros((len(npImages), k), \"float32\")\n",
-    "    for i in xrange(len(npImages)):\n",
-    "        # vq: (Encoding) Assign words from the dictionary to each descripteur\n",
-    "        words, distance = vq(des_list[i],dic)\n",
-    "        \n",
-    "        ### 3. Step: Pooling - calculate a histogram for each image\n",
-    "        # Pooling refers to the process of representing an image as a \"bag of words\". \n",
-    "        # The word bag here is meant to convey that once you have encoded each descripteur with a word  (a number between 1 and K), \n",
-    "        # you build a new representation (a bag) that discards the spatial relationship between the words that \n",
-    "        # constitute your image.\n",
-    "\n",
-    "        # This representation is often a histogram or a collection of spatially adjacent histograms of the desribteurs \n",
-    "        # (i.e. histograms of values 1 to K) that together form your image. \"Pooling\" is thus the process of \n",
-    "        # building a histogram of words (i.e. pooling ~ \"sampling\" words from the image to build a probability \n",
-    "        # mass function of words)\n",
-    "\n",
-    "        # To clarify, the purpose of pooling is two fold:\n",
-    "        #           By building a feature vector that is a histogram of words (as opposed to putting the full \"sentence of words\" \n",
-    "        #           in the feature vector), your descriptor will be invariant to changes in \"the ordering of words\". \n",
-    "        #           In computer vision this translates into invariance with respect to rotations and distortions of the image \n",
-    "        #           and object, which is a desirable thing to have.\n",
-    "\n",
-    "        #           If the dictionary is small compared to the length of the sentence, a histogram of words has less dimensions \n",
-    "        #           than the original vector. Less dimensions makes learning (training) much easier.\n",
-    "        \n",
-    "        \n",
-    "        # Count the accuarance of each word (w) in image (i) to build histogram\n",
-    "        for w in words:\n",
-    "            npSurfHist[i][w] += 1\n",
-    "        \n",
-    "        #### 4. Step: Normalization of features vector (Can be changed to distribution like ColorHisto)\n",
-    "        # Frequency divided by amount of words (k)\n",
-    "        sumHist = sum(npSurfHist[i])\n",
-    "        if(sumHist>0):\n",
-    "            for x in range(0,k):\n",
-    "                #npSurfHist[i][x] = npSurfHist[i][x]/k\n",
-    "                npSurfHist[i][x] = npSurfHist[i][x]/sumHist #sumHist can be zero...change system\n",
-    "        else:      \n",
-    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
-    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
-    "                        \n",
-    "        \n",
-    "        #stdSlr = StandardScaler().fit(npSurfHist)\n",
-    "        #npSurfHist = stdSlr.transform(npSurfHist)\n",
-    "    \n",
-    "    return npSurfHist"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### SIFT Experimental - use SURF "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(100L, 128L)\n"
-     ]
-    }
-   ],
-   "source": [
-    "########### Feature SIFT (Scale-invariant feature transform cf. http://docs.opencv.org/master/da/df5/tutorial_py_sift_intro.html#gsc.tab=0)\n",
-    "# Api cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html\n",
-    "import cv2\n",
-    "import numpy as np\n",
-    "\n",
-    "img = cv2.imread('D:/Caltech//airplanes//image_0306.jpg')\n",
-    "gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
-    "\n",
-    "sift = cv2.SIFT(nfeatures=100)\n",
-    "#sift = cv2.xfeatures2d.SIFT_create()\n",
-    "\n",
-    "# Detector which detects the Keypoints in the Image\n",
-    "#kp = sift.detect(gray,None)\n",
-    "\n",
-    "# Just a visualization of the Keypoints in the Image\n",
-    "#img=cv2.drawKeypoints(gray,kp)\n",
-    "#cv2.imwrite('D:\\Sift-test\\sift_keypoints.jpg',img)\n",
-    "\n",
-    "# Another visualization with FLAG: draw a circle with size of keypoint and it will even show its orientation\n",
-    "#img=cv2.drawKeypoints(gray,kp,flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)\n",
-    "#cv2.imwrite('D:\\Sift-test\\sift_keypoints.jpg',img)\n",
-    "\n",
-    "# Method to compute the descripteurs after one has already detected the keypoints\n",
-    "#kp,des = sift.compute(gray,kp)\n",
-    "\n",
-    "#sift = cv2.xfeatures2d.SIFT_create()\n",
-    "#sift = cv2.SIFT()\n",
-    "\n",
-    "# Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call\n",
-    "kp, des = sift.detectAndCompute(gray,None)\n",
-    "\n",
-    "print des.shape\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Functions to export calculated Data to csv "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "#### Export Features to csv\n",
-    "def exportToCSV(pandasSorDF, filename):\n",
-    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Feature\"\n",
-    "    path = os.getcwdu() + \"\\\\\" + filename\n",
-    "    \n",
-    "    if os.path.isfile(path + \".csv\"):\n",
-    "        for i in range(1,20):\n",
-    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
-    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
-    "                pandasSorDF.to_csv(testFileName)\n",
-    "                break\n",
-    "\n",
-    "    else:\n",
-    "        pandasSorDF.to_csv(filename + \".csv\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def exportNumpyToCSV(numpyArray, filename):\n",
-    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Feature\"\n",
-    "    path = os.getcwdu() + \"\\\\\" + filename\n",
-    "    \n",
-    "    if os.path.isfile(path + \".csv\"):\n",
-    "        for i in range(1,20):\n",
-    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
-    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
-    "                np.savetxt(testFileName, numpyArray, delimiter=\",\")\n",
-    "                break\n",
-    "\n",
-    "    else:\n",
-    "        np.savetxt(filename + \".csv\", numpyArray, delimiter=\",\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Main Programm\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# Imports\n",
-    "import os           # for iteration throug directories\n",
-    "import pandas as pd # for Series and DataFrames\n",
-    "import cv2          # for OpenCV \n",
-    "import cv           # for OpenCV\n",
-    "import datetime     # for TimeStamp in CSVFile\n",
-    "from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html\n",
-    "import numpy as np  # for arrays\n",
-    "import time       # for time calculations\n",
-    "from termcolor import colored #to color Warnings - PACKAGE MIGHT NOT BE AVAILABLE"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Extract images path from DataBase "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(9144L,)\n",
-      "Time to extract all images: 26.3950002193\n"
-     ]
-    }
-   ],
-   "source": [
-    "start = time.time()\n",
-    "\n",
-    "# Determine the Database to extract features\n",
-    "path ='D:\\Caltech'\n",
-    "\n",
-    "# get dictionary to link classLabels Text to Integers\n",
-    "sClassLabels = getClassLabels(path)\n",
-    "\n",
-    "# Get all path from all images inclusive classLabel as Integer\n",
-    "dfImages = imgCrawl(path, sClassLabels)\n",
-    "print dfImages.classLabel.shape\n",
-    "\n",
-    "end = time.time()\n",
-    "print \"Time to extract all images: \" + str(end - start)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# CSV Output\n",
-    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-ClassLabels\"\n",
-    "exportNumpyToCSV(dfImages.classLabel, fileNameClassLabels)\n",
-    "\n",
-    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-ClassLabels-Description\"\n",
-    "exportToCSV(sClassLabels, fileNameClassLabels)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### RGB Color Histogram "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(9144L, 48L)\n",
-      "Time to calculate RGBColorHistogram: 19.3360002041\n"
-     ]
-    }
-   ],
-   "source": [
-    "start = time.time()\n",
-    "\n",
-    "# Calculate RGB Color Histogram wit 16 bins for each color -> feature length = 3 x 16 = 48\n",
-    "npRGBColorHistogram = calcRGBColorHisto(dfImages, 16)\n",
-    "print npRGBColorHistogram.shape\n",
-    "\n",
-    "end = time.time()\n",
-    "print \"Time to calculate RGBColorHistogram: \" + str(end - start)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# CSV Output\n",
-    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-RGBColorHistogram\"\n",
-    "exportNumpyToCSV(npRGBColorHistogram, fileNameColorHis)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### HSV Color Histogram "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[31mWARNING NORMELIZATION: sumHIST is zero (0)\u001b[0m\n",
-      "\u001b[31mimage: D:\\Caltech\\BACKGROUND_Google\\image_0031.jpg\n",
-      "\u001b[0m\n",
-      "\u001b[31mWARNING NORMELIZATION: sumHIST is zero (0)\u001b[0m\n",
-      "\u001b[31mimage: D:\\Caltech\\menorah\\image_0022.jpg\n",
-      "\u001b[0m\n",
-      "(9144L, 14L)\n",
-      "Time to calculate HSVColorHistogram: 22.4220001698\n"
-     ]
-    }
-   ],
-   "source": [
-    "start = time.time()\n",
-    "\n",
-    "# Calculate HSV Color Histogramm -> feature length = 8+3+3 = 14\n",
-    "\n",
-    "h_bins = 8 \n",
-    "s_bins = 3\n",
-    "v_bins = 3\n",
-    "histSize = [ h_bins, s_bins, v_bins ]\n",
-    "\n",
-    "npHSVColorHistogram = calcHSVColorHisto(dfImages, histSize)\n",
-    "print npHSVColorHistogram.shape\n",
-    "\n",
-    "end = time.time()\n",
-    "print \"Time to calculate HSVColorHistogram: \" + str(end - start)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# CSV Output\n",
-    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-HSVColorHistogram\"\n",
-    "exportNumpyToCSV(npHSVColorHistogram, fileNameColorHis)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### SURF Histogram "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "start = time.time()\n",
-    "\n",
-    "# Calculate Surf Histogramm with K=100 Cluster\n",
-    "npSurfHistogram = calcSurfHisto(dfImages, 100)\n",
-    "print npSurfHistogram.shape\n",
-    "\n",
-    "end = time.time()\n",
-    "print \"Time to calculate SurfHistogram: \" + str(end - start)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# CSV Output\n",
-    "fileNameSurfHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-SurfHistogram\"\n",
-    "exportNumpyToCSV(npSurfHistogram, fileNameSurfHis)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Code to Extract all Features from Database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Author: Nikolas Hülsmann\n",
+    "#### Date: 2015-12-06"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions for Extract Data\n",
+    "\n",
+    "### Function to iterate through given directory and return images paths and classLabels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def imgCrawl(path, sClassLabels): #path to 'highest' folder\n",
+    "    rootdir = path\n",
+    "    df = pd.DataFrame()\n",
+    "        \n",
+    "    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories\n",
+    "        for file in files:\n",
+    "            pathOfFile = os.path.join(subdir, file) #path of file\n",
+    "            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel\n",
+    "            \n",
+    "            # assign integer label for dataframe\n",
+    "            classLabel = sClassLabels[sClassLabels == classLabel].index[0]\n",
+    "            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) \n",
+    "            \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to determine Class-Labels with Integer representation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# function to determine Class-labels and return Series\n",
+    "def getClassLabels(path):\n",
+    "    data = os.listdir(path) # listdir returns all subdirectories\n",
+    "    index = range(0,len(data))\n",
+    "    \n",
+    "    return pd.Series(data,index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to calculate the RGBColorHistogram for given Images "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate RGBColorHistograms for all images\n",
+    "\n",
+    "### Points to improve: \n",
+    "\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# numberOfBins_: Number of bins Histogram\n",
+    "def calcRGBColorHisto(dfImages_, numberOfBins_):\n",
+    "    # Initialize function\n",
+    "\n",
+    "    npImages = dfImages_.values\n",
+    "    numberOfBins = numberOfBins_\n",
+    "    npColorHist = np.zeros((len(npImages), numberOfBins*3), \"float32\")\n",
+    "    i=0\n",
+    "    \n",
+    "    ## algo\n",
+    "    for images in npImages:\n",
+    "        image = cv2.imread(images[1])\n",
+    "        \n",
+    "        # Image Size for Normalization\n",
+    "        #height, width, channels = image.shape\n",
+    "        #img_size = height * width\n",
+    "        \n",
+    "        # Split into color chanels rgb\n",
+    "        chans = cv2.split(image)\n",
+    "        colors = (\"b\", \"g\", \"r\")\n",
+    "        \n",
+    "        histogram = []\n",
+    "\n",
+    "        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels\n",
+    "        for (chan, color) in zip(chans, colors):         \n",
+    "            \n",
+    "            # Calculate Color Histogram - 16 bins cf. paper (test with 64 has shown that die result is similair in score)\n",
+    "            # Seperates the intesity for each color from 0 to 256, and creates 16 bins of same size: 0-15, 16-31, .. , 241-256\n",
+    "            hist = cv2.calcHist([chan], [0], None, [numberOfBins], [0, 256])\n",
+    "\n",
+    "            # to get raw values\n",
+    "            hist = hist[:,0]\n",
+    "            \n",
+    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (red/blue/green): \n",
+    "            #        (number of pixels in bin)/(sum of pixels in histogram)\n",
+    "            #hist[:] = [x / img_size for x in hist]\n",
+    "            sumHist = sum(hist)\n",
+    "            if(sumHist>0):\n",
+    "                hist[:] = [x / sumHist for x in hist]\n",
+    "            else:\n",
+    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
+    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
+    "            \n",
+    "\n",
+    "            # Normalize with MinMax from 0 to 1 -> feature scaling\n",
+    "            #cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)\n",
+    "            \n",
+    "            histogram.extend(hist)\n",
+    "\n",
+    "        # append features_colHist to npColorHist\n",
+    "        npColorHist[i] = histogram\n",
+    "        i = i+1\n",
+    "        \n",
+    "    \n",
+    "    return npColorHist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate HSVColorHistograms for all images\n",
+    "\n",
+    "### Points to improve: \n",
+    "\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# histSize: Number of bins in Histogram (differenz for each channel H,S,V)\n",
+    "def calcHSVColorHisto(dfImages_, histSize_):\n",
+    "    # Initialize function\n",
+    "    npImages = dfImages_.values\n",
+    "    histSize = histSize_\n",
+    "    npColorHist = np.zeros((len(npImages), int(histSize[0]+histSize[1]+histSize[2])), \"float32\")\n",
+    "    i=0\n",
+    "\n",
+    "    h_ranges = [ 0, 180 ]\n",
+    "    s_ranges = [ 0, 256 ]\n",
+    "    v_ranges = [ 0, 256 ]\n",
+    "\n",
+    "    ranges = []\n",
+    "\n",
+    "    ranges.append(h_ranges)\n",
+    "    ranges.append(s_ranges)\n",
+    "    ranges.append(v_ranges)\n",
+    "\n",
+    "    channels = [ 0, 1, 2]\n",
+    "    \n",
+    "    histogram = []\n",
+    "    \n",
+    "    ## algo\n",
+    "    for images in npImages:\n",
+    "        image = cv2.imread(images[1])\n",
+    "        hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)\n",
+    "        \n",
+    "        \n",
+    "        # Split into color chanels rgb\n",
+    "        chans = cv2.split(image)\n",
+    "        colors = (\"H\", \"S\", \"V\")\n",
+    "        \n",
+    "        histogram = []\n",
+    "        \n",
+    "        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels\n",
+    "        for (chan, color, binsize, range_chan) in zip(chans, colors, histSize, ranges): \n",
+    "            hist = cv2.calcHist([chan], [0], None, [binsize], range_chan )\n",
+    "            \n",
+    "            # to get raw values\n",
+    "            hist = hist[:,0]\n",
+    "            \n",
+    "            \n",
+    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (H/S/V): \n",
+    "            #        (number of pixels in bin)/(sum of pixels in histogram)\n",
+    "            #hist[:] = [x / img_size for x in hist]\n",
+    "            sumHist = sum(hist)\n",
+    "            if(sumHist>0):\n",
+    "                hist[:] = [x / sumHist for x in hist]\n",
+    "            else:\n",
+    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
+    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
+    "                        \n",
+    "            histogram.extend(hist)\n",
+    "        \n",
+    "        \n",
+    "        # append features_colHist to npColorHist\n",
+    "        npColorHist[i] = histogram\n",
+    "        i = i+1\n",
+    "        \n",
+    "    \n",
+    "    return npColorHist\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to calculate Surf Histogram"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "################# FEATURE SURF (cf. http://docs.opencv.org/3.0-beta/doc/py_tutorials/py_feature2d/py_surf_intro/py_surf_intro.html#surf)\n",
+    "# API cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html\n",
+    "\n",
+    "#### Calculate Histogramm of SURF Descripteurs with Bag Of Words appraoch for all images\n",
+    "\n",
+    "### Points to improve: \n",
+    "# - use spatial histogram: http://www.di.ens.fr/willow/events/cvml2011/materials/practical-classification/\n",
+    "# - change function: parameter how many K clustes/feature length (in regard of overfitting)\n",
+    "\n",
+    "\n",
+    "# path to higehst folder\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# k: number of K-Cluster -> length of feature vector\n",
+    "def calcSurfHisto(dfImages_, k_):\n",
+    "    \n",
+    "    # Initialize function\n",
+    "    npImages = dfImages_.values\n",
+    "    k = k_\n",
+    "    \n",
+    "    # List where all the descriptors are stored\n",
+    "    des_list = []\n",
+    "    \n",
+    "    #### Feature Detection and Description (Surf): \n",
+    "    # Detect (localize) for each image the keypoints (Points of Interest in an image - Surf uses therefore like SIFT corners)\n",
+    "    # Pro: SIFT/SURF are scale and rotation invariant!\n",
+    "    for images in npImages:\n",
+    "        # Read image\n",
+    "        image = cv2.imread(images[1], cv2.CV_LOAD_IMAGE_COLOR)\n",
+    "        #cv2.cvtColor(image, gray, cv.CV_BGR2GRAY);\n",
+    "        \n",
+    "        # Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call\n",
+    "        # Each image has different amount of kp, but each kp has a describteur of fixed length (128)\n",
+    "        kp, des = sift.detectAndCompute(image,None)\n",
+    "        des_list.append(des)\n",
+    "    \n",
+    "    # Stack all the descriptors vertically in a numpy array\n",
+    "    descriptors = des_list[0]\n",
+    "    for descriptor in des_list[0:]:\n",
+    "        descriptors = np.vstack((descriptors, descriptor)) \n",
+    "    \n",
+    "    #### Bag of Words Approach\n",
+    "    ### 1. Step: using K-means cluster to create dictionary/vocabulary/codebook:\n",
+    "    # Encoding is the quantization of the image kp/des that constitute the image to be classified. \n",
+    "    # Basic encoding schemes work by first running K-means on the set of all des that you collect \n",
+    "    # across multiple images.\n",
+    "    # This builds what is known a dictionary/vocabulary/codebook represented by the centroids obtained from the clustering.\n",
+    "    \n",
+    "    # Perform k-means clustering -> creates the words from all describteurs -> this is the (dic) dictionary/vocabulary/codebook\n",
+    "    # k: amount of different clusters to build! Will result in a feature length k\n",
+    "    dic, variance = kmeans(descriptors, k, 1) \n",
+    "    \n",
+    "    ### 2. Step: encoding/coding/vector quantization(vq) to assign each descripteur the closest \"visual word\" from dictionary:\n",
+    "    # At the end of this process, you end up with K representative \"visual words\" (the centroid of each cluster after \n",
+    "    # K means ends) of your image descripteurs. These \"visual words\" represent what is usually understood as your \n",
+    "    # visual dictionary. Once you have these visual words, encoding is the process of assigning \n",
+    "    # each descripteur within your image the \"visual word\" (nearest neighbor) in the dictionary.\n",
+    "    \n",
+    "    npSurfHist = np.zeros((len(npImages), k), \"float32\")\n",
+    "    for i in xrange(len(npImages)):\n",
+    "        # vq: (Encoding) Assign words from the dictionary to each descripteur\n",
+    "        words, distance = vq(des_list[i],dic)\n",
+    "        \n",
+    "        ### 3. Step: Pooling - calculate a histogram for each image\n",
+    "        # Pooling refers to the process of representing an image as a \"bag of words\". \n",
+    "        # The word bag here is meant to convey that once you have encoded each descripteur with a word  (a number between 1 and K), \n",
+    "        # you build a new representation (a bag) that discards the spatial relationship between the words that \n",
+    "        # constitute your image.\n",
+    "\n",
+    "        # This representation is often a histogram or a collection of spatially adjacent histograms of the desribteurs \n",
+    "        # (i.e. histograms of values 1 to K) that together form your image. \"Pooling\" is thus the process of \n",
+    "        # building a histogram of words (i.e. pooling ~ \"sampling\" words from the image to build a probability \n",
+    "        # mass function of words)\n",
+    "\n",
+    "        # To clarify, the purpose of pooling is two fold:\n",
+    "        #           By building a feature vector that is a histogram of words (as opposed to putting the full \"sentence of words\" \n",
+    "        #           in the feature vector), your descriptor will be invariant to changes in \"the ordering of words\". \n",
+    "        #           In computer vision this translates into invariance with respect to rotations and distortions of the image \n",
+    "        #           and object, which is a desirable thing to have.\n",
+    "\n",
+    "        #           If the dictionary is small compared to the length of the sentence, a histogram of words has less dimensions \n",
+    "        #           than the original vector. Less dimensions makes learning (training) much easier.\n",
+    "        \n",
+    "        \n",
+    "        # Count the accuarance of each word (w) in image (i) to build histogram\n",
+    "        for w in words:\n",
+    "            npSurfHist[i][w] += 1\n",
+    "        \n",
+    "        #### 4. Step: Normalization of features vector (Can be changed to distribution like ColorHisto)\n",
+    "        # Frequency divided by amount of words (k)\n",
+    "        sumHist = sum(npSurfHist[i])\n",
+    "        if(sumHist>0):\n",
+    "            for x in range(0,k):\n",
+    "                #npSurfHist[i][x] = npSurfHist[i][x]/k\n",
+    "                npSurfHist[i][x] = npSurfHist[i][x]/sumHist #sumHist can be zero...change system\n",
+    "        else:      \n",
+    "                print colored(\"WARNING NORMELIZATION: sumHIST is zero (0)\", 'red')\n",
+    "                print colored(\"image: \" + images[1] + \"\\n\", 'red')\n",
+    "                        \n",
+    "        \n",
+    "        #stdSlr = StandardScaler().fit(npSurfHist)\n",
+    "        #npSurfHist = stdSlr.transform(npSurfHist)\n",
+    "    \n",
+    "    return npSurfHist"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SIFT Experimental - use SURF "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(100L, 128L)\n"
+     ]
+    }
+   ],
+   "source": [
+    "########### Feature SIFT (Scale-invariant feature transform cf. http://docs.opencv.org/master/da/df5/tutorial_py_sift_intro.html#gsc.tab=0)\n",
+    "# Api cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "\n",
+    "img = cv2.imread('D:/Caltech//airplanes//image_0306.jpg')\n",
+    "gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
+    "\n",
+    "sift = cv2.SIFT(nfeatures=100)\n",
+    "#sift = cv2.xfeatures2d.SIFT_create()\n",
+    "\n",
+    "# Detector which detects the Keypoints in the Image\n",
+    "#kp = sift.detect(gray,None)\n",
+    "\n",
+    "# Just a visualization of the Keypoints in the Image\n",
+    "#img=cv2.drawKeypoints(gray,kp)\n",
+    "#cv2.imwrite('D:\\Sift-test\\sift_keypoints.jpg',img)\n",
+    "\n",
+    "# Another visualization with FLAG: draw a circle with size of keypoint and it will even show its orientation\n",
+    "#img=cv2.drawKeypoints(gray,kp,flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)\n",
+    "#cv2.imwrite('D:\\Sift-test\\sift_keypoints.jpg',img)\n",
+    "\n",
+    "# Method to compute the descripteurs after one has already detected the keypoints\n",
+    "#kp,des = sift.compute(gray,kp)\n",
+    "\n",
+    "#sift = cv2.xfeatures2d.SIFT_create()\n",
+    "#sift = cv2.SIFT()\n",
+    "\n",
+    "# Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call\n",
+    "kp, des = sift.detectAndCompute(gray,None)\n",
+    "\n",
+    "print des.shape\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functions to export calculated Data to csv "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Export Features to csv\n",
+    "def exportToCSV(pandasSorDF, filename):\n",
+    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Feature\"\n",
+    "    path = os.getcwdu() + \"\\\\\" + filename\n",
+    "    \n",
+    "    if os.path.isfile(path + \".csv\"):\n",
+    "        for i in range(1,20):\n",
+    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
+    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
+    "                pandasSorDF.to_csv(testFileName)\n",
+    "                break\n",
+    "\n",
+    "    else:\n",
+    "        pandasSorDF.to_csv(filename + \".csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def exportNumpyToCSV(numpyArray, filename):\n",
+    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Feature\"\n",
+    "    path = os.getcwdu() + \"\\\\\" + filename\n",
+    "    \n",
+    "    if os.path.isfile(path + \".csv\"):\n",
+    "        for i in range(1,20):\n",
+    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
+    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
+    "                np.savetxt(testFileName, numpyArray, delimiter=\",\")\n",
+    "                break\n",
+    "\n",
+    "    else:\n",
+    "        np.savetxt(filename + \".csv\", numpyArray, delimiter=\",\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Main Programm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os           # for iteration throug directories\n",
+    "import pandas as pd # for Series and DataFrames\n",
+    "import cv2          # for OpenCV \n",
+    "import cv           # for OpenCV\n",
+    "import datetime     # for TimeStamp in CSVFile\n",
+    "from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html\n",
+    "import numpy as np  # for arrays\n",
+    "import time       # for time calculations\n",
+    "from termcolor import colored #to color Warnings - PACKAGE MIGHT NOT BE AVAILABLE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Extract images path from DataBase "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(9144L,)\n",
+      "Time to extract all images: 26.3950002193\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Determine the Database to extract features\n",
+    "path ='D:\\Caltech'\n",
+    "\n",
+    "# get dictionary to link classLabels Text to Integers\n",
+    "sClassLabels = getClassLabels(path)\n",
+    "\n",
+    "# Get all path from all images inclusive classLabel as Integer\n",
+    "dfImages = imgCrawl(path, sClassLabels)\n",
+    "print dfImages.classLabel.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to extract all images: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-ClassLabels\"\n",
+    "exportNumpyToCSV(dfImages.classLabel, fileNameClassLabels)\n",
+    "\n",
+    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-ClassLabels-Description\"\n",
+    "exportToCSV(sClassLabels, fileNameClassLabels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### RGB Color Histogram "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(9144L, 48L)\n",
+      "Time to calculate RGBColorHistogram: 19.3360002041\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Calculate RGB Color Histogram wit 16 bins for each color -> feature length = 3 x 16 = 48\n",
+    "npRGBColorHistogram = calcRGBColorHisto(dfImages, 16)\n",
+    "print npRGBColorHistogram.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to calculate RGBColorHistogram: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-RGBColorHistogram\"\n",
+    "exportNumpyToCSV(npRGBColorHistogram, fileNameColorHis)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### HSV Color Histogram "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31mWARNING NORMELIZATION: sumHIST is zero (0)\u001b[0m\n",
+      "\u001b[31mimage: D:\\Caltech\\BACKGROUND_Google\\image_0031.jpg\n",
+      "\u001b[0m\n",
+      "\u001b[31mWARNING NORMELIZATION: sumHIST is zero (0)\u001b[0m\n",
+      "\u001b[31mimage: D:\\Caltech\\menorah\\image_0022.jpg\n",
+      "\u001b[0m\n",
+      "(9144L, 14L)\n",
+      "Time to calculate HSVColorHistogram: 22.4220001698\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Calculate HSV Color Histogramm -> feature length = 8+3+3 = 14\n",
+    "\n",
+    "h_bins = 8 \n",
+    "s_bins = 3\n",
+    "v_bins = 3\n",
+    "histSize = [ h_bins, s_bins, v_bins ]\n",
+    "\n",
+    "npHSVColorHistogram = calcHSVColorHisto(dfImages, histSize)\n",
+    "print npHSVColorHistogram.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to calculate HSVColorHistogram: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-HSVColorHistogram\"\n",
+    "exportNumpyToCSV(npHSVColorHistogram, fileNameColorHis)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SURF Histogram "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "\n",
+    "# Calculate Surf Histogramm with K=100 Cluster\n",
+    "npSurfHistogram = calcSurfHisto(dfImages, 100)\n",
+    "print npSurfHistogram.shape\n",
+    "\n",
+    "end = time.time()\n",
+    "print \"Time to calculate SurfHistogram: \" + str(end - start)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# CSV Output\n",
+    "fileNameSurfHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Caltech-Feature-SurfHistogram\"\n",
+    "exportNumpyToCSV(npSurfHistogram, fileNameSurfHis)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/ipynb/HOG Extraction.ipynb b/ipynb/HOG Extraction.ipynb
index b3caf051559839802e6b0b5dca39cfb11e515e50..baafb339c6d77094303a9b186d45f1009c3501f0 100644
--- a/ipynb/HOG Extraction.ipynb	
+++ b/ipynb/HOG Extraction.ipynb	
@@ -346,7 +346,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.10"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,