try that

300ba3d2 · bbauvin · 300ba3d2 · 300ba3d2 · 300ba3d2 · 300ba3d2
Commit 300ba3d2 authored 9 years ago by bbauvin
--- a/2015_12_07-Caltech-ClassLabels.csv
+++ b/2015_12_07-Caltech-ClassLabels.csv
--- a/ColorHistoTest.ipynb
+++ b/ColorHistoTest.ipynb
--- a/Feature Extraction - ColorHistogram - NormDistibution.ipynb
+++ b/Feature Extraction - ColorHistogram - NormDistibution.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Code to Extract ColorHistograms for Database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Author: Nikolas Hülsmann\n",
+    "#### Date: 2015-11-22"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions for Extract Data\n",
+    "\n",
+    "### Function to iterate through given directory and return images paths and classLabels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def imgCrawl(path): #path to 'highest' folder\n",
+    "    rootdir = path\n",
+    "    df = pd.DataFrame()\n",
+    "        \n",
+    "    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories\n",
+    "        for file in files:\n",
+    "            pathOfFile = os.path.join(subdir, file) #path of file\n",
+    "            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel\n",
+    "            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) \n",
+    "            \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to determine Class-Labels with Integer representation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# function to determine Class-labels and return Series\n",
+    "def getClassLabels(path):\n",
+    "    data = os.listdir(path) # listdir returns all subdirectories\n",
+    "    index = range(0,len(data))\n",
+    "    \n",
+    "    return pd.Series(data,index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to calculate the ColorHistogram for given Images "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate ColorHistograms for all images\n",
+    "\n",
+    "# path to higehst folder\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# sClassLabel: Series with ClassLabels - use function getClassLabels\n",
+    "def calcColorHisto(path_, dfImages_, sClassLabels_):\n",
+    "    # Initialize function\n",
+    "    df = pd.DataFrame()\n",
+    "    path =path_\n",
+    "    npImages = dfImages_.values\n",
+    "    sClassLabels = sClassLabels_\n",
+    "\n",
+    "    ## algo\n",
+    "    for images in npImages:\n",
+    "        image = cv2.imread(images[1])\n",
+    "                \n",
+    "        # Image Size for Normalization\n",
+    "        height, width, channels = image.shape\n",
+    "        img_size = height * width\n",
+    "        \n",
+    "        # Split into color chanels rgb\n",
+    "        chans = cv2.split(image)\n",
+    "        colors = (\"b\", \"g\", \"r\")\n",
+    "        \n",
+    "        features = []\n",
+    "        i = 1\n",
+    "\n",
+    "        # loop over the image channels\n",
+    "        for chan in chans:\n",
+    "            # Calculate Color Histogram - 16 bins cf. paper (test with 64 has shown that die result is similair in score)\n",
+    "            hist = cv2.calcHist([chan], [0], None, [16], [0, 256])\n",
+    "\n",
+    "            print i\n",
+    "            i=i+1\n",
+    "            # to get raw values\n",
+    "            hist = hist[:,0]\n",
+    "            \n",
+    "            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (red/blue/green): \n",
+    "            #        (number of pixels in bin)/(pixel size of image)\n",
+    "            hist[:] = [x / img_size for x in hist]\n",
+    "\n",
+    "            # Normalize with MinMax from 0 to 1 -> feature scaling\n",
+    "            #cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)\n",
+    "            \n",
+    "            \n",
+    "            features.extend(hist)\n",
+    "\n",
+    "        # assign integer label for dataframe\n",
+    "        classLabel = sClassLabels[sClassLabels == images[0]].index[0]\n",
+    "\n",
+    "        # append features to df\n",
+    "        df = df.append({'classLabel': classLabel, 'ColHisto': features}, ignore_index=True) \n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to export calculated Data to csv "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Export ColorHistogram to csv\n",
+    "def exportToCSV(pandasSorDF, filename):\n",
+    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-ColorHistogram\"\n",
+    "    path = os.getcwdu() + \"\\\\\" + filename\n",
+    "    \n",
+    "    if os.path.isfile(path + \".csv\"):\n",
+    "        for i in range(1,20):\n",
+    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
+    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
+    "                pandasSorDF.to_csv(testFileName)\n",
+    "                break\n",
+    "\n",
+    "    else:\n",
+    "        pandasSorDF.to_csv(filename + \".csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Main Programm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os           # for iteration throug directories\n",
+    "import pandas as pd # for Series and DataFrames\n",
+    "import cv2          # for OpenCV \n",
+    "import datetime     # for TimeStamp in CSVFile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1\n",
+      "2\n",
+      "3\n"
+     ]
+    },
+    {
+     "ename": "IndexError",
+     "evalue": "index 0 is out of bounds for axis 0 with size 0",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-10-543bf34003a8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0mdfImages\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimgCrawl\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0msClassLabels\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetClassLabels\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdfColorHistogram\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcalcColorHisto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdfImages\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msClassLabels\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[0mfileNameColorHis\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrftime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"%Y_%m_%d\"\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\"-Features\"\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;34m\"-ColorHistogram\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m<ipython-input-9-3a59b70a518a>\u001b[0m in \u001b[0;36mcalcColorHisto\u001b[1;34m(path_, dfImages_, sClassLabels_)\u001b[0m\n\u001b[0;32m     47\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     48\u001b[0m         \u001b[1;31m# assign integer label for dataframe\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 49\u001b[1;33m         \u001b[0mclassLabel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msClassLabels\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0msClassLabels\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mimages\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     51\u001b[0m         \u001b[1;31m# append features to df\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mD:\\Programme\\Anaconda\\lib\\site-packages\\pandas\\core\\index.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   1074\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1075\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misscalar\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1076\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mgetitem\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1077\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1078\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mslice\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0"
+     ]
+    }
+   ],
+   "source": [
+    "#### Calculate Color Histogram\n",
+    "path ='D:\\CaltechMini'\n",
+    "dfImages = imgCrawl(path)\n",
+    "sClassLabels = getClassLabels(path)\n",
+    "dfColorHistogram = calcColorHisto(path, dfImages, sClassLabels)\n",
+    "\n",
+    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Features\" +\"-ColorHistogram\"\n",
+    "exportToCSV(dfColorHistogram, fileNameColorHis)\n",
+    "\n",
+    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-ClassLabels\" + \"-Caltech\"\n",
+    "exportToCSV(sClassLabels, fileNameClassLabels)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
+%% Cell type:markdown id: tags:
+
+# Code to Extract ColorHistograms for Database
+
+%% Cell type:markdown id: tags:
+
+#### Author: Nikolas Hülsmann
+#### Date: 2015-11-22
+
+%% Cell type:markdown id: tags:
+
+## Functions for Extract Data
+
+### Function to iterate through given directory and return images paths and classLabels
+
+%% Cell type:code id: tags:
+
+``` python
+def imgCrawl(path): #path to 'highest' folder
+    rootdir = path
+    df = pd.DataFrame()
+
+    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories
+        for file in files:
+            pathOfFile = os.path.join(subdir, file) #path of file
+            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel
+            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True)
+
+    return df
+```
+
+%% Cell type:markdown id: tags:
+
+### Function to determine Class-Labels with Integer representation
+
+%% Cell type:code id: tags:
+
+``` python
+# function to determine Class-labels and return Series
+def getClassLabels(path):
+    data = os.listdir(path) # listdir returns all subdirectories
+    index = range(0,len(data))
+
+    return pd.Series(data,index)
+```
+
+%% Cell type:markdown id: tags:
+
+### Function to calculate the ColorHistogram for given Images
+
+%% Cell type:code id: tags:
+
+``` python
+#### Calculate ColorHistograms for all images
+
+# path to higehst folder
+# dfImages: Dataframe with paths to all images - use function imgCrawl
+# sClassLabel: Series with ClassLabels - use function getClassLabels
+def calcColorHisto(path_, dfImages_, sClassLabels_):
+    # Initialize function
+    df = pd.DataFrame()
+    path =path_
+    npImages = dfImages_.values
+    sClassLabels = sClassLabels_
+
+    ## algo
+    for images in npImages:
+        image = cv2.imread(images[1])
+
+        # Image Size for Normalization
+        height, width, channels = image.shape
+        img_size = height * width
+
+        # Split into color chanels rgb
+        chans = cv2.split(image)
+        colors = ("b", "g", "r")
+
+        features = []
+        i = 1
+
+        # loop over the image channels
+        for chan in chans:
+            # Calculate Color Histogram - 16 bins cf. paper (test with 64 has shown that die result is similair in score)
+            hist = cv2.calcHist([chan], [0], None, [16], [0, 256])
+
+            print i
+            i=i+1
+            # to get raw values
+            hist = hist[:,0]
+
+            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (red/blue/green):
+            #        (number of pixels in bin)/(pixel size of image)
+            hist[:] = [x / img_size for x in hist]
+
+            # Normalize with MinMax from 0 to 1 -> feature scaling
+            #cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)
+
+
+            features.extend(hist)
+
+        # assign integer label for dataframe
+        classLabel = sClassLabels[sClassLabels == images[0]].index[0]
+
+        # append features to df
+        df = df.append({'classLabel': classLabel, 'ColHisto': features}, ignore_index=True)
+
+    return df
+```
+
+%% Cell type:markdown id: tags:
+
+### Function to export calculated Data to csv
+
+%% Cell type:code id: tags:
+
+``` python
+#### Export ColorHistogram to csv
+def exportToCSV(pandasSorDF, filename):
+    #filename = datetime.datetime.now().strftime("%Y_%m_%d") + "-ColorHistogram"
+    path = os.getcwdu() + "\\" + filename
+
+    if os.path.isfile(path + ".csv"):
+        for i in range(1,20):
+            testFileName = filename  + "-" + str(i) + ".csv"
+            if os.path.isfile(os.getcwdu() + "\\" +  testFileName)!=True:
+                pandasSorDF.to_csv(testFileName)
+                break
+
+    else:
+        pandasSorDF.to_csv(filename + ".csv")
+```
+
+%% Cell type:markdown id: tags:
+
+## Main Programm
+
+%% Cell type:code id: tags:
+
+``` python
+# Imports
+import os           # for iteration throug directories
+import pandas as pd # for Series and DataFrames
+import cv2          # for OpenCV
+import datetime     # for TimeStamp in CSVFile
+```
+
+%% Cell type:code id: tags:
+
+``` python
+#### Calculate Color Histogram
+path ='D:\CaltechMini'
+dfImages = imgCrawl(path)
+sClassLabels = getClassLabels(path)
+dfColorHistogram = calcColorHisto(path, dfImages, sClassLabels)
+
+fileNameColorHis = datetime.datetime.now().strftime("%Y_%m_%d") + "-Features" +"-ColorHistogram"
+exportToCSV(dfColorHistogram, fileNameColorHis)
+
+fileNameClassLabels = datetime.datetime.now().strftime("%Y_%m_%d") + "-ClassLabels" + "-Caltech"
+exportToCSV(sClassLabels, fileNameClassLabels)
+```
+
+%% Output
+
+    1
+    2
+    3
+
+    ---------------------------------------------------------------------------
+    IndexError                                Traceback (most recent call last)
+    <ipython-input-10-543bf34003a8> in <module>()
+          3 dfImages = imgCrawl(path)
+          4 sClassLabels = getClassLabels(path)
+    ----> 5 dfColorHistogram = calcColorHisto(path, dfImages, sClassLabels)
+          6
+          7 fileNameColorHis = datetime.datetime.now().strftime("%Y_%m_%d") + "-Features" +"-ColorHistogram"
+    <ipython-input-9-3a59b70a518a> in calcColorHisto(path_, dfImages_, sClassLabels_)
+         47
+         48         # assign integer label for dataframe
+    ---> 49         classLabel = sClassLabels[sClassLabels == images[0]].index[0]
+         50
+         51         # append features to df
+    D:\Programme\Anaconda\lib\site-packages\pandas\core\index.pyc in __getitem__(self, key)
+       1074
+       1075         if np.isscalar(key):
+    -> 1076             return getitem(key)
+       1077
+       1078         if isinstance(key, slice):
+    IndexError: index 0 is out of bounds for axis 0 with size 0
--- a/Feature Extraction - ColorHistogram.ipynb
+++ b/Feature Extraction - ColorHistogram.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Code to Extract ColorHistograms for Database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Author: Nikolas Hülsmann\n",
+    "#### Date: 2015-11-22"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions for Extract Data\n",
+    "\n",
+    "### Function to iterate through given directory and return images paths and classLabels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def imgCrawl(path): #path to 'highest' folder\n",
+    "    rootdir = path\n",
+    "    df = pd.DataFrame()\n",
+    "        \n",
+    "    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories\n",
+    "        for file in files:\n",
+    "            pathOfFile = os.path.join(subdir, file) #path of file\n",
+    "            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel\n",
+    "            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) \n",
+    "            \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to determine Class-Labels with Integer representation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# function to determine Class-labels and return Series\n",
+    "def getClassLabels(path):\n",
+    "    data = os.listdir(path) # listdir returns all subdirectories\n",
+    "    index = range(0,len(data))\n",
+    "    \n",
+    "    return pd.Series(data,index)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to calculate the ColorHistogram for given Images "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate ColorHistograms for all images\n",
+    "\n",
+    "# path to higehst folder\n",
+    "# dfImages: Dataframe with paths to all images - use function imgCrawl\n",
+    "# sClassLabel: Series with ClassLabels - use function getClassLabels\n",
+    "def calcColorHisto(path_, dfImages_, sClassLabels_):\n",
+    "    # Initialize function\n",
+    "    df = pd.DataFrame()\n",
+    "    path =path_\n",
+    "    npImages = dfImages_.values\n",
+    "    sClassLabels = sClassLabels_\n",
+    "\n",
+    "    ## algo\n",
+    "    for images in npImages:\n",
+    "        image = cv2.imread(images[1])\n",
+    "        chans = cv2.split(image) # Split into color chanels rgb\n",
+    "        colors = (\"b\", \"g\", \"r\")\n",
+    "        features = []\n",
+    "\n",
+    "        # loop over the image channels\n",
+    "        for (chan, color) in zip(chans, colors):\n",
+    "            # Calculate Color Histogram - 16 bins cf. paper\n",
+    "            hist = cv2.calcHist([chan], [0], None, [16], [0, 256])\n",
+    "\n",
+    "            # to get raw values\n",
+    "            hist = hist[:,0]\n",
+    "\n",
+    "            # Normalize with MinMax from 0 to 1 -> feature scaling\n",
+    "            cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)\n",
+    "            features.extend(hist)\n",
+    "\n",
+    "        # assign integer label for dataframe\n",
+    "        classLabel = sClassLabels[sClassLabels == images[0]].index[0]\n",
+    "\n",
+    "        # append features to df\n",
+    "        df = df.append({'classLabel': classLabel, 'ColHisto': features}, ignore_index=True) \n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Function to export calculated Data to csv "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Export ColorHistogram to csv\n",
+    "def exportToCSV(pandasSorDF, filename):\n",
+    "    #filename = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-ColorHistogram\"\n",
+    "    path = os.getcwdu() + \"\\\\\" + filename\n",
+    "    \n",
+    "    if os.path.isfile(path + \".csv\"):\n",
+    "        for i in range(1,20):\n",
+    "            testFileName = filename  + \"-\" + str(i) + \".csv\"\n",
+    "            if os.path.isfile(os.getcwdu() + \"\\\\\" +  testFileName)!=True:\n",
+    "                pandasSorDF.to_csv(testFileName)\n",
+    "                break\n",
+    "\n",
+    "    else:\n",
+    "        pandasSorDF.to_csv(filename + \".csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Main Programm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os           # for iteration throug directories\n",
+    "import pandas as pd # for Series and DataFrames\n",
+    "import cv2          # for OpenCV \n",
+    "import datetime     # for TimeStamp in CSVFile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "#### Calculate Color Histogram\n",
+    "path ='D:\\Caltech'\n",
+    "dfImages = imgCrawl(path)\n",
+    "sClassLabels = getClassLabels(path)\n",
+    "dfColorHistogram = calcColorHisto(path, dfImages, sClassLabels)\n",
+    "\n",
+    "fileNameColorHis = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-Features\" +\"-ColorHistogram\"\n",
+    "exportToCSV(dfColorHistogram, fileNameColorHis)\n",
+    "\n",
+    "fileNameClassLabels = datetime.datetime.now().strftime(\"%Y_%m_%d\") + \"-ClassLabels\" + \"-Caltech\"\n",
+    "exportToCSV(sClassLabels, fileNameClassLabels)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
+%% Cell type:markdown id: tags:
+
+# Code to Extract ColorHistograms for Database
+
+%% Cell type:markdown id: tags:
+
+#### Author: Nikolas Hülsmann
+#### Date: 2015-11-22
+
+%% Cell type:markdown id: tags:
+
+## Functions for Extract Data
+
+### Function to iterate through given directory and return images paths and classLabels
+
+%% Cell type:code id: tags:
+
+``` python
+def imgCrawl(path): #path to 'highest' folder
+    rootdir = path
+    df = pd.DataFrame()
+
+    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories
+        for file in files:
+            pathOfFile = os.path.join(subdir, file) #path of file
+            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel
+            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True)
+
+    return df
+```
+
+%% Cell type:markdown id: tags:
+
+### Function to determine Class-Labels with Integer representation
+
+%% Cell type:code id: tags:
+
+``` python
+# function to determine Class-labels and return Series
+def getClassLabels(path):
+    data = os.listdir(path) # listdir returns all subdirectories
+    index = range(0,len(data))
+
+    return pd.Series(data,index)
+```
+
+%% Cell type:markdown id: tags:
+
+### Function to calculate the ColorHistogram for given Images
+
+%% Cell type:code id: tags:
+
+``` python
+#### Calculate ColorHistograms for all images
+
+# path to higehst folder
+# dfImages: Dataframe with paths to all images - use function imgCrawl
+# sClassLabel: Series with ClassLabels - use function getClassLabels
+def calcColorHisto(path_, dfImages_, sClassLabels_):
+    # Initialize function
+    df = pd.DataFrame()
+    path =path_
+    npImages = dfImages_.values
+    sClassLabels = sClassLabels_
+
+    ## algo
+    for images in npImages:
+        image = cv2.imread(images[1])
+        chans = cv2.split(image) # Split into color chanels rgb
+        colors = ("b", "g", "r")
+        features = []
+
+        # loop over the image channels
+        for (chan, color) in zip(chans, colors):
+            # Calculate Color Histogram - 16 bins cf. paper
+            hist = cv2.calcHist([chan], [0], None, [16], [0, 256])
+
+            # to get raw values
+            hist = hist[:,0]
+
+            # Normalize with MinMax from 0 to 1 -> feature scaling
+            cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)
+            features.extend(hist)
+
+        # assign integer label for dataframe
+        classLabel = sClassLabels[sClassLabels == images[0]].index[0]
+
+        # append features to df
+        df = df.append({'classLabel': classLabel, 'ColHisto': features}, ignore_index=True)
+
+    return df
+```
+
+%% Cell type:markdown id: tags:
+
+### Function to export calculated Data to csv
+
+%% Cell type:code id: tags:
+
+``` python
+#### Export ColorHistogram to csv
+def exportToCSV(pandasSorDF, filename):
+    #filename = datetime.datetime.now().strftime("%Y_%m_%d") + "-ColorHistogram"
+    path = os.getcwdu() + "\\" + filename
+
+    if os.path.isfile(path + ".csv"):
+        for i in range(1,20):
+            testFileName = filename  + "-" + str(i) + ".csv"
+            if os.path.isfile(os.getcwdu() + "\\" +  testFileName)!=True:
+                pandasSorDF.to_csv(testFileName)
+                break
+
+    else:
+        pandasSorDF.to_csv(filename + ".csv")
+```
+
+%% Cell type:markdown id: tags:
+
+## Main Programm
+
+%% Cell type:code id: tags:
+
+``` python
+# Imports
+import os           # for iteration throug directories
+import pandas as pd # for Series and DataFrames
+import cv2          # for OpenCV
+import datetime     # for TimeStamp in CSVFile
+```
+
+%% Cell type:code id: tags:
+
+``` python
+#### Calculate Color Histogram
+path ='D:\Caltech'
+dfImages = imgCrawl(path)
+sClassLabels = getClassLabels(path)
+dfColorHistogram = calcColorHisto(path, dfImages, sClassLabels)
+
+fileNameColorHis = datetime.datetime.now().strftime("%Y_%m_%d") + "-Features" +"-ColorHistogram"
+exportToCSV(dfColorHistogram, fileNameColorHis)
+
+fileNameClassLabels = datetime.datetime.now().strftime("%Y_%m_%d") + "-ClassLabels" + "-Caltech"
+exportToCSV(sClassLabels, fileNameClassLabels)
+```
--- a/FeatureExtraction-All.ipynb
+++ b/FeatureExtraction-All.ipynb
--- a/FeatureExtraction-All_unix.ipynb
+++ b/FeatureExtraction-All_unix.ipynb
--- a/feature_extraction_try.py
+++ b/feature_extraction_try.py
+# coding: utf-8
+import os as os        # for iteration throug directories
+import pandas as pd # for Series and DataFrames
+import cv2          # for OpenCV 
+import datetime     # for TimeStamp in CSVFile
+from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html
+import numpy as np  # for arrays
+import time 
+
+
+# # Code to Extract ColorHistograms for Database
+
+# #### Author: Nikolas Hülsmann
+# #### Date: 2015-11-22
+
+# ## Functions for Extract Data
+# 
+# ### Function to iterate through given directory and return images paths and classLabels
+
+# In[31]:
+
+def imgCrawl(path, sClassLabels): #path to 'highest' folder
+    rootdir = path
+    df = pd.DataFrame()
+        
+    for subdir, dirs, files in os.walk(rootdir): # loop through subdirectories
+        for file in files:
+            pathOfFile = os.path.join(subdir, file) #path of file
+            head, classLabel = os.path.split(os.path.split(pathOfFile)[0]) # get directoryname of file as classLabel
+            
+            # assign integer label for dataframe
+            classLabel = sClassLabels[sClassLabels == classLabel].index[0]
+            df = df.append({'classLabel': classLabel, 'pathOfFile': pathOfFile}, ignore_index=True) 
+            
+    return df
+
+
+# ### Function to determine Class-Labels with Integer representation
+
+# In[32]:
+
+# function to determine Class-labels and return Series
+def getClassLabels(path):
+    data = os.listdir(path) # listdir returns all subdirectories
+    index = range(0,len(data))
+    
+    return pd.Series(data,index)
+
+
+# ### Function to calculate the ColorHistogram for given Images 
+
+# In[33]:
+
+#### Calculate ColorHistograms for all images
+
+### Points to improve: 
+# - use HSV color spectrum
+# - change function: parameter how many bins of ColorHistogramm (feature length)
+
+
+# dfImages: Dataframe with paths to all images - use function imgCrawl
+# numberOfBins_: Number of bins Histogram
+def calcColorHisto(dfImages_, numberOfBins_):
+    # Initialize function
+    df = pd.DataFrame()
+    npImages = dfImages_.values
+    numberOfBins = numberOfBins_
+    npColorHist = np.zeros((len(npImages), numberOfBins*3), "float32")
+    i=0
+    
+    ## algo
+    for images in npImages:
+        image = cv2.imread(images[1])
+        
+        # Image Size for Normalization
+        height, width, channels = image.shape
+        img_size = height * width
+        
+        # Split into color chanels rgb
+        chans = cv2.split(image)
+        colors = ("b", "g", "r")
+        
+        histogram = []
+
+        ########### Feature Color Histogram (cf. http://docs.opencv.org/2.4/doc/tutorials/imgproc/histograms/histogram_calculation/histogram_calculation.html)     # loop over the image channels
+        for (chan, color) in zip(chans, colors):         
+            
+            # Calculate Color Histogram - 16 bins cf. paper (test with 64 has shown that die result is similair in score)
+            # Seperates the intesity for each color from 0 to 256, and creates 16 bins of same size: 0-15, 16-31, .. , 241-256
+            hist = cv2.calcHist([chan], [0], None, [numberOfBins], [0, 256])
+
+            # to get raw values
+            hist = hist[:,0]
+            
+            # Normalize to a Distrubution from 0 to 1 throug calculating for each color channel (red/blue/green): 
+            #        (number of pixels in bin)/(pixel size of image)
+            #hist[:] = [x / img_size for x in hist]
+            hist[:] = [x / sum(hist) for x in hist]
+            
+
+            # Normalize with MinMax from 0 to 1 -> feature scaling
+            #cv2.normalize(hist, hist, 0, 1, cv2.NORM_MINMAX)
+            
+            histogram.extend(hist)
+
+        # append features_colHist to df
+        npColorHist[i] = histogram
+        i = i+1
+        #df = df.append({'ColHisto': features_colHist}, ignore_index=True) 
+    
+    return npColorHist
+
+
+# ### Function to calculate Surf Histogram
+
+# In[34]:
+
+################# FEATURE SURF (cf. http://docs.opencv.org/3.0-beta/doc/py_tutorials/py_feature2d/py_surf_intro/py_surf_intro.html#surf)
+# API cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html
+
+#### Calculate Histogramm of SURF Descripteurs with Bag Of Words appraoch for all images
+
+### Points to improve: 
+# - use spatial histogram: http://www.di.ens.fr/willow/events/cvml2011/materials/practical-classification/
+# - change function: parameter how many K clustes/feature length (in regard of overfitting)
+
+
+# path to higehst folder
+# dfImages: Dataframe with paths to all images - use function imgCrawl
+# k: number of K-Cluster -> length of feature vector
+def calcSurfHisto(dfImages_, k_):
+    
+    # Initialize function
+    df = pd.DataFrame()
+    npImages = dfImages_.values
+    k = k_
+    
+    # List where all the descriptors are stored
+    des_list = []
+    
+    #### Feature Detection and Description (Surf): 
+    # Detect (localize) for each image the keypoints (Points of Interest in an image - Surf uses therefore like SIFT corners)
+    # Pro: SIFT/SURF are scale and rotation invariant!
+    for images in npImages:
+        # Read image
+        image = cv2.imread(images[1])
+        
+        # Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call
+        # Each image has different amount of kp, but each kp has a describteur of fixed length (128)
+        kp, des = sift.detectAndCompute(image,None)
+        des_list.append(des)
+    
+    # Stack all the descriptors vertically in a numpy array
+    descriptors = des_list[0][1]
+    for descriptor in des_list[0:]:
+        descriptors = np.vstack((descriptors, descriptor)) 
+    
+    #### Bag of Words Approach
+    ### 1. Step: using K-means cluster to create dictionary/vocabulary/codebook:
+    # Encoding is the quantization of the image kp/des that constitute the image to be classified. 
+    # Basic encoding schemes work by first running K-means on the set of all des that you collect 
+    # across multiple images.
+    # This builds what is known a dictionary/vocabulary/codebook represented by the centroids obtained from the clustering.
+    
+    # Perform k-means clustering -> creates the words from all describteurs -> this is the (dic) dictionary/vocabulary/codebook
+    # k: amount of different clusters to build! Will result in a feature length k
+    dic, variance = kmeans(descriptors, k, 1) 
+    
+    ### 2. Step: encoding/coding/vector quantization(vq) to assign each descripteur the closest "visual word" from dictionary:
+    # At the end of this process, you end up with K representative "visual words" (the centroid of each cluster after 
+    # K means ends) of your image descripteurs. These "visual words" represent what is usually understood as your 
+    # visual dictionary. Once you have these visual words, encoding is the process of assigning 
+    # each descripteur within your image the "visual word" (nearest neighbor) in the dictionary.
+    
+    npSurfHist = np.zeros((len(npImages), k), "float32")
+    for i in xrange(len(npImages)):
+        # vq: (Encoding) Assign words from the dictionary to each descripteur
+        words, distance = vq(des_list[i],dic)
+        
+        ### 3. Step: Pooling - calculate a histogram for each image
+        # Pooling refers to the process of representing an image as a "bag of words". 
+        # The word bag here is meant to convey that once you have encoded each descripteur with a word  (a number between 1 and K), 
+        # you build a new representation (a bag) that discards the spatial relationship between the words that 
+        # constitute your image.
+
+        # This representation is often a histogram or a collection of spatially adjacent histograms of the desribteurs 
+        # (i.e. histograms of values 1 to K) that together form your image. "Pooling" is thus the process of 
+        # building a histogram of words (i.e. pooling ~ "sampling" words from the image to build a probability 
+        # mass function of words)
+
+        # To clarify, the purpose of pooling is two fold:
+        #           By building a feature vector that is a histogram of words (as opposed to putting the full "sentence of words" 
+        #           in the feature vector), your descriptor will be invariant to changes in "the ordering of words". 
+        #           In computer vision this translates into invariance with respect to rotations and distortions of the image 
+        #           and object, which is a desirable thing to have.
+
+        #           If the dictionary is small compared to the length of the sentence, a histogram of words has less dimensions 
+        #           than the original vector. Less dimensions makes learning (training) much easier.
+        
+        
+        # Count the accuarance of each word (w) in image (i) to build histogram
+        for w in words:
+            npSurfHist[i][w] += 1
+        
+        #### 4. Step: Normalization of features vector (Can be changed to distribution like ColorHisto)
+        # Frequency divided by amount of words (k)
+        summe = sum(npSurfHist[i])
+        for x in range(0,k):
+            #npSurfHist[i][x] = npSurfHist[i][x]/k
+            npSurfHist[i][x] = npSurfHist[i][x]/summe
+        
+        #stdSlr = StandardScaler().fit(npSurfHist)
+        #npSurfHist = stdSlr.transform(npSurfHist)
+    
+    return npSurfHist
+
+
+# ### SIFT Experimental - use SURF 
+
+# In[35]:
+
+# ########### Feature SIFT (Scale-invariant feature transform cf. http://docs.opencv.org/master/da/df5/tutorial_py_sift_intro.html#gsc.tab=0)
+# # Api cf. http://docs.opencv.org/2.4/modules/nonfree/doc/feature_detection.html
+# import cv2
+# import numpy as np
+
+# img = cv2.imread('../../03-jeux-de-donnees/101_ObjectCategories/airplanes/image_0306.jpg')
+# gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
+
+# sift = cv2.SIFT(nfeatures=100)
+# #sift = cv2.xfeatures2d.SIFT_create()
+
+# # Detector which detects the Keypoints in the Image
+# #kp = sift.detect(gray,None)
+
+# # Just a visualization of the Keypoints in the Image
+# #img=cv2.drawKeypoints(gray,kp)
+# #cv2.imwrite('D:\Sift-test\sift_keypoints.jpg',img)
+
+# # Another visualization with FLAG: draw a circle with size of keypoint and it will even show its orientation
+# #img=cv2.drawKeypoints(gray,kp,flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
+# #cv2.imwrite('D:\Sift-test\sift_keypoints.jpg',img)
+
+# # Method to compute the descripteurs after one has already detected the keypoints
+# #kp,des = sift.compute(gray,kp)
+
+# #sift = cv2.xfeatures2d.SIFT_create()
+# #sift = cv2.SIFT()
+
+# # Method to detect keypoints (kp) and calculate the descripteurs (des) with one function call
+# kp, des = sift.detectAndCompute(gray,None)
+
+# print (des.shape)
+
+
+# ### Functions to export calculated Data to csv 
+
+# In[36]:
+
+#### Export Features to csv
+def exportToCSV(pandasSorDF, filename):
+    #filename = datetime.datetime.now().strftime("%Y_%m_%d") + "-Feature"
+    path = os.getcwdu() + "\\" + filename
+    
+    if os.path.isfile(path + ".csv"):
+        for i in range(1,20):
+            testFileName = filename  + "-" + str(i) + ".csv"
+            if os.path.isfile(os.getcwdu() + "\\" +  testFileName)!=True:
+                pandasSorDF.to_csv(testFileName)
+                break
+
+    else:
+        pandasSorDF.to_csv(filename + ".csv")
+
+
+# In[37]:
+
+def exportNumpyToCSV(numpyArray, filename):
+    #filename = datetime.datetime.now().strftime("%Y_%m_%d") + "-Feature"
+    path = os.getcwdu() + "\\" + filename
+    
+    if os.path.isfile(path + ".csv"):
+        for i in range(1,20):
+            testFileName = filename  + "-" + str(i) + ".csv"
+            if os.path.isfile(os.getcwdu() + "\\" +  testFileName)!=True:
+                np.savetxt(testFileName, numpyArray, delimiter=",")
+                break
+
+    else:
+        np.savetxt(filename + ".csv", numpyArray, delimiter=",")
+
+
+# ## Main Programm
+# 
+
+# In[38]:
+
+# # Imports
+# import os           # for iteration throug directories
+# import pandas as pd # for Series and DataFrames
+# import cv2          # for OpenCV 
+# import datetime     # for TimeStamp in CSVFile
+# from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html
+# import numpy as np  # for arrays
+# import time       # for time calculations
+
+
+# # In[39]:
+
+# start = time.time()
+
+# # Determine the Database to extract features
+# path ='../../03-jeux-de-donnees/101_ObjectCategories'
+
+# # get dictionary to link classLabels Text to Integers
+# sClassLabels = getClassLabels(path)
+
+# # Get all path from all images inclusive classLabel as Integer
+# dfImages = imgCrawl(path, sClassLabels)
+
+# print dfImages.classLabel.shape
+
+# fileNameClassLabels = datetime.datetime.now().strftime("%Y_%m_%d") + "-Caltech-ClassLabels"
+# exportNumpyToCSV(dfImages.classLabel, fileNameClassLabels)
+
+# fileNameClassLabels = datetime.datetime.now().strftime("%Y_%m_%d") + "-Caltech-ClassLabels-Description"
+# #exportToCSV(sClassLabels, fileNameClassLabels)
+
+# end = time.time()
+# print "Time to extract all images: " + str(end - start)
+
+
+# # In[ ]:
+
+# start = time.time()
+
+# # Calculate Color Histogramm wit 16 bins for each color -> feature length = 3 x 16 = 48
+# npColorHistogram = calcColorHisto(dfImages, 16)
+
+# print npColorHistogram.shape
+
+# fileNameColorHis = datetime.datetime.now().strftime("%Y_%m_%d") + "-Caltech-Feature-ColorHistogram"
+# #exportNumpyToCSV(npColorHistogram, fileNameColorHis)
+
+# end = time.time()
+# print "Time to calculate ColorHistogram: " + str(end - start)
+
+
+# # In[ ]:
+
+# start = time.time()
+
+# # Calculate Surf Histogramm with K=100 Cluster
+# npSurfHistogram = calcSurfHisto(dfImages, 5)
+
+# print npSurfHistogram.shape
+
+# fileNameSurfHis = datetime.datetime.now().strftime("%Y_%m_%d") + "-Caltech-Feature-SurfHistogram"
+# #exportNumpyToCSV(npSurfHistogram, fileNameSurfHis)
+
+# end = time.time()
+# print "Time to calculate SurfHistogram: " + str(end - start)
+
--- a/feature_extraction_try.pyc
+++ b/feature_extraction_try.pyc
--- a/hog_extraction.py
+++ b/hog_extraction.py
+# Imports
+
+import os as os        # for iteration throug directories
+import pandas as pd # for Series and DataFrames
+import cv2          # for OpenCV 
+import datetime     # for TimeStamp in CSVFile
+from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html
+import numpy as np  # for arrays
+import time       # for time calculations
+from feature_extraction_try import imgCrawl, getClassLabels
+
+#in : npImages, color
+
+# In order to calculate HOG, we will use a bag of word approach : cf SURF function, well documented. 
+
+def imageSequencing(npImages, CELL_DIMENSION):
+
+  blocksList=[]
+  for i in range(1):
+    print npImages[i][1]
+    image = cv2.imread(npImages[i][1])
+    cv2.imshow(image)
+    resizedImage = reSize(image, CELL_DIMENSION)
+    height, width, channels = resizedImage.shape
+
+    blocksList.append(np.array([resizedImage[j*CELL_DIMENSION:j*CELL_DIMENSION+CELL_DIMENSION-1, i*CELL_DIMENSION:i*CELL_DIMENSION+CELL_DIMENSION-1, :] for i in range(width/CELL_DIMENSION) for j in range(height/CELL_DIMENSION)]))
+  return np.array(blocksList)  
+
+def reSize(image, CELL_DIMENSION):
+  height, width, channels = image.shape
+  if height%5==0 and width%5==0:
+    resizedImage = image
+  elif width%5==0:
+    missingPixels = 5-height%5
+    resizedImage = cv2.copyMakeBorder(image,0,missingPixels,0,0,cv2.BORDER_REPLICATE)
+  elif height%5==0:
+    missingPixels = 5-width%5
+    resizedImage = cv2.copyMakeBorder(image,0,0,0,missingPixels,cv2.BORDER_REPLICATE)
+  else:
+    missingWidthPixels = 5-width%5
+    missingHeightPixels = 5-height%5
+    resizedImage = cv2.copyMakeBorder(image,0,missingHeightPixels,0,missingWidthPixels,cv2.BORDER_REPLICATE)
+  # height, width, channels = resizedImage.shape
+  # if height%5==0 and width%5==0:
+  #   print ("My job has been done")
+  return resizedImage
+
+start = time.time()
+path ='../../03-jeux-de-donnees/101_ObjectCategories'
+print "Fetching Images in " + path
+
+# get dictionary to link classLabels Text to Integers
+sClassLabels = getClassLabels(path)
+
+# Get all path from all images inclusive classLabel as Integer
+dfImages = imgCrawl(path, sClassLabels)
+npImages = dfImages.values
+middle = time.time()
+print "Extracted images in " + str(middle-start)
+print "Sequencing Images ..."
+sequencedCorpus = imageSequencing(npImages, 5)
+end = time.time()
+print "Sequenced images in " + str(end-middle)
+print sequencedCorpus.shape
+cv2.imshow(sequencedCorpus[0][0])
+
+
+
+
+
+# def even(difference):
+#   return not(difference % 2)
+
+# def findmaxDim(npImages):
+  
+#   max_height = 0
+#   max_width = 0
+  
+#   for npImage in npImages:
+#     height, width, channels = cv2.imread(npImage[1]).shape
+#     if height > max_height:
+#       max_height=height
+#     if width > max_width:
+#       max_width=width
+#   return [max_height, max_width]
+
+# def resizeImage(image, height, width):
+#   ratio = float(8000000)/(width*height)
+#   smallImage = cv2.resize(image, (0,0), fx=ratio, fy=ratio)
+#   return smallImage 
+
+# def enlarge(image, maxDimension, color):
+
+#   height, width, channels = image.shape
+#   [height_difference, width_difference] = np.array(maxDimension) - np.array([height, width])
+#   print(height_difference, width_difference)
+#   if even(height_difference) and even(width_difference):
+#     treatedImage = cv2.copyMakeBorder(image, height_difference/2, height_difference/2, width_difference/2, width_difference/2, cv2.BORDER_CONSTANT, value=color)
+    
+#   elif even(height_difference):
+#     treatedImage = cv2.copyMakeBorder(image, height_difference/2, height_difference/2, width_difference/2+1, width_difference/2, cv2.BORDER_CONSTANT, value=color)
+    
+#   elif even(width_difference):
+#     treatedImage = cv2.copyMakeBorder(image, height_difference/2+1, height_difference/2, width_difference/2, width_difference/2, cv2.BORDER_CONSTANT, value=color)
+    
+#   else:
+#     treatedImage = cv2.copyMakeBorder(image, height_difference/2+1, height_difference/2, width_difference/2+1, width_difference/2, cv2.BORDER_CONSTANT, value=color)
+    
+#   return treatedImage
+
+# def calcHog(npImages, color, maxDimension):
+
+#   list_hog = []
+#   hog = cv2.HOGDescriptor()
+
+#   # poulet = preTreat(cv2.imread(npImages[0][1]), maxDimension, color)
+#   for npImage in npImages:
+#     image = cv2.imread(npImage[1])
+#     height, width, channels = image.shape
+
+#     if height * width > 8000000:
+#       g = hog.compute(resizeImage(image, height, width))
+#     else:
+#       g = hog.compute(enlarge(image, maxDimension, color))
+#     print g.shape
+#   # list_hog = [hog.compute(cv2.imread(npImage[1])) for npImage in npImages]
+#   return list_hog
+
+# color=[0,0,0]
+# path ='../../03-jeux-de-donnees/101_ObjectCategories'
+
+# # get dictionary to link classLabels Text to Integers
+# sClassLabels = getClassLabels(path)
+
+# # Get all path from all images inclusive classLabel as Integer
+# dfImages = imgCrawl(path, sClassLabels)
+
+# npImages = dfImages.values
+# maxDimension = findmaxDim(npImages)
+# list_hog = calcHog(npImages, color, maxDimension)
+# print len(list_hog)
--- a/image_size.py
+++ b/image_size.py
+
+import os as os        # for iteration throug directories
+import pandas as pd # for Series and DataFrames
+import cv2          # for OpenCV 
+import datetime     # for TimeStamp in CSVFile
+from scipy.cluster.vq import * # for Clustering http://docs.scipy.org/doc/scipy/reference/cluster.vq.html
+import numpy as np  # for arrays
+import time       # for time calculations
+from feature_extraction_try import imgCrawl, getClassLabels
+
+def findmaxDim(npImages):
+  
+  max_height = 0
+  max_width = 0
+  heights =[]
+  widths = []
+  totals = []
+  count=0
+  poulet=0
+
+  for npImage in npImages:
+    height, width, channels = cv2.imread(npImage[1]).shape
+    heights.append(height)
+    widths.append(width)
+    totals.append(width*height)
+    if width * height > 500000:
+    	count+=1
+    if not(abs(height-200) <200):
+    	poulet+=1
+    	print (npImage[1])
+    if width > max_width:
+      max_width=width
+
+  print float(poulet)*100/len(heights)
+  # print float(count)*100/len(heights)
+  return heights, widths, totals
+
+
+path ='../../03-jeux-de-donnees/101_ObjectCategories'
+
+# get dictionary to link classLabels Text to Integers
+sClassLabels = getClassLabels(path)
+
+# Get all path from all images inclusive classLabel as Integer
+dfImages = imgCrawl(path, sClassLabels)
+
+npImages = dfImages.values
+heights = []
+heights, widths, totals= findmaxDim(npImages)
+heights_ = sorted(list(set(heights)), reverse=True)
+widths_ = sorted(list(set(widths)), reverse=True)
+totals_ = sorted(totals, reverse=True)
+# print (totals_[len(totals_)/2])
+# print ("height", sum(heights)/len(heights), "width", sum(widths)/len(widths) )
+# print heights_ 
+# print("poulmmet")
+# print widths_
\ No newline at end of file
--- a/res
+++ b/res
+('height', 244, 'width', 301)
+[3999, 2955, 1406, 1280, 1200, 1154, 1071, 1024, 974, 960, 927, 919, 889, 870, 854, 832, 821, 817, 781, 780, 768, 764, 750, 742, 740, 723, 720, 709, 700, 689, 682, 663, 659, 656, 655, 650, 648, 630, 629, 624, 622, 617, 612, 600, 596, 594, 592, 585, 581, 576, 567, 566, 565, 560, 539, 534, 529, 528, 526, 522, 514, 510, 509, 504, 502, 500, 494, 487, 480, 477, 473, 468, 455, 452, 451, 450, 448, 445, 444, 437, 435, 430, 426, 424, 420, 418, 415, 414, 412, 409, 407, 406, 403, 400, 396, 395, 394, 393, 392, 390, 389, 388, 386, 385, 384, 382, 381, 380, 379, 378, 377, 376, 375, 374, 373, 372, 371, 370, 369, 367, 366, 365, 364, 363, 362, 361, 360, 359, 358, 356, 355, 354, 353, 352, 351, 350, 349, 348, 347, 346, 345, 344, 343, 342, 341, 340, 339, 338, 337, 336, 335, 334, 333, 332, 331, 330, 329, 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, 318, 317, 316, 315, 314, 313, 312, 311, 310, 309, 308, 307, 306, 305, 304, 303, 302, 301, 300, 299, 298, 297, 296, 295, 294, 293, 292, 291, 290, 289, 288, 287, 286, 285, 284, 283, 282, 281, 280, 279, 278, 277, 276, 275, 274, 273, 272, 271, 270, 269, 268, 267, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256, 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240, 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, 215, 214, 213, 212, 211, 210, 209, 208, 207, 206, 205, 204, 203, 202, 201, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173, 172, 171, 170, 169, 168, 167, 166, 165, 164, 163, 162, 161, 160, 159, 158, 157, 156, 155, 154, 153, 152, 151, 150, 149, 148, 147, 146, 145, 144, 143, 142, 141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 108, 107, 106, 104, 103, 102, 101, 97, 92]
+poulmmet
+[3481, 2799, 1792, 1312, 1280, 1221, 1152, 1132, 1024, 989, 969, 960, 940, 911, 909, 889, 857, 842, 832, 800, 792, 768, 747, 727, 726, 721, 720, 718, 708, 705, 700, 695, 688, 685, 679, 672, 656, 655, 652, 649, 648, 642, 640, 633, 631, 628, 623, 619, 616, 615, 613, 610, 606, 604, 600, 598, 595, 594, 590, 589, 588, 583, 582, 578, 576, 574, 569, 567, 566, 565, 563, 561, 560, 559, 558, 556, 555, 552, 550, 549, 548, 546, 545, 544, 542, 540, 538, 536, 535, 534, 532, 530, 528, 526, 525, 524, 522, 520, 519, 518, 517, 516, 515, 513, 511, 510, 509, 508, 507, 506, 504, 503, 502, 501, 500, 499, 498, 497, 495, 494, 492, 490, 489, 487, 485, 484, 482, 481, 479, 477, 476, 475, 474, 473, 472, 471, 470, 469, 468, 467, 466, 465, 464, 463, 462, 461, 460, 459, 458, 457, 456, 455, 454, 453, 452, 451, 450, 449, 448, 447, 445, 444, 443, 441, 440, 439, 437, 436, 435, 434, 433, 432, 431, 430, 429, 428, 427, 426, 425, 424, 423, 422, 421, 420, 419, 418, 417, 416, 415, 414, 413, 412, 411, 410, 409, 408, 407, 406, 405, 404, 403, 402, 401, 400, 399, 398, 397, 396, 395, 394, 393, 392, 391, 390, 388, 387, 382, 376, 371, 370, 368, 365, 362, 360, 358, 356, 351, 350, 347, 346, 340, 339, 338, 333, 330, 327, 324, 320, 319, 313, 309, 306, 305, 302, 300, 299, 298, 297, 296, 295, 294, 293, 292, 291, 290, 289, 288, 287, 286, 285, 284, 283, 282, 281, 280, 279, 278, 277, 276, 275, 274, 273, 272, 271, 270, 269, 268, 267, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256, 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240, 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, 215, 214, 213, 212, 211, 210, 209, 208, 207, 206, 205, 204, 203, 202, 201, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 173, 172, 171, 170, 169, 168, 167, 166, 165, 164, 163, 162, 161, 160, 159, 158, 157, 155, 154, 153, 152, 151, 150, 149, 148, 147, 146, 145, 144, 143, 141, 140, 137, 136, 134, 131, 128, 124, 119, 114, 105, 80]