Doc cleaning + refactoring Splearn_array into SplearnArray + change...

Doc cleaning + refactoring Splearn_array into SplearnArray + change Spectral._populate_dictionnaries into Spectral.polulate_dictionnaries

Doc cleaning + refactoring Splearn_array into SplearnArray + change...
cb7dcfc2 · Denis Arrivault · a330c613 · cb7dcfc2 · cb7dcfc2 · cb7dcfc2
Commit cb7dcfc2 authored Feb 21, 2018 by Denis Arrivault
--- a/examples/json_save.ipynb
+++ b/examples/json_save.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@@ -22,7 +22,7 @@
       "     smooth_method='none', sparse=True, version='classic')"
      ]
     },
-     "execution_count": 1,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -39,16 +39,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Automaton.write(sp.automaton, train_file + \".json\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -57,7 +48,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -66,33 +57,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "str2 = Serializer.data_to_json(A)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "str1 == str2"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 7,
@@ -101,7 +72,7 @@
    {
     "data": {
      "text/plain": [
-       "'{\"automaton\": {\"nbL\": 4, \"nbS\": 5, \"initial\": {\"numpy.ndarray\": {\"values\": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], \"dtype\": \"float64\"}}, \"final\": {\"numpy.ndarray\": {\"values\": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], \"dtype\": \"float64\"}}, \"transitions\": [{\"numpy.ndarray\": {\"values\": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], \"dtype\": \"float64\"}}], \"type\": \"classic\"}}'"
+       "True"
      ]
     },
     "execution_count": 7,
@@ -110,33 +81,13 @@
    }
   ],
   "source": [
-    "str1"
+    "str1 == str2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'{\"automaton\": {\"nbL\": 4, \"nbS\": 5, \"initial\": {\"numpy.ndarray\": {\"values\": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], \"dtype\": \"float64\"}}, \"final\": {\"numpy.ndarray\": {\"values\": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], \"dtype\": \"float64\"}}, \"transitions\": [{\"numpy.ndarray\": {\"values\": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], \"dtype\": \"float64\"}}], \"type\": \"classic\"}}'"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "str2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
   "outputs": [],
   "source": [
    "str3 = Serializer.data_to_yaml(sp.automaton)"
@@ -144,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@@ -226,6 +177,15 @@
    "print(str3)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Automaton.write(sp.automaton, train_file + \".json\")"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 11,
@@ -331,249 +291,59 @@
    "Ayl.transitions"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from splearn import Hankel\n",
-    "Hankel.write(sp.hankel, train_file + \"_hankel.json\", \"json\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Hb = Hankel.read(train_file + \"_hankel.json\", \"json\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Hankel equality check\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Hb == sp.hankel"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 8251 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 2199 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 2122 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 1091 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 3489 stored elements in Dictionary Of Keys format>]"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Hb.lhankel"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 8251 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 2199 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 2122 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 1091 stored elements in Dictionary Of Keys format>,\n",
-       " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 3489 stored elements in Dictionary Of Keys format>]"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sp.hankel.lhankel"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import scipy.sparse as sps\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = sp.hankel.lhankel[0]\n",
-    "k_str = \"({0:d},{1:d})\"\n",
-    "dico = dict(zip([k_str.format(i, j) for (i,j) in data.keys()], data.values()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dok = sps.dok_matrix(data.shape, dtype=data.dtype)\n",
-    "for k, val in dico.items():\n",
-    "    k = k.replace(\"(\",\"\").replace(\")\",\"\")\n",
-    "    ind1, ind2 = k.split(\",\")\n",
-    "    dok[(int(ind1), int(ind2))] = val"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "  (0, 2950)\t6.0\n",
-      "  (1, 1141)\t6.0\n",
-      "  (2, 820)\t6.0\n",
-      "  (9, 192)\t6.0\n",
-      "  (35, 75)\t6.0\n",
-      "  (123, 12)\t6.0\n",
-      "  (358, 4)\t6.0\n",
-      "  (832, 0)\t6.0\n",
-      "  (0, 20)\t551.0\n",
-      "  (1, 4)\t551.0\n",
-      "  (5, 0)\t551.0\n",
-      "  (0, 837)\t9.0\n",
-      "  (1, 212)\t9.0\n",
-      "  (4, 33)\t9.0\n",
-      "  (14, 17)\t9.0\n",
-      "  (56, 1)\t9.0\n",
-      "  (183, 0)\t9.0\n",
-      "  (0, 254)\t7.0\n",
-      "  (1, 25)\t7.0\n",
-      "  (2, 9)\t7.0\n",
-      "  (7, 1)\t7.0\n",
-      "  (26, 0)\t7.0\n",
-      "  (0, 3160)\t5.0\n",
-      "  (1, 1601)\t5.0\n",
-      "  (5, 323)\t5.0\n",
-      "  :\t:\n",
-      "  (607, 109)\t1.0\n",
-      "  (1270, 48)\t1.0\n",
-      "  (34, 2382)\t1.0\n",
-      "  (117, 1262)\t1.0\n",
-      "  (336, 580)\t1.0\n",
-      "  (761, 265)\t1.0\n",
-      "  (464, 3272)\t1.0\n",
-      "  (1015, 1821)\t1.0\n",
-      "  (338, 2911)\t1.0\n",
-      "  (770, 1090)\t1.0\n",
-      "  (0, 2926)\t1.0\n",
-      "  (1, 1113)\t1.0\n",
-      "  (2, 767)\t1.0\n",
-      "  (9, 131)\t1.0\n",
-      "  (34, 70)\t1.0\n",
-      "  (119, 7)\t1.0\n",
-      "  (343, 3)\t1.0\n",
-      "  (786, 0)\t1.0\n",
-      "  (1073, 2555)\t1.0\n",
-      "  (0, 825)\t1.0\n",
-      "  (1, 197)\t1.0\n",
-      "  (3, 80)\t1.0\n",
-      "  (13, 17)\t1.0\n",
-      "  (53, 1)\t1.0\n",
-      "  (175, 0)\t1.0\n"
+      "True\n",
+      "True\n",
+      "True\n",
+      "True\n"
     ]
    }
   ],
   "source": [
-    "print(dok)"
+    "import numpy as np\n",
+    "for i in range(4):\n",
+    "    print(np.array_equal(Ajs.transitions[i], Ayl.transitions[i]))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "from splearn import Hankel\n",
-    "Hankel.write(sp.hankel, train_file + \"_hankel.yaml\", \"yaml\")"
+    "Hankel.write(sp.hankel, train_file + \"_hankel.json\", \"json\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
-    "Hb = Hankel.read(train_file + \"_hankel.yaml\", \"yaml\")"
+    "Hb = Hankel.read(train_file + \"_hankel.json\", \"json\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Hankel equality check\n"
-     ]
-    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
-     "execution_count": 27,
+     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -584,102 +354,40 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
-    "yamlstr = \"- scipy.dok_matrix:\\n    dtype: float64\\n    shape:\\n        tuple: [1, 1]\\n    values: {'(0,0)': 1.0}\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "- scipy.dok_matrix:\n",
-      "    dtype: float64\n",
-      "    shape:\n",
-      "        tuple: [1, 1]\n",
-      "    values: {'(0,0)': 1.0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(yamlstr)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<1x1 sparse matrix of type '<class 'numpy.float64'>'\n",
-       "\twith 1 stored elements in Dictionary Of Keys format>]"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "Serializer.yaml_to_data(yamlstr)"
+    "Hankel.write(sp.hankel, train_file + \"_hankel.yaml\", \"yaml\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'\"- scipy.dok_matrix:\\\\n    dtype: float64\\\\n    shape:\\\\n        tuple: [1, 1]\\\\n    values:\\\\\\n  \\\\ {\\'(0,0)\\': 1.0}\"\\n'"
-      ]
-     },
-     "execution_count": 41,
+   "execution_count": 24,
   "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "dy"
+    "Hb = Hankel.read(train_file + \"_hankel.yaml\", \"yaml\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "False"
+       "True"
      ]
     },
-     "execution_count": 47,
+     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "issubclass(TypeError, ValueError)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"The input data string (\" + data_str + \") should contain the following keys : \\\"\" + '\\\", \\\"'.join(keys) + \"\\\"\""
+    "Hb == sp.hankel"
   ]
  }
 ],

 %% Cell type:code id: tags:

 ``` python
 from splearn.datasets.base import load_data_sample
 from splearn.tests.datasets.get_dataset_path import get_dataset_path
 from splearn import Spectral, Automaton, Serializer
 train_file = '3.pautomac_light.train'
 data = load_data_sample(adr=get_dataset_path(train_file))
 sp = Spectral()
 sp.fit(X=data.data)
 ```

 %% Output

    Start Hankel matrix computation
    End of Hankel matrix computation
    Start Building Automaton from Hankel matrix
    End of Automaton computation

    Spectral(lcolumns=7, lrows=7, mode_quiet=False, partial=True, rank=5,
         smooth_method='none', sparse=True, version='classic')

 %% Cell type:code id: tags:

 ``` python
-Automaton.write(sp.automaton, train_file + ".json")
-```
-
-%% Cell type:code id: tags:
-
-``` python
 str1 = Serializer.data_to_json(sp.automaton)
 ```

 %% Cell type:code id: tags:

 ``` python
 A = Serializer.json_to_data(str1)
 ```

 %% Cell type:code id: tags:

 ``` python
 str2 = Serializer.data_to_json(A)
 ```

 %% Cell type:code id: tags:

 ``` python
 str1 == str2
 ```

 %% Output

    True

 %% Cell type:code id: tags:

 ``` python
-str1
-```
-
-%% Output
-
-    '{"automaton": {"nbL": 4, "nbS": 5, "initial": {"numpy.ndarray": {"values": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], "dtype": "float64"}}, "final": {"numpy.ndarray": {"values": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], "dtype": "float64"}}, "transitions": [{"numpy.ndarray": {"values": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], "dtype": "float64"}}], "type": "classic"}}'
-
-%% Cell type:code id: tags:
-
-``` python
-str2
-```
-
-%% Output
-
-    '{"automaton": {"nbL": 4, "nbS": 5, "initial": {"numpy.ndarray": {"values": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], "dtype": "float64"}}, "final": {"numpy.ndarray": {"values": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], "dtype": "float64"}}, "transitions": [{"numpy.ndarray": {"values": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], "dtype": "float64"}}], "type": "classic"}}'
-
-%% Cell type:code id: tags:
-
-``` python
 str3 = Serializer.data_to_yaml(sp.automaton)
 ```

 %% Cell type:code id: tags:

 ``` python
 print(str3)
 ```

 %% Output

    automaton:
      final:
        numpy.ndarray:
          dtype: float64
          values: [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759,
            -0.554674433356224]
      initial:
        numpy.ndarray:
          dtype: float64
          values: [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415,
            -0.1077770261654714, -0.0866391379316952]
      nbL: 4
      nbS: 5
      transitions:
      - numpy.ndarray:
          dtype: float64
          values:
          - [0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579,
            -0.21402523377497645]
          - [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124,
            -0.5580573163749153]
          - [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607,
            -0.14481622025561292]
          - [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007,
            -0.15103654604159977]
          - [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935,
            0.2432902242047721]
      - numpy.ndarray:
          dtype: float64
          values:
          - [0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105,
            0.20289396030628148]
          - [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881,
            -0.11085207725068251]
          - [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564,
            -0.1567967473145572]
          - [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344,
            -0.053754646789681754]
          - [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344,
            0.1418257620585633]
      - numpy.ndarray:
          dtype: float64
          values:
          - [-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635,
            -0.22977695089938127]
          - [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974,
            0.3484153673774836]
          - [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982,
            0.02316491010895583]
          - [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187,
            -0.038514339001406585]
          - [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314,
            0.04984888818929034]
      - numpy.ndarray:
          dtype: float64
          values:
          - [0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735,
            0.024753473688328077]
          - [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589,
            0.07372294122306285]
          - [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677,
            0.09251699239093385]
          - [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407,
            0.6491033177492604]
          - [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646,
            0.6616477207948602]
      type: classic
    

 %% Cell type:code id: tags:

 ``` python
+Automaton.write(sp.automaton, train_file + ".json")
+```
+
+%% Cell type:code id: tags:
+
+``` python
 Ajs = Automaton.read(train_file + ".json")
 ```

 %% Cell type:code id: tags:

 ``` python
 Ajs.transitions
 ```

 %% Output

    [array([[ 0.04512121, -0.2403897 ,  0.34945   , -0.28116807, -0.21402523],
            [ 0.06925801, -0.30062293,  0.20641375, -0.14960814, -0.55805732],
            [ 0.02980115, -0.13866481,  0.18362213, -0.20969545, -0.14481622],
            [ 0.00569934, -0.02338583, -0.06600665,  0.10749935, -0.15103655],
            [-0.02008655,  0.09026348, -0.00552559, -0.03135532,  0.24329022]]),
     array([[ 0.07744772,  0.09007074, -0.30472201,  0.27676245,  0.20289396],
            [-0.0990298 , -0.08061847,  0.25853171, -0.1208633 , -0.11085208],
            [-0.06171079, -0.06244152,  0.12007655,  0.00250637, -0.15679675],
            [-0.00273697, -0.00900572, -0.00046003, -0.00855043, -0.05375465],
            [ 0.03098733,  0.0397268 , -0.04997113,  0.00357694,  0.14182576]]),
     array([[-0.06791915, -0.11357938,  0.37955393, -0.2178498 , -0.22977695],
            [ 0.11596642,  0.14914957, -0.13357508, -0.00891606,  0.34841537],
            [ 0.01173082,  0.0192738 ,  0.04142658, -0.03534659,  0.02316491],
            [ 0.00732891,  0.00553651, -0.02245608,  0.03611543, -0.03851434],
            [-0.01058989, -0.01062662, -0.00054311, -0.02556748,  0.04984889]]),
     array([[ 0.07276211, -0.01571956,  0.07428593, -0.10369862,  0.02475347],
            [-0.05607105, -0.08896207,  0.27638225, -0.23711256,  0.07372294],
            [-0.00739129, -0.0487418 , -0.62912397,  0.46816277,  0.09251699],
            [-0.00711022, -0.05623318, -0.36606659, -0.0132978 ,  0.64910332],
            [ 0.00233552, -0.02156115,  0.09096243, -0.38438823,  0.66164772]])]

 %% Cell type:code id: tags:

 ``` python
 Automaton.write(sp.automaton, train_file + ".yaml", "yaml")
 ```

 %% Cell type:code id: tags:

 ``` python
 Ayl = Automaton.read(train_file + ".yaml", "yaml")
 ```

 %% Cell type:code id: tags:

 ``` python
 Ayl.transitions
 ```

 %% Output

    [array([[ 0.04512121, -0.2403897 ,  0.34945   , -0.28116807, -0.21402523],
            [ 0.06925801, -0.30062293,  0.20641375, -0.14960814, -0.55805732],
            [ 0.02980115, -0.13866481,  0.18362213, -0.20969545, -0.14481622],
            [ 0.00569934, -0.02338583, -0.06600665,  0.10749935, -0.15103655],
            [-0.02008655,  0.09026348, -0.00552559, -0.03135532,  0.24329022]]),
     array([[ 0.07744772,  0.09007074, -0.30472201,  0.27676245,  0.20289396],
            [-0.0990298 , -0.08061847,  0.25853171, -0.1208633 , -0.11085208],
            [-0.06171079, -0.06244152,  0.12007655,  0.00250637, -0.15679675],
            [-0.00273697, -0.00900572, -0.00046003, -0.00855043, -0.05375465],
            [ 0.03098733,  0.0397268 , -0.04997113,  0.00357694,  0.14182576]]),
     array([[-0.06791915, -0.11357938,  0.37955393, -0.2178498 , -0.22977695],
            [ 0.11596642,  0.14914957, -0.13357508, -0.00891606,  0.34841537],
            [ 0.01173082,  0.0192738 ,  0.04142658, -0.03534659,  0.02316491],
            [ 0.00732891,  0.00553651, -0.02245608,  0.03611543, -0.03851434],
            [-0.01058989, -0.01062662, -0.00054311, -0.02556748,  0.04984889]]),
     array([[ 0.07276211, -0.01571956,  0.07428593, -0.10369862,  0.02475347],
            [-0.05607105, -0.08896207,  0.27638225, -0.23711256,  0.07372294],
            [-0.00739129, -0.0487418 , -0.62912397,  0.46816277,  0.09251699],
            [-0.00711022, -0.05623318, -0.36606659, -0.0132978 ,  0.64910332],
            [ 0.00233552, -0.02156115,  0.09096243, -0.38438823,  0.66164772]])]

 %% Cell type:code id: tags:

 ``` python
-from splearn import Hankel
-Hankel.write(sp.hankel, train_file + "_hankel.json", "json")
-```
-
-%% Cell type:code id: tags:
-
-``` python
-Hb = Hankel.read(train_file + "_hankel.json", "json")
-```
-
-%% Cell type:code id: tags:
-
-``` python
-Hb == sp.hankel
+import numpy as np
+for i in range(4):
+    print(np.array_equal(Ajs.transitions[i], Ayl.transitions[i]))
 ```

 %% Output

-    Hankel equality check
-
+    True
+    True
+    True
    True

 %% Cell type:code id: tags:

 ``` python
-Hb.lhankel
-```
-
-%% Output
-
-    [<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 8251 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 2199 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 2122 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 1091 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 3489 stored elements in Dictionary Of Keys format>]
-
-%% Cell type:code id: tags:
-
-``` python
-sp.hankel.lhankel
-```
-
-%% Output
-
-    [<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 8251 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 2199 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 2122 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 1091 stored elements in Dictionary Of Keys format>,
-     <1310x3308 sparse matrix of type '<class 'numpy.float64'>'
-    	with 3489 stored elements in Dictionary Of Keys format>]
-
-%% Cell type:code id: tags:
-
-``` python
-import scipy.sparse as sps
-import numpy as np
-```
-
-%% Cell type:code id: tags:
-
-``` python
-data = sp.hankel.lhankel[0]
-k_str = "({0:d},{1:d})"
-dico = dict(zip([k_str.format(i, j) for (i,j) in data.keys()], data.values()))
+from splearn import Hankel
+Hankel.write(sp.hankel, train_file + "_hankel.json", "json")
 ```

 %% Cell type:code id: tags:

 ``` python
-dok = sps.dok_matrix(data.shape, dtype=data.dtype)
-for k, val in dico.items():
-    k = k.replace("(","").replace(")","")
-    ind1, ind2 = k.split(",")
-    dok[(int(ind1), int(ind2))] = val
+Hb = Hankel.read(train_file + "_hankel.json", "json")
 ```

 %% Cell type:code id: tags:

 ``` python
-print(dok)
+Hb == sp.hankel
 ```

 %% Output

-      (0, 2950)	6.0
-      (1, 1141)	6.0
-      (2, 820)	6.0
-      (9, 192)	6.0
-      (35, 75)	6.0
-      (123, 12)	6.0
-      (358, 4)	6.0
-      (832, 0)	6.0
-      (0, 20)	551.0
-      (1, 4)	551.0
-      (5, 0)	551.0
-      (0, 837)	9.0
-      (1, 212)	9.0
-      (4, 33)	9.0
-      (14, 17)	9.0
-      (56, 1)	9.0
-      (183, 0)	9.0
-      (0, 254)	7.0
-      (1, 25)	7.0
-      (2, 9)	7.0
-      (7, 1)	7.0
-      (26, 0)	7.0
-      (0, 3160)	5.0
-      (1, 1601)	5.0
-      (5, 323)	5.0
-      :	:
-      (607, 109)	1.0
-      (1270, 48)	1.0
-      (34, 2382)	1.0
-      (117, 1262)	1.0
-      (336, 580)	1.0
-      (761, 265)	1.0
-      (464, 3272)	1.0
-      (1015, 1821)	1.0
-      (338, 2911)	1.0
-      (770, 1090)	1.0
-      (0, 2926)	1.0
-      (1, 1113)	1.0
-      (2, 767)	1.0
-      (9, 131)	1.0
-      (34, 70)	1.0
-      (119, 7)	1.0
-      (343, 3)	1.0
-      (786, 0)	1.0
-      (1073, 2555)	1.0
-      (0, 825)	1.0
-      (1, 197)	1.0
-      (3, 80)	1.0
-      (13, 17)	1.0
-      (53, 1)	1.0
-      (175, 0)	1.0
+    True

 %% Cell type:code id: tags:

 ``` python
-from splearn import Hankel
 Hankel.write(sp.hankel, train_file + "_hankel.yaml", "yaml")
 ```

 %% Cell type:code id: tags:

 ``` python
 Hb = Hankel.read(train_file + "_hankel.yaml", "yaml")
 ```

 %% Cell type:code id: tags:

 ``` python
 Hb == sp.hankel
 ```

 %% Output

-    Hankel equality check
-
    True
-
-%% Cell type:code id: tags:
-
-``` python
-yamlstr = "- scipy.dok_matrix:\n    dtype: float64\n    shape:\n        tuple: [1, 1]\n    values: {'(0,0)': 1.0}"
-```
-
-%% Cell type:code id: tags:
-
-``` python
-print(yamlstr)
-```
-
-%% Output
-
-    - scipy.dok_matrix:
-        dtype: float64
-        shape:
-            tuple: [1, 1]
-        values: {'(0,0)': 1.0}
-
-%% Cell type:code id: tags:
-
-``` python
-Serializer.yaml_to_data(yamlstr)
-```
-
-%% Output
-
-    [<1x1 sparse matrix of type '<class 'numpy.float64'>'
-    	with 1 stored elements in Dictionary Of Keys format>]
-
-%% Cell type:code id: tags:
-
-``` python
-dy
-```
-
-%% Output
-
-    '"- scipy.dok_matrix:\\n    dtype: float64\\n    shape:\\n        tuple: [1, 1]\\n    values:\\\n  \\ {\'(0,0)\': 1.0}"\n'
-
-%% Cell type:code id: tags:
-
-``` python
-issubclass(TypeError, ValueError)
-```
-
-%% Output
-
-    False
-
-%% Cell type:code id: tags:
-
-``` python
-"The input data string (" + data_str + ") should contain the following keys : \"" + '\", \"'.join(keys) + "\""
-```

--- a/splearn/datasets/__init__.py
+++ b/splearn/datasets/__init__.py
 from splearn.datasets.base import *
-from splearn.datasets.data_sample import DataSample, Splearn_array
\ No newline at end of file
+from splearn.datasets.data_sample import DataSample, SplearnArray
\ No newline at end of file
--- a/splearn/datasets/base.py
+++ b/splearn/datasets/base.py
@@ -3,38 +3,25 @@ import numpy as np
 from splearn.datasets.data_sample import DataSample


-def load_data_sample(adr, type='SPiCe', pickle=False):
+def load_data_sample(adr, filetype='SPiCe', pickle=False):
    """Load a sample from file and returns a dictionary
    (word,count)

    - Input:

-    :param lrows: number or list of rows,
-           a list of strings if partial=True;
-           otherwise, based on pref if version="classic" or
-           "prefix", fact otherwise
-    :type lrows: int or list of int
-    :param lcolumns: number or list of columns
-            a list of strings if partial=True ;
-            otherwise, based on suff if version="classic" or "suffix",
-            fact otherwise
-    :type lcolumns: int or list of int
-    :param string version: (default = "classic") version name
-    :param boolean partial: (default value = False) build of partial
-           if True partial dictionaries are loaded based
-           on nrows and lcolumns
+    :param str adr: address and name of the loaded file
+    :param str filetype: (default value = 'SPiCe') indicate
+           the structure of the file. Should be either 'SPiCe' or 'Pautomac'
+    :param boolean pickle: if enabled it a pickle file is created from the loaded file. Default is fault.

    - Output:

-    :returns:  nbL , nbEx , dsample , dpref , dsuff  , dfact
-    :rtype: int , int , dict , dict , dict  , dict
+    :returns: corresponding DataSample
+    :rtype: DataSample


    :Example:

-    Let's say you are interested in the samples 10, 25, and 50, and want to
-    know their class name.
-
    >>> from splearn.datasets.base import load_data_sample
    >>> from splearn.tests.datasets.get_dataset_path import get_dataset_path
    >>> train_file = '3.pautomac_light.train' # '4.spice.train'
@@ -54,13 +41,13 @@ def load_data_sample(adr, type='SPiCe', pickle=False):

    """

-    if type == 'SPiCe' or type == 'Pautomac':
+    if filetype == 'SPiCe' or filetype == 'Pautomac':
        data = _load_file_doublelecture(adr=adr, pickle=pickle)
        return DataSample(data=data)

 def _load_file_doublelecture(adr, pickle=False):
    dsample = {}  # dictionary (word,count)
-    nb_sample, max_length = _read_dimension(adr=adr)
+    _, max_length = _read_dimension(adr=adr)
    f = open(adr, "r")
    line = f.readline()
    l = line.split()
@@ -107,49 +94,6 @@ def _read_dimension(adr):
                         "do not match number of samples " + str(nb_sample))
    return nb_sample , max_length

-# def _load_file_1lecture(adr, pickle=False):
-#     dsample = {}  # dictionary (word,count)
-#     f = open(adr, "r")
-#     line = f.readline()
-#     l = line.split()
-#     nbEx = int(l[0])
-#     nbL = int(l[1])
-#     line = f.readline()
-#     data1 = np.zeros((0,0))
-#     length = 0
-#     while line:
-#         l = line.split()
-#         # w = () if int(l[0]) == 0 else tuple([int(x) for x in l[1:]])
-#         # dsample[w] = dsample[w] + 1 if w in dsample else 1
-#         # traitement du mot vide pour les préfixes, suffixes et facteurs
-#         w = [] if int(l[0]) == 0 else [int(x) for x in l[1:]]
-#         word = np.array(w, ndmin=2, dtype=np.uint32)
-#         diff = abs(int(l[0]) - length)
-#         if len(w) > length and not np.array_equal(data1, np.zeros((0,0))):
-#             data1 = _add_empty(data1, diff)
-#         elif word.shape[0] < length and not np.array_equal(data1, np.zeros((0,0))):
-#             word = _add_empty(word, diff)
-#
-#         if np.array_equal(data1, np.zeros((0,0))):
-#             data1 = word
-#         else:
-#             data1 = np.concatenate((data1, word), axis=0)
-#         length = data1.shape[1]
-#         line = f.readline()
-#
-#     f.close()
-#     if pickle:
-#         _create_pickle_files(adr=adr, dsample=dsample)
-#     return nbL, nbEx, data1
-
-
-# def _add_empty(data, diff):
-#     empty = np.zeros((data.shape[0], diff))
-#     empty += -1
-#     data = np.concatenate((data, empty), axis=1)
-#     return data
-
-
 def _create_pickle_files(self, adr, dsample):
    f = open(adr + ".sample.pkl", "wb")
    pickle.dump(dsample, f)

--- a/splearn/datasets/data_sample.py
+++ b/splearn/datasets/data_sample.py
@@ -33,29 +33,58 @@
 #
 #
 # ######### COPYRIGHT #########
-"""This module contains the DataSample class and Splearn_array class
-The DataSample class encapsulates a sample 's components
-nbL and nbEx numbers,
-Splearn_array class inherit from numpy ndarray and contains a 2d data ndarray
-with the shape
-
-==== ====  ====  ====  ====
-x    x     x     x     -1
-x    x     x     x     x
-x    x     -1    -1    -1
-x    -1    -1    -1    -1
-1   -1    -1    -1    -1
-==== ====  ====  ====  ====
-
-where -1 a indicates a empty cell,
-the number nbL and nbEx and , the fourth dictionaries for sample,
-prefix, suffix and factor where they are computed
+"""This module contains the DataSample class and SplearnArray class.
+
+
 """
 import numpy as np


-class Splearn_array(np.ndarray):
-    """Splearn_array inherit from numpy ndarray
+class SplearnArray(np.ndarray):
+    """Sample data array used by the splearn spectral estimation
+
+    **SplearnArray** class inherit from numpy ndarray as a 2d data ndarray.
+    
+    Example of a possible 2d shape:
+    
+    +---+---+---+---+---+
+    |  0|  1|  0|  3| -1|
+    +---+---+---+---+---+
+    |  0|  0|  3|  3|  1|
+    +---+---+---+---+---+
+    |  1|  1| -1| -1| -1|
+    +---+---+---+---+---+
+    |  5| -1| -1| -1| -1|
+    +---+---+---+---+---+
+    | -1| -1| -1| -1| -1|
+    +---+---+---+---+---+
+    
+    is equivalent to:
+    
+    - word (0103) or abad
+    - word (00331) or aaddb
+    - word (11) or bb
+    - word (5) or f
+    - word () or empty
+    
+    Each line represents a word of the sample. The words are represented by integer letters (0->a, 1->b, 2->c ...).
+    -1 indicates the end of the word. The number of rows is the total number of words in the sample (=nbEx) and the number of columns
+    is given by the size of the longest word. Notice that the total number of words does not care about the words' duplications. 
+    If a word is duplicated in the sample, it is counted twice as two different examples. 
+    
+    The DataSample class encapsulates also the sample's parameters 'nbL', 'nbEx' (number of letters in the alphabet and 
+    number of samples) and the fourth dictionaries 'sample', 'prefix', 'suffix' and 'factor' that will be populated during the fit
+    estimations.
+    
+    - Input:
+
+    :param nd.array input_array: input ndarray that will be converted into **SplearnArray**
+    :param int nbL: the number of letters
+    :param int nbEx: total number of examples.
+    :param dict sample: the keys are the words and the values are the number of time it appears in the sample.
+    :param dict pref: the keys are the prefixes and the values are the number of time it appears in the sample.
+    :param dict suff: the keys are the suffixes and the values are the number of time it appears in the sample.
+    :param dict fact: the keys are the factors and the values are the number of time it appears in the sample.

    :Example:

@@ -66,7 +95,7 @@ class Splearn_array(np.ndarray):
    >>> print(data.__class__)
    >>> data.data
    <class 'splearn.datasets.data_sample.DataSample'>
-    GSplearn_array([[ 3.,  0.,  3., ..., -1., -1., -1.],
+    SplearnArray([[ 3.,  0.,  3., ..., -1., -1., -1.],
        [ 3.,  3., -1., ..., -1., -1., -1.],
        [ 3.,  2.,  0., ..., -1., -1., -1.],
        ...,
@@ -96,150 +125,15 @@ class Splearn_array(np.ndarray):
        self.suff = getattr(obj, 'suff', None)
        self.fact = getattr(obj, 'fact', None)

-    # def select_rows(self, nb_rows_max=1000, version='classic'):
-    #     """define lrows
-    #
-    #     - Input:
-    #
-    #     :param int nb_rows_max:  (default = 1000) number of maximum rows
-    #     :param string version: (default = "classic") version name
-    #
-    #     - Output:
-    #
-    #     :returns: list lrows,  list of rows
-    #     :rtype: list
-    #     """
-    #     lRows = []  # liste à renvoyer
-    #     nbRows = 0
-    #     lLeafs = [([], self.nbEx )]
-    #     #  pref[()]la liste de couples (prefixes frontières, nb occ)
-    #     # initialisée au prefixe vide
-    #     if version == 'classic':
-    #         while lLeafs and nbRows < nb_rows_max:
-    #             lastWord = lLeafs.pop()[
-    #                 0]  # le prefixe frontière le plus fréquent
-    #             lRows.append(tuple(lastWord))
-    #             nbRows += 1
-    #             for i in range(self.nbL):
-    #                 newWord = lastWord + [i]  # successeur de lastword
-    #                 tnewWord = tuple(newWord)  # tuple associé
-    #                 if tnewWord in self.pref:
-    #                     # ajout d'un nouveau prefixe frontière
-    #                     lLeafs.append((newWord, self.pref[tnewWord]))
-    #             lLeafs = sorted(lLeafs, key=lambda x: x[1])
-    #     elif version == 'prefix':
-    #         while lLeafs and nbRows < nb_rows_max:
-    #             lastWord = lLeafs.pop()[
-    #                 0]  # le prefixe frontière le plus fréquent
-    #             lRows.append(tuple(lastWord))
-    #             nbRows += 1
-    #             for i in range(self.nbL):
-    #                 newWord = lastWord + [i]  # successeur de lastword
-    #                 tnewWord = tuple(newWord)  # tuple associé
-    #                 if tnewWord in self.pref:
-    #                     # ajout d'un nouveau prefixe frontière
-    #                     nb = 0
-    #                     for u in self.sample:
-    #                         if tnewWord <= u:
-    #                             nb += self.sample[u] * (
-    #                             len(u) - len(tnewWord) + 1)
-    #                     lLeafs.append((newWord, nb))
-    #             lLeafs = sorted(lLeafs, key=lambda x: x[1])
-    #     elif version == 'factor':
-    #         while lLeafs and nbRows < nb_rows_max:
-    #             lastWord = lLeafs.pop()[
-    #                 0]  # le prefixe frontière le plus fréquent
-    #             lRows.append(tuple(lastWord))
-    #             nbRows += 1
-    #             for i in range(self.nbL):
-    #                 newWord = lastWord + [i]  # successeur de lastword
-    #                 tnewWord = tuple(newWord)  # tuple associé
-    #                 if tnewWord in self.fact:
-    #                     # ajout d'un nouveau prefixe frontière
-    #                     nb = 0
-    #                     lw = len(tnewWord)
-    #                     for u in self.sample:
-    #                         if len(u) >= lw:
-    #                             for i in range(lw, len(u) + 1):
-    #                                 if u[:i][-lw:] == tnewWord:
-    #                                     nb += self.sample[u] * (len(u) - i + 1)
-    #                     lLeafs.append((newWord, nb))
-    #             lLeafs = sorted(lLeafs, key=lambda x: x[1])
-    #             # print(lLeafs)
-    #     return lRows
-
-    # def select_columns(self, nb_columns_max=1000, version='classic'):
-    #     """define lcolumns
-    #
-    #     - Input:
-    #
-    #     :param int nb_columns_max:  (default = 1000) number of maximum columns
-    #     :param string version: (default = "classic") version name
-    #
-    #     - Output:
-    #
-    #     :returns:list lcolumns,  list of columns
-    #     :rtype: list
-    #     """
-    #     lColumns = []  # liste à renvoyer
-    #     lLeafs = [([], self.nbEx)]  # la liste de couples (suffixes frontières,
-    #     #  nb occ) initialisée au suffixe vide
-    #
-    #     nbColumns = 0
-    #     if version == 'classic':
-    #          while lLeafs and nbColumns < nb_columns_max:
-    #             lastWord = lLeafs.pop()[
-    #                 0]  # le suffixe frontière le plus fréquent
-    #             lColumns.append(tuple(lastWord))
-    #             nbColumns += 1
-    #             for i in range(self.nbL):
-    #                 newWord = lastWord + [i]  # successeur de lastword
-    #                 tnewWord = tuple(newWord)  # tuple associé
-    #                 if tnewWord in self.suff:
-    #                     # ajout d'un nouveau suffixe frontière
-    #                     lLeafs.append((newWord, self.suff[tnewWord]))
-    #             lLeafs = sorted(lLeafs, key=lambda x: x[
-    #                 1])  # suffixe le plus fréquent en dernier
-    #             # print(lLeafs)
-    #     elif version == 'prefix':
-    #         while lLeafs and nbColumns < nb_columns_max:
-    #             lastWord = lLeafs.pop()[
-    #                 0]  # le prefixe frontière le plus fréquent
-    #             lColumns.append(tuple(lastWord))
-    #             nbColumns += 1
-    #             for i in range(self.nbL):
-    #                 newWord = lastWord + [i]  # successeur de lastword
-    #                 tnewWord = tuple(newWord)  # tuple associé
-    #                 if tnewWord in self.fact:
-    #                     # ajout d'un nouveau suffixe frontière
-    #                     lLeafs.append((newWord, self.fact[tnewWord]))
-    #             lLeafs = sorted(lLeafs, key=lambda x: x[1])
-    #     elif version == 'factor':
-    #         while lLeafs and nbColumns < nb_columns_max:
-    #             lastWord = lLeafs.pop()[
-    #                 0]  # le prefixe frontière le plus fréquent
-    #             lColumns.append(tuple(lastWord))
-    #             nbColumns += 1
-    #             for i in range(self.nbL):
-    #                 newWord = lastWord + [i]  # successeur de lastword
-    #                 tnewWord = tuple(newWord)  # tuple associé
-    #                 if tnewWord in self.fact:
-    #                     # ajout d'un nouveau prefixe frontière
-    #                     nb = 0
-    #                     lw = len(tnewWord)
-    #                     for u in self.sample:
-    #                         if len(u) >= lw:
-    #                             for i in range(lw, len(u) + 1):
-    #                                 if u[:i][-lw:] == tnewWord:
-    #                                     nb += self.sample[u] * (i - lw + 1)
-    #                     lLeafs.append((newWord, nb))
-    #             lLeafs = sorted(lLeafs, key=lambda x: x[1])
-    #             # print(lLeafs)
-    #     return lColumns
-
 class DataSample(dict):
    """ A DataSample instance

+    - Input:
+
+    :param tuple data: a tuple of (int, int, numpy.array) for the corresponding three elements
+        (nbL, nbEx, data) where nbL is the number of letters in the alphabet, nbEx is the number
+        of samples and data is the 2d data array
+
    :Example:

    >>> from splearn.datasets.base import load_data_sample
@@ -254,46 +148,20 @@ class DataSample(dict):
    5000
    >>> data.data

-    - Input:
-
-    :param string adr: adresse and name of the loaden file
-    :param string type: (default value = 'SPiCe') indicate
-           the structure of the file
-    :param lrows: number or list of rows,
-           a list of strings if partial=True;
-           otherwise, based on self.pref if version="classic" or
-           "prefix", self.fact otherwise
-    :type lrows: int or list of int
-    :param lcolumns: number or list of columns
-           a list of strings if partial=True ;
-           otherwise, based on self.suff if version="classic" or "suffix",
-           self.fact otherwise
-    :type lcolumns: int or list of int
-    :param string version: (default = "classic") version name
-    :param boolean partial: (default value = False) build of partial
-
    """

-    def __init__(self, data=None, type='SPiCe', **kwargs):
-
-        # Size of the alphabet
-        self._nbL = 0
-        # Number of samples
-        self._nbEx = 0
+    def __init__(self, data=None, **kwargs):
        # The dictionary that contains the sample
-        self._data = Splearn_array(np.zeros((0,0)))
+        self._data = SplearnArray(np.zeros((0,0)))
        if data is not None:
-            self.nbL = data[0]
-            self.nbEx = data[1]
-            self.data = Splearn_array(data[2], nbL=data[0], nbEx=data[1])
-
+            self.data = SplearnArray(data[2], nbL=data[0], nbEx=data[1])
        super(DataSample, self).__init__(kwargs)


    @property
    def nbL(self):
        """Number of letters"""
-        return self._nbL
+        return self.data.nbL

    @nbL.setter
    def nbL(self, nbL):
@@ -302,13 +170,12 @@ class DataSample(dict):
        if nbL < 0:
            raise ValueError("The size of the alphabet should " +
                             "an integer >= 0")
-        self._nbL = nbL
+        self.data.nbL = nbL

    @property
    def nbEx(self):
        """Number of examples"""
-
-        return self._nbEx
+        return self.data.nbEx

    @nbEx.setter
    def nbEx(self, nbEx):
@@ -317,21 +184,17 @@ class DataSample(dict):
        if nbEx < 0:
            raise ValueError("The number of examples should be " +
                             " an integer >= 0")
-        self._nbEx = nbEx
+        self.data.nbEx = nbEx

    @property
    def data(self):
-        """Splearn_array"""
-
+        """SplearnArray"""
        return self._data

    @data.setter
    def data(self, data):
-        if isinstance(data, (Splearn_array, np.ndarray, np.generic)):
+        if isinstance(data, (SplearnArray, np.ndarray, np.generic)):
            self._data = data
        else:
-            raise TypeError("sample should be a Splearn_array.")
-
-
-
+            raise TypeError("sample should be a SplearnArray.")

--- a/splearn/hankel.py
+++ b/splearn/hankel.py
@@ -44,21 +44,9 @@ import numpy as np
 class Hankel(object):
    """ A Hankel instance , compute the list of Hankel matrices

-    :Example:
-    
-    >>> from splearn import Learning, Hankel , Spectral
-    >>> train_file = '0.spice.train'
-    >>> pT = load_data_sample(adr=train_file)
-    >>> sp = Spectral()
-    >>> sp.fit(X=pT.data)
-    >>> lhankel = Hankel( sample_instance=pT.sample,
-    >>>                   nbL=pT.nbL, nbEx=pT.nbEx,
-    >>>                   lrows=6, lcolumns=6, version="classic",
-    >>>                   partial=True, sparse=True, mode_quiet=True).lhankel
-    
    - Input:
    
-    :param Splearn_array sample_instance: instance of Splearn_array
+    :param SplearnArray sample_instance: instance of SplearnArray
    :param lrows: number or list of rows,
           a list of strings if partial=True;
           otherwise, based on self.pref if version="classic" or
@@ -79,6 +67,19 @@ class Hankel(object):
           *sample_instance* or *lhankel* has to be not None. If *sample_instance* is given,
           the **Hankel** instance is built directly from the sample dictionnary,
           else it is deduced from the *lhankels* list of matrices. 
+
+    :Example:
+    
+    >>> from splearn import Learning, Hankel , Spectral
+    >>> train_file = '0.spice.train'
+    >>> pT = load_data_sample(adr=train_file)
+    >>> sp = Spectral()
+    >>> sp.fit(X=pT.data)
+    >>> lhankel = Hankel( sample_instance=pT.sample,
+    >>>                   nbL=pT.nbL, nbEx=pT.nbEx,
+    >>>                   lrows=6, lcolumns=6, version="classic",
+    >>>                   partial=True, sparse=True, mode_quiet=True).lhankel
+
    """

    def __init__(
@@ -177,6 +178,8 @@ class Hankel(object):
    
    @property
    def build_from_sample(self):
+        """Boolean that indicates if the matrices have been build form sample or not
+        (directly build from an Automaton in this case) """
        return self._build_from_sample
    
    @build_from_sample.setter
@@ -193,10 +196,10 @@ class Hankel(object):

        - Input:

-        :param dict sample: sample dictionary
-        :param dict pref: prefix dictionary
-        :param dict suff: suffix dictionary
-        :param dict fact: factor dictionary
+        :param dict sample: the keys are the words and the values are the number of time it appears in the sample.
+        :param dict pref: the keys are the prefixes and the values are the number of time it appears in the sample.
+        :param dict suff: the keys are the suffixes and the values are the number of time it appears in the sample.
+        :param dict fact: the keys are the factors and the values are the number of time it appears in the sample.
        :param lrows: number or list of rows,
               a list of strings if partial=True;
               otherwise, based on self.pref if version="classic" or

--- a/splearn/spectral.py
+++ b/splearn/spectral.py
@@ -41,7 +41,10 @@
 from __future__ import division, print_function
 import numpy as np
 import math
-from splearn.datasets.data_sample import  Splearn_array
+import threading
+lock = threading.Lock()
+
+from splearn.datasets.data_sample import SplearnArray
 from splearn.hankel import Hankel
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
@@ -51,29 +54,6 @@ import warnings
 class Spectral(BaseEstimator):
    """A Spectral estimator instance

-    :Example:
-
-    >>> from splearn.spectral import Spectral
-    >>> sp = Spectral()
-    >>> sp.set_params(partial=True, lcolumns=6, lrows=6, smooth_method='trigram')
-    Spectral(lcolumns=6, lrows=6, mode_quiet=False, partial=True, rank=5,
-     smooth_method='trigram', sparse=True, version='classic')
-    >>> sp.fit(data.data)
-    Start Hankel matrix computation
-    End of Hankel matrix computation
-    Start Building Automaton from Hankel matrix
-    End of Automaton computation
-    Spectral(lcolumns=6, lrows=6, partial=True, rank=5, smooth_method='trigram', sparse=True, version='classic')
-    >>> sp.automaton.initial
-    array([-0.00049249,  0.00304676, -0.04405996, -0.10765322, -0.08660063])
-    >>> sp.predict(data.data)
-    array([  4.38961058e-04,   1.10616861e-01,   1.35569353e-03, ...,
-        4.66041996e-06,   4.68177275e-02,   5.24287604e-20])
-    >>> sp.loss(data.data, normalize=True)
-    -10.530029936056017
-    >>> sp.score(data.data)
-    10.530029936056017
-
    - Input:

    :param int rank: the ranking number
@@ -106,6 +86,28 @@ class Spectral(BaseEstimator):
    :param boolean mode_quiet: (default value = False) True for no
           output message.

+    :Example:
+
+    >>> from splearn.spectral import Spectral
+    >>> sp = Spectral()
+    >>> sp.set_params(partial=True, lcolumns=6, lrows=6, smooth_method='trigram')
+    Spectral(lcolumns=6, lrows=6, mode_quiet=False, partial=True, rank=5,
+     smooth_method='trigram', sparse=True, version='classic')
+    >>> sp.fit(data.data)
+    Start Hankel matrix computation
+    End of Hankel matrix computation
+    Start Building Automaton from Hankel matrix
+    End of Automaton computation
+    Spectral(lcolumns=6, lrows=6, partial=True, rank=5, smooth_method='trigram', sparse=True, version='classic')
+    >>> sp.automaton.initial
+    array([-0.00049249,  0.00304676, -0.04405996, -0.10765322, -0.08660063])
+    >>> sp.predict(data.data)
+    array([  4.38961058e-04,   1.10616861e-01,   1.35569353e-03, ...,
+        4.66041996e-06,   4.68177275e-02,   5.24287604e-20])
+    >>> sp.loss(data.data, normalize=True)
+    -10.530029936056017
+    >>> sp.score(data.data)
+    10.530029936056017

    """
    def __init__(self,  rank=5, lrows=7, lcolumns=7,
@@ -172,8 +174,7 @@ class Spectral(BaseEstimator):
            self.smooth = 0

    def set_params(self, **parameters):
-        """
-        set the values of  Spectral estimator parameters
+        """set the values of  Spectral estimator parameters

        - Output:

@@ -186,12 +187,12 @@ class Spectral(BaseEstimator):
                self._rule_smooth_method(value)
        return self

-    def fit(self, X, y=None):   #, gram
+    def fit(self, X, y=None):
        """Fit the model

        - Input:

-        :param Splearn_array X: object of shape [n_samples,n_features]
+        :param SplearnArray X: object of shape [n_samples,n_features]
               Training data
        :param ndarray y: (default value = None) not used by Spectral estimator
               numpy array of shape [n_samples] Target values
@@ -206,11 +207,11 @@ class Spectral(BaseEstimator):
        """
        check_array(X)

-        if not isinstance(X, Splearn_array):
+        if not isinstance(X, SplearnArray):
            self._hankel = None
            self._automaton = None
            return self
-        X = self._polulate_dictionnaries(X)
+        X = self.polulate_dictionnaries(X)
        self._hankel = Hankel(sample_instance=X,
                         lrows=self.lrows, lcolumns=self.lcolumns,
                         version=self.version,
@@ -232,8 +233,108 @@ class Spectral(BaseEstimator):
            dsample[w] = dsample[w] + 1 if w in dsample else 1
        return dsample

-    def _polulate_dictionnaries(self, X):
-        if not isinstance(X, Splearn_array):
+#     def _populate_new_word(self, X, i, lrowsmax=None, version_rows_int=None,
+#                            lcolumnsmax=None, version_columns_int=None, lmax=None):
+#         w = X[i, :]
+#         w = w[w >= 0]
+#         w = tuple([int(x) for x in w[0:]])
+#         with lock:
+#             X.sample[w] = X.sample.setdefault(w, 0) + 1
+#         if self.version == "prefix" or self.version == "classic":
+#             # empty word treatment for prefixe, suffix, and factor dictionnaries
+#             with lock:
+#                 X.pref[()] = X.pref[()] + 1 if () in X.pref else 1
+#         if self.version == "suffix" or self.version == "classic":
+#             with lock:
+#                 X.suff[()] = X.suff[()] + 1 if () in X.suff else 1
+#         if self.version == "factor" or self.version == "suffix" \
+#                 or self.version == "prefix":
+#             with lock:
+#                 X.fact[()] = X.fact[()] + len(w) + 1 if () in X.fact else len(w) + 1
+# 
+#         if self.partial:
+#             for i in range(len(w)):
+#                 if self.version == "classic":
+#                     if (version_rows_int is True and
+#                                     i + 1 <= lrowsmax) or \
+#                        (version_rows_int is False and
+#                                     w[:i + 1] in self.lrows):
+#                         with lock:
+#                             X.pref[w[:i + 1]] = \
+#                                 X.pref[w[:i + 1]] + 1 if w[:i + 1] in X.pref else 1
+#                     if (version_columns_int is True and i + 1 <= lcolumnsmax) or \
+#                        (version_columns_int is False and w[-( i + 1):] in self.lcolumns):
+#                         with lock:
+#                             X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 if \
+#                                 w[-(i + 1):] in X.suff else 1
+#                 if self.version == "prefix":
+#                     # dictionaries dpref is populated until
+#                     # lmax = lrows + lcolumns
+#                     # dictionaries dfact is populated until lcolumns
+#                     if ((version_rows_int is True or
+#                                  version_columns_int is True) and
+#                                     i + 1 <= lmax) or \
+#                             (version_rows_int is False and
+#                                  (w[:i + 1] in self.lrows)) or \
+#                             (version_columns_int is False and
+#                                  (w[:i + 1] in self.lcolumns)):
+#                         X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
+#                             if w[:i + 1] in X.pref else 1
+#                     for j in range(i + 1, len(w) + 1):
+#                         if (version_columns_int is True and (
+#                             j - i) <= lmax) or \
+#                                 (version_columns_int is False and
+#                                      (w[i:j] in self.lcolumns)):
+#                             X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
+#                                 if w[i:j] in X.fact else 1
+#                 if self.version == "suffix":
+#                     if ((version_rows_int is True or
+#                                  version_columns_int is True) and
+#                                 i <= lmax) or \
+#                             (version_rows_int is False and
+#                                  (w[-(i + 1):] in self.lrows)) or \
+#                             (version_columns_int is False and
+#                                  (w[-(i + 1):] in self.lcolumns)):
+#                         X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 \
+#                             if w[-(i + 1):] in X.suff else 1
+#                     for j in range(i + 1, len(w) + 1):
+#                         if (version_rows_int is True and (
+#                             j - i) <= lmax) or \
+#                                 (version_rows_int is False and
+#                                      (w[i:j] in self.lrows)):
+#                             X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
+#                                 if w[i:j] in X.fact else 1
+#                 if self.version == "factor":
+#                     for j in range(i + 1, len(w) + 1):
+#                         if ((version_rows_int is True or
+#                                      version_columns_int is True) and
+#                                     (j - i) <= lmax) or \
+#                                 (version_rows_int is False and
+#                                      (w[i:j] in self.lrows)) or \
+#                                 (version_columns_int is False and
+#                                      (w[i:j] in self.lcolumns)):
+#                             X.fact[w[i:j]] = \
+#                                 X.fact[w[i:j]] + 1 if w[i:j] in X.fact else 1
+# 
+#         else:  # not partial
+#             for i in range(len(w)):
+#                 X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
+#                     if w[:i + 1] in X.pref else 1
+#                 X.suff[w[i:]] = X.suff[w[i:]] + 1 if w[i:] in X.suff else 1
+#                 for j in range(i + 1, len(w) + 1):
+#                     X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
+#                         if w[i:j] in X.fact else 1
+
+    def polulate_dictionnaries(self, X):
+        """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
+                
+        - Input:
+
+        :param SplearnArray X: object of shape [n_samples,n_features]
+               Training data
+        
+        """
+        if not isinstance(X, SplearnArray):
            return X
        dsample = {}  # dictionary (word,count)
        dpref = {}  # dictionary (prefix,count)
@@ -459,7 +560,7 @@ class Spectral(BaseEstimator):

        - Input:

-        :param Splearn_array X : of shape data shape = (n_samples, n_features)
+        :param SplearnArray X : of shape data shape = (n_samples, n_features)
               Samples.


@@ -489,7 +590,7 @@ class Spectral(BaseEstimator):

        - Input:

-        :param Splearn_array X : Samples, data shape = (n_samples, n_features)
+        :param SplearnArray X : Samples, data shape = (n_samples, n_features)


        - Output:
@@ -537,18 +638,17 @@ class Spectral(BaseEstimator):
        return Y

    def loss(self, X, y=None, normalize=True):
-        """
-        Log probability using the Spectral model
+        """Log probability using the Spectral model

        - Input:

-        :param Splearn_array X : of shape data shape = (n_samples, n_features)
+        :param SplearnArray X: of shape data shape = (n_samples, n_features)
               Samples. X is validation data.
        :param ndarray y: (default value = Null)
               numpy array of shape [n_samples] Target values,
               is the ground truth target for X (in the supervised case) or
               None (in the unsupervised case)
-        :param boolean normalize (default value = True) calculation are
+        :param boolean normalize: (default value = True) calculation are
               performed and normalize by the number of sample in case of True

        - Output:
@@ -584,7 +684,7 @@ class Spectral(BaseEstimator):

        - Input:

-        :param Splearn_array X: of shape data shape = (n_samples, n_features)
+        :param SplearnArray X: of shape data shape = (n_samples, n_features)
               Samples.
        :param ndarray y: (default value = None)
               numpy array of shape [n_samples] Target values,

--- a/splearn/tests/test_data_sample.py
+++ b/splearn/tests/test_data_sample.py
@@ -38,7 +38,7 @@ from __future__ import division, print_function
 import numpy as np
 import unittest
 from splearn.datasets.base import load_data_sample
-from splearn.datasets.data_sample import DataSample, Splearn_array
+from splearn.datasets.data_sample import DataSample, SplearnArray
 from splearn.tests.datasets.get_dataset_path import get_dataset_path
 from splearn.spectral import Spectral

@@ -68,7 +68,7 @@ class UnitaryTest(unittest.TestCase):

        s = load_data_sample(adr=adr)
        cl = Spectral()
-        cl._polulate_dictionnaries(s.data)
+        cl.polulate_dictionnaries(s.data)
        self.assertEqual(s.nbL,s.data.nbL)
        self.assertEqual(s.nbEx, s.data.nbEx)
        with self.assertRaises(TypeError):
@@ -88,7 +88,7 @@ class UnitaryTest(unittest.TestCase):

        data = load_data_sample(adr=adr)
        cl = Spectral(partial=False)
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        nbL = data.data.nbL
        nbEx = data.data.nbEx
        sample = data.data .sample
@@ -107,7 +107,7 @@ class UnitaryTest(unittest.TestCase):
        self.assertEqual(nbSuff1, nbSuff2)

        cl = Spectral(version = 'factor', partial=False)
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        fact = data.data.fact
        nbFact1 = sum([sample[w]*(len(w)+1)*(len(w)+2)/2 for w in sample])
        nbFact2 = sum([fact[w] for w in fact])
@@ -117,7 +117,7 @@ class UnitaryTest(unittest.TestCase):
        adr = get_dataset_path("0.spice.train")
        pT = load_data_sample(adr=adr)
        cl = Spectral(partial=False)
-        cl._polulate_dictionnaries(pT.data)
+        cl.polulate_dictionnaries(pT.data)
        # lR = pT.data.select_rows(nb_rows_max = 10, version = 'classic')
        # lC = pT.data.select_columns(nb_columns_max = 10, version = 'classic')
        # self.assertEqual(lR, [(), (3,), (3, 0), (3, 3), (3, 0, 3), (3, 1),
@@ -127,7 +127,7 @@ class UnitaryTest(unittest.TestCase):
        #                   (1,), (1, 3), (3, 0, 3)])

        cl = Spectral(version = 'prefix', partial=False)
-        cl._polulate_dictionnaries(pT.data)
+        cl.polulate_dictionnaries(pT.data)
        # lRp = pT.data.select_rows(nb_rows_max = 10, version = 'prefix')
        # lCp = pT.data.select_columns(nb_columns_max = 10, version = 'prefix')
        # self.assertEqual(lRp, [(), (3,), (3, 0), (3, 0, 0), (3, 0, 1),
@@ -137,7 +137,7 @@ class UnitaryTest(unittest.TestCase):
        #                    (0, 3), (1, 3), (3, 1)])

        cl = Spectral(version = 'factor', partial=False)
-        cl._polulate_dictionnaries(pT.data)
+        cl.polulate_dictionnaries(pT.data)
        # lRf = pT.data.select_rows(nb_rows_max = 10, version = 'factor')
        # lCf = pT.data.select_columns(nb_columns_max = 10, version = 'factor')
        # self.assertEqual(lRf,  [(), (3,), (0,), (1,), (3, 0), (3, 3), (2,),

--- a/splearn/tests/test_hankel.py
+++ b/splearn/tests/test_hankel.py
@@ -53,7 +53,7 @@ class HankelTest(unittest.TestCase):
        # adr = get_dataset_path("3.pautomac.train")
        data = load_data_sample(adr=adr)
        cl = Spectral(partial=False)
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        lprefix = [()]
        lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
        lprefix = lprefix+[(i, j) for i in range(data.data.nbL)
@@ -123,7 +123,7 @@ class HankelTest(unittest.TestCase):
        # adr = get_dataset_path("3.pautomac.train")
        data = load_data_sample(adr=adr)
        cl = Spectral(partial=False, version="prefix")
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        lprefix = [()]
        lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
        lprefix = lprefix + [(i, j) for i in range(data.data.nbL)
@@ -196,7 +196,7 @@ class HankelTest(unittest.TestCase):
        # adr = get_dataset_path("3.pautomac.train")
        data = load_data_sample(adr=adr)
        cl = Spectral(partial=False, version="suffix")
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        lprefix = [()]
        lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
        lprefix = lprefix + [(i, j) for i in range(data.data.nbL)
@@ -266,7 +266,7 @@ class HankelTest(unittest.TestCase):
        # adr = get_dataset_path("3.pautomac.train")
        data = load_data_sample(adr=adr)
        cl = Spectral(partial=False, version="factor")
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        lprefix = [()]
        lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
        lprefix = lprefix + [(i, j) for i in range(data.data.nbL)
@@ -336,7 +336,7 @@ class HankelTest(unittest.TestCase):
        # adr = get_dataset_path("3.pautomac.train")
        data = load_data_sample(adr=adr)
        cl = Spectral()
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        h = Hankel(sample_instance=data.data, lrows=1, lcolumns=1,
            version="classic", partial=False, sparse=False)
        with self.assertRaises(TypeError):
@@ -349,7 +349,7 @@ class HankelTest(unittest.TestCase):
        adr = get_dataset_path("essai")
        data = load_data_sample(adr=adr)
        cl = Spectral()
-        cl._polulate_dictionnaries(data.data)
+        cl.polulate_dictionnaries(data.data)
        h1 = Hankel(sample_instance=data.data, lrows=1, lcolumns=1,
                    version="classic", partial=False, sparse=False)
        h2 = Hankel(sample_instance=data.data, lrows=1, lcolumns=1,

--- a/splearn/tests/test_spectral.py
+++ b/splearn/tests/test_spectral.py
@@ -41,9 +41,10 @@ from splearn.datasets.base import load_data_sample
 from splearn.automaton import Automaton
 from splearn.spectral import Spectral
 from splearn.tests.datasets.get_dataset_path import get_dataset_path
+from sklearn.linear_model.tests.test_passive_aggressive import random_state


-class UnitaryTest(unittest.TestCase):
+class SpectralTest(unittest.TestCase):

    def test_version(self):
        adr = get_dataset_path("essai")
@@ -238,6 +239,25 @@ class UnitaryTest(unittest.TestCase):
        np.testing.assert_almost_equal(A.val([0, 1, 0, 1, 1]),
                                       B.val([0, 1, 0, 1, 1]))

+    def test_sklearn_compatibility(self):
+        from sklearn.utils.estimator_checks import check_estimator
+        from sklearn.model_selection import train_test_split, cross_val_score
+        check_estimator(Spectral)
+        adr = get_dataset_path("3.pautomac_light.train")
+        data = load_data_sample(adr=adr)
+        sp = Spectral(lrows=6, lcolumns=6, rank = 5, sparse=False,
+                      partial=True, smooth_method='trigram')
+        X_train, X_test = train_test_split(data.data, test_size=0.4, random_state=0)
+        sp.fit(X_train)
+        single_predicted_weights = sp.predict(X_test)
+        print(single_predicted_weights)
+        self.assertAlmostEqual(single_predicted_weights[0], 6.76217667e-02, delta = 1e-5)
+        scores = cross_val_score(sp, data.data, cv=4)
+        print(scores)
+        scores_expected = [-10.65272755, -10.7090267,  -10.78404758, -11.08453211]
+        for s1, s2 in zip(scores, scores_expected):
+            self.assertAlmostEqual(s1, s2, delta=0.1)
+
 #     def test_Perplexity(self):
 #         adr = get_dataset_path("3.pautomac")
 #         P = Learning()