Skip to content
Snippets Groups Projects
Commit cb7dcfc2 authored by Denis Arrivault's avatar Denis Arrivault
Browse files

Doc cleaning + refactoring Splearn_array into SplearnArray + change...

Doc cleaning + refactoring Splearn_array into SplearnArray + change Spectral._populate_dictionnaries into Spectral.polulate_dictionnaries
parent a330c613
No related branches found
No related tags found
No related merge requests found
Pipeline #
%% Cell type:code id: tags:
``` python
from splearn.datasets.base import load_data_sample
from splearn.tests.datasets.get_dataset_path import get_dataset_path
from splearn import Spectral, Automaton, Serializer
train_file = '3.pautomac_light.train'
data = load_data_sample(adr=get_dataset_path(train_file))
sp = Spectral()
sp.fit(X=data.data)
```
%% Output
Start Hankel matrix computation
End of Hankel matrix computation
Start Building Automaton from Hankel matrix
End of Automaton computation
Spectral(lcolumns=7, lrows=7, mode_quiet=False, partial=True, rank=5,
smooth_method='none', sparse=True, version='classic')
%% Cell type:code id: tags:
``` python
Automaton.write(sp.automaton, train_file + ".json")
```
%% Cell type:code id: tags:
``` python
str1 = Serializer.data_to_json(sp.automaton)
```
%% Cell type:code id: tags:
``` python
A = Serializer.json_to_data(str1)
```
%% Cell type:code id: tags:
``` python
str2 = Serializer.data_to_json(A)
```
%% Cell type:code id: tags:
``` python
str1 == str2
```
%% Output
True
%% Cell type:code id: tags:
``` python
str1
```
%% Output
'{"automaton": {"nbL": 4, "nbS": 5, "initial": {"numpy.ndarray": {"values": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], "dtype": "float64"}}, "final": {"numpy.ndarray": {"values": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], "dtype": "float64"}}, "transitions": [{"numpy.ndarray": {"values": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], "dtype": "float64"}}], "type": "classic"}}'
%% Cell type:code id: tags:
``` python
str2
```
%% Output
'{"automaton": {"nbL": 4, "nbS": 5, "initial": {"numpy.ndarray": {"values": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], "dtype": "float64"}}, "final": {"numpy.ndarray": {"values": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], "dtype": "float64"}}, "transitions": [{"numpy.ndarray": {"values": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], "dtype": "float64"}}, {"numpy.ndarray": {"values": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], "dtype": "float64"}}], "type": "classic"}}'
%% Cell type:code id: tags:
``` python
str3 = Serializer.data_to_yaml(sp.automaton)
```
%% Cell type:code id: tags:
``` python
print(str3)
```
%% Output
automaton:
final:
numpy.ndarray:
dtype: float64
values: [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759,
-0.554674433356224]
initial:
numpy.ndarray:
dtype: float64
values: [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415,
-0.1077770261654714, -0.0866391379316952]
nbL: 4
nbS: 5
transitions:
- numpy.ndarray:
dtype: float64
values:
- [0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579,
-0.21402523377497645]
- [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124,
-0.5580573163749153]
- [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607,
-0.14481622025561292]
- [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007,
-0.15103654604159977]
- [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935,
0.2432902242047721]
- numpy.ndarray:
dtype: float64
values:
- [0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105,
0.20289396030628148]
- [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881,
-0.11085207725068251]
- [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564,
-0.1567967473145572]
- [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344,
-0.053754646789681754]
- [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344,
0.1418257620585633]
- numpy.ndarray:
dtype: float64
values:
- [-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635,
-0.22977695089938127]
- [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974,
0.3484153673774836]
- [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982,
0.02316491010895583]
- [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187,
-0.038514339001406585]
- [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314,
0.04984888818929034]
- numpy.ndarray:
dtype: float64
values:
- [0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735,
0.024753473688328077]
- [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589,
0.07372294122306285]
- [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677,
0.09251699239093385]
- [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407,
0.6491033177492604]
- [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646,
0.6616477207948602]
type: classic
%% Cell type:code id: tags:
``` python
Automaton.write(sp.automaton, train_file + ".json")
```
%% Cell type:code id: tags:
``` python
Ajs = Automaton.read(train_file + ".json")
```
%% Cell type:code id: tags:
``` python
Ajs.transitions
```
%% Output
[array([[ 0.04512121, -0.2403897 , 0.34945 , -0.28116807, -0.21402523],
[ 0.06925801, -0.30062293, 0.20641375, -0.14960814, -0.55805732],
[ 0.02980115, -0.13866481, 0.18362213, -0.20969545, -0.14481622],
[ 0.00569934, -0.02338583, -0.06600665, 0.10749935, -0.15103655],
[-0.02008655, 0.09026348, -0.00552559, -0.03135532, 0.24329022]]),
array([[ 0.07744772, 0.09007074, -0.30472201, 0.27676245, 0.20289396],
[-0.0990298 , -0.08061847, 0.25853171, -0.1208633 , -0.11085208],
[-0.06171079, -0.06244152, 0.12007655, 0.00250637, -0.15679675],
[-0.00273697, -0.00900572, -0.00046003, -0.00855043, -0.05375465],
[ 0.03098733, 0.0397268 , -0.04997113, 0.00357694, 0.14182576]]),
array([[-0.06791915, -0.11357938, 0.37955393, -0.2178498 , -0.22977695],
[ 0.11596642, 0.14914957, -0.13357508, -0.00891606, 0.34841537],
[ 0.01173082, 0.0192738 , 0.04142658, -0.03534659, 0.02316491],
[ 0.00732891, 0.00553651, -0.02245608, 0.03611543, -0.03851434],
[-0.01058989, -0.01062662, -0.00054311, -0.02556748, 0.04984889]]),
array([[ 0.07276211, -0.01571956, 0.07428593, -0.10369862, 0.02475347],
[-0.05607105, -0.08896207, 0.27638225, -0.23711256, 0.07372294],
[-0.00739129, -0.0487418 , -0.62912397, 0.46816277, 0.09251699],
[-0.00711022, -0.05623318, -0.36606659, -0.0132978 , 0.64910332],
[ 0.00233552, -0.02156115, 0.09096243, -0.38438823, 0.66164772]])]
%% Cell type:code id: tags:
``` python
Automaton.write(sp.automaton, train_file + ".yaml", "yaml")
```
%% Cell type:code id: tags:
``` python
Ayl = Automaton.read(train_file + ".yaml", "yaml")
```
%% Cell type:code id: tags:
``` python
Ayl.transitions
```
%% Output
[array([[ 0.04512121, -0.2403897 , 0.34945 , -0.28116807, -0.21402523],
[ 0.06925801, -0.30062293, 0.20641375, -0.14960814, -0.55805732],
[ 0.02980115, -0.13866481, 0.18362213, -0.20969545, -0.14481622],
[ 0.00569934, -0.02338583, -0.06600665, 0.10749935, -0.15103655],
[-0.02008655, 0.09026348, -0.00552559, -0.03135532, 0.24329022]]),
array([[ 0.07744772, 0.09007074, -0.30472201, 0.27676245, 0.20289396],
[-0.0990298 , -0.08061847, 0.25853171, -0.1208633 , -0.11085208],
[-0.06171079, -0.06244152, 0.12007655, 0.00250637, -0.15679675],
[-0.00273697, -0.00900572, -0.00046003, -0.00855043, -0.05375465],
[ 0.03098733, 0.0397268 , -0.04997113, 0.00357694, 0.14182576]]),
array([[-0.06791915, -0.11357938, 0.37955393, -0.2178498 , -0.22977695],
[ 0.11596642, 0.14914957, -0.13357508, -0.00891606, 0.34841537],
[ 0.01173082, 0.0192738 , 0.04142658, -0.03534659, 0.02316491],
[ 0.00732891, 0.00553651, -0.02245608, 0.03611543, -0.03851434],
[-0.01058989, -0.01062662, -0.00054311, -0.02556748, 0.04984889]]),
array([[ 0.07276211, -0.01571956, 0.07428593, -0.10369862, 0.02475347],
[-0.05607105, -0.08896207, 0.27638225, -0.23711256, 0.07372294],
[-0.00739129, -0.0487418 , -0.62912397, 0.46816277, 0.09251699],
[-0.00711022, -0.05623318, -0.36606659, -0.0132978 , 0.64910332],
[ 0.00233552, -0.02156115, 0.09096243, -0.38438823, 0.66164772]])]
%% Cell type:code id: tags:
``` python
from splearn import Hankel
Hankel.write(sp.hankel, train_file + "_hankel.json", "json")
```
%% Cell type:code id: tags:
``` python
Hb = Hankel.read(train_file + "_hankel.json", "json")
```
%% Cell type:code id: tags:
``` python
Hb == sp.hankel
import numpy as np
for i in range(4):
print(np.array_equal(Ajs.transitions[i], Ayl.transitions[i]))
```
%% Output
Hankel equality check
True
True
True
True
%% Cell type:code id: tags:
``` python
Hb.lhankel
```
%% Output
[<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 8251 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 2199 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 2122 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 1091 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 3489 stored elements in Dictionary Of Keys format>]
%% Cell type:code id: tags:
``` python
sp.hankel.lhankel
```
%% Output
[<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 8251 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 2199 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 2122 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 1091 stored elements in Dictionary Of Keys format>,
<1310x3308 sparse matrix of type '<class 'numpy.float64'>'
with 3489 stored elements in Dictionary Of Keys format>]
%% Cell type:code id: tags:
``` python
import scipy.sparse as sps
import numpy as np
```
%% Cell type:code id: tags:
``` python
data = sp.hankel.lhankel[0]
k_str = "({0:d},{1:d})"
dico = dict(zip([k_str.format(i, j) for (i,j) in data.keys()], data.values()))
from splearn import Hankel
Hankel.write(sp.hankel, train_file + "_hankel.json", "json")
```
%% Cell type:code id: tags:
``` python
dok = sps.dok_matrix(data.shape, dtype=data.dtype)
for k, val in dico.items():
k = k.replace("(","").replace(")","")
ind1, ind2 = k.split(",")
dok[(int(ind1), int(ind2))] = val
Hb = Hankel.read(train_file + "_hankel.json", "json")
```
%% Cell type:code id: tags:
``` python
print(dok)
Hb == sp.hankel
```
%% Output
(0, 2950) 6.0
(1, 1141) 6.0
(2, 820) 6.0
(9, 192) 6.0
(35, 75) 6.0
(123, 12) 6.0
(358, 4) 6.0
(832, 0) 6.0
(0, 20) 551.0
(1, 4) 551.0
(5, 0) 551.0
(0, 837) 9.0
(1, 212) 9.0
(4, 33) 9.0
(14, 17) 9.0
(56, 1) 9.0
(183, 0) 9.0
(0, 254) 7.0
(1, 25) 7.0
(2, 9) 7.0
(7, 1) 7.0
(26, 0) 7.0
(0, 3160) 5.0
(1, 1601) 5.0
(5, 323) 5.0
: :
(607, 109) 1.0
(1270, 48) 1.0
(34, 2382) 1.0
(117, 1262) 1.0
(336, 580) 1.0
(761, 265) 1.0
(464, 3272) 1.0
(1015, 1821) 1.0
(338, 2911) 1.0
(770, 1090) 1.0
(0, 2926) 1.0
(1, 1113) 1.0
(2, 767) 1.0
(9, 131) 1.0
(34, 70) 1.0
(119, 7) 1.0
(343, 3) 1.0
(786, 0) 1.0
(1073, 2555) 1.0
(0, 825) 1.0
(1, 197) 1.0
(3, 80) 1.0
(13, 17) 1.0
(53, 1) 1.0
(175, 0) 1.0
True
%% Cell type:code id: tags:
``` python
from splearn import Hankel
Hankel.write(sp.hankel, train_file + "_hankel.yaml", "yaml")
```
%% Cell type:code id: tags:
``` python
Hb = Hankel.read(train_file + "_hankel.yaml", "yaml")
```
%% Cell type:code id: tags:
``` python
Hb == sp.hankel
```
%% Output
Hankel equality check
True
%% Cell type:code id: tags:
``` python
yamlstr = "- scipy.dok_matrix:\n dtype: float64\n shape:\n tuple: [1, 1]\n values: {'(0,0)': 1.0}"
```
%% Cell type:code id: tags:
``` python
print(yamlstr)
```
%% Output
- scipy.dok_matrix:
dtype: float64
shape:
tuple: [1, 1]
values: {'(0,0)': 1.0}
%% Cell type:code id: tags:
``` python
Serializer.yaml_to_data(yamlstr)
```
%% Output
[<1x1 sparse matrix of type '<class 'numpy.float64'>'
with 1 stored elements in Dictionary Of Keys format>]
%% Cell type:code id: tags:
``` python
dy
```
%% Output
'"- scipy.dok_matrix:\\n dtype: float64\\n shape:\\n tuple: [1, 1]\\n values:\\\n \\ {\'(0,0)\': 1.0}"\n'
%% Cell type:code id: tags:
``` python
issubclass(TypeError, ValueError)
```
%% Output
False
%% Cell type:code id: tags:
``` python
"The input data string (" + data_str + ") should contain the following keys : \"" + '\", \"'.join(keys) + "\""
```
......
from splearn.datasets.base import *
from splearn.datasets.data_sample import DataSample, Splearn_array
\ No newline at end of file
from splearn.datasets.data_sample import DataSample, SplearnArray
\ No newline at end of file
......@@ -3,38 +3,25 @@ import numpy as np
from splearn.datasets.data_sample import DataSample
def load_data_sample(adr, type='SPiCe', pickle=False):
def load_data_sample(adr, filetype='SPiCe', pickle=False):
"""Load a sample from file and returns a dictionary
(word,count)
- Input:
:param lrows: number or list of rows,
a list of strings if partial=True;
otherwise, based on pref if version="classic" or
"prefix", fact otherwise
:type lrows: int or list of int
:param lcolumns: number or list of columns
a list of strings if partial=True ;
otherwise, based on suff if version="classic" or "suffix",
fact otherwise
:type lcolumns: int or list of int
:param string version: (default = "classic") version name
:param boolean partial: (default value = False) build of partial
if True partial dictionaries are loaded based
on nrows and lcolumns
:param str adr: address and name of the loaded file
:param str filetype: (default value = 'SPiCe') indicate
the structure of the file. Should be either 'SPiCe' or 'Pautomac'
:param boolean pickle: if enabled it a pickle file is created from the loaded file. Default is fault.
- Output:
:returns: nbL , nbEx , dsample , dpref , dsuff , dfact
:rtype: int , int , dict , dict , dict , dict
:returns: corresponding DataSample
:rtype: DataSample
:Example:
Let's say you are interested in the samples 10, 25, and 50, and want to
know their class name.
>>> from splearn.datasets.base import load_data_sample
>>> from splearn.tests.datasets.get_dataset_path import get_dataset_path
>>> train_file = '3.pautomac_light.train' # '4.spice.train'
......@@ -54,13 +41,13 @@ def load_data_sample(adr, type='SPiCe', pickle=False):
"""
if type == 'SPiCe' or type == 'Pautomac':
if filetype == 'SPiCe' or filetype == 'Pautomac':
data = _load_file_doublelecture(adr=adr, pickle=pickle)
return DataSample(data=data)
def _load_file_doublelecture(adr, pickle=False):
dsample = {} # dictionary (word,count)
nb_sample, max_length = _read_dimension(adr=adr)
_, max_length = _read_dimension(adr=adr)
f = open(adr, "r")
line = f.readline()
l = line.split()
......@@ -107,49 +94,6 @@ def _read_dimension(adr):
"do not match number of samples " + str(nb_sample))
return nb_sample , max_length
# def _load_file_1lecture(adr, pickle=False):
# dsample = {} # dictionary (word,count)
# f = open(adr, "r")
# line = f.readline()
# l = line.split()
# nbEx = int(l[0])
# nbL = int(l[1])
# line = f.readline()
# data1 = np.zeros((0,0))
# length = 0
# while line:
# l = line.split()
# # w = () if int(l[0]) == 0 else tuple([int(x) for x in l[1:]])
# # dsample[w] = dsample[w] + 1 if w in dsample else 1
# # traitement du mot vide pour les préfixes, suffixes et facteurs
# w = [] if int(l[0]) == 0 else [int(x) for x in l[1:]]
# word = np.array(w, ndmin=2, dtype=np.uint32)
# diff = abs(int(l[0]) - length)
# if len(w) > length and not np.array_equal(data1, np.zeros((0,0))):
# data1 = _add_empty(data1, diff)
# elif word.shape[0] < length and not np.array_equal(data1, np.zeros((0,0))):
# word = _add_empty(word, diff)
#
# if np.array_equal(data1, np.zeros((0,0))):
# data1 = word
# else:
# data1 = np.concatenate((data1, word), axis=0)
# length = data1.shape[1]
# line = f.readline()
#
# f.close()
# if pickle:
# _create_pickle_files(adr=adr, dsample=dsample)
# return nbL, nbEx, data1
# def _add_empty(data, diff):
# empty = np.zeros((data.shape[0], diff))
# empty += -1
# data = np.concatenate((data, empty), axis=1)
# return data
def _create_pickle_files(self, adr, dsample):
f = open(adr + ".sample.pkl", "wb")
pickle.dump(dsample, f)
......
......@@ -33,29 +33,58 @@
#
#
# ######### COPYRIGHT #########
"""This module contains the DataSample class and Splearn_array class
The DataSample class encapsulates a sample 's components
nbL and nbEx numbers,
Splearn_array class inherit from numpy ndarray and contains a 2d data ndarray
with the shape
==== ==== ==== ==== ====
x x x x -1
x x x x x
x x -1 -1 -1
x -1 -1 -1 -1
-1 -1 -1 -1 -1
==== ==== ==== ==== ====
where -1 a indicates a empty cell,
the number nbL and nbEx and , the fourth dictionaries for sample,
prefix, suffix and factor where they are computed
"""This module contains the DataSample class and SplearnArray class.
"""
import numpy as np
class Splearn_array(np.ndarray):
"""Splearn_array inherit from numpy ndarray
class SplearnArray(np.ndarray):
"""Sample data array used by the splearn spectral estimation
**SplearnArray** class inherit from numpy ndarray as a 2d data ndarray.
Example of a possible 2d shape:
+---+---+---+---+---+
| 0| 1| 0| 3| -1|
+---+---+---+---+---+
| 0| 0| 3| 3| 1|
+---+---+---+---+---+
| 1| 1| -1| -1| -1|
+---+---+---+---+---+
| 5| -1| -1| -1| -1|
+---+---+---+---+---+
| -1| -1| -1| -1| -1|
+---+---+---+---+---+
is equivalent to:
- word (0103) or abad
- word (00331) or aaddb
- word (11) or bb
- word (5) or f
- word () or empty
Each line represents a word of the sample. The words are represented by integer letters (0->a, 1->b, 2->c ...).
-1 indicates the end of the word. The number of rows is the total number of words in the sample (=nbEx) and the number of columns
is given by the size of the longest word. Notice that the total number of words does not care about the words' duplications.
If a word is duplicated in the sample, it is counted twice as two different examples.
The DataSample class encapsulates also the sample's parameters 'nbL', 'nbEx' (number of letters in the alphabet and
number of samples) and the fourth dictionaries 'sample', 'prefix', 'suffix' and 'factor' that will be populated during the fit
estimations.
- Input:
:param nd.array input_array: input ndarray that will be converted into **SplearnArray**
:param int nbL: the number of letters
:param int nbEx: total number of examples.
:param dict sample: the keys are the words and the values are the number of time it appears in the sample.
:param dict pref: the keys are the prefixes and the values are the number of time it appears in the sample.
:param dict suff: the keys are the suffixes and the values are the number of time it appears in the sample.
:param dict fact: the keys are the factors and the values are the number of time it appears in the sample.
:Example:
......@@ -66,7 +95,7 @@ class Splearn_array(np.ndarray):
>>> print(data.__class__)
>>> data.data
<class 'splearn.datasets.data_sample.DataSample'>
GSplearn_array([[ 3., 0., 3., ..., -1., -1., -1.],
SplearnArray([[ 3., 0., 3., ..., -1., -1., -1.],
[ 3., 3., -1., ..., -1., -1., -1.],
[ 3., 2., 0., ..., -1., -1., -1.],
...,
......@@ -96,150 +125,15 @@ class Splearn_array(np.ndarray):
self.suff = getattr(obj, 'suff', None)
self.fact = getattr(obj, 'fact', None)
# def select_rows(self, nb_rows_max=1000, version='classic'):
# """define lrows
#
# - Input:
#
# :param int nb_rows_max: (default = 1000) number of maximum rows
# :param string version: (default = "classic") version name
#
# - Output:
#
# :returns: list lrows, list of rows
# :rtype: list
# """
# lRows = [] # liste à renvoyer
# nbRows = 0
# lLeafs = [([], self.nbEx )]
# # pref[()]la liste de couples (prefixes frontières, nb occ)
# # initialisée au prefixe vide
# if version == 'classic':
# while lLeafs and nbRows < nb_rows_max:
# lastWord = lLeafs.pop()[
# 0] # le prefixe frontière le plus fréquent
# lRows.append(tuple(lastWord))
# nbRows += 1
# for i in range(self.nbL):
# newWord = lastWord + [i] # successeur de lastword
# tnewWord = tuple(newWord) # tuple associé
# if tnewWord in self.pref:
# # ajout d'un nouveau prefixe frontière
# lLeafs.append((newWord, self.pref[tnewWord]))
# lLeafs = sorted(lLeafs, key=lambda x: x[1])
# elif version == 'prefix':
# while lLeafs and nbRows < nb_rows_max:
# lastWord = lLeafs.pop()[
# 0] # le prefixe frontière le plus fréquent
# lRows.append(tuple(lastWord))
# nbRows += 1
# for i in range(self.nbL):
# newWord = lastWord + [i] # successeur de lastword
# tnewWord = tuple(newWord) # tuple associé
# if tnewWord in self.pref:
# # ajout d'un nouveau prefixe frontière
# nb = 0
# for u in self.sample:
# if tnewWord <= u:
# nb += self.sample[u] * (
# len(u) - len(tnewWord) + 1)
# lLeafs.append((newWord, nb))
# lLeafs = sorted(lLeafs, key=lambda x: x[1])
# elif version == 'factor':
# while lLeafs and nbRows < nb_rows_max:
# lastWord = lLeafs.pop()[
# 0] # le prefixe frontière le plus fréquent
# lRows.append(tuple(lastWord))
# nbRows += 1
# for i in range(self.nbL):
# newWord = lastWord + [i] # successeur de lastword
# tnewWord = tuple(newWord) # tuple associé
# if tnewWord in self.fact:
# # ajout d'un nouveau prefixe frontière
# nb = 0
# lw = len(tnewWord)
# for u in self.sample:
# if len(u) >= lw:
# for i in range(lw, len(u) + 1):
# if u[:i][-lw:] == tnewWord:
# nb += self.sample[u] * (len(u) - i + 1)
# lLeafs.append((newWord, nb))
# lLeafs = sorted(lLeafs, key=lambda x: x[1])
# # print(lLeafs)
# return lRows
# def select_columns(self, nb_columns_max=1000, version='classic'):
# """define lcolumns
#
# - Input:
#
# :param int nb_columns_max: (default = 1000) number of maximum columns
# :param string version: (default = "classic") version name
#
# - Output:
#
# :returns:list lcolumns, list of columns
# :rtype: list
# """
# lColumns = [] # liste à renvoyer
# lLeafs = [([], self.nbEx)] # la liste de couples (suffixes frontières,
# # nb occ) initialisée au suffixe vide
#
# nbColumns = 0
# if version == 'classic':
# while lLeafs and nbColumns < nb_columns_max:
# lastWord = lLeafs.pop()[
# 0] # le suffixe frontière le plus fréquent
# lColumns.append(tuple(lastWord))
# nbColumns += 1
# for i in range(self.nbL):
# newWord = lastWord + [i] # successeur de lastword
# tnewWord = tuple(newWord) # tuple associé
# if tnewWord in self.suff:
# # ajout d'un nouveau suffixe frontière
# lLeafs.append((newWord, self.suff[tnewWord]))
# lLeafs = sorted(lLeafs, key=lambda x: x[
# 1]) # suffixe le plus fréquent en dernier
# # print(lLeafs)
# elif version == 'prefix':
# while lLeafs and nbColumns < nb_columns_max:
# lastWord = lLeafs.pop()[
# 0] # le prefixe frontière le plus fréquent
# lColumns.append(tuple(lastWord))
# nbColumns += 1
# for i in range(self.nbL):
# newWord = lastWord + [i] # successeur de lastword
# tnewWord = tuple(newWord) # tuple associé
# if tnewWord in self.fact:
# # ajout d'un nouveau suffixe frontière
# lLeafs.append((newWord, self.fact[tnewWord]))
# lLeafs = sorted(lLeafs, key=lambda x: x[1])
# elif version == 'factor':
# while lLeafs and nbColumns < nb_columns_max:
# lastWord = lLeafs.pop()[
# 0] # le prefixe frontière le plus fréquent
# lColumns.append(tuple(lastWord))
# nbColumns += 1
# for i in range(self.nbL):
# newWord = lastWord + [i] # successeur de lastword
# tnewWord = tuple(newWord) # tuple associé
# if tnewWord in self.fact:
# # ajout d'un nouveau prefixe frontière
# nb = 0
# lw = len(tnewWord)
# for u in self.sample:
# if len(u) >= lw:
# for i in range(lw, len(u) + 1):
# if u[:i][-lw:] == tnewWord:
# nb += self.sample[u] * (i - lw + 1)
# lLeafs.append((newWord, nb))
# lLeafs = sorted(lLeafs, key=lambda x: x[1])
# # print(lLeafs)
# return lColumns
class DataSample(dict):
""" A DataSample instance
- Input:
:param tuple data: a tuple of (int, int, numpy.array) for the corresponding three elements
(nbL, nbEx, data) where nbL is the number of letters in the alphabet, nbEx is the number
of samples and data is the 2d data array
:Example:
>>> from splearn.datasets.base import load_data_sample
......@@ -254,46 +148,20 @@ class DataSample(dict):
5000
>>> data.data
- Input:
:param string adr: adresse and name of the loaden file
:param string type: (default value = 'SPiCe') indicate
the structure of the file
:param lrows: number or list of rows,
a list of strings if partial=True;
otherwise, based on self.pref if version="classic" or
"prefix", self.fact otherwise
:type lrows: int or list of int
:param lcolumns: number or list of columns
a list of strings if partial=True ;
otherwise, based on self.suff if version="classic" or "suffix",
self.fact otherwise
:type lcolumns: int or list of int
:param string version: (default = "classic") version name
:param boolean partial: (default value = False) build of partial
"""
def __init__(self, data=None, type='SPiCe', **kwargs):
# Size of the alphabet
self._nbL = 0
# Number of samples
self._nbEx = 0
def __init__(self, data=None, **kwargs):
# The dictionary that contains the sample
self._data = Splearn_array(np.zeros((0,0)))
self._data = SplearnArray(np.zeros((0,0)))
if data is not None:
self.nbL = data[0]
self.nbEx = data[1]
self.data = Splearn_array(data[2], nbL=data[0], nbEx=data[1])
self.data = SplearnArray(data[2], nbL=data[0], nbEx=data[1])
super(DataSample, self).__init__(kwargs)
@property
def nbL(self):
"""Number of letters"""
return self._nbL
return self.data.nbL
@nbL.setter
def nbL(self, nbL):
......@@ -302,13 +170,12 @@ class DataSample(dict):
if nbL < 0:
raise ValueError("The size of the alphabet should " +
"an integer >= 0")
self._nbL = nbL
self.data.nbL = nbL
@property
def nbEx(self):
"""Number of examples"""
return self._nbEx
return self.data.nbEx
@nbEx.setter
def nbEx(self, nbEx):
......@@ -317,21 +184,17 @@ class DataSample(dict):
if nbEx < 0:
raise ValueError("The number of examples should be " +
" an integer >= 0")
self._nbEx = nbEx
self.data.nbEx = nbEx
@property
def data(self):
"""Splearn_array"""
"""SplearnArray"""
return self._data
@data.setter
def data(self, data):
if isinstance(data, (Splearn_array, np.ndarray, np.generic)):
if isinstance(data, (SplearnArray, np.ndarray, np.generic)):
self._data = data
else:
raise TypeError("sample should be a Splearn_array.")
raise TypeError("sample should be a SplearnArray.")
......@@ -44,21 +44,9 @@ import numpy as np
class Hankel(object):
""" A Hankel instance , compute the list of Hankel matrices
:Example:
>>> from splearn import Learning, Hankel , Spectral
>>> train_file = '0.spice.train'
>>> pT = load_data_sample(adr=train_file)
>>> sp = Spectral()
>>> sp.fit(X=pT.data)
>>> lhankel = Hankel( sample_instance=pT.sample,
>>> nbL=pT.nbL, nbEx=pT.nbEx,
>>> lrows=6, lcolumns=6, version="classic",
>>> partial=True, sparse=True, mode_quiet=True).lhankel
- Input:
:param Splearn_array sample_instance: instance of Splearn_array
:param SplearnArray sample_instance: instance of SplearnArray
:param lrows: number or list of rows,
a list of strings if partial=True;
otherwise, based on self.pref if version="classic" or
......@@ -79,6 +67,19 @@ class Hankel(object):
*sample_instance* or *lhankel* has to be not None. If *sample_instance* is given,
the **Hankel** instance is built directly from the sample dictionnary,
else it is deduced from the *lhankels* list of matrices.
:Example:
>>> from splearn import Learning, Hankel , Spectral
>>> train_file = '0.spice.train'
>>> pT = load_data_sample(adr=train_file)
>>> sp = Spectral()
>>> sp.fit(X=pT.data)
>>> lhankel = Hankel( sample_instance=pT.sample,
>>> nbL=pT.nbL, nbEx=pT.nbEx,
>>> lrows=6, lcolumns=6, version="classic",
>>> partial=True, sparse=True, mode_quiet=True).lhankel
"""
def __init__(
......@@ -177,6 +178,8 @@ class Hankel(object):
@property
def build_from_sample(self):
"""Boolean that indicates if the matrices have been build form sample or not
(directly build from an Automaton in this case) """
return self._build_from_sample
@build_from_sample.setter
......@@ -193,10 +196,10 @@ class Hankel(object):
- Input:
:param dict sample: sample dictionary
:param dict pref: prefix dictionary
:param dict suff: suffix dictionary
:param dict fact: factor dictionary
:param dict sample: the keys are the words and the values are the number of time it appears in the sample.
:param dict pref: the keys are the prefixes and the values are the number of time it appears in the sample.
:param dict suff: the keys are the suffixes and the values are the number of time it appears in the sample.
:param dict fact: the keys are the factors and the values are the number of time it appears in the sample.
:param lrows: number or list of rows,
a list of strings if partial=True;
otherwise, based on self.pref if version="classic" or
......
......@@ -41,7 +41,10 @@
from __future__ import division, print_function
import numpy as np
import math
from splearn.datasets.data_sample import Splearn_array
import threading
lock = threading.Lock()
from splearn.datasets.data_sample import SplearnArray
from splearn.hankel import Hankel
from sklearn.base import BaseEstimator
from sklearn.utils import check_array
......@@ -51,29 +54,6 @@ import warnings
class Spectral(BaseEstimator):
"""A Spectral estimator instance
:Example:
>>> from splearn.spectral import Spectral
>>> sp = Spectral()
>>> sp.set_params(partial=True, lcolumns=6, lrows=6, smooth_method='trigram')
Spectral(lcolumns=6, lrows=6, mode_quiet=False, partial=True, rank=5,
smooth_method='trigram', sparse=True, version='classic')
>>> sp.fit(data.data)
Start Hankel matrix computation
End of Hankel matrix computation
Start Building Automaton from Hankel matrix
End of Automaton computation
Spectral(lcolumns=6, lrows=6, partial=True, rank=5, smooth_method='trigram', sparse=True, version='classic')
>>> sp.automaton.initial
array([-0.00049249, 0.00304676, -0.04405996, -0.10765322, -0.08660063])
>>> sp.predict(data.data)
array([ 4.38961058e-04, 1.10616861e-01, 1.35569353e-03, ...,
4.66041996e-06, 4.68177275e-02, 5.24287604e-20])
>>> sp.loss(data.data, normalize=True)
-10.530029936056017
>>> sp.score(data.data)
10.530029936056017
- Input:
:param int rank: the ranking number
......@@ -106,6 +86,28 @@ class Spectral(BaseEstimator):
:param boolean mode_quiet: (default value = False) True for no
output message.
:Example:
>>> from splearn.spectral import Spectral
>>> sp = Spectral()
>>> sp.set_params(partial=True, lcolumns=6, lrows=6, smooth_method='trigram')
Spectral(lcolumns=6, lrows=6, mode_quiet=False, partial=True, rank=5,
smooth_method='trigram', sparse=True, version='classic')
>>> sp.fit(data.data)
Start Hankel matrix computation
End of Hankel matrix computation
Start Building Automaton from Hankel matrix
End of Automaton computation
Spectral(lcolumns=6, lrows=6, partial=True, rank=5, smooth_method='trigram', sparse=True, version='classic')
>>> sp.automaton.initial
array([-0.00049249, 0.00304676, -0.04405996, -0.10765322, -0.08660063])
>>> sp.predict(data.data)
array([ 4.38961058e-04, 1.10616861e-01, 1.35569353e-03, ...,
4.66041996e-06, 4.68177275e-02, 5.24287604e-20])
>>> sp.loss(data.data, normalize=True)
-10.530029936056017
>>> sp.score(data.data)
10.530029936056017
"""
def __init__(self, rank=5, lrows=7, lcolumns=7,
......@@ -172,8 +174,7 @@ class Spectral(BaseEstimator):
self.smooth = 0
def set_params(self, **parameters):
"""
set the values of Spectral estimator parameters
"""set the values of Spectral estimator parameters
- Output:
......@@ -186,12 +187,12 @@ class Spectral(BaseEstimator):
self._rule_smooth_method(value)
return self
def fit(self, X, y=None): #, gram
def fit(self, X, y=None):
"""Fit the model
- Input:
:param Splearn_array X: object of shape [n_samples,n_features]
:param SplearnArray X: object of shape [n_samples,n_features]
Training data
:param ndarray y: (default value = None) not used by Spectral estimator
numpy array of shape [n_samples] Target values
......@@ -206,11 +207,11 @@ class Spectral(BaseEstimator):
"""
check_array(X)
if not isinstance(X, Splearn_array):
if not isinstance(X, SplearnArray):
self._hankel = None
self._automaton = None
return self
X = self._polulate_dictionnaries(X)
X = self.polulate_dictionnaries(X)
self._hankel = Hankel(sample_instance=X,
lrows=self.lrows, lcolumns=self.lcolumns,
version=self.version,
......@@ -232,8 +233,108 @@ class Spectral(BaseEstimator):
dsample[w] = dsample[w] + 1 if w in dsample else 1
return dsample
def _polulate_dictionnaries(self, X):
if not isinstance(X, Splearn_array):
# def _populate_new_word(self, X, i, lrowsmax=None, version_rows_int=None,
# lcolumnsmax=None, version_columns_int=None, lmax=None):
# w = X[i, :]
# w = w[w >= 0]
# w = tuple([int(x) for x in w[0:]])
# with lock:
# X.sample[w] = X.sample.setdefault(w, 0) + 1
# if self.version == "prefix" or self.version == "classic":
# # empty word treatment for prefixe, suffix, and factor dictionnaries
# with lock:
# X.pref[()] = X.pref[()] + 1 if () in X.pref else 1
# if self.version == "suffix" or self.version == "classic":
# with lock:
# X.suff[()] = X.suff[()] + 1 if () in X.suff else 1
# if self.version == "factor" or self.version == "suffix" \
# or self.version == "prefix":
# with lock:
# X.fact[()] = X.fact[()] + len(w) + 1 if () in X.fact else len(w) + 1
#
# if self.partial:
# for i in range(len(w)):
# if self.version == "classic":
# if (version_rows_int is True and
# i + 1 <= lrowsmax) or \
# (version_rows_int is False and
# w[:i + 1] in self.lrows):
# with lock:
# X.pref[w[:i + 1]] = \
# X.pref[w[:i + 1]] + 1 if w[:i + 1] in X.pref else 1
# if (version_columns_int is True and i + 1 <= lcolumnsmax) or \
# (version_columns_int is False and w[-( i + 1):] in self.lcolumns):
# with lock:
# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 if \
# w[-(i + 1):] in X.suff else 1
# if self.version == "prefix":
# # dictionaries dpref is populated until
# # lmax = lrows + lcolumns
# # dictionaries dfact is populated until lcolumns
# if ((version_rows_int is True or
# version_columns_int is True) and
# i + 1 <= lmax) or \
# (version_rows_int is False and
# (w[:i + 1] in self.lrows)) or \
# (version_columns_int is False and
# (w[:i + 1] in self.lcolumns)):
# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
# if w[:i + 1] in X.pref else 1
# for j in range(i + 1, len(w) + 1):
# if (version_columns_int is True and (
# j - i) <= lmax) or \
# (version_columns_int is False and
# (w[i:j] in self.lcolumns)):
# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
# if w[i:j] in X.fact else 1
# if self.version == "suffix":
# if ((version_rows_int is True or
# version_columns_int is True) and
# i <= lmax) or \
# (version_rows_int is False and
# (w[-(i + 1):] in self.lrows)) or \
# (version_columns_int is False and
# (w[-(i + 1):] in self.lcolumns)):
# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 \
# if w[-(i + 1):] in X.suff else 1
# for j in range(i + 1, len(w) + 1):
# if (version_rows_int is True and (
# j - i) <= lmax) or \
# (version_rows_int is False and
# (w[i:j] in self.lrows)):
# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
# if w[i:j] in X.fact else 1
# if self.version == "factor":
# for j in range(i + 1, len(w) + 1):
# if ((version_rows_int is True or
# version_columns_int is True) and
# (j - i) <= lmax) or \
# (version_rows_int is False and
# (w[i:j] in self.lrows)) or \
# (version_columns_int is False and
# (w[i:j] in self.lcolumns)):
# X.fact[w[i:j]] = \
# X.fact[w[i:j]] + 1 if w[i:j] in X.fact else 1
#
# else: # not partial
# for i in range(len(w)):
# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
# if w[:i + 1] in X.pref else 1
# X.suff[w[i:]] = X.suff[w[i:]] + 1 if w[i:] in X.suff else 1
# for j in range(i + 1, len(w) + 1):
# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
# if w[i:j] in X.fact else 1
def polulate_dictionnaries(self, X):
"""Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
- Input:
:param SplearnArray X: object of shape [n_samples,n_features]
Training data
"""
if not isinstance(X, SplearnArray):
return X
dsample = {} # dictionary (word,count)
dpref = {} # dictionary (prefix,count)
......@@ -459,7 +560,7 @@ class Spectral(BaseEstimator):
- Input:
:param Splearn_array X : of shape data shape = (n_samples, n_features)
:param SplearnArray X : of shape data shape = (n_samples, n_features)
Samples.
......@@ -489,7 +590,7 @@ class Spectral(BaseEstimator):
- Input:
:param Splearn_array X : Samples, data shape = (n_samples, n_features)
:param SplearnArray X : Samples, data shape = (n_samples, n_features)
- Output:
......@@ -537,18 +638,17 @@ class Spectral(BaseEstimator):
return Y
def loss(self, X, y=None, normalize=True):
"""
Log probability using the Spectral model
"""Log probability using the Spectral model
- Input:
:param Splearn_array X : of shape data shape = (n_samples, n_features)
:param SplearnArray X: of shape data shape = (n_samples, n_features)
Samples. X is validation data.
:param ndarray y: (default value = Null)
numpy array of shape [n_samples] Target values,
is the ground truth target for X (in the supervised case) or
None (in the unsupervised case)
:param boolean normalize (default value = True) calculation are
:param boolean normalize: (default value = True) calculation are
performed and normalize by the number of sample in case of True
- Output:
......@@ -584,7 +684,7 @@ class Spectral(BaseEstimator):
- Input:
:param Splearn_array X: of shape data shape = (n_samples, n_features)
:param SplearnArray X: of shape data shape = (n_samples, n_features)
Samples.
:param ndarray y: (default value = None)
numpy array of shape [n_samples] Target values,
......
......@@ -38,7 +38,7 @@ from __future__ import division, print_function
import numpy as np
import unittest
from splearn.datasets.base import load_data_sample
from splearn.datasets.data_sample import DataSample, Splearn_array
from splearn.datasets.data_sample import DataSample, SplearnArray
from splearn.tests.datasets.get_dataset_path import get_dataset_path
from splearn.spectral import Spectral
......@@ -68,7 +68,7 @@ class UnitaryTest(unittest.TestCase):
s = load_data_sample(adr=adr)
cl = Spectral()
cl._polulate_dictionnaries(s.data)
cl.polulate_dictionnaries(s.data)
self.assertEqual(s.nbL,s.data.nbL)
self.assertEqual(s.nbEx, s.data.nbEx)
with self.assertRaises(TypeError):
......@@ -88,7 +88,7 @@ class UnitaryTest(unittest.TestCase):
data = load_data_sample(adr=adr)
cl = Spectral(partial=False)
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
nbL = data.data.nbL
nbEx = data.data.nbEx
sample = data.data .sample
......@@ -107,7 +107,7 @@ class UnitaryTest(unittest.TestCase):
self.assertEqual(nbSuff1, nbSuff2)
cl = Spectral(version = 'factor', partial=False)
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
fact = data.data.fact
nbFact1 = sum([sample[w]*(len(w)+1)*(len(w)+2)/2 for w in sample])
nbFact2 = sum([fact[w] for w in fact])
......@@ -117,7 +117,7 @@ class UnitaryTest(unittest.TestCase):
adr = get_dataset_path("0.spice.train")
pT = load_data_sample(adr=adr)
cl = Spectral(partial=False)
cl._polulate_dictionnaries(pT.data)
cl.polulate_dictionnaries(pT.data)
# lR = pT.data.select_rows(nb_rows_max = 10, version = 'classic')
# lC = pT.data.select_columns(nb_columns_max = 10, version = 'classic')
# self.assertEqual(lR, [(), (3,), (3, 0), (3, 3), (3, 0, 3), (3, 1),
......@@ -127,7 +127,7 @@ class UnitaryTest(unittest.TestCase):
# (1,), (1, 3), (3, 0, 3)])
cl = Spectral(version = 'prefix', partial=False)
cl._polulate_dictionnaries(pT.data)
cl.polulate_dictionnaries(pT.data)
# lRp = pT.data.select_rows(nb_rows_max = 10, version = 'prefix')
# lCp = pT.data.select_columns(nb_columns_max = 10, version = 'prefix')
# self.assertEqual(lRp, [(), (3,), (3, 0), (3, 0, 0), (3, 0, 1),
......@@ -137,7 +137,7 @@ class UnitaryTest(unittest.TestCase):
# (0, 3), (1, 3), (3, 1)])
cl = Spectral(version = 'factor', partial=False)
cl._polulate_dictionnaries(pT.data)
cl.polulate_dictionnaries(pT.data)
# lRf = pT.data.select_rows(nb_rows_max = 10, version = 'factor')
# lCf = pT.data.select_columns(nb_columns_max = 10, version = 'factor')
# self.assertEqual(lRf, [(), (3,), (0,), (1,), (3, 0), (3, 3), (2,),
......
......@@ -53,7 +53,7 @@ class HankelTest(unittest.TestCase):
# adr = get_dataset_path("3.pautomac.train")
data = load_data_sample(adr=adr)
cl = Spectral(partial=False)
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
lprefix = [()]
lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
lprefix = lprefix+[(i, j) for i in range(data.data.nbL)
......@@ -123,7 +123,7 @@ class HankelTest(unittest.TestCase):
# adr = get_dataset_path("3.pautomac.train")
data = load_data_sample(adr=adr)
cl = Spectral(partial=False, version="prefix")
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
lprefix = [()]
lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
lprefix = lprefix + [(i, j) for i in range(data.data.nbL)
......@@ -196,7 +196,7 @@ class HankelTest(unittest.TestCase):
# adr = get_dataset_path("3.pautomac.train")
data = load_data_sample(adr=adr)
cl = Spectral(partial=False, version="suffix")
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
lprefix = [()]
lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
lprefix = lprefix + [(i, j) for i in range(data.data.nbL)
......@@ -266,7 +266,7 @@ class HankelTest(unittest.TestCase):
# adr = get_dataset_path("3.pautomac.train")
data = load_data_sample(adr=adr)
cl = Spectral(partial=False, version="factor")
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
lprefix = [()]
lprefix = lprefix + [(i,) for i in range(data.data.nbL)]
lprefix = lprefix + [(i, j) for i in range(data.data.nbL)
......@@ -336,7 +336,7 @@ class HankelTest(unittest.TestCase):
# adr = get_dataset_path("3.pautomac.train")
data = load_data_sample(adr=adr)
cl = Spectral()
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
h = Hankel(sample_instance=data.data, lrows=1, lcolumns=1,
version="classic", partial=False, sparse=False)
with self.assertRaises(TypeError):
......@@ -349,7 +349,7 @@ class HankelTest(unittest.TestCase):
adr = get_dataset_path("essai")
data = load_data_sample(adr=adr)
cl = Spectral()
cl._polulate_dictionnaries(data.data)
cl.polulate_dictionnaries(data.data)
h1 = Hankel(sample_instance=data.data, lrows=1, lcolumns=1,
version="classic", partial=False, sparse=False)
h2 = Hankel(sample_instance=data.data, lrows=1, lcolumns=1,
......
......@@ -41,9 +41,10 @@ from splearn.datasets.base import load_data_sample
from splearn.automaton import Automaton
from splearn.spectral import Spectral
from splearn.tests.datasets.get_dataset_path import get_dataset_path
from sklearn.linear_model.tests.test_passive_aggressive import random_state
class UnitaryTest(unittest.TestCase):
class SpectralTest(unittest.TestCase):
def test_version(self):
adr = get_dataset_path("essai")
......@@ -238,6 +239,25 @@ class UnitaryTest(unittest.TestCase):
np.testing.assert_almost_equal(A.val([0, 1, 0, 1, 1]),
B.val([0, 1, 0, 1, 1]))
def test_sklearn_compatibility(self):
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import train_test_split, cross_val_score
check_estimator(Spectral)
adr = get_dataset_path("3.pautomac_light.train")
data = load_data_sample(adr=adr)
sp = Spectral(lrows=6, lcolumns=6, rank = 5, sparse=False,
partial=True, smooth_method='trigram')
X_train, X_test = train_test_split(data.data, test_size=0.4, random_state=0)
sp.fit(X_train)
single_predicted_weights = sp.predict(X_test)
print(single_predicted_weights)
self.assertAlmostEqual(single_predicted_weights[0], 6.76217667e-02, delta = 1e-5)
scores = cross_val_score(sp, data.data, cv=4)
print(scores)
scores_expected = [-10.65272755, -10.7090267, -10.78404758, -11.08453211]
for s1, s2 in zip(scores, scores_expected):
self.assertAlmostEqual(s1, s2, delta=0.1)
# def test_Perplexity(self):
# adr = get_dataset_path("3.pautomac")
# P = Learning()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment