diff --git a/examples/json_save.ipynb b/examples/json_save.ipynb index 61db044db842d9e54fd1ed9cfb5ce9cfb4f5b629..03094aa242234a18b120b31afcebbb141204aee7 100644 --- a/examples/json_save.ipynb +++ b/examples/json_save.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -22,7 +22,7 @@ " smooth_method='none', sparse=True, version='classic')" ] }, - "execution_count": 1, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -39,16 +39,7 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "Automaton.write(sp.automaton, train_file + \".json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -66,33 +57,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "str2 = Serializer.data_to_json(A)" ] }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "str1 == str2" - ] - }, { "cell_type": "code", "execution_count": 7, @@ -101,7 +72,7 @@ { "data": { "text/plain": [ - "'{\"automaton\": {\"nbL\": 4, \"nbS\": 5, \"initial\": {\"numpy.ndarray\": {\"values\": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], \"dtype\": \"float64\"}}, \"final\": {\"numpy.ndarray\": {\"values\": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], \"dtype\": \"float64\"}}, \"transitions\": [{\"numpy.ndarray\": {\"values\": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], \"dtype\": \"float64\"}}], \"type\": \"classic\"}}'" + "True" ] }, "execution_count": 7, @@ -110,33 +81,13 @@ } ], "source": [ - "str1" + "str1 == str2" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'{\"automaton\": {\"nbL\": 4, \"nbS\": 5, \"initial\": {\"numpy.ndarray\": {\"values\": [-0.0004934419970497512, 0.0030634697107912346, -0.044073932015580415, -0.1077770261654714, -0.0866391379316952], \"dtype\": \"float64\"}}, \"final\": {\"numpy.ndarray\": {\"values\": [0.07757136847945045, -0.024220294003132026, -0.4468125366321221, 0.627732084089759, -0.554674433356224], \"dtype\": \"float64\"}}, \"transitions\": [{\"numpy.ndarray\": {\"values\": [[0.04512120959511772, -0.24038969827844062, 0.34944999592135334, -0.2811680730534579, -0.21402523377497645], [0.0692580056243761, -0.30062293462829204, 0.20641375368520157, -0.14960814319756124, -0.5580573163749153], [0.02980115192176571, -0.13866480809160409, 0.18362212572805459, -0.20969545230657607, -0.14481622025561292], [0.005699344003198349, -0.023385825120201414, -0.06600665373981851, 0.10749935271466007, -0.15103654604159977], [-0.02008655193147911, 0.09026347555230492, -0.005525585655539262, -0.031355317090308935, 0.2432902242047721]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.0774477207917058, 0.09007073705762021, -0.3047220063293013, 0.2767624549859105, 0.20289396030628148], [-0.09902980483670844, -0.08061846818727973, 0.25853170692250554, -0.12086330214608881, -0.11085207725068251], [-0.061710792028537534, -0.06244151779954751, 0.12007654564862075, 0.0025063746277943564, -0.1567967473145572], [-0.002736973749965403, -0.009005721984277787, -0.00046003295909181354, -0.008550426472005344, -0.053754646789681754], [0.030987327588710728, 0.03972680066723246, -0.04997113350910248, 0.0035769411874962344, 0.1418257620585633]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[-0.06791915236220235, -0.11357937659088102, 0.37955392604054394, -0.21784979894046635, -0.22977695089938127], [0.11596642335411328, 0.14914956804629287, -0.13357508376686902, -0.008916063072034974, 0.3484153673774836], [0.011730817547426673, 0.019273800531955612, 0.0414265834586712, -0.035346588560982, 0.02316491010895583], [0.007328911075541707, 0.005536509132796312, -0.022456082950666856, 0.03611543477693187, -0.038514339001406585], [-0.010589894686551544, -0.010626616553723532, -0.000543105645661794, -0.025567476700160314, 0.04984888818929034]], \"dtype\": \"float64\"}}, {\"numpy.ndarray\": {\"values\": [[0.07276211427780357, -0.0157195576855797, 0.07428592814590385, -0.10369861539249735, 0.024753473688328077], [-0.05607105449779142, -0.08896207276035666, 0.27638225397521243, -0.2371125582838589, 0.07372294122306285], [-0.007391294007753122, -0.048741797963875705, -0.6291239733858526, 0.46816276521577677, 0.09251699239093385], [-0.007110224931878467, -0.05623317735898056, -0.36606658567620365, -0.013297798115225407, 0.6491033177492604], [0.002335515008556511, -0.021561151264484414, 0.09096243479437888, -0.38438823493062646, 0.6616477207948602]], \"dtype\": \"float64\"}}], \"type\": \"classic\"}}'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "str2" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, "outputs": [], "source": [ "str3 = Serializer.data_to_yaml(sp.automaton)" @@ -144,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -226,6 +177,15 @@ "print(str3)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "Automaton.write(sp.automaton, train_file + \".json\")" + ] + }, { "cell_type": "code", "execution_count": 11, @@ -331,249 +291,59 @@ "Ayl.transitions" ] }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "from splearn import Hankel\n", - "Hankel.write(sp.hankel, train_file + \"_hankel.json\", \"json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "Hb = Hankel.read(train_file + \"_hankel.json\", \"json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hankel equality check\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Hb == sp.hankel" - ] - }, { "cell_type": "code", "execution_count": 19, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[<1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 8251 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 2199 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 2122 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 1091 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 3489 stored elements in Dictionary Of Keys format>]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Hb.lhankel" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[<1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 8251 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 2199 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 2122 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 1091 stored elements in Dictionary Of Keys format>,\n", - " <1310x3308 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 3489 stored elements in Dictionary Of Keys format>]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sp.hankel.lhankel" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "import scipy.sparse as sps\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "data = sp.hankel.lhankel[0]\n", - "k_str = \"({0:d},{1:d})\"\n", - "dico = dict(zip([k_str.format(i, j) for (i,j) in data.keys()], data.values()))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "dok = sps.dok_matrix(data.shape, dtype=data.dtype)\n", - "for k, val in dico.items():\n", - " k = k.replace(\"(\",\"\").replace(\")\",\"\")\n", - " ind1, ind2 = k.split(\",\")\n", - " dok[(int(ind1), int(ind2))] = val" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " (0, 2950)\t6.0\n", - " (1, 1141)\t6.0\n", - " (2, 820)\t6.0\n", - " (9, 192)\t6.0\n", - " (35, 75)\t6.0\n", - " (123, 12)\t6.0\n", - " (358, 4)\t6.0\n", - " (832, 0)\t6.0\n", - " (0, 20)\t551.0\n", - " (1, 4)\t551.0\n", - " (5, 0)\t551.0\n", - " (0, 837)\t9.0\n", - " (1, 212)\t9.0\n", - " (4, 33)\t9.0\n", - " (14, 17)\t9.0\n", - " (56, 1)\t9.0\n", - " (183, 0)\t9.0\n", - " (0, 254)\t7.0\n", - " (1, 25)\t7.0\n", - " (2, 9)\t7.0\n", - " (7, 1)\t7.0\n", - " (26, 0)\t7.0\n", - " (0, 3160)\t5.0\n", - " (1, 1601)\t5.0\n", - " (5, 323)\t5.0\n", - " :\t:\n", - " (607, 109)\t1.0\n", - " (1270, 48)\t1.0\n", - " (34, 2382)\t1.0\n", - " (117, 1262)\t1.0\n", - " (336, 580)\t1.0\n", - " (761, 265)\t1.0\n", - " (464, 3272)\t1.0\n", - " (1015, 1821)\t1.0\n", - " (338, 2911)\t1.0\n", - " (770, 1090)\t1.0\n", - " (0, 2926)\t1.0\n", - " (1, 1113)\t1.0\n", - " (2, 767)\t1.0\n", - " (9, 131)\t1.0\n", - " (34, 70)\t1.0\n", - " (119, 7)\t1.0\n", - " (343, 3)\t1.0\n", - " (786, 0)\t1.0\n", - " (1073, 2555)\t1.0\n", - " (0, 825)\t1.0\n", - " (1, 197)\t1.0\n", - " (3, 80)\t1.0\n", - " (13, 17)\t1.0\n", - " (53, 1)\t1.0\n", - " (175, 0)\t1.0\n" + "True\n", + "True\n", + "True\n", + "True\n" ] } ], "source": [ - "print(dok)" + "import numpy as np\n", + "for i in range(4):\n", + " print(np.array_equal(Ajs.transitions[i], Ayl.transitions[i]))" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from splearn import Hankel\n", - "Hankel.write(sp.hankel, train_file + \"_hankel.yaml\", \"yaml\")" + "Hankel.write(sp.hankel, train_file + \"_hankel.json\", \"json\")" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "Hb = Hankel.read(train_file + \"_hankel.yaml\", \"yaml\")" + "Hb = Hankel.read(train_file + \"_hankel.json\", \"json\")" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 22, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hankel equality check\n" - ] - }, { "data": { "text/plain": [ "True" ] }, - "execution_count": 27, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -584,102 +354,40 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "yamlstr = \"- scipy.dok_matrix:\\n dtype: float64\\n shape:\\n tuple: [1, 1]\\n values: {'(0,0)': 1.0}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "- scipy.dok_matrix:\n", - " dtype: float64\n", - " shape:\n", - " tuple: [1, 1]\n", - " values: {'(0,0)': 1.0}\n" - ] - } - ], - "source": [ - "print(yamlstr)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[<1x1 sparse matrix of type '<class 'numpy.float64'>'\n", - "\twith 1 stored elements in Dictionary Of Keys format>]" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Serializer.yaml_to_data(yamlstr)" + "Hankel.write(sp.hankel, train_file + \"_hankel.yaml\", \"yaml\")" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 24, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\"- scipy.dok_matrix:\\\\n dtype: float64\\\\n shape:\\\\n tuple: [1, 1]\\\\n values:\\\\\\n \\\\ {\\'(0,0)\\': 1.0}\"\\n'" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "dy" + "Hb = Hankel.read(train_file + \"_hankel.yaml\", \"yaml\")" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "False" + "True" ] }, - "execution_count": 47, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "issubclass(TypeError, ValueError)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"The input data string (\" + data_str + \") should contain the following keys : \\\"\" + '\\\", \\\"'.join(keys) + \"\\\"\"" + "Hb == sp.hankel" ] } ], diff --git a/splearn/datasets/__init__.py b/splearn/datasets/__init__.py index 1b09d4a13650cc352a8e4b04a592576072d7e6b2..f118dff034cf7297a0b143ec45dbbf186ffdeb80 100644 --- a/splearn/datasets/__init__.py +++ b/splearn/datasets/__init__.py @@ -1,2 +1,2 @@ from splearn.datasets.base import * -from splearn.datasets.data_sample import DataSample, Splearn_array \ No newline at end of file +from splearn.datasets.data_sample import DataSample, SplearnArray \ No newline at end of file diff --git a/splearn/datasets/base.py b/splearn/datasets/base.py index 6693e2dba4d812fe559df53939dcdc1440ca8315..d7781bdd40e5dfc92641d3bd6c0a4790e511552e 100644 --- a/splearn/datasets/base.py +++ b/splearn/datasets/base.py @@ -3,38 +3,25 @@ import numpy as np from splearn.datasets.data_sample import DataSample -def load_data_sample(adr, type='SPiCe', pickle=False): +def load_data_sample(adr, filetype='SPiCe', pickle=False): """Load a sample from file and returns a dictionary (word,count) - Input: - :param lrows: number or list of rows, - a list of strings if partial=True; - otherwise, based on pref if version="classic" or - "prefix", fact otherwise - :type lrows: int or list of int - :param lcolumns: number or list of columns - a list of strings if partial=True ; - otherwise, based on suff if version="classic" or "suffix", - fact otherwise - :type lcolumns: int or list of int - :param string version: (default = "classic") version name - :param boolean partial: (default value = False) build of partial - if True partial dictionaries are loaded based - on nrows and lcolumns + :param str adr: address and name of the loaded file + :param str filetype: (default value = 'SPiCe') indicate + the structure of the file. Should be either 'SPiCe' or 'Pautomac' + :param boolean pickle: if enabled it a pickle file is created from the loaded file. Default is fault. - Output: - :returns: nbL , nbEx , dsample , dpref , dsuff , dfact - :rtype: int , int , dict , dict , dict , dict + :returns: corresponding DataSample + :rtype: DataSample :Example: - Let's say you are interested in the samples 10, 25, and 50, and want to - know their class name. - >>> from splearn.datasets.base import load_data_sample >>> from splearn.tests.datasets.get_dataset_path import get_dataset_path >>> train_file = '3.pautomac_light.train' # '4.spice.train' @@ -54,13 +41,13 @@ def load_data_sample(adr, type='SPiCe', pickle=False): """ - if type == 'SPiCe' or type == 'Pautomac': + if filetype == 'SPiCe' or filetype == 'Pautomac': data = _load_file_doublelecture(adr=adr, pickle=pickle) return DataSample(data=data) def _load_file_doublelecture(adr, pickle=False): dsample = {} # dictionary (word,count) - nb_sample, max_length = _read_dimension(adr=adr) + _, max_length = _read_dimension(adr=adr) f = open(adr, "r") line = f.readline() l = line.split() @@ -107,49 +94,6 @@ def _read_dimension(adr): "do not match number of samples " + str(nb_sample)) return nb_sample , max_length -# def _load_file_1lecture(adr, pickle=False): -# dsample = {} # dictionary (word,count) -# f = open(adr, "r") -# line = f.readline() -# l = line.split() -# nbEx = int(l[0]) -# nbL = int(l[1]) -# line = f.readline() -# data1 = np.zeros((0,0)) -# length = 0 -# while line: -# l = line.split() -# # w = () if int(l[0]) == 0 else tuple([int(x) for x in l[1:]]) -# # dsample[w] = dsample[w] + 1 if w in dsample else 1 -# # traitement du mot vide pour les préfixes, suffixes et facteurs -# w = [] if int(l[0]) == 0 else [int(x) for x in l[1:]] -# word = np.array(w, ndmin=2, dtype=np.uint32) -# diff = abs(int(l[0]) - length) -# if len(w) > length and not np.array_equal(data1, np.zeros((0,0))): -# data1 = _add_empty(data1, diff) -# elif word.shape[0] < length and not np.array_equal(data1, np.zeros((0,0))): -# word = _add_empty(word, diff) -# -# if np.array_equal(data1, np.zeros((0,0))): -# data1 = word -# else: -# data1 = np.concatenate((data1, word), axis=0) -# length = data1.shape[1] -# line = f.readline() -# -# f.close() -# if pickle: -# _create_pickle_files(adr=adr, dsample=dsample) -# return nbL, nbEx, data1 - - -# def _add_empty(data, diff): -# empty = np.zeros((data.shape[0], diff)) -# empty += -1 -# data = np.concatenate((data, empty), axis=1) -# return data - - def _create_pickle_files(self, adr, dsample): f = open(adr + ".sample.pkl", "wb") pickle.dump(dsample, f) diff --git a/splearn/datasets/data_sample.py b/splearn/datasets/data_sample.py index 3122630e07138773178dcb3f6ee76fca2a19041b..0c9f31f9b88ad66e62d7f013340b523f3355d398 100644 --- a/splearn/datasets/data_sample.py +++ b/splearn/datasets/data_sample.py @@ -33,29 +33,58 @@ # # # ######### COPYRIGHT ######### -"""This module contains the DataSample class and Splearn_array class -The DataSample class encapsulates a sample 's components -nbL and nbEx numbers, -Splearn_array class inherit from numpy ndarray and contains a 2d data ndarray -with the shape +"""This module contains the DataSample class and SplearnArray class. -==== ==== ==== ==== ==== -x x x x -1 -x x x x x -x x -1 -1 -1 -x -1 -1 -1 -1 --1 -1 -1 -1 -1 -==== ==== ==== ==== ==== -where -1 a indicates a empty cell, -the number nbL and nbEx and , the fourth dictionaries for sample, -prefix, suffix and factor where they are computed """ import numpy as np -class Splearn_array(np.ndarray): - """Splearn_array inherit from numpy ndarray +class SplearnArray(np.ndarray): + """Sample data array used by the splearn spectral estimation + + **SplearnArray** class inherit from numpy ndarray as a 2d data ndarray. + + Example of a possible 2d shape: + + +---+---+---+---+---+ + | 0| 1| 0| 3| -1| + +---+---+---+---+---+ + | 0| 0| 3| 3| 1| + +---+---+---+---+---+ + | 1| 1| -1| -1| -1| + +---+---+---+---+---+ + | 5| -1| -1| -1| -1| + +---+---+---+---+---+ + | -1| -1| -1| -1| -1| + +---+---+---+---+---+ + + is equivalent to: + + - word (0103) or abad + - word (00331) or aaddb + - word (11) or bb + - word (5) or f + - word () or empty + + Each line represents a word of the sample. The words are represented by integer letters (0->a, 1->b, 2->c ...). + -1 indicates the end of the word. The number of rows is the total number of words in the sample (=nbEx) and the number of columns + is given by the size of the longest word. Notice that the total number of words does not care about the words' duplications. + If a word is duplicated in the sample, it is counted twice as two different examples. + + The DataSample class encapsulates also the sample's parameters 'nbL', 'nbEx' (number of letters in the alphabet and + number of samples) and the fourth dictionaries 'sample', 'prefix', 'suffix' and 'factor' that will be populated during the fit + estimations. + + - Input: + + :param nd.array input_array: input ndarray that will be converted into **SplearnArray** + :param int nbL: the number of letters + :param int nbEx: total number of examples. + :param dict sample: the keys are the words and the values are the number of time it appears in the sample. + :param dict pref: the keys are the prefixes and the values are the number of time it appears in the sample. + :param dict suff: the keys are the suffixes and the values are the number of time it appears in the sample. + :param dict fact: the keys are the factors and the values are the number of time it appears in the sample. :Example: @@ -66,7 +95,7 @@ class Splearn_array(np.ndarray): >>> print(data.__class__) >>> data.data <class 'splearn.datasets.data_sample.DataSample'> - GSplearn_array([[ 3., 0., 3., ..., -1., -1., -1.], + SplearnArray([[ 3., 0., 3., ..., -1., -1., -1.], [ 3., 3., -1., ..., -1., -1., -1.], [ 3., 2., 0., ..., -1., -1., -1.], ..., @@ -96,150 +125,15 @@ class Splearn_array(np.ndarray): self.suff = getattr(obj, 'suff', None) self.fact = getattr(obj, 'fact', None) - # def select_rows(self, nb_rows_max=1000, version='classic'): - # """define lrows - # - # - Input: - # - # :param int nb_rows_max: (default = 1000) number of maximum rows - # :param string version: (default = "classic") version name - # - # - Output: - # - # :returns: list lrows, list of rows - # :rtype: list - # """ - # lRows = [] # liste à renvoyer - # nbRows = 0 - # lLeafs = [([], self.nbEx )] - # # pref[()]la liste de couples (prefixes frontières, nb occ) - # # initialisée au prefixe vide - # if version == 'classic': - # while lLeafs and nbRows < nb_rows_max: - # lastWord = lLeafs.pop()[ - # 0] # le prefixe frontière le plus fréquent - # lRows.append(tuple(lastWord)) - # nbRows += 1 - # for i in range(self.nbL): - # newWord = lastWord + [i] # successeur de lastword - # tnewWord = tuple(newWord) # tuple associé - # if tnewWord in self.pref: - # # ajout d'un nouveau prefixe frontière - # lLeafs.append((newWord, self.pref[tnewWord])) - # lLeafs = sorted(lLeafs, key=lambda x: x[1]) - # elif version == 'prefix': - # while lLeafs and nbRows < nb_rows_max: - # lastWord = lLeafs.pop()[ - # 0] # le prefixe frontière le plus fréquent - # lRows.append(tuple(lastWord)) - # nbRows += 1 - # for i in range(self.nbL): - # newWord = lastWord + [i] # successeur de lastword - # tnewWord = tuple(newWord) # tuple associé - # if tnewWord in self.pref: - # # ajout d'un nouveau prefixe frontière - # nb = 0 - # for u in self.sample: - # if tnewWord <= u: - # nb += self.sample[u] * ( - # len(u) - len(tnewWord) + 1) - # lLeafs.append((newWord, nb)) - # lLeafs = sorted(lLeafs, key=lambda x: x[1]) - # elif version == 'factor': - # while lLeafs and nbRows < nb_rows_max: - # lastWord = lLeafs.pop()[ - # 0] # le prefixe frontière le plus fréquent - # lRows.append(tuple(lastWord)) - # nbRows += 1 - # for i in range(self.nbL): - # newWord = lastWord + [i] # successeur de lastword - # tnewWord = tuple(newWord) # tuple associé - # if tnewWord in self.fact: - # # ajout d'un nouveau prefixe frontière - # nb = 0 - # lw = len(tnewWord) - # for u in self.sample: - # if len(u) >= lw: - # for i in range(lw, len(u) + 1): - # if u[:i][-lw:] == tnewWord: - # nb += self.sample[u] * (len(u) - i + 1) - # lLeafs.append((newWord, nb)) - # lLeafs = sorted(lLeafs, key=lambda x: x[1]) - # # print(lLeafs) - # return lRows - - # def select_columns(self, nb_columns_max=1000, version='classic'): - # """define lcolumns - # - # - Input: - # - # :param int nb_columns_max: (default = 1000) number of maximum columns - # :param string version: (default = "classic") version name - # - # - Output: - # - # :returns:list lcolumns, list of columns - # :rtype: list - # """ - # lColumns = [] # liste à renvoyer - # lLeafs = [([], self.nbEx)] # la liste de couples (suffixes frontières, - # # nb occ) initialisée au suffixe vide - # - # nbColumns = 0 - # if version == 'classic': - # while lLeafs and nbColumns < nb_columns_max: - # lastWord = lLeafs.pop()[ - # 0] # le suffixe frontière le plus fréquent - # lColumns.append(tuple(lastWord)) - # nbColumns += 1 - # for i in range(self.nbL): - # newWord = lastWord + [i] # successeur de lastword - # tnewWord = tuple(newWord) # tuple associé - # if tnewWord in self.suff: - # # ajout d'un nouveau suffixe frontière - # lLeafs.append((newWord, self.suff[tnewWord])) - # lLeafs = sorted(lLeafs, key=lambda x: x[ - # 1]) # suffixe le plus fréquent en dernier - # # print(lLeafs) - # elif version == 'prefix': - # while lLeafs and nbColumns < nb_columns_max: - # lastWord = lLeafs.pop()[ - # 0] # le prefixe frontière le plus fréquent - # lColumns.append(tuple(lastWord)) - # nbColumns += 1 - # for i in range(self.nbL): - # newWord = lastWord + [i] # successeur de lastword - # tnewWord = tuple(newWord) # tuple associé - # if tnewWord in self.fact: - # # ajout d'un nouveau suffixe frontière - # lLeafs.append((newWord, self.fact[tnewWord])) - # lLeafs = sorted(lLeafs, key=lambda x: x[1]) - # elif version == 'factor': - # while lLeafs and nbColumns < nb_columns_max: - # lastWord = lLeafs.pop()[ - # 0] # le prefixe frontière le plus fréquent - # lColumns.append(tuple(lastWord)) - # nbColumns += 1 - # for i in range(self.nbL): - # newWord = lastWord + [i] # successeur de lastword - # tnewWord = tuple(newWord) # tuple associé - # if tnewWord in self.fact: - # # ajout d'un nouveau prefixe frontière - # nb = 0 - # lw = len(tnewWord) - # for u in self.sample: - # if len(u) >= lw: - # for i in range(lw, len(u) + 1): - # if u[:i][-lw:] == tnewWord: - # nb += self.sample[u] * (i - lw + 1) - # lLeafs.append((newWord, nb)) - # lLeafs = sorted(lLeafs, key=lambda x: x[1]) - # # print(lLeafs) - # return lColumns - class DataSample(dict): """ A DataSample instance + - Input: + + :param tuple data: a tuple of (int, int, numpy.array) for the corresponding three elements + (nbL, nbEx, data) where nbL is the number of letters in the alphabet, nbEx is the number + of samples and data is the 2d data array + :Example: >>> from splearn.datasets.base import load_data_sample @@ -254,46 +148,20 @@ class DataSample(dict): 5000 >>> data.data - - Input: - - :param string adr: adresse and name of the loaden file - :param string type: (default value = 'SPiCe') indicate - the structure of the file - :param lrows: number or list of rows, - a list of strings if partial=True; - otherwise, based on self.pref if version="classic" or - "prefix", self.fact otherwise - :type lrows: int or list of int - :param lcolumns: number or list of columns - a list of strings if partial=True ; - otherwise, based on self.suff if version="classic" or "suffix", - self.fact otherwise - :type lcolumns: int or list of int - :param string version: (default = "classic") version name - :param boolean partial: (default value = False) build of partial - """ - def __init__(self, data=None, type='SPiCe', **kwargs): - - # Size of the alphabet - self._nbL = 0 - # Number of samples - self._nbEx = 0 + def __init__(self, data=None, **kwargs): # The dictionary that contains the sample - self._data = Splearn_array(np.zeros((0,0))) + self._data = SplearnArray(np.zeros((0,0))) if data is not None: - self.nbL = data[0] - self.nbEx = data[1] - self.data = Splearn_array(data[2], nbL=data[0], nbEx=data[1]) - + self.data = SplearnArray(data[2], nbL=data[0], nbEx=data[1]) super(DataSample, self).__init__(kwargs) @property def nbL(self): """Number of letters""" - return self._nbL + return self.data.nbL @nbL.setter def nbL(self, nbL): @@ -302,13 +170,12 @@ class DataSample(dict): if nbL < 0: raise ValueError("The size of the alphabet should " + "an integer >= 0") - self._nbL = nbL + self.data.nbL = nbL @property def nbEx(self): """Number of examples""" - - return self._nbEx + return self.data.nbEx @nbEx.setter def nbEx(self, nbEx): @@ -317,21 +184,17 @@ class DataSample(dict): if nbEx < 0: raise ValueError("The number of examples should be " + " an integer >= 0") - self._nbEx = nbEx + self.data.nbEx = nbEx @property def data(self): - """Splearn_array""" - + """SplearnArray""" return self._data @data.setter def data(self, data): - if isinstance(data, (Splearn_array, np.ndarray, np.generic)): + if isinstance(data, (SplearnArray, np.ndarray, np.generic)): self._data = data else: - raise TypeError("sample should be a Splearn_array.") - - - + raise TypeError("sample should be a SplearnArray.") diff --git a/splearn/hankel.py b/splearn/hankel.py index 6e629a265785726d476e2749a40a55088704ddf7..627203eb19c9116da30ee6f903faec3f7a8fe87c 100644 --- a/splearn/hankel.py +++ b/splearn/hankel.py @@ -44,21 +44,9 @@ import numpy as np class Hankel(object): """ A Hankel instance , compute the list of Hankel matrices - :Example: - - >>> from splearn import Learning, Hankel , Spectral - >>> train_file = '0.spice.train' - >>> pT = load_data_sample(adr=train_file) - >>> sp = Spectral() - >>> sp.fit(X=pT.data) - >>> lhankel = Hankel( sample_instance=pT.sample, - >>> nbL=pT.nbL, nbEx=pT.nbEx, - >>> lrows=6, lcolumns=6, version="classic", - >>> partial=True, sparse=True, mode_quiet=True).lhankel - - Input: - :param Splearn_array sample_instance: instance of Splearn_array + :param SplearnArray sample_instance: instance of SplearnArray :param lrows: number or list of rows, a list of strings if partial=True; otherwise, based on self.pref if version="classic" or @@ -79,6 +67,19 @@ class Hankel(object): *sample_instance* or *lhankel* has to be not None. If *sample_instance* is given, the **Hankel** instance is built directly from the sample dictionnary, else it is deduced from the *lhankels* list of matrices. + + :Example: + + >>> from splearn import Learning, Hankel , Spectral + >>> train_file = '0.spice.train' + >>> pT = load_data_sample(adr=train_file) + >>> sp = Spectral() + >>> sp.fit(X=pT.data) + >>> lhankel = Hankel( sample_instance=pT.sample, + >>> nbL=pT.nbL, nbEx=pT.nbEx, + >>> lrows=6, lcolumns=6, version="classic", + >>> partial=True, sparse=True, mode_quiet=True).lhankel + """ def __init__( @@ -177,6 +178,8 @@ class Hankel(object): @property def build_from_sample(self): + """Boolean that indicates if the matrices have been build form sample or not + (directly build from an Automaton in this case) """ return self._build_from_sample @build_from_sample.setter @@ -193,10 +196,10 @@ class Hankel(object): - Input: - :param dict sample: sample dictionary - :param dict pref: prefix dictionary - :param dict suff: suffix dictionary - :param dict fact: factor dictionary + :param dict sample: the keys are the words and the values are the number of time it appears in the sample. + :param dict pref: the keys are the prefixes and the values are the number of time it appears in the sample. + :param dict suff: the keys are the suffixes and the values are the number of time it appears in the sample. + :param dict fact: the keys are the factors and the values are the number of time it appears in the sample. :param lrows: number or list of rows, a list of strings if partial=True; otherwise, based on self.pref if version="classic" or diff --git a/splearn/spectral.py b/splearn/spectral.py index 1bc76ee2e0f8d951992035514cf4a3009d2e5e85..bf56fa6ea2c8c894530634e1fb9d404477f9f45c 100644 --- a/splearn/spectral.py +++ b/splearn/spectral.py @@ -41,7 +41,10 @@ from __future__ import division, print_function import numpy as np import math -from splearn.datasets.data_sample import Splearn_array +import threading +lock = threading.Lock() + +from splearn.datasets.data_sample import SplearnArray from splearn.hankel import Hankel from sklearn.base import BaseEstimator from sklearn.utils import check_array @@ -51,29 +54,6 @@ import warnings class Spectral(BaseEstimator): """A Spectral estimator instance - :Example: - - >>> from splearn.spectral import Spectral - >>> sp = Spectral() - >>> sp.set_params(partial=True, lcolumns=6, lrows=6, smooth_method='trigram') - Spectral(lcolumns=6, lrows=6, mode_quiet=False, partial=True, rank=5, - smooth_method='trigram', sparse=True, version='classic') - >>> sp.fit(data.data) - Start Hankel matrix computation - End of Hankel matrix computation - Start Building Automaton from Hankel matrix - End of Automaton computation - Spectral(lcolumns=6, lrows=6, partial=True, rank=5, smooth_method='trigram', sparse=True, version='classic') - >>> sp.automaton.initial - array([-0.00049249, 0.00304676, -0.04405996, -0.10765322, -0.08660063]) - >>> sp.predict(data.data) - array([ 4.38961058e-04, 1.10616861e-01, 1.35569353e-03, ..., - 4.66041996e-06, 4.68177275e-02, 5.24287604e-20]) - >>> sp.loss(data.data, normalize=True) - -10.530029936056017 - >>> sp.score(data.data) - 10.530029936056017 - - Input: :param int rank: the ranking number @@ -106,6 +86,28 @@ class Spectral(BaseEstimator): :param boolean mode_quiet: (default value = False) True for no output message. + :Example: + + >>> from splearn.spectral import Spectral + >>> sp = Spectral() + >>> sp.set_params(partial=True, lcolumns=6, lrows=6, smooth_method='trigram') + Spectral(lcolumns=6, lrows=6, mode_quiet=False, partial=True, rank=5, + smooth_method='trigram', sparse=True, version='classic') + >>> sp.fit(data.data) + Start Hankel matrix computation + End of Hankel matrix computation + Start Building Automaton from Hankel matrix + End of Automaton computation + Spectral(lcolumns=6, lrows=6, partial=True, rank=5, smooth_method='trigram', sparse=True, version='classic') + >>> sp.automaton.initial + array([-0.00049249, 0.00304676, -0.04405996, -0.10765322, -0.08660063]) + >>> sp.predict(data.data) + array([ 4.38961058e-04, 1.10616861e-01, 1.35569353e-03, ..., + 4.66041996e-06, 4.68177275e-02, 5.24287604e-20]) + >>> sp.loss(data.data, normalize=True) + -10.530029936056017 + >>> sp.score(data.data) + 10.530029936056017 """ def __init__(self, rank=5, lrows=7, lcolumns=7, @@ -172,8 +174,7 @@ class Spectral(BaseEstimator): self.smooth = 0 def set_params(self, **parameters): - """ - set the values of Spectral estimator parameters + """set the values of Spectral estimator parameters - Output: @@ -186,12 +187,12 @@ class Spectral(BaseEstimator): self._rule_smooth_method(value) return self - def fit(self, X, y=None): #, gram + def fit(self, X, y=None): """Fit the model - Input: - :param Splearn_array X: object of shape [n_samples,n_features] + :param SplearnArray X: object of shape [n_samples,n_features] Training data :param ndarray y: (default value = None) not used by Spectral estimator numpy array of shape [n_samples] Target values @@ -206,11 +207,11 @@ class Spectral(BaseEstimator): """ check_array(X) - if not isinstance(X, Splearn_array): + if not isinstance(X, SplearnArray): self._hankel = None self._automaton = None return self - X = self._polulate_dictionnaries(X) + X = self.polulate_dictionnaries(X) self._hankel = Hankel(sample_instance=X, lrows=self.lrows, lcolumns=self.lcolumns, version=self.version, @@ -232,8 +233,108 @@ class Spectral(BaseEstimator): dsample[w] = dsample[w] + 1 if w in dsample else 1 return dsample - def _polulate_dictionnaries(self, X): - if not isinstance(X, Splearn_array): +# def _populate_new_word(self, X, i, lrowsmax=None, version_rows_int=None, +# lcolumnsmax=None, version_columns_int=None, lmax=None): +# w = X[i, :] +# w = w[w >= 0] +# w = tuple([int(x) for x in w[0:]]) +# with lock: +# X.sample[w] = X.sample.setdefault(w, 0) + 1 +# if self.version == "prefix" or self.version == "classic": +# # empty word treatment for prefixe, suffix, and factor dictionnaries +# with lock: +# X.pref[()] = X.pref[()] + 1 if () in X.pref else 1 +# if self.version == "suffix" or self.version == "classic": +# with lock: +# X.suff[()] = X.suff[()] + 1 if () in X.suff else 1 +# if self.version == "factor" or self.version == "suffix" \ +# or self.version == "prefix": +# with lock: +# X.fact[()] = X.fact[()] + len(w) + 1 if () in X.fact else len(w) + 1 +# +# if self.partial: +# for i in range(len(w)): +# if self.version == "classic": +# if (version_rows_int is True and +# i + 1 <= lrowsmax) or \ +# (version_rows_int is False and +# w[:i + 1] in self.lrows): +# with lock: +# X.pref[w[:i + 1]] = \ +# X.pref[w[:i + 1]] + 1 if w[:i + 1] in X.pref else 1 +# if (version_columns_int is True and i + 1 <= lcolumnsmax) or \ +# (version_columns_int is False and w[-( i + 1):] in self.lcolumns): +# with lock: +# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 if \ +# w[-(i + 1):] in X.suff else 1 +# if self.version == "prefix": +# # dictionaries dpref is populated until +# # lmax = lrows + lcolumns +# # dictionaries dfact is populated until lcolumns +# if ((version_rows_int is True or +# version_columns_int is True) and +# i + 1 <= lmax) or \ +# (version_rows_int is False and +# (w[:i + 1] in self.lrows)) or \ +# (version_columns_int is False and +# (w[:i + 1] in self.lcolumns)): +# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \ +# if w[:i + 1] in X.pref else 1 +# for j in range(i + 1, len(w) + 1): +# if (version_columns_int is True and ( +# j - i) <= lmax) or \ +# (version_columns_int is False and +# (w[i:j] in self.lcolumns)): +# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \ +# if w[i:j] in X.fact else 1 +# if self.version == "suffix": +# if ((version_rows_int is True or +# version_columns_int is True) and +# i <= lmax) or \ +# (version_rows_int is False and +# (w[-(i + 1):] in self.lrows)) or \ +# (version_columns_int is False and +# (w[-(i + 1):] in self.lcolumns)): +# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 \ +# if w[-(i + 1):] in X.suff else 1 +# for j in range(i + 1, len(w) + 1): +# if (version_rows_int is True and ( +# j - i) <= lmax) or \ +# (version_rows_int is False and +# (w[i:j] in self.lrows)): +# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \ +# if w[i:j] in X.fact else 1 +# if self.version == "factor": +# for j in range(i + 1, len(w) + 1): +# if ((version_rows_int is True or +# version_columns_int is True) and +# (j - i) <= lmax) or \ +# (version_rows_int is False and +# (w[i:j] in self.lrows)) or \ +# (version_columns_int is False and +# (w[i:j] in self.lcolumns)): +# X.fact[w[i:j]] = \ +# X.fact[w[i:j]] + 1 if w[i:j] in X.fact else 1 +# +# else: # not partial +# for i in range(len(w)): +# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \ +# if w[:i + 1] in X.pref else 1 +# X.suff[w[i:]] = X.suff[w[i:]] + 1 if w[i:] in X.suff else 1 +# for j in range(i + 1, len(w) + 1): +# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \ +# if w[i:j] in X.fact else 1 + + def polulate_dictionnaries(self, X): + """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X + + - Input: + + :param SplearnArray X: object of shape [n_samples,n_features] + Training data + + """ + if not isinstance(X, SplearnArray): return X dsample = {} # dictionary (word,count) dpref = {} # dictionary (prefix,count) @@ -459,7 +560,7 @@ class Spectral(BaseEstimator): - Input: - :param Splearn_array X : of shape data shape = (n_samples, n_features) + :param SplearnArray X : of shape data shape = (n_samples, n_features) Samples. @@ -489,7 +590,7 @@ class Spectral(BaseEstimator): - Input: - :param Splearn_array X : Samples, data shape = (n_samples, n_features) + :param SplearnArray X : Samples, data shape = (n_samples, n_features) - Output: @@ -537,18 +638,17 @@ class Spectral(BaseEstimator): return Y def loss(self, X, y=None, normalize=True): - """ - Log probability using the Spectral model + """Log probability using the Spectral model - Input: - :param Splearn_array X : of shape data shape = (n_samples, n_features) + :param SplearnArray X: of shape data shape = (n_samples, n_features) Samples. X is validation data. - :param ndarray y : (default value = Null) + :param ndarray y: (default value = Null) numpy array of shape [n_samples] Target values, is the ground truth target for X (in the supervised case) or None (in the unsupervised case) - :param boolean normalize (default value = True) calculation are + :param boolean normalize: (default value = True) calculation are performed and normalize by the number of sample in case of True - Output: @@ -584,7 +684,7 @@ class Spectral(BaseEstimator): - Input: - :param Splearn_array X: of shape data shape = (n_samples, n_features) + :param SplearnArray X: of shape data shape = (n_samples, n_features) Samples. :param ndarray y: (default value = None) numpy array of shape [n_samples] Target values, diff --git a/splearn/tests/test_data_sample.py b/splearn/tests/test_data_sample.py index 1eae3ba20ab49018b0ed668c0593104487bcba93..affb871f03f7f1a339efb30c432b55fd7abea1cc 100644 --- a/splearn/tests/test_data_sample.py +++ b/splearn/tests/test_data_sample.py @@ -38,7 +38,7 @@ from __future__ import division, print_function import numpy as np import unittest from splearn.datasets.base import load_data_sample -from splearn.datasets.data_sample import DataSample, Splearn_array +from splearn.datasets.data_sample import DataSample, SplearnArray from splearn.tests.datasets.get_dataset_path import get_dataset_path from splearn.spectral import Spectral @@ -68,7 +68,7 @@ class UnitaryTest(unittest.TestCase): s = load_data_sample(adr=adr) cl = Spectral() - cl._polulate_dictionnaries(s.data) + cl.polulate_dictionnaries(s.data) self.assertEqual(s.nbL,s.data.nbL) self.assertEqual(s.nbEx, s.data.nbEx) with self.assertRaises(TypeError): @@ -88,7 +88,7 @@ class UnitaryTest(unittest.TestCase): data = load_data_sample(adr=adr) cl = Spectral(partial=False) - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) nbL = data.data.nbL nbEx = data.data.nbEx sample = data.data .sample @@ -107,7 +107,7 @@ class UnitaryTest(unittest.TestCase): self.assertEqual(nbSuff1, nbSuff2) cl = Spectral(version = 'factor', partial=False) - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) fact = data.data.fact nbFact1 = sum([sample[w]*(len(w)+1)*(len(w)+2)/2 for w in sample]) nbFact2 = sum([fact[w] for w in fact]) @@ -117,7 +117,7 @@ class UnitaryTest(unittest.TestCase): adr = get_dataset_path("0.spice.train") pT = load_data_sample(adr=adr) cl = Spectral(partial=False) - cl._polulate_dictionnaries(pT.data) + cl.polulate_dictionnaries(pT.data) # lR = pT.data.select_rows(nb_rows_max = 10, version = 'classic') # lC = pT.data.select_columns(nb_columns_max = 10, version = 'classic') # self.assertEqual(lR, [(), (3,), (3, 0), (3, 3), (3, 0, 3), (3, 1), @@ -127,7 +127,7 @@ class UnitaryTest(unittest.TestCase): # (1,), (1, 3), (3, 0, 3)]) cl = Spectral(version = 'prefix', partial=False) - cl._polulate_dictionnaries(pT.data) + cl.polulate_dictionnaries(pT.data) # lRp = pT.data.select_rows(nb_rows_max = 10, version = 'prefix') # lCp = pT.data.select_columns(nb_columns_max = 10, version = 'prefix') # self.assertEqual(lRp, [(), (3,), (3, 0), (3, 0, 0), (3, 0, 1), @@ -137,7 +137,7 @@ class UnitaryTest(unittest.TestCase): # (0, 3), (1, 3), (3, 1)]) cl = Spectral(version = 'factor', partial=False) - cl._polulate_dictionnaries(pT.data) + cl.polulate_dictionnaries(pT.data) # lRf = pT.data.select_rows(nb_rows_max = 10, version = 'factor') # lCf = pT.data.select_columns(nb_columns_max = 10, version = 'factor') # self.assertEqual(lRf, [(), (3,), (0,), (1,), (3, 0), (3, 3), (2,), diff --git a/splearn/tests/test_hankel.py b/splearn/tests/test_hankel.py index 2f2a4bc1cb6d43aa8faf681db67347f50ab6eded..a1cb6c27bf1aab24b19e1c09f7bbe8420f86112d 100644 --- a/splearn/tests/test_hankel.py +++ b/splearn/tests/test_hankel.py @@ -53,7 +53,7 @@ class HankelTest(unittest.TestCase): # adr = get_dataset_path("3.pautomac.train") data = load_data_sample(adr=adr) cl = Spectral(partial=False) - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) lprefix = [()] lprefix = lprefix + [(i,) for i in range(data.data.nbL)] lprefix = lprefix+[(i, j) for i in range(data.data.nbL) @@ -123,7 +123,7 @@ class HankelTest(unittest.TestCase): # adr = get_dataset_path("3.pautomac.train") data = load_data_sample(adr=adr) cl = Spectral(partial=False, version="prefix") - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) lprefix = [()] lprefix = lprefix + [(i,) for i in range(data.data.nbL)] lprefix = lprefix + [(i, j) for i in range(data.data.nbL) @@ -196,7 +196,7 @@ class HankelTest(unittest.TestCase): # adr = get_dataset_path("3.pautomac.train") data = load_data_sample(adr=adr) cl = Spectral(partial=False, version="suffix") - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) lprefix = [()] lprefix = lprefix + [(i,) for i in range(data.data.nbL)] lprefix = lprefix + [(i, j) for i in range(data.data.nbL) @@ -266,7 +266,7 @@ class HankelTest(unittest.TestCase): # adr = get_dataset_path("3.pautomac.train") data = load_data_sample(adr=adr) cl = Spectral(partial=False, version="factor") - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) lprefix = [()] lprefix = lprefix + [(i,) for i in range(data.data.nbL)] lprefix = lprefix + [(i, j) for i in range(data.data.nbL) @@ -336,7 +336,7 @@ class HankelTest(unittest.TestCase): # adr = get_dataset_path("3.pautomac.train") data = load_data_sample(adr=adr) cl = Spectral() - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) h = Hankel(sample_instance=data.data, lrows=1, lcolumns=1, version="classic", partial=False, sparse=False) with self.assertRaises(TypeError): @@ -349,7 +349,7 @@ class HankelTest(unittest.TestCase): adr = get_dataset_path("essai") data = load_data_sample(adr=adr) cl = Spectral() - cl._polulate_dictionnaries(data.data) + cl.polulate_dictionnaries(data.data) h1 = Hankel(sample_instance=data.data, lrows=1, lcolumns=1, version="classic", partial=False, sparse=False) h2 = Hankel(sample_instance=data.data, lrows=1, lcolumns=1, diff --git a/splearn/tests/test_spectral.py b/splearn/tests/test_spectral.py index 3ab9abb16289316d96c1c581c415ec19cb70b5d3..314aa17b4f0724699e488314f9c2d4d246a9bc94 100644 --- a/splearn/tests/test_spectral.py +++ b/splearn/tests/test_spectral.py @@ -41,9 +41,10 @@ from splearn.datasets.base import load_data_sample from splearn.automaton import Automaton from splearn.spectral import Spectral from splearn.tests.datasets.get_dataset_path import get_dataset_path +from sklearn.linear_model.tests.test_passive_aggressive import random_state -class UnitaryTest(unittest.TestCase): +class SpectralTest(unittest.TestCase): def test_version(self): adr = get_dataset_path("essai") @@ -238,6 +239,25 @@ class UnitaryTest(unittest.TestCase): np.testing.assert_almost_equal(A.val([0, 1, 0, 1, 1]), B.val([0, 1, 0, 1, 1])) + def test_sklearn_compatibility(self): + from sklearn.utils.estimator_checks import check_estimator + from sklearn.model_selection import train_test_split, cross_val_score + check_estimator(Spectral) + adr = get_dataset_path("3.pautomac_light.train") + data = load_data_sample(adr=adr) + sp = Spectral(lrows=6, lcolumns=6, rank = 5, sparse=False, + partial=True, smooth_method='trigram') + X_train, X_test = train_test_split(data.data, test_size=0.4, random_state=0) + sp.fit(X_train) + single_predicted_weights = sp.predict(X_test) + print(single_predicted_weights) + self.assertAlmostEqual(single_predicted_weights[0], 6.76217667e-02, delta = 1e-5) + scores = cross_val_score(sp, data.data, cv=4) + print(scores) + scores_expected = [-10.65272755, -10.7090267, -10.78404758, -11.08453211] + for s1, s2 in zip(scores, scores_expected): + self.assertAlmostEqual(s1, s2, delta=0.1) + # def test_Perplexity(self): # adr = get_dataset_path("3.pautomac") # P = Learning()