Skip to content
Snippets Groups Projects
Commit f14ae87c authored by Denis Arrivault's avatar Denis Arrivault
Browse files

populate_dictionnary rewrite in progress

parent cb7dcfc2
Branches
No related tags found
No related merge requests found
Pipeline #
# -*- coding: utf-8 -*-
'''
Created on 20 févr. 2018
@author: arrivault
'''
import sys
from timeit import default_timer as timer
from splearn import Spectral
from splearn.tests.datasets.get_dataset_path import get_dataset_path
from splearn.datasets.base import load_data_sample
def test():
adr = get_dataset_path("3.pautomac.train")
data = load_data_sample(adr=adr)
X = data.data
sp1 = Spectral()
start = timer()
sp1 = sp1.fit(X)
duration = timer() - start
print("Classic : " + str(duration))
sp2 = Spectral()
start = timer()
sp2 = sp2.fit_opt(X)
duration = timer() - start
print("Opt : " + str(duration))
if sp1.hankel == sp2.hankel:
print("Same result.")
else:
print("The result is different", file=sys.stderr)
if __name__ == '__main__':
test()
...@@ -41,15 +41,13 @@ ...@@ -41,15 +41,13 @@
from __future__ import division, print_function from __future__ import division, print_function
import numpy as np import numpy as np
import math import math
import threading import warnings
lock = threading.Lock()
from splearn.datasets.data_sample import SplearnArray from splearn.datasets.data_sample import SplearnArray
from splearn.hankel import Hankel from splearn.hankel import Hankel
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.utils import check_array from sklearn.utils import check_array
from sklearn.utils.validation import NotFittedError from sklearn.utils.validation import NotFittedError
import warnings
class Spectral(BaseEstimator): class Spectral(BaseEstimator):
"""A Spectral estimator instance """A Spectral estimator instance
...@@ -224,106 +222,135 @@ class Spectral(BaseEstimator): ...@@ -224,106 +222,135 @@ class Spectral(BaseEstimator):
return self return self
def _populate_sample_dict(self, X): def fit_opt(self, X, y=None):
dsample = {} # dictionary (word,count) """Fit the model in a optimal way
- Input:
:param SplearnArray X: object of shape [n_samples,n_features]
Training data
:param ndarray y: (default value = None) not used by Spectral estimator
numpy array of shape [n_samples] Target values
- Output:
:returns: Spectral itself with an automaton attribute instanced
returns an instance of self.
:rtype: Spectral
"""
check_array(X)
if not isinstance(X, SplearnArray):
self._hankel = None
self._automaton = None
return self
X = self.polulate_dictionnaries_opt(X)
self._hankel = Hankel(sample_instance=X,
lrows=self.lrows, lcolumns=self.lcolumns,
version=self.version,
partial=self.partial, sparse=self.sparse,
mode_quiet=self.mode_quiet)
self._automaton = self._hankel.to_automaton(self.rank, self.mode_quiet)
# for smooth option compute trigram dictionnary
if self.smooth == 1:
self.trigram = self._threegramdict(X.sample)
return self
def polulate_dictionnaries_opt(self, X):
"""Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
- Input:
:param SplearnArray X: object of shape [n_samples,n_features]
Training data
"""
if not isinstance(X, SplearnArray):
return X
X.sample = {} # dictionary (word,count)
X.pref = {} # dictionary (prefix,count)
X.suff = {} # dictionary (suffix,count)
X.fact = {} # dictionary (factor,count)
if self.partial:
if isinstance(self.lrows, int):
lrowsmax = self.lrows
version_rows_int = True
else:
version_rows_int = False
lrowsmax = self.lrows.__len__()
if isinstance(self.lcolumns, int):
lcolumnsmax = self.lcolumns
version_columns_int = True
else:
lcolumnsmax = self.lcolumns.__len__()
version_columns_int = False
lmax = lrowsmax + lcolumnsmax
for line in range(X.shape[0]):
self._populate_a_word(X, line, lrowsmax, version_rows_int, lcolumnsmax, version_columns_int, lmax)
else:
for line in range(X.shape[0]): for line in range(X.shape[0]):
self._populate_a_word(X, line)
return X
def _populate_a_word(self, X, line, lrowsmax=None, version_rows_int=None,
lcolumnsmax=None, version_columns_int=None, lmax=None):
w = X[line, :] w = X[line, :]
w = w[w >= 0] w = w[w >= 0]
w = tuple([int(x) for x in w[0:]]) w = tuple([int(x) for x in w[0:]])
dsample[w] = dsample[w] + 1 if w in dsample else 1 X.sample[w] = X.sample.setdefault(w, 0) + 1
return dsample if self.version == "prefix" or self.version == "classic":
# empty word treatment for prefixe, suffix, and factor dictionnaries
# def _populate_new_word(self, X, i, lrowsmax=None, version_rows_int=None, X.pref[()] = X.pref.setdefault((),0) + 1
# lcolumnsmax=None, version_columns_int=None, lmax=None): if self.version == "suffix" or self.version == "classic":
# w = X[i, :] X.suff[()] = X.suff.setdefault((),0) + 1
# w = w[w >= 0] if (self.version == "factor" or self.version == "suffix" or
# w = tuple([int(x) for x in w[0:]]) self.version == "prefix"):
# with lock: X.fact[()] = X.fact.setdefault((),0) + len(w) + 1
# X.sample[w] = X.sample.setdefault(w, 0) + 1 if self.partial:
# if self.version == "prefix" or self.version == "classic": for i in range(len(w)):
# # empty word treatment for prefixe, suffix, and factor dictionnaries if self.version == "classic":
# with lock: if ((version_rows_int and i + 1 <= lrowsmax) or
# X.pref[()] = X.pref[()] + 1 if () in X.pref else 1 (not version_rows_int and w[:i + 1] in self.lrows)):
# if self.version == "suffix" or self.version == "classic": X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1
# with lock: if ((version_columns_int and i + 1 <= lcolumnsmax) or
# X.suff[()] = X.suff[()] + 1 if () in X.suff else 1 (not version_columns_int and w[-( i + 1):] in self.lcolumns)):
# if self.version == "factor" or self.version == "suffix" \ X.suff[w[-(i + 1):]] = X.suff.setdefault(w[-(i + 1):], 0) + 1
# or self.version == "prefix": elif self.version == "prefix":
# with lock: # dictionaries dpref is populated until
# X.fact[()] = X.fact[()] + len(w) + 1 if () in X.fact else len(w) + 1 # lmax = lrows + lcolumns
# # dictionaries dfact is populated until lcolumns
# if self.partial: if (((version_rows_int or version_columns_int) and i + 1 <= lmax) or
# for i in range(len(w)): (not version_rows_int and w[:i + 1] in self.lrows) or
# if self.version == "classic": (not version_columns_int and w[:i + 1] in self.lcolumns)):
# if (version_rows_int is True and X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1
# i + 1 <= lrowsmax) or \ for j in range(i + 1, len(w) + 1):
# (version_rows_int is False and if ((version_columns_int and (j - i) <= lmax) or
# w[:i + 1] in self.lrows): (not version_columns_int and w[i:j] in self.lcolumns)):
# with lock: X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
# X.pref[w[:i + 1]] = \ elif self.version == "suffix":
# X.pref[w[:i + 1]] + 1 if w[:i + 1] in X.pref else 1 if (((version_rows_int or version_columns_int) and i <= lmax) or
# if (version_columns_int is True and i + 1 <= lcolumnsmax) or \ (not version_rows_int and w[-(i + 1):] in self.lrows) or
# (version_columns_int is False and w[-( i + 1):] in self.lcolumns): (not version_columns_int and w[-(i + 1):] in self.lcolumns)):
# with lock: X.suff[w[-(i + 1):]] = X.suff.setdefault(w[-(i + 1):], 0) + 1
# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 if \ for j in range(i + 1, len(w) + 1):
# w[-(i + 1):] in X.suff else 1 if ((version_rows_int and (j - i) <= lmax) or
# if self.version == "prefix": (not version_rows_int and w[i:j] in self.lrows)):
# # dictionaries dpref is populated until X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
# # lmax = lrows + lcolumns elif self.version == "factor":
# # dictionaries dfact is populated until lcolumns for j in range(i + 1, len(w) + 1):
# if ((version_rows_int is True or if (((version_rows_int or version_columns_int) and (j - i) <= lmax) or
# version_columns_int is True) and (not version_rows_int and w[i:j] in self.lrows) or
# i + 1 <= lmax) or \ (not version_columns_int and w[i:j] in self.lcolumns)):
# (version_rows_int is False and X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
# (w[:i + 1] in self.lrows)) or \ else: # not partial
# (version_columns_int is False and for i in range(len(w)):
# (w[:i + 1] in self.lcolumns)): X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1
# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \ X.suff[w[i:]] = X.suff.setdefault(w[i:], 0) + 1
# if w[:i + 1] in X.pref else 1 for j in range(i + 1, len(w) + 1):
# for j in range(i + 1, len(w) + 1): X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
# if (version_columns_int is True and (
# j - i) <= lmax) or \
# (version_columns_int is False and
# (w[i:j] in self.lcolumns)):
# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
# if w[i:j] in X.fact else 1
# if self.version == "suffix":
# if ((version_rows_int is True or
# version_columns_int is True) and
# i <= lmax) or \
# (version_rows_int is False and
# (w[-(i + 1):] in self.lrows)) or \
# (version_columns_int is False and
# (w[-(i + 1):] in self.lcolumns)):
# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 \
# if w[-(i + 1):] in X.suff else 1
# for j in range(i + 1, len(w) + 1):
# if (version_rows_int is True and (
# j - i) <= lmax) or \
# (version_rows_int is False and
# (w[i:j] in self.lrows)):
# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
# if w[i:j] in X.fact else 1
# if self.version == "factor":
# for j in range(i + 1, len(w) + 1):
# if ((version_rows_int is True or
# version_columns_int is True) and
# (j - i) <= lmax) or \
# (version_rows_int is False and
# (w[i:j] in self.lrows)) or \
# (version_columns_int is False and
# (w[i:j] in self.lcolumns)):
# X.fact[w[i:j]] = \
# X.fact[w[i:j]] + 1 if w[i:j] in X.fact else 1
#
# else: # not partial
# for i in range(len(w)):
# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
# if w[:i + 1] in X.pref else 1
# X.suff[w[i:]] = X.suff[w[i:]] + 1 if w[i:] in X.suff else 1
# for j in range(i + 1, len(w) + 1):
# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
# if w[i:j] in X.fact else 1
def polulate_dictionnaries(self, X): def polulate_dictionnaries(self, X):
"""Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
...@@ -460,6 +487,15 @@ class Spectral(BaseEstimator): ...@@ -460,6 +487,15 @@ class Spectral(BaseEstimator):
X.pref = {} X.pref = {}
return X return X
def _populate_sample_dict(self, X):
dsample = {} # dictionary (word,count)
for line in range(X.shape[0]):
w = X[line, :]
w = w[w >= 0]
w = tuple([int(x) for x in w[0:]])
dsample[w] = dsample[w] + 1 if w in dsample else 1
return dsample
@property @property
def trigram(self): def trigram(self):
"""The trigram dictionary""" """The trigram dictionary"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment