diff --git a/examples/performances_calculation.py b/examples/performances_calculation.py new file mode 100644 index 0000000000000000000000000000000000000000..9eb6d59da95bada73e78a0bfc48006389eaa4f44 --- /dev/null +++ b/examples/performances_calculation.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +''' +Created on 20 févr. 2018 + +@author: arrivault +''' +import sys +from timeit import default_timer as timer + +from splearn import Spectral +from splearn.tests.datasets.get_dataset_path import get_dataset_path +from splearn.datasets.base import load_data_sample + +def test(): + adr = get_dataset_path("3.pautomac.train") + data = load_data_sample(adr=adr) + X = data.data + sp1 = Spectral() + start = timer() + sp1 = sp1.fit(X) + duration = timer() - start + print("Classic : " + str(duration)) + + sp2 = Spectral() + start = timer() + sp2 = sp2.fit_opt(X) + duration = timer() - start + print("Opt : " + str(duration)) + + if sp1.hankel == sp2.hankel: + print("Same result.") + else: + print("The result is different", file=sys.stderr) + +if __name__ == '__main__': + test() diff --git a/splearn/spectral.py b/splearn/spectral.py index bf56fa6ea2c8c894530634e1fb9d404477f9f45c..19c7f09616b84e73f0a139dc11799165aa344d7b 100644 --- a/splearn/spectral.py +++ b/splearn/spectral.py @@ -41,15 +41,13 @@ from __future__ import division, print_function import numpy as np import math -import threading -lock = threading.Lock() +import warnings from splearn.datasets.data_sample import SplearnArray from splearn.hankel import Hankel from sklearn.base import BaseEstimator from sklearn.utils import check_array from sklearn.utils.validation import NotFittedError -import warnings class Spectral(BaseEstimator): """A Spectral estimator instance @@ -223,107 +221,136 @@ class Spectral(BaseEstimator): self.trigram = self._threegramdict(X.sample) return self + + def fit_opt(self, X, y=None): + """Fit the model in a optimal way - def _populate_sample_dict(self, X): - dsample = {} # dictionary (word,count) - for line in range(X.shape[0]): + - Input: + + :param SplearnArray X: object of shape [n_samples,n_features] + Training data + :param ndarray y: (default value = None) not used by Spectral estimator + numpy array of shape [n_samples] Target values + + + - Output: + + :returns: Spectral itself with an automaton attribute instanced + returns an instance of self. + :rtype: Spectral + + """ + check_array(X) + + if not isinstance(X, SplearnArray): + self._hankel = None + self._automaton = None + return self + X = self.polulate_dictionnaries_opt(X) + self._hankel = Hankel(sample_instance=X, + lrows=self.lrows, lcolumns=self.lcolumns, + version=self.version, + partial=self.partial, sparse=self.sparse, + mode_quiet=self.mode_quiet) + self._automaton = self._hankel.to_automaton(self.rank, self.mode_quiet) + # for smooth option compute trigram dictionnary + if self.smooth == 1: + self.trigram = self._threegramdict(X.sample) + + return self + + def polulate_dictionnaries_opt(self, X): + """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X + + - Input: + + :param SplearnArray X: object of shape [n_samples,n_features] + Training data + + """ + if not isinstance(X, SplearnArray): + return X + X.sample = {} # dictionary (word,count) + X.pref = {} # dictionary (prefix,count) + X.suff = {} # dictionary (suffix,count) + X.fact = {} # dictionary (factor,count) + if self.partial: + if isinstance(self.lrows, int): + lrowsmax = self.lrows + version_rows_int = True + else: + version_rows_int = False + lrowsmax = self.lrows.__len__() + if isinstance(self.lcolumns, int): + lcolumnsmax = self.lcolumns + version_columns_int = True + else: + lcolumnsmax = self.lcolumns.__len__() + version_columns_int = False + lmax = lrowsmax + lcolumnsmax + for line in range(X.shape[0]): + self._populate_a_word(X, line, lrowsmax, version_rows_int, lcolumnsmax, version_columns_int, lmax) + else: + for line in range(X.shape[0]): + self._populate_a_word(X, line) + return X + + def _populate_a_word(self, X, line, lrowsmax=None, version_rows_int=None, + lcolumnsmax=None, version_columns_int=None, lmax=None): w = X[line, :] w = w[w >= 0] w = tuple([int(x) for x in w[0:]]) - dsample[w] = dsample[w] + 1 if w in dsample else 1 - return dsample - -# def _populate_new_word(self, X, i, lrowsmax=None, version_rows_int=None, -# lcolumnsmax=None, version_columns_int=None, lmax=None): -# w = X[i, :] -# w = w[w >= 0] -# w = tuple([int(x) for x in w[0:]]) -# with lock: -# X.sample[w] = X.sample.setdefault(w, 0) + 1 -# if self.version == "prefix" or self.version == "classic": -# # empty word treatment for prefixe, suffix, and factor dictionnaries -# with lock: -# X.pref[()] = X.pref[()] + 1 if () in X.pref else 1 -# if self.version == "suffix" or self.version == "classic": -# with lock: -# X.suff[()] = X.suff[()] + 1 if () in X.suff else 1 -# if self.version == "factor" or self.version == "suffix" \ -# or self.version == "prefix": -# with lock: -# X.fact[()] = X.fact[()] + len(w) + 1 if () in X.fact else len(w) + 1 -# -# if self.partial: -# for i in range(len(w)): -# if self.version == "classic": -# if (version_rows_int is True and -# i + 1 <= lrowsmax) or \ -# (version_rows_int is False and -# w[:i + 1] in self.lrows): -# with lock: -# X.pref[w[:i + 1]] = \ -# X.pref[w[:i + 1]] + 1 if w[:i + 1] in X.pref else 1 -# if (version_columns_int is True and i + 1 <= lcolumnsmax) or \ -# (version_columns_int is False and w[-( i + 1):] in self.lcolumns): -# with lock: -# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 if \ -# w[-(i + 1):] in X.suff else 1 -# if self.version == "prefix": -# # dictionaries dpref is populated until -# # lmax = lrows + lcolumns -# # dictionaries dfact is populated until lcolumns -# if ((version_rows_int is True or -# version_columns_int is True) and -# i + 1 <= lmax) or \ -# (version_rows_int is False and -# (w[:i + 1] in self.lrows)) or \ -# (version_columns_int is False and -# (w[:i + 1] in self.lcolumns)): -# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \ -# if w[:i + 1] in X.pref else 1 -# for j in range(i + 1, len(w) + 1): -# if (version_columns_int is True and ( -# j - i) <= lmax) or \ -# (version_columns_int is False and -# (w[i:j] in self.lcolumns)): -# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \ -# if w[i:j] in X.fact else 1 -# if self.version == "suffix": -# if ((version_rows_int is True or -# version_columns_int is True) and -# i <= lmax) or \ -# (version_rows_int is False and -# (w[-(i + 1):] in self.lrows)) or \ -# (version_columns_int is False and -# (w[-(i + 1):] in self.lcolumns)): -# X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 \ -# if w[-(i + 1):] in X.suff else 1 -# for j in range(i + 1, len(w) + 1): -# if (version_rows_int is True and ( -# j - i) <= lmax) or \ -# (version_rows_int is False and -# (w[i:j] in self.lrows)): -# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \ -# if w[i:j] in X.fact else 1 -# if self.version == "factor": -# for j in range(i + 1, len(w) + 1): -# if ((version_rows_int is True or -# version_columns_int is True) and -# (j - i) <= lmax) or \ -# (version_rows_int is False and -# (w[i:j] in self.lrows)) or \ -# (version_columns_int is False and -# (w[i:j] in self.lcolumns)): -# X.fact[w[i:j]] = \ -# X.fact[w[i:j]] + 1 if w[i:j] in X.fact else 1 -# -# else: # not partial -# for i in range(len(w)): -# X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \ -# if w[:i + 1] in X.pref else 1 -# X.suff[w[i:]] = X.suff[w[i:]] + 1 if w[i:] in X.suff else 1 -# for j in range(i + 1, len(w) + 1): -# X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \ -# if w[i:j] in X.fact else 1 + X.sample[w] = X.sample.setdefault(w, 0) + 1 + if self.version == "prefix" or self.version == "classic": + # empty word treatment for prefixe, suffix, and factor dictionnaries + X.pref[()] = X.pref.setdefault((),0) + 1 + if self.version == "suffix" or self.version == "classic": + X.suff[()] = X.suff.setdefault((),0) + 1 + if (self.version == "factor" or self.version == "suffix" or + self.version == "prefix"): + X.fact[()] = X.fact.setdefault((),0) + len(w) + 1 + if self.partial: + for i in range(len(w)): + if self.version == "classic": + if ((version_rows_int and i + 1 <= lrowsmax) or + (not version_rows_int and w[:i + 1] in self.lrows)): + X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1 + if ((version_columns_int and i + 1 <= lcolumnsmax) or + (not version_columns_int and w[-( i + 1):] in self.lcolumns)): + X.suff[w[-(i + 1):]] = X.suff.setdefault(w[-(i + 1):], 0) + 1 + elif self.version == "prefix": + # dictionaries dpref is populated until + # lmax = lrows + lcolumns + # dictionaries dfact is populated until lcolumns + if (((version_rows_int or version_columns_int) and i + 1 <= lmax) or + (not version_rows_int and w[:i + 1] in self.lrows) or + (not version_columns_int and w[:i + 1] in self.lcolumns)): + X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1 + for j in range(i + 1, len(w) + 1): + if ((version_columns_int and (j - i) <= lmax) or + (not version_columns_int and w[i:j] in self.lcolumns)): + X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1 + elif self.version == "suffix": + if (((version_rows_int or version_columns_int) and i <= lmax) or + (not version_rows_int and w[-(i + 1):] in self.lrows) or + (not version_columns_int and w[-(i + 1):] in self.lcolumns)): + X.suff[w[-(i + 1):]] = X.suff.setdefault(w[-(i + 1):], 0) + 1 + for j in range(i + 1, len(w) + 1): + if ((version_rows_int and (j - i) <= lmax) or + (not version_rows_int and w[i:j] in self.lrows)): + X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1 + elif self.version == "factor": + for j in range(i + 1, len(w) + 1): + if (((version_rows_int or version_columns_int) and (j - i) <= lmax) or + (not version_rows_int and w[i:j] in self.lrows) or + (not version_columns_int and w[i:j] in self.lcolumns)): + X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1 + else: # not partial + for i in range(len(w)): + X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1 + X.suff[w[i:]] = X.suff.setdefault(w[i:], 0) + 1 + for j in range(i + 1, len(w) + 1): + X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1 def polulate_dictionnaries(self, X): """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X @@ -460,6 +487,15 @@ class Spectral(BaseEstimator): X.pref = {} return X + def _populate_sample_dict(self, X): + dsample = {} # dictionary (word,count) + for line in range(X.shape[0]): + w = X[line, :] + w = w[w >= 0] + w = tuple([int(x) for x in w[0:]]) + dsample[w] = dsample[w] + 1 if w in dsample else 1 + return dsample + @property def trigram(self): """The trigram dictionary"""