populate_dictionnary rewrite in progress

f14ae87c · Denis Arrivault · cb7dcfc2 · f14ae87c · f14ae87c
Commit f14ae87c authored 7 years ago by Denis Arrivault
--- a/examples/performances_calculation.py
+++ b/examples/performances_calculation.py
+# -*- coding: utf-8 -*-
+'''
+Created on 20 févr. 2018
+@author: arrivault
+'''
+import sys
+from timeit import default_timer as timer
+from splearn import Spectral
+from splearn.tests.datasets.get_dataset_path import get_dataset_path
+from splearn.datasets.base import load_data_sample
+def test():
+    adr = get_dataset_path("3.pautomac.train")
+    data = load_data_sample(adr=adr)
+    X = data.data
+    sp1 = Spectral()
+    start = timer()
+    sp1 = sp1.fit(X)
+    duration = timer() - start
+    print("Classic : " + str(duration))
+    sp2 = Spectral()
+    start = timer()
+    sp2 = sp2.fit_opt(X)
+    duration = timer() - start
+    print("Opt : " + str(duration))
+    if sp1.hankel == sp2.hankel:
+        print("Same result.")
+    else:
+        print("The result is different", file=sys.stderr)
+if __name__ == '__main__':
+    test()
--- a/splearn/spectral.py
+++ b/splearn/spectral.py
@@ -41,15 +41,13 @@
 from __future__ import division, print_function
 import numpy as np
 import math
-import threading
+import warnings
-lock = threading.Lock()
 from splearn.datasets.data_sample import SplearnArray
 from splearn.hankel import Hankel
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
 from sklearn.utils.validation import NotFittedError
-import warnings
 class Spectral(BaseEstimator):
    """A Spectral estimator instance
@@ -224,106 +222,135 @@ class Spectral(BaseEstimator):
        return self
-    def _populate_sample_dict(self, X):
+    def fit_opt(self, X, y=None):
-        dsample = {}  # dictionary (word,count)
+        """Fit the model in a optimal way
+        - Input:
+        :param SplearnArray X: object of shape [n_samples,n_features]
+               Training data
+        :param ndarray y: (default value = None) not used by Spectral estimator
+               numpy array of shape [n_samples] Target values
+        - Output:
+        :returns: Spectral itself with an automaton attribute instanced
+                  returns an instance of self.
+        :rtype: Spectral
+        """
+        check_array(X)
+        if not isinstance(X, SplearnArray):
+            self._hankel = None
+            self._automaton = None
+            return self
+        X = self.polulate_dictionnaries_opt(X)
+        self._hankel = Hankel(sample_instance=X,
+                         lrows=self.lrows, lcolumns=self.lcolumns,
+                         version=self.version,
+                         partial=self.partial, sparse=self.sparse,
+                         mode_quiet=self.mode_quiet)
+        self._automaton = self._hankel.to_automaton(self.rank, self.mode_quiet)
+        # for smooth option compute trigram dictionnary
+        if self.smooth == 1:
+            self.trigram = self._threegramdict(X.sample)
+        return self
+    def polulate_dictionnaries_opt(self, X):
+        """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
+        - Input:
+        :param SplearnArray X: object of shape [n_samples,n_features]
+               Training data
+        """
+        if not isinstance(X, SplearnArray):
+            return X
+        X.sample = {}  # dictionary (word,count)
+        X.pref = {}  # dictionary (prefix,count)
+        X.suff = {}  # dictionary (suffix,count)
+        X.fact = {}  # dictionary (factor,count)
+        if self.partial:
+            if isinstance(self.lrows, int):
+                lrowsmax = self.lrows
+                version_rows_int = True
+            else:
+                version_rows_int = False
+                lrowsmax = self.lrows.__len__()
+            if isinstance(self.lcolumns, int):
+                lcolumnsmax = self.lcolumns
+                version_columns_int = True
+            else:
+                lcolumnsmax = self.lcolumns.__len__()
+                version_columns_int = False
+            lmax = lrowsmax + lcolumnsmax
+            for line in range(X.shape[0]):
+                self._populate_a_word(X, line, lrowsmax, version_rows_int, lcolumnsmax, version_columns_int, lmax)
+        else:
            for line in range(X.shape[0]):
+                self._populate_a_word(X, line)
+        return X
+    def _populate_a_word(self, X, line, lrowsmax=None, version_rows_int=None,
+                         lcolumnsmax=None, version_columns_int=None, lmax=None):
            w = X[line, :]
            w = w[w >= 0]
            w = tuple([int(x) for x in w[0:]])
-            dsample[w] = dsample[w] + 1 if w in dsample else 1
+            X.sample[w] = X.sample.setdefault(w, 0) + 1
-        return dsample
+            if self.version == "prefix" or self.version == "classic":
+                # empty word treatment for prefixe, suffix, and factor dictionnaries
-#     def _populate_new_word(self, X, i, lrowsmax=None, version_rows_int=None,
+                X.pref[()] = X.pref.setdefault((),0) + 1
-#                            lcolumnsmax=None, version_columns_int=None, lmax=None):
+            if self.version == "suffix" or self.version == "classic":
-#         w = X[i, :]
+                X.suff[()] = X.suff.setdefault((),0) + 1
-#         w = w[w >= 0]
+            if (self.version == "factor" or self.version == "suffix" or
-#         w = tuple([int(x) for x in w[0:]])
+                self.version == "prefix"):
-#         with lock:
+                X.fact[()] = X.fact.setdefault((),0) + len(w) + 1
-#             X.sample[w] = X.sample.setdefault(w, 0) + 1
+            if self.partial:
-#         if self.version == "prefix" or self.version == "classic":
+                for i in range(len(w)):
-#             # empty word treatment for prefixe, suffix, and factor dictionnaries
+                    if self.version == "classic":
-#             with lock:
+                        if ((version_rows_int and i + 1 <= lrowsmax) or
-#                 X.pref[()] = X.pref[()] + 1 if () in X.pref else 1
+                           (not version_rows_int and w[:i + 1] in self.lrows)):
-#         if self.version == "suffix" or self.version == "classic":
+                            X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1
-#             with lock:
+                        if ((version_columns_int and i + 1 <= lcolumnsmax) or
-#                 X.suff[()] = X.suff[()] + 1 if () in X.suff else 1
+                           (not version_columns_int and w[-( i + 1):] in self.lcolumns)):
-#         if self.version == "factor" or self.version == "suffix" \
+                            X.suff[w[-(i + 1):]] = X.suff.setdefault(w[-(i + 1):], 0) + 1
-#                 or self.version == "prefix":
+                    elif self.version == "prefix":
-#             with lock:
+                        # dictionaries dpref is populated until
-#                 X.fact[()] = X.fact[()] + len(w) + 1 if () in X.fact else len(w) + 1
+                        # lmax = lrows + lcolumns
-# 
+                        # dictionaries dfact is populated until lcolumns
-#         if self.partial:
+                        if (((version_rows_int or version_columns_int) and i + 1 <= lmax) or
-#             for i in range(len(w)):
+                             (not version_rows_int and w[:i + 1] in self.lrows) or
-#                 if self.version == "classic":
+                             (not version_columns_int  and w[:i + 1] in self.lcolumns)):
-#                     if (version_rows_int is True and
+                            X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1
-#                                     i + 1 <= lrowsmax) or \
+                        for j in range(i + 1, len(w) + 1):
-#                        (version_rows_int is False and
+                            if ((version_columns_int and (j - i) <= lmax) or 
-#                                     w[:i + 1] in self.lrows):
+                                (not version_columns_int and w[i:j] in self.lcolumns)):
-#                         with lock:
+                                X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
-#                             X.pref[w[:i + 1]] = \
+                    elif self.version == "suffix":
-#                                 X.pref[w[:i + 1]] + 1 if w[:i + 1] in X.pref else 1
+                        if (((version_rows_int or version_columns_int) and i <= lmax) or
-#                     if (version_columns_int is True and i + 1 <= lcolumnsmax) or \
+                             (not version_rows_int and w[-(i + 1):] in self.lrows) or
-#                        (version_columns_int is False and w[-( i + 1):] in self.lcolumns):
+                             (not version_columns_int and w[-(i + 1):] in self.lcolumns)):
-#                         with lock:
+                            X.suff[w[-(i + 1):]] = X.suff.setdefault(w[-(i + 1):], 0) + 1
-#                             X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 if \
+                        for j in range(i + 1, len(w) + 1):
-#                                 w[-(i + 1):] in X.suff else 1
+                            if ((version_rows_int and (j - i) <= lmax) or
-#                 if self.version == "prefix":
+                                (not version_rows_int and w[i:j] in self.lrows)):
-#                     # dictionaries dpref is populated until
+                                X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
-#                     # lmax = lrows + lcolumns
+                    elif self.version == "factor":
-#                     # dictionaries dfact is populated until lcolumns
+                        for j in range(i + 1, len(w) + 1):
-#                     if ((version_rows_int is True or
+                            if (((version_rows_int or version_columns_int) and (j - i) <= lmax) or
-#                                  version_columns_int is True) and
+                                 (not version_rows_int and w[i:j] in self.lrows) or
-#                                     i + 1 <= lmax) or \
+                                 (not version_columns_int and w[i:j] in self.lcolumns)):
-#                             (version_rows_int is False and
+                                X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
-#                                  (w[:i + 1] in self.lrows)) or \
+            else:  # not partial
-#                             (version_columns_int is False and
+                for i in range(len(w)):
-#                                  (w[:i + 1] in self.lcolumns)):
+                    X.pref[w[:i + 1]] = X.pref.setdefault(w[:i + 1], 0) + 1
-#                         X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
+                    X.suff[w[i:]] = X.suff.setdefault(w[i:], 0) + 1
-#                             if w[:i + 1] in X.pref else 1
+                    for j in range(i + 1, len(w) + 1):
-#                     for j in range(i + 1, len(w) + 1):
+                        X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
-#                         if (version_columns_int is True and (
-#                             j - i) <= lmax) or \
-#                                 (version_columns_int is False and
-#                                      (w[i:j] in self.lcolumns)):
-#                             X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
-#                                 if w[i:j] in X.fact else 1
-#                 if self.version == "suffix":
-#                     if ((version_rows_int is True or
-#                                  version_columns_int is True) and
-#                                 i <= lmax) or \
-#                             (version_rows_int is False and
-#                                  (w[-(i + 1):] in self.lrows)) or \
-#                             (version_columns_int is False and
-#                                  (w[-(i + 1):] in self.lcolumns)):
-#                         X.suff[w[-(i + 1):]] = X.suff[w[-(i + 1):]] + 1 \
-#                             if w[-(i + 1):] in X.suff else 1
-#                     for j in range(i + 1, len(w) + 1):
-#                         if (version_rows_int is True and (
-#                             j - i) <= lmax) or \
-#                                 (version_rows_int is False and
-#                                      (w[i:j] in self.lrows)):
-#                             X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
-#                                 if w[i:j] in X.fact else 1
-#                 if self.version == "factor":
-#                     for j in range(i + 1, len(w) + 1):
-#                         if ((version_rows_int is True or
-#                                      version_columns_int is True) and
-#                                     (j - i) <= lmax) or \
-#                                 (version_rows_int is False and
-#                                      (w[i:j] in self.lrows)) or \
-#                                 (version_columns_int is False and
-#                                      (w[i:j] in self.lcolumns)):
-#                             X.fact[w[i:j]] = \
-#                                 X.fact[w[i:j]] + 1 if w[i:j] in X.fact else 1
-# 
-#         else:  # not partial
-#             for i in range(len(w)):
-#                 X.pref[w[:i + 1]] = X.pref[w[:i + 1]] + 1 \
-#                     if w[:i + 1] in X.pref else 1
-#                 X.suff[w[i:]] = X.suff[w[i:]] + 1 if w[i:] in X.suff else 1
-#                 for j in range(i + 1, len(w) + 1):
-#                     X.fact[w[i:j]] = X.fact[w[i:j]] + 1 \
-#                         if w[i:j] in X.fact else 1
    def polulate_dictionnaries(self, X):
        """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
@@ -460,6 +487,15 @@ class Spectral(BaseEstimator):
            X.pref = {}
        return X
+    def _populate_sample_dict(self, X):
+        dsample = {}  # dictionary (word,count)
+        for line in range(X.shape[0]):
+            w = X[line, :]
+            w = w[w >= 0]
+            w = tuple([int(x) for x in w[0:]])
+            dsample[w] = dsample[w] + 1 if w in dsample else 1
+        return dsample
    @property
    def trigram(self):
        """The trigram dictionary"""