From ca59fee91a568bdb7082ec2d708103d7d89d55f5 Mon Sep 17 00:00:00 2001
From: Denis Arrivault <denis.arrivault@lif.univ-mrs.fr>
Date: Fri, 23 Feb 2018 18:51:25 +0100
Subject: [PATCH] Some "bad" (sic) ideas to improve the populate_dictionnary
 functions

---
 splearn/spectral.py | 152 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 145 insertions(+), 7 deletions(-)

diff --git a/splearn/spectral.py b/splearn/spectral.py
index 22f27f1..3ff0e4e 100644
--- a/splearn/spectral.py
+++ b/splearn/spectral.py
@@ -42,16 +42,18 @@ from __future__ import division, print_function
 import numpy as np
 import math
 import warnings
+from concurrent.futures import ThreadPoolExecutor, wait
 import threading
-
 lock = threading.RLock()
 
+
 from splearn.datasets.data_sample import SplearnArray
 from splearn.hankel import Hankel
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
 from sklearn.utils.validation import NotFittedError
 
+
 class Spectral(BaseEstimator):
     """A Spectral estimator instance
 
@@ -249,7 +251,8 @@ class Spectral(BaseEstimator):
             self._hankel = None
             self._automaton = None
             return self
-        X = self.polulate_dictionnaries_opt(X)
+        #self.polulate_dictionnaries_opt(X)
+        self.polulate_dictionnaries_async(X)
         self._hankel = Hankel(sample_instance=X,
                          lrows=self.lrows, lcolumns=self.lcolumns,
                          version=self.version,
@@ -277,6 +280,8 @@ class Spectral(BaseEstimator):
         X.pref = {}  # dictionary (prefix,count)
         X.suff = {}  # dictionary (suffix,count)
         X.fact = {}  # dictionary (factor,count)
+        futures = []
+        pool = ThreadPoolExecutor(1)
         if self.partial:
             if isinstance(self.lrows, int):
                 lrowsmax = self.lrows
@@ -293,8 +298,10 @@ class Spectral(BaseEstimator):
             lmax = lrowsmax + lcolumnsmax
             #threads = []
             for line in range(X.shape[0]):
-                self._populate_a_word(X, line, lrowsmax, version_rows_int,
-                                      lcolumnsmax, version_columns_int, lmax)
+                futures.append(pool.submit(self._populate_a_word, X, line, lrowsmax, version_rows_int,
+                               lcolumnsmax, version_columns_int, lmax))
+#                 self._populate_a_word(X, line, lrowsmax, version_rows_int,
+#                                       lcolumnsmax, version_columns_int, lmax)
 #                                                 )
 #                 threads.append(threading.Thread(target = self._populate_a_word,
 #                                                 args=(X, line, lrowsmax, version_rows_int,
@@ -302,9 +309,10 @@ class Spectral(BaseEstimator):
 #                                                 ).start())
         else:
             for line in range(X.shape[0]):
-                self._populate_a_word(X, line)
-        return X
-    
+                futures.append(pool.submit(self._populate_a_word, X, line))
+#                self._populate_a_word(X, line)
+        wait(futures)
+
     def _populate_a_word_locked(self, X, line, lrowsmax=None, version_rows_int=None,
                          lcolumnsmax=None, version_columns_int=None, lmax=None):
         w = X[line, :]
@@ -431,6 +439,136 @@ class Spectral(BaseEstimator):
                 for j in range(i + 1, len(w) + 1):
                     X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1
 
+    def _populate_generator(self, X, lrowsmax=None, version_rows_int=None,
+                            lcolumnsmax=None, version_columns_int=None, lmax=None):
+        for line in range(X.shape[0]):
+            w = X[line, :]
+            w = w[w >= 0]
+            w = tuple([int(x) for x in w[0:]])
+            yield ('sample', w, 0)
+            if self.version == "prefix" or self.version == "classic":
+                # empty word treatment for prefixe, suffix, and factor dictionnaries
+                yield ('pref', (), 0)
+            if self.version == "suffix" or self.version == "classic":
+                yield ('suff', (), 0)
+            if (self.version == "factor" or self.version == "suffix" or
+                self.version == "prefix"):
+                yield ('fact', (), len(w))
+            if self.partial:
+                for i in range(len(w)):
+                    if self.version == "classic":
+                        if ((version_rows_int and i + 1 <= lrowsmax) or
+                           (not version_rows_int and w[:i + 1] in self.lrows)):
+                            yield ('pref', w[:i + 1], 0)
+                        if ((version_columns_int and i + 1 <= lcolumnsmax) or
+                           (not version_columns_int and w[-( i + 1):] in self.lcolumns)):
+                            yield ('suff', w[-(i + 1):], 0)
+                    elif self.version == "prefix":
+                        # dictionaries dpref is populated until
+                        # lmax = lrows + lcolumns
+                        # dictionaries dfact is populated until lcolumns
+                        if (((version_rows_int or version_columns_int) and i + 1 <= lmax) or
+                             (not version_rows_int and w[:i + 1] in self.lrows) or
+                             (not version_columns_int  and w[:i + 1] in self.lcolumns)):
+                            yield ('pref', w[:i + 1], 0)
+                        for j in range(i + 1, len(w) + 1):
+                            if ((version_columns_int and (j - i) <= lmax) or 
+                                (not version_columns_int and w[i:j] in self.lcolumns)):
+                                yield ('fact', w[i:j], 0)
+                    elif self.version == "suffix":
+                        if (((version_rows_int or version_columns_int) and i <= lmax) or
+                             (not version_rows_int and w[-(i + 1):] in self.lrows) or
+                             (not version_columns_int and w[-(i + 1):] in self.lcolumns)):
+                            yield ('suff', w[-(i + 1):], 0)
+                        for j in range(i + 1, len(w) + 1):
+                            if ((version_rows_int and (j - i) <= lmax) or
+                                (not version_rows_int and w[i:j] in self.lrows)):
+                                yield ('fact', w[i:j], 0)
+                    elif self.version == "factor":
+                        for j in range(i + 1, len(w) + 1):
+                            if (((version_rows_int or version_columns_int) and (j - i) <= lmax) or
+                                 (not version_rows_int and w[i:j] in self.lrows) or
+                                 (not version_columns_int and w[i:j] in self.lcolumns)):
+                                yield ('fact', w[i:j], 0)
+            else:  # not partial
+                for i in range(len(w)):
+                    yield ('pref', w[:i + 1], 0)
+                    yield ('suff', w[i:], 0)
+                    for j in range(i + 1, len(w) + 1):
+                        yield ('fact', w[i:j], 0)
+
+    def _populate_coroutine(self, d):
+        print("Ready to populate")
+        while True:
+            key, val = (yield)
+            d[key] = d.setdefault(key, 0) + val + 1
+
+    def _populate_each_value(self, d, s, key, val):
+        if val:
+            d[key] = d.setdefault(key, 0) + val + 1
+        else:
+            d[key] = d.setdefault(key, 0) + 1
+
+    def polulate_dictionnaries_async(self, X):
+        """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
+                
+        - Input:
+
+        :param SplearnArray X: object of shape [n_samples,n_features]
+               Training data
+        
+        """
+        if not isinstance(X, SplearnArray):
+            return X
+        X.sample = {}  # dictionary (word,count)
+        X.pref = {}  # dictionary (prefix,count)
+        X.suff = {}  # dictionary (suffix,count)
+        X.fact = {}  # dictionary (factor,count)
+        rsample = self._populate_coroutine(X.sample)
+        next(rsample)
+        rpref = self._populate_coroutine(X.pref)
+        next(rpref)
+        rsuff = self._populate_coroutine(X.suff)
+        next(rsuff)
+        rfact = self._populate_coroutine(X.fact)
+        next(rfact)
+        if self.partial:
+            if isinstance(self.lrows, int):
+                lrowsmax = self.lrows
+                version_rows_int = True
+            else:
+                version_rows_int = False
+                lrowsmax = self.lrows.__len__()
+            if isinstance(self.lcolumns, int):
+                lcolumnsmax = self.lcolumns
+                version_columns_int = True
+            else:
+                lcolumnsmax = self.lcolumns.__len__()
+                version_columns_int = False
+            lmax = lrowsmax + lcolumnsmax
+            for s, key, val in self._populate_generator(X, lrowsmax, version_rows_int, lcolumnsmax, version_columns_int, lmax):
+#                 d = getattr(X, s)
+#                 self._populate_each_value(d, s, key, val)
+                if s == 'fact':
+                    rfact.send((key, val))
+                elif s == 'pref':
+                    rpref.send((key, val))
+                elif s == 'suff':
+                    rsuff.send((key, val))
+                else:
+                    rsample.send((key, val))
+        else:
+            for s, key, val in self._populate_generator(X):
+#                 d = getattr(X, s)
+#                 self._populate_each_value(d, s, key, val)
+                if s == 'fact':
+                    rfact.send((key, val))
+                elif s == 'pref':
+                    rpref.send((key, val))
+                elif s == 'suff':
+                    rsuff.send((key, val))
+                else:
+                    rsample.send((key, val))
 
     def polulate_dictionnaries(self, X):
         """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X
-- 
GitLab