From ca59fee91a568bdb7082ec2d708103d7d89d55f5 Mon Sep 17 00:00:00 2001 From: Denis Arrivault <denis.arrivault@lif.univ-mrs.fr> Date: Fri, 23 Feb 2018 18:51:25 +0100 Subject: [PATCH] Some "bad" (sic) ideas to improve the populate_dictionnary functions --- splearn/spectral.py | 152 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 7 deletions(-) diff --git a/splearn/spectral.py b/splearn/spectral.py index 22f27f1..3ff0e4e 100644 --- a/splearn/spectral.py +++ b/splearn/spectral.py @@ -42,16 +42,18 @@ from __future__ import division, print_function import numpy as np import math import warnings +from concurrent.futures import ThreadPoolExecutor, wait import threading - lock = threading.RLock() + from splearn.datasets.data_sample import SplearnArray from splearn.hankel import Hankel from sklearn.base import BaseEstimator from sklearn.utils import check_array from sklearn.utils.validation import NotFittedError + class Spectral(BaseEstimator): """A Spectral estimator instance @@ -249,7 +251,8 @@ class Spectral(BaseEstimator): self._hankel = None self._automaton = None return self - X = self.polulate_dictionnaries_opt(X) + #self.polulate_dictionnaries_opt(X) + self.polulate_dictionnaries_async(X) self._hankel = Hankel(sample_instance=X, lrows=self.lrows, lcolumns=self.lcolumns, version=self.version, @@ -277,6 +280,8 @@ class Spectral(BaseEstimator): X.pref = {} # dictionary (prefix,count) X.suff = {} # dictionary (suffix,count) X.fact = {} # dictionary (factor,count) + futures = [] + pool = ThreadPoolExecutor(1) if self.partial: if isinstance(self.lrows, int): lrowsmax = self.lrows @@ -293,8 +298,10 @@ class Spectral(BaseEstimator): lmax = lrowsmax + lcolumnsmax #threads = [] for line in range(X.shape[0]): - self._populate_a_word(X, line, lrowsmax, version_rows_int, - lcolumnsmax, version_columns_int, lmax) + futures.append(pool.submit(self._populate_a_word, X, line, lrowsmax, version_rows_int, + lcolumnsmax, version_columns_int, lmax)) +# self._populate_a_word(X, line, lrowsmax, version_rows_int, +# lcolumnsmax, version_columns_int, lmax) # ) # threads.append(threading.Thread(target = self._populate_a_word, # args=(X, line, lrowsmax, version_rows_int, @@ -302,9 +309,10 @@ class Spectral(BaseEstimator): # ).start()) else: for line in range(X.shape[0]): - self._populate_a_word(X, line) - return X - + futures.append(pool.submit(self._populate_a_word, X, line)) +# self._populate_a_word(X, line) + wait(futures) + def _populate_a_word_locked(self, X, line, lrowsmax=None, version_rows_int=None, lcolumnsmax=None, version_columns_int=None, lmax=None): w = X[line, :] @@ -431,6 +439,136 @@ class Spectral(BaseEstimator): for j in range(i + 1, len(w) + 1): X.fact[w[i:j]] = X.fact.setdefault(w[i:j], 0) + 1 + def _populate_generator(self, X, lrowsmax=None, version_rows_int=None, + lcolumnsmax=None, version_columns_int=None, lmax=None): + for line in range(X.shape[0]): + w = X[line, :] + w = w[w >= 0] + w = tuple([int(x) for x in w[0:]]) + yield ('sample', w, 0) + if self.version == "prefix" or self.version == "classic": + # empty word treatment for prefixe, suffix, and factor dictionnaries + yield ('pref', (), 0) + if self.version == "suffix" or self.version == "classic": + yield ('suff', (), 0) + if (self.version == "factor" or self.version == "suffix" or + self.version == "prefix"): + yield ('fact', (), len(w)) + if self.partial: + for i in range(len(w)): + if self.version == "classic": + if ((version_rows_int and i + 1 <= lrowsmax) or + (not version_rows_int and w[:i + 1] in self.lrows)): + yield ('pref', w[:i + 1], 0) + if ((version_columns_int and i + 1 <= lcolumnsmax) or + (not version_columns_int and w[-( i + 1):] in self.lcolumns)): + yield ('suff', w[-(i + 1):], 0) + elif self.version == "prefix": + # dictionaries dpref is populated until + # lmax = lrows + lcolumns + # dictionaries dfact is populated until lcolumns + if (((version_rows_int or version_columns_int) and i + 1 <= lmax) or + (not version_rows_int and w[:i + 1] in self.lrows) or + (not version_columns_int and w[:i + 1] in self.lcolumns)): + yield ('pref', w[:i + 1], 0) + for j in range(i + 1, len(w) + 1): + if ((version_columns_int and (j - i) <= lmax) or + (not version_columns_int and w[i:j] in self.lcolumns)): + yield ('fact', w[i:j], 0) + elif self.version == "suffix": + if (((version_rows_int or version_columns_int) and i <= lmax) or + (not version_rows_int and w[-(i + 1):] in self.lrows) or + (not version_columns_int and w[-(i + 1):] in self.lcolumns)): + yield ('suff', w[-(i + 1):], 0) + for j in range(i + 1, len(w) + 1): + if ((version_rows_int and (j - i) <= lmax) or + (not version_rows_int and w[i:j] in self.lrows)): + yield ('fact', w[i:j], 0) + elif self.version == "factor": + for j in range(i + 1, len(w) + 1): + if (((version_rows_int or version_columns_int) and (j - i) <= lmax) or + (not version_rows_int and w[i:j] in self.lrows) or + (not version_columns_int and w[i:j] in self.lcolumns)): + yield ('fact', w[i:j], 0) + else: # not partial + for i in range(len(w)): + yield ('pref', w[:i + 1], 0) + yield ('suff', w[i:], 0) + for j in range(i + 1, len(w) + 1): + yield ('fact', w[i:j], 0) + + def _populate_coroutine(self, d): + print("Ready to populate") + while True: + key, val = (yield) + d[key] = d.setdefault(key, 0) + val + 1 + + def _populate_each_value(self, d, s, key, val): + if val: + d[key] = d.setdefault(key, 0) + val + 1 + else: + d[key] = d.setdefault(key, 0) + 1 + + def polulate_dictionnaries_async(self, X): + """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X + + - Input: + + :param SplearnArray X: object of shape [n_samples,n_features] + Training data + + """ + if not isinstance(X, SplearnArray): + return X + X.sample = {} # dictionary (word,count) + X.pref = {} # dictionary (prefix,count) + X.suff = {} # dictionary (suffix,count) + X.fact = {} # dictionary (factor,count) + rsample = self._populate_coroutine(X.sample) + next(rsample) + rpref = self._populate_coroutine(X.pref) + next(rpref) + rsuff = self._populate_coroutine(X.suff) + next(rsuff) + rfact = self._populate_coroutine(X.fact) + next(rfact) + if self.partial: + if isinstance(self.lrows, int): + lrowsmax = self.lrows + version_rows_int = True + else: + version_rows_int = False + lrowsmax = self.lrows.__len__() + if isinstance(self.lcolumns, int): + lcolumnsmax = self.lcolumns + version_columns_int = True + else: + lcolumnsmax = self.lcolumns.__len__() + version_columns_int = False + lmax = lrowsmax + lcolumnsmax + for s, key, val in self._populate_generator(X, lrowsmax, version_rows_int, lcolumnsmax, version_columns_int, lmax): +# d = getattr(X, s) +# self._populate_each_value(d, s, key, val) + if s == 'fact': + rfact.send((key, val)) + elif s == 'pref': + rpref.send((key, val)) + elif s == 'suff': + rsuff.send((key, val)) + else: + rsample.send((key, val)) + else: + for s, key, val in self._populate_generator(X): +# d = getattr(X, s) +# self._populate_each_value(d, s, key, val) + if s == 'fact': + rfact.send((key, val)) + elif s == 'pref': + rpref.send((key, val)) + elif s == 'suff': + rsuff.send((key, val)) + else: + rsample.send((key, val)) def polulate_dictionnaries(self, X): """Populates the *sample*, *pref*, *suff*, *fact* dictionnaries of X -- GitLab