Skip to content
Snippets Groups Projects
Commit 9d68b04f authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Parallelize the kmeans forest regressor

parent 59e65276
Branches
No related tags found
1 merge request!12Resolve "integration-sota"
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
......@@ -5,6 +7,8 @@ from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
from joblib import Parallel, delayed
from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
......@@ -15,7 +19,7 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
random_state=models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
@property
......@@ -34,6 +38,8 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
labels = np.array(kmeans.labels_)
# for each cluster select the best tree on the validation set
"""
pruned_forest = list()
for c in range(self._extracted_forest_size):
index = np.where(labels == c)[0]
......@@ -43,10 +49,31 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
tree_pred = score_metric(y_val, y_val_pred)
cluster.append(tree_pred)
best_tree_index = np.argmax(cluster)
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])"""
extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=False)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, score_metric)
for i in range(self._extracted_forest_size))
self._regressor.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=False)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._regressor.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._regressor.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._regressor.predict(X)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment