Skip to content
Snippets Groups Projects
Commit 9d68b04f authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Parallelize the kmeans forest regressor

parent 59e65276
No related branches found
No related tags found
1 merge request!12Resolve "integration-sota"
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
......@@ -5,6 +7,8 @@ from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
from joblib import Parallel, delayed
from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
......@@ -15,7 +19,7 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
random_state=models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
@property
......@@ -34,6 +38,8 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
labels = np.array(kmeans.labels_)
# for each cluster select the best tree on the validation set
"""
pruned_forest = list()
for c in range(self._extracted_forest_size):
index = np.where(labels == c)[0]
......@@ -43,10 +49,31 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
tree_pred = score_metric(y_val, y_val_pred)
cluster.append(tree_pred)
best_tree_index = np.argmax(cluster)
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])"""
extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=False)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, score_metric)
for i in range(self._extracted_forest_size))
self._regressor.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=False)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._regressor.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._regressor.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._regressor.predict(X)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment