diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py index 181332d990b0fad85b521a139a4a6b610431b2bf..dc82b3b03c02c23a05e0cb41e0adf8ae1d9f8416 100644 --- a/code/bolsonaro/models/kmeans_forest_regressor.py +++ b/code/bolsonaro/models/kmeans_forest_regressor.py @@ -1,3 +1,5 @@ +from bolsonaro.utils import tqdm_joblib + from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.base import BaseEstimator @@ -5,6 +7,8 @@ from sklearn.cluster import KMeans from abc import abstractmethod, ABCMeta import numpy as np from scipy.stats import mode +from joblib import Parallel, delayed +from tqdm import tqdm class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): @@ -15,7 +19,7 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): def __init__(self, models_parameters): self._models_parameters = models_parameters self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], - random_state=models_parameters.seed) + random_state=models_parameters.seed, n_jobs=-1) self._extracted_forest_size = self._models_parameters.extracted_forest_size @property @@ -34,6 +38,8 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): labels = np.array(kmeans.labels_) # for each cluster select the best tree on the validation set + + """ pruned_forest = list() for c in range(self._extracted_forest_size): index = np.where(labels == c)[0] @@ -43,10 +49,31 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): tree_pred = score_metric(y_val, y_val_pred) cluster.append(tree_pred) best_tree_index = np.argmax(cluster) - pruned_forest.append(self._regressor.estimators_[index[best_tree_index]]) - + pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])""" + + extracted_forest_sizes = list(range(self._extracted_forest_size)) + with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=False)) as prune_forest_job_pb: + pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb, + extracted_forest_sizes[i], labels, X_val, y_val, score_metric) + for i in range(self._extracted_forest_size)) + self._regressor.estimators_ = pruned_forest + def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): + index = np.where(labels == c)[0] + with tqdm_joblib(tqdm(total=len(index), disable=False)) as cluster_job_pb: + cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, + y_val, score_metric) for i in range(len(index))) + best_tree_index = np.argmax(cluster) + prune_forest_job_pb.update() + return self._regressor.estimators_[index[best_tree_index]] + + def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric): + y_val_pred = self._regressor.estimators_[i].predict(X_val) + tree_pred = score_metric(y_val, y_val_pred) + cluster_job_pb.update() + return tree_pred + def predict(self, X): return self._regressor.predict(X)