Skip to content
Snippets Groups Projects
Commit 8ec2871e authored by Luc Giffon's avatar Luc Giffon
Browse files

optimize + fix bug kmeans forest regressor

parent 4d4c0848
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
import time
from bolsonaro.utils import tqdm_joblib from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
...@@ -35,29 +37,50 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -35,29 +37,50 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
predictions = list()
for tree in self._estimator.estimators_: predictions_val = np.empty((len(self._estimator.estimators_), X_val.shape[0]))
predictions.append(tree.predict(X_train)) predictions = np.empty((len(self._estimator.estimators_), X_train.shape[0]))
predictions = np.array(predictions) for i_tree, tree in enumerate(self._estimator.estimators_):
predictions_val[i_tree, :] = tree.predict(X_val)
predictions[i_tree, :] = tree.predict(X_train)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_) labels = np.array(kmeans.labels_)
# For each cluster select the best tree on the validation set # start_np_version = time.time()
extracted_forest_sizes = list(range(self._extracted_forest_size)) pruned_forest_1 = list()
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: for cluster_idx in range(self._extracted_forest_size): # pourrait être parallelise
pruned_forest = Parallel(n_jobs=2)(delayed(self._prune_forest_job)(prune_forest_job_pb, index_trees_cluster = np.where(labels == cluster_idx)[0]
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster
for i in range(self._extracted_forest_size)) if self._score_metric == mean_squared_error:
# compute the mean squared error of all trees at once usng numpy machinery
self._selected_trees = pruned_forest diff = predictions_val_cluster - y_val
self._estimator.estimators_ = pruned_forest squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
best_tree_index = np.argmin(mean_squared_diff) # get best scoring tree (the one with lowest mse)
pruned_forest_1.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]])
else:
raise ValueError
# stop_np_version = time.time()
# print("Time np {}".format(stop_np_version - start_np_version))
# start_paralel_version = time.time()
# # For each cluster select the best tree on the validation set
# extracted_forest_sizes = list(range(self._extracted_forest_size))
# with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
# pruned_forest = Parallel(n_jobs=2)(delayed(self._prune_forest_job)(prune_forest_job_pb, extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) for i in range(self._extracted_forest_size))
# stop_paralel_version = time.time()
# print("Time paralel {}".format(stop_paralel_version - start_paralel_version))
# assert all([t1 is t2 for (t1, t2) in zip(pruned_forest_1, pruned_forest)])
self._selected_trees = pruned_forest_1
self._estimator.estimators_ = pruned_forest_1
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0] index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb: with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=2)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, cluster = Parallel(n_jobs=2)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, y_val, score_metric) for i in range(len(index)))
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster) best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update() prune_forest_job_pb.update()
return self._estimator.estimators_[index[best_tree_index]] return self._estimator.estimators_[index[best_tree_index]]
...@@ -82,3 +105,14 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -82,3 +105,14 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
return self._estimator.predict(X) return self._estimator.predict(X)
if __name__ == "__main__":
from sklearn import datasets
from bolsonaro.models.model_parameters import ModelParameters
X, y = datasets.fetch_california_housing(return_X_y=True)
ModelParameters(extracted_forest_size=100,
normalize_D=True,
)
k_reg = KMeansForestRegressor()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment