diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py index d0d64120d1c391ae31d107d73ed22b1a2306e8c9..6c0e3a501066feacdaaba0dad920a8232df870fc 100644 --- a/code/bolsonaro/models/kmeans_forest_regressor.py +++ b/code/bolsonaro/models/kmeans_forest_regressor.py @@ -1,3 +1,5 @@ +import time + from bolsonaro.utils import tqdm_joblib from sklearn.ensemble import RandomForestRegressor @@ -35,29 +37,50 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): def fit(self, X_train, y_train, X_val, y_val): self._estimator.fit(X_train, y_train) - predictions = list() - for tree in self._estimator.estimators_: - predictions.append(tree.predict(X_train)) - predictions = np.array(predictions) + + predictions_val = np.empty((len(self._estimator.estimators_), X_val.shape[0])) + predictions = np.empty((len(self._estimator.estimators_), X_train.shape[0])) + for i_tree, tree in enumerate(self._estimator.estimators_): + predictions_val[i_tree, :] = tree.predict(X_val) + predictions[i_tree, :] = tree.predict(X_train) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) labels = np.array(kmeans.labels_) - # For each cluster select the best tree on the validation set - extracted_forest_sizes = list(range(self._extracted_forest_size)) - with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: - pruned_forest = Parallel(n_jobs=2)(delayed(self._prune_forest_job)(prune_forest_job_pb, - extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) - for i in range(self._extracted_forest_size)) - - self._selected_trees = pruned_forest - self._estimator.estimators_ = pruned_forest + # start_np_version = time.time() + pruned_forest_1 = list() + for cluster_idx in range(self._extracted_forest_size): # pourrait ĂȘtre parallelise + index_trees_cluster = np.where(labels == cluster_idx)[0] + predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster + if self._score_metric == mean_squared_error: + # compute the mean squared error of all trees at once usng numpy machinery + diff = predictions_val_cluster - y_val + squared_diff = diff ** 2 + mean_squared_diff = np.mean(squared_diff, axis=1) + + best_tree_index = np.argmin(mean_squared_diff) # get best scoring tree (the one with lowest mse) + pruned_forest_1.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]]) + else: + raise ValueError + # stop_np_version = time.time() + # print("Time np {}".format(stop_np_version - start_np_version)) + + # start_paralel_version = time.time() + # # For each cluster select the best tree on the validation set + # extracted_forest_sizes = list(range(self._extracted_forest_size)) + # with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: + # pruned_forest = Parallel(n_jobs=2)(delayed(self._prune_forest_job)(prune_forest_job_pb, extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) for i in range(self._extracted_forest_size)) + # stop_paralel_version = time.time() + # print("Time paralel {}".format(stop_paralel_version - start_paralel_version)) + # assert all([t1 is t2 for (t1, t2) in zip(pruned_forest_1, pruned_forest)]) + + self._selected_trees = pruned_forest_1 + self._estimator.estimators_ = pruned_forest_1 def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): index = np.where(labels == c)[0] with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb: - cluster = Parallel(n_jobs=2)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, - y_val, score_metric) for i in range(len(index))) + cluster = Parallel(n_jobs=2)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, y_val, score_metric) for i in range(len(index))) best_tree_index = np.argmax(cluster) prune_forest_job_pb.update() return self._estimator.estimators_[index[best_tree_index]] @@ -82,3 +105,14 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): def predict_base_estimator(self, X): return self._estimator.predict(X) + +if __name__ == "__main__": + from sklearn import datasets + from bolsonaro.models.model_parameters import ModelParameters + + X, y = datasets.fetch_california_housing(return_X_y=True) + ModelParameters(extracted_forest_size=100, + normalize_D=True, + ) + + k_reg = KMeansForestRegressor() \ No newline at end of file