Skip to content
Snippets Groups Projects
Commit 146a9900 authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

Correction of a HUGE issue

parent 3777cf65
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Groupe de travail
Le but de ce notebook est de tester l'idée de réduction des random forest
%% Cell type:markdown id: tags:
## Import scikit-learn
%% Cell type:code id: tags:
``` python
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors.kde import KernelDensity
```
%% Cell type:markdown id: tags:
## Variables globales
%% Cell type:code id: tags:
``` python
NB_TREES = 100
```
%% Cell type:markdown id: tags:
## Load jeu de donnée
%% Cell type:code id: tags:
``` python
X, y = load_boston(return_X_y=True)
```
%% Cell type:code id: tags:
``` python
def train_forest(X_train, y_train, nb_trees, random_seed):
'''
Function that will train a random forest with nb_tress
:param X_train: list of inputs
:param y_train: list of results
:param nb_trees: int, number of trees in the forest
:param random_seed: int, seed for the random_states
:return: a RandomForestRegressor
'''
# Entraînement de la forêt aléatoire
regressor = RandomForestRegressor(n_estimators=nb_trees, random_state = random_seed)
regressor.fit(X_train, y_train)
return regressor
def extract_subforest(random_forest, X_train, y_train, nb_trees_extracted):
'''
Function use to get the weight list of a subforest of size nb_trees_extracted for random_forest
using OMP.
:param random_forest: a RandomForestRegressor
:param X_train: list of inputs
:param y_train: list of results
:param nb_trees_extracted: int, number of trees extracted
:return: a list of int, weight of each tree
'''
# Accès à la la liste des arbres
tree_list = random_forest.estimators_
# Création de la matrice des prédictions de chaque arbre
# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X
# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque
# élément comprenant les valeurs prédites par chaque arbre
D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]
# OMP
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nb_trees_extracted, fit_intercept = False, normalize=False)
omp.fit(D, y_train)
weights = omp.coef_
return weights
def compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed):
'''
Compute the score of the different techniques
:param weights: weights given by the OMP
:param random_forest: a RandomForestRegressor
:param X_train: list of inputs
:param X_dev: list of inputs
:param X_test: list of inputs
:param y_train: list of results
:param y_dev: list of results
:param y_test: list of results
:param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly
'''
# Calcul des différents résultats
res_base_forest = mean_squared_error(random_forest.predict(X_test), y_test)
# Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])
for elem in X_test]
res_extract_weight = mean_squared_error(y_pred, y_test)
# Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])/sum(weights)
for elem in X_test]
res_extract_weight_norm = mean_squared_error(y_pred, y_test)
# Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits
y_pred = [mean([random_forest.estimators_[i].predict([elem])[0] for i in range(nb_trees) if abs(weights[i]) >= 0.01])
for elem in X_test]
res_extract_mean = mean_squared_error(y_pred, y_test)
# Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait
small_forest = train_forest(np.concatenate((X_train, X_dev)), np.concatenate((y_train, y_dev)), nb_trees_extracted, random_seed)
res_small_forest = mean_squared_error(small_forest.predict(X_test), y_test)
return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights
def extract_and_get_results(random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, nb_trees,
nb_trees_extracted, random_seed):
'''
Extract the subforest and returns the resuts of the different methods
:param X_train: list of inputs
:param X_dev: list of inputs
:param X_test: list of inputs
:param y_train: list of results
:param y_dev: list of results
:param y_test: list of results
:param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly
'''
weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)
res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest = \
compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed)
return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights
def train_extract_subforest(X_train, X_test, y_train, y_test, nb_trees, nb_trees_extracted, random_seed):
'''
Function that takes data, number of trees and a random seed. Train a forest with nb_trees, extract
with OMP nb_trees_extracted and compare the results of the different method
:param X_train: list of inputs
:param X_test: list of inputs
:param y_train: list of results
:param y_test: list of results
:param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly
'''
random_forest = train_forest(X_train, y_train, nb_trees, random_seed)
weight = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted)
res_base_forest, res_extract_weight, res_extract_mean, res_small_forest = \
compute_results(weight, random_forest, X_train, X_test, y_train, y_test,
nb_trees, nb_trees_extracted, random_seed)
return res_base_forest, res_extract_weight, res_extract_mean, res_small_forest
```
%% Cell type:code id: tags:
``` python
results_global = []
results_dev_global = []
nb_trees = 100
random_seeds = list(range(10))
for random_seed in random_seeds:
# Séparation train_test avec random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = 0.5, random_state = random_seed)
random_forest = train_forest(X_train, y_train, NB_TREES, random_seed)
results = []
results_dev = []
for nb_trees_extracted in [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]:
weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)
results.append(compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed))
results_dev.append(compute_results(weights, random_forest, X_train, X_dev, X_dev, y_train, y_dev, y_dev,
nb_trees, nb_trees_extracted, random_seed))
results_global.append(results)
results_dev_global.append(results_dev)
print('over')
```
%% Cell type:code id: tags:
``` python
def plot_results(results_global, title_graph):
def plot_mean_and_CI(mean, lb, ub, x_value, color_mean=None, color_shading=None, label=None):
# plot the shaded range of the confidence intervals
plt.fill_between(x_value, ub, lb,
color=color_shading, alpha=.5)
# plot the mean on top
plt.plot(x_value, mean, color_mean, label = label)
means_results = np.array(
[
[mean(
[results[i][k] for results in results_global] # loop over the different experiments
) for i in range(len(results_global[0]))] # loop over the different number of trees extracted
for k in range(5)]) # loop over the different methods
std_results = np.array(
[
[np.std(
[results[i][k] for results in results_global]
) for i in range(len(results_global[0]))]
for k in range(5)])
x_value = [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]
# plot the data
fig = plt.figure(1, figsize=(15, 10))
plot_mean_and_CI(means_results[0], means_results[0] + std_results[0], means_results[0] - std_results[0],
x_value, color_mean='k', color_shading='k', label='Results of the base forest (on train set)')
plot_mean_and_CI(means_results[1], means_results[1] + std_results[1], means_results[1] - std_results[1],
x_value, color_mean='darkorange', color_shading='darkorange',
label='Weighted results of the extracted trees')
plot_mean_and_CI(means_results[2], means_results[2] + std_results[2], means_results[2] - std_results[2],
x_value, color_mean='red', color_shading='red',
x label='Weighted results of the extracted trees normalized')
plot_mean_and_CI(means_results[3], means_results[3] + std_results[3], means_results[3] - std_results[3],
x_value, color_mean='b', color_shading='b',
label='Mean results of the extracted trees')
plot_mean_and_CI(means_results[4], means_results[4] + std_results[4], means_results[4] - std_results[4],
x_value, color_mean='g', color_shading='g',
label='Results of a forest train with number of trees extracted (train+dev set)')
plt.xlabel('Number of trees extracted')
plt.ylabel('MSE')
plt.title(title_graph)
plt.legend(loc="upper right")
```
%% Cell type:code id: tags:
``` python
plot_results(results_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on train set')
```
%% Cell type:code id: tags:
``` python
plot_results(results_dev_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on dev set')
```
%% Cell type:code id: tags:
``` python
for results in results_global:
x_value = [int(NB_TREES/k) for k in [5, 10, 50, 100, 500, 1000]]
plt.xlabel('Number of trees extracted')
plt.ylabel('MSE')
plt.plot(x_value, [elem[1] for elem in results], color='darkorange',
label='Weighted results of the average trees')
plt.plot(x_value, [elem[2] for elem in results], color='red',
label='Weighted results of the average trees normalized')
plt.plot(x_value, [elem[3] for elem in results], color='blue',
label='Mean results of the average trees')
plt.plot(x_value, [elem[4] for elem in results], color='green',
label='Results of a forest train with number of trees extracted')
plt.plot(x_value, [elem[0] for elem in results], color='black',
label='Results of the base forest')
plt.figure(1, figsize=(15, 10))
plt.legend(loc="upper right")
fig_acc_rec = plt.gcf()
plt.show()
```
%% Cell type:code id: tags:
``` python
def weight_density(list_weight):
print(list_weight)
X_plot = [np.exp(elem) for elem in list_weight]
fig, ax = plt.subplots()
for kernel in ['gaussian', 'tophat', 'epanechnikov']:
kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot)
log_dens = kde.score_samples(X_plot)
ax.plot(X_plot[:, 0], np.exp(log_dens), '-',
label="kernel = '{0}'".format(kernel))
ax.legend(loc='upper left')
ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')
ax.set_xlim(-4, 9)
ax.set_ylim(-0.02, 0.4)
plt.show()
```
%% Cell type:code id: tags:
``` python
for results in results_global:
ax = pd.Series([[e for e in test[5] if e != 0] for test in results][1]).plot.kde(figsize=(15, 10))
legends = ['OK'] * 10
legends[4] = 'Problème'
# ax.legend(legends)
```
%% Cell type:code id: tags:
``` python
np.array(
[
[
[results[i][k] for results in results_global]
for i in range(len(results_global[0]))]
for k in range(5)])
```
%% Cell type:code id: tags:
``` python
[[sum(elem[5]) for elem in results] for results in results_global]
```
%% Cell type:code id: tags:
``` python
results_global[0]
```
%% Cell type:markdown id: tags:
## Entraînement de la forêt aléatoire
%% Cell type:code id: tags:
``` python
regressor = RandomForestRegressor(n_estimators=NB_TREES, random_state = RANDOM_SEED)
regressor.fit(X_train, y_train)
```
%% Cell type:code id: tags:
``` python
# Accès à la la liste des arbres
tree_list = regressor.estimators_
```
%% Cell type:markdown id: tags:
## Création de la matrice des prédictions de chaque arbre
%% Cell type:code id: tags:
``` python
# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X
# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque
# élément comprenant les valeurs prédites par chaque arbre
D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]
```
%% Cell type:code id: tags:
``` python
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=NB_TREES_EXTRACTED)
omp.fit(D, y_train)
```
%% Cell type:code id: tags:
``` python
# Matrice avec poids de chaque arbre
omp.coef_
```
%% Cell type:markdown id: tags:
## Calcul des résultats des différentes méthodes
%% Cell type:markdown id: tags:
### Résultat de la forêt de base
%% Cell type:code id: tags:
``` python
mean_squared_error(regressor.predict(X_test), y_test)
```
%% Cell type:markdown id: tags:
### Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
%% Cell type:code id: tags:
``` python
y_pred = [sum([tree_list[i].predict([elem])[0] * omp.coef_[i] for i in range(NB_TREES)]) for elem in X_test]
```
%% Cell type:code id: tags:
``` python
mean_squared_error(y_pred, y_test)
```
%% Cell type:markdown id: tags:
### Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits
%% Cell type:code id: tags:
``` python
y_pred = [mean([tree_list[i].predict([elem])[0] for i in range(NB_TREES) if omp.coef_[i] != 0])for elem in X_test]
mean_squared_error(y_pred, y_test)
```
%% Cell type:markdown id: tags:
### Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait
%% Cell type:code id: tags:
``` python
regressor_small = RandomForestRegressor(n_estimators=NB_TREES_EXTRACTED, random_state=RANDOM_SEED)
regressor_small.fit(X_train, y_train)
```
%% Cell type:code id: tags:
``` python
mean_squared_error(regressor_small.predict(X_test), y_test)
```
%% Cell type:code id: tags:
``` python
```
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment