Skip to content
Snippets Groups Projects
Commit fb235e8f authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

Last changes'

parent 838cc99d
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Groupe de travail
Le but de ce notebook est de tester l'idée de réduction des random forest
%% Cell type:markdown id: tags:
## Import scikit-learn
%% Cell type:code id: tags:
``` python
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_boston, load_breast_cancer, fetch_california_housing
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors.kde import KernelDensity
```
%% Cell type:markdown id: tags:
## Variables globales
%% Cell type:code id: tags:
``` python
NB_TREES = 100
```
%% Cell type:markdown id: tags:
## Load jeu de donnée
%% Cell type:code id: tags:
``` python
X, y = fetch_california_housing(return_X_y=True)
```
%% Output
Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/l_bouscarrat/scikit_learn_data
%% Cell type:code id: tags:
``` python
def train_forest(X_train, y_train, nb_trees, random_seed):
'''
Function that will train a random forest with nb_tress
:param X_train: list of inputs
:param y_train: list of results
:param nb_trees: int, number of trees in the forest
:param random_seed: int, seed for the random_states
:return: a RandomForestRegressor
'''
# Entraînement de la forêt aléatoire
regressor = RandomForestRegressor(n_estimators=nb_trees, random_state = random_seed)
regressor.fit(X_train, y_train)
return regressor
def extract_subforest(random_forest, X_train, y_train, nb_trees_extracted):
'''
Function use to get the weight list of a subforest of size nb_trees_extracted for random_forest
using OMP.
:param random_forest: a RandomForestRegressor
:param X_train: list of inputs
:param y_train: list of results
:param nb_trees_extracted: int, number of trees extracted
:return: a list of int, weight of each tree
'''
# Accès à la la liste des arbres
tree_list = random_forest.estimators_
# Création de la matrice des prédictions de chaque arbre
# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X
# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque
# élément comprenant les valeurs prédites par chaque arbre
D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]
# OMP
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nb_trees_extracted, fit_intercept = False, normalize=False)
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nb_trees_extracted, fit_intercept = False, normalize = False)
omp.fit(D, y_train)
weights = omp.coef_
return weights
def compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed):
'''
Compute the score of the different techniques
:param weights: weights given by the OMP
:param random_forest: a RandomForestRegressor
:param X_train: list of inputs
:param X_dev: list of inputs
:param X_test: list of inputs
:param y_train: list of results
:param y_dev: list of results
:param y_test: list of results
:param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly
'''
# Calcul des différents résultats
res_base_forest = mean_squared_error(random_forest.predict(X_test), y_test)
# Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])
for elem in X_test]
res_extract_weight = mean_squared_error(y_pred, y_test)
# Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])/sum(weights)
for elem in X_test]
res_extract_weight_norm = mean_squared_error(y_pred, y_test)
# Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits
y_pred = [mean([random_forest.estimators_[i].predict([elem])[0] for i in range(nb_trees) if abs(weights[i]) >= 0.01])
for elem in X_test]
res_extract_mean = mean_squared_error(y_pred, y_test)
# Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait
small_forest = train_forest(np.concatenate((X_train, X_dev)), np.concatenate((y_train, y_dev)), nb_trees_extracted, random_seed)
res_small_forest = mean_squared_error(small_forest.predict(X_test), y_test)
return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights
def extract_and_get_results(random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, nb_trees,
nb_trees_extracted, random_seed):
'''
Extract the subforest and returns the resuts of the different methods
:param X_train: list of inputs
:param X_dev: list of inputs
:param X_test: list of inputs
:param y_train: list of results
:param y_dev: list of results
:param y_test: list of results
:param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly
'''
weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)
res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest = \
compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed)
return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights
def train_extract_subforest(X_train, X_test, y_train, y_test, nb_trees, nb_trees_extracted, random_seed):
'''
Function that takes data, number of trees and a random seed. Train a forest with nb_trees, extract
with OMP nb_trees_extracted and compare the results of the different method
:param X_train: list of inputs
:param X_test: list of inputs
:param y_train: list of results
:param y_test: list of results
:param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly
'''
random_forest = train_forest(X_train, y_train, nb_trees, random_seed)
weight = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted)
res_base_forest, res_extract_weight, res_extract_mean, res_small_forest = \
compute_results(weight, random_forest, X_train, X_test, y_train, y_test,
nb_trees, nb_trees_extracted, random_seed)
return res_base_forest, res_extract_weight, res_extract_mean, res_small_forest
```
%% Cell type:code id: tags:
``` python
results_global = []
results_dev_global = []
results_without_dev_global = []
nb_trees = 100
random_seeds = list(range(10))
for random_seed in random_seeds:
# Séparation train_test avec random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = 0.2, random_state = random_seed)
random_forest = train_forest(X_train, y_train, NB_TREES, random_seed)
results = []
results_dev = []
results_without_dev = []
for nb_trees_extracted in [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]:
weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)
weights_train = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted)
results.append(compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed))
results_without_dev.append(compute_results(weights_train, random_forest, X_train, X_train,
X_test, y_train, y_train, y_test,
nb_trees, nb_trees_extracted, random_seed)
)
results_dev.append(compute_results(weights, random_forest, X_train, X_dev, X_dev, y_train, y_dev, y_dev,
nb_trees, nb_trees_extracted, random_seed))
results_global.append(results)
results_dev_global.append(results_dev)
results_without_dev_global.append(results_without_dev)
print('over')
```
%% Output
over
over
over
over
over
over
over
%% Cell type:code id: tags:
``` python
def plot_results(results_global, title_graph):
def plot_mean_and_CI(mean, lb, ub, x_value, color_mean=None, color_shading=None, label=None):
# plot the shaded range of the confidence intervals
plt.fill_between(x_value, ub, lb,
color=color_shading, alpha=.5)
# plot the mean on top
plt.plot(x_value, mean, color_mean, label = label)
means_results = np.array(
[
[mean(
[results[i][k] for results in results_global] # loop over the different experiments
) for i in range(len(results_global[0]))] # loop over the different number of trees extracted
for k in range(5)]) # loop over the different methods
std_results = np.array(
[
[np.std(
[results[i][k] for results in results_global]
) for i in range(len(results_global[0]))]
for k in range(5)])
x_value = [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]
# plot the data
fig = plt.figure(1, figsize=(15, 10))
plot_mean_and_CI(means_results[0], means_results[0] + std_results[0], means_results[0] - std_results[0],
x_value, color_mean='k', color_shading='k', label='Results of the base forest (on train set)')
plot_mean_and_CI(means_results[1], means_results[1] + std_results[1], means_results[1] - std_results[1],
x_value, color_mean='darkorange', color_shading='darkorange',
label='Weighted results of the extracted trees')
plot_mean_and_CI(means_results[2], means_results[2] + std_results[2], means_results[2] - std_results[2],
x_value, color_mean='red', color_shading='red',
label='Weighted results of the extracted trees normalized')
plot_mean_and_CI(means_results[3], means_results[3] + std_results[3], means_results[3] - std_results[3],
x_value, color_mean='b', color_shading='b',
label='Mean results of the extracted trees')
plot_mean_and_CI(means_results[4], means_results[4] + std_results[4], means_results[4] - std_results[4],
x_value, color_mean='g', color_shading='g',
label='Results of a forest train with number of trees extracted (train+dev set)')
plt.xlabel('Number of trees extracted')
plt.ylabel('MSE')
plt.title(title_graph)
plt.legend(loc="upper right")
```
%% Cell type:code id: tags:
``` python
plot_results(results_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on train set')
```
%% Cell type:code id: tags:
``` python
plot_results(results_dev_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on dev set')
```
%% Cell type:code id: tags:
``` python
plot_results(results_without_dev,
'Reduction of a forest with 100 trees, 10 iterations with different seed, score when there is no dev set')
```
%% Cell type:code id: tags:
``` python
for results in results_global:
x_value = [int(NB_TREES/k) for k in [5, 10, 50, 100, 500, 1000]]
plt.xlabel('Number of trees extracted')
plt.ylabel('MSE')
plt.plot(x_value, [elem[1] for elem in results], color='darkorange',
label='Weighted results of the average trees')
plt.plot(x_value, [elem[2] for elem in results], color='red',
label='Weighted results of the average trees normalized')
plt.plot(x_value, [elem[3] for elem in results], color='blue',
label='Mean results of the average trees')
plt.plot(x_value, [elem[4] for elem in results], color='green',
label='Results of a forest train with number of trees extracted')
plt.plot(x_value, [elem[0] for elem in results], color='black',
label='Results of the base forest')
plt.figure(1, figsize=(15, 10))
plt.legend(loc="upper right")
fig_acc_rec = plt.gcf()
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
def weight_density(list_weight):
print(list_weight)
X_plot = [np.exp(elem) for elem in list_weight]
fig, ax = plt.subplots()
for kernel in ['gaussian', 'tophat', 'epanechnikov']:
kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot)
log_dens = kde.score_samples(X_plot)
ax.plot(X_plot[:, 0], np.exp(log_dens), '-',
label="kernel = '{0}'".format(kernel))
ax.legend(loc='upper left')
ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')
ax.set_xlim(-4, 9)
ax.set_ylim(-0.02, 0.4)
plt.show()
```
%% Cell type:code id: tags:
``` python
for results in results_global:
ax = pd.Series([[e for e in test[5] if e != 0] for test in results][1]).plot.kde(figsize=(15, 10))
legends = ['Experience '+ str(i+1) for i in range(10)]
ax.legend(legends)
```
%% Output
<matplotlib.legend.Legend at 0x7f1437754c10>
%% Cell type:code id: tags:
``` python
np.array(
[
[
[results[i][k] for results in results_global]
for i in range(len(results_global[0]))]
for k in range(5)])
```
%% Output
array([[[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801]],
[[0.27542295, 0.27749768, 0.28513058, 0.26038702, 0.27043376,
0.2655008 , 0.28448981, 0.28333658, 0.27387447, 0.2769381 ],
[0.27746557, 0.27723817, 0.28723859, 0.26434651, 0.27067318,
0.26330039, 0.28196962, 0.28240111, 0.27509222, 0.28088583],
[0.28995364, 0.29198289, 0.29873153, 0.27618004, 0.2848853 ,
0.27857491, 0.29298835, 0.30077324, 0.2886711 , 0.28905086],
[0.32365526, 0.32322906, 0.32710513, 0.29903915, 0.31318329,
0.30669926, 0.32434317, 0.32110736, 0.31463418, 0.321466 ],
[0.39986111, 0.42484653, 0.42855969, 0.39370378, 0.39935977,
0.38460084, 0.41563938, 0.40814036, 0.39929003, 0.38932494],
[0.58523066, 0.55891364, 0.59428021, 0.60547191, 0.5266932 ,
0.54086835, 0.57100958, 0.54292164, 0.53241884, 0.59593718]],
[[0.27521601, 0.27770826, 0.28523181, 0.26038166, 0.27049563,
0.26550442, 0.28434805, 0.28336574, 0.27317227, 0.27730912],
[0.27744027, 0.27741863, 0.28736133, 0.26435666, 0.2708491 ,
0.26327205, 0.28195336, 0.28236196, 0.2744054 , 0.28125145],
[0.29039242, 0.2925128 , 0.29908397, 0.27622541, 0.28540109,
0.27863392, 0.2930088 , 0.30074197, 0.28805612, 0.28951146],
[0.32484297, 0.32460791, 0.32853218, 0.29964306, 0.31475447,
0.30729241, 0.3249432 , 0.32151156, 0.31435099, 0.32265734],
[0.40624496, 0.4302276 , 0.43530539, 0.39867506, 0.40663919,
0.38977503, 0.42116368, 0.41272401, 0.40457793, 0.39346472],
[0.61110865, 0.57982481, 0.62469263, 0.63199171, 0.54630055,
0.56558963, 0.59272349, 0.56177889, 0.55353829, 0.61516357]],
[[0.27065035, 0.26652983, 0.27690332, 0.25221968, 0.26834736,
0.25713719, 0.27970688, 0.27633639, 0.26457284, 0.26459536],
[0.27745036, 0.27779328, 0.28618091, 0.26025249, 0.27057945,
0.26101474, 0.28177424, 0.27897506, 0.27042052, 0.27522275],
[0.29080491, 0.29277669, 0.29888537, 0.27686572, 0.28509987,
0.2785929 , 0.29306583, 0.29749568, 0.28659403, 0.28920128],
[0.32643403, 0.32525994, 0.32820348, 0.30076919, 0.31430383,
0.30854091, 0.3247721 , 0.31945693, 0.31341696, 0.32260233],
[0.40894501, 0.43023553, 0.43584108, 0.40142063, 0.40520717,
0.39108446, 0.4205632 , 0.41085099, 0.40442852, 0.39712488],
[0.61110865, 0.57982481, 0.62469263, 0.63199171, 0.54630055,
0.56558963, 0.59272349, 0.56177889, 0.55353829, 0.61516357]],
[[0.26184144, 0.25626252, 0.26511056, 0.24293248, 0.25853787,
0.24899595, 0.27433988, 0.27443584, 0.24968665, 0.25521777],
[0.27008137, 0.26487907, 0.27600019, 0.25217323, 0.26622961,
0.25721161, 0.28795328, 0.28488014, 0.25274144, 0.26086461],
[0.28205227, 0.2757543 , 0.29624245, 0.27094063, 0.29016011,
0.27193868, 0.30978997, 0.29614998, 0.26827511, 0.27353369],
[0.30842355, 0.30288144, 0.32913279, 0.30527809, 0.32101279,
0.31426529, 0.3350261 , 0.33831256, 0.30012365, 0.30287159],
[0.39608144, 0.38109562, 0.41662116, 0.40638002, 0.41791456,
0.40641226, 0.44955332, 0.44545138, 0.40050687, 0.39659829],
[0.52593732, 0.54251735, 0.57760869, 0.56082674, 0.56808121,
0.55389761, 0.59152879, 0.62432776, 0.52053009, 0.54411424]]])
%% Cell type:code id: tags:
``` python
[[sum(elem[5]) for elem in results] for results in results_global]
```
%% Output
[[1.0019333893291256,
1.0002339744254798,
0.9965284128922761,
0.9930020768164572,
0.9713515521464255,
0.9355587965148584],
[0.9964172479701133,
0.9965133913097529,
0.9921861297905141,
0.9842416540561515,
0.9682575760922218,
0.933691714442025],
[0.9983332596273122,
0.9980542966631237,
0.9953384960946179,
0.9863737744133908,
0.9713936104999941,
0.9322186005941735],
[1.0038094772994455,
1.0018194366490565,
0.9977426702506628,
0.9901515373986598,
0.9656704215772347,
0.9225711413699638],
[0.9990610710125596,
0.997516190665299,
0.994208907140429,
0.9866287246076226,
0.9654844823617819,
0.9349411984209011],
[0.9988517681736824,
0.9978719264484801,
0.9951808165785492,
0.9869717461401798,
0.96626061534528,
0.9275270848174226],
[1.0036372042892556,
1.0021281690286359,
0.9992184310564347,
0.9936213667248184,
0.9726814533989055,
0.9287908504721321],
[1.0072738855021581,
1.0058719271210204,
1.0008091849171328,
0.9924905567327407,
0.9726743101033102,
0.9357752656379753],
[1.0124396297482838,
1.0124081517188515,
1.008883039483941,
1.0028553033696677,
0.9743542569764307,
0.942597999044375],
[0.9944072082781984,
0.9943119815126942,
0.9936514348889314,
0.9866366329633924,
0.9668343182393178,
0.9188473811851859]]
%% Cell type:code id: tags:
``` python
results_global[0]
```
%% Cell type:markdown id: tags:
## Entraînement de la forêt aléatoire
%% Cell type:code id: tags:
``` python
regressor = RandomForestRegressor(n_estimators=NB_TREES, random_state = RANDOM_SEED)
regressor.fit(X_train, y_train)
```
%% Cell type:code id: tags:
``` python
# Accès à la la liste des arbres
tree_list = regressor.estimators_
```
%% Cell type:markdown id: tags:
## Création de la matrice des prédictions de chaque arbre
%% Cell type:code id: tags:
``` python
# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X
# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque
# élément comprenant les valeurs prédites par chaque arbre
D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]
```
%% Cell type:code id: tags:
``` python
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=NB_TREES_EXTRACTED)
omp.fit(D, y_train)
```
%% Cell type:code id: tags:
``` python
# Matrice avec poids de chaque arbre
omp.coef_
```
%% Cell type:markdown id: tags:
## Calcul des résultats des différentes méthodes
%% Cell type:markdown id: tags:
### Résultat de la forêt de base
%% Cell type:code id: tags:
``` python
mean_squared_error(regressor.predict(X_test), y_test)
```
%% Cell type:markdown id: tags:
### Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
%% Cell type:code id: tags:
``` python
y_pred = [sum([tree_list[i].predict([elem])[0] * omp.coef_[i] for i in range(NB_TREES)]) for elem in X_test]
```
%% Cell type:code id: tags:
``` python
mean_squared_error(y_pred, y_test)
```
%% Cell type:markdown id: tags:
### Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits
%% Cell type:code id: tags:
``` python
y_pred = [mean([tree_list[i].predict([elem])[0] for i in range(NB_TREES) if omp.coef_[i] != 0])for elem in X_test]
mean_squared_error(y_pred, y_test)
```
%% Cell type:markdown id: tags:
### Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait
%% Cell type:code id: tags:
``` python
regressor_small = RandomForestRegressor(n_estimators=NB_TREES_EXTRACTED, random_state=RANDOM_SEED)
regressor_small.fit(X_train, y_train)
```
%% Cell type:code id: tags:
``` python
mean_squared_error(regressor_small.predict(X_test), y_test)
```
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment