Skip to content
Snippets Groups Projects
Commit fb235e8f authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

Last changes'

parent 838cc99d
No related branches found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Groupe de travail # Groupe de travail
Le but de ce notebook est de tester l'idée de réduction des random forest Le but de ce notebook est de tester l'idée de réduction des random forest
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Import scikit-learn ## Import scikit-learn
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from statistics import mean from statistics import mean
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from sklearn.datasets import load_boston, load_breast_cancer, fetch_california_housing from sklearn.datasets import load_boston, load_breast_cancer, fetch_california_housing
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.neighbors.kde import KernelDensity from sklearn.neighbors.kde import KernelDensity
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Variables globales ## Variables globales
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
NB_TREES = 100 NB_TREES = 100
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Load jeu de donnée ## Load jeu de donnée
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
X, y = fetch_california_housing(return_X_y=True) X, y = fetch_california_housing(return_X_y=True)
``` ```
%% Output %% Output
Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/l_bouscarrat/scikit_learn_data Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /home/l_bouscarrat/scikit_learn_data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def train_forest(X_train, y_train, nb_trees, random_seed): def train_forest(X_train, y_train, nb_trees, random_seed):
''' '''
Function that will train a random forest with nb_tress Function that will train a random forest with nb_tress
:param X_train: list of inputs :param X_train: list of inputs
:param y_train: list of results :param y_train: list of results
:param nb_trees: int, number of trees in the forest :param nb_trees: int, number of trees in the forest
:param random_seed: int, seed for the random_states :param random_seed: int, seed for the random_states
:return: a RandomForestRegressor :return: a RandomForestRegressor
''' '''
# Entraînement de la forêt aléatoire # Entraînement de la forêt aléatoire
regressor = RandomForestRegressor(n_estimators=nb_trees, random_state = random_seed) regressor = RandomForestRegressor(n_estimators=nb_trees, random_state = random_seed)
regressor.fit(X_train, y_train) regressor.fit(X_train, y_train)
return regressor return regressor
def extract_subforest(random_forest, X_train, y_train, nb_trees_extracted): def extract_subforest(random_forest, X_train, y_train, nb_trees_extracted):
''' '''
Function use to get the weight list of a subforest of size nb_trees_extracted for random_forest Function use to get the weight list of a subforest of size nb_trees_extracted for random_forest
using OMP. using OMP.
:param random_forest: a RandomForestRegressor :param random_forest: a RandomForestRegressor
:param X_train: list of inputs :param X_train: list of inputs
:param y_train: list of results :param y_train: list of results
:param nb_trees_extracted: int, number of trees extracted :param nb_trees_extracted: int, number of trees extracted
:return: a list of int, weight of each tree :return: a list of int, weight of each tree
''' '''
# Accès à la la liste des arbres # Accès à la la liste des arbres
tree_list = random_forest.estimators_ tree_list = random_forest.estimators_
# Création de la matrice des prédictions de chaque arbre # Création de la matrice des prédictions de chaque arbre
# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X # L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X
# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque # et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque
# élément comprenant les valeurs prédites par chaque arbre # élément comprenant les valeurs prédites par chaque arbre
D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train] D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]
# OMP # OMP
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nb_trees_extracted, fit_intercept = False, normalize=False) omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nb_trees_extracted, fit_intercept = False, normalize = False)
omp.fit(D, y_train) omp.fit(D, y_train)
weights = omp.coef_ weights = omp.coef_
return weights return weights
def compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, def compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed): nb_trees, nb_trees_extracted, random_seed):
''' '''
Compute the score of the different techniques Compute the score of the different techniques
:param weights: weights given by the OMP :param weights: weights given by the OMP
:param random_forest: a RandomForestRegressor :param random_forest: a RandomForestRegressor
:param X_train: list of inputs :param X_train: list of inputs
:param X_dev: list of inputs :param X_dev: list of inputs
:param X_test: list of inputs :param X_test: list of inputs
:param y_train: list of results :param y_train: list of results
:param y_dev: list of results :param y_dev: list of results
:param y_test: list of results :param y_test: list of results
:param nb_trees: int, number of trees in the main forest :param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest :param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states :param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest, :return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly of the extracted trees, results of a random_forest train with nb_trees_extracted directly
''' '''
# Calcul des différents résultats # Calcul des différents résultats
res_base_forest = mean_squared_error(random_forest.predict(X_test), y_test) res_base_forest = mean_squared_error(random_forest.predict(X_test), y_test)
# Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids # Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)]) y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])
for elem in X_test] for elem in X_test]
res_extract_weight = mean_squared_error(y_pred, y_test) res_extract_weight = mean_squared_error(y_pred, y_test)
# Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids # Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])/sum(weights) y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])/sum(weights)
for elem in X_test] for elem in X_test]
res_extract_weight_norm = mean_squared_error(y_pred, y_test) res_extract_weight_norm = mean_squared_error(y_pred, y_test)
# Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits # Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits
y_pred = [mean([random_forest.estimators_[i].predict([elem])[0] for i in range(nb_trees) if abs(weights[i]) >= 0.01]) y_pred = [mean([random_forest.estimators_[i].predict([elem])[0] for i in range(nb_trees) if abs(weights[i]) >= 0.01])
for elem in X_test] for elem in X_test]
res_extract_mean = mean_squared_error(y_pred, y_test) res_extract_mean = mean_squared_error(y_pred, y_test)
# Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait # Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait
small_forest = train_forest(np.concatenate((X_train, X_dev)), np.concatenate((y_train, y_dev)), nb_trees_extracted, random_seed) small_forest = train_forest(np.concatenate((X_train, X_dev)), np.concatenate((y_train, y_dev)), nb_trees_extracted, random_seed)
res_small_forest = mean_squared_error(small_forest.predict(X_test), y_test) res_small_forest = mean_squared_error(small_forest.predict(X_test), y_test)
return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights
def extract_and_get_results(random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, nb_trees, def extract_and_get_results(random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, nb_trees,
nb_trees_extracted, random_seed): nb_trees_extracted, random_seed):
''' '''
Extract the subforest and returns the resuts of the different methods Extract the subforest and returns the resuts of the different methods
:param X_train: list of inputs :param X_train: list of inputs
:param X_dev: list of inputs :param X_dev: list of inputs
:param X_test: list of inputs :param X_test: list of inputs
:param y_train: list of results :param y_train: list of results
:param y_dev: list of results :param y_dev: list of results
:param y_test: list of results :param y_test: list of results
:param nb_trees: int, number of trees in the main forest :param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest :param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states :param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest, :return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly of the extracted trees, results of a random_forest train with nb_trees_extracted directly
''' '''
weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted) weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)
res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest = \ res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest = \
compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed) nb_trees, nb_trees_extracted, random_seed)
return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights
def train_extract_subforest(X_train, X_test, y_train, y_test, nb_trees, nb_trees_extracted, random_seed): def train_extract_subforest(X_train, X_test, y_train, y_test, nb_trees, nb_trees_extracted, random_seed):
''' '''
Function that takes data, number of trees and a random seed. Train a forest with nb_trees, extract Function that takes data, number of trees and a random seed. Train a forest with nb_trees, extract
with OMP nb_trees_extracted and compare the results of the different method with OMP nb_trees_extracted and compare the results of the different method
:param X_train: list of inputs :param X_train: list of inputs
:param X_test: list of inputs :param X_test: list of inputs
:param y_train: list of results :param y_train: list of results
:param y_test: list of results :param y_test: list of results
:param nb_trees: int, number of trees in the main forest :param nb_trees: int, number of trees in the main forest
:param nb_trees_extracted: int, number of trees extracted from the main forest :param nb_trees_extracted: int, number of trees extracted from the main forest
:param random_seed: int, seed for the random_states :param random_seed: int, seed for the random_states
:return: 4 results of 4 different methods, in order: results of the main forest, :return: 4 results of 4 different methods, in order: results of the main forest,
results of the weighted results of the extracted trees, results of the mean results results of the weighted results of the extracted trees, results of the mean results
of the extracted trees, results of a random_forest train with nb_trees_extracted directly of the extracted trees, results of a random_forest train with nb_trees_extracted directly
''' '''
random_forest = train_forest(X_train, y_train, nb_trees, random_seed) random_forest = train_forest(X_train, y_train, nb_trees, random_seed)
weight = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted) weight = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted)
res_base_forest, res_extract_weight, res_extract_mean, res_small_forest = \ res_base_forest, res_extract_weight, res_extract_mean, res_small_forest = \
compute_results(weight, random_forest, X_train, X_test, y_train, y_test, compute_results(weight, random_forest, X_train, X_test, y_train, y_test,
nb_trees, nb_trees_extracted, random_seed) nb_trees, nb_trees_extracted, random_seed)
return res_base_forest, res_extract_weight, res_extract_mean, res_small_forest return res_base_forest, res_extract_weight, res_extract_mean, res_small_forest
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
results_global = [] results_global = []
results_dev_global = [] results_dev_global = []
results_without_dev_global = [] results_without_dev_global = []
nb_trees = 100 nb_trees = 100
random_seeds = list(range(10)) random_seeds = list(range(10))
for random_seed in random_seeds: for random_seed in random_seeds:
# Séparation train_test avec random_state # Séparation train_test avec random_state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = 0.2, random_state = random_seed) X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = 0.2, random_state = random_seed)
random_forest = train_forest(X_train, y_train, NB_TREES, random_seed) random_forest = train_forest(X_train, y_train, NB_TREES, random_seed)
results = [] results = []
results_dev = [] results_dev = []
results_without_dev = [] results_without_dev = []
for nb_trees_extracted in [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]: for nb_trees_extracted in [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]:
weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted) weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)
weights_train = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted) weights_train = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted)
results.append(compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, results.append(compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test,
nb_trees, nb_trees_extracted, random_seed)) nb_trees, nb_trees_extracted, random_seed))
results_without_dev.append(compute_results(weights_train, random_forest, X_train, X_train, results_without_dev.append(compute_results(weights_train, random_forest, X_train, X_train,
X_test, y_train, y_train, y_test, X_test, y_train, y_train, y_test,
nb_trees, nb_trees_extracted, random_seed) nb_trees, nb_trees_extracted, random_seed)
) )
results_dev.append(compute_results(weights, random_forest, X_train, X_dev, X_dev, y_train, y_dev, y_dev, results_dev.append(compute_results(weights, random_forest, X_train, X_dev, X_dev, y_train, y_dev, y_dev,
nb_trees, nb_trees_extracted, random_seed)) nb_trees, nb_trees_extracted, random_seed))
results_global.append(results) results_global.append(results)
results_dev_global.append(results_dev) results_dev_global.append(results_dev)
results_without_dev_global.append(results_without_dev) results_without_dev_global.append(results_without_dev)
print('over') print('over')
``` ```
%% Output %% Output
over over
over over
over over
over over
over over
over over
over over
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def plot_results(results_global, title_graph): def plot_results(results_global, title_graph):
def plot_mean_and_CI(mean, lb, ub, x_value, color_mean=None, color_shading=None, label=None): def plot_mean_and_CI(mean, lb, ub, x_value, color_mean=None, color_shading=None, label=None):
# plot the shaded range of the confidence intervals # plot the shaded range of the confidence intervals
plt.fill_between(x_value, ub, lb, plt.fill_between(x_value, ub, lb,
color=color_shading, alpha=.5) color=color_shading, alpha=.5)
# plot the mean on top # plot the mean on top
plt.plot(x_value, mean, color_mean, label = label) plt.plot(x_value, mean, color_mean, label = label)
means_results = np.array( means_results = np.array(
[ [
[mean( [mean(
[results[i][k] for results in results_global] # loop over the different experiments [results[i][k] for results in results_global] # loop over the different experiments
) for i in range(len(results_global[0]))] # loop over the different number of trees extracted ) for i in range(len(results_global[0]))] # loop over the different number of trees extracted
for k in range(5)]) # loop over the different methods for k in range(5)]) # loop over the different methods
std_results = np.array( std_results = np.array(
[ [
[np.std( [np.std(
[results[i][k] for results in results_global] [results[i][k] for results in results_global]
) for i in range(len(results_global[0]))] ) for i in range(len(results_global[0]))]
for k in range(5)]) for k in range(5)])
x_value = [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]] x_value = [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]
# plot the data # plot the data
fig = plt.figure(1, figsize=(15, 10)) fig = plt.figure(1, figsize=(15, 10))
plot_mean_and_CI(means_results[0], means_results[0] + std_results[0], means_results[0] - std_results[0], plot_mean_and_CI(means_results[0], means_results[0] + std_results[0], means_results[0] - std_results[0],
x_value, color_mean='k', color_shading='k', label='Results of the base forest (on train set)') x_value, color_mean='k', color_shading='k', label='Results of the base forest (on train set)')
plot_mean_and_CI(means_results[1], means_results[1] + std_results[1], means_results[1] - std_results[1], plot_mean_and_CI(means_results[1], means_results[1] + std_results[1], means_results[1] - std_results[1],
x_value, color_mean='darkorange', color_shading='darkorange', x_value, color_mean='darkorange', color_shading='darkorange',
label='Weighted results of the extracted trees') label='Weighted results of the extracted trees')
plot_mean_and_CI(means_results[2], means_results[2] + std_results[2], means_results[2] - std_results[2], plot_mean_and_CI(means_results[2], means_results[2] + std_results[2], means_results[2] - std_results[2],
x_value, color_mean='red', color_shading='red', x_value, color_mean='red', color_shading='red',
label='Weighted results of the extracted trees normalized') label='Weighted results of the extracted trees normalized')
plot_mean_and_CI(means_results[3], means_results[3] + std_results[3], means_results[3] - std_results[3], plot_mean_and_CI(means_results[3], means_results[3] + std_results[3], means_results[3] - std_results[3],
x_value, color_mean='b', color_shading='b', x_value, color_mean='b', color_shading='b',
label='Mean results of the extracted trees') label='Mean results of the extracted trees')
plot_mean_and_CI(means_results[4], means_results[4] + std_results[4], means_results[4] - std_results[4], plot_mean_and_CI(means_results[4], means_results[4] + std_results[4], means_results[4] - std_results[4],
x_value, color_mean='g', color_shading='g', x_value, color_mean='g', color_shading='g',
label='Results of a forest train with number of trees extracted (train+dev set)') label='Results of a forest train with number of trees extracted (train+dev set)')
plt.xlabel('Number of trees extracted') plt.xlabel('Number of trees extracted')
plt.ylabel('MSE') plt.ylabel('MSE')
plt.title(title_graph) plt.title(title_graph)
plt.legend(loc="upper right") plt.legend(loc="upper right")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
plot_results(results_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on train set') plot_results(results_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on train set')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
plot_results(results_dev_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on dev set') plot_results(results_dev_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on dev set')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
plot_results(results_without_dev, plot_results(results_without_dev,
'Reduction of a forest with 100 trees, 10 iterations with different seed, score when there is no dev set') 'Reduction of a forest with 100 trees, 10 iterations with different seed, score when there is no dev set')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
for results in results_global: for results in results_global:
x_value = [int(NB_TREES/k) for k in [5, 10, 50, 100, 500, 1000]] x_value = [int(NB_TREES/k) for k in [5, 10, 50, 100, 500, 1000]]
plt.xlabel('Number of trees extracted') plt.xlabel('Number of trees extracted')
plt.ylabel('MSE') plt.ylabel('MSE')
plt.plot(x_value, [elem[1] for elem in results], color='darkorange', plt.plot(x_value, [elem[1] for elem in results], color='darkorange',
label='Weighted results of the average trees') label='Weighted results of the average trees')
plt.plot(x_value, [elem[2] for elem in results], color='red', plt.plot(x_value, [elem[2] for elem in results], color='red',
label='Weighted results of the average trees normalized') label='Weighted results of the average trees normalized')
plt.plot(x_value, [elem[3] for elem in results], color='blue', plt.plot(x_value, [elem[3] for elem in results], color='blue',
label='Mean results of the average trees') label='Mean results of the average trees')
plt.plot(x_value, [elem[4] for elem in results], color='green', plt.plot(x_value, [elem[4] for elem in results], color='green',
label='Results of a forest train with number of trees extracted') label='Results of a forest train with number of trees extracted')
plt.plot(x_value, [elem[0] for elem in results], color='black', plt.plot(x_value, [elem[0] for elem in results], color='black',
label='Results of the base forest') label='Results of the base forest')
plt.figure(1, figsize=(15, 10)) plt.figure(1, figsize=(15, 10))
plt.legend(loc="upper right") plt.legend(loc="upper right")
fig_acc_rec = plt.gcf() fig_acc_rec = plt.gcf()
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def weight_density(list_weight): def weight_density(list_weight):
print(list_weight) print(list_weight)
X_plot = [np.exp(elem) for elem in list_weight] X_plot = [np.exp(elem) for elem in list_weight]
fig, ax = plt.subplots() fig, ax = plt.subplots()
for kernel in ['gaussian', 'tophat', 'epanechnikov']: for kernel in ['gaussian', 'tophat', 'epanechnikov']:
kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot) kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot)
log_dens = kde.score_samples(X_plot) log_dens = kde.score_samples(X_plot)
ax.plot(X_plot[:, 0], np.exp(log_dens), '-', ax.plot(X_plot[:, 0], np.exp(log_dens), '-',
label="kernel = '{0}'".format(kernel)) label="kernel = '{0}'".format(kernel))
ax.legend(loc='upper left') ax.legend(loc='upper left')
ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')
ax.set_xlim(-4, 9) ax.set_xlim(-4, 9)
ax.set_ylim(-0.02, 0.4) ax.set_ylim(-0.02, 0.4)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
for results in results_global: for results in results_global:
ax = pd.Series([[e for e in test[5] if e != 0] for test in results][1]).plot.kde(figsize=(15, 10)) ax = pd.Series([[e for e in test[5] if e != 0] for test in results][1]).plot.kde(figsize=(15, 10))
legends = ['Experience '+ str(i+1) for i in range(10)] legends = ['Experience '+ str(i+1) for i in range(10)]
ax.legend(legends) ax.legend(legends)
``` ```
%% Output %% Output
<matplotlib.legend.Legend at 0x7f1437754c10> <matplotlib.legend.Legend at 0x7f1437754c10>
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
np.array( np.array(
[ [
[ [
[results[i][k] for results in results_global] [results[i][k] for results in results_global]
for i in range(len(results_global[0]))] for i in range(len(results_global[0]))]
for k in range(5)]) for k in range(5)])
``` ```
%% Output %% Output
array([[[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508, array([[[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801], 0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508, [0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801], 0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508, [0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801], 0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508, [0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801], 0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508, [0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801], 0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801],
[0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508, [0.26899689, 0.26359377, 0.2780403 , 0.25029723, 0.26674508,
0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801]], 0.25602716, 0.28057576, 0.2761758 , 0.25817293, 0.26356801]],
[[0.27542295, 0.27749768, 0.28513058, 0.26038702, 0.27043376, [[0.27542295, 0.27749768, 0.28513058, 0.26038702, 0.27043376,
0.2655008 , 0.28448981, 0.28333658, 0.27387447, 0.2769381 ], 0.2655008 , 0.28448981, 0.28333658, 0.27387447, 0.2769381 ],
[0.27746557, 0.27723817, 0.28723859, 0.26434651, 0.27067318, [0.27746557, 0.27723817, 0.28723859, 0.26434651, 0.27067318,
0.26330039, 0.28196962, 0.28240111, 0.27509222, 0.28088583], 0.26330039, 0.28196962, 0.28240111, 0.27509222, 0.28088583],
[0.28995364, 0.29198289, 0.29873153, 0.27618004, 0.2848853 , [0.28995364, 0.29198289, 0.29873153, 0.27618004, 0.2848853 ,
0.27857491, 0.29298835, 0.30077324, 0.2886711 , 0.28905086], 0.27857491, 0.29298835, 0.30077324, 0.2886711 , 0.28905086],
[0.32365526, 0.32322906, 0.32710513, 0.29903915, 0.31318329, [0.32365526, 0.32322906, 0.32710513, 0.29903915, 0.31318329,
0.30669926, 0.32434317, 0.32110736, 0.31463418, 0.321466 ], 0.30669926, 0.32434317, 0.32110736, 0.31463418, 0.321466 ],
[0.39986111, 0.42484653, 0.42855969, 0.39370378, 0.39935977, [0.39986111, 0.42484653, 0.42855969, 0.39370378, 0.39935977,
0.38460084, 0.41563938, 0.40814036, 0.39929003, 0.38932494], 0.38460084, 0.41563938, 0.40814036, 0.39929003, 0.38932494],
[0.58523066, 0.55891364, 0.59428021, 0.60547191, 0.5266932 , [0.58523066, 0.55891364, 0.59428021, 0.60547191, 0.5266932 ,
0.54086835, 0.57100958, 0.54292164, 0.53241884, 0.59593718]], 0.54086835, 0.57100958, 0.54292164, 0.53241884, 0.59593718]],
[[0.27521601, 0.27770826, 0.28523181, 0.26038166, 0.27049563, [[0.27521601, 0.27770826, 0.28523181, 0.26038166, 0.27049563,
0.26550442, 0.28434805, 0.28336574, 0.27317227, 0.27730912], 0.26550442, 0.28434805, 0.28336574, 0.27317227, 0.27730912],
[0.27744027, 0.27741863, 0.28736133, 0.26435666, 0.2708491 , [0.27744027, 0.27741863, 0.28736133, 0.26435666, 0.2708491 ,
0.26327205, 0.28195336, 0.28236196, 0.2744054 , 0.28125145], 0.26327205, 0.28195336, 0.28236196, 0.2744054 , 0.28125145],
[0.29039242, 0.2925128 , 0.29908397, 0.27622541, 0.28540109, [0.29039242, 0.2925128 , 0.29908397, 0.27622541, 0.28540109,
0.27863392, 0.2930088 , 0.30074197, 0.28805612, 0.28951146], 0.27863392, 0.2930088 , 0.30074197, 0.28805612, 0.28951146],
[0.32484297, 0.32460791, 0.32853218, 0.29964306, 0.31475447, [0.32484297, 0.32460791, 0.32853218, 0.29964306, 0.31475447,
0.30729241, 0.3249432 , 0.32151156, 0.31435099, 0.32265734], 0.30729241, 0.3249432 , 0.32151156, 0.31435099, 0.32265734],
[0.40624496, 0.4302276 , 0.43530539, 0.39867506, 0.40663919, [0.40624496, 0.4302276 , 0.43530539, 0.39867506, 0.40663919,
0.38977503, 0.42116368, 0.41272401, 0.40457793, 0.39346472], 0.38977503, 0.42116368, 0.41272401, 0.40457793, 0.39346472],
[0.61110865, 0.57982481, 0.62469263, 0.63199171, 0.54630055, [0.61110865, 0.57982481, 0.62469263, 0.63199171, 0.54630055,
0.56558963, 0.59272349, 0.56177889, 0.55353829, 0.61516357]], 0.56558963, 0.59272349, 0.56177889, 0.55353829, 0.61516357]],
[[0.27065035, 0.26652983, 0.27690332, 0.25221968, 0.26834736, [[0.27065035, 0.26652983, 0.27690332, 0.25221968, 0.26834736,
0.25713719, 0.27970688, 0.27633639, 0.26457284, 0.26459536], 0.25713719, 0.27970688, 0.27633639, 0.26457284, 0.26459536],
[0.27745036, 0.27779328, 0.28618091, 0.26025249, 0.27057945, [0.27745036, 0.27779328, 0.28618091, 0.26025249, 0.27057945,
0.26101474, 0.28177424, 0.27897506, 0.27042052, 0.27522275], 0.26101474, 0.28177424, 0.27897506, 0.27042052, 0.27522275],
[0.29080491, 0.29277669, 0.29888537, 0.27686572, 0.28509987, [0.29080491, 0.29277669, 0.29888537, 0.27686572, 0.28509987,
0.2785929 , 0.29306583, 0.29749568, 0.28659403, 0.28920128], 0.2785929 , 0.29306583, 0.29749568, 0.28659403, 0.28920128],
[0.32643403, 0.32525994, 0.32820348, 0.30076919, 0.31430383, [0.32643403, 0.32525994, 0.32820348, 0.30076919, 0.31430383,
0.30854091, 0.3247721 , 0.31945693, 0.31341696, 0.32260233], 0.30854091, 0.3247721 , 0.31945693, 0.31341696, 0.32260233],
[0.40894501, 0.43023553, 0.43584108, 0.40142063, 0.40520717, [0.40894501, 0.43023553, 0.43584108, 0.40142063, 0.40520717,
0.39108446, 0.4205632 , 0.41085099, 0.40442852, 0.39712488], 0.39108446, 0.4205632 , 0.41085099, 0.40442852, 0.39712488],
[0.61110865, 0.57982481, 0.62469263, 0.63199171, 0.54630055, [0.61110865, 0.57982481, 0.62469263, 0.63199171, 0.54630055,
0.56558963, 0.59272349, 0.56177889, 0.55353829, 0.61516357]], 0.56558963, 0.59272349, 0.56177889, 0.55353829, 0.61516357]],
[[0.26184144, 0.25626252, 0.26511056, 0.24293248, 0.25853787, [[0.26184144, 0.25626252, 0.26511056, 0.24293248, 0.25853787,
0.24899595, 0.27433988, 0.27443584, 0.24968665, 0.25521777], 0.24899595, 0.27433988, 0.27443584, 0.24968665, 0.25521777],
[0.27008137, 0.26487907, 0.27600019, 0.25217323, 0.26622961, [0.27008137, 0.26487907, 0.27600019, 0.25217323, 0.26622961,
0.25721161, 0.28795328, 0.28488014, 0.25274144, 0.26086461], 0.25721161, 0.28795328, 0.28488014, 0.25274144, 0.26086461],
[0.28205227, 0.2757543 , 0.29624245, 0.27094063, 0.29016011, [0.28205227, 0.2757543 , 0.29624245, 0.27094063, 0.29016011,
0.27193868, 0.30978997, 0.29614998, 0.26827511, 0.27353369], 0.27193868, 0.30978997, 0.29614998, 0.26827511, 0.27353369],
[0.30842355, 0.30288144, 0.32913279, 0.30527809, 0.32101279, [0.30842355, 0.30288144, 0.32913279, 0.30527809, 0.32101279,
0.31426529, 0.3350261 , 0.33831256, 0.30012365, 0.30287159], 0.31426529, 0.3350261 , 0.33831256, 0.30012365, 0.30287159],
[0.39608144, 0.38109562, 0.41662116, 0.40638002, 0.41791456, [0.39608144, 0.38109562, 0.41662116, 0.40638002, 0.41791456,
0.40641226, 0.44955332, 0.44545138, 0.40050687, 0.39659829], 0.40641226, 0.44955332, 0.44545138, 0.40050687, 0.39659829],
[0.52593732, 0.54251735, 0.57760869, 0.56082674, 0.56808121, [0.52593732, 0.54251735, 0.57760869, 0.56082674, 0.56808121,
0.55389761, 0.59152879, 0.62432776, 0.52053009, 0.54411424]]]) 0.55389761, 0.59152879, 0.62432776, 0.52053009, 0.54411424]]])
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
[[sum(elem[5]) for elem in results] for results in results_global] [[sum(elem[5]) for elem in results] for results in results_global]
``` ```
%% Output %% Output
[[1.0019333893291256, [[1.0019333893291256,
1.0002339744254798, 1.0002339744254798,
0.9965284128922761, 0.9965284128922761,
0.9930020768164572, 0.9930020768164572,
0.9713515521464255, 0.9713515521464255,
0.9355587965148584], 0.9355587965148584],
[0.9964172479701133, [0.9964172479701133,
0.9965133913097529, 0.9965133913097529,
0.9921861297905141, 0.9921861297905141,
0.9842416540561515, 0.9842416540561515,
0.9682575760922218, 0.9682575760922218,
0.933691714442025], 0.933691714442025],
[0.9983332596273122, [0.9983332596273122,
0.9980542966631237, 0.9980542966631237,
0.9953384960946179, 0.9953384960946179,
0.9863737744133908, 0.9863737744133908,
0.9713936104999941, 0.9713936104999941,
0.9322186005941735], 0.9322186005941735],
[1.0038094772994455, [1.0038094772994455,
1.0018194366490565, 1.0018194366490565,
0.9977426702506628, 0.9977426702506628,
0.9901515373986598, 0.9901515373986598,
0.9656704215772347, 0.9656704215772347,
0.9225711413699638], 0.9225711413699638],
[0.9990610710125596, [0.9990610710125596,
0.997516190665299, 0.997516190665299,
0.994208907140429, 0.994208907140429,
0.9866287246076226, 0.9866287246076226,
0.9654844823617819, 0.9654844823617819,
0.9349411984209011], 0.9349411984209011],
[0.9988517681736824, [0.9988517681736824,
0.9978719264484801, 0.9978719264484801,
0.9951808165785492, 0.9951808165785492,
0.9869717461401798, 0.9869717461401798,
0.96626061534528, 0.96626061534528,
0.9275270848174226], 0.9275270848174226],
[1.0036372042892556, [1.0036372042892556,
1.0021281690286359, 1.0021281690286359,
0.9992184310564347, 0.9992184310564347,
0.9936213667248184, 0.9936213667248184,
0.9726814533989055, 0.9726814533989055,
0.9287908504721321], 0.9287908504721321],
[1.0072738855021581, [1.0072738855021581,
1.0058719271210204, 1.0058719271210204,
1.0008091849171328, 1.0008091849171328,
0.9924905567327407, 0.9924905567327407,
0.9726743101033102, 0.9726743101033102,
0.9357752656379753], 0.9357752656379753],
[1.0124396297482838, [1.0124396297482838,
1.0124081517188515, 1.0124081517188515,
1.008883039483941, 1.008883039483941,
1.0028553033696677, 1.0028553033696677,
0.9743542569764307, 0.9743542569764307,
0.942597999044375], 0.942597999044375],
[0.9944072082781984, [0.9944072082781984,
0.9943119815126942, 0.9943119815126942,
0.9936514348889314, 0.9936514348889314,
0.9866366329633924, 0.9866366329633924,
0.9668343182393178, 0.9668343182393178,
0.9188473811851859]] 0.9188473811851859]]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
results_global[0] results_global[0]
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Entraînement de la forêt aléatoire ## Entraînement de la forêt aléatoire
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
regressor = RandomForestRegressor(n_estimators=NB_TREES, random_state = RANDOM_SEED) regressor = RandomForestRegressor(n_estimators=NB_TREES, random_state = RANDOM_SEED)
regressor.fit(X_train, y_train) regressor.fit(X_train, y_train)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Accès à la la liste des arbres # Accès à la la liste des arbres
tree_list = regressor.estimators_ tree_list = regressor.estimators_
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Création de la matrice des prédictions de chaque arbre ## Création de la matrice des prédictions de chaque arbre
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X # L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X
# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque # et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque
# élément comprenant les valeurs prédites par chaque arbre # élément comprenant les valeurs prédites par chaque arbre
D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train] D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=NB_TREES_EXTRACTED) omp = OrthogonalMatchingPursuit(n_nonzero_coefs=NB_TREES_EXTRACTED)
omp.fit(D, y_train) omp.fit(D, y_train)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Matrice avec poids de chaque arbre # Matrice avec poids de chaque arbre
omp.coef_ omp.coef_
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Calcul des résultats des différentes méthodes ## Calcul des résultats des différentes méthodes
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Résultat de la forêt de base ### Résultat de la forêt de base
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
mean_squared_error(regressor.predict(X_test), y_test) mean_squared_error(regressor.predict(X_test), y_test)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids ### Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
y_pred = [sum([tree_list[i].predict([elem])[0] * omp.coef_[i] for i in range(NB_TREES)]) for elem in X_test] y_pred = [sum([tree_list[i].predict([elem])[0] * omp.coef_[i] for i in range(NB_TREES)]) for elem in X_test]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
mean_squared_error(y_pred, y_test) mean_squared_error(y_pred, y_test)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits ### Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
y_pred = [mean([tree_list[i].predict([elem])[0] for i in range(NB_TREES) if omp.coef_[i] != 0])for elem in X_test] y_pred = [mean([tree_list[i].predict([elem])[0] for i in range(NB_TREES) if omp.coef_[i] != 0])for elem in X_test]
mean_squared_error(y_pred, y_test) mean_squared_error(y_pred, y_test)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait ### Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
regressor_small = RandomForestRegressor(n_estimators=NB_TREES_EXTRACTED, random_state=RANDOM_SEED) regressor_small = RandomForestRegressor(n_estimators=NB_TREES_EXTRACTED, random_state=RANDOM_SEED)
regressor_small.fit(X_train, y_train) regressor_small.fit(X_train, y_train)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
mean_squared_error(regressor_small.predict(X_test), y_test) mean_squared_error(regressor_small.predict(X_test), y_test)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment