Réduction_de_fôrets_aléatoires_with_dev-Copy1.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Groupe de travail\n",
    "\n",
    "Le but de ce notebook est de tester l'idée de réduction des random forest"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from statistics import mean \n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from sklearn.datasets import load_boston, load_breast_cancer\n",
    "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
    "from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.neighbors.kde import KernelDensity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variables globales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NB_TREES = 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load jeu de donnée"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = load_boston(return_X_y=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_forest(X_train, y_train, nb_trees, random_seed):\n",
    "    '''\n",
    "    Function that will train a random forest with nb_tress\n",
    "    :param X_train: list of inputs\n",
    "    :param y_train: list of results\n",
    "    :param nb_trees: int, number of trees in the forest\n",
    "    :param random_seed: int, seed for the random_states\n",
    "    :return: a RandomForestRegressor\n",
    "    '''\n",
    "    \n",
    "     # Entraînement de la forêt aléatoire\n",
    "    \n",
    "    regressor = RandomForestRegressor(n_estimators=nb_trees, random_state = random_seed)\n",
    "    regressor.fit(X_train, y_train)\n",
    "    return regressor\n",
    "\n",
    "\n",
    "def extract_subforest(random_forest, X_train, y_train, nb_trees_extracted):\n",
    "    '''\n",
    "    Function use to get the weight list of a subforest of size nb_trees_extracted for random_forest\n",
    "    using OMP.\n",
    "    :param random_forest: a RandomForestRegressor\n",
    "    :param X_train: list of inputs\n",
    "    :param y_train: list of results\n",
    "    :param nb_trees_extracted: int, number of trees extracted \n",
    "    :return: a list of int, weight of each tree\n",
    "    '''\n",
    "    \n",
    "    # Accès à la la liste des arbres\n",
    "\n",
    "    tree_list = random_forest.estimators_\n",
    "    \n",
    "    # Création de la matrice des prédictions de chaque arbre\n",
    "    \n",
    "    # L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X \n",
    "    # et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque\n",
    "    # élément comprenant les valeurs prédites par chaque arbre\n",
    "\n",
    "    D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]\n",
    "    \n",
    "    # OMP\n",
    "    \n",
    "    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=nb_trees_extracted, fit_intercept = False, normalize=False)\n",
    "    omp.fit(D, y_train)\n",
    "    \n",
    "    weights = omp.coef_\n",
    "    \n",
    "    return weights\n",
    "\n",
    "\n",
    "def compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, \n",
    "                    nb_trees, nb_trees_extracted, random_seed):\n",
    "    '''\n",
    "    Compute the score of the different techniques\n",
    "    :param weights: weights given by the OMP\n",
    "    :param random_forest: a RandomForestRegressor\n",
    "    :param X_train: list of inputs\n",
    "    :param X_dev: list of inputs\n",
    "    :param X_test: list of inputs\n",
    "    :param y_train: list of results\n",
    "    :param y_dev: list of results\n",
    "    :param y_test: list of results\n",
    "    :param nb_trees: int, number of trees in the main forest\n",
    "    :param nb_trees_extracted: int, number of trees extracted from the main forest\n",
    "    :param random_seed: int, seed for the random_states\n",
    "    :return: 4 results of 4 different methods, in order: results of the main forest, \n",
    "    results of the weighted results of the extracted trees, results of the mean results \n",
    "    of the extracted trees, results of a random_forest train with nb_trees_extracted directly\n",
    "    '''\n",
    "    \n",
    "    # Calcul des différents résultats\n",
    "    \n",
    "    res_base_forest = mean_squared_error(random_forest.predict(X_test), y_test)\n",
    "    \n",
    "    # Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids\n",
    "    \n",
    "    y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)]) \n",
    "              for elem in X_test]\n",
    "    res_extract_weight = mean_squared_error(y_pred, y_test)\n",
    "    \n",
    "    # Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids\n",
    "    \n",
    "    y_pred = [sum([random_forest.estimators_[i].predict([elem])[0] * weights[i] for i in range(nb_trees)])/sum(weights) \n",
    "              for elem in X_test]\n",
    "    res_extract_weight_norm = mean_squared_error(y_pred, y_test)\n",
    "    \n",
    "    # Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits\n",
    "    \n",
    "    y_pred = [mean([random_forest.estimators_[i].predict([elem])[0] for i in range(nb_trees) if abs(weights[i]) >= 0.01])\n",
    "              for elem in X_test]\n",
    "    res_extract_mean = mean_squared_error(y_pred, y_test)\n",
    "    \n",
    "    # Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait\n",
    "\n",
    "    small_forest = train_forest(np.concatenate((X_train, X_dev)), np.concatenate((y_train, y_dev)), nb_trees_extracted, random_seed)\n",
    "    res_small_forest = mean_squared_error(small_forest.predict(X_test), y_test)\n",
    "    \n",
    "    return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights\n",
    "\n",
    "\n",
    "def extract_and_get_results(random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, nb_trees, \n",
    "                            nb_trees_extracted, random_seed):\n",
    "    '''\n",
    "    Extract the subforest and returns the resuts of the different methods\n",
    "    :param X_train: list of inputs\n",
    "    :param X_dev: list of inputs\n",
    "    :param X_test: list of inputs\n",
    "    :param y_train: list of results\n",
    "    :param y_dev: list of results\n",
    "    :param y_test: list of results\n",
    "    :param nb_trees: int, number of trees in the main forest\n",
    "    :param nb_trees_extracted: int, number of trees extracted from the main forest\n",
    "    :param random_seed: int, seed for the random_states\n",
    "    :return: 4 results of 4 different methods, in order: results of the main forest, \n",
    "    results of the weighted results of the extracted trees, results of the mean results \n",
    "    of the extracted trees, results of a random_forest train with nb_trees_extracted directly\n",
    "    '''\n",
    "    \n",
    "    weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)\n",
    "    \n",
    "    res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest = \\\n",
    "        compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, \n",
    "                        nb_trees, nb_trees_extracted, random_seed)\n",
    "    \n",
    "    return res_base_forest, res_extract_weight, res_extract_weight_norm, res_extract_mean, res_small_forest, weights\n",
    "    \n",
    "    \n",
    "\n",
    "def train_extract_subforest(X_train, X_test, y_train, y_test, nb_trees, nb_trees_extracted, random_seed):\n",
    "    '''\n",
    "    Function that takes data, number of trees and a random seed. Train a forest with nb_trees, extract\n",
    "    with OMP nb_trees_extracted and compare the results of the different method\n",
    "    :param X_train: list of inputs\n",
    "    :param X_test: list of inputs\n",
    "    :param y_train: list of results\n",
    "    :param y_test: list of results\n",
    "    :param nb_trees: int, number of trees in the main forest\n",
    "    :param nb_trees_extracted: int, number of trees extracted from the main forest\n",
    "    :param random_seed: int, seed for the random_states\n",
    "    :return: 4 results of 4 different methods, in order: results of the main forest, \n",
    "    results of the weighted results of the extracted trees, results of the mean results \n",
    "    of the extracted trees, results of a random_forest train with nb_trees_extracted directly\n",
    "    '''\n",
    "    \n",
    "    random_forest = train_forest(X_train, y_train, nb_trees, random_seed)\n",
    "    \n",
    "    weight = extract_subforest(random_forest, X_train, y_train, nb_trees_extracted)\n",
    "    \n",
    "    res_base_forest, res_extract_weight, res_extract_mean, res_small_forest = \\\n",
    "        compute_results(weight, random_forest, X_train, X_test, y_train, y_test,\n",
    "                        nb_trees, nb_trees_extracted, random_seed)\n",
    "    \n",
    "    \n",
    "    return res_base_forest, res_extract_weight, res_extract_mean, res_small_forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_global = []\n",
    "results_dev_global = []\n",
    "\n",
    "nb_trees = 100\n",
    "random_seeds = list(range(10))\n",
    "\n",
    "for random_seed in random_seeds:\n",
    "    \n",
    "    # Séparation train_test avec random_state\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_seed)\n",
    "    X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = 0.5, random_state = random_seed)\n",
    "\n",
    "    random_forest = train_forest(X_train, y_train, NB_TREES, random_seed)\n",
    "    \n",
    "    results = []\n",
    "    results_dev = []\n",
    "\n",
    "    for nb_trees_extracted in [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]:\n",
    "        \n",
    "        weights = extract_subforest(random_forest, X_dev, y_dev, nb_trees_extracted)\n",
    "\n",
    "        results.append(compute_results(weights, random_forest, X_train, X_dev, X_test, y_train, y_dev, y_test, \n",
    "                        nb_trees, nb_trees_extracted, random_seed))\n",
    "        \n",
    "        \n",
    "        results_dev.append(compute_results(weights, random_forest, X_train, X_dev, X_dev, y_train, y_dev, y_dev, \n",
    "                           nb_trees, nb_trees_extracted, random_seed))\n",
    "\n",
    "    results_global.append(results)\n",
    "    results_dev_global.append(results_dev)\n",
    "    print('over')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_results(results_global, title_graph):\n",
    "    def plot_mean_and_CI(mean, lb, ub, x_value, color_mean=None, color_shading=None, label=None):\n",
    "        # plot the shaded range of the confidence intervals\n",
    "        plt.fill_between(x_value, ub, lb,\n",
    "                         color=color_shading, alpha=.5)\n",
    "        # plot the mean on top\n",
    "        plt.plot(x_value, mean, color_mean, label = label)\n",
    "\n",
    "    means_results = np.array(\n",
    "        [\n",
    "            [mean(\n",
    "                [results[i][k] for results in results_global] # loop over the different experiments\n",
    "            ) for i in range(len(results_global[0]))] # loop over the different number of trees extracted\n",
    "        for k in range(5)]) # loop over the different methods\n",
    "    std_results = np.array(\n",
    "        [\n",
    "            [np.std(\n",
    "                [results[i][k] for results in results_global]\n",
    "            ) for i in range(len(results_global[0]))]\n",
    "        for k in range(5)])\n",
    "\n",
    "    x_value = [int(NB_TREES/k) for k in [2, 5, 10, 20, 50, 100]]\n",
    "    # plot the data\n",
    "    fig = plt.figure(1, figsize=(15, 10))\n",
    "    plot_mean_and_CI(means_results[0], means_results[0] + std_results[0], means_results[0] - std_results[0],\n",
    "                     x_value, color_mean='k', color_shading='k', label='Results of the base forest (on train set)')\n",
    "\n",
    "    plot_mean_and_CI(means_results[1], means_results[1] + std_results[1], means_results[1] - std_results[1],\n",
    "                     x_value, color_mean='darkorange', color_shading='darkorange', \n",
    "                     label='Weighted results of the extracted trees')\n",
    "    plot_mean_and_CI(means_results[2], means_results[2] + std_results[2], means_results[2] - std_results[2], \n",
    "                     x_value, color_mean='red', color_shading='red',\n",
    "x                    label='Weighted results of the extracted trees normalized')\n",
    "\n",
    "    plot_mean_and_CI(means_results[3], means_results[3] + std_results[3], means_results[3] - std_results[3], \n",
    "                     x_value, color_mean='b', color_shading='b',\n",
    "                    label='Mean results of the extracted trees')\n",
    "    plot_mean_and_CI(means_results[4], means_results[4] + std_results[4], means_results[4] - std_results[4], \n",
    "                     x_value, color_mean='g', color_shading='g',\n",
    "                    label='Results of a forest train with number of trees extracted (train+dev set)')\n",
    "    plt.xlabel('Number of trees extracted')\n",
    "    plt.ylabel('MSE')\n",
    "    plt.title(title_graph)\n",
    "\n",
    "    plt.legend(loc=\"upper right\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "plot_results(results_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on train set')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_results(results_dev_global, 'Reduction of a forest with 100 trees, 10 iterations with different seed, score on dev set')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for results in results_global:\n",
    "    x_value = [int(NB_TREES/k) for k in [5, 10, 50, 100, 500, 1000]]\n",
    "    plt.xlabel('Number of trees extracted')\n",
    "    plt.ylabel('MSE')\n",
    "    plt.plot(x_value, [elem[1] for elem in results], color='darkorange',\n",
    "             label='Weighted results of the average trees')\n",
    "    plt.plot(x_value, [elem[2] for elem in results], color='red',\n",
    "            label='Weighted results of the average trees normalized')\n",
    "    plt.plot(x_value, [elem[3] for elem in results], color='blue',\n",
    "             label='Mean results of the average trees')\n",
    "    plt.plot(x_value, [elem[4] for elem in results], color='green',\n",
    "             label='Results of a forest train with number of trees extracted')\n",
    "    plt.plot(x_value, [elem[0] for elem in results], color='black',\n",
    "             label='Results of the base forest')\n",
    "    plt.figure(1, figsize=(15, 10))\n",
    "\n",
    "    plt.legend(loc=\"upper right\")\n",
    "    \n",
    "\n",
    "\n",
    "    fig_acc_rec = plt.gcf()\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weight_density(list_weight):\n",
    "    print(list_weight)\n",
    "    X_plot = [np.exp(elem) for elem in list_weight]\n",
    "    fig, ax = plt.subplots()\n",
    "\n",
    "    for kernel in ['gaussian', 'tophat', 'epanechnikov']:\n",
    "        kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot)\n",
    "        log_dens = kde.score_samples(X_plot)\n",
    "        ax.plot(X_plot[:, 0], np.exp(log_dens), '-',\n",
    "                label=\"kernel = '{0}'\".format(kernel))\n",
    "\n",
    "    ax.legend(loc='upper left')\n",
    "    ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')\n",
    "\n",
    "    ax.set_xlim(-4, 9)\n",
    "    ax.set_ylim(-0.02, 0.4)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for results in results_global:\n",
    "    ax = pd.Series([[e for e in test[5] if e != 0] for test in results][1]).plot.kde(figsize=(15, 10))\n",
    "    \n",
    "legends = ['OK'] * 10\n",
    "legends[4] = 'Problème'\n",
    "# ax.legend(legends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "np.array(\n",
    "    [\n",
    "        [\n",
    "            [results[i][k] for results in results_global]\n",
    "        for i in range(len(results_global[0]))]\n",
    "    for k in range(5)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[[sum(elem[5]) for elem in results] for results in results_global] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_global[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Entraînement de la forêt aléatoire"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regressor = RandomForestRegressor(n_estimators=NB_TREES, random_state = RANDOM_SEED)\n",
    "\n",
    "regressor.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Accès à la la liste des arbres\n",
    "\n",
    "tree_list = regressor.estimators_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Création de la matrice des prédictions de chaque arbre"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# L'implémentation de scikit-learn est un peu différente que celle vue en réunion, D est de même taille que X \n",
    "# et chaque élément est composé de d signaux, d'où la création suivante de D où on créé une liste pour chaque\n",
    "# élément comprenant les valeurs prédites par chaque arbre\n",
    "\n",
    "D = [[tree.predict([elem])[0] for tree in tree_list] for elem in X_train]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "omp = OrthogonalMatchingPursuit(n_nonzero_coefs=NB_TREES_EXTRACTED)\n",
    "omp.fit(D, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Matrice avec poids de chaque arbre\n",
    "\n",
    "omp.coef_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calcul des résultats des différentes méthodes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Résultat de la forêt de base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_squared_error(regressor.predict(X_test), y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Résultat de la forêt extraite avec l'OMP, où chaque arbre est multiplié par son poids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = [sum([tree_list[i].predict([elem])[0] * omp.coef_[i] for i in range(NB_TREES)]) for elem in X_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_squared_error(y_pred, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Résultat de la forêt extraite avec l'OMP, où on prends la moyenne des arbres extraits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = [mean([tree_list[i].predict([elem])[0] for i in range(NB_TREES) if omp.coef_[i] != 0])for elem in X_test]\n",
    "mean_squared_error(y_pred, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Résultat d'une forêt avec le même nombre d'arbre que le nombre d'arbre extrait"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regressor_small = RandomForestRegressor(n_estimators=NB_TREES_EXTRACTED, random_state=RANDOM_SEED)\n",
    "regressor_small.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_squared_error(regressor_small.predict(X_test), y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}