bolsonaro.tex

\documentclass[11pt]{article}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage{float}

\algnewcommand\algorithmicforeach{\textbf{for each}}
\algdef{S}[FOR]{ForEach}[1]{\algorithmicforeach\ #1\ \algorithmicdo}

\makeatletter
\renewcommand{\ALG@beginalgorithmic}{\small}
\makeatother

\title{bolsonaro}
\date{September 2019}

\begin{document}

\maketitle

\section{Introduction}
\subsection{Notation}

$S = \{(x_i, y_i)\}^n_{i=1}$ the dataset, with $x_i \in X$ and $y_i \in Y$. $T = \{t_1, t_2, \dots, t_d\}$ the random forest of $d$ trees, such that $t_j : X \rightarrow Y$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Orthogonal Matching Pursuit (OMP)}

$y \in \mathbb{R}^n$ a signal. $D \in \mathbb{R}^{n \times d}$ a dictionnary with $d_j \in \mathbb{R^n}$. Goal: find $w \in \mathbb{R}^d$, such that $y = Dw$ and $||w||_0 < k$. $\text{span}(\{v_1, \dots, v_n\}) \{u : u = \sum^n_{i=1} \alpha_i v_i \ | \ \alpha_i \in \mathbb{R}\}$.

\begin{algorithm}[htb]
    \caption{Orthogonal Matching Pursuit}
    \begin{algorithmic}[1]
        \State $w_0 \gets 0$
        \State $r \gets y$
        \State $\lambda \gets \emptyset$
        \ForEach {$k \in \{0, \dots, K\}$}
            \State $d^* \gets \underset{d \in \{d_1, \dots, d_d\}}{\text{argmax}} \ |<d, r_k>|$
            \State $\lambda \gets \lambda \cup \{d^*\}$
            \State $w_{k+1} \gets \underset{\substack{\alpha \text{ s.c. } \\ D\alpha \ \in \ \text{span}(d) \\ \alpha \ \in \ \mathbb{R}^d}}{\text{argmin}} \ ||y - D\alpha||^2_2$
            \State $r_{k + 1} \rightarrow y - D_{w_{k+1}}$
        \EndFor
    \end{algorithmic}
\end{algorithm}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Our problem}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Related Work}
\begin{itemize}
\item \cite{Yang2012}: once the forest $(F = t_1, \dots, t_n)$ is built, he gives each tree a score (which measures the importance of the tree in the forest). The tree with the lowest score is removed from the forest. To eliminate the next tree, all the scores are recomputed, and the tree with the lowest score is removed...\\
They present in this paper 4 different tree's score. For each tree $t_i$, we compute:
\begin{itemize}
\item $score_1 = mean_{(x,y) \in train}( margin((x,y), F) - margin((x,y),F\backslash t_i))$
\item $score_2 = min_{(x,y) \in train}( margin((x,y), F) - min_{(x,y) \in train} (margin(F\backslash t_i)))$
%\item $score_3 = min_{(x,y) \in train}( margin((x,y), F) - min_{(x,y) \in train} (margin(F\backslash t_i)))$
\end{itemize}
where:
$$ margin((x, y), F) = \frac{1}{|F|} \sum_{i = 1}^{|F|} I(t_i(x) = y) - \sum_{i = 1}^{|F|} \max_{l \neq y} I(t_i(x) = l)$$
They compute some experiments in several classification (most of them are binary classification) UCI data set, with different number of attribute (from 5 to 61): Diabetes, Heart, Hearts, Iris, Ionosphere, Monks, Sonar, Steel, Tic, Wine.
They construct a random forest model of size 100, then prune it with their Algorithm and obtain a smaller forest with size ranging from 99 to 20. The performances of their algorithms are compared with random forest models with the corresponding sizes (i.e. forest directly constructed with size 99 to 20). 
On all the data sets except colon and diabetes data sets, the more the number of trees pruned, the better the performance.
They does not show the variance of the models. They also compare their method with similarity based pruning ( Sim-P) and  distance minimization(MarDistM) . Except for diabetes, their method outperforms the other two algorithms.
%
\item \cite{Zhang}: This paper present 3 measures to determine the importance of a tree in a forest. Trees with less importance will be removed from the forest.
\begin{itemize}
\item $measure_1$ focuses on the prediction. A tree can be removed if its removal from the forest have the smallest impact in the prediction accuracy. Let $F = (t_1, \dots, t_n)$ a forest. For every tree $t_i$, we calculate importance score $\Delta_{T \backslash t_i}$ which is the difference between the prediction accuracy of $F$ and $F \backslash t_i$:
$$ \Delta_{T \backslash t_i} =  predictAccuracy(F) - predictAccuracy(F \backslash t_i) $$
The tree that will be removed is $t = argmin_{t \in F} ( \Delta_{T \backslash t})$.
\item $measure_2$ will try to remove a tree if it is similar to others trees in the forest. The measure of similarity between the tree $t_i$ and the forest is noted by:
$$\rho_{t_i} = \frac{1}{|F|} \sum_{t \in F; \ t \neq t_i} cor_{t_i, t}$$
where: $cor_{t_i, t_j} = correltion(predict_{t_i}, predict_{t_j} ) $ is the correlation between the prediction of the tree $t_i$ and the tree $t_j$. In this method, the deleted tree is the one that minimize $\rho$.
\item $measure_3$ 
\end{itemize}
For the experiments, they use breast cancer prognosis. They reduce the size of a forest of 100 trees to a forest of on average 26 trees keeping the same error rate.

\item \cite{Fawagreh2015}: The goal is to get a much smaller forest while staying accurate and diverse. To do so, they used a clustering algorithm. Let $C(t_i, T) = \{c_{i1}, \dots, c_{im}\}$ denotes a vector of class labels obtained after having $t_i$ classify the training set $T$ of size $m$, with $t_i \in F$, $F$ the forest of size $n$. Let $\mathcal{C} = \bigcup^n_{i=1} C(t_i, T)$ be the super vector of all class vectors classified by each tree $t_i$. They then applied a clustering algorithm on $\mathcal{C}$ to find $k = \sqrt{\frac{n}{2}}$ clusters. Finally, the final forest $F'$ is composed on the union of each tree that is the most representative per cluster, for each cluster. So if you have 100 trees and 7 clusters, the final number of trees will be 7. They obtained at least similar performances as with regular RF algorithm.
\end{itemize}

 
 \section{Reference}

\nocite{*}
\bibliographystyle{plain}
\bibliography{bolsonaro_biblio}
\end{document}