From 8265d59656f0abe4411fa89b9dd6ad963fb482af Mon Sep 17 00:00:00 2001
From: "farah.cherfaoui" <farah.cherfaoui.lis-lab.fr>
Date: Sat, 2 Nov 2019 19:52:43 +0100
Subject: [PATCH] notation table and OMP algo

---
 reports/bolsonaro.tex | 60 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/reports/bolsonaro.tex b/reports/bolsonaro.tex
index 24e7fb1..fc71047 100644
--- a/reports/bolsonaro.tex
+++ b/reports/bolsonaro.tex
@@ -51,30 +51,55 @@ We will need to define the vector prediction of a forest for all the data matrix
    F_{t_1, \dots, t_l}(x_1) \\
    \dots \\
    F_{t_1, \dots, t_l}(x_n) 
-\end{pmatrix}.$
+\end{pmatrix}.$\\
+%
+%
+%
+All these notations can be summarized in the following table:\\
+\begin{tabular}{l c}%\caption{Notation table}
+  %\hline
+  \textbf{x} & the vector x \\
+  $k$ & the desired (pruned) forest size \\
+  $X$ & the matrix $X$ \\
+  ${\cal X}$ & the data representation space \\
+  ${\cal Y}$ & the label representation space \\
+  $n$ & the number of data\\
+  $d$ & the data dimension \\
+  $l$ & the forest size \\
+  $F_{t_1, \dots, t_l}$ & a forest of $l$ trees \\
+  $F_{t_1, \dots, t_l}(\textbf{x}) \in {\cal Y}$ & the predicted label of \textbf{x} by the forest $F_{t_1, \dots, t_l}$ \\
+  $F_{t_1, \dots, t_l}(X) \in {\cal Y}^n$ & the predicted label of all the data of $X$ by the forest $F_{t_1, \dots, t_l}$\\
+  %\hline
+\end{tabular}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Orthogonal Matching Pursuit (OMP)}
+Given a matrix $D = [d_1, \dots , d_l] \in \mathbb{R}^n\times l$ (also called a dictionary) and a signal $\textbf{y}\in \mathbb{R}^n$, finding a $k$-sparse vector $\textbf{w} \in \mathbb{R}^l$ (i.e. $|| \textbf{w} ||_0 \leq k$) that minimize $|| X\textbf{w} - \textbf{y}||$ is an NP-hard problem (ref).
+The Orthogonal Matching Pursuit (OMP) algorithm is a greedy algorithm that aim to give an approximate solution of this problem.
+The approximation of $\textbf{y}$ is build one term at a time. Noting $\textbf{y}_k$ the current
+ approximation and $r_k = \textbf{y}_k - \textbf{y}_k$ the so-called residual, we select at each time step the atom (i.e. the column of $X$) which has the largest inner product with $r_k$, and update the approximation. 
+This step is repeated until a satisfactory approximation. This can be summarized in Algorithm \ref{algo: OMP}
 
-$y \in \mathbb{R}^n$ a signal. $D \in \mathbb{R}^{n \times d}$ a dictionnary with $d_j \in \mathbb{R^n}$. Goal: find $w \in \mathbb{R}^d$, such that $y = Dw$ and $||w||_0 < k$. $\text{span}(\{v_1, \dots, v_n\}) \{u : u = \sum^n_{i=1} \alpha_i v_i \ | \ \alpha_i \in \mathbb{R}\}$.
+
+
+%$y \in \mathbb{R}^n$ a signal. $D \in \mathbb{R}^{n \times d}$ a dictionnary with $d_j \in \mathbb{R^n}$. Goal: find $w \in \mathbb{R}^d$, such that $y = Dw$ and $||w||_0 < k$. $\text{span}(\{v_1, \dots, v_n\}) \{u : u = \sum^n_{i=1} \alpha_i v_i \ | \ \alpha_i \in \mathbb{R}\}$.
 
 \begin{algorithm}[htb]
-    \caption{Orthogonal Matching Pursuit}
+    \caption{Orthogonal Matching Pursuit}\label{algo: OMP}
     \begin{algorithmic}[1]
         \State $w_0 \gets 0$
-        \State $r \gets y$
+        \State $r_0 \gets \textbf{y}$
         \State $\lambda \gets \emptyset$
         \ForEach {$k \in \{0, \dots, K\}$}
-            \State $d^* \gets \underset{d \in \{d_1, \dots, d_d\}}{\text{argmax}} \ |<d, r_k>|$
+            \State $d^* \gets \underset{d \in \{d_1, \dots, d_l\}}{\text{argmax}} \ |<d, r_k>|$
             \State $\lambda \gets \lambda \cup \{d^*\}$
-            \State $w_{k+1} \gets \underset{\substack{\alpha \text{ s.c. } \\ D\alpha \ \in \ \text{span}(d) \\ \alpha \ \in \ \mathbb{R}^d}}{\text{argmin}} \ ||y - D\alpha||^2_2$
-            \State $r_{k + 1} \rightarrow y - D_{w_{k+1}}$
+            \State $w_{k+1} \gets \underset{\substack{\alpha \in \mathbb{R}^n \text{ s.c. } \\ D\alpha \ \in \ \text{span}(\lambda)}}{\text{argmin}} \ ||\textbf{y} - D\alpha||^2_2$
+            \State $r_{k + 1} \rightarrow \textbf{y} - D_{w_{k+1}}$
         \EndFor
     \end{algorithmic}
 \end{algorithm}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Our problem}
-
+%
+In general, the OMP algorithm can be seen as a algorithm that 'summarize' the most useful column of the dictionary for expressing the signal \textbf{y}.
+In this paper, we use this algorithm to reduce the forest's size by selecting the most informative trees in the forest (see Section \ref{sec: forest pruning} for more details). 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Related Work}
 \begin{itemize}
@@ -106,7 +131,18 @@ For the experiments, they use breast cancer prognosis. They reduce the size of a
 
 \item \cite{Fawagreh2015}: The goal is to get a much smaller forest while staying accurate and diverse. To do so, they used a clustering algorithm. Let $C(t_i, T) = \{c_{i1}, \dots, c_{im}\}$ denotes a vector of class labels obtained after having $t_i$ classify the training set $T$ of size $m$, with $t_i \in F$, $F$ the forest of size $n$. Let $\mathcal{C} = \bigcup^n_{i=1} C(t_i, T)$ be the super vector of all class vectors classified by each tree $t_i$. They then applied a clustering algorithm on $\mathcal{C}$ to find $k = \sqrt{\frac{n}{2}}$ clusters. Finally, the final forest $F'$ is composed on the union of each tree that is the most representative per cluster, for each cluster. So if you have 100 trees and 7 clusters, the final number of trees will be 7. They obtained at least similar performances as with regular RF algorithm.
 \end{itemize}
-
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Forest pruning}\label{sec: forest pruning}
+In this section, we will describe our method for pruning the forest, and thus reduce its size. \\
+Consider a forest $F_{t_1, \dots, t_l}$ of $l = 100$ trees, trained using the training data set, witch consist of the 60\% of the data. For every $i \in \{ 1, \dots , l\}$, we note the vector of prediction of the tree $t_i$ in all the $n$ data by:
+$$\textbf{y}_i =  \begin{pmatrix}
+   t_1(\textbf{x}_1) \\
+   \dots \\
+   t_1(\textbf{x}_n)
+\end{pmatrix},$$
+and the matrix of all the forest prediction in all the data by:
+$$Y =  [\textbf{y}_1 , \dots , \textbf{y}_l ] \in \mathbb{R}^{n \times l}.$$
+We apply the OMP algorithm to the $Y$ matrix and to the reals labels vector $\textbf{y}$. Thus, we will look for the $k$ most informative trees to predict the true labels.
  
  \section{Reference}
 
-- 
GitLab