From 03e513c1e8eed84ec53dccb48417d4db49fc38cb Mon Sep 17 00:00:00 2001 From: "farah.cherfaoui" <farah.cherfaoui.lis-lab.fr> Date: Sun, 6 Oct 2019 18:10:33 +0200 Subject: [PATCH] add description of a 2eme paper --- reports/bolsonaro.tex | 12 +++++++++++- reports/bolsonaro_biblio.bib | 9 +++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/reports/bolsonaro.tex b/reports/bolsonaro.tex index 9ea3d5b..04b5827 100644 --- a/reports/bolsonaro.tex +++ b/reports/bolsonaro.tex @@ -64,7 +64,17 @@ They construct a random forest model of size 100, then prune it with their Algor On all the data sets except colon and diabetes data sets, the more the number of trees pruned, the better the performance. They does not show the variance of the models. They also compare their method with similarity based pruning ( Sim-P) and distance minimization(MarDistM) . Except for diabetes, their method outperforms the other two algorithms. % -\item \cite{Ren2015}: coming soon :-) +\item \cite{Zhang}: This paper present 3 measures to determine the importance of a tree in a forest. Trees with less importance will be removed from the forest. +\begin{itemize} +\item $measure_1$ focuses on the prediction. A tree can be removed if its removal from the forest have the smallest impact in the prediction accuracy. Let $F = (t_1, \dots, t_n)$ a forest. For every tree $t_i$, we calculate importance score $\Delta_{T \backslash t_i}$ which is the difference between the prediction accuracy of $F$ and $F \backslash t_i$: +$$ \Delta_{T \backslash t_i} = predictAccuracy(F) - predictAccuracy(F \backslash t_i) $$ +The tree that will be removed is $t = argmin_{t \in F} ( \Delta_{T \backslash t})$. +\item $measure_2$ will try to remove a tree if it is similar to others trees in the forest. The measure of similarity between the tree $t_i$ and the forest is noted by: +$$\rho_{t_i} = \frac{1}{|F|} \sum_{t \in F; \ t \neq t_i} cor_{t_i, t}$$ +where: $cor_{t_i, t_j} = correltion(predict_{t_i}, predict_{t_j} ) $ is the correlation between the prediction of the tree $t_i$ and the tree $t_j$. In this method, the deleted tree is the one that minimize $\rho$. +\item $measure_3$ +\end{itemize} +For the experiments, they use breast cancer prognosis. They reduce the size of a forest of 100 trees to a forest of on average 26 trees keeping the same error rate. \end{itemize} diff --git a/reports/bolsonaro_biblio.bib b/reports/bolsonaro_biblio.bib index 5e661c8..b4e1683 100644 --- a/reports/bolsonaro_biblio.bib +++ b/reports/bolsonaro_biblio.bib @@ -8,10 +8,7 @@ publisher={Elsevier} } -@inproceedings{Ren2015, - title={Global refinement of random forest}, - author={Ren, Shaoqing and Cao, Xudong and Wei, Yichen and Sun, Jian}, - booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, - pages={723--730}, - year={2015} +@article{Zhang, + title={Search for the smallest random forest}, + author={Zhang, Heping and Wang, Minghui} } \ No newline at end of file -- GitLab