\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave} 
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                              fontshape=it,
                                              fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
    \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                                  fontshape=it,
                                                  fontsize=\small}
    \rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
  \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
  \end{figure} }
  \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
  \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, 
                                                samepage = true,
                                                fontfamily=courier,
                                                fontshape=it,
                                                fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%%  numbers=left
}
\newcommand{\numberSinput}{
    \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
    \DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}} 
\newcommand{\stress}[1]{\textit{#1}} 
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}   
\newcommand{\Var}{\mathsf{Var}}   
\newcommand{\Cov}{\mathsf{Cov}}   
\newcommand{\Cor}{\mathsf{Cor}}   
\newcommand{\x}{\mathbf{x}}   
\newcommand{\y}{\mathbf{y}}   
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}   
\newcommand{\C}{\mathbf{C}}   
\renewcommand{\H}{\mathbf{H}}   
\newcommand{\X}{\mathbf{X}}   
\newcommand{\B}{\mathbf{B}}   
\newcommand{\V}{\mathbf{V}}   
\newcommand{\I}{\mathbf{I}}   
\newcommand{\D}{\mathbf{D}}   
\newcommand{\bS}{\mathbf{S}}   
\newcommand{\N}{\mathcal{N}}   
\renewcommand{\P}{\mathsf{P}}   
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
%%% links
\usepackage{hyperref}
\hypersetup{%
  pdftitle = {A Handbook of Statistical Analyses Using R},
  pdfsubject = {Book},
  pdfauthor = {Brian S. Everitt and Torsten Hothorn},
  colorlinks = {black},
  linkcolor = {black},
  citecolor = {black},
  urlcolor = {black},
  hyperindex = {true},
  linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%% 
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\item{\stepcounter{exercise} Ex.
                       \arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need 
\usepackage[utf8]{inputenc}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 2nd Edition}
\author{Brian S. Everitt and Torsten Hothorn}
\maketitle
%%\VignetteIndexEntry{Chapter Recursive Partitioning}
%%\VignetteDepends{vcd,lattice,randomForest,party,partykit,TH.data}
\setcounter{chapter}{8}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} 
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
                "package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+  ",
    width = 63, # digits = 4, 
    show.signif.stars = FALSE,
    SweaveHooks = list(leftpar = function() 
        par(mai = par("mai") * c(1, 1.05, 1, 1)),
        bigleftpar = function()
        par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR2")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR2"))
rm(HSAURpkg)
 ###  hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
 ### 
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM", 
                "DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA", 
                "MDS", "CA"), 1:18)
ch <- function(x) {
    ch <- refs[which(refs[,1] == x),]
    if (book) {
        return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
    } else {
        return(paste("Chapter~", ch[2], sep = ""))
    }
}
if (file.exists("deparse.R"))
    source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
    lattice.options(default.theme = 
        function()
            standard.theme("pdf", color = FALSE))
    })
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
library("vcd")
library("lattice")
library("randomForest")
library("party")
library("partykit")
ltheme <- canonical.theme(color = FALSE)     ## in-built B&W theme  
ltheme$strip.background$col <- "transparent" ## change strip bg  
lattice.options(default.theme = ltheme) 
mai <- par("mai")
options(SweaveHooks = list(nullmai = function() { par(mai = rep(0, 4)) },
                           twomai = function() { par(mai = c(0, mai[2], 0, 0)) },
                           threemai = function() { par(mai = c(0, mai[2], 0.1, 0)) }))
numbers <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
@
\chapter[Recursive Partitioning]{Recursive Partitioning: 
                                 Predicting Body Fat and Glaucoma Diagnosis \label{RP}}
\section{Introduction}
\section{Recursive Partitioning}
\section{Analysis Using \R{}}
\subsection{Predicting Body Fat Content}
The \Rcmd{rpart} function from \Rpackage{rpart} can be used to grow a
regression tree. The response variable and the covariates are defined by a
model formula in the same way as for \Rcmd{lm}, say. By default, a large
initial tree is grown, we restrict the number of observations required to
establish a potential binary split to at least ten:
<>=
library("rpart")
data("bodyfat", package = "TH.data")
bodyfat_rpart <- rpart(DEXfat ~ age + waistcirc + hipcirc + 
    elbowbreadth + kneebreadth, data = bodyfat, 
    control = rpart.control(minsplit = 10))
@
A \Rcmd{print} method for \Rclass{rpart} objects is available; however, a
graphical representation \citep[here utilising functionality offered 
from package \Rpackage{partykit},][]{PKG:partykit} shown in Figure~\ref{RP-bodyfat-plot} is more
convenient. Observations that satisfy the condition shown for each node 
go to the left and
observations that don't are element of the right branch in each node. %'
As expected, higher values for waist- and hip circumferences and wider knees
correspond to higher values of body fat content. The rightmost terminal node
consists of only three rather extreme observations.
\begin{figure}
\begin{center}
<>=
library("partykit")
plot(as.party(bodyfat_rpart), tp_args = list(id = FALSE))
@
\caption{Initial tree for the body fat data with the distribution
         of body fat in terminal nodes visualised via boxplots. \label{RP-bodyfat-plot}}
\end{center}
\end{figure}
\index{Cross-validation}
To determine if the tree is appropriate or if some of the branches need to 
be subjected to pruning we can use the \Robject{cptable}
element of the \Rclass{rpart} object:
<>=
print(bodyfat_rpart$cptable)
opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
@
The \Robject{xerror} column contains of estimates of cross-validated prediction error
for different numbers of splits (\Robject{nsplit}). The best tree has
\Sexpr{numbers[bodyfat_rpart$cptable[opt, "nsplit"] + 1]}
splits. Now we can prune back the large initial tree using
<>=
cp <- bodyfat_rpart$cptable[opt, "CP"]
bodyfat_prune <- prune(bodyfat_rpart, cp = cp)
@
The result is shown in Figure~\ref{RP-bodyfat-pruneplot}. Note that
the inner nodes three and six have been removed from the tree. Still, the
rightmost terminal node might give very unreliable extreme predictions.
\begin{figure}
\begin{center}
<>=
plot(as.party(bodyfat_prune), tp_args = list(id = FALSE))
@
\caption{Pruned regression tree for body fat data.
         \label{RP-bodyfat-pruneplot}}
\end{center}
\end{figure}
Given this model, one can predict the (unknown, in real circumstances) body fat
content based on the covariate measurements. Here, using the known values of
the response variable, we compare the model predictions with the actually measured
body fat as shown in Figure~\ref{RP-bodyfat-predict}. The three observations
with large body fat measurements in the rightmost terminal node can be identified
easily.
\begin{figure}
\begin{center}
<>=
DEXfat_pred <- predict(bodyfat_prune, newdata = bodyfat)
xlim <- range(bodyfat$DEXfat)
plot(DEXfat_pred ~ DEXfat, data = bodyfat, xlab = "Observed", 
     ylab = "Predicted", ylim = xlim, xlim = xlim)
abline(a = 0, b = 1)
@
\caption{Observed and predicted DXA measurements.
         \label{RP-bodyfat-predict}}
\end{center}
\end{figure}
\subsection{Glaucoma Diagnosis}
<>=
set.seed(290875)
@
<>=
data("GlaucomaM", package = "TH.data")
glaucoma_rpart <- rpart(Class ~ ., data = GlaucomaM, 
    control = rpart.control(xval = 100))
glaucoma_rpart$cptable
opt <- which.min(glaucoma_rpart$cptable[,"xerror"])
cp <- glaucoma_rpart$cptable[opt, "CP"]
glaucoma_prune <- prune(glaucoma_rpart, cp = cp)
@
\setkeys{Gin}{width = 0.7\textwidth}
\begin{figure}
\begin{center}
<>=
plot(as.party(glaucoma_prune), tp_args = list(id = FALSE))
@
\caption{Pruned classification tree of the glaucoma data with class
         distribution in the leaves. \label{RP:gl}}
\end{center}
\end{figure}
\setkeys{Gin}{width=0.95\textwidth}
\index{Classification tree!choice of tree size}
\index{Tree size}
As we discussed earlier, the choice of the appropriatly sized tree is not a
trivial problem. For the glaucoma data, the above choice of three leaves is
very unstable across multiple runs of cross-validation. As an illustration
of this problem we repeat the very same analysis as shown above and record
the optimal number of splits as suggested by the cross-validation runs.
<>=
nsplitopt <- vector(mode = "integer", length = 25)
for (i in 1:length(nsplitopt)) {
    cp <- rpart(Class ~ ., data = GlaucomaM)$cptable
    nsplitopt[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(nsplitopt)
@
Although for \Sexpr{sum(nsplitopt == 1)} runs of cross-validation a simple
tree with one split only is suggested, larger trees would have been favoured
in \Sexpr{sum(nsplitopt > 1)} of the cases. This short analysis shows that
we should not trust the tree in Figure~\ref{RP:gl} too much.
\index{Bagging}
\index{Bootstrap approach!glaucoma diagnosis data}
One way out of this dilemma is the aggregation of multiple trees via bagging. 
In \R{}, the bagging idea can be implemented by three or four lines
of code. Case count or weight vectors representing the bootstrap samples can be drawn from the
multinominal distribution with parameters $n$ and $p_1 = 1/n, \dots, p_n =
1/n$ via the \Rcmd{rmultinom} function. For each weight vector, one large tree is
constructed without pruning and the \Rclass{rpart} objects are stored in a
list, here called \Robject{trees}:
<>=
trees <- vector(mode = "list", length = 25)
n <- nrow(GlaucomaM)
bootsamples <- rmultinom(length(trees), n, rep(1, n)/n)
mod <- rpart(Class ~ ., data = GlaucomaM, 
             control = rpart.control(xval = 0))
for (i in 1:length(trees))
    trees[[i]] <- update(mod, weights = bootsamples[,i])
@
The \Rcmd{update} function re-evaluates the call of \Robject{mod}, however,
with the weights being altered, i.e., fits a tree to a bootstrap sample
specified by the weights.
It is interesting to have a look at the structures of the multiple trees.
For example, the variable selected for splitting in the root of the tree is
not unique as can be seen by
<>=
table(sapply(trees, function(x) as.character(x$frame$var[1])))
@
Although \Robject{varg} is selected most of the time, other variables such
as \Robject{vari} occur as well -- a further indication that the tree in
Figure~\ref{RP:gl} is
questionable and that hard decisions are not appropriate for the glaucoma
data.
In order to make use of the ensemble of trees in the list \Robject{trees}
we estimate the conditional probability of suffering from glaucoma given the
covariates for each observation in the original data set by
<>=
classprob <- matrix(0, nrow = n, ncol = length(trees))
for (i in 1:length(trees)) {
    classprob[,i] <- predict(trees[[i]], 
                             newdata = GlaucomaM)[,1]
    classprob[bootsamples[,i] > 0,i] <- NA
}
@
Thus, for each observation we get \Sexpr{length(trees)} estimates. However,
each observation has been used for growing one of the trees with
probability $0.632$ and thus was not used with probability $0.368$.
Consequently, the estimate from a tree where an observation was not used for
growing is better for judging the quality of the predictions and we label
the other estimates with \Robject{NA}.
Now, we can average the estimates and we vote for glaucoma when the average
of the estimates of the conditional glaucoma probability exceeds $0.5$. The
comparison between the observed and the predicted classes does not suffer from
overfitting since the predictions are computed from those trees for which 
each single observation was \stress{not} used for growing.
<>=
avg <- rowMeans(classprob, na.rm = TRUE)
predictions <- factor(ifelse(avg > 0.5, "glaucoma", 
                                        "normal"))
predtab <- table(predictions, GlaucomaM$Class)
predtab
@
Thus, an honest estimate of the probability of
a glaucoma prediction when the patient is actually suffering from glaucoma is
<>=
round(predtab[1,1] / colSums(predtab)[1] * 100)
@
per cent. For 
<>=
round(predtab[2,2] / colSums(predtab)[2] * 100)
@
per cent of normal eyes, the ensemble does not predict a glaucomateous
damage.
\begin{figure}
\begin{center}
<>=
library("lattice")
gdata <- data.frame(avg = rep(avg, 2), 
    class = rep(as.numeric(GlaucomaM$Class), 2),
    obs = c(GlaucomaM[["varg"]], GlaucomaM[["vari"]]),
    var = factor(c(rep("varg", nrow(GlaucomaM)), 
                   rep("vari", nrow(GlaucomaM)))))
panelf <- function(x, y) {
           panel.xyplot(x, y, pch = gdata$class)
           panel.abline(h = 0.5, lty = 2)
       }
print(xyplot(avg ~ obs | var, data = gdata, 
       panel = panelf,
       scales = "free", xlab = "", 
       ylab = "Estimated Class Probability Glaucoma"))
@
\caption{Estimated class probabilities depending on two important
         variables. The $0.5$ cut-off for the estimated glaucoma probability
         is depicted as a horizontal line. Glaucomateous eyes are plotted as
         circles and normal eyes are triangles. \label{RP:glplot}}
\end{center}
\end{figure}
\index{Random forest}
The bagging procedure is a special case of a more general approach
called \stress{random forest} \citep{HSAUR:Breiman2001b}. The package
\Rpackage{randomForest} \citep{PKG:randomForest}
can be used to compute such ensembles via
<>=
library("randomForest")
rf <- randomForest(Class ~ ., data = GlaucomaM)
@
and we obtain out-of-bag estimates for the prediction error via
<>=
table(predict(rf), GlaucomaM$Class)
@
\subsection{Trees Revisited}
<>=
detach("package:partykit")
@
For the body fat data, such a \stress{conditional inference tree} 
can be computed using
the \Rcmd{ctree} function
\index{Conditional tree}
<>=
library("party")
bodyfat_ctree <- ctree(DEXfat ~ age + waistcirc + hipcirc +
    elbowbreadth + kneebreadth, data = bodyfat)
@
This tree doesn't require a pruning procedure because
an internal stop criterion based on formal statistical
tests prevents the procedure from overfitting the data. 
The tree structure is shown in Figure~\ref{RP-bodyfat-ctree-plot}. Although
the structure of this tree and the tree depicted in Figure~\ref{RP-bodyfat-pruneplot}
are rather different, the corresponding predictions don't vary too much.
\begin{figure}
\begin{center}
<>=
plot(bodyfat_ctree)
@
\caption{Conditional inference tree with the distribution 
         of body fat content shown for each terminal leaf.
         \label{RP-bodyfat-ctree-plot}}
\end{center}
\end{figure}
Very much the same code is needed to grow a tree on the glaucoma data:
<>=
glaucoma_ctree <- ctree(Class ~ ., data = GlaucomaM)
@
and a graphical representation is depicted in Figure~\ref{RP-glaucoma-ctree-plot} showing
both the cutpoints and the $p$-values of the associated independence tests for each node. 
The first split is performed using a cutpoint defined with respect to the
volume of the optic nerve above some reference plane, but in the inferior part of
the eye only (\Robject{vari}).
\begin{figure}
\begin{center}
<>=
plot(glaucoma_ctree)
@
\caption{Conditional inference tree with the distribution 
         of glaucomateous eyes shown for each terminal leaf.
         \label{RP-glaucoma-ctree-plot}}
\end{center}
\end{figure}
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}   
\end{document}