EIX: Titanic data

Ewelina Karbowiak

2018-29-03

Data Info

This vignette shows usage of EIX package for titanic data. This dataset was copied from stablelearner package. With EIX package we explain XGBoost classification model concerning the survival problem. More details about EIX package here.

#devtools :: install_github("ModelOriented/EIX")
library("EIX")
library(data.table)
set.seed(4)
titanic_data<-data.table(na.omit(titanic_data))
knitr::kable(head(titanic_data))
gender age class embarked country fare sibsp parch survived
male 42 3rd Southampton United States 7.11 0 0 no
male 13 3rd Southampton United States 20.05 0 2 no
male 16 3rd Southampton United States 20.05 1 1 no
female 39 3rd Southampton England 20.05 1 1 yes
female 16 3rd Southampton Norway 7.13 0 0 yes
male 25 3rd Southampton United States 7.13 0 0 yes
library("Matrix")
sparse_matrix <- sparse.model.matrix(survived ~ . - 1,  data = titanic_data)

Xgboost model creation

library("xgboost")
xgb_model<- xgboost(sparse_matrix, titanic_data[, "survived"], objective = "binary:logistic", max_depth = 2, nrounds = 50, verbosity = 0)

Model visualization

First let’s plot the model.

lolli<-lollipop(xgb_model,sparse_matrix)
plot(lolli, threshold=0.02)

Interactions

Next we explore interactions using interactions() functions and its plot.

interactions<-interactions(xgb_model, sparse_matrix, option = "interactions")
head(interactions, 15)
##                 Parent               Child   sumGain frequency
##                 <char>              <char>     <num>     <int>
##  1:           class3rd                fare 48.020927         5
##  2:              sibsp                 age 31.215267         5
##  3:           class3rd                 age 30.158607         6
##  4:       genderfemale                 age 15.022622         3
##  5:       genderfemale                fare 10.176903         3
##  6:     classdeck crew        genderfemale  8.730529         1
##  7:                age                fare  8.694904         3
##  8: countrySwitzerland                fare  6.475769         1
##  9:               fare               sibsp  5.967306         2
## 10:       genderfemale embarkedSouthampton  5.925262         1
## 11:       genderfemale            class2nd  4.943701         1
## 12:                age embarkedSouthampton  4.549278         1
## 13:           class3rd        genderfemale  4.512232         1
## 14:     classdeck crew                 age  4.182668         1
## 15:                age            class2nd  3.924453         2
plot(interactions)

Variables’ and interactions’ importance

importance<-importance(xgb_model, sparse_matrix, option = "both")
head(importance, 15)
##                         Feature sumGain sumCover meanGain meanCover frequency
##                          <char>   <num>    <num>    <num>     <num>     <num>
##  1:                genderfemale 863.200   3773.0   78.470     343.0        11
##  2:                    class3rd 196.600   2517.0   17.870     228.9        11
##  3:              classdeck crew 118.100   1993.0   19.690     332.2         6
##  4:                         age 105.600   4297.0    5.031     204.6        21
##  5:                        fare  56.480   3373.0    3.530     210.8        16
##  6:           embarkedCherbourg  51.550   1058.0   17.180     352.6         3
##  7:               class3rd:fare  48.020    691.2    9.604     138.2         5
##  8:       classrestaurant staff  34.100   1861.0    5.684     310.2         6
##  9:                   sibsp:age  31.220   1001.0    6.243     200.2         5
## 10:                class3rd:age  30.160   1087.0    5.026     181.2         6
## 11:        countryUnited States  28.140    499.9   14.070     249.9         2
## 12:                       sibsp  17.860   1330.0    4.465     332.6         4
## 13:            genderfemale:age  15.020    581.0    5.008     193.7         3
## 14:           genderfemale:fare  10.180    386.8    3.392     128.9         3
## 15: classdeck crew:genderfemale   8.731    335.9    8.731     335.9         1
##     mean5Gain
##         <num>
##  1:   167.100
##  2:    34.420
##  3:    23.400
##  4:    12.950
##  5:     7.115
##  6:    17.180
##  7:     9.604
##  8:     6.470
##  9:     6.243
## 10:     5.592
## 11:    14.070
## 12:     4.465
## 13:     5.008
## 14:     3.392
## 15:     8.731
plot(importance, radar=FALSE)

plot(importance)
## Ignoring unknown labels:
## • fill : "Measures"

Explanation of the single prediction including interactions

Let’s see an explanation of the prediction for an 18-year-old from England who has traveled 3rd class.

data <- titanic_data[27,]
new_observation <- sparse_matrix[27,]
wf<-waterfall(xgb_model, new_observation, data, option = "interactions")
wf
##                                 contribution
## xgboost: intercept                    -0.030
## xgboost: gender = 2                    1.634
## xgboost: class = 3                    -1.381
## xgboost: country = 15                 -1.277
## xgboost: embarked = 4                  0.449
## xgboost: country:fare = 15:8.06       -0.420
## xgboost: age = 18                     -0.177
## xgboost: embarked:fare = 4:8.06        0.126
## xgboost: class:fare = 3:8.06           0.101
## xgboost: age:country = 18:15           0.086
## xgboost: age:sibsp = 18:0             -0.080
## xgboost: age:fare = 18:8.06            0.068
## xgboost: gender:embarked = 2:4        -0.061
## xgboost: class:age = 3:18              0.034
## xgboost: gender:fare = 2:8.06         -0.030
## xgboost: age:class = 18:3              0.030
## xgboost: sibsp:age = 0:18             -0.028
## xgboost: age:embarked = 18:4          -0.025
## xgboost: sibsp = 0                     0.021
## xgboost: gender:age = 2:18             0.020
## xgboost: fare:age = 8.06:18            0.018
## xgboost: sibsp:fare = 0:8.06          -0.013
## xgboost: fare = 8.06                   0.010
## xgboost: prediction                   -0.926
plot(wf)
## `height` was translated to `width`.