Tweedie regression prediction from link value

simonc · May 22, 2020, 1:11am

Hi everyone,
This is a somewhat niche question, but I really don’t get it.

When I run a Tweedie GLM, one can get the prediction from the link by doing exp(link). To get the prediction for a Tweedie GLM, I get the prediction from the link by doing exp(link)/2. I don’t understand why I need to divide by 2.

Minimal reproducible example below, inspired from the tweedie regression demo at https://github.com/dmlc/xgboost/blob/master/R-package/demo/tweedie_regression.R

cross-posted to stackoverflow: https://stackoverflow.com/questions/61945750/xgboost-tweedie-why-is-the-formula-to-get-the-prediction-from-the-link-expli

> library(xgboost)
> library(data.table)
> library(cplm) # for insurance data
> library(statmod) # for tweedie glm
> data(AutoClaim)
> 
> # auto insurance dataset analyzed by Yip and Yau (2005)
> dt <- data.table(AutoClaim)
> 
> # exclude these columns from the model matrix
> exclude <-  c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
> 
> # retains the missing values
> # NOTE: this dataset is comes ready out of the box
> options(na.action = 'na.pass')
> x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
> options(na.action = 'na.omit')
> 
> # response
> y <- dt[, CLM_AMT5]
> 
> d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
> 
> # the tweedie_variance_power parameter determines the shape of
> # distribution
> # - closer to 1 is more poisson like and the mass
> #   is more concentrated near zero
> # - closer to 2 is more gamma like and the mass spreads to the
> #   the right with less concentration near zero
> 
> params <- list(
>   objective = 'reg:tweedie',
>   eval_metric = 'rmse',
>   tweedie_variance_power = 1.4,
>   max_depth = 2,
>   eta = 1)
> set.seed(42)
> bst <- xgb.train(
>   data = d_train,
>   params = params,
>   maximize = FALSE,
>   watchlist = list(train = d_train),
>   nrounds = 3)
> 
> 
> xgb.plot.tree(model = bst)
> ```
> 
> # Manually extract the values for the first record :
> x[1,]
> 
> # travtime < 102, bluebook <61645 -->tree #1 value= 2.49922585
> # revolkedyes <  -9.53674316e-07,   npolicy < 5.5 --> tree #2  value= 2.48586464
> # REVOLKEDYes <  -9.53674316e-07, areaurban >  -9.53674316e-07 --> tree #2 vakye =  2.36028123
> 
> link_gbm <-  2.49922585 +2.48586464+  2.36028123
> link_gbm # 7.345372
> 
> # Take exp(link_gbm), divide by 2
> exp(link_gbm ) / 2 # 774.5053
> 
> # Compare with getting prediction directly from GBM.
> 
> 
>  predict(bst, d_train)[1] # 774.5053
> 
> 
> # Let's do the same with a GLM:  
> dt2 <-  dt[, -exclude, with = F]
> dt2$CLM_AMT5 <-  dt$CLM_AMT5
> 
> tweedie_fit <-
>   glm(CLM_AMT5 ~ .,
>       family=tweedie(var.power=1.4, link.power=0),
>       data = dt2)
> 
> summary(tweedie_fit)
> # Manually get the link value for the first record
> 
> dt2[1,]
> link_glm <- tweedie_fit$coefficients["(Intercept)"] +
>   14 * tweedie_fit$coefficients["TRAVTIME"] +
>   14230 * tweedie_fit$coefficients["BLUEBOOK"] +
>   11 * tweedie_fit$coefficients["RETAINED"]  +
>   1 * tweedie_fit$coefficients["NPOLICY"] +
>   1 * tweedie_fit$coefficients["CAR_TYPESedan"] +
>   1 * tweedie_fit$coefficients["RED_CARyes"] +
>   3 * tweedie_fit$coefficients["MVR_PTS"] +
>   60 * tweedie_fit$coefficients["AGE"] +
>   11 * tweedie_fit$coefficients["YOJ"] +
>   67349 * tweedie_fit$coefficients["INCOME"] +
>   1 * tweedie_fit$coefficients["GENDERM"] +
>   1 * tweedie_fit$coefficients["JOBCLASSProfessional"] +
>   1 * tweedie_fit$coefficients["MAX_EDUCPhD"] +
>   18 * tweedie_fit$coefficients["SAMEHOME"] +
>   1 * tweedie_fit$coefficients["AREAUrban"]
> 
> link_glm #  8.299899
> 
> # prediction is exp(link_glm)
> 
> exp(link_glm) # 4023.466
> 
> # compare with link and  prediction from glm ... yes, it's identical
> 
> predict(tweedie_fit, type="link")[1]
> 
> predict(tweedie_fit, type="response")[1] # 4023.466
> ```