Hi everyone,
This is a somewhat niche question, but I really don’t get it.
When I run a Tweedie GLM, one can get the prediction from the link by doing exp(link). To get the prediction for a Tweedie GLM, I get the prediction from the link by doing exp(link)/2. I don’t understand why I need to divide by 2.
Minimal reproducible example below, inspired from the tweedie regression demo at https://github.com/dmlc/xgboost/blob/master/R-package/demo/tweedie_regression.R
cross-posted to stackoverflow: https://stackoverflow.com/questions/61945750/xgboost-tweedie-why-is-the-formula-to-get-the-prediction-from-the-link-expli
> library(xgboost)
> library(data.table)
> library(cplm) # for insurance data
> library(statmod) # for tweedie glm
> data(AutoClaim)
>
> # auto insurance dataset analyzed by Yip and Yau (2005)
> dt <- data.table(AutoClaim)
>
> # exclude these columns from the model matrix
> exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
>
> # retains the missing values
> # NOTE: this dataset is comes ready out of the box
> options(na.action = 'na.pass')
> x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
> options(na.action = 'na.omit')
>
> # response
> y <- dt[, CLM_AMT5]
>
> d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
>
> # the tweedie_variance_power parameter determines the shape of
> # distribution
> # - closer to 1 is more poisson like and the mass
> # is more concentrated near zero
> # - closer to 2 is more gamma like and the mass spreads to the
> # the right with less concentration near zero
>
> params <- list(
> objective = 'reg:tweedie',
> eval_metric = 'rmse',
> tweedie_variance_power = 1.4,
> max_depth = 2,
> eta = 1)
> set.seed(42)
> bst <- xgb.train(
> data = d_train,
> params = params,
> maximize = FALSE,
> watchlist = list(train = d_train),
> nrounds = 3)
>
>
> xgb.plot.tree(model = bst)
> ```
>
> # Manually extract the values for the first record :
> x[1,]
>
> # travtime < 102, bluebook <61645 -->tree #1 value= 2.49922585
> # revolkedyes < -9.53674316e-07, npolicy < 5.5 --> tree #2 value= 2.48586464
> # REVOLKEDYes < -9.53674316e-07, areaurban > -9.53674316e-07 --> tree #2 vakye = 2.36028123
>
> link_gbm <- 2.49922585 +2.48586464+ 2.36028123
> link_gbm # 7.345372
>
> # Take exp(link_gbm), divide by 2
> exp(link_gbm ) / 2 # 774.5053
>
> # Compare with getting prediction directly from GBM.
>
>
> predict(bst, d_train)[1] # 774.5053
>
>
> # Let's do the same with a GLM:
> dt2 <- dt[, -exclude, with = F]
> dt2$CLM_AMT5 <- dt$CLM_AMT5
>
> tweedie_fit <-
> glm(CLM_AMT5 ~ .,
> family=tweedie(var.power=1.4, link.power=0),
> data = dt2)
>
> summary(tweedie_fit)
> # Manually get the link value for the first record
>
> dt2[1,]
> link_glm <- tweedie_fit$coefficients["(Intercept)"] +
> 14 * tweedie_fit$coefficients["TRAVTIME"] +
> 14230 * tweedie_fit$coefficients["BLUEBOOK"] +
> 11 * tweedie_fit$coefficients["RETAINED"] +
> 1 * tweedie_fit$coefficients["NPOLICY"] +
> 1 * tweedie_fit$coefficients["CAR_TYPESedan"] +
> 1 * tweedie_fit$coefficients["RED_CARyes"] +
> 3 * tweedie_fit$coefficients["MVR_PTS"] +
> 60 * tweedie_fit$coefficients["AGE"] +
> 11 * tweedie_fit$coefficients["YOJ"] +
> 67349 * tweedie_fit$coefficients["INCOME"] +
> 1 * tweedie_fit$coefficients["GENDERM"] +
> 1 * tweedie_fit$coefficients["JOBCLASSProfessional"] +
> 1 * tweedie_fit$coefficients["MAX_EDUCPhD"] +
> 18 * tweedie_fit$coefficients["SAMEHOME"] +
> 1 * tweedie_fit$coefficients["AREAUrban"]
>
> link_glm # 8.299899
>
> # prediction is exp(link_glm)
>
> exp(link_glm) # 4023.466
>
> # compare with link and prediction from glm ... yes, it's identical
>
> predict(tweedie_fit, type="link")[1]
>
> predict(tweedie_fit, type="response")[1] # 4023.466
> ```