rm(list=ls())
setwd("~/Dropbox/SMDA/Parte B/labs/data")
db <- read.csv("medpar.csv",header=TRUE)
str(db)
db$hmo<-as.factor(db$hmo)
db$white<-as.factor(db$white)
db$age80<-as.factor(db$age80)
db$type<-as.factor(db$type)
# some descriptive analysis
summary(db)
hist(db$los)
boxplot(db$los~db$hmo)
wilcox.test(db$los~db$hmo) # p=.09 > 5%
boxplot(db$los~db$white) 
wilcox.test(db$los~db$white) # p=.04 < 5%
boxplot(db$los~db$age80) 
wilcox.test(db$los~db$age80) # p=.6 > 5% - NS
boxplot(db$los~db$type) 
kruskal.test(db$los~db$type) # p<0.001 < 5%

# Poisson regression model 
# los_i ~ Poisson(mu_i) 
# g(mu_i) = eta_i= B0+B1*HMO+B2*WHITE+B3*AGE80+B4*TYPE_2+B5*TYPE_3
mod_full<-glm(los~.,data=db,family="poisson")
summary(mod_full)
library(sjPlot)
tab_model(mod_full)
step(mod_full,direction = "backward")
# the model is good, all the variables are significant
# and we keep everything
add1(mod_full,~.^2,test="Chisq")
mod_full2<-glm(los~hmo+white*age80+type,data=db,family="poisson")
add1(mod_full2,~.^2,test="Chisq")
mod_full3<-glm(los~hmo+white*age80+white*type,data=db,family="poisson")
add1(mod_full3,~.^2,test="Chisq")
mod_full4<-glm(los~hmo*type+white*age80+white*type,data=db,family="poisson")
add1(mod_full4,~.^2,test="Chisq")

tab_model(mod_full4)
library(ggeffects)
library(ggplot2)
plot(ggpredict(mod_full4,c("hmo","type")))
plot(ggpredict(mod_full4,c("age80","white")))
plot(ggpredict(mod_full4,c("white","type")))
### DIAGNOSTIC
# check the collinearity of X
library(car)
vif(mod_full) # the model is good
par(mfrow=c(2,2))
plot(mod_full4)
par(mfrow=c(1,1))
# I can redo the analysis omitting these observations (1452, 1466)

# the model without two outliers
mod_full4b<-glm(los~hmo*type+white*age80+white*type,
               data=db,family="poisson",subset=-c(1452,1466))

car::Anova(mod_full4b) 
par(mfrow=c(2,2))
plot(mod_full4b)
par(mfrow=c(1,1))

#estimate a model with overdispersion 
mod_full4c<-glm(los~hmo*type+white*age80+white*type,
                data=db,family="quasipoisson")

car::Anova(mod_full4c) 

mod_full4d<-glm(los~hmo+type+white*age80,
                data=db,family="quasipoisson")
car::Anova(mod_full4d) 

mod_full4e<-glm(los~type+white*age80,
                data=db,family="quasipoisson")
car::Anova(mod_full4e) 
par(mfrow=c(2,2))
plot(mod_full4e)
par(mfrow=c(1,1))

plot(ggpredict(mod_full4e,c("age80","white")))
