# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/glms/")


# Data --------------------------------------------------------------------
load("Rats.RData")
str(Rats); head(Rats)
# The dataframe has 58 rows and 5 columns about the effects of dietary regimens on the fetal
# development of rats in a laboratory setting. There are four groups. In particular, rats in group 1 were given placebo
# injections, and rats in other groups were given injections of an iron supplement.
# This was done on days 7 and 10 in group 2, on days 0 and 7 in group 3, and weekly in group 4.
# The 58 rats were made pregnant, sacrificed after 3 weeks, and then the total number of dead fetuses
# was counted in each litter, as was the mother’s hemoglobin level.
# 
# The variables are as follows:
# litter: litter index ('lettiga')
# group: group index (1,2,3,4)
# h: hemoglobin level of the mother
# n: number of fetuses in the litter
# s: number of dead fetuses in the litter

# We may dicothomize the group assignments as follows:
Rats$placebo = ifelse(Rats$group==1,1,0) #placebo vs treatment
Rats$y = Rats$s / Rats$n
# Next, we can consider whether the probability for a rat to die may depend on placebo vs tratment and hemoglobine levels.


# Models ------------------------------------------------------------------
# Beta-Binomial model for overdispersion
# Useful comments: https://rpubs.com/cakapourani/beta-binomial

# From https://online.stat.psu.edu/stat504/lesson/7/7.2/7.2.2:
# Overdispersion occurs because the mean and variance components of a GLM are related and 
# depends on the same parameter that is being predicted through the independent vector.
# With discrete response variables the possibility for overdispersion exists because the commonly used distributions 
# specify particular relationships between the variance and the mean.
# For the binomial response, if Y_i ~ Bin(m_i,pi_i), the mean is E[Y_i] = mu_i = pi_i*m_i and the variance is 
# VAR[Y_i] = (mu_i*m_i-mu_i^2)/m_i
# Overdispersion means that the data show evidence that the (sample) variance of the response is greater than VAR[Y_i] (expected variance)
# Overdispersion arises when the m_i Bernoulli trials that are summarized in a line of the dataset are:
## not identically distributed (i.e. the success probabilities vary from one trial to the next), or
## not independent (i.e. the outcome of one trial influences the outcomes of other trials).
# The usual way to correct for overdispersion in a logit model is to assume that:
## E[Y_i] = m_i*pi_i
## VAR[Y_i] ~= m_i*pi_i*(1-pi_i)*phi
# where phi is a scale parameter (overdispersion parameter).

mod1 = glm(formula = y~placebo+h,family = binomial(link=logit),weights = n, data = Rats)
summary(mod1)
# The probability for a rat to die is higher in the placebo group (by controlling for variable 'h')
plot(effects::allEffects(mod1))

# The residual variance is too high. We may want considering whether adding a dispersion parameter would increase the fit of the current model.
# The Pearson based statistic is useful in this context:
pears_stat = sum(residuals(mod1,type = "pearson")^2) / mod1$df.residual
print(pears_stat)
# Generally, values for this statistic higher the 1 may indicate the presence of over-dispersion. 
# We can proceed further by evaluating a Beta-binomial model to take into account overdispersion
mod2 = aod::quasibin(formula = cbind(s,n-s)~placebo+h,data=Rats)
print(mod2)
# Still for the Beta-binomial model, the probability for a rato to die is higher for those receiving placebo (by controlling for 'h')






