# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2020_2021/GLMs/")


# Case study 1: Data ------------------------------------------------------
load(file = "data/Beetles10.RData")
str(Beetles10); head(Beetles10)
# They refer to n=481 observations referring to beetles which died following a 5-hour exposure to gaseous carbon disulphide.
# Variables:
# logdose10: continuous variable about concentration of carbon disulphide (mg. per litre) in log scale
# ucciso (killed): integer variable about beetles' death (0: survived, 1: dead)

# The goal here is to define a linear model for 'ucciso' as a function of log.dose10. As the response variabile is dicothomous
# (it represents samples from a Binomial random variable), we need to use a Binomial linear model. Here, the term linear refers
# to the so-called linear predictor of the mean Eta_i = Xb. Note also that in the standard Binomial linear model, mean and variance are both
# function of the same explanatory variables and they do not account for eventual overdispersion (a condition where VAR[Y_i]>E[Y_i]).

# In the case of Beetles10, data are not grouped. This means that the response variable Y_i is represented as a collection of zero and one:
# Y_i ~ Bin(1,pi_i)
# with pi_i = g^-1(Xb), g^-1 is the inverse link function (e.g., logistic function)
# Similarly, the Binomial linear model works for grouped data as well (i.e., for each observation the total number of cases are reported).
# For instance:
X=table(Beetles10)
Beetles = data.frame(as.numeric(rownames(X)),X[,1]+X[,2],X[,2],row.names = NULL)
colnames(Beetles) = c("logdose","m","killed")
print(Beetles) # 'm' is the total number of beetles 
# In this case, the Binomial linear model is:
# m_i*Y_i ~ Bin(m_i,p_i)
# with pi_i being defined as above.

# This is to say that we can use the same GLM for both grouped and ungrouped dichotomous responses. We will use the grouped one (i.e., Beetles)
# for the next analyses.
# The response variable in the case of grouped data is as follows:
Beetles$y = Beetles$killed/Beetles$m #proportions


# Case study 1: Models ----------------------------------------------------
mod1_logit = glm(data=Beetles,formula = y~logdose,family = binomial(link=logit),weights = m) 
summary(mod1_logit)
# As we are dealing with grouped data, the total number of cases 'm' has to be used as weights for the model.
# This is the same as
# mod1_logit = glm(data=Beetles10,formula = ucciso~log.dose10,family = binomial(link=logit))
# for the case of ungrouped data (weights=1 and it can be omitted).
# Similarly, the syntax produces the same results as:
# mod1_logit = glm(data=Beetles,formula = cbind(killed,m-killed)~logdose,family = binomial(link=logit)) 

# We may try using different link functions
mod1_probit = glm(data=Beetles,formula = y~logdose,family = binomial(link=probit),weights = m) 
AIC(mod1_logit,mod1_probit)
# Overall, the probit choice seems working slightly better then standard logit.
# In addition, the probit link gets smaller residual variance then the logit one.

plot(Beetles$logdose,Beetles$y,bty="n",xlab="logdose",ylab="y",pch=20,lwd=3)
lines(Beetles$logdose,mod1_probit$fitted.values,lty=2,col="gray",lwd=2)
lines(Beetles$logdose,mod1_logit$fitted.values,lty=4,col="gray",lwd=2)
legend("topleft",legend=c("logit","probit"),bty="n",lty=c(4,2))

# Alternatively:
# plot(effects::allEffects(mod1_probit)) 

# Interpretation of the coefficients should be made according to the link function scale:
mod1_probit$coefficients
# beta = 19.73 is the estimated amount by which the log odds of y would increase if logdose were one unit higher. 
# The log odds of y when logdose is 0 is just Intercept. In this case, increasing the logdose, increases the probability to kill beetles.
# Useful discuss: https://stats.stackexchange.com/questions/34636/interpretation-of-simple-predictions-to-odds-ratios-in-logistic-regression/34638#34638

# Some useful machineries (for the logit case):
# Do not forget that pi_i = g^-1(eta_i), eta_i = b0 + logdose*b1, whereas eta_i = g(pi)
# where in general g(z):=exp(z)/(1+exp(z)) whereas g^-1(u):=log(u/(1-u))
# Indeed,
eta=cbind(1,Beetles[,1])%*%mod1_logit$coefficients
print(eta)
# which is the same as
predict(mod1_logit,type = "link")
# Instead, 
mu=cbind(1,Beetles[,1])%*%mod1_logit$coefficients
probs=exp(mu)/(1+exp(mu))
# which is the same as
predict(mod1_logit,type = "response")
# or
log(probs/(1-probs))

# The overall fit of the model can be evaluated in terms of AUC of the ROC curve.
# First, we need to get the long-version of the aggregated dataset Beetles (ROC curve requires dis-aggregated data)
Beetles_long = c() #it will contains two columns, one for the observed response and the second one for the fitted response data
for(i in 1:NROW(Beetles)){
  Beetles_long = rbind(Beetles_long,
                       cbind(c(rep(1,Beetles$killed[i]),rep(0,Beetles$m[i]-Beetles$killed[i])),
                             rep(mod1_probit$fitted.values[i],Beetles$m[i])))
}
Beetles_long = data.frame(Beetles_long,row.names = NULL); names(Beetles_long) = c("y","yfitted")

mod1_probit_roc = pROC::roc(Beetles_long$y,Beetles_long$yfitted)
plot(mod1_probit_roc,print.auc=TRUE)
# The AUC indicates that mod1_probit accurately resembles observed data in terms of predicted data:
# predictions resembles observations.

# Residual analysis
par(mfrow=c(2,2))
plot(mod1_probit,which=1:4)

binomTools::halfnorm(mod1_logit,resType = "pearson",env = TRUE) 
# halfnormal plot: All the residuals are well inside the simulated envelope.

# Given a certain logdose of carbon disulphide, what is the probability for a beetle to die?
## predicting on link scale
linpred = predict(object = mod1_probit,newdata = data.frame(logdose=1.8002),se.fit = TRUE,type="link")
print(linpred)

## CI for the prediction on link scale
linpred_ci = linpred$fit + c(-1,+1)*qnorm(p=0.975)*linpred$se.fit

## CI for the probability for a beetle to die given logdose
prob_ci = mod1_probit$family$linkinv(linpred_ci)


# Case study 2: Data ------------------------------------------------------
load("data/Rats.RData")
str(Rats); head(Rats)
# The dataframe has 58 rows and 5 columns about the effects of dietary regimens on the fetal
# development of rats in a laboratory setting. There are four groups. In particular, rats in group 1 were given placebo
# injections, and rats in other groups were given injections of an iron supplement.
# This was done on days 7 and 10 in group 2, on days 0 and 7 in group 3, and weekly in group 4.
# The 58 rats were made pregnant, sacrificed after 3 weeks, and then the total number of dead fetuses
# was counted in each litter, as was the mother’s hemoglobin level.
# 
# The variables are as follows:
# litter: litter index ('lettiga')
# group: group index (1,2,3,4)
# h: hemoglobin level of the mother
# n: number of fetuses in the litter
# s: number of dead fetuses in the litter

# We may dicothomize the group assignments as follows:
Rats$placebo = ifelse(Rats$group==1,1,0) #placebo vs treatment
Rats$y = Rats$s / Rats$n
# Next, we can consider whether the probability for a rat to die may depend on placebo vs tratment and hemoglobine levels.

# Beta-Binomial model for overdispersion
# Useful comments: https://rpubs.com/cakapourani/beta-binomial

# From https://online.stat.psu.edu/stat504/lesson/7/7.2/7.2.2:
# Overdispersion occurs because the mean and variance components of a GLM are related and 
# depends on the same parameter that is being predicted through the independent vector.
# With discrete response variables the possibility for overdispersion exists because the commonly used distributions 
# specify particular relationships between the variance and the mean.
# For the binomial response, if Y_i ~ Bin(m_i,pi_i), the mean is E[Y_i] = mu_i = pi_i*m_i and the variance is 
# VAR[Y_i] = (mu_i*m_i-mu_i^2)/m_i
# Overdispersion means that the data show evidence that the (sample) variance of the response is greater than VAR[Y_i] (expected variance)
# Overdispersion arises when the m_i Bernoulli trials that are summarized in a line of the dataset are:
## not identically distributed (i.e. the success probabilities vary from one trial to the next), or
## not independent (i.e. the outcome of one trial influences the outcomes of other trials).
# The usual way to correct for overdispersion in a logit model is to assume that:
## E[Y_i] = m_i*pi_i
## VAR[Y_i] ~= m_i*pi_i*(1-pi_i)*phi
# where phi is a scale parameter (overdispersion parameter).


# Case study 2: Models ----------------------------------------------------
mod1 = glm(formula = y~placebo+h,family = binomial(link=logit),weights = n, data = Rats)
summary(mod1)
# The probability for a rat to die is higher in the placebo group (by controlling for variable 'h')
plot(effects::allEffects(mod1))

# The residual variance is too high. We may want considering whether adding a dispersion parameter would increase the fit of the current model.
# The Pearson based statistic is useful in this context:
pears_stat = sum(residuals(mod1,type = "pearson")^2) / mod1$df.residual
print(pears_stat)
# Generally, values for this statistic higher the 1 may indicate the presence of over-dispersion. 
# We can proceed further by evaluating a Beta-binomial model to take into account overdispersion
mod2 = aod::quasibin(formula = cbind(s,n-s)~placebo+h,data=Rats)
print(mod2)
# Still for the Beta-binomial model, the probability for a rato to die is higher for those receiving placebo (by controlling for 'h')








