# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2020_2021/GLMs/")

# Case study 1: Data ------------------------------------------------------
load(file = "data/Ants.RData")
# The dataset refers to an experiment involving ants and sandwiches. The general aim
# was to evaluate whether ants preferred one of the four types of sandwiches. There were 
# other controlling variables in the experiment concerning the type of sandwich.
# The variables are as follows:
# Bread: integer indicator for the kind of bread (1=rye, 2=wheatmeal, 3=multigrain, 4=white)
# Filling: integer indicator for the kind of filling (1=vegemite, 2=peanut butter, 3=ham and pickles)
# Butter: indicator for butter (1=butter, -1=no butter)
# Ant_count: number of captured ants
# Order: order of the experiment

# The goal is to define a Poisson linear model to represent the number of captured ants as a function of the other variables:
# Y_i ~ Poi(lambda_i), lambda_i = g^-1(Xb), g^-1 is the inverse link function (e.g., log function)

# We can start by running a few of exploratory graphical analyses.
str(Ants)
Ants$Bread=as.factor(Ants$Bread)
Ants$Filling=as.factor(Ants$Filling)
Ants$Butter=factor(x = Ants$Butter,levels = c(-1,1),labels = c("no","yes"))

head(Ants)
psych::describeBy(x = Ants$Ant_count,group = Ants$Bread)
psych::describeBy(x = Ants$Ant_count,group = Ants$Butter)

par(mfrow=c(2,2))
boxplot(Ants$Ant_count~Ants$Bread,frame="n",xlab="Bread",ylab="Counts")
boxplot(Ants$Ant_count~Ants$Filling,frame="n",xlab="Filling",ylab="Counts")
boxplot(Ants$Ant_count~Ants$Butter,frame="n",xlab="Butter",ylab="Counts")
boxplot(Ants$Ant_count~Ants$Bread*Ants$Butter,frame="n",xlab="Bread * Butter",ylab="Counts")


# Case study 1: Models ----------------------------------------------------
# We can start by defining and fitting a poisson linear model with canonical link function
mod1 = glm(data = Ants,formula = Ant_count~Bread+Filling+Butter,family = poisson(link=log))
summary(mod1)
# which shows a higher residual deviance. Still, we can explore if mod1 is better (in terms of fit) as opposed to
# other high-order models (e.g., models where interactions have been added). Comparisons are carried out by means of LR test
add1(object = mod1,scope = .~(.)^2,test = "Chisq")
# The procedure shows that adding the interaction term 'Filling:Butter' will improve the fit of 'mod1' in terms of LR statistic:
mod2 = update(object = mod1,formula. = .~. + Filling:Butter)
summary(mod2)
# We can now repeat the add1() procedure to see if adding other high-order terms will improve the fit of the model.
add1(object = mod2,scope = .~(.)^2,test = "Chisq")
# There is no evidence that adding higher-order terms will increase the overall fit of mod2. 
# In this case, we have used a forward procedure to prune our glm. Similarly, we may also start from the full model (i.e., that containing all the higher-order terms)
# and proceed by removing the uninfluential terms by adopting a backward procedure via drop1() function.
# Final model:
summary(mod2)
par(mfrow=c(2,2))
plot(mod2,which=1:4) #the observation i=19 is the most influential in the dataset (it can be removed!)
# The overall fit is still not satisfactory, as shown by the higher residual deviance and the residual plots of the model.
plot(effects::allEffects(mod2))

# Let's check for overdispersion:
pears_stat = sum(residuals(mod2,type = "pearson")^2) / mod2$df.residual
#pchisq(q = sum(residuals(mod2,type = "pearson")^2),df = mod2$df.residual,lower.tail = FALSE) --alternatively
print(pears_stat)
# The Pearson-based statistic indicates the presence of overdispersion in the data.
# An alternative model for overdispersed poisson counts if the Negative Binomial model
# where Y_i ∼ NegBin(k, pi_i)
# E[Y_i] = k*(1-pi_i)/pi_i i 
# VAR[Y_i] = lambda_i(1 + lambda_i/k) with lambda_i being the Poisson means of the model
# and overdispersion parameter equals to tau = 1/k 
mod1_nb = MASS::glm.nb(data = Ants,formula = Ant_count~Bread+Filling+Butter)
mod2_nb = MASS::glm.nb(data = Ants,formula = Ant_count~Filling+Butter)
anova(mod1_nb,mod2_nb,test = "Chisq") #there is no evidence that adding Bread will increase the fit of the model
mod3_nb = MASS::glm.nb(data = Ants,formula = Ant_count~Filling+Butter+Filling:Butter) 
anova(mod2_nb,mod3_nb,test = "Chisq") #there is no evidence that adding Filling:Butter will increase the fit of the model
# The final model is then:
summary(mod2_nb)
# In the Negative-Binomial model, the interaction term Filling:Butter - which has previously been significant - is no longer significant.
# The overdispersion parameter is 11.30.


# Case study 2: Data ------------------------------------------------------
rm(list=ls()); graphics.off()
load(file = "data/Clotting.RData")
# Let us now concentrate on Clotting dataset again (it has been used in Day 1).
Clotting$lotto = relevel(Clotting$lotto,"uno")

plot(log(Clotting$u[Clotting$lotto=="uno"]),Clotting$tempo[Clotting$lotto=="uno"],bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=1)
points(log(Clotting$u[Clotting$lotto=="due"]),Clotting$tempo[Clotting$lotto=="due"],bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=2)
legend("topright",legend=c("lot 1","lot 2"),bty="n",pch=20,col=c(1,2))

# We observe that clotting time is a non-linear function of plasma concentration for both groups. 
# Instead of linearizing the response variable (as it has been done in Day 1), we now try defining a Gamma linear model:
# Y_i ~ Gam(mu_i,phi)
# E[Y_i] = g^-1(Xb), g^-1 is the inverse link function (e.g., inverse function)
# V[Y_i] = phi*V(Y_i)
# Interesting topic: https://stats.stackexchange.com/questions/67547/when-to-use-gamma-glms#67550


# Case study 2: Models ----------------------------------------------------
mod1a = glm(data = Clotting, formula = tempo~log(u)+lotto,family = Gamma(link=inverse))
mod1b = glm(data = Clotting, formula = tempo~log(u)+lotto,family = Gamma(link=identity))
AIC(mod1a,mod1b) #mod1a is to be preferred

mod2 = glm(data = Clotting, formula = tempo~log(u)*lotto,family = Gamma(link=inverse))
summary(mod2)
anova(mod1a,mod2,test="F")

# observed vs predicted response values
plot(log(Clotting$u[Clotting$lotto=="uno"]),Clotting$tempo[Clotting$lotto=="uno"],bty="n",xlab="u",ylab="time",pch=20,lwd=2,col=1)
points(log(Clotting$u[Clotting$lotto=="due"]),Clotting$tempo[Clotting$lotto=="due"],pch=20,lwd=2,col=2)
legend("topright",legend=c("lot 1","lot 2"),bty="n",pch=20,col=c(1,2))

points(log(Clotting$u[Clotting$lotto=="uno"]),mod2$fitted.values[Clotting$lotto=="uno"],pch=25,lwd=2,col="gray")
points(log(Clotting$u[Clotting$lotto=="due"]),mod2$fitted.values[Clotting$lotto=="due"],pch=22,lwd=2,col="gray")


# Model checking via Monte Carlo ------------------------------------------
# Let consider a Poisson linear model for given counts (y) and a categorical variable (z)

## Simulate the true model
set.seed(191020)
n=250 #number of observations
alpha=0; beta=1.11 #known parameters for the linear predictor
z = rbinom(n = n,size = 1,prob = 0.5) #simulate a categorical variable {0:Control,1:Experimental condition}
mu = alpha+z*beta #linear predictor
y = rpois(n,exp(mu)) #simulate n counts with means mu

boxplot(y[z==0],y[z==1],names = c("E","C")) 
head(cbind(y,z),n=10) # current dataset

# A common (but wrong) way to analyse count data is to represent them using a Normal model. This is frequently done, for instance, in the 
# ANOVA case and related settings. Here, we will consider what is the effect of treating count data as they are sampled from a Normal model.
mod1 = lm(formula = y~z) 
summary(mod1)
alpha_est = summary(mod1)$coefficients[1,1]
beta_est = summary(mod1)$coefficients[2,1]
s_est = summary(mod1)$sigma 

# Next, let generate data from the fitted model
M = 1000 # number of replicates
Ypred = matrix(NA,nrow = M,ncol = n)
mu_pred = alpha_est+z*beta_est #estimated linear predictor for the mean of the model
for(m in 1:M){
  print(paste("Prediction no.:",m,sep=" "))
  Ypred[m,] = rnorm(n,mu_pred,s_est)
}
print(Ypred[1:5,1:10]) #predicted data

# Plot the observed data
suppy = min(y):(max(y)-1) #discrete domain of the response variable y
py = table(y)/n #observed counts (relative counts) 
plot(x=suppy,y=py,bty="n",xlim=c(-2,max(suppy)+2.5),ylim=c(0,max(py)+0.1),ylab="counts",xlab="y",col=10,lwd=2.5); segments(x0=suppy,x1=suppy,y0=0,y1=py,lwd=1.5) 

# ..add the predicted data
for(m in 1:M){lines(density(Ypred[m,]),col="gray")} #densities
points(x=suppy,y=py,col=10,lwd=2.5);segments(x0=suppy,x1=suppy,y0=0,y1=py,lwd=1.5) #just to highlight the observed data
# We can observe how the predicted data from the Normal linear model goes beyond the natural lower bound for the counts (i.e., they are less then zero!)
# The Normal linear model, when applied on counts data, predicts observations beyond the natural range of the response variable y.
# The percentage of the data outside the range of y is as follows:
sum(as.vector(Ypred)<0)/length(as.vector(Ypred))*100

# To compare sample data y with regards to predicted data {y1,..,yM} we may use some statistics t(y) of the form t: Y^n -> R and
# compare them to the predicted data. The rationale is to evaluate where t(y) is with respect to the predicted distribution F(t(y1,..,yM)).
# By assessing how much the predicted distribution 'contains' the statistic t(y), one can quantify the plausibility of the predictions given the observations.

# First statistic: mean and variance
yt_pred = apply(Ypred,1,mean) #predicted means
yt_obs = mean(y) #observed mean
hist(yt_pred,main=""); abline(v = yt_obs,col=2,lwd=2,lty=2)
delta_t = min(sum(yt_pred >= yt_obs)/M, sum(yt_pred < yt_obs)/M)
print(delta_t) 
# when delta_t approximates zero then the model is not completely adequate to represent the characteristics of the data (in terms of the given t(y))

yt_pred = apply(Ypred,1,var) #predicted variances
yt_obs = var(y) #observed variance
hist(yt_pred,main=""); abline(v = yt_obs,col=2,lwd=2,lty=2)
delta_t = min(sum(yt_pred >= yt_obs)/M, sum(yt_pred < yt_obs)/M)
print(delta_t)
# Mean and variance of counts are adequately represented by the Normal model

# Second statistic: minumum
yt_pred = apply(Ypred,1,min)
yt_obs = min(y)
hist(yt_pred,main=""); abline(v = yt_obs,col=2,lwd=2,lty=2)
delta_t = min(sum(yt_pred >= yt_obs)/M, sum(yt_pred < yt_obs)/M)
print(delta_t)
# The Normal model does not correctly represent the mininum of the counts

# Third statistic: maximum
yt_pred = apply(Ypred,1,max)
yt_obs = max(y)
hist(yt_pred,main=""); abline(v = yt_obs,col=2,lwd=2,lty=2)
delta_t = min(sum(yt_pred >= yt_obs)/M, sum(yt_pred < yt_obs)/M)
print(delta_t)
# The Normal model does not correctly represent the maximum of the counts

# Fourth statistic: skewness
yt_pred = apply(Ypred,1,function(x)psych::skew(x)) 
yt_obs = psych::skew(y)
hist(yt_pred,main=""); abline(v = yt_obs,col=2,lwd=2,lty=2)
delta_t = min(sum(yt_pred >= yt_obs)/M, sum(yt_pred < yt_obs)/M)
print(delta_t)
# Observed data show positive skewness (yt_obs>0): The right tail is longer and the mass of the distribution is concentrated on the left of the distribution

# Fifth statistic: kurtosis
yt_pred = apply(Ypred,1,function(x)psych::kurtosi(x)) 
yt_obs = psych::kurtosi(y)
hist(yt_pred,main=""); abline(v = yt_obs,col=2,lwd=2,lty=2)
delta_t = min(sum(yt_pred >= yt_obs)/M, sum(yt_pred < yt_obs)/M)
print(delta_t)

# In general, a Normal model in this case fails in reproducing some characteristics of the observed counts. Although it is still enough 
# when the interest lies in modeling mean or variance of the data, it is not enough when the interest is to represent quantiles of the
# distribution such as minimum/maximum or shape of the distribution (skewness/kurtosis).
# In this case, a Poisson model should be preferred.





