###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) ANOVA model
## (A.1) Data
## (A.2) Descriptive statistics
## (A.3) Evaluating nested models
## (A.4) Some diagnostics

## (B) ANCOVA model
## (B.1) Data
## (B.2) Descriptive statistics
## (B.3) Evaluating nested models
## (B.4) Some diagnostics
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/smda/labs/")
source("utilities.R")


# (A) ANOVA model ---------------------------------------------------------
## ANOVA models can be defined and fitted using the lm() function as done
## till now for the Normal linear models. Indeed, the ANOVA is a Normal linear
## model where the predictors are categorical variables. We can use the syntax
## and the statistical theory of Normal linear models to conveniently fit
## ANOVA models.

## (A.1) Data
## Data refers to n=80 participants which were tested on a cognitive task about reading performance.
## The variables are as follows:
## y: Measure of cognitive performance in reading normalized in [-10,10] (the higher the score, the better the performance).
## group: Categorical variable indicating whether a participant belongs to the experimental group ("Exp") or control group ("Control").
## task: Four types of cognitive tasks.
load("data/data6.Rda")
n = NROW(datax); J=NCOL(datax)-1
head(datax)

## (A.2) Descriptive statistics
## The data refers to a factorial design involving two categorical variables (group, task) with two and four levels, respectively.
## We can start by inspecting the marginal distribution of the outcome variable w.r.t. the two factors:
x11(); par(mfrow=c(1,2))
boxplot(datax$y~datax$group,frame=FALSE) #first margin of the factorial design (regardless of "task")
boxplot(datax$y~datax$task, frame=FALSE) #second margin of the factorial design (regardless of "group")
# The marginal distributions do not show skewness and, in general, it seems there is no effect of the 'group' variable on the outcome
# (exept a greater variability in the control group) whereas there are some effects of 'task' on the outcome.
## We can look at the interactions by running the following commands:
x11(); par(mfrow=c(1,1))
interaction.plot(response = datax$y,x.factor = datax$group,trace.factor = datax$task,bty="n") #'group'
interaction.plot(response = datax$y,x.factor = datax$task,trace.factor = datax$group,bty="n") #'task'
# It seems that an interaction exists as the differences in the mean of the outcome change over the 'group' and 'task' variables.
# The second plot shows that the control group has an inverse pattern on the outcome variable as opposed to the exp group.


## (A.3) Evaluating nested models
## We can evaluate the effect of the factorial design on the outcome variable by defining and running the following submodels:
mod1 = lm(data = datax,formula = y~group+task) #additive model without interaction
mod2 = lm(data = datax,formula = y~group*task) #complete model
anova(mod1,mod2,test="F")
# There is poor evidence for an interaction effect of the factors on the outcome.
# Then, proceed by keeping the additive model.
anova(mod1,test="F") 
# Overall, there is no evidence on a statistical effect of the experimental variables on the outcome.
# The coefficients for each levels of the variables are as follows:
summary(mod1)
plot(effects::allEffects(mod1))
## Note that the same results could have been achieved if the following commands were run:
mod_aov = aov(formula = y~x1*x2,data = datax)
summary(mod_aov) # the same as anova(mod2)
## where the function aov() has been used instead of lm()


## (A.4) Some diagnostics
x11();plot(performance::check_normality(mod1))

x11();plot(performance::check_heteroscedasticity(mod1))
lmtest::bptest(formula = y~x1*x2,data = datax) #Breusch-Pagan test
bartlett.test(formula = y~x1,data = datax) #Bartlett test (on the categorical predictor only)

x11();car::influencePlot(mod1,bty="n")
diffbeta_plot(fitted_model = mod1,new_window = TRUE)

posterior_pcheck_Normal(fitted_model = mod1,new_window = TRUE)



# (B) ANCOVA model --------------------------------------------------------
## ANCOVA models can be defined and fitted using the lm() function as done
## for the ANOVA case. Indeed, the ANCOVA is a Normal linear
## model where the predictors are both categorical and continuous variables. 

## (B.1) Data
## Data are the same as those used for the previous section (A). However, in this case the variable 'task' has been 
## replaced by the numeric variable 'age'.
## Then, variables are as follows:
## y: Measure of cognitive performance in reading normalized in [-10,10] (the higher the score, the better the performance).
## group: Categorical variable indicating whether a participant belongs to the experimental group ("Exp") or control group ("Control").
## age: Numeric variable referring to age of participants (in years).
load("data/data7.Rda")
n = NROW(datax); J=NCOL(datax)-1
head(datax)


## (B.2) Descriptive statistics
## The data refers to a factorial design involving one categorical variables ('group') with two levels and a continuous variable ('age').
## We can start by inspecting the marginal distribution of the outcome variable w.r.t. the two predictors:
x11(); par(mfrow=c(1,2))
boxplot(datax$y~datax$group,frame=FALSE) #first margin of the factorial design (regardless of "task")
plot(datax$y~datax$age, col="white",bty="n") 
## Note: col="white" do not plot anything. It just creates a graph for the next two plots 
## as point() requires that plot() has been called before.
points(datax$age[datax$group=="Exp"],datax$y[datax$group=="Exp"],col="red",pch="E")  
# We indicate with the shape "E" and color "red" those observations belonging to the Experimental group.
points(datax$age[datax$group=="Control"],datax$y[datax$group=="Control"],col="blue",pch="C")
# Similarly, we indicate with the shape "C" and color "blue" those observations belonging to the Control group.


## (B.3) Evaluating nested models
## As for the ANOVA case, we can run the ANCOVA model by defining as many submodels as hypotheses:
mod1 = lm(data = datax,formula = y~group+age) #additive model without interaction
mod2 = lm(data = datax,formula = y~group*age) #complete model
anova(mod1,mod2,test="F")
# Then, select 'mod2':
summary(mod2)
anova(mod2)
# As we can notice, both 'age' and the interactions are significant although 'group' does not.
# This indicates that the differences in Exp and Control emerge as soon 'age' is added as additional covariate.
## Note: The output by anova() shows that 'group' is overall significant although the test t provided by summary(mod2)
## indicates an opposite result. This should not be surprising as anova() implements the test F which indicates whether adding a 
## variable improves (or not) the overall fit of the model.

x11();plot(effects::effect(mod = mod2,term="group"))
x11();plot(effects::effect(mod = mod2,term="age"))
x11();plot(effects::effect(mod = mod2,term="group:age"))

## We may also draw the submodels related to the interaction term by using the abline() command after having calculated
## the coefficients for the two submodels.
# submodel 1: group:Control + b*age = 3.5288 - 0.1084*age
# submodel 2: group:Exp + b*age = 3.5288+1.7729 - 

x11()
plot(datax$y~datax$age, col="white",bty="n") 
points(datax$age[datax$group=="Exp"],datax$y[datax$group=="Exp"],col="red",pch="E")
points(datax$age[datax$group=="Control"],datax$y[datax$group=="Control"],col="blue",pch="C")
abline(a = coef(mod2)[1],b=coef(mod2)[3],col="blue")
abline(a = coef(mod2)[1]+coef(mod2)[2],b=coef(mod2)[3]+coef(mod2)[4],col="red")

## The same results could have been achived by running the two submodels separately:
mod2a = lm(data = datax,formula = y~age,subset = group=="Control")
mod2b = lm(data = datax,formula = y~age,subset = group=="Exp")
## Then, again:
x11()
plot(datax$y~datax$age, col="white",bty="n") 
points(datax$age[datax$group=="Exp"],datax$y[datax$group=="Exp"],col="red",pch="E")
points(datax$age[datax$group=="Control"],datax$y[datax$group=="Control"],col="blue",pch="C")
abline(mod2a,col="blue")
abline(mod2b,col="red")


## (B.4) Some diagnostics
## Diagnostics can be performed as for the previous case.




