###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Data
## (B) A first model
## (B.1) Descriptive statistics
## (B.2) Model building
## (B.3) Final model
## (B.4) Diagnostics

## (C) A second model
## (C.1) Descriptive statistics
## (C.2) Model building
## (C.3) Final model
## (C.4) Diagnostics
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/smda/labs/")
source("utilities.R")


# (A) Data ----------------------------------------------------------------
## Data refer to a study of reaction times in sec ('rts') under sleep derivation. Reaction times were computed using the Multi-Attribute Task Battery, 
## a set of tasks used in laboratory studies of operator performance and workload. The task is sensitive to sleep-deprivation effects. 
## The sample consists of n=120 units and J=9 variables about personality measures as follows: 
## trais: novelty seeking ('novelty_seek'), harm avoidance ('harm_avoid'), reward dependence ('reward_dep');
## states: 'anger', 'confusion', 'tension', 'fatigue', 'depression', 'activity'.
## They have been measured on 10-point scales.
## The goal is to predict rts under sleep deprivation w.r.t. personality measures.
load("data/data2.Rda")
n = NROW(datax); J=NCOL(datax)-1


# (B) A first model -------------------------------------------------------

## (B.1) Descriptive statistics
## We can start by computing descriptive statistics of the data, including graphical representations.
head(datax)
str(datax)
psych::describe(datax)

x11(); par(mfrow=c(3,3))
for(j in 2:NCOL(datax)){
  plot(datax[,j],datax[,1],bty="n",xlab=names(datax)[j],ylab="RTs")
}
# The plots show the outcome variable ('rts') as a function of the personality predictors. As expected, the response data are aggregated w.r.t.
# the levels/categories of the predictors, which in this case are not continuous (they are categorical with several distinct levels). 
## Note: Alternatively, you can run exploratory_plots(y=datax[,1],X=datax[,2:10]) from utilities.R which runs all these exploratory plots in a simplified way.

## (B.2) Model building
mod_full = lm(formula = rts~.,data=datax) # full model
summary(mod_full)
# The adjusted R2 index is low, indicating that the model is currently explaining about 10% of the total variability.
## We can proceed by building a better model w.r.t. the overall fit. As described in lab2.R, we may use the add1() function
## or alternatively the leaps() function of the 'leaps' library. In this case, we can select the best submodel among them maximizing the adjusted R2 index.
out = leaps::leaps(x = datax[,2:NCOL(datax)],y = datax[,1],method = "adjr2",names = names(datax)[2:NCOL(datax)],int = TRUE)
## Note that 'int=TRUE' always add the intercept to each submodel.
print(out)
## The output of the procedure is a list where:
## $which contains the Boolean matrix of the selected variables
## $label is the array with the names of the variables
## $size is the number of included variables for each submodel being tested
## adjr2 is the adjusted R2 index for each submodel being tested
adjr2 = out$adjr2
# The total number of submodels is:
length(adjr2)
x11();plot(1:length(adjr2),adjr2,bty="n",xlab="model index",ylab="adjR2",pch=1)
points(which.max(adjr2),max(adjr2),pch=20,col="red");abline(h = max(adjr2),v = which.max(adjr2))
# The model with largest adjusted R2 is the submodel number 
which.max(adjr2)
# that achieved the adjusted R2 index equals to
max(adjr2)
# which is, again, not so large (about the 5% of the variability is currently explained by the model)
# The subset of variables are:
xvars = out$which[which.max(adjr2),]
# where we need to select those with TRUE value
names(xvars[xvars==TRUE])
## Note: Alternatively, you can use the function leaps_r2(y=datax[,1],X=datax[,2:10]) from utilities.R which runs a simplified leaps() function.

## (B.3) Final model
mod_best = lm(formula = rts~confusion+depress,data = datax)
summary(mod_best)
x11();plot(effects::allEffects(mod_best))
# Reaction times (in sec) increases as a function of confusion and depression.

## (B.4) Diagnostics
## Diagnostics of the fitted model can be run one-shot by running the following command:
x11();par(mfrow=c(2,3));plot(mod_best,which=1:6)
## Alternatively, we can proceed by evaluating step-by-step the assumptions of the models (slides 43-45, Module B).

## Normality of residuals (slide 46, Module B)
x11();plot(performance::check_normality(mod_best)) ## density estimation of the residuals
x11();plot(mod_best,which=2,bty="n")
# We can notice that distribution of residuals does not resemble a Normal curve (heavy tails)

## Homoscedasticity (slide 47, Module B)
x11();plot(performance::check_heteroscedasticity(mod_best))
x11();plot(mod_best,which=3,bty="n")
# There is a slight trend in the plot although it is negligible

## Leverage points (slides 50-51, Module B)
x11();car::leveragePlots(mod_best)
# The observation i=29 should be carefully inspected (it seems to be an outlier point).

## Identifying outliers and/or influential points (slides 52-54, Module B)
x11();car::influencePlot(mod_best,bty="n")
car::outlierTest(mod_best) # Bonferroni-corrected t-test for outliers
# Again, the observation i=29 should be carefully inspected.
influential_plot(fitted_model = mod_best,new_window = TRUE)

## Diff-beta statistics (slide 55, Module B)
out = dfbetas(mod_best)
diffbeta_plot(fitted_model = mod_best,new_window = TRUE)
# The observation i=29 is again the most influential w.r.t. the beta-diff statistics for both the predictors.



# (C) A second model ------------------------------------------------------
## The previous model did not fit the data accuretaly and, as the diagnostics indicated, the Normal linear model was not 
## a good choice for the outcome being analysed (i.e., 'rts'). To improve the model, we have two possibilities: 
## (i) change the probabilistic model of the outcome (e.g., using GLMs -> see the second part of the course)
## (ii) transform the outcome variable
## We will proceed by implementing the second strategy.

## (C.1) Transform the data
datax$logrts = log(datax$rts) #let's try the log transformation of the times

## (C.2) Model building
mod_full2 = lm(formula = logrts~.,data=datax[,-1]) # full model
summary(mod_full2)

out = leaps::leaps(x = datax[,2:(NCOL(datax)-1)],y = datax[,11],method = "adjr2",names = names(datax)[2:(NCOL(datax)-1)],int = TRUE)
c(which.max(out$adjr2),max(out$adjr2))
xvars = out$which[which.max(out$adjr2),]
names(xvars[xvars==TRUE])

## (C.3) Final model
mod_best2 = lm(formula = logrts~confusion+depress,data = datax)
summary(mod_best2)
x11();plot(effects::allEffects(mod_best2))
# Reaction times (in sec) increases as a function of confusion and depression.

## (C.4) Diagnostics

## Normality of residuals (slide 46, Module B)
x11();plot(performance::check_normality(mod_best2)) ## density estimation of the residuals
x11();plot(mod_best2,which=2,bty="n")

## Homoscedasticity (slide 47, Module B)
x11();plot(performance::check_heteroscedasticity(mod_best2))
x11();plot(mod_best2,which=3,bty="n")

## Leverage points (slides 50-51, Module B)
x11();car::leveragePlots(mod_best2)

## Identifying outliers and/or influential points (slides 52-54, Module B)
x11();car::influencePlot(mod_best2,bty="n")
influential_plot(fitted_model = mod_best2,new_window = TRUE)

## Diff-beta statistics (slide 55, Module B)
out = dfbetas(mod_best2)
diffbeta_plot(fitted_model = mod_best2,new_window = TRUE,adjText = -0.5)

## We may try refitting the current model by exluding the influential observations, e.g. i=29 and i=82.










