###########################################################################
### Statistical methods and data analysis in developmental psychology
### A.Y. 2021/2022
### prof. Antonio Calcagni' (antonio.calcagni@unipd.it)
###########################################################################


### CONTENTS ###########################################
## (A) Data
## (B) Model 1
## (B.1) Explorative analysis
## (B.2) Model definition and fit
## (B.3) Plotting results
## (B.4) Predictive check of the model

## (C) Model 2
## (C.1) Model definition and fit
## (C.2) Plotting results
## (C.3) Predictive check of the model
## (C.4) Comparisons between Model 1 and Model 2
########################################################


# Set environment ---------------------------------------------------------
rm(list=ls())
setwd("/home/antonio/MEGA/Lavoro_sync/Didattica/2021_2022/smda/labs/")
source("utilities.R")


# (A) Data ----------------------------------------------------------------
## Data refer to a study of reaction times in sec ('rts') under sleep derivation (see lab3.R). 
## The current sample consists of n=250 units and J=3 variables as follows: 
## 'pers_traits': total score computed by averaging novelty seeking ('novelty_seek'), harm avoidance ('harm_avoid'), reward dependence ('reward_dep');
## 'pers_states': total score computed by averaging 'anger', 'confusion', 'tension', 'fatigue', 'depression', 'activity'.
## 'group': boolean variable for gender
## The goal is to predict rts under sleep deprivation w.r.t. personality measures and group.
load("data/data3.Rda")
head(datax)
str(datax)

## Considering what has been done in lab3.R, we need to transform the outcome 'rts' using the logarithm
## transformation (log10). This makes the Normal linear model suitable for analysing the current data.
datax$logrts = log(datax$rts)




# (B) Model 1 -------------------------------------------------------------

## (B.1) Explorative analyses
x11();par(mfrow=c(2,3))
cols = c("darkorange3","darkolivegreen") #see: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf

plot(datax$pers_traits,datax$logrts,bty="n",ylab="log RTs",xlab="personality traits") #1

plot(datax$pers_states,datax$logrts,bty="n",ylab="log RTs",xlab="personality states") #2

boxplot(datax$logrts~datax$group,frame=FALSE,ylab="log RTs",xlab="group") #3

plot(datax$pers_traits[datax$group=="M"],datax$logrts[datax$group=="M"],bty="n",ylab="log RTs",xlab="personality traits",pch=20,col=cols[1]) #4
points(datax$pers_traits[datax$group=="F"],datax$logrts[datax$group=="F"],pch=20,col=cols[2])
legend("topright",legend=c("M","F"),col=cols,cex = 1.25,pch=20)

plot(datax$pers_states[datax$group=="M"],datax$logrts[datax$group=="M"],bty="n",ylab="log RTs",xlab="personality states",pch=20,col=cols[1]) #5
points(datax$pers_states[datax$group=="F"],datax$logrts[datax$group=="F"],pch=20,col=cols[2])
legend("topleft",legend=c("M","F"),col=cols,cex = 1.25,pch=20)

# It seems there is a non-linear relationship between the outcome and the predictor 'pers_states' (plot #2).
# Moreover, this pattern still holds when the variable 'group' is used to stratify the outcome variable (plot #5)


## (B.2) Model definition and fit
## We can start by defining and fit the full model - i.e. the model which includes all the additive and interaction terms - as well as
## the baseline model
mod0 = lm(data = datax[,-1],formula = logrts~1) 
mod_full = lm(data = datax[,-1],formula = logrts~.^2) #the syntax .^2 means all the additive and interaction terms
## Then, we can proceed by dropping those terms that do not contribute to increase the fit of the model
add1(object = mod0,scope = mod_full,test="F")
# include the term 'pers_states':
mod1 = update(object = mod0,formula. = .~. + pers_states) #the syntax .~. means all the previous terms
add1(object = mod1,scope = mod_full,test="F")
# include the term 'group':
mod2 = update(object = mod1,formula. = .~. + group) 
add1(object = mod2,scope = mod_full,test="F")
# include the term 'pers_states':
mod3 = update(object = mod2,formula. = .~. + pers_traits) 
add1(object = mod3,scope = mod_full,test="F")
# include the interaction pers_traits:group
mod4 = update(object = mod3,formula. = .~. + pers_traits:group) 
add1(object = mod4,scope = mod_full,test="F")
# we can stope here:
summary(mod4)
## Note: the interpretation of the regression coeffs should be made on 'logrts' and not 'rts'!
## To interpret the coeffs on the scale of 'rts' we need to back-transform the estimates via exp(beta)
## Further info: https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqhow-do-i-interpret-a-regression-model-when-some-variables-are-log-transformed/

## Similarly, we can use the procedure implemented in leaps() (see lab3.R) or, alternatively, we can run some specialized functions from
## the 'olsrr' library:
out = olsrr::ols_step_best_subset(mod_full)
print(out)
#plot(out)
## Similarly, we may use the AIC criterion in a forward manner:
out = olsrr::ols_step_forward_aic(mod_full)
print(out)
## Note: the results may differ from those given by the incremental F-test!
## Further info: https://cran.r-project.org/web/packages/olsrr/vignettes/variable_selection.html

## (B.3) Plotting results
plot(effects::allEffects(mod4)) #plot the marginal effects computed using effects::allEffects(mod4)
effects::allEffects(mod4) #computing the marginal effects numerically
car::avPlots(mod4) #plot the partial regression plots

## (B.4) Predictive check of the model
## While diagnostics can always be run using the functions studied in lab3.R, the fitted model can also
## be evaluated in terms of posterior predictive check, i.e. the ability of the model to resemble some statistics of the 
## original data (see: Gelman et al., 2020). 
posterior_pcheck_Normal(fitted_model = mod4,M = 100) #note: M should be usually larger (e.g., M>1000)
## The routine simulates M new data from the fitted model and compare them with the observed one.
## In this example, the densities of the new data are in gray whereas the true data density is in blue. 
## The routine also computes the degree of overlapping between the two densities (the larger the OV index, the better the fit) and
## the delta_t index which is interpreted as probability for a set of statistics. In this case we can see that the model fails
## in representing the min/max statistics of the observed data whereas the mean (and the variance too) is well represented



# (C) Model 2 -------------------------------------------------------------

## We have seen that the relationship between the outcome variable ('logrts') and the predictor 'pers_states' seems
## to be non-linear:
x11(); par(mfrow=c(1,2))
plot(datax$pers_states,datax$logrts,bty="n",ylab="log RTs",xlab="personality states") #observed relationship
car::avPlot(mod4,variable="pers_states") #estimated relationship
## The previous model ('mod4') has been built by considering the relationship linear. In this section, we will
## build a new model where such relationship is non-linear (i.e., quadratic, cubic). Then, we will compare the new model
## with the previous one.

## (C.1) Model definition and fit
## We will select the best submodel via the AIC criterion starting from the results obtained in the previous section.
mod4$call #formula previous model
mod5 = lm(formula = logrts~I(pers_states^2) + group + pers_traits + group:pers_traits,data=datax) #quadratic term
mod6 = lm(formula = logrts~I(pers_states^3) + group + pers_traits + group:pers_traits,data=datax) #cubic term
performance::compare_performance(mod5,mod6,bayesfactor = FALSE)
# The model with lowest AIC (and lowest RMSE) is 'mod6' which includes a cubic term for 'pers_states'.

## (C.2) Plotting results
plot(effects::allEffects(mod6)) #plot the marginal effects computed using effects::allEffects(mod4)
car::avPlot(mod6,variable="I(pers_states^3)") #estimated relationship (in particular)

## (C.3) Predictive check of the model
posterior_pcheck_Normal(fitted_model = mod6,M = 100) #note: M should be usually larger (e.g., M>1000)


## (C.4) Comparisons between Model 1 and Model 2
performance::compare_performance(mod4,mod6,bayesfactor = FALSE)

x11();par(mfrow=c(1,2))
posterior_pcheck_Normal(fitted_model = mod4,M = 150,titlep = "mod4") #note: M should be usually larger (e.g., M>1000)
posterior_pcheck_Normal(fitted_model = mod6,M = 150,titlep = "mod6") #note: M should be usually larger (e.g., M>1000)


