#######################################################################
## Social Network Analysis
## Master in 'Data Science per il Welfare'
## prof. Antonio Calcagnì (antonio.calcagni@unipd.it)
#######################################################################


## CONTENTS ###########################################################
# (A) Creating the graph/network
# (B) ERG-based data analysis
# (C) ERG model fit
#######################################################################


# Initial settings --------------------------------------------------------
rm(list=ls()); graphics.off()
setwd("~/MEGA/Lavoro_sync/Didattica/2023_2024/socNetwork_analysis/") #change it according to your local path!
library(ergm); library(network)

# Disclaimer: 
# The materials covered here are mostly based on the following resource https://rstudio-pubs-static.s3.amazonaws.com/471073_d45a4acd780b4987932dc8fc47c46dd5.html


# (A) Creating the graph/network ------------------------------------------

## Import the datasets
El = read.table('labs/data/SC_edgelist.csv', sep = ";",header = TRUE) #edgelist
Dt = read.table('labs/data/SC_students.csv', sep = ";",header = TRUE) #nodes characteristics

anyNA(Dt)
# The dataset contains NA. Remove them row-wise:
Dt = na.omit(Dt)

## Create the adjacency matrix
students = unique(c(El$studentID, El$friend.ID.code)) #units for the analysis (remove repetitions)
A = matrix(data = 0, nrow =length(students) , ncol = length(students)) #empty matrix
colnames(A) = as.character(students); rownames(A) = as.character(students) #matrix attributes
for (i in 1:nrow(El)){
  ii = which(students == El[i, 1]) #take the student's id
  jj = which(students == El[i, 2])  #take the friend's id
  A[ii,jj] = 1
}
head(A)

## Create a network object using the library 'network'
gnet = network(A, directed = TRUE) #directed graph
x11(); plot(gnet)

## Adding covariates as network attributes
set.vertex.attribute(gnet, 'sex', as.character(Dt$sex)) #sex
set.vertex.attribute(gnet, 'class', as.character(Dt$class)) #class
set.vertex.attribute(gnet, 'hsport', as.numeric(Dt$sport.hours)) #sport hours
set.vertex.attribute(gnet, 'grade', as.numeric(Dt$marks.overall.num)) #final grade
set.vertex.attribute(gnet, 'game', as.character(Dt$game)) #gamer or not
set.vertex.attribute(gnet, 'likeschool', as.character(Dt$liking.school)) #school pleasant or not
set.vertex.attribute(gnet, 'tvh', as.numeric(Dt$hr.tv)) #tv hours



# (B) ERG-based data analysis ---------------------------------------------
# In what follows we will show how to define and fit an ERG model accounting for endogenous network statistics and exogeneous variables/covariates.

# Statistic: Edges (null model)
m0 = ergm(formula = gnet~edges)
summary(m0)

# Statistic: Mutual
# This statistic is used in directed networks to indicate how many ties are reciprocated
summary(gnet ~ mutual) #using MCMC
# Let us test whether there are more mutual ties than expected in our network:
m1 = ergm(formula = gnet~edges+mutual) #using MCMC
summary(m1)
# The number of mutual ties is compared to generated random networks with the same density. 
# The effect is positive and significant, indicating there are more mutual links in our network than what one would expect 
# from a random network with 134 edges.

# Let's explore how the differences in gender affects the reciprocity of friendships
m1b = ergm(formula = gnet~edges+mutual('sex',diff=TRUE))
summary(m1b)
# Female students have a slightly higher propensity to engage in mutual behavior. 
# Overall, the effects are quite similar for the two groups

# Statistic: Degree
# It checks whether the number of nodes with a specific degree is more or less likely than compared to a random network. 
# Let's see how many nodes have an in/out-degree between 0 and 10.
summary(gnet ~ idegree(0:10)) #IN-degrees
summary(gnet ~ odegree(0:10)) #OUT-degrees

# Instead of directly using degrees into the model, a more appealing way to measure the in- or outdegree distributions is based on the
# gwdedree statistic. It stands for 'geometrically weighted' degree distribution and measures a node's tendency to have multiple outgoing or incoming ties. 
# The weights applied to the increasing number of connections each node has. As such, one of the parameters (the α value) of the term is the decay 
# in weight: Values close to 0 give more relative weight to smaller degree-counts. The decay behavior is similar to that of the AKTSTAR statistic.

# We can explore different weights to see how they change the estimation (use AIC or BIC to select the best one).
m2a = ergm(formula = gnet~edges+mutual+gwodegree(0.1, fixed = TRUE)+gwidegree(0.1, fixed = TRUE)) #α=0.1
m2b = ergm(formula = gnet~edges+mutual+gwodegree(0.25, fixed = TRUE)+gwidegree(0.25, fixed = TRUE)) #α=0.25
AIC(m2a,m2b)
# Changing the decay parameter does not seem to affect pretty much the results.
AIC(m1,m2b) #let's proceed with the model m2b

# Statistic: Triadic structures and ESP
# The goal is to measure the number of triads in the network and compares them to the number of triads in random networks. 
# Note that this statistic has a tendency to produce 'model degeneracy' (a kind of non-convergent model). 
# In this case, the simulations based on the triads term produce networks too dissimilar from the observed networks.
# A degenerated model is unuseful for the data analysis.

# Instead of considering the limiting triangles in a network, we move onto the edgewise shared partner (ESP) statistics. 
# This set of terms measure how many connections two nodes have in common. They can be quite complex structures.
summary(gnet ~ esp(0:10))
# We can use the geometrically weighted ESP term. 
# As for the previous weighted statistic, it includes a parameter to determine the weight for the relationships with higher number of shared partners.
m3 = ergm(formula = gnet~edges+mutual+gwodegree(0.25, fixed = TRUE)+gwidegree(0.25, fixed = TRUE)+gwesp(0.5, fixed = TRUE))
# Note: The model does not converge at all. So we need to exlude this term from the analysis.

# Adding covariates
# First: Let's explore the differences in the way connections are made between nodes given a categorical covariate.
m3 = ergm(formula = gnet~edges+mutual+gwodegree(0.25, fixed = TRUE)+nodefactor('sex'))
summary(m3)
# There is not a statistically significant difference between how men and women make connections in this particular network.
# As this is the case of a directed network, we can differentiate between in-degree and out-degree to determine how the attributes affects the correlation:
m3b = ergm(formula = gnet~edges+mutual+gwodegree(0.25, fixed = TRUE)+nodeifactor('sex')+nodeofactor('sex'))
summary(m3b)
AIC(m2b,m3) #use m2b

# Since gender does not seem to affect (incoming or outgoing) friendship formation rates, let's see if other categorical or continuous variables matter. 
m4 = ergm(formula = gnet~edges+mutual+gwodegree(0.25, fixed = TRUE)+nodefactor('game')+nodefactor('likeschool')+nodecov('hsport'))
summary(m4)
AIC(m2b,m4)

# Statistic: Dyadic terms
# It helps us measure whether students have a tendency to nominate friends with whom they share a particular attribute. 
# In other words, it measures homophily in a network based on a categorical variable.
m5 = ergm(formula = gnet~edges+mutual+gwodegree(0.25, fixed = TRUE)+nodefactor('class')+nodematch('class',diff = TRUE))
summary(m5)
# In general, being in the same class does not increase the probability of a connection between two nodes.

# Let's consider a possible final model
m6_formula = as.formula(gnet ~ 
                          gwesp(.25, fixed = TRUE) +
                          nodematch('class') + nodefactor('class') +
                          nodematch('sex') + nodeifactor('sex') +
                          nodeicov('grade') + 
                          nodeofactor('game') +
                          nodeofactor('likeschool') +
                          absdiff('tvh') +
                          mutual + 
                          edges)

m6 = ergm(m6_formula, control = control.ergm(seed = 12345))
summary(m6)
AIC(m2b,m6) #again, choose m2b



# (C) ERG model fit -------------------------------------------------------
out = btergm::gof(m2b, nsim = 500)
x11();plot(out) #it's fine!
mcmc.diagnostics(m2b) #no evidence of degeneracy (the MC chains mixed up very well)




