#######################################################################
## Social Network Analysis
## Master in 'Data Science per il Welfare'
## prof. Antonio Calcagnì (antonio.calcagni@unipd.it)
#######################################################################


## CONTENTS ###########################################################
# (A) Import the data and create the graph
# (B) A few exploratory analyses
# (C) Exponential Random Graph model
#######################################################################


# Initial settings --------------------------------------------------------
rm(list=ls()); graphics.off()
setwd("~/MEGA/Lavoro_sync/Didattica/2023_2024/socNetwork_analysis/") #change it according to your local path!
library(igraph); library(ergm)



# (A) Import the data and create the graph --------------------------------
# Dataset description: The dataset contains information on conflicts among 66 world countries during the Cold War period.
# The dataset is structured as a matrix where each cell contains the number of conflicts registered from 1950 to 1985.

war = as.matrix(read.csv(file = "labs/data/war.txt",sep = " "))
head(war) #the original matrix contains counts
war = (war>0)*1 #transform the matrix back (so it contains boolean values)
diag(war)=0
x11(); lattice::levelplot(war)
# The observed matrix of co-occurrence is sparse.

# Note: The basic formulation of Exponential Random Graph Models (ERGMs) is primarily designed for unweighted networks. 
# To accomodate for weights, there are extensions of ERGMs that can handle weighted networks, such as 
# Generalized Exponential Random Graph Models (GERGMs).

# Create the undirected graph from the adj matrix
gwar = graph_from_adjacency_matrix(war, mode = 'undirected', weighted = NULL, diag=FALSE)
str(gwar)

gwarl = layout_with_fr(graph = gwar)
x11(); plot(gwar,layout=gwarl,vertex.label.font=2,vertex.size=15)
x11(); hist(degree(gwar),breaks = 10)

# There graph is not fully connected. Indeed, there are countries that show no connections/conflicts with the others.
# We can therefore decide to prune the graph by removing vertices with no edges.
iid = which(degree(gwar)<1)
gwar = delete_vertices(gwar,iid)

is_connected(gwar) #the network is still not connected
any_multiple(gwar) #no multiple connections
any_loop(gwar) #no loops
# Then, the graph gwar is saidto be 'simple'.



# (B) A few exploratory analyses ------------------------------------------

# We can run a clique-based analysis and look for 2- and 3-maximal cliques of conflicts
out = sapply(max_cliques(gwar),length) 
table(out) #how many maximal cliques

mcl3 = max_cliques(gwar)[out==3] #retrieve cliques with maximal size 3
mcl2 = max_cliques(gwar)[out==2] #retrieve cliques with maximal size 3

x11(); par(mfrow=c(1,2))

# Plot 2-cliques
V(gwar)$color = rep("lightblue",vcount(gwar))
V(gwar)$color[unlist(mcl2)] = "orange" 
gwarl = layout_with_fr(graph = gwar)
plot(gwar,layout=gwarl,vertex.label.font=2,vertex.size=15,main="2-cliques")

# Plot 3-cliques
V(gwar)$color = rep("lightblue",vcount(gwar))
V(gwar)$color[unlist(mcl3)] = "red" 
plot(gwar,layout=gwarl,vertex.label.font=2,vertex.size=15,main="3-cliques")

# We can improve the graphical representation by adding those nodes
# that show a high level of centrality
btw = betweenness(graph = gwar)
sort(btw,decreasing = TRUE) #as expected, USA and USR are those countries with higher level of betweeness
iid = as.numeric(V(gwar)[c("USA","USR")])
V(gwar)$color[iid] = "gray" 
x11(); plot(gwar,layout=gwarl,vertex.label.font=2,vertex.size=15,main="3-cliques")

# Global and local transitivity
transitivity(graph = gwar,type = "global")
transitivity(graph = gwar,type = "local",vids = iid) #using the previously identified central nodes

# Looking for communities
gwar_cl = cluster_louvain(graph = gwar)
x11(); plot(gwar_cl,gwar,layout=gwarl,vertex.label.font=2,vertex.size=15)
modularity(gwar_cl)

max(gwar_cl$membership) 
# Too many communities? Let's try another method:

gwar_cl2 = cluster_infomap(graph = gwar)
x11(); plot(gwar_cl2,gwar,layout=gwarl,vertex.label.font=2,vertex.size=15)
modularity(gwar_cl2)
max(gwar_cl2$membership) 

# Is the number of communities not a random result?
# Use the Erdos-Renyi graph as null model
set.seed(121)
B=1000; x=matrix(nrow=B,ncol = 1)
for(b in 1:B){
  gb = sample_gnm(n = vcount(gwar),m = ecount(gwar),directed = FALSE,loops = FALSE)
  x[b] = max(cluster_louvain(graph = gb)$membership)
}
x11(); plot(table(x),type="h",bty="n")
summary(x)
sum(x>=8)/B #Prob of the observed statistic under the null distribution



# (C) Exponential Random Graph model --------------------------------------
# User's guide: https://cran.r-project.org/web/packages/ergm/vignettes/ergm.pdf
# A list of newtork statistics (eg: triangles, k-stars,..) that can be used in the 'ergm'
# library can be found here: https://cran.r-project.org/web/packages/ergm/vignettes/ergm-term-crossRef.html

# Let's start by visualizing again the network 'war'
x11(); plot(gwar,layout=gwarl,vertex.label.font=2,vertex.size=15)

# To detect components of the graph
out = components(gwar) 
str(out) 
# We can recognize from the output that - in terms of connections - the graph is composed by three components, each
# with 58, 2, and 2 elements. The last two components are exactly the disconnected components formed by ARG-CHL and DOM-HAI.
# Let extract and remove them from the graph:
iid = which(out$membership>1)
gwar = igraph::delete_vertices(gwar,iid)
summary(gwar)

# Now we can turn to the problem of defining and estimating the ERG model for the network 'gwar'.
library(network)
detach(package:sand); detach(package:igraph) #detach libraries to avoid conflicts with ergm and related libraries
# We need to transform the 'igraph' object to a 'network' object. So, 
A = as.matrix(igraph::as_adjacency_matrix(graph = gwar)) #compute the adj matrix 
gnet = network(x = A,directed = FALSE) 
network.vertex.names(gnet) = colnames(A) #add vertex names
print(gnet)

# The simplest ERG model you can run includes only an EDGES term (baseline/null model)
# This basic model only accounts for the density of the network, ensuring that all the simulated networks have the same density.
m0 = ergm(formula = gnet~edges,estimate = "MPL") #use the Maximum Pseudo Likelihood-based approach
summary(m0)
b_edges = coef(m0)['edges']
exp(b_edges)/(1 + exp(b_edges)) #alternatively: plogis(m0$coef)
# Similarly to logistic models, we transform log-odds back to odds ratios. This is the probability of a tie.
# This probability is equal to the observed of a tie in the network, namely:
181/network::network.dyadcount(gnet) #number of edges divided by the total number of triangles/triads

# We can also evaluate if the number of nodes with specific DEGREES is more or less likely that compared to a random network.
# Let's compute the degree distribution using the ergm functions:
summary(gnet ~ degree(0:20)) 
m1 = ergm(formula = gnet~edges+degree(2:7),estimate = "MPL") 
summary(m1)
# The second model gets a lower AIC, so this esogenous feature should be included into the model.
plogis(m1$coefficients)

# Let's add a measure of clustering, namely the number of (completed) triads/triangles.
# Any model containing the ergm term triangle has the property that dyads are not probabilistically independent of one another. 
# As a result, the estimation algorithm automatically changes to MCMC. Remove the term 'edges' for the sake of simplicity.
# Note that MCMC can take a while before convergence. So to save time, let's use the MPL method (although we know it's not the case).
# In the MCMC case, one also needs to adjust the control parameters of the chains, eg.:
# control = control.ergm(seed = 12345, MCMC.burnin = 500, MCMC.samplesize = 2000,parallel = 8)
m2 = ergm(formula = gnet~edges+triangle,estimate = "MPL") 
summary(m2)
# Interpration of the coeffs is as follows:
# The conditional log-odds of two nodes having a tie - keeping the rest of the network fixed - is
## For a tie with no triangle: -3.63 (prob=0.025)
## For 1 triangle: -3.63 + 1*0.70 = -2.93 (prob=0.050)
## For 2 triangle: -3.63 + 2*0.70 = -2.23 (prob=0.097)
## For 3 triangle: -3.63 + 3*0.70 = -1.53 (prob=0.178)

# Let's add a measure of popularity, namely the number of k-stars:
summary(gnet~kstar(3:4)) #consider 3- and 4-stars only
m3 = ergm(formula = gnet~edges+triangle+kstar(3:4),estimate="MPL") 
summary(m3)

# Compare and assess the models being fitted
AIC(m0,m1,m2,m3)
# The model 'm3' is that reaching the minimum AIC criterion (best model).
# However, it poorly fits the data as the goodness-of-fit (GOF) procedure indicates.
x11();par(mfrow=c(2,2));
plot(gof(m3~model+degree+esp+distance))
# The boxplots show the distribution of descriptive statistics for the simulated networks. 
# The black line shows the statistic for the observed network. 
# In general, a model that fits well is a model where the black line is generally within the gray boxplots.
# Top left: Shows the descriptive statistics that are related to all all the parameters you’ve included.
# Top right: Shows the degree distribution
# Bottom left: edge-wise shared partners distribution (it measures how many connections two nodes have in common)
# Bottom right: distribution of geodesic distances.

# Notes on GOF:
# The ERG model can be seen as generative machinery which tries at representing the process that governs the global patterns 
# of ties from a 'local perspective'. The locally generated processes in turn aggregate up to produce 'global' network properties, 
# even though these global properties are not explicit terms in the model.
#
# To assess whether a local model 'fits the data' one can check how well it reproduces the observed global 
# network properties that are not in the model. Then, one compares the value of the statistics observed in the original network to 
# the distribution of values obtained from bootstrapping the current network model. 
#
# The gof() function implements three 'global characteristics' of a network, i.e.:
# degree, edgwise share partners (esp), and geodesic distance.
# Each of these terms captures an aggregate network distribution, at either 
# the node level (degree), the edge level (esp), or the dyad level (distance).

# Try out the effect of exogenous variables or covariates.
war_covs = read.csv(file = "labs/data/war_covs.txt",header = FALSE)
# For each country we have a numeric value between 0 and 1 indicating their likelihood of 
# getting involved in a worldwide war during the period 1950-1985.
head(war_covs)
# We can dichotomize the node-level covariate:
prob_war = ifelse(war_covs[,2]<0.5,"low","high")
# Now, let's treat the covariate as node attribute:
set.vertex.attribute(gnet,'riskWar', as.character(prob_war))

# We are going to explore the differences in the way connections are made between nodes given the categorical covariate:
m4 = ergm(formula = gnet~edges+triangle+kstar(3:4)+nodefactor('riskWar',level=1),estimate="MPL") #level=1: the level "Low" is used as baseline
summary(m4)
# Results suggest that there is not a statistically significant difference between how low and high risk countries make connections 
# in this particular network.

# Let's add a node-level numeric covariate.
# Consider the raw data 'war.txt'
wardata = read.csv(file = "labs/data/war.txt",sep = " ")
diag(wardata)=0
# and create a variable about the 'propensity' of each country to get involved into a conflict
x = apply(wardata,1,sum)
x = x/max(x) #x->0 indicate lower risk/propensity
set.vertex.attribute(gnet,'conflicts', x) #add the new variable as node attribute

# Include (additively) the new variable into the model now.
# We check whether countries with higher risk have a tendency to receive more ties in the network.
m5 = ergm(formula = gnet~edges+triangle+kstar(3:4)+nodefactor('riskWar',level=1)+nodecov('conflicts'),estimate="MPL")
summary(m5)
# No significant effect for the continuous covariate.

# Is there any interaction between the covariates?
m6 = ergm(formula = gnet~edges+triangle+kstar(3:4)+nodefactor('riskWar',level=1)*nodecov('conflicts'),estimate="MPL")
anova(m5,m6)
AIC(m5,m6)
# No, it does not.

# Adding a measure of homophily in the analysis (dyadic-level covariates).
# Are nodes with the same attribute levels more likely to be connected? 
# Particularly, do countries tend to have conflicts with countries of the same risk level?
m6 = ergm(formula = gnet~edges+triangle+kstar(3:4)+nodefactor('riskWar',level=1)+nodematch('riskWar'),estimate="MPL")
summary(m6)
# No dyadic effect produced by the categorical covariate.
# Note: The function nodematch() allow for quantifying the homophily in a network based on a external variable.

# For continuous covariates, the homophily feature can be assesed using the function absdiff():
m7 = ergm(formula = gnet~edges+triangle+kstar(3:4)+nodecov('conflicts')+absdiff('conflicts'),estimate="MPL")
summary(m7)

# Note: 
# The use of these dyadic term should be done while also including the nodefactor() with the same variable ('main effect'). 
# The reason for this is controlling the eventual over-representation of possible ties between nodes that share an attribute.



