#### This script runs part of the analysis presented in Stephenson et al. 'Parasites of Trinidadian guppies: Evidence for sex- and age-specific trait-mediated indirect effects of predators'. This analysis uses a model averaging approach to the data because we found during stepwise deletion of the model terms that there are several equally well supported models. We also have a number of explanatory variables and wanted to be confident in the parameter estimates and relative importance of these. Please see the text of the paper for more details, and references.

dframe1 <- read.csv(file.choose())    # Select "Trini_data.csv"

###### Check the data is being read correctly
summary(dframe1)

##### Make sure variables are read as either numeric or factors - this also gives a description of each of the columns in the datafile
dframe1$W <- as.numeric(dframe1$wgt) # fish weight
dframe1$gyro<-as.numeric(dframe1$gyro) # count of gyrodactylid parasites on that fish
dframe1$year<-as.factor(dframe1$year) # year the fish was sampled

dframe1$presG<-as.factor(dframe1$presG) # presence/absence of gyrodactylid parasites
dframe1$prestrich<-as.factor(dframe1$prestrich) # presence/absence of trichodina
dframe1$presWS<-as.factor(dframe1$presWS) # presence/absence white spot
dframe1$presApio<-as.factor(dframe1$presApio) # presence/absence apiosoma
dframe1$presfung<-as.factor(dframe1$presfung) # presence/absence fungal infection 
dframe1$presDig<-as.factor(dframe1$presDig) # presence/absence of digenean metacercariae 
dframe1$presCN<-as.factor(dframe1$presCN) # presence/absence of camallanus nematodes 

dframe1$richness.no.gyro<-as.numeric(dframe1$richness.no.gyro) # richness score not including gyrodactylid parasites
dframe1$richness.w.gyro<-as.numeric(dframe1$richness.w.gyro) # richness score including gyrodactylid parasites

dframe1$site<-as.factor(dframe1$site) # arbitrary site number within the course of each river - not useful alone but necessary for the next step:

###### Create nested spatial variables
dframe1 <- within(dframe1, {nriver<-factor(drainage:river)
                            ncourse <- factor(drainage:river:course)
                            nsite <- factor(drainage:river:course:site) })

###### create a subsetted dataframe with just 2003, 2004, 2008 because of model convergence issues
dfrn89<-subset(dframe1, year %in% c('2003','2004','2006'))# because the sample sizes for 2008 and 2009 are so much smaller than for the other years there are problems with the model convergence.

###### rescale variables to aid model fitting
library("arm")
dfrn89$presG<-rescale(dfrn89$presG, binary.inputs="0/1")
dfrn89$W<-rescale(dfrn89$W)
dfrn89$prestrich<-rescale(dfrn89$prestrich,binary.inputs="0/1")

###### remove any missing values
dfrn89<-na.omit(dfrn89)

####### construct the global model with all the terms about which you have an a priori hypothesis

library(lme4)

### N.B. The new versions of lme4 have become more sensitive to non-convergence, to the point of producing false positives. This is especially likely in large models like those below. Please note that the messages are warnings only, not error messages, and that lme4 developers are aware that the convergence tests are too sensitive.

model1<-glmer(presG~class+W+year+course+course:class+course:W+class:W+prestrich+prestrich:class+(1|drainage/river/course/site), data=dfrn89, family=binomial, na.action=na.fail, control=glmerControl(optimizer="bobyqa")) ### This model elicits warning messages.

######## RANDOM EFFECTS: simplifying the hierarchical spatial term:

#model 2 has 'drainage' removed, so we have to use the 'nriver' (nested river) term. model 3 and 4 have successively simpler random terms.

model2<-glmer(presG~class+W+year+course+course:class+course:W+class:W+prestrich+prestrich:class+(1|nriver/course/site), data=dfrn89, family=binomial, na.action=na.fail, control=glmerControl(optimizer="bobyqa"))

model3<-glmer(presG~class+W+year+course+course:class+course:W+class:W+prestrich+prestrich:class+(1|ncourse/site), data=dfrn89, family=binomial, na.action=na.fail, control=glmerControl(optimizer="bobyqa"))

model4<-glmer(presG~class+W+year+course+course:class+course:W+class:W+prestrich+prestrich:class+(1|nsite), data=dfrn89, family=binomial, na.action=na.fail, control=glmerControl(optimizer="bobyqa"))

AIC(model1, model2, model3, model4) # compare the AIC values for the models that vary in the random term - model4 has the lowest AIC and is therefore the best.

######## R squared for the models - following Nagakawa and Schielzeth (2013) - compares the effects of the different random terms
library("MuMIn")
r.squaredGLMM(model1) # Gives the marginal (fixed effects only), and conditional (random+fixed effects) R2 for the model - i.e. the proportion of the variation in the data explained by the model
r.squaredGLMM(model2)
r.squaredGLMM(model3)
r.squaredGLMM(model4)


########## Likelihood Ratio Tests for the models to compare random terms

anova(model1, model2, test="Chisq") # LRT to compare two models - they only differ in that model1 contains 'drainage' in the spatial term, so the P value refers to the importance of 'drainage' in the model.

anova(model2, model3, test="Chisq") # LRT to compare two models - they only differ in that model2 contains 'river' in the spatial term, so the P value refers to the importance of 'river' in the model.

anova(model3, model4, test="Chisq") # LRT to compare two models - they only differ in that model3 contains 'course' in the spatial term, so the P value refers to the importance of 'course' in the model.

## N.B. because site is the smallest spatial scale we cannot run the model without it without substantially changing the model (i.e. it would no longer have a random term). We assessed the importance of site by calculating the percentage of variance it explained using the model output. As follows:

summary(model1) # focus on the variance portion of the output: 'Random effects'. The exact numbers might differ depending on the version of lme4 used.


100*(4.42/(4.42+1.89+0.55+0.36)) # percentage explained by site
100*(1.89/(4.42+1.89+0.55+0.36)) # percentage explained by course
100*(0.55/(4.42+1.89+0.55+0.36)) # percentage explained by river
100*(0.36/(4.42+1.89+0.55+0.36)) # percentage explained by drainage

### So putting all of this information together for the random effects - site is the only spatial level which explains a significant portion of the variation - model4 is the best

############FIXED EFFECTS: model averaging

####### Standardise the model so all factors have a mean of 0 and a standard deviation of 0.5 so that you can easily compare/average between models ######

library("arm")
stdz.model <- standardize(model4, standardize.y = FALSE)## this could take some time
summary(stdz.model)


######## Use dredge to go through all the models that are encorporated by the global model, and rank them according to AIC so you can pick the best ones #########

library('MuMIn')
model.set <- dredge(stdz.model, rank='AIC') ## this step could take some time
model.set

######## Select the top models, e.g. define top models as having delta AIC < 4

top.models <- get.models(model.set, subset = delta <4) ## this step could take some time
top.models
summary(top.models)


######## Obtain parameter estimates averaged across the top.models set
averaged.model <- model.avg(top.models,revised.var = TRUE,
                            adjusted = TRUE, rank = "AIC")         # recalculates the model weights, for the set of top models
averaged.model
summary(averaged.model)
confint(averaged.model)