# ********************** Preparing the data **********************************

# In the following, the file "dreissena_in_lakes_of_belarus.csv" is assumed to be stored 
# in the current R working directory.

# Load the data in:
spread <- read.csv(""dreissena_in_lakes_of_belarus.csv"")

# Make sure "ZMpresence" is treated as factor:
spread$ZMpresence = factor(spread$ZMpresence)

# Remove MAXD, TRANSP, and PermOx from the data frame (see paper for explanation):
dat = spread[, -c(5, 8, 22)]

# Create a subset with infested lakes only:
true.positive = subset(dat, ZMpresence == 1)

# Create a subset with non-infested lakes only:
all.zeros = subset(dat, ZMpresence != 1)

# Create a subset of lakes that can potentially be infested (see paper for details):
potential.positive = subset(all.zeros,
                            pH >= 7.47 &
                            Ca >= 23.1 & TDS >= 158)

# Combine invaded lakes and lakes suitable but not yet invaded:
temporary.data = rbind(true.positive, potential.positive)

# Load the caret package, which is used to remove highly collinear variables:
library(caret)

# Calculate correlation matrix for all quantitative predictors:
predictorsCor = cor(temporary.data[, -c(1, 2, 20, 21)], # remove
                    # variables Lake_Code, ZMpresence, N and E
                    # from correlation calculations
                    method = "spearman", use = "complete.obs")

# Find indices of the highly correlated, redundant predictors:
highCorr = findCorrelation(predictorsCor, 0.80)

# Create a vector with names of the predictors to be kept in analysis:
AbioVarsSelected = names(data.frame(
  temporary.data[, -c(1, 2)][, -highCorr]) )

# Create a data frame for the Random Forest analysis:
dat.RF = temporary.data[, c("ZMpresence", AbioVarsSelected)]

# For each lake, calculate the distance (km) to all the invaded lakes,
# and select the shortest of those distances (Dmin):
library(geosphere)

dat.RF$Dmin <- 0

for(i in 1:dim(dat.RF)[1]){
  
  p1 <- c(dat.RF[i, "E"], dat.RF[i, "N"])
  
  temp.data <- dat.RF[-i, c("ZMpresence", "E", "N")]
  temp.data <- temp.data[temp.data$ZMpresence == 1, ]
  
  p2 <- cbind(temp.data[, "E"], temp.data[, "N"])
  
  # distances according to haversine formula:
  d <- round(distHaversine(p1, p2)/1000, 2)
  
  dat.RF[i, "Dmin"] <- min(d)
}


# ****************** Random Forest classifier ********************

# Load the randomForest library
library(randomForest)

# Set seed for the random number generator to reproduce the results:
set.seed(1200)

# Build the Random Forests classifier (note: SPECWATSHED and NO2 are
# excluded - see paper for details; the latitude and longitude are also exluded):
RF = randomForest(ZMpresence ~ .,
                  data = dat.RF[, -c(4, 15, 17, 18)],
                  importance = TRUE,
                  keep.forest = TRUE,
                  do.trace = 1000,
                  ntree = 20000,
                  na.action = na.exclude,
                  sampsize = c(80, 95))


# Commands to reproduce Figure 2:
imp.RF <- data.frame(Parameter = rownames(RF$importance),
                      MDA = RF$importance[, 3],
                      MDAsd = RF$importanceSD[, 3])
rownames(imp.RF) <- NULL
imp.RF <- imp.RF[order(imp.RF$MDA), ]
imp.RF$Parameter <- as.character(imp.RF$Parameter)
imp.RF$Parameter[11] <- "Average depth"
imp.RF$Parameter[12] <- "Color"
imp.RF$Parameter[13] <- "Lake area"
imp.RF$Parameter[14] <- "Shortest distance"
imp.RF$Parameter <- as.factor(imp.RF$Parameter)
imp.RF$Parameter <- ordered(imp.RF$Parameter,
                             levels = imp.RF$Parameter)

# Load the ggplot2 package:
library(ggplot2)

ggplot(data = imp.RF, aes(x = 0, xend = MDA,
                           y = Parameter, yend = Parameter)) + 
  geom_point(aes(MDA, Parameter), shape = 15, size = 3) + 
  xlab("MDA index") +
  geom_segment(linetype = 2) + theme_bw()


# Commands to reproduce Figure 3:
ggplot(data = dat.RF, aes(Dmin, fill = factor(ZMpresence))) + 
  geom_histogram(position = "dodge") +
  geom_histogram(position = "dodge", color = I("black"),
                 show_guide = FALSE) +
  scale_x_log10() + 
  theme_bw() + xlab("log(km)") + 
  theme(legend.position = c(0.8, 0.8),
        legend.background = element_blank()) +
  ylab("Frequency") +
  scale_fill_manual(values=c("gray80", "gray40"), 
                    name = "Invasion status:",
                    breaks = c("0", "1"),
                    labels = c("free", "invaded"))

# Commands to reproduce Figure 4:
par(mar = c(4.4, 4.3, 1, 1), mfrow = c(2, 4))
partialPlot(RF, na.omit(dat.RF), x.var = "Dmin",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = "Shortest distance (km)", ylab = "logit[P]/2")
partialPlot(RF, na.omit(dat.RF), x.var = "LAREA",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = expression(paste("Lake area (",
            km^2, ")")), ylab = "logit[P]/2")
partialPlot(RF, na.omit(dat.RF), x.var = "COLOR",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = "Color (degrees)", ylab = "logit[P]/2")
partialPlot(RF, na.omit(dat.RF), x.var = "AVED",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = "Average depth (m)", ylab = "logit[P]/2")
partialPlot(RF, na.omit(dat.RF), x.var = "Cl",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = "Cl (mg/L)", ylab = "logit[P]/2")
partialPlot(RF, na.omit(dat.RF), x.var = "Mg",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = "Mg (mg/L)", ylab = "logit[P]/2")
partialPlot(RF, na.omit(dat.RF), x.var = "HCO3",
            which.class = "1", main = "", las = 1,
            cex.lab = 0.9, cex.axis = 0.8, rug = F,
            xlab = "HCO3 (mg/L)", ylab = "logit[P]/2")

# *********************** END OF SCRIPT *********************************