# ********************** Preparing the data ********************************** # In the following, the file "dreissena_in_lakes_of_belarus.csv" is assumed to be stored # in the current R working directory. # Load the data in: spread <- read.csv(""dreissena_in_lakes_of_belarus.csv"") # Make sure "ZMpresence" is treated as factor: spread$ZMpresence = factor(spread$ZMpresence) # Remove MAXD, TRANSP, and PermOx from the data frame (see paper for explanation): dat = spread[, -c(5, 8, 22)] # Create a subset with infested lakes only: true.positive = subset(dat, ZMpresence == 1) # Create a subset with non-infested lakes only: all.zeros = subset(dat, ZMpresence != 1) # Create a subset of lakes that can potentially be infested (see paper for details): potential.positive = subset(all.zeros, pH >= 7.47 & Ca >= 23.1 & TDS >= 158) # Combine invaded lakes and lakes suitable but not yet invaded: temporary.data = rbind(true.positive, potential.positive) # Load the caret package, which is used to remove highly collinear variables: library(caret) # Calculate correlation matrix for all quantitative predictors: predictorsCor = cor(temporary.data[, -c(1, 2, 20, 21)], # remove # variables Lake_Code, ZMpresence, N and E # from correlation calculations method = "spearman", use = "complete.obs") # Find indices of the highly correlated, redundant predictors: highCorr = findCorrelation(predictorsCor, 0.80) # Create a vector with names of the predictors to be kept in analysis: AbioVarsSelected = names(data.frame( temporary.data[, -c(1, 2)][, -highCorr]) ) # Create a data frame for the Random Forest analysis: dat.RF = temporary.data[, c("ZMpresence", AbioVarsSelected)] # For each lake, calculate the distance (km) to all the invaded lakes, # and select the shortest of those distances (Dmin): library(geosphere) dat.RF$Dmin <- 0 for(i in 1:dim(dat.RF)[1]){ p1 <- c(dat.RF[i, "E"], dat.RF[i, "N"]) temp.data <- dat.RF[-i, c("ZMpresence", "E", "N")] temp.data <- temp.data[temp.data$ZMpresence == 1, ] p2 <- cbind(temp.data[, "E"], temp.data[, "N"]) # distances according to haversine formula: d <- round(distHaversine(p1, p2)/1000, 2) dat.RF[i, "Dmin"] <- min(d) } # ****************** Random Forest classifier ******************** # Load the randomForest library library(randomForest) # Set seed for the random number generator to reproduce the results: set.seed(1200) # Build the Random Forests classifier (note: SPECWATSHED and NO2 are # excluded - see paper for details; the latitude and longitude are also exluded): RF = randomForest(ZMpresence ~ ., data = dat.RF[, -c(4, 15, 17, 18)], importance = TRUE, keep.forest = TRUE, do.trace = 1000, ntree = 20000, na.action = na.exclude, sampsize = c(80, 95)) # Commands to reproduce Figure 2: imp.RF <- data.frame(Parameter = rownames(RF$importance), MDA = RF$importance[, 3], MDAsd = RF$importanceSD[, 3]) rownames(imp.RF) <- NULL imp.RF <- imp.RF[order(imp.RF$MDA), ] imp.RF$Parameter <- as.character(imp.RF$Parameter) imp.RF$Parameter[11] <- "Average depth" imp.RF$Parameter[12] <- "Color" imp.RF$Parameter[13] <- "Lake area" imp.RF$Parameter[14] <- "Shortest distance" imp.RF$Parameter <- as.factor(imp.RF$Parameter) imp.RF$Parameter <- ordered(imp.RF$Parameter, levels = imp.RF$Parameter) # Load the ggplot2 package: library(ggplot2) ggplot(data = imp.RF, aes(x = 0, xend = MDA, y = Parameter, yend = Parameter)) + geom_point(aes(MDA, Parameter), shape = 15, size = 3) + xlab("MDA index") + geom_segment(linetype = 2) + theme_bw() # Commands to reproduce Figure 3: ggplot(data = dat.RF, aes(Dmin, fill = factor(ZMpresence))) + geom_histogram(position = "dodge") + geom_histogram(position = "dodge", color = I("black"), show_guide = FALSE) + scale_x_log10() + theme_bw() + xlab("log(km)") + theme(legend.position = c(0.8, 0.8), legend.background = element_blank()) + ylab("Frequency") + scale_fill_manual(values=c("gray80", "gray40"), name = "Invasion status:", breaks = c("0", "1"), labels = c("free", "invaded")) # Commands to reproduce Figure 4: par(mar = c(4.4, 4.3, 1, 1), mfrow = c(2, 4)) partialPlot(RF, na.omit(dat.RF), x.var = "Dmin", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = "Shortest distance (km)", ylab = "logit[P]/2") partialPlot(RF, na.omit(dat.RF), x.var = "LAREA", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = expression(paste("Lake area (", km^2, ")")), ylab = "logit[P]/2") partialPlot(RF, na.omit(dat.RF), x.var = "COLOR", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = "Color (degrees)", ylab = "logit[P]/2") partialPlot(RF, na.omit(dat.RF), x.var = "AVED", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = "Average depth (m)", ylab = "logit[P]/2") partialPlot(RF, na.omit(dat.RF), x.var = "Cl", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = "Cl (mg/L)", ylab = "logit[P]/2") partialPlot(RF, na.omit(dat.RF), x.var = "Mg", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = "Mg (mg/L)", ylab = "logit[P]/2") partialPlot(RF, na.omit(dat.RF), x.var = "HCO3", which.class = "1", main = "", las = 1, cex.lab = 0.9, cex.axis = 0.8, rug = F, xlab = "HCO3 (mg/L)", ylab = "logit[P]/2") # *********************** END OF SCRIPT *********************************