library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
library(janitor)
library(plotly)
library(respirometry)Lab 13
NEON_soilMAGs_soilChem <- read.csv("NEON_soilMAGs_soilChem.csv")#install.packages("vegan")
#install.packages("corrplot")
# Load the package
library(corrplot)
library(vegan) # for ecological analyses
library(sf) # for spatial data
library(ggplot2)# Replace with your actual file path
soil_data <- read_csv("NEON_soilMAGs_soilChem.csv")
# Peek at the structure
glimpse(soil_data)Rows: 5,572
Columns: 46
$ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ genomicsSampleID <chr> "BONA_001-O-20230710", "BONA_001-O-20230710", "B…
$ soilMoisture.x <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ decimalLatitude <dbl> 65.17445, 65.17445, 65.17445, 65.17445, 65.17445…
$ decimalLongitude <dbl> -147.4782, -147.4782, -147.4782, -147.4782, -147…
$ elevation <dbl> 374.1, 374.1, 374.1, 374.1, 374.1, 374.1, 374.1,…
$ soilTemp <dbl> 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1…
$ soilMoisture.y <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ soilInWaterpH <dbl> 3.993976, 3.993976, 3.993976, 3.993976, 3.993976…
$ soilInCaClpH <dbl> 3.449951, 3.449951, 3.449951, 3.449951, 3.449951…
$ siteID <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ bin_oid <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ bin_id <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ site <chr> "Caribou-Poker Creeks Research Watershed NEON Fi…
$ site_ID <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ subplot <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ layer <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"…
$ quadrant <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date <dbl> 20230710, 20230710, 20230710, 20230710, 20230710…
$ img_genome_id <dbl> 3300079481, 3300079481, 3300079481, 3300079481, …
$ bin_quality <chr> "MQ", "HQ", "MQ", "MQ", "MQ", "MQ", "MQ", "MQ", …
$ bin_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia", "Bacte…
$ gtdb_taxonomy_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia; Terrigl…
$ domain <chr> "Bacteria", "Bacteria", "Bacteria", "Bacteria", …
$ phylum <chr> "Acidobacteriota", "Actinomycetota", "Actinomyce…
$ class <chr> "Terriglobia", "Thermoleophilia", "Thermoleophil…
$ order <chr> "Terriglobales", "Solirubrobacterales", "Solirub…
$ family <chr> "SbA1", "Solirubrobacteraceae", "Solirubrobacter…
$ genus <chr> "Sulfotelmatobacter", "Palsa-744", "Palsa-744", …
$ bin_methods <chr> "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0…
$ created_by <chr> "IMG_PIPELINE", "IMG_PIPELINE", "IMG_PIPELINE", …
$ date_added <date> 2025-02-03, 2025-02-03, 2025-02-03, 2025-02-03,…
$ bin_completeness <dbl> 95.61, 94.66, 51.78, 60.83, 81.54, 88.11, 77.19,…
$ bin_contamination <dbl> 4.96, 1.99, 9.42, 0.04, 0.18, 0.19, 4.25, 0.66, …
$ average_coverage <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ total_number_of_bases <dbl> 6486507, 3237739, 2712910, 2720369, 3539081, 395…
$ x5s_r_rna <dbl> 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, …
$ x16s_r_rna <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
$ x23s_r_rna <dbl> 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
$ t_rna_genes <dbl> 54, 51, 30, 10, 30, 29, 36, 21, 24, 37, 20, 21, …
$ gene_count <dbl> 5785, 3054, 3039, 2539, 3314, 3753, 5943, 2297, …
$ scaffold_count <dbl> 100, 52, 513, 152, 245, 91, 497, 188, 470, 227, …
$ gold_study_id <chr> "Gs0166454", "Gs0166454", "Gs0166454", "Gs016645…
$ community <chr> "Soil microbial communities", "Soil microbial co…
$ type <chr> "Soil", "Soil", "Soil", "Soil", "Soil", "Soil", …
$ sample_unknown <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
colnames(soil_data) [1] "...1" "genomicsSampleID" "soilMoisture.x"
[4] "decimalLatitude" "decimalLongitude" "elevation"
[7] "soilTemp" "soilMoisture.y" "soilInWaterpH"
[10] "soilInCaClpH" "siteID" "bin_oid"
[13] "bin_id" "site" "site_ID"
[16] "subplot" "layer" "quadrant"
[19] "date" "img_genome_id" "bin_quality"
[22] "bin_lineage" "gtdb_taxonomy_lineage" "domain"
[25] "phylum" "class" "order"
[28] "family" "genus" "bin_methods"
[31] "created_by" "date_added" "bin_completeness"
[34] "bin_contamination" "average_coverage" "total_number_of_bases"
[37] "x5s_r_rna" "x16s_r_rna" "x23s_r_rna"
[40] "t_rna_genes" "gene_count" "scaffold_count"
[43] "gold_study_id" "community" "type"
[46] "sample_unknown"
Environmental correlations
# Environmental variables
env_vars <- soil_data %>%
select(soilMoisture.x, soilMoisture.y, soilTemp,
soilInWaterpH, soilInCaClpH,
decimalLatitude, decimalLongitude, elevation)
# Genomic metadata (MAG features)
mag_vars <- soil_data %>%
select(bin_completeness, bin_contamination,
average_coverage, gene_count, scaffold_count)Correlation Matrix
mag_vars <- soil_data %>%
select(bin_completeness, bin_contamination,
gene_count, scaffold_count)
cor_matrix <- cor(env_vars, mag_vars, use = "pairwise.complete.obs")
# Heatmap option
heatmap(cor_matrix, scale = "none")# Now rerun your correlation plot
corrplot(cor_matrix, method = "color", tl.cex = 0.9, cl.cex = 0.9)Ordination analysis
# Combine both sets to drop rows with NA in either
combined <- bind_cols(mag_vars, env_vars) %>% drop_na()
# Split back into MAG and environmental matrices
mag_clean <- combined %>% select(bin_completeness, bin_contamination,
gene_count, scaffold_count)
env_clean <- combined %>% select(soilMoisture.x, soilMoisture.y, soilTemp,
soilInWaterpH, soilInCaClpH,
decimalLatitude, decimalLongitude, elevation)
rda_model <- rda(mag_clean ~ ., data = env_clean)# Base RDA plot with arrows
plot(rda_model, main = "RDA: MAG features vs Environmental Variables")library(ggrepel) # every session
# --- Extract scores ---
sp <- scores(rda_model, display = "species", scaling = 2) # species/MAG features
st <- scores(rda_model, display = "sites", scaling = 2) # samples
bp <- scores(rda_model, display = "bp", scaling = 2) # environmental arrows
sp_df <- as.data.frame(sp)
sp_df$label <- rownames(sp_df)
st_df <- as.data.frame(st)
st_df$label <- rownames(st_df)
bp_df <- as.data.frame(bp)
bp_df$label <- rownames(bp_df)
# --- Variance explained percentages ---
var <- summary(rda_model)$cont$importance[2,1:2] * 100
# --- Build ggplot ---
ggplot() +
# site points
geom_point(data = st_df,
aes(x = RDA1, y = RDA2),
color = "black", alpha = 0.7, size = 2) +
# species points
geom_point(data = sp_df,
aes(x = RDA1, y = RDA2),
color = "red", size = 2) +
# species labels (non-overlapping)
geom_text_repel(data = sp_df,
aes(x = RDA1, y = RDA2, label = label),
color = "red", size = 3, max.overlaps = Inf) +
# environmental arrows
geom_segment(data = bp_df,
aes(x = 0, y = 0, xend = RDA1, yend = RDA2),
arrow = arrow(length = unit(0.25,"cm")),
color = "blue", linewidth = 0.7) +
# environmental labels (non-overlapping)
geom_text_repel(data = bp_df,
aes(x = RDA1, y = RDA2, label = label),
color = "blue", size = 3, max.overlaps = Inf) +
# axis labels using explained variance
xlab(paste0("RDA1 (", round(var[1], 1), "%)")) +
ylab(paste0("RDA2 (", round(var[2], 1), "%)")) +
theme_classic() +
ggtitle("RDA: MAG features vs Environmental Variables")Geographical analysis
taxonomy_split <- soil_data %>%
separate(phylum, into = c("Domain","Phylum","Class","Order","Family","Genus"),
sep = ";", fill = "right", remove = FALSE)comm_matrix <- taxonomy_split %>%
group_by(site, phylum) %>%
summarise(n = n(), .groups = "drop") %>%
pivot_wider(names_from = phylum, values_from = n, values_fill = 0)comm_matrix <- comm_matrix %>% filter(!is.na(site))
comm_matrix$site <- make.unique(comm_matrix$site)
site_taxa <- comm_matrix %>% column_to_rownames("site")
site_taxa <- comm_matrix %>% column_to_rownames("site")site_taxa <- site_taxa %>% mutate(across(everything(), as.numeric))
site_taxa <- as.data.frame(site_taxa)
site_taxa[is.na(site_taxa)] <- 0#install.packages("ggrepel")
library(ggrepel)
nmds <- metaMDS(site_taxa, distance = "bray", k = 2, trymax = 100)Square root transformation
Wisconsin double standardization
Run 0 stress 0.1669422
Run 1 stress 0.1760092
Run 2 stress 0.1769485
Run 3 stress 0.1665285
... New best solution
... Procrustes: rmse 0.01174433 max resid 0.03724345
Run 4 stress 0.1804953
Run 5 stress 0.1735792
Run 6 stress 0.1662938
... New best solution
... Procrustes: rmse 0.07933971 max resid 0.314819
Run 7 stress 0.1636713
... New best solution
... Procrustes: rmse 0.03951047 max resid 0.1817833
Run 8 stress 0.1614353
... New best solution
... Procrustes: rmse 0.104599 max resid 0.3461183
Run 9 stress 0.1766572
Run 10 stress 0.1806274
Run 11 stress 0.1716798
Run 12 stress 0.2038341
Run 13 stress 0.167602
Run 14 stress 0.1673046
Run 15 stress 0.1834984
Run 16 stress 0.1666108
Run 17 stress 0.1804953
Run 18 stress 0.1820159
Run 19 stress 0.1750122
Run 20 stress 0.1643972
Run 21 stress 0.1735792
Run 22 stress 0.1636655
Run 23 stress 0.1662587
Run 24 stress 0.1820062
Run 25 stress 0.1788415
Run 26 stress 0.1637626
Run 27 stress 0.1639418
Run 28 stress 0.1634574
Run 29 stress 0.1899823
Run 30 stress 0.1804955
Run 31 stress 0.1904707
Run 32 stress 0.1669422
Run 33 stress 0.1672211
Run 34 stress 0.1671559
Run 35 stress 0.1981508
Run 36 stress 0.1799928
Run 37 stress 0.1899826
Run 38 stress 0.1721773
Run 39 stress 0.1946032
Run 40 stress 0.1783752
Run 41 stress 0.1769485
Run 42 stress 0.1799781
Run 43 stress 0.1721975
Run 44 stress 0.180663
Run 45 stress 0.1818906
Run 46 stress 0.1716798
Run 47 stress 0.1712751
Run 48 stress 0.1614354
... Procrustes: rmse 3.102261e-05 max resid 9.514876e-05
... Similar to previous best
*** Best solution repeated 1 times
site_scores <- as.data.frame(scores(nmds, display = "sites"))
site_scores$SiteID <- rownames(site_scores)
taxa_scores <- as.data.frame(scores(nmds, display = "species"))
taxa_scores$Taxon <- rownames(taxa_scores)
ggplot() +
geom_point(data = site_scores, aes(x = NMDS1, y = NMDS2), color = "black") +
geom_text_repel(data = site_scores, aes(x = NMDS1, y = NMDS2, label = SiteID), color = "black") +
geom_text_repel(data = taxa_scores, aes(x = NMDS1, y = NMDS2, label = Taxon), color = "red") +
labs(title = "NMDS of Microbial Communities", x = "NMDS1", y = "NMDS2") +
theme_minimal()# Community matrix
comm_matrix <- taxonomy_split %>%
filter(!is.na(siteID), !is.na(phylum)) %>%
group_by(siteID, phylum) %>%
summarise(n = n(), .groups = "drop") %>%
pivot_wider(names_from = phylum, values_from = n, values_fill = 0)
site_taxa <- comm_matrix %>%
column_to_rownames("siteID") %>%
as.data.frame()
# Collapse coords to unique sites
coords <- soil_data %>%
distinct(siteID, decimalLatitude, decimalLongitude)
# Keep only sites present in site_taxa
common_sites <- intersect(rownames(site_taxa), coords$siteID)
coords <- coords %>%
filter(siteID %in% common_sites)
# Reorder coords to match site_taxa
coords <- coords[match(common_sites, coords$siteID), ]
# Check alignment
all(coords$siteID == rownames(site_taxa)) # should return TRUE[1] TRUE
comm_dist <- vegdist(site_taxa, method = "bray")
geo_dist <- dist(coords %>% select(decimalLatitude, decimalLongitude))
mantel_result <- mantel(comm_dist, geo_dist)
print(mantel_result)
Mantel statistic based on Pearson's product-moment correlation
Call:
mantel(xdis = comm_dist, ydis = geo_dist)
Mantel statistic r: 0.1608
Significance: 0.083
Upper quantiles of permutations (null model):
90% 95% 97.5% 99%
0.146 0.195 0.233 0.263
Permutation: free
Number of permutations: 999
# Convert distance objects to matrices
comm_mat <- as.matrix(comm_dist)
geo_mat <- as.matrix(geo_dist)
# Extract lower triangle (unique site pairs only)
comm_vec <- comm_mat[lower.tri(comm_mat)]
geo_vec <- geo_mat[lower.tri(geo_mat)]
# Build tidy data frame
df_decay <- data.frame(
GeographicDistance = geo_vec,
CommunitySimilarity = 1 - comm_vec
)
# Plot with ggplot2
ggplot(df_decay, aes(x = GeographicDistance, y = CommunitySimilarity)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(
title = "Distance–Decay Relationship",
x = "Geographic Distance",
y = "Community Similarity (1 - Bray-Curtis)"
) +
theme_minimal()Taxonomy analysis
taxonomy_split <- soil_data %>%
separate(gtdb_taxonomy_lineage, into = c("Domain","Phylum","Class","Order","Family","Genus"),
sep = ";", fill = "right", remove = FALSE)
# --- Summarize composition at Phylum level ---
phylum_counts <- taxonomy_split %>%
count(phylum, sort = TRUE) %>%
mutate(RelativeAbundance = n / sum(n) * 100)
# --- Plot relative abundance ---
ggplot(phylum_counts, aes(x = reorder(phylum, -RelativeAbundance),
y = RelativeAbundance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Taxonomic Composition at Phylum Level",
x = "Phylum",
y = "Relative Abundance (%)") +
theme_minimal()# --- Compare across locations (example) ---
# Replace 'Location' with the actual column name in your dataset
phylum_by_location <- taxonomy_split %>%
group_by(site, Phylum) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(site) %>%
mutate(RelativeAbundance = n / sum(n) * 100)
ggplot(phylum_by_location, aes(x = site, y = RelativeAbundance, fill = Phylum)) +
geom_bar(stat = "identity") +
labs(title = "Phylum Composition by Location",
x = "Location",
y = "Relative Abundance (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))