Lab 13

Author

Iris Thesmar

library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
library(janitor)
library(plotly)
library(respirometry)
NEON_soilMAGs_soilChem <- read.csv("NEON_soilMAGs_soilChem.csv")
#install.packages("vegan")

#install.packages("corrplot")

# Load the package
library(corrplot)
library(vegan)   # for ecological analyses
library(sf)      # for spatial data
library(ggplot2)
# Replace with your actual file path
soil_data <- read_csv("NEON_soilMAGs_soilChem.csv")

# Peek at the structure
glimpse(soil_data)
Rows: 5,572
Columns: 46
$ ...1                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ genomicsSampleID      <chr> "BONA_001-O-20230710", "BONA_001-O-20230710", "B…
$ soilMoisture.x        <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ decimalLatitude       <dbl> 65.17445, 65.17445, 65.17445, 65.17445, 65.17445…
$ decimalLongitude      <dbl> -147.4782, -147.4782, -147.4782, -147.4782, -147…
$ elevation             <dbl> 374.1, 374.1, 374.1, 374.1, 374.1, 374.1, 374.1,…
$ soilTemp              <dbl> 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1…
$ soilMoisture.y        <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ soilInWaterpH         <dbl> 3.993976, 3.993976, 3.993976, 3.993976, 3.993976…
$ soilInCaClpH          <dbl> 3.449951, 3.449951, 3.449951, 3.449951, 3.449951…
$ siteID                <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ bin_oid               <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ bin_id                <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ site                  <chr> "Caribou-Poker Creeks Research Watershed NEON Fi…
$ site_ID               <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ subplot               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ layer                 <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"…
$ quadrant              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date                  <dbl> 20230710, 20230710, 20230710, 20230710, 20230710…
$ img_genome_id         <dbl> 3300079481, 3300079481, 3300079481, 3300079481, …
$ bin_quality           <chr> "MQ", "HQ", "MQ", "MQ", "MQ", "MQ", "MQ", "MQ", …
$ bin_lineage           <chr> "Bacteria; Acidobacteriota; Terriglobia", "Bacte…
$ gtdb_taxonomy_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia; Terrigl…
$ domain                <chr> "Bacteria", "Bacteria", "Bacteria", "Bacteria", …
$ phylum                <chr> "Acidobacteriota", "Actinomycetota", "Actinomyce…
$ class                 <chr> "Terriglobia", "Thermoleophilia", "Thermoleophil…
$ order                 <chr> "Terriglobales", "Solirubrobacterales", "Solirub…
$ family                <chr> "SbA1", "Solirubrobacteraceae", "Solirubrobacter…
$ genus                 <chr> "Sulfotelmatobacter", "Palsa-744", "Palsa-744", …
$ bin_methods           <chr> "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0…
$ created_by            <chr> "IMG_PIPELINE", "IMG_PIPELINE", "IMG_PIPELINE", …
$ date_added            <date> 2025-02-03, 2025-02-03, 2025-02-03, 2025-02-03,…
$ bin_completeness      <dbl> 95.61, 94.66, 51.78, 60.83, 81.54, 88.11, 77.19,…
$ bin_contamination     <dbl> 4.96, 1.99, 9.42, 0.04, 0.18, 0.19, 4.25, 0.66, …
$ average_coverage      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ total_number_of_bases <dbl> 6486507, 3237739, 2712910, 2720369, 3539081, 395…
$ x5s_r_rna             <dbl> 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, …
$ x16s_r_rna            <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
$ x23s_r_rna            <dbl> 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
$ t_rna_genes           <dbl> 54, 51, 30, 10, 30, 29, 36, 21, 24, 37, 20, 21, …
$ gene_count            <dbl> 5785, 3054, 3039, 2539, 3314, 3753, 5943, 2297, …
$ scaffold_count        <dbl> 100, 52, 513, 152, 245, 91, 497, 188, 470, 227, …
$ gold_study_id         <chr> "Gs0166454", "Gs0166454", "Gs0166454", "Gs016645…
$ community             <chr> "Soil microbial communities", "Soil microbial co…
$ type                  <chr> "Soil", "Soil", "Soil", "Soil", "Soil", "Soil", …
$ sample_unknown        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
colnames(soil_data)
 [1] "...1"                  "genomicsSampleID"      "soilMoisture.x"       
 [4] "decimalLatitude"       "decimalLongitude"      "elevation"            
 [7] "soilTemp"              "soilMoisture.y"        "soilInWaterpH"        
[10] "soilInCaClpH"          "siteID"                "bin_oid"              
[13] "bin_id"                "site"                  "site_ID"              
[16] "subplot"               "layer"                 "quadrant"             
[19] "date"                  "img_genome_id"         "bin_quality"          
[22] "bin_lineage"           "gtdb_taxonomy_lineage" "domain"               
[25] "phylum"                "class"                 "order"                
[28] "family"                "genus"                 "bin_methods"          
[31] "created_by"            "date_added"            "bin_completeness"     
[34] "bin_contamination"     "average_coverage"      "total_number_of_bases"
[37] "x5s_r_rna"             "x16s_r_rna"            "x23s_r_rna"           
[40] "t_rna_genes"           "gene_count"            "scaffold_count"       
[43] "gold_study_id"         "community"             "type"                 
[46] "sample_unknown"       

Environmental correlations

# Environmental variables
env_vars <- soil_data %>%
  select(soilMoisture.x, soilMoisture.y, soilTemp,
         soilInWaterpH, soilInCaClpH,
         decimalLatitude, decimalLongitude, elevation)

# Genomic metadata (MAG features)
mag_vars <- soil_data %>%
  select(bin_completeness, bin_contamination,
         average_coverage, gene_count, scaffold_count)

Correlation Matrix

mag_vars <- soil_data %>%
  select(bin_completeness, bin_contamination,
         gene_count, scaffold_count)

cor_matrix <- cor(env_vars, mag_vars, use = "pairwise.complete.obs")

# Heatmap option
heatmap(cor_matrix, scale = "none")

# Now rerun your correlation plot
corrplot(cor_matrix, method = "color", tl.cex = 0.9, cl.cex = 0.9)

Ordination analysis

# Combine both sets to drop rows with NA in either
combined <- bind_cols(mag_vars, env_vars) %>% drop_na()

# Split back into MAG and environmental matrices
mag_clean <- combined %>% select(bin_completeness, bin_contamination,
                                 gene_count, scaffold_count)
env_clean <- combined %>% select(soilMoisture.x, soilMoisture.y, soilTemp,
                                 soilInWaterpH, soilInCaClpH,
                                 decimalLatitude, decimalLongitude, elevation)

rda_model <- rda(mag_clean ~ ., data = env_clean)
# Base RDA plot with arrows
plot(rda_model, main = "RDA: MAG features vs Environmental Variables")

library(ggrepel)              # every session
# --- Extract scores ---
sp  <- scores(rda_model, display = "species", scaling = 2)  # species/MAG features
st  <- scores(rda_model, display = "sites",   scaling = 2)  # samples
bp  <- scores(rda_model, display = "bp",      scaling = 2)  # environmental arrows

sp_df <- as.data.frame(sp)
sp_df$label <- rownames(sp_df)

st_df <- as.data.frame(st)
st_df$label <- rownames(st_df)

bp_df <- as.data.frame(bp)
bp_df$label <- rownames(bp_df)

# --- Variance explained percentages ---
var <- summary(rda_model)$cont$importance[2,1:2] * 100

# --- Build ggplot ---
ggplot() +
  
  # site points
  geom_point(data = st_df,
             aes(x = RDA1, y = RDA2),
             color = "black", alpha = 0.7, size = 2) +
  
  # species points
  geom_point(data = sp_df,
             aes(x = RDA1, y = RDA2),
             color = "red", size = 2) +
  
  # species labels (non-overlapping)
  geom_text_repel(data = sp_df,
                  aes(x = RDA1, y = RDA2, label = label),
                  color = "red", size = 3, max.overlaps = Inf) +
  
  # environmental arrows
  geom_segment(data = bp_df,
               aes(x = 0, y = 0, xend = RDA1, yend = RDA2),
               arrow = arrow(length = unit(0.25,"cm")),
               color = "blue", linewidth = 0.7) +
  
  # environmental labels (non-overlapping)
  geom_text_repel(data = bp_df,
                  aes(x = RDA1, y = RDA2, label = label),
                  color = "blue", size = 3, max.overlaps = Inf) +
  
  # axis labels using explained variance
  xlab(paste0("RDA1 (", round(var[1], 1), "%)")) +
  ylab(paste0("RDA2 (", round(var[2], 1), "%)")) +
  
  theme_classic() +
  ggtitle("RDA: MAG features vs Environmental Variables")

Geographical analysis

taxonomy_split <- soil_data %>%
  separate(phylum, into = c("Domain","Phylum","Class","Order","Family","Genus"),
           sep = ";", fill = "right", remove = FALSE)
comm_matrix <- taxonomy_split %>%
  group_by(site, phylum) %>%
  summarise(n = n(), .groups = "drop") %>%
  pivot_wider(names_from = phylum, values_from = n, values_fill = 0)
comm_matrix <- comm_matrix %>% filter(!is.na(site))
comm_matrix$site <- make.unique(comm_matrix$site)


site_taxa <- comm_matrix %>% column_to_rownames("site")
site_taxa <- comm_matrix %>% column_to_rownames("site")
site_taxa <- site_taxa %>% mutate(across(everything(), as.numeric))
site_taxa <- as.data.frame(site_taxa)
site_taxa[is.na(site_taxa)] <- 0
#install.packages("ggrepel")
library(ggrepel)

nmds <- metaMDS(site_taxa, distance = "bray", k = 2, trymax = 100)
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1669422 
Run 1 stress 0.1760092 
Run 2 stress 0.1769485 
Run 3 stress 0.1665285 
... New best solution
... Procrustes: rmse 0.01174433  max resid 0.03724345 
Run 4 stress 0.1804953 
Run 5 stress 0.1735792 
Run 6 stress 0.1662938 
... New best solution
... Procrustes: rmse 0.07933971  max resid 0.314819 
Run 7 stress 0.1636713 
... New best solution
... Procrustes: rmse 0.03951047  max resid 0.1817833 
Run 8 stress 0.1614353 
... New best solution
... Procrustes: rmse 0.104599  max resid 0.3461183 
Run 9 stress 0.1766572 
Run 10 stress 0.1806274 
Run 11 stress 0.1716798 
Run 12 stress 0.2038341 
Run 13 stress 0.167602 
Run 14 stress 0.1673046 
Run 15 stress 0.1834984 
Run 16 stress 0.1666108 
Run 17 stress 0.1804953 
Run 18 stress 0.1820159 
Run 19 stress 0.1750122 
Run 20 stress 0.1643972 
Run 21 stress 0.1735792 
Run 22 stress 0.1636655 
Run 23 stress 0.1662587 
Run 24 stress 0.1820062 
Run 25 stress 0.1788415 
Run 26 stress 0.1637626 
Run 27 stress 0.1639418 
Run 28 stress 0.1634574 
Run 29 stress 0.1899823 
Run 30 stress 0.1804955 
Run 31 stress 0.1904707 
Run 32 stress 0.1669422 
Run 33 stress 0.1672211 
Run 34 stress 0.1671559 
Run 35 stress 0.1981508 
Run 36 stress 0.1799928 
Run 37 stress 0.1899826 
Run 38 stress 0.1721773 
Run 39 stress 0.1946032 
Run 40 stress 0.1783752 
Run 41 stress 0.1769485 
Run 42 stress 0.1799781 
Run 43 stress 0.1721975 
Run 44 stress 0.180663 
Run 45 stress 0.1818906 
Run 46 stress 0.1716798 
Run 47 stress 0.1712751 
Run 48 stress 0.1614354 
... Procrustes: rmse 3.102261e-05  max resid 9.514876e-05 
... Similar to previous best
*** Best solution repeated 1 times
site_scores <- as.data.frame(scores(nmds, display = "sites"))
site_scores$SiteID <- rownames(site_scores)

taxa_scores <- as.data.frame(scores(nmds, display = "species"))
taxa_scores$Taxon <- rownames(taxa_scores)

ggplot() +
  geom_point(data = site_scores, aes(x = NMDS1, y = NMDS2), color = "black") +
  geom_text_repel(data = site_scores, aes(x = NMDS1, y = NMDS2, label = SiteID), color = "black") +
  geom_text_repel(data = taxa_scores, aes(x = NMDS1, y = NMDS2, label = Taxon), color = "red") +
  labs(title = "NMDS of Microbial Communities", x = "NMDS1", y = "NMDS2") +
  theme_minimal()

# Community matrix
comm_matrix <- taxonomy_split %>%
  filter(!is.na(siteID), !is.na(phylum)) %>%
  group_by(siteID, phylum) %>%
  summarise(n = n(), .groups = "drop") %>%
  pivot_wider(names_from = phylum, values_from = n, values_fill = 0)

site_taxa <- comm_matrix %>%
  column_to_rownames("siteID") %>%
  as.data.frame()

# Collapse coords to unique sites
coords <- soil_data %>%
  distinct(siteID, decimalLatitude, decimalLongitude)

# Keep only sites present in site_taxa
common_sites <- intersect(rownames(site_taxa), coords$siteID)

coords <- coords %>%
  filter(siteID %in% common_sites)

# Reorder coords to match site_taxa
coords <- coords[match(common_sites, coords$siteID), ]

# Check alignment
all(coords$siteID == rownames(site_taxa))  # should return TRUE
[1] TRUE
comm_dist <- vegdist(site_taxa, method = "bray")
geo_dist  <- dist(coords %>% select(decimalLatitude, decimalLongitude))

mantel_result <- mantel(comm_dist, geo_dist)
print(mantel_result)

Mantel statistic based on Pearson's product-moment correlation 

Call:
mantel(xdis = comm_dist, ydis = geo_dist) 

Mantel statistic r: 0.1608 
      Significance: 0.083 

Upper quantiles of permutations (null model):
  90%   95% 97.5%   99% 
0.146 0.195 0.233 0.263 
Permutation: free
Number of permutations: 999
# Convert distance objects to matrices
comm_mat <- as.matrix(comm_dist)
geo_mat  <- as.matrix(geo_dist)

# Extract lower triangle (unique site pairs only)
comm_vec <- comm_mat[lower.tri(comm_mat)]
geo_vec  <- geo_mat[lower.tri(geo_mat)]

# Build tidy data frame
df_decay <- data.frame(
  GeographicDistance = geo_vec,
  CommunitySimilarity = 1 - comm_vec
)

# Plot with ggplot2
ggplot(df_decay, aes(x = GeographicDistance, y = CommunitySimilarity)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(
    title = "Distance–Decay Relationship",
    x = "Geographic Distance",
    y = "Community Similarity (1 - Bray-Curtis)"
  ) +
  theme_minimal()

Taxonomy analysis

taxonomy_split <- soil_data %>%
  separate(gtdb_taxonomy_lineage, into = c("Domain","Phylum","Class","Order","Family","Genus"),
           sep = ";", fill = "right", remove = FALSE)

# --- Summarize composition at Phylum level ---
phylum_counts <- taxonomy_split %>%
  count(phylum, sort = TRUE) %>%
  mutate(RelativeAbundance = n / sum(n) * 100)

# --- Plot relative abundance ---
ggplot(phylum_counts, aes(x = reorder(phylum, -RelativeAbundance),
                          y = RelativeAbundance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Taxonomic Composition at Phylum Level",
       x = "Phylum",
       y = "Relative Abundance (%)") +
  theme_minimal()

# --- Compare across locations (example) ---
# Replace 'Location' with the actual column name in your dataset
phylum_by_location <- taxonomy_split %>%
  group_by(site, Phylum) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(site) %>%
  mutate(RelativeAbundance = n / sum(n) * 100)

ggplot(phylum_by_location, aes(x = site, y = RelativeAbundance, fill = Phylum)) +
  geom_bar(stat = "identity") +
  labs(title = "Phylum Composition by Location",
       x = "Location",
       y = "Relative Abundance (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))