Comprehensive Microbial Co-occurrence and Network Analysis

Author

NEON Soil MAGs Analysis

Published

December 3, 2025

Introduction

This comprehensive report includes: - Data preprocessing and normalization. - Static network visualization. - Interactive network visualization with taxonomy tooltips and module color coding. - Module detection visualization. - Interactive soil chemistry correlation heatmap.

Required Packages

R code
library(tidyverse)
library(igraph)
library(ggraph)
library(tidygraph)
library(readr)
library(visNetwork)
library(plotly)

1. Load and Inspect Data

R code
mags_data <- read_csv("NEON_soilMAGs_soilChem.csv")
glimpse(mags_data)
Rows: 5,573
Columns: 43
$ ...1                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ genomicsSampleID      <chr> "BONA_001-O-20230710", "BONA_001-O-20230710", "B…
$ soilMoisture          <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ decimalLatitude       <dbl> 65.17445, 65.17445, 65.17445, 65.17445, 65.17445…
$ decimalLongitude      <dbl> -147.4782, -147.4782, -147.4782, -147.4782, -147…
$ elevation             <dbl> 374.1, 374.1, 374.1, 374.1, 374.1, 374.1, 374.1,…
$ soilTemp              <dbl> 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1…
$ soilInWaterpH         <dbl> 3.993976, 3.993976, 3.993976, 3.993976, 3.993976…
$ soilInCaClpH          <dbl> 3.449951, 3.449951, 3.449951, 3.449951, 3.449951…
$ bin_oid               <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ bin_id                <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ site                  <chr> "Caribou-Poker Creeks Research Watershed NEON Fi…
$ site_ID               <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ subplot               <chr> "001", "001", "001", "001", "001", "001", "001",…
$ layer                 <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"…
$ quadrant              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date                  <dbl> 20230710, 20230710, 20230710, 20230710, 20230710…
$ img_genome_id         <dbl> 3300079481, 3300079481, 3300079481, 3300079481, …
$ bin_quality           <chr> "MQ", "HQ", "MQ", "MQ", "MQ", "MQ", "MQ", "MQ", …
$ bin_lineage           <chr> "Bacteria; Acidobacteriota; Terriglobia", "Bacte…
$ gtdb_taxonomy_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia; Terrigl…
$ domain                <chr> "Bacteria", "Bacteria", "Bacteria", "Bacteria", …
$ phylum                <chr> "Acidobacteriota", "Actinomycetota", "Actinomyce…
$ class                 <chr> "Terriglobia", "Thermoleophilia", "Thermoleophil…
$ order                 <chr> "Terriglobales", "Solirubrobacterales", "Solirub…
$ family                <chr> "SbA1", "Solirubrobacteraceae", "Solirubrobacter…
$ genus                 <chr> "Sulfotelmatobacter", "Palsa-744", "Palsa-744", …
$ bin_methods           <chr> "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0…
$ created_by            <chr> "IMG_PIPELINE", "IMG_PIPELINE", "IMG_PIPELINE", …
$ date_added            <date> 2025-02-03, 2025-02-03, 2025-02-03, 2025-02-03,…
$ bin_completeness      <dbl> 95.61, 94.66, 51.78, 60.83, 81.54, 88.11, 77.19,…
$ bin_contamination     <dbl> 4.96, 1.99, 9.42, 0.04, 0.18, 0.19, 4.25, 0.66, …
$ average_coverage      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ total_number_of_bases <dbl> 6486507, 3237739, 2712910, 2720369, 3539081, 395…
$ x5s_r_rna             <dbl> 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, …
$ x16s_r_rna            <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
$ x23s_r_rna            <dbl> 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
$ t_rna_genes           <dbl> 54, 51, 30, 10, 30, 29, 36, 21, 24, 37, 20, 21, …
$ gene_count            <dbl> 5785, 3054, 3039, 2539, 3314, 3753, 5943, 2297, …
$ scaffold_count        <dbl> 100, 52, 513, 152, 245, 91, 497, 188, 470, 227, …
$ gold_study_id         <chr> "Gs0166454", "Gs0166454", "Gs0166454", "Gs016645…
$ community             <chr> "Soil microbial communities", "Soil microbial co…
$ type                  <chr> "Soil", "Soil", "Soil", "Soil", "Soil", "Soil", …

2. Preprocessing and Aggregation

We use gene_count as abundance metric and normalize by sample.

R code
data_filtered <- mags_data %>%
  filter(!is.na(genus), !is.na(gene_count)) %>%
  select(genomicsSampleID, genus, gene_count, gtdb_taxonomy_lineage)

abundance_matrix <- data_filtered %>%
  group_by(genomicsSampleID, genus) %>%
  summarise(abundance = sum(gene_count), .groups = "drop") %>%
  pivot_wider(names_from = genus, values_from = abundance, values_fill = 0)

abundance_matrix <- abundance_matrix %>%
  mutate(across(-genomicsSampleID, ~ .x / sum(.x)))

3. Compute Co-occurrence Correlations

R code
abundance_only <- abundance_matrix %>% select(-genomicsSampleID)
cor_matrix <- cor(abundance_only, method = "spearman")
cor_edges <- as.data.frame(as.table(cor_matrix)) %>%
  filter(Var1 != Var2, Freq > 0.6)

4. Build Network and Detect Modules

R code
network_graph <- graph_from_data_frame(d = cor_edges, directed = FALSE)
communities <- cluster_louvain(network_graph)
graph_tbl <- as_tbl_graph(network_graph) %>%
  mutate(module = communities$membership)

5. Static Network Visualization

R code
ggraph(network_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = Freq), show.legend = FALSE) +
  geom_node_point(color = "steelblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

6. Module Detection Visualization

R code
ggraph(graph_tbl, layout = "fr") +
  geom_edge_link(alpha = 0.3) +
  geom_node_point(aes(color = as.factor(module)), size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

7. Interactive Network Visualization with Tooltips and Color Coding

R code
node_info <- graph_tbl %>%
  as_tibble() %>%
  rename(genus = name) %>%
  left_join(data_filtered %>% select(genus, gtdb_taxonomy_lineage) %>% distinct(), by = "genus")

node_info <- node_info %>% distinct(genus, .keep_all = TRUE)

nodes <- data.frame(
  id = node_info$genus,
  label = node_info$genus,
  title = paste("Taxonomy:", node_info$gtdb_taxonomy_lineage),
  color = as.factor(node_info$module)
)

edges <- cor_edges %>% distinct(Var1, Var2, .keep_all = TRUE) %>%
  rename(from = Var1, to = Var2, value = Freq)
R code
visNetwork(nodes, edges) %>%
  visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
  visEdges(smooth = FALSE) %>%
  visLegend() %>%
  visLayout(randomSeed = 123)

8. Interactive Soil Chemistry Correlation Heatmap

R code
module_abundance <- graph_tbl %>%
  as_tibble() %>%
  select(name, module) %>%
  inner_join(data_filtered, by = c("name" = "genus")) %>%
  group_by(genomicsSampleID, module) %>%
  summarise(module_abundance = sum(gene_count), .groups = "drop") %>%
  pivot_wider(names_from = module, values_from = module_abundance, values_fill = 0)

soil_data <- mags_data %>% select(genomicsSampleID, soilMoisture, soilTemp, soilInWaterpH)
merged_data <- inner_join(module_abundance, soil_data, by = "genomicsSampleID")

cor_matrix_soil <- cor(merged_data %>% select(-genomicsSampleID), method = "spearman")

cor_long <- as.data.frame(as.table(cor_matrix_soil))

plot_ly(
  x = unique(cor_long$Var2),
  y = unique(cor_long$Var1),
  z = matrix(cor_long$Freq, nrow = length(unique(cor_long$Var1)), byrow = TRUE),
  type = "heatmap",
  colorscale = "RdBu",
  reversescale = TRUE
) %>%
  layout(title = "Module vs Soil Chemistry Correlation Heatmap", xaxis = list(title = "Variables"), yaxis = list(title = "Modules"))

Conclusion

This comprehensive report combines static and interactive visualizations, module detection, and environmental correlations for microbial co-occurrence analysis.