R code
library(tidyverse)
library(igraph)
library(ggraph)
library(tidygraph)
library(readr)
library(visNetwork)
library(plotly)NEON Soil MAGs Analysis
December 3, 2025
This comprehensive report includes: - Data preprocessing and normalization. - Static network visualization. - Interactive network visualization with taxonomy tooltips and module color coding. - Module detection visualization. - Interactive soil chemistry correlation heatmap.
Rows: 5,573
Columns: 43
$ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ genomicsSampleID <chr> "BONA_001-O-20230710", "BONA_001-O-20230710", "B…
$ soilMoisture <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ decimalLatitude <dbl> 65.17445, 65.17445, 65.17445, 65.17445, 65.17445…
$ decimalLongitude <dbl> -147.4782, -147.4782, -147.4782, -147.4782, -147…
$ elevation <dbl> 374.1, 374.1, 374.1, 374.1, 374.1, 374.1, 374.1,…
$ soilTemp <dbl> 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1…
$ soilInWaterpH <dbl> 3.993976, 3.993976, 3.993976, 3.993976, 3.993976…
$ soilInCaClpH <dbl> 3.449951, 3.449951, 3.449951, 3.449951, 3.449951…
$ bin_oid <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ bin_id <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ site <chr> "Caribou-Poker Creeks Research Watershed NEON Fi…
$ site_ID <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ subplot <chr> "001", "001", "001", "001", "001", "001", "001",…
$ layer <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"…
$ quadrant <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date <dbl> 20230710, 20230710, 20230710, 20230710, 20230710…
$ img_genome_id <dbl> 3300079481, 3300079481, 3300079481, 3300079481, …
$ bin_quality <chr> "MQ", "HQ", "MQ", "MQ", "MQ", "MQ", "MQ", "MQ", …
$ bin_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia", "Bacte…
$ gtdb_taxonomy_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia; Terrigl…
$ domain <chr> "Bacteria", "Bacteria", "Bacteria", "Bacteria", …
$ phylum <chr> "Acidobacteriota", "Actinomycetota", "Actinomyce…
$ class <chr> "Terriglobia", "Thermoleophilia", "Thermoleophil…
$ order <chr> "Terriglobales", "Solirubrobacterales", "Solirub…
$ family <chr> "SbA1", "Solirubrobacteraceae", "Solirubrobacter…
$ genus <chr> "Sulfotelmatobacter", "Palsa-744", "Palsa-744", …
$ bin_methods <chr> "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0…
$ created_by <chr> "IMG_PIPELINE", "IMG_PIPELINE", "IMG_PIPELINE", …
$ date_added <date> 2025-02-03, 2025-02-03, 2025-02-03, 2025-02-03,…
$ bin_completeness <dbl> 95.61, 94.66, 51.78, 60.83, 81.54, 88.11, 77.19,…
$ bin_contamination <dbl> 4.96, 1.99, 9.42, 0.04, 0.18, 0.19, 4.25, 0.66, …
$ average_coverage <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ total_number_of_bases <dbl> 6486507, 3237739, 2712910, 2720369, 3539081, 395…
$ x5s_r_rna <dbl> 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, …
$ x16s_r_rna <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
$ x23s_r_rna <dbl> 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
$ t_rna_genes <dbl> 54, 51, 30, 10, 30, 29, 36, 21, 24, 37, 20, 21, …
$ gene_count <dbl> 5785, 3054, 3039, 2539, 3314, 3753, 5943, 2297, …
$ scaffold_count <dbl> 100, 52, 513, 152, 245, 91, 497, 188, 470, 227, …
$ gold_study_id <chr> "Gs0166454", "Gs0166454", "Gs0166454", "Gs016645…
$ community <chr> "Soil microbial communities", "Soil microbial co…
$ type <chr> "Soil", "Soil", "Soil", "Soil", "Soil", "Soil", …
We use gene_count as abundance metric and normalize by sample.
data_filtered <- mags_data %>%
filter(!is.na(genus), !is.na(gene_count)) %>%
select(genomicsSampleID, genus, gene_count, gtdb_taxonomy_lineage)
abundance_matrix <- data_filtered %>%
group_by(genomicsSampleID, genus) %>%
summarise(abundance = sum(gene_count), .groups = "drop") %>%
pivot_wider(names_from = genus, values_from = abundance, values_fill = 0)
abundance_matrix <- abundance_matrix %>%
mutate(across(-genomicsSampleID, ~ .x / sum(.x)))node_info <- graph_tbl %>%
as_tibble() %>%
rename(genus = name) %>%
left_join(data_filtered %>% select(genus, gtdb_taxonomy_lineage) %>% distinct(), by = "genus")
node_info <- node_info %>% distinct(genus, .keep_all = TRUE)
nodes <- data.frame(
id = node_info$genus,
label = node_info$genus,
title = paste("Taxonomy:", node_info$gtdb_taxonomy_lineage),
color = as.factor(node_info$module)
)
edges <- cor_edges %>% distinct(Var1, Var2, .keep_all = TRUE) %>%
rename(from = Var1, to = Var2, value = Freq)module_abundance <- graph_tbl %>%
as_tibble() %>%
select(name, module) %>%
inner_join(data_filtered, by = c("name" = "genus")) %>%
group_by(genomicsSampleID, module) %>%
summarise(module_abundance = sum(gene_count), .groups = "drop") %>%
pivot_wider(names_from = module, values_from = module_abundance, values_fill = 0)
soil_data <- mags_data %>% select(genomicsSampleID, soilMoisture, soilTemp, soilInWaterpH)
merged_data <- inner_join(module_abundance, soil_data, by = "genomicsSampleID")
cor_matrix_soil <- cor(merged_data %>% select(-genomicsSampleID), method = "spearman")
cor_long <- as.data.frame(as.table(cor_matrix_soil))
plot_ly(
x = unique(cor_long$Var2),
y = unique(cor_long$Var1),
z = matrix(cor_long$Freq, nrow = length(unique(cor_long$Var1)), byrow = TRUE),
type = "heatmap",
colorscale = "RdBu",
reversescale = TRUE
) %>%
layout(title = "Module vs Soil Chemistry Correlation Heatmap", xaxis = list(title = "Variables"), yaxis = list(title = "Modules"))This comprehensive report combines static and interactive visualizations, module detection, and environmental correlations for microbial co-occurrence analysis.