R code
df <- read_csv("NEON_soilMAGs_soilChem.csv")
df <- df %>%
filter(!is.na(genomicsSampleID)) %>%
mutate(siteID = site_ID) %>%
select(siteID, phylum, soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH)Tutorial
December 3, 2025
This tutorial aggregates data by siteID instead of sampleID and includes: - Taxonomic profiling - Alpha and beta diversity - Ordination (PCoA, NMDS) - Constrained ordination (CCA, RDA) with soil chemistry - Color-coded biplots and interactive plots - Variance explained and significance tests
Explanation: Aggregate MAG counts and soil chemistry by siteID.
Explanation: Summarize phylum-level counts per site.
tax_profile <- df %>%
group_by(siteID, phylum) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(rel_abundance = count / sum(count))
ggplot(tax_profile, aes(x = siteID, y = rel_abundance, fill = phylum)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Taxonomic Profile by Site", y = "Relative Abundance") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), legend.position = "right")Explanation: Compute diversity metrics at site level.
abundance_matrix <- tax_profile %>%
pivot_wider(names_from = phylum, values_from = count, values_fill = 0)
abundance_matrix <- abundance_matrix %>% distinct(siteID, .keep_all = TRUE)
abundance_matrix <- abundance_matrix %>% filter(!is.na(siteID))
rn <- abundance_matrix$siteID
abundance_matrix <- abundance_matrix %>% select(-siteID)
abundance_matrix <- as.data.frame(lapply(abundance_matrix, as.numeric))
rownames(abundance_matrix) <- rn
abundance_matrix <- abundance_matrix[rowSums(abundance_matrix) > 0, ]
shannon <- diversity(abundance_matrix, index = "shannon")
simpson <- diversity(abundance_matrix, index = "simpson")
richness <- specnumber(abundance_matrix)
alpha_div <- tibble(siteID = rownames(abundance_matrix), Shannon = shannon, Simpson = simpson, Richness = richness)
alpha_div# A tibble: 27 × 4
siteID Shannon Simpson Richness
<chr> <dbl> <dbl> <int>
1 BONA 0.00173 0.000359 2
2 CLBJ 0.00173 0.000359 2
3 CPER 0.00173 0.000359 2
4 DCFS 0.00173 0.000359 2
5 DEJU 0.694 0.500 3
6 DSNY 0.00173 0.000359 2
7 GUAN 0.00173 0.000359 2
8 HARV 0.00173 0.000359 2
9 KONZ 0.00173 0.000359 2
10 LENO 0.00173 0.000359 2
# ℹ 17 more rows
Explanation: Bray-Curtis dissimilarity between sites.
BONA CLBJ CPER DCFS DEJU
BONA 0.00000000 0.06930693 0.7419355 0.4210526 0.6223501
CLBJ 0.06930693 0.00000000 0.7090909 0.3623188 0.6630573
CPER 0.74193548 0.70909091 0.0000000 0.4666667 0.9333276
DCFS 0.42105263 0.36231884 0.4666667 0.0000000 0.8267575
DEJU 0.62235014 0.66305730 0.9333276 0.8267575 0.0000000
Explanation: Visualize site-level differences.
Square root transformation
Wisconsin double standardization
Run 0 stress 1.219798e-16
Run 1 stress 0
... New best solution
... Procrustes: rmse 0.08431368 max resid 0.2164455
Run 2 stress 0.005368253
Run 3 stress 3.036322e-05
... Procrustes: rmse 0.1075216 max resid 0.1751653
Run 4 stress 9.836936e-05
... Procrustes: rmse 0.1207031 max resid 0.1831196
Run 5 stress 0
... Procrustes: rmse 0.1444153 max resid 0.3680746
Run 6 stress 8.45748e-05
... Procrustes: rmse 0.1232783 max resid 0.3293707
Run 7 stress 9.932887e-05
... Procrustes: rmse 0.1059297 max resid 0.1652366
Run 8 stress 0
... Procrustes: rmse 0.1599825 max resid 0.3953014
Run 9 stress 0
... Procrustes: rmse 0.1540484 max resid 0.4326244
Run 10 stress 0.004081708
Run 11 stress 0
... Procrustes: rmse 0.09780662 max resid 0.1555036
Run 12 stress 0
... Procrustes: rmse 0.1569152 max resid 0.4531087
Run 13 stress 0
... Procrustes: rmse 0.1328415 max resid 0.3239635
Run 14 stress 0
... Procrustes: rmse 0.1280671 max resid 0.2942607
Run 15 stress 0.007794782
Run 16 stress 0
... Procrustes: rmse 0.1082688 max resid 0.2118214
Run 17 stress 0
... Procrustes: rmse 0.09806647 max resid 0.1613522
Run 18 stress 0
... Procrustes: rmse 0.08747574 max resid 0.1432149
Run 19 stress 0
... Procrustes: rmse 0.1318983 max resid 0.2894873
Run 20 stress 0
... Procrustes: rmse 0.1281027 max resid 0.2842546
*** Best solution was not repeated -- monoMDS stopping criteria:
3: no. of iterations >= maxit
17: stress < smin
Explanation: Aggregate soil chemistry by site and visualize environmental influence.
soil_data <- df %>% group_by(siteID) %>% summarise(across(c(soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH), mean, na.rm = TRUE))
soil_data <- soil_data %>% filter(siteID %in% rownames(abundance_matrix))
soil_data <- column_to_rownames(soil_data, "siteID")
soil_data <- as.data.frame(lapply(soil_data, as.numeric))
rownames(soil_data) <- rownames(abundance_matrix)
variable_types <- tibble(variable = c("soilMoisture", "soilTemp", "soilInWaterpH", "soilInCaClpH"),
type = c("Moisture", "Temperature", "pH", "pH"))
# CCA
cca_model <- cca(abundance_matrix ~ ., data = soil_data)
cca_var <- summary(cca_model)$concont$importance[2,]
cca_sites <- scores(cca_model, display = "sites")
cca_env <- scores(cca_model, display = "bp")
cca_df <- as.data.frame(cca_sites)
cca_df$siteID <- rownames(cca_df)
cca_env_df <- as.data.frame(cca_env)
cca_env_df$variable <- rownames(cca_env_df)
cca_env_df <- left_join(cca_env_df, variable_types, by = "variable")
p_cca <- ggplot() +
geom_point(data = cca_df, aes(x = CCA1, y = CCA2, text = siteID), color = "blue") +
geom_segment(data = cca_env_df, aes(x = 0, y = 0, xend = CCA1, yend = CCA2, color = type, text = variable),
arrow = arrow(length = unit(0.3, "cm"))) +
geom_text(data = cca_env_df, aes(x = CCA1, y = CCA2, label = variable), vjust = -0.5) +
scale_color_manual(values = c("Moisture" = "blue", "Temperature" = "orange", "pH" = "green")) +
labs(title = paste("CCA Interactive Biplot (Variance Explained: CCA1 =", round(cca_var[1]*100, 1), "%, CCA2 =", round(cca_var[2]*100, 1), "% )"),
color = "Variable Type") + theme_minimal()
ggplotly(p_cca, tooltip = c("text"))# RDA
rda_model <- rda(abundance_matrix ~ ., data = soil_data)
rda_var <- summary(rda_model)$cont$importance[2,]
rda_sites <- scores(rda_model, display = "sites")
rda_env <- scores(rda_model, display = "bp")
rda_df <- as.data.frame(rda_sites)
rda_df$siteID <- rownames(rda_df)
rda_env_df <- as.data.frame(rda_env)
rda_env_df$variable <- rownames(rda_env_df)
rda_env_df <- left_join(rda_env_df, variable_types, by = "variable")
p_rda <- ggplot() +
geom_point(data = rda_df, aes(x = RDA1, y = RDA2, text = siteID), color = "darkgreen") +
geom_segment(data = rda_env_df, aes(x = 0, y = 0, xend = RDA1, yend = RDA2, color = type, text = variable),
arrow = arrow(length = unit(0.3, "cm"))) +
geom_text(data = rda_env_df, aes(x = RDA1, y = RDA2, label = variable), vjust = -0.5) +
scale_color_manual(values = c("Moisture" = "blue", "Temperature" = "orange", "pH" = "green")) +
labs(title = paste("RDA Interactive Biplot (Variance Explained: RDA1 =", round(rda_var[1]*100, 1), "%, RDA2 =", round(rda_var[2]*100, 1), "% )"),
color = "Variable Type") + theme_minimal()
ggplotly(p_rda, tooltip = c("text"))