Access NEON Data for Metagenomics

From Access NEON Data for Metagenomics See * Update on the changing NEON microbial data * Soil microbe metagenome sequences

Install packages

R code
install.packages("neonUtilities")
install.packages("neonOS")
R code
library(neonUtilities)
library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
R code
soilTrialSites = c("BONA","DEJU","HEAL","TOOL","BARR")
soilTrialSites = c("HARV")


soilChem <- loadByProduct(
  dpID='DP1.10086.001',
  startdate = "2017-01",
  enddate = "2019-12",
  check.size = FALSE,
  site = soilTrialSites,
  package='expanded')
R code
View(soilChem$sls_metagenomicsPooling)
R code
metaGdata <- loadByProduct(dpID = 'DP1.10107.001',

                          check.size = FALSE,

                          package = 'expanded') 
R code
metaGdata_mms_metagenomeSequencing <- metaGdata$mms_metagenomeSequencing
write_csv(metaGdata_mms_metagenomeSequencing, "data/NEON_metadata/metaGdata_mms_metagenomeSequencing.csv")

This has HARV data collected up until 2022. The 2021 and 2022 data are not in the phyloNEON data.

R code
metaGdata_mms_metagenomeSequencing_HARV <- metaGdata$mms_metagenomeSequencing |> 
write_csv(metaGdata_mms_metagenomeSequencing_HARV, "data/NEON_metadata/metaGdata_mms_metagenomeSequencing_HARV.csv")

Read in saved file (all above in eval = FALSE)

R code
metaGdata_mms_metagenomeSequencing <- read_csv("data/NEON_metadata/metaGdata_mms_metagenomeSequencing.csv")

Create data frame from dnaSampleID

R code
metaGdata_dnaSampleID <- metaGdata_mms_metagenomeSequencing |> 
  select(dnaSampleID)

# Read in 2023 and 2024 data that is not in the metagenome data product yet

neon_ay23_jgi_samples <- read_csv("data/NEON_metadata/neon_ay23_jgi_samples_soil.csv")
neon_ay24_jgi_samples <- read_csv("data/NEON_metadata/neon_ay24_jgi_samples_soil.csv")

neon_dnaSampleID <- rbind(metaGdata_dnaSampleID, neon_ay23_jgi_samples, neon_ay24_jgi_samples)

Full table

R code
neon_dnaSampleID_split <- neon_dnaSampleID |> 

  separate(`dnaSampleID`, c("dnaSampleID.site","dnaSampleID.sub"), "_", remove=FALSE) |> 
  
  mutate_at("dnaSampleID.sub", str_replace, "-comp", "_COMP") |>
  mutate_at("dnaSampleID.sub", str_replace, "-COMP", "_COMP") |>
  mutate_at("dnaSampleID.sub", str_replace, "-GEN", "_GEN") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.sub","dnaSampleID.type"), "_") |> 
  
  mutate_at("dnaSampleID.sub", str_replace, "-M", "_M") |>
  mutate_at("dnaSampleID.sub", str_replace, "-O", "_O") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.plot","dnaSampleID.sub"), "_") |> 
  
  mutate_at("dnaSampleID.sub", str_replace, "M-", "M_") |>
  mutate_at("dnaSampleID.sub", str_replace, "O-", "O_") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.layer","dnaSampleID.sub"), "_") |> 

  mutate_at("dnaSampleID.sub", str_replace, "-201", "201") |>
  mutate_at("dnaSampleID.sub", str_replace, "-202", "202") |>
  mutate_at("dnaSampleID.sub", str_replace, "2013", "_2013") |>
  mutate_at("dnaSampleID.sub", str_replace, "2014", "_2014") |>
  mutate_at("dnaSampleID.sub", str_replace, "2015", "_2015") |>
  mutate_at("dnaSampleID.sub", str_replace, "2016", "_2016") |>
  mutate_at("dnaSampleID.sub", str_replace, "2017", "_2017") |>
  mutate_at("dnaSampleID.sub", str_replace, "2018", "_2018") |>
  mutate_at("dnaSampleID.sub", str_replace, "2019", "_2019") |>
  mutate_at("dnaSampleID.sub", str_replace, "2020", "_2020") |>
  mutate_at("dnaSampleID.sub", str_replace, "2021", "_2021") |>
  mutate_at("dnaSampleID.sub", str_replace, "2022", "_2022") |>
  mutate_at("dnaSampleID.sub", str_replace, "2023", "_2023") |>
  mutate_at("dnaSampleID.sub", str_replace, "2024", "_2024") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.subplot","dnaSampleID.date"), "_") |> 

  unite(plotID, c(dnaSampleID.site, dnaSampleID.plot), sep='_', remove=FALSE)

neon_dnaSampleID_split$dnaSampleID.date <- as.numeric(neon_dnaSampleID_split$dnaSampleID.date)
neon_dnaSampleID_split$dnaSampleID.date <- ymd(neon_dnaSampleID_split$dnaSampleID.date)

Full table

R code
metaGdata_mms_metagenomeSequencing <- metaGdata_mms_metagenomeSequencing |> 

  separate(`dnaSampleID`, c("dnaSampleID.site","dnaSampleID.sub"), "_", remove=FALSE) |> 
  
  mutate_at("dnaSampleID.sub", str_replace, "-comp", "_COMP") |>
  mutate_at("dnaSampleID.sub", str_replace, "-COMP", "_COMP") |>
  mutate_at("dnaSampleID.sub", str_replace, "-GEN", "_GEN") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.sub","dnaSampleID.type"), "_") |> 
  
  mutate_at("dnaSampleID.sub", str_replace, "-M", "_M") |>
  mutate_at("dnaSampleID.sub", str_replace, "-O", "_O") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.plot","dnaSampleID.sub"), "_") |> 
  
  mutate_at("dnaSampleID.sub", str_replace, "M-", "M_") |>
  mutate_at("dnaSampleID.sub", str_replace, "O-", "O_") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.layer","dnaSampleID.sub"), "_") |> 

  mutate_at("dnaSampleID.sub", str_replace, "-201", "201") |>
  mutate_at("dnaSampleID.sub", str_replace, "-202", "202") |>
  mutate_at("dnaSampleID.sub", str_replace, "201", "_201") |>
  mutate_at("dnaSampleID.sub", str_replace, "202", "_202") |>
  separate(`dnaSampleID.sub`, c("dnaSampleID.subplot","dnaSampleID.date"), "_") |> 

  unite(plotID, c(dnaSampleID.site, dnaSampleID.plot), sep='_', remove=FALSE)

metaGdata_mms_metagenomeSequencing$dnaSampleID.data <- as.numeric(metaGdata_mms_metagenomeSequencing$dnaSampleID.date)
metaGdata_mms_metagenomeSequencing$dnaSampleID.date <- ymd(metaGdata_mms_metagenomeSequencing$dnaSampleID.date)

Plot of HARV samples per plot per year

R code
neon_dnaSampleID_split |> 
  filter(dnaSampleID.site == "HARV") |> 
  group_by(Year = lubridate::year(dnaSampleID.date), dnaSampleID.plot) |> 
  count() |> 
  pivot_wider(names_from = dnaSampleID.plot, values_from = n) |> 
  mutate_all(funs(replace_na(.,0))) |> 
  pivot_longer(!Year, names_to = "plot", values_to = "metagenomes") |> 
  ggplot(aes(x=Year, y = plot)) +
  geom_tile(aes(fill = metagenomes)) +
  scale_fill_viridis(discrete=FALSE, direction = -1) +
  scale_x_continuous(breaks = seq(2013, 2024, by = 1)) 

Plot of samples per plot per year at all sites

R code
neon_dnaSampleID_split |> 
  group_by(dnaSampleID.site, Year = lubridate::year(dnaSampleID.date), dnaSampleID.plot) |> 
  count() |> 
  pivot_wider(names_from = Year, values_from = n) |> 
  mutate_all(funs(replace_na(.,0))) |> 
  pivot_longer(!c(dnaSampleID.site, dnaSampleID.plot), names_to = "Year", values_to = "metagenomes") |> 
  ggplot(aes(x=Year, y = dnaSampleID.plot)) +
  geom_tile(aes(fill = metagenomes)) +
  scale_fill_viridis(discrete=FALSE, direction = -1) +
  facet_wrap(~dnaSampleID.site, scales ="free_y", ncol = 3) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))