R code
# Load Libraries
library(tidyverse)bioconductor and gwascat take a long time to install (20-30 min).data folder.Note if you are working on your own computer (and not RStudio Cloud) you will need to install Bioconductor before installing ggbio. See the directions for installing Bioconductor packages in the previous lab.
# Load Libraries
library(tidyverse)There are three separate code snippets here. The first is for the 23andMe file that we have been working with. The second is for your 23andME results file which has a hashtag on the column header line. The third is for the AncestryDNA resuts.
# Load SNP file and convert genotype and chromosome to factors
mySNPs <- read_tsv("data/23andMe_raw.tsv", comment = '#', col_names = FALSE) |>
setNames(c("rsid", "chromosome", "position", "genotype")) |>
mutate(chromosome = as.factor(chromosome)) |>
mutate(genotype = as.factor(genotype)) |>
mutate(position = as.numeric(position))# Load Ancestry DNA file
mySNPs <- read_tsv("data/AncestryDNA.txt", comment = '#',
col_types =
cols(
rsid = col_character(),
chromosome = col_factor(),
position = col_integer()
)) |>
mutate(genotype= paste(allele1, allele2, sep = '')) |>
select(-c(allele1, allele2)) |>
mutate(across(genotype, as_factor))mySNPs <- read_csv("data/MyHeritage_raw.csv", comment = '#') |>
# rename column labels to be consistent with 23andME and AncestryDNA
rename(rsid = RSID) |>
rename(chromosome = CHROMOSOME) |>
rename(position = POSITION) |>
rename(genotype = RESULT) |>
# change to factors
mutate(chromosome = as.factor(chromosome)) |>
mutate(genotype = as.factor(genotype))mySNPs <- read_csv("data/FamilyTreeDNA_raw.csv", comment = '#') |>
# rename column labels to be consistent with 23andME and AncestryDNA
rename(rsid = RSID) |>
rename(chromosome = CHROMOSOME) |>
rename(position = POSITION) |>
rename(genotype = RESULT) |>
# change to factors
mutate(chromosome = as.factor(chromosome)) |>
mutate(genotype = as.factor(genotype)) As we have seen this file holds just the rsid, position, chromosome and genotype
head(mySNPs)# A tibble: 6 × 4
rsid chromosome position genotype
<chr> <fct> <dbl> <fct>
1 rs548049170 1 69869 TT
2 rs4477212 1 82154 AA
3 rs9326622 1 567092 CC
4 rs116587930 1 727841 GG
5 rs3094315 1 752566 AA
6 rs3131972 1 752721 GG
These files are available in your data folder on Posit Cloud, in the file tab in CANVAS and through direct download from EBI as described in the previous lab. There is an issue with the 2024 download and one of the below graphs, but that graph works if you join the your SNPs with the 2023 gwas data set.
# Oct 2024 gwas table
# updated_gwas_data <- read_tsv("data/updated_gwas_data.tsv") |>
# mutate(CHR_POS = as.numeric(CHR_POS))
# Oct 2023 gwas table
updated_gwas_data <- read_tsv("data/gwas_data_2023.tsv") |>
mutate(CHR_POS = as.numeric(CHR_POS)) mySNPs_gwas_table <- inner_join(mySNPs, updated_gwas_data, by = c("rsid" = "SNPS"))Add columns for your alleles
mySNPs_gwas_table$risk_allele_clean <- str_sub(mySNPs_gwas_table$`STRONGEST.SNP.RISK.ALLELE`, -1)
mySNPs_gwas_table$my_allele_1 <- str_sub(mySNPs_gwas_table$genotype, 1, 1)
mySNPs_gwas_table$my_allele_2 <- str_sub(mySNPs_gwas_table$genotype, 2, 2)
mySNPs_gwas_table$have_risk_allele_count <- if_else(mySNPs_gwas_table$my_allele_1 == mySNPs_gwas_table$risk_allele_clean, 1, 0) + if_else(mySNPs_gwas_table$my_allele_2 == mySNPs_gwas_table$risk_allele_clean, 1, 0)# Load Libraries
library(DT)
library(gwascat) # bioconductor package
library(ggtext)
library(ggbio) # bioconductor package
library(GenomicRanges) # bioconductor package
data("CRC", package = "biovizBase")# Change CONTEXT to a factor
mySNPs_gwas_table$CONTEXT <- as.factor(mySNPs_gwas_table$CONTEXT)The below code uses the function fct_infreq from the forcats package in tidyverse to order the bars according to abundance.
ggplot(mySNPs_gwas_table, aes(x = fct_infreq(CONTEXT))) +
geom_bar() +
coord_flip() +
labs(y = "SNP Count", x = "Genome Context", title="Plot of Genome Context SNP Counts")ggplot(mySNPs_gwas_table, aes(x = "", fill = fct_infreq(CONTEXT))) +
geom_bar(width = 1) +
coord_polar(theta = "y", start=0) +
theme(legend.position = "bottom") +
labs(y = "SNP Count", fill = "Genome Context", title="Plot of Genome Context SNP Counts")With help from Daniel Roelfs’ tutorial below is some exmaples of making Manhattan plots with the GWASCAT/mySNP data.
# This is to create a continuous y-axis for plotting
data_cum <- mySNPs_gwas_table |>
group_by(chromosome) |>
summarise(max_bp = max(CHR_POS)) |>
mutate(bp_add = lag(cumsum(max_bp), default = 0)) |>
select(chromosome, bp_add)
mySNPs_gwas_Manh_data <- mySNPs_gwas_table |>
inner_join(data_cum, by = "chromosome") |>
mutate(bp_cum = CHR_POS + bp_add)# This is for printing the chromosome label in the center position of each region
axis_set <- mySNPs_gwas_Manh_data |>
group_by(chromosome) |>
summarize(center = mean(bp_cum))
# This is for creating the ylim for ploting
ylim <- mySNPs_gwas_Manh_data |>
filter(P.VALUE == min(P.VALUE)) |>
mutate(ylim = abs(floor(log10(P.VALUE))) + 2) |>
pull(ylim)
sig <- 5e-8# Making the Manhattan plot
ggplot(mySNPs_gwas_Manh_data, aes(x = bp_cum, y = -log10(P.VALUE),
color = as_factor(chromosome))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chromosome, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_color_manual(values = rep(c("#276FBF", "#183059"), unique(length(axis_set$chromosome)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(P.VALUE)") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)# Making the Manhattan plot
mySNPs_gwas_Manh_data_ED <- mySNPs_gwas_Manh_data |>
filter(DISEASE.TRAIT == "Type 2 diabetes")
mySNPs_gwas_Manh_data_ED$have_risk_allele_count <- as.factor(mySNPs_gwas_Manh_data_ED$have_risk_allele_count)
ggplot(mySNPs_gwas_Manh_data_ED, aes(x = bp_cum, y = -log10(P.VALUE),
color = have_risk_allele_count)) +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chromosome, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(P.VALUE)") +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)ggbio is a Bioconductor package building on top of ggplot2. It provides a genomic visualization framework that facilitates graphing of SNP, RNAseq and other data in a genome context. ’ggbio` is available through Bioconductor https://bioconductor.org/packages/release/bioc/html/ggbio.html . There is a brand new tutorial (Nov 1) that could be used to build on the below example visualizations.
Note: This graph is not currently working with the 2024 gwas data set. Thus, skip unless you are interested in doing this with the 2023 gwascat file in our data folder.
mySNPs_gwas_table_1_22 <- mySNPs_gwas_table |>
filter(seqnames %in% 1:22 & have_risk_allele_count > 1) |>
distinct(rsid, .keep_all = TRUE)
mySNPs_gwas_plot_1_22 <- GRanges(seqnames = mySNPs_gwas_table_1_22$seqnames, ranges = mySNPs_gwas_table_1_22$start, seqlengths = seqlengths(hg19sub), seqinfo = seqinfo(hg19sub))
ggbio() +
circle(mySNPs_gwas_plot_1_22, geom = "rect", color = "steelblue") +
circle(hg19sub, geom = "scale", size = 2) +
circle(hg19sub, geom = "text", aes(label = seqnames), vjust = -2, size = 3)mySNPs_gwas_table_disease <- mySNPs_gwas_table |>
filter(seqnames %in% 1:22 & have_risk_allele_count > 1 & DISEASE.TRAIT == "Type 2 diabetes") |>
distinct(rsid, .keep_all = TRUE)
mySNPs_gwas_plot_disease <- GRanges(seqnames = mySNPs_gwas_table_disease$seqnames, ranges = mySNPs_gwas_table_disease$start, seqlengths = seqlengths(hg19sub), seqinfo = seqinfo(hg19sub))
ggbio() +
circle(mySNPs_gwas_plot_disease, geom = "rect", color = "maroon4") +
circle(mySNPs_gwas_plot_1_22, geom = "rect", color = "steelblue") +
circle(hg19sub, geom = "scale", size = 2) +
circle(hg19sub, geom = "text", aes(label = seqnames), vjust = -2, size = 3)Create a Manhattan plot using a different disease trait.
Using ggbio map a disease traut to a circularized human genome.
Be creative. Make a graph using another aspect of the GWAS data.
Special thanks to former MCB student Dr. Kirk MacKinnon for ideas of further using the GWASCAT data and visualizaitions.