R code
# Load Libraries
library(tidyverse)
bioconductor
and gwascat
take a long time to install (20-30 min).data
folder.Note if you are working on your own computer (and not RStudio Cloud) you will need to install Bioconductor before installing ggbio
. See the directions for installing Bioconductor packages in the previous lab.
# Load Libraries
library(tidyverse)
There are three separate code snippets here. The first is for the 23andMe file that we have been working with. The second is for your 23andME results file which has a hashtag on the column header line. The third is for the AncestryDNA resuts.
# Load SNP file and convert genotype and chromosome to factors
<- read_tsv("data/23andMe_raw.tsv", comment = '#', col_names = FALSE) |>
mySNPs setNames(c("rsid", "chromosome", "position", "genotype")) |>
mutate(chromosome = as.factor(chromosome)) |>
mutate(genotype = as.factor(genotype)) |>
mutate(position = as.numeric(position))
# Load Ancestry DNA file
<- read_tsv("data/AncestryDNA.txt", comment = '#',
mySNPs col_types =
cols(
rsid = col_character(),
chromosome = col_factor(),
position = col_integer()
|>
)) mutate(genotype= paste(allele1, allele2, sep = '')) |>
select(-c(allele1, allele2)) |>
mutate(across(genotype, as_factor))
<- read_csv("data/MyHeritage_raw.csv", comment = '#') |>
mySNPs # rename column labels to be consistent with 23andME and AncestryDNA
rename(rsid = RSID) |>
rename(chromosome = CHROMOSOME) |>
rename(position = POSITION) |>
rename(genotype = RESULT) |>
# change to factors
mutate(chromosome = as.factor(chromosome)) |>
mutate(genotype = as.factor(genotype))
<- read_csv("data/FamilyTreeDNA_raw.csv", comment = '#') |>
mySNPs # rename column labels to be consistent with 23andME and AncestryDNA
rename(rsid = RSID) |>
rename(chromosome = CHROMOSOME) |>
rename(position = POSITION) |>
rename(genotype = RESULT) |>
# change to factors
mutate(chromosome = as.factor(chromosome)) |>
mutate(genotype = as.factor(genotype))
As we have seen this file holds just the rsid, position, chromosome and genotype
head(mySNPs)
# A tibble: 6 × 4
rsid chromosome position genotype
<chr> <fct> <dbl> <fct>
1 rs548049170 1 69869 TT
2 rs4477212 1 82154 AA
3 rs9326622 1 567092 CC
4 rs116587930 1 727841 GG
5 rs3094315 1 752566 AA
6 rs3131972 1 752721 GG
These files are available in your data folder on Posit Cloud, in the file tab in CANVAS and through direct download from EBI as described in the previous lab. There is an issue with the 2024 download and one of the below graphs, but that graph works if you join the your SNPs with the 2023 gwas data set.
# Oct 2024 gwas table
# updated_gwas_data <- read_tsv("data/updated_gwas_data.tsv") |>
# mutate(CHR_POS = as.numeric(CHR_POS))
# Oct 2023 gwas table
<- read_tsv("data/gwas_data_2023.tsv") |>
updated_gwas_data mutate(CHR_POS = as.numeric(CHR_POS))
<- inner_join(mySNPs, updated_gwas_data, by = c("rsid" = "SNPS")) mySNPs_gwas_table
Add columns for your alleles
$risk_allele_clean <- str_sub(mySNPs_gwas_table$`STRONGEST.SNP.RISK.ALLELE`, -1)
mySNPs_gwas_table$my_allele_1 <- str_sub(mySNPs_gwas_table$genotype, 1, 1)
mySNPs_gwas_table$my_allele_2 <- str_sub(mySNPs_gwas_table$genotype, 2, 2)
mySNPs_gwas_table$have_risk_allele_count <- if_else(mySNPs_gwas_table$my_allele_1 == mySNPs_gwas_table$risk_allele_clean, 1, 0) + if_else(mySNPs_gwas_table$my_allele_2 == mySNPs_gwas_table$risk_allele_clean, 1, 0) mySNPs_gwas_table
# Load Libraries
library(DT)
library(gwascat) # bioconductor package
library(ggtext)
library(ggbio) # bioconductor package
library(GenomicRanges) # bioconductor package
data("CRC", package = "biovizBase")
# Change CONTEXT to a factor
$CONTEXT <- as.factor(mySNPs_gwas_table$CONTEXT) mySNPs_gwas_table
The below code uses the function fct_infreq
from the forcats
package in tidyverse
to order the bars according to abundance.
ggplot(mySNPs_gwas_table, aes(x = fct_infreq(CONTEXT))) +
geom_bar() +
coord_flip() +
labs(y = "SNP Count", x = "Genome Context", title="Plot of Genome Context SNP Counts")
ggplot(mySNPs_gwas_table, aes(x = "", fill = fct_infreq(CONTEXT))) +
geom_bar(width = 1) +
coord_polar(theta = "y", start=0) +
theme(legend.position = "bottom") +
labs(y = "SNP Count", fill = "Genome Context", title="Plot of Genome Context SNP Counts")
With help from Daniel Roelfs’ tutorial below is some exmaples of making Manhattan plots with the GWASCAT/mySNP data.
# This is to create a continuous y-axis for plotting
<- mySNPs_gwas_table |>
data_cum group_by(chromosome) |>
summarise(max_bp = max(CHR_POS)) |>
mutate(bp_add = lag(cumsum(max_bp), default = 0)) |>
select(chromosome, bp_add)
<- mySNPs_gwas_table |>
mySNPs_gwas_Manh_data inner_join(data_cum, by = "chromosome") |>
mutate(bp_cum = CHR_POS + bp_add)
# This is for printing the chromosome label in the center position of each region
<- mySNPs_gwas_Manh_data |>
axis_set group_by(chromosome) |>
summarize(center = mean(bp_cum))
# This is for creating the ylim for ploting
<- mySNPs_gwas_Manh_data |>
ylim filter(P.VALUE == min(P.VALUE)) |>
mutate(ylim = abs(floor(log10(P.VALUE))) + 2) |>
pull(ylim)
<- 5e-8 sig
# Making the Manhattan plot
ggplot(mySNPs_gwas_Manh_data, aes(x = bp_cum, y = -log10(P.VALUE),
color = as_factor(chromosome))) +
geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chromosome, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_color_manual(values = rep(c("#276FBF", "#183059"), unique(length(axis_set$chromosome)))) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(P.VALUE)") +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)
# Making the Manhattan plot
<- mySNPs_gwas_Manh_data |>
mySNPs_gwas_Manh_data_ED filter(DISEASE.TRAIT == "Type 2 diabetes")
$have_risk_allele_count <- as.factor(mySNPs_gwas_Manh_data_ED$have_risk_allele_count)
mySNPs_gwas_Manh_data_ED
ggplot(mySNPs_gwas_Manh_data_ED, aes(x = bp_cum, y = -log10(P.VALUE),
color = have_risk_allele_count)) +
geom_point(alpha = 0.75) +
scale_x_continuous(label = axis_set$chromosome, breaks = axis_set$center) +
scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) +
scale_size_continuous(range = c(0.5,3)) +
labs(x = NULL,
y = "-log<sub>10</sub>(P.VALUE)") +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.title.y = element_markdown(),
axis.text.x = element_text(angle = 60, size = 8, vjust = 0.5)
)
ggbio is a Bioconductor package building on top of ggplot2. It provides a genomic visualization framework that facilitates graphing of SNP, RNAseq and other data in a genome context. ’ggbio` is available through Bioconductor https://bioconductor.org/packages/release/bioc/html/ggbio.html . There is a brand new tutorial (Nov 1) that could be used to build on the below example visualizations.
Note: This graph is not currently working with the 2024 gwas data set. Thus, skip unless you are interested in doing this with the 2023 gwascat file in our data folder.
<- mySNPs_gwas_table |>
mySNPs_gwas_table_1_22 filter(seqnames %in% 1:22 & have_risk_allele_count > 1) |>
distinct(rsid, .keep_all = TRUE)
<- GRanges(seqnames = mySNPs_gwas_table_1_22$seqnames, ranges = mySNPs_gwas_table_1_22$start, seqlengths = seqlengths(hg19sub), seqinfo = seqinfo(hg19sub))
mySNPs_gwas_plot_1_22
ggbio() +
circle(mySNPs_gwas_plot_1_22, geom = "rect", color = "steelblue") +
circle(hg19sub, geom = "scale", size = 2) +
circle(hg19sub, geom = "text", aes(label = seqnames), vjust = -2, size = 3)
<- mySNPs_gwas_table |>
mySNPs_gwas_table_disease filter(seqnames %in% 1:22 & have_risk_allele_count > 1 & DISEASE.TRAIT == "Type 2 diabetes") |>
distinct(rsid, .keep_all = TRUE)
<- GRanges(seqnames = mySNPs_gwas_table_disease$seqnames, ranges = mySNPs_gwas_table_disease$start, seqlengths = seqlengths(hg19sub), seqinfo = seqinfo(hg19sub))
mySNPs_gwas_plot_disease
ggbio() +
circle(mySNPs_gwas_plot_disease, geom = "rect", color = "maroon4") +
circle(mySNPs_gwas_plot_1_22, geom = "rect", color = "steelblue") +
circle(hg19sub, geom = "scale", size = 2) +
circle(hg19sub, geom = "text", aes(label = seqnames), vjust = -2, size = 3)
Create a Manhattan plot using a different disease trait.
Using ggbio map a disease traut to a circularized human genome.
Be creative. Make a graph using another aspect of the GWAS data.
Special thanks to former MCB student Dr. Kirk MacKinnon for ideas of further using the GWASCAT data and visualizaitions.