Skip to content

Instantly share code, notes, and snippets.

@mikelove
Created December 18, 2025 14:09
Show Gist options
  • Select an option

  • Save mikelove/97dd70d89e9a5b32973b2dd1f6ee1d05 to your computer and use it in GitHub Desktop.

Select an option

Save mikelove/97dd70d89e9a5b32973b2dd1f6ee1d05 to your computer and use it in GitHub Desktop.
backup for variant table creation script for liver IGVF
library(here)
library(plyranges)
source(here("analysis/common/import.R"))
# This script imports variant-level summary statistics and adds QC variables
# e.g.
# - sufficient activity of at least one of the alleles
# - balanced representation of reference and
# alternate alleles in the input DNA
library(ggplot2)
if (FALSE) {
# negative control violin plot
e |>
filter(class == "element inactive control") |>
ggplot(aes(origin, log2FoldChange)) +
geom_violin() +
geom_jitter(width = 0.1, size = 1.5, alpha = 0.2) +
stat_summary(fun = median, geom = "point", size = 4, color = "red") +
ggtitle("Negative controls, activity across sublibraries")
}
neg_ctrl <- e |> filter(class == "element inactive control") |>
group_by(origin) |>
summarize(mean = median(log2FoldChange), sd = mad(log2FoldChange)) |>
tibble::column_to_rownames("origin")
### max activity (of the two alleles) ###
v <- v |> mutate(
max_activity = pmax(
log2(outputCountRef) - log2(inputCountRef),
log2(outputCountAlt) - log2(inputCountAlt)
),
# quantile version, using per sub-lib distributions:
max_activity_Q = pnorm(
q = max_activity,
mean = neg_ctrl[origin,"mean"],
sd = neg_ctrl[origin,"sd"]
)
)
if (FALSE) {
# plot max activity and its quantiled version, split by sub-library
v |> arrange(origin, max_activity) |>
ggplot(aes(max_activity, max_activity_Q, color=origin)) +
geom_line() +
coord_cartesian(xlim = c(-2, 1))
}
### adding columns: input stats and displayable columns for allelic sig
v <- v |> mutate(
delta_log10_input = abs(log10(inputCountRef) - log10(inputCountAlt)),
meanInput = 0.5 * (log10(inputCountRef) + log10(inputCountAlt)),
displayP = round(minusLog10PValue, 1),
displayQ = round(minusLog10QValue, 1),
)
if (FALSE) {
# examine delta log10 input diagnostic (y-axis), with qvalue on x-axis
v |>
filter(minusLog10QValue > 1) |>
filter(meanInput > 1) |>
ggplot(aes(
minusLog10QValue, delta_log10_input,
color = meanInput
)) +
geom_point() +
geom_hline(yintercept = log10(5))
# max activity (y-axis) and qvalue on x-axis
# faceting by sub-library
v |>
filter(minusLog10QValue > 1) |>
ggplot(aes(max_activity_Q, minusLog10QValue)) +
geom_point(size=.1) +
facet_wrap(~origin) +
coord_cartesian(xlim=c(0.5,1), ylim=c(0,8))
}
# how many variants both active and balanced in activity
v |>
mutate(
signif = minusLog10QValue > 1,
input_balance = delta_log10_input < log10(5),
active = max_activity_Q > 0.9,
) |>
group_by(origin) |>
count(signif, active, input_balance) |>
print(n=32)
# add significance column (FDR < 10%)
v <- v |>
mutate(
active = max_activity_Q > 0.9,
input_balance = delta_log10_input < log10(5),
signif = minusLog10QValue > 1 & active & input_balance
)
### add nearest gene ###
stopifnot(all.equal(v$SPDI, vranges$SPDI))
encode_dir <- "external_data/encode/HepG2_2016_GSE90322"
exprs <- read_delim(here(
encode_dir,
"GSE90322_norm_counts_TPM_GRCh38.p13_NCBI.tsv.gz"
))
exprs <- exprs |>
mutate(tpm = rowMeans(exprs[, 2:3])) |>
select(GeneID, tpm)
source(here("analysis/common/refseq_to_chr.R")) # for the refseq_to_chr() function
genes <- read_delim(here(
encode_dir,"Human.GRCh38.p13.annot.tsv.gz")) |>
filter(!stringr::str_detect(ChrStart,";")) |>
mutate(
seqnames = refseq_to_chr(ChrAcc),
start = as.numeric(ChrStart),
end = as.numeric(ChrStop)
) |>
filter(!is.na(start) & !is.na(seqnames)) |>
left_join(exprs) |>
as_granges()
# get the exons from UCSC known gene table
#library(TxDb.Hsapiens.UCSC.hg38.knownGene)
#ebg <- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene, "gene")
#saveRDS(ebg, file=here("external_data/conservation/exons_by_gene.rds"))
ebg <- readRDS(here("external_data/conservation/exons_by_gene.rds"))
# nearest gene (body)
vranges_near <- vranges |>
join_nearest(
genes |>
filter(GeneType == "protein-coding") |>
select(nearestGene=Symbol, nearestTPM=tpm)
)
# nearest moderately expressed (TPM > 5) TSS
vranges_near_exprs_tss <- vranges |>
join_nearest(
genes |>
filter(GeneType == "protein-coding", tpm > 5) |>
anchor_5p() |>
mutate(width=1) |>
select(nearestExprsGene=Symbol, nearestExprsTPM=tpm)
)
stopifnot(all.equal(v$SPDI, vranges$SPDI))
# add all four to the variant table
v$nearestGene <- vranges_near$nearestGene
v$nearestTPM <- vranges_near$nearestTPM
v$nearestExprsGene <- vranges_near_exprs_tss$nearestExprsGene
v$nearestExprsTPM <- vranges_near_exprs_tss$nearestExprsTPM
### add conservation ###
# conservation information is from cactus (method) 241-way track from zoonomia
# PhyloP Basewise Conservation of Zoonomia 241 Placental Mammals
# https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=cons241way
# downloaded from UCSC:
# https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus241way/cactus241way.phyloP.bw
# values were extracted from this 9 GB file using
# rtracklayer::import.bw(con, which=GRanges, as="NumericList")
# then this was saved as a GRanges object
# compute max and mean in the variant-centered ranges, but first need to zero out exonic regions
phyloP <- readRDS(here("external_data/conservation/igvf_lib1_variant_phyloP.rds"))
phyloP <- updateObject(phyloP)
phyloP <- phyloP %>%
mutate(
in_exon = as.numeric(overlapsAny(., ebg)),
mean_cons = mean(phyloP241way),
max_cons = max(phyloP241way)
)
# some plots and diagnostics
if (FALSE) {
table(phyloP$in_exon)
hist(phyloP$mean_cons, breaks=50)
hist(phyloP$max_cons)
with(mcols(phyloP), table(mean_cons=cut(mean_cons, c(-2,0,1,2,10)), in_exon))
with(mcols(phyloP), table(max_cons=cut(max_cons, 0:9), in_exon))
with(mcols(phyloP), round(prop.table(table(
mean_cons=cut(mean_cons, c(-2,0,1,2,10)), in_exon),2),2))
with(mcols(phyloP), round(prop.table(table(
max_cons=cut(max_cons, 0:9), in_exon),2),2))
}
# cut off conservation scores if the range overlaps an exon (UCSC known genes)
phylo_chop <- phyloP |>
mutate(
mean_cons = mean_cons * (1 - in_exon),
max_cons = max_cons * (1 - in_exon)
)
# check that we've zeroed out conservation in exons
with(mcols(phylo_chop), table(cons=mean_cons > 1, in_exon))
with(mcols(phylo_chop), table(cons=max_cons > 8, in_exon))
phylo_dat <- phylo_chop |>
select(name, in_exon, mean_cons, max_cons, .drop_ranges=TRUE) |>
as_tibble() |>
rename(SPDI = name) |>
filter(!duplicated(SPDI))
# add conservation data to the variant table
v <- v |>
left_join(phylo_dat)
if (FALSE) {
# add repeat elements? not for now
library(AnnotationHub)
ah <- AnnotationHub()
query(ah, "Kundaje.GRCh38_unified_Excludable")
excl <- ah[["AH107305"]]
stopifnot(all(genome(excl) == "hg38"))
vranges %>%
mutate(near_excluded = pmin(count_overlaps(., excl, maxgap = 150), 1))
}
# cleaning
v <- v |> rename(chr = seqnames, sublib = origin)
v <- v |> relocate(name, sequence, .after = last_col())
# make a TSV for easy viewing of variant results
write_tsv(v, file=here("analysis/common/igvf_lib1_variant_table.tsv"))
# also as a tibble
saveRDS(v, file=here("analysis/common/igvf_lib1_variant_tibble.rds"))
# save the GRanges object
stopifnot(all.equal(v$SPDI, vranges$SPDI))
mcols(vranges) <- v |> select(-c(chr, start, width))
saveRDS(vranges, file=here("analysis/common/igvf_lib1_variant_GRanges.rds"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment