Created
December 18, 2025 14:09
-
-
Save mikelove/97dd70d89e9a5b32973b2dd1f6ee1d05 to your computer and use it in GitHub Desktop.
backup for variant table creation script for liver IGVF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(here) | |
| library(plyranges) | |
| source(here("analysis/common/import.R")) | |
| # This script imports variant-level summary statistics and adds QC variables | |
| # e.g. | |
| # - sufficient activity of at least one of the alleles | |
| # - balanced representation of reference and | |
| # alternate alleles in the input DNA | |
| library(ggplot2) | |
| if (FALSE) { | |
| # negative control violin plot | |
| e |> | |
| filter(class == "element inactive control") |> | |
| ggplot(aes(origin, log2FoldChange)) + | |
| geom_violin() + | |
| geom_jitter(width = 0.1, size = 1.5, alpha = 0.2) + | |
| stat_summary(fun = median, geom = "point", size = 4, color = "red") + | |
| ggtitle("Negative controls, activity across sublibraries") | |
| } | |
| neg_ctrl <- e |> filter(class == "element inactive control") |> | |
| group_by(origin) |> | |
| summarize(mean = median(log2FoldChange), sd = mad(log2FoldChange)) |> | |
| tibble::column_to_rownames("origin") | |
| ### max activity (of the two alleles) ### | |
| v <- v |> mutate( | |
| max_activity = pmax( | |
| log2(outputCountRef) - log2(inputCountRef), | |
| log2(outputCountAlt) - log2(inputCountAlt) | |
| ), | |
| # quantile version, using per sub-lib distributions: | |
| max_activity_Q = pnorm( | |
| q = max_activity, | |
| mean = neg_ctrl[origin,"mean"], | |
| sd = neg_ctrl[origin,"sd"] | |
| ) | |
| ) | |
| if (FALSE) { | |
| # plot max activity and its quantiled version, split by sub-library | |
| v |> arrange(origin, max_activity) |> | |
| ggplot(aes(max_activity, max_activity_Q, color=origin)) + | |
| geom_line() + | |
| coord_cartesian(xlim = c(-2, 1)) | |
| } | |
| ### adding columns: input stats and displayable columns for allelic sig | |
| v <- v |> mutate( | |
| delta_log10_input = abs(log10(inputCountRef) - log10(inputCountAlt)), | |
| meanInput = 0.5 * (log10(inputCountRef) + log10(inputCountAlt)), | |
| displayP = round(minusLog10PValue, 1), | |
| displayQ = round(minusLog10QValue, 1), | |
| ) | |
| if (FALSE) { | |
| # examine delta log10 input diagnostic (y-axis), with qvalue on x-axis | |
| v |> | |
| filter(minusLog10QValue > 1) |> | |
| filter(meanInput > 1) |> | |
| ggplot(aes( | |
| minusLog10QValue, delta_log10_input, | |
| color = meanInput | |
| )) + | |
| geom_point() + | |
| geom_hline(yintercept = log10(5)) | |
| # max activity (y-axis) and qvalue on x-axis | |
| # faceting by sub-library | |
| v |> | |
| filter(minusLog10QValue > 1) |> | |
| ggplot(aes(max_activity_Q, minusLog10QValue)) + | |
| geom_point(size=.1) + | |
| facet_wrap(~origin) + | |
| coord_cartesian(xlim=c(0.5,1), ylim=c(0,8)) | |
| } | |
| # how many variants both active and balanced in activity | |
| v |> | |
| mutate( | |
| signif = minusLog10QValue > 1, | |
| input_balance = delta_log10_input < log10(5), | |
| active = max_activity_Q > 0.9, | |
| ) |> | |
| group_by(origin) |> | |
| count(signif, active, input_balance) |> | |
| print(n=32) | |
| # add significance column (FDR < 10%) | |
| v <- v |> | |
| mutate( | |
| active = max_activity_Q > 0.9, | |
| input_balance = delta_log10_input < log10(5), | |
| signif = minusLog10QValue > 1 & active & input_balance | |
| ) | |
| ### add nearest gene ### | |
| stopifnot(all.equal(v$SPDI, vranges$SPDI)) | |
| encode_dir <- "external_data/encode/HepG2_2016_GSE90322" | |
| exprs <- read_delim(here( | |
| encode_dir, | |
| "GSE90322_norm_counts_TPM_GRCh38.p13_NCBI.tsv.gz" | |
| )) | |
| exprs <- exprs |> | |
| mutate(tpm = rowMeans(exprs[, 2:3])) |> | |
| select(GeneID, tpm) | |
| source(here("analysis/common/refseq_to_chr.R")) # for the refseq_to_chr() function | |
| genes <- read_delim(here( | |
| encode_dir,"Human.GRCh38.p13.annot.tsv.gz")) |> | |
| filter(!stringr::str_detect(ChrStart,";")) |> | |
| mutate( | |
| seqnames = refseq_to_chr(ChrAcc), | |
| start = as.numeric(ChrStart), | |
| end = as.numeric(ChrStop) | |
| ) |> | |
| filter(!is.na(start) & !is.na(seqnames)) |> | |
| left_join(exprs) |> | |
| as_granges() | |
| # get the exons from UCSC known gene table | |
| #library(TxDb.Hsapiens.UCSC.hg38.knownGene) | |
| #ebg <- exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene, "gene") | |
| #saveRDS(ebg, file=here("external_data/conservation/exons_by_gene.rds")) | |
| ebg <- readRDS(here("external_data/conservation/exons_by_gene.rds")) | |
| # nearest gene (body) | |
| vranges_near <- vranges |> | |
| join_nearest( | |
| genes |> | |
| filter(GeneType == "protein-coding") |> | |
| select(nearestGene=Symbol, nearestTPM=tpm) | |
| ) | |
| # nearest moderately expressed (TPM > 5) TSS | |
| vranges_near_exprs_tss <- vranges |> | |
| join_nearest( | |
| genes |> | |
| filter(GeneType == "protein-coding", tpm > 5) |> | |
| anchor_5p() |> | |
| mutate(width=1) |> | |
| select(nearestExprsGene=Symbol, nearestExprsTPM=tpm) | |
| ) | |
| stopifnot(all.equal(v$SPDI, vranges$SPDI)) | |
| # add all four to the variant table | |
| v$nearestGene <- vranges_near$nearestGene | |
| v$nearestTPM <- vranges_near$nearestTPM | |
| v$nearestExprsGene <- vranges_near_exprs_tss$nearestExprsGene | |
| v$nearestExprsTPM <- vranges_near_exprs_tss$nearestExprsTPM | |
| ### add conservation ### | |
| # conservation information is from cactus (method) 241-way track from zoonomia | |
| # PhyloP Basewise Conservation of Zoonomia 241 Placental Mammals | |
| # https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=cons241way | |
| # downloaded from UCSC: | |
| # https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus241way/cactus241way.phyloP.bw | |
| # values were extracted from this 9 GB file using | |
| # rtracklayer::import.bw(con, which=GRanges, as="NumericList") | |
| # then this was saved as a GRanges object | |
| # compute max and mean in the variant-centered ranges, but first need to zero out exonic regions | |
| phyloP <- readRDS(here("external_data/conservation/igvf_lib1_variant_phyloP.rds")) | |
| phyloP <- updateObject(phyloP) | |
| phyloP <- phyloP %>% | |
| mutate( | |
| in_exon = as.numeric(overlapsAny(., ebg)), | |
| mean_cons = mean(phyloP241way), | |
| max_cons = max(phyloP241way) | |
| ) | |
| # some plots and diagnostics | |
| if (FALSE) { | |
| table(phyloP$in_exon) | |
| hist(phyloP$mean_cons, breaks=50) | |
| hist(phyloP$max_cons) | |
| with(mcols(phyloP), table(mean_cons=cut(mean_cons, c(-2,0,1,2,10)), in_exon)) | |
| with(mcols(phyloP), table(max_cons=cut(max_cons, 0:9), in_exon)) | |
| with(mcols(phyloP), round(prop.table(table( | |
| mean_cons=cut(mean_cons, c(-2,0,1,2,10)), in_exon),2),2)) | |
| with(mcols(phyloP), round(prop.table(table( | |
| max_cons=cut(max_cons, 0:9), in_exon),2),2)) | |
| } | |
| # cut off conservation scores if the range overlaps an exon (UCSC known genes) | |
| phylo_chop <- phyloP |> | |
| mutate( | |
| mean_cons = mean_cons * (1 - in_exon), | |
| max_cons = max_cons * (1 - in_exon) | |
| ) | |
| # check that we've zeroed out conservation in exons | |
| with(mcols(phylo_chop), table(cons=mean_cons > 1, in_exon)) | |
| with(mcols(phylo_chop), table(cons=max_cons > 8, in_exon)) | |
| phylo_dat <- phylo_chop |> | |
| select(name, in_exon, mean_cons, max_cons, .drop_ranges=TRUE) |> | |
| as_tibble() |> | |
| rename(SPDI = name) |> | |
| filter(!duplicated(SPDI)) | |
| # add conservation data to the variant table | |
| v <- v |> | |
| left_join(phylo_dat) | |
| if (FALSE) { | |
| # add repeat elements? not for now | |
| library(AnnotationHub) | |
| ah <- AnnotationHub() | |
| query(ah, "Kundaje.GRCh38_unified_Excludable") | |
| excl <- ah[["AH107305"]] | |
| stopifnot(all(genome(excl) == "hg38")) | |
| vranges %>% | |
| mutate(near_excluded = pmin(count_overlaps(., excl, maxgap = 150), 1)) | |
| } | |
| # cleaning | |
| v <- v |> rename(chr = seqnames, sublib = origin) | |
| v <- v |> relocate(name, sequence, .after = last_col()) | |
| # make a TSV for easy viewing of variant results | |
| write_tsv(v, file=here("analysis/common/igvf_lib1_variant_table.tsv")) | |
| # also as a tibble | |
| saveRDS(v, file=here("analysis/common/igvf_lib1_variant_tibble.rds")) | |
| # save the GRanges object | |
| stopifnot(all.equal(v$SPDI, vranges$SPDI)) | |
| mcols(vranges) <- v |> select(-c(chr, start, width)) | |
| saveRDS(vranges, file=here("analysis/common/igvf_lib1_variant_GRanges.rds")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment