Created
December 19, 2025 19:47
-
-
Save dinovski/cdfb87014e6f4d7c2d6029b78aa5c400 to your computer and use it in GitHub Desktop.
Parse BLASTN query results and output stacked alignments using ggmsa in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(ggmsa) | |
| library(ggplot2) | |
| library(patchwork) | |
| inPath ='' | |
| setwd(inPath) | |
| #infile = paste0(inPath, 'MB9W5P2C014-Alignment.txt') #1 query | |
| infile = paste0(inPath, 'MB56R3DS014-Alignment.txt') #2 queries | |
| lines <- readLines(infile) | |
| window_size <- 50 | |
| # ---------- helpers ---------- | |
| get_query_sections <- function(lines) { | |
| q_starts <- grep("^Query #[0-9]+:", lines) | |
| if (length(q_starts) == 0) stop("No 'Query #<n>:' sections found in file.") | |
| q_ends <- c(q_starts[-1] - 1, length(lines)) | |
| sections <- lapply(seq_along(q_starts), function(i) lines[q_starts[i]:q_ends[i]]) | |
| q_names <- sub("^Query #([0-9]+):.*$", "Query\\1", lines[q_starts]) | |
| names(sections) <- q_names | |
| sections | |
| } | |
| split_hits_by_score <- function(section_lines) { | |
| score_idx <- grep("^Score:", section_lines) # NOTE: colon! | |
| if (length(score_idx) == 0) { | |
| return(list(section_lines)) | |
| } | |
| starts <- score_idx | |
| ends <- c(score_idx[-1] - 1, length(section_lines)) | |
| lapply(seq_along(starts), function(i) section_lines[starts[i]:ends[i]]) | |
| } | |
| extract_gapped_alignment <- function(hit_lines) { | |
| q_lines <- grep("^Query[[:space:]]+[0-9]+", hit_lines, value = TRUE) | |
| s_lines <- grep("^Sbjct[[:space:]]+[0-9]+", hit_lines, value = TRUE) | |
| if (length(q_lines) == 0 || length(s_lines) == 0) return(NULL) | |
| q_parts <- vapply(strsplit(q_lines, "[[:space:]]+"), function(x) x[3], character(1)) | |
| s_parts <- vapply(strsplit(s_lines, "[[:space:]]+"), function(x) x[3], character(1)) | |
| q_seq <- paste0(q_parts, collapse = "") | |
| s_seq <- paste0(s_parts, collapse = "") | |
| if (nchar(q_seq) != nchar(s_seq)) { | |
| stop("Query/Sbjct gapped sequences have different lengths inside a hit block. Block boundaries may be wrong.") | |
| } | |
| list(human = q_seq, nhp = s_seq) | |
| } | |
| extract_identities_gaps <- function(hit_lines) { | |
| # Example BLAST line often looks like: | |
| # "Identities: 172/187(92%), Gaps: 1/187(0%)" | |
| id_line <- grep("Identities:", hit_lines, value = TRUE) | |
| if (length(id_line) == 0) return("Identities/Gaps: not found") | |
| # Keep it clean: just return "Identities: ... , Gaps: ..." | |
| # (strip leading/trailing spaces) | |
| x <- trimws(id_line[1]) | |
| # Sometimes there is extra text after; grab the part containing Identities and Gaps | |
| m <- regexpr("Identities:[^\\r\\n]*", x) | |
| if (m[1] == -1) return("Identities/Gaps: not found") | |
| substr(x, m[1], m[1] + attr(m, "match.length") - 1) | |
| } | |
| write_gapped_fasta <- function(path, human_seq, nhp_seq, human_name="Human", nhp_name="NHP") { | |
| writeLines(c( | |
| paste0(">", human_name), | |
| human_seq, | |
| paste0(">", nhp_name), | |
| nhp_seq | |
| ), con = path) | |
| } | |
| plot_stacked_windows_pdf <- function(aln_fa, out_prefix, hit_subtitle, | |
| win=50, | |
| color_scheme="Chemistry_NT", | |
| char_width=0.45, | |
| width=14, height_per_row=0.60) { | |
| fa <- readLines(aln_fa) | |
| seqs <- fa[!grepl("^>", fa)] | |
| aln_len <- nchar(seqs[1]) | |
| starts <- seq(1, aln_len, by = win) | |
| ends <- pmin(starts + win - 1, aln_len) | |
| plots <- lapply(seq_along(starts), function(i) { | |
| st <- starts[i]; en <- ends[i] | |
| ggmsa(aln_fa, start=st, end=en, | |
| color=color_scheme, | |
| seq_name=TRUE, | |
| char_width=char_width) + | |
| theme_void() + | |
| theme( | |
| legend.position="none", | |
| plot.margin = margin(2, 6, 2, 6), | |
| axis.text.y = element_text(size = 10) | |
| ) + | |
| ggtitle(paste0(st, "–", en, " / ", aln_len)) + | |
| theme(plot.title = element_text(size = 9, hjust = 0)) | |
| }) | |
| combined <- wrap_plots(plots, ncol = 1) + | |
| plot_annotation( | |
| title = out_prefix, | |
| subtitle = hit_subtitle, | |
| theme = theme( | |
| plot.title = element_text(size = 12, face = "bold"), | |
| plot.subtitle = element_text(size = 10), | |
| plot.margin = margin(8, 8, 8, 8) | |
| ) | |
| ) | |
| height <- max(2, length(plots) * height_per_row + 1.2) # + space for title/subtitle | |
| ggsave(paste0(out_prefix, "_stacked.pdf"), combined, width=width, height=height) | |
| combined | |
| } | |
| pad_alignment_to_window <- function(aln_fa, win=50, padded_fa=NULL) { | |
| fa <- readLines(aln_fa) | |
| headers <- fa[grepl("^>", fa)] | |
| seqs <- fa[!grepl("^>", fa)] | |
| if (length(seqs) != 2) stop("Expected exactly 2 sequences in ", aln_fa) | |
| aln_len <- nchar(seqs[1]) | |
| if (nchar(seqs[2]) != aln_len) stop("Sequence lengths differ in ", aln_fa) | |
| target_len <- ceiling(aln_len / win) * win | |
| pad_len <- target_len - aln_len | |
| if (pad_len > 0) { | |
| pad <- paste(rep("-", pad_len), collapse = "") | |
| seqs <- paste0(seqs, pad) | |
| } | |
| if (is.null(padded_fa)) { | |
| padded_fa <- sub("\\.fa(sta)?$", "", aln_fa) | |
| padded_fa <- paste0(padded_fa, "_padded.fa") | |
| } | |
| writeLines(c(headers[1], seqs[1], headers[2], seqs[2]), padded_fa) | |
| padded_fa | |
| } | |
| # ---------- main ---------- | |
| sections <- get_query_sections(lines) | |
| for (qname in names(sections)) { | |
| sec <- sections[[qname]] | |
| hit_blocks <- split_hits_by_score(sec) | |
| hit_plots <- list() | |
| hit_count <- 0 | |
| for (hb in hit_blocks) { | |
| aln <- extract_gapped_alignment(hb) | |
| if (is.null(aln)) next | |
| hit_count <- hit_count + 1 | |
| hit_meta <- extract_identities_gaps(hb) | |
| fa_name <- paste0(qname, "_hit", hit_count, "_aln.fa") | |
| write_gapped_fasta( | |
| fa_name, | |
| aln$human, aln$nhp, | |
| human_name = "Human", | |
| nhp_name = "NHP" | |
| ) | |
| hit_prefix <- paste0(qname, " — Hit ", hit_count) | |
| fa_padded <- pad_alignment_to_window(fa_name, win = window_size) | |
| hit_plots[[hit_count]] <- plot_stacked_windows_pdf( | |
| fa_padded, out_prefix = hit_prefix, hit_subtitle = hit_meta, win = window_size | |
| ) | |
| } | |
| if (hit_count == 0) { | |
| message("No alignments found for ", qname) | |
| next | |
| } | |
| # Combine all hits for this query into a single PDF (Hit1 on top, Hit2 below, etc.) | |
| if (hit_count > 1) { | |
| combined_q <- wrap_plots(hit_plots, ncol = 1) + | |
| plot_annotation( | |
| title = paste0(qname, " — all hits"), | |
| theme = theme(plot.title = element_text(size = 14, face = "bold")) | |
| ) | |
| ggsave(paste0(qname, "_ALL_hits_stacked.pdf"), combined_q, width=14, height=6 * hit_count + 2) | |
| } | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment