Skip to content

Instantly share code, notes, and snippets.

@dinovski
Created December 19, 2025 19:47
Show Gist options
  • Select an option

  • Save dinovski/cdfb87014e6f4d7c2d6029b78aa5c400 to your computer and use it in GitHub Desktop.

Select an option

Save dinovski/cdfb87014e6f4d7c2d6029b78aa5c400 to your computer and use it in GitHub Desktop.
Parse BLASTN query results and output stacked alignments using ggmsa in R
library(ggmsa)
library(ggplot2)
library(patchwork)
inPath =''
setwd(inPath)
#infile = paste0(inPath, 'MB9W5P2C014-Alignment.txt') #1 query
infile = paste0(inPath, 'MB56R3DS014-Alignment.txt') #2 queries
lines <- readLines(infile)
window_size <- 50
# ---------- helpers ----------
get_query_sections <- function(lines) {
q_starts <- grep("^Query #[0-9]+:", lines)
if (length(q_starts) == 0) stop("No 'Query #<n>:' sections found in file.")
q_ends <- c(q_starts[-1] - 1, length(lines))
sections <- lapply(seq_along(q_starts), function(i) lines[q_starts[i]:q_ends[i]])
q_names <- sub("^Query #([0-9]+):.*$", "Query\\1", lines[q_starts])
names(sections) <- q_names
sections
}
split_hits_by_score <- function(section_lines) {
score_idx <- grep("^Score:", section_lines) # NOTE: colon!
if (length(score_idx) == 0) {
return(list(section_lines))
}
starts <- score_idx
ends <- c(score_idx[-1] - 1, length(section_lines))
lapply(seq_along(starts), function(i) section_lines[starts[i]:ends[i]])
}
extract_gapped_alignment <- function(hit_lines) {
q_lines <- grep("^Query[[:space:]]+[0-9]+", hit_lines, value = TRUE)
s_lines <- grep("^Sbjct[[:space:]]+[0-9]+", hit_lines, value = TRUE)
if (length(q_lines) == 0 || length(s_lines) == 0) return(NULL)
q_parts <- vapply(strsplit(q_lines, "[[:space:]]+"), function(x) x[3], character(1))
s_parts <- vapply(strsplit(s_lines, "[[:space:]]+"), function(x) x[3], character(1))
q_seq <- paste0(q_parts, collapse = "")
s_seq <- paste0(s_parts, collapse = "")
if (nchar(q_seq) != nchar(s_seq)) {
stop("Query/Sbjct gapped sequences have different lengths inside a hit block. Block boundaries may be wrong.")
}
list(human = q_seq, nhp = s_seq)
}
extract_identities_gaps <- function(hit_lines) {
# Example BLAST line often looks like:
# "Identities: 172/187(92%), Gaps: 1/187(0%)"
id_line <- grep("Identities:", hit_lines, value = TRUE)
if (length(id_line) == 0) return("Identities/Gaps: not found")
# Keep it clean: just return "Identities: ... , Gaps: ..."
# (strip leading/trailing spaces)
x <- trimws(id_line[1])
# Sometimes there is extra text after; grab the part containing Identities and Gaps
m <- regexpr("Identities:[^\\r\\n]*", x)
if (m[1] == -1) return("Identities/Gaps: not found")
substr(x, m[1], m[1] + attr(m, "match.length") - 1)
}
write_gapped_fasta <- function(path, human_seq, nhp_seq, human_name="Human", nhp_name="NHP") {
writeLines(c(
paste0(">", human_name),
human_seq,
paste0(">", nhp_name),
nhp_seq
), con = path)
}
plot_stacked_windows_pdf <- function(aln_fa, out_prefix, hit_subtitle,
win=50,
color_scheme="Chemistry_NT",
char_width=0.45,
width=14, height_per_row=0.60) {
fa <- readLines(aln_fa)
seqs <- fa[!grepl("^>", fa)]
aln_len <- nchar(seqs[1])
starts <- seq(1, aln_len, by = win)
ends <- pmin(starts + win - 1, aln_len)
plots <- lapply(seq_along(starts), function(i) {
st <- starts[i]; en <- ends[i]
ggmsa(aln_fa, start=st, end=en,
color=color_scheme,
seq_name=TRUE,
char_width=char_width) +
theme_void() +
theme(
legend.position="none",
plot.margin = margin(2, 6, 2, 6),
axis.text.y = element_text(size = 10)
) +
ggtitle(paste0(st, "–", en, " / ", aln_len)) +
theme(plot.title = element_text(size = 9, hjust = 0))
})
combined <- wrap_plots(plots, ncol = 1) +
plot_annotation(
title = out_prefix,
subtitle = hit_subtitle,
theme = theme(
plot.title = element_text(size = 12, face = "bold"),
plot.subtitle = element_text(size = 10),
plot.margin = margin(8, 8, 8, 8)
)
)
height <- max(2, length(plots) * height_per_row + 1.2) # + space for title/subtitle
ggsave(paste0(out_prefix, "_stacked.pdf"), combined, width=width, height=height)
combined
}
pad_alignment_to_window <- function(aln_fa, win=50, padded_fa=NULL) {
fa <- readLines(aln_fa)
headers <- fa[grepl("^>", fa)]
seqs <- fa[!grepl("^>", fa)]
if (length(seqs) != 2) stop("Expected exactly 2 sequences in ", aln_fa)
aln_len <- nchar(seqs[1])
if (nchar(seqs[2]) != aln_len) stop("Sequence lengths differ in ", aln_fa)
target_len <- ceiling(aln_len / win) * win
pad_len <- target_len - aln_len
if (pad_len > 0) {
pad <- paste(rep("-", pad_len), collapse = "")
seqs <- paste0(seqs, pad)
}
if (is.null(padded_fa)) {
padded_fa <- sub("\\.fa(sta)?$", "", aln_fa)
padded_fa <- paste0(padded_fa, "_padded.fa")
}
writeLines(c(headers[1], seqs[1], headers[2], seqs[2]), padded_fa)
padded_fa
}
# ---------- main ----------
sections <- get_query_sections(lines)
for (qname in names(sections)) {
sec <- sections[[qname]]
hit_blocks <- split_hits_by_score(sec)
hit_plots <- list()
hit_count <- 0
for (hb in hit_blocks) {
aln <- extract_gapped_alignment(hb)
if (is.null(aln)) next
hit_count <- hit_count + 1
hit_meta <- extract_identities_gaps(hb)
fa_name <- paste0(qname, "_hit", hit_count, "_aln.fa")
write_gapped_fasta(
fa_name,
aln$human, aln$nhp,
human_name = "Human",
nhp_name = "NHP"
)
hit_prefix <- paste0(qname, " — Hit ", hit_count)
fa_padded <- pad_alignment_to_window(fa_name, win = window_size)
hit_plots[[hit_count]] <- plot_stacked_windows_pdf(
fa_padded, out_prefix = hit_prefix, hit_subtitle = hit_meta, win = window_size
)
}
if (hit_count == 0) {
message("No alignments found for ", qname)
next
}
# Combine all hits for this query into a single PDF (Hit1 on top, Hit2 below, etc.)
if (hit_count > 1) {
combined_q <- wrap_plots(hit_plots, ncol = 1) +
plot_annotation(
title = paste0(qname, " — all hits"),
theme = theme(plot.title = element_text(size = 14, face = "bold"))
)
ggsave(paste0(qname, "_ALL_hits_stacked.pdf"), combined_q, width=14, height=6 * hit_count + 2)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment