The FWR4 region in a germline J gene allele is expected to start with amino acid motif WGXG on the heavy chain and FGXG on the light chain. Note that X represents any amino acid.
However, we've found that some of the germline J gene alleles provided by IMGT or AIRR-community/OGRDB do not conform to this.
The igblastr package provides some utilities that we are going to use to identify the non-conforming J alleles.
See https://github.com/HyrienLab/igblastr?tab=readme-ov-file#1-install-and-load-igblastr for how to install igblastr.
Then load igblastr and install IgBLAST with:
library(igblastr)
if (!has_igblast()) install_igblast()The "auxiliary data" included in IgBLAST annotates the known germline J gene alleles for the 5 organisms that IgBLAST supports out of the box: human, mouse, rat, rhesus_monkey, rabbit
Let's take a look at IgBLAST auxiliary data for human:
auxdata <- load_auxdata("human")
head(auxdata)
# allele_name coding_frame_start chain_type cdr3_end extra_bps
# 1 IGHJ1*01 0 JH 17 1
# 2 IGHJ1P*01 2 JH NA NA
# 3 IGHJ2*01 1 JH 18 1
# 4 IGHJ2P*01 0 JH NA NA
# 5 IGHJ3*01 1 JH 15 1
# 6 IGHJ3*02 1 JH 15 1The cdr3_end column indicates the 0-based position of the last nucleotide of the CDR3 region.
The 12 nucleotides immediately following that position are the first 4 codons of the FWR4 region.
They're expected to translate to WGXG for alleles located on the IGH locus, and to FGXG for alleles
located on the IGK and IGL loci.
In the next section we'll identify the non-conforming germline J gene alleles provided by IMGT for human, mouse, rat, rhesus_monkey, and rabbit. Let's download and install germline dbs for these organisms:
## BCR germline dbs:
install_IMGT_germline_db("202531-1", "Homo_sapiens", force=TRUE)
install_IMGT_germline_db("202531-1", "Mus_musculus", force=TRUE)
install_IMGT_germline_db("202531-1", "Rattus_norvegicus", force=TRUE)
install_IMGT_germline_db("202531-1", "Macaca_mulatta", force=TRUE)
install_IMGT_germline_db("202531-1", "Oryctolagus_cuniculus", force=TRUE)
## TCR germline dbs:
install_IMGT_germline_db("202531-1", "Homo_sapiens", tcr.db=TRUE, force=TRUE)
install_IMGT_germline_db("202531-1", "Mus_musculus", tcr.db=TRUE, force=TRUE)
install_IMGT_germline_db("202531-1", "Macaca_mulatta", tcr.db=TRUE, force=TRUE)
install_IMGT_germline_db("202531-1", "Oryctolagus_cuniculus", tcr.db=TRUE, force=TRUE)Note that IMGT does not provide TCR germline gene alleles for rat.
We'll also need the following helper function:
is_not_conforming <- function(fwr4_head) !(grepl("^[FW]G.G", fwr4_head) | is.na(fwr4_head))## Load IgBLAST auxiliary data:
auxdata <- load_auxdata("human")
## Get germline V gene allele sequences:
db_name <- "IMGT-202531-1.Homo_sapiens.IGH+IGK+IGL"
J_alleles <- load_germline_db(db_name, region_types="J")
## Translate the first 4 codons of the FWR4 region:
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# FGEG FGGG FGPG FGQG FGSG FGTG WGKG WGQG WGRG
# 2 8 1 6 1 1 2 10 1 All alleles are conforming:
table(is_not_conforming(fwr4_head))
# FALSE
# 34 auxdata <- load_auxdata("mouse")
db_name <- "IMGT-202531-1.Mus_musculus.IGH+IGK+IGL"
J_alleles <- load_germline_db(db_name, region_types="J")
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# FGAG FGGG FGSG FGTG FSDG FSSN WGAG WGQG WGTG
# 1 6 4 1 2 1 2 6 1
table(is_not_conforming(fwr4_head))
# FALSE TRUE
# 24 3 Non-conforming alleles:
fwr4_head[is_not_conforming(fwr4_head)]
# IGKJ3*01 IGKJ3*02 IGLJ3P*01
# "FSDG" "FSDG" "FSSN" auxdata <- load_auxdata("rat")
db_name <- "IMGT-202531-1.Rattus_norvegicus.IGH+IGK+IGL"
J_alleles <- load_germline_db(db_name, region_types="J")
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# FGAG FGGG FGSG LGKG WGPG WGQG
# 3 3 2 2 1 3
table(is_not_conforming(fwr4_head))
# FALSE TRUE
# 13 2 Non-conforming alleles:
fwr4_head[is_not_conforming(fwr4_head)]
# IGLJ2*01 IGLJ4*01
# "LGKG" "LGKG" auxdata <- load_auxdata("rhesus_monkey")
db_name <- "IMGT-202531-1.Macaca_mulatta.IGH+IGK+IGL"
J_alleles <- load_germline_db(db_name, region_types="J")
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# FCGG FGAG FGEG FGGG FGPG FGQG FGRG FGSG LGRG WGPG WGQG WGRG
# 1 1 1 4 1 3 1 1 1 3 6 1
table(is_not_conforming(fwr4_head))
# FALSE TRUE
# 22 2 Non-conforming alleles:
fwr4_head[is_not_conforming(fwr4_head)]
# IGLJ4*01 IGLJ7*02
# "FCGG" "LGRG" auxdata <- load_auxdata("rabbit")
db_name <- "IMGT-202531-1.Oryctolagus_cuniculus.IGH+IGK+IGL"
J_alleles <- load_germline_db(db_name, region_types="J")
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# FGAG FGEE FGGG FGKG FGSG FGSR FSRG LGPG RGPG WGPG WGQG WGTG
# 5 3 6 1 3 1 1 2 1 6 4 1
table(is_not_conforming(fwr4_head))
# FALSE TRUE
# 26 8 Non-conforming alleles:
fwr4_head[is_not_conforming(fwr4_head)]
# IGKJ1-3*01 IGKJ1-3*02 IGKJ1-3*03 IGKJ1-5*01 IGKJ1-5*02 IGKJ1-5*03 IGLJ1*01 IGLJ3*01
# "LGPG" "LGPG" "RGPG" "FGEE" "FGEE" "FGEE" "FGSR" "FSRG" auxdata <- load_auxdata("human")
db_name <- "IMGT-202531-1.Homo_sapiens.TRA+TRB+TRG+TRD"
J_alleles <- load_germline_db(db_name, region_types="J")
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# CGSG FAEG FAKG FARG FFGT FGAG FGAN FGDG FGEG FGGG FGIG FGKG FGMG FGNG FGPG FGQG FGRG FGSG FGTG GRLG VGPG WGAG WGLG
# 1 1 1 2 1 11 1 3 3 2 1 20 1 2 7 7 2 11 13 1 1 1 1
table(is_not_conforming(fwr4_head))
# FALSE TRUE
# 88 9 Non-conforming alleles:
fwr4_head[is_not_conforming(fwr4_head)]
# TRAJ16*01 TRAJ16*02 TRAJ35*01 TRAJ61*01 TRBJ2-2P*01 TRBJ2-7*02 TRDJ3*01 TRGJP1*01 TRGJP2*01
# "FARG" "FARG" "CGSG" "FGAN" "GRLG" "VGPG" "FFGT" "FAEG" "FAKG" auxdata <- load_auxdata("mouse")
db_name <- "IMGT-202531-1.Mus_musculus.TRA+TRB+TRG+TRD"
J_alleles <- load_germline_db(db_name, region_types="J")
fwr4_head <- translate_fwr4(J_alleles, auxdata, max.codons=4)
table(fwr4_head)
# CGLG FAAG FAEG FAKG FATG FGAE FGAG FGDG FGEG FGGG FGHG FGIG FGKG FGLG FGPG FGQG FGRG FGSG FGTG FGTW HGLG LAEA LGAG
# 1 1 3 1 1 1 8 4 7 1 3 2 13 3 4 5 1 9 12 1 1 1 1
# LGKG LGRE SGIE VESV WGLG WGSG
# 1 1 1 1 1 1
table(is_not_conforming(fwr4_head))
# FALSE TRUE
# 80 16 Non-conforming alleles:
fwr4_head[is_not_conforming(fwr4_head)]
# TRAJ19*01 TRAJ20*01 TRAJ25*01 TRAJ29*01 TRAJ3*01 TRAJ41*01 TRAJ44*01 TRAJ47*01 TRAJ59*01 TRAJ7*01
# "SGIE" "VESV" "FGTW" "LGRE" "FGAE" "LAEA" "LGAG" "CGLG" "FATG" "LGKG"
# TRBJ1-6*01 TRBJ1-7*01 TRGJ1*01 TRGJ2*01 TRGJ3*01 TRGJ4*01
# "FAAG" "HGLG" "FAEG" "FAEG" "FAEG" "FAKG" IMGT does not provide TCR germline gene alleles for rat:
install_IMGT_germline_db("202531-1", "Rattus_norvegicus", tcr.db=TRUE, force=TRUE)
# Error in .path_to_IMGT_germline_fasta_store(local_store, organism, loci_prefix) :
# cannot find TR germline sequences for Rattus_norvegicus in IMGT release 202531-1Unfortunately, the TCR germline J gene alleles provided by IMGT are not annotated in the IgBLAST auxiliary data for rhesus monkey, so we cannot identify the non-conforming alleles:
auxdata <- load_auxdata("rhesus_monkey")
db_name <- "IMGT-202531-1.Macaca_mulatta.TRA+TRB+TRG+TRD"
J_alleles <- load_germline_db(db_name, region_types="J")
any(names(J_alleles) %in% auxdata$allele_name)
# [1] FALSEUnfortunately, the TCR germline J gene alleles provided by IMGT are not annotated in the IgBLAST auxiliary data for rabbit, so we cannot identify the non-conforming alleles:
auxdata <- load_auxdata("rabbit")
db_name <- "IMGT-202531-1.Oryctolagus_cuniculus.TRA+TRB+TRG+TRD"
J_alleles <- load_germline_db(db_name, region_types="J")
any(names(J_alleles) %in% auxdata$allele_name)
# [1] FALSEOutput of sessionInfo():
R version 4.5.1 (2025-06-13)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS
Matrix products: default
BLAS: /home/hpages/R/R-4.5.1/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/liblapack.so.3; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/Los_Angeles
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] igblastr_1.0.8 Biostrings_2.78.0 Seqinfo_1.0.0
[4] XVector_0.50.0 IRanges_2.44.0 S4Vectors_0.48.0
[7] BiocGenerics_0.56.0 generics_0.1.4 tibble_3.3.0
loaded via a namespace (and not attached):
[1] crayon_1.5.3 vctrs_0.6.5 httr_1.4.7
[4] cli_3.6.5 rlang_1.1.6 UCSC.utils_1.6.1
[7] jsonlite_2.0.0 xtable_1.8-4 glue_1.8.0
[10] GenomeInfoDb_1.46.2 lifecycle_1.0.4 compiler_4.5.1
[13] rvest_1.0.5 pkgconfig_2.0.3 R.oo_1.27.1
[16] R.utils_2.13.0 R6_2.6.1 pillar_1.11.1
[19] curl_7.0.0 magrittr_2.0.4 R.methodsS3_1.8.2
[22] tools_4.5.1 xml2_1.5.1