Skip to content

Instantly share code, notes, and snippets.

@simon-anders
Created December 18, 2025 15:33
Show Gist options
  • Select an option

  • Save simon-anders/7fca7e234d7370e8f9500b538ab331bc to your computer and use it in GitHub Desktop.

Select an option

Save simon-anders/7fca7e234d7370e8f9500b538ab331bc to your computer and use it in GitHub Desktop.
# Load "anndata" Python package via reticulate
# so that we can read the h5ad file
library( reticulate )
use_virtualenv("~/pyenvs/main/")
anndata <- import("anndata")
# Load the h5ad file from ZebraHub
# https://figshare.com/ndownloader/files/36736206
ad <- anndata$read_h5ad("zf_atlas_full_v1_release.h5ad")
# Take the count matrix from `ad$X` and the cell meta data from `ad$obs`,
# make a Seurat object out of this
library( Seurat )
CreateSeuratObject( t(ad$X), meta.data=ad$obs ) -> seu
library( tidyverse )
# Standard Seurat preprocessing
seu |>
NormalizeData() |>
FindVariableFeatures() |>
ScaleData() |>
RunPCA() |>
RunUMAP(dims=1:50) -> seu
UMAPPlot( seu, group.by="zebrafish_anatomy_ontology_class") + coord_equal()
# RNA-Seq count matrix from our 48/96h data
readRDS("~/tmp/zf/count_matrix.rds") -> count_matrix
# Log-normalize
log1p( t( t(count_matrix) / colSums(count_matrix) ) * 1e4 ) -> lexpr_matrix
# Set rownames of both data sets to gene IDs
rownames(lexpr_matrix) <- names(rownames(lexpr_matrix))
rownames(seu) <- ad$var$gene_ids
# variable genes found by Seurat in atlas
VariableFeaturePlot(seu)
# Use only the variable gene from atlas for comparison, and subset to those that appear also
# in our count matrix
common_genes <- intersect( VariableFeatures(seu), rownames(lexpr_matrix) )
# Calculate correlation between atrlas cells and our cells, using the common genes
cor(
as.matrix( LayerData(seu)[ common_genes, ] ),
as.matrix(lexpr_matrix[ common_genes, ]) ) -> rna_corrmat
# For each of our cells, find a "matched" atlas cell, namely the one with highest correlation
apply( rna_corrmat, 2, which.max ) -> matched_cell
# Plot atlas UMAP and mark matched cells in red
plot( Embeddings(seu,"umap"), pch=20, cex=.1, asp=1, col="#00000005" )
points( Embeddings(seu,"umap")[matched_cell,], pch=20, cex=.3, col="#ff000080" )
# Make table of annotation of atlas cells
seu$zebrafish_anatomy_ontology_class[matched_cell] |> table()
# Subset atlas data to opnly CNS cells, rerun Seurat pipeline
seu[ , seu$zebrafish_anatomy_ontology_class == "central_nervous_system" ] |>
NormalizeData() |>
FindVariableFeatures() |>
ScaleData() |>
RunPCA() |>
RunUMAP(dims=1:50) -> seu_cns
UMAPPlot( seu_cns, group.by="timepoint") + coord_equal()
# Chose the common genes again, now using the CNS-variable genes
common_genes <- intersect( VariableFeatures(seu_cns), rownames(lexpr_matrix) )
# Recalculate the correlation matrix (is this good?)
cor(
as.matrix( LayerData(seu_cns)[ common_genes, ] ),
as.matrix(lexpr_matrix[ common_genes, ]) ) -> rna_corrmat
apply( rna_corrmat, 2, which.max ) -> matched_cell
# Mark matched cells in CNS UMAP
plot( Embeddings(seu_cns,"umap"), pch=20, cex=.1, asp=1, col="#00000005" )
points( Embeddings(seu_cns,"umap")[matched_cell,], pch=20, cex=.3, col="#ff000080" )
# Load the matrix of DCM deviance values
readRDS("~/tmp/zf/dev_count_matrix.rds") -> devmatrix
# Load meta data for our cells
read_csv("~/tmp/zf/meta_table.csv") -> metatbl
# Further subset common genes to only thos that also appear in devmatrix
common_genes <- intersect( VariableFeatures(seu_cns),
intersect( rownames(lexpr_matrix), rownames(devmatrix) ) )
# Calculate correlations betqween atlas RNA-Seq and our cells' DCM deviance
cor(
as.matrix( LayerData(seu_cns)[ common_genes, ] ),
as.matrix(devmatrix[ common_genes, ]) ) -> dcm_corrmat
# Pick a random cell
cell <- sample( colnames(devmatrix), 1 )
cell
# Colour UMAP according to the correlation of each atlas cell to
# the picked cell's RNA expression
Embeddings(seu_cns,"umap") |>
as_tibble() |>
mutate( score = ((rna_corrmat[,cell]+.05)/.84)^2. ) |>
arrange( score ) |>
ggplot() + geom_point( aes(x=umap_1, y=umap_2, col=score ), size=.1 ) +
coord_equal() + scale_color_viridis_c(direction=1,option="A",limits=c(0,.35),oob=scales::oob_squish) +
ggtitle(str_c( "corr of atlas RNA-Seq with RNA-Seq of cell ", cell ) ) -> p1
# Colour UMAP according to the correlation of each atlas cell to
# the picked cell's DCM deviance
Embeddings(seu_cns,"umap") |>
as_tibble() |>
mutate( score=((dcm_corrmat[,cell]+.15)/.46)^3. ) |>
arrange( score ) |>
ggplot() + geom_point( aes(x=umap_1, y=umap_2, col=score ), size=.1 ) +
coord_equal() + scale_color_viridis_c(direction=1,option="A",limits=c(0,.35),oob=scales::oob_squish) +
ggtitle(str_c( "corr of atlas RNA-Seq with DCM dev of cell ", cell ) ) -> p2
cowplot::plot_grid(p2,p1)
# For each of our cells, find the average timepoint value of its correlated partners
tibble(
rna_tp = sapply( colnames(dcm_corrmat), function(cell) weighted.mean( as.integer(seu_cns$timepoint), pmax( 0, rna_corrmat[,cell] ) ) ),
dcm_tp = sapply( colnames(dcm_corrmat), function(cell) weighted.mean( as.integer(seu_cns$timepoint), pmax( 0, dcm_corrmat[,cell] ) ) ) ) -> b
plot(b)
abline(0,1)
# 3D UMAP
RunUMAP(seu_cns,1:50,reduction.name="umap3d",n.components=3, n.neighbors=40, min.dist=.4) -> seu_cns
babyplots::pointCloud( Embeddings(seu_cns,"umap3d"), "values", as.integer(seu_cns$timepoint) )
# Cycling cells
FeaturePlot(seu_cns, "ENSDARG00000091150",) # mki67
# Translate cell-cycle markers to zebrafish
s_genes <- biomaRt::getBM(c("drerio_homolog_ensembl_gene"), filters="hgnc_symbol",
values=cc.genes.updated.2019$s.genes, mart=biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl"))[,1]
g2m_genes <- biomaRt::getBM(c("drerio_homolog_ensembl_gene"), filters="hgnc_symbol",
values=cc.genes.updated.2019$g2m.genes, mart=biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl"))[,1]
CellCycleScoring( seu_cns, s.features=s_genes, g2m.features=g2m_genes ) -> seu_cns
# Mark cell cycle in 3D UMAPP
unitrange <- function(x) (x-min(x))/(max(x)-min(x))
babyplots::pointCloud( Embeddings(seu_cns,"umap3d"), "direct",
rgb( unitrange(seu_cns$S.Score), 0, unitrange(seu_cns$S.Score) ) )
# Random stuff that didn't work
ScaleData( seu_cns, vars.to.regress = c(s_genes,g2m_genes) ) -> seu_cns_ccr
c(s_genes,g2m_genes) %in% rownames(seu_cns)
Embeddings(seu_cns,"umap") |>
as_tibble() |>
mutate( score=as.integer(seu_cns$timepoint) ) |>
arrange( score ) |>
ggplot() + geom_point( aes(x=umap_1, y=umap_2, col=score ), size=.1 ) +
coord_equal() + scale_color_viridis_c(direction=1,option="A")
plot( cor( Embeddings(seu_cns,"pca"), seu_cns$S.Score ) )
limma::lmFit( t(Embeddings(seu_cns,"pca")), cbind( seu_cns$S.Score, seu_cns$G2M.Score) ) -> fit
uwot::umap( Embeddings(seu_cns,"pca") - fit$design %*% t(fit$coefficients) ) -> ump
sapply( 1:ncol(Embeddings(seu_cns,"pca")), function(i) {
fit <- lm.fit( cbind( 1, seu_cns$S.Score, seu_cns$G2M.Score), Embeddings(seu_cns,"pca")[,i] )
Embeddings(seu_cns,"pca")[,i] - cbind( seu_cns$S.Score, seu_cns$G2M.Score) %*% fit$coefficients[-1] } ) -> a
uwot::umap( cbind( as.integer(seu$timepoint), Embeddings(seu_cns,"pca") ), n_neighbors=30, metric="cosine", min_dist=.3 ) -> ump
plot( ump, asp=1, pch=20, cex=.1, col="#00000040" )
ump |>
as_tibble() |>
mutate( score=as.integer(seu_cns$timepoint) ) |>
arrange( score ) |>
ggplot() + geom_point( aes(x=V1, y=V2, col=score ), size=.1 ) +
coord_equal() + scale_color_viridis_c(direction=1,option="A")
FindNeighbors(seu_cns, return.neighbor = TRUE) -> seu_cns
ump |>
as_tibble() |>
mutate( score=seu_cns@neighbors$RNA.nn@nn.dist[,15] ) |>
arrange( score ) |>
ggplot() + geom_point( aes(x=V1, y=V2, col=score ), size=.1 ) +
coord_equal() + scale_color_viridis_c(direction=1,option="A",trans="log10")
babyplots::pointCloud( Embeddings(seu_cns,"umap3d"), "values",
log10(seu_cns@neighbors$RNA.nn@nn.dist[,15]) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment