Created
December 18, 2025 15:33
-
-
Save simon-anders/7fca7e234d7370e8f9500b538ab331bc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Load "anndata" Python package via reticulate | |
| # so that we can read the h5ad file | |
| library( reticulate ) | |
| use_virtualenv("~/pyenvs/main/") | |
| anndata <- import("anndata") | |
| # Load the h5ad file from ZebraHub | |
| # https://figshare.com/ndownloader/files/36736206 | |
| ad <- anndata$read_h5ad("zf_atlas_full_v1_release.h5ad") | |
| # Take the count matrix from `ad$X` and the cell meta data from `ad$obs`, | |
| # make a Seurat object out of this | |
| library( Seurat ) | |
| CreateSeuratObject( t(ad$X), meta.data=ad$obs ) -> seu | |
| library( tidyverse ) | |
| # Standard Seurat preprocessing | |
| seu |> | |
| NormalizeData() |> | |
| FindVariableFeatures() |> | |
| ScaleData() |> | |
| RunPCA() |> | |
| RunUMAP(dims=1:50) -> seu | |
| UMAPPlot( seu, group.by="zebrafish_anatomy_ontology_class") + coord_equal() | |
| # RNA-Seq count matrix from our 48/96h data | |
| readRDS("~/tmp/zf/count_matrix.rds") -> count_matrix | |
| # Log-normalize | |
| log1p( t( t(count_matrix) / colSums(count_matrix) ) * 1e4 ) -> lexpr_matrix | |
| # Set rownames of both data sets to gene IDs | |
| rownames(lexpr_matrix) <- names(rownames(lexpr_matrix)) | |
| rownames(seu) <- ad$var$gene_ids | |
| # variable genes found by Seurat in atlas | |
| VariableFeaturePlot(seu) | |
| # Use only the variable gene from atlas for comparison, and subset to those that appear also | |
| # in our count matrix | |
| common_genes <- intersect( VariableFeatures(seu), rownames(lexpr_matrix) ) | |
| # Calculate correlation between atrlas cells and our cells, using the common genes | |
| cor( | |
| as.matrix( LayerData(seu)[ common_genes, ] ), | |
| as.matrix(lexpr_matrix[ common_genes, ]) ) -> rna_corrmat | |
| # For each of our cells, find a "matched" atlas cell, namely the one with highest correlation | |
| apply( rna_corrmat, 2, which.max ) -> matched_cell | |
| # Plot atlas UMAP and mark matched cells in red | |
| plot( Embeddings(seu,"umap"), pch=20, cex=.1, asp=1, col="#00000005" ) | |
| points( Embeddings(seu,"umap")[matched_cell,], pch=20, cex=.3, col="#ff000080" ) | |
| # Make table of annotation of atlas cells | |
| seu$zebrafish_anatomy_ontology_class[matched_cell] |> table() | |
| # Subset atlas data to opnly CNS cells, rerun Seurat pipeline | |
| seu[ , seu$zebrafish_anatomy_ontology_class == "central_nervous_system" ] |> | |
| NormalizeData() |> | |
| FindVariableFeatures() |> | |
| ScaleData() |> | |
| RunPCA() |> | |
| RunUMAP(dims=1:50) -> seu_cns | |
| UMAPPlot( seu_cns, group.by="timepoint") + coord_equal() | |
| # Chose the common genes again, now using the CNS-variable genes | |
| common_genes <- intersect( VariableFeatures(seu_cns), rownames(lexpr_matrix) ) | |
| # Recalculate the correlation matrix (is this good?) | |
| cor( | |
| as.matrix( LayerData(seu_cns)[ common_genes, ] ), | |
| as.matrix(lexpr_matrix[ common_genes, ]) ) -> rna_corrmat | |
| apply( rna_corrmat, 2, which.max ) -> matched_cell | |
| # Mark matched cells in CNS UMAP | |
| plot( Embeddings(seu_cns,"umap"), pch=20, cex=.1, asp=1, col="#00000005" ) | |
| points( Embeddings(seu_cns,"umap")[matched_cell,], pch=20, cex=.3, col="#ff000080" ) | |
| # Load the matrix of DCM deviance values | |
| readRDS("~/tmp/zf/dev_count_matrix.rds") -> devmatrix | |
| # Load meta data for our cells | |
| read_csv("~/tmp/zf/meta_table.csv") -> metatbl | |
| # Further subset common genes to only thos that also appear in devmatrix | |
| common_genes <- intersect( VariableFeatures(seu_cns), | |
| intersect( rownames(lexpr_matrix), rownames(devmatrix) ) ) | |
| # Calculate correlations betqween atlas RNA-Seq and our cells' DCM deviance | |
| cor( | |
| as.matrix( LayerData(seu_cns)[ common_genes, ] ), | |
| as.matrix(devmatrix[ common_genes, ]) ) -> dcm_corrmat | |
| # Pick a random cell | |
| cell <- sample( colnames(devmatrix), 1 ) | |
| cell | |
| # Colour UMAP according to the correlation of each atlas cell to | |
| # the picked cell's RNA expression | |
| Embeddings(seu_cns,"umap") |> | |
| as_tibble() |> | |
| mutate( score = ((rna_corrmat[,cell]+.05)/.84)^2. ) |> | |
| arrange( score ) |> | |
| ggplot() + geom_point( aes(x=umap_1, y=umap_2, col=score ), size=.1 ) + | |
| coord_equal() + scale_color_viridis_c(direction=1,option="A",limits=c(0,.35),oob=scales::oob_squish) + | |
| ggtitle(str_c( "corr of atlas RNA-Seq with RNA-Seq of cell ", cell ) ) -> p1 | |
| # Colour UMAP according to the correlation of each atlas cell to | |
| # the picked cell's DCM deviance | |
| Embeddings(seu_cns,"umap") |> | |
| as_tibble() |> | |
| mutate( score=((dcm_corrmat[,cell]+.15)/.46)^3. ) |> | |
| arrange( score ) |> | |
| ggplot() + geom_point( aes(x=umap_1, y=umap_2, col=score ), size=.1 ) + | |
| coord_equal() + scale_color_viridis_c(direction=1,option="A",limits=c(0,.35),oob=scales::oob_squish) + | |
| ggtitle(str_c( "corr of atlas RNA-Seq with DCM dev of cell ", cell ) ) -> p2 | |
| cowplot::plot_grid(p2,p1) | |
| # For each of our cells, find the average timepoint value of its correlated partners | |
| tibble( | |
| rna_tp = sapply( colnames(dcm_corrmat), function(cell) weighted.mean( as.integer(seu_cns$timepoint), pmax( 0, rna_corrmat[,cell] ) ) ), | |
| dcm_tp = sapply( colnames(dcm_corrmat), function(cell) weighted.mean( as.integer(seu_cns$timepoint), pmax( 0, dcm_corrmat[,cell] ) ) ) ) -> b | |
| plot(b) | |
| abline(0,1) | |
| # 3D UMAP | |
| RunUMAP(seu_cns,1:50,reduction.name="umap3d",n.components=3, n.neighbors=40, min.dist=.4) -> seu_cns | |
| babyplots::pointCloud( Embeddings(seu_cns,"umap3d"), "values", as.integer(seu_cns$timepoint) ) | |
| # Cycling cells | |
| FeaturePlot(seu_cns, "ENSDARG00000091150",) # mki67 | |
| # Translate cell-cycle markers to zebrafish | |
| s_genes <- biomaRt::getBM(c("drerio_homolog_ensembl_gene"), filters="hgnc_symbol", | |
| values=cc.genes.updated.2019$s.genes, mart=biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl"))[,1] | |
| g2m_genes <- biomaRt::getBM(c("drerio_homolog_ensembl_gene"), filters="hgnc_symbol", | |
| values=cc.genes.updated.2019$g2m.genes, mart=biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl"))[,1] | |
| CellCycleScoring( seu_cns, s.features=s_genes, g2m.features=g2m_genes ) -> seu_cns | |
| # Mark cell cycle in 3D UMAPP | |
| unitrange <- function(x) (x-min(x))/(max(x)-min(x)) | |
| babyplots::pointCloud( Embeddings(seu_cns,"umap3d"), "direct", | |
| rgb( unitrange(seu_cns$S.Score), 0, unitrange(seu_cns$S.Score) ) ) | |
| # Random stuff that didn't work | |
| ScaleData( seu_cns, vars.to.regress = c(s_genes,g2m_genes) ) -> seu_cns_ccr | |
| c(s_genes,g2m_genes) %in% rownames(seu_cns) | |
| Embeddings(seu_cns,"umap") |> | |
| as_tibble() |> | |
| mutate( score=as.integer(seu_cns$timepoint) ) |> | |
| arrange( score ) |> | |
| ggplot() + geom_point( aes(x=umap_1, y=umap_2, col=score ), size=.1 ) + | |
| coord_equal() + scale_color_viridis_c(direction=1,option="A") | |
| plot( cor( Embeddings(seu_cns,"pca"), seu_cns$S.Score ) ) | |
| limma::lmFit( t(Embeddings(seu_cns,"pca")), cbind( seu_cns$S.Score, seu_cns$G2M.Score) ) -> fit | |
| uwot::umap( Embeddings(seu_cns,"pca") - fit$design %*% t(fit$coefficients) ) -> ump | |
| sapply( 1:ncol(Embeddings(seu_cns,"pca")), function(i) { | |
| fit <- lm.fit( cbind( 1, seu_cns$S.Score, seu_cns$G2M.Score), Embeddings(seu_cns,"pca")[,i] ) | |
| Embeddings(seu_cns,"pca")[,i] - cbind( seu_cns$S.Score, seu_cns$G2M.Score) %*% fit$coefficients[-1] } ) -> a | |
| uwot::umap( cbind( as.integer(seu$timepoint), Embeddings(seu_cns,"pca") ), n_neighbors=30, metric="cosine", min_dist=.3 ) -> ump | |
| plot( ump, asp=1, pch=20, cex=.1, col="#00000040" ) | |
| ump |> | |
| as_tibble() |> | |
| mutate( score=as.integer(seu_cns$timepoint) ) |> | |
| arrange( score ) |> | |
| ggplot() + geom_point( aes(x=V1, y=V2, col=score ), size=.1 ) + | |
| coord_equal() + scale_color_viridis_c(direction=1,option="A") | |
| FindNeighbors(seu_cns, return.neighbor = TRUE) -> seu_cns | |
| ump |> | |
| as_tibble() |> | |
| mutate( score=seu_cns@neighbors$RNA.nn@nn.dist[,15] ) |> | |
| arrange( score ) |> | |
| ggplot() + geom_point( aes(x=V1, y=V2, col=score ), size=.1 ) + | |
| coord_equal() + scale_color_viridis_c(direction=1,option="A",trans="log10") | |
| babyplots::pointCloud( Embeddings(seu_cns,"umap3d"), "values", | |
| log10(seu_cns@neighbors$RNA.nn@nn.dist[,15]) ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment