Skip to content

Instantly share code, notes, and snippets.

@simon-anders
Created January 29, 2026 14:44
Show Gist options
  • Select an option

  • Save simon-anders/2262caad7fff10d498f7682f3031ff9f to your computer and use it in GitHub Desktop.

Select an option

Save simon-anders/2262caad7fff10d498f7682f3031ff9f to your computer and use it in GitHub Desktop.
library( tidyverse )
read_tsv("~/tmp/truth_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>%
mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>%
column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> true_expr
read_tsv("~/tmp/predicted_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>%
mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>%
column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> pred_expr
# Compare means across tissues (each point is a gene, x and y coordinates are means
# over observed or predicted expression on log1p scale)
plot( rowMeans(log1p(true_expr)), rowMeans(log1p(pred_expr)), asp=1)
# Next try: Pick a tissue, subtract the mean over all tissues, and divide this "deviation from
# the average" by the gene's standard deviation across tissues. Compare predicted vs observed
# colour by average expression (as proxy for power)
plot(
( log1p(true_expr)[,"liver"] - apply( log1p(true_expr), 1, mean ) ) / apply( log1p(true_expr), 1, sd ),
( log1p(pred_expr)[,"liver"] - apply( log1p(pred_expr), 1, mean ) ) / apply( log1p(pred_expr), 1, sd ),
col=viridis::viridis(120,dir=-1)[ round(10*apply( log1p(true_expr), 1, median )) ], asp=1 )
abline(h=0,v=0)
# Pick one gene, plot predicted vs observed expression. (Now, each point is one tissue)
gene <- sample( rownames(true_expr), 1 )
plot( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ), main=gene,
sub=sprintf("Corr: %f.2", cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) )
abline(0,1)
# (Run this code several times to see several genes)
# Which genes have expression in at least one sample?
goodgenes <- names(which(rowMeans(true_expr)>0))
# Calculate the correlation, as shown by the scatter plots for individual genes above,
# for all "good" genes
sapply( goodgenes, function(gene)
cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) -> cor1
#sapply( goodgenes, function(gene)
# cor( true_expr[gene,], pred_expr[gene,] ) ) -> cor1
# Histogram of these
hist(cor1)
# Generate a null distribution by calculating correlation of a gene's observed expression
# with a randomly picked other gene's predicetd expression
perm_pred_expr <- pred_expr
rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr))
sapply( goodgenes, function(gene)
cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) -> cor0
hist(cor0)
# Run this 1000 times to get a smoother histogram
replicate( 1000, {
perm_pred_expr <- pred_expr
rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr))
sapply( goodgenes, function(gene)
cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) } ) -> cor0
hist(cor0)
# Make a QQ plot of observed correlation coefficients versus their null distribution
plot(
sort(cor0)[ seq( 500, by=1000, length.out=length(cor1) ) ],
sort(cor1) )
abline(0,1)
# Plot correlation coefficient against SD (as measure of variation across tissues)
# and against the latters' ranks
plot( apply( log1p(true_expr), 1, sd )[goodgenes], cor1 )
plot( rank(apply( log1p(true_expr), 1, sd )[goodgenes]), cor1 )
# One might wonder why the null distribution (cor0) above has this skew.
# It might be illustrative to look at the histogram of correlation coefficients
# of two independent draws of 10 values each from a gamma distribution
# (which looks somewhat similar to the distribution of our expression values)
corgamma <- replicate( 10000, cor(rgamma(10,1,1),rgamma(10,1,1)) )
hist( corgamma )
# However, these have mean 0, while our null distribution has a mean >0
mean( corgamma )
mean( cor0 )
# I don't quite understand that.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment