simon-anders · January 29, 2026 14:44
diff --git a/corgi_test.R b/corgi_test.R
 library( tidyverse )

 read_tsv("~/tmp/truth_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>%
 mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>%
 column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> true_expr
 read_tsv("~/tmp/predicted_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>%
 mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>%
 column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> pred_expr


 # Compare means across tissues (each point is a gene, x and y coordinates are means
 # over observed or predicted expression on log1p scale)
 plot( rowMeans(log1p(true_expr)), rowMeans(log1p(pred_expr)), asp=1)


 # Next try: Pick a tissue, subtract the mean over all tissues, and divide this "deviation from
 # the average" by the gene's standard deviation across tissues. Compare predicted vs observed
 # colour by average expression (as proxy for power)
 plot(
  ( log1p(true_expr)[,"liver"] - apply( log1p(true_expr), 1, mean ) ) / apply( log1p(true_expr), 1, sd ),
  ( log1p(pred_expr)[,"liver"] - apply( log1p(pred_expr), 1, mean ) ) / apply( log1p(pred_expr), 1, sd ),
  col=viridis::viridis(120,dir=-1)[ round(10*apply( log1p(true_expr), 1, median )) ], asp=1 )
 abline(h=0,v=0)

 # Pick one gene, plot predicted vs observed expression. (Now, each point is one tissue)
 gene <- sample( rownames(true_expr), 1 )
 plot( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ), main=gene, 
      sub=sprintf("Corr: %f.2", cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) )
 abline(0,1)
 # (Run this code several times to see several genes)

 # Which genes have expression in at least one sample?
 goodgenes <- names(which(rowMeans(true_expr)>0))

 # Calculate the correlation, as shown by the scatter plots for individual genes above,
 # for all "good" genes
 sapply( goodgenes, function(gene)
  cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) -> cor1

 #sapply( goodgenes, function(gene)
 #  cor( true_expr[gene,], pred_expr[gene,] ) ) -> cor1

 # Histogram of these
 hist(cor1)

 # Generate a null distribution by calculating correlation of a gene's observed expression
 # with a randomly picked other gene's predicetd expression
 perm_pred_expr <- pred_expr
 rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr))
 sapply( goodgenes, function(gene)
  cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) -> cor0
 hist(cor0)

 # Run this 1000 times to get a smoother histogram
 replicate( 1000, {
  perm_pred_expr <- pred_expr
  rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr))
  sapply( goodgenes, function(gene)
    cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) } ) -> cor0
 hist(cor0)

 # Make a QQ plot of observed correlation coefficients versus their null distribution
 plot(
  sort(cor0)[ seq( 500, by=1000, length.out=length(cor1) ) ],
  sort(cor1) )
 abline(0,1)

 # Plot correlation coefficient against SD (as measure of variation across tissues)
 # and against the latters' ranks
 plot( apply( log1p(true_expr), 1, sd )[goodgenes], cor1 )
 plot( rank(apply( log1p(true_expr), 1, sd )[goodgenes]), cor1 )


 # One might wonder why the null distribution (cor0) above has this skew.
 # It might be illustrative to look at the histogram of correlation coefficients
 # of two independent draws of 10 values each from a gamma distribution
 # (which looks somewhat similar to the distribution of our expression values)

 corgamma <- replicate( 10000, cor(rgamma(10,1,1),rgamma(10,1,1)) )
 hist( corgamma )

 # However, these have mean 0, while our null distribution has a mean >0
 mean( corgamma )
 mean( cor0 )
 # I don't quite understand that.
	library( tidyverse )

	read_tsv("~/tmp/truth_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>%
	mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>%
	column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> true_expr
	read_tsv("~/tmp/predicted_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>%
	mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>%
	column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> pred_expr


	# Compare means across tissues (each point is a gene, x and y coordinates are means
	# over observed or predicted expression on log1p scale)
	plot( rowMeans(log1p(true_expr)), rowMeans(log1p(pred_expr)), asp=1)


	# Next try: Pick a tissue, subtract the mean over all tissues, and divide this "deviation from
	# the average" by the gene's standard deviation across tissues. Compare predicted vs observed
	# colour by average expression (as proxy for power)
	plot(
	( log1p(true_expr)[,"liver"] - apply( log1p(true_expr), 1, mean ) ) / apply( log1p(true_expr), 1, sd ),
	( log1p(pred_expr)[,"liver"] - apply( log1p(pred_expr), 1, mean ) ) / apply( log1p(pred_expr), 1, sd ),
	col=viridis::viridis(120,dir=-1)[ round(10*apply( log1p(true_expr), 1, median )) ], asp=1 )
	abline(h=0,v=0)

	# Pick one gene, plot predicted vs observed expression. (Now, each point is one tissue)
	gene <- sample( rownames(true_expr), 1 )
	plot( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ), main=gene,
	sub=sprintf("Corr: %f.2", cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) )
	abline(0,1)
	# (Run this code several times to see several genes)

	# Which genes have expression in at least one sample?
	goodgenes <- names(which(rowMeans(true_expr)>0))

	# Calculate the correlation, as shown by the scatter plots for individual genes above,
	# for all "good" genes
	sapply( goodgenes, function(gene)
	cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) -> cor1

	#sapply( goodgenes, function(gene)
	# cor( true_expr[gene,], pred_expr[gene,] ) ) -> cor1

	# Histogram of these
	hist(cor1)

	# Generate a null distribution by calculating correlation of a gene's observed expression
	# with a randomly picked other gene's predicetd expression
	perm_pred_expr <- pred_expr
	rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr))
	sapply( goodgenes, function(gene)
	cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) -> cor0
	hist(cor0)

	# Run this 1000 times to get a smoother histogram
	replicate( 1000, {
	perm_pred_expr <- pred_expr
	rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr))
	sapply( goodgenes, function(gene)
	cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) } ) -> cor0
	hist(cor0)

	# Make a QQ plot of observed correlation coefficients versus their null distribution
	plot(
	sort(cor0)[ seq( 500, by=1000, length.out=length(cor1) ) ],
	sort(cor1) )
	abline(0,1)

	# Plot correlation coefficient against SD (as measure of variation across tissues)
	# and against the latters' ranks
	plot( apply( log1p(true_expr), 1, sd )[goodgenes], cor1 )
	plot( rank(apply( log1p(true_expr), 1, sd )[goodgenes]), cor1 )


	# One might wonder why the null distribution (cor0) above has this skew.
	# It might be illustrative to look at the histogram of correlation coefficients
	# of two independent draws of 10 values each from a gamma distribution
	# (which looks somewhat similar to the distribution of our expression values)

	corgamma <- replicate( 10000, cor(rgamma(10,1,1),rgamma(10,1,1)) )
	hist( corgamma )

	# However, these have mean 0, while our null distribution has a mean >0
	mean( corgamma )
	mean( cor0 )
	# I don't quite understand that.
No results found