Created
January 29, 2026 14:44
-
-
Save simon-anders/2262caad7fff10d498f7682f3031ff9f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library( tidyverse ) | |
| read_tsv("~/tmp/truth_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>% | |
| mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>% | |
| column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> true_expr | |
| read_tsv("~/tmp/predicted_gene_expression_protein_coding_summed_by_tissue_negative_strand.csv") %>% | |
| mutate( gene = coalesce( `Gene name`, `Gene stable ID` ) ) %>% | |
| column_to_rownames("gene") %>% select( -(1:4) ) %>% as.matrix -> pred_expr | |
| # Compare means across tissues (each point is a gene, x and y coordinates are means | |
| # over observed or predicted expression on log1p scale) | |
| plot( rowMeans(log1p(true_expr)), rowMeans(log1p(pred_expr)), asp=1) | |
| # Next try: Pick a tissue, subtract the mean over all tissues, and divide this "deviation from | |
| # the average" by the gene's standard deviation across tissues. Compare predicted vs observed | |
| # colour by average expression (as proxy for power) | |
| plot( | |
| ( log1p(true_expr)[,"liver"] - apply( log1p(true_expr), 1, mean ) ) / apply( log1p(true_expr), 1, sd ), | |
| ( log1p(pred_expr)[,"liver"] - apply( log1p(pred_expr), 1, mean ) ) / apply( log1p(pred_expr), 1, sd ), | |
| col=viridis::viridis(120,dir=-1)[ round(10*apply( log1p(true_expr), 1, median )) ], asp=1 ) | |
| abline(h=0,v=0) | |
| # Pick one gene, plot predicted vs observed expression. (Now, each point is one tissue) | |
| gene <- sample( rownames(true_expr), 1 ) | |
| plot( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ), main=gene, | |
| sub=sprintf("Corr: %f.2", cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) ) | |
| abline(0,1) | |
| # (Run this code several times to see several genes) | |
| # Which genes have expression in at least one sample? | |
| goodgenes <- names(which(rowMeans(true_expr)>0)) | |
| # Calculate the correlation, as shown by the scatter plots for individual genes above, | |
| # for all "good" genes | |
| sapply( goodgenes, function(gene) | |
| cor( log1p( true_expr[gene,] ), log1p( pred_expr[gene,] ) ) ) -> cor1 | |
| #sapply( goodgenes, function(gene) | |
| # cor( true_expr[gene,], pred_expr[gene,] ) ) -> cor1 | |
| # Histogram of these | |
| hist(cor1) | |
| # Generate a null distribution by calculating correlation of a gene's observed expression | |
| # with a randomly picked other gene's predicetd expression | |
| perm_pred_expr <- pred_expr | |
| rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr)) | |
| sapply( goodgenes, function(gene) | |
| cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) -> cor0 | |
| hist(cor0) | |
| # Run this 1000 times to get a smoother histogram | |
| replicate( 1000, { | |
| perm_pred_expr <- pred_expr | |
| rownames(perm_pred_expr) <- sample(rownames(perm_pred_expr)) | |
| sapply( goodgenes, function(gene) | |
| cor( log1p( true_expr[gene,] ), log1p( perm_pred_expr[gene,] ) ) ) } ) -> cor0 | |
| hist(cor0) | |
| # Make a QQ plot of observed correlation coefficients versus their null distribution | |
| plot( | |
| sort(cor0)[ seq( 500, by=1000, length.out=length(cor1) ) ], | |
| sort(cor1) ) | |
| abline(0,1) | |
| # Plot correlation coefficient against SD (as measure of variation across tissues) | |
| # and against the latters' ranks | |
| plot( apply( log1p(true_expr), 1, sd )[goodgenes], cor1 ) | |
| plot( rank(apply( log1p(true_expr), 1, sd )[goodgenes]), cor1 ) | |
| # One might wonder why the null distribution (cor0) above has this skew. | |
| # It might be illustrative to look at the histogram of correlation coefficients | |
| # of two independent draws of 10 values each from a gamma distribution | |
| # (which looks somewhat similar to the distribution of our expression values) | |
| corgamma <- replicate( 10000, cor(rgamma(10,1,1),rgamma(10,1,1)) ) | |
| hist( corgamma ) | |
| # However, these have mean 0, while our null distribution has a mean >0 | |
| mean( corgamma ) | |
| mean( cor0 ) | |
| # I don't quite understand that. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment