# Signature-based analysis Here is the method used to explore glioblastoma heterogeneity. The aim is to separate cells based on a tumorigenic score (determined by previous analyses) to be able to find functional states shared among patients. ### Data To test this method, we used a scRNAseq dataset from Darmanis analysis ( [article](https://doi.org/10.1016/j.celrep.2017.10.030) ). ### Steps - Filter cells : - Filter out cells based on : - library size - number of detected genes in the cells Generate Fig. S1A and S1B - Transform the raw counts matrix in log2(CPM+1) - Gene filtering : - Filter out genes that aren't detected in at least 3% of the remaining cells - Calculation of a Tumorigenic score : - Retrieve genes of interest in the log2(CPM+1) filtered matrix - Remove genes that are poorly expressed (default : 20% of detection) - Score = Geometric mean of the retained genes - Separate cells in two groups with signature scores : - lower or higher than the signature score mean (for single cell data), respectively - lower than the first quartile and higher than the third quartile (for bulk tumor sample), respectively Generated Fig 3A (left panel), 3C and S3 - Differential analysis : - Mann Whitney test between low/high groups - Benjamini Hochberg correction : FDR = 0.01 - Correlation analysis : - Retrieve significantly differentially expressed genes (all genes with a Benjamini-Hotchberg adjusted p-values < FDR) - Pearson Correlation between tumorigenic score and gene expression across all cells - Additional output : Pearson correlation between gene expressions across all cells ##################### # Filter cells # ##################### # First step of the signature-based workflow # Using cutoff based on number of transcripts and # number of detected genes to remove low quality cells # Example of command (that generates output files) : # Rscript filter_cells.R -f --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot --output --output_metada # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) library(ggplot2) # Arguments option_list = list( make_option(c("-f", "--file"), default=NA, type='character', help="Input file that contains values to filter : Rows = genes and Columns = Samples"), make_option("--sep", default="\t", type='character', help="File column separator [default : '%default' ]"), make_option("--percentile_genes", default=0, type='integer', help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"), make_option("--percentile_counts", default=0, type='integer', help="nth Percentile of the total counts per cell distribution [default : '%default' ]"), make_option("--absolute_genes", default=0, type='integer', help="Remove cells that didn't express at least this number of genes [default : '%default' ]"), make_option("--absolute_counts", default=0, type='integer', help="Number of transcript threshold for cell filtering [default : '%default' ]"), make_option("--manage_cutoffs", default="intersect", type='character', help="combine or intersect cutoffs for filtering"), make_option("--pdfplot", type = 'character', default = "plotfile.pdf", help="Path to pdf file of the plots"), make_option("--output", type = 'character', default = "filtered_cells.tsv", help="Path to tsv file of filtered cell data"), make_option("--output_metada", type = 'character', default = "filtered_metadata.tsv", help="Path to tsv file of filtered cell metadata") ) opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} if (opt$sep == "space") {opt$sep = " "} # check consistency of filtering options if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) { opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering # if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) { # opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) { opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering # if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) { # opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering # Import datasets data.counts <- read.table( opt$file, header = TRUE, stringsAsFactors = F, sep = opt$sep, check.names = FALSE, row.names = 1 ) QC_metrics <- data.frame(cell_id = colnames(data.counts), nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell total_counts = colSums(data.counts), # total_counts : Total counts per cell stringsAsFactors = F) plot_hist <- function(mydata, variable, title, cutoff){ mybinwidth = round(max(mydata[, variable]) * 5 / 100) mylabel = paste0("cutoff= ", cutoff) hist_plot <- qplot( mydata[, variable], main = title, xlab = variable, geom="histogram", binwidth = mybinwidth, col = I("white")) + geom_vline(xintercept = cutoff) + annotate(geom="text", x=cutoff + mybinwidth, y=1, label=mylabel, color="white") plot(hist_plot) } # returns the highest value such as the sum of the ordered values including this highest value # is lower (below) than the percentile threshold (n) percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){ p = n / 100 percentile_threshold = quantile(qcmetrics[, variable], p)[[1]] plot_hist(qcmetrics, variable, plot_title, percentile_threshold) return(percentile_threshold) } pdf(file = opt$pdfplot) # Determine thresholds based on percentile if (opt$percentile_counts > 0) { counts_threshold <- percentile_cutoff( opt$percentile_counts, QC_metrics, "total_counts", "Histogram of Aligned read counts per cell" )} else { counts_threshold <- opt$absolute_counts plot_hist(QC_metrics, variable = "total_counts", title = "Histogram of Total counts per cell", cutoff = counts_threshold) } if (opt$percentile_genes > 0) { genes_threshold <- percentile_cutoff( opt$percentile_genes, QC_metrics, "nGenes", "Histogram of Number of detected genes per cell" )} else { genes_threshold <- opt$absolute_genes plot_hist(QC_metrics, variable = "nGenes", title = "Histogram of Number of detected genes per cell", cutoff = genes_threshold) } # Filter out rows below thresholds (genes and read counts) if (opt$manage_cutoffs == 'union'){ QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold) } else { QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold) } ## Plot the results # Determine title from the parameter logics if (opt$percentile_counts > 0){ part_one = paste0("Cells with aligned reads counts below the ", opt$percentile_counts, "th percentile of aligned read counts")} else { part_one = paste0("Cells with aligned read counts below ", opt$absolute_counts) } if(opt$percentile_genes > 0){ part_two = paste0("with number of detected genes below the ", opt$percentile_genes, "th percentile of detected gene counts")} else { part_two = paste0("with number of detected genes below ", opt$absolute_genes) } if (opt$manage_cutoffs == "intersect") { conjunction = " and\n" } else { conjunction = " or\n" } # plot with ggplot2 ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) + geom_point() + scale_y_log10() + scale_colour_discrete(name = "", breaks= c(FALSE, TRUE), labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"), paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) + xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") + geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) + ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) + theme(plot.title = element_text(size = 8, face = "bold")) dev.off() # Retrieve identifier of kept cells kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered] data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE) # Save filtered cells write.table( data.counts, opt$output, sep = "\t", quote = F, col.names = T, row.names = F ) # Add QC metrics of filtered cells to a metadata file metadata <- QC_metrics[kept.cells,] # Save the metadata (QC metrics) file write.table( metadata, opt$output_metada, sep = "\t", quote = F, col.names = T, row.names = F ) ######################### # log2(CPM +1) # # transformation # ######################### # Second step of the signature-based workflow # Transform raw counts into log2(CPM +1) # Example of command # Rscript 2-log2CPM1p.R -d ../1-filter_cells/filterCells.tsv -o log2CPM1p.tsv # Load necessary packages (install them if it's not the case) requiredPackages = c('optparse') for (p in requiredPackages) { if (!require(p, character.only = TRUE, quietly = T)) { install.packages(p) } suppressPackageStartupMessages(suppressMessages(library(p, character.only = TRUE))) } #Arguments option_list = list( make_option( c("-d", "--data"), default = NA, type = 'character', help = "Input file that contains count values to transform : Rows = genes and Columns = Samples" ), make_option( c("-s", "--sep"), default = '\t', type = 'character', help = "File separator [default : '%default' ]" ), make_option( c("-c", "--colnames"), default = TRUE, type = 'logical', help = "Consider first line as header ? [default : '%default' ]" ), make_option( c("-o", "--out"), default = "res.tab", type = 'character', help = "Output name [default : '%default' ]" ) ) opt = parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$data == "" & !(opt$help)) { stop("At least one argument must be supplied (count data).\n", call. = FALSE) } #Open files data.counts <- read.table( opt$data, h = opt$colnames, row.names = 1, sep = opt$sep, check.names = F ) #Raw counts to CPM cpm <- t(t(data.counts) / colSums(data.counts)) * 1000000 #CPM to log2(CPM +1) log2cpm1p <- log2(cpm +1) #Save file write.table( log2cpm1p, paste(opt$out), col.names = opt$colnames, row.names = T, quote = F, sep = "\t" ) ######################### # filter genes # ######################### # Third step of the signature-based workflow # Filter low expressed genes # Example of command (used for generate output file) : # Rscript 3-filter_genes.R -f ../log2CPM1p.tsv -o filterGenes.tsv --absolute_detection 3 # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") library(optparse) # Arguments option_list = list( make_option( c("-f", "--input"), default = NA, type = 'character', help = "Input file that contains expression values to filter : Rows = genes and Columns = Samples" ), make_option( c("-s", "--sep"), default = '\t', type = 'character', help = "File separator [default : '%default' ]" ), make_option( c("-c", "--colnames"), default = TRUE, type = 'logical', help = "first line is a header [default : '%default' ]" ), make_option( "--percentile_detection", default = 0, type = 'numeric', help = "Include genes with detected expression in at least \ this fraction of cells [default : '%default' ]" ), make_option( "--absolute_detection", default = 0, type = 'numeric', help = "Include genes with detected expression in at least \ this number of cells [default : '%default' ]" ), make_option( c("-o", "--output"), default = NA, type = 'character', help = "Output name [default : '%default' ]" ) ) opt = parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} # Open files data.counts <- read.table( opt$input, h = opt$colnames, row.names = 1, sep = opt$sep, check.names = F ) # note the [if else] below, to handle percentile_detection=absolute_detection=0 # Search for genes that are expressed in a certain percent of cells if (opt$percentile_detection > 0) { kept_genes <- rowSums(data.counts != 0) >= (opt$percentile_detection * ncol(data.counts)) } else { # Search for genes that are expressed in more than an absolute number of cells kept_genes <- rowSums(data.counts != 0) >= (opt$absolute_detection) } # Filter matrix data.counts <- data.counts[kept_genes,] # Save filtered matrix write.table( data.counts, opt$output, sep = "\t", quote = F, col.names = T, row.names = T ) ######################### # Signature score # ######################### # Fourth step of the signature-based workflow # Compute the signature score based on the # geometric mean of the target gene expression # and separate cells thanks to this signature #score into 2 groups (high/low). # Example of command # Rscript 4-signature_score.R --input --genes # --output --pdf --type sc # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) library(psych) library(ggplot2) library(gridExtra) # Arguments option_list = list( make_option( "--input", default = NA, type = 'character', help = "Input file that contains log2(CPM +1) values : Rows = genes and Columns = Samples" ), make_option( "--sep", default = '\t', type = 'character', help = "File separator [default : '%default' ]" ), make_option( "--colnames", default = TRUE, type = 'logical', help = "Consider first line as header ? [default : '%default' ]" ), make_option( "--genes", default = NA, type = 'character', help = "File that contains the genes of interest (no header) : one gene per line" ), make_option( "--percentile_threshold", default = 20, type = 'integer', help = "Percentage of dectection threshold [default : '%default' ]" ), make_option( "--type", default = "sc", type = 'character', help="Type of data you want to process : bulk or sc"), make_option( "--output", default = "~/output.tab", type = 'character', help = "Output path [default : '%default' ]" ), make_option( "--pdf", default = "~/output.pdf", type = 'character', help = "Output path [default : '%default' ]" ) ) opt = parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} # Take input data data.counts <- read.table( opt$input, h = opt$colnames, row.names = 1, sep = opt$sep, check.names = F ) # Get vector of target genes gene_list <- read.csv( opt$genes, header = FALSE, stringsAsFactors = FALSE ) genes <- as.character(gene_list[,1]) if (length(unique(genes %in% rownames(data.counts))) == 1) { if (unique(genes %in% rownames(data.counts)) == F) stop("None of these genes are in your dataset: ", opt$genes) } logical_genes <- rownames(data.counts) %in% genes # Retrieve target genes in counts data signature.counts <- subset(data.counts, logical_genes) ## Descriptive Statistics Function descriptive_stats = function(InputData) { SummaryData = data.frame( mean = rowMeans(InputData), SD = apply(InputData, 1, sd), Variance = apply(InputData, 1, var), Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { (sum(x != 0) / ncol(y)) * 100 }) ) return(SummaryData) } signature_stats <- descriptive_stats(signature.counts) # Find poorly detected genes from the signature kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold # Add warnings if (length(unique(kept_genes)) > 1) { cat( "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), "\n" ) } else { if (unique(kept_genes) == F) { stop( "None of these genes are detected in ", opt$percent, "% of your cells: ", paste(rownames(signature_stats), collapse = ", "), ". You can be less stringent thanks to --percent parameter." ) } } # Remove genes poorly detected in the dataset signature.counts <- signature.counts[kept_genes,] # Replace 0 by 1 counts signature.counts[signature.counts == 0] <- 1 # Geometric mean by cell score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych # Add results in signature_output if(opt$type == "sc"){ signature_output <- data.frame( cell = names(score), score = score, rate = ifelse(score > mean(score), "HIGH", "LOW"), nGenes = colSums(data.counts != 0), total_counts = colSums(data.counts) ) # Re-arrange score matrix for plots score <- data.frame(score = score, order = rank(score, ties.method = "first"), signature = signature_output$rate, stringsAsFactors = F) pdf(file = opt$pdf) dist_plot <- ggplot(score, aes(x = order, y = score)) + geom_line() + geom_segment(x = 0, xend = max(score$order[score$signature == "LOW"]), y = mean(score$score), yend = mean(score$score)) + geom_area(aes(fill = signature), alpha = .7) + scale_fill_manual(values=c("#ff0000", "#08661e")) + geom_text(aes(x = 1, y = mean(score)), label = "Mean", vjust = -0.3, colour = "black") + labs(title = "Ordered cell signature scores", x = "Cell index", y = "Score") plot(dist_plot) density_score <- density(score$score) dens_plot <- ggplot(data.frame(density_score[1:2]), aes(x, y, fill = ifelse(x < mean(score$score), "LOW", "HIGH"))) + geom_line() + geom_vline(xintercept = mean(score$score)) + geom_text(x = mean(score$score), y = max(density_score$y), label = "Mean", hjust = -0.3, colour = "black") + geom_area(alpha = .7) + scale_fill_manual(values=c("#ff0000", "#08661e")) + ylim(0, max(density_score$y)) + labs( title = "Distribution of Cell signature scores", x = paste("N =", density_score$n, "Bandwidth =", density_score$bw), y = "Density", fill = "Signature" ) plot(dens_plot) # Check score independant of low expression p_gene <- ggplot(signature_output, aes(rate, nGenes)) + geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) + scale_fill_manual(values=c("#ff0000", "#08661e")) + geom_jitter() + labs(y = "Number of detected genes", x = "Signature") p_counts <- ggplot(signature_output, aes(rate, total_counts)) + geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) + scale_fill_manual(values=c("#ff0000", "#08661e")) + geom_jitter() + labs(y = "Total counts", x = "Signature") grid.arrange(p_gene, p_counts, ncol = 2, top = "Influence of library sequencing depth on cell signature scores") dev.off() }else { signature_output <- data.frame( cell = names(score), score = score, rate = "middle", nGenes = colSums(data.counts != 0), total_counts = colSums(data.counts), stringsAsFactors = F ) signature_output$rate[signature_output$score <= quantile(signature_output$score)[2]] <- "LOW" signature_output$rate[signature_output$score >= quantile(signature_output$score)[4]] <- "HIGH" # Re-arrange score matrix for plots score <- data.frame(score = score, order = rank(score, ties.method = "first"), signature = signature_output$rate, stringsAsFactors = F) pdf(file = opt$pdf) distri_plot <- ggplot(score, aes(x = order, y = score, fill = signature)) + geom_line() + geom_area(aes(fill = signature), alpha = .7) + scale_fill_manual(values=c("#e67300", "#00b32d", "grey")) + labs(title = "Ordered bulk tumor signature scores", x = "Tumor index", y = "Score") plot(distri_plot) dev.off() } # Save file write.table( signature_output, opt$output, sep = "\t", quote = F, col.names = T, row.names = F ) #################### # Differential # # analysis # #################### # Fifth step of the signature-based workflow # Perform a differential analysis between 2 # groups high/low. # Example of command # Rscript 5-differential_analysis.R --input ../3-filter_genes/filterGenes_3.tsv --sep tab --colnames TRUE --comparison_factor_file signatureCategories.tsv --fdr 0.01 --log --output diff_analysis.tsv # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) #Arguments option_list = list( make_option( "--input", default = NA, type = 'character', help = "Input file that contains log2(CPM +1) values : Rows = genes and Columns = Samples" ), make_option( "--sep", default = '\t', type = 'character', help = "File separator [default : '%default' ]" ), make_option( "--colnames", default = TRUE, type = 'logical', help = "Consider first line as header ? [default : '%default' ]" ), make_option( "--comparison_factor_file", default = NA, type = 'character', help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)" ), make_option( "--factor1", default = "HIGH", type = 'character', help = "First factor of rate category in comparison factor file" ), make_option( "--factor2", default = "LOW", type = 'character', help = "Second factor of rate category in comparison factor file" ), make_option( "--fdr", default = 0.01, type = 'numeric', help = "FDR threshold [default : '%default' ]" ), make_option( "--log", default=FALSE, action="store_true", type = 'logical', help = "Expression data are log-transformed [default : '%default' ]" ), make_option( "--output", default = "results.tsv", type = 'character', help = "Output name [default : '%default' ]" ) ) opt = parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} #Open files data.counts <- read.table( opt$input, h = opt$colnames, row.names = 1, sep = opt$sep, check.names = F ) metadata <- read.table( opt$comparison_factor_file, header = TRUE, stringsAsFactors = F, sep = "\t", check.names = FALSE, row.names = 1 ) metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts)) # Create two logical named vectors for each factor level of cell signature factor1_cells <- setNames(metadata[,1] == opt$factor1, rownames(metadata)) factor2_cells <- setNames(metadata[,1] == opt$factor2, rownames(metadata)) ## Mann-Whitney test (Two-sample Wilcoxon test) MW_test <- data.frame(t(apply(data.counts, 1, function(x) { do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2] })), stringsAsFactors = F) # Benjamini-Hochberg correction and significativity MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH" , n = nrow(MW_test)) # MW_test$Critical.value <- (rank(MW_test$p.value) / nrow(MW_test)) * opt$fdr MW_test$Significant <- MW_test$p.adjust < opt$fdr ## Descriptive Statistics Function descriptive_stats <- function(InputData) { SummaryData = data.frame( mean = rowMeans(InputData), SD = apply(InputData, 1, sd), Variance = apply(InputData, 1, var), Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { (sum(x != 0) / ncol(y)) * 100 }), mean_factor2 = rowMeans(InputData[,factor2_cells]), mean_factor1 = rowMeans(InputData[, factor1_cells]) ) if(opt$log) { SummaryData$fold_change <- SummaryData$mean_factor1 - SummaryData$mean_factor2 } else { SummaryData$fold_change <- SummaryData$mean_factor1 / SummaryData$mean_factor2 } return(SummaryData) } gene_stats <- descriptive_stats(data.counts) results <- merge(gene_stats, MW_test, by = "row.names") colnames(results)[1] <- "genes" # Save files write.table( results, opt$output, sep = "\t", quote = F, col.names = T, row.names = F ) #################### # Correlation # # analysis # #################### # Sixth step of the signature-based workflow # Performs multi-correlation analysis between the vectors of gene expressions # in single cell RNAseq libraries and the vectors of signature scores in these # same single cell RNAseq libraries. # All data file used in input must have the same delimiter. # Example of command # Rscript 6-correlation.R --expression_file ../3-filter_genes/filterGenes.tsv --signatures_file signatureScore.tsv --gene_metadata ../5-differential_analysis/diff_analysis.tsv # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") requiredPackages = c('optparse', 'Hmisc') warnings() library(optparse) library(Hmisc) # Arguments option_list = list( make_option( "--sep", default = '\t', type = 'character', help = "File separator, must be the same for all input files [default : '%default' ]" ), make_option( "--colnames", default = TRUE, type = 'logical', help = "Consider first lines as header (must stand for all input files) [default : '%default' ]" ), make_option( "--expression_file", default = NA, type = 'character', help = "Input file that contains log2(CPM +1) expression values : Rows = genes and Columns = Samples" ), make_option( "--signatures_file", default = NA, type = 'character', help = "Input file that contains cell signature : two column table (with header) of cell identifiers and a signature score value for each cell" ), make_option( c("-g", "--gene_metadata"), default = NA, type = 'character', help = "Input file that contains genes metadata" ), make_option( "--sig_corr", default = "sig_corr.tsv", type = 'character', help = "signature correlations output [default : '%default' ]" ), make_option( "--gene_corr", default = "gene_corr.tsv", type = 'character', help = "gene-gene correlations output [default : '%default' ]" ), make_option( "--gene_corr_pval", default = "gene_corr_pval.tsv", type = 'character', help = "gene-gene correlation pvalues output [default : '%default' ]" ) ) opt = parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} # Open files data <- read.table( opt$expression_file, header = opt$colnames, row.names = 1, sep = opt$sep, check.names = F ) signature <- read.delim( opt$signatures_file, header = T, stringsAsFactors = F, row.names = 1, sep = opt$sep, check.names = F ) gene_metadata <- read.delim( opt$gene_metadata, header = TRUE, stringsAsFactors = F, sep = opt$sep, check.names = FALSE, row.names = 1 ) #Retrieve significantly differentially expressed genes DE_genes <- rownames(subset(gene_metadata, Significant == TRUE)) #Filter expression matrix data <- data[DE_genes,] # keep only signatures that are in the expression dataframe signature <- subset(signature, rownames(signature) %in% colnames(data)) # Add signature score to expression matrix data <- rbind(t(signature), data) # Gene correlation gene_corr <- rcorr(t(data), type = "pearson") # transpose because we correlate genes, not cells # Gene correlation with signature score gene_signature_corr <- cbind.data.frame(gene = colnames(gene_corr$r), Pearson_correlation = gene_corr$r[, 1], p_value = gene_corr$P[, 1]) gene_signature_corr <- gene_signature_corr[ order(gene_signature_corr[,2], decreasing = T), ] # Save files write.table( gene_signature_corr, file = opt$sig_corr, sep = "\t", quote = F, col.names = T, row.names = F ) r_genes <- data.frame(gene=rownames(gene_corr$r), gene_corr$r) # add rownames as a variable for output p_genes <- data.frame(gene=rownames(gene_corr$P), gene_corr$P) # add rownames as a variable for output write.table( r_genes[-1,-2], file = opt$gene_corr, sep = "\t", quote = F, col.names = T, row.names = F ) write.table( p_genes[-1,-2], file = opt$gene_corr_pval, sep = "\t", quote = F, col.names = T, row.names = F )