# Signature-based analysis
Here is the method used to explore glioblastoma heterogeneity. The aim is to separate cells based on a tumorigenic score (determined by previous analyses) to be able to find functional states shared among patients.
### Data
To test this method, we used a scRNAseq dataset from Darmanis analysis ( [article](https://doi.org/10.1016/j.celrep.2017.10.030) ).
### Steps
- Filter cells :
- Filter out cells based on :
- library size
- number of detected genes in the cells
Generate Fig. S1A and S1B
- Transform the raw counts matrix in log2(CPM+1)
- Gene filtering :
- Filter out genes that aren't detected in at least 3% of the remaining cells
- Calculation of a Tumorigenic score :
- Retrieve genes of interest in the log2(CPM+1) filtered matrix
- Remove genes that are poorly expressed (default : 20% of detection)
- Score = Geometric mean of the retained genes
- Separate cells in two groups with signature scores :
- lower or higher than the signature score mean (for single cell data), respectively
- lower than the first quartile and higher than the third quartile (for bulk tumor sample), respectively
Generated Fig 3A (left panel), 3C and S3
- Differential analysis :
- Mann Whitney test between low/high groups
- Benjamini Hochberg correction : FDR = 0.01
- Correlation analysis :
- Retrieve significantly differentially expressed genes (all genes with a Benjamini-Hotchberg adjusted p-values < FDR)
- Pearson Correlation between tumorigenic score and gene expression across all cells
- Additional output : Pearson correlation between gene expressions across all cells
#####################
# Filter cells #
#####################
# First step of the signature-based workflow
# Using cutoff based on number of transcripts and
# number of detected genes to remove low quality cells
# Example of command (that generates output files) :
# Rscript filter_cells.R -f --sep "/t" --absolute_genes 1700 --absolute_counts 90000 --pdfplot --output --output_metada
# load packages that are provided in the conda env
options( show.error.messages=F,
error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
warnings()
library(optparse)
library(ggplot2)
# Arguments
option_list = list(
make_option(c("-f", "--file"), default=NA, type='character',
help="Input file that contains values to filter : Rows = genes and Columns = Samples"),
make_option("--sep", default="\t", type='character',
help="File column separator [default : '%default' ]"),
make_option("--percentile_genes", default=0, type='integer',
help="nth Percentile of the number of genes detected by a cell distribution [default : '%default' ]"),
make_option("--percentile_counts", default=0, type='integer',
help="nth Percentile of the total counts per cell distribution [default : '%default' ]"),
make_option("--absolute_genes", default=0, type='integer',
help="Remove cells that didn't express at least this number of genes [default : '%default' ]"),
make_option("--absolute_counts", default=0, type='integer',
help="Number of transcript threshold for cell filtering [default : '%default' ]"),
make_option("--manage_cutoffs", default="intersect", type='character',
help="combine or intersect cutoffs for filtering"),
make_option("--pdfplot", type = 'character',
default = "plotfile.pdf",
help="Path to pdf file of the plots"),
make_option("--output", type = 'character',
default = "filtered_cells.tsv",
help="Path to tsv file of filtered cell data"),
make_option("--output_metada", type = 'character',
default = "filtered_metadata.tsv",
help="Path to tsv file of filtered cell metadata")
)
opt = parse_args(OptionParser(option_list=option_list), args = commandArgs(trailingOnly = TRUE))
if (opt$sep == "tab") {opt$sep = "\t"}
if (opt$sep == "comma") {opt$sep = ","}
if (opt$sep == "space") {opt$sep = " "}
# check consistency of filtering options
if ((opt$percentile_counts > 0) & (opt$absolute_counts > 0)) {
opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
# if ((opt$percentile_counts == 0) & (opt$absolute_counts == 0)) {
# opt$percentile_counts = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
if ((opt$percentile_genes > 0) & (opt$absolute_genes > 0)) {
opt$percentile_genes = 0 } # since input parameters are not consistent (one or either method, not both), no filtering
# if ((opt$percentile_genes == 0) & (opt$absolute_genes == 0)) {
# opt$percentile_genes = 100 } # since input parameters are not consistent (one or either method, not both), no filtering
# Import datasets
data.counts <- read.table(
opt$file,
header = TRUE,
stringsAsFactors = F,
sep = opt$sep,
check.names = FALSE,
row.names = 1
)
QC_metrics <-
data.frame(cell_id = colnames(data.counts),
nGenes = colSums(data.counts != 0), # nGenes : Number of detected genes for each cell
total_counts = colSums(data.counts), # total_counts : Total counts per cell
stringsAsFactors = F)
plot_hist <- function(mydata, variable, title, cutoff){
mybinwidth = round(max(mydata[, variable]) * 5 / 100)
mylabel = paste0("cutoff= ", cutoff)
hist_plot <- qplot(
mydata[, variable],
main = title,
xlab = variable,
geom="histogram",
binwidth = mybinwidth,
col = I("white")) +
geom_vline(xintercept = cutoff) +
annotate(geom="text",
x=cutoff + mybinwidth, y=1,
label=mylabel, color="white")
plot(hist_plot)
}
# returns the highest value such as the sum of the ordered values including this highest value
# is lower (below) than the percentile threshold (n)
percentile_cutoff <- function(n, qcmetrics, variable, plot_title, ...){
p = n / 100
percentile_threshold = quantile(qcmetrics[, variable], p)[[1]]
plot_hist(qcmetrics,
variable,
plot_title,
percentile_threshold)
return(percentile_threshold)
}
pdf(file = opt$pdfplot)
# Determine thresholds based on percentile
if (opt$percentile_counts > 0) {
counts_threshold <- percentile_cutoff(
opt$percentile_counts,
QC_metrics,
"total_counts",
"Histogram of Aligned read counts per cell"
)} else {
counts_threshold <- opt$absolute_counts
plot_hist(QC_metrics,
variable = "total_counts",
title = "Histogram of Total counts per cell",
cutoff = counts_threshold)
}
if (opt$percentile_genes > 0) {
genes_threshold <- percentile_cutoff(
opt$percentile_genes,
QC_metrics,
"nGenes",
"Histogram of Number of detected genes per cell"
)} else {
genes_threshold <- opt$absolute_genes
plot_hist(QC_metrics,
variable = "nGenes",
title = "Histogram of Number of detected genes per cell",
cutoff = genes_threshold)
}
# Filter out rows below thresholds (genes and read counts)
if (opt$manage_cutoffs == 'union'){
QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) | (QC_metrics$total_counts < counts_threshold)
} else {
QC_metrics$filtered <- (QC_metrics$nGenes < genes_threshold) & (QC_metrics$total_counts < counts_threshold)
}
## Plot the results
# Determine title from the parameter logics
if (opt$percentile_counts > 0){
part_one = paste0("Cells with aligned reads counts below the ",
opt$percentile_counts,
"th percentile of aligned read counts")} else {
part_one = paste0("Cells with aligned read counts below ",
opt$absolute_counts)
}
if(opt$percentile_genes > 0){
part_two = paste0("with number of detected genes below the ",
opt$percentile_genes,
"th percentile of detected gene counts")} else {
part_two = paste0("with number of detected genes below ",
opt$absolute_genes)
}
if (opt$manage_cutoffs == "intersect") {
conjunction = " and\n" } else {
conjunction = " or\n"
}
# plot with ggplot2
ggplot(QC_metrics, aes(nGenes, total_counts, colour = filtered)) +
geom_point() + scale_y_log10() +
scale_colour_discrete(name = "",
breaks= c(FALSE, TRUE),
labels= c(paste0("Not filtered (", table(QC_metrics$filtered)[1], " cells)"),
paste0("Filtered (", table(QC_metrics$filtered)[2], " cells)"))) +
xlab("Detected genes per cell") + ylab("Aligned reads per cell (log10 scale)") +
geom_vline(xintercept = genes_threshold) + geom_hline(yintercept = counts_threshold) +
ggtitle( paste0(part_one, conjunction, part_two, "\nwere filtered out")) +
theme(plot.title = element_text(size = 8, face = "bold"))
dev.off()
# Retrieve identifier of kept cells
kept.cells <- QC_metrics$cell_id[!QC_metrics$filtered]
data.counts <- data.frame(Genes=rownames(data.counts[,kept.cells]), data.counts[,kept.cells], check.names = FALSE)
# Save filtered cells
write.table(
data.counts,
opt$output,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)
# Add QC metrics of filtered cells to a metadata file
metadata <- QC_metrics[kept.cells,]
# Save the metadata (QC metrics) file
write.table(
metadata,
opt$output_metada,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)
#########################
# log2(CPM +1) #
# transformation #
#########################
# Second step of the signature-based workflow
# Transform raw counts into log2(CPM +1)
# Example of command
# Rscript 2-log2CPM1p.R -d ../1-filter_cells/filterCells.tsv -o log2CPM1p.tsv
# Load necessary packages (install them if it's not the case)
requiredPackages = c('optparse')
for (p in requiredPackages) {
if (!require(p, character.only = TRUE, quietly = T)) {
install.packages(p)
}
suppressPackageStartupMessages(suppressMessages(library(p, character.only = TRUE)))
}
#Arguments
option_list = list(
make_option(
c("-d", "--data"),
default = NA,
type = 'character',
help = "Input file that contains count values to transform : Rows = genes and Columns = Samples"
),
make_option(
c("-s", "--sep"),
default = '\t',
type = 'character',
help = "File separator [default : '%default' ]"
),
make_option(
c("-c", "--colnames"),
default = TRUE,
type = 'logical',
help = "Consider first line as header ? [default : '%default' ]"
),
make_option(
c("-o", "--out"),
default = "res.tab",
type = 'character',
help = "Output name [default : '%default' ]"
)
)
opt = parse_args(OptionParser(option_list = option_list),
args = commandArgs(trailingOnly = TRUE))
if (opt$data == "" & !(opt$help)) {
stop("At least one argument must be supplied (count data).\n",
call. = FALSE)
}
#Open files
data.counts <- read.table(
opt$data,
h = opt$colnames,
row.names = 1,
sep = opt$sep,
check.names = F
)
#Raw counts to CPM
cpm <- t(t(data.counts) / colSums(data.counts)) * 1000000
#CPM to log2(CPM +1)
log2cpm1p <- log2(cpm +1)
#Save file
write.table(
log2cpm1p,
paste(opt$out),
col.names = opt$colnames,
row.names = T,
quote = F,
sep = "\t"
)
#########################
# filter genes #
#########################
# Third step of the signature-based workflow
# Filter low expressed genes
# Example of command (used for generate output file) :
# Rscript 3-filter_genes.R -f ../log2CPM1p.tsv -o filterGenes.tsv --absolute_detection 3
# load packages that are provided in the conda env
options( show.error.messages=F,
error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
library(optparse)
# Arguments
option_list = list(
make_option(
c("-f", "--input"),
default = NA,
type = 'character',
help = "Input file that contains expression values to filter : Rows = genes and Columns = Samples"
),
make_option(
c("-s", "--sep"),
default = '\t',
type = 'character',
help = "File separator [default : '%default' ]"
),
make_option(
c("-c", "--colnames"),
default = TRUE,
type = 'logical',
help = "first line is a header [default : '%default' ]"
),
make_option(
"--percentile_detection",
default = 0,
type = 'numeric',
help = "Include genes with detected expression in at least \
this fraction of cells [default : '%default' ]"
),
make_option(
"--absolute_detection",
default = 0,
type = 'numeric',
help = "Include genes with detected expression in at least \
this number of cells [default : '%default' ]"
),
make_option(
c("-o", "--output"),
default = NA,
type = 'character',
help = "Output name [default : '%default' ]"
)
)
opt = parse_args(OptionParser(option_list = option_list),
args = commandArgs(trailingOnly = TRUE))
if (opt$sep == "tab") {opt$sep = "\t"}
if (opt$sep == "comma") {opt$sep = ","}
# Open files
data.counts <- read.table(
opt$input,
h = opt$colnames,
row.names = 1,
sep = opt$sep,
check.names = F
)
# note the [if else] below, to handle percentile_detection=absolute_detection=0
# Search for genes that are expressed in a certain percent of cells
if (opt$percentile_detection > 0) {
kept_genes <- rowSums(data.counts != 0) >= (opt$percentile_detection * ncol(data.counts))
} else {
# Search for genes that are expressed in more than an absolute number of cells
kept_genes <- rowSums(data.counts != 0) >= (opt$absolute_detection)
}
# Filter matrix
data.counts <- data.counts[kept_genes,]
# Save filtered matrix
write.table(
data.counts,
opt$output,
sep = "\t",
quote = F,
col.names = T,
row.names = T
)
#########################
# Signature score #
#########################
# Fourth step of the signature-based workflow
# Compute the signature score based on the
# geometric mean of the target gene expression
# and separate cells thanks to this signature
#score into 2 groups (high/low).
# Example of command
# Rscript 4-signature_score.R --input --genes
# --output --pdf --type sc
# load packages that are provided in the conda env
options( show.error.messages=F,
error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
warnings()
library(optparse)
library(psych)
library(ggplot2)
library(gridExtra)
# Arguments
option_list = list(
make_option(
"--input",
default = NA,
type = 'character',
help = "Input file that contains log2(CPM +1) values : Rows = genes and Columns = Samples"
),
make_option(
"--sep",
default = '\t',
type = 'character',
help = "File separator [default : '%default' ]"
),
make_option(
"--colnames",
default = TRUE,
type = 'logical',
help = "Consider first line as header ? [default : '%default' ]"
),
make_option(
"--genes",
default = NA,
type = 'character',
help = "File that contains the genes of interest (no header) : one gene per line"
),
make_option(
"--percentile_threshold",
default = 20,
type = 'integer',
help = "Percentage of dectection threshold [default : '%default' ]"
),
make_option(
"--type",
default = "sc",
type = 'character',
help="Type of data you want to process : bulk or sc"),
make_option(
"--output",
default = "~/output.tab",
type = 'character',
help = "Output path [default : '%default' ]"
),
make_option(
"--pdf",
default = "~/output.pdf",
type = 'character',
help = "Output path [default : '%default' ]"
)
)
opt = parse_args(OptionParser(option_list = option_list),
args = commandArgs(trailingOnly = TRUE))
if (opt$sep == "tab") {opt$sep = "\t"}
if (opt$sep == "comma") {opt$sep = ","}
# Take input data
data.counts <- read.table(
opt$input,
h = opt$colnames,
row.names = 1,
sep = opt$sep,
check.names = F
)
# Get vector of target genes
gene_list <- read.csv(
opt$genes,
header = FALSE,
stringsAsFactors = FALSE
)
genes <- as.character(gene_list[,1])
if (length(unique(genes %in% rownames(data.counts))) == 1) {
if (unique(genes %in% rownames(data.counts)) == F)
stop("None of these genes are in your dataset: ", opt$genes)
}
logical_genes <- rownames(data.counts) %in% genes
# Retrieve target genes in counts data
signature.counts <- subset(data.counts, logical_genes)
## Descriptive Statistics Function
descriptive_stats = function(InputData) {
SummaryData = data.frame(
mean = rowMeans(InputData),
SD = apply(InputData, 1, sd),
Variance = apply(InputData, 1, var),
Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
(sum(x != 0) / ncol(y)) * 100
})
)
return(SummaryData)
}
signature_stats <- descriptive_stats(signature.counts)
# Find poorly detected genes from the signature
kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold
# Add warnings
if (length(unique(kept_genes)) > 1) {
cat(
"WARNINGS ! Following genes were removed from further analysis due to low gene expression :",
paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "),
"\n"
)
} else {
if (unique(kept_genes) == F) {
stop(
"None of these genes are detected in ",
opt$percent,
"% of your cells: ",
paste(rownames(signature_stats), collapse = ", "),
". You can be less stringent thanks to --percent parameter."
)
}
}
# Remove genes poorly detected in the dataset
signature.counts <- signature.counts[kept_genes,]
# Replace 0 by 1 counts
signature.counts[signature.counts == 0] <- 1
# Geometric mean by cell
score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych
# Add results in signature_output
if(opt$type == "sc"){
signature_output <- data.frame(
cell = names(score),
score = score,
rate = ifelse(score > mean(score), "HIGH", "LOW"),
nGenes = colSums(data.counts != 0),
total_counts = colSums(data.counts)
)
# Re-arrange score matrix for plots
score <- data.frame(score = score,
order = rank(score, ties.method = "first"),
signature = signature_output$rate,
stringsAsFactors = F)
pdf(file = opt$pdf)
dist_plot <- ggplot(score, aes(x = order, y = score)) +
geom_line() +
geom_segment(x = 0, xend = max(score$order[score$signature == "LOW"]), y = mean(score$score), yend = mean(score$score)) +
geom_area(aes(fill = signature), alpha = .7) +
scale_fill_manual(values=c("#ff0000", "#08661e")) +
geom_text(aes(x = 1, y = mean(score)), label = "Mean", vjust = -0.3, colour = "black") +
labs(title = "Ordered cell signature scores", x = "Cell index", y = "Score")
plot(dist_plot)
density_score <- density(score$score)
dens_plot <- ggplot(data.frame(density_score[1:2]), aes(x, y, fill = ifelse(x < mean(score$score), "LOW", "HIGH"))) +
geom_line() +
geom_vline(xintercept = mean(score$score)) +
geom_text(x = mean(score$score), y = max(density_score$y), label = "Mean", hjust = -0.3, colour = "black") +
geom_area(alpha = .7) +
scale_fill_manual(values=c("#ff0000", "#08661e")) +
ylim(0, max(density_score$y)) +
labs(
title = "Distribution of Cell signature scores",
x = paste("N =", density_score$n, "Bandwidth =", density_score$bw),
y = "Density",
fill = "Signature"
)
plot(dens_plot)
# Check score independant of low expression
p_gene <- ggplot(signature_output, aes(rate, nGenes)) +
geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) +
scale_fill_manual(values=c("#ff0000", "#08661e")) +
geom_jitter() + labs(y = "Number of detected genes", x = "Signature")
p_counts <- ggplot(signature_output, aes(rate, total_counts)) +
geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) +
scale_fill_manual(values=c("#ff0000", "#08661e")) +
geom_jitter() + labs(y = "Total counts", x = "Signature")
grid.arrange(p_gene, p_counts, ncol = 2, top = "Influence of library sequencing depth on cell signature scores")
dev.off()
}else {
signature_output <- data.frame(
cell = names(score),
score = score,
rate = "middle",
nGenes = colSums(data.counts != 0),
total_counts = colSums(data.counts),
stringsAsFactors = F
)
signature_output$rate[signature_output$score <= quantile(signature_output$score)[2]] <- "LOW"
signature_output$rate[signature_output$score >= quantile(signature_output$score)[4]] <- "HIGH"
# Re-arrange score matrix for plots
score <- data.frame(score = score,
order = rank(score, ties.method = "first"),
signature = signature_output$rate,
stringsAsFactors = F)
pdf(file = opt$pdf)
distri_plot <- ggplot(score, aes(x = order, y = score, fill = signature)) +
geom_line() +
geom_area(aes(fill = signature), alpha = .7) +
scale_fill_manual(values=c("#e67300", "#00b32d", "grey")) +
labs(title = "Ordered bulk tumor signature scores", x = "Tumor index", y = "Score")
plot(distri_plot)
dev.off()
}
# Save file
write.table(
signature_output,
opt$output,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)
####################
# Differential #
# analysis #
####################
# Fifth step of the signature-based workflow
# Perform a differential analysis between 2
# groups high/low.
# Example of command
# Rscript 5-differential_analysis.R --input ../3-filter_genes/filterGenes_3.tsv --sep tab --colnames TRUE --comparison_factor_file signatureCategories.tsv --fdr 0.01 --log --output diff_analysis.tsv
# load packages that are provided in the conda env
options( show.error.messages=F,
error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
warnings()
library(optparse)
#Arguments
option_list = list(
make_option(
"--input",
default = NA,
type = 'character',
help = "Input file that contains log2(CPM +1) values : Rows = genes and Columns = Samples"
),
make_option(
"--sep",
default = '\t',
type = 'character',
help = "File separator [default : '%default' ]"
),
make_option(
"--colnames",
default = TRUE,
type = 'logical',
help = "Consider first line as header ? [default : '%default' ]"
),
make_option(
"--comparison_factor_file",
default = NA,
type = 'character',
help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)"
),
make_option(
"--factor1",
default = "HIGH",
type = 'character',
help = "First factor of rate category in comparison factor file"
),
make_option(
"--factor2",
default = "LOW",
type = 'character',
help = "Second factor of rate category in comparison factor file"
),
make_option(
"--fdr",
default = 0.01,
type = 'numeric',
help = "FDR threshold [default : '%default' ]"
),
make_option(
"--log",
default=FALSE,
action="store_true",
type = 'logical',
help = "Expression data are log-transformed [default : '%default' ]"
),
make_option(
"--output",
default = "results.tsv",
type = 'character',
help = "Output name [default : '%default' ]"
)
)
opt = parse_args(OptionParser(option_list = option_list),
args = commandArgs(trailingOnly = TRUE))
if (opt$sep == "tab") {opt$sep = "\t"}
if (opt$sep == "comma") {opt$sep = ","}
#Open files
data.counts <- read.table(
opt$input,
h = opt$colnames,
row.names = 1,
sep = opt$sep,
check.names = F
)
metadata <- read.table(
opt$comparison_factor_file,
header = TRUE,
stringsAsFactors = F,
sep = "\t",
check.names = FALSE,
row.names = 1
)
metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts))
# Create two logical named vectors for each factor level of cell signature
factor1_cells <- setNames(metadata[,1] == opt$factor1, rownames(metadata))
factor2_cells <- setNames(metadata[,1] == opt$factor2, rownames(metadata))
## Mann-Whitney test (Two-sample Wilcoxon test)
MW_test <- data.frame(t(apply(data.counts, 1, function(x) {
do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2]
})), stringsAsFactors = F)
# Benjamini-Hochberg correction and significativity
MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH" , n = nrow(MW_test))
# MW_test$Critical.value <- (rank(MW_test$p.value) / nrow(MW_test)) * opt$fdr
MW_test$Significant <- MW_test$p.adjust < opt$fdr
## Descriptive Statistics Function
descriptive_stats <- function(InputData) {
SummaryData = data.frame(
mean = rowMeans(InputData),
SD = apply(InputData, 1, sd),
Variance = apply(InputData, 1, var),
Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
(sum(x != 0) / ncol(y)) * 100
}),
mean_factor2 = rowMeans(InputData[,factor2_cells]),
mean_factor1 = rowMeans(InputData[, factor1_cells])
)
if(opt$log) {
SummaryData$fold_change <- SummaryData$mean_factor1 - SummaryData$mean_factor2
} else {
SummaryData$fold_change <- SummaryData$mean_factor1 / SummaryData$mean_factor2
}
return(SummaryData)
}
gene_stats <- descriptive_stats(data.counts)
results <- merge(gene_stats, MW_test, by = "row.names")
colnames(results)[1] <- "genes"
# Save files
write.table(
results,
opt$output,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)
####################
# Correlation #
# analysis #
####################
# Sixth step of the signature-based workflow
# Performs multi-correlation analysis between the vectors of gene expressions
# in single cell RNAseq libraries and the vectors of signature scores in these
# same single cell RNAseq libraries.
# All data file used in input must have the same delimiter.
# Example of command
# Rscript 6-correlation.R --expression_file ../3-filter_genes/filterGenes.tsv --signatures_file signatureScore.tsv --gene_metadata ../5-differential_analysis/diff_analysis.tsv
# load packages that are provided in the conda env
options( show.error.messages=F,
error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
requiredPackages = c('optparse', 'Hmisc')
warnings()
library(optparse)
library(Hmisc)
# Arguments
option_list = list(
make_option(
"--sep",
default = '\t',
type = 'character',
help = "File separator, must be the same for all input files [default : '%default' ]"
),
make_option(
"--colnames",
default = TRUE,
type = 'logical',
help = "Consider first lines as header (must stand for all input files) [default : '%default' ]"
),
make_option(
"--expression_file",
default = NA,
type = 'character',
help = "Input file that contains log2(CPM +1) expression values : Rows = genes and Columns = Samples"
),
make_option(
"--signatures_file",
default = NA,
type = 'character',
help = "Input file that contains cell signature : two column table (with header) of cell identifiers and a signature score value for each cell"
),
make_option(
c("-g", "--gene_metadata"),
default = NA,
type = 'character',
help = "Input file that contains genes metadata"
),
make_option(
"--sig_corr",
default = "sig_corr.tsv",
type = 'character',
help = "signature correlations output [default : '%default' ]"
),
make_option(
"--gene_corr",
default = "gene_corr.tsv",
type = 'character',
help = "gene-gene correlations output [default : '%default' ]"
),
make_option(
"--gene_corr_pval",
default = "gene_corr_pval.tsv",
type = 'character',
help = "gene-gene correlation pvalues output [default : '%default' ]"
)
)
opt = parse_args(OptionParser(option_list = option_list),
args = commandArgs(trailingOnly = TRUE))
if (opt$sep == "tab") {opt$sep = "\t"}
if (opt$sep == "comma") {opt$sep = ","}
# Open files
data <- read.table(
opt$expression_file,
header = opt$colnames,
row.names = 1,
sep = opt$sep,
check.names = F
)
signature <- read.delim(
opt$signatures_file,
header = T,
stringsAsFactors = F,
row.names = 1,
sep = opt$sep,
check.names = F
)
gene_metadata <- read.delim(
opt$gene_metadata,
header = TRUE,
stringsAsFactors = F,
sep = opt$sep,
check.names = FALSE,
row.names = 1
)
#Retrieve significantly differentially expressed genes
DE_genes <- rownames(subset(gene_metadata, Significant == TRUE))
#Filter expression matrix
data <- data[DE_genes,]
# keep only signatures that are in the expression dataframe
signature <- subset(signature, rownames(signature) %in% colnames(data))
# Add signature score to expression matrix
data <- rbind(t(signature), data)
# Gene correlation
gene_corr <- rcorr(t(data), type = "pearson") # transpose because we correlate genes, not cells
# Gene correlation with signature score
gene_signature_corr <- cbind.data.frame(gene = colnames(gene_corr$r),
Pearson_correlation = gene_corr$r[, 1],
p_value = gene_corr$P[, 1])
gene_signature_corr <- gene_signature_corr[ order(gene_signature_corr[,2], decreasing = T), ]
# Save files
write.table(
gene_signature_corr,
file = opt$sig_corr,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)
r_genes <- data.frame(gene=rownames(gene_corr$r), gene_corr$r) # add rownames as a variable for output
p_genes <- data.frame(gene=rownames(gene_corr$P), gene_corr$P) # add rownames as a variable for output
write.table(
r_genes[-1,-2],
file = opt$gene_corr,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)
write.table(
p_genes[-1,-2],
file = opt$gene_corr_pval,
sep = "\t",
quote = F,
col.names = T,
row.names = F
)