rm(list=ls()) quad <- read.table("pqs-tss-feature.csv", sep=",",header=TRUE,stringsAsFactors=FALSE) # import dataset for Bloom Syndrome Patients bdat <- read.table("Bloom Syndrome patient data.csv", sep=",", header=TRUE) # split gene sets into up- and down-regulated blm.up1 <- as.data.frame(bdat[with(bdat, log2.ER. > 0), ]$gene) blm.down1 <- as.data.frame(bdat[with(bdat, log2.ER. < 0), ]$gene) colnames(blm.up1) <- colnames(blm.down1) <- "gene_name" # merge blm.up1 and blm.down1 datasets with quad --> full TSS and PQS lists blm.up2 <- merge(quad, blm.up1, by="gene_name") blm.down2 <- merge(quad, blm.down1, by="gene_name") # calculate numbers of TSS in resepctive datasets genomic.tss <- length(unique(quad$unique.tss.number)) blm.up.tss <- length(unique(blm.up2$unique.tss.number)) blm.down.tss <- length(unique(blm.down2$unique.tss.number)) # print datasets to file ####################### sink("Bloom Syndrome dataset stats.txt") print("number of unique genes") print(length(unique(quad$gene_name))) print("number of unique TSS") print(length(unique(quad$unique.tss.number))) print("number of PQS per TSS genome-wide") print(sum(!is.na(quad$delta))/genomic.tss) print("number of genes in bdat dataset") print(nrow(bdat)) print("number of up-regulated genes") print(nrow(blm.up1)) print("number of TSS in up-regulated gene set") print(blm.up.tss) print("number of PQS in up-regulated gene set") print(sum(!is.na(blm.up2$delta))) print("number of PQS per TSS in up-regulated gene set") print(sum(!is.na(blm.up2$delta))/blm.up.tss) print("number of down-regulated genes") print(nrow(blm.down1)) print("number of TSS in down-regulated gene set") print(blm.down.tss) print("number of PQS in down-regulated gene set") print(sum(!is.na(blm.down2$delta))) print("number of PQS per TSS in down-regulated gene set") print(sum(!is.na(blm.down2$delta))/blm.down.tss) sink() ###################### # split into template strand (TS) andnon-template strand (NTS) strand datasets ###################### all <- quad[with(quad, is.na(delta) == FALSE), ]$delta plus <- quad[with(quad, strand.orient == "t" & is.na(delta) == FALSE), ]$delta minus <- quad[with(quad, strand.orient == "nt" & is.na(delta) == FALSE), ]$delta blm.up <- blm.up2[with(blm.up2, is.na(delta) == FALSE), ]$delta blm.up.plus <- blm.up2[with(blm.up2, strand.orient == "t" & is.na(delta) == FALSE), ]$delta blm.up.minus <- blm.up2[with(blm.up2, strand.orient == "nt" & is.na(delta) == FALSE), ]$delta blm.down <- blm.down2[with(blm.down2, is.na(delta) == FALSE), ]$delta blm.down.plus <- blm.down2[with(blm.down2, strand.orient == "t" & is.na(delta) == FALSE), ]$delta blm.down.minus <- blm.down2[with(blm.down2, strand.orient == "nt" & is.na(delta) == FALSE), ]$delta ## histogram analysis of all quadruplexes in control and differentially-expressed genes bins <- seq(from=-2000, to=2000, by=100) h1 <- hist(as.numeric(all), breaks=bins, plot=FALSE) h2 <- hist(as.numeric(blm.up), breaks=bins, plot=FALSE) h3 <- hist(as.numeric(blm.down), breaks=bins, plot=FALSE) h4 <- hist(as.numeric(plus), breaks=bins, plot=FALSE) h5 <- hist(as.numeric(blm.up.plus), breaks=bins, plot=FALSE) h6 <- hist(as.numeric(blm.down.plus), breaks=bins, plot=FALSE) h7 <- hist(as.numeric(minus), breaks=bins, plot=FALSE) h8 <- hist(as.numeric(blm.up.minus), breaks=bins, plot=FALSE) h9 <- hist(as.numeric(blm.down.minus), breaks=bins, plot=FALSE) pdf("Bloom Syndrome Histograms.pdf") # bottom left top right par(oma=c(0,0,4,0)) par(mfcol=c(3,2)) par(cex.main=1.5) xlim <- c(-2000,2000) par(font=2) plot(h4, col="gray", xlab="Distance Downstream of TSS (bp)", main="Whole Genome", xlim=xlim, font.axis=2, font.lab=2,xaxt="n") axis(side=1, at=c(-2000,0,2000),font=2) plot(h5, col="gray", xlab="Distance Downstream of TSS (bp)", main="Up-Regulated Genes", xlim=xlim, font.axis=2, font.lab=2,xaxt="n") axis(side=1, at=c(-2000,0,2000),font=2) plot(h6, col="gray", xlab="Distance Downstream of TSS (bp)", main="Down-Regulated Genes", xlim=xlim, font.axis=2, font.lab=2,xaxt="n") axis(side=1, at=c(-2000,0,2000),font=2) plot(h7, col="gray", xlab="Distance Downstream of TSS (bp)", main="Whole Genome", xlim=xlim, font.axis=2, font.lab=2,xaxt="n") axis(side=1, at=c(-2000,0,2000),font=2) plot(h8, col="gray", xlab="Distance Downstream of TSS (bp)", main="Up-Regulated Genes", xlim=xlim, font.axis=2, font.lab=2,xaxt="n") axis(side=1, at=c(-2000,0,2000),font=2) plot(h9, col="gray", xlab="Distance Downstream of TSS (bp)", main="Down-Regulated Genes", xlim=xlim, font.axis=2, font.lab=2,xaxt="n") axis(side=1, at=c(-2000,0,2000),font=2) mtext(side=3, line=0, outer=TRUE, adj=0, padj= NA, cex=1.5, font=2, text=" TS NTS") dev.off() par(mfrow=c(1,1)) par(cex.main=1) par(oma=c(0,0,0,0)) ################# # run 100 simulations to figure out what the threshold p-value should be for determining significance width <- 200 iter <- 100 by=10 sequence <- seq(from=-2000+width/2, to=2000-width/2, by= by) p.up.rand.out <- NULL p.down.rand.out <- NULL p.up.rand.plus.out <- NULL p.up.rand.minus.out <- NULL p.down.rand.plus.out <- NULL p.down.rand.minus.out <- NULL for(k in 1:6){ for(j in seq(from=1, to=iter, by=1)){ if(k == 1){ dat <- as.vector(blm.up) # delta only test dataset dat2 <- blm.up2 # full test dataset containing all TSS and PQS dat3 <- blm.up.tss cdat <- as.vector(all) # delta only control dataset cdat2 <- quad # full control dataset containing all TSS and PQS cdat3 <- genomic.tss sample.pool <- as.vector(unique(cdat2$gene_name)) samples <- sample(sample.pool, length(unique(dat2$gene_name))) rdat2 <- cdat2[with(cdat2, gene_name %in% samples), ] rdat3 <- length(unique(rdat2$unique.tss.number)) rdat <- as.vector(rdat2[with(rdat2, is.na(delta) == FALSE), ]$delta) p.up <- NULL p.rand.up <- NULL mean.up <- NULL # loop for up-regulated genes for(i in sequence){ subset.ctl <- cdat[cdat > i-(width/2) & cdat < i+(width/2)] ctl.prob <- length(subset.ctl)/cdat3 subset.test <- dat[dat > i-(width/2) & dat < i+(width/2)] subset.prob <- length(subset.test)/dat3 mean.up <- c(mean.up,(subset.prob/ctl.prob)) p <- prop.test(length(subset.test), dat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.up <- c(p.up, p$p.value) subset.rand <- rdat[rdat > i-(width/2) & rdat < i+(width/2)] rand.prob <- length(subset.rand)/rdat3 p.r <- prop.test(length(subset.rand), rdat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.rand.up <- c(p.rand.up, p.r$p.value) } plot(sequence, p.rand.up, log="y",ylim=c(1E-5,1), pch=4,main=paste("up-regulated, ",j)) points(sequence, p.up, pch=16) abline(h=0.005) p.up.rand.out <- c(p.up.rand.out, p.rand.up) } if(k == 2){ dat <- as.vector(blm.up.plus) # delta only test dataset dat2 <- blm.up2 # full test dataset containing all TSS and PQS dat3 <- blm.up.tss cdat <- as.vector(plus) # delta only control dataset cdat2 <- quad # full control dataset containing all TSS and PQS cdat3 <- genomic.tss sample.pool <- as.vector(unique(cdat2$gene_name)) samples <- sample(sample.pool, length(unique(dat2$gene_name))) rdat2 <- cdat2[with(cdat2, gene_name %in% samples), ] rdat3 <- length(unique(rdat2$unique.tss.number)) rdat <- as.vector(rdat2[with(rdat2, is.na(delta) == FALSE & strand.orient == "t"), ]$delta) p.up.plus <- NULL p.rand.up.plus <- NULL mean.up.plus <- NULL # loop for up-regulated genes for(i in sequence){ subset.ctl <- cdat[cdat > i-(width/2) & cdat < i+(width/2)] ctl.prob <- length(subset.ctl)/cdat3 subset.test <- dat[dat > i-(width/2) & dat < i+(width/2)] subset.prob <- length(subset.test)/dat3 mean.up.plus <- c(mean.up.plus,(subset.prob/ctl.prob)) p <- prop.test(length(subset.test), dat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.up.plus <- c(p.up.plus, p$p.value) subset.rand <- rdat[rdat > i-(width/2) & rdat < i+(width/2)] rand.prob <- length(subset.rand)/rdat3 p.r <- prop.test(length(subset.rand), rdat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.rand.up.plus <- c(p.rand.up.plus, p.r$p.value) } plot(sequence, p.rand.up.plus, log="y",ylim=c(1E-5,1), pch=4,main=paste("up-regulated, ",j," + strand")) points(sequence, p.up.plus, pch=16) abline(h=0.005) p.up.rand.plus.out <- c(p.up.rand.plus.out, p.rand.up.plus) } if(k == 3){ dat <- as.vector(blm.up.minus) # delta only test dataset dat2 <- blm.up2 # full test dataset containing all TSS and PQS dat3 <- blm.up.tss cdat <- as.vector(minus) # delta only control dataset cdat2 <- quad # full control dataset containing all TSS and PQS cdat3 <- genomic.tss sample.pool <- as.vector(unique(cdat2$gene_name)) samples <- sample(sample.pool, length(unique(dat2$gene_name))) rdat2 <- cdat2[with(cdat2, gene_name %in% samples), ] rdat3 <- length(unique(rdat2$unique.tss.number)) rdat <- as.vector(rdat2[with(rdat2, is.na(delta) == FALSE & strand.orient == "nt"), ]$delta) p.up.minus <- NULL p.rand.up.minus <- NULL mean.up.minus <- NULL # loop for up-regulated genes for(i in sequence){ subset.ctl <- cdat[cdat > i-(width/2) & cdat < i+(width/2)] ctl.prob <- length(subset.ctl)/cdat3 subset.test <- dat[dat > i-(width/2) & dat < i+(width/2)] subset.prob <- length(subset.test)/dat3 mean.up.minus <- c(mean.up.minus,(subset.prob/ctl.prob)) p <- prop.test(length(subset.test), dat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.up.minus <- c(p.up.minus, p$p.value) subset.rand <- rdat[rdat > i-(width/2) & rdat < i+(width/2)] rand.prob <- length(subset.rand)/rdat3 p.r <- prop.test(length(subset.rand), rdat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.rand.up.minus <- c(p.rand.up.minus, p.r$p.value) } plot(sequence, p.rand.up.minus, log="y",ylim=c(1E-5,1), pch=4,main=paste("up-regulated, ",j," - strand")) points(sequence, p.up.minus, pch=16) abline(h=0.005) p.up.rand.minus.out <- c(p.up.rand.minus.out, p.rand.up.minus) } if(k == 4){ dat <- as.vector(blm.down) # delta only test dataset dat2 <- blm.down2 # full test dataset containing all TSS and PQS dat3 <- blm.down.tss cdat <- as.vector(all) # delta only control dataset cdat2 <- quad # full control dataset containing all TSS and PQS cdat3 <- genomic.tss sample.pool <- as.vector(unique(cdat2$gene_name)) samples <- sample(sample.pool, length(unique(dat2$gene_name))) rdat2 <- cdat2[with(cdat2, gene_name %in% samples), ] rdat3 <- length(unique(rdat2$unique.tss.number)) rdat <- as.vector(rdat2[with(rdat2, is.na(delta) == FALSE), ]$delta) p.down <- NULL p.rand.down <- NULL mean.down <- NULL # loop for down-regulated genes for(i in sequence){ subset.ctl <- cdat[cdat > i-(width/2) & cdat < i+(width/2)] ctl.prob <- length(subset.ctl)/cdat3 subset.test <- dat[dat > i-(width/2) & dat < i+(width/2)] subset.prob <- length(subset.test)/dat3 mean.down <- c(mean.down,(subset.prob/ctl.prob)) p <- prop.test(length(subset.test), dat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.down <- c(p.down, p$p.value) subset.rand <- rdat[rdat > i-(width/2) & rdat < i+(width/2)] rand.prob <- length(subset.rand)/rdat3 p.r <- prop.test(length(subset.rand), rdat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.rand.down <- c(p.rand.down, p.r$p.value) } plot(sequence, p.rand.down, log="y",ylim=c(1E-5,1), pch=4,main=paste("down-regulated, ",j)) points(sequence, p.down, pch=16) abline(h=0.005) p.down.rand.out <- c(p.down.rand.out, p.rand.down) } if(k == 5){ dat <- as.vector(blm.down.plus) # delta only test dataset dat2 <- blm.down2 # full test dataset containing all TSS and PQS dat3 <- blm.down.tss cdat <- as.vector(plus) # delta only control dataset cdat2 <- quad # full control dataset containing all TSS and PQS cdat3 <- genomic.tss sample.pool <- as.vector(unique(cdat2$gene_name)) samples <- sample(sample.pool, length(unique(dat2$gene_name))) rdat2 <- cdat2[with(cdat2, gene_name %in% samples), ] rdat3 <- length(unique(rdat2$unique.tss.number)) rdat <- as.vector(rdat2[with(rdat2, is.na(delta) == FALSE & strand.orient == "t"), ]$delta) p.down.plus <- NULL p.rand.down.plus <- NULL mean.down.plus <- NULL # loop for down-regulated genes for(i in sequence){ subset.ctl <- cdat[cdat > i-(width/2) & cdat < i+(width/2)] ctl.prob <- length(subset.ctl)/cdat3 subset.test <- dat[dat > i-(width/2) & dat < i+(width/2)] subset.prob <- length(subset.test)/dat3 mean.down.plus <- c(mean.down.plus,(subset.prob/ctl.prob)) p <- prop.test(length(subset.test), dat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.down.plus <- c(p.down.plus, p$p.value) subset.rand <- rdat[rdat > i-(width/2) & rdat < i+(width/2)] rand.prob <- length(subset.rand)/rdat3 p.r <- prop.test(length(subset.rand), rdat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.rand.down.plus <- c(p.rand.down.plus, p.r$p.value) } plot(sequence, p.rand.down.plus, log="y",ylim=c(1E-5,1), pch=4,main=paste("down-regulated, ",j," + strand")) points(sequence, p.down.plus, pch=16) abline(h=0.005) p.down.rand.plus.out <- c(p.down.rand.plus.out, p.rand.down.plus) } if(k == 6){ dat <- as.vector(blm.down.minus) # delta only test dataset dat2 <- blm.down2 # full test dataset containing all TSS and PQS dat3 <- blm.down.tss cdat <- as.vector(minus) # delta only control dataset cdat2 <- quad # full control dataset containing all TSS and PQS cdat3 <- genomic.tss sample.pool <- as.vector(unique(cdat2$gene_name)) samples <- sample(sample.pool, length(unique(dat2$gene_name))) rdat2 <- cdat2[with(cdat2, gene_name %in% samples), ] rdat3 <- length(unique(rdat2$unique.tss.number)) rdat <- as.vector(rdat2[with(rdat2, is.na(delta) == FALSE & strand.orient == "nt"), ]$delta) p.down.minus <- NULL p.rand.down.minus <- NULL mean.down.minus <- NULL # loop for down-regulated genes for(i in sequence){ subset.ctl <- cdat[cdat > i-(width/2) & cdat < i+(width/2)] ctl.prob <- length(subset.ctl)/cdat3 subset.test <- dat[dat > i-(width/2) & dat < i+(width/2)] subset.prob <- length(subset.test)/dat3 mean.down.minus <- c(mean.down.minus,(subset.prob/ctl.prob)) p <- prop.test(length(subset.test), dat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.down.minus <- c(p.down.minus, p$p.value) subset.rand <- rdat[rdat > i-(width/2) & rdat < i+(width/2)] rand.prob <- length(subset.rand)/rdat3 p.r <- prop.test(length(subset.rand), rdat3, p = ctl.prob, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, correct = TRUE) p.rand.down.minus <- c(p.rand.down.minus, p.r$p.value) } plot(sequence, p.rand.down.minus, log="y",ylim=c(1E-5,1), pch=4,main=paste("down-regulated, ",j," - strand")) points(sequence, p.down.minus, pch=16) abline(h=0.005) p.down.rand.minus.out <- c(p.down.rand.minus.out, p.rand.down.minus) } print(j) } } output.df <- data.frame(sequence,mean.up,p.up,mean.down,p.down,mean.up.plus,p.up.plus,mean.down.plus,p.down.plus,mean.up.minus,p.up.minus,mean.down.minus,p.down.minus) write.table(output.df, file="Bloom Syndrome quad numbers ratios and p-value map.csv", sep=",", col.names=colnames(output.df),row.names=FALSE) output2.df <- data.frame(p.up.rand.out,p.down.rand.out,p.up.rand.plus.out,p.up.rand.minus.out,p.down.rand.plus.out,p.down.rand.minus.out) write.table(output2.df, file="Bloom Syndrome random sampling p-values.csv", sep=",", col.names=colnames(output2.df), row.names=FALSE) # export FDR and dataset stats ################################ # calculate p-value threshold for FDR=0.01 (1% chance of a p-value in the random dataser being below this value) fdr.thresh <- 0.01 fdr.up1 <- sort(p.up.rand.out, decreasing=FALSE) fdr.up.plus1 <- sort(p.up.rand.plus.out, decreasing=FALSE) fdr.up.minus1 <- sort(p.up.rand.minus.out, decreasing=FALSE) fdr.down1 <- sort(p.down.rand.out, decreasing=FALSE) fdr.down.plus1 <- sort(p.down.rand.plus.out, decreasing=FALSE) fdr.down.minus1 <- sort(p.down.rand.minus.out, decreasing=FALSE) fdr.up.coord <- floor(length(fdr.up1)*fdr.thresh) fdr.up.plus.coord <- floor(length(fdr.up.plus1)*fdr.thresh) fdr.up.minus.coord <- floor(length(fdr.up.minus1)*fdr.thresh) fdr.down.coord <- floor(length(fdr.down1)*fdr.thresh) fdr.down.plus.coord <- floor(length(fdr.down.plus1)*fdr.thresh) fdr.down.minus.coord <- floor(length(fdr.down.minus1)*fdr.thresh) # p.cut is the p-value threshold, below which there is a 1% chance of a a point in a # randomly-sampled dataset achieving significance ... so 1% of the points that achieve signifance # in the test dataset are "false positives" p.up.cut <- fdr.up1[fdr.up.coord] p.up.plus.cut <- fdr.up.plus1[fdr.up.plus.coord] p.up.minus.cut <- fdr.up.minus1[fdr.up.minus.coord] p.down.cut <- fdr.down1[fdr.down.coord] p.down.plus.cut <- fdr.down.plus1[fdr.down.plus.coord] p.down.minus.cut <- fdr.down.minus1[fdr.down.minus.coord] print(p.up.cut) print(p.up.plus.cut) print(p.up.minus.cut) print(p.down.cut) print(p.down.plus.cut) print(p.down.minus.cut) # calculate # of discoveries in test dataset discoveries.up <- p.up[p.up <= p.up.cut] discoveries.up.plus <- p.up.plus[p.up.plus <= p.up.plus.cut] discoveries.up.minus <- p.up.minus[p.up.minus <= p.up.minus.cut] disc.rate.up <- length(discoveries.up)/length(p.up) disc.rate.up.plus <- length(discoveries.up.plus)/length(p.up.plus) disc.rate.up.minus <- length(discoveries.up.minus)/length(p.up.minus) discoveries.down <- p.down[p.down <= p.down.cut] discoveries.down.plus <- p.down.plus[p.down.plus <= p.down.plus.cut] discoveries.down.minus <- p.down.minus[p.down.minus <= p.down.minus.cut] disc.rate.down <- length(discoveries.down)/length(p.down) disc.rate.down.plus <- length(discoveries.down.plus)/length(p.down.plus) disc.rate.down.minus <- length(discoveries.down.minus)/length(p.down.minus) # calculate FDR (proportion of discoveries that are likely "false") fdr.up <- fdr.thresh/disc.rate.up fdr.up.plus <- fdr.thresh/disc.rate.up.plus fdr.up.minus <- fdr.thresh/disc.rate.up.minus fdr.down <- fdr.thresh/disc.rate.down fdr.down.plus <- fdr.thresh/disc.rate.down.plus fdr.down.minus <- fdr.thresh/disc.rate.down.minus # export FDR and threshold P-Value to file for future use sink("Bloom Syndrome dataset and FDR stats.txt") print("FD fraction of data points=") print(fdr.thresh) print("threshold p-value for specified FDR (up-regulated genes)=") print(p.up.cut) print("number of false discoveries (up-regulated genes)=") print(length(sequence)*fdr.thresh) print("number of discoveries in test dataset (up-regulated genes)=") print(length(discoveries.up)) print("discovery fraction (up-regulated genes)=") print(disc.rate.up) print("FDR (up-regulated genes)=") print(fdr.up) print("") print("FD fraction of data points=") print(fdr.thresh) print("threshold p-value for specified FDR (up-regulated genes, plus strand)=") print(p.up.plus.cut) print("number of false discoveries (up-regulated genes, plus strand)=") print(length(sequence)*fdr.thresh) print("number of discoveries in test dataset (up-regulated genes, plus strand)=") print(length(discoveries.up.plus)) print("discovery fraction (up-regulated genes, plus strand)=") print(disc.rate.up.plus) print("FDR (up-regulated genes, plus strand)=") print(fdr.up.plus) print("") print("FD fraction of data points=") print(fdr.thresh) print("threshold p-value for specified FDR (up-regulated genes, minus strand)=") print(p.up.minus.cut) print("number of false discoveries (up-regulated genes, minus strand)=") print(length(sequence)*fdr.thresh) print("number of discoveries in test dataset (up-regulated genes, minus strand)=") print(length(discoveries.up.minus)) print("discovery fraction (up-regulated genes, minus strand)=") print(disc.rate.up.minus) print("FDR (up-regulated genes, minus strand)=") print(fdr.up.minus) print("") print("FD fraction of data points (down-regulated genes)=") print(fdr.thresh) print("threshold p-value for specified FDR (down-regulated genes)=") print(p.down.cut) print("number of false discoveries (down-regulated genes)=") print(length(sequence)*fdr.thresh) print("number of discoveries in test dataset (down-regulated genes)=") print(length(discoveries.down)) print("discovery fraction (down-regulated genes)=") print(disc.rate.down) print("FDR (down-regulated genes)=") print(fdr.down) print("") print("FD fraction of data points (down-regulated genes, plus strand)=") print(fdr.thresh) print("threshold p-value for specified FDR (down-regulated genes, plus strand)=") print(p.down.plus.cut) print("number of false discoveries (down-regulated genes, plus strand)=") print(length(sequence)*fdr.thresh) print("number of discoveries in test dataset (down-regulated genes, plus strand)=") print(length(discoveries.down.plus)) print("discovery fraction (down-regulated genes, plus strand)=") print(disc.rate.down.plus) print("FDR (down-regulated genes, plus strand)=") print(fdr.down.plus) print("") print("FD fraction of data points (down-regulated genes, minus strand)=") print(fdr.thresh) print("threshold p-value for specified FDR (down-regulated genes, minus strand)=") print(p.down.minus.cut) print("number of false discoveries (down-regulated genes, minus strand)=") print(length(sequence)*fdr.thresh) print("number of discoveries in test dataset (down-regulated genes, minus strand)=") print(length(discoveries.down.minus)) print("discovery fraction (down-regulated genes, minus strand)=") print(disc.rate.down.minus) print("FDR (down-regulated genes, minus strand)=") print(fdr.down.minus) sink() ################## p.plus.up <- p.up.plus p.minus.up <- p.up.minus p.plus.down <- p.down.plus p.minus.down <- p.down.minus mean.plus.up <- mean.up.plus mean.minus.up <- mean.up.minus mean.plus.down <- mean.down.plus mean.minus.down <- mean.down.minus # calculate subset of map.points for which the p-value is below the specified threshold for p-value # goal is to plot shaded areas on PE plot where data is statistically significant sig1.plus.df <- data.frame(seq(from=1, to=length(sequence), by=1),sequence, mean.up, p.up) sig1.plus.p.df <- data.frame(seq(from=1, to=length(sequence), by=1),sequence, mean.plus.up, p.plus.up) sig1.plus.m.df <- data.frame(seq(from=1, to=length(sequence), by=1),sequence, mean.minus.up, p.minus.up) sig1.minus.df <- data.frame(seq(from=1, to=length(sequence), by=1),sequence, mean.down, p.down) sig1.minus.p.df <- data.frame(seq(from=1, to=length(sequence), by=1),sequence, mean.plus.down, p.plus.down) sig1.minus.m.df <- data.frame(seq(from=1, to=length(sequence), by=1),sequence, mean.minus.down, p.minus.down) # keep only entries below the significance threshold sig2.plus.df <- sig1.plus.df[with(sig1.plus.df, p.up <= p.up.cut),] sig2.plus.p.df <- sig1.plus.p.df[with(sig1.plus.p.df, p.plus.up <= p.up.plus.cut),] sig2.plus.m.df <- sig1.plus.m.df[with(sig1.plus.m.df, p.minus.up <= p.up.minus.cut),] sig2.minus.df <- sig1.minus.df[with(sig1.minus.df, p.down <= p.down.cut),] sig2.minus.p.df <- sig1.minus.p.df[with(sig1.minus.p.df, p.plus.down <= p.down.plus.cut),] sig2.minus.m.df <- sig1.minus.m.df[with(sig1.minus.m.df, p.minus.down <= p.down.minus.cut),] colnames(sig2.plus.df) <- colnames(sig2.minus.df) <- colnames(sig2.plus.p.df) <- colnames(sig2.plus.m.df) <- colnames(sig2.minus.p.df) <- colnames(sig2.minus.m.df) <- c("number","map.points","pe.out","p2.out") # identify continuous regions (plot each continuous region separately) cont.start.plus <- NULL for(i in 1:nrow(sig2.plus.df)){ if(i ==1){ sig.start <- sig2.plus.df[i,]$number cont.start.plus <- i sig.run <- sig.start } if(i > 1){ num <- sig2.plus.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.plus.df[i,]$number cont.start.plus <- c(cont.start.plus,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.plus.df[i,]$number } } print(i) } cont.start.plus.p <- NULL for(i in 1:nrow(sig2.plus.p.df)){ if(i ==1){ sig.start <- sig2.plus.p.df[i,]$number cont.start.plus.p <- i sig.run <- sig.start } if(i > 1){ num <- sig2.plus.p.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.plus.p.df[i,]$number cont.start.plus.p <- c(cont.start.plus.p,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.plus.p.df[i,]$number } } print(i) } cont.start.plus.m <- NULL for(i in 1:nrow(sig2.plus.m.df)){ if(i ==1){ sig.start <- sig2.plus.m.df[i,]$number cont.start.plus.m <- i sig.run <- sig.start } if(i > 1){ num <- sig2.plus.m.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.plus.m.df[i,]$number cont.start.plus.m <- c(cont.start.plus.m,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.plus.m.df[i,]$number } } print(i) } cont.start.minus <- NULL for(i in 1:nrow(sig2.minus.df)){ if(i ==1){ sig.start <- sig2.minus.df[i,]$number cont.start.minus <- i sig.run <- sig.start } if(i > 1){ num <- sig2.minus.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.minus.df[i,]$number cont.start.minus <- c(cont.start.minus,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.minus.df[i,]$number } } print(i) } cont.start.minus.p <- NULL for(i in 1:nrow(sig2.minus.p.df)){ if(i ==1){ sig.start <- sig2.minus.p.df[i,]$number cont.start.minus.p <- i sig.run <- sig.start } if(i > 1){ num <- sig2.minus.p.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.minus.p.df[i,]$number cont.start.minus.p <- c(cont.start.minus.p,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.minus.p.df[i,]$number } } print(i) } cont.start.minus.m <- NULL for(i in 1:nrow(sig2.minus.m.df)){ if(i ==1){ sig.start <- sig2.minus.m.df[i,]$number cont.start.minus.m <- i sig.run <- sig.start } if(i > 1){ num <- sig2.minus.m.df[i,]$number # do this if the row is the beginning of a new run if(num != sig.run+1){ sig.start <- sig2.minus.m.df[i,]$number cont.start.minus.m <- c(cont.start.minus.m,i) sig.run <- sig.start } # do this if the row is part of a continuous run if(num == sig.run+1){ sig.run <- sig2.minus.m.df[i,]$number } } print(i) } # split sig2.df into continuous regions regions.plus <- list() for(i in 1:length(cont.start.plus)){ if(i < length(cont.start.plus)){ df <- sig2.plus.df[(cont.start.plus[i]:(cont.start.plus[i+1]-1)),] regions.plus[i][[1]] <- df } if(i == length(cont.start.plus)){ df <- sig2.plus.df[(cont.start.plus[i]:nrow(sig2.plus.df)),] regions.plus[i][[1]] <- df } } regions.plus.p <- list() for(i in 1:length(cont.start.plus.p)){ if(i < length(cont.start.plus.p)){ df <- sig2.plus.p.df[(cont.start.plus.p[i]:(cont.start.plus.p[i+1]-1)),] regions.plus.p[i][[1]] <- df } if(i == length(cont.start.plus.p)){ df <- sig2.plus.p.df[(cont.start.plus.p[i]:nrow(sig2.plus.p.df)),] regions.plus.p[i][[1]] <- df } } regions.plus.m <- list() for(i in 1:length(cont.start.plus.m)){ if(i < length(cont.start.plus.m)){ df <- sig2.plus.m.df[(cont.start.plus.m[i]:(cont.start.plus.m[i+1]-1)),] regions.plus.m[i][[1]] <- df } if(i == length(cont.start.plus.m)){ df <- sig2.plus.m.df[(cont.start.plus.m[i]:nrow(sig2.plus.m.df)),] regions.plus.m[i][[1]] <- df } } regions.minus <- list() for(i in 1:length(cont.start.minus)){ if(i < length(cont.start.minus)){ df <- sig2.minus.df[(cont.start.minus[i]:(cont.start.minus[i+1]-1)),] regions.minus[i][[1]] <- df } if(i == length(cont.start.minus)){ df <- sig2.minus.df[(cont.start.minus[i]:nrow(sig2.minus.df)),] regions.minus[i][[1]] <- df } } regions.minus.p <- list() for(i in 1:length(cont.start.minus.p)){ if(i < length(cont.start.minus.p)){ df <- sig2.minus.p.df[(cont.start.minus.p[i]:(cont.start.minus.p[i+1]-1)),] regions.minus.p[i][[1]] <- df } if(i == length(cont.start.minus.p)){ df <- sig2.minus.p.df[(cont.start.minus.p[i]:nrow(sig2.minus.p.df)),] regions.minus.p[i][[1]] <- df } } regions.minus.m <- list() for(i in 1:length(cont.start.minus.m)){ if(i < length(cont.start.minus.m)){ df <- sig2.minus.m.df[(cont.start.minus.m[i]:(cont.start.minus.m[i+1]-1)),] regions.minus.m[i][[1]] <- df } if(i == length(cont.start.minus.m)){ df <- sig2.minus.m.df[(cont.start.minus.m[i]:nrow(sig2.minus.m.df)),] regions.minus.m[i][[1]] <- df } } # define series of x and y coordinates for plotting shaded regions sink("Bloom Syndrome regions of statistically-significant difference.txt") print("up-regulated") print(regions.plus) print("up-regulated + strand") print(regions.plus.p) print("up-regulated - strand") print(regions.plus.m) print("down-regulated") print(regions.minus) print("down-regulated + strand") print(regions.minus.p) print("down-regulated - strand") print(regions.minus.m) sink() cord.xl.plus <- list() cord.yl.plus <- list() cord.xl.plus.p <- list() cord.yl.plus.p <- list() cord.xl.plus.m <- list() cord.yl.plus.m <- list() cord.xl.minus <- list() cord.yl.minus <- list() cord.xl.minus.p <- list() cord.yl.minus.p <- list() cord.xl.minus.m <- list() cord.yl.minus.m <- list() for(i in 1:length(regions.plus)){ df <- regions.plus[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -1000+width/2 & df[nrow(df),]$map.points != 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.plus[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -1000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.plus[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.plus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, sequence== (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.df[with(sig1.plus.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),1) cord.yl.plus[i][[1]] <- cord.y } } for(i in 1:length(regions.plus.p)){ df <- regions.plus.p[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -1000+width/2 & df[nrow(df),]$map.points != 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus.p[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.p.df[with(sig1.plus.p.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.p.df[with(sig1.plus.p.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.plus.p[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -1000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus.p[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.p.df[with(sig1.plus.p.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.p.df[with(sig1.plus.p.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.plus.p[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.plus.p[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.p.df[with(sig1.plus.p.df, sequence== (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.p.df[with(sig1.plus.p.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),1) cord.yl.plus.p[i][[1]] <- cord.y } } for(i in 1:length(regions.plus.m)){ df <- regions.plus.m[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -1000+width/2 & df[nrow(df),]$map.points != 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus.m[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.m.df[with(sig1.plus.m.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.m.df[with(sig1.plus.m.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.plus.m[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -1000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.plus.m[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.m.df[with(sig1.plus.m.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.m.df[with(sig1.plus.m.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.plus.m[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.plus.m[i][[1]] <- cord.x low.outside <- as.numeric(sig1.plus.m.df[with(sig1.plus.m.df, sequence== (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.plus.m.df[with(sig1.plus.m.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),1) cord.yl.plus.m[i][[1]] <- cord.y } } for(i in 1:length(regions.minus)){ df <- regions.minus[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -1000+width/2 & df[nrow(df),]$map.points != 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.minus[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -1000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.minus[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.minus[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.df[with(sig1.minus.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),1) cord.yl.minus[i][[1]] <- cord.y } } for(i in 1:length(regions.minus.p)){ df <- regions.minus.p[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -1000+width/2 & df[nrow(df),]$map.points != 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus.p[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.p.df[with(sig1.minus.p.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.p.df[with(sig1.minus.p.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.minus.p[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -1000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus.p[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.p.df[with(sig1.minus.p.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.p.df[with(sig1.minus.p.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.minus.p[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.minus.p[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.p.df[with(sig1.minus.p.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.p.df[with(sig1.minus.p.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),1) cord.yl.minus.p[i][[1]] <- cord.y } } for(i in 1:length(regions.minus.m)){ df <- regions.minus.m[i][[1]] # do this if the df does not start at the leftmost moundary of the map or end at the rightmost boundary of the map if(df[1,]$map.points != -1000+width/2 & df[nrow(df),]$map.points != 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus.m[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.m.df[with(sig1.minus.m.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.m.df[with(sig1.minus.m.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.minus.m[i][[1]] <- cord.y } # do this if the region if touching the lefmost boundary of the map if(df[1,]$map.points == -1000+width/2){ min.x <- as.numeric(df[1,]$map.points) max.x <- as.numeric(df[nrow(df),]$map.points)+by/2 cord.x <- c(min.x,as.numeric(df$map.points),max.x,max.x) cord.xl.minus.m[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.m.df[with(sig1.minus.m.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.m.df[with(sig1.minus.m.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,as.numeric(df$pe.out),mean(c(high.outside,as.numeric(df[nrow(df),]$pe.out))),1) cord.yl.minus.m[i][[1]] <- cord.y } # do this if the region is touching the rightmost boundary of the map if(df[nrow(df),]$map.points == 1000-width/2){ min.x <- as.numeric(df[1,]$map.points)-by/2 max.x <- as.numeric(df[nrow(df),]$map.points) cord.x <- c(min.x,min.x,as.numeric(df$map.points),max.x) cord.xl.minus.m[i][[1]] <- cord.x low.outside <- as.numeric(sig1.minus.m.df[with(sig1.minus.m.df, sequence == (min.x-by/2)),]$pe.out) high.outside <- as.numeric(sig1.minus.m.df[with(sig1.minus.m.df, sequence == (max.x+by/2)),]$pe.out) cord.y <- c(1,mean(c(low.outside,as.numeric(df[1,]$pe.out))),as.numeric(df$pe.out),1) cord.yl.minus.m[i][[1]] <- cord.y } } library(stringr) title <- str_c("Bloom Syndrome PQS Distribution Analysis (four panel) ",width,".pdf") #### plot six panel figure showing aggregate strand analysis as well as the more granular analysis of individual strands pdf(title) par(mfcol=c(2,2)) # mar: margins- bottom, left, top, right; oma: c(bottom, left, top) size of the outer margins in lines of text par(mar = c(0, 0, 4, 0), oma = c(4, 7, 4, 1)) # tcl: length of tick marks as a fraction of the height ofa line of text, defaults to -0.5 par(tcl = -0.25) # mgp: margin line for axis title, axis labels, and axis lines (default c(3,1,0) par(mgp = c(3, 1, 0)) clegend <- 1.5 cmain <- 1.8 clabel <- 1.5 caxis <- 1.5 fline <- "azure3" vline <- 1 bcol <- "white" ylim <- c(-0.5,2.8) #################################################################################################################### ###### plot template strand (TS) data par(mar = c(0, 0, 4, 0)) plot(sequence,p.plus.up, type="l",log="y",lty=1,lwd=2,main="TS", ylab="",xlab="",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel,ylim=c(1E-32,1), font=2,font.lab=2,font.main=2, font.axis=2,cex.main=cmain, col="white") abline(v=seq(-2000,2000,400),col=fline,lwd=1) par(font=2) # abline(h=p.up.plus.cut,lty=3, lwd=2, col="red") # abline(h=p.down.plus.cut,lty=3, lwd=2, col="blue") abline(h=p.up.plus.cut, lty=3,lwd=3,col="red") abline(h=p.down.plus.cut, lty=3,lwd=3,col="blue") abline(v=0,lty=1,lwd=vline) points(sequence,p.plus.up, type="l",lty=1,lwd=2, col="red") points(sequence, p.plus.down, type="l", lty=1, lwd=2, col="blue") axis(side=2, at=c(1E-40,1E-36,1E-32,1E-28,1E-24,1E-20,1E-16,1E-12,1E-8,1E-4,1), las=1, font=2, cex.axis=1.5) # legend("bottomright",legend=c(paste("Threshold=",round(p.up.plus.cut,3)," "),paste("Threshold=",round(p.down.plus.cut,3)," ")), # lty=3,lwd=2,cex=clegend, bg=NULL,box.col=NULL, bty="n", text.col=c("red","blue"), col=c("red","blue")) # legend("bottomright",legend=c(paste("Threshold=",signif(p.up.plus.cut,1)), # paste("Threshold=",signif(p.down.plus.cut,1))), # lty=3,lwd=2,cex=clegend, bg=NULL,box.col=NULL, bty="n", text.col=c("red","blue"), col=c("red","blue")) box(lwd=2) par(mar = c(4, 0, 0, 0)) plot(sequence, mean.plus.up, type="l",lty=1,lwd=2,main="", xlab="",ylab="",xaxt="n",, yaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=ylim,font=2,font.lab=2,font.main=2,cex.main=cmain, col="white") abline(v=seq(-2000,2000,400), col=fline,lwd=1) abline(h=1,lty=1,lwd=1) abline(v=0,lty=1,lwd=vline) points(sequence, mean.plus.up, type="l", lty=1, lwd=2, col="red") points(sequence, mean.plus.down, type="l", lty=1, lwd=2, col="blue") for(i in 1:length(regions.plus.p)){ polygon(cord.xl.plus.p[i][[1]],cord.yl.plus.p[i][[1]],col="red", density=50, angle=-45) } for(i in 1:length(regions.minus.p)){ polygon(cord.xl.minus.p[i][[1]],cord.yl.minus.p[i][[1]],col="blue", density=50, angle=45) } axis(side=2, at=c(0,0.5, 1,1.5,2,2.5,3,3.5,4,4.5), las=1, font=2, cex.axis=1.5) axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) legend("bottom",legend=c(paste("FDR=",round(fdr.up.plus,3)),paste("FDR=",round(fdr.down.plus,3))),cex=clegend, bty="n", text.col=c("red","blue")) box(lwd=2) ############### plot - Strand Data par(mar = c(0, 0, 4, 0)) plot(sequence,p.minus.up, type="l",log="y",lty=1,lwd=2,main="NTS", ylab="",xlab="",xaxt="n",yaxt="n",cex.axis=caxis,cex.lab=clabel,ylim=c(1E-32,1), font=2,font.lab=2,font.main=2, font.axis=2,cex.main=cmain, col="white") abline(v=seq(-2000,2000,400),col=fline,lwd=1) par(font=2) # abline(h=p.up.minus.cut,lty=3, lwd=2, col="red") # abline(h=p.down.minus.cut,lty=3, lwd=2, col="blue") abline(h=p.up.minus.cut, lty=3,lwd=3,col="red") abline(h=p.down.minus.cut, lty=3,lwd=3,col="blue") abline(v=0,lty=1,lwd=vline) points(sequence,p.minus.up, type="l",lty=1,lwd=2, col="red") points(sequence, p.minus.down, type="l", lty=1, lwd=2, col="blue") # legend("bottomright",legend=c(paste("Threshold=",round(p.up.minus.cut,3)," "),paste("Threshold=",round(p.down.minus.cut,3)," ")), # lty=3,lwd=2,cex=clegend, bg=NULL,box.col=NULL, bty="n", text.col=c("red","blue"),col=c("red","blue")) # legend("bottomright",legend=c(paste("Threshold=",signif(p.up.minus.cut,1)), # paste("Threshold=",signif(p.down.minus.cut,1))), # lty=3,lwd=2,cex=clegend, bg=NULL,box.col=NULL, bty="n", text.col=c("red","blue"), col=c("red","blue")) box(lwd=2) par(mar = c(4, 0, 0, 0)) plot(sequence, mean.minus.up, type="l",lty=1,lwd=2,main="", xlab="",ylab="",xaxt="n",, yaxt="n",cex.axis=caxis,cex.lab=clabel, ylim=ylim,font=2,font.lab=2,font.main=2,cex.main=cmain, col="white") abline(v=seq(-2000,2000,400), col=fline,lwd=1) abline(h=1,lty=1,lwd=1) abline(v=0,lty=1,lwd=vline) points(sequence, mean.minus.up, type="l", lty=1, lwd=2, col="red") points(sequence, mean.minus.down, type="l", lty=1, lwd=2, col="blue") for(i in 1:length(regions.plus.m)){ polygon(cord.xl.plus.m[i][[1]],cord.yl.plus.m[i][[1]],col="red", density=50, angle=-45) } for(i in 1:length(regions.minus.m)){ polygon(cord.xl.minus.m[i][[1]],cord.yl.minus.m[i][[1]],col="blue", density=50, angle=45) } axis(1, at=seq(from=-1600, to=1600, by=400),cex.axis=caxis,font=2,las=2) legend("bottom",legend=c(paste(" FDR=",round(fdr.up.minus,3)),paste(" FDR=",round(fdr.down.minus,3))),cex=clegend, bty="n", text.col=c("red","blue")) box(lwd=2) mtext(text=" Enrichment P-Value", line=5, font=2, side=2, cex=1.5, outer=TRUE, adj=0) mtext(text=" Distance Downstream of TSS (bp)", line=2, font=2, side=1, cex=1.5, outer=TRUE, adj=0) dev.off()