##MethylKit and pvclust R code for generating bootstrapped dendrogram

#!/usr/bin/Rscript

library("methylKit")
library("pvclust")

rowSds <- function(x, center=NULL, ...) {
  n <- !is.na(x);
  n <- rowSums(n);
  n[n <= 1] <- NA;

  if (is.null(center)) {
    center <- rowMeans(x, ...);
  }

  x <- x - center;
  x <- x*x;
  x <- rowSums(x, ...);
  x <- x/(n-1);

 sqrt(x);
}

colSds <- function(x, ...) {
  x <- t(x);
  rowSds(x, ...);
}

file.list=list("16H.bsseq.input.txt","16L.bsseq.input.txt","22H.bsseq.input.txt","22L.bsseq.input.txt","5H.bsseq.input.txt","5L.bsseq.input.txt","7H.bsseq.input.txt","7L.bsseq.input.txt","8H.bsseq.input.txt","8L.bsseq.input.txt","9H.bsseq.input.txt","9L.bsseq.input.txt")

myobj=methRead( file.list,sample.id=list("16H","16L","22H","22L","5H","5L","7H","7L","8H","8L","9H","9L"),assembly="Dm2.4",treatment=c(1,0,1,0,1,0,1,0,1,0,1,0),context="CpG",pipeline="bismarkCoverage")
filtered.myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,hi.count=NULL,hi.perc=99.9)
meth=unite(myobj, destrand=FALSE)


mat =getData(meth)
mat =mat[ rowSums(is.na(mat))==0, ] 
meth.mat = mat[, meth@numCs.index]/
	(mat[,meth@numCs.index] + mat[,meth@numTs.index] )                                      
names(meth.mat)=meth@sample.ids
sds=rowSds(as.matrix(meth.mat))
cutoff=quantile(sds,0.5)
meth.mat=meth.mat[sds>cutoff,]
     
dist.boot.10000=pvclust(meth.mat, method.dist="cor", method.hclust="ward.D2", nboot=10000, parallel=TRUE)

svg(filename="pvclust.10000.wald2.svg")
plot(dist.boot.10000)
dev.off()

pdf("pvclust.10000.wald2.pdf")
plot(dist.boot.10000)
dev.off()


##Bsseq Rscript:

#!/usr/bin/Rscript
library("bsseq")

meth=read.bismark(files=c("16H.bsseq.input.txt", "22H.bsseq.input.txt", "5H.bsseq.input.txt", "7H.bsseq.input.txt", "8H.bsseq.input.txt", "9H.bsseq.input.txt", "16L.bsseq.input.txt", "22L.bsseq.input.txt", "5L.bsseq.input.txt", "7L.bsseq.input.txt", "8L.bsseq.input.txt", "9L.bsseq.input.txt"),sampleNames=c("16H","22H","5H","7H","8H","9H","16L","22L","5L","7L","8L","9L"),fileType="cov",mc.cores=10,strandCollapse=FALSE)

Design <- data.frame(row.names = sampleNames(meth), condition=c("H","H","H","H","H","H","L","L","L","L","L","L"),mother=c("16","22","5","7","8","9","16","22","5","7","8","9"))
pData(meth)<-Design

BS.meth.smooth<- BSmooth(meth, mc.cores = 60, verbose = TRUE)
BS.cov <- getCoverage(BS.meth.smooth)
BS.meth <-(getMeth(BS.meth.smooth))

keepLoci.ex <- which(rowSums(BS.cov[, BS.meth.smooth$condition == "H"] >= 6) >= 2 &rowSums(BS.cov[, BS.meth.smooth$condition == "L"] >= 6) >= 2 & Reduce(`&`,as.data.frame(!is.na(BS.meth))))
BS.meth.smooth <- BS.meth.smooth[keepLoci.ex,]
summary(getMeth(BS.meth.smooth))

meth.tstat <- BSmooth.tstat(BS.meth.smooth, group1 = c("16H","22H","5H","7H","8H","9H"),group2 = c("16L","22L","5L","7L","8L","9L"),estimate.var = "paired",local.correct=TRUE,verbose = TRUE)
#write.csv(as.data.frame(meth.tstat), file="meth.tsat.2.csv")
dmrs0 <- dmrFinder(meth.tstat, cutoff = c(-4.6, 4.6))
write.csv(as.data.frame(dmrs0), file="dmrs0.2.filtered.csv")
dmrs <- subset(dmrs0, n >= 3 & abs(meanDiff) >= 0.1)
write.csv(as.data.frame(dmrs), file="dmrs.filtered.csv")
save(meth.tstat,file="meth.tstat.filtered.R")

pData <- pData(BS.meth.smooth)
pData$col <- rep(c("red", "blue"), each = 3)
pData(BS.meth.smooth) <- pData
pdf(file = "dmrs_top200.2.pdf", width = 10, height = 5)
plotManyRegions(BS.meth.smooth, dmrs[1:200,], extend = 5000,addRegions = dmrs)
dev.off()

dmrs1 <- dmrFinder(meth.tstat,qcutoff = c(0.01, 0.99))
write.csv(as.data.frame(dmrs0), file="dmrs1.filtered.csv")
dmrsDiff <- subset(dmrs1, n >= 3 & abs(meanDiff) >= 0.1)
write.csv(as.data.frame(dmrsDiff), file="dmrsDiff.filtered.csv")

##Generate per CpG p-values and convert to q-values
library(bsseq)
library(qvalue)

load("meth.tstat.filtered.R")
tstats<-getStats(meth.tstat)[,"tstat.corrected"]
p.value <- 2 * pt(-abs(as.vector(tstats)), df = 5)
qobj=qvalue(p = p.value)
qvals<-qobj$qvalues
Values<-cbind(data.frame(meth.tstat@gr),data.frame(meth.tstat@stats),p.value,qvals)

write.csv(file="AllPvalues.Nov18.InRevision.csv", Values)

##TopGO code used to identify significantly enriched terms

library("topGO")
args <- commandArgs(trailingOnly = TRUE)
universeFile = args[1]
interestingGenesFile = args[2]
algorithm = args[3]

geneID2GO <- readMappings(file= universeFile)
geneNames <- names(geneID2GO)

myInterestingGenes <- readLines(interestingGenesFile)
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))

names(geneList) <- geneNames

GOdataBP <- new("topGOdata", ontology="BP",allGenes=geneList,annot=annFUN.gene2GO, gene2GO=geneID2GO, nodeSize = 5)
GOdataMF <- new("topGOdata", ontology="MF",allGenes=geneList,annot=annFUN.gene2GO, gene2GO=geneID2GO, nodeSize = 5)
GOdataCC <- new("topGOdata", ontology="CC",allGenes=geneList,annot=annFUN.gene2GO, gene2GO=geneID2GO, nodeSize = 5)

resultFisherBP <- runTest(GOdataBP, algorithm = algorithm, statistic = "fisher")
resultFisherMF <- runTest(GOdataMF, algorithm = algorithm, statistic = "fisher")
resultFisherCC <- runTest(GOdataCC, algorithm = algorithm, statistic = "fisher")

mysummaryBP <- summary(attributes(resultFisherBP)$score <= 0.01)
as.matrix(mysummaryBP)
length(mysummaryBP)
#numsignifBP <- as.integer(mysummaryBP[[3]])
	
mysummaryMF <- summary(attributes(resultFisherMF)$score <= 0.01)
as.matrix(mysummaryMF)
length(mysummaryMF)
#numsignifMF <- as.integer(mysummaryMF[[3]])

mysummaryCC <- summary(attributes(resultFisherCC)$score <= 0.01)
matrixCC = as.matrix(mysummaryCC)
length(mysummaryCC)
length(matrixCC)
is.matrix(matrixCC)

#mysummaryCC <- as.integer(mysummaryCC[[3]])

if(length(mysummaryCC) == 3) {
	numsignifCC <- as.integer(mysummaryCC[[3]])
	allResCC <- GenTable(GOdataCC, weight01=resultFisherCC, orderBy=algorithm, topNodes = numsignifCC)
	output_fileCC = paste(interestingGenesFile,algorithm,"CC.txt", sep=".")
        write.table(allResCC, file=output_fileCC, row.names=FALSE)
	printGraph(GOdataCC, resultFisherCC, firstSigNodes = numsignifCC, fn.prefix = paste(interestingGenesFile,algorithm,"CC", sep="."), useInfo = "all", pdfSW = FALSE)
	printGraph(GOdataCC, resultFisherCC, firstSigNodes = numsignifCC, fn.prefix = paste(interestingGenesFile,algorithm,"CC", sep="."), useInfo = "all", pdfSW = TRUE)
	GO.CC<-allResCC$GO.ID
	mygenes.CC <- genesInTerm(GOdataCC,GO.CC)
	GOTab.CC<-NULL
	for (i in 1:length(GO.CC))
		{
		myterm <- GO.CC[i]
		mygenesforterm <- mygenes.CC[myterm][[1]]
		mygenesforterm <- paste(mygenesforterm, collapse=',')
		output_fileGO.CC = paste(interestingGenesFile,algorithm,"CC.GO2genes.txt", sep=".")
		GOTab.CC <-rbind(GOTab.CC,paste(myterm,mygenesforterm, collapse=','))
		}
	write.table(GOTab.CC,file= output_fileGO.CC, row.names=FALSE,col.names=FALSE)
	} else {
	print ("TRUE in mysummaryCC does not exist!")
	}

if(length(mysummaryBP) == 3) {
        numsignifBP <- as.integer(mysummaryBP[[3]])
	allResBP <- GenTable(GOdataBP, weight01=resultFisherBP, orderBy=algorithm, topNodes = numsignifBP)
	output_fileBP = paste(interestingGenesFile,algorithm,"BP.txt", sep=".")
	write.table(allResBP, file=output_fileBP, row.names=FALSE)
	printGraph(GOdataBP, resultFisherBP, firstSigNodes = numsignifBP, fn.prefix = paste(interestingGenesFile,algorithm,"BP", sep="."), useInfo = "all", pdfSW = FALSE)
	printGraph(GOdataBP, resultFisherBP, firstSigNodes = numsignifBP, fn.prefix = paste(interestingGenesFile,algorithm,"BP", sep="."), useInfo = "all", pdfSW = TRUE)
	GO.BP<-allResBP$GO.ID
	mygenes.BP <- genesInTerm(GOdataBP,GO.BP)
	GOTab.BP<-NULL
		for (i in 1:length(GO.BP))
		{
		myterm <- GO.BP[i]
		mygenesforterm <- mygenes.BP[myterm][[1]]
		mygenesforterm <- paste(mygenesforterm, collapse=',')
		output_fileGO.BP = paste(interestingGenesFile,algorithm,"BP.GO2genes.txt", sep=".")
		GOTab.BP <-rbind(GOTab.BP,paste(myterm,mygenesforterm, collapse=','))
		}
	write.table(GOTab.BP,file=output_fileGO.BP, row.names=FALSE,col.names=FALSE)
	} else {
        print ("TRUE in mysummaryBP does not exist!")
        }

if(length(mysummaryMF) == 3) {
        numsignifMF <- as.integer(mysummaryMF[[3]])
	allResMF <- GenTable(GOdataMF, weight01=resultFisherMF, orderBy=algorithm, topNodes = numsignifMF)
	output_fileMF = paste(interestingGenesFile,algorithm,"MF.txt", sep=".")
	write.table(allResMF, file=output_fileMF, row.names=FALSE)
	printGraph(GOdataMF, resultFisherMF, firstSigNodes = numsignifMF, fn.prefix = paste(interestingGenesFile,algorithm,"MF", sep="."), useInfo = "all", pdfSW = FALSE)
	printGraph(GOdataMF, resultFisherMF, firstSigNodes = numsignifMF, fn.prefix = paste(interestingGenesFile,algorithm,"MF", sep="."), useInfo = "all", pdfSW = TRUE)
	GO.MF<-allResMF$GO.ID
	mygenes.MF <- genesInTerm(GOdataMF,GO.MF)
	GOTab.MF<-NULL
	for (i in 1:length(GO.MF))
		{
		myterm <- GO.MF[i]
		mygenesforterm <- mygenes.MF[myterm][[1]]
		mygenesforterm <- paste(mygenesforterm, collapse=',')
		output_fileGO.MF = paste(interestingGenesFile,algorithm,"MF.GO2genes.txt", sep=".")
		GOTab.MF <-rbind(GOTab.MF,paste(myterm,mygenesforterm, collapse=','))
		}
	write.table(GOTab.MF,file= output_fileGO.MF, row.names=FALSE,col.names=FALSE)
	} else {
        print ("TRUE in mysummaryMF does not exist!")
        }