library(scales)

####################
setwd("/Users/acd13/Desktop/ATAC/Analysis/conservation")

# read in the data, being sure to sort by gene name (which when I generated the coverage data I amde sure was unique and descriptive)
# conservation scores were generated by: intersectBed -a <score bedGraph> -b <region of interest bed> | coverageBed -a - -b <region of interest bed> | awk '{if($NF > 0.5) print $0}' | sortBed -i - | mapBed -a - -b <score bedGraph> -c 4 -o mean 
phastConsScores.ee <- read.table('EE_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap.bed')
phastConsScores.ee.noChrM <- subset(phastConsScores.ee, phastConsScores.ee[,1]!='chrM')
phastConsScores.l3 <- read.table('L3_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap.bed')
phastConsScores.l3.noChrM <- subset(phastConsScores.l3, phastConsScores.l3[,1]!='chrM')
phastConsScores.ya <- read.table('YA_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap.bed')
phastConsScores.ya.noChrM <- subset(phastConsScores.ya, phastConsScores.ya[,1]!='chrM')

# read in null distributions
ee.nullData.asOne <- read.table(gzfile('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/EE/allTogether.txt.gz'), header=F)
l3.nullData.asOne <- read.table(gzfile('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/L3/allTogether.txt.gz'), header=F)
ya.nullData.asOne <- read.table(gzfile('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/YA/allTogether.txt.gz'), header=F)

genomeWideMedian <- median(read.csv("/Volumes/extra/Genomic_files/worm/ce10/ce10_conservations/ce10.phastCons7way.noRefSeqExons.noRepeats.singleBp.bedGraph", sep="\t", header=F)[,4])

# now normalize all values to be relative to the genomewide median
realEENormdVals <- log2(phastConsScores.ee.noChrM[,ncol(phastConsScores.ee.noChrM)]/genomeWideMedian)
nullEENormdVals <- log2(ee.nullData.asOne[,1]/genomeWideMedian)
rm(phastConsScores.ee, phastConsScores.ee.noChrM, ee.nullData.asOne)

realL3NormdVals <- log2(phastConsScores.l3.noChrM[,ncol(phastConsScores.l3.noChrM)]/genomeWideMedian)
nullL3NormdVals <- log2(l3.nullData.asOne[,1]/genomeWideMedian)
rm(phastConsScores.l3, phastConsScores.l3.noChrM, l3.nullData.asOne)

realYANormdVals <- log2(phastConsScores.ya.noChrM[,ncol(phastConsScores.ya.noChrM)]/genomeWideMedian)
nullYANormdVals <- log2(ya.nullData.asOne[,1]/genomeWideMedian)
rm(phastConsScores.ya, phastConsScores.ya.noChrM, ya.nullData.asOne)


pdf('conservation_noOutliers.pdf', width=5, height=5)
# The 0s are for spacing
boxplot(list(
              realEENormdVals
              ,nullEENormdVals
              ,0
              ,realL3NormdVals
              ,nullL3NormdVals
              ,0
              ,realYANormdVals
              ,nullYANormdVals
  )
  , notch=T
  , col=c('darkorchid4'
          , alpha('darkorchid4',0.5)
          ,'white','goldenrod2'
          ,alpha('goldenrod2',0.5)
          ,'white'
          ,'darkgreen'
          ,alpha('darkgreen',0.5)
          )
  , names=c('Embryo'
             ,'Null'
             ,""
             ,"Larval"
             , 'Null'
             ,''
             ,'Adult'
             , 'Null'
             )
  , ylab='Log2(Mean bp PhastCons score / distal and non-coding genome median)'
  , outline=FALSE
)
dev.off()

## Now check significance
eeKSTest <- ks.test(as.numeric(phastConsScores.ee.noChrM[,ncol(phastConsScores.ee.noChrM)]),as.numeric(ee.nullData.asOne[,1]))
format.pval(eeKSTest$p.value,6,1e-323)
#[1] "< 9.881e-324"
l3KSTest <- ks.test(as.numeric(phastConsScores.l3.noChrM[,ncol(phastConsScores.l3.noChrM)]),as.numeric(l3.nullData.asOne[,1]))
#[1] "< 9.881e-324"
yaKSTest <- ks.test(as.numeric(phastConsScores.ya.noChrM[,ncol(phastConsScores.ya.noChrM)]),as.numeric(ya.nullData.asOne[,1]))
#[1] "< 9.881e-324"

# I also want to do a test against each individual shuffle, just to make sure the pvals aren't being inflated because of the numbers
allL3Nulls <- list.files(path="./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/L3/", pattern="L3*")
L3Pvals <- vector('numeric',length=length(allL3Nulls))
for (i in 1:length(allL3Nulls)){
  nullData <- read.table(gzfile(paste('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/L3/',allL3Nulls[i],sep="")), header=F)
  ksTest <- ks.test(as.numeric(phastConsScores.l3.noChrM[,ncol(phastConsScores.l3.noChrM)]),as.numeric(nullData[,1]))
  L3Pvals[i] <- format.pval(ksTest$p.value,6,1e-323)
}
length(which(L3Pvals=="< 9.881e-324"))
#[1] 10000

allYANulls <- list.files(path="./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/YA/", pattern="YA*")
YAPvals <- vector('numeric',length=length(allYANulls))
for (i in 1:length(allYANulls)){
  nullData <- read.table(gzfile(paste('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/YA/',allYANulls[i],sep="")), header=F)
  ksTest <- ks.test(as.numeric(phastConsScores.ya.noChrM[,ncol(phastConsScores.ya.noChrM)]),as.numeric(nullData[,1]))
  YAPvals[i] <- format.pval(ksTest$p.value,6,1e-323)
}
length(which(YAPvals=="< 9.881e-324"))
#[1] 10000

#allMetaNulls <- list.files(path="./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/meta/", pattern="all4Stages*")
#MetaPvals <- vector('numeric',length=length(allMetaNulls))
#for (i in 1:length(allMetaNulls)){
#  nullData <- read.table(gzfile(paste('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/meta/',allMetaNulls[i],sep="")), header=F)
#  ksTest <- ks.test(as.numeric(phastConsScores.meta.noChrM[,ncol(phastConsScores.meta.noChrM)]),as.numeric(nullData[,1]))
#  MetaPvals[i] <- format.pval(ksTest$p.value,6,1e-323)
#}

allEENulls <- list.files(path="./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/EE/", pattern="EE*")
EEPvals <- vector('numeric',length=length(allEENulls))
for (i in 1:length(allEENulls)){
  nullData <- read.table(gzfile(paste('./nullDists_consensusGdnaMaskedPeaks_phastCons7way.noRefSeqExons.noRepeats.Score_min0.5Overlap/EE/',allEENulls[i],sep="")), header=F)
  ksTest <- ks.test(as.numeric(phastConsScores.ee.noChrM[,ncol(phastConsScores.ee.noChrM)]),as.numeric(nullData[,1]))
  EEPvals[i] <- format.pval(ksTest$p.value,6,1e-323)
}
length(which(EEPvals=="< 9.881e-324"))

# Every single one, for every single sample had a p < 9.881e-324
