#!/usr/bin/env Rscript
library(gplots)
library(ChIPseeker)
library(ChIPpeakAnno)
library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(clusterProfiler)
library(VennDiagram)
library(namespace)


args<-commandArgs(TRUE)
input<-args[1]
time<-as.character(args[2])
shuffle<-as.numeric(as.character(args[3]))
out<-args[4]
IDX<-args[5]


txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
#TIME<-c("00h","01h","04h","08h","12h")[time]
TIME<-as.factor(as.character(time))

## Read bed files

setwd(input)

list<-list.files(patter="bed$")
list<-list[sapply(strsplit(as.character(list), "\\."), "[[", 2)==c("antiFLAG")] # select only antiFLAG
list<-list[sapply(strsplit(as.character(list), "\\_"), "[[", 2)%in%c("cJun","FOSL1","FOSL2","JUND","JunB")]
list
files<-as.list(list)

names(files)<-paste(sapply(strsplit(as.character(list), "\\_"), "[[", 2),
                    sapply(strsplit(as.character(list), "\\."), "[[", 4)
                    ,sep="_")

## Read bed files

bed_list<-lapply(files,toGRanges)
names(bed_list) 
#sub_bed_list<-bed_list[c(time,time+5,time+10,time+15,time+20)]

idx<-grepl(TIME,list,perl = T)
sub_bed_list<-bed_list[idx]

TIME
class(TIME)

ovp = findOverlapsOfPeaks(sub_bed_list,connectedPeaks=c("merge"))
observed.overlap<-as.matrix(ovp$venn_cnt)[,c("Counts")]


#print(ovp)
## population of subunits
peak_union<-do.call(getMethod(c, "GenomicRanges"), sub_bed_list)
peak_union

df <- data.frame(seqnames=seqnames(peak_union),
                 starts=start(peak_union)-1,
                 ends=end(peak_union))
df<-df[!duplicated(df),]

## Getting the number of peaks for each CHR
print("Getting the number of peaks for each CHR")

### for A GIVEN TIME POINTS

for(s in 1:shuffle){ # number of shuffling
query_df<-lapply(sub_bed_list,function(x) table(seqnames=seqnames(x)))
  
for( i in 1:length(query_df)){ # the number of subunits, (i,e, fosl2, jun,....)
  for( j in 1:length(query_df[i][[1]])){ # each chromosome numbers
    CHR=names(query_df[i][[1]][j]) # current chrosome
  CHR
  PEAK.N=query_df[i][[1]][[j]] # the number of peaks in that chr
  PEAK.N
  ROW.N<-row.names(df)[df$seqnames==CHR] # row.numbers in population peaks
  if(j==1){
  sub_df<-df[sample(ROW.N,PEAK.N,replace = FALSE),] # randomly select populatin peaks according to the number of query peaks in that chr
  }else{
  SUB<-df[sample(ROW.N,PEAK.N),]
  sub_df<-rbind(sub_df,SUB) 
  }
  }
 cat("Original subunit factor peak distribution",query_df[i][[1]], "\n") 
 cat("Sampled peaks from population peaks according to Original subunit factor peak distribution",table(sub_df$seqnames), "\n") 
query_df[i][[1]] <-sub_df
 
}

#lapply(query_df,function(x)nrow(x))

cat("Starting looking at ovelap with shuffling","\n")
shuffle_df<-lapply(query_df,function(x) makeGRangesFromDataFrame(x,
                         keep.extra.columns=FALSE,
                         seqinfo=NULL,
                         seqnames.field=c("seqnames"),
                         start.field="starts",
                         end.field=c("ends"),
                          starts.in.df.are.0based=FALSE))


#cat(lapply(sub_bed_list,function(x) table(seqnames=seqnames(x))),"\n")
#cat(lapply(shuffle_df,function(x) table(seqnames=seqnames(x))),"\n")

## this is randomly selected peaks that matches to the number of peaks in each chromosome in original factor(fosl2,jun.etc) numbers
cat("Starting sorting shuffled peaks","\n")
shuffle_df<-lapply(shuffle_df,function(x)sort(sortSeqlevels(x))) ## sorting bed files 

## compute overlap counts from shuffling
cat("Starting computing shuffled peaks","\n")

ovp = findOverlapsOfPeaks(shuffle_df,connectedPeaks=c("merge"))
ovp$venn_cnt

if(s==1){
  mat<-as.matrix(ovp$venn_cnt)
}else{
  MAT<-as.matrix(ovp$venn_cnt)
  mat<-cbind(mat,Counts=MAT[,6])
  
}
cat("Done with computing overlaps for shuffling",s,"\n")
}

MAT<-cbind(observed.overlap,mat)
head(MAT)

setwd(out)
#out
#shuffle
#TIME


write.table(MAT,file=sprintf("empirical_overlap_dist_shuffle_%s_%s.idx%s.txt",shuffle,TIME,IDX), sep="\t",col.names=T,row.names=F,quote=F)



