
source("../headers.R")
source("../functions.R")

prefixes=c(
"MicGSC_BCP_AAIOSF_2_HiSeq","PGM-1_FACS_ANLG",
"Ezl174-2_RNAi_r1","DCL2_3_RNAi_r1_HBJ-1",
prefixes,aphi_prefixes)
labels=c(
"MIC","PGM",
"EZL1","DCL23",
labels,aphi_labels)


seqlength=read.table("../data/ptetraurelia_mic2.seqlength",sep="\t")
colnames(seqlength)=c("SEQ_ID","SEQ_LENGTH")
rownames(seqlength)=seqlength$ID
seq_ids=seqlength[seqlength$SEQ_LENGTH>1000,]$SEQ_ID



ref_pref="mic2"
base_genome="ptetraurelia_mic2"


base_img_dir=paste0("images/")

dir.create(base_img_dir,showWarnings=F,  recursive=T)


data_dir=paste0("/data/PARAMECIUM/COVERAGE/tetraurelia/",base_genome,"/WINDOWS/")
window_size=1000
quality=30

compartments=read.table(paste0(data_dir,base_genome,".windows",window_size,"_MIC_MAC_compartments.tab"),sep="\t",h=T)
compartments=compartments[compartments$COMPARTMENT!="NONE" & compartments$LENGTH==1000,]
compartments = compartments[which(is.element(compartments$SEQ_ID,seq_ids)),]

seqtype="DNAseq"
mapping_base_dir=paste0("/data/PARAMECIUM/MAPPING/tetraurelia/",base_genome,"/",seqtype,"/")


mapper="BOWTIE"
nb_mapped_reads=c()
for(p in 1:length(prefixes)) {
    prefix=prefixes[p]
    log_file=try(system(paste0("find ",mapping_base_dir," -name ",prefix,".",mapper,".",ref_pref,".log | head -1"),intern=T))
    
    nb_mapped_read<-as.numeric(try(system(paste("head",log_file," | grep 'MAPPED_READS=' | perl -p -e 's/MAPPED_READS=//'"),intern=T)))
    nb_mapped_reads=c(nb_mapped_reads,nb_mapped_read)
    print(paste(prefix,log_file))
    nb_mapped_read=0
}
names(nb_mapped_reads)=labels
nb_mapped_reads







res_dir=data_dir
rpm=c()
rpm_norm=c()
for(p in 1:length(prefixes)) {
   prefix=prefixes[p]
   file = paste0(res_dir,"/",base_genome,".windows",window_size,"/",seqtype,"/multicov/",prefix,".BOWTIE.",ref_pref,".MIN_QUAL_",quality,".multicov.tab")
   print(file)
   
   tab= read.table(file,h=F,sep="\t")
   colnames(tab)=c("SEQ_ID","START","END","NB_READs")
   tab$UNIQUENAME=paste(tab$SEQ_ID,tab$START,tab$END,sep=":")
   rownames(tab)=tab$UNIQUENAME
   tab=tab[rownames(compartments),]
   tab$LENGTH=(tab$END-tab$START)
   tab$RPM = tab$NB_READs / ( nb_mapped_reads[labels[p]]/1e6 )
   
   tab$RPM_NORM = tab$RPM / median(tab$RPM)
   
   
   if(length(rpm) == 0) {
      rpm=tab[,c("UNIQUENAME","SEQ_ID","START","END","LENGTH","RPM")]
      rpm_norm=tab[,c("UNIQUENAME","SEQ_ID","START","END","LENGTH","RPM_NORM")]
      
   } else {
       rpm=merge(rpm,tab[,c("UNIQUENAME","RPM")],by="UNIQUENAME")
       rpm_norm=merge(rpm_norm,tab[,c("UNIQUENAME","RPM_NORM")],by="UNIQUENAME")
   } 
   colnames(rpm)[length(colnames(rpm))]  = labels[p]
   colnames(rpm_norm)[length(colnames(rpm_norm))]  = labels[p]
}
rownames(rpm)=rpm$UNIQUENAME
rpm=rpm[,-1]

rownames(rpm_norm)=rpm_norm$UNIQUENAME
rpm_norm=rpm_norm[,-1]

head(rpm)
head(rpm_norm)




cutoff=2.5
bin=1/2
pdf(paste0(base_img_dir,"hist_genome_coverage.pdf"),width=20,height=23)
par(mfrow=c(5,5))
max_cplx=round(sum(rpm[rpm[,"MIC"]>cutoff,]$LENGTH)/1e6,1)

mic_only_windows_ids=rownames(compartments[compartments$SUB_COMPARTMENT=="MIC_only",])

percent_covered_genome=c()
for(l in 1:length(labels)) {
    
    cplx=round(sum(rpm[rpm[,labels[l]]>=cutoff,]$LENGTH)/1e6,1)
    pcplx=round(cplx/max_cplx*100,0)
    hist(rpm[,labels[l]],col="grey",main=paste0(labels[l]," ",cplx,"Mb (",pcplx,"%)"),xlim=c(0,30),border="black",breaks=seq(0,round(max(rpm[,labels[l]])+1,0),bin),xlab="Coverage (RPM)",axes=F) 
    axis(2)
    #axis(1,col=exp_colors[l])
    axis(1)
    hist(rpm[rownames(compartments[compartments$COMPARTMENT=="MIC-limited" ,]),labels[l]],col="red",xlim=c(0,30),main="",border="white",breaks=seq(0,round(max(rpm[,labels[l]])+1,0),bin),xlab="",add=T) 
    hist(rpm[rownames(compartments[compartments$SUB_COMPARTMENT=="MIC_only" ,]),labels[l]],col="blue",xlim=c(0,30),main="",border="white",breaks=seq(0,round(max(rpm[,labels[l]])+1,0),bin),xlab="",add=T) 
    
    #hist(rpm[mic_only_windows_ids,labels[l]],col="blue",xlim=c(0,30),main="",border="white",breaks=seq(0,round(max(rpm[,labels[l]])+1,0),bin),xlab="",add=T) 
    
    abline(v=cutoff,lwd=2,col="black")
    
    
    percent_covered_genome=rbind(percent_covered_genome,c(paste0(pcplx,"%"),round(c(table(compartments[rownames(rpm[rpm[,labels[l]]>=cutoff,]),]$COMPARTMENT)[c("MAC-destined","MIC-limited")]/nrow(rpm[rpm[,labels[l]]>=cutoff,]) * pcplx),1)))
   
    
    #percent_covered_genome=rbind(percent_covered_genome,table(compartments[rownames(rpm[rpm[,labels[l]]>=cutoff,]),]$SUB_COMPARTMENT)[c("MAC","MIC_PGM","MIC_only","NONE")]/nrow(rpm[rpm[,labels[l]]>=cutoff,]) * pcplx)
    
}
rownames(percent_covered_genome)=labels
colnames(percent_covered_genome)=c("Percent MIC covered","MAC-destined","MIC-limited")
dev.off()

write.table(percent_covered_genome,"percent_covered_genome.txt",quote=F,sep="\t")






