setwd("/misc/paras/data/arne/hydra/hydra_mutation_rate")

require(data.table)
read_threshold=20

I_cells=c("sample1","sample2","sample3")
E_cells=c("sample5","sample6_2nd+3rd_run","sample7")
sample2celltype=setNames(c(rep("I_cells",3),rep("E_cells",3)),c(I_cells,E_cells))

min_coverage=sapply(simplify = F,I_cells,function(sample) fread(paste0("sample5-7_vs_",sample,".min.coverage"),col.names = c("reads","bases","fraction")))
min_coverage=c(min_coverage,sapply(simplify = F,E_cells,function(sample) fread(paste0("sample1-3_vs_",sample,".min.coverage"),col.names = c("reads","bases","fraction"))))
sapply(names(min_coverage),function(sample){
     min_coverage[[sample]]$cum_bases<<-sapply(min_coverage[[sample]]$reads,function(min_reads) sum(min_coverage[[sample]]$bases[which(min_coverage[[sample]]$reads>=min_reads)]))
     min_coverage[[sample]]$cum_fraction<<-sapply(min_coverage[[sample]]$reads,function(min_reads) sum(min_coverage[[sample]]$fraction[which(min_coverage[[sample]]$reads>=min_reads)]))
})
sapply(min_coverage,function(x) x[which(x$reads==read_threshold),])

mutect=sapply(simplify = F,c(I_cells,E_cells),function(sample) {
     x=fread(paste0("mutect.",sample,".call_stats.txt"),sep="\t",header = T)
     x[which(x$total_reads>=read_threshold & x$judgement=="KEEP"),]
})
sapply(mutect,nrow)

varscan=sapply(simplify = F,c(I_cells,E_cells),function(sample) {
     x=fread(paste0("VarScan2.",sample,".snp"),sep="\t",header = T)
     x$normal_reads=as.integer(x[["normal_reads1"]])+as.integer(x[["normal_reads2"]])
     x$tumor_reads=as.integer(x[["tumor_reads1"]])+as.integer(x[["tumor_reads2"]])
     x=x[which(x$somatic_status=="Somatic" & x$normal_reads2==0 & x$tumor_reads>=read_threshold & x$normal_reads>=read_threshold),]
})
sapply(varscan,nrow)

ugt=sapply(simplify = F,c(I_cells,E_cells,"sample1-3","sample5-7"),function(sample) {
     fread(paste0(sample,".srt.rmdup.RG.realigned.BQSR.UnifiedGenotyper.vcf"),header=T,sep="\t",skip=5556)
})
sapply(ugt,nrow)

merged=sapply(simplify = F,Reduce(intersect,list(names(ugt),names(varscan)),names(mutect)), function(sample){
     x=merge(mutect[[sample]],varscan[[sample]],by.x=c("contig","position"),by.y=c("chrom","position"))
     merge(x,ugt[[sample]],by.x=c("contig","position"),by.y=c("#CHROM","POS"))
})
sapply(merged,nrow)

sample2bulk=c(sample1="sample5-7",sample2="sample5-7",sample3="sample5-7",sample5="sample1-3",`sample6_2nd+3rd_run`="sample1-3",sample7="sample1-3")

subtract <- function(x,y,by.x=1,by.y=1,by=NULL){
     if (!is.null(by)){by.x=by;by.y=by}
     if (!((length(by.x)==length(by.y)))) stop("by.x and by.y must have same length")
     logic=rowSums(sapply(1:length(by.x), function(z) x[,by.x[z]] %in% y[,by.y[z]]))==length(by.x)
     return(x[!logic,])
}
merged_sub_bulk=sapply(simplify = F,names(merged),function(sample) subtract(as.data.frame(merged[[sample]]),as.data.frame(ugt[[sample2bulk[sample]]]),by.x=c("contig","position"),by.y=c("#CHROM","POS")))
sapply(merged_sub_bulk,nrow)

cell_cycle_days=c(I_cells=1.125,E_cells=3.5)
estimated_mitoses=44*365/cell_cycle_days+log(100000,base=2)
mut_rate_merged_sub_bulk=sapply(names(merged_sub_bulk),function(x) {
     covered_bases=unname(unlist(min_coverage[[x]][which(min_coverage[[x]]$reads==read_threshold),"cum_bases"]))
     c(mutations=nrow(merged_sub_bulk[[x]]),covered_bases=covered_bases,mutations_per_base=nrow(merged_sub_bulk[[x]])/covered_bases,mutation_rate=nrow(merged_sub_bulk[[x]])/covered_bases/unname(estimated_mitoses[sample2celltype[x]]))
})
View(mut_rate_merged_sub_bulk)


dir.create("merged_sub_bulk2")  
for(x in names(merged_sub_bulk)) write.table(merged_sub_bulk[[x]][,c("contig","position","position")],paste0("merged_sub_bulk2/merged_sub_bulk_",x,".bed"),col.names=F,row.names=F,quote = F,sep="\t",eol="\r\n")

library(data.table)
library(VariantAnnotation)
library(GenomicFeatures)
hydra_TxDb=makeTxDbFromGFF("hydra2.0_genemodels.gff3")
merged_sub_bulk_CDS=sapply(simplify=F,names(merged_sub_bulk),function(x) {
     ret=setNames(merge(fread(paste0("merged_sub_bulk2/merged_sub_bulk_CDS_",x,".bed"),col.names=c("chrom","start","stop","gene")),merged_sub_bulk[[x]],by.x=c("chrom","start"),by.y=c("contig","position"))[,c(1:4,6,7)],c("chrom","start","stop","gene","REF","ALT"))
     predictCoding=predictCoding(makeGRangesFromDataFrame(ret,seqnames.field = "chrom",start.field="start",end.field = "start"),hydra_TxDb,FaFile("Hm105_Dovetail_Assembly_1.0.fa"),DNAStringSet(ret$ALT))
     consequence=as.character(predictCoding$CONSEQUENCE)
     PROTEINLOC=as.numeric(predictCoding$PROTEINLOC)
     REFAA=predictCoding$REFAA
     VARAA=predictCoding$VARAA
     ret=cbind(ret,t(sapply(1:nrow(ret),function(x) {
          x=which(predictCoding$QUERYID==x)
          c(nonsense=length(which(consequence[x]=="nonsense")),nonsynonymous=length(which(consequence[x]=="nonsynonymous")),synonymous=length(which(consequence[x]=="synonymous")),start_loss=sum(PROTEINLOC[x]==1),stop_loss=sum(REFAA[x]=="*" & VARAA[x]!="*"))/length(x)
     })))
     i=sapply(1:nrow(ret),function(x) which(predictCoding$QUERYID==x)[1])
     ret=cbind(ret,as.data.frame(predictCoding)[i,c("REFCODON","VARCODON","REFAA","VARAA")])
     ret
})

min_coverage_CDS=sapply(simplify = F,c(I_cells),function(sample) fread(paste0("sample5-7_vs_",sample,".min.coverage_CDS"),col.names = c("reads","bases","fraction")))
min_coverage_CDS=c(min_coverage_CDS,sapply(simplify = F,c(E_cells),function(sample) fread(paste0("sample1-3_vs_",sample,".min.coverage_CDS"),col.names = c("reads","bases","fraction"))))

sapply(names(min_coverage_CDS),function(sample){
     min_coverage_CDS[[sample]]$cum_bases<<-sapply(min_coverage_CDS[[sample]]$reads,function(min_reads) sum(min_coverage_CDS[[sample]]$bases[which(min_coverage_CDS[[sample]]$reads>=min_reads)]))
     min_coverage_CDS[[sample]]$cum_fraction<<-sapply(min_coverage_CDS[[sample]]$reads,function(min_reads) sum(min_coverage_CDS[[sample]]$fraction[which(min_coverage_CDS[[sample]]$reads>=min_reads)]))
})
sapply(min_coverage_CDS,function(x) x[which(x$reads==read_threshold),])


mut_rate_merged_sub_bulk_CDS=sapply(names(merged_sub_bulk_CDS),function(x) {
     covered_bases=unname(unlist(min_coverage_CDS[[x]][which(min_coverage_CDS[[x]]$reads==read_threshold),"cum_bases"]))
     c(mutations=nrow(merged_sub_bulk_CDS[[x]]),covered_bases=covered_bases,mutations_per_base=nrow(merged_sub_bulk_CDS[[x]])/covered_bases,mutation_rate=nrow(merged_sub_bulk_CDS[[x]])/covered_bases/unname(estimated_mitoses[sample2celltype[x]]))
})
View(mut_rate_merged_sub_bulk_CDS)

t.test(mut_rate_merged_sub_bulk["mutation_rate",1:3],mut_rate_merged_sub_bulk_CDS["mutation_rate",1:3])
t.test(mut_rate_merged_sub_bulk["mutation_rate",4:6],mut_rate_merged_sub_bulk_CDS["mutation_rate",4:6])
h=cbind(rate=mut_rate_merged_sub_bulk["mutation_rate",1:6],CDS=0,I_cell=c(1,1,1,0,0,0))
h=as.data.frame(rbind(h,cbind(rate=mut_rate_merged_sub_bulk_CDS["mutation_rate",1:6],CDS=1,I_cell=c(1,1,1,0,0,0))))
summary(aov(rate~CDS+I_cell,h))

mut_rate_human=c(3.5*10^-9,3.2*10^-9,1.4*10^-9,2.8*10^-9,2.8*10^-9,2.5*10^-9,2.8*10^-9,2.2*10^-9,1.6*10^-9,2.6*10^-9)
mut_rate_human_CDS=mean(mut_rate_human)*0.96/1.4
mut_rate_mouse=c(7.1*10^-9,1.0*10^-8,8.2*10^-9,6.7*10^-9,8.1*10^-9)
mut_rate_mouse_CDS=mean(mut_rate_mouse)*1.23/1.2

wilcox.test(mut_rate_human_CDS-mut_rate_human,mut_rate_mouse_CDS-mut_rate_human)# Frage von Matthias: 3.	ALLvsCDS-Unterschied bei Mensch (zumindest im Vergleich zu Maus) signifikant? 
wilcox.test(mut_rate_merged_sub_bulk["mutation_rate",1:3],mut_rate_merged_sub_bulk["mutation_rate",4:6])#weitere Fragen von Matthias
wilcox.test(mut_rate_merged_sub_bulk_CDS["mutation_rate",1:3],mut_rate_merged_sub_bulk_CDS["mutation_rate",4:6])
t.test(mut_rate_merged_sub_bulk["mutation_rate",1:3],mut_rate_merged_sub_bulk["mutation_rate",4:6])
t.test(mut_rate_merged_sub_bulk_CDS["mutation_rate",1:3],mut_rate_merged_sub_bulk_CDS["mutation_rate",4:6])


x=1:100000
mut_rates_by_estimated_number_of_mitoses=cbind(estimated_number_of_mitoses=x,I_cells=mean(mut_rate_merged_sub_bulk["mutations_per_base",I_cells])/x,E_cells=mean(mut_rate_merged_sub_bulk["mutations_per_base",E_cells])/x)

mutation_nucl_distr=function(mutation_table,ref_allele="ref_allele",alt_allele="alt_allele"){
     nucl_pairs=c(C="G",A="T",G="C",T="A")
     X="A"
     ret=do.call(rbind,unname(sapply(simplify = F,setdiff(names(nucl_pairs),X),function(x) {
          c(X,nucl_pairs[X],x,nucl_pairs[x])
          ret=data.frame(length(which(mutation_table[,ref_allele]==X & mutation_table[,alt_allele]==x))+length(which(mutation_table[,ref_allele]==nucl_pairs[X] & mutation_table[,alt_allele]==nucl_pairs[x])))
          rownames(ret)=paste0(X,nucl_pairs[X],"->",x,nucl_pairs[x])
          ret
     })))
     X="C"
     ret2=do.call(rbind,unname(sapply(simplify = F,setdiff(names(nucl_pairs),X),function(x) {
          c(X,nucl_pairs[X],x,nucl_pairs[x])
          ret=data.frame(length(which(mutation_table[,ref_allele]==X & mutation_table[,alt_allele]==x))+length(which(mutation_table[,ref_allele]==nucl_pairs[X] & mutation_table[,alt_allele]==nucl_pairs[x])))
          rownames(ret)=paste0(X,nucl_pairs[X],"->",x,nucl_pairs[x])
          ret
     })))
     rbind(ret,ret2)
}

merged_sub_bulk_mut_nucl_distr=setNames(data.frame(do.call(cbind,sapply(merged_sub_bulk,mutation_nucl_distr,simplify = F))),names(merged_sub_bulk))
merged_sub_bulk_mut_nucl_distr_percent=apply(merged_sub_bulk_mut_nucl_distr,2,function(x) x/sum(x)*100)
merged_sub_bulk_mut_nucl_distr_percent=cbind(merged_sub_bulk_mut_nucl_distr_percent,human=c(8.41,28.4,8.03,19.68,9.52,25.97),mouse=c(25.12,30.62,13.02,7.1,4.84,19.3))#from: https://www.nature.com/articles/ncomms15183/tables/1

mutations_merged_sub_bulk=sapply(c(I_cells,E_cells),function(x) apply(merged_sub_bulk[[x]][,c("contig","position","alt_allele")],1,function(x) paste(x,collapse = "_")))




Fig_1_data=list(A=list(I_cells=mut_rate_merged_sub_bulk["mutation_rate",I_cells],I_cells_CDS=mut_rate_merged_sub_bulk_CDS["mutation_rate",I_cells],E_cells=mut_rate_merged_sub_bulk["mutation_rate",E_cells],E_cells_CDS=mut_rate_merged_sub_bulk_CDS["mutation_rate",E_cells],human=mut_rate_human,human_CDS=mut_rate_human_CDS,mouse=mut_rate_mouse,mouse_CDS=mut_rate_mouse_CDS)
                ,B=list(lines=mut_rates_by_estimated_number_of_mitoses,points=matrix(nrow=2,dimnames=list(c("estimated_number_of_mitoses","mutation_rate"),c("I_cells","E_cells")),c(estimated_mitoses["I_cells"],mean(mut_rate_merged_sub_bulk["mutation_rate",I_cells]),estimated_mitoses["E_cells"],mean(mut_rate_merged_sub_bulk["mutation_rate",E_cells]))))
                ,C=list(distribution_samples=merged_sub_bulk_mut_nucl_distr_percent,sample2celltype=sample2celltype)
                ,D=list(I_cells=mutations_merged_sub_bulk[I_cells],E_cells=mutations_merged_sub_bulk[E_cells]))

saveRDS(Fig_1_data,"Fig_1_data.RDS")

library(VennDiagram)
pdf("Figure_1.pdf")
plot(main="1A",1,type="n",xlim=c(1,4),ylim=c(min(unlist(Fig_1_data$A))*0.9,max(unlist(Fig_1_data$A))*1.1),xlab="",ylab="Mutation rate",xaxt="n",log="y")
axis(1,1:4,c("Interstitial cells","Epithelial cells","Human","Mouse"))
points(c(rep(1,length(Fig_1_data$A$I_cells)+length(Fig_1_data$A$I_cells_CDS)),rep(2,length(Fig_1_data$A$E_cells)+length(Fig_1_data$A$E_cells_CDS)),rep(3,length(Fig_1_data$A$human)),rep(4,length(Fig_1_data$A$mouse))),unlist(Fig_1_data$A[c(1:5,7)])
       ,pch=c(rep(16,length(Fig_1_data$A$I_cells)),rep(15,length(Fig_1_data$A$I_cells_CDS)),rep(16,length(Fig_1_data$A$E_cells)),rep(15,length(Fig_1_data$A$E_cells_CDS)),rep(16,length(Fig_1_data$A$human)+length(Fig_1_data$A$mouse))))
x=rbind(sapply(Fig_1_data$A,mean),c(1,1,2,2,3,3,4,4))
y=0;apply(x,2,function(x) {y<<-y+1;lines(c(x[2]-0.2,x[2]+0.2),c(x[1],x[1]),lty=if(y%%2==0) 2 else 1)})
legend("topright",c("Total genome","Coding sequence"),pch=c(16,15),lty=c(1,2),bty="y",cex=0.8)

plot(main="1B",Fig_1_data$B$lines[,"estimated_number_of_mitoses"],Fig_1_data$B$lines[,"E_cells"],type="l",col="blue",log="y",xlab="Estimated number of mitoses",ylab="Mutation rate per mitosis")
lines(Fig_1_data$B$lines[,"estimated_number_of_mitoses"],Fig_1_data$B$lines[,"I_cells"],col="red")
points(Fig_1_data$B$points["estimated_number_of_mitoses",],Fig_1_data$B$points["mutation_rate",],col=c("red","blue"),pch=20,cex=3)
legend("topright",c("Interstitial cells","Epithelial cells"),pch=20,col=c("red","blue"))

par(mar=c(5, 4, 4, 4) + 0.1,xpd=T)
barplot(main="1C",args.legend=list(x=11.2,y=100,xpd=T,bty="n",cex=0.7),cex.names=0.9,Fig_1_data$C$distribution_samples,col=rainbow(nrow(Fig_1_data$C$distribution_samples)),legend=rownames(Fig_1_data$C$distribution_samples),ylab="% of mutations",names.arg = c(NA,"Interstitial cells",NA,NA,"Epithelial cells",NA,"Human","Mouse"))
lines(c(0,3.6),c(-3,-3));lines(c(3.8,7.3),c(-3,-3))

par(mar=c(5, 4, 4, 2) + 0.1)
plot(0,type="n",axes=F,ann=F)
grid.draw(venn.diagram(setNames(Fig_1_data$D$I_cells,c("I-cell 1","I-cell 2","I-cell 3")),NULL,print.mode = "raw"))
title(main="1D")
plot(0,type="n",axes=F,ann=F)
grid.draw(venn.diagram(setNames(Fig_1_data$D$E_cells,c("E-cell 1","E-cell 2","E-cell 3")),NULL,print.mode = "raw"))
title(main="1E")

dev.off()


birth_dates=c(Hm_175="2006-06-28",Hm_175_508="2006-10-15",Hm_175_734="2007-02-07",Hm_175_1170="2008-01-16",Hm_175_2003="2011-01-04",Hm_175_2419="2011-11-04",Hm_175_2450="2011-11-25",Hm_175bud1="2018-06-05",Hm_175bud2="2018-06-05",Hm_175bud3="2018-06-05")
death_dates=c(Hm_175="2019-02-06",Hm_175_508="2019-02-06",Hm_175_734="2019-02-06",Hm_175_1170="2019-02-06",Hm_175_2003="2019-02-06",Hm_175_2419="2019-02-06",Hm_175_2450="2019-02-06",Hm_175bud1="2018-07-03",Hm_175bud2="2018-07-03",Hm_175bud3="2018-07-03")
lifespans=sapply(names(birth_dates),function(x) as.numeric(as.Date(death_dates[x])-as.Date(birth_dates[x]))/365)

individuals_mut_numbers=sapply(simplify = F,c("somatic","somatic_merged","germline","somatic_downsampled"),function(x) read.table(paste0("../hydra_genetic_distance_kons/",x,"/stats.tsv")))

individuals_coverage=sapply(simplify = F,names(individuals_mut_numbers[1:3]),function(x) t(sapply(rownames(individuals_mut_numbers[[x]]),function(y) {
     y=read.table(paste0("../hydra_genetic_distance_kons/mapping_stats/",y,".",x,if (x=="germline") ".cov" else ".min.cov"))
     c(total=sum(y[which(y[,1]>=10),2]),fraction=sum(y[which(y[,1]>=10),3]))
})))
individuals_coverage[["somatic_downsampled"]]=t(sapply(rownames(individuals_mut_numbers$somatic_downsampled),function(y) {
     y=read.table(paste0("../hydra_genetic_distance_kons/mapping_stats/",y,".downsampled.somatic.min.cov"))
     c(total=sum(y[which(y[,1]>=10),2]),fraction=sum(y[which(y[,1]>=10),3]))
}))


genome_size=individuals_coverage$somatic_downsampled[1,"total"]/individuals_coverage$somatic_downsampled[1,"total"]

plot(mut_rate_merged_sub_bulk["covered_bases",4:6],mut_rate_merged_sub_bulk["mutations",4:6])
plot(mut_rate_merged_sub_bulk["covered_bases",4:6],mut_rate_merged_sub_bulk["mutations",4:6])
genome_size*mean(mut_rate_merged_sub_bulk["mutations_per_base",1:3])#I-Cells
genome_size*mean(mut_rate_merged_sub_bulk["mutations_per_base",4:6])#E-Cells

family_175_coverage_overlap_table=read.table(header=T,"../hydra_genetic_distance_kons/mapping_stats/Family_Hm_175.downsampled.genomecov.filter_minReads10.multi_bed")
family_175_coverage_overlap_table=family_175_coverage_overlap_table[,setdiff(colnames(family_175_coverage_overlap_table),c("num","list"))]

clusterExport(cluster,"family_175_coverage_overlap_table")  
h=grep("Hm_175",colnames(family_175_coverage_overlap_table),value=T)
h=sapply(1:length(h), function(x) combn(h,x),simplify = F)
family_175_coverage_overlaps=sapply(h,function(mat) 
     parApply(cluster,mat,2,function(individuals) {
          x=family_175_coverage_overlap_table[which(apply(as.data.frame(family_175_coverage_overlap_table[,individuals]),1,all)),]
          lengths=abs(x$end-x$start)+1-2#bedtools merge adds 1 additional base at start and stop if feature length is greater 1
          sum(lengths)
     }))

sapply(family_175_coverage_overlaps,median)
x=unlist(sapply(1:length(family_175_coverage_overlaps),function(x) rep(x,length(family_175_coverage_overlaps[[x]]))))
y=unlist(family_175_coverage_overlaps)/(individuals_coverage$somatic_downsampled[,1]/individuals_coverage$somatic_downsampled[,2])[1]*100
plot(x,y,pch=20,xlab="#Individuals",ylab="% of genome covered by overlap",cex=0.5)  



individuals_mut_fracs=sapply(simplify = F,c("somatic","somatic_merged","germline"),function(x) apply(individuals_mut_numbers[[x]],2,function(y) y/individuals_coverage[[x]][rownames(individuals_mut_numbers[[x]]),"total"]))

cor(individuals_mut_fracs$somatic[,16],individuals_coverage$somatic[,2])
cor.test(individuals_mut_fracs$somatic[,16],individuals_coverage$somatic[,2])

three_of_six=sapply(simplify = F,rownames(individuals_mut_numbers$somatic_downsampled),function(sample) {
     gz=gzfile(paste0("../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/",sample,".fixed.nomulti.normed.simple_filter.vcf.gz"))
     setNames(read.table(gz,sep="\t",stringsAsFactors = F),c("CHROM","POS","REF","ALT","TOOLS_CODE"))
})

three_of_six_overlaps=sapply(three_of_six,function(x) sapply( three_of_six, function(y) 
     nrow(merge(x,y,by=c("CHROM","POS")))
))

x=sapply(three_of_six[grep("Hm_22",names(three_of_six))],simplify = F,function(x) paste(x$CHROM,x$POS))
names(x)=sub("_DNA.*","",names(x))
dir.create("Venn_diagrams")
venn.diagram(x,"Venn_diagrams/Individuals_Hm_22_overlaps.tiff")

x=three_of_six[grep("Hm_175",names(three_of_six))]
names(x)=sub("_DNA.*","",names(x))
Individuals_Hm_175_overlaps_all_samples=Reduce(function(x,y) merge(x,y,by=c("CHROM","POS")),x)

x=sapply(three_of_six[grep("Hm_175",names(three_of_six))],simplify = F,function(x) paste(x$CHROM,x$POS))
y=unique(unlist(x))
Individuals_Hm_175_overlaps_counter=sapply(y,function(y) sum(sapply(x,function(a) y%in%a)))
sapply(1:length(x),function(x) length(which(Individuals_Hm_175_overlaps_counter>=x)))

x=c(list(I_cells=I_cells,E_cells=E_cells,single_cells=c(I_cells,E_cells)),setNames(c(I_cells,E_cells),c(I_cells,E_cells)))


mutations_in_sample_sets=sapply(simplify = F,names(x),function(sample_set_name)
     unique(unlist(sapply(simplify = F,merged_sub_bulk[x[[sample_set_name]]],function(y) paste(y$contig,y$position))))
)

x=c(list(individuals=names(three_of_six),family_175=grep("^Hm_175",(names(three_of_six)),value = T),family_22=grep("^Hm_22",(names(three_of_six)),value = T)),setNames(names(three_of_six),names(three_of_six)))
mutations_in_sample_sets=c(mutations_in_sample_sets,sapply(simplify = F,names(x),function(sample_set_name)
     unique(unlist(sapply(simplify = F,three_of_six[x[[sample_set_name]]],function(y) paste(y$CHROM,y$POS))))
))
sapply(mutations_in_sample_sets,length)

mutations_in_sample_sets_overlaps=sapply(mutations_in_sample_sets,function(x)
     sapply(mutations_in_sample_sets,function(y) length(intersect(x,y))))

venn.diagram(mutations_in_sample_sets[c("I_cells","E_cells","family_175","family_22")],"Venn_diagrams/I_cells_E_cells_175_22.tiff")
venn.diagram(mutations_in_sample_sets[c("I_cells","E_cells","individuals")],"Venn_diagrams/I_cells_E_cells_individuals.tiff")
venn.diagram(mutations_in_sample_sets[c("single_cells","individuals")],"Venn_diagrams/single_cells_individuals.tiff")
venn.diagram(mutations_in_sample_sets[c("single_cells","family_175","family_22")],"Venn_diagrams/single_cells_175_22.tiff")
venn.diagram(mutations_in_sample_sets[c("family_175","family_22")],"Venn_diagrams/175_22.tiff")
venn.diagram(mutations_in_sample_sets[c("single_cells","family_175")],"Venn_diagrams/single_cells_175.tiff")

h=grep("Hm_175",names(mutations_in_sample_sets),value = T)
h=sapply(1:length(h), function(x) combn(h,x),simplify = F)

Hm_175_overlaps_numbers=sapply(h,function(mat) 
     apply(mat,2,function(individuals) length(Reduce(intersect,mutations_in_sample_sets[individuals]))))

Hm_175_overlaps=sapply(h,function(mat) 
     apply(mat,2,function(individuals) Reduce(intersect,mutations_in_sample_sets[individuals])))

Hm_175_overlaps_numbers_2=sapply(h,function(mat) 
     length(unique(unlist(apply(mat,2,function(individuals) Reduce(intersect,mutations_in_sample_sets[individuals]))))))

Hm_175_overlaps_2=sapply(h,function(mat) 
     unique(unlist(apply(mat,2,function(individuals) Reduce(intersect,mutations_in_sample_sets[individuals])))))



h=optim(c(max(unlist(Hm_175_overlaps_numbers))*5,0.10),method="L-BFGS-B",lower=c(10000,0),upper=c(100000000,1),fn=function(v){
     K=v[1]
     P=v[2]
     sum(sapply(2:length(Hm_175_overlaps_numbers),function(n) {
          e=P^n*K
          #sum((e-Hm_175_overlaps_numbers[[n]])^2)
          #(e-mean(Hm_175_overlaps_numbers[[n]]))^2
          ((e-mean(Hm_175_overlaps_numbers[[n]]))/mean(Hm_175_overlaps_numbers[[n]]))^2
     }))^0.5
})
h$par
prod(h$par)
h$value

png("Hydra_estimated_vs_actual_overlaps_210726.png")
x=unlist(sapply(1:length(Hm_175_overlaps_numbers),function(x) rep(x,length(Hm_175_overlaps_numbers[[x]]))))
y=unlist(Hm_175_overlaps_numbers)
plot(x,y,pch=20,xlab="#Individuals",ylab="#Overlap",cex=0.5,log="y",ylim=c(1,30000))  
x=1:length(Hm_175_overlaps_numbers)
y=sapply(x,function(n) h$par[2]^n*h$par[1])
points(x,y,col="red",pch=20)
lines(x,y,col="red")
dev.off()


for(x in names(three_of_six)) write.table(three_of_six[[x]][,c("CHROM","POS","POS")],paste0("../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/",x,".bed"),col.names=F,row.names=F,quote = F,sep="\t",eol="\r\n")

three_of_six_CDS=sapply(simplify = F,c(names(three_of_six)),function(sample) {
     ret=merge(setNames(read.table(paste0("../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/",sample,".CDS.bed"),sep="\t",stringsAsFactors = F),c("CHROM","start","stop","gene")),three_of_six[[sample]],by.x=c("CHROM","start"),by.y=c("CHROM","POS"))
     predictCoding=predictCoding(makeGRangesFromDataFrame(ret,seqnames.field = "chrom",start.field="start",end.field = "start"),hydra_TxDb,FaFile("Hm105_Dovetail_Assembly_1.0.fa"),DNAStringSet(ret$ALT))
     consequence=as.character(predictCoding$CONSEQUENCE)
     PROTEINLOC=as.numeric(predictCoding$PROTEINLOC)
     REFAA=predictCoding$REFAA
     VARAA=predictCoding$VARAA
     ret=cbind(ret,t(sapply(1:nrow(ret),function(x) {
          x=which(predictCoding$QUERYID==x)
          c(nonsense=length(which(consequence[x]=="nonsense")),nonsynonymous=length(which(consequence[x]=="nonsynonymous")),synonymous=length(which(consequence[x]=="synonymous")),start_loss=sum(PROTEINLOC[x]==1),stop_loss=sum(REFAA[x]=="*" & VARAA[x]!="*"))/length(x)
     })))
     i=sapply(1:nrow(ret),function(x) which(predictCoding$QUERYID==x)[1])
     ret=cbind(ret,as.data.frame(predictCoding)[i,c("REFCODON","VARCODON","REFAA","VARAA")])
     ret
})

h=grep("Hm_175",names(mutations_in_sample_sets),value = T)
h=sapply(1:length(h), function(x) combn(h,x),simplify = F)

Hm_175_overlaps_numbers=sapply(h,function(mat) 
     apply(mat,2,function(individuals) length(Reduce(intersect,mutations_in_sample_sets[individuals]))))


three_of_six_CDS_Hm_175_overlaps=sapply(simplify = F,Hm_175_overlaps_2,function(overlaps) sapply(simplify=F,three_of_six_CDS[grep("175",names(three_of_six_CDS))],function(mut_table){
     x=paste(mut_table$CHROM,mut_table$start)
     mut_table[which(x%in%overlaps),]
}))
sapply(three_of_six_CDS_Hm_175_overlaps,function(x) sapply(x,nrow))

three_of_six_CDS_Hm_175_overlap_numbers=sapply(Hm_175_overlaps_2,function(overlaps) length(unique(unlist(sapply(simplify=F,three_of_six_CDS[grep("175",names(three_of_six_CDS))],function(mut_table){
     x=paste(mut_table$CHROM,mut_table$start)
     overlaps[which(overlaps%in%x)]
})))))
Hm_175_overlaps_numbers_2=rbind(all=Hm_175_overlaps_numbers_2,genic=three_of_six_CDS_Hm_175_overlap_numbers,`%genic`=100*three_of_six_CDS_Hm_175_overlap_numbers/Hm_175_overlaps_numbers_2)

sapply(three_of_six_CDS_Hm_175_overlaps,function(x) sapply(x,function(x) sum(x$nonsynonymous)))

three_of_six_CDS_Hm_175_overlaps_2=sapply(Hm_175_overlaps_2,function(overlaps) unique(unlist(sapply(simplify=F,three_of_six_CDS[grep("175",names(three_of_six_CDS))],function(mut_table){
     x=paste(mut_table$CHROM,mut_table$start)
     overlaps[which(overlaps%in%x)]
}))))

three_of_six_CDS_Hm_175_overlap_nonsynonymous=sapply(three_of_six_CDS_Hm_175_overlaps_2,function(overlaps) sum(sapply(overlaps,function(overlap) {
     y=sapply(three_of_six_CDS[grep("175",names(three_of_six_CDS))],function(mut_table){
          x=paste(mut_table$CHROM,mut_table$start)
          #return(mut_table[which(x==overlap),])
          if(!(overlap%in%x)) return(c(present=0,nonsynonymous=0))
          else return(c(present=1,nonsynonymous=mut_table[which(x==overlap),"nonsynonymous"]))
     })
     sum(y["nonsynonymous",])/sum(y["present",])
})))
Hm_175_overlaps_numbers_2=rbind(Hm_175_overlaps_numbers_2,nonsynonymous=three_of_six_CDS_Hm_175_overlap_nonsynonymous,`%nonsynonymous`=three_of_six_CDS_Hm_175_overlap_nonsynonymous/Hm_175_overlaps_numbers_2["genic",]*100)

three_of_six_Hm_175_overlaps=sapply(simplify = F,Hm_175_overlaps_2,function(overlaps) sapply(simplify=F,three_of_six[grep("175",names(three_of_six))],function(mut_table){
     x=paste(mut_table$CHROM,mut_table$POS)
     mut_table[which(x%in%overlaps),]
}))
sapply(three_of_six_Hm_175_overlaps,function(x) sapply(x,nrow))

three_of_six_Hm_175_overlaps_mut_distr=sapply(three_of_six_Hm_175_overlaps,function(x) {
     y=do.call(rbind,x)
     y=mutation_nucl_distr(y,ref_allele = "REF",alt_allele = "ALT")
     setNames(unlist(y),rownames(y))
})

three_of_six_Hm_175_overlaps_mut_distr_percent=apply(three_of_six_Hm_175_overlaps_mut_distr,2,function(x) x/sum(x)*100)

x=c(list(I_cells=I_cells,E_cells=E_cells,single_cells=c(I_cells,E_cells)),setNames(c(I_cells,E_cells),c(I_cells,E_cells)))
mutations_in_sample_sets_CDS=sapply(simplify = F,names(x),function(sample_set_name)
     unique(unlist(sapply(simplify = F,merged_sub_bulk_CDS[x[[sample_set_name]]],function(y) paste(y$chrom,y$start))))
)

x=c(list(individuals=names(three_of_six_CDS),family_175=grep("^Hm_175",(names(three_of_six_CDS)),value = T),family_22=grep("^Hm_22",(names(three_of_six)),value = T)),setNames(names(three_of_six),names(three_of_six)))
mutations_in_sample_sets_CDS=c(mutations_in_sample_sets_CDS,sapply(simplify = F,names(x),function(sample_set_name)
     unique(unlist(sapply(simplify = F,three_of_six_CDS[x[[sample_set_name]]],function(y) paste(y$CHROM,y$start))))
))
sapply(mutations_in_sample_sets_CDS,length)




h=grep("Hm_175",names(mutations_in_sample_sets),value = T)
h=sapply(1:length(h), function(x) combn(h,x),simplify = F)



Hm_175_overlaps_numbers_CDS=sapply(h,function(mat) 
     apply(mat,2,function(individuals) length(Reduce(intersect,mutations_in_sample_sets_CDS[individuals]))))


Hm_175_overlaps_2_CDS=sapply(h,function(mat) 
     unique(unlist(apply(mat,2,function(individuals) Reduce(intersect,mutations_in_sample_sets_CDS[individuals])))))


x=c(list(I_cells=I_cells,E_cells=E_cells,single_cells=c(I_cells,E_cells)),setNames(c(I_cells,E_cells),c(I_cells,E_cells)))
mutations_in_sample_sets_nonsynonymous=sapply(simplify = F,names(x),function(sample_set_name)
     unique(unlist(sapply(simplify = F,merged_sub_bulk_CDS[x[[sample_set_name]]],function(y) {y=y[which(y$nonsynonymous==1),];paste(y$chrom,y$start)})))
)

x=c(list(individuals=names(three_of_six_CDS),family_175=grep("^Hm_175",(names(three_of_six_CDS)),value = T),family_22=grep("^Hm_22",(names(three_of_six)),value = T)),setNames(names(three_of_six),names(three_of_six)))
mutations_in_sample_sets_nonsynonymous=c(mutations_in_sample_sets_nonsynonymous,sapply(simplify = F,names(x),function(sample_set_name)
     unique(unlist(sapply(simplify = F,three_of_six_CDS[x[[sample_set_name]]],function(y) {y=y[which(y$nonsynonymous==1),];paste(y$CHROM,y$start)})))
))
sapply(mutations_in_sample_sets_nonsynonymous,length)

Hm_175_overlaps_numbers_nonsynonymous=sapply(h,function(mat) 
     apply(mat,2,function(individuals) length(Reduce(intersect,mutations_in_sample_sets_nonsynonymous[individuals]))))

Hm_175_overlaps_numbers_names=sapply(h,function(mat) 
     apply(mat,2,function(individuals) paste(individuals,collapse=",")))

Hm_175_overlaps_numbers_combined=list(all=Hm_175_overlaps_numbers,genic=Hm_175_overlaps_numbers_CDS,nonsynonymous=Hm_175_overlaps_numbers_nonsynonymous,names=Hm_175_overlaps_numbers_names)

saveRDS(Hm_175_overlaps_numbers_combined,"../hydra_genetic_distance_kons/data_for_hans_and_johann.RDS")
x=readRDS("../hydra_genetic_distance_kons/data_for_hans_and_johann.RDS")


gene_model_CDS_table=read.table("hydra2.0_genemodels.CDS.bed",sep="\t",stringsAsFactors = F,col.names=c("Chrom","Start","Stop","Gene_ID"))

gene2length=sapply(unique(gene_model_CDS_table$Gene_ID),function(Gene_ID) {
     x=gene_model_CDS_table[which(gene_model_CDS_table$Gene_ID==Gene_ID),]
     sum(abs(x$Stop-x$Start)+1)
})
sum(gene2length);mean(gene2length);length(gene2length)

x=c(I_cells="genes_CDS_covered_positions/sample1-3_and_complement.bed",E_cells="genes_CDS_covered_positions/sample5-7_and_complement.bed",single_cells="genes_CDS_covered_positions/sample1-3+5-7_and_complement.bed",setNames(paste0("genes_CDS_covered_positions/",c(I_cells,E_cells),"_and_complement.bed"),c(I_cells,E_cells)),individuals="../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/all.genomecov_CDS_and_complement.bed",family_175="../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/Hm_175_all.genomecov_CDS_and_complement.bed",family_22="../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/Hm_22_all.genomecov_CDS_and_complement.bed",setNames(paste0("../hydra_genetic_distance_kons/somatic_downsampled/three_of_six/",names(three_of_six_CDS),".genomecov_CDS_and_complement.bed"),names(three_of_six_CDS)),germline="../hydra_genetic_distance_kons/germline/three_of_six/Hm_175_all.genomecov_CDS.bed")
min_coverage_genes=sapply(simplify = F,names(x),function(y) setNames(read.table(x[y],sep="\t",stringsAsFactors = F),c("Chrom","Start","Stop","Gene_ID")))
library(parallel)
cluster=makeCluster(60)
clusterExport(cluster,c("min_coverage_genes"))
genes2covered_bases=parSapply(cluster,simplify = F, names(min_coverage_genes),function(sample)
     sapply(unique(min_coverage_genes[[sample]]$Gene_ID),function(Gene_ID) {
          x=min_coverage_genes[[sample]][which(min_coverage_genes[[sample]]$Gene_ID==Gene_ID),]
          lengths=abs(x$Stop-x$Start)+1
          lengths=ifelse(lengths==1,1,lengths-2)#bedtools merge adds 1 additional base at start and stop if feature length is greater 1
          sum(lengths)
     })
)
sapply(names(three_of_six_CDS_Hm_175_overlaps_2_tables),function(x) genes2covered_bases[[x]]<<-genes2covered_bases$family_175)
genes2covered_bases$I_cells_overlap=genes2covered_bases$I_cells

sapply(genes2covered_bases,sum);sapply(genes2covered_bases,median)


#GO_table=setNames(read.table("Hmag_2.0_DT.fa.gff.go",sep="\t",stringsAsFactors = F),c("Gene_ID","GO_ID","Category","Description"))#from /misc/paras/data/genomes/Hmag/
GO_table=setNames(read.table("blast2go_results.tsv",sep="\t",stringsAsFactors = F),c("Gene_ID","GO_ID","Description"))#https://research.nhgri.nih.gov/hydra/download/?dl=fa
GO_table$Gene_ID=sub("[.]t.*","",GO_table$Gene_ID)
library(GO.db)
GO_table=GO_table[which(GO_table$GO_ID%in%keys(GO.db,"GOID")),]
GO_table$Description=Term(GOTERM[GO_table$GO_ID])
GO_overview=t(sapply(unique(GO_table$GO_ID),function(GO_ID) GO_table[which(GO_table$GO_ID==GO_ID)[1],2:ncol(GO_table)]))
GO2genes=sapply(simplify = F,unique(GO_table$GO_ID),function(GO_ID) GO_table[which(GO_table$GO_ID==GO_ID),"Gene_ID"])
sapply(1:20,function(x) length(which(sapply(GO2genes,length)>=x)))
genes2GO=t(sapply(unique(GO_table$Gene_ID),function(Gene_ID) c(GO_IDs=paste(GO_table[which(GO_table$Gene_ID==Gene_ID),"GO_ID"],collapse = ","),Descriptions=paste(GO_table[which(GO_table$Gene_ID==Gene_ID),"Description"],collapse = ","))))

syn_and_nonSyn_sites=read.table(check.names=F,"hydra2.0_genemodels.nt.longest_iso.syn_and_nonSyn.tsv",sep="\t",stringsAsFactors = F,row.names=1,col.names=c("gene","#synonymous_sites","#nonsynonymous_sites","#total_sites"))
jukes_cantor=function(dist) ifelse(dist==0,0,-(3/4)*log(1-(4/3)*dist))

h=do.call(rbind,three_of_six_CDS[grep("175",names(three_of_six_CDS))])
h=h[order(h$CHROM,h$start),]
h$id=paste(h$CHROM,h$start)
old=NA
x=unlist(sapply(1:nrow(h), function(i) if(is.na(old) || old!=h[i,"id"]) {old<<-h[i,"id"];i}))
h=h[x,]
three_of_six_CDS_Hm_175_overlaps_2_tables=setNames(sapply(simplify = F,Hm_175_overlaps_2_CDS,function(overlaps)
     h[which(h$id%in%overlaps),]),paste0("Hm_175_overlaps_min_",1:8))
sapply(three_of_six_CDS_Hm_175_overlaps_2_tables,nrow)

h=do.call(rbind,merged_sub_bulk_CDS[c("sample1","sample2","sample3")])
h=h[order(h$chrom,h$start),]
h$id=paste(h$chrom,h$start)
old=NA
x=unlist(sapply(1:nrow(h), function(i) if(is.na(old) || old!=h[i,"id"]) {old<<-h[i,"id"];i}))
h=h[x,]
x=Reduce(intersect,mutations_in_sample_sets_CDS[c("sample1","sample2","sample3")])
I_cells_overlap_CDS_table=h[which(h$id%in%x),]


h=c(merged_sub_bulk_CDS,three_of_six_CDS,three_of_six_CDS_Hm_175_overlaps_2_tables,I_cells_overlap=list(I_cells_overlap_CDS_table))
x=c(list(I_cells=I_cells,E_cells=E_cells,single_cells=c(I_cells,E_cells)),setNames(c(I_cells,E_cells),c(I_cells,E_cells)),individuals=list(names(three_of_six_CDS)),family_175=list(grep("Hm_175",names(three_of_six_CDS),value = T)),family_22=list(grep("Hm_22",names(three_of_six_CDS),value = T)),setNames(names(three_of_six_CDS),names(three_of_six_CDS)),setNames(names(three_of_six_CDS_Hm_175_overlaps_2_tables)[2:8],names(three_of_six_CDS_Hm_175_overlaps_2_tables)[2:8]),I_cells_overlap="I_cells_overlap")

gene_mutation_count_fun=function(sample_set_name){
     mutation_table=data.frame(do.call(rbind,h[x[[sample_set_name]]]))
     ret=setNames(data.frame(table(mutation_table$gene)),c("Gene_ID","#mutations"))
     ret$Gene_ID=as.character(ret$Gene_ID)
     genes=unique(ret$Gene_ID)
     ret=cbind(ret,t(sapply(genes,function(gene) colSums(mutation_table[which(mutation_table$gene==gene),c("nonsense","nonsynonymous","synonymous")])) ))
     ret=rbind(ret,cbind(Gene_ID=setdiff(names(gene2length),ret$Gene_ID),`#mutations`=0,nonsense=0,nonsynonymous=0,synonymous=0))
     sapply(c("#mutations","nonsense","nonsynonymous","synonymous"), function(x) ret[,x]<<-as.numeric(ret[,x]))
     y=genes2covered_bases[[sample_set_name]]
     z=setdiff(ret$Gene_ID,names(y))
     y=c(y,setNames(rep(0,length(z)),z))
     ret=cbind(ret,`#bases`= gene2length[ret$Gene_ID],syn_and_nonSyn_sites[ret$Gene_ID,c("#synonymous_sites","#nonsynonymous_sites")],`#covered_bases`=y[ret$Gene_ID])
     ret=cbind(ret,`%covered_bases`=ret$`#covered_bases`/ret$`#bases`*100)
     total_mutations=sum(ret$`#mutations`)
     total_mutation_rate=total_mutations/sum(ret$`#covered_bases`)
     ret=cbind(ret,`#expected_mutations`=total_mutation_rate*ret$`#covered_bases`)
     ret=cbind(ret,direction=ifelse(ret$`#mutations`>ret$`#expected_mutations`,"HIGHER","LOWER"))
     #ret=cbind(ret,binomial.pvalue=mapply(function(x,n,p) if (n<=0) NA else binom.test(x,n,p)$p.value,ret$`#mutations`,ret$`#covered_bases`,total_mutation_rate))
     #ret=cbind(ret,binomial.fdr=p.adjust(ret$binomial.pvalue,method="fdr"))
     total_non_mutations=sum(ret$`#covered_bases`)-total_mutations
     ret=cbind(ret,fisher.pvalue=mapply(function(a,b,c,d) if (b<=0) NA else fisher.test(matrix(c(a,b,c,d),nrow=2))$p.value,ret$`#mutations`,ret$`#covered_bases`-ret$`#mutations`,total_mutations-ret$`#mutations`,total_non_mutations-ret$`#covered_bases`+ret$`#mutations`))
     ret=cbind(ret,fisher.fdr=p.adjust(ret$fisher.pvalue,method="fdr")) 
     total_nonsynonymous_rate=sum(ret$nonsynonymous)/sum(ret$nonsynonymous+ret$synonymous)
     ret=cbind(ret,`#expected_nonsynonymous`=total_nonsynonymous_rate*(ret$nonsynonymous+ret$synonymous))
     ret=cbind(ret,nonsynonymous_direction=ifelse(ret$nonsynonymous>ret$`#expected_nonsynonymous`,"HIGHER","LOWER"))
     ret=cbind(ret,nonsynonymous.fisher.pvalue=mapply(function(a,b,c,d) if (a+b<=0) NA else fisher.test(matrix(c(a,b,c,d),nrow=2))$p.value,ret$nonsynonymous,ret$synonymous,sum(ret$nonsynonymous)-ret$nonsynonymous,sum(ret$synonymous)-ret$synonymous))
     ret=cbind(ret,nonsynonymous.fisher.fdr=p.adjust(ret$nonsynonymous.fisher.pvalue,method="fdr")) 
     ret=cbind(ret,`Ka/Ks`=jukes_cantor(ret$nonsynonymous/ret$`#nonsynonymous_sites`)/jukes_cantor(ret$synonymous/ret$`#synonymous_sites`) )
     z=setdiff(ret$Gene_ID,rownames(genes2GO))
     y=rbind(genes2GO,matrix(rep("",2*length(z)),nrow=length(z),ncol=2,dimnames=list(z,colnames(genes2GO))))
     ret=cbind(ret,y[ret$Gene_ID,])
     ret
}
clusterExport(cluster,c("h","genes2covered_bases","x","gene2length","genes2GO","syn_and_nonSyn_sites","jukes_cantor","gene_mutation_count_fun"))
gene_mutation_count=parSapply(cluster,simplify = F,names(x),gene_mutation_count_fun)

GO_mutation_count_fun=function(gene_mutation_table){
     #cat(gene_mutation_table,"\n")
     #gene_mutation_table=gene_mutation_count[[gene_mutation_table]]
     total_mutations=sum(gene_mutation_table$`#mutations`)
     total_mutation_rate=sum(gene_mutation_table$`#mutations`)/sum(gene_mutation_table$`#covered_bases`)
     total_non_mutations=sum(gene_mutation_table$`#covered_bases`)-total_mutations
     total_nonsynonymous=sum(gene_mutation_table$nonsynonymous)
     total_synonymous=sum(gene_mutation_table$synonymous)
     total_nonsynonymous_rate=total_nonsynonymous/(total_synonymous+total_nonsynonymous)
     GO_mutation_count=do.call(rbind,sapply(simplify = F,names(GO2genes),function(GO_ID){
          genes=GO2genes[[GO_ID]]
          ret=colSums(gene_mutation_table[which(gene_mutation_table$Gene_ID%in%genes),c("#mutations","#covered_bases","nonsense","nonsynonymous","synonymous","#bases","#synonymous_sites","#nonsynonymous_sites")])
          ret["%covered_bases"]=ret["#covered_bases"]/ret["#bases"]*100
          ret=as.list(ret)
          ret$`#expected_mutations`=total_mutation_rate*ret$`#covered_bases`
          ret$direction=if(ret$`#mutations`>ret$`#expected_mutations`) "HIGHER" else "LOWER"
          #ret$binomial.pvalue=if (ret$`#covered_bases`==0) NA else binom.test(ret$`#mutations`,ret$`#covered_bases`,total_mutation_rate)$p.value
          ret$fisher.pvalue=if (ret$`#covered_bases`==0) NA else fisher.test(matrix(c(ret$`#mutations`,ret$`#covered_bases`-ret$`#mutations`,total_mutations-ret$`#mutations`,total_non_mutations-ret$`#covered_bases`+ret$`#mutations`),nrow=2))$p.value 
          ret$`#expected_nonsynonymous`=total_nonsynonymous_rate*(ret$nonsynonymous+ret$synonymous)
          ret$nonsynonymous_direction=if(ret$nonsynonymous>ret$`#expected_nonsynonymous`) "HIGHER" else "LOWER"
          ret$nonsynonymous.fisher.pvalue=if(ret$synonymous+ret$nonsynonymous<=0) NA else fisher.test(matrix(c(ret$nonsynonymous,ret$synonymous,total_nonsynonymous-ret$nonsynonymous,total_synonymous-ret$synonymous),nrow=2))$p.value
          ret$`Ka/Ks`=jukes_cantor(ret$nonsynonymous/ret$`#nonsynonymous_sites`)/jukes_cantor(ret$synonymous/ret$`#synonymous_sites`)
          ret=c(GO_ID=GO_ID,Description=GO_overview[GO_ID,"Description"],`#genes`=length(genes),genes=paste0(genes,collapse = ","),ret)
          ret=data.frame(ret,stringsAsFactors = F,check.names = F)
     }))
     #GO_mutation_count$binomial.fdr=p.adjust(GO_mutation_count$binomial.pvalue,method="fdr")
     GO_mutation_count$fisher.fdr=p.adjust(GO_mutation_count$fisher.pvalue,method="fdr")
     GO_mutation_count$nonsynonymous.fisher.fdr=p.adjust(GO_mutation_count$nonsynonymous.fisher.pvalue,method="fdr")
     GO_mutation_count[order(GO_mutation_count$fisher.pvalue),]
}
clusterExport(cluster,c("GO2genes","GO_overview","GO_mutation_count_fun"))
GO_mutation_count=parSapply(cluster,simplify = F,gene_mutation_count, GO_mutation_count_fun)


library(openxlsx)
#write.xlsx(GO_mutation_count,"GO_mutation_count.xlsx")#für Hmag_2.0_DT.fa.gff.go
write.xlsx(GO_mutation_count,"GO_mutation_count2.xlsx")#für blast2go_results.tsv


h=do.call(rbind,three_of_six[grep("175",names(three_of_six))])
h=h[order(h$CHROM,h$POS),]
h$id=paste(h$CHROM,h$POS)
old=NA
x=unlist(sapply(1:nrow(h), function(i) if(is.na(old) || old!=h[i,"id"]) {old<<-h[i,"id"];i}))
h=h[x,]
three_of_six_Hm_175_overlaps_2_tables=setNames(sapply(simplify = F,Hm_175_overlaps_2,function(overlaps)
     h[which(h$id%in%overlaps),]),paste0("Hm_175_overlaps_min_",1:8))
sapply(three_of_six_Hm_175_overlaps_2_tables,nrow)

Hm_175_overlaps_overview=as.data.frame(sapply(three_of_six[grep("^Hm_175",names(three_of_six),value=T)],function(y) {
     id=paste(y$CHROM,y$POS)
     setNames(Hm_175_overlaps_2[[1]]%in%id,Hm_175_overlaps_2[[1]])
}))
Hm_175_overlaps_overview$id=rownames(Hm_175_overlaps_overview)
Hm_175_overlaps_overview=merge(Hm_175_overlaps_overview,three_of_six_Hm_175_overlaps_2_tables[[1]],by="id",all=T)
rownames(Hm_175_overlaps_overview)=Hm_175_overlaps_overview$id
Hm_175_overlaps_overview$CDS=rownames(Hm_175_overlaps_overview)%in%Hm_175_overlaps_2_CDS[[1]]
Hm_175_overlaps_overview=merge(Hm_175_overlaps_overview,three_of_six_CDS_Hm_175_overlaps_2_tables[[1]],all=T)
h=grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)
a=Hm_175_overlaps_overview[,h]
p=sum(unlist(a))/(nrow(a)*ncol(a))

simple_linkage=function(x,y) sum(x & y)
clusterExport(cluster,"simple_linkage")

mutation_linkage=function(mut_matrix,method="simple_linkage",cluster=NULL) parApply(cluster,mut_matrix,1,function(x) 
     apply(mut_matrix,1,function(y) do.call(method,list(x,y))))

mutation_linkage_table=mutation_linkage(a,method="simple_linkage",cluster=cluster) #-> dauert zu lang -> abgebrochen -> vielelicht Gen- oder Pathwaysweise?

three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(x)
     merge(x,Hm_175_overlaps_overview[,c("id",grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T))],by="id"))



cluster=makeCluster(detectCores()-1)
ids=three_of_six_Hm_175_overlaps_2_tables$Hm_175_overlaps_min_1$id
h=grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)[1]
clusterExport(cluster,c("ids","h"))
a=parSapply(cluster,simplify = F,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),function(name){
     x=read.table(gzfile(paste0("../hydra_genetic_distance_kons/kons_analysis/results/somatic/downsampled_single/variants/bwa/vardict/",name,".fixed.nomulti.normed.vcf.gz")),sep="\t",stringsAsFactors = F)
     x$id=paste(x$V1,x$V2) 
     x=x[which(x$id%in%ids),c("id","V9","V10","V11")]
     c=which(strsplit(x[1,"V9"],":")[[1]]=="MAF")
     c2=which(strsplit(x[1,"V9"],":")[[1]]=="COV")
     x$mother_MAF=as.numeric(sapply(strsplit(x[,"V10"],":"),function(x) x[c]))
     x$mother_COV=as.numeric(sapply(strsplit(x[,"V10"],":"),function(x) x[c2]))
     x[,paste0(name,".MAF")]=as.numeric(sapply(strsplit(x[,"V11"],":"),function(x) x[c]))
     x[,paste0(name,".COV")]=as.numeric(sapply(strsplit(x[,"V11"],":"),function(x) x[c2]))
     #  sapply(simplify=F, names(three_of_six_Hm_175_overlaps_2_tables),function(y){
     #    table=a[[y]]
     #    ids=table[which(table[,name]),"id"]
     #    x=x[which(x$id%in%ids),if (h) c("id","mother_MAF","mother_COV","MAF","COV") else c("id","MAF","COV")]
     #   rownames(table)=table$id
     #    table[x$id,if (h) c("mother_MAF","mother_COV",) else paste0(name,c(".MAF",".COV"))]=x[,2:ncol(x)]
     #    table
     #    a[[y]]<<-table
     #  })
     x[,c("id","mother_MAF","mother_COV",paste0(name,c(".MAF",".COV")))]
})
three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(x) {
     x[,c("mother_MAF","mother_COV")]=NA
     rownames(x)=x$id
     sapply(simplify = F,a,function(y) {
          y=y[which(y$id%in%x$id),]
          h=which(is.na(x[y$id,"mother_MAF"]))
          x[y$id[h],c("mother_MAF","mother_COV")]<<-y[h,c("mother_MAF","mother_COV")]
          x[y$id,colnames(y)[4:5]]<<-y[,4:5]
     });x})

three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(x) {
     x$CDS=(x$id%in%three_of_six_CDS_Hm_175_overlaps_2_tables[[1]]$id);x})

three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(x) {
     x$nonsense=NA;x$nonsynonymous=NA;x$synonymous=NA;x$gene=NA
     rownames(x)=x$id;
     y=three_of_six_CDS_Hm_175_overlaps_2_tables[[1]]
     rownames(y)=y$id
     x[intersect(three_of_six_CDS_Hm_175_overlaps_2_tables[[1]]$id,x$id),c("nonsense","nonsynonymous","synonymous","gene")]=y[intersect(y$id,x$id),c("nonsense","nonsynonymous","synonymous","gene")]
     x
})

three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(x) {
     x$individuals_count=rowSums(x[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])
     x
})

h=unlist(sapply(merged_sub_bulk,function(x) x$id))
three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(x) {
     x$single_cell=x$id%in%h
     x
})
length(which(three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell))


table=three_of_six_Hm_175_overlaps_2_tables[[1]]
MAF_COV_data=sapply(simplify=F,three_of_six_Hm_175_overlaps_2_tables,function(table) {
     MAFs=unlist(table[,grep("Hm_175.*[.]MAF",colnames(table),perl=T)])
     COVs=unlist(table[,grep("Hm_175.*[.]COV",colnames(table),perl=T)])
     MUTs=unlist(table[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])
     filtered_MAFs_table=ifelse(as.matrix(table[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]),T,NA)*table[,grep("Hm_175.*[.]MAF",colnames(table),perl=T)]
     other_MAFs_table=ifelse(as.matrix(table[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]),NA,T)*table[,grep("Hm_175.*[.]MAF",colnames(table),perl=T)]
     list(filtered_MAFs=MAFs[MUTs],
          other_MAFs=MAFs[!MUTs],
          mother_MAFs=table$mother_MAF,
          filtered_MAFs_mean=rowMeans(filtered_MAFs_table,na.rm=T),
          filtered_MAFs_median=apply(filtered_MAFs_table,1,function(x) median(x,na.rm=T)),
          other_MAFs_mean=rowMeans(other_MAFs_table,na.rm=T),
          other_MAFs_median=apply(other_MAFs_table,1,function(x) median(x,na.rm=T)),
          filtered_COVs=COVs[MUTs],
          other_COVs=COVs[!MUTs],
          mother_COVs=table$mother_COV
     )
})
sapply(MAF_COV_data,function(x) mean(x$mother_COVs,na.rm=T))
sapply(MAF_COV_data,function(x) median(x$mother_COVs,na.rm=T))
plot(density(MAF_COV_data$Hm_175_overlaps_min_1$filtered_COVs[which(MAF_COV_data$Hm_175_overlaps_min_1$filtered_COVs<100)]))
plot(density(MAF_COV_data$Hm_175_overlaps_min_1$other_COVs[which(MAF_COV_data$Hm_175_overlaps_min_1$other_COVs<100)]))
plot(density(MAF_COV_data$Hm_175_overlaps_min_1$mother_COVs[which(MAF_COV_data$Hm_175_overlaps_min_1$mother_COVs<100)]))
plot(density(MAF_COV_data$Hm_175_overlaps_min_8$filtered_COVs[which(MAF_COV_data$Hm_175_overlaps_min_8$filtered_COVs<100)]))
plot(density(MAF_COV_data$Hm_175_overlaps_min_8$mother_COVs[which(MAF_COV_data$Hm_175_overlaps_min_8$mother_COVs<100)]))
t.test(MAF_COV_data$Hm_175_overlaps_min_1$filtered_COVs,MAF_COV_data$Hm_175_overlaps_min_8$filtered_COVs)
t.test(MAF_COV_data$Hm_175_overlaps_min_7$filtered_COVs,MAF_COV_data$Hm_175_overlaps_min_8$filtered_COVs)
t.test(MAF_COV_data$Hm_175_overlaps_min_7$mother_COVs,MAF_COV_data$Hm_175_overlaps_min_8$mother_COVs)

plot(density(na.omit(MAF_COV_data$Hm_175_overlaps_min_1$filtered_MAFs_median-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs)))
plot(density(na.omit(MAF_COV_data$Hm_175_overlaps_min_1$filtered_MAFs_mean-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs)))
plot(density(na.omit(MAF_COV_data$Hm_175_overlaps_min_1$other_MAFs_mean-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs)))
quantile(na.omit(MAF_COV_data$Hm_175_overlaps_min_1$filtered_MAFs_mean-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs),probs = seq(0,1,0.1))
quantile(na.omit(MAF_COV_data$Hm_175_overlaps_min_1$other_MAFs_mean-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs),probs = seq(0,1,0.1))

png("Variant_overlaps_MAFs.png",width=1000,height=1000)
par(mfrow=c(4,2))
sapply(MAF_COV_data,function(x){
     with(x,{
          a=density(filtered_MAFs)
          if(length(other_COVs)>0) b=density(other_MAFs)
          c=density(mother_MAFs)
          plot(ylim=c(0,max(a$y,b$y,c$y)),a,main=paste0("Filtered MAFs, n=",length(filtered_MAFs)," mean=",format(mean(filtered_MAFs),digits=2),", sd=",format(sd(filtered_MAFs),digits=2),
                                                        "\nOther MAFs n=",length(other_MAFs),", mean=",format(mean(other_MAFs),digits=2),", sd=",format(sd(other_MAFs),digits=2),
                                                        "\nMother MAFs n=",length(mother_MAFs),", mean=",format(mean(mother_MAFs),digits=2),", sd=",format(sd(mother_MAFs),digits=2)))
          if(length(other_COVs)>0)lines(b,lty=2)
          lines(c,col="orange")
          legend("topright",c("Filtered MAFs","Other MAFs","Mother MAFs"),col=c("black","black","orange"),lty=c(1,2,1))
     })
})
dev.off()

png("Variant_overlaps_COVs.png",width=960,height=960)
par(mfrow=c(3,1))
boxplot(sapply(MAF_COV_data,function(x) x$filtered_COV),ylim=c(0,50),main="Coverage filtered variants",names=1:8)
boxplot(sapply(MAF_COV_data,function(x) x$other_COV),ylim=c(0,50),main="Coverage other variants (removed during filtering)",names=1:8)
boxplot(sapply(MAF_COV_data,function(x) x$mother_COV),ylim=c(0,50),main="Coverage mother (at variant positions)",names=1:8)
dev.off()

three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(table) {
     MAFs_table=table[,grep("Hm_175.*[.]MAF",colnames(table),perl=T)]
     COVs_table=table[,grep("Hm_175.*[.]COV",colnames(table),perl=T)]
     buds_tumor_reads=round(MAFs_table*COVs_table)
     buds_normal_reads=COVs_table-buds_tumor_reads
     mother_tumor_reads=round(table$mother_MAF*table$mother_COV)
     mother_normal_reads=table$mother_COV-mother_tumor_reads
     clusterExport(cluster,c("buds_tumor_reads","buds_normal_reads","mother_tumor_reads","mother_normal_reads"),envir=environment())
     ret=t(parSapply(cluster,1:nrow(buds_tumor_reads),function(x) sapply(1:ncol(buds_tumor_reads),function(y){
          if (is.na(buds_tumor_reads[x,y]) || is.na(buds_tumor_reads[x,y])) NA
          else fisher.test(matrix(nrow=2,c(buds_tumor_reads[x,y],mother_tumor_reads[x],buds_normal_reads[x,y],mother_normal_reads[x])),alternative="greater")$p.value
     })))
     colnames(ret)=paste0(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),".pval")
     cbind(table,ret)
})
table(rowSums(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]))

three_of_six_Hm_175_overlaps_2_tables=sapply(simplify = F,three_of_six_Hm_175_overlaps_2_tables,function(table) {
     pvalue_table=ifelse(as.matrix(table[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]),T,NA)*table[,grep("Hm_175.*[.]pval",colnames(table),perl=T)]
     fdr_table=sapply(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),function(name) {
          fdr_n=individuals_mut_numbers$germline[name,"three_of_six.simple_filter"]
          p.adjust(pvalue_table[,paste0(name,".pval")],method="fdr",n=fdr_n)
     })
     colnames(fdr_table)=paste0(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),".FDR")
     cbind(table[,1:40],fdr_table)
})


three_of_six_Hm_175_overlaps_2_tables_filtered=three_of_six_Hm_175_overlaps_2_tables[[1]] 
h=three_of_six_Hm_175_overlaps_2_tables_filtered[,grep("Hm_175.*[.]FDR",colnames(three_of_six_Hm_175_overlaps_2_tables_filtered),perl=T)]<0.05
three_of_six_Hm_175_overlaps_2_tables_filtered[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]=ifelse(is.na(h),F,h)
three_of_six_Hm_175_overlaps_2_tables_filtered=three_of_six_Hm_175_overlaps_2_tables_filtered[which(rowSums(three_of_six_Hm_175_overlaps_2_tables_filtered[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])>=1),]
nrow(three_of_six_Hm_175_overlaps_2_tables_filtered)
colSums(three_of_six_Hm_175_overlaps_2_tables_filtered[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])  


h=three_of_six_Hm_175_overlaps_2_tables_filtered[which(three_of_six_Hm_175_overlaps_2_tables_filtered$id%in%Hm_175_overlaps_overview[which(Hm_175_overlaps_overview$CDS),"id"]),]
nrow(h)
h=merge(h,Hm_175_overlaps_overview[,c("id","gene","nonsense","nonsynonymous","synonymous")],by="id")
View(h)
h$gene
genes2GO[h$gene[which(h$gene%in%rownames(genes2GO))],]#-> not related to any GO terms

three_of_six_Hm_175_overlaps_2_tables_filtered2=three_of_six_Hm_175_overlaps_2_tables[[1]] 
h=three_of_six_Hm_175_overlaps_2_tables_filtered2[,grep("Hm_175.*[.]MAF",colnames(three_of_six_Hm_175_overlaps_2_tables_filtered2),perl=T)]
h=(h-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs)>0.4
three_of_six_Hm_175_overlaps_2_tables_filtered2[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]=ifelse(is.na(h),F,h)
three_of_six_Hm_175_overlaps_2_tables_filtered3=three_of_six_Hm_175_overlaps_2_tables_filtered2
three_of_six_Hm_175_overlaps_2_tables_filtered3[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]=three_of_six_Hm_175_overlaps_2_tables_filtered3[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)] & MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs==0

three_of_six_Hm_175_overlaps_2_tables_filtered2=three_of_six_Hm_175_overlaps_2_tables_filtered2[which(rowSums(three_of_six_Hm_175_overlaps_2_tables_filtered2[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])>=1),]
nrow(three_of_six_Hm_175_overlaps_2_tables_filtered2)
colSums(three_of_six_Hm_175_overlaps_2_tables_filtered2[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])  
table(rowSums(three_of_six_Hm_175_overlaps_2_tables_filtered2[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]))  

three_of_six_Hm_175_overlaps_2_tables_filtered3=three_of_six_Hm_175_overlaps_2_tables_filtered3[which(rowSums(three_of_six_Hm_175_overlaps_2_tables_filtered3[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])>=1),]
nrow(three_of_six_Hm_175_overlaps_2_tables_filtered3)
colSums(three_of_six_Hm_175_overlaps_2_tables_filtered3[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])  
table(rowSums(three_of_six_Hm_175_overlaps_2_tables_filtered3[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]))

plot(density(na.omit(three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]]$mother_COV<=100),"mother_COV"])))
three_of_six_Hm_175_overlaps_2_tables_filtered4=three_of_six_Hm_175_overlaps_2_tables[[1]] 
t=40
three_of_six_Hm_175_overlaps_2_tables_filtered4=three_of_six_Hm_175_overlaps_2_tables_filtered4[which(three_of_six_Hm_175_overlaps_2_tables_filtered4$mother_COV>=t),]
nrow(three_of_six_Hm_175_overlaps_2_tables_filtered4)
table(rowSums(three_of_six_Hm_175_overlaps_2_tables_filtered4[,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]))  

length(which(three_of_six_Hm_175_overlaps_2_tables_filtered4$id%in%three_of_six_CDS_Hm_175_overlaps_2_tables$Hm_175_overlaps_min_1$id))
length(which(three_of_six_Hm_175_overlaps_2_tables[[1]]$id%in%three_of_six_CDS_Hm_175_overlaps_2_tables$Hm_175_overlaps_min_1$id))

plot(MAF_COV_data$Hm_175_overlaps_min_1$mother_COVs,MAF_COV_data$Hm_175_overlaps_min_1$mother_MAF,pch=20,cex=0.1,log="x")
plot(MAF_COV_data$Hm_175_overlaps_min_1$mother_COVs,MAF_COV_data$Hm_175_overlaps_min_1$filtered_MAFs_mean-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAF,pch=20,cex=0.1,log="x")

h=which(unlist(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]))
x=three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("Hm_175.*[.]MAF",colnames(three_of_six_Hm_175_overlaps_2_tables[[1]]),perl=T)]
x=unlist(x-MAF_COV_data$Hm_175_overlaps_min_1$mother_MAFs)[h]
y=rep(three_of_six_Hm_175_overlaps_2_tables[[1]]$mother_MAF,length(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)))[h]
z=unlist(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("Hm_175.*[.]pval",colnames(three_of_six_Hm_175_overlaps_2_tables[[1]]),perl=T)])[h]
MAF_COV_data2=as.data.frame(cbind(mother_MAF=y,MAF_dif=x,pval=z,logp=-log(z)))
y=rep(three_of_six_Hm_175_overlaps_2_tables[[1]]$mother_COV,length(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)))[h]
MAF_COV_data2=as.data.frame(cbind(MAF_COV_data2,mother_COV=y))
x=unlist(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("Hm_175.*[.]COV",colnames(three_of_six_Hm_175_overlaps_2_tables[[1]]),perl=T)])[h]
MAF_COV_data2=cbind(MAF_COV_data2,bud_COV=x,min_mother_bud_cov=pmin(MAF_COV_data2$mother_COV,x))
x=unlist(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("Hm_175.*[.]MAF",colnames(three_of_six_Hm_175_overlaps_2_tables[[1]]),perl=T)])[h]
MAF_COV_data2=cbind(MAF_COV_data2,bud_MAF=x)
MAF_COV_data2$individuals_count=rep(rowSums(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]),length(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)))[h]
MAF_COV_data2$FDR=unlist(three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("Hm_175.*[.]FDR",colnames(three_of_six_Hm_175_overlaps_2_tables[[1]]),perl=T)])[h]
MAF_COV_data2$significant=MAF_COV_data2$FDR<0.05
MAF_COV_data2$id=rep(three_of_six_Hm_175_overlaps_2_tables[[1]]$id,length(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)))[h]
MAF_COV_data2$CDS=MAF_COV_data2$id%in%three_of_six_CDS_Hm_175_overlaps_2_tables[[1]]$id
x=unlist(sapply(merged_sub_bulk,function(x) x$id))
MAF_COV_data2$single_cell=MAF_COV_data2$id%in%x
x=unlist(sapply(merged_sub_bulk,function(x) x[which(x$homozygous),"id"]))
MAF_COV_data2$single_cell_homozygous=MAF_COV_data2$id%in%x
MAF_COV_data2$I_cell_overlap=MAF_COV_data2$id%in%I_cells_overlap
MAF_COV_data2$nonsynonymous=rep(as.logical(three_of_six_Hm_175_overlaps_2_tables[[1]]$nonsynonymous),length(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)))[h]
MAF_COV_data2$hover=paste0(MAF_COV_data2$id,"\nMother Cov=",MAF_COV_data2$mother_COV,"\nBud Cov=",MAF_COV_data2$bud_COV,"\nMother MAF=",MAF_COV_data2$mother_MAF,"\nBud MAF=",MAF_COV_data2$bud_MAF,"\nIndividual Count=",MAF_COV_data2$individuals_count,"\nCDS=",MAF_COV_data2$CDS,"\nnonsynonymous=",MAF_COV_data2$nonsynonymous)

color_gradient=function(x, colors=c("green","red"), colsteps=100) {
     return( colorRampPalette(colors) (colsteps) [ findInterval(x, seq(min(x,na.rm = T),max(x,na.rm=T), length.out=colsteps)) ] )
}
#MAF_COV_data2$col=color_gradient(order(MAF_COV_data2$pval))
MAF_COV_data2$col=color_gradient(MAF_COV_data2$individuals_count)


library(plotly)
plot(MAF_COV_data2$mother_COV,MAF_COV_data2$mother_MAF,pch=20,cex=0.1,log="x",col=MAF_COV_data2$col,xlim = c(10,max(MAF_COV_data2$mother_COV,na.rm = T)))
plot(MAF_COV_data2$mother_COV,MAF_COV_data2$MAF_dif,pch=20,cex=0.1,log="x",col=MAF_COV_data2$col,xlim = c(10,max(MAF_COV_data2$mother_COV,na.rm = T)))
plot(MAF_COV_data2$bud_COV,MAF_COV_data2$bud_MAF,pch=20,cex=0.1,log="x",col=MAF_COV_data2$col,xlim = c(10,max(MAF_COV_data2$bud_COV,na.rm = T)))
plot(MAF_COV_data2$bud_COV,MAF_COV_data2$MAF_dif,pch=20,cex=0.1,log="x",col=MAF_COV_data2$col,xlim = c(10,max(MAF_COV_data2$bud_COV,na.rm = T)))
plot(MAF_COV_data2$min_mother_bud_cov,MAF_COV_data2$MAF_dif,pch=20,cex=0.1,log="x",col=MAF_COV_data2$col,xlim = c(10,max(MAF_COV_data2$min_mother_bud_cov,na.rm = T)))

a=add_markers(plot_ly(MAF_COV_data2,x=~mother_COV,y=~bud_COV,z=abs(MAF_COV_data2$MAF_dif),color=~individuals_count,colorscale = c('#FFE1A1', '#683531'), 
                      showscale = TRUE,marker=list(symbol="circle",sizemode="diameter"),size=ifelse(MAF_COV_data2$significant,1,2),sizes=c(10,3),text=MAF_COV_data2$hover ,hoverinfo="text"))
a=layout(a, scene=list(xaxis = list(title="Coverage mother",type = "log",range=log(base=10,c(10,max(MAF_COV_data2$mother_COV,na.rm=T)))),yaxis = list(type = "log",title="Coverage bud",range=log(base=10,c(10,max(MAF_COV_data2$bud_COV,na.rm=T)))),zaxis=list(title="MAF dif")))
a

a=add_markers(plot_ly(MAF_COV_data2,x=~mother_COV,y=~bud_COV,z=abs(MAF_COV_data2$MAF_dif),color=ifelse(MAF_COV_data2$significant,"FDR significant",ifelse(MAF_COV_data2$single_cell,"single cell confirmed","other")), 
                      showscale = TRUE,marker=list(symbol="circle",sizemode="diameter"),size=ifelse(MAF_COV_data2$significant,1,2),sizes=c(10,3)))
a=layout(a, scene=list(xaxis = list(title="Coverage mother",type = "log",range=log(base=10,c(10,max(MAF_COV_data2$mother_COV,na.rm=T)))),yaxis = list(type = "log",title="Coverage bud",range=log(base=10,c(10,max(MAF_COV_data2$bud_COV,na.rm=T)))),zaxis=list(title="MAF dif")))
a

a=add_markers(plot_ly(MAF_COV_data2[which(MAF_COV_data2$significant),],x=~mother_COV,y=~bud_COV,z=~MAF_dif,color=~individuals_count,colorscale = c('#FFE1A1', '#683531'), 
                      showscale = TRUE,marker=list(symbol=ifelse(MAF_COV_data2[which(MAF_COV_data2$significant),"CDS"],"x","circle"),sizemode="diameter"),size=1,sizes=c(10,3),text=MAF_COV_data2[which(MAF_COV_data2$significant),"hover"] ,hoverinfo="text"))
a=layout(a, scene=list(xaxis = list(title="Coverage mother",type = "log",range=log(base=10,c(10,max(MAF_COV_data2$mother_COV,na.rm=T)))),yaxis = list(type = "log",title="Coverage bud",range=log(base=10,c(10,max(MAF_COV_data2$bud_COV,na.rm=T)))),zaxis=list(title="MAF dif")))
a


median(MAF_COV_data2$pval,na.rm=T)
median(MAF_COV_data2$pval,na.rm=T)*individuals_mut_numbers$germline["Hm_175_DNA_v4_S4","three_of_six.simple_filter"]
nrow(three_of_six_Hm_175_overlaps_2_tables[[1]]) 

x=MAF_COV_data2[which(MAF_COV_data2$significant),]
cor(x$FDR,x$individuals_count,method = "spearman")
cor.test(x$FDR,x$individuals_count,method = "spearman")

length(which(MAF_COV_data2$CDS))
MAF_COV_data2[which(MAF_COV_data2$CDS & MAF_COV_data2$significant),]
median(MAF_COV_data2[which(MAF_COV_data2$CDS),"pval"],na.rm=T)
median(MAF_COV_data2[which(!MAF_COV_data2$CDS),"pval"],na.rm=T)

x=p.adjust(MAF_COV_data2[which(MAF_COV_data2$CDS),"pval"],method="fdr",n=individuals_mut_numbers$germline["Hm_175_DNA_v4_S4","three_of_six.simple_filter"]*0.03)
length(which(x<0.05))


length(which(MAF_COV_data2$nonsynonymous))
MAF_COV_data2[which(MAF_COV_data2$nonsynonymous & MAF_COV_data2$significant),]
median(MAF_COV_data2[which(MAF_COV_data2$nonsynonymous),"pval"],na.rm=T)
median(MAF_COV_data2[which(!MAF_COV_data2$nonsynonymous),"pval"],na.rm=T)

a=add_markers(plot_ly(MAF_COV_data2,x=~mother_COV,y=~bud_COV,z=abs(MAF_COV_data2$MAF_dif),color=ifelse(!MAF_COV_data2$CDS,"not CDS", ifelse(!as.logical(MAF_COV_data2$nonsynonymous),"CDS Synonymous", "CDS Nonsynonymous")), 
                      showscale = TRUE,marker=list(symbol="circle",sizemode="diameter"),size=ifelse(MAF_COV_data2$significant,1,2),sizes=c(10,3),text=MAF_COV_data2$hover ,hoverinfo="text"))
a=layout(a, scene=list(xaxis = list(title="Coverage mother",type = "log",range=log(base=10,c(10,max(MAF_COV_data2$mother_COV,na.rm=T)))),yaxis = list(type = "log",title="Coverage bud",range=log(base=10,c(10,max(MAF_COV_data2$bud_COV,na.rm=T)))),zaxis=list(title="MAF dif")))
a
quantile(MAF_COV_data2[which(MAF_COV_data2$nonsynonymous),"pval"],seq(0,1,0.1),na.rm=T)
quantile(MAF_COV_data2[which(!MAF_COV_data2$nonsynonymous),"pval"],seq(0,1,0.1),na.rm=T)
quantile(MAF_COV_data2[which(MAF_COV_data2$nonsynonymous),"pval"],seq(0,0.1,0.005),na.rm=T)
quantile(MAF_COV_data2[which(!MAF_COV_data2$nonsynonymous),"pval"],seq(0,0.1,0.005),na.rm=T)

length(which(MAF_COV_data2$single_cell))
MAF_COV_data2[which(MAF_COV_data2$single_cell & MAF_COV_data2$significant),]
median(MAF_COV_data2[which(!MAF_COV_data2$single_cell),"pval"],na.rm=T)
median(MAF_COV_data2[which(MAF_COV_data2$single_cell),"pval"],na.rm=T)

length(which(MAF_COV_data2$single_cell_homozygous))
MAF_COV_data2[which(MAF_COV_data2$single_cell_homozygous & MAF_COV_data2$significant),]
median(MAF_COV_data2[which(!MAF_COV_data2$single_cell_homozygous),"pval"],na.rm=T)
median(MAF_COV_data2[which(MAF_COV_data2$single_cell_homozygous),"pval"],na.rm=T)

length(which(MAF_COV_data2$I_cell_overlap))
median(MAF_COV_data2[which(MAF_COV_data2$I_cell_overlap),"pval"],na.rm=T)
MAF_COV_data2[which(MAF_COV_data2$I_cell_overlap),"pval"]

length(which(MAF_COV_data2$single_cell & !MAF_COV_data2$single_cell_homozygous))
median(MAF_COV_data2[which(!(MAF_COV_data2$single_cell & !MAF_COV_data2$single_cell_homozygous)),"pval"],na.rm=T)
median(MAF_COV_data2[which(MAF_COV_data2$single_cell & !MAF_COV_data2$single_cell_homozygous),"pval"],na.rm=T)

plot(density(na.omit(MAF_COV_data2[,"pval"])),log="x")
lines(density(na.omit(MAF_COV_data2[which(MAF_COV_data2$CDS),"pval"])),col="red")
lines(density(na.omit(MAF_COV_data2[which(MAF_COV_data2$CDS),"pval"])),col="violet")
lines(density(na.omit(MAF_COV_data2[which(MAF_COV_data2$single_cell),"pval"])),col="blue")
lines(density(na.omit(MAF_COV_data2[which(MAF_COV_data2$single_cell & !MAF_COV_data2$single_cell_homozygous),"pval"])),col="green")
legend("topright",c("all","in CDS","in CDS & nonsyn","in single cells","in single Cells & not homo"),col = c("black","red","violet","blue","green"),lty=1)

a=add_markers(plot_ly(MAF_COV_data2,x=~mother_COV,y=~bud_COV,z=abs(MAF_COV_data2$MAF_dif),color=ifelse(!MAF_COV_data2$single_cell,"not single cell", ifelse(!MAF_COV_data2$single_cell_homozygous,"sc but not homo", "sc and homo")), 
                      showscale = TRUE,marker=list(symbol="circle",sizemode="diameter"),size=ifelse(MAF_COV_data2$significant,1,2),sizes=c(10,3),text=MAF_COV_data2$hover ,hoverinfo="text"))
a=layout(a, scene=list(xaxis = list(title="Coverage mother",type = "log",range=log(base=10,c(10,max(MAF_COV_data2$mother_COV,na.rm=T)))),yaxis = list(type = "log",title="Coverage bud",range=log(base=10,c(10,max(MAF_COV_data2$bud_COV,na.rm=T)))),zaxis=list(title="MAF dif")))
a


length(which(MAF_COV_data2$single_cell))
length(which(MAF_COV_data2$single_cell & MAF_COV_data2$CDS))
colSums(three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell),grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])
colSums(three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell & three_of_six_Hm_175_overlaps_2_tables[[1]]$CDS),grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)])
h=apply(three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell),grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)],2,
        function(x) three_of_six_Hm_175_overlaps_2_tables[[1]][which(x),"id"])
sapply(h,function(x) sapply(h, function(y) length(intersect(x,y))))
hist(table(unlist(h,recursive = T)))




View(GO_mutation_count$I_cells_overlap)#->Die drei Mutationen der meisten Terme gehen auf Gen Sc4wPfr_672.g32778 zurück
View(gene_mutation_count$I_cells_overlap)#->Sc4wPfr_672.g32778 ist einziges der drei für Mutationen angereicherten Gene, das mit Go-Termen verbunden ist
merged_sub_bulk_CDS$sample1[which(merged_sub_bulk_CDS$sample1$gene=="Sc4wPfr_672.g32778"),] #-> drei zusammanehängende Mutationen

nrow(I_cells_overlap_CDS_table)
length(unique(I_cells_overlap_CDS_table$gene))
I_cells_overlap_CDS_table=cbind(I_cells_overlap_CDS_table,t(sapply(I_cells_overlap_CDS_table$gene,function(gene) if (gene%in%rownames(genes2GO)) genes2GO[gene,] else c(GO_IDs=NA,Descriptions=NA))),stringsAsFactors=F)

plot(density(merged_sub_bulk$sample1$tumor_f));median(merged_sub_bulk$sample1$tumor_f)
plot(density(merged_sub_bulk$sample2$tumor_f))
plot(density(merged_sub_bulk$sample3$tumor_f))
plot(density(merged$sample1$tumor_f))
plot(density(mutect$sample1$tumor_f))
plot(density(varscan$sample1$tumor_reads2/(varscan$sample1$tumor_reads2+varscan$sample1$tumor_reads1)))

merged_sub_bulk=sapply(simplify = F,names(merged_sub_bulk),function(x) {
     merged_sub_bulk[[x]]$id=paste(merged_sub_bulk[[x]]$contig,merged_sub_bulk[[x]]$position)
     merged_sub_bulk[[x]]
})
I_cells_overlap=Reduce(intersect,sapply(merged_sub_bulk[I_cells],function(x) x$id))
x=merged_sub_bulk$sample1[which(merged_sub_bulk$sample1$id%in%I_cells_overlap),"tumor_f"]
plot(density(x));median(x)


germline_three_of_six_175=parSapply(cluster,simplify = F,c("Hm_175_DNA_v4_S4",grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)),function(sample) {
     gz=gzfile(paste0("../hydra_genetic_distance_kons/germline/three_of_six/",sample,".fixed.nomulti.normed.simple_filter.vcf.gz"))
     setNames(read.table(gz,sep="\t",stringsAsFactors = F),c("CHROM","POS","REF","ALT","TOOLS_CODE"))[,c("CHROM","POS","ALT")]
})

germline_three_of_six_175=parSapply(cluster,simplify = F,germline_three_of_six_175,function(x) paste(x$CHROM,x$POS))

clusterExport(cluster,"merged_sub_bulk")
a=parSapply(cluster,germline_three_of_six_175,function(y) sapply(merged_sub_bulk, function(x) x$id%in%y))
rm(germline_three_of_six_175)
y=apply(a,1,function(x) as.data.frame(do.call(cbind,x)))
sapply(names(y),function (x) y[[x]]$individuals_count<<-rowSums(y[[x]]))
merged_sub_bulk=sapply(simplify=F,names(merged_sub_bulk),function(x) cbind(merged_sub_bulk[[x]][,setdiff(colnames(merged_sub_bulk[[x]]),colnames(y[[x]]))],y[[x]]))
sapply(names(merged_sub_bulk), function(sample){
     x=merged_sub_bulk[[sample]][,"individuals_count"]
     hist(x,main=paste0(sample,", ", "mean=",format(digits=2,mean(x)),", median=",format(digits=2,median(x)),", sd=",format(digits=2,sd(x))),col="red")
     y=merged_sub_bulk[[sample]][which(merged_sub_bulk[[sample]]$tumor_f>=1),"individuals_count"]
     hist(y,add=T,col="blue")
     legend("top",c("homozygous","heterozygous"),col = c("blue","red"),bty="n",pch=20)
})

I_cell_overlap_tables=sapply(simplify = F,names(merged_sub_bulk[I_cells]), function(sample) merged_sub_bulk[[sample]][which(merged_sub_bulk[[sample]]$id%in%I_cells_overlap),])  
sapply(I_cell_overlap_tables,nrow)
I_cell_overlap_MAFs=sapply(I_cell_overlap_tables,function(x) x$tumor_f)
sum(I_cell_overlap_MAFs==1)

merged_sub_bulk=sapply(simplify = F,merged_sub_bulk,function(x) {x$homozygous=(x$tumor_f>=1);x})
I_cell_overlap_tables=sapply(simplify = F,I_cell_overlap_tables,function(x) {x$homozygous=(x$tumor_f>=1);x})


sapply(names(I_cell_overlap_tables),function(x) {
     c(I_cell_overlap.individuals_count.median=median(I_cell_overlap_tables[[x]]$individuals_count),I_cell_overlap.individuals_count.mean=mean(I_cell_overlap_tables[[x]]$individuals_count),I_cell_overlap.individuals_count.sd=sd(I_cell_overlap_tables[[x]]$individuals_count),
       I_cell_overlap.homozygous=length(which(I_cell_overlap_tables[[x]]$homozygous)),I_cell_overlap.homozygous_fraction=length(which(I_cell_overlap_tables[[x]]$homozygous))/nrow(I_cell_overlap_tables[[x]]),
       I_cells.individuals_count.median=median(merged_sub_bulk[[x]]$individuals_count),I_cells.individuals_count.mean=mean(merged_sub_bulk[[x]]$individuals_count),I_cells.individuals_count.sd=sd(merged_sub_bulk[[x]]$individuals_count),
       I_cells.homozygous=length(which(merged_sub_bulk[[x]]$tumor_f==1)),I_cells.homozygous_fraction=length(which(merged_sub_bulk[[x]]$homozygous))/nrow(merged_sub_bulk[[x]]),
       I_cells.0_individuals.homozygous=length(which((merged_sub_bulk[[x]]$homozygous) &  merged_sub_bulk[[x]]$individuals_count==0)),I_cells.0_individuals.homozygous_fraction=length(which((merged_sub_bulk[[x]]$homozygous) &  merged_sub_bulk[[x]]$individuals_count==0))/length(which((merged_sub_bulk[[x]]$individuals_count==0))),
       I_cells.9_individuals.homozygous=length(which((merged_sub_bulk[[x]]$homozygous) &  merged_sub_bulk[[x]]$individuals_count==9)),I_cells.9_individuals.homozygous_fraction=length(which((merged_sub_bulk[[x]]$homozygous) &  merged_sub_bulk[[x]]$individuals_count==9))/length(which((merged_sub_bulk[[x]]$individuals_count==9))))
     
})


h=sapply(simplify = F,names(merged_sub_bulk),function(y){x=merged_sub_bulk[[y]];colnames(x)[82]="sample_info";x$sample=y;x})
merged_sub_bulk_table=do.call(rbind,h)
h=unlist(sapply(merged_sub_bulk_CDS,function(x) paste(x$chrom,x$start)))
merged_sub_bulk_table$CDS=merged_sub_bulk_table$id%in%h
sum(merged_sub_bulk_table$CDS)
nrow(merged_sub_bulk_table)

sapply(simplify = F,names(merged_sub_bulk),function(sample){
     y=merged_sub_bulk_CDS[[sample]]
     y$id=paste(y$chrom,y$start)
     x=merge(merged_sub_bulk_table[which(merged_sub_bulk_table$sample==sample & merged_sub_bulk_table$CDS),c("id","contig","position")],y[,c("id","nonsense","nonsynonymous","synonymous","gene")],by="id",all.x=T)
     merged_sub_bulk_table[which(merged_sub_bulk_table$sample==sample & merged_sub_bulk_table$CDS),c("nonsense","nonsynonymous","synonymous","gene")]<<-x[,c("nonsense","nonsynonymous","synonymous","gene")]
})

h=merged_sub_bulk_table[order(merged_sub_bulk_table$id),]
old=NA
x=unlist(sapply(1:nrow(h), function(i) if(is.na(old) || old!=h[i,"id"]) {old<<-h[i,"id"];i}))
h=h[x,]
merged_sub_bulk_table_2=h#anders als merged_sub_bulk_table ist in merged_sub_bulk_table_2 jeder Position einzigartig,merged_sub_bulk_table listet also alle Varianten in allen proben auf und merged_sub_bulk_table_2 alle Positionen von Varianten
sapply(simplify = F,names(merged_sub_bulk),function(x) merged_sub_bulk_table_2[,x]<<-merged_sub_bulk_table_2$id%in%merged_sub_bulk[[x]]$id)
merged_sub_bulk_table_2$sample_count=rowSums(merged_sub_bulk_table_2[,names(merged_sub_bulk)])
table(merged_sub_bulk_table_2$sample_count)
merged_sub_bulk_table_2$I_cell=as.logical(rowSums(merged_sub_bulk_table_2[,I_cells]))
merged_sub_bulk_table_2$E_cell=as.logical(rowSums(merged_sub_bulk_table_2[,E_cells]))
merged_sub_bulk_table_2$I_cell_overlap=(3==rowSums(merged_sub_bulk_table_2[,I_cells]))

sapply(names(I_cell_overlap_tables),function(sample) 
     wilcox.test(I_cell_overlap_tables[[sample]]$individuals_count,merged_sub_bulk[[sample]][which(!(merged_sub_bulk[[sample]]$id%in%I_cell_overlap_tables[[sample]]$id)),"individuals_count"],alternative="greater")$p.value)
mean(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$I_cell_overlap),"individuals_count"]);mean(merged_sub_bulk_table_2[which(!merged_sub_bulk_table_2$I_cell_overlap),"individuals_count"])
wilcox.test(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$I_cell_overlap),"individuals_count"],merged_sub_bulk_table_2[which(!merged_sub_bulk_table_2$I_cell_overlap),"individuals_count"])
mean(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$I_cell),"individuals_count"]);mean(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$E_cell),"individuals_count"])
wilcox.test(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$I_cell),"individuals_count"],merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$E_cell),"individuals_count"])
mean(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$I_cell_overlap),"individuals_count"]);mean(merged_sub_bulk_table_2[which(!merged_sub_bulk_table_2$I_cell_overlap & merged_sub_bulk_table_2$I_cell),"individuals_count"])
wilcox.test(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$I_cell_overlap),"individuals_count"],merged_sub_bulk_table_2[which(!merged_sub_bulk_table_2$I_cell_overlap & merged_sub_bulk_table_2$I_cell),"individuals_count"])


sapply(names(I_cell_overlap_tables),function(sample) 
     wilcox.test(as.numeric(I_cell_overlap_tables[[sample]]$homozygous),as.numeric(merged_sub_bulk[[sample]][which(!(merged_sub_bulk[[sample]]$id%in%I_cell_overlap_tables[[sample]]$id)),"homozygous"]),alternative="greater")$p.value)
wilcox.test(merged_sub_bulk_table[which(merged_sub_bulk_table_2$I_cell),"individuals_count"],merged_sub_bulk_table_2$individuals_count)

wilcox.test(merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$E_cell),"individuals_count"],merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==1),"individuals_count"])
wilcox.test(merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==1),"individuals_count"],merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==2),"individuals_count"])
wilcox.test(merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==2),"individuals_count"],merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==3),"individuals_count"])

sapply(names(merged_sub_bulk),function(sample) {
     a=length(which(merged_sub_bulk[[sample]]$individuals_count==9 & merged_sub_bulk[[sample]]$homozygous))
     b=length(which(merged_sub_bulk[[sample]]$individuals_count==9 & !merged_sub_bulk[[sample]]$homozygous))
     c=length(which(merged_sub_bulk[[sample]]$individuals_count==0 & merged_sub_bulk[[sample]]$homozygous))
     d=length(which(merged_sub_bulk[[sample]]$individuals_count==0 & !merged_sub_bulk[[sample]]$homozygous))
     fisher.test(matrix(nrow=2,c(a,c,b,d)),alternative="greater")$p.value
})

with(merged_sub_bulk_table_2, fisher.test(matrix(nrow=2,c(sum(individuals_count==9 & homozygous),sum(individuals_count==9 & !homozygous),sum(individuals_count==0 & homozygous),sum(individuals_count==0 & !homozygous)))))
with(merged_sub_bulk_table_2,c(sum(individuals_count==9 & homozygous),sum(individuals_count==9 & !homozygous),sum(individuals_count==0 & homozygous),sum(individuals_count==0 & !homozygous)))

sum(merged_sub_bulk_table_2$homozygous)/nrow(merged_sub_bulk_table_2)#percentage Homzygot insgesamt ->~19%
sum(merged_sub_bulk_table_2$homozygous & merged_sub_bulk_table_2$individuals_count==9)/sum(merged_sub_bulk_table_2$individuals_count==9)#percentage Homzygot in sc variants in 9 Individuen  ->~27%
sum(merged_sub_bulk_table_2$homozygous & merged_sub_bulk_table_2$individuals_count==0)/sum(merged_sub_bulk_table_2$individuals_count==0)#percentage Homzygot in sc variants in 0 Individuen  ->~9%


sapply(c("E_cell","I_cell","I_cell_overlap"),simplify = F,function(x) {h=which(merged_sub_bulk_table_2[,x]);a=sum(merged_sub_bulk_table_2[h,"homozygous"]);b=sum(!merged_sub_bulk_table_2[h,"homozygous"]);c(a/(a+b),b/(a+b)) })

h=list(sample_1_2=c("sample1","sample2"),sample_1_3=c("sample1","sample3"),sample_2_3=c("sample2","sample3"),sample_1_2_3=c("sample1","sample2","sample3"),
       sample_5_6=c("sample5","sample6_2nd+3rd_run"),sample_5_7=c("sample5","sample7"),sample_6_7=c("sample6_2nd+3rd_run","sample7"),sample_5_6_7=c("sample5","sample6_2nd+3rd_run","sample7"))
dir.create("merged_sub_bulk_unions")
sapply(names(h),function(x) {
     table=do.call(rbind,sapply(h[[x]], simplify = F,function(x) merged_sub_bulk_table_2[which(merged_sub_bulk_table_2[,x]),c("contig","position")]))
     table=table[!duplicated(table),]
     write.table(cbind(table,table[,2]),paste0("merged_sub_bulk_unions/",x,".bed"),col.names=F,row.names=F,quote = F,sep="\t")
})

merged_sub_bulk_potential_overlaps=sapply(names(h),simplify = F,function(x) read.table(paste0("merged_sub_bulk_unions/potential_",x,".bed"),sep="\t",stringsAsFactors = F))
merged_sub_bulk_potential_overlaps_2=sapply(merged_sub_bulk_potential_overlaps,simplify = F,function(x) paste0(x[,1],x[,2]))
#sapply(c("sample_1_2","sample_2_3","sample_1_3"),function(x) merged_sub_bulk_potential_overlaps_2[[x]]<<-setdiff(merged_sub_bulk_potential_overlaps_2[[x]],merged_sub_bulk_potential_overlaps_2$sample_1_2_3))
#sapply(c("sample_5_6","sample_6_7","sample_5_7"),function(x) merged_sub_bulk_potential_overlaps_2[[x]]<<-setdiff(merged_sub_bulk_potential_overlaps_2[[x]],merged_sub_bulk_potential_overlaps_2$sample_5_6_7))


merged_sub_bulk_overlap_overview=sapply(names(h),function(x) 
     length(Reduce(intersect,sapply(h[[x]],simplify = F,function(y) merged_sub_bulk_table_2[which(merged_sub_bulk_table_2[,y]),"id"]))))
#merged_sub_bulk_overlap_overview[c("sample_1_2","sample_1_3","sample_2_3")]=merged_sub_bulk_overlap_overview[c("sample_1_2","sample_1_3","sample_2_3")]-merged_sub_bulk_overlap_overview["sample_1_2_3"]
merged_sub_bulk_overlap_overview=rbind(overlap=merged_sub_bulk_overlap_overview,potential_overlap=sapply(merged_sub_bulk_potential_overlaps_2,length))
merged_sub_bulk_overlap_overview=rbind(merged_sub_bulk_overlap_overview,`%`=merged_sub_bulk_overlap_overview["overlap",]/merged_sub_bulk_overlap_overview["potential_overlap",]*100)
merged_sub_bulk_overlap_overview=rbind(merged_sub_bulk_overlap_overview,`%time`=merged_sub_bulk_overlap_overview["%",]*2/(merged_sub_bulk_overlap_overview["%",]/100+1))
merged_sub_bulk_overlap_overview["%time","sample_5_6_7"]=NA
merged_sub_bulk_overlap_overview["%time","sample_1_2_3"]=merged_sub_bulk_overlap_overview["%time","sample_1_2_3"]*3/(merged_sub_bulk_overlap_overview["%time","sample_1_2_3"]/100+1)
merged_sub_bulk_overlap_overview



# Fig. 2
Fig_2_data=list(A=list(I_cells=mutations_merged_sub_bulk[I_cells],E_cells=mutations_merged_sub_bulk[E_cells]),
                B=merged_sub_bulk_table_2[,c("individuals_count","homozygous")],
                C=list(E_cells=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$E_cell),"individuals_count"],
                       I_cells_1_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==1),"individuals_count"],
                       I_cells_2_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==2),"individuals_count"],
                       I_cells_3_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==3),"individuals_count"])
)
saveRDS(Fig_2_data,"Fig_2.RDS")

library(grid)
library(VennDiagram)
pdf("Figure_2.pdf")#haben uns gegen zweite Figure entschieden und nehme nstattdessen einfahc nur Teil als weiteren der Fig_1
par(mar=c(5, 4, 4, 2) + 0.1)


hist(Fig_2_data$B$individuals_count,col="red",xlab="#Individuals in which single cell variants were found again",main="2A")
hist(Fig_2_data$B[which(Fig_2_data$B$homozygous),"individuals_count"],col="blue",add=T)
legend("top",c("homozygous","heterozygous"),col = c("blue","red"),bty="n",pch=20)

boxplot(main="2B",Fig_2_data$B,ylab="#Individuals in which single cell variants were found again",names=paste0(names(Fig_2_data$B),"\nn=",sapply(Fig_2_data$B,length)))


dev.off()

x=list(E_cells_1_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample5+merged_sub_bulk_table_2$`sample6_2nd+3rd_run`+merged_sub_bulk_table_2$sample7)==1),"individuals_count"],
       E_cells_2_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample5+merged_sub_bulk_table_2$`sample6_2nd+3rd_run`+merged_sub_bulk_table_2$sample7)==2),"individuals_count"],
       I_cells_1_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==1),"individuals_count"],
       I_cells_2_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==2),"individuals_count"],
       I_cells_3_overlap=merged_sub_bulk_table_2[which((merged_sub_bulk_table_2$sample1+merged_sub_bulk_table_2$sample2+merged_sub_bulk_table_2$sample3)==3),"individuals_count"])
x=sapply(x,function(x) sapply(0:9,function(count) length(which(x==count))/length(x)*100))

Fig_2v2_data=list(A=list(I_cells=mutations_merged_sub_bulk[I_cells],E_cells=mutations_merged_sub_bulk[E_cells]),
                  B=merged_sub_bulk_table_2[,c("individuals_count","homozygous")],
                  C=x
)
saveRDS(Fig_2v2_data,"Fig_2v2.RDS")

pdf("Fig_2C.pdf")
par(mar=c(5, 4, 4, 4) + 0.1,xpd=T)
barplot(main="",args.legend=list(x=7.2,y=100,xpd=T,bty="n",cex=0.9),cex.names=0.9,Fig_2v2_data$C,col=(colorRampPalette(c("lightblue","darkblue")))(10),legend=as.character(0:9),ylab="% of variants")
dev.off()


lines(c(0,3.6),c(-3,-3));lines(c(3.8,7.3),c(-3,-3))


h=list(I_cells=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS & merged_sub_bulk_table_2$I_cell),]
       ,I_cell_overlap=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS & merged_sub_bulk_table_2$I_cell_overlap),]
       ,single_cell_0_individuals=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS & merged_sub_bulk_table_2$individuals_count==0),]
       ,single_cell_9_individuals=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS  & merged_sub_bulk_table_2$individuals_count==9),]
       ,E_cells=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS & merged_sub_bulk_table_2$E_cell),]
       ,single_cells=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS),]
       ,single_cell_homozygous=merged_sub_bulk_table_2[which(merged_sub_bulk_table_2$CDS & merged_sub_bulk_table_2$homozygous),]
       ,individuals_175_filtered=three_of_six_Hm_175_overlaps_2_tables_filtered[which(three_of_six_Hm_175_overlaps_2_tables_filtered$CDS),]
       ,individuals_175_single_cell=three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell & three_of_six_Hm_175_overlaps_2_tables[[1]]$CDS),]
)
sapply(h,nrow)
x=c(as.list(setNames(names(h),names(h))),individuals_175_filtered_or_single_cell=list(c("individuals_175_filtered","individuals_175_single_cell")))

#cluster=makeCluster(detectCores())
genes2covered_bases$single_cell_0_individuals=genes2covered_bases$single_cell_9_individuals=genes2covered_bases$single_cells
clusterExport(cluster,c("h","genes2covered_bases","x","gene2length","genes2GO","syn_and_nonSyn_sites","jukes_cantor","gene_mutation_count_fun"))
gene_mutation_count_2=parSapply(cluster,simplify = F,names(x),gene_mutation_count_fun)

clusterExport(cluster,c("GO2genes","GO_overview","GO_mutation_count_fun"))
GO_mutation_count_2=parSapply(cluster,simplify = F,gene_mutation_count_2[c("I_cells","single_cells","single_cell_0_individuals","single_cell_9_individuals")],function(x) GO_mutation_count_fun(x[which(x$nonsense==0),]))
#GO_mutation_count_2=list(I_cells=GO_mutation_count_fun(gene_mutation_count_2$I_cells[which(gene_mutation_count_2$I_cells$nonsense==0),]))#I_cell_overlap, I_cells_0_individuals, I_cells_9_individuals E_Cells, individuals_175_single_cell, individuals_175_filtered weggelassen, da Funktion mangels Daten sind fehlschlägt

h=1
germline_three_of_six_175=c()
sapply(simplify = F,c("Hm_175_DNA_v4_S4",grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)),function(sample) {
     gz=gzfile(paste0("../hydra_genetic_distance_kons/germline/three_of_six/",sample,".fixed.nomulti.normed.simple_filter.vcf.gz"))
     x=setNames(read.table(gz,sep="\t",stringsAsFactors = F),c("CHROM","POS","REF","ALT","TOOLS_CODE"))[,c("CHROM","POS","ALT")]
     x$id=paste(x$CHROM,x$POS)
     if (h==1) germline_three_of_six_175<<-x
     else germline_three_of_six_175<<-rbind(germline_three_of_six_175,x[which(!(x$id%in%germline_three_of_six_175$id)),])
     h<<-h+1 
})
nrow(germline_three_of_six_175)#in total 6343345 germline mutations -> delete for size and examine CDS only
rm(germline_three_of_six_175)

h=1
germline_three_of_six_175_CDS=c()
sapply(simplify = F,c("Hm_175_DNA_v4_S4",grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)),function(sample) {
     x=setNames(read.table(paste0("../hydra_genetic_distance_kons/germline/three_of_six/",sample,".CDS.bed"),sep="\t",stringsAsFactors = F),c("CHROM","start","stop","REF","ALT","gene"))
     x$id=paste(x$CHROM,x$start)
     if (h==1) germline_three_of_six_175_CDS<<-x
     else germline_three_of_six_175_CDS<<-rbind(germline_three_of_six_175_CDS,x[which(!(x$id%in%germline_three_of_six_175_CDS$id)),])
     h<<-h+1 
})
nrow(germline_three_of_six_175_CDS)

library(data.table)
library(VariantAnnotation)
library(GenomicFeatures)
hydra_TxDb=makeTxDbFromGFF("hydra2.0_genemodels.gff3")
x=predictCoding(makeGRangesFromDataFrame(germline_three_of_six_175_CDS,seqnames.field = "CHROM",start.field="start",end.field = "start"),hydra_TxDb,FaFile("Hm105_Dovetail_Assembly_1.0.fa"),DNAStringSet(germline_three_of_six_175_CDS$ALT))
h=data.frame(x)
clusterExport(cluster,c("h","x"))
y=t(parSapply(cluster,unique(x$QUERYID),function(y) {
     x=which(x$QUERYID==y)
     c(nonsense=length(which(h[x,"CONSEQUENCE"]=="nonsense")),nonsynonymous=length(which(h[x,"CONSEQUENCE"]=="nonsynonymous")),synonymous=length(which(h[x,"CONSEQUENCE"]=="synonymous")),start_loss=sum(h[x,"PROTEINLOC"]==1),stop_loss=sum(h[x,"REFAA"]=="*" & h[x,"VARAA"]!="*"))/length(x)
}))
germline_three_of_six_175_CDS=cbind(germline_three_of_six_175_CDS,y)
y=t(parSapply(cluster,unique(x$QUERYID),function(y) {
     x=which(x$QUERYID==y)[1]
     setNames(as.character(h[x,c("REFCODON","VARCODON","REFAA","VARAA")]),c("REFCODON","VARCODON","REFAA","VARAA"))
}))
germline_three_of_six_175_CDS=cbind(germline_three_of_six_175_CDS,y)



h=list(germline=germline_three_of_six_175_CDS)
x=c(germline="germline")
gene_mutation_count_2$germline=gene_mutation_count_fun("germline")
GO_mutation_count_2$germline=GO_mutation_count_fun(gene_mutation_count_2$germline[which(gene_mutation_count_2$germline$nonsense==0),])

GO_mutation_count_2_filtered=sapply(GO_mutation_count_2,simplify = F, function(x) x[which(x$`#covered_bases`>=10000 & (x$nonsynonymous+x$synonymous)>=10),])

sapply(gene_mutation_count_2,function(x) jukes_cantor(sum(x$nonsynonymous)/sum(x$`#nonsynonymous_sites`))/jukes_cantor(sum(x$synonymous)/sum(x$`#synonymous_sites`)))
sum(gene_mutation_count_2$individuals_175_single_cell$nonsynonymous)
sum(gene_mutation_count_2$individuals_175_single_cell$synonymous)
sum(gene_mutation_count_2$individuals_175_single_cell$nonsense)
sapply(GO_mutation_count_2,function(x) jukes_cantor(sum(x$nonsynonymous)/sum(x$`#nonsynonymous_sites`))/jukes_cantor(sum(x$synonymous)/sum(x$`#synonymous_sites`)))

length(which(gene_mutation_count_2$I_cells$nonsense>0))
length(which(gene_mutation_count_2$I_cells$nonsense>0)) 
sum(gene_mutation_count_2$I_cells$nonsense)/sum(gene_mutation_count_2$I_cells$`#covered_bases`)
nrow(gene_mutation_count_2$germline)
length(which(gene_mutation_count_2$germline$nonsense>0))
sum(gene_mutation_count_2$germline$nonsense)/sum(gene_mutation_count_2$germline$`#covered_bases`)

GO_nonsense_fun=function(gene_mutation_table){
     nonsense_genes=gene_mutation_table[which(gene_mutation_table$nonsense>0),"Gene_ID"]
     other_genes=gene_mutation_table[which(gene_mutation_table$nonsense==0),"Gene_ID"]
     expected_nonsense_frac=length(nonsense_genes)/(length(nonsense_genes)+length(other_genes))
     GO_nonsense=do.call(rbind,sapply(simplify = F,names(GO2genes),function(GO_ID){
          genes=GO2genes[[GO_ID]]
          ret=colSums(gene_mutation_table[which(gene_mutation_table$Gene_ID%in%genes),c("#covered_bases","#bases","nonsense")])
          ret["%covered_bases"]=ret["#covered_bases"]/ret["#bases"]*100
          ret=as.list(ret)
          ret$expected_nonsense_genes=expected_nonsense_frac*length(genes)
          ret$nonsense_genes=length(intersect(genes,nonsense_genes))
          ret$other_genes=length(genes)-ret$nonsense_genes
          ret$fisher.pvalue=fisher.test(matrix(c(ret$nonsense_genes,length(nonsense_genes)-ret$nonsense_genes,ret$other_genes,length(other_genes)-ret$other_genes),nrow=2),alternative="greater")$p.value 
          ret=c(GO_ID=GO_ID,Description=GO_overview[GO_ID,"Description"],`#genes`=length(genes),genes=paste0(genes,collapse = ","),ret)
          ret=data.frame(ret,stringsAsFactors = F,check.names = F)
     }))
     GO_nonsense$fisher.fdr=p.adjust(GO_nonsense$fisher.pvalue,method="fdr")
     GO_nonsense[order(GO_nonsense$fisher.pvalue),]
}
clusterExport(cluster,c("GO2genes","GO_overview","GO_nonsense_fun"))
GO_nonsense_2=parSapply(cluster,gene_mutation_count_2,simplify = F,GO_nonsense_fun)

x=sapply(c(I_cells,E_cells), function(x) sum(merged_sub_bulk_table$sample==x))
x
sum(merged_sub_bulk_table_2$I_cell)
mean(x[1:3])
sum(merged_sub_bulk_table_2$E_cell)
mean(x[4:6])
mut_rate_merged_sub_bulk
h=sum(min_coverage$sample1$bases)
mean(mut_rate_merged_sub_bulk["covered_bases",1:3])/h*100
mean(mut_rate_merged_sub_bulk["covered_bases",4:6])/h*100

x=sapply(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),function (x) {
     FDR_filtered=sum(three_of_six_Hm_175_overlaps_2_tables_filtered[,x])
     single_cell_confirmed=sum(three_of_six_Hm_175_overlaps_2_tables[[1]][,x] & three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell)
     combined=length(which(three_of_six_Hm_175_overlaps_2_tables[[1]][,x] & (three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell | three_of_six_Hm_175_overlaps_2_tables[[1]][,paste0(x,".FDR")]<0.05)))
     combined_CDS=length(which(three_of_six_Hm_175_overlaps_2_tables[[1]][,x] & three_of_six_Hm_175_overlaps_2_tables[[1]]$CDS & (three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell | three_of_six_Hm_175_overlaps_2_tables[[1]][,paste0(x,".FDR")]<0.05)))
     c(FDR_filtered=FDR_filtered,single_cell_confirmed=single_cell_confirmed,combined=combined,combined_CDS=combined_CDS)
})
x
y=x[3,]
min(y);max(y);mean(y)
length(unique(unlist(sapply(grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),function (x) three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]][,x] & (three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell | three_of_six_Hm_175_overlaps_2_tables[[1]][,paste0(x,".FDR")]<0.05)),"id"]))))
x=individuals_coverage$somatic_downsampled[grep("^Hm_175",rownames(individuals_coverage$somatic_downsampled)),"fraction"]*100
x
mean(x)

x=sapply(simplify = F,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),function (x) 
     three_of_six_Hm_175_overlaps_2_tables[[1]][three_of_six_Hm_175_overlaps_2_tables[[1]][,x] & three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell,"id"]
)

y=individuals_mut_numbers$germline[grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),"three_of_six.simple_filter"]
names(y)=grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)
y
min(y);max(y);mean(y)
x=individuals_coverage$germline[grep("^Hm_175",rownames(individuals_coverage$germline)),"fraction"]*100
x
mean(x)

y=sapply(c("Hm_175_DNA_v4_S4",grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)),function(sample) {
     x=setNames(read.table(paste0("../hydra_genetic_distance_kons/germline/three_of_six/",sample,".CDS.bed"),sep="\t",stringsAsFactors = F),c("CHROM","start","stop","REF","ALT","gene"))
     nrow(x)
})


mut_rate_merged_sub_bulk
mean(mut_rate_merged_sub_bulk[3,1:3]);sd(mut_rate_merged_sub_bulk[3,1:3]);1/mean(mut_rate_merged_sub_bulk[3,1:3])
mean(mut_rate_merged_sub_bulk[3,4:6]);sd(mut_rate_merged_sub_bulk[3,4:6]);1/mean(mut_rate_merged_sub_bulk[3,4:6])


gene_expression_tpms=Reduce(function(x,y) merge(x,y,by="Gene"),sapply(simplify = F,list.files("../hydra_expression/results/counted/segemehl/",".tpm"),                                                                     function(x) setNames(read.table(colClasses = c("character","numeric"),paste0("../hydra_expression/results/counted/segemehl/",x),header=F),c("Gene",strsplit(x,"[.]")[[1]][1]))))
rownames(gene_expression_tpms)=gene_expression_tpms$Gene
gene_expression_tpms=gene_expression_tpms[,setdiff(colnames(gene_expression_tpms),"Gene")]
colSums(gene_expression_tpms)#1 Million, Passt!

gene_expression_tpms=cbind(gene_expression_tpms,t(apply(gene_expression_tpms[,setdiff(colnames(gene_expression_tpms),"Hm_175bud2_RNA_S12_R1_001")],1,function(x) c(mean=mean(x),median=median(x),sd=sd(x)))))
colMeans(gene_expression_tpms)
colSums(gene_expression_tpms)
library(plotly)
p=plot_ly()
sapply(colnames(gene_expression_tpms),function(x) {
     d=density(gene_expression_tpms[,x])
     p<<-add_trace(p,x=d$x,y=d$y,name=x,mode="lines")
});layout(p,xaxis=list(type="log"))
sapply(gene_expression_tpms,function(x) sapply(c(1,2,5,10,20,50,100),function(threshold) sum(x>=threshold)))
gene_expression_tpms$expressed=gene_expression_tpms$median>=5
expressed_genes=rownames(gene_expression_tpms)[which(gene_expression_tpms$expressed)]
length(expressed_genes);nrow(gene_expression_tpms);length(expressed_genes)/nrow(gene_expression_tpms)

View(sapply(gene_mutation_count_2,function(table){
     n_total_genes=nrow(table)
     mutated_genes=rownames(table)[which(table$`#mutations`>0)];n_mutated_genes=length(mutated_genes)
     expressed_mutated_genes=intersect(mutated_genes,expressed_genes);n_expressed_mutated_genes=length(expressed_mutated_genes)
     n_expressed_genes=length(expressed_genes)
     pval_greater=phyper(n_expressed_mutated_genes,n_expressed_genes,n_total_genes-n_expressed_genes,n_mutated_genes,lower.tail=F)
     pval_two_sided=fisher.test(matrix(nrow=2,c(n_expressed_mutated_genes,n_expressed_genes-n_expressed_mutated_genes,n_mutated_genes-n_expressed_mutated_genes,n_total_genes-(n_expressed_mutated_genes)-(n_expressed_genes-n_expressed_mutated_genes)-(n_mutated_genes-n_expressed_mutated_genes))))$p.value
     c(expressed_mutated_genes=n_expressed_mutated_genes,mutated_genes=n_mutated_genes,ratio=n_expressed_mutated_genes/n_mutated_genes,expressed_total_genes=n_expressed_genes,total_genes=n_total_genes,ratio_total=n_expressed_genes/n_total_genes,pvalue_greater=pval_greater,pvalue_two_sided=pval_two_sided)
}))

supplement_tables=list(`GO_single_cells`=GO_mutation_count_2_filtered$single_cells[order(decreasing = T,GO_mutation_count_2_filtered$single_cells$`Ka/Ks`),setdiff(colnames(GO_mutation_count_2_filtered$single_cells),c("fisher.pvalue","nonsynonymous_direction","nonsynonymous.fisher.pvalue","fisher.fdr","nonsynonymous.fisher.fdr","direction","#expected_mutations","#mutations","nonsense"))],
                       #`GO_sc_9_ind`=GO_mutation_count_2_filtered$single_cell_9_individuals[order(decreasing = T,GO_mutation_count_2_filtered$single_cell_9_individuals$`Ka/Ks`),setdiff(colnames(GO_mutation_count_2_filtered$single_cell_9_individuals),c("fisher.pvalue","nonsynonymous_direction","nonsynonymous.fisher.pvalue","fisher.fdr","nonsynonymous.fisher.fdr","direction","#expected_mutations","#mutations","nonsense"))],
                       #`GO_sc_0_ind`=GO_mutation_count_2_filtered$single_cell_0_individuals[order(decreasing = T,GO_mutation_count_2_filtered$single_cell_0_individuals$`Ka/Ks`),setdiff(colnames(GO_mutation_count_2_filtered$single_cell_0_individuals),c("fisher.pvalue","nonsynonymous_direction","nonsynonymous.fisher.pvalue","fisher.fdr","nonsynonymous.fisher.fdr","direction","#expected_mutations","#mutations","nonsense"))],
                       `GO_reference_comparison`=GO_mutation_count_2_filtered$germline[order(decreasing = T,GO_mutation_count_2_filtered$germline$`Ka/Ks`),setdiff(colnames(GO_mutation_count_2_filtered$germline),c("fisher.pvalue","nonsynonymous_direction","nonsynonymous.fisher.pvalue","fisher.fdr","nonsynonymous.fisher.fdr","direction","#expected_mutations","#mutations","nonsense"))],
                       GO_nonsense_reference=GO_nonsense_2$germline)

x=supplement_tables$GO_single_cells[which(supplement_tables$GO_single_cells$`Ka/Ks`>1),"GO_ID"]
y=supplement_tables$GO_sc_0_ind[which(supplement_tables$GO_sc_0_ind$`Ka/Ks`>1),"GO_ID"]
View(supplement_tables$GO_single_cells[which(supplement_tables$GO_single_cells$GO_ID%in%intersect(x,y)),])

supplement_tables=sapply(simplify = F,supplement_tables,function(table){
     genes=strsplit(table$genes,",")
     expressed=sapply(genes,function(x) intersect(x,expressed_genes))
     pvalues=sapply(1:length(genes),function(x) phyper(length(expressed[[x]]),length(expressed_genes),nrow(gene_expression_tpms)-length(expressed_genes),length(genes[[x]]),lower.tail=F))
     fdrs=p.adjust(pvalues)
     cbind(table[,c("GO_ID","Description","#genes","genes")],`#expressed_genes`=sapply(expressed,length),expressed_genes=sapply(expressed,function(x) paste(x,collapse=",")),`p_value_more_expressed_than_expected`=pvalues,fdr_more_expressed_than_expected=fdrs,table[,setdiff(colnames(table),c("GO_ID","Description","#genes","genes"))])
})
sapply(supplement_tables[c("GO_single_cells","GO_reference_comparison")],function(table) c(cor.spearman=cor(table$`Ka/Ks`,table$`#expressed_genes`,method="spearman"),cor.spearman.test=cor.test(table$`Ka/Ks`,table$`#expressed_genes`,method="spearman")))
sum(supplement_tables$GO_single_cells$`Ka/Ks`>=1);sum(supplement_tables$GO_single_cells$`Ka/Ks`>=1 & supplement_tables$GO_single_cells$fdr_more_expressed_than_expected<0.05)
sum(supplement_tables$GO_single_cells$`Ka/Ks`<=0.1);sum(supplement_tables$GO_single_cells$`Ka/Ks`<=0.1 & supplement_tables$GO_single_cells$fdr_more_expressed_than_expected<0.05)
sum(supplement_tables$GO_reference_comparison$`Ka/Ks`>=1);sum(supplement_tables$GO_reference_comparison$`Ka/Ks`>=1 & supplement_tables$GO_reference_comparison$fdr_more_expressed_than_expected<0.05)
sum(supplement_tables$GO_reference_comparison$`Ka/Ks`<=0.1);sum(supplement_tables$GO_reference_comparison$`Ka/Ks`<=0.1 & supplement_tables$GO_reference_comparison$fdr_more_expressed_than_expected<0.05)
length(unique(c(rownames(supplement_tables$GO_single_cells)[which(supplement_tables$GO_single_cells$`Ka/Ks`>=1 | supplement_tables$GO_single_cells$`Ka/Ks`<=0.1)],rownames(supplement_tables$GO_reference_comparison)[which(supplement_tables$GO_reference_comparison$`Ka/Ks`>=1 | supplement_tables$GO_reference_comparison$`Ka/Ks`<=0.1)])))
length(unique(c(rownames(supplement_tables$GO_single_cells)[which((supplement_tables$GO_single_cells$`Ka/Ks`>=1 & supplement_tables$GO_single_cells$fdr_more_expressed_than_expected<0.05) | (supplement_tables$GO_single_cells$`Ka/Ks`<=0.1 & supplement_tables$GO_single_cells$fdr_more_expressed_than_expected<1))],rownames(supplement_tables$GO_reference_comparison)[which((supplement_tables$GO_reference_comparison$`Ka/Ks`>=1 & supplement_tables$GO_reference_comparison$fdr_more_expressed_than_expected<0.05) | (supplement_tables$GO_reference_comparison$`Ka/Ks`<=0.1 & supplement_tables$GO_reference_comparison$fdr_more_expressed_than_expected<1))])))


library(openxlsx)
write.xlsx(supplement_tables,"Supplement_tables.xlsx")


#Fig. 3
Fig_3_data=setNames(sapply(c("single_cell_positive_selection_revigo","germline_negative_selection_revigo"),simplify = F,function(x){
     x=readRDS(paste0(x,".RDS"))
     sapply(c("term_ID","description","representative"),function(y) x[,y]<<-as.character(x[,y]))
     x=data.frame(t(sapply(unique(x$representative),function(rep) {
          x=x[which(x$representative==rep),]
          rep_i=which(x$description==rep)
          all_genes=unique(unlist(sapply(x$term_ID,function(x) GO2genes[[x]])))
          all_genes_expressed=intersect(all_genes,expressed_genes)
          expressed_pvalue=phyper(length(all_genes_expressed),length(expressed_genes),nrow(gene_expression_tpms)-length(expressed_genes),length(all_genes),lower.tail=F)
          c(rep_genes=length(GO2genes[[x[rep_i,"term_ID"]]]),rep_KaKs=x[rep_i,"value"],all_genes=length(all_genes),all_genes_expressed=length(all_genes_expressed),expressed_pvalue=expressed_pvalue,terms=nrow(x))
     })))
     x[,"expressed_fdr"]=p.adjust(x[,"expressed_pvalue"])
     x[,"expressed_significant"]=x[,"expressed_fdr"]<0.05
     x[order(x$rep_KaKs),]
}),c("A","B"))
Fig_3_data$B=Fig_3_data$B[which(Fig_3_data$B[,"terms"]>1),]
Fig_3_data$B=Fig_3_data$B[order(Fig_3_data$B$rep_KaKs,decreasing = T),]

pdf("Fig_3.pdf",width=10,height=9)
par(mfrow=c(2,1),mar=c(4,22,4,1))
x=rbind(Fig_3_data$A$rep_KaKs*Fig_3_data$A$all_genes_expressed/Fig_3_data$A$all_genes,Fig_3_data$A$rep_KaKs-Fig_3_data$A$rep_KaKs*Fig_3_data$A$all_genes_expressed/Fig_3_data$A$all_genes)
barplot(cex.names=0.8,cex.axis=0.8,xaxt="n",x,horiz = T,col=c("darkblue","darkred"),las=1,names=paste0(rownames(Fig_3_data$A), " (",Fig_3_data$A$all_genes_expressed,", ",Fig_3_data$A$all_genes,")",ifelse(Fig_3_data$A$expressed_significant,"*","")),xlab="Ka/Ks",ylim=c(0,6))
axis(1,seq(0,10,2),col="darkred",col.axis="darkred")
axis(3,seq(0,10,2),label=seq(0,1,0.2),col="darkblue",col.axis="darkblue")
#"#C83737"
x=rbind(Fig_3_data$B$rep_KaKs*Fig_3_data$B$all_genes_expressed/Fig_3_data$B$all_genes,Fig_3_data$B$rep_KaKs-Fig_3_data$B$rep_KaKs*Fig_3_data$B$all_genes_expressed/Fig_3_data$B$all_genes)
barplot(cex.names=0.8,cex.axis=0.8,x,xaxt="n",horiz = T,col=c("darkblue","darkred"),las=1,names=paste0(rownames(Fig_3_data$B), " (",Fig_3_data$B$all_genes_expressed,", ",Fig_3_data$B$all_genes,")",ifelse(Fig_3_data$B$expressed_significant,"*","")),xlab="Ka/Ks",xlim=c(0,0.05),ylim=c(0,6))
axis(1,seq(0,0.05,0.01),col="darkred",col.axis="darkred")
dev.off()
write.table(Fig_3_data$A[,c("all_genes","rep_KaKs")],"Fig_3.tsv",sep="\t",quote=F,col.names = NA)
write.table(Fig_3_data$B[,c("all_genes","rep_KaKs")],"Fig_3.tsv",sep="\t",quote=F,append=T,col.names = NA)

ENA_read_data_table=read.table("../hydra_ENA_mutationrate_genetic_distance/ENA_filled_without_md5.tsv",sep="\t",skip = 1,header=T,stringsAsFactors = F)
ENA_read_data_md5=read.table("../hydra_ENA_mutationrate_genetic_distance/md5sums.tsv",sep=" ",stringsAsFactors = F)
ENA_read_data_md5=setNames(ENA_read_data_md5[,1],ENA_read_data_md5[,3])
ENA_read_data_table=as.data.frame(t(apply(ENA_read_data_table,1,function(x) {
     x["forward_file_md5"]=ENA_read_data_md5[strsplit(x["forward_file_name"],"/")[[1]][2]]
     x["reverse_file_md5"]=ENA_read_data_md5[strsplit(x["reverse_file_name"],"/")[[1]][2]]
     x
})),stringsAsFactors = F)
ENA_read_data_table$library_name=""
ENA_read_data_table[which(ENA_read_data_table$forward_file_name=="hydra/sample6_R1.fastq.gz"),"instrument_model"]="Illumina NovaSeq 6000"
ENA_read_data_table[which(ENA_read_data_table$forward_file_name=="hydra/sample7_R1.fastq.gz"),"instrument_model"]="Illumina NovaSeq 6000"
write.table(ENA_read_data_table,"../hydra_ENA_mutationrate_genetic_distance/ENA_filled_with_md5.tsv",quote = F,sep="\t",row.names=F)

dir.create("UCSC_tracks")
UCSC_buds_vs_mother=sapply(simplify = F, grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T),function (x) {
     y=three_of_six_Hm_175_overlaps_2_tables[[1]][which(three_of_six_Hm_175_overlaps_2_tables[[1]][,x] & (three_of_six_Hm_175_overlaps_2_tables[[1]]$single_cell | three_of_six_Hm_175_overlaps_2_tables[[1]][,paste0(x,".FDR")]<0.05)),]
     y=cbind(`#CHROM`=y$CHROM,POS=y$POS,ID=".",y[,c("REF","ALT")],QUAL=".",FILTER=".",INFO=paste0("AF=",y[,paste0(x,".MAF")]),FORMAT=".",X=".")
     y[order(y$`#CHROM`,y$POS),]
})
sapply(names(UCSC_buds_vs_mother),function(x) {
     write("##fileformat=VCFv4.3",paste0("UCSC_tracks/",x,".somatic.vcf"))
     write.table(append=T,UCSC_buds_vs_mother[[x]],paste0("UCSC_tracks/",x,".somatic.vcf"),quote=F,sep="\t",row.names = F)
})
UCSC_single_cells=sapply(simplify = F,c(I_cells,E_cells), function(x) {
     y=merged_sub_bulk_table[which(merged_sub_bulk_table$sample==x),]
     y=cbind(`#CHROM`=y$contig,POS=y$position,ID=".",setNames(y[,c("ref_allele","alt_allele")],c("REF","ALT")),QUAL=".",FILTER=".",INFO=paste0("AF=",y$tumor_f),FORMAT=".",X=".")
     y[order(y$`#CHROM`,y$POS),]
})
sapply(names(UCSC_single_cells),function(x) {
     write("##fileformat=VCFv4.3",paste0("UCSC_tracks/",x,".vcf"))
     write.table(append=T,UCSC_single_cells[[x]],paste0("UCSC_tracks/",x,".vcf"),quote=F,sep="\t",row.names = F)
})


x=three_of_six_Hm_175_overlaps_2_tables[[1]][,grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)]
colnames(x)=sub("_DNA.*","",colnames(x))
y=dist(t(x),method="manhattan")
z=hclust(y)
pdf("Supplement_Fig_Cluster")
plot(z)
a=prcomp(t(x))
library("factoextra")
fviz_eig(a)#7/8 Dimensionen haben fast gleichen Varianzanteil --> PCA funktioniert nicht
fviz_pca_ind(a,repel = TRUE,axes=c(3,5))    

library(parallel)
cluster=makeCluster(detectCores())
germline_varscan=parSapply(cluster,simplify = F,c("Hm_175_DNA_v4_S4",grep("^Hm_175",colnames(Hm_175_overlaps_overview),value=T)),function(sample) {
     gz=gzfile(paste0("../hydra_genetic_distance_kons/germline/varscan/",sample,".fixed.nomulti.normed.simple_filter.vcf.gz"))
     x=setNames(read.table(gz,sep="\t",stringsAsFactors = F),c("CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","MAF"))[,c("CHROM","POS","ALT","MAF")]
     x=as.data.frame(x)
     x$MAF=as.numeric(sapply(strsplit(x$MAF,":"),function(x) x[4]))
     x$id=paste(x$CHROM,x$POS)
     x
})

merged_sub_bulk_table_2=Reduce(function(x,y) merge(x,y,by="id",all.x=T),sapply(simplify = F,names(germline_varscan),function(x) setNames(germline_varscan[[x]][,c("id","MAF")],c("id",paste0(x,".MAF")))),merged_sub_bulk_table_2)
merged_sub_bulk_table_2[,paste0(names(germline_varscan),".MAF")]=apply(merged_sub_bulk_table_2[,paste0(names(germline_varscan),".MAF")],c(1,2),function(x) if (is.na(x)) 0 else x)

merged_sub_bulk_table=Reduce(function(x,y) merge(x,y,by="id",all.x=T),sapply(simplify = F,names(germline_varscan),function(x) setNames(germline_varscan[[x]][,c("id","MAF")],c("id",paste0(x,".MAF")))),merged_sub_bulk_table)
merged_sub_bulk_table[,paste0(names(germline_varscan),".MAF")]=apply(merged_sub_bulk_table[,paste0(names(germline_varscan),".MAF")],c(1,2),function(x) if (is.na(x)) 0 else x)

a=unlist(merged_sub_bulk_table[which(merged_sub_bulk_table$homozygous),paste0(names(germline_varscan),".MAF")])
b=unlist(merged_sub_bulk_table[which(!merged_sub_bulk_table$homozygous),paste0(names(germline_varscan),".MAF")])
c=density(a)
d=density(b)
plot(c,ylim=c(0,max(c$y,d$y)),main="Distribution of MAFs",col="red")
lines(d,col="black")
legend("topright",cex=0.85,bty="n",c(paste0("Homozygous, n=",length(a),", median=",round(median(a),2)),paste0("Heterozygous, n=",length(b),", median=",round(median(b),2))),lty=1,col=c("red","black"))
wilcox.test(a,b)
rm(germline_varscan)

length(which(merged_sub_bulk_table_2$I_cell))
length(which(merged_sub_bulk_table_2$I_cell & merged_sub_bulk_table_2$individuals_count>0))
length(which(merged_sub_bulk_table_2$I_cell & merged_sub_bulk_table_2$individuals_count>0))/length(which(merged_sub_bulk_table_2$I_cell))
length(which(merged_sub_bulk_table_2$E_cell))
length(which(merged_sub_bulk_table_2$E_cell & merged_sub_bulk_table_2$individuals_count>0))
length(which(merged_sub_bulk_table_2$E_cell & merged_sub_bulk_table_2$individuals_count>0))/length(which(merged_sub_bulk_table_2$E_cell))



x=sapply(c(T,F),function(x) sapply(0:9,function(y) sum(Fig_2v2_data$B$homozygous==x & Fig_2v2_data$B$individuals_count==y)))
#expected vs observed frequency based on heterozygous
#(heterozygous divided by mean of covered bases in single cell)^2 -> expected
#homzygote/heterzygote expected
y=(x[,1]/mean(mut_rate_merged_sub_bulk[2,]))/((x[,2]/mean(mut_rate_merged_sub_bulk[2,]))^2)
format(y,scientific = T)

z=cbind(x,x[,1]/x[,2],(x[,2]/mean(mut_rate_merged_sub_bulk[2,])),(x[,2]/mean(mut_rate_merged_sub_bulk[2,]))^2,(x[,1]/mean(mut_rate_merged_sub_bulk[2,])),y)
rownames(z)=0:9
colnames(z)=c("Homozygous SNVs","Heterozygous SNVs","Homozygous SNVs / Heterozygous SNVs","Heterozygous mutation frequency [Heterozygous SNVs / Sufficiently covered genome bases]","Expected homozygous mutation frequency [ = (Heterozygous mutation frequency)^2 ]","Observed homozygous mutation frequency [= Homozygous SNVs / Sufficiently covered genome bases]"," Observed / expected Homozygous mutation frequency")
write.table(z,"Supplement_Table_homozygous_heterozygous.tsv",sep="\t",quote=F)

write.table(GO_nonsense_2$germline,"Supplement_Table_stop_gain_wild.tsv",sep="\t",quote=F)


sccaller=sapply(simplify = F,c(I_cells,E_cells),function(sample) {
     #sccaller=sapply(simplify = F,c("sample1","sample3",E_cells),function(sample) {
     x=read.table(paste0("sccaller.",sample,".vcf"),sep="\t",header = T,skip = 5538,comment.char = "")
     x=x[which(sapply(x[,"ALT"],nchar)==1),]#only SNPs
     #x=x[which(x[,"FILTER"]=="."),]
     x=x[which(sapply(strsplit(x$CELL001,":"),function(x) x[2]=="True")),]
     x$reads_ref=sapply(strsplit(x$CELL001,":"),function(x) as.numeric(strsplit(x[3],",")[[1]][1] ))
     x$reads_alt=sapply(strsplit(x$CELL001,":"),function(x) as.numeric(strsplit(x[3],",")[[1]][2] ))
     x=x[which((x$reads_ref+x$reads_alt)>=read_threshold),]
     
     #x$homo_ref=sapply(strsplit(x$CELL001,":"),function(x) x[1]=="0/0")#-> visual inspection shows that genotype info makes no sense
     #x=x[which(!x$homo_ref),]
     x
})
sapply(sccaller,nrow); sapply(merged_sub_bulk,nrow)


sccaller_2=sapply(simplify = F,sccaller,function(x) x[which((!sapply(strsplit(x$CELL001,":"),function(x) x[1]=="0/0")) & x[,"FILTER"]=="."),])

sapply(sccaller,nrow); sapply(sccaller_2,nrow);sapply(merged_sub_bulk,nrow)

mut_rate_sccaller_2=sapply(names(sccaller_2),function(x) {
     covered_bases=unname(unlist(min_coverage[[x]][which(min_coverage[[x]]$reads==read_threshold),"cum_bases"]))
     c(mutations=nrow(sccaller_2[[x]]),covered_bases=covered_bases,mutations_per_base=nrow(sccaller_2[[x]])/covered_bases,mutation_rate=nrow(sccaller_2[[x]])/covered_bases/unname(estimated_mitoses[sample2celltype[x]]))
})
mut_rate_sccaller_2
mut_rate_merged_sub_bulk

sapply(names(sccaller_2),function(name){
     sccaller=paste(sccaller_2[[name]][,1],sccaller_2[[name]][,2])
     merged_sub_bulk=paste(merged_sub_bulk[[name]][,1],merged_sub_bulk[[name]][,2])
     intersect=intersect(sccaller,merged_sub_bulk)
     c(sccaller_2=length(sccaller),merged_sub_bulk=length(merged_sub_bulk),intersect=length(intersect),intersect_relative=length(intersect)/min(length(sccaller),length(merged_sub_bulk)))
})


sccaller_indels=sapply(simplify = F,c(I_cells,E_cells),function(sample) {
     x=read.table(paste0("sccaller.",sample,".vcf"),sep="\t",header = T,skip = 5538,comment.char = "")
     x=x[which(sapply(x[,"ALT"],nchar)>1),]#only SNPs
     x=x[which((!sapply(strsplit(x$CELL001,":"),function(x) x[1]=="0/0")) & x[,"FILTER"]=="."),]
     x=x[which(sapply(strsplit(x$CELL001,":"),function(x) x[2]=="True")),]
     x$reads_ref=sapply(strsplit(x$CELL001,":"),function(x) as.numeric(strsplit(x[3],",")[[1]][1] ))
     x$reads_alt=sapply(strsplit(x$CELL001,":"),function(x) as.numeric(strsplit(x[3],",")[[1]][2] ))
     x=x[which((x$reads_ref+x$reads_alt)>=read_threshold),]
     x
})
sapply(sccaller_2,nrow); sapply(sccaller_indels,nrow)

mut_rate_sccaller_indels=sapply(names(sccaller_indels),function(x) {
     covered_bases=unname(unlist(min_coverage[[x]][which(min_coverage[[x]]$reads==read_threshold),"cum_bases"]))
     c(mutations=nrow(sccaller_indels[[x]]),covered_bases=covered_bases,mutations_per_base=nrow(sccaller_indels[[x]])/covered_bases,mutation_rate=nrow(sccaller_indels[[x]])/covered_bases/unname(estimated_mitoses[sample2celltype[x]]))
})
mut_rate_sccaller_indels

mapd_raw_counts=sapply(simplify = F,c(I_cells,E_cells),function(sample) setNames(read.table(paste0(sample,".srt.rmdup.RG.realigned.BQSR.coverage.tpm.bed"),stringsAsFactors = F,header=F,sep="\t"),c("chrom","start","stop","count")))

mapd_chromsome_wise=sapply(mapd_raw_counts,simplify = F,function(x) {
     sapply(unique(x$chrom),function(chrom){
          x=x[which(x$chrom==chrom),]
          if (nrow(x)<=1) return(NA)
          abs(log2(x[1:(nrow(x)-1),"count"])-log2(x[2:nrow(x),"count"]))
     })
})

sapply(mapd_chromsome_wise,function(x) median(na.exclude(unlist(x))))#MAPD -> all below 2

devtools::install_github("kgori/sigfit", build_vignettes = TRUE,
                         build_opts = c("--no-resave-data", "--no-manual"))
library(sigfit)
data("cosmic_signatures_v2")

probs <- c(0.4, 0.3, 0.2, 0.1) %*% as.matrix(cosmic_signatures_v2[c(1, 3, 7, 11), ])
x <- matrix(rmultinom(1, 20000, probs), nrow = 1)
colnames(x) <- colnames(cosmic_signatures_v2)
mcmc_samples_fit <- fit_signatures(counts = x, 
                                   signatures = cosmic_signatures_v2,
                                   iter = 2000, 
                                   warmup = 1000, 
                                   chains = 1, 
                                   seed = 1756)
exposures <- retrieve_pars(mcmc_samples_fit, 
                           par = "exposures", 
                           hpd_prob = 0.90)
names(exposures)
exposures$mean
rm(exposures,probs,x,mcmc_samples_fit)

reverse_complement_single_string <- function(seq) {
     complement <- function(nucleotide) {
          switch(nucleotide,
                 "A" = "T",
                 "T" = "A",
                 "C" = "G",
                 "G" = "C",
                 "N" = "N",  # Handle unknown nucleotides
                 stop("Invalid nucleotide"))
     }
     
     reversed_seq <- rev(strsplit(seq, NULL)[[1]])  # Reverse the sequence
     complemented_seq <- sapply(reversed_seq, complement)  # Compute complement
     return(paste(complemented_seq, collapse = ""))
}


data("cosmic_signatures_v3")
sapply(names(merged_sub_bulk),simplify = F,function(sample){
     first=paste0(substr(merged_sub_bulk[[sample]]$context,3,3))
     third=paste0(substr(merged_sub_bulk[[sample]]$context,5,5))
     from=paste0(first,merged_sub_bulk[[sample]]$ref_allele,third)
     to=paste0(first,merged_sub_bulk[[sample]]$alt_allele,third)
     from=ifelse (merged_sub_bulk[[sample]]$ref_allele=="A" | merged_sub_bulk[[sample]]$ref_allele=="G",sapply(from,function(from) reverse_complement(from)),from) 
     to=ifelse (merged_sub_bulk[[sample]]$ref_allele=="A" | merged_sub_bulk[[sample]]$ref_allele=="G",sapply(to,function(to) reverse_complement(to)),to) 
     merged_sub_bulk[[sample]]$mutation_context<<-paste0(from,">",to)
})

system("wget https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6424819/bin/mmc1.xlsx")
library(openxlsx)
x=read.xlsx("mmc1.xlsx",2)
x$mutation_context=paste0(x$Mutation.Subtype,">",substr(x$Mutation.Subtype,1,1),substr(x$Mutation.Type,3,3),substr(x$Mutation.Subtype,3,3))
my_mutation_signatures=cosmic_signatures_v3
my_mutation_signatures=rbind(my_mutation_signatures,t(x[which(x$mutation_context==colnames(my_mutation_signatures)),c("SBS.sc_E","SBS.sc_F")]))

exposures=sapply(names(merged_sub_bulk),simplify = F,function(sample){
     x=matrix(0,nrow=1,ncol=ncol(cosmic_signatures_v3),dimnames = list(NULL,colnames(cosmic_signatures_v3)))
     sapply(merged_sub_bulk[[sample]]$mutation_context,function(mutation_context) x[,mutation_context]<<- x[,mutation_context]+1)
     mcmc_samples_fit <- fit_signatures(counts = x, 
                                        signatures = my_mutation_signatures,
                                        iter = 2000, 
                                        warmup = 1000, 
                                        chains = 1, 
                                        seed = 1756)
     exposures <- retrieve_pars(mcmc_samples_fit, 
                                par = "exposures", 
                                hpd_prob = 0.90)
     exposures
})
sapply(exposures,function(x) x$mean[c("SBS.sc_E","SBS.sc_F")])
sapply(exposures,function(x) x$mean["SBS.sc_E"]+x$mean["SBS.sc_F"])


x=data.frame(do.call(rbind,merged_sub_bulk_CDS))
x=t(sapply(unique(x$gene),function(gene) colSums(x[which(x$gene==gene),c("start_loss","stop_loss")])) )
y=gene_mutation_count_2$single_cells
y=cbind(y,start_loss=0,stop_loss=0)
sapply(rownames(x),function(gene) y[gene,c("start_loss","stop_loss")]<<-x[gene,c("start_loss","stop_loss")])
gene_mutation_count_2$single_cells=y

x=germline_three_of_six_175_CDS
x=t(sapply(unique(x$gene),function(gene) colSums(x[which(x$gene==gene),c("start_loss","stop_loss")])) )
y=gene_mutation_count_2$germline
y=cbind(y,start_loss=0,stop_loss=0)
sapply(rownames(x),function(gene) y[gene,c("start_loss","stop_loss")]<<-x[gene,c("start_loss","stop_loss")])
gene_mutation_count_2$germline=y
sapply(c("#mutations","nonsense","nonsynonymous","synonymous","start_loss","stop_loss"),function(x) sum(gene_mutation_count_2$germline[,x]>0,na.rm=T))

require(ggpubr)
x=rbind(data.frame(check.names=F,`# genes affected by mutation type`=sapply(c("synonymous","nonsynonymous","nonsense","start_loss","stop_loss"),function(x) sum(gene_mutation_count_2$germline[,x]>0,na.rm=T)),
                   `mutation type`=c("synonymous","nonsynonymous","stop gain","start loss","stop loss"),analysis="In the wild (germline analysis)"),
        data.frame(check.names=F,`# genes affected by mutation type`=sapply(c("synonymous","nonsynonymous","nonsense","start_loss","stop_loss"),function(x) sum(gene_mutation_count_2$single_cells[,x]>0,na.rm=T)),
                   `mutation type`=c("synonymous","nonsynonymous","stop gain","start loss","stop loss"),analysis="In captivity (single cell analysis)"))
facet(ggbarplot(x,"mutation type","# genes affected by mutation type",label=T,fill="mutation type",lab.size =6,font.x = c(size=18, face="bold"),font.y = c(size=18, face="bold")),facet.by = "analysis",nrow=2,scales="free_y",panel.labs.font=list(size=18,face="bold"))+font("axis.text",size=18)+rremove("legend")


GO_X_fun=function(gene_mutation_table,X){
     X_genes=gene_mutation_table[which(gene_mutation_table[,X]>0),"Gene_ID"]
     other_genes=gene_mutation_table[which(gene_mutation_table[,X]==0),"Gene_ID"]
     expected_X_frac=length(X_genes)/(length(X_genes)+length(other_genes))
     GO_X=do.call(rbind,sapply(simplify = F,names(GO2genes),function(GO_ID){
          genes=GO2genes[[GO_ID]]
          ret=colSums(gene_mutation_table[which(gene_mutation_table$Gene_ID%in%genes),c("#covered_bases","#bases",X)])
          ret["%covered_bases"]=ret["#covered_bases"]/ret["#bases"]*100
          ret=as.list(ret)
          ret$expected_X_genes=expected_X_frac*length(genes)
          ret$X_genes=length(intersect(genes,X_genes))
          ret$other_genes=length(genes)-ret$X_genes
          ret$fisher.pvalue=fisher.test(matrix(c(ret$X_genes,length(X_genes)-ret$X_genes,ret$other_genes,length(other_genes)-ret$other_genes),nrow=2),alternative="greater")$p.value 
          ret=c(GO_ID=GO_ID,Description=GO_overview[GO_ID,"Description"],`#genes`=length(genes),genes=paste0(genes,collapse = ","),ret)
          ret=data.frame(ret,stringsAsFactors = F,check.names = F)
     }))
     GO_X$fisher.fdr=p.adjust(GO_X$fisher.pvalue,method="fdr")
     GO_X[order(GO_X$fisher.pvalue),]
}
clusterExport(cluster,c("GO2genes","GO_overview","GO_X_fun"))
GO_germline_X=sapply(simplify = F,c("start_loss","stop_loss"),function(x) GO_X_fun(gene_mutation_count_2$germline,x))


BiocManager::install("coRdon")
library(coRdon)
x=readSet(file="hydra2.0_genemodels.nt")
codonTable=codonTable(x)
codonCounts=codonCounts(codonTable)
codonCounts_overall=colSums(codonCounts)

codonCounts_germline_syn=table(germline_three_of_six_175_CDS[which(germline_three_of_six_175_CDS$REFAA==germline_three_of_six_175_CDS$VARAA & germline_three_of_six_175_CDS$REFAA!="M"),"VARCODON"])
codonCounts_germline_syn=setNames(as.numeric(codonCounts_germline_syn),names(codonCounts_germline_syn))
x=setdiff(names(codonCounts_overall),names(codonCounts_germline_syn))
codonCounts_germline_syn=c(codonCounts_germline_syn,setNames(rep(0,length(x)),x))
codonCounts_germline_syn=codonCounts_germline_syn[names(codonCounts_overall)]

codonCounts_single_cells_syn=table(unlist(sapply(merged_sub_bulk_CDS,function(x) x[which(x$REFAA==x$VARAA &   x$REFAA!="M"),"VARCODON"])))
codonCounts_single_cells_syn=setNames(as.numeric(codonCounts_single_cells_syn),names(codonCounts_single_cells_syn))
x=setdiff(names(codonCounts_overall),names(codonCounts_single_cells_syn))
codonCounts_single_cells_syn=c(codonCounts_single_cells_syn,setNames(rep(0,length(x)),x))
codonCounts_single_cells_syn=codonCounts_single_cells_syn[names(codonCounts_overall)]

AA2Codon=sapply(unique(germline_three_of_six_175_CDS$REFAA),function(x) 
     x=unique(germline_three_of_six_175_CDS[which(germline_three_of_six_175_CDS$REFAA==x),"REFCODON"]))

x=sapply(setdiff(names(AA2Codon),c("M","W")),function(AA)
     chisq.test(rbind(codonCounts_overall[AA2Codon[[AA]]],codonCounts_germline_syn[AA2Codon[[AA]]]))$p.value)
x
p.adjust(x)

x=sapply(setdiff(names(AA2Codon),c("M","W")),function(AA)
     chisq.test(rbind(codonCounts_overall[AA2Codon[[AA]]],codonCounts_single_cells_syn[AA2Codon[[AA]]]))$p.value)
x
p.adjust(x)

x=sapply(setdiff(names(AA2Codon),c("M","W")),function(AA)
     chisq.test(rbind(codonCounts_germline_syn[AA2Codon[[AA]]],codonCounts_single_cells_syn[AA2Codon[[AA]]]))$p.value)
x
p.adjust(x)

#Fig. S8
x=rbind(data.frame(check.names=F,fraction=unlist(sapply(setdiff(names(AA2Codon),c("M","W")),function(AA)
     codonCounts_overall[AA2Codon[[AA]]]/sum(codonCounts_overall[AA2Codon[[AA]]])),use.names = F),codon=unlist(AA2Codon[setdiff(names(AA2Codon),c("M","W"))]),`amino acid`=unlist(sapply(setdiff(names(AA2Codon),c("M","W")),function(a) rep(a,length(AA2Codon[[a]])))),set="genome"),
     data.frame(check.names=F,fraction=unlist(sapply(setdiff(names(AA2Codon),c("M","W")),function(AA)
          codonCounts_germline_syn[AA2Codon[[AA]]]/sum(codonCounts_germline_syn[AA2Codon[[AA]]])),use.names = F),codon=unlist(AA2Codon[setdiff(names(AA2Codon),c("M","W"))]),`amino acid`=unlist(sapply(setdiff(names(AA2Codon),c("M","W")),function(a) rep(a,length(AA2Codon[[a]])))),set="synonymous wild (germline)"),     
     data.frame(check.names=F,fraction=unlist(sapply(setdiff(names(AA2Codon),c("M","W")),function(AA)
          codonCounts_single_cells_syn[AA2Codon[[AA]]]/sum(codonCounts_single_cells_syn[AA2Codon[[AA]]])),use.names = F),codon=unlist(AA2Codon[setdiff(names(AA2Codon),c("M","W"))]),`amino acid`=unlist(sapply(setdiff(names(AA2Codon),c("M","W")),function(a) rep(a,length(AA2Codon[[a]])))),set="synonymous captivity (single cells)")          
)

require(rstatix)
y=expand.grid(unique(x$`amino acid`),unique(x$set),unique(x$set),stringsAsFactors = F)
y=setNames(y[which(y[,2]!=y[,3]),],c("amino acid","group1","group2"))
y$dummy=apply(y,1,function(x) paste(x[1],collapse = "," ,sort(c(x[2],x[3]))))
y=do.call(rbind,sapply(simplify = F,unique(y$dummy),function(x) y[which(y$dummy==x),][1,]))[,1:3]

z=list(genome=codonCounts_overall,`synonymous wild (germline)`=codonCounts_germline_syn,`synonymous captivity (single cells)`=codonCounts_single_cells_syn)
a=apply(y,1,function(x) rbind(z[[x[2]]][AA2Codon[[x[1]]]],z[[x[3]]][AA2Codon[[x[1]]]]))
y$p=apply(y,1,function(x) chisq.test(rbind(z[[x[2]]][AA2Codon[[x[1]]]],z[[x[3]]][AA2Codon[[x[1]]]]))$p.value)
y$p.adj=p.adjust(y$p)
y=add_significance(y)
y$y.position=ifelse(y$group1=="synonymous captivity (single cells)" & y$group2=="genome",0.95,0.75)

#ggbarplot(x,"set","fraction",fill="codon",position = position_dodge(0.8),xlab="",facet.by="amino acid",legend="right")+rotate_x_text(angle = 90) +
#     stat_pvalue_manual(y,label="p.adj.signif",y.position=0.9)

x$`amino acid`=sapply(x$`amino acid`,function(AA) paste0(AA," (n=",sum(codonCounts_overall[AA2Codon[[AA]]]),", ",sum(codonCounts_germline_syn[AA2Codon[[AA]]]),", ",sum(codonCounts_single_cells_syn[AA2Codon[[AA]]]),")"))
x$`amino acid`=sub("[*]","STOP",x$`amino acid`)

y$`amino acid`=sapply(y$`amino acid`,function(AA) paste0(AA," (n=",sum(codonCounts_overall[AA2Codon[[AA]]]),", ",sum(codonCounts_germline_syn[AA2Codon[[AA]]]),", ",sum(codonCounts_single_cells_syn[AA2Codon[[AA]]]),")"))
y$`amino acid`=sub("[*]","STOP",y$`amino acid`)

facet(ggbarplot(x,"set","fraction",fill="codon",position = position_dodge(0.8),legend="right"),xlab="",facet.by="amino acid")+rotate_x_text(angle = 90) +
     stat_pvalue_manual(y,label="p.adj.signif") + rremove("xlab")


x=as.data.frame(table(merged_sub_bulk$sample1$mutation_context))
x=setNames(x$Freq,x$Var1)
x=x/sum(x)
y=paste0(substr(names(x),2,2),substr(names(x),6,6))
a=x[order(y)]

mutation_context_table=sapply(merged_sub_bulk,function(x){
     x=as.data.frame(table(x$mutation_context))
     x=setNames(x$Freq,x$Var1)
     x=x/sum(x)
     z=setdiff(names(a),names(x))
     z=setNames(rep(0,length(z)),z)
     x=c(x,z)
     y=paste0(substr(names(x),2,2),substr(names(x),6,6))
     x=x[order(y)]
     x
})

library(ggpubr)


df=data.frame(check.names = F,from_to=rownames(mutation_context_table),`% of mutations`=mutation_context_table[,1])
df$from_to_middle=paste0(substr(df$from_to,2,2),">",substr(df$from_to,6,6))
df$sample="I-cell 1"

y=c("I-cell 1","I-cell 2","I-cell 3","E-cell 1","E-cell 2","E-cell 3")
dfs=sapply(simplify = F, 1:6, function(x){
     df=data.frame(check.names = F,from_to=rownames(mutation_context_table),`% of mutations`=mutation_context_table[,x])
     df$from_to_middle=paste0(substr(df$from_to,2,2),">",substr(df$from_to,6,6))
     df$sample=y[x]
     df
})
df=do.call(rbind,dfs)


palette=setNames(c("cyan","black","darkred","grey","green","pink"),unique(df$from_to_middle))
p=ggbarplot(df,x="from_to",y="% of mutations",fill="from_to_middle",palette = palette,xlab=F)+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5,color=palette[df$from_to_middle]))+ theme(legend.title=element_blank())
facet(p,ncol=1,facet.by = "sample")

require(data.table)
min_coverage_2=sapply(list.files(".",".min.coverage$"), function(x) {
     x= fread(x,col.names = c("reads","bases","fraction"))
     sum(x[which(x$reads>=20),"bases"])
})




