#!/usr/bin/env Rscript

setwd("~/myproject/inputdata")
library(vcfR)

GenomeInfo <- read.delim("GenomeInfoOrenil2.txt")
centromers <- read.delim("EstimatedCentromerPositions.txt")

#biallelic sites 
benithos <- read.vcfR("BenithosSNPsOnly.final.vcf.gz")
benithos_gt <- extract.gt(benithos)

benithos_gt_num <- benithos_gt
benithos_gt_num <- as.data.frame(benithos_gt_num)
colnames(benithos_gt_num)<-c("WildP", "Mother", "child1", "child2")
benithos_gt_num <- data.frame(lapply(benithos_gt_num, function(x) {gsub("/", "|", x)}))
rownames(benithos_gt_num)<-rownames(benithos_gt)
benithos_gt_num$POS<-sapply(strsplit(row.names(benithos_gt_num),'\\.2_'), "[", 2)
benithos_gt_num$CHROM<-sapply(strsplit(row.names(benithos_gt_num),'\\.2'), "[", 1)
benithos_gt_num$POS<-as.numeric(benithos_gt_num$POS)

###remove SNPs in low quality windows
allcallsites_df_tokeep <- read.csv("allcallsites_df_tokeep.txt", sep="")
colnames(allcallsites_df_tokeep)<-c("row","sumSNPs", "start", "stop", "CHROM")
allcallsites_df_tokeep$start<-as.numeric(allcallsites_df_tokeep$start)
allcallsites_df_tokeep$stop<-as.numeric(allcallsites_df_tokeep$stop)
allcallsites_df_tokeep$sumSNPs<-as.numeric(allcallsites_df_tokeep$sumSNPs)

# consider only those placed on chromosomes
benithos_gt_num_chr<-subset(benithos_gt_num, benithos_gt_num$CHROM!="UNPLACED")
benithos_gt_num_chr<-subset(benithos_gt_num_chr, benithos_gt_num_chr$CHROM!="NC_013663")
benithos_gt_num_chr$POS<-as.numeric(benithos_gt_num_chr$POS)

sites_to_keep<-list()
for (i in unique(benithos_gt_num_chr$CHROM)) {
  myfragment<-subset(benithos_gt_num_chr, benithos_gt_num_chr$CHROM==i)
  windows<-allcallsites_df_tokeep[grep(i,allcallsites_df_tokeep$CHROM),] 
  out_count<-list()
  for (j in 1:length(windows$start)) {
    mywindow_start<-windows$start[j]
    mywindow_end<-windows$stop[j]
    testtable<-subset(myfragment, POS>=mywindow_start & POS<=mywindow_end)
    out_count[[j]]<-testtable
  }
  out_count_df<-do.call(rbind,out_count)
  sites_to_keep[[i]]<-out_count_df
}

sites_to_keep_df<-do.call(rbind, sites_to_keep)
apply(sites_to_keep_df[1:4],2, table)


infofields_full_DP<-extract.info(benithos, "DP")
infofields_full_ExcessHet<-extract.info(benithos, "ExcessHet")
infofields_full_FS<-extract.info(benithos, "FS")
infofields_full_MQ<-extract.info(benithos, "MQ")
infofields_full_MQRankSum<-extract.info(benithos, "MQRankSum")
infofields_full_QD<-extract.info(benithos, "QD")
infofields_full_ReadPosRankSum<-extract.info(benithos, "ReadPosRankSum")
infofields_full_SOR<-extract.info(benithos, "SOR")
infofields_full_MLEAC<-extract.info(benithos, "MLEAC")
infofields_full_MLEAF<-extract.info(benithos, "MLEAF")
infofields_full<-cbind(infofields_full_DP,infofields_full_ExcessHet,infofields_full_FS,infofields_full_MQ,infofields_full_MQRankSum,infofields_full_QD,infofields_full_ReadPosRankSum,infofields_full_SOR,infofields_full_MLEAC,infofields_full_MLEAF)
colnames(infofields_full)<-c("DP", "ExcessHet","FS","MQ", "MQRankSum","QD","ReadPosRankSum","SOR","MLEAC","MLEAF" )
infofields_full<-as.data.frame(infofields_full)
infofields_full[] <- lapply(infofields_full, function(x) as.numeric(as.character(x)))
rownames(infofields_full)<-rownames(benithos_gt_num)
infofields_full$SNP_ID<-rownames(infofields_full)

sites_to_keep_df$SNP_ID<-paste(sites_to_keep_df$CHROM,".2_" ,sites_to_keep_df$POS, sep="")
sites_to_keep_df_withstats<-merge(sites_to_keep_df, infofields_full, by="SNP_ID", all.x = T, all.y = F)


ind_depth<-extract.gt(benithos, element = "DP")
ind_GQ<-extract.gt(benithos, element = "GQ")
colnames(ind_depth)<-c("DP_wildI","DP_mother","DP_child1","DP_child2")
colnames(ind_GQ)<-c("GQ_wildI","GQ_mother","GQ_child1","GQ_child2")
ind_depth<-as.data.frame(ind_depth)
ind_GQ<-as.data.frame(ind_GQ)
ind_depth[] <- lapply(ind_depth, function(x) as.numeric(x))
ind_GQ[] <- lapply(ind_GQ, function(x) as.numeric(x))
rownames(ind_depth)<-rownames(benithos_gt_num)
rownames(ind_GQ)<-rownames(benithos_gt_num)
ind_depth$SNP_ID<-rownames(ind_depth)
ind_GQ$SNP_ID<-rownames(ind_GQ)

sites_to_keep_df_withstats<-merge(sites_to_keep_df_withstats, ind_depth, by="SNP_ID", all.x = T, all.y = F)
sites_to_keep_df_withstats<-merge(sites_to_keep_df_withstats, ind_GQ, by="SNP_ID", all.x = T, all.y = F)

pdf("IndividualCoverageSiteTypes.pdf")
par( mfrow=c(2,2))
boxplot(sites_to_keep_df_withstats$DP_wildI~sites_to_keep_df_withstats$WildP)
boxplot(sites_to_keep_df_withstats$DP_mother~sites_to_keep_df_withstats$Mother)
boxplot(sites_to_keep_df_withstats$DP_child1~sites_to_keep_df_withstats$child1)
boxplot(sites_to_keep_df_withstats$DP_child2~sites_to_keep_df_withstats$child2)
dev.off()



#######################
## offspring 1
######################
child1_same<-subset(sites_to_keep_df_withstats, sites_to_keep_df_withstats$child1==sites_to_keep_df_withstats$Mother)
dim(child1_same)
dim(sites_to_keep_df_withstats)-dim(child1_same)
child1_notsame<-subset(sites_to_keep_df_withstats, sites_to_keep_df_withstats$child1!=sites_to_keep_df_withstats$Mother)
##how  many sites are incompatibel with parthenogenesis/selfing
child1_notsame_IN<-child1_notsame
child1_notsame_IN$WildP<-NULL
child1_notsame_IN$child2<-NULL
child1_notsame_IN$Mother<-as.character(child1_notsame_IN$Mother)
child1_notsame_IN$child1<-as.character(child1_notsame_IN$child1)

test1<-strsplit(child1_notsame_IN$Mother, "|")
test1<-matrix(unlist(test1), ncol=3, byrow=TRUE)
test1<-as.data.frame(test1)
test1$V2<-NULL
mother.list <- as.list(as.data.frame(t(test1)))

test2<-strsplit(child1_notsame_IN$child1, "|")
test2<-matrix(unlist(test2), ncol=3, byrow=TRUE)
test2<-as.data.frame(test2)
test2$V2<-NULL
child.list <- as.list(as.data.frame(t(test2)))

child1test<-list()
for (i in 1:length(child.list)) {
  child1test[[i]]<-child.list[[i]] %in% mother.list[[i]]
}

child1testDF <- data.frame(matrix(unlist(child1test), nrow=length(child1test), byrow=TRUE))
child1testDF$IN <- rowSums(child1testDF == "FALSE")
child1testDF$SNP<-row.names(child1_notsame_IN)
not_in_moth_child1<-subset(child1testDF, child1testDF$IN>0)##12926 alleles not in the mother
nrow(not_in_moth_child1)/nrow(sites_to_keep_df_withstats)  ##~ 3.8%
not_in_moth_child1_full<-sites_to_keep_df_withstats[rownames(sites_to_keep_df_withstats)%in%not_in_moth_child1$SNP,]

#######################
## offspring 2
######################
levels(sites_to_keep_df_withstats$Mother)<-levels(sites_to_keep_df_withstats$child2)
child2_same<-subset(sites_to_keep_df_withstats, sites_to_keep_df_withstats$child2==sites_to_keep_df_withstats$Mother)
dim(child2_same)
dim(sites_to_keep_df_withstats)-dim(child2_same)
child2_notsame<-subset(sites_to_keep_df_withstats, sites_to_keep_df_withstats$child2!=sites_to_keep_df_withstats$Mother)

##how  many sites are incompatibel with parthenogenesis/selfing
child2_notsame_IN<-child2_notsame
child2_notsame_IN$WildP<-NULL
child2_notsame_IN$child1<-NULL
child2_notsame_IN$Mother<-as.character(child2_notsame_IN$Mother)
child2_notsame_IN$child2<-as.character(child2_notsame_IN$child2)

test1<-strsplit(child2_notsame_IN$Mother, "|")
test1<-matrix(unlist(test1), ncol=3, byrow=TRUE)
test1<-as.data.frame(test1)
test1$V2<-NULL
mother.list <- as.list(as.data.frame(t(test1)))
test2<-strsplit(child2_notsame_IN$child2, "|")
test2<-matrix(unlist(test2), ncol=3, byrow=TRUE)
test2<-as.data.frame(test2)
test2$V2<-NULL
child.list <- as.list(as.data.frame(t(test2)))

child2test<-list()
for (i in 1:length(child.list)) {
  child2test[[i]]<-child.list[[i]] %in% mother.list[[i]]
}
child2testDF <- data.frame(matrix(unlist(child2test), nrow=length(child2test), byrow=TRUE))
child2testDF$IN <- rowSums(child2testDF == "FALSE")
child2testDF$SNP<-row.names(child2_notsame_IN)

not_in_moth_child2<-subset(child2testDF, child2testDF$IN>0)
nrow(not_in_moth_child2)/nrow(sites_to_keep_df_withstats) 

not_in_moth_child2_full<-sites_to_keep_df_withstats[rownames(sites_to_keep_df_withstats)%in%not_in_moth_child2$SNP,]
intersec_notinbothchild<-intersect(rownames(not_in_moth_child1_full), rownames(not_in_moth_child2_full))
length(intersec_notinbothchild)


pdf("NotInParentDepth.pdf")
par(mfrow=c(2,2))
boxplot(not_in_moth_child1_full$DP_child1~not_in_moth_child1_full$child1,varwidth=T,main="Child1")
boxplot(not_in_moth_child2_full$DP_child2~not_in_moth_child2_full$child2,varwidth=T,main="Child2")
boxplot(not_in_moth_child1_full$DP_mother~not_in_moth_child1_full$child1,varwidth=T,main="DP mother at impossible sites Child1")
boxplot(not_in_moth_child2_full$DP_mother~not_in_moth_child2_full$child2,varwidth=T,main="DP mother at impossible sites Child2")
dev.off()


##at impossible 0|1 sites the parent is either 00 or 11
##at impossible 00 sites, the parent is 11, check if the parent has lower DP at those sites and we might have missed a 0|1 call
##at impossible 11 sites, the parent is 00, check if the parent has lower DP at those sites and we might have missed a 0|1 call

##compare mother coverage at the impossible sites with those that are possible
table(not_in_moth_child1_full$child1)
table(not_in_moth_child2_full$child2)


##check if the sites that are possible have higher coverage in the parent and also in the offspring

goodsiteschild1<-subset(sites_to_keep_df_withstats, !(sites_to_keep_df_withstats$SNP_ID%in%not_in_moth_child1_full$SNP_ID))
goodsiteschild2<-subset(sites_to_keep_df_withstats, !(sites_to_keep_df_withstats$SNP_ID%in%not_in_moth_child2_full$SNP_ID))

ymax<-max(not_in_moth_child1_full$DP_child1, not_in_moth_child1_full$DP_mother,not_in_moth_child2_full$DP_mother, not_in_moth_child2_full$DP_child2,goodsiteschild1$DP_child1,goodsiteschild1$DP_mother,goodsiteschild2$DP_child2,goodsiteschild2$DP_mother)

pdf("CoverageAndGQImpossPossOffspringParent.pdf", paper="a4")
par(mfrow=c(2,2))
boxplot(not_in_moth_child1_full$DP_child1,not_in_moth_child1_full$DP_mother,goodsiteschild1$DP_child1,goodsiteschild1$DP_mother,main="Child1", names=c("DP_child1_impossible", "DP_mother_impossible","DP_child1_possible", "DP_mother_possible"),las=2, outline=F,ylim=c(0,40))
boxplot(not_in_moth_child2_full$DP_child2,not_in_moth_child2_full$DP_mother,goodsiteschild2$DP_child2,goodsiteschild2$DP_mother,main="Child2", names=c("DP_child2_impossible", "DP_mother_impossible","DP_child2_possible", "DP_mother_possible"),las=2, outline=F,ylim=c(0,40))
boxplot(not_in_moth_child1_full$GQ_child1,not_in_moth_child1_full$GQ_mother,goodsiteschild1$GQ_child1,goodsiteschild1$GQ_mother,main="Child1", names=c("DP_child1_impossible", "DP_mother_impossible","DP_child1_possible", "DP_mother_possible"),las=2, outline=F)
boxplot(not_in_moth_child2_full$GQ_child2,not_in_moth_child2_full$GQ_mother,goodsiteschild2$GQ_child2,goodsiteschild2$GQ_mother,main="Child2", names=c("DP_child2_impossible", "DP_mother_impossible","DP_child2_possible", "DP_mother_possible"),las=2, outline=F)
dev.off()

wilcox.test(not_in_moth_child1_full$DP_child1,goodsiteschild1$DP_child1, alternative = "less")
wilcox.test(not_in_moth_child1_full$DP_mother,goodsiteschild1$DP_mother, alternative = "less")
wilcox.test(not_in_moth_child2_full$DP_child2,goodsiteschild2$DP_child2, alternative = "less")
wilcox.test(not_in_moth_child2_full$DP_mother,goodsiteschild2$DP_mother, alternative = "less")

wilcox.test(not_in_moth_child1_full$GQ_child1,goodsiteschild1$GQ_child1, alternative = "less")
wilcox.test(not_in_moth_child1_full$GQ_mother,goodsiteschild1$GQ_mother, alternative = "less")
wilcox.test(not_in_moth_child2_full$GQ_child2,goodsiteschild2$GQ_child2, alternative = "less")
wilcox.test(not_in_moth_child2_full$GQ_mother,goodsiteschild2$GQ_mother, alternative = "less")


#plot quality distributions of the incompatible sites
pdf("StatsOverallIncomOffspring1.pdf", paper="a4")
par(mfrow=c(3,4))
boxplot(not_in_moth_child1_full$DP,goodsiteschild1$DP,main="Depth", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$QD,goodsiteschild1$QD,main="QD", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$ExcessHet,goodsiteschild1$ExcessHet,main="ExcessHet", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$FS,goodsiteschild1$FS,main="FS", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$MQ,goodsiteschild1$MQ,main="MQ", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$MQRankSum,goodsiteschild1$MQRankSum,main="MQRankSum", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$ReadPosRankSum,goodsiteschild1$ReadPosRankSum,main="ReadPosRankSum", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$SOR,goodsiteschild1$SOR,main="SOR", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$MLEAC,goodsiteschild1$MLEAC,main="MLEAC", names=c("impossible","good"))
boxplot(not_in_moth_child1_full$MLEAF,goodsiteschild1$MLEAF,main="MLEAF", names=c("impossible","good"))
dev.off()
#test for less:
wilcox.test(not_in_moth_child1_full$DP,goodsiteschild1$DP, alternative = "less")
wilcox.test(not_in_moth_child1_full$QD,goodsiteschild1$QD, alternative = "less")
wilcox.test(not_in_moth_child1_full$MQ,goodsiteschild1$MQ, alternative = "less")
wilcox.test(not_in_moth_child1_full$MQRankSum,goodsiteschild1$MQRankSum, alternative = "less")
wilcox.test(not_in_moth_child1_full$ReadPosRankSum,goodsiteschild1$ReadPosRankSum, alternative = "less")
#test for more:
wilcox.test(not_in_moth_child1_full$FS,goodsiteschild1$FS, alternative = "greater")
wilcox.test(not_in_moth_child1_full$SOR,goodsiteschild1$SOR, alternative = "greater")


pdf("StatsOverallIncomOffspring2.pdf", paper="a4")
par(mfrow=c(3,4))
boxplot(not_in_moth_child2_full$DP,goodsiteschild2$DP,main="Depth", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$QD,goodsiteschild2$QD,main="QD", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$ExcessHet,goodsiteschild2$ExcessHet,main="ExcessHet", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$FS,goodsiteschild2$FS,main="FS", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$MQ,goodsiteschild2$MQ,main="MQ", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$MQRankSum,goodsiteschild2$MQRankSum,main="MQRankSum", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$ReadPosRankSum,goodsiteschild2$ReadPosRankSum,main="ReadPosRankSum", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$SOR,goodsiteschild2$SOR,main="SOR", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$MLEAC,goodsiteschild2$MLEAC,main="MLEAC", names=c("impossible","good"))
boxplot(not_in_moth_child2_full$MLEAF,goodsiteschild2$MLEAF,main="MLEAF", names=c("impossible","good"))
dev.off()

#test for less:
wilcox.test(not_in_moth_child2_full$DP,goodsiteschild2$DP, alternative = "less")
wilcox.test(not_in_moth_child2_full$QD,goodsiteschild2$QD, alternative = "less")
wilcox.test(not_in_moth_child2_full$MQ,goodsiteschild2$MQ, alternative = "less")
wilcox.test(not_in_moth_child2_full$MQRankSum,goodsiteschild2$MQRankSum, alternative = "less")
wilcox.test(not_in_moth_child2_full$ReadPosRankSum,goodsiteschild2$ReadPosRankSum, alternative = "less")
#test for more:
wilcox.test(not_in_moth_child2_full$FS,goodsiteschild2$FS, alternative = "greater")
wilcox.test(not_in_moth_child2_full$SOR,goodsiteschild2$SOR, alternative = "greater")
