#!/usr/bin/env Rscript

library(vcfR)
setwd("~/myproject/inputdata")
GenomeInfo <- read.delim("GenomeInfoOrenil2.txt")
centromers <- read.delim("EstimatedCentromerPositions.txt")

# biallelic sites 
benithos <- read.vcfR("BenithosSNPsOnly.final.vcf.gz")
benithos_gt <- extract.gt(benithos)

benithos_gt_num <- benithos_gt
benithos_gt_num <- as.data.frame(benithos_gt_num)
colnames(benithos_gt_num)<-c("WildP", "Mother", "child1", "child2")
benithos_gt_num <- data.frame(lapply(benithos_gt_num, function(x) {gsub("/", "|", x)}))
rownames(benithos_gt_num)<-rownames(benithos_gt)
benithos_gt_num$POS<-sapply(strsplit(row.names(benithos_gt_num),'\\.2_'), "[", 2)
benithos_gt_num$CHROM<-sapply(strsplit(row.names(benithos_gt_num),'\\.2'), "[", 1)
benithos_gt_num$POS<-as.numeric(benithos_gt_num$POS)

allcallsites_df_tokeep <- read.csv("allcallsites_df_tokeep.txt", sep="")
colnames(allcallsites_df_tokeep)<-c("row","sumSNPs", "start", "stop", "CHROM")
allcallsites_df_tokeep$start<-as.numeric(allcallsites_df_tokeep$start)
allcallsites_df_tokeep$stop<-as.numeric(allcallsites_df_tokeep$stop)
allcallsites_df_tokeep$sumSNPs<-as.numeric(allcallsites_df_tokeep$sumSNPs)

# consider only those placed on chromosomes
benithos_gt_num_chr<-subset(benithos_gt_num, benithos_gt_num$CHROM!="UNPLACED")
benithos_gt_num_chr<-subset(benithos_gt_num_chr, benithos_gt_num_chr$CHROM!="NC_013663")
benithos_gt_num_chr$POS<-as.numeric(benithos_gt_num_chr$POS)

sites_to_keep<-list()
for (i in unique(benithos_gt_num_chr$CHROM)) {
  myfragment<-subset(benithos_gt_num_chr, benithos_gt_num_chr$CHROM==i)
  windows<-allcallsites_df_tokeep[grep(i,allcallsites_df_tokeep$CHROM),] 
  out_count<-list()
  for (j in 1:length(windows$start)) {
    mywindow_start<-windows$start[j]
    mywindow_end<-windows$stop[j]
    testtable<-subset(myfragment, POS>=mywindow_start & POS<=mywindow_end)
    out_count[[j]]<-testtable
  }
  out_count_df<-do.call(rbind,out_count)
  sites_to_keep[[i]]<-out_count_df
}

sites_to_keep_df<-do.call(rbind, sites_to_keep)
apply(sites_to_keep_df[1:4],2, table)

sites_to_keep_df$SNP_ID<-paste(sites_to_keep_df$CHROM,".2_" ,sites_to_keep_df$POS, sep="")

homreflength<-round((nrow(sites_to_keep_df)/100)*20.4)
homaltlength<-round((nrow(sites_to_keep_df)/100)*19.6)
hetlength<-round((nrow(sites_to_keep_df)/100)*60)
ref<-rep("0|0",homreflength )
alt<-rep("1|1",homaltlength )
het<-rep("0|1",hetlength )

allbrothers<-list()
allF1<-list()
for (i in 1:10) {
  trial2<-sites_to_keep_df
  trial2$Brother<-sample(x = c(ref,alt,het), size=nrow(trial2), replace = TRUE)
  trial2_notident<-subset(trial2, trial2$Brother!=trial2$Mother)
  trial2_ident<-subset(trial2, trial2$Brother==trial2$Mother)
  length(trial2_ident$Brother)
  make_identic<-trial2_notident[sample(nrow(trial2_notident), (length(sites_to_keep_df$Mother)/2)-length(trial2_ident$Brother)), ]
  make_identic$Brother<-make_identic$Mother
  keep_nonidentic<-subset(trial2_notident, !(trial2_notident$SNP_ID%in%make_identic$SNP_ID))
  
  final_df<-rbind(make_identic,keep_nonidentic,trial2_ident)
  # now the brother is exactly 50% identical
  final_df$brother_a1<-sapply(strsplit(final_df$Brother,'|'), "[", 1)
  final_df$brother_a2<-sapply(strsplit(final_df$Brother,'|'), "[", 3)

  final_df$mother_a1<-sapply(strsplit(final_df$Mother,'|'), "[", 1)
  final_df$mother_a2<-sapply(strsplit(final_df$Mother,'|'), "[", 3)

  # now  simulate a number of children
  fullsibF1s<-list()

    for (j in 1:10) {
      sib_a1<-apply(final_df, 1, function (x) sample(x[9:10],1,replace=F))
      sib_a2<-apply(final_df, 1, function (x) sample(x[11:12],1,replace=F))
      sib<-data.frame(sib_a1,sib_a2)
      sib$genotype<-paste(sib$sib_a1, "|", sib$sib_a2, sep="")
      fullsibF1s[[j]]<-sib$genotype
    }
    all_F1_fullsib<-do.call(cbind,fullsibF1s)
    allbrothers[[i]]<-final_df$Brother
    allF1[[i]]<-all_F1_fullsib
}    
    
  
overall_het<-apply(all_F1_fullsib,2, table)
overall_het<-data.frame(overall_het)
overall_het<-t(overall_het)
overall_het<-data.frame(overall_het)
overall_het$hetsum<-overall_het$X0.1+overall_het$X1.0
overall_het$het_proportion<-overall_het$hetsum/(overall_het$X0.0+overall_het$X0.1+overall_het$X1.0+overall_het$X1.1)

mean(overall_het$het_proportion)

contsites<-apply(sites_to_keep_df[1:4], 2, table)
contsites<-as.data.frame(contsites)
allbinomtest<-apply(contsites, 2, function(x) binom.test(c(x[1],x[3]),p=0.5))

# generate Bxy and Mxy input
testvcf<-t(testvcf)
testvcf<-sub("\\|", " ", testvcf)

row.names(testvcf)<-c("WildI pop1", "Mother pop1", "child1 pop1", "child2  pop1")
write.table(testvcf, file="temp.txt", sep=" ", row.names = T, col.names = F, quote = F)

# generate header row for demerelate
df<-as.data.frame(rep(colnames(testvcf), each=2))
andb<-c("_a", "_b")
df$locus<-rep(andb, nrow(df)/2)
df$finalheaderrow<-paste(df$`rep(colnames(testvcf), each = 2)`, df$locus, sep="")
remove(testvcf)
testvcf2 <- read.table("temp.txt", quote="\"", comment.char="")
colnames(testvcf2)<-c("Sample_ID", "population", df$finalheaderrow)
testvcf3<-data.frame(testvcf2[1:2],testvcf2[, 3:ncol(testvcf2)] + 1)
# 0  as  allele  value is not  working in Demerelate, add 1 to each value
write.table(testvcf3, "BenithosSNPsOnly.Multi.final.DemeIn.txt", append = FALSE, sep = "\t",
            row.names = FALSE, col.names = TRUE, quote = FALSE)
