#!/usr/bin/env Rscript

library(vcfR)
library(grDevices)
library(rstatix)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(tidyr)

setwd("~/myproject/inputdata")

centromers <- read.delim("EstimatedCentromerPositions.txt")
centromers$chrom<-as.character(centromers$chrom)
centromers$chromshort<-sapply(strsplit(centromers$chrom,'\\.2'), "[", 1)
# read in information on genome
GenomeInfo <- read.delim("GenomeInfoOrenil2.txt")
GenomeInfo$RefSeq.Accn<-as.character(GenomeInfo$RefSeq.Accn)
GenomeInfo$chromshort<-sapply(strsplit(GenomeInfo$RefSeq.Accn,'\\.2'), "[", 1)

benithos <- read.vcfR("BenithosSNPsOnly.final.vcf.gz") 
benithos_gt <- extract.gt(benithos)
benithos_gt_num <- benithos_gt
benithos_gt_num <- as.data.frame(benithos_gt_num)
colnames(benithos_gt_num)<-c("WildP", "Mother", "child1", "child2")
benithos_gt_num <- data.frame(lapply(benithos_gt_num, function(x) {gsub("/", "|", x)}))
rownames(benithos_gt_num)<-rownames(benithos_gt)
benithos_gt_num$POS<-sapply(strsplit(row.names(benithos_gt_num),'\\_'), "[", 3)
benithos_gt_num$CHROM<-sapply(strsplit(row.names(benithos_gt_num),'\\_'), "[", 2)
benithos_gt_num$CHROM<-paste("NC_", benithos_gt_num$CHROM, sep="")
benithos_gt_num$CHROM[grep("UNPL", rownames(benithos_gt_num))]<-"UNPLACED"
benithos_gt_num$SNP_ID<-rownames(benithos_gt_num)

# remove low quality windows
allcallsites_df_tokeep <- read.csv("allcallsites_df_tokeep.txt", sep="")
colnames(allcallsites_df_tokeep)<-c("row","sumSNPs", "start", "stop", "CHROM")
allcallsites_df_tokeep$start<-as.numeric(allcallsites_df_tokeep$start)
allcallsites_df_tokeep$stop<-as.numeric(allcallsites_df_tokeep$stop)
allcallsites_df_tokeep$sumSNPs<-as.numeric(allcallsites_df_tokeep$sumSNPs)

# consider only those placed on chromosomes
benithos_gt_num_chr<-subset(benithos_gt_num, benithos_gt_num$CHROM!="UNPLACED")
benithos_gt_num_chr<-subset(benithos_gt_num_chr, benithos_gt_num_chr$CHROM!="NC_013663")
benithos_gt_num_chr$POS<-as.numeric(benithos_gt_num_chr$POS)

sites_to_keep<-list()
for (i in unique(benithos_gt_num_chr$CHROM)) {
  myfragment<-subset(benithos_gt_num_chr, benithos_gt_num_chr$CHROM==i)
  windows<-allcallsites_df_tokeep[grep(i,allcallsites_df_tokeep$CHROM),] 
  out_count<-list()
  for (j in 1:length(windows$start)) {
    mywindow_start<-windows$start[j]
    mywindow_end<-windows$stop[j]
    testtable<-subset(myfragment, POS>=mywindow_start & POS<=mywindow_end)
    out_count[[j]]<-testtable
  }
  out_count_df<-do.call(rbind,out_count)
  sites_to_keep[[i]]<-out_count_df
}


sites_to_keep_df<-do.call(rbind, sites_to_keep)


child1_same<-subset(sites_to_keep_df, sites_to_keep_df$child1==sites_to_keep_df$Mother)
dim(child1_same) 
(nrow(child1_same)/nrow(sites_to_keep_df))*100
dim(sites_to_keep_df)-dim(child1_same)
child1_notsame<-subset(sites_to_keep_df, sites_to_keep_df$child1!=sites_to_keep_df$Mother)
child1_notsame_IN<-child1_notsame
child1_notsame_IN$WildP<-NULL
child1_notsame_IN$child2<-NULL
child1_notsame_IN$Mother<-as.character(child1_notsame_IN$Mother)
child1_notsame_IN$child1<-as.character(child1_notsame_IN$child1)
test1<-strsplit(child1_notsame_IN$Mother, "|")
test1<-matrix(unlist(test1), ncol=3, byrow=TRUE)
test1<-as.data.frame(test1)
test1$V2<-NULL
mother.list <- as.list(as.data.frame(t(test1)))
test2<-strsplit(child1_notsame_IN$child1, "|")
test2<-matrix(unlist(test2), ncol=3, byrow=TRUE)
test2<-as.data.frame(test2)
test2$V2<-NULL
child.list <- as.list(as.data.frame(t(test2)))

child1test<-list()
for (i in 1:length(child.list)) {
  child1test[[i]]<-child.list[[i]] %in% mother.list[[i]]
}

child1testDF <- data.frame(matrix(unlist(child1test), nrow=length(child1test), byrow=TRUE))
child1testDF$IN <- rowSums(child1testDF == "FALSE")
child1testDF$SNP<-row.names(child1_notsame_IN)
any_not_in_moth_child1<-subset(child1testDF, child1testDF$IN>0)
nrow(any_not_in_moth_child1)/nrow(sites_to_keep_df)*100
both_not_in_moth_child1<-subset(child1testDF, child1testDF$IN>1)
nrow(both_not_in_moth_child1)/nrow(sites_to_keep_df)*100  

child2_same<-subset(sites_to_keep_df, sites_to_keep_df$child2==sites_to_keep_df$Mother)
dim(child2_same) 
(nrow(child2_same)/nrow(sites_to_keep_df))*100
dim(sites_to_keep_df)-dim(child2_same)
child2_notsame<-subset(sites_to_keep_df, sites_to_keep_df$child2!=sites_to_keep_df$Mother)
child2_notsame_IN<-child2_notsame
child2_notsame_IN$WildP<-NULL
child2_notsame_IN$child1<-NULL
child2_notsame_IN$Mother<-as.character(child2_notsame_IN$Mother)
child2_notsame_IN$child2<-as.character(child2_notsame_IN$child2)
test1<-strsplit(child2_notsame_IN$Mother, "|")
test1<-matrix(unlist(test1), ncol=3, byrow=TRUE)
test1<-as.data.frame(test1)
test1$V2<-NULL
mother.list <- as.list(as.data.frame(t(test1)))
test2<-strsplit(child2_notsame_IN$child2, "|")
test2<-matrix(unlist(test2), ncol=3, byrow=TRUE)
test2<-as.data.frame(test2)
test2$V2<-NULL
child.list <- as.list(as.data.frame(t(test2)))

child2test<-list()
for (i in 1:length(child.list)) {
  child2test[[i]]<-child.list[[i]] %in% mother.list[[i]]
}

child2testDF <- data.frame(matrix(unlist(child2test), nrow=length(child2test), byrow=TRUE))
child2testDF$IN <- rowSums(child2testDF == "FALSE")
child2testDF$SNP<-row.names(child2_notsame_IN)
any_not_in_moth_child2<-subset(child2testDF, child2testDF$IN>0)
nrow(any_not_in_moth_child2)/nrow(benithos_gt_num)*100
both_not_in_moth_child2<-subset(child2testDF, child2testDF$IN>1)

nrow(both_not_in_moth_child2)/nrow(sites_to_keep_df)*100 

benithos_gt_num_filt_df_2<-sites_to_keep_df[!rownames(sites_to_keep_df)%in%both_not_in_moth_child1$SNP,]
benithos_gt_num_filt_df_2<-benithos_gt_num_filt_df_2[!rownames(benithos_gt_num_filt_df_2)%in%both_not_in_moth_child2$SNP,]
nrow(benithos_gt_num_filt_df_2) #335753
write.table(cbind(benithos_gt_num_filt_df_2$POS,benithos_gt_num_filt_df_2$CHROM,benithos_gt_num_filt_df_2$SNP_ID), file="PositionsToKeepFiltImpossible.txt", sep = "\t", col.names = T, quote = F)

ind_depth<-extract.gt(benithos, element = "DP")
ind_depth<-as.data.frame(ind_depth)
ind_depth<-apply(ind_depth, 2, function(x) as.numeric(x))
ind_depth<-as.data.frame(ind_depth)
colnames(ind_depth)<-c("DP_wildI","DP_mother","DP_child1","DP_child2")
rownames(ind_depth)<-rownames(benithos_gt_num)
ind_depth$SNP_ID<-rownames(ind_depth)

ind_qual<-extract.gt(benithos, element = "GQ")
ind_qual<-as.data.frame(ind_qual)
ind_qual<-apply(ind_qual, 2, function(x) as.numeric(x))
ind_qual<-as.data.frame(ind_qual)
colnames(ind_qual)<-c("GQ_wildI","GQ_mother","GQ_child1","GQ_child2")
rownames(ind_qual)<-rownames(benithos_gt_num)
ind_qual$SNP_ID<-rownames(ind_qual)

##apply depth filter 
benithos_gt_num_filt_df_3<-merge(benithos_gt_num_filt_df_2,ind_depth,by="SNP_ID")
benithos_gt_num_filt_df_3<-merge(benithos_gt_num_filt_df_3,ind_qual,by="SNP_ID")


DPstats<-boxplot(benithos_gt_num_filt_df_3$DP_child1,benithos_gt_num_filt_df_3$DP_child2,benithos_gt_num_filt_df_3$DP_mother, benithos_gt_num_filt_df_3$DP_wildI)$stats
pdf(file="SuppFig3a.pdf", paper="a4")
boxplot(benithos_gt_num_filt_df_3$DP_child1,benithos_gt_num_filt_df_3$DP_child2,benithos_gt_num_filt_df_3$DP_mother, benithos_gt_num_filt_df_3$DP_wildI)
dev.off()

DPstats<-as.data.frame(DPstats)
colnames(DPstats)<-c("DPst_child1","DPst_child2","DPst_mother","DPst_wildInd")

##overall heterozygosity atfer filtering

het_mother<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$Mother=="0|1")
homref_mother<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$Mother=="0|0")
homalt_mother<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$Mother=="1|1")
nrow(het_mother)/nrow(benithos_gt_num_filt_df_3)*100 

het_child1<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$child1=="0|1")
homref_child1<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$child1=="0|0")
homalt_child1<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$child1=="1|1")
nrow(het_child1)/nrow(benithos_gt_num_filt_df_3)*100 

het_child2<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$child2=="0|1")
homref_child2<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$child2=="0|0")
homalt_child2<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$child2=="1|1")
nrow(het_child2)/nrow(benithos_gt_num_filt_df_3)*100 

het_wildI<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$WildP=="0|1")
nrow(het_wildI)/nrow(benithos_gt_num_filt_df_3)*100 
homref_wildI<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$WildP=="0|0")
homalt_wildI<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$WildP=="1|1")

##retained heterozygosity
het_mother<-subset(benithos_gt_num_filt_df_3, benithos_gt_num_filt_df_3$Mother=="0|1")
ret_het_child1<-subset(het_mother, het_mother$child1=="0|1")
homref_child1<-subset(het_mother, het_mother$child1=="0|0")
homalt_child1<-subset(het_mother, het_mother$child1=="1|1")
nrow(ret_het_child1)/nrow(het_mother)*100 

ret_het_child2<-subset(het_mother, het_mother$child2=="0|1")
homref_child2<-subset(het_mother, het_mother$child2=="0|0")
homalt_child2<-subset(het_mother, het_mother$child2=="1|1")
nrow(ret_het_child2)/nrow(het_mother)*100 

ret_het_wildI<-subset(het_mother, het_mother$WildP=="0|1")
nrow(ret_het_wildI)/nrow(het_mother)*100 
homref_wildI<-subset(het_mother, het_mother$WildP=="0|0")
homalt_wildI<-subset(het_mother, het_mother$WildP=="1|1")


pdf(file="SuppFig3b.pdf", paper="a4")
par(mfrow=c(2,2))
boxplot(benithos_gt_num_filt_df_3$DP_wildI ~ benithos_gt_num_filt_df_3$WildP)
boxplot(benithos_gt_num_filt_df_3$DP_mother ~ benithos_gt_num_filt_df_3$Mother)
boxplot(benithos_gt_num_filt_df_3$DP_child1 ~ benithos_gt_num_filt_df_3$child1)
boxplot(benithos_gt_num_filt_df_3$DP_child2 ~ benithos_gt_num_filt_df_3$child2)
dev.off()

wildInd<-data.frame(benithos_gt_num_filt_df_3$WildP,benithos_gt_num_filt_df_3$DP_wildI)
colnames(wildInd)<-c("genotype","depth")
kruskal.test(wildInd$depth ~ wildInd$genotype)
dunn_test(depth ~ genotype, data = wildInd,p.adjust.method = "bonferroni")

mother<-data.frame(benithos_gt_num_filt_df_3$Mother,benithos_gt_num_filt_df_3$DP_mother)
colnames(mother)<-c("genotype","depth")
kruskal.test(mother$depth ~ mother$genotype)
dunn_test(depth ~ genotype, data = mother,p.adjust.method = "bonferroni")

child1<-data.frame(benithos_gt_num_filt_df_3$child1,benithos_gt_num_filt_df_3$DP_child1)
colnames(child1)<-c("genotype","depth")
kruskal.test(child1$depth ~ child1$genotype)
dunn_test(depth ~ genotype, data = child1,p.adjust.method = "bonferroni")

child2<-data.frame(benithos_gt_num_filt_df_3$child1,benithos_gt_num_filt_df_3$DP_child1)
colnames(child2)<-c("genotype","depth")
kruskal.test(child2$depth ~ child2$genotype)
dunn_test(depth ~ genotype, data = child2,p.adjust.method = "bonferroni")




############## PART 4 "Cycle through primary and auxiliary data matrices" ############## 

# 5 samsara cycles /for loops
# 1 for each site subset, ie 1once for all sites and once for the sites where the mother is heterozygous
# and 3 times, once each for 

all_sites <- benithos_gt_num_filt_df_3
all_sites <- all_sites %>% mutate_at(.vars = c("Mother", "child1", "child2", "WildP"), .funs = ~ifelse(. == "0|1", 1, .))
all_sites <- all_sites %>% mutate_at(.vars = c("Mother", "child1", "child2", "WildP"), .funs = ~ifelse(. == "1|1", 2, .))
all_sites <- all_sites %>% mutate_at(.vars = c("Mother", "child1", "child2", "WildP"), .funs = ~ifelse(. == "0|0", 0, .))
all_sites <- all_sites %>% mutate_at(.vars = c("Mother", "child1", "child2", "WildP"), .funs = as.factor)

# change names of mother and children
colnames(all_sites) <- c("SNP_ID","wild","parent","offspring1","offspring2","POS","CHROM","DP_wild","DP_parent","DP_offspring1",
                          "DP_offspring2","GQ_wild","GQ_parent","GQ_offspring1","GQ_offspring2" )

# since the all_sites dataset is all_sites, now create the het_mother dataset

het_mother<-subset(all_sites, all_sites$parent==1)


########################### Engage the creation of functioning data arrays ########################### 

for (site_split in c("het_mother","all_sites")) {
  site_df<-get(site_split)
  
  site_df_alleles <- site_df %>%
    select(SNP_ID, POS, CHROM, wild, parent, offspring1, offspring2) %>%
    pivot_longer(
      cols = c(wild, parent, offspring1, offspring2),
      names_to = "Sample",
      values_to = "Allele"
    )
  
  # Pivoting the numeric (DP,GQ) columns
  site_df_numeric <- site_df %>%
    select(SNP_ID, DP_wild, DP_parent, DP_offspring1, DP_offspring2, GQ_wild,GQ_parent, GQ_offspring1,GQ_offspring2) %>%
    pivot_longer(
      cols = c(DP_wild:DP_offspring2, GQ_wild:GQ_offspring2),
      names_to = c("Variable", "Sample"),
      names_pattern = "(.*[^1,2])_(.*)", # changed the regex pattern to match your specific column names
      values_to = "Value"
    )
  
  # Merging the two data frames back together
  site_df_long <- merge(site_df_alleles, site_df_numeric, by = c("SNP_ID", "Sample"))
  
  #separate Dp and GQ
  site_df_long <- site_df_long %>%
    spread(Variable, Value)
  
  # make variables numeric for better plotting
  site_df_long <- site_df_long %>%
    mutate_at(vars(POS,DP,GQ), as.numeric)
  
  #remove unplaced snps
  site_df_long<-subset(site_df_long, !grepl("UNPLACED", CHROM))
  
  # Convert the 'Sample' column to a factor
  site_df_long$Sample <- factor(site_df_long$Sample)
  
  # remove wild
  site_df_long_no_wild <- site_df_long %>%
    filter(Sample != "wild")
  
  
  # Reorder the levels
  site_df_long$Sample <- factor(site_df_long$Sample, levels = c("wild","parent", "offspring1", "offspring2"))
  site_df_long_no_wild$Sample <- factor(site_df_long_no_wild$Sample, levels = c("parent", "offspring1", "offspring2"))
  
  
  # DP FILTERING
  # Only keep sites where no parent or offspring have a DP 
  # outside the given range
  
  
  ################# range 3 -- 
  dppositions3 <- site_df_long %>%
    filter(DP < 3) %>%
    pull(POS)
  
  # Filter anything not in those position out of the  data
  site_df_long_no_wild_DP3 <- site_df_long %>%
    filter(!(POS %in% dppositions3))
  
  
  ################# range 8 -- 
  dppositions8 <- site_df_long %>%
    filter(DP < 8) %>%
    pull(POS)
  site_df_long_no_wild_DP8 <- site_df_long %>%
    filter(!(POS %in% dppositions8))
  
  ################# range 15 -- 
  dppositions15 <- site_df_long %>%
    filter(DP < 15) %>%
    pull(POS)
  site_df_long_no_wild_DP15 <- site_df_long %>%
    filter(!(POS %in% dppositions15))
  
  ################# range 3 -- 20 
  bad_pos3_20 <- site_df_long_no_wild %>%
    filter(DP < 3 | DP > 20) %>%
    pull(POS)
  site_df_long_no_wild_DP3_20 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos3_20))
  
  ################# range 8 -- 20 
  bad_pos8_20 <- site_df_long_no_wild %>%
    filter(DP < 8 | DP > 20) %>%
    pull(POS)
  site_df_long_no_wild_DP8_20 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos8_20))
  
  ################# range 15 -- 20 
  bad_pos15_20 <- site_df_long_no_wild %>%
    filter(DP < 15 | DP > 20) %>%
    pull(POS)
  site_df_long_no_wild_DP15_20 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos15_20))
  
  ################# range 3 -- 25
  bad_pos3_25 <- site_df_long_no_wild %>%
    filter(DP < 3 | DP > 25) %>%
    pull(POS)
  site_df_long_no_wild_DP3_25 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos3_25))
  
  ################# range 8 -- 25
  bad_pos8_25 <- site_df_long_no_wild %>%
    filter(DP < 8 | DP > 25) %>%
    pull(POS)
  site_df_long_no_wild_DP8_25 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos8_25))
  
  ################# range 15 -- 25
  bad_pos15_25 <- site_df_long_no_wild %>%
    filter(DP < 15 | DP > 25) %>%
    pull(POS)
  site_df_long_no_wild_DP15_25 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos15_25))
  
  ################# range 3 -- 30
  bad_pos3_30 <- site_df_long_no_wild %>%
    filter(DP < 3 | DP > 30) %>%
    pull(POS)
  site_df_long_no_wild_DP3_30 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos3_30))
  
  ################# range 8 -- 30
  bad_pos8_30 <- site_df_long_no_wild %>%
    filter(DP < 8 | DP > 30) %>%
    pull(POS)
  site_df_long_no_wild_DP8_30 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos8_30))
  
  ################# range 15 -- 30
  bad_pos15_30 <- site_df_long_no_wild %>%
    filter(DP < 15 | DP > 30) %>%
    pull(POS)
  site_df_long_no_wild_DP15_30 <- site_df_long_no_wild %>%
    filter(!(POS %in% bad_pos15_30))
  
  
  ############################################## STATS  ############################################## 
  
  # Define a function to calculate the retained heterozygosity 
  # between a parent and its two offspring
  calc_ret_het <- function(dataframe_name){
    #`Calculate retained heterozygosity between parent and two offspring
    #` Installs or loads dplyr
    #' Input is a string of the name of the variable

    
    # Checks if the 'dplyr' package is installed, if not, it installs and then loads it
    if (!require("dplyr", character.only = TRUE)) {
      install.packages("dplyr")
      library("dplyr", character.only = TRUE)
    }
    if (!is.character(dataframe_name) || !exists(dataframe_name)) {
      stop("Input should be a string representing an existing data frame")
    }
    
    # Converts the input string name into the actual dataframe
    dataframe <- get(dataframe_name)
    
    
    if(!all(c("Sample", "Allele", "POS") %in% names(dataframe))){
      stop("The dataframe must contain 'Sample', 'Allele', and 'POS' columns.")
    }
    
    # Filters rows where Sample is "parent" and Allele is 1
    het_parent <- dataframe %>%
      filter(Sample == "parent" & Allele == 1)
    
    # Filters rows where Sample is "offspring1" and Allele is 1
    het_offspring1 <- dataframe %>%
      filter(Sample == "offspring1" & Allele == 1)
    
    # Filters rows where Sample is "offspring2" and Allele is 1
    het_offspring2 <- dataframe %>%
      filter(Sample == "offspring2" & Allele == 1)
    
    # Calculates retained heterozygosity for both offspring
    retained_heterozygosity_offspring1 <- sum(het_offspring1$POS %in% het_parent$POS) / nrow(het_parent)
    retained_heterozygosity_offspring2 <- sum(het_offspring2$POS %in% het_parent$POS) / nrow(het_parent)
    
    # Prepares result as a dataframe
    result <- data.frame(
      Database = dataframe_name,
      Offspring1 = retained_heterozygosity_offspring1,
      Offspring2 = retained_heterozygosity_offspring2
    )
    
    return(result)
  }
  
  
  ############ iterate over these dataframes to do various statistics   ############ 
  # the goal of the following for loop is to 
  # loop over the above dataframes and save them outside the larger samsara / for loop
  # so they may be plotted in a separate forloop
  
  # initialize a db for the values of ret_het and 1 to 0/2 freq
  het_to_hom_freq <- data.frame()
  ret_het <- data.frame()
  
  # Loop over the list of databases
  for( my_db in c("site_df_long_no_wild_DP3",
                    "site_df_long_no_wild_DP8",
                    "site_df_long_no_wild_DP15",
                    "site_df_long_no_wild_DP3_20",
                    "site_df_long_no_wild_DP8_20", 
                    "site_df_long_no_wild_DP15_20", 
                    "site_df_long_no_wild_DP3_25", 
                    "site_df_long_no_wild_DP8_25", 
                    "site_df_long_no_wild_DP15_25", 
                    "site_df_long_no_wild_DP3_30", 
                    "site_df_long_no_wild_DP8_30", 
                    "site_df_long_no_wild_DP15_30")) {
    
    # Save each dataset outside the for loop for separate processing later
    assign(paste0(site_split,"_" , my_db), get(my_db))
    
    ############# STATS ############# 
    
    # Retained heterozygosity
    
    # Call the retained heterozygosity function
    my_ret_het<-calc_ret_het(my_db)
    
    # Append results to outside dataframe
    ret_het <- rbind(ret_het, my_ret_het)
    
   # 1 to 0 or 2 frequecy
    
    # allele is a character and a factor which is not ideal for this calculation
    df <- get(my_db)
    df$Allele <- as.numeric(as.character(df$Allele))
  
    # Pivot data to wide format *shrug*
    data_wide <- df %>%
      pivot_wider(names_from = Sample, 
                  values_from = c(Allele, DP, GQ),
                  names_glue = "{Sample}_{.value}")
  
    # Calculate frequencies of conversion of heterozygous alleles to homozygous alleles
    my_db_homhet_freq <- data_wide %>%
      summarise(
        Offspring1_0_when_Parent_1 = ifelse(sum(parent_Allele == 1) == 0, NA, sum(offspring1_Allele == 0 & parent_Allele == 1) / sum(parent_Allele == 1)),
        Offspring1_2_when_Parent_1 = ifelse(sum(parent_Allele == 1) == 0, NA, sum(offspring1_Allele == 2 & parent_Allele == 1) / sum(parent_Allele == 1)),
        Offspring2_0_when_Parent_1 = ifelse(sum(parent_Allele == 1) == 0, NA, sum(offspring2_Allele == 0 & parent_Allele == 1) / sum(parent_Allele == 1)),
        Offspring2_2_when_Parent_1 = ifelse(sum(parent_Allele == 1) == 0, NA, sum(offspring2_Allele == 2 & parent_Allele == 1) / sum(parent_Allele == 1)) 
        )
  
    # Add column for dataframe name
    my_db_homhet_freq$Database <- my_db
    
    # Append results to the results dataframe
    het_to_hom_freq <- rbind(het_to_hom_freq, my_db_homhet_freq)
    
  }
  
  # save above dataframes outside the samsara
  assign(paste0(site_split,"_ret_het"),ret_het)
  assign(paste0(site_split,"_het_to_hom_freq"),het_to_hom_freq)
  
  }

# bring these calculations outside the samsara at some point

write.table(all_sites_ret_het, file="ret_het.txt", sep = "\t", col.names = T, quote = F)
write.table(all_sites_het_to_hom_freq, file="het_to_hom_freq.txt", sep = "\t", col.names = T, quote = F)

all_my_datasets<-c("all_sites_site_df_long_no_wild_DP3",
  "all_sites_site_df_long_no_wild_DP8",
  "all_sites_site_df_long_no_wild_DP15",
  "all_sites_site_df_long_no_wild_DP3_20",
  "all_sites_site_df_long_no_wild_DP8_20",
  "all_sites_site_df_long_no_wild_DP15_20",
  "all_sites_site_df_long_no_wild_DP3_25",
  "all_sites_site_df_long_no_wild_DP8_25",
  "all_sites_site_df_long_no_wild_DP15_25",
  "all_sites_site_df_long_no_wild_DP3_30",
  "all_sites_site_df_long_no_wild_DP8_30",
  "all_sites_site_df_long_no_wild_DP15_30",
  "het_mother_site_df_long_no_wild_DP3",
  "het_mother_site_df_long_no_wild_DP8",
  "het_mother_site_df_long_no_wild_DP15",
  "het_mother_site_df_long_no_wild_DP3_20",
  "het_mother_site_df_long_no_wild_DP8_20",
  "het_mother_site_df_long_no_wild_DP15_20",
  "het_mother_site_df_long_no_wild_DP3_25",
  "het_mother_site_df_long_no_wild_DP8_25",
  "het_mother_site_df_long_no_wild_DP15_25",
  "het_mother_site_df_long_no_wild_DP3_30",
  "het_mother_site_df_long_no_wild_DP8_30",
  "het_mother_site_df_long_no_wild_DP15_30")

new_db <- data.frame()
for (dataset in all_my_datasets ) {
  
  # get the unique number of POS
  num_unique_pos <- length(unique(get(dataset)$POS))
  
  # create a new data frame
  loop_db <- data.frame(dataset = dataset, Number_of_sites = num_unique_pos)
  new_db <- rbind(new_db,loop_db)
  
}

write.table(new_db, file="total_sites_per_db.txt", sep = "\t", col.names = T, row.names = F,quote = F)


############################################## PLOT  ############################################## 




# For each dataset in the list of all datasets, this loop iterates through unique chromosomes
# and creates separate graphs for each chromosome
for (dataset in all_my_datasets ) {
  for (chrom in unique(get(dataset)$CHROM)) {
    
    # Create a separate subset for each chromosome in the dataset
    # and assign it to a variable named after the current chromosome
    assign(chrom, subset(get(dataset), CHROM == chrom))
    
    # Assign the subset of the current chromosome to chrom_loop
    chrom_loop <- get(chrom)
    
    # Fetch the length of the current chromosome from GenomeInfo
    chrom_length=GenomeInfo$Sequence.Length[GenomeInfo$RefSeq.Accn==chrom]
    

    # Fetch the centromere position for the current chromosome
    centro_pos=centromers$median_blast[centromers$chrom==chrom]
    
    # Create a heatmap for each chromosome, depicting allele variants for each sample
    # A vertical red dashed line indicates the centromere position
    heatmapplot_sample_allele <-ggplot(chrom_loop, aes(x = POS, 
                                                       y = Sample,
                                                       color=Allele,
                                                       fill=Allele)) +
      geom_tile(size=0.05) +
      geom_vline(xintercept = centro_pos, linetype='dashed', col = 'red') +
      theme_minimal() +
      scale_color_manual(labels = c("Homozygous reference", "Heterozygous", "Homozygous alternative"), 
                         values = c("springgreen", "grey18", "orchid1")) +
      scale_fill_manual(labels = c("Homozygous reference", "Heterozygous", "Homozygous alternative"), 
                        values = c("springgreen", "grey18", "orchid1")) +
      scale_x_continuous("bp",limits = c(0, chrom_length)) +
      theme(legend.key.width=unit(0.5,"cm"),
            legend.key.size=unit(0.5,"cm"),
            text = element_text(size = 9),
            plot.margin = margin(8,10,8,8, "points") ) +
      scale_y_discrete(limits = rev(levels(chrom_loop$Sample) ) )
    
    
    # add second empty linerange to increase the size of the lines in the legend (only way that works)
    # Create a barplot for each chromosome, depicting DP (Depth of Coverage) for each allele
    # A vertical red dashed line indicates the centromere position
    barplot_sample_DP <- ggplot(chrom_loop, aes(x = POS, 
                                                y = DP,
                                                group=Allele,
                                                color = Allele)) +
      geom_linerange(aes(ymin = 0, ymax = DP), 
                     size=0.05, show.legend = FALSE) +
      geom_linerange(data = NULL, aes(ymin = 0, ymax = 0), size=5) +
      geom_vline(xintercept = centro_pos, linetype='dashed', col = 'red') +
      theme_minimal() +
            scale_color_manual(labels = c("Homozygous reference", "Heterozygous", "Homozygous alternative"), 
                         values = c("springgreen", "grey18", "orchid1")) +
      scale_x_continuous("bp",limits = c(0, chrom_length)) +
      scale_y_continuous(breaks = as.integer(seq.int(from=0,to=max(chrom_loop$DP),length.out=4) ) ) +
      theme(legend.key.width=unit(0.5,"cm"),
            legend.key.size=unit(0.5,"cm"),
            text = element_text(size = 4),
            axis.text = element_text(size = 5),
            legend.text = element_text(size = 5),
            legend.title = element_text(size = 5),
            plot.margin = margin(8,10,8,8, "points") ) +
      facet_grid(Sample ~ .)
    
    # Create a barplot for each chromosome, depicting GQ (Genotype Quality) for each allele
    # A vertical red dashed line indicates the centromere position
    barplot_sample_GQ <- ggplot(chrom_loop, aes(x = POS, 
                                                y = GQ,
                                                group=Allele,
                                                color = Allele)) +
      geom_linerange(aes(ymin = 0, ymax = GQ), 
                     size=0.05, show.legend = FALSE) +
      geom_linerange(data = NULL, aes(ymin = 0, ymax = 0), size=5) +
      geom_vline(xintercept = centro_pos, linetype='dashed', col = 'red') +
      theme_minimal() +
      scale_color_manual(labels = c("Homozygous reference", "Heterozygous", "Homozygous alternative"), 
                         values = c("springgreen", "grey18", "orchid1")) +
      scale_x_continuous("bp",limits = c(0, chrom_length)) +
      scale_y_continuous(breaks = as.integer(seq.int(from=0,to=max(chrom_loop$GQ),length.out=4) ) ) +
      theme(legend.key.width=unit(0.5,"cm"),
            legend.key.size=unit(0.5,"cm"),
            text = element_text(size = 4),
            axis.text = element_text(size = 5),
            legend.text = element_text(size = 5),
            legend.title = element_text(size = 5),
            plot.margin = margin(8,10,8,8, "points") ) +
      facet_grid(Sample ~ .)
    
    
    # Assign each of the created plots to a unique variable, named based on the chromosome
    assign(paste0(chrom,"_heatmap"),heatmapplot_sample_allele)
    assign(paste0(chrom,"_DP_PLOT"),barplot_sample_DP)
    assign(paste0(chrom,"_GQ_PLOT"),barplot_sample_GQ)

  }
  
  # Arrange the individual heatmaps, DP, and GQ plots for each chromosome into combined plots
  all_chroms_heatmap<-ggarrange(NC_031965.2_heatmap,
                                NC_031966.2_heatmap,
                                NC_031967.2_heatmap,
                                NC_031969.2_heatmap,
                                NC_031970.2_heatmap,
                                NC_031971.2_heatmap,
                                NC_031972.2_heatmap,
                                NC_031973.2_heatmap,
                                NC_031974.2_heatmap,
                                NC_031975.2_heatmap,
                                NC_031976.2_heatmap,
                                NC_031977.2_heatmap,
                                NC_031978.2_heatmap,
                                NC_031979.2_heatmap,
                                NC_031980.2_heatmap,
                                NC_031987.2_heatmap,
                                NC_031981.2_heatmap,
                                NC_031982.2_heatmap,
                                NC_031983.2_heatmap,
                                NC_031984.2_heatmap,
                                NC_031985.2_heatmap,
                                NC_031986.2_heatmap,
                                labels = c("LG01", "LG02", "LG03",
                                           "LG04", "LG05", "LG06",
                                           "LG07", "LG08", "LG09",
                                           "LG10", "LG11", "LG12",
                                           "LG13", "LG14", "LG15",
                                           "LG16", "LG17", "LG18",
                                           "LG19", "LG20", "LG22",
                                           "LG23"),
                                hjust = -0.45, vjust = 0.45, heights = 7, widths = 3,
                                common.legend = TRUE, font.label = list(size = 10),
                                ncol = 3, nrow = 8)


  all_chroms_DP_PLOT<-ggarrange(NC_031965.2_DP_PLOT,
                                NC_031966.2_DP_PLOT,
                                NC_031967.2_DP_PLOT,
                                NC_031969.2_DP_PLOT,
                                NC_031970.2_DP_PLOT,
                                NC_031971.2_DP_PLOT,
                                NC_031972.2_DP_PLOT,
                                NC_031973.2_DP_PLOT,
                                NC_031974.2_DP_PLOT,
                                NC_031975.2_DP_PLOT,
                                NC_031976.2_DP_PLOT,
                                NC_031977.2_DP_PLOT,
                                NC_031978.2_DP_PLOT,
                                NC_031979.2_DP_PLOT,
                                NC_031980.2_DP_PLOT,
                                NC_031987.2_DP_PLOT,
                                NC_031981.2_DP_PLOT,
                                NC_031982.2_DP_PLOT,
                                NC_031983.2_DP_PLOT,
                                NC_031984.2_DP_PLOT,
                                NC_031985.2_DP_PLOT,
                                NC_031986.2_DP_PLOT,
                                labels = c("LG01", "LG02", "LG03",
                                           "LG04", "LG05", "LG06",
                                           "LG07", "LG08", "LG09",
                                           "LG10", "LG11", "LG12",
                                           "LG13", "LG14", "LG15",
                                           "LG16", "LG17", "LG18",
                                           "LG19", "LG20", "LG22",
                                           "LG23"),
                                hjust = -0.45, vjust = 0.45,
                                common.legend = TRUE, font.label = list(size = 5),
                                ncol = 3, nrow = 8)
  
  all_chroms_GQ_PLOT<-ggarrange(NC_031965.2_GQ_PLOT,
                                NC_031966.2_GQ_PLOT,
                                NC_031967.2_GQ_PLOT,
                                NC_031969.2_GQ_PLOT,
                                NC_031970.2_GQ_PLOT,
                                NC_031971.2_GQ_PLOT,
                                NC_031972.2_GQ_PLOT,
                                NC_031973.2_GQ_PLOT,
                                NC_031974.2_GQ_PLOT,
                                NC_031975.2_GQ_PLOT,
                                NC_031976.2_GQ_PLOT,
                                NC_031977.2_GQ_PLOT,
                                NC_031978.2_GQ_PLOT,
                                NC_031979.2_GQ_PLOT,
                                NC_031980.2_GQ_PLOT,
                                NC_031987.2_GQ_PLOT,
                                NC_031981.2_GQ_PLOT,
                                NC_031982.2_GQ_PLOT,
                                NC_031983.2_GQ_PLOT,
                                NC_031984.2_GQ_PLOT,
                                NC_031985.2_GQ_PLOT,
                                NC_031986.2_GQ_PLOT,
                                labels = c("LG01", "LG02", "LG03",
                                           "LG04", "LG05", "LG06",
                                           "LG07", "LG08", "LG09", 
                                           "LG10", "LG11", "LG12", 
                                           "LG13", "LG14", "LG15",
                                           "LG16", "LG17", "LG18", 
                                           "LG19", "LG20", "LG22", 
                                           "LG23"),
                                hjust = -0.45, vjust = 0.45,
                                common.legend = TRUE, font.label = list(size = 5),
                                ncol = 3, nrow = 8)
  
  
  #create variables with plots in order to save it outside of the forloop
  #as soon as it's finished
  # assign(paste0(dataset,"heatmap"),all_chroms_heatmap)
  # assign(paste0(dataset,"DP_PLOT"),all_chroms_DP_PLOT)
  # assign(paste0(dataset,"GQ_PLOT"),all_chroms_GQ_PLOT)
  #,dpi = 400,width = 8,height = 16, device = pdf)
  
  ggsave(paste0(dataset,"_AlleleHeatmap.pdf"),plot=all_chroms_heatmap, width = 210, height = 297, units = "mm", device = pdf)
  ggsave(paste0(dataset,"_DP_PLOT.pdf"),plot=all_chroms_DP_PLOT, width = 210, height = 297, units = "mm",device = pdf)
  ggsave(paste0(dataset,"_GQ_PLOT.pdf"),plot=all_chroms_GQ_PLOT, width = 210, height = 297, units = "mm",device = pdf)

  
}

