# Bioinformatics pipeline and perl scripts (at the end of the file) used for data analysis in
# Whole-genome re-sequencing of temporally stratified samples reveal substantial loss of haplotype diversity in the highly inbred Scandinavian wolf population
# Viluma et al.



##########################################     FILTERING VARIANT CALLS     ##################################################


#1. Prepare a bed file with segmental duplications of the reference genome CanFam3.1

	#a) extract reference sequence of each chromosome

module load bioinfo-tools samtools/1.8

for chr in {1..38} X
do
samtools faidx Canis_familiaris.CanFam3.1.dna.toplevel.fa $chr > Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.fa
done

	#b) do self alignment of each chromosome with LastZ
 
for chr in {1..38} X
do
inp="Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.fa"

module load bioinfo-tools lastz/1.04.00

lastz $inp --self --nomirror --notransition --step=20 --nogapped --hspthresh=6000 --matchcount=500 --format=maf > ${inp%.fa}.lastz.selfaln.maf
done

#for X chromosome increase --hspthresh=60000 in the run_lastz_selfaln2.sh

for chr in X
do
inp="Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.fa"

module load bioinfo-tools lastz/1.04.00

lastz $inp --self --nomirror --notransition --step=20 --nogapped --hspthresh=60000 --matchcount=500 --format=maf > ${inp%.fa}.lastz.selfaln.maf
done


	#c) convert maf file (output from LastZ alignment) to bed file

for chr in X {1..38}
do
perl maf2bed.pl Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.lastz.selfaln.maf Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.lastz.selfaln.bed
done




#2. Join genomic coordinates of the segmental duplications with manually downloaded RepeatMasker and StarDust+WM bed files from UCSC genome browser. 

	#a) Combine coordinate files in one, sort it
for chr in {1..38} X
do
#combine all
cat CanFam3_chr${chr}_WM_SDust_UCSC.bed CanFam3_chr${chr}_RepeatMasker_UCSC.bed Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.lastz.selfaln.bed > CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed
#keep only first three columns of the merged file
cut -f1,2,3 CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed > tmp
mv tmp CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed
#Sort the final bed file 
inp="CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed"
out="CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.sorted.bed"

module load bioinfo-tools BEDTools/2.27.1
bedtools sort -i  $inp > $out
done

	#b) merge overlapping entries, with -d 100 option (if the window between two masked regions is <100, mask it as well)

module load bioinfo-tools BEDTools/2.27.1

for chr in {1..38} X
do
bedtools merge -d 100 -i CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.sorted.bed >tmp
mv tmp CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed
rm -f CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.sorted.bed
done

#combine all chromosomes in one file (remove all separate chromosome files):
for chr in {1..38} X
do
cat CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed >> CanFam3_RepeatMasker_WM_SDust_UCSC_selfaln.bed
done


#!!!!  IMPORTANT  !!!! add a header line to the final bed file, othervise vcftools is not masking the first line of the bed file!!!
sed  -i '1i CHR START END' CanFam3_RepeatMasker_WM_SDust_UCSC_selfaln.bed



#3.  Run filtering pipeline 

#	(remove SNP calls within the repetitive regions of the CanFam3.1 assembly)
#   (set a treshold for SNP and genotype quality, read depth)
#	(include only sites with Minor Allele Count greater than or equal to 1: keeping only bi-allelic SNPs)


#this is done on the entire data set of Scandinavian and Finnish wolves

#total No. of variants in 76Scand.98Finnish.recode.vcf.gz (chr1-38, X)
zcat 76Scand.98Finnish.recode.vcf.gz | grep "^[1234567890X]" | wc -l
38,188,956


inp="76Scand.98Finnish.vcf.gz"
tmp="76Scand.98Finnish"
rep="CanFam3_RepeatMasker_WM_SDust_UCSC_selfaln.bed"

module load bioinfo-tools vcftools/0.1.15

#Filter away all repetitive regions of the CanFam3 assembly
vcftools --gzvcf $inp --recode --out ${tmp} --exclude-bed $rep
#output file $tmp.no.repeats.recode.vcf

#then apply other filtering criteria
vcftools --vcf $tmp.recode.vcf --remove-indels --mac 1 --min-alleles 2 --max-alleles 2 --minGQ 30 --minQ 300 --maxDP 80 --recode --out ${tmp}.filtered.v6
rm $tmp.recode.vcf

# Number of remaining variants after filtering:
zcat 76Scand.98Finnish.filtered.v6.vcf.gz |grep "^[1234567890X]" | wc -l 
5,884,691





#4. Remove regions of high SNP density

	#a) build 1kb windows

module load bioinfo-tools BEDTools/2.27.1
bedtools makewindows -g CanFam3_chromosome_lenghth.txt -w 1000 > CanFam3_chromosome_lenghth_1Kb_windows.bed

	#b) count nr of SNPs in each window

module load bioinfo-tools BEDTools/2.27.1
bed="CanFam3_chromosome_lenghth_1Kb_windows.bed"
vcf="76Scand.98Finnish.filtered.v6.vcf"

bedtools coverage -a $bed -b $vcf -counts > ${vcf%.vcf}.SNPdensity_1KbW.txt

#Number of SNPs coinciding with each window is given in the last column (column 3)


	#c)  set a treshold of SNP density

#take last column and plot a histogram of SNP density across all windows

module load perl_modules/5.24.1
inp="76Scand.98Finnish.filtered.v6.SNPdensity_1KbW.txt"
perl ./plot.histogram.pl $inp 3


#output:
___________________________________________________________________________________
Count: 2327652
Range:  0.000 - 230.000; Mean:  2.528; Median:  1.000; Stddev:  3.609
Percentiles:  90th:  7.000; 95th:  9.000; 99th: 15.000
   0.000 -    2.216: 665280 #####################################################
   2.216 -    4.171: 397327 ################################
   4.171 -    7.314: 272654 ######################
   7.314 -   12.368: 128199 ##########
  12.368 -   20.494: 33953 ###
  20.494 -   33.560:  7148 #
  33.560 -   54.570:  1230 |
  54.570 -   88.350:   290 |
  88.350 -  142.666:   123 |
 142.666 -  230.000:    23 |
_____________________________________________________________________________________
# Choosing 90th Percentile as a boundary and extracting coordinates for windows with more than 7 SNPs per 1Kb

inp="76Scand.98Finnish.filtered.v6.SNPdensity_1KbW.txt"
out="76Scand.98Finnish.filtered.v6.High_SNPdensity_windows.bed"

awk -F'\t' '$4>7' $inp >$out
sed  -i '1i CHR START END SCORE' $out


#For statistical phasing exclusion of high SNP density windows was incorporated into phasing pipeline

#However, later this filtering was also necessary for individual SNP allele comparison and the entire vcf file was filtered:

inp="76Scand.98Finnish.filtered.v6.vcf"
bed="76Scand.98Finnish.filtered.v6.High_SNPdensity_windows.bed"
filter="noHighSNPdensity"

module load bioinfo-tools vcftools/0.1.15
vcftools --vcf $inp --recode --out ${inp%.vcf}.$filter --exclude-bed $bed 


#Number of SNPs left after high density filtering
cat 76Scand.98Finnish.filtered.v6.noHighSNPdensity.recode.vcf |  grep "^[1234567890X]" | wc -l 
3,900,583

______________________________________________
Summary of used tools:
custom perl scripts (see at the end of file)
samtools/1.8
lastz/1.04.00
BEDTools/2.27.1
vcftools/0.1.15






##########################################     TWO-STEP PHASING     ##################################################

#1. Step one: phasing of individual 1 MB windows across the genome

	#a) defining non-overlapping 1 Mb windows across the CanFam3.1 genome 

module load bioinfo-tools BEDTools/2.27.1
bedtools makewindows -g CanFam3_chromosome_lenghth.txt -w 1000000 > CanFam3_chromosome_lenghth_1Mb_windows.bed


	#b) Phasing individual 1 MB windows chromosome by chromosome

for chr in {1..38} X
do
mkdir -p chr${chr}/PHASE_1_newfilt/
inp="76Scand.98Finnish.filtered.v6.vcf"
out="chr${chr}/PHASE_1_newfilt/"
bed="76Scand.98Finnish.filtered.v6.High_SNPdensity_windows.bed"
windows="CanFam3_chromosome_lenghth_1Mb_windows.bed"

module load bioinfo-tools vcftools/0.1.15

rm -f ${out}Markerpos.txt
grep -P "^$chr""\\t" $windows | while read line
do
start_c=$(echo $line | cut -f2 -d " ")
end_c=$(echo $line | cut -f3 -d " ")
echo $chr $start_c $end_c

#Make an individual vcf file for each window from the windows.bed file
#and simultaneously exclude SNPs from 1 Kb windows of too high SNP density
vcftools --vcf $inp --recode --out ${out}tmp1 --exclude-bed $bed --chr $chr --from-bp $start_c --to-bp $end_c

#remove SNPs with more than 25% missing genotypes and thin markers to be 20 000 bp apart
vcftools --vcf ${out}tmp1.recode.vcf --recode --out ${out}tmp --max-missing 0.75 --thin 20000

#count and display marker positions that have been used for phasing of each window
grep -P "^$chr""\\t" ${out}tmp.recode.vcf | cut -f1,2 >> ${out}Markerpos.txt

cd $out

#transform final vcf file as input file for PHASE
perl Transform_VCF2PHASE.pl ${out}tmp.recode.vcf tmp.PHASE.inp $chr

#phase
PHASE tmp.PHASE.inp Scand_wolf.markers.for.phasing1_1Mb_${start_c}.PHASE.out

rm -f ${out}tmp1.recode.vcf
rm -f ${out}tmp.recode.vcf
rm -f tmp.PHASE.inp
done
done


	#c) count number of markers used for PHASE step 1:

for chr in {1..38} X
do
wc -l /proj/sllstore2017034/nobackup/work/agnese/TwoStepPHASE_Scand_Finn_wolf/chr${chr}/PHASE_1_newfilt/Markerpos.txt
done




#2. Step two: chromosome-level phasing of 1 MB window alleles

#As our data set has a large number of individuals (including individuals from Finnish population), second PHASEing step initially resulted in an error of too high KMAX (50 was the default maximum)
#To solve this, I downloaded the source code from github (https://github.com/stephens999/phase/archive/master.zip), increased KMAX in constants.h to 100 and recompiled.

	#a) Collect genotypes from the first phasing step and prepare input files for the second phasing step

module load perl_modules/5.24.1

for chr in {1..38} X
do
out="chr${chr}/Summary.PHASE_1.chr${chr}.haplotype.pairs.txt"
out2="chr${chr}/1Mb_Haplotype_alleles.for.PHASE_2"
windows=$(ll chr${chr}/PHASE_1_newfilt/Scand_wolf.markers.for.phasing1_1Mb_*0.PHASE.out | wc -l)

perl Extract_haplotype_pairs_PHASE.v2.pl "chr${chr}/PHASE_1_newfilt/Scand_wolf.markers.for.phasing1_1Mb_*0.PHASE.out" $out
awk '{a[(NR-1)%n]=a[(NR-1)%n]==""?$1:a[(NR-1)%n] OFS $1}END{for (i=0;i<n;i++) print a[i]}' n=403 OFS="\t" $out > tmp
sed -i 's/000000.PHASE.out//g' tmp
sed -i 's/.PHASE.out//g' tmp
mv tmp $out

sed "s/SampleID\t/201\n${windows}\nP/g; s/_1\t/\n/g; s/.*_2\t//g" $out > ${out2}.inp
sed -i "4i $(printf '%.0sM' $(eval echo "{1..$windows}")})"  ${out2}.inp
done


	#b) Proceed with 2nd phasing step, parse results to be excel friendly

for chr in {1..38} X
do
inp="chr${chr}/1Mb_Haplotype_alleles.for.PHASE_2.inp"

module load perl_modules/5.24.1

PHASE $inp ${inp%.inp}.out

#make it excel friendly
perl Collect.PHASE.haps.pl ${inp%.inp}.out ${inp%.inp}.out.forExcel.txt
done


	#c) collect final results from all chr in one folder

mkdir PHASE_2_summary

for chr in {1..38} X
do
cp chr${chr}/1Mb_Haplotype_alleles.for.PHASE_2.out.forExcel.txt PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB
done


	#d) Sort phased individuals in an ordered manner (based on the subgroup assignment and birth year)

#list of subgroups and haplotype assignment
FinnoScand_all_timeordered_info.txt
#list of wolf IDs corresponding to the list of subgroups and haplotype assignment
FinnoScand_all_timeordered_IDs.txt



ind="FinnoScand_all_timeordered_IDs.txt"
for chr in {1..38} X
do
inp="PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB"
cat $ind | while read line
do
grep -P "^$line" $inp | head -2 | sed '1,117!d; s/[()]//g' - >> tmp
done
paste FinnoScand_all_timeordered_info.txt tmp > PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB_timeorder
rm -f tmp
perl -i -p -E 's/ /\t/g' PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB_timeorder
done

#Downloaded all _timeorder files, extracted 76 Scandinavian individuals and imported data into specially prepared excel sheet template (Supplementary File 1).
#Color asignment was achieved by use of conditional formatting option in MS Excel creating hierarchical colouring sheme.
#Proceeded with manually inferring male founder haplotypes and performed manual curation of the data set.
#Excel was also used to calculate frequency of founder haplotypes in each subsample and to determine clusters of 1 Mb founder haplotypes absent in each subgroup.



______________________________________________
Summary of used tools:
custom perl scripts (see at the end of file)
BEDTools/2.27.1
vcftools/0.1.15
PHASE 2.1.1.






#############################################     COMPARISON OF FOUNDER HAPLOTYPES     ##################################################

# A pairwise comparison of the 6 founder haplotypes (FF1/FF2/FM11/FM12/FM21/FM22).
# Input file Founder_haplotypes_All_chr_removed_11_discarded_widows.txt was obtained by compiling assigned founder haplotypes from all chromosomes and transforming original haplotype IDs to a scale of 1 to 6.
# Each row of the file contains per chromosome sequence of phased 1 Mb haplotype IDs (tab separated) of a particular founder haplotype:
# chromosome	Total_windows_per_chromosome	Founder_haplotype	haplotype_ID_window_0	haplotype_ID_window_1 ...


file="Founder_haplotypes_All_chr_removed_11_discarded_widows.txt"
out="Founder_haplotypes_All_chr_removed_11_discarded_widows.pairwise.comp.txt"

#if such outputfile already exists, don't forget to remove or rename it
rm -f $out
        
for chr in {1..38} PAR X
do 
FF1=( `echo $(grep -P "^${chr}\t[0-9]*\tFF1" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` )
FF2=( `echo $(grep -P "^${chr}\t[0-9]*\tFF2" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` )
FM11=( `echo $(grep -P "^${chr}\t[0-9]*\tFM11" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` )
FM12=( `echo $(grep -P "^${chr}\t[0-9]*\tFM12" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` )
FM21=( `echo $(grep -P "^${chr}\t[0-9]*\tFM21" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` )
FM22=( `echo $(grep -P "^${chr}\t[0-9]*\tFM22" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` )

FF1_FF2=0
FF1_FM11=0
FF1_FM12=0
FF1_FM21=0
FF1_FM22=0
FF2_FM11=0
FF2_FM12=0
FF2_FM21=0
FF2_FM22=0
FM11_FM12=0
FM11_FM21=0
FM11_FM22=0
FM12_FM21=0
FM12_FM22=0
FM21_FM22=0

l=${#FF1[@]}
for ((i=0;i<${#FF1[@]};++i))
  do  
  [[  "${FF1[i]}" == "${FF2[i]}" ]] && FF1_FF2=$(expr $FF1_FF2 + 1)
  [[  "${FF1[i]}" == "${FM11[i]}" ]] && FF1_FM11=$(expr $FF1_FM11 + 1)
  [[  "${FF1[i]}" == "${FM12[i]}" ]] && FF1_FM12=$(expr $FF1_FM12 + 1)
  [[  "${FF1[i]}" == "${FM21[i]}" ]] && FF1_FM21=$(expr $FF1_FM21 + 1)
  [[  "${FF1[i]}" == "${FM22[i]}" ]] && FF1_FM22=$(expr $FF1_FM22 + 1)
  [[  "${FF2[i]}" == "${FM11[i]}" ]] && FF2_FM11=$(expr $FF2_FM11 + 1)
  [[  "${FF2[i]}" == "${FM12[i]}" ]] && FF2_FM12=$(expr $FF2_FM12 + 1)
  [[  "${FF2[i]}" == "${FM21[i]}" ]] && FF2_FM21=$(expr $FF2_FM21 + 1)
  [[  "${FF2[i]}" == "${FM22[i]}" ]] && FF2_FM22=$(expr $FF2_FM22 + 1)
  [[  "${FM11[i]}" == "${FM12[i]}" ]] && FM11_FM12=$(expr $FM11_FM12 + 1)
  [[  "${FM11[i]}" == "${FM21[i]}" ]] && FM11_FM21=$(expr $FM11_FM21 + 1)
  [[  "${FM11[i]}" == "${FM22[i]}" ]] && FM11_FM22=$(expr $FM11_FM22 + 1)
  [[  "${FM12[i]}" == "${FM21[i]}" ]] && FM12_FM21=$(expr $FM12_FM21 + 1)
  [[  "${FM12[i]}" == "${FM22[i]}" ]] && FM12_FM22=$(expr $FM12_FM22 + 1)
  [[  "${FM21[i]}" == "${FM22[i]}" ]] && FM21_FM22=$(expr $FM21_FM22 + 1)
   done
total=`echo $(grep -P "^${chr}\t[0-9]*\tFF1" $file | cut -f 2)`
   
FF1_FF2_p=$(echo "scale=3; $FF1_FF2 / $total * 100" | bc -l)
FF1_FM11_p=$(echo "scale=3; $FF1_FM11 / $total * 100" | bc -l)
FF1_FM12_p=$(echo "scale=3; $FF1_FM12 / $total * 100" | bc -l)
FF1_FM21_p=$(echo "scale=3; $FF1_FM21 / $total * 100" | bc -l)
FF1_FM22_p=$(echo "scale=3; $FF1_FM22 / $total * 100" | bc -l)
FF2_FM11_p=$(echo "scale=3; $FF2_FM11 / $total * 100" | bc -l)
FF2_FM12_p=$(echo "scale=3; $FF2_FM12 / $total * 100" | bc -l)
FF2_FM21_p=$(echo "scale=3; $FF2_FM21 / $total * 100" | bc -l)
FF2_FM22_p=$(echo "scale=3; $FF2_FM22 / $total * 100" | bc -l)
FM11_FM12_p=$(echo "scale=3; $FM11_FM12 / $total * 100" | bc -l)
FM11_FM21_p=$(echo "scale=3; $FM11_FM21 / $total * 100" | bc -l)
FM11_FM22_p=$(echo "scale=3; $FM11_FM22 / $total * 100" | bc -l)
FM12_FM21_p=$(echo "scale=3; $FM12_FM21 / $total * 100" | bc -l)
FM12_FM22_p=$(echo "scale=3; $FM12_FM22 / $total * 100" | bc -l)
FM21_FM22_p=$(echo "scale=3; $FM21_FM22 / $total * 100" | bc -l)


echo -e "$chr\tFF1\tFF2\tFM11\tFM12\tFM21\tFM22\nFF1\t$total\t$FF1_FF2\t$FF1_FM11\t$FF1_FM12\t$FF1_FM21\t$FF1_FM22\nFF2\t$FF1_FF2_p\t$total\t$FF2_FM11\t$FF2_FM12\t$FF2_FM21\t$FF2_FM22\nFM11\t$FF1_FM11_p\t$FF2_FM11_p\t$total\t$FM11_FM12\t$FM11_FM21\t$FM11_FM22\nFM12\t$FF1_FM12_p\t$FF2_FM12_p\t$FM11_FM12_p\t$total\t$FM12_FM21\t$FM12_FM22\nFM21\t$FF1_FM21_p\t$FF2_FM21_p\t$FM11_FM21_p\t$FM12_FM21_p\t$total\t$FM21_FM22\nFM22\t$FF1_FM22_p\t$FF2_FM22_p\t$FM11_FM22_p\t$FM12_FM22_p\t$FM21_FM22_p\t$total\n" >> $out
done


# Count number of homozygous windows per each founder
perl Count_homoz_founder_windows.pl $out Scand_wolf_founder_homoz_windows_11w_discarded.txt

# Between founder comparisons were done in MS Excel worksheet








##########################################      ABSENCE OF PHASED 1 MB HAPLOTYPES 		########################################### 


#Founder haplotype frequencies for each subpopulation were calculated individually for each chromosome in MS Excel.
#For further analysis and visualisation purposes coordinates of the absent haplotype segments were transferred to individual BED files - one per each founder haplotype and time period.
#(start coordinate is the first lost window of a segment; end coordinate is the window right after the last lost window of the segment)

#1. Analysing haplotypes of the female founder (FF1 and FF2)

#list of bed files:
Lost_segments_FF1_1983-1993.bed
Lost_segments_FF2_1983-1993.bed
Lost_segments_FF1_1994-2005.bed
Lost_segments_FF2_1994-2005.bed
Lost_segments_FF1_2006-2014.bed
Lost_segments_FF2_2006-2014.bed


		#a) count number of unique haplotype loss to each temporal subsample

module load bioinfo-tools BEDTools/2.29.2


##  uniq absence in 1983-1993 (TFF - meaning that absence of the haplotype is TRUE for 1983-1993 and FALSE for 1994-2005 and 2006-2014) #
# Haplotype FF1
input="Lost_segments_FF1_1983-1993.bed"
comp1="Lost_segments_FF1_1994-2005.bed"
comp2="Lost_segments_FF1_2006-2014.bed"
out="Lost_segments_FF1_1983-1993_present_in_1994-2005_and_2006-2014.bed"
subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
#Result: 0

# Haplotype FF2
input="Lost_segments_FF2_1983-1993.bed"
comp1="Lost_segments_FF2_1994-2005.bed"
comp2="Lost_segments_FF2_2006-2014.bed"
out="Lost_segments_FF2_1983-1993_present_in_1994-2005_and_2006-2014.bed"
subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
#Result: 2 #none of these haplotypes fall within the list of discarded windows

# Total TFF=0+2=2
 

##  uniq absence in 1994-2005 (FTF) #
# Haplotype FF1
input="Lost_segments_FF1_1994-2005.bed"
comp1="Lost_segments_FF1_1983-1993.bed"
comp2="Lost_segments_FF1_2006-2014.bed"
out="Lost_segments_FF1_1994-2005_present_in_1983-1993_and_2006-2014.bed"
subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
#Result: 2


# Haplotype FF2
input="Lost_segments_FF2_1994-2005.bed"
comp1="Lost_segments_FF2_1983-1993.bed"
comp2="Lost_segments_FF2_2006-2014.bed"
out="Lost_segments_FF2_1994-2005_present_in_1983-1993_and_2006-2014.bed"
subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 11

# Total FTF=2+11=13



##  uniq absence in 2006-2014 (FFT) #
# Haplotype FF1
input="Lost_segments_FF1_2006-2014.bed"
comp1="Lost_segments_FF1_1983-1993.bed"
comp2="Lost_segments_FF1_1994-2005.bed"
out="Lost_segments_FF1_2006-2014_present_in_1983-1993_and_1994-2005.bed"
subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 100 -1 haplotype from discarded windows


# Haplotype FF2
input="Lost_segments_FF2_2006-2014.bed"
comp1="Lost_segments_FF2_1983-1993.bed"
comp2="Lost_segments_FF2_1994-2005.bed"
out="Lost_segments_FF2_2006-2014_present_in_1983-1993_and_1994-2005.bed"
subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 153

# Total FFT=99+153=252


		#b) count common absence of haplotypes in all three subgroups

# Haplotype FF1
input="Lost_segments_FF1_1983-1993.bed"
comp1="Lost_segments_FF1_1994-2005.bed"
comp2="Lost_segments_FF1_2006-2014.bed"
out="TwoStepPHASE_Scand_Finn_wolf/Lost_segments_FF1_1983-1993_1994-2005_2006-2014.bed"
intersectBed -a $input -b $comp1 | intersectBed -a - -b $comp2 >$out

awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 29


# Haplotype FF2
input="Lost_segments_FF2_1983-1993.bed"
comp1="Lost_segments_FF2_1994-2005.bed"
comp2="Lost_segments_FF2_2006-2014.bed"
out="Lost_segments_FF2_1983-1993_1994-2005_2006-2014.bed"
intersectBed -a $input -b $comp1 | intersectBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 127

# Total TTT=29+127 = 156




		#c) common absence in two subsamples and presence in one.

			# absent in 1983-1993 and 1994-2005, present in 2006-2014 (TTF)

# Haplotype FF1
input="Lost_segments_FF1_1983-1993.bed"
comp1="Lost_segments_FF1_1994-2005.bed"
comp2="Lost_segments_FF1_2006-2014.bed"
out="Lost_segments_FF1_1983-1993_1994-2005_but_not_in_2006-2014.bed"
intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 0


# Haplotype FF2
input="Lost_segments_FF2_1983-1993.bed"
comp1="Lost_segments_FF2_1994-2005.bed"
comp2="Lost_segments_FF2_2006-2014.bed"
out="Lost_segments_FF2_1983-1993_1994-2005_but_not_in_2006-2014.bed"
intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result:3

# Total TTF=3


				# absent in 1983-1993 and 2006-2014, present in 1994-2005 (TFT)

# Haplotype FF1
input="Lost_segments_FF1_1983-1993.bed"
comp1="Lost_segments_FF1_2006-2014.bed"
comp2="Lost_segments_FF1_1994-2005.bed"
out="Lost_segments_FF1_1983-1993_2006-2014_but_not_in_1994-2005.bed"
intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 0

# Haplotype FF2
input="Lost_segments_FF2_1983-1993.bed"
comp1="Lost_segments_FF2_2006-2014.bed"
comp2="Lost_segments_FF2_1994-2005.bed"
out="Lost_segments_FF2_1983-1993_2006-2014_but_not_in_1994-2005.bed"
intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 0

# Total TFT=0


				#  absent in 1994-2005 and 2006-2014, present in 1983-1993 (FTT)

# Haplotype FF1
input="Lost_segments_FF1_1994-2005.bed"
comp1="Lost_segments_FF1_2006-2014.bed"
comp2="Lost_segments_FF1_1983-1993.bed"
out="Lost_segments_FF1_1994-2005_2006-2014_but_not_in_1983-1993.bed"
intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 365 	
 
# Haplotype FF2
input="Lost_segments_FF2_1994-2005.bed"
comp1="Lost_segments_FF2_2006-2014.bed"
comp2="Lost_segments_FF2_1983-1993.bed"
out="Lost_segments_FF2_1994-2005_2006-2014_but_not_in_1983-1993.bed"
intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc
##Result: 351 -1 (coinciding with discarded segment)

# Total FTT= 365+350=715




#2. Analysing first founder male (FM1) 

#List of files:
Lost_segments_untill2005_FM11.bed
Lost_segments_untill2005_FM12.bed
Lost_segments_after2005_FM11.bed
Lost_segments_after2005_FM12.bed
 
module load bioinfo-tools BEDTools/2.29.2

		#a) count number of windows absent only in 1994-2005 (-TF) #

# Haplotype FM11
input="Lost_segments_FM11_1994-2005.bed"
comp1="Lost_segments_FM11_2006-2014.bed"
out="Lost_segments_FM11_1994-2005_present_in_2006-2014.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Results: 8

# Haplotype FM12
input="Lost_segments_FM12_1994-2005.bed"
comp1="Lost_segments_FM12_2006-2014.bed"
out="Lost_segments_FM12_1994-2005_present_in_2006-2014.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Results:18

# Total -TF=8+18=26



		#b) count number of windows absent only in 2006-2014 (-FT) #

# Haplotype FM11
input="Lost_segments_FM11_2006-2014.bed"
comp1="Lost_segments_FM11_1994-2005.bed"
out="Lost_segments_FM11_2006-2014_present_in_1994-2005.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Results: 191


# Haplotype FM12
input="Lost_segments_FM12_2006-2014.bed"
comp1="Lost_segments_FM12_1994-2005.bed"
out="Lost_segments_FM12_2006-2014_present_in_1994-2005.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Results: 93

# Total -FT=191+93=284


		#c count number of windows absent in both time periods (-TT)

# Haplotype FM11
input="Lost_segments_FM11_2006-2014.bed"
comp1="Lost_segments_FM11_1994-2005.bed"
out="Lost_segments_FM11_2006-2014_1994-2005.bed"
intersectBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Results: 182


# Haplotype FM12
input="Lost_segments_FM12_2006-2014.bed"
comp1="Lost_segments_FM12_1994-2005.bed"
out="Lost_segments_FM12_2006-2014_1994-2005.bed"
intersectBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Results: 240

# Total -TT = 182+240=422




#3. Analysing second founder male (FM2)

#List of files:
Lost_segments_untill2005_FM21.bed
Lost_segments_untill2005_FM22.bed
Lost_segments_after2005_FM21.bed
Lost_segments_after2005_FM22.bed
 

module load bioinfo-tools BEDTools/2.29.2

		#a) count number of windows absent only in 1994-2005 (-TF) #

# Haplotype FM21
input="Lost_segments_FM21_1994-2005.bed"
comp1="Lost_segments_FM21_2006-2014.bed"
out="Lost_segments_FM21_1994-2005_present_in_2006-2014.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 93	

# Haplotype FM22
input="Lost_segments_FM22_1994-2005.bed"
comp1="Lost_segments_FM22_2006-2014.bed"
out="Lost_segments_FM22_1994-2005_present_in_2006-2014.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 28

# Total -TF = 93+28 = 121



		#b) count number of windows absent only in 2006-2014 (-FT) #

# Haplotype FM21
input="Lost_segments_FM21_2006-2014.bed"
comp1="Lost_segments_FM21_1994-2005.bed"
out="Lost_segments_FM21_2006-2014_present_in_1994-2005.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 171 -1 conc. with disc. windows

# Haplotype FM22
input="Lost_segments_FM22_2006-2014.bed"
comp1="Lost_segments_FM22_1994-2005.bed"
out="Lost_segments_FM22_2006-2014_present_in_1994-2005.bed"
subtractBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 61 -1 conc. with disc. windows

# Total -FT=170+60=230



		#c) count number of windows absent in both time periods (-TT)

# Haplotype FM21
input="Lost_segments_FM21_2006-2014.bed"
comp1="Lost_segments_FM21_1994-2005.bed"
out="Lost_segments_FM21_2006-2014_1994-2005.bed"
intersectBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 85


# Haplotype FM22
input="Lost_segments_FM22_2006-2014.bed"
comp1="Lost_segments_FM22_1994-2005.bed"
out="Lost_segments_FM22_2006-2014_1994-2005.bed"
intersectBed -a $input -b $comp1 >$out
awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc
##Result: 166

# Total -TT= 85+166=251

__________________________________________________________________
Tools used:
BEDTools/2.27.1






########################################     ABSENCE ANALYSIS OF INDIVIDUAL SNP ALLELES     ##############################################

#1. Data preparation and extra filtering steps

	#a) Extract a list of variants where 1983-1993 sample (Founder population) has no missing genotypes

ind="Scand_wolf_1983-1993_pop_IDs.txt"
inp="76Scand.98Finnish.filtered.v6.noHighSNPdensity.recode.vcf" 

module load bioinfo-tools vcftools/0.1.15

vcftools --vcf $inp  --recode --out ${inp}.founder_pop.noNAs --keep $ind --max-missing-count 0

	#b) Make a file with all individuals, but only for those positions where 1983-1993 sample doesn't have any missing values

inp="76Scand.98Finnish.filtered.v6.noHighSNPdensity.recode.vcf" 
snps_to_keep="${inp}.founder_pop.noNAs.recode.vcf"

vcftools --vcf $inp  --recode --out ${inp%.recode.vcf}.founder_pop.noNAs --positions $snps_to_keep


	#c) Extract a vcf file with 76 Scandinavian wolves and keep only those SNPs that are segregating in this population.

inp="76Scand.98Finnish.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf"
out="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_76_individuals_IDs.txt"

module load bioinfo-tools vcftools/0.1.15 htslib/1.6

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz



#2. Extracting vcf file for each subsample of interest.

module load bioinfo-tools vcftools/0.1.15 htslib/1.6


	#a) For a female founder:


inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz"
out="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_FF.txt"

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz


	#b) offspring of the first male founder:

inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz"
out="FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_FM1_F1-3_offspring.txt"

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz


	#c) offspring of the second male founder:
	
inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz"
out="FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_FM2_F1_offspring.txt"

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz


	#d) 1983-1993 subsample:

inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz"
out="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_born_1983-1993.txt"

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz


	#e) 1994-2005 subsample

inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz"
out="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_born_1994-2005.txt"

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz


	#f) 2006-2014 subsample
	
inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz"
out="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs"
keep="Scand_wolf_born_2006-2014.txt"

vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2  --recode --out $out
bgzip $out.recode.vcf
tabix $out.recode.vcf.gz



#3. Converting vcf files to allele tables [chr	pos	Ind1_Allele1	Ind1_Allele2, Ind2_Allele1	Ind2_Allele2, ... ]:

perl VCF2table.v2.pl D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table
perl VCF2table.v2.pl FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table
perl VCF2table.v2.pl FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table
perl VCF2table.v2.pl 1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz 1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table
perl VCF2table.v2.pl 1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz 1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table
perl VCF2table.v2.pl 2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz 2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table



#4. Female founder SNP allele comparisons

	#a) Count the occurrence of each female founder's (FF) allele in the three subgroups 1983-1993; 1994-2005; and 2006-2014

inp="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
group1="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
out="D-85-01.allele.counts.txt"

perl allele.counts.v2.pl $inp $group1 $group2 $group3 $out

	#b) Bin all of the female founder alleles into 8 bins according to presence/absence pattern:

perl Summary.FF.lost.alleles.pl $out ${out%.txt}.summary.txt




#5. Infering the portion of male founder alleles: unique alleles to the first (FM1) and second male founder (FM2) + those common in both, but absent in female founder

#!!NB!! all allele.table files have to contain the same marker positions in the same order.


	#a) Extract SNP alleles that are unique to FM1 offspring, but not to FM2 offspring and FF

presence="FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence1="FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence2="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
out="FM1-offsp.unique.allele.table"

perl presence.absence.analysis.pl $presence $absence1 $absence2 $out


	#b) Extract SNP alleles that are unique to FM2 offspring, but not to FM1 offspring and FF

presence="FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence1="FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence2="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
out="FM2-offsp.unique.allele.table"

perl presence.absence.analysis.pl $presence $absence1 $absence2 $out


	#c) Extract SNP alleles that are in MF1 and MF2 offspring (1983-1993 sample) that are not in FF. 

# As technically script requires two populations of absence, I supplied FF alleles twice

presence="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence1="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence2="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
out="1983-1993_FM1_FM2.allele.table"

perl presence.absence.analysis.pl $presence $absence1 $absence2 $out

	
#6.  Male founder allele comparisons: binning into 4 bins, according to wether they are present or absent in the 1994-2005; 2006-2014 subgroups.

	#a) Unique alleles of the first male founder

inp="FM1-offsp.unique.allele.table"
group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"

perl FM1_allele.comparisons.pl $inp $group2 $group3 $inp.bin.summary


	#b) Unique alleles of the second male founder

inp="FM2-offsp.unique.allele.table"
group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"

perl FM2_allele.comparisons.pl $inp $group2 $group3 $inp.bin.summary

	#c) Common alleles to both male founders
	
inp="1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.allele.table"
group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"

perl 1983-1993_FM1_FM2_common_allele.comparisons.pl $inp $group2 $group3 $inp.bin.summary


	#d) alleles present in 1994-2005; 2006-2014 that are absent in FF and in 1983-1993

#locate alleles present in 1994-2005 but not in 1983-1993 and FF
presence="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence1="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence2="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
out="1994-2005.not.inFFand1983-1993.allele.table"

perl presence.absence.analysis.pl $presence $absence1 $absence2 $out

#locate alleles present in 2006-2014 but not in 1983-1993 and FF
presence="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence1="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
absence2="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table"
out="2006-2014.not.inFFand1983-1993.allele.table"

perl presence.absence.analysis.pl $presence $absence1 $absence2 $out



#7  Collecting allele lists that are not present at certain subsample

#FF_alleles not present in 1983-1993
sed 's/$/\tTTT/' D-85-01.allele.counts.bin7.TTT >> D-85-01.alleles.not.in.1983-1993
sed 's/$/\tTTF/' D-85-01.allele.counts.bin1.TTF >> D-85-01.alleles.not.in.1983-1993
sed 's/$/\tTFT/' D-85-01.allele.counts.bin2.TFT >> D-85-01.alleles.not.in.1983-1993
sed 's/$/\tTFF/' D-85-01.allele.counts.bin4.TFF >> D-85-01.alleles.not.in.1983-1993

wc -l D-85-01.alleles.not.in.1983-1993
10108 D-85-01.alleles.not.in.1983-1993



#FF_alleles not present in 1994-2005
sed 's/$/\tTTT/' D-85-01.allele.counts.bin7.TTT >> D-85-01.alleles.not.in.1994-2005
sed 's/$/\tTTF/' D-85-01.allele.counts.bin1.TTF >> D-85-01.alleles.not.in.1994-2005
sed 's/$/\tFTT/' D-85-01.allele.counts.bin3.FTT >> D-85-01.alleles.not.in.1994-2005
sed 's/$/\tFTF/' D-85-01.allele.counts.bin6.FTF >> D-85-01.alleles.not.in.1994-2005

wc -l D-85-01.alleles.not.in.1994-2005
71588 D-85-01.alleles.not.in.1994-2005


#FF_alleles not present in 2006-2014
sed 's/$/\tTTT/' D-85-01.allele.counts.bin7.TTT >> D-85-01.alleles.not.in.2006-2014
sed 's/$/\tTFT/' D-85-01.allele.counts.bin2.TFT >> D-85-01.alleles.not.in.2006-2014
sed 's/$/\tFTT/' D-85-01.allele.counts.bin3.FTT >> D-85-01.alleles.not.in.2006-2014
sed 's/$/\tFFT/' D-85-01.allele.counts.bin5.FFT >> D-85-01.alleles.not.in.2006-2014

wc -l D-85-01.alleles.not.in.2006-2014
91766 D-85-01.alleles.not.in.2006-2014



#FM1_alleles not present in 1994-2005
sed 's/$/\tTT/' FM1-offsp.unique.allele.bin1.TT >> FM1-offsp.alleles.not.in.1994-2005
sed 's/$/\tTF/' FM1-offsp.unique.allele.bin3.TF >> FM1-offsp.alleles.not.in.1994-2005

wc -l FM1-offsp.alleles.not.in.1994-2005
20002 FM1-offsp.alleles.not.in.1994-2005

#FM1_alleles not present in 2006-2014
sed 's/$/\tTT/' FM1-offsp.unique.allele.bin1.TT >> FM1-offsp.alleles.not.in.2006-2014
sed 's/$/\tFT/' FM1-offsp.unique.allele.bin2.FT >> FM1-offsp.alleles.not.in.2006-2014

wc -l FM1-offsp.alleles.not.in.2006-2014
34189 FM1-offsp.alleles.not.in.2006-2014


#FM2_alleles not present in 1994-2005
sed 's/$/\tTT/' FM2-offsp.unique.allele.bin1.TT >> FM2-offsp.alleles.not.in.1994-2005
sed 's/$/\tTF/' FM2-offsp.unique.allele.bin3.TF >> FM2-offsp.alleles.not.in.1994-2005

wc -l FM2-offsp.alleles.not.in.1994-2005
24070 FM2-offsp.alleles.not.in.1994-2005


#FM1_alleles not present in 2006-2014
sed 's/$/\tTT/' FM2-offsp.unique.allele.bin1.TT >> FM2-offsp.alleles.not.in.2006-2014
sed 's/$/\tFT/' FM2-offsp.unique.allele.bin2.FT >> FM2-offsp.alleles.not.in.2006-2014

wc -l FM2-offsp.alleles.not.in.2006-2014
28735 FM2-offsp.alleles.not.in.2006-2014

__________________________________________________________________
Tools used:
custom perl scripts
vcftools/0.1.15






########################################     CONCORDANCE BETWEEN THE LOST SNPs AND 1 MB WINDOWS    ####################################

#1. Count total nr of analysed individual SNPs per chromosome (for statistical overview):


echo -e "chr\tmarkers_total\tFF_unique_lost\tFF_not_in_1994-2005\tFF_not_in_2006-20014\tFM1_unique_kept\tFM1_not_in_1994-2005\tFM1_not_in_2006-2014\tFM2_unique_kept\tFM2_not_in_1994-2005\tFM2_not_in_2006-2014" > number_of_markers_per_chr_new.txt

for chr in {1..38} X
do
FF_unique=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1983-1993 |wc -l)` )
FM1_unique=(`echo $(grep -P "^$chr\t" FM1-offsp.unique.allele.table | wc -l )` )
FM2_unique=(`echo $(grep -P "^$chr\t" FM2-offsp.unique.allele.table | wc -l)` )

FF_not_in_pop1=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1994-2005 |wc -l)` )
FM1_not_in_pop1=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.1994-2005 |wc -l )` )
FM2_not_in_pop1=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.1994-2005 |wc -l )` )

FF_not_in_pop2=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.2006-2014 | wc -l )` )
FM1_not_in_pop2=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.2006-2014 | wc -l )` )
FM2_not_in_pop2=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.2006-2014 | wc -l )` )

markers=(`echo $(zcat 76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz | grep -P "^${chr}\t" - | wc -l )` )
echo -e "$chr\t$markers\t$FF_unique\t$FF_not_in_pop1\t$FF_not_in_pop2\t$FM1_unique\t$FM1_not_in_pop1\t$FM1_not_in_pop2\t$FM2_unique\t$FM2_not_in_pop1\t$FM2_not_in_pop2" >> number_of_markers_per_chr_new.txt
done


#2. Counting SNPs that coinside with absent 1 Mb windows per each founder per period

	#a) Alleles absent in 1994-2005 subsample

perl sort_lost_variation_according_to_bed.v2.pl Lost_segments_untill2005.txt  Lost_segments_untill2005_SNP+marker_count.txt

	#b) Alleles absent in 2006-2014 subsample

perl sort_lost_variation_according_to_bed.v3.pl Lost_segments_after2005.txt  Lost_segments_after2005_SNP+marker_count.txt

	#c) Alleles absent in 1983-1993 subsample

perl sort_lost_variation_according_to_bed.v4.pl Lost_segments_FFonly.txt  Lost_segments_FFonly_SNP+marker_count.txt

	#d) Count unique alleles only, produce summary table of the number of concordant SNPs
	
#the issue with this calculation is that if both homologous parts of the same chromosome have been lost, it counts SNPs within these regions twice.
#after execution of the script loop through .concordantSNPs files and count only unique values:


echo -e "chr\tFF_unique\tFF_not_in_pop1\tFF_not_in_pop2\tFM1_not_in_pop1\tFM1_not_in_pop2\tFM2_not_in_pop1\tFM2_not_in_pop2" > number_of_Concordant_markers_per_chr_new.txt

for chr in {1..38} X
do
FF_unique=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1983-1993.concordantSNPs | cut -f2 |sort | uniq |wc -l)` )
FF_not_in_pop1=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1994-2005.concordantSNPs | cut -f2 |sort | uniq |wc -l)` )
FF_not_in_pop2=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.2006-2014.concordantSNPs | cut -f2 |sort | uniq |wc -l)` )
FM1_not_in_pop1=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.1994-2005.concordantSNPs | cut -f2 |sort | uniq |wc -l)` )
FM1_not_in_pop2=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.2006-2014.concordantSNPs | cut -f2 |sort | uniq |wc -l)` )
FM2_not_in_pop1=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.1994-2005.concordantSNPs| cut -f2 |sort | uniq |wc -l)` )
FM2_not_in_pop2=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.2006-2014.concordantSNPs | cut -f2 |sort | uniq |wc -l)` )
echo -e "$chr\t$FF_unique\t$FF_not_in_pop1\t$FF_not_in_pop2\t$FM1_not_in_pop1\t$FM1_not_in_pop2\t$FM2_not_in_pop1\t$FM2_not_in_pop2" >> number_of_Concordant_markers_per_chr_new.txt
done
 
#then contrast these values to the number of SNPs lost at each period (in MS Excel).

#To pinpoint more precise boundaries of the lost 1 Mb chromosome segments they were manually updated by using first/last individual SNP allele of the SNP cluster that coincided with the absent 1 Mb haplotype cluster (if possible).
__________________________________________________________________
Tools used:
custom perl scripts
BEDTools/2.27.1
vcftools/0.1.15








##############################################    COMPILATION OF THE USED PERL SCRIPTS    ####################################


#1#
maf2bed.pl 
#!usr/bin/perl
#This script reads in .maf file from lastz alignment and converts it to a bed file with coordinates of the aligned sequences 

use strict;
use warnings;

open (FH, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; #maf file
open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; #bed file



while (<FH>) {
        if ($_ =~ /^s/ && $_ =~ /\+/){
                #print STDOUT $_;
                chomp $_;
                my @data = split(/[\s\t]+/,$_);
                #print STDOUT "$data[2]\t$data[3]\n";
                my $end=$data[2]+$data[3];
                my $start=$data[2];
                print FH2 "$data[1]\t$start\t$end\n";
        }
    elsif ($_ =~ /^s/ && $_ =~ /\-/){
                chomp $_;
                my @data = split(/[\s\t]+/,$_);
                my $end=$data[5]-$data[2];
                my $start=$data[5]-$data[2]-$data[3];
                print FH2 "$data[1]\t$start\t$end\n";
        }
    }





#2#
plot.histogram.pl
#execution: module load perl_modules/5.24.1
#perl ./plot.histogram.pl infile columNr-1
use strict;
use warnings;
use Statistics::Histogram;

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";

my @hist = ();

while (<FH1>) {
		chomp $_;
		my @data = split(/\t/,$_);
        push @hist, $data[$ARGV[1]];
        } 
print STDOUT get_histogram(\@hist);






#3#
Transform_VCF2PHASE.pl
#!usr/bin/perl
#Script reads in a vcf file with SNP genotypes and transforms it into an inputfile for PHASE 
#Chromosome needs to be specified
#Usage: perl Transform_VCF2PHASE.pl <vcf.file> <PHASE.inp> <chromosome>

use strict;
use warnings;
local $" = "\t";

open (FH, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; # input vcf 
open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; # output file

my $chr=$ARGV[2];
my @samples;
my @colums;
my @POS;
my $type = '';;
my $nr_of_samples = '';
my $nr_of_SNPs = '';

while (<FH>) {
        my $line = $_;
        my @data;
        chomp $line;
        if ($_ =~ /^#CHROM/){
        
                @colums = split(/\t/,$line);
                $nr_of_samples = scalar @colums - 9;
                @samples = @colums[9..scalar @colums -1];
                                                
                #print STDOUT "@colums\n$nr_of_samples\t@samples";              
                }

    elsif ($_ =~ /^$chr/){
  
        @data = split(/\t/,$line);
        $type = join '', $type, 'S';
        push @POS, $data[1];
        }
        $nr_of_SNPs = scalar @POS;
     
     }
#print STDOUT "@POS\n";
print FH2 "$nr_of_samples\n$nr_of_SNPs\nP@POS\n$type\n";

my $i = 9;
foreach my $ind (@samples){
     
     open (FH, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";

     my $geno= '';
     my @alleleA;
     my @alleleB;
    
     
     while (<FH>) {
        my @data;
        my $line = $_;
                if ($_ =~ /^$chr/){
                        chomp $line;
                @data = split(/\t/,$line);
                $geno = $data[$i];
                #print STDOUT "$geno and $data[$i]\n";
                }       
                if ($geno =~ /^0[\/\|]0/){
                        push @alleleA, $data[3];
                        push @alleleB, $data[3];
                        }
                elsif ($geno=~ /^1[\/\|]0/){
                        push @alleleA, $data[4];
                        push @alleleB, $data[3];
                        }
                elsif ($geno =~ /^0[\/\|]1/){
                        push @alleleA, $data[3];
                        push @alleleB, $data[4];
                        }
                elsif ($geno =~ /^1[\/\|]1/){
                        push @alleleA, $data[4];
                        push @alleleB, $data[4];
                        }
                elsif ($geno =~ /^\.[\/\|]\./){
                        push @alleleA, "?";
                        push @alleleB, "?";
                        }
                }       
        print FH2 "#$ind\n@alleleA\n@alleleB\n";
    $i++;
    #print STDOUT "$i\n";               
    }



'#4#
Extract_haplotype_pairs_PHASE.v2.pl
#!usr/bin/perl
#Script reads in a set of .PHASE.out files (for example, all output files of individual 1 Mb window phasing of a chromosome) and extracts BESTPAIRS_SUMMARY
#Output is a table, where first column is a list of individual sample IDs and remaining columns contain haplotype IDs of particular windows from the 1st step of phasing.
#Intended for collecting results after running PHASE on sliding windows
#Usage: perl Extract_haplotype_pairs_PHASE.v2.pl <pattern of .PHASE.out files> <outputfile>
#example of pattern: PHASING/1MB_windows/chr${chr}/PHASE_1_newfilt/Scand_wolf.markers.for.phasing1_1Mb_*0.PHASE.out


use strict;
use warnings;
use Sort::Naturally;

print STDOUT "$ARGV[0] \n";
my @FILES = nsort <$ARGV[0]>;


print STDOUT "@FILES \n";

open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";
open (FH3, '<', $FILES[0]) or die "Can't open $FILES[0]: $!\n";
print FH2 "SampleID\n";

while (<FH3>) {
                my @data =();
                my @haps =();
                if ($_ =~/^#/){
                #print STDOUT $_;
                chomp $_;
                @data = split(/:/,$_);
                print FH2 "$data[0]_1\n$data[0]_2\n";
        }
}
foreach my $file (@FILES) {
                my @window = split(/b_/,$file);
        print FH2 "$window[1]\n";
        open (FH, '<', $file ) or die "Can't open $ARGV[0]: $!\n";
        while (<FH>) {
                my @data =();
                my @haps =();
                if ($_ =~/^#/){
                #print STDOUT $_;
                chomp $_;
                @data = split(/:/,$_);
                @haps = split(/[,\(\)]/,$data[1]);
                print FH2 "$haps[1]\n$haps[2]\n";
        }
    }

}


#5#
Collect.PHASE.haps.pl
#!usr/bin/perl
#Script is intended to be used after 2nd phasing step (PHASE results for chromosome-level haplotypes)
#It reads in a PHASE.out file and extracts info from BESTPAIRS1 section
#output file is formated to be imported in Excel for manual curration
#Usage: perl Collect.PHASE.haps.pl <PHASE.out> <.out.forExcel.txt>
#If several runs are compared, PHASE.out can also be a pattern match of file names


use strict;
use warnings;

my @FILES = <"$ARGV[0]">; # PHASE.out file (can also be used a patern match if several runs compared)
open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";

foreach my $file (@FILES) {
        print FH2 "$file\n";
        open (FH, '<', $file ) or die "Can't open $ARGV[0]: $!\n";
        while (<FH>) {
        my @data=();
                if ($_ =~/^0 #/){
                        #print STDOUT $_;
                        chomp $_;
                        @data = split(/#/,$_);
                        #print STDOUT "@data";
                        my $hap1 = <FH>;
                        my $hap2 = <FH>;
                        print FH2 "$data[1] $hap1$data[1] $hap2";
        }
    }

}



#6#
Count_homoz_founder_windows.pl
#!usr/bin/perl
#This script reads in a text file with pairwise comparison matrices of all six founder haplotypes for all chromosomes and outputs a table of homozygous windows per founder per chromosome
#Usage: perl Count_homoz_founer_windows.pl <pairwise_comparison_matrices.txt> <output.txt>

use strict;
use warnings;


open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";

print FH2 "Chr\tFF\tFM1\tFM2\n";

while (<FH1>) {

	for (my $i=1; $i <= 39; $i++) {	

		next unless $_ =~ /^$i\t/; 
		my $FF_line = <FH1>;
		chomp $FF_line;
		my @data_FF = split(/\t/,$FF_line);
		#print STDOUT "@data_FF\n";
		<FH1>; 
		my $FM1_line = <FH1>;
		chomp $FM1_line;
		my @data_FM1 = split(/\t/,$FM1_line);
		<FH1>; 
		my $FM2_line = <FH1>;
		chomp $FM2_line;
		my @data_FM2 = split(/\t/,$FM2_line);
		print FH2 "$i\t$data_FF[2]\t$data_FM1[4]\t$data_FM2[6]\n";
	}
}







#7#
VCF2table.v2.pl
#!usr/bin/perl
#Script transforms a vcf file to a table of alleles
#Usage: perl VCF2table.v2.pl <input.vcf> <output.allele.table>

use strict;
use warnings;
local $" = "\t";

open (FH, "-|", "gzip -dc $ARGV[0]") or die "Can't open $ARGV[0]: $!\n"; # input vcf
open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; # output file

#print STDOUT "opened file\n";

my $length='';

while (<FH>) {
    chomp $_;
#       print STDOUT "$_\n";

    if ($_ =~ /^#CHR/){
        print STDOUT "got CHR\n";
        my @heder = split(/\t/,$_);
        print STDOUT "heder is @heder\n";
        $length = scalar @heder;
        print STDOUT "length of heder is $length\n";
        my @indiv = @heder[9 .. $length - 1];
        print STDOUT "exporting individuals @indiv to a table\n";
        print FH2 join("\t", $heder[0], $heder[1], map { $_, $_ }  @indiv) . "\n";
        }

     if ($_ =~ /^[\dX]/){
        
        my @data = split(/\t/,$_);
        my $ref_allele = $data[3];
        my $alt_allele = $data[4];
        my @alleles = ();

        for(my $i=9; $i <= $length - 1; $i++) {                       
                my @geno = split /[:\/\|]/, $data[$i];
                #print STDOUT "my geno is @geno\n";

                                #assign allele 1  
                                if ($geno[0] =~ 0){
                        push @alleles, $ref_allele;
                } elsif ($geno[0]=~ 1){
                    push @alleles, $alt_allele;
                                } else {
                                        push @alleles, ".";
                                }
                        
                                #assign allele 2  
                                if ($geno[1] =~ 0){
                        push @alleles, $ref_allele;
                }
                                elsif ($geno[1]=~ 1){
                    push @alleles, $alt_allele;
                                }
                                else {
                                        push @alleles, ".";
                                }
                        }
        print FH2 "$data[0]\t$data[1]\t@alleles\n";
        }
}            







#8#
allele.counts.v2.pl
#!usr/bin/perl
#Script reads in allele.table of a single individual and counts the occurance of each allele in 3 different subgroups
#Usage: perl allele.counts.v2.pl <input.allele.table> <subgroup1.allele.table> <subgroup2.allele.table> <subgroup3.allele.table> <output.allele.counts.txt>

use strict;
use warnings;
local $" = "\t";

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";
open (FH3, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n";
open (FH4, '<', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n";
open (FH5, '>', $ARGV[4]) or die "Can't open $ARGV[4]: $!\n";

print FH5 "CHR\tPOS\tFF1\tGroup1\tGroup2\tGroup3\tFF2\tGroup1\tGroup2\tGroup3\n";

while (<FH1>) {
        
        my $input = $_;
        my $subgroup1 = <FH2>;
        my $subgroup2 = <FH3>;
        my $subgroup3 = <FH4>;
        
        if ($input !~ /^#/) {
        
                chomp $input;
                chomp $subgroup1;
                chomp $subgroup2;
                chomp $subgroup3;

                my @input = split(/\t/, $input);
                my @subgroup1 = split(/\t/, $subgroup1);
                my @subgroup2 = split(/\t/, $subgroup2);
                my @subgroup3 = split(/\t/, $subgroup3);

                my $sg1Lgth = scalar @subgroup1;
                my $sg2Lgth = scalar @subgroup2;
                my $sg3Lgth = scalar @subgroup3;

                my @sg1Alleles = @subgroup1[2..$sg1Lgth - 1];
                my @sg2Alleles = @subgroup2[2..$sg2Lgth - 1];
                my @sg3Alleles = @subgroup3[2..$sg3Lgth - 1];

                my $FF1_in_SG1 = 0;
                my $FF1_in_SG2 = 0;
                my $FF1_in_SG3 = 0;
                my $FF2_in_SG1 = 0;
                my $FF2_in_SG2 = 0;
                my $FF2_in_SG3 = 0;

                my @out = @input[0..1];
        
                #count acureances of FF1 allele in three subgroups, store counts in @out array
                if ( grep( /^$input[2]$/, @sg1Alleles ) ) {
                $FF1_in_SG1 = grep (/^$input[2]$/, @sg1Alleles);
                }
                if ( grep( /^$input[2]$/, @sg2Alleles ) ) {
                $FF1_in_SG2 = grep (/^$input[2]$/, @sg2Alleles);
                }
                if ( grep( /^$input[2]$/, @sg3Alleles ) ) {
                $FF1_in_SG3 = grep (/^$input[2]$/, @sg3Alleles);
                }
                push @out, ($input[2], $FF1_in_SG1, $FF1_in_SG2, $FF1_in_SG3);

                #count acureances of FF2 allele in three subgroups, store counts in @out array
                if ( grep( /^$input[3]$/, @sg1Alleles ) ) {
                $FF2_in_SG1 = grep (/^$input[3]$/, @sg1Alleles);
                }
                if ( grep( /^$input[3]$/, @sg2Alleles ) ) {
                $FF2_in_SG2 = grep (/^$input[3]$/, @sg2Alleles);
                }
                if ( grep( /^$input[3]$/, @sg3Alleles ) ) {
                $FF2_in_SG3 = grep (/^$input[3]$/, @sg3Alleles);
                }
                push @out, ($input[3], $FF2_in_SG1, $FF2_in_SG2, $FF2_in_SG3);
                print FH5 "@out\n";

        }
}






#9#
Summary.FF.lost.alleles.pl
# Script tailored specifically to count lost alleles of the female founder
# Script reads in the output from allele.counts.v2.pl and bins all alleles into 8 different files according to presence/absence pattern within 3 subgroups
# T stands for "absence of allele is TRUE"; F stands for "absence of allele is FALSE"
# usage: perl Summary.FF.lost.alleles.pl <allele.counts.txt> <allele.counts.summary.txt>

use strict;
use warnings;
local $" = "\t";

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[3]: $!\n";

#make output files to store coordinates of alleles of a particular bin
open (BIN1, '>', "D-85-01.allele.counts.bin1.TTF") or die "Can't open BIN1: $!\n";
open (BIN2, '>', "D-85-01.allele.counts.bin2.TFT") or die "Can't open BIN2: $!\n";
open (BIN3, '>', "D-85-01.allele.counts.bin3.FTT") or die "Can't open BIN3: $!\n";
open (BIN4, '>', "D-85-01.allele.counts.bin4.TFF") or die "Can't open BIN4: $!\n";
open (BIN5, '>', "D-85-01.allele.counts.bin5.FFT") or die "Can't open BIN5: $!\n";
open (BIN6, '>', "D-85-01.allele.counts.bin6.FTF") or die "Can't open BIN6: $!\n";
open (BIN7, '>', "D-85-01.allele.counts.bin7.TTT") or die "Can't open BIN7: $!\n";
open (BIN8, '>', "D-85-01.allele.counts.bin8.FFF") or die "Can't open BIN8: $!\n";


#define 8 bins based on presence/absence pattern
#3 reference periods are in consecutive order
my $TTF=0;
my $TFT=0;
my $FTT=0;
my $TFF=0;
my $FFT=0;
my $FTF=0;
my $TTT=0;
my $FFF=0;

my $total=0;

<FH1>;
while (<FH1>) {
        chomp $_;
#       print "$_";
        my @data = split (/\t/, $_);
        my @FF1alleles = @data[3..5];
        my @FF2alleles = @data[7..9];
        
#       print "alleles to count:\n@FF1alleles\n@FF2alleles\n";

        if ($FF1alleles[0] == 0 && $FF1alleles[1] == 0 && $FF1alleles[2] != 0) { $TTF++; $total++; print BIN1 "@data[0..2]\n";}
        if ($FF1alleles[0] == 0 && $FF1alleles[1] != 0 && $FF1alleles[2] == 0) { $TFT++; $total++; print BIN2 "@data[0..2]\n";}
        if ($FF1alleles[0] != 0 && $FF1alleles[1] == 0 && $FF1alleles[2] == 0) { $FTT++; $total++; print BIN3 "@data[0..2]\n";}
        if ($FF1alleles[0] == 0 && $FF1alleles[1] != 0 && $FF1alleles[2] != 0) { $TFF++; $total++; print BIN4 "@data[0..2]\n";}
        if ($FF1alleles[0] != 0 && $FF1alleles[1] != 0 && $FF1alleles[2] == 0) { $FFT++; $total++; print BIN5 "@data[0..2]\n";}
        if ($FF1alleles[0] != 0 && $FF1alleles[1] == 0 && $FF1alleles[2] != 0) { $FTF++; $total++; print BIN6 "@data[0..2]\n";}
        if ($FF1alleles[0] == 0 && $FF1alleles[1] == 0 && $FF1alleles[2] == 0) { $TTT++; $total++; print BIN7 "@data[0..2]\n";}
        if ($FF1alleles[0] != 0 && $FF1alleles[1] != 0 && $FF1alleles[2] != 0) { $FFF++; $total++; print BIN8 "@data[0..2]\n";}
        
        if ($FF2alleles[0] == 0 && $FF2alleles[1] == 0 && $FF2alleles[2] != 0) { $TTF++; $total++; print BIN1 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] == 0 && $FF2alleles[1] != 0 && $FF2alleles[2] == 0) { $TFT++; $total++; print BIN2 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] != 0 && $FF2alleles[1] == 0 && $FF2alleles[2] == 0) { $FTT++; $total++; print BIN3 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] == 0 && $FF2alleles[1] != 0 && $FF2alleles[2] != 0) { $TFF++; $total++; print BIN4 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] != 0 && $FF2alleles[1] != 0 && $FF2alleles[2] == 0) { $FFT++; $total++; print BIN5 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] != 0 && $FF2alleles[1] == 0 && $FF2alleles[2] != 0) { $FTF++; $total++; print BIN6 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] == 0 && $FF2alleles[1] == 0 && $FF2alleles[2] == 0) { $TTT++; $total++; print BIN7 "@data[0..1]\t$data[6]\n";}
        if ($FF2alleles[0] != 0 && $FF2alleles[1] != 0 && $FF2alleles[2] != 0) { $FFF++; $total++; print BIN8 "@data[0..1]\t$data[6]\n";}

}
print FH2 "Summarizing file $ARGV[0]\n";
print FH2 "Total number of alleles across bins: $total\n";      
print FH2 "TTF\t$TTF\nTFT\t$TFT\nFTT\t$FTT\nTFF\t$TFF\nFFT\t$FFT\nFTF\t$FTF\nTTT\t$TTT\nFFF\t$FFF\n";





#10#
presence.absence.analysis.pl
# Script was tailored for obtaining unique alleles of male founders by comparing allele presence/absence in different sets of their offspring.
# Script reads in allele tables of several subgroups and compares allele content between those
# the output file has a list of alleles present in one, but absent in other two groups. 
# Usage: perl presence.absence.analysis.pl <presece.allele.table> <absence1.allele.table> <absence2.allele.table> <output>

use strict;
use warnings;
local $" = "\t";

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";
open (FH3, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n";
open (FH4, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n";


while (<FH1>) {
        
        my $presence = $_;
        my $absence1 = <FH2>;
        my $absence2 = <FH3>;
        
        if ($presence !~ /^#/) {
        
                chomp $presence;
                chomp $absence1;
                chomp $absence2;

                my @presence = split(/\t/, $presence);
                my @absence1 = split(/\t/, $absence1);
                my @absence2 = split(/\t/, $absence2);

                print STDOUT "$presence[1]\t$absence1[1]\t$absence2[1]\n";

                my $presLgth = scalar @presence;
                my @alleles = @presence[2..$presLgth - 1];

                my %seen =();
                my @uniq =();

                #make an array with unique alleles
                foreach my $item (@alleles) {
                        push(@uniq, $item) unless $seen{$item}++;
                }

                foreach my $item (@uniq) {
                        if ( grep( /^$item$/, @absence1 ) || grep( /^$item$/, @absence2 )) {
                                print STDOUT "$item\n@absence1\n@absence2\n";
                                } else {
                                print FH4 "$presence[0]\t$presence[1]\t$item\n";
                                print STDOUT "unique value detected\n$item\n@absence1\n@absence2\n";
                                }
                }
        }
}





#11#
FM1_allele.comparisons.pl
#script individually tailored for the first male founder analysis
#it reads in a file with unique alleles of the first male founder and determines if each particular allele 
#is present in the two temporal subgroups and bins unique alleles of the first male founder into 4 output files:
#TT (absent in both subgroups); FT (present in first, absent in the other, TF (vice versa), FF (present in both)
#usage: perl FM_allele.comparisons.pl <FM1_uniq_allele.table> <subgroup2.alele.table> <subgroup3.allele.table> <out.summary>

use strict;
use warnings;
local $" = "\t";

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n";

#make output files to store coordinates of alleles in a particular bin
open (BIN1, '>', "FM1-offsp.unique.allele.bin1.TT") or die "Can't open BIN1: $!\n";
open (BIN2, '>', "FM1-offsp.unique.allele.bin2.FT") or die "Can't open BIN2: $!\n";
open (BIN3, '>', "FM1-offsp.unique.allele.bin3.TF") or die "Can't open BIN3: $!\n";
open (BIN4, '>', "FM1-offsp.unique.allele.bin4.FF") or die "Can't open BIN4: $!\n";


#define 4 bins (T stands for "absence of the allele is TRUE"; F  - "absence of the allele is FALSE")
#(T -> TRUE == 0; F -> False != 0)
my $TT=0;
my $FT=0;
my $TF=0;
my $FF=0;

my $total=0;

while (<FH1>) {
        chomp $_;
        my @data = split (/\t/, $_);
        my @sg2Alleles = ();
        my @sg3Alleles = ();
        my $FM_in_SG2 = 0;
        my $FM_in_SG3 = 0;
        
        #print "@data\n";
        
        #loop through the subgroup2 and look for the line with corresponding chr and position of the allele
        open (FH3, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";

        my $LINE2 = '';
        do {$LINE2 = <FH3>} until $LINE2 =~ /^$data[0]\t$data[1]/ || eof; 
        chomp $LINE2;
        my @subgroup2 = split(/\t/, $LINE2);
        print "SG2:@subgroup2[0..1]\n";
        my $sg2Lgth = scalar @subgroup2;
        @sg2Alleles = @subgroup2[2..$sg2Lgth - 1];


        #loop through the subgroup3 and find the line with corresponding chr and position of the allele
        open (FH4, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n";
        my $LINE3 = '';
        do {$LINE3 = <FH4>} until $LINE3 =~ /^$data[0]\t$data[1]/ || eof; 
        chomp $LINE3;
        my @subgroup3 = split(/\t/, $LINE3);
        print "SG3:@subgroup3[0..1]\n";
        my $sg3Lgth = scalar @subgroup3;
        @sg3Alleles = @subgroup3[2..$sg3Lgth - 1];

        print "$data[2]\n@sg2Alleles\n@sg3Alleles\n";
        my $g2 = 0;
        my $g3 = 0;

        if ( grep /^$data[2]$/, @sg2Alleles){$g2 = 1;}
        if ( grep /^$data[2]$/, @sg3Alleles){$g3 = 1;}

        if ($g2 == 0 && $g3 == 0) {print BIN1 "@data\n"; $TT++; $total++;}
        if ($g2 == 1 && $g3 == 0) {print BIN2 "@data\n"; $FT++; $total++;}
        if ($g2 == 0 && $g3 == 1) {print BIN3 "@data\n"; $TF++; $total++;}
        if ($g2 == 1 && $g3 == 1) {print BIN4 "@data\n"; $FF++; $total++;}

}

print FH2 "Summarizing file $ARGV[0]\n";
print FH2 "Total number of alleles across bins: $total\n";      
print FH2 "TT\t$TT\nFT\t$FT\nTF\t$TF\nFF\t$FF\n";






#12#
FM2_allele.comparisons.pl
#script individually tailored for the second male founder
#it reads in a file with unique alleles of the second male founder and determines if each particular allele 
#is present in the two temporal subgroups and bins unique alleles of the second male founder into 4 output files:
#TT (absent in both subgroups); FT (present in first, absent in the other, TF (vice versa), FF (present in both)
#usage: perl FM2_allele.comparisons.pl <FM2_uniq_allele.table> <subgroup2.alele.table> <subgroup3.allele.table> <out.summary>

use strict;
use warnings;
local $" = "\t";

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n";

#make output files to store coordinates of alleles in particular bin
open (BIN1, '>', "FM2-offsp.unique.allele.bin1.TT") or die "Can't open BIN1: $!\n";
open (BIN2, '>', "FM2-offsp.unique.allele.bin2.FT") or die "Can't open BIN2: $!\n";
open (BIN3, '>', "FM2-offsp.unique.allele.bin3.TF") or die "Can't open BIN3: $!\n";
open (BIN4, '>', "FM2-offsp.unique.allele.bin4.FF") or die "Can't open BIN4: $!\n";


#define 4 bins (T stands for "absence of the allele is TRUE"; F  - "absence of the allele is FALSE")
#(T -> TRUE == 0; F -> False != 0)
my $TT=0;
my $FT=0;
my $TF=0;
my $FF=0;

my $total=0;

while (<FH1>) {
        chomp $_;
        my @data = split (/\t/, $_);
        my @sg2Alleles = ();
        my @sg3Alleles = ();
        my $FM_in_SG2 = 0;
        my $FM_in_SG3 = 0;
        
        print "@data\n";
        
        #loop through the subgroup2 and look for the line with corresponding chr and position of the allele
        open (FH3, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";

        my $LINE2 = '';
        do {$LINE2 = <FH3>} until $LINE2 =~ /^$data[0]\t$data[1]/ || eof; 
        chomp $LINE2;
        my @subgroup2 = split(/\t/, $LINE2);
        print "SG2:@subgroup2[0..1]\n";
        my $sg2Lgth = scalar @subgroup2;
        @sg2Alleles = @subgroup2[2..$sg2Lgth - 1];


        #loop through the subgroup3 and look for the line with corresponding chr and position of the allele
        open (FH4, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n";
        my $LINE3 = '';
        do {$LINE3 = <FH4>} until $LINE3 =~ /^$data[0]\t$data[1]/ || eof; 
        chomp $LINE3;
        my @subgroup3 = split(/\t/, $LINE3);
        print "SG3:@subgroup3[0..1]\n";
        my $sg3Lgth = scalar @subgroup3;
        @sg3Alleles = @subgroup3[2..$sg3Lgth - 1];

        print "$data[2]\n@sg2Alleles\n@sg3Alleles\n";
        my $g2 = 0;
        my $g3 = 0;

        if ( grep /^$data[2]$/, @sg2Alleles){$g2 = 1;}
        if ( grep /^$data[2]$/, @sg3Alleles){$g3 = 1;}

        if ($g2 == 0 && $g3 == 0) {print BIN1 "@data\n"; $TT++; $total++;}
        if ($g2 == 1 && $g3 == 0) {print BIN2 "@data\n"; $FT++; $total++;}
        if ($g2 == 0 && $g3 == 1) {print BIN3 "@data\n"; $TF++; $total++;}
        if ($g2 == 1 && $g3 == 1) {print BIN4 "@data\n"; $FF++; $total++;}

}

print FH2 "Summarizing file $ARGV[0]\n";
print FH2 "Total number of alleles across bins: $total\n";      
print FH2 "TT\t$TT\nFT\t$FT\nTF\t$TF\nFF\t$FF\n";



#13#
1983-1993_FM1_FM2_common_allele.comparisons.pl
#script individually tailored for the common alleles of the first and second male founders
#it reads in a file with common alleles of both male founders and determines if each particular allele 
#is present in the two temporal subgroups and bins common male alleles into 4 output files:
#TT (absent in both subgroups); FT (present in first, absent in the other, TF (vice versa), FF (present in both)
#usage: perl FM2_allele.comparisons.pl <FM1_FM2_common_allele.table> <subgroup2.alele.table> <subgroup3.allele.table> <out.summary>

use strict;
use warnings;
local $" = "\t";

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n";
open (FH2, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n";

#make output files to store coordinates of alleles in particular bin
open (BIN1, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin1.TT") or die "Can't open BIN1: $!\n";
open (BIN2, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin2.FT") or die "Can't open BIN2: $!\n";
open (BIN3, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin3.TF") or die "Can't open BIN3: $!\n";
open (BIN4, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin4.FF") or die "Can't open BIN4: $!\n";


#define 4 bins (T stands for "absence of the allele is TRUE"; F  - "absence of the allele is FALSE")
#(T -> TRUE == 0; F -> False != 0)
my $TT=0;
my $FT=0;
my $TF=0;
my $FF=0;

my $total=0;

while (<FH1>) {
        chomp $_;
        my @data = split (/\t/, $_);
        my @sg2Alleles = ();
        my @sg3Alleles = ();
        my $FM_in_SG2 = 0;
        my $FM_in_SG3 = 0;
        
        print "@data\n";
        
        #loop through the subgroup2 and look for the line with corresponding chr and position of the allele
        open (FH3, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n";

        my $LINE2 = '';
        do {$LINE2 = <FH3>} until $LINE2 =~ /^$data[0]\t$data[1]/ || eof; 
        chomp $LINE2;
        my @subgroup2 = split(/\t/, $LINE2);
        print "SG2:@subgroup2[0..1]\n";
        my $sg2Lgth = scalar @subgroup2;
        @sg2Alleles = @subgroup2[2..$sg2Lgth - 1];


        #loop through the subgroup3 and look for the line with corresponding chr and position of the allele
        open (FH4, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n";
        my $LINE3 = '';
        do {$LINE3 = <FH4>} until $LINE3 =~ /^$data[0]\t$data[1]/ || eof; 
        chomp $LINE3;
        my @subgroup3 = split(/\t/, $LINE3);
        print "SG3:@subgroup3[0..1]\n";
        my $sg3Lgth = scalar @subgroup3;
        @sg3Alleles = @subgroup3[2..$sg3Lgth - 1];

        print "$data[2]\n@sg2Alleles\n@sg3Alleles\n";
        my $g2 = 0;
        my $g3 = 0;

        if ( grep /^$data[2]$/, @sg2Alleles){$g2 = 1;}
        if ( grep /^$data[2]$/, @sg3Alleles){$g3 = 1;}

        if ($g2 == 0 && $g3 == 0) {print BIN1 "@data\n"; $TT++; $total++;}
        if ($g2 == 1 && $g3 == 0) {print BIN2 "@data\n"; $FT++; $total++;}
        if ($g2 == 0 && $g3 == 1) {print BIN3 "@data\n"; $TF++; $total++;}
        if ($g2 == 1 && $g3 == 1) {print BIN4 "@data\n"; $FF++; $total++;}

}

print FH2 "Summarizing file $ARGV[0]\n";
print FH2 "Total number of alleles across bins: $total\n";      
print FH2 "TT\t$TT\nFT\t$FT\nTF\t$TF\nFF\t$FF\n";





#14#
sort_lost_variation_according_to_bed.v2.pl
#!usr/bin/perl
#Individually tailored script to count the number of concordant SNPs between the lost 1 Mb haplotype clusters (bed file) and lost individual founder alleles in 1994-2005.
#Update 21.08.2019; removed the option of printing disconcordant SNPs to outfile, instead counting them afterwards
#Update 02.07.2020; analyzing only chromosomes where there are lost segments
#This script reads in a list of SNP positions and compares it to a bed file in order to detrmine concordant SNPs - those that fall into regions defined in bed file.
#Positions of concordant SNPs are printed to separate file for each founder (ends with .concordantSNPs) and number of SNPs per each 1 Mb haplotype cluster are printed in output file
#perl sort_lost_variation_according_to_bed.pl <input BED file intervals>  <out>

use strict;
use warnings;

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; 
open (FH4, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; 

#Define all SNP lists that need to be compared
my $infile_FF = "D-85-01.alleles.not.in.1994-2005";
my $outfile_FF = "D-85-01.alleles.not.in.1994-2005.concordantSNPs";
my $infile_FM1 = "FM1-offsp.alleles.not.in.1994-2005";
my $outfile_FM1 = "FM1-offsp.alleles.not.in.1994-2005.concordantSNPs";
my $infile_FM2 = "FM2-offsp.alleles.not.in.1994-2005";
my $outfile_FM2 = "FM2-offsp.alleles.not.in.1994-2005.concordantSNPs";


my $FF_concordant_SNPs = '';
my $FM1_concordant_SNPs = '';
my $FM2_concordant_SNPs = '';

while (<FH1>) {
        next if ($_ =~ /chr/);
        chomp $_;
        my @data = split(/\t/,$_);
        my $chr = $data[0];
        my $begc = $data[1];
        my $endc = $data[2];
        my $founder = $data[3];
        my $nr_of_SNPs = '';

        next if ( $data[1] == 0 && $data[2] == 0 );
        if ($founder =~ /^FF/) {
                open (FH2, '<', $infile_FF ) or die "Can't open $infile_FF: $!\n";
                open (FH3, '>>', $outfile_FF ) or die "Can't open $outfile_FF: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";          
                print STDOUT "Start processing new FF region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        #print STDOUT "$snp_chr\t$chr:$snp_pos\t$begc\t$endc\n";
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) {                       
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FF_concordant_SNPs;
                                ++$nr_of_SNPs;
                        }
                }
        } elsif ($founder =~ /^FM1/) {
                open (FH2, '<', $infile_FM1 ) or die "Can't open $infile_FM1: $!\n";
                open (FH3, '>>', $outfile_FM1 ) or die "Can't open $outfile_FM1: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";          
                print STDOUT "Start processing new FM1 region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos  <= $endc) {                              
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FM1_concordant_SNPs;
                                ++$nr_of_SNPs;
                        }
                }
        } elsif ($founder =~ /^FM2/) {
                open (FH2, '<', $infile_FM2 ) or die "Can't open $infile_FM2: $!\n";
                open (FH3, '>>', $outfile_FM2 ) or die "Can't open $outfile_FM2: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";
                print STDOUT "Start processing new FM2 region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos  <= $endc) {
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FM2_concordant_SNPs;
                                ++$nr_of_SNPs;
                        } 
                }
        }
print FH4 "@data\t$nr_of_SNPs\n"; 

}
  
print STDOUT "FF_concordant_SNPs\t$FF_concordant_SNPs\nFM1_concordant_SNPs\t$FM1_concordant_SNPs\nFM2_concordant_SNPs:$FM2_concordant_SNPs\n";




#15#
sort_lost_variation_according_to_bed.v3.pl
#!usr/bin/perl
#Individually tailored script to count the number of concordant SNPs between the lost 1 Mb haplotype clusters (bed file) and lost individual founder alleles in 2006-2014.
#Update 21.08.2019; removed the option of printing disconcordant SNPs to outfile, instead counting them afterwards
#Update 02.07.2020; analyzing only chromosomes where there are lost segments
#This script reads in a list of SNP positions and compares it to a bed file in order to detrmine concordant SNPs - those that fall into regions defined in bed file.
#Positions of concordant SNPs are printed to separate file for each founder (ends with .concordantSNPs) and number of SNPs per each 1 Mb haplotype cluster are printed in output file
#perl sort_lost_variation_according_to_bed.v3.pl <input BED file intervals>  <out>


use strict;
use warnings;

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; 
open (FH4, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; 

#Define all SNP lists that need to be compared
my $infile_FF = "D-85-01.alleles.not.in.2006-2014";
my $outfile_FF = "D-85-01.alleles.not.in.2006-2014.SNPresolution.concordantSNPs";
my $infile_FM1 = "FM1-offsp.alleles.not.in.2006-2014";
my $outfile_FM1 = "FM1-offsp.alleles.not.in.2006-2014.SNPresolution.concordantSNPs";
my $infile_FM2 = "FM2-offsp.alleles.not.in.2006-2014";
my $outfile_FM2 = "FM2-offsp.alleles.not.in.2006-2014.SNPresolution.concordantSNPs";


my $FF_concordant_SNPs = '';
my $FM1_concordant_SNPs = '';
my $FM2_concordant_SNPs = '';

while (<FH1>) {
        next if ($_ =~ /chr/);
        chomp $_;
        my @data = split(/\t/,$_);
        my $chr = $data[0];
        my $begc = $data[1];
        my $endc = $data[2];
        my $founder = $data[3];
        my $nr_of_SNPs = '';
        
        next if ( $data[1] == 0 && $data[2] == 0 );

        if ($founder =~ /^FF/) {
                open (FH2, '<', $infile_FF ) or die "Can't open $infile_FF: $!\n";
                open (FH3, '>>', $outfile_FF ) or die "Can't open $outfile_FF: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";          
                print STDOUT "Start processing new FF region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        #print STDOUT "$snp_chr\t$chr:$snp_pos\t$begc\t$endc\n";
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) {                       
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FF_concordant_SNPs;
                                ++$nr_of_SNPs;
                        }
                }
        } elsif ($founder =~ /^FM1/) {
                open (FH2, '<', $infile_FM1 ) or die "Can't open $infile_FM1: $!\n";
                open (FH3, '>>', $outfile_FM1 ) or die "Can't open $outfile_FM1: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";          
                print STDOUT "Start processing new FM1 region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos  <= $endc) {                              
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FM1_concordant_SNPs;
                                ++$nr_of_SNPs;
                        }
                }

        } elsif ($founder =~ /^FM2/) {
                open (FH2, '<', $infile_FM2 ) or die "Can't open $infile_FM2: $!\n";
                open (FH3, '>>', $outfile_FM2 ) or die "Can't open $outfile_FM2: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";
                print STDOUT "Start processing new FM2 region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos  <= $endc) {
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FM2_concordant_SNPs;
                                ++$nr_of_SNPs;
                        } 
                }
        }
print FH4 "@data\t$nr_of_SNPs\n"; 

}
 
print STDOUT "FF_concordant_SNPs\t$FF_concordant_SNPs\nFM1_concordant_SNPs\t$FM1_concordant_SNPs\nFM2_concordant_SNPs\t$FM2_concordant_SNPs\n";


#16#
sort_lost_variation_according_to_bed.v4.pl
#!usr/bin/perl
#Individually tailored script to count the number of concordant SNPs between the lost 1 Mb haplotype clusters (bed file) and lost individual female founder's alleles in 1983-1993.
#Update 21.08.2019; removed the option of printing disconcordant SNPs to outfile, instead counting them afterwards
#Update 02.07.2020; analyzing only chromosomes where there are lost segments
#This script reads in a list of SNP positions and compares it to a bed file in order to detrmine concordant SNPs - those that fall into regions defined in bed file.
#Positions of concordant SNPs are printed to separate file for each founder (ends with .concordantSNPs) and number of SNPs per each 1 Mb haplotype cluster are printed in output file
#perl sort_lost_variation_according_to_bed.v4.pl <input BED file intervals>  <out>


use strict;
use warnings;

open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; 
open (FH4, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; 

#Define all SNP lists that need to be compared
my $infile_FF = "D-85-01.alleles.not.in.1983-1993";
my $outfile_FF = "D-85-01.alleles.not.in.1983-1993.SNPresolution.concordantSNPs";


my $FF_concordant_SNPs = '';
my $FM1_concordant_SNPs = '';
my $FM2_concordant_SNPs = '';

while (<FH1>) {
        
        next if ($_ =~ /chr/);
        chomp $_;
        my @data = split(/\t/,$_);
        my $chr = $data[0];
        my $begc = $data[1];
        my $endc = $data[2];
        my $founder = $data[3];
        my $nr_of_SNPs = '';
        
        next if ( $data[1] == 0 && $data[2] == 0 );     

        if ($founder =~ /^FF/) {
                open (FH2, '<', $infile_FF ) or die "Can't open $infile_FF: $!\n";
                open (FH3, '>>', $outfile_FF ) or die "Can't open $outfile_FF: $!\n";
                print STDOUT "$chr\t$begc\t$endc\t$founder\n";          
                print STDOUT "Start processing new FF region on chr $chr...\n";
                while (<FH2>) {
                        chomp $_;
                        my @snp_data = split(/\t/,$_);
                        my $snp_chr = $snp_data[0];
                        my $snp_pos = $snp_data[1];
                        #print STDOUT "$snp_chr\t$chr:$snp_pos\t$begc\t$endc\n";
                        if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) {                       
                                print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n";
                                ++$FF_concordant_SNPs;
                                ++$nr_of_SNPs;
                        }
                }
}
print FH4 "@data\t$nr_of_SNPs\n"; 

}
 
print STDOUT "FF_concordant_SNPs\t$FF_concordant_SNPs\n";