# Bioinformatics pipeline and perl scripts (at the end of the file) used for data analysis in # Whole-genome re-sequencing of temporally stratified samples reveal substantial loss of haplotype diversity in the highly inbred Scandinavian wolf population # Viluma et al. ########################################## FILTERING VARIANT CALLS ################################################## #1. Prepare a bed file with segmental duplications of the reference genome CanFam3.1 #a) extract reference sequence of each chromosome module load bioinfo-tools samtools/1.8 for chr in {1..38} X do samtools faidx Canis_familiaris.CanFam3.1.dna.toplevel.fa $chr > Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.fa done #b) do self alignment of each chromosome with LastZ for chr in {1..38} X do inp="Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.fa" module load bioinfo-tools lastz/1.04.00 lastz $inp --self --nomirror --notransition --step=20 --nogapped --hspthresh=6000 --matchcount=500 --format=maf > ${inp%.fa}.lastz.selfaln.maf done #for X chromosome increase --hspthresh=60000 in the run_lastz_selfaln2.sh for chr in X do inp="Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.fa" module load bioinfo-tools lastz/1.04.00 lastz $inp --self --nomirror --notransition --step=20 --nogapped --hspthresh=60000 --matchcount=500 --format=maf > ${inp%.fa}.lastz.selfaln.maf done #c) convert maf file (output from LastZ alignment) to bed file for chr in X {1..38} do perl maf2bed.pl Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.lastz.selfaln.maf Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.lastz.selfaln.bed done #2. Join genomic coordinates of the segmental duplications with manually downloaded RepeatMasker and StarDust+WM bed files from UCSC genome browser. #a) Combine coordinate files in one, sort it for chr in {1..38} X do #combine all cat CanFam3_chr${chr}_WM_SDust_UCSC.bed CanFam3_chr${chr}_RepeatMasker_UCSC.bed Canis_familiaris.CanFam3.1.dna.toplevel_chr${chr}.lastz.selfaln.bed > CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed #keep only first three columns of the merged file cut -f1,2,3 CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed > tmp mv tmp CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed #Sort the final bed file inp="CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed" out="CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.sorted.bed" module load bioinfo-tools BEDTools/2.27.1 bedtools sort -i $inp > $out done #b) merge overlapping entries, with -d 100 option (if the window between two masked regions is <100, mask it as well) module load bioinfo-tools BEDTools/2.27.1 for chr in {1..38} X do bedtools merge -d 100 -i CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.sorted.bed >tmp mv tmp CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed rm -f CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.sorted.bed done #combine all chromosomes in one file (remove all separate chromosome files): for chr in {1..38} X do cat CanFam3_chr${chr}_RepeatMasker_WM_SDust_UCSC_selfaln.bed >> CanFam3_RepeatMasker_WM_SDust_UCSC_selfaln.bed done #!!!! IMPORTANT !!!! add a header line to the final bed file, othervise vcftools is not masking the first line of the bed file!!! sed -i '1i CHR START END' CanFam3_RepeatMasker_WM_SDust_UCSC_selfaln.bed #3. Run filtering pipeline # (remove SNP calls within the repetitive regions of the CanFam3.1 assembly) # (set a treshold for SNP and genotype quality, read depth) # (include only sites with Minor Allele Count greater than or equal to 1: keeping only bi-allelic SNPs) #this is done on the entire data set of Scandinavian and Finnish wolves #total No. of variants in 76Scand.98Finnish.recode.vcf.gz (chr1-38, X) zcat 76Scand.98Finnish.recode.vcf.gz | grep "^[1234567890X]" | wc -l 38,188,956 inp="76Scand.98Finnish.vcf.gz" tmp="76Scand.98Finnish" rep="CanFam3_RepeatMasker_WM_SDust_UCSC_selfaln.bed" module load bioinfo-tools vcftools/0.1.15 #Filter away all repetitive regions of the CanFam3 assembly vcftools --gzvcf $inp --recode --out ${tmp} --exclude-bed $rep #output file $tmp.no.repeats.recode.vcf #then apply other filtering criteria vcftools --vcf $tmp.recode.vcf --remove-indels --mac 1 --min-alleles 2 --max-alleles 2 --minGQ 30 --minQ 300 --maxDP 80 --recode --out ${tmp}.filtered.v6 rm $tmp.recode.vcf # Number of remaining variants after filtering: zcat 76Scand.98Finnish.filtered.v6.vcf.gz |grep "^[1234567890X]" | wc -l 5,884,691 #4. Remove regions of high SNP density #a) build 1kb windows module load bioinfo-tools BEDTools/2.27.1 bedtools makewindows -g CanFam3_chromosome_lenghth.txt -w 1000 > CanFam3_chromosome_lenghth_1Kb_windows.bed #b) count nr of SNPs in each window module load bioinfo-tools BEDTools/2.27.1 bed="CanFam3_chromosome_lenghth_1Kb_windows.bed" vcf="76Scand.98Finnish.filtered.v6.vcf" bedtools coverage -a $bed -b $vcf -counts > ${vcf%.vcf}.SNPdensity_1KbW.txt #Number of SNPs coinciding with each window is given in the last column (column 3) #c) set a treshold of SNP density #take last column and plot a histogram of SNP density across all windows module load perl_modules/5.24.1 inp="76Scand.98Finnish.filtered.v6.SNPdensity_1KbW.txt" perl ./plot.histogram.pl $inp 3 #output: ___________________________________________________________________________________ Count: 2327652 Range: 0.000 - 230.000; Mean: 2.528; Median: 1.000; Stddev: 3.609 Percentiles: 90th: 7.000; 95th: 9.000; 99th: 15.000 0.000 - 2.216: 665280 ##################################################### 2.216 - 4.171: 397327 ################################ 4.171 - 7.314: 272654 ###################### 7.314 - 12.368: 128199 ########## 12.368 - 20.494: 33953 ### 20.494 - 33.560: 7148 # 33.560 - 54.570: 1230 | 54.570 - 88.350: 290 | 88.350 - 142.666: 123 | 142.666 - 230.000: 23 | _____________________________________________________________________________________ # Choosing 90th Percentile as a boundary and extracting coordinates for windows with more than 7 SNPs per 1Kb inp="76Scand.98Finnish.filtered.v6.SNPdensity_1KbW.txt" out="76Scand.98Finnish.filtered.v6.High_SNPdensity_windows.bed" awk -F'\t' '$4>7' $inp >$out sed -i '1i CHR START END SCORE' $out #For statistical phasing exclusion of high SNP density windows was incorporated into phasing pipeline #However, later this filtering was also necessary for individual SNP allele comparison and the entire vcf file was filtered: inp="76Scand.98Finnish.filtered.v6.vcf" bed="76Scand.98Finnish.filtered.v6.High_SNPdensity_windows.bed" filter="noHighSNPdensity" module load bioinfo-tools vcftools/0.1.15 vcftools --vcf $inp --recode --out ${inp%.vcf}.$filter --exclude-bed $bed #Number of SNPs left after high density filtering cat 76Scand.98Finnish.filtered.v6.noHighSNPdensity.recode.vcf | grep "^[1234567890X]" | wc -l 3,900,583 ______________________________________________ Summary of used tools: custom perl scripts (see at the end of file) samtools/1.8 lastz/1.04.00 BEDTools/2.27.1 vcftools/0.1.15 ########################################## TWO-STEP PHASING ################################################## #1. Step one: phasing of individual 1 MB windows across the genome #a) defining non-overlapping 1 Mb windows across the CanFam3.1 genome module load bioinfo-tools BEDTools/2.27.1 bedtools makewindows -g CanFam3_chromosome_lenghth.txt -w 1000000 > CanFam3_chromosome_lenghth_1Mb_windows.bed #b) Phasing individual 1 MB windows chromosome by chromosome for chr in {1..38} X do mkdir -p chr${chr}/PHASE_1_newfilt/ inp="76Scand.98Finnish.filtered.v6.vcf" out="chr${chr}/PHASE_1_newfilt/" bed="76Scand.98Finnish.filtered.v6.High_SNPdensity_windows.bed" windows="CanFam3_chromosome_lenghth_1Mb_windows.bed" module load bioinfo-tools vcftools/0.1.15 rm -f ${out}Markerpos.txt grep -P "^$chr""\\t" $windows | while read line do start_c=$(echo $line | cut -f2 -d " ") end_c=$(echo $line | cut -f3 -d " ") echo $chr $start_c $end_c #Make an individual vcf file for each window from the windows.bed file #and simultaneously exclude SNPs from 1 Kb windows of too high SNP density vcftools --vcf $inp --recode --out ${out}tmp1 --exclude-bed $bed --chr $chr --from-bp $start_c --to-bp $end_c #remove SNPs with more than 25% missing genotypes and thin markers to be 20 000 bp apart vcftools --vcf ${out}tmp1.recode.vcf --recode --out ${out}tmp --max-missing 0.75 --thin 20000 #count and display marker positions that have been used for phasing of each window grep -P "^$chr""\\t" ${out}tmp.recode.vcf | cut -f1,2 >> ${out}Markerpos.txt cd $out #transform final vcf file as input file for PHASE perl Transform_VCF2PHASE.pl ${out}tmp.recode.vcf tmp.PHASE.inp $chr #phase PHASE tmp.PHASE.inp Scand_wolf.markers.for.phasing1_1Mb_${start_c}.PHASE.out rm -f ${out}tmp1.recode.vcf rm -f ${out}tmp.recode.vcf rm -f tmp.PHASE.inp done done #c) count number of markers used for PHASE step 1: for chr in {1..38} X do wc -l /proj/sllstore2017034/nobackup/work/agnese/TwoStepPHASE_Scand_Finn_wolf/chr${chr}/PHASE_1_newfilt/Markerpos.txt done #2. Step two: chromosome-level phasing of 1 MB window alleles #As our data set has a large number of individuals (including individuals from Finnish population), second PHASEing step initially resulted in an error of too high KMAX (50 was the default maximum) #To solve this, I downloaded the source code from github (https://github.com/stephens999/phase/archive/master.zip), increased KMAX in constants.h to 100 and recompiled. #a) Collect genotypes from the first phasing step and prepare input files for the second phasing step module load perl_modules/5.24.1 for chr in {1..38} X do out="chr${chr}/Summary.PHASE_1.chr${chr}.haplotype.pairs.txt" out2="chr${chr}/1Mb_Haplotype_alleles.for.PHASE_2" windows=$(ll chr${chr}/PHASE_1_newfilt/Scand_wolf.markers.for.phasing1_1Mb_*0.PHASE.out | wc -l) perl Extract_haplotype_pairs_PHASE.v2.pl "chr${chr}/PHASE_1_newfilt/Scand_wolf.markers.for.phasing1_1Mb_*0.PHASE.out" $out awk '{a[(NR-1)%n]=a[(NR-1)%n]==""?$1:a[(NR-1)%n] OFS $1}END{for (i=0;i tmp sed -i 's/000000.PHASE.out//g' tmp sed -i 's/.PHASE.out//g' tmp mv tmp $out sed "s/SampleID\t/201\n${windows}\nP/g; s/_1\t/\n/g; s/.*_2\t//g" $out > ${out2}.inp sed -i "4i $(printf '%.0sM' $(eval echo "{1..$windows}")})" ${out2}.inp done #b) Proceed with 2nd phasing step, parse results to be excel friendly for chr in {1..38} X do inp="chr${chr}/1Mb_Haplotype_alleles.for.PHASE_2.inp" module load perl_modules/5.24.1 PHASE $inp ${inp%.inp}.out #make it excel friendly perl Collect.PHASE.haps.pl ${inp%.inp}.out ${inp%.inp}.out.forExcel.txt done #c) collect final results from all chr in one folder mkdir PHASE_2_summary for chr in {1..38} X do cp chr${chr}/1Mb_Haplotype_alleles.for.PHASE_2.out.forExcel.txt PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB done #d) Sort phased individuals in an ordered manner (based on the subgroup assignment and birth year) #list of subgroups and haplotype assignment FinnoScand_all_timeordered_info.txt #list of wolf IDs corresponding to the list of subgroups and haplotype assignment FinnoScand_all_timeordered_IDs.txt ind="FinnoScand_all_timeordered_IDs.txt" for chr in {1..38} X do inp="PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB" cat $ind | while read line do grep -P "^$line" $inp | head -2 | sed '1,117!d; s/[()]//g' - >> tmp done paste FinnoScand_all_timeordered_info.txt tmp > PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB_timeorder rm -f tmp perl -i -p -E 's/ /\t/g' PHASE_2_summary/FinnoScand_all_chr${chr}_PHASE_1MB_timeorder done #Downloaded all _timeorder files, extracted 76 Scandinavian individuals and imported data into specially prepared excel sheet template (Supplementary File 1). #Color asignment was achieved by use of conditional formatting option in MS Excel creating hierarchical colouring sheme. #Proceeded with manually inferring male founder haplotypes and performed manual curation of the data set. #Excel was also used to calculate frequency of founder haplotypes in each subsample and to determine clusters of 1 Mb founder haplotypes absent in each subgroup. ______________________________________________ Summary of used tools: custom perl scripts (see at the end of file) BEDTools/2.27.1 vcftools/0.1.15 PHASE 2.1.1. ############################################# COMPARISON OF FOUNDER HAPLOTYPES ################################################## # A pairwise comparison of the 6 founder haplotypes (FF1/FF2/FM11/FM12/FM21/FM22). # Input file Founder_haplotypes_All_chr_removed_11_discarded_widows.txt was obtained by compiling assigned founder haplotypes from all chromosomes and transforming original haplotype IDs to a scale of 1 to 6. # Each row of the file contains per chromosome sequence of phased 1 Mb haplotype IDs (tab separated) of a particular founder haplotype: # chromosome Total_windows_per_chromosome Founder_haplotype haplotype_ID_window_0 haplotype_ID_window_1 ... file="Founder_haplotypes_All_chr_removed_11_discarded_widows.txt" out="Founder_haplotypes_All_chr_removed_11_discarded_widows.pairwise.comp.txt" #if such outputfile already exists, don't forget to remove or rename it rm -f $out for chr in {1..38} PAR X do FF1=( `echo $(grep -P "^${chr}\t[0-9]*\tFF1" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` ) FF2=( `echo $(grep -P "^${chr}\t[0-9]*\tFF2" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` ) FM11=( `echo $(grep -P "^${chr}\t[0-9]*\tFM11" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` ) FM12=( `echo $(grep -P "^${chr}\t[0-9]*\tFM12" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` ) FM21=( `echo $(grep -P "^${chr}\t[0-9]*\tFM21" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` ) FM22=( `echo $(grep -P "^${chr}\t[0-9]*\tFM22" $file | cut -f4- | sed -e $'s/\t/ /g; s/ 0/ /g' )` ) FF1_FF2=0 FF1_FM11=0 FF1_FM12=0 FF1_FM21=0 FF1_FM22=0 FF2_FM11=0 FF2_FM12=0 FF2_FM21=0 FF2_FM22=0 FM11_FM12=0 FM11_FM21=0 FM11_FM22=0 FM12_FM21=0 FM12_FM22=0 FM21_FM22=0 l=${#FF1[@]} for ((i=0;i<${#FF1[@]};++i)) do [[ "${FF1[i]}" == "${FF2[i]}" ]] && FF1_FF2=$(expr $FF1_FF2 + 1) [[ "${FF1[i]}" == "${FM11[i]}" ]] && FF1_FM11=$(expr $FF1_FM11 + 1) [[ "${FF1[i]}" == "${FM12[i]}" ]] && FF1_FM12=$(expr $FF1_FM12 + 1) [[ "${FF1[i]}" == "${FM21[i]}" ]] && FF1_FM21=$(expr $FF1_FM21 + 1) [[ "${FF1[i]}" == "${FM22[i]}" ]] && FF1_FM22=$(expr $FF1_FM22 + 1) [[ "${FF2[i]}" == "${FM11[i]}" ]] && FF2_FM11=$(expr $FF2_FM11 + 1) [[ "${FF2[i]}" == "${FM12[i]}" ]] && FF2_FM12=$(expr $FF2_FM12 + 1) [[ "${FF2[i]}" == "${FM21[i]}" ]] && FF2_FM21=$(expr $FF2_FM21 + 1) [[ "${FF2[i]}" == "${FM22[i]}" ]] && FF2_FM22=$(expr $FF2_FM22 + 1) [[ "${FM11[i]}" == "${FM12[i]}" ]] && FM11_FM12=$(expr $FM11_FM12 + 1) [[ "${FM11[i]}" == "${FM21[i]}" ]] && FM11_FM21=$(expr $FM11_FM21 + 1) [[ "${FM11[i]}" == "${FM22[i]}" ]] && FM11_FM22=$(expr $FM11_FM22 + 1) [[ "${FM12[i]}" == "${FM21[i]}" ]] && FM12_FM21=$(expr $FM12_FM21 + 1) [[ "${FM12[i]}" == "${FM22[i]}" ]] && FM12_FM22=$(expr $FM12_FM22 + 1) [[ "${FM21[i]}" == "${FM22[i]}" ]] && FM21_FM22=$(expr $FM21_FM22 + 1) done total=`echo $(grep -P "^${chr}\t[0-9]*\tFF1" $file | cut -f 2)` FF1_FF2_p=$(echo "scale=3; $FF1_FF2 / $total * 100" | bc -l) FF1_FM11_p=$(echo "scale=3; $FF1_FM11 / $total * 100" | bc -l) FF1_FM12_p=$(echo "scale=3; $FF1_FM12 / $total * 100" | bc -l) FF1_FM21_p=$(echo "scale=3; $FF1_FM21 / $total * 100" | bc -l) FF1_FM22_p=$(echo "scale=3; $FF1_FM22 / $total * 100" | bc -l) FF2_FM11_p=$(echo "scale=3; $FF2_FM11 / $total * 100" | bc -l) FF2_FM12_p=$(echo "scale=3; $FF2_FM12 / $total * 100" | bc -l) FF2_FM21_p=$(echo "scale=3; $FF2_FM21 / $total * 100" | bc -l) FF2_FM22_p=$(echo "scale=3; $FF2_FM22 / $total * 100" | bc -l) FM11_FM12_p=$(echo "scale=3; $FM11_FM12 / $total * 100" | bc -l) FM11_FM21_p=$(echo "scale=3; $FM11_FM21 / $total * 100" | bc -l) FM11_FM22_p=$(echo "scale=3; $FM11_FM22 / $total * 100" | bc -l) FM12_FM21_p=$(echo "scale=3; $FM12_FM21 / $total * 100" | bc -l) FM12_FM22_p=$(echo "scale=3; $FM12_FM22 / $total * 100" | bc -l) FM21_FM22_p=$(echo "scale=3; $FM21_FM22 / $total * 100" | bc -l) echo -e "$chr\tFF1\tFF2\tFM11\tFM12\tFM21\tFM22\nFF1\t$total\t$FF1_FF2\t$FF1_FM11\t$FF1_FM12\t$FF1_FM21\t$FF1_FM22\nFF2\t$FF1_FF2_p\t$total\t$FF2_FM11\t$FF2_FM12\t$FF2_FM21\t$FF2_FM22\nFM11\t$FF1_FM11_p\t$FF2_FM11_p\t$total\t$FM11_FM12\t$FM11_FM21\t$FM11_FM22\nFM12\t$FF1_FM12_p\t$FF2_FM12_p\t$FM11_FM12_p\t$total\t$FM12_FM21\t$FM12_FM22\nFM21\t$FF1_FM21_p\t$FF2_FM21_p\t$FM11_FM21_p\t$FM12_FM21_p\t$total\t$FM21_FM22\nFM22\t$FF1_FM22_p\t$FF2_FM22_p\t$FM11_FM22_p\t$FM12_FM22_p\t$FM21_FM22_p\t$total\n" >> $out done # Count number of homozygous windows per each founder perl Count_homoz_founder_windows.pl $out Scand_wolf_founder_homoz_windows_11w_discarded.txt # Between founder comparisons were done in MS Excel worksheet ########################################## ABSENCE OF PHASED 1 MB HAPLOTYPES ########################################### #Founder haplotype frequencies for each subpopulation were calculated individually for each chromosome in MS Excel. #For further analysis and visualisation purposes coordinates of the absent haplotype segments were transferred to individual BED files - one per each founder haplotype and time period. #(start coordinate is the first lost window of a segment; end coordinate is the window right after the last lost window of the segment) #1. Analysing haplotypes of the female founder (FF1 and FF2) #list of bed files: Lost_segments_FF1_1983-1993.bed Lost_segments_FF2_1983-1993.bed Lost_segments_FF1_1994-2005.bed Lost_segments_FF2_1994-2005.bed Lost_segments_FF1_2006-2014.bed Lost_segments_FF2_2006-2014.bed #a) count number of unique haplotype loss to each temporal subsample module load bioinfo-tools BEDTools/2.29.2 ## uniq absence in 1983-1993 (TFF - meaning that absence of the haplotype is TRUE for 1983-1993 and FALSE for 1994-2005 and 2006-2014) # # Haplotype FF1 input="Lost_segments_FF1_1983-1993.bed" comp1="Lost_segments_FF1_1994-2005.bed" comp2="Lost_segments_FF1_2006-2014.bed" out="Lost_segments_FF1_1983-1993_present_in_1994-2005_and_2006-2014.bed" subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc #Result: 0 # Haplotype FF2 input="Lost_segments_FF2_1983-1993.bed" comp1="Lost_segments_FF2_1994-2005.bed" comp2="Lost_segments_FF2_2006-2014.bed" out="Lost_segments_FF2_1983-1993_present_in_1994-2005_and_2006-2014.bed" subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc #Result: 2 #none of these haplotypes fall within the list of discarded windows # Total TFF=0+2=2 ## uniq absence in 1994-2005 (FTF) # # Haplotype FF1 input="Lost_segments_FF1_1994-2005.bed" comp1="Lost_segments_FF1_1983-1993.bed" comp2="Lost_segments_FF1_2006-2014.bed" out="Lost_segments_FF1_1994-2005_present_in_1983-1993_and_2006-2014.bed" subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc #Result: 2 # Haplotype FF2 input="Lost_segments_FF2_1994-2005.bed" comp1="Lost_segments_FF2_1983-1993.bed" comp2="Lost_segments_FF2_2006-2014.bed" out="Lost_segments_FF2_1994-2005_present_in_1983-1993_and_2006-2014.bed" subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 11 # Total FTF=2+11=13 ## uniq absence in 2006-2014 (FFT) # # Haplotype FF1 input="Lost_segments_FF1_2006-2014.bed" comp1="Lost_segments_FF1_1983-1993.bed" comp2="Lost_segments_FF1_1994-2005.bed" out="Lost_segments_FF1_2006-2014_present_in_1983-1993_and_1994-2005.bed" subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 100 -1 haplotype from discarded windows # Haplotype FF2 input="Lost_segments_FF2_2006-2014.bed" comp1="Lost_segments_FF2_1983-1993.bed" comp2="Lost_segments_FF2_1994-2005.bed" out="Lost_segments_FF2_2006-2014_present_in_1983-1993_and_1994-2005.bed" subtractBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 153 # Total FFT=99+153=252 #b) count common absence of haplotypes in all three subgroups # Haplotype FF1 input="Lost_segments_FF1_1983-1993.bed" comp1="Lost_segments_FF1_1994-2005.bed" comp2="Lost_segments_FF1_2006-2014.bed" out="TwoStepPHASE_Scand_Finn_wolf/Lost_segments_FF1_1983-1993_1994-2005_2006-2014.bed" intersectBed -a $input -b $comp1 | intersectBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 29 # Haplotype FF2 input="Lost_segments_FF2_1983-1993.bed" comp1="Lost_segments_FF2_1994-2005.bed" comp2="Lost_segments_FF2_2006-2014.bed" out="Lost_segments_FF2_1983-1993_1994-2005_2006-2014.bed" intersectBed -a $input -b $comp1 | intersectBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 127 # Total TTT=29+127 = 156 #c) common absence in two subsamples and presence in one. # absent in 1983-1993 and 1994-2005, present in 2006-2014 (TTF) # Haplotype FF1 input="Lost_segments_FF1_1983-1993.bed" comp1="Lost_segments_FF1_1994-2005.bed" comp2="Lost_segments_FF1_2006-2014.bed" out="Lost_segments_FF1_1983-1993_1994-2005_but_not_in_2006-2014.bed" intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 0 # Haplotype FF2 input="Lost_segments_FF2_1983-1993.bed" comp1="Lost_segments_FF2_1994-2005.bed" comp2="Lost_segments_FF2_2006-2014.bed" out="Lost_segments_FF2_1983-1993_1994-2005_but_not_in_2006-2014.bed" intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result:3 # Total TTF=3 # absent in 1983-1993 and 2006-2014, present in 1994-2005 (TFT) # Haplotype FF1 input="Lost_segments_FF1_1983-1993.bed" comp1="Lost_segments_FF1_2006-2014.bed" comp2="Lost_segments_FF1_1994-2005.bed" out="Lost_segments_FF1_1983-1993_2006-2014_but_not_in_1994-2005.bed" intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 0 # Haplotype FF2 input="Lost_segments_FF2_1983-1993.bed" comp1="Lost_segments_FF2_2006-2014.bed" comp2="Lost_segments_FF2_1994-2005.bed" out="Lost_segments_FF2_1983-1993_2006-2014_but_not_in_1994-2005.bed" intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 0 # Total TFT=0 # absent in 1994-2005 and 2006-2014, present in 1983-1993 (FTT) # Haplotype FF1 input="Lost_segments_FF1_1994-2005.bed" comp1="Lost_segments_FF1_2006-2014.bed" comp2="Lost_segments_FF1_1983-1993.bed" out="Lost_segments_FF1_1994-2005_2006-2014_but_not_in_1983-1993.bed" intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 365 # Haplotype FF2 input="Lost_segments_FF2_1994-2005.bed" comp1="Lost_segments_FF2_2006-2014.bed" comp2="Lost_segments_FF2_1983-1993.bed" out="Lost_segments_FF2_1994-2005_2006-2014_but_not_in_1983-1993.bed" intersectBed -a $input -b $comp1 | subtractBed -a - -b $comp2 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}'| paste -s -d+ - | bc ##Result: 351 -1 (coinciding with discarded segment) # Total FTT= 365+350=715 #2. Analysing first founder male (FM1) #List of files: Lost_segments_untill2005_FM11.bed Lost_segments_untill2005_FM12.bed Lost_segments_after2005_FM11.bed Lost_segments_after2005_FM12.bed module load bioinfo-tools BEDTools/2.29.2 #a) count number of windows absent only in 1994-2005 (-TF) # # Haplotype FM11 input="Lost_segments_FM11_1994-2005.bed" comp1="Lost_segments_FM11_2006-2014.bed" out="Lost_segments_FM11_1994-2005_present_in_2006-2014.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Results: 8 # Haplotype FM12 input="Lost_segments_FM12_1994-2005.bed" comp1="Lost_segments_FM12_2006-2014.bed" out="Lost_segments_FM12_1994-2005_present_in_2006-2014.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Results:18 # Total -TF=8+18=26 #b) count number of windows absent only in 2006-2014 (-FT) # # Haplotype FM11 input="Lost_segments_FM11_2006-2014.bed" comp1="Lost_segments_FM11_1994-2005.bed" out="Lost_segments_FM11_2006-2014_present_in_1994-2005.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Results: 191 # Haplotype FM12 input="Lost_segments_FM12_2006-2014.bed" comp1="Lost_segments_FM12_1994-2005.bed" out="Lost_segments_FM12_2006-2014_present_in_1994-2005.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Results: 93 # Total -FT=191+93=284 #c count number of windows absent in both time periods (-TT) # Haplotype FM11 input="Lost_segments_FM11_2006-2014.bed" comp1="Lost_segments_FM11_1994-2005.bed" out="Lost_segments_FM11_2006-2014_1994-2005.bed" intersectBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Results: 182 # Haplotype FM12 input="Lost_segments_FM12_2006-2014.bed" comp1="Lost_segments_FM12_1994-2005.bed" out="Lost_segments_FM12_2006-2014_1994-2005.bed" intersectBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Results: 240 # Total -TT = 182+240=422 #3. Analysing second founder male (FM2) #List of files: Lost_segments_untill2005_FM21.bed Lost_segments_untill2005_FM22.bed Lost_segments_after2005_FM21.bed Lost_segments_after2005_FM22.bed module load bioinfo-tools BEDTools/2.29.2 #a) count number of windows absent only in 1994-2005 (-TF) # # Haplotype FM21 input="Lost_segments_FM21_1994-2005.bed" comp1="Lost_segments_FM21_2006-2014.bed" out="Lost_segments_FM21_1994-2005_present_in_2006-2014.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 93 # Haplotype FM22 input="Lost_segments_FM22_1994-2005.bed" comp1="Lost_segments_FM22_2006-2014.bed" out="Lost_segments_FM22_1994-2005_present_in_2006-2014.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 28 # Total -TF = 93+28 = 121 #b) count number of windows absent only in 2006-2014 (-FT) # # Haplotype FM21 input="Lost_segments_FM21_2006-2014.bed" comp1="Lost_segments_FM21_1994-2005.bed" out="Lost_segments_FM21_2006-2014_present_in_1994-2005.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 171 -1 conc. with disc. windows # Haplotype FM22 input="Lost_segments_FM22_2006-2014.bed" comp1="Lost_segments_FM22_1994-2005.bed" out="Lost_segments_FM22_2006-2014_present_in_1994-2005.bed" subtractBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 61 -1 conc. with disc. windows # Total -FT=170+60=230 #c) count number of windows absent in both time periods (-TT) # Haplotype FM21 input="Lost_segments_FM21_2006-2014.bed" comp1="Lost_segments_FM21_1994-2005.bed" out="Lost_segments_FM21_2006-2014_1994-2005.bed" intersectBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 85 # Haplotype FM22 input="Lost_segments_FM22_2006-2014.bed" comp1="Lost_segments_FM22_1994-2005.bed" out="Lost_segments_FM22_2006-2014_1994-2005.bed" intersectBed -a $input -b $comp1 >$out awk -F '\t' 'FNR == NR {print ($3-$2)/1000000}' $out | awk '{printf "%.0f\n", $1+0.49}' | paste -s -d+ - | bc ##Result: 166 # Total -TT= 85+166=251 __________________________________________________________________ Tools used: BEDTools/2.27.1 ######################################## ABSENCE ANALYSIS OF INDIVIDUAL SNP ALLELES ############################################## #1. Data preparation and extra filtering steps #a) Extract a list of variants where 1983-1993 sample (Founder population) has no missing genotypes ind="Scand_wolf_1983-1993_pop_IDs.txt" inp="76Scand.98Finnish.filtered.v6.noHighSNPdensity.recode.vcf" module load bioinfo-tools vcftools/0.1.15 vcftools --vcf $inp --recode --out ${inp}.founder_pop.noNAs --keep $ind --max-missing-count 0 #b) Make a file with all individuals, but only for those positions where 1983-1993 sample doesn't have any missing values inp="76Scand.98Finnish.filtered.v6.noHighSNPdensity.recode.vcf" snps_to_keep="${inp}.founder_pop.noNAs.recode.vcf" vcftools --vcf $inp --recode --out ${inp%.recode.vcf}.founder_pop.noNAs --positions $snps_to_keep #c) Extract a vcf file with 76 Scandinavian wolves and keep only those SNPs that are segregating in this population. inp="76Scand.98Finnish.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf" out="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_76_individuals_IDs.txt" module load bioinfo-tools vcftools/0.1.15 htslib/1.6 vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #2. Extracting vcf file for each subsample of interest. module load bioinfo-tools vcftools/0.1.15 htslib/1.6 #a) For a female founder: inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz" out="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_FF.txt" vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #b) offspring of the first male founder: inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz" out="FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_FM1_F1-3_offspring.txt" vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #c) offspring of the second male founder: inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz" out="FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_FM2_F1_offspring.txt" vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #d) 1983-1993 subsample: inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz" out="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_born_1983-1993.txt" vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #e) 1994-2005 subsample inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz" out="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_born_1994-2005.txt" vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #f) 2006-2014 subsample inp="76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz" out="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs" keep="Scand_wolf_born_2006-2014.txt" vcftools --vcf $inp --keep $keep --mac 1 --min-alleles 2 --max-alleles 2 --recode --out $out bgzip $out.recode.vcf tabix $out.recode.vcf.gz #3. Converting vcf files to allele tables [chr pos Ind1_Allele1 Ind1_Allele2, Ind2_Allele1 Ind2_Allele2, ... ]: perl VCF2table.v2.pl D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table perl VCF2table.v2.pl FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table perl VCF2table.v2.pl FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table perl VCF2table.v2.pl 1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz 1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table perl VCF2table.v2.pl 1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz 1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table perl VCF2table.v2.pl 2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz 2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table #4. Female founder SNP allele comparisons #a) Count the occurrence of each female founder's (FF) allele in the three subgroups 1983-1993; 1994-2005; and 2006-2014 inp="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" group1="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" out="D-85-01.allele.counts.txt" perl allele.counts.v2.pl $inp $group1 $group2 $group3 $out #b) Bin all of the female founder alleles into 8 bins according to presence/absence pattern: perl Summary.FF.lost.alleles.pl $out ${out%.txt}.summary.txt #5. Infering the portion of male founder alleles: unique alleles to the first (FM1) and second male founder (FM2) + those common in both, but absent in female founder #!!NB!! all allele.table files have to contain the same marker positions in the same order. #a) Extract SNP alleles that are unique to FM1 offspring, but not to FM2 offspring and FF presence="FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence1="FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence2="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" out="FM1-offsp.unique.allele.table" perl presence.absence.analysis.pl $presence $absence1 $absence2 $out #b) Extract SNP alleles that are unique to FM2 offspring, but not to FM1 offspring and FF presence="FM2-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence1="FM1-offsp.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence2="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" out="FM2-offsp.unique.allele.table" perl presence.absence.analysis.pl $presence $absence1 $absence2 $out #c) Extract SNP alleles that are in MF1 and MF2 offspring (1983-1993 sample) that are not in FF. # As technically script requires two populations of absence, I supplied FF alleles twice presence="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence1="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence2="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" out="1983-1993_FM1_FM2.allele.table" perl presence.absence.analysis.pl $presence $absence1 $absence2 $out #6. Male founder allele comparisons: binning into 4 bins, according to wether they are present or absent in the 1994-2005; 2006-2014 subgroups. #a) Unique alleles of the first male founder inp="FM1-offsp.unique.allele.table" group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" perl FM1_allele.comparisons.pl $inp $group2 $group3 $inp.bin.summary #b) Unique alleles of the second male founder inp="FM2-offsp.unique.allele.table" group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" perl FM2_allele.comparisons.pl $inp $group2 $group3 $inp.bin.summary #c) Common alleles to both male founders inp="1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.allele.table" group2="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" group3="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" perl 1983-1993_FM1_FM2_common_allele.comparisons.pl $inp $group2 $group3 $inp.bin.summary #d) alleles present in 1994-2005; 2006-2014 that are absent in FF and in 1983-1993 #locate alleles present in 1994-2005 but not in 1983-1993 and FF presence="1994-2005.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence1="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence2="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" out="1994-2005.not.inFFand1983-1993.allele.table" perl presence.absence.analysis.pl $presence $absence1 $absence2 $out #locate alleles present in 2006-2014 but not in 1983-1993 and FF presence="2006-2014.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence1="D-85-01.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" absence2="1983-1993.filtered.v6.noHighSNPdensity.founder_pop.noNAs.allele.table" out="2006-2014.not.inFFand1983-1993.allele.table" perl presence.absence.analysis.pl $presence $absence1 $absence2 $out #7 Collecting allele lists that are not present at certain subsample #FF_alleles not present in 1983-1993 sed 's/$/\tTTT/' D-85-01.allele.counts.bin7.TTT >> D-85-01.alleles.not.in.1983-1993 sed 's/$/\tTTF/' D-85-01.allele.counts.bin1.TTF >> D-85-01.alleles.not.in.1983-1993 sed 's/$/\tTFT/' D-85-01.allele.counts.bin2.TFT >> D-85-01.alleles.not.in.1983-1993 sed 's/$/\tTFF/' D-85-01.allele.counts.bin4.TFF >> D-85-01.alleles.not.in.1983-1993 wc -l D-85-01.alleles.not.in.1983-1993 10108 D-85-01.alleles.not.in.1983-1993 #FF_alleles not present in 1994-2005 sed 's/$/\tTTT/' D-85-01.allele.counts.bin7.TTT >> D-85-01.alleles.not.in.1994-2005 sed 's/$/\tTTF/' D-85-01.allele.counts.bin1.TTF >> D-85-01.alleles.not.in.1994-2005 sed 's/$/\tFTT/' D-85-01.allele.counts.bin3.FTT >> D-85-01.alleles.not.in.1994-2005 sed 's/$/\tFTF/' D-85-01.allele.counts.bin6.FTF >> D-85-01.alleles.not.in.1994-2005 wc -l D-85-01.alleles.not.in.1994-2005 71588 D-85-01.alleles.not.in.1994-2005 #FF_alleles not present in 2006-2014 sed 's/$/\tTTT/' D-85-01.allele.counts.bin7.TTT >> D-85-01.alleles.not.in.2006-2014 sed 's/$/\tTFT/' D-85-01.allele.counts.bin2.TFT >> D-85-01.alleles.not.in.2006-2014 sed 's/$/\tFTT/' D-85-01.allele.counts.bin3.FTT >> D-85-01.alleles.not.in.2006-2014 sed 's/$/\tFFT/' D-85-01.allele.counts.bin5.FFT >> D-85-01.alleles.not.in.2006-2014 wc -l D-85-01.alleles.not.in.2006-2014 91766 D-85-01.alleles.not.in.2006-2014 #FM1_alleles not present in 1994-2005 sed 's/$/\tTT/' FM1-offsp.unique.allele.bin1.TT >> FM1-offsp.alleles.not.in.1994-2005 sed 's/$/\tTF/' FM1-offsp.unique.allele.bin3.TF >> FM1-offsp.alleles.not.in.1994-2005 wc -l FM1-offsp.alleles.not.in.1994-2005 20002 FM1-offsp.alleles.not.in.1994-2005 #FM1_alleles not present in 2006-2014 sed 's/$/\tTT/' FM1-offsp.unique.allele.bin1.TT >> FM1-offsp.alleles.not.in.2006-2014 sed 's/$/\tFT/' FM1-offsp.unique.allele.bin2.FT >> FM1-offsp.alleles.not.in.2006-2014 wc -l FM1-offsp.alleles.not.in.2006-2014 34189 FM1-offsp.alleles.not.in.2006-2014 #FM2_alleles not present in 1994-2005 sed 's/$/\tTT/' FM2-offsp.unique.allele.bin1.TT >> FM2-offsp.alleles.not.in.1994-2005 sed 's/$/\tTF/' FM2-offsp.unique.allele.bin3.TF >> FM2-offsp.alleles.not.in.1994-2005 wc -l FM2-offsp.alleles.not.in.1994-2005 24070 FM2-offsp.alleles.not.in.1994-2005 #FM1_alleles not present in 2006-2014 sed 's/$/\tTT/' FM2-offsp.unique.allele.bin1.TT >> FM2-offsp.alleles.not.in.2006-2014 sed 's/$/\tFT/' FM2-offsp.unique.allele.bin2.FT >> FM2-offsp.alleles.not.in.2006-2014 wc -l FM2-offsp.alleles.not.in.2006-2014 28735 FM2-offsp.alleles.not.in.2006-2014 __________________________________________________________________ Tools used: custom perl scripts vcftools/0.1.15 ######################################## CONCORDANCE BETWEEN THE LOST SNPs AND 1 MB WINDOWS #################################### #1. Count total nr of analysed individual SNPs per chromosome (for statistical overview): echo -e "chr\tmarkers_total\tFF_unique_lost\tFF_not_in_1994-2005\tFF_not_in_2006-20014\tFM1_unique_kept\tFM1_not_in_1994-2005\tFM1_not_in_2006-2014\tFM2_unique_kept\tFM2_not_in_1994-2005\tFM2_not_in_2006-2014" > number_of_markers_per_chr_new.txt for chr in {1..38} X do FF_unique=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1983-1993 |wc -l)` ) FM1_unique=(`echo $(grep -P "^$chr\t" FM1-offsp.unique.allele.table | wc -l )` ) FM2_unique=(`echo $(grep -P "^$chr\t" FM2-offsp.unique.allele.table | wc -l)` ) FF_not_in_pop1=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1994-2005 |wc -l)` ) FM1_not_in_pop1=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.1994-2005 |wc -l )` ) FM2_not_in_pop1=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.1994-2005 |wc -l )` ) FF_not_in_pop2=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.2006-2014 | wc -l )` ) FM1_not_in_pop2=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.2006-2014 | wc -l )` ) FM2_not_in_pop2=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.2006-2014 | wc -l )` ) markers=(`echo $(zcat 76Scand.subset.filtered.v6.noHighSNPdensity.founder_pop.noNAs.recode.vcf.gz | grep -P "^${chr}\t" - | wc -l )` ) echo -e "$chr\t$markers\t$FF_unique\t$FF_not_in_pop1\t$FF_not_in_pop2\t$FM1_unique\t$FM1_not_in_pop1\t$FM1_not_in_pop2\t$FM2_unique\t$FM2_not_in_pop1\t$FM2_not_in_pop2" >> number_of_markers_per_chr_new.txt done #2. Counting SNPs that coinside with absent 1 Mb windows per each founder per period #a) Alleles absent in 1994-2005 subsample perl sort_lost_variation_according_to_bed.v2.pl Lost_segments_untill2005.txt Lost_segments_untill2005_SNP+marker_count.txt #b) Alleles absent in 2006-2014 subsample perl sort_lost_variation_according_to_bed.v3.pl Lost_segments_after2005.txt Lost_segments_after2005_SNP+marker_count.txt #c) Alleles absent in 1983-1993 subsample perl sort_lost_variation_according_to_bed.v4.pl Lost_segments_FFonly.txt Lost_segments_FFonly_SNP+marker_count.txt #d) Count unique alleles only, produce summary table of the number of concordant SNPs #the issue with this calculation is that if both homologous parts of the same chromosome have been lost, it counts SNPs within these regions twice. #after execution of the script loop through .concordantSNPs files and count only unique values: echo -e "chr\tFF_unique\tFF_not_in_pop1\tFF_not_in_pop2\tFM1_not_in_pop1\tFM1_not_in_pop2\tFM2_not_in_pop1\tFM2_not_in_pop2" > number_of_Concordant_markers_per_chr_new.txt for chr in {1..38} X do FF_unique=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1983-1993.concordantSNPs | cut -f2 |sort | uniq |wc -l)` ) FF_not_in_pop1=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.1994-2005.concordantSNPs | cut -f2 |sort | uniq |wc -l)` ) FF_not_in_pop2=(`echo $(grep -P "^$chr\t" D-85-01.alleles.not.in.2006-2014.concordantSNPs | cut -f2 |sort | uniq |wc -l)` ) FM1_not_in_pop1=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.1994-2005.concordantSNPs | cut -f2 |sort | uniq |wc -l)` ) FM1_not_in_pop2=(`echo $(grep -P "^$chr\t" FM1-offsp.alleles.not.in.2006-2014.concordantSNPs | cut -f2 |sort | uniq |wc -l)` ) FM2_not_in_pop1=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.1994-2005.concordantSNPs| cut -f2 |sort | uniq |wc -l)` ) FM2_not_in_pop2=(`echo $(grep -P "^$chr\t" FM2-offsp.alleles.not.in.2006-2014.concordantSNPs | cut -f2 |sort | uniq |wc -l)` ) echo -e "$chr\t$FF_unique\t$FF_not_in_pop1\t$FF_not_in_pop2\t$FM1_not_in_pop1\t$FM1_not_in_pop2\t$FM2_not_in_pop1\t$FM2_not_in_pop2" >> number_of_Concordant_markers_per_chr_new.txt done #then contrast these values to the number of SNPs lost at each period (in MS Excel). #To pinpoint more precise boundaries of the lost 1 Mb chromosome segments they were manually updated by using first/last individual SNP allele of the SNP cluster that coincided with the absent 1 Mb haplotype cluster (if possible). __________________________________________________________________ Tools used: custom perl scripts BEDTools/2.27.1 vcftools/0.1.15 ############################################## COMPILATION OF THE USED PERL SCRIPTS #################################### #1# maf2bed.pl #!usr/bin/perl #This script reads in .maf file from lastz alignment and converts it to a bed file with coordinates of the aligned sequences use strict; use warnings; open (FH, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; #maf file open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; #bed file while () { if ($_ =~ /^s/ && $_ =~ /\+/){ #print STDOUT $_; chomp $_; my @data = split(/[\s\t]+/,$_); #print STDOUT "$data[2]\t$data[3]\n"; my $end=$data[2]+$data[3]; my $start=$data[2]; print FH2 "$data[1]\t$start\t$end\n"; } elsif ($_ =~ /^s/ && $_ =~ /\-/){ chomp $_; my @data = split(/[\s\t]+/,$_); my $end=$data[5]-$data[2]; my $start=$data[5]-$data[2]-$data[3]; print FH2 "$data[1]\t$start\t$end\n"; } } #2# plot.histogram.pl #execution: module load perl_modules/5.24.1 #perl ./plot.histogram.pl infile columNr-1 use strict; use warnings; use Statistics::Histogram; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; my @hist = (); while () { chomp $_; my @data = split(/\t/,$_); push @hist, $data[$ARGV[1]]; } print STDOUT get_histogram(\@hist); #3# Transform_VCF2PHASE.pl #!usr/bin/perl #Script reads in a vcf file with SNP genotypes and transforms it into an inputfile for PHASE #Chromosome needs to be specified #Usage: perl Transform_VCF2PHASE.pl use strict; use warnings; local $" = "\t"; open (FH, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; # input vcf open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; # output file my $chr=$ARGV[2]; my @samples; my @colums; my @POS; my $type = '';; my $nr_of_samples = ''; my $nr_of_SNPs = ''; while () { my $line = $_; my @data; chomp $line; if ($_ =~ /^#CHROM/){ @colums = split(/\t/,$line); $nr_of_samples = scalar @colums - 9; @samples = @colums[9..scalar @colums -1]; #print STDOUT "@colums\n$nr_of_samples\t@samples"; } elsif ($_ =~ /^$chr/){ @data = split(/\t/,$line); $type = join '', $type, 'S'; push @POS, $data[1]; } $nr_of_SNPs = scalar @POS; } #print STDOUT "@POS\n"; print FH2 "$nr_of_samples\n$nr_of_SNPs\nP@POS\n$type\n"; my $i = 9; foreach my $ind (@samples){ open (FH, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; my $geno= ''; my @alleleA; my @alleleB; while () { my @data; my $line = $_; if ($_ =~ /^$chr/){ chomp $line; @data = split(/\t/,$line); $geno = $data[$i]; #print STDOUT "$geno and $data[$i]\n"; } if ($geno =~ /^0[\/\|]0/){ push @alleleA, $data[3]; push @alleleB, $data[3]; } elsif ($geno=~ /^1[\/\|]0/){ push @alleleA, $data[4]; push @alleleB, $data[3]; } elsif ($geno =~ /^0[\/\|]1/){ push @alleleA, $data[3]; push @alleleB, $data[4]; } elsif ($geno =~ /^1[\/\|]1/){ push @alleleA, $data[4]; push @alleleB, $data[4]; } elsif ($geno =~ /^\.[\/\|]\./){ push @alleleA, "?"; push @alleleB, "?"; } } print FH2 "#$ind\n@alleleA\n@alleleB\n"; $i++; #print STDOUT "$i\n"; } '#4# Extract_haplotype_pairs_PHASE.v2.pl #!usr/bin/perl #Script reads in a set of .PHASE.out files (for example, all output files of individual 1 Mb window phasing of a chromosome) and extracts BESTPAIRS_SUMMARY #Output is a table, where first column is a list of individual sample IDs and remaining columns contain haplotype IDs of particular windows from the 1st step of phasing. #Intended for collecting results after running PHASE on sliding windows #Usage: perl Extract_haplotype_pairs_PHASE.v2.pl #example of pattern: PHASING/1MB_windows/chr${chr}/PHASE_1_newfilt/Scand_wolf.markers.for.phasing1_1Mb_*0.PHASE.out use strict; use warnings; use Sort::Naturally; print STDOUT "$ARGV[0] \n"; my @FILES = nsort <$ARGV[0]>; print STDOUT "@FILES \n"; open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; open (FH3, '<', $FILES[0]) or die "Can't open $FILES[0]: $!\n"; print FH2 "SampleID\n"; while () { my @data =(); my @haps =(); if ($_ =~/^#/){ #print STDOUT $_; chomp $_; @data = split(/:/,$_); print FH2 "$data[0]_1\n$data[0]_2\n"; } } foreach my $file (@FILES) { my @window = split(/b_/,$file); print FH2 "$window[1]\n"; open (FH, '<', $file ) or die "Can't open $ARGV[0]: $!\n"; while () { my @data =(); my @haps =(); if ($_ =~/^#/){ #print STDOUT $_; chomp $_; @data = split(/:/,$_); @haps = split(/[,\(\)]/,$data[1]); print FH2 "$haps[1]\n$haps[2]\n"; } } } #5# Collect.PHASE.haps.pl #!usr/bin/perl #Script is intended to be used after 2nd phasing step (PHASE results for chromosome-level haplotypes) #It reads in a PHASE.out file and extracts info from BESTPAIRS1 section #output file is formated to be imported in Excel for manual curration #Usage: perl Collect.PHASE.haps.pl <.out.forExcel.txt> #If several runs are compared, PHASE.out can also be a pattern match of file names use strict; use warnings; my @FILES = <"$ARGV[0]">; # PHASE.out file (can also be used a patern match if several runs compared) open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; foreach my $file (@FILES) { print FH2 "$file\n"; open (FH, '<', $file ) or die "Can't open $ARGV[0]: $!\n"; while () { my @data=(); if ($_ =~/^0 #/){ #print STDOUT $_; chomp $_; @data = split(/#/,$_); #print STDOUT "@data"; my $hap1 = ; my $hap2 = ; print FH2 "$data[1] $hap1$data[1] $hap2"; } } } #6# Count_homoz_founder_windows.pl #!usr/bin/perl #This script reads in a text file with pairwise comparison matrices of all six founder haplotypes for all chromosomes and outputs a table of homozygous windows per founder per chromosome #Usage: perl Count_homoz_founer_windows.pl use strict; use warnings; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; print FH2 "Chr\tFF\tFM1\tFM2\n"; while () { for (my $i=1; $i <= 39; $i++) { next unless $_ =~ /^$i\t/; my $FF_line = ; chomp $FF_line; my @data_FF = split(/\t/,$FF_line); #print STDOUT "@data_FF\n"; ; my $FM1_line = ; chomp $FM1_line; my @data_FM1 = split(/\t/,$FM1_line); ; my $FM2_line = ; chomp $FM2_line; my @data_FM2 = split(/\t/,$FM2_line); print FH2 "$i\t$data_FF[2]\t$data_FM1[4]\t$data_FM2[6]\n"; } } #7# VCF2table.v2.pl #!usr/bin/perl #Script transforms a vcf file to a table of alleles #Usage: perl VCF2table.v2.pl use strict; use warnings; local $" = "\t"; open (FH, "-|", "gzip -dc $ARGV[0]") or die "Can't open $ARGV[0]: $!\n"; # input vcf open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; # output file #print STDOUT "opened file\n"; my $length=''; while () { chomp $_; # print STDOUT "$_\n"; if ($_ =~ /^#CHR/){ print STDOUT "got CHR\n"; my @heder = split(/\t/,$_); print STDOUT "heder is @heder\n"; $length = scalar @heder; print STDOUT "length of heder is $length\n"; my @indiv = @heder[9 .. $length - 1]; print STDOUT "exporting individuals @indiv to a table\n"; print FH2 join("\t", $heder[0], $heder[1], map { $_, $_ } @indiv) . "\n"; } if ($_ =~ /^[\dX]/){ my @data = split(/\t/,$_); my $ref_allele = $data[3]; my $alt_allele = $data[4]; my @alleles = (); for(my $i=9; $i <= $length - 1; $i++) { my @geno = split /[:\/\|]/, $data[$i]; #print STDOUT "my geno is @geno\n"; #assign allele 1 if ($geno[0] =~ 0){ push @alleles, $ref_allele; } elsif ($geno[0]=~ 1){ push @alleles, $alt_allele; } else { push @alleles, "."; } #assign allele 2 if ($geno[1] =~ 0){ push @alleles, $ref_allele; } elsif ($geno[1]=~ 1){ push @alleles, $alt_allele; } else { push @alleles, "."; } } print FH2 "$data[0]\t$data[1]\t@alleles\n"; } } #8# allele.counts.v2.pl #!usr/bin/perl #Script reads in allele.table of a single individual and counts the occurance of each allele in 3 different subgroups #Usage: perl allele.counts.v2.pl use strict; use warnings; local $" = "\t"; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; open (FH3, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n"; open (FH4, '<', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n"; open (FH5, '>', $ARGV[4]) or die "Can't open $ARGV[4]: $!\n"; print FH5 "CHR\tPOS\tFF1\tGroup1\tGroup2\tGroup3\tFF2\tGroup1\tGroup2\tGroup3\n"; while () { my $input = $_; my $subgroup1 = ; my $subgroup2 = ; my $subgroup3 = ; if ($input !~ /^#/) { chomp $input; chomp $subgroup1; chomp $subgroup2; chomp $subgroup3; my @input = split(/\t/, $input); my @subgroup1 = split(/\t/, $subgroup1); my @subgroup2 = split(/\t/, $subgroup2); my @subgroup3 = split(/\t/, $subgroup3); my $sg1Lgth = scalar @subgroup1; my $sg2Lgth = scalar @subgroup2; my $sg3Lgth = scalar @subgroup3; my @sg1Alleles = @subgroup1[2..$sg1Lgth - 1]; my @sg2Alleles = @subgroup2[2..$sg2Lgth - 1]; my @sg3Alleles = @subgroup3[2..$sg3Lgth - 1]; my $FF1_in_SG1 = 0; my $FF1_in_SG2 = 0; my $FF1_in_SG3 = 0; my $FF2_in_SG1 = 0; my $FF2_in_SG2 = 0; my $FF2_in_SG3 = 0; my @out = @input[0..1]; #count acureances of FF1 allele in three subgroups, store counts in @out array if ( grep( /^$input[2]$/, @sg1Alleles ) ) { $FF1_in_SG1 = grep (/^$input[2]$/, @sg1Alleles); } if ( grep( /^$input[2]$/, @sg2Alleles ) ) { $FF1_in_SG2 = grep (/^$input[2]$/, @sg2Alleles); } if ( grep( /^$input[2]$/, @sg3Alleles ) ) { $FF1_in_SG3 = grep (/^$input[2]$/, @sg3Alleles); } push @out, ($input[2], $FF1_in_SG1, $FF1_in_SG2, $FF1_in_SG3); #count acureances of FF2 allele in three subgroups, store counts in @out array if ( grep( /^$input[3]$/, @sg1Alleles ) ) { $FF2_in_SG1 = grep (/^$input[3]$/, @sg1Alleles); } if ( grep( /^$input[3]$/, @sg2Alleles ) ) { $FF2_in_SG2 = grep (/^$input[3]$/, @sg2Alleles); } if ( grep( /^$input[3]$/, @sg3Alleles ) ) { $FF2_in_SG3 = grep (/^$input[3]$/, @sg3Alleles); } push @out, ($input[3], $FF2_in_SG1, $FF2_in_SG2, $FF2_in_SG3); print FH5 "@out\n"; } } #9# Summary.FF.lost.alleles.pl # Script tailored specifically to count lost alleles of the female founder # Script reads in the output from allele.counts.v2.pl and bins all alleles into 8 different files according to presence/absence pattern within 3 subgroups # T stands for "absence of allele is TRUE"; F stands for "absence of allele is FALSE" # usage: perl Summary.FF.lost.alleles.pl use strict; use warnings; local $" = "\t"; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '>', $ARGV[1]) or die "Can't open $ARGV[3]: $!\n"; #make output files to store coordinates of alleles of a particular bin open (BIN1, '>', "D-85-01.allele.counts.bin1.TTF") or die "Can't open BIN1: $!\n"; open (BIN2, '>', "D-85-01.allele.counts.bin2.TFT") or die "Can't open BIN2: $!\n"; open (BIN3, '>', "D-85-01.allele.counts.bin3.FTT") or die "Can't open BIN3: $!\n"; open (BIN4, '>', "D-85-01.allele.counts.bin4.TFF") or die "Can't open BIN4: $!\n"; open (BIN5, '>', "D-85-01.allele.counts.bin5.FFT") or die "Can't open BIN5: $!\n"; open (BIN6, '>', "D-85-01.allele.counts.bin6.FTF") or die "Can't open BIN6: $!\n"; open (BIN7, '>', "D-85-01.allele.counts.bin7.TTT") or die "Can't open BIN7: $!\n"; open (BIN8, '>', "D-85-01.allele.counts.bin8.FFF") or die "Can't open BIN8: $!\n"; #define 8 bins based on presence/absence pattern #3 reference periods are in consecutive order my $TTF=0; my $TFT=0; my $FTT=0; my $TFF=0; my $FFT=0; my $FTF=0; my $TTT=0; my $FFF=0; my $total=0; ; while () { chomp $_; # print "$_"; my @data = split (/\t/, $_); my @FF1alleles = @data[3..5]; my @FF2alleles = @data[7..9]; # print "alleles to count:\n@FF1alleles\n@FF2alleles\n"; if ($FF1alleles[0] == 0 && $FF1alleles[1] == 0 && $FF1alleles[2] != 0) { $TTF++; $total++; print BIN1 "@data[0..2]\n";} if ($FF1alleles[0] == 0 && $FF1alleles[1] != 0 && $FF1alleles[2] == 0) { $TFT++; $total++; print BIN2 "@data[0..2]\n";} if ($FF1alleles[0] != 0 && $FF1alleles[1] == 0 && $FF1alleles[2] == 0) { $FTT++; $total++; print BIN3 "@data[0..2]\n";} if ($FF1alleles[0] == 0 && $FF1alleles[1] != 0 && $FF1alleles[2] != 0) { $TFF++; $total++; print BIN4 "@data[0..2]\n";} if ($FF1alleles[0] != 0 && $FF1alleles[1] != 0 && $FF1alleles[2] == 0) { $FFT++; $total++; print BIN5 "@data[0..2]\n";} if ($FF1alleles[0] != 0 && $FF1alleles[1] == 0 && $FF1alleles[2] != 0) { $FTF++; $total++; print BIN6 "@data[0..2]\n";} if ($FF1alleles[0] == 0 && $FF1alleles[1] == 0 && $FF1alleles[2] == 0) { $TTT++; $total++; print BIN7 "@data[0..2]\n";} if ($FF1alleles[0] != 0 && $FF1alleles[1] != 0 && $FF1alleles[2] != 0) { $FFF++; $total++; print BIN8 "@data[0..2]\n";} if ($FF2alleles[0] == 0 && $FF2alleles[1] == 0 && $FF2alleles[2] != 0) { $TTF++; $total++; print BIN1 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] == 0 && $FF2alleles[1] != 0 && $FF2alleles[2] == 0) { $TFT++; $total++; print BIN2 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] != 0 && $FF2alleles[1] == 0 && $FF2alleles[2] == 0) { $FTT++; $total++; print BIN3 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] == 0 && $FF2alleles[1] != 0 && $FF2alleles[2] != 0) { $TFF++; $total++; print BIN4 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] != 0 && $FF2alleles[1] != 0 && $FF2alleles[2] == 0) { $FFT++; $total++; print BIN5 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] != 0 && $FF2alleles[1] == 0 && $FF2alleles[2] != 0) { $FTF++; $total++; print BIN6 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] == 0 && $FF2alleles[1] == 0 && $FF2alleles[2] == 0) { $TTT++; $total++; print BIN7 "@data[0..1]\t$data[6]\n";} if ($FF2alleles[0] != 0 && $FF2alleles[1] != 0 && $FF2alleles[2] != 0) { $FFF++; $total++; print BIN8 "@data[0..1]\t$data[6]\n";} } print FH2 "Summarizing file $ARGV[0]\n"; print FH2 "Total number of alleles across bins: $total\n"; print FH2 "TTF\t$TTF\nTFT\t$TFT\nFTT\t$FTT\nTFF\t$TFF\nFFT\t$FFT\nFTF\t$FTF\nTTT\t$TTT\nFFF\t$FFF\n"; #10# presence.absence.analysis.pl # Script was tailored for obtaining unique alleles of male founders by comparing allele presence/absence in different sets of their offspring. # Script reads in allele tables of several subgroups and compares allele content between those # the output file has a list of alleles present in one, but absent in other two groups. # Usage: perl presence.absence.analysis.pl use strict; use warnings; local $" = "\t"; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; open (FH3, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n"; open (FH4, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n"; while () { my $presence = $_; my $absence1 = ; my $absence2 = ; if ($presence !~ /^#/) { chomp $presence; chomp $absence1; chomp $absence2; my @presence = split(/\t/, $presence); my @absence1 = split(/\t/, $absence1); my @absence2 = split(/\t/, $absence2); print STDOUT "$presence[1]\t$absence1[1]\t$absence2[1]\n"; my $presLgth = scalar @presence; my @alleles = @presence[2..$presLgth - 1]; my %seen =(); my @uniq =(); #make an array with unique alleles foreach my $item (@alleles) { push(@uniq, $item) unless $seen{$item}++; } foreach my $item (@uniq) { if ( grep( /^$item$/, @absence1 ) || grep( /^$item$/, @absence2 )) { print STDOUT "$item\n@absence1\n@absence2\n"; } else { print FH4 "$presence[0]\t$presence[1]\t$item\n"; print STDOUT "unique value detected\n$item\n@absence1\n@absence2\n"; } } } } #11# FM1_allele.comparisons.pl #script individually tailored for the first male founder analysis #it reads in a file with unique alleles of the first male founder and determines if each particular allele #is present in the two temporal subgroups and bins unique alleles of the first male founder into 4 output files: #TT (absent in both subgroups); FT (present in first, absent in the other, TF (vice versa), FF (present in both) #usage: perl FM_allele.comparisons.pl use strict; use warnings; local $" = "\t"; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n"; #make output files to store coordinates of alleles in a particular bin open (BIN1, '>', "FM1-offsp.unique.allele.bin1.TT") or die "Can't open BIN1: $!\n"; open (BIN2, '>', "FM1-offsp.unique.allele.bin2.FT") or die "Can't open BIN2: $!\n"; open (BIN3, '>', "FM1-offsp.unique.allele.bin3.TF") or die "Can't open BIN3: $!\n"; open (BIN4, '>', "FM1-offsp.unique.allele.bin4.FF") or die "Can't open BIN4: $!\n"; #define 4 bins (T stands for "absence of the allele is TRUE"; F - "absence of the allele is FALSE") #(T -> TRUE == 0; F -> False != 0) my $TT=0; my $FT=0; my $TF=0; my $FF=0; my $total=0; while () { chomp $_; my @data = split (/\t/, $_); my @sg2Alleles = (); my @sg3Alleles = (); my $FM_in_SG2 = 0; my $FM_in_SG3 = 0; #print "@data\n"; #loop through the subgroup2 and look for the line with corresponding chr and position of the allele open (FH3, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; my $LINE2 = ''; do {$LINE2 = } until $LINE2 =~ /^$data[0]\t$data[1]/ || eof; chomp $LINE2; my @subgroup2 = split(/\t/, $LINE2); print "SG2:@subgroup2[0..1]\n"; my $sg2Lgth = scalar @subgroup2; @sg2Alleles = @subgroup2[2..$sg2Lgth - 1]; #loop through the subgroup3 and find the line with corresponding chr and position of the allele open (FH4, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n"; my $LINE3 = ''; do {$LINE3 = } until $LINE3 =~ /^$data[0]\t$data[1]/ || eof; chomp $LINE3; my @subgroup3 = split(/\t/, $LINE3); print "SG3:@subgroup3[0..1]\n"; my $sg3Lgth = scalar @subgroup3; @sg3Alleles = @subgroup3[2..$sg3Lgth - 1]; print "$data[2]\n@sg2Alleles\n@sg3Alleles\n"; my $g2 = 0; my $g3 = 0; if ( grep /^$data[2]$/, @sg2Alleles){$g2 = 1;} if ( grep /^$data[2]$/, @sg3Alleles){$g3 = 1;} if ($g2 == 0 && $g3 == 0) {print BIN1 "@data\n"; $TT++; $total++;} if ($g2 == 1 && $g3 == 0) {print BIN2 "@data\n"; $FT++; $total++;} if ($g2 == 0 && $g3 == 1) {print BIN3 "@data\n"; $TF++; $total++;} if ($g2 == 1 && $g3 == 1) {print BIN4 "@data\n"; $FF++; $total++;} } print FH2 "Summarizing file $ARGV[0]\n"; print FH2 "Total number of alleles across bins: $total\n"; print FH2 "TT\t$TT\nFT\t$FT\nTF\t$TF\nFF\t$FF\n"; #12# FM2_allele.comparisons.pl #script individually tailored for the second male founder #it reads in a file with unique alleles of the second male founder and determines if each particular allele #is present in the two temporal subgroups and bins unique alleles of the second male founder into 4 output files: #TT (absent in both subgroups); FT (present in first, absent in the other, TF (vice versa), FF (present in both) #usage: perl FM2_allele.comparisons.pl use strict; use warnings; local $" = "\t"; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n"; #make output files to store coordinates of alleles in particular bin open (BIN1, '>', "FM2-offsp.unique.allele.bin1.TT") or die "Can't open BIN1: $!\n"; open (BIN2, '>', "FM2-offsp.unique.allele.bin2.FT") or die "Can't open BIN2: $!\n"; open (BIN3, '>', "FM2-offsp.unique.allele.bin3.TF") or die "Can't open BIN3: $!\n"; open (BIN4, '>', "FM2-offsp.unique.allele.bin4.FF") or die "Can't open BIN4: $!\n"; #define 4 bins (T stands for "absence of the allele is TRUE"; F - "absence of the allele is FALSE") #(T -> TRUE == 0; F -> False != 0) my $TT=0; my $FT=0; my $TF=0; my $FF=0; my $total=0; while () { chomp $_; my @data = split (/\t/, $_); my @sg2Alleles = (); my @sg3Alleles = (); my $FM_in_SG2 = 0; my $FM_in_SG3 = 0; print "@data\n"; #loop through the subgroup2 and look for the line with corresponding chr and position of the allele open (FH3, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; my $LINE2 = ''; do {$LINE2 = } until $LINE2 =~ /^$data[0]\t$data[1]/ || eof; chomp $LINE2; my @subgroup2 = split(/\t/, $LINE2); print "SG2:@subgroup2[0..1]\n"; my $sg2Lgth = scalar @subgroup2; @sg2Alleles = @subgroup2[2..$sg2Lgth - 1]; #loop through the subgroup3 and look for the line with corresponding chr and position of the allele open (FH4, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n"; my $LINE3 = ''; do {$LINE3 = } until $LINE3 =~ /^$data[0]\t$data[1]/ || eof; chomp $LINE3; my @subgroup3 = split(/\t/, $LINE3); print "SG3:@subgroup3[0..1]\n"; my $sg3Lgth = scalar @subgroup3; @sg3Alleles = @subgroup3[2..$sg3Lgth - 1]; print "$data[2]\n@sg2Alleles\n@sg3Alleles\n"; my $g2 = 0; my $g3 = 0; if ( grep /^$data[2]$/, @sg2Alleles){$g2 = 1;} if ( grep /^$data[2]$/, @sg3Alleles){$g3 = 1;} if ($g2 == 0 && $g3 == 0) {print BIN1 "@data\n"; $TT++; $total++;} if ($g2 == 1 && $g3 == 0) {print BIN2 "@data\n"; $FT++; $total++;} if ($g2 == 0 && $g3 == 1) {print BIN3 "@data\n"; $TF++; $total++;} if ($g2 == 1 && $g3 == 1) {print BIN4 "@data\n"; $FF++; $total++;} } print FH2 "Summarizing file $ARGV[0]\n"; print FH2 "Total number of alleles across bins: $total\n"; print FH2 "TT\t$TT\nFT\t$FT\nTF\t$TF\nFF\t$FF\n"; #13# 1983-1993_FM1_FM2_common_allele.comparisons.pl #script individually tailored for the common alleles of the first and second male founders #it reads in a file with common alleles of both male founders and determines if each particular allele #is present in the two temporal subgroups and bins common male alleles into 4 output files: #TT (absent in both subgroups); FT (present in first, absent in the other, TF (vice versa), FF (present in both) #usage: perl FM2_allele.comparisons.pl use strict; use warnings; local $" = "\t"; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH2, '>', $ARGV[3]) or die "Can't open $ARGV[3]: $!\n"; #make output files to store coordinates of alleles in particular bin open (BIN1, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin1.TT") or die "Can't open BIN1: $!\n"; open (BIN2, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin2.FT") or die "Can't open BIN2: $!\n"; open (BIN3, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin3.TF") or die "Can't open BIN3: $!\n"; open (BIN4, '>', "1983-1993_FM1_FM2_without_uniq_FM1_FM2_offsp.bin4.FF") or die "Can't open BIN4: $!\n"; #define 4 bins (T stands for "absence of the allele is TRUE"; F - "absence of the allele is FALSE") #(T -> TRUE == 0; F -> False != 0) my $TT=0; my $FT=0; my $TF=0; my $FF=0; my $total=0; while () { chomp $_; my @data = split (/\t/, $_); my @sg2Alleles = (); my @sg3Alleles = (); my $FM_in_SG2 = 0; my $FM_in_SG3 = 0; print "@data\n"; #loop through the subgroup2 and look for the line with corresponding chr and position of the allele open (FH3, '<', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; my $LINE2 = ''; do {$LINE2 = } until $LINE2 =~ /^$data[0]\t$data[1]/ || eof; chomp $LINE2; my @subgroup2 = split(/\t/, $LINE2); print "SG2:@subgroup2[0..1]\n"; my $sg2Lgth = scalar @subgroup2; @sg2Alleles = @subgroup2[2..$sg2Lgth - 1]; #loop through the subgroup3 and look for the line with corresponding chr and position of the allele open (FH4, '<', $ARGV[2]) or die "Can't open $ARGV[2]: $!\n"; my $LINE3 = ''; do {$LINE3 = } until $LINE3 =~ /^$data[0]\t$data[1]/ || eof; chomp $LINE3; my @subgroup3 = split(/\t/, $LINE3); print "SG3:@subgroup3[0..1]\n"; my $sg3Lgth = scalar @subgroup3; @sg3Alleles = @subgroup3[2..$sg3Lgth - 1]; print "$data[2]\n@sg2Alleles\n@sg3Alleles\n"; my $g2 = 0; my $g3 = 0; if ( grep /^$data[2]$/, @sg2Alleles){$g2 = 1;} if ( grep /^$data[2]$/, @sg3Alleles){$g3 = 1;} if ($g2 == 0 && $g3 == 0) {print BIN1 "@data\n"; $TT++; $total++;} if ($g2 == 1 && $g3 == 0) {print BIN2 "@data\n"; $FT++; $total++;} if ($g2 == 0 && $g3 == 1) {print BIN3 "@data\n"; $TF++; $total++;} if ($g2 == 1 && $g3 == 1) {print BIN4 "@data\n"; $FF++; $total++;} } print FH2 "Summarizing file $ARGV[0]\n"; print FH2 "Total number of alleles across bins: $total\n"; print FH2 "TT\t$TT\nFT\t$FT\nTF\t$TF\nFF\t$FF\n"; #14# sort_lost_variation_according_to_bed.v2.pl #!usr/bin/perl #Individually tailored script to count the number of concordant SNPs between the lost 1 Mb haplotype clusters (bed file) and lost individual founder alleles in 1994-2005. #Update 21.08.2019; removed the option of printing disconcordant SNPs to outfile, instead counting them afterwards #Update 02.07.2020; analyzing only chromosomes where there are lost segments #This script reads in a list of SNP positions and compares it to a bed file in order to detrmine concordant SNPs - those that fall into regions defined in bed file. #Positions of concordant SNPs are printed to separate file for each founder (ends with .concordantSNPs) and number of SNPs per each 1 Mb haplotype cluster are printed in output file #perl sort_lost_variation_according_to_bed.pl use strict; use warnings; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH4, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; #Define all SNP lists that need to be compared my $infile_FF = "D-85-01.alleles.not.in.1994-2005"; my $outfile_FF = "D-85-01.alleles.not.in.1994-2005.concordantSNPs"; my $infile_FM1 = "FM1-offsp.alleles.not.in.1994-2005"; my $outfile_FM1 = "FM1-offsp.alleles.not.in.1994-2005.concordantSNPs"; my $infile_FM2 = "FM2-offsp.alleles.not.in.1994-2005"; my $outfile_FM2 = "FM2-offsp.alleles.not.in.1994-2005.concordantSNPs"; my $FF_concordant_SNPs = ''; my $FM1_concordant_SNPs = ''; my $FM2_concordant_SNPs = ''; while () { next if ($_ =~ /chr/); chomp $_; my @data = split(/\t/,$_); my $chr = $data[0]; my $begc = $data[1]; my $endc = $data[2]; my $founder = $data[3]; my $nr_of_SNPs = ''; next if ( $data[1] == 0 && $data[2] == 0 ); if ($founder =~ /^FF/) { open (FH2, '<', $infile_FF ) or die "Can't open $infile_FF: $!\n"; open (FH3, '>>', $outfile_FF ) or die "Can't open $outfile_FF: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FF region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; #print STDOUT "$snp_chr\t$chr:$snp_pos\t$begc\t$endc\n"; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FF_concordant_SNPs; ++$nr_of_SNPs; } } } elsif ($founder =~ /^FM1/) { open (FH2, '<', $infile_FM1 ) or die "Can't open $infile_FM1: $!\n"; open (FH3, '>>', $outfile_FM1 ) or die "Can't open $outfile_FM1: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FM1 region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FM1_concordant_SNPs; ++$nr_of_SNPs; } } } elsif ($founder =~ /^FM2/) { open (FH2, '<', $infile_FM2 ) or die "Can't open $infile_FM2: $!\n"; open (FH3, '>>', $outfile_FM2 ) or die "Can't open $outfile_FM2: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FM2 region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FM2_concordant_SNPs; ++$nr_of_SNPs; } } } print FH4 "@data\t$nr_of_SNPs\n"; } print STDOUT "FF_concordant_SNPs\t$FF_concordant_SNPs\nFM1_concordant_SNPs\t$FM1_concordant_SNPs\nFM2_concordant_SNPs:$FM2_concordant_SNPs\n"; #15# sort_lost_variation_according_to_bed.v3.pl #!usr/bin/perl #Individually tailored script to count the number of concordant SNPs between the lost 1 Mb haplotype clusters (bed file) and lost individual founder alleles in 2006-2014. #Update 21.08.2019; removed the option of printing disconcordant SNPs to outfile, instead counting them afterwards #Update 02.07.2020; analyzing only chromosomes where there are lost segments #This script reads in a list of SNP positions and compares it to a bed file in order to detrmine concordant SNPs - those that fall into regions defined in bed file. #Positions of concordant SNPs are printed to separate file for each founder (ends with .concordantSNPs) and number of SNPs per each 1 Mb haplotype cluster are printed in output file #perl sort_lost_variation_according_to_bed.v3.pl use strict; use warnings; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH4, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; #Define all SNP lists that need to be compared my $infile_FF = "D-85-01.alleles.not.in.2006-2014"; my $outfile_FF = "D-85-01.alleles.not.in.2006-2014.SNPresolution.concordantSNPs"; my $infile_FM1 = "FM1-offsp.alleles.not.in.2006-2014"; my $outfile_FM1 = "FM1-offsp.alleles.not.in.2006-2014.SNPresolution.concordantSNPs"; my $infile_FM2 = "FM2-offsp.alleles.not.in.2006-2014"; my $outfile_FM2 = "FM2-offsp.alleles.not.in.2006-2014.SNPresolution.concordantSNPs"; my $FF_concordant_SNPs = ''; my $FM1_concordant_SNPs = ''; my $FM2_concordant_SNPs = ''; while () { next if ($_ =~ /chr/); chomp $_; my @data = split(/\t/,$_); my $chr = $data[0]; my $begc = $data[1]; my $endc = $data[2]; my $founder = $data[3]; my $nr_of_SNPs = ''; next if ( $data[1] == 0 && $data[2] == 0 ); if ($founder =~ /^FF/) { open (FH2, '<', $infile_FF ) or die "Can't open $infile_FF: $!\n"; open (FH3, '>>', $outfile_FF ) or die "Can't open $outfile_FF: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FF region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; #print STDOUT "$snp_chr\t$chr:$snp_pos\t$begc\t$endc\n"; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FF_concordant_SNPs; ++$nr_of_SNPs; } } } elsif ($founder =~ /^FM1/) { open (FH2, '<', $infile_FM1 ) or die "Can't open $infile_FM1: $!\n"; open (FH3, '>>', $outfile_FM1 ) or die "Can't open $outfile_FM1: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FM1 region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FM1_concordant_SNPs; ++$nr_of_SNPs; } } } elsif ($founder =~ /^FM2/) { open (FH2, '<', $infile_FM2 ) or die "Can't open $infile_FM2: $!\n"; open (FH3, '>>', $outfile_FM2 ) or die "Can't open $outfile_FM2: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FM2 region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FM2_concordant_SNPs; ++$nr_of_SNPs; } } } print FH4 "@data\t$nr_of_SNPs\n"; } print STDOUT "FF_concordant_SNPs\t$FF_concordant_SNPs\nFM1_concordant_SNPs\t$FM1_concordant_SNPs\nFM2_concordant_SNPs\t$FM2_concordant_SNPs\n"; #16# sort_lost_variation_according_to_bed.v4.pl #!usr/bin/perl #Individually tailored script to count the number of concordant SNPs between the lost 1 Mb haplotype clusters (bed file) and lost individual female founder's alleles in 1983-1993. #Update 21.08.2019; removed the option of printing disconcordant SNPs to outfile, instead counting them afterwards #Update 02.07.2020; analyzing only chromosomes where there are lost segments #This script reads in a list of SNP positions and compares it to a bed file in order to detrmine concordant SNPs - those that fall into regions defined in bed file. #Positions of concordant SNPs are printed to separate file for each founder (ends with .concordantSNPs) and number of SNPs per each 1 Mb haplotype cluster are printed in output file #perl sort_lost_variation_according_to_bed.v4.pl use strict; use warnings; open (FH1, '<', $ARGV[0]) or die "Can't open $ARGV[0]: $!\n"; open (FH4, '>', $ARGV[1]) or die "Can't open $ARGV[1]: $!\n"; #Define all SNP lists that need to be compared my $infile_FF = "D-85-01.alleles.not.in.1983-1993"; my $outfile_FF = "D-85-01.alleles.not.in.1983-1993.SNPresolution.concordantSNPs"; my $FF_concordant_SNPs = ''; my $FM1_concordant_SNPs = ''; my $FM2_concordant_SNPs = ''; while () { next if ($_ =~ /chr/); chomp $_; my @data = split(/\t/,$_); my $chr = $data[0]; my $begc = $data[1]; my $endc = $data[2]; my $founder = $data[3]; my $nr_of_SNPs = ''; next if ( $data[1] == 0 && $data[2] == 0 ); if ($founder =~ /^FF/) { open (FH2, '<', $infile_FF ) or die "Can't open $infile_FF: $!\n"; open (FH3, '>>', $outfile_FF ) or die "Can't open $outfile_FF: $!\n"; print STDOUT "$chr\t$begc\t$endc\t$founder\n"; print STDOUT "Start processing new FF region on chr $chr...\n"; while () { chomp $_; my @snp_data = split(/\t/,$_); my $snp_chr = $snp_data[0]; my $snp_pos = $snp_data[1]; #print STDOUT "$snp_chr\t$chr:$snp_pos\t$begc\t$endc\n"; if ($snp_chr eq $chr && $snp_pos >= $begc && $snp_pos <= $endc) { print FH3 "$snp_chr\t$snp_pos\t$chr:$begc:$endc:$founder\n"; ++$FF_concordant_SNPs; ++$nr_of_SNPs; } } } print FH4 "@data\t$nr_of_SNPs\n"; } print STDOUT "FF_concordant_SNPs\t$FF_concordant_SNPs\n";