java -jar ~/software/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 20 -validatePairs SRR14766075_1.fastq.gz SRR14766075_2.fastq.gz ILLUMINACLIP:~/software/Trimmomatic-0.39/adapters/TruSeq3-PE-2.fa:2:30:10:8:true LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50 -baseout pop_clean.fastq.gz
# build index for genome
java -jar ~/02.software/picard/picard.jar CreateSequenceDictionary R=genome.fasta
bwa index genome.fasta
samtools faidx genome.fasta
# Bwa alignment
ls ../data/*_1.fq.gz |awk -F "/" '{print $3}' |awk -F "_" '{print $1}' |while read i;do echo -e "$i\t../data/${i}_1.fq.gz\t../data/${i}_2.fq.gz";done >sample.list
cat sample.list |while read sp fq1 fq2;do echo "bwa mem -t 8 -R '@RG\tID:$sp\tSM:$sp\tPL:illumina' ../ref/genome.fasta $fq1 $fq2 2>$sp.bwa.log |samtools sort -@ 8 -m 5G -o ${sp}.sort.bam -";done >1.bwa.sh
parallel -j 20 < 1.bwa.sh  
## Remove duplications
mkdir tmp
ls *.sort.bam |awk -F "." '{print $1}' |while read i;do echo "java -Xmx4g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=./tmp -jar ~/02.software/picard/picard.jar MarkDuplicates I=${i}.sort.bam O=${i}.sort.markdup.bam CREATE_INDEX=true REMOVE_DUPLICATES=true M=${i}.marked_dup_metrics.txt";done >2.picard.sh
parallel -j 8 < 2.picard.sh

# Stat alignment ratio
ls *.sort.bam |while read i;do echo "samtools flagstat $i >${i}.flagstat";done >map_ratio_stat.sh
parallel -j 20 < map_ratio_stat.sh
ls *.sort.bam |while read i;do echo "~/software/samtools-1.16.1/samtools coverage $i >${i}.coverage";done >coverage_stat.sh
parallel -j 20 < coverage_stat.sh
### Run GATK
mkdir 2.GATK && cd 2.GATK
mkdir tmp
grep ">" ../ref/genome.fasta |sed 's/>//g' >chr.list
ls ../1.Readalign/*.sort.markdup.bam |awk -F "/" '{print $3}' |awk -F "." '{print $1}' |grep -v "bowfin" |while read i;do echo -e "$i\t../1.Readalign/${i}.sort.markdup.bam";done >sample_bam.list
bash gatk1.sh
##############
# gatk1.sh
#!/usr/bin/bash
ref=../ref/genome.fasta
cat sample_bam.list |while read sample bam
do
	mkdir $sample
	cat chr.list |while read chr
	do
		echo "gatk --java-options \"-Xmx10g -Djava.io.tmpdir=./tmp\" HaplotypeCaller -R $ref -I $bam -L $chr -ERC GVCF -O $sample/$sample.$chr.g.vcf.gz 1>$sample/$sample.$chr.HC.log 2>&1"
	done > step1.$sample.HaplotypeCaller.sh
done
#####################
bash gatk2.sh  
########
# gatk2.sh
#!/usr/bin/bash
ref=../ref/genome.fasta
cat chr.list |while read chr
do
	cat sample_bam.list |awk '{print $1}'|while read sample
	do
		echo $sample/$sample.$chr.g.vcf.gz
	done >gvcf.$chr.list

	echo "gatk --java-options \"-Xmx10g -Djava.io.tmpdir=./tmp\" CombineGVCFs -R $ref -V gvcf.$chr.list -O $chr.g.vcf.gz 1>$chr.CombineGVCFs.log 2>&1" >> step2.CombineGVCFs.sh

	echo "gatk --java-options \"-Xmx10g -Djava.io.tmpdir=./tmp\" GenotypeGVCFs -R $ref -V $chr.g.vcf.gz -O $chr.raw.vcf.gz 1>$chr.GenotypeGVCFs.log 2>&1" >> step3.GenotypeGVCFs_gvcf.sh

done
#########################
## combine vcf
awk '{print $1".raw.vcf.gz"}' chr.list  >raw_vcf.list
gatk --java-options "-Xmx10g -Djava.io.tmpdir=./tmp" MergeVcfs -I raw_vcf.list -O all.merge_raw.vcf
### filtering SNP
gatk --java-options "-Xmx4g -Djava.io.tmpdir=./tmp" SelectVariants -R ../ref/genome.fasta -V all.merge_raw.vcf --select-type SNP -O all.raw.snp.vcf
gatk --java-options "-Xmx4g -Djava.io.tmpdir=./tmp" VariantFiltration -R ../ref/genome.fasta -V all.raw.snp.vcf --filter-expression "QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name 'SNP_filter' -O all.filter.snp.vcf
gatk --java-options "-Xmx4g -Djava.io.tmpdir=./tmp" SelectVariants -R ../ref/genome.fasta -V all.filter.snp.vcf --exclude-filtered -O all.filtered.snp.vcf
## Use vcftools to perform statistics on VCF files and filter according to the screening criteria.
vcftools --gzvcf gar.vcf.gz --depth --out gar
vcftools --gzvcf gar.vcf.gz --site-mean-depth --out gar
vcftools --gzvcf gar.vcf.gz --missing-indv --out gar
vcftools --gzvcf gar.vcf.gz --missing-site --out gar
vcftools --gzvcf gar.vcf.gz --site-quality --out gar
vcftools --gzvcf gar.vcf.gz --het --out gar
vcftools --gzvcf gar.vcf.gz --freq2 --out gar --max-alleles 2
### Run statvcf.r 
## Filter
vcftools --gzvcf gar.vcf.gz --remove-indels --maf 0.3 --max-missing 0.9 --minQ 30 --minDP 15 --maxDP 60 --recode --stdout |gzip -c >filtered.vcf.gz
## Plink
plink --vcf ../gar.NEWfiltered.vcf.gz --extract tmp.ld.prune.in --out all.LDfilter --recode vcf-iid --keep-allele-order --allow-extra-chr --set-missing-var-ids @:#
## Build tree
perl  ~/biosoft/tassel-5-standalone/run_pipeline.pl  -Xms1G -Xmx5G -importGuess all.LDfilter.sort.vcf -ExportPlugin -saveAs sequences.phy -format Phylip_Inter
~/biosoft/FastTree -nt -gtr sequences.phy >out.nwk
~/biosoft/iqtree-1.6.12-Linux/bin/iqtree -s sequences.phy -nt 12  -m GTR+ASC -bb 1000 -pre iqtreeout
## LDdecay
PopLDdecay -InVCF ../4.PopStructure/gar.NEWfiltered.vcf.gz -SubPop pop.CW.table -MaxDist 500  -OutStat CW.stat2
ls *.stat2.stat.gz |awk -F"." '{ print $0"\t"$1 }' > ld_stat.list
Plot_MultiPop.pl -inList ld_stat.list -output ld_stat.multi -keepR
# pi
vcftools --gzvcf ../4.PopStructure/gar.NEWfiltered.vcf.gz --window-pi 100000 --window-pi-step 10000 --keep ../5.LDdecay/pop.BD.table --out Pi.BD
# Fst
vcftools --gzvcf ../4.PopStructure/gar.NEWfiltered.vcf.gz --fst-window-size 100000 --fst-window-step 10000 --weir-fst-pop ../5.LDdecay/pop.BD.table --weir-fst-pop ../5.LDdecay/pop.CW.table --out Fst.BD.CW
# ROH
/hpcdata/clad/wangcheng/biosoft/plink2 --vcf ../8.het/gar.NEWfiltered.vcf --allow-extra-chr --max-alleles 2 --allow-no-sex -out Gar_plink --make-bed

plink --bfile Gar_plink  --homozyg --homozyg-density 50 --homozyg-gap 100 --homozyg-kb 100 --homozyg-snp 50 --homozyg-window-het 1 --homozyg-window-snp 50 --homozyg-window-threshold 0.05  --out new50roh --allow-extra-chr
les roh.hom |awk '{print $2"\t"$9}' > roh.result.txt
les new50roh.hom.indiv |awk '{print $2"\t"$2"\t"$4"\t"$5}' |sed 's/IID/FAM/' |perl -alne 'if(/^BD/) {s/\d+//;print;} elsif(/^CW/) {s/\d+//;print;} elsif(/^EQS/) {s/\d+//;print;} else {print;}' |awk '{print $2"\t"$3"\t"$4"\t"$1}' >roh100.indiv.txt 
cat new50roh.hom.indiv |awk '{print $2}' |while read j;do echo $j;less new50roh.hom |grep -w $j  |awk '{print $4}' |sort |uniq |while read i;do grep -w $i genome.fasta.fai;done |perl -alne '@t=split/\s+/;$sum+=$t[1];END{print $sum;}';done > roh.chr.txt

cat roh.chr.txt |awk '{print $1}' |while read i;do grep -w $i new50roh.hom.indiv |awk '{print $2"\t"$5*1000}';done >roh.value.txt
paste roh.chr.txt roh.value.txt |perl -alne 'if(/^BD/) {s/\d+//;print;} elsif(/^CW/) {s/\d+//;print;} elsif(/^EQS/) {s/\d+//;print;} else {print;}' |awk '{print $3"\t"$4"\t"$2"\t"$1}' |sed '1iID\troh\tchr\tFam' >Froh.txt
# F3
~/01.software/AdmixTools-8.0.2/bin/qp3Pop -p par.PED.EIGENSTRAT.gar >3pop_qp3pop

# Momi2
~/software/jdk-21.0.5/bin/java -jar ~/software/snpEff/SnpSift.jar split gar.vcf
bedtools subtract -a EQS.chr.bed -b EQS.repeat.bed  >norepeat.bed
bedtools subtract -a norepeat.bed -b EQS.cds.bed >norepeat_cds.bed
for i in {1..28};do grep -w chr$i norepeat_cds.bed >chr$i.bed;done
### Compute allele counts
for ((i=1;i<29;i++));do echo "bgzip vcf/gar.chr${i}.vcf;tabix vcf/gar.chr${i}.vcf.gz;python3 -m momi.read_vcf vcf/gar.chr${i}.vcf.gz pop.txt gar.chr${i}.snpAlleleCounts.gz --bed chr${i}.bed --no_aa";done >momi1.sh
parallel -j 6 < momi1.sh
## Extract combined SFS
### combine the SFS across multiple files, and split the SFS into a number of equally sized blocks for jackknifing and bootstrapping.
python3 -m momi.extract_sfs sfs.gz 100 gar.chr*.snpAlleleCounts.gz


