# Prepare reference genomes
Downloads reference genomes from NCBI.

## calliptera
```sh
mkdir -p genome/astCal1.2/
cd genome/astCal1.2/

# download from NCBI
curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/246/225/GCF_900246225.1_fAstCal1.2/GCF_900246225.1_fAstCal1.2_genomic.fna.gz' -o astCal_v1.2.fa.gz
curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/246/225/GCF_900246225.1_fAstCal1.2/GCF_900246225.1_fAstCal1.2_genomic.gtf.gz' -o astCal_v1.2.gtf.gz
curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/246/225/GCF_900246225.1_fAstCal1.2/GCF_900246225.1_fAstCal1.2_genomic.gff.gz' -o astCal_v1.2.gff.gz

# genome file indexing
gunzip astCal_v1.2.fa.gz 
bgzip -c astCal_v1.2.fa > astCal_v1.2.fa.gz # keep uncompressed file too
samtools faidx astCal_v1.2.fa.gz

# GTF sorting and indexing 
gunzip -c astCal_v1.2.gtf.gz | grep -v ^"#" | sort -k1,1 -k4,4n | bgzip > annotation.gtf.gz
tabix -p gff annotation.gtf.gz

# GTF sorting and indexing (ensembl)
gunzip -c astCal_v1.2.gtf.ensembl.gz | grep -v ^"#" | sort -k1,1 -k4,4n | bgzip > annotation_ensembl.gtf.gz
tabix -p gff annotation_ensembl.gtf.gz

# count chromosome sizes
cut -f1,2 astCal_v1.2.fa.gz.fai > chrom.sizes

# index for minimap alignment
minimap2 -x map-pb -d astCal_v1.2.fa.gz.mmi astCal_v1.2.fa.gz
```

## zebra
```sh
mkdir -p genome/mayZeb2.0/
cd genome/mayZeb2.0/

curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/238/955/GCF_000238955.4_M_zebra_UMD2a/GCF_000238955.4_M_zebra_UMD2a_genomic.fna.gz' -o mayZeb_v2.0.fa.gz
curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/238/955/GCF_000238955.4_M_zebra_UMD2a/GCF_000238955.4_M_zebra_UMD2a_genomic.gtf.gz' -o mayZeb_v2.0.gtf.gz
curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/238/955/GCF_000238955.4_M_zebra_UMD2a/GCF_000238955.4_M_zebra_UMD2a_genomic.gff.gz' -o mayZeb_v2.0.gff.gz
curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/238/955/GCF_000238955.4_M_zebra_UMD2a/GCF_000238955.4_M_zebra_UMD2a_assembly_report.txt' -o assembly_report.txt

# genome file indexing
gunzip mayZeb_v2.0.fa.gz 
bgzip -c mayZeb_v2.0.fa > mayZeb_v2.0.fa.gz
samtools faidx mayZeb_v2.0.fa.gz

# GTF sorting and indexing 
gunzip -c mayZeb_v2.0.gtf.gz | grep -v ^"#" | sort -k1,1 -k4,4n | bgzip > annotation.gtf.gz
tabix -p gff annotation.gtf.gz

# count chromosome sizes
cut -f1,2 mayZeb_v2.0.fa.gz.fai > chrom.sizes

# index for minimap alignment
minimap2 -x map-pb -d mayZeb_v2.0.fa.gz.mmi mayZeb_v2.0.fa.gz
```

## ensembl versions
Download genomes and annotations.
```sh
mkdir -p genome/astCal1.2_ensembl/
curl 'http://ftp.ensembl.org/pub/release-103/fasta/astatotilapia_calliptera/dna/Astatotilapia_calliptera.fAstCal1.2.dna.toplevel.fa.gz' -o astCal_v1.2.fa.gz
curl 'http://ftp.ensembl.org/pub/release-103/gtf/astatotilapia_calliptera/Astatotilapia_calliptera.fAstCal1.2.103.gtf.gz' -o astCal_v1.2.gtf.gz
curl 'http://ftp.ensembl.org/pub/release-103/gff3/astatotilapia_calliptera/Astatotilapia_calliptera.fAstCal1.2.103.gff3.gz' -o astCal_v1.2.gff.gz

mkdir -p genome/mayZeb2.0_ensembl/
curl 'http://ftp.ensembl.org/pub/release-103/fasta/maylandia_zebra/dna/Maylandia_zebra.M_zebra_UMD2a.dna.toplevel.fa.gz' -o mayZeb_v2.0.fa.gz
curl 'http://ftp.ensembl.org/pub/release-103/gtf/maylandia_zebra/Maylandia_zebra.M_zebra_UMD2a.103.gtf.gz' -o mayZeb_v2.0.gtf.gz
curl 'http://ftp.ensembl.org/pub/release-103/gff3/maylandia_zebra/Maylandia_zebra.M_zebra_UMD2a.103.gff3.gz' -o mayZeb_v2.0.gff.gz
```

Then, created sorted and index versions of the GFF and GTF for viewing in IGV.
```sh
gunzip -c astCal_v1.2.gff.gz > annotation_unsorted.gff
'sort in IGV' > annotation.gff
bgzip annotation.gff
tabix annotation.gff.gz
rm annotation_unsorted.gff
```

Also run the indexing and sorting commands from above.

### additional things
```sh
gunzip -c mayZeb2.0_ensembl.gtf.gz | grep -v "#" | awk '{print $1}' | uniq

# assembly report and aliases between different IDs
grep -v "^#" assembly_report.txt | awk -v OFS='\t'  '{ print $1, $7}' > alias_num_ncbi.txt
grep -v "^#" assembly_report.txt | awk -v OFS='\t'  '{ print $1, $5}' > alias_num_ensembl.txt
grep -v "^#" assembly_report.txt | awk -v OFS='\t'  '{ print $5, $7}' > alias_ensembl_ncbi.txt
```

## Rename Malawi chromosomes
For `minigraph` genome graphs, the genomes need to renamed so the contig names are not repeated. I also have an additional step of filtering out scaffolds that are smaller than 10000bp manually using `vim`.

```
cd genome/longread-decompress/
sed '/^>/ s/>/>astCal_/' astCal1.2.ref.fa > astCal1.2.renamed.fa
sed '/^>/ s/>/>aulStu_/' aulStu5.0.ref.fa > aulStu5.0.renamed.fa
sed '/^>/ s/>/>mayZeb_/' mayZeb2.0.ref.fa > mayZeb2.0.renamed.fa 
sed '/^>/ s/>/>rhaChi_/' rhaChi1.0.ref.fa > rhaChi1.0.renamed.fa
sed '/^>/ s/>/>troMau_/' troMau2.0.ref.fa > troMau2.0.renamed.fa
sed '/^>/ s/>/>copChr_/' copChr1.0.ref.fa > copChr1.0.renamed.fa
sed '/^>/ s/>/>otoArg_/' otoArg1.0.ref.fa > otoArg1.0.renamed.fa
sed '/^>/ s/>/>rhaChiTwo_/' rhaChi2.0.ref.fa > rhaChi2.0.renamed.fa
sed '/^>/ s/>/>astCalONT_/' astCal1.2_ONT.ref.fa > astCal1.2_ONT.renamed.fa
```