#!/bin/bash
#
# SUPPLEMENT
#
# analysis code
# uses seqtk
# SK


# stop on error
set -e

# working directory
WD=$PWD


##############################
# data
##############################
#
# set up read data paths - change as appropriate

# reads
WTREADS=$WD'/reads/nanopore/2017-09-05_coronavirus_WT.rna.fastq'
SL2READS=$WD'/reads/nanopore/2017-09-29_coronavirus_SL2.rna.fastq'
WTILLU=$WD'/reads/illumina/160_WT_II_p21_TGACCA_L004_R1_001.fastq'
SL2ILLU=$WD'/reads/illumina/SL2_SARS_II_p12pool_ACTGAT_L004_R1_001.fastq'

# genomes
# HCoV:
# WT:    WT_229E_reference.fa
# SL2:   WT_229E_SL2_SARS.fa
# S.cerev. enolase 2 mRNA / ONT calibration strand:  ENO2.fa
ENO2_REF="$WD/ENO2.fa"
# Homo sapiens:  set a path to Ensemble release 93 primary assembly fasta
HSA_REF='/genomes/homo_sapiens_done/ensembl-release-93/Homo_sapiens.GRCh38.dna.primary_assembly.fa'


##############################
# reference mappings
##############################
# needs genomes:
# WT:    WT_229E_reference.fa
# SL2:   WT_229E_SL2_SARS.fa
minimap2 -ax splice -k14 -un WT_229E_reference.fa $WTREADS | samtools view -hF4 > WT.sam
minimap2 -ax splice -k14 -un WT_229E_SL2_SARS.fa $SL2READS | samtools view -hF4 > SL2.sam
# convert to bam and index
./misc_scripts/bamify WT.sam SL2.sam



##############################
# coverage plot
##############################
cd covplot
# get data in pydict
python3 plotCoverageExons.py -P covdata.Pydict ../WT.bam ../SL2.bam
# plot it
python3 plot_coverage_annot.py covdata.Pydict
cd $WD



##############################
# consensus sequence with ococo and reference
##############################
# with ococo callable from environment
cd consensus
ococo -i ../WT.sam -f ../WT_229E_reference.fa -V WT_ref_ococo.vcf -F WT_ref_ococo.fa -t majority
ococo -i ../SL2.sam -f ../WT_229E_SL2_SARS.fa -V SL2_ref_ococo.vcf -F SL2_ref_ococo.fa -t majority
cd $WD



##############################
# HG-CoLoR
##############################
# needs to be installed in ./hgcolor/HG-CoLoR/
cd hgcolor

# prepare 20k+ reads
awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 20000) {print header, seq, qheader, qseq}}' < $WTREADS > WT_min20k.fastq
awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 20000) {print header, seq, qheader, qseq}}' < $SL2READS > SL2_min20k.fastq
seqtk seq -A WT_min20k.fastq >WT_min20k.fa
seqtk seq -A SL2_min20k.fastq >SL2_min20k.fa
sed -i 's/U/T/g' WT_min20k.fa
sed -i 's/U/T/g' SL2_min20k.fa

# run
./HG-CoLoR/HG-CoLoR -j 40 --maxorder 50 --longreads WT_min20k.fa \
--shortreads $WTILLU --out WT_min20k_hgc.fa --tmpdir tmp
./HG-CoLoR/HG-CoLoR -j 40 --maxorder 50 --longreads SL2_min20k.fa \
--shortreads $SL2ILLU --out SL2_min20k_hgc.fa --tmpdir tmp

cd $WD



##############################
# raw data squiggle plot
##############################
cd squiggle
# random chosen sample read:  sampleread_mRNA7_WT.fast5
python3 plot_squiggle.py sampleread_mRNA7_WT.fast5
cd $WD



##############################
# read origin
##############################
cd origin
HCOV_REF_WT='../WT_229E_reference.fa'
HCOV_REF_SL2='../WT_229E_SL2_SARS.fa'


# map to HCoV variant, yeast enolase 2, human concatenated
minimap2 -ax splice -k14 -un <(cat $HCOV_REF_WT $ENO2_REF $HSA_REF) $WTREADS > all_WT.sam
minimap2 -ax splice -k14 -un <(cat $HCOV_REF_SL2 $ENO2_REF $HSA_REF) $SL2READS > all_SL2.sam

# select only primary (best) alignment for every read
# field 2 (flags) should be only 0 or 16:  primary alignment forward or reverse
# HCoV
awk '$3=="gi|12082738|gb|AF304460.1|_HCoV_229E" && ($2=="0" || $2=="16")' all_WT.sam > HCoV_WT.sam
awk '$3=="gi|12082738|gb|AF304460.1|_HCoV_229E" && ($2=="0" || $2=="16")' all_SL2.sam > HCoV_SL2.sam

# ENO2
awk '$3=="ENO2" && ($2=="0" || $2=="16")' all_WT.sam > ENO2_WT.sam
awk '$3=="ENO2" && ($2=="0" || $2=="16")' all_SL2.sam > ENO2_SL2.sam

# H.sapiens
awk '($3!="gi|12082738|gb|AF304460.1|_HCoV_229E" && $3!="ENO2") && ($2=="0" || $2=="16")' all_WT.sam > HSapi_WT.sam
awk '($3!="gi|12082738|gb|AF304460.1|_HCoV_229E" && $3!="ENO2") && ($2=="0" || $2=="16")' all_SL2.sam > HSapi_SL2.sam

# unmapped, 4 bit set in flags
awk 'and($2,4)==4' all_WT.sam > unmapped_WT.sam
awk 'and($2,4)==4' all_SL2.sam > unmapped_SL2.sam

# assert number of reads is correct
wc -l HCoV_WT.sam ENO2_WT.sam HSapi_WT.sam unmapped_WT.sam
wc -l HCoV_SL2.sam ENO2_SL2.sam HSapi_SL2.sam unmapped_SL2.sam
for F in *sam; do echo $F $(grep -v '^@' $F | cut -f1 | wc -l) $(grep -v '^@' $F | cut -f1 | sort | uniq | wc -l); done
# assert no duplicates
echo WT $(cat HCoV_WT.sam ENO2_WT.sam HSapi_WT.sam unmapped_WT.sam | cut -f1 | wc -l) $(cat HCoV_WT.sam ENO2_WT.sam HSapi_WT.sam unmapped_WT.sam | cut -f1 | sort | uniq | wc -l)
echo SL2 $(cat HCoV_SL2.sam ENO2_SL2.sam HSapi_SL2.sam unmapped_SL2.sam | cut -f1 | wc -l) $(cat HCoV_SL2.sam ENO2_SL2.sam HSapi_SL2.sam unmapped_SL2.sam | cut -f1 | sort | uniq | wc -l)

# get general stats
../misc_scripts/read_stats.py $WTREADS all_WT.sam HCoV_WT.sam ENO2_WT.sam HSapi_WT.sam unmapped_WT.sam | tee stats_WT.txt
../misc_scripts/read_stats.py $SL2READS all_SL2.sam HCoV_SL2.sam ENO2_SL2.sam HSapi_SL2.sam unmapped_SL2.sam | tee stats_SL2.txt

# get read ids
for F in *.sam; do grep -v '^@' $F | cut -f1 | sort | uniq > ${F%.sam}.ids; done

# restplot
mkdir restplot && cd restplot
python3 restplot.py $WTREADS ../all_WT.ids ../ENO2_WT.ids ../HSapi_WT.ids ../HCoV_WT.ids ../unmapped_WT.ids $SL2READS ../all_SL2.ids ../ENO2_SL2.ids ../HSapi_SL2.ids ../HCoV_SL2.ids ../unmapped_SL2.ids
cd ..


# BLAST it against nt
mkdir blast && cd blast
# get sequences
seqtk subseq $WTREADS ../unmapped_WT.ids | seqtk seq -A > unmapped_WT.fa
seqtk subseq $SL2READS ../unmapped_SL2.ids | seqtk seq -A > unmapped_SL2.fa
# run
blastn -version # 2.5.0
# put path to blast DB here:  NCBI nt downloaded on 2018-07-25
BLAST_DB_NT='/database/ncbi_nt_2018-07-25/nt'
nice blastn -num_threads 20 -db $BLAST_DB_NT -outfmt="6 std staxids ssciname scomname" -query unmapped_WT.fa -out unmapped_WT.blastout
nice blastn -num_threads 20 -db $BLAST_DB_NT -outfmt="6 std staxids ssciname scomname" -query unmapped_SL2.fa -out unmapped_SL2.blastout

# get NCBI taxonomy names (downloaded on 2019-05-21)
# wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
# tar xf taxdump.tar.gz
# rm taxdump.tar.gz readme.txt nodes.dmp merged.dmp gencode.dmp gc.prt division.dmp delnodes.dmp citations.dmp
# => names.dmp remains
gunzip names.dmp.gz

# high confidence hits <= 1e-20
# low confidence hits  <= 1e-10
# rest > 1e-10
python3 parse_blast_output.py names.dmp unmapped_WT.blastout > taxid_counts_WT.csv
python3 parse_blast_output.py names.dmp unmapped_SL2.blastout > taxid_counts_SL2.csv


cd $WD



##############################
# raw read length & aligned read length
##############################
cd readlength
# get reads that mapped to HCoV
seqtk subseq $WTREADS ../origin/HCoV_WT.ids > aligned_WT.fastq
seqtk subseq $SL2READS ../origin/HCoV_SL2.ids > aligned_SL2.fastq
# raw
python3 raw_readlength.py aligned_WT.fastq aligned_SL2.fastq

# aligned 
python3 new_distrib_rev01.py ../WT.bam ../SL2.bam
cd $WD



##############################
# error rates
##############################
cd errorrates

# re-add header to alignments
samtools view -H ../origin/all_WT.sam > all_WT.header
samtools view -H ../origin/all_SL2.sam > all_SL2.header
for F in ../origin/[EH]*WT.sam; do cat all_WT.header $F > ${F##*/}; done
for F in ../origin/[EH]*SL2.sam; do cat all_SL2.header $F > ${F##*/}; done

# run indel correction script to output error statistics, but suppress output of corrected reads

# HCoV_229E
# care changed reference for SL2 sample
./correct_indels.py HCoV_WT.sam ../WT_229E_reference.fa > /dev/null 2> HCoV_WT_indelcorr.log
./correct_indels.py HCoV_SL2.sam ../WT_229E_SL2_SARS.fa > /dev/null 2> HCoV_SL2_indelcorr.log

# S.Cerev. enolase 2
./correct_indels.py ENO2_WT.sam $ENO2_REF > /dev/null 2> ENO2_WT_indelcorr.log
./correct_indels.py ENO2_SL2.sam $ENO2_REF > /dev/null 2> ENO2_SL2_indelcorr.log

# H. sapiens
# reference:  $HSA_REF
./correct_indels.py HSapi_WT.sam $HSA_REF > /dev/null 2> HSapi_WT_indelcorr.log
./correct_indels.py HSapi_SL2.sam $HSA_REF > /dev/null 2> HSapi_SL2_indelcorr.log

cd $WD



##############################
# HCoV mRNA numbers
##############################
cd mrnas
# canonical mRNA sequences
# WT:   mRNAs_noA.fa
# SL2:  exchange SL2 sequence in leader
cp mRNAs_noA.fa SL2_mRNAs_noA.fa
sed -i 's/TTAGACTTTGTGTCTACT/TGATCTCTTGTAGATCCT/' SL2_mRNAs_noA.fa

# map to mRNAs / select only primary forward alignments
minimap2 -ax map-ont -k14 mRNAs_noA.fa $WTREADS | samtools view -hF4095 > WT_mRNAs_noA.sam
minimap2 -ax map-ont -k14 SL2_mRNAs_noA.fa $SL2READS | samtools view -hF4095 > SL2_mRNAs_noA.sam

# count mRNAs
python3 mrna_analysis.py WT_mRNAs_noA.sam > counts_mRNA_WT.csv
python3 mrna_analysis.py SL2_mRNAs_noA.sam > counts_mRNA_SL2.csv

# plot
python3 plot_mrna_counts.py counts_mRNA_WT.csv counts_mRNA_SL2.csv

cd $WD



##############################
# homopolymer deletion stats
##############################
cd homopolymer

# get data
python3 homopolymer.py ../errorrates/HCoV_WT.sam ../WT_229E_reference.fa > /dev/null 2> HCoV_WT.stats
python3 homopolymer.py ../errorrates/HCoV_SL2.sam ../WT_229E_SL2_SARS.fa > /dev/null 2> HCoV_SL2.stats
python3 homopolymer.py ../errorrates/ENO2_WT.sam $ENO2_REF > /dev/null 2> ENO2_WT.stats
python3 homopolymer.py ../errorrates/ENO2_SL2.sam $ENO2_REF > /dev/null 2> ENO2_SL2.stats
python3 homopolymer.py ../errorrates/HSapi_WT.sam $HSA_REF > /dev/null 2> HSapi_WT.stats
python3 homopolymer.py ../errorrates/HSapi_SL2.sam $HSA_REF > /dev/null 2> HSapi_SL2.stats
# plot it
python3 plot_homopolymer_stats.py ENO2_WT.stats ENO2_SL2.stats HSapi_WT.stats HSapi_SL2.stats HCoV_WT.stats HCoV_SL2.stats
cd $WD


##############################
# substitutions vs modified bases
##############################

cd modsubs
# mRNAs reference:         mRNAs_noA.fa
# reads aligned to mRNA2:  WT_mRNAs_noA.sam
# tombo result for mRNA2:  tombo_result_S_mRNA2.wig
# run
python3 modsubs.py tombo_result_S_mRNA2.wig mRNAs_noA.fa WT_mRNAs_noA.sam

cd $WD


##############################
# 5' basecalling errors
##############################
cd fiveprime
# parse and plot
python3 fiveprime_errors.py ../WT.bam ../WT_229E_reference.fa
python3 fiveprime_errors.py ../SL2.bam ../WT_229E_SL2_SARS.fa

cd $WD


##############################
# mapping to human transcriptome
##############################

cd hsa_transcriptome
# put path to transcriptome assembly here
TRANS='/assemblies/hsa/trinity/Trinity.fasta'

# unspliced mapping, only keep aligned reads
minimap2 -ax map-ont -k14 $TRANS $WTREADS | samtools view -hF4 > WT_trans.sam
minimap2 -ax map-ont -k14 $TRANS $SL2READS | samtools view -hF4 > SL2_trans.sam
# number of aligned reads
grep -v '^@' WT_trans.sam | cut -f1 | sort | uniq | wc -l
# 97414
grep -v '^@' SL2_trans.sam | cut -f1 | sort | uniq | wc -l
# 87943

cd $WD