#!/bin/sh

#SBATCH --job-name=Pipeline
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=kecostello@coh.org
#SBATCH -n 16
#SBATCH -N 1-1
#SBATCH --mem=100G
#SBATCH --time=24:00:00
#SBATCH --output=active.log


#=============================================#
#  Transposable Element Derived RNA analysis  # 
#=============================================#


### set species and geneDataBase ###
species="hg19"
geneDataBase="refseq"
aligner="HIsat2"        ; ## STAR for multimapping HIsat2 for unique mapping

### path to software
hisat2="/opt/easybuild/software/HISAT2/2.1.0-foss-2017a/bin/hisat2"
STAR="/opt/STAR/2.6.0.a/bin/Linux_x86_64/STAR"
bedtools="/opt/bedtools/2.27.1-sys/bin/bedtools"
stringtie="/opt/stringtie/1.3.4d/stringtie"
samtools="/opt/SAMtools/1.6/bin/samtools"

## Python used
#Python 2.7.14

## number of nodes ## (if unknown leave "1")
nodes=16

echo" 
    ___________        ___________
  /|           |     /|           |
 | |  o     o  |    | |  o        |
 | |           |    | |     o     |
 | |  o     o  |    | |        o  |
 | |___________|    | |___________|
 |/___________/     |/___________/


This is the TE-Derived RNA Analysis

Requires at least one replicate for the control
Remember to change project name and species to match experiment

Good luck 


Running TE-Derived RNA Analysis for $project with $aligner $species $geneDataBase

"

module load Python/2.7.14-foss-2017a
project=`basename $1 .list`
dir=`pwd`

if [[ ! -d $dir/projects/$project ]]
then
mkdir $dir/projects/$project
fi


### makes temporary scripts and variables to be deleted later

mkdir $dir/projects/$project/run-space
sleep 2

cat $dir/scripts/top-edgeR.r > $dir/projects/$project/run-space/edgeR.r
echo "colnames(dat) <- c(" > $dir/projects/$project/run-space/edgeR-name-list.txt
awk '{print $1}' $1 > $dir/projects/$project/run-space/name-list
awk '{if ($2 == 2) print $1}' $1 > $dir/projects/$project/run-space/test-name-list
awk '{if ($2 == 1) print $1}' $1 > $dir/projects/$project/run-space/control-name-list
awk '{if ($2 == 2) print $4}' $1 > $dir/projects/$project/run-space/test-read2
awk '{if ($2 == 1) print $4}' $1 > $dir/projects/$project/run-space/control-read2
let "tot=0"
num=`awk '{print NF}' $1 | head -1`
echo first > $dir/projects/$project/run-space/signal-flare-bam 

#================#
# STAR Alignment #
#================#

### Preforms STAR alignments for fastqs if not already done and customizes temporary scripts for later use ###

## determines file compression and runs Alignment for control group
while read name; do
    file=`grep -w $name $dir/$1 |awk '{print $3}' `
    prefix="*."; foo=${file##$prefix}
    let "tot+=1"
    if [[ ! -s $dir/files/$name/$name\Aligned.sortedByCoord.out.bam ]]; then
        mkdir $dir/files/$name
        cd $dir/files/$name
        if [[ $num == 4 ]] ; then
            file2=`grep -w $name $dir/$1 | awk '{print $4}' `
        ### if used on a high preformance cluster, run next line with executable i.e. qsub or sbatch to have jobs run all at once 
            if [[ $aligner == "HIsat2" ]] ; then $dir/scripts/HIsat2.sh $file $file2 $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_HIsat2/$species $dir/Reference/$species/$species-$geneDataBase-hisat-splice.txt $hisat2 $nodes $samtools ;fi
            if [[ $aligner == "STAR" ]] ; then $dir/scripts/STAR.sh $file $file2 $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_STAR $dir/Reference/$species/$species.$geneDataBase.gtf $foo $STAR $nodes ;fi
            cd $dir; echo " Aligning paired $name $foo files to $species " 
        fi
        if [[ $num == 3 ]]; then
        ### if used on a high preformance cluster, run next line with executable i.e. qsub or sbatch to have jobs run all at once
             if [[ $aligner == "HIsat2" ]] ; then $dir/scripts/HIsat2single.sh $file $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_HIsat2/$species $dir/Reference/$species/$species-$geneDataBase-hisat-splice.txt $hisat2 $nodes $samtools;fi
             if [[ $aligner == "STAR" ]] ; then $dir/scripts/STARsingle.sh $file $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_STAR $dir/Reference/$species/$species.$geneDataBase.gtf $foo $STAR $nodes ;fi
            cd $dir ; echo " Aligning single $name $foo files to $species "
    fi 
    else 
        echo "already done" ; echo done >> $dir/projects/$project/run-space/signal-flare-bam 
    echo "$dir/files/$name/$name-Aligned.sortedByCoord.out.bam" >> $dir/projects/$project/run-space/control-bam.txt
    fi
    echo '"group1",' >> $dir/projects/$project/run-space/edgeR.r ; echo "'$name'," >> $dir/projects/$project/run-space/edgeR-name-list.txt
done < $dir/projects/$project/run-space/control-name-list


##determines file compression and runs Alignment for treatment group
while read name; do
    file=`grep -w $name $dir/$1 |awk '{print $3}' `
    prefix="*."; foo=${file##$prefix}
    let "tot+=1"
    if [[ ! -s $dir/files/$name/$name\Aligned.sortedByCoord.out.bam ]]; then
        mkdir $dir/files/$name
        cd $dir/files/$name
        if [[ $num == 4 ]] ; then
            file2=`grep -w $name $dir/$1 | awk '{print $4}' `
        ### if used on a high preformance cluster, run next line with executable i.e. qsub or sbatch to have jobs run all at once
            if [[ $aligner == "HIsat2" ]] ; then $dir/scripts/HIsat2.sh $file $file2 $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_HIsat2/$species $dir/Reference/$species/$species-$geneDataBase-hisat-splice.txt $hisat2 $nodes $samtools ;fi
            if [[ $aligner == "STAR" ]] ; then $dir/scripts/STAR.sh $file $file2 $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_STAR $dir/Reference/$species/$species.$geneDataBase.gtf $foo $STAR $nodes ;fi
            cd $dir ; echo " Aligning paired $name $foo files to $species "
        fi
        if [[ $num == 3 ]]; then
        ### if used on a high preformance cluster, run next line with executable i.e. qsub or sbatch to have jobs run all at once
            if [[ $aligner == "HIsat2" ]] ; then $dir/scripts/HIsat2single.sh $file $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_HIsat2/$species $dir/Reference/$species/$species-$geneDataBase-hisat-splice.txt $hisat2 $nodes $samtools ;fi
            if [[ $aligner == "STAR" ]] ; then $dir/scripts/STARsingle.sh $file $name $dir/projects/$project/run-space/signal-flare-bam $dir/Reference/$species/$species\_STAR $dir/Reference/$species/$species.$geneDataBase.gtf $foo $STAR $nodes ;fi
            cd $dir ; echo " Aligning single $name $foo files to $species "
    fi 
    else
         echo "already done" ; echo done >> $dir/projects/$project/run-space/signal-flare-bam 
    echo "$dir/files/$name/$name-Aligned.sortedByCoord.out.bam" >> $dir/projects/$project/run-space/treatment-bam.txt
    fi
    echo '"group2",' >> $dir/projects/$project/run-space/edgeR.r; echo "'$name'," >> $dir/projects/$project/run-space/edgeR-name-list.txt
done < $dir/projects/$project/run-space/test-name-list

## finishing writing temporary scripts for statsitcal analysis 

sed '$ s/.$//' $dir/projects/$project/run-space/edgeR-name-list.txt >$project-temp ; echo ")" >> $project-temp; cat $project-temp > $dir/projects/$project/run-space/edgeR-name-list.txt
sed '$ s/.$//' $dir/projects/$project/run-space/edgeR.r > $project-temp; echo ")) " >> $project-temp; cat $project-temp > $dir/projects/$project/run-space/edgeR.r; rm $project-temp
cat $dir/projects/$project/run-space/edgeR-name-list.txt >> $dir/projects/$project/run-space/edgeR.r; cat $dir/scripts/bottom-edgeR.r >> $dir/projects/$project/run-space/edgeR.r
let "tot+=1"



### waits for BAMs to finish aligning ###

flarebam=`wc -l $dir/projects/$project/run-space/signal-flare-bam | awk '{print $1}'`
while [[ $flarebam -lt $tot ]]; do sleep 120
    flarebam=`wc -l $dir/projects/$project/run-space/signal-flare-bam | awk '{print $1}'`
done


echo "

Alignment Done

"
#====================#
# Stringtie assembly #
#====================#

### runs Stringtie for each file and reports locations of GTF for merging ###


echo "Light the Great Beakon!!" > $dir/projects/$project/run-space/signal-flare1
if [[ -s $dir/projects/$project/run-space/assembly_GTF_list.txt ]]; then
rm $dir/projects/$project/run-space/assembly_GTF_list.txt
fi

if [[ $aligner == "HIsat2" ]] ; 
then
pushd $dir/projects/$project/run-space/
    $samtools merge -@ $nodes -b $dir/projects/$project/run-space/control-bam.txt $dir/projects/$project/run-space/control.bam; $samtools sort -@ $nodes $dir/projects/$project/run-space/control.bam >$dir/projects/$project/run-space/control.bam-temp  
        mv $dir/projects/$project/run-space/control.bam-temp $dir/projects/$project/run-space/control.bam
    $samtools merge -@ $nodes -b $dir/projects/$project/run-space/treatment-bam.txt $dir/projects/$project/run-space/treatment.bam; $samtools sort -@ $nodes $dir/projects/$project/run-space/treatment.bam >$dir/projects/$project/run-space/treatment.bam-temp  
        mv $dir/projects/$project/run-space/treatment.bam-temp $dir/projects/$project/run-space/treatment.bam
    $stringtie $dir/projects/$project/run-space/control.bam -G $dir/Reference/$species/$species.$geneDataBase.gtf -o $dir/projects/$project/run-space/control.gtf
    $stringtie $dir/projects/$project/run-space/treatment.bam -G $dir/Reference/$species/$species.$geneDataBase.gtf -o $dir/projects/$project/run-space/treatment.gtf
    echo "$dir/projects/$project/run-space/control.gtf" >> $dir/projects/$project/run-space/assembly_GTF_list.txt
    echo "$dir/projects/$project/run-space/treatment.gtf" >> $dir/projects/$project/run-space/assembly_GTF_list.txt
popd
fi

if [[ $aligner == "STAR" ]] ;
then
while read name; do
        sample=$dir/files/$name
    if [[ ! -s $sample/stringtie/transcripts.gtf ]]; then
        cd $sample ; echo "
Running StringTie for $name ..."
        ### if used on a high preformance cluster, run next line with executable i.e. qsub or sbatch to have jobs run all at once
        $dir/scripts/StringTie.sh $dir/projects/$project/run-space $dir/Reference/$species/$species.$geneDataBase.gtf $samtools $stringtie
        echo "$sample/stringtie/transcripts.gtf" >> $dir/projects/$project/run-space/assembly_GTF_list.txt
    else 
        echo "StringTie alreay done" >> $dir/projects/$project/run-space/signal-flare1 ; echo "$sample/stringtie/transcripts.gtf" >> $dir/projects/$project/run-space/assembly_GTF_list.txt ; echo "StringTie already done for $name"
    fi
done <$dir/projects/$project/run-space/name-list

### waits for Stringtie to finish

    flare1=`wc -l $dir/projects/$project/run-space/signal-flare1 | awk '{print $1}'`
    while [[ $flare1 -lt $tot ]]; do sleep 120; flare1=`wc -l $dir/projects/$project/run-space/signal-flare1 | awk '{print $1}'` ; done
fi
    echo "

StringTie is done!!

"

#==================================#
# Selects TE-initiated Transcripts #
#==================================#

### runs Stringtie --merge on generated assembles and selects the first exons ###


if [ ! -f "$dir/projects/$project/TE-transcripts.bed" ]
then
module load Python/2.7.14-foss-2017a
module load bedtools/2.27.1-foss-2017a-Python-2.7.14
    echo "
Making Chimera files"
        mkdir $dir/projects/$project/merged_asm/ ; cd $dir/projects/$project/merged_asm/
        $stringtie --merge -G $dir/Reference/$species/$species.$geneDataBase.gtf -o merged.gtf $dir/projects/$project/run-space/assembly_GTF_list.txt 
        $dir/scripts/filter-exons.py $dir/projects/$project/merged_asm/merged.gtf | awk '{ print $1 "\t" $4 "\t" $5 "\t" $7 "\t" $12}' | awk -F '"' '{print $1,$2}' | awk '!seen[$0]++' >$dir/projects/$project/merged_asm/merged.bed
### reports transcritps with 5' ends overlapping TEs to Chimeric-exons.bed ###
### merges all transcripts with the same start position and reports the largest exon starting on the TE to removes dulicates ###
        $dir/scripts/transcript-merge.py $dir/projects/$project/merged_asm/merged.bed | awk -F ";" '{ while(++i<=NF) printf (!a[$i]++) ? $i FS : ""; i=split("",a); print ""}' | sed 's/.$//' | awk '!seen[$0]++' > $project-merged-transcripts  
        awk '{if ($4 == "+") print $1 "\t" $2 "\t" ($2) +1 "\t" $4 "\t"  $5 "\t" $3} { if ($4 == "-") print $1 "\t" ($3) -1 "\t" $3 "\t" $4 "\t" $5 "\t" $2}' $project-merged-transcripts > $project-ends;
        $bedtools intersect -a $project-ends -b $dir/Reference/$species/$species-TE.bed -wa -wb | awk '{ if (($4 == "+")) print $1 "\t" $2 "\t" $6 "\t" $4 "\t" $10 ":" $11 ":" $5 } { if ($4 == "-") print $1 "\t" $6 "\t" $3 "\t" $4 "\t" $10 ":" $11 ":" $5}' | sort -k1,1 -k2,2n | awk '!seen[$0]++' > $dir/projects/$project/TE-transcripts.bed

else
    echo "
TE-transcripts already exist"
fi


num=`wc -l $dir/projects/$project/TE-transcripts.bed | awk '{print $1}'`
echo "Found $num TE-transcripts!
"
mkdir $dir/projects/$project/results

if [ "$num" == "0" ]; then
    exit 1
fi


#=======================#
# Get normalized counts #
#=======================#

### Get normilized counts for first exon of TE transript using StringTrie -eb for multimapped, or Raw counts for unique mapped ###

if [ ! -f "$dir/projects/$project/results/merged.counts" ]
then
echo "
Getting counts
"
    echo "Light the great beakon!!" > $dir/projects/$project/run-space/signal-flare2
    while read name
    do
    cd $dir/files/$name
    ### if used on a high preformance cluster, run next line with executable i.e. qsub or sbatch to have jobs run all at once
        if [[ $aligner == "HIsat2" ]] ; then $dir/scripts/coverage.sh $dir/projects/$project/run-space/signal-flare2 $project $dir/projects/$project/TE-transcripts.bed; fi
        if [[ $aligner == "STAR" ]] ; then $dir/scripts/Count.sh $dir/projects/$project/run-space/signal-flare2 $name $dir/projects/$project/merged_asm/merged.gtf $dir/files/$name $dir/projects/$project/TE-transcripts.bed $project; fi
    done<$dir/projects/$project/run-space/name-list

### waits untill counts are finished ###

    flare2=0
    while [[ $flare2 -lt $tot ]]; do sleep 60; flare2=`wc -l $dir/projects/$project/run-space/signal-flare2 | awk '{print $1}'` ; done
    echo "Counting is done!
"

### join reads together into one file with control first and treatments second ###

    con_num=`wc -l $dir/projects/$project/run-space/control-name-list | awk '{print $1 }' `
    treat_num=`wc -l $dir/projects/$project/run-space/test-name-list | awk '{print $1 }' `
    con1=`head -1 $dir/projects/$project/run-space/control-name-list` ; sed '1d' $dir/projects/$project/run-space/control-name-list > $dir/projects/$project/run-space/control-name-list-1.txt
    con2=`head -1 $dir/projects/$project/run-space/control-name-list-1.txt` ;join --nocheck-order -1 1 -2 1 -t $'\t' $dir/files/$con1/$project-$con1-norm.counts $dir/files/$con2/$project-$con2-norm.counts > $dir/projects/$project/results/merged.counts
    sed '1d' $dir/projects/$project/run-space/control-name-list-1.txt > $dir/projects/$project/run-space/control-name-list-2.txt ; rm $dir/files/$con1/$project-$con1-norm.counts $dir/files/$con2/$project-$con2-norm.counts

    while read name; do 
        sample=$dir/files/$name; cut -f2 $sample/$project-$name-norm.counts > $project-temp1; paste $dir/projects/$project/results/merged.counts $project-temp1  >$project-temp; awk -v OFS="\t" '$1=$1' $project-temp > $dir/projects/$project/results/merged.counts 
        rm $project-temp1 $project-temp $sample/$project-$name-norm.counts
    done<$dir/projects/$project/run-space/control-name-list-2.txt

    while read name; do
        sample=$dir/files/$name; cut -f2 $sample/$project-$name-norm.counts > $project-temp1; paste $dir/projects/$project/results/merged.counts $project-temp1  >$project-temp; awk -v OFS="\t" '$1=$1' $project-temp > $dir/projects/$project/results/merged.counts
        rm $project-temp1 $project-temp $sample/$project-$name-norm.counts
    done <$dir/projects/$project/run-space/test-name-list


### Filters reads and selects possible Cryptic transcripts ###

    cd $dir/projects/$project/results
    $dir/scripts/check-counts.py $dir/projects/$project/results/merged.counts $con_num $treat_num | awk '!seen[$0]++' > $project-temp ; mv $project-temp $dir/projects/$project/results/merged.counts
    awk '!seen[$0]++' $dir/projects/$project/results/merged.counts > $project-temp ; mv $project-temp $dir/projects/$project/results/merged.counts

else
    echo "Counts already exist
"
fi


#==========================#
# EdgeR stastical analysis #
#==========================#

cd $dir/projects/$project/results
mv $dir/projects/$project/run-space/edgeR.r .

module load R

echo "Running statistical tests
"
Rscript edgeR.r

### prints totals number of classes, subfamilies, and families of signifacnlty changing TE-Transcripts ###

awk '{if (($5 < 0.05) && ($2 > 0)) print $0}' $dir/projects/$project/results/pvals.txt | grep -v "PValue" > $dir/projects/$project/results/up-fdr-0.05.txt; awk '{if (($5 < 0.05) && ($2 < 0)) print $0}' $dir/projects/$project/results/pvals.txt  | grep -v "PValue" > $dir/projects/$project/results/down-fdr-0.05.txt
awk -F ":" '{print  $1 ":" $2 "\t " $3 ":" $4 }' $dir/projects/$project/results/up-fdr-0.05.txt > $dir/projects/$project/results/up-fdr-0.05.list
awk -F ":" '{print  $1 ":" $2 "\t " $3 ":" $4 }' $dir/projects/$project/results/down-fdr-0.05.txt > $dir/projects/$project/results/down-fdr-0.05.list

$dir/scripts/count-TEs.py $dir/projects/$project/results/up-fdr-0.05.txt $dir/projects/$project/results/down-fdr-0.05.txt


#=============#
# Cleaning up #
#=============#

### Gets rid of temporary scripts and moves active.log to the log folder ###

echo "cleaning up ...

Finished running for $project 
"
rm -r $dir/projects/$project/run-space
mv $dir/active.log $dir/logs/$project\.log



