
#!/bin/sh
GHOME="/nlmusr/gchirn/linux"
RHOME="/nlmusr/reazur/linux"
HOME=$GHOME

# Input for this pipeline is the .uq file
# $1 = OSS_C.fastq.uq
input=$1
prefix=${1%.fastq*}
read_len=75
#----------------------------------
# remove ployN characters ...
#---------------------------------------
$RHOME/CORE/remove_ployN_seq.sh $1
output=$1.ployn

$RHOME/CORE/writeqc.sh $input $output ployN_filter

##############################
# remove virus sequences
#-----------------------------

mismatch=0
database="/nlmusr/reazur/linux/DATABASE/fly_virus"
input=$output
output=$input".sam"
$GHOME/TOOLS/bin/bowtie -f -v $mismatch -S -k 100000 -m 100000 --strata --best -p 12 $database $input $output

perl $RHOME/CORE/separate_aligned_unaligned.pl -f $input -s $output 
mv $input.al $prefix.virus
mv $input.ual $prefix.novirus

#write qc
$RHOME/CORE/writeqc.sh $input $prefix.novirus virus_filter


##############################
# remove structural rna sequences 
#-----------------------------

mismatch=0
database="/nlmusr/reazur/linux/DATABASE/fly_structure_rna"
input=$prefix".novirus"
output=$input".sam"
$GHOME/TOOLS/bin/bowtie -f -v $mismatch -S -k 100000 -m 100000 --strata --best -p 12 $database $input $output

perl $RHOME/CORE/separate_aligned_unaligned.pl -f $input -s $output 
mv $input.al $prefix.strna
mv $input.ual $prefix.nostrna

$RHOME/CORE/writeqc.sh $input $prefix.nostrna struct_filter
#./writeqc.sh $input $prefix.nostrna 


###################################
##  align to masked genome and remove those sequences
###################################


mismatch=3
database="/nlmusr/reazur/linux/NELSON/Genome_resequence/dm3_mask"
input=$prefix".nostrna"
output=$input".sam"
$GHOME/TOOLS/bin/bowtie -f -v $mismatch -S -k 100000 -m 100000 --strata --best -p 12 $database $input $output

perl $RHOME/CORE/separate_aligned_unaligned.pl -f $input -s $output 
mv $input.al $prefix.mask
mv $input.ual $prefix.nomask

$RHOME/CORE/writeqc.sh $input $prefix.nomask mask_filter
#./writeqc.sh $input $prefix.nomask


#exit
###################################
##  align to genomic repeats (REPBASE)
###################################


mismatch=3
database="/nlmusr/gchirn/linux/REPBASE/repeat_fly"
input=$prefix".nomask"
output=$input".sam"
$GHOME/TOOLS/bin/bowtie -f -v $mismatch -S -k 100000 -m 100000 --strata --best -p 12 $database $input $output

perl $RHOME/CORE/separate_aligned_unaligned.pl -f $input -s $output 
mv $input.al $prefix.rep
mv $input.ual $prefix.norep

$RHOME/CORE/writeqc.sh $input $prefix.norep rep_filter
#./writeqc.sh $input $prefix.norep


###################################
##  align to consensus TE
###################################


mismatch=3
database="/nlmusr/reazur/linux/NELSON/Genome_resequence/dm_TE"
input=$prefix".norep"
output=$input".sam"
$GHOME/TOOLS/bin/bowtie -f -v $mismatch -S -k 100000 -m 100000 --strata --best -p 12 $database $input $output

perl $RHOME/CORE/separate_aligned_unaligned.pl -f $input -s $output 
mv $input.al $prefix.TE
mv $input.ual $prefix.noTE

$RHOME/CORE/writeqc.sh $input $prefix.noTE TE_filter
#./writeqc.sh $input $prefix.noTE

#--------------------------------------------------
#align to the genome to remove the false positives
#--------------------------------------------------

mismatch=1
database="/nlmusr/gchirn/linux/UCSC_GENOME/fly/ucsc_fly_genome"
input=$prefix".noTE"
output=$input".sam"
$GHOME/TOOLS/bin/bowtie -f -v $mismatch -S -k 100000 -m 100000 --strata --best -p 12 $database $input $output

perl $RHOME/CORE/separate_aligned_unaligned.pl -f $input -s $output 
mv $input.al $prefix.GEN
mv $input.ual $prefix.noGEN

$RHOME/CORE/writeqc.sh $input $prefix.noGEN Genome_filter

#-------------------------------------------------------

#---------------------------------------------
#get the 5' and 3' end of surviving reads...
length=22
perl $RHOME/NELSON/Genome_resequence/get_front_end_reads.pl -l $length $prefix.noGEN


frontfile=$prefix.noGEN.front
endfile=$prefix.noGEN.end

#now i have to align 
./part_match.sh $frontfile

./part_match_end.sh $endfile

#---------------
# analysis of samfile files to identify candidate insertion sites
frontgensam=$frontfile".gen.sam"
frontTEsam=$frontfile".TE.sam"
endgensam=$endfile".gen.sam"
endTEsam=$endfile".TE.sam"
candidate_insert=$prefix"_insert.txt"
perl $RHOME/NELSON/Genome_resequence/unify_front_end_reads.pl -f $frontgensam -e $endTEsam -n $endgensam -r $frontTEsam > $candidate_insert
    
#qc
echo "Candidate insertion sites in file: $candidate_insert" >> summary
orig=$(grep -v "^Ident" $candidate_insert | cut -f1 | cut -d':' -f2 | /nlmusr/gchirn/linux/CORE/bin/sum)
echo -ne "  reads:\t$orig\n" >> summary
uqr=$(grep -v "^Ident" $candidate_insert | wc -l )
echo -ne "  uqreads:\t$uqr\n" >> summary
echo -ne "  Number of Candidate sites:\t$uqr\n\n" >> summary




exit;
#-----------------------------------newrun.sh 
