GHOME="/nlmusr/gchirn/linux"
RHOME="/nlmusr/reazur/linux"
HOME=$GHOME

#$1 = OSS_C.fastq
input=$1
prefix=${1%.fastq*}
read_len=150

old_candidate_insert=$prefix"_insert.txt"
blat_query=$prefix"_insert.fa"

#create the blat query from insert.txt file

perl $RHOME/NELSON/Genome_resequence/insert_reads_to_fasta.pl $old_candidate_insert > $blat_query



GENOME=$GHOME"/UCSC_GENOME/fly/ucsc_fly_genome.fa"
blat_out=$prefix"_insert.psl"
blat $GENOME $blat_query -stepSize=3 -repMatch=2253 -minScore=0 -minIdentity=0 -maxIntron=10 $blat_out

#-makeOoc=11.ooc

#convert result to bed file
blat_bed=$prefix"_insert.bed"
perl $RHOME/NELSON/Genome_resequence/psl_to_bed_best_score.pl $blat_out $blat_bed


candidate_insert=$prefix"_new_insert.txt"
perl $RHOME/NELSON/Genome_resequence/add_blat_score_to_insert.pl -i $old_candidate_insert -b $blat_bed -l $read_len > $candidate_insert

#create a boxplot of the blat score
R --no-save ---no-restore --no-readline $candidate_insert  < $RHOME/NELSON/Genome_resequence/boxplot_blatscore.R 


length=22
#--------------------------

lim=300
read_threshold=4
#span threshold (82-22), or 60-22
chr_distance_threshold=60
BSR_threshold=83

#level 1 collapse, collapsing the same transposons in the same reason
level1file=$prefix"_insert_base"$length"_W"$lim"_level1.xls"
perl $RHOME/NELSON/Genome_resequence/collapse_insert_sites.pl -l $lim -r $read_threshold -d $chr_distance_threshold -t $BSR_threshold $candidate_insert > $level1file
 
echo "Level 1 collapse of insertion sites in file: $level1file" >> summary
uqr=$(grep -v "^Insertion"  $level1file | wc -l )
echo -ne "  Number of Level 1 collapsed sites:\t$uqr\n\n" >> summary
#--------------------------------------
 
level1sitesannotation=$prefix"_level1siteannotation.xls"
#perl $RHOME/NELSON/Genome_resequence/annotation/rough_annotation.pl $level1file > $level1sitesannotation
perl $RHOME/NELSON/Genome_resequence/annotation/rough_annotation_insertion_sites.pl $level1file > $level1sitesannotation
echo "Level 1 annotation file: $level1sitesannotation" >> summary

#window=1000
##outputfile=$prefix"_insert_fixed_bin_"$window".xls"
#perl $RHOME/NELSON/Genome_resequence/fixed_bin_sites_mod.pl -w $window $level1file > $outputfile
#
#echo "Insertion sites in fixed bin:" >> summary
#echo "$outputfile" >> summary


window=5000
outputfile=$prefix"_insert_fixed_bin_"$window".xls"
#perl $RHOME/NELSON/Genome_resequence/fixed_bin_sites_mod.pl -w $window $level1file > $outputfile
perl $RHOME/NELSON/Genome_resequence/fixed_bin_sites_insertion.pl -w $window $level1file > $outputfile
echo "Insertion sites in fixed bin:" >> summary
echo "$outputfile" >> summary

#bloxplot of avg_blat_score
R --no-save ---no-restore --no-readline $level1file  < $RHOME/NELSON/Genome_resequence/boxplot_avg_blatscore.R



#---------------------------------------
#this part is used for the heterogeniety score at the sitosertion
genomefile="/nlmusr/gchirn/linux/UCSC_GENOME/fly/ucsc_fly_genome"
clusterfile=$level1file
buffer="24"
outfile="selected_seq.fa"

perl $RHOME/NELSON/Genome_resequence/get_sequence_from_genome.pl -l $buffer -g $genomefile -c $clusterfile > $outfile
#

database="select_seqdb"
bowtie-build $outfile $database


input=$prefix".nostrna"

$RHOME/CORE/align_to_gene_list_mod.sh $input $database "Insertion_site_coverage"

readnum=$(grep "^>" $input | cut -d':' -f2 | $GHOME/CORE/bin/sum )

cluster_coverage_file=$prefix"_cluster_coverage_"$buffer"_bp.xls"  
perl $RHOME/CORE/parse_bedfile_single_read_library.pl -q $input -r $readnum -s $outfile > $cluster_coverage_file

#---------------------------------

combinedlevel1file=$prefix"_combined_level1.xls"
perl $RHOME/NELSON/Genome_resequence/merge_cluster_with_read_coverage.pl -c $level1file -r $cluster_coverage_file > $combinedlevel1file



#use level1 fiel for annotation, instead of level2

echo "End of pipeline" >> summary

cp summary $prefix"_summary.xls"

