ibam=$1
iouterexon=$2
confintron=$3
nCutoff=$4	#up to nCutoff bp protruding reads or up to nCutoff bp from upstream intron reads are considered as candidate for realignemnt. If protruding reads are to large or distsance to upstream intron is to far, it is unlikely realigned succesfully.
odir=$5



mkdir $odir/realn_candidate
odir=$odir/realn_candidate

ibam_fname=${ibam##*/}
iouterexon_fname=${iouterexon##*/}
confintron_fname=${confintron##*/}

##For 5' end realign candidate identification
###1) Find 5' end exon first
###2) Find if it overlaps SA (A) and calculate the distance from upstream intron (B)
###3) For case A, >0 length is realn candidate
#     For case B, if softclip length > distance to upstream intron, realn candidate


###Reduce the dimension first
awk -v OFS='\t' '$7!="5end"{ $4="3end"; $5=0; print $1,$2,$3,$4,$5,$6; }' $iouterexon | sort -k1,1 -V -k2,2n | uniq > $odir/${iouterexon_fname%.bed}".3endexon.bed"
awk -v OFS='\t' '$7!="3end"{ $4="5end"; $5=0; print $1,$2,$3,$4,$5,$6; }' $iouterexon | sort -k1,1 -V -k2,2n |uniq > $odir/${iouterexon_fname%.bed}".5endexon.bed"

#Concept
##Exon coordinate (0-based)
##SA: 5' end of exon
##SD: 3' end of exon
awk -v OFS='\t' '{ if($6=="+"){ print $1,$3,$3+1,"SA","0",$6; }else{ print $1,$2-1,$2,"SA","0",$6;} }' $confintron | sort -k1,1 -V -k2,2n |uniq > $odir/${confintron_fname%.bed}".SA.bed"
awk -v OFS='\t' '{ if($6=="+"){ print $1,$2-1,$2,"SD","0",$6;}else{ print $1,$3,$3+1,"SD","0",$6;} }' $confintron | sort -k1,1 -V -k2,2n |uniq > $odir/${confintron_fname%.bed}".SD.bed"
awk -v OFS='\t' '{ if($6=="+"){ print $1,$2,$2+1,"CTSS","0",$6;}else{ print $1,$3-1,$3,"CTSS","0", $6; }}' $iouterexon | sort -k1,1 -V -k2,2n |uniq > $odir/${iouterexon_fname%.bed}".CTSS.bed"
awk -v OFS='\t' '{ if($6=="+"){ print $1,$3-1,$3,"CTES","0", $6;}else{ print $1,$2,$2+1,"CTES","0",$6;}}' $iouterexon | sort -k1,1 -V -k2,2n |uniq > $odir/${iouterexon_fname%.bed}".CTES.bed"

#####Distance from TSS to inner exon SA
###1) identify first which of exons are overlapping with SA/SD
###2) What is the distance from TSS/TES to SA/SD
bedtools intersect -wo -s -a $odir/${iouterexon_fname%.bed}".5endexon.bed" -b $odir/${confintron_fname%.bed}".SA.bed" | awk -v OFS='\t' -v delta=0 '{ if($6=="+"){ delta=$8-$2; }else{ delta=$9-$3; }  print $1,$2,$3,$4,$5,$6,delta; }' | uniq > $odir/${iouterexon_fname%.bed}".5endexon_innerSA.bed"
bedtools closest -s -D a -fu -a $odir/${iouterexon_fname%.bed}".CTSS.bed" -b $confintron | awk -v OFS='\t' '{ print $1,$2,$3,$4,$5,$6,$14; }' | uniq > $odir/${iouterexon_fname%.bed}".CTSS_upperintron.bed"
bedtools intersect -wo -s -a $odir/${iouterexon_fname%.bed}".3endexon.bed" -b $odir/${confintron_fname%.bed}".SD.bed" | awk -v OFS='\t' -v delta=0 '{ if($6=="+"){ delta=$9-$3; }else{ delta=$8-$2; } print $1,$2,$3,$4,$5,$6,delta; }' | uniq > $odir/${iouterexon_fname%.bed}".3endexon_innerSD.bed"
bedtools closest -s -D a -fd -a $odir/${iouterexon_fname%.bed}".CTES.bed" -b $confintron | awk -v OFS='\t' '{ print $1,$2,$3,$4,$5,$6,$14; }' | uniq > $odir/${iouterexon_fname%.bed}".CTES_downintron.bed"

#Print protruding read length
##only output reads with either 5' or 3' end has protruding bases
protrud_info=$odir/${ibam_fname%.bam}".protrud_length.info"
python /script/findCandidateRead.v3.py $ibam $iouterexon $odir/${iouterexon_fname%.bed}".5endexon_innerSA.bed" $odir/${iouterexon_fname%.bed}".CTSS_upperintron.bed" $odir/${iouterexon_fname%.bed}".3endexon_innerSD.bed" $odir/${iouterexon_fname%.bed}".CTES_downintron.bed" $protrud_info
rm $odir/${iouterexon_fname%.bed}".5endexon_innerSA.bed" $odir/${iouterexon_fname%.bed}".CTSS_upperintron.bed" $odir/${iouterexon_fname%.bed}".CTSS_upperintron.bed" $odir/${iouterexon_fname%.bed}".CTES_downintron.bed" 

realn_info=$odir/${ibam_fname%.bam}".realn_cand.info"
echo -e "#readname\t5end_protrud\t5end_protrudbysc\t3end_protrud\t3end_protrudbysc\trealn_cand_5\trealn_cand_3" > $realn_info
awk -v OFS='\t' -v Realn5="" -v Realn3="" -v cutoff=$nCutoff 'NR>1{ if( ($2>0 && $2<=cutoff) || ($3>0 && $3<=cutoff) ){ Realn5="O"; }else{ Realn5="X"; } if( ($4>0 && $4<=cutoff) || ($5>0 && $5<=cutoff) ){ Realn3="O"; }else{ Realn3="X"; } print $1,$2,$3,$4,$5,Realn5,Realn3; }' $protrud_info >> $realn_info

##Split bam files into two
readname_candonly=$odir/$ibam_fname".realn_cand_readname.txt"
awk -v OFS='\t' 'NR>1{if($6=="O"||$7=="O"){ print $1; }}' $realn_info > $readname_candonly
echo `pwd`
ls
java -jar /picard.jar FilterSamReads I=$ibam O=$odir/${ibam_fname%.bam}".realn_cand.bam" READ_LIST_FILE=$readname_candonly FILTER=includeReadList
java -jar /picard.jar FilterSamReads I=$ibam O=$odir/${ibam_fname%.bam}".realn_notcand.bam" READ_LIST_FILE=$readname_candonly FILTER=excludeReadList

samtools index $odir/${ibam_fname%.bam}".realn_cand.bam"
samtools index $odir/${ibam_fname%.bam}".realn_notcand.bam"


#change ln -s to mv
mv $odir/${ibam_fname%.bam}".realn_notcand.bam" $odir/../${ibam_fname%.bam}".realn_notcand.bam"
mv $odir/${ibam_fname%.bam}".realn_notcand.bam.bai" $odir/../${ibam_fname%.bam}".realn_notcand.bam.bai"






  












