use strict;
use warnings;


if(@ARGV!=1)
{
	warn "Usage: <*.bsp> \n";
	warn "CH only\n";
	exit 1;
}

my $type='CH';

my %h_CH;
my %h_CG;
my $N;
my %jagged_end_size_distr;
my %JE_distr;
my %stat;

while(<>)
{
	chomp;
	my @F1=split /\t/, $_;
	chomp (my $ln2=<>);
	my @F2=split /\t/, $ln2;
#M01624:132:000000000-C66CF:1:2107:17969:1048	AATACTTAATATCAAGTAACGATACAAAACCCTTTTATTTAAAATAACAA	GGGGFEGGEGGFC,<,@GGGGGGGGGGFGGGFCGFGGGGGGGGGGCCCCC	UM	chr2	67010471	-+	158	aaGATGCTTAGTATCAATTGACGGTGCAGGGCCCTTTTATTTGAAGTAGCAGaa	1	0:1:0:0:0
#M01624:132:000000000-C66CF:1:2107:17969:1048	NAACGCCATAAAAAAAACTACTAACTCTATAAATTCAAACAATAAAAAAT	#8ACCFFEFFGGGGGCGGFGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGG	UM	chr2	67010363	--	158	aaCAACGCCATAAAGAAAACTGCTGACTCTATAAGTTCAAGCAATAGGAGATtt	0	1:0:0:0:0
	$stat{tot}++;
	my $len=$F1[7];
	if($F1[0] eq $F2[0] && $F1[7]>0 && $F1[6] eq '++' && $F2[6] eq '+-')
	{
		my ($R1_C, $R1_T, $R1_CH, $R2_C, $R2_T, $R2_CH, $strand, $pos, $JE_len, $pass, $is_blunt)=deduce_CH_MD($F1[1], $F1[8], $F2[1], $F2[8], 'W');
		#join("\t", qw/chr, start end R1_C R1_T R1_CH R2_C R2_T R2_CH strand pos JE_len pass is_blunt/)
		print join("\t", $F1[4], $F1[5]-1, $F1[5]-1+$len, $F1[1], $F1[8], $F2[1], $F2[8], 
		"R1C:$R1_C", "R1T:$R1_T", "R1CH:$R1_CH", "R2C:$R2_C", "R2T:$R2_T", "R2CH:$R2_CH", "STRAND:$strand", "JPOS:$pos", "JLEN:$JE_len", "JPASS:$pass", "BLUNT:$is_blunt"), "\n";
		if($pass==1)
		{
			$stat{exact}++;
			$JE_distr{$JE_len}++;
		}
		if($is_blunt==1)
		{
			$JE_distr{0}++;
			$stat{exact}++;
		}
	}
	elsif($F1[0] eq $F2[0] && $F1[7]>0 && $F1[6] eq '-+' && $F2[6] eq '--')
	{
		my ($R1_C, $R1_T, $R1_CH, $R2_C, $R2_T, $R2_CH, $strand, $pos, $JE_len, $pass, $is_blunt)=deduce_CH_MD($F1[1], $F1[8], $F2[1], $F2[8], 'C');
		#join("\t", qw/chr, start end R1_C R1_T R1_CH R2_C R2_T R2_CH strand pos JE_len pass is_blunt/)
		#need double-check the bed region is correct: $F1[5]-$len, $F1[5]
		my $read_len=length($F1[1]);
		print join("\t", $F1[4], $F1[5]+$read_len-1-$len, $F1[5]+$read_len-1, $F1[1], $F1[8], $F2[1], $F2[8], 
		"R1C:$R1_C", "R1T:$R1_T", "R1CH:$R1_CH", "R2C:$R2_C", "R2T:$R2_T", "R2CH:$R2_CH", "STRAND:$strand", "JPOS:$pos", "JLEN:$JE_len", "JPASS:$pass", "BLUNT:$is_blunt"), "\n";
		if($pass==1)
		{
			$stat{exact}++;
			$JE_distr{$JE_len}++;
		}
		if($is_blunt==1)
		{
			$stat{exact}++;
			$JE_distr{0}++;
		}	
	}
}

my $r=sprintf "%.2f", $stat{exact}/$stat{tot}*100;
warn join("\t", "$type:", "tot:$stat{tot}",  "exactJE:$stat{exact}", $r), "\n";

for my $JE_len (0..75)
{
	warn join("\t", $JE_len, $JE_distr{$JE_len}||0, ($JE_distr{$JE_len}||0) / $stat{exact}*100), "\n";
}

###R1 and R2 CH methylation difference
sub deduce_CH_MD
{

	my $read1=shift; #+
	my $read1_ref=uc shift;
	my $read2=shift; #-
	my $read2_ref=uc shift;
	my $strand=shift; #W-watson C-crick
	#my $JE_distr=shift; #hash ref

	if( $strand  eq 'C' )
	{
		$read1=revcom($read1);
		$read2=revcom($read2);
		$read1_ref=revcom($read1_ref);
		$read2_ref=revcom($read2_ref);		
	}

	my @R1=split //, $read1;
	my @R1_ref=split //, $read1_ref;
	my @C1_idx=findC($read1_ref, 2);

	my $R1_C=0;
	my $R1_T=0;
	my $R1_CH=0;

	for my $i (@C1_idx)
	{
		if($R1_ref[$i] eq 'C' && $R1_ref[$i+1] ne 'G')
		{
			$R1_C++ if $R1[$i-2] eq 'C';
			$R1_T++ if $R1[$i-2] eq 'T';
			$R1_CH++;
		}
	}
	
	my @R2=split //, $read2;
	my @R2_ref=split //, $read2_ref;
	my $read_len=length($read2);
	
	my @C2_idx=findC($read2_ref, 2);
	#print join("\t", @C2_idx), " checking \n";

	my $R2_C=0;
	my $R2_T=0;
	my $R2_CH=0;
	for my $i (@C2_idx)
	{
		if($R2_ref[$i] =~ /C/i && $R2_ref[$i+1] !~/G/i)
		{
			$R2_C++ if $R2[$i-2] eq 'C';
			$R2_T++ if $R2[$i-2] eq 'T';
			$R2_CH++;
		}
	}
	
	my $pos=-1;
	my $pass=0;
	my $JE_len=0;

	#my $idx=shift; my $R2_ref=shift; my $R2_CH_C=shift; my $pos=shift; my $R2_CH_T_prefix=shift; my $R2_CH_T_prefix_pos=shift; my $n=shift; my $R2_CH_T=shift;	

	deduce_CH(\@C2_idx, \@R2_ref, \@R2, \$pos, $read_len, \$JE_len, \$pass);

	#blunt ends
	my $is_blunt=detect_blunt_end($read2, $read2_ref);
	#warn join("\t", $F1[0], $F1[1], $F2[1], $F2[8], $pos, $read_len, $len, 0, $pass, ), "\n";
	
	# if($is_blunt)
	# {
		# $JE_distr->{0}++;
	# }
	# elsif($pass == 1)
	# {
		# $JE_distr->{$JE_len}++;
	# }
	
	return ($R1_C, $R1_T, $R1_CH, $R2_C, $R2_T, $R2_CH, $strand, $pos, $JE_len, $pass, $is_blunt);
}

sub detect_blunt_end
{
	my $seq=shift;
	my $ref=shift;
	my $len_ref=length($ref);
	my $di_tag=substr($ref, $len_ref-3, 2);
	
	my $len=length($seq);
	my $last_base=substr($seq, $len-1, 1);
	if($di_tag eq 'CC' && $last_base eq 'T')
	{
		return 1;
	}
	elsif($di_tag eq 'CC' && $last_base eq 'C')
	{
		return 0;
	}
	else
	{
		return -1;
	}
}

sub deduce_CH
{
	my $C2_idx=shift; #array ref
	my $R2_ref=shift; #array ref
	my $R2=shift; #array ref
	my $pos=shift; #ref
	my $read_len=shift;
	my $JE_len=shift; #ref
	my $pass=shift; #ref
	
	my @C2_idx=@$C2_idx;
	my @R2_ref=@$R2_ref;
	my @R2=@$R2;
	
	for my $i (@C2_idx)
	{
		if($R2_ref[$i] eq 'C' && $R2_ref[$i+1] ne 'G' && defined $R2_ref[$i-1] && $R2_ref[$i-1] eq 'C')
		{
			if($R2[$i-2] eq 'C' && defined $R2[$i-2-1] && $R2[$i-2-1] eq 'T')
			{
				$$pos=$i-2;
				$$pass=1;
				$$JE_len=$read_len-($i-2);
				last;
			}
		}
	}
}


sub deduce_CG
{
	my $C2_idx=shift; #array ref
	my $R2_ref=shift; #array ref
	my $R2=shift; #array ref
	my $pos=shift;
	my $read_len=shift;
	my $JE_len=shift;
	my $pass=shift;
	
	my @C2_idx=@$C2_idx;
	my @R2_ref=@$R2_ref;
	my @R2=@$R2;
	
	for my $i (@C2_idx)
	{
		if($R2_ref[$i] eq 'C' && $R2_ref[$i+1] eq 'G' && defined $R2_ref[$i-2] && $R2_ref[$i-2] eq 'C' && defined $R2_ref[$i-1]  && $R2_ref[$i-1] eq 'G')
		{
			
			if($R2[$i-2] eq 'C' && defined $R2[$i-2-2] && $R2[$i-2-2] eq 'T')
			{
				$$pos=($i-2);
				$$pass=1;
				$$JE_len=$read_len-$$pos;
				last;
			}
		}
	}
}


sub revcom
{
	my $seq=shift;
	$seq=reverse $seq;
	$seq=~tr/ACGTacgt/TGCAtgca/;
	return $seq;
}



sub findC
{
	my $seq=shift;
	my $offset=shift;
	my $result = index($seq, 'C', $offset);
	my $len=length($seq);
	my @res=();
	while ($result != -1 && ($result) < ($len-2) )
	{
		push @res, $result;
		$offset = $result + 1;
		$result = index($seq, 'C', $offset);
	}
	return @res;
}
