#!/usr/bin/perl -w
#===============================================================================
#
#         FILE:  methylation_vs_size.mm9.pl 
#        USAGE:  methylation_vs_size.mm9.pl  
#  DESCRIPTION:  calculate size band methylation
# 		 if read1 and read2 are overlapped,consitant CpG in overlaped region will
# 		 be report only once while inconsistat CpG will be reported to error
#       AUTHOR:  lemon (NIMeng), nimeng@cuhk.edu.hk
#      COMPANY:  CPY@CUHK
#      VERSION:  0.1
#      CREATED:  06/03/2020 22:07:22 PM
#     REVISION:  0
#===============================================================================

use strict;

die ("perl $0 <outprefix> <bsalign>\n") unless @ARGV >=2;

my $W_CG_bed="./mm9.Chr1.W.CG.bed";   #### user-defined genome.W.CG.bed 
my $C_CG_bed="./mm9.Chr1.C.rev.CG.bed";   #### user-defined genome.C.rev.CG.bed 
my $op=shift;
my %h;

open OUT,">$op.meth";
sub load_cpg
{
	my $bed=shift;
	open BED, $bed or die "Could not open $bed $!";
	my %ref;
	while(<BED>)
	{
		my @F=split;
		$ref{$F[0]."\t".$F[2]}="";#bed is 0 based
	}
	close BED;
	return \%ref;
}


$h{W}=load_cpg($W_CG_bed);
$h{C}=load_cpg($C_CG_bed);

my %meth;
for my $file(@ARGV)
{
	open IN,"$file" or die "Could not open $file $!\n";
	while (<IN>)
	{
		my $line1=$_;
		chomp $line1;
		my $line2=<IN>;
		chomp $line2;
		my @sp1=split /\t/,$line1;
		my @sp2=split /\t/,$line2;
		my($meth_nu, $unmeth_nu, $tot)=(0,0,0);
		my $count1=0;
		my $count2=0;
		my $len1=length($sp1[1]);
		my $len2=length($sp2[1]);
		my @base1=split //,$sp1[1];
		my @base2=split //,$sp2[1];
		my $size=$sp2[8]+$len2-$sp1[8]; #stranded sequencing
		#HWI-ST1049:1:1101:1220:2101#0/1 TTTTTGTTTTTTGAGTTTAATTAAGTTTTTTGATTGGGTTTTTCGAGTAGTTGGGATTATAGGTATGTGTTATTA	1       a       75      +       chr9    5849104	0       75M     75      W
		#HWI-ST1049:1:1101:1220:2101#0/2 ATAGGTATGTGTTATTATATTTAGTTAATTTTTGTATTTTTAGTAGAGATAGGGTTTTATTATGTTGGTTAGGAG	1       a       75      -       chr9    5849162	0       75M     75      W
	
	
		if ($sp1[8]+$len1-1 >= $sp2[8])#read1 and read2 overlap
		{
			my $ol=$sp1[8]+$len1-$sp2[8];
			my $subR1=substr $sp1[1],-$ol;
			my $subR2=substr $sp2[1],0,$ol;
			if ($subR1 ne $subR2)
			{
				next;
			}
			else
			{
				my $cnt=0;
				my @ol_base=split //,$subR1;
				for my $site($sp2[8]..$sp1[8]+$len1-1)
				{
					if(exists $h{$sp1[-1]}{$sp1[7]."\t".$site}) #in later step, overlaped region will be counted twice
					{	my $bas=$ol_base[$cnt];
						$tot--;
						$meth_nu-- if $bas eq "C";
						$unmeth_nu-- if $bas eq "T";
					}
					$cnt++;
				}
			}
			
		}
		for my $site($sp1[8]..$sp1[8]+$len1-1)
		{
			if(exists $h{$sp1[-1]}{$sp1[7]."\t".$site})
			{
				my $bas=$base1[$count1];
				$tot++;
				$meth_nu++ if $bas eq "C";
				$unmeth_nu++ if $bas eq "T";
			}
			$count1++;
		}
		
		for my $site2($sp2[8]..$sp2[8]+$len2-1)
		{
			if(exists $h{$sp2[-1]}{$sp2[7]."\t".$site2})
			{
				my $bas=$base2[$count2];
				$tot++;
				$meth_nu++ if $bas eq "C";
				$unmeth_nu++ if $bas eq "T";
			}
			$count2++;
		}
		if($tot>=1)
		{
			$meth{$size}{'C'}+=$meth_nu;
			$meth{$size}{'T'}+=$unmeth_nu;
		}
	}
	close IN;
}

for my $i (0..600)
{
	my $c=$meth{$i}{C}||0;
	my $t=$meth{$i}{T}||0;
	my $m=sprintf "%.4f",$c/(($t+$c)||1)*100;
	print OUT join("\t",$i,$c,$t,$m),"\n";
}
close OUT;
