#!/usr/bin/perl -w use Statistics::Distributions; while() { chomp; my ($dist,$word,$count) = split; $$dist{$word} = $count; if (length $word == 2) { $ditotal{$dist} += $count; } else { $monototal{$dist} += $count; } } # check independence of background and target --- ratio of dinucs to mono products print "Observed-expected ratios of the target\n"; print " "; foreach my $s (qw/A C G T/) { printf "%-6s ",$s; } print "\n"; %rowsum = (); %colsum = (); foreach my $f (qw/A C G T/) { printf "%-6s ",$f; foreach my $s (qw/A C G T/) { my $d = "$f$s"; my $o = $t{$d} / $ditotal{t}; $rowsum{$f} += $t{$d}; $colsum{$s} += $t{$d}; my $e = ($t{$f} * $t{$s}) / ($monototal{t}**2); printf ("%5.4f ",($o/$e)); $cell += $t{$d} * log $t{$d}; } $rowcol += $rowsum{$f} * log ($rowsum{$f}); print "\n"; } print "\n"; foreach my $s (qw/A C G T/) { $rowcol += $colsum{$s} * log ($colsum{$s}); } $G = 2 * ($cell - $rowcol + ($ditotal{t} * log $ditotal{t})); printf "The G-test statistic G for the target = %6.4f with 9 degrees of freedom.\n",$G; $cp = Statistics::Distributions::chisqrprob (9,$G); printf "The probability that sites are independent in the target is %6.4f\n",$cp; undef $G; undef $cell; undef $rowcol; print "\n"; print "Observed-expected ratios of the background\n"; print " "; foreach my $s (qw/A C G T/) { printf "%-6s ",$s; } print "\n"; %rowsum = (); %colsum = (); foreach my $f (qw/A C G T/) { printf "%-6s ",$f; foreach my $s (qw/A C G T/) { my $d = "$f$s"; my $o = $b{$d} / $ditotal{b}; $rowsum{$f} += $b{$d}; $colsum{$s} += $b{$d}; my $e = ($b{$f} * $b{$s}) / ($monototal{b}**2); printf ("%5.4f ",($o/$e)); $cell += $b{$d} * log $b{$d}; } $rowcol += $rowsum{$f} * log ($rowsum{$f}); print "\n"; } print "\n"; foreach my $s (qw/A C G T/) { $rowcol += $colsum{$s} * log ($colsum{$s}); } $G = 2 * ($cell - $rowcol + ($ditotal{b} * log $ditotal{b})); printf "The G-test statistic G for the target = %6.4f with 9 degrees of freedom.\n",$G; $cp = Statistics::Distributions::chisqrprob (9,$G); printf "The probability that sites are independent in the target is %6.4f\n",$cp; undef $G; print "\n"; print "Conditional dinucleotide ratios of target over background\n"; print " "; foreach my $s (qw/A C G T/) { printf "%-6s ",$s; } print "\n"; foreach my $f (qw/A C G T/) { printf "%-6s ",$f; foreach my $s (qw/A C G T/) { my $d = "$f$s"; printf ("%5.4f ",(($t{$d}/$ditotal{t})/($t{$f}/$monototal{t})) / (($b{$d}/$ditotal{b})/($b{$f}/$monototal{b}))); } print "\n"; } print "\n"; __DATA__ t A 9162 t C 8630 t G 10694 t T 10880 t AA 2471 t AC 1421 t AG 2768 t AT 2502 t CA 1958 t CC 1995 t CG 2254 t CT 2423 t GA 2490 t GC 2244 t GG 2991 t GT 2969 t TA 2322 t TC 3007 t TG 2462 t TT 3089 b A 5137356 b C 825895 b G 823707 b T 5139932 b AA 2982119 b AC 314202 b AG 257294 b AT 1583741 b CA 380463 b CC 138835 b CG 46357 b CT 260240 b GA 312201 b GC 58223 b GG 139153 b GT 314130 b TA 1464940 b TC 314553 b TG 381002 b TT 2979437