#!/usr/bin/perl -w my $usage = "perl parse_rm_align_file_paml_format.pl chromosome\n"; my $Chr = shift or die $usage; #Declare variables my @AlignArray = (); my %OutHash = (); my $Left = 0; my $RepName = ''; my $StartRow = 0; my $EndRow = 0; my $ConsString = ''; my $ChrString = ''; my $LoopNumber = 0; my $LastFound = 'E'; my $Complement = 'N'; #Create the align file name my $AlignFileName="<$Chr.fa.align"; #Create the out file name my $OutFileName="<$Chr.fa.out"; #Open the .align file for input open(FILE1, $AlignFileName); #Open the .out file for input open(FILE2, $OutFileName); print "Opening the align file\n"; #Load the align file into an array while() { #Push the entire line into the array row push(@AlignArray, $_); } print "Done\n"; print "Opening the out file\n"; #Load the out file into a hash while() { #Get only the left and the repname from the .out file and add them to the %OutHash my($Key, $Value) = (split)[7,9]; if (defined($Key)) { $OutHash{$Key} = $Value; } } print "Done\n"; #Loop through the AlignArray and parse it for (my $i = 0; $i <= $#AlignArray; $i++) { #Check to see if the row begins with a number. If so, this is the beginning of a new section if ((substr($AlignArray[$i], 0, 1) =~ /[0-9]/) && ($LastFound eq 'E')) { #Get the left and the repname my($Temp1, $Temp2, $Temp3, $Temp4, $Temp5, $Temp6, $Temp7, $Left, $Temp8, $Temp9, $Temp10, $Temp11, $Temp12); ($Temp1, $Temp2, $Temp3, $Temp4, $Temp5, $Temp6, $Temp7, $Left, $Temp8, $Temp9, $Temp10, $Temp11, $Temp12) = split(' ', $AlignArray[$i]); $RepName = ''; #Now that we have the left, we can get the correct repname from the %OutHash if(defined($OutHash{$Left})) { $RepName = $OutHash{$Left}; } #Set $StartRow, used to mark the beginning of a section $StartRow = $i; $LastFound = 'S'; } #Now we need to find the end of the section if (((substr($AlignArray[$i], 0, 5)) eq 'Trans') && ($LastFound eq 'S')) { #Set $EndRow, used to mark the end of a section $EndRow = $i; #Now, do a "sub-loop", from $StartRow to $EndRow, finding the sequences and writing them to files #First, figure out how many sequence sections need to be looped through $LoopNumber = (($EndRow - $StartRow - 2) / 4); #Begin at the first row of the consensus sequence $Counter1 = $StartRow + 4; #Begin at the first row of the chromosome sequence $Counter2 = $StartRow + 2; #Clear $ConsString $ConsString = ''; #Clear $ChrString $ChrString = ''; for (my $j = 1; $j <= $LoopNumber; $j++) { my ($TempString1, $Temp1) = split(' ', substr($AlignArray[$Counter1], 25)); $ConsString = $ConsString.$TempString1; #Remove all but ATGC from $ConsString $ConsString =~ s/[^ATGC]/-/g; if (substr($AlignArray[$Counter1], 0, 1) eq 'C') { $Complement = 'Y'; } else { $Complement = 'N'; } $Counter1 = $Counter1 + 4; } for (my $k = 1; $k <= $LoopNumber; $k++) { my ($TempString2, $Temp2) = split(' ', substr($AlignArray[$Counter2], 25)); $ChrString = $ChrString.$TempString2; #Remove all but ATGC from $ConsString $ChrString =~ s/[^ATGC]/-/g; $Counter2 = $Counter2 + 4; } #If the Consensus sequence is the complement, I need to remove the GC dinucleotides and #the corresponding sequences in the chromosome sequence if ($Complement eq 'Y') { for (my $c = 0; $c < length($ConsString); $c++) { my $IndexPlace = index($ConsString, 'GC'); if ($IndexPlace != -1) { $ConsString = substr($ConsString, 0, ($IndexPlace)).substr($ConsString, $IndexPlace + 2); $ChrString = substr($ChrString, 0, ($IndexPlace)).substr($ChrString, $IndexPlace + 2); } } for (my $e = 0; $e < length($ConsString); $e++) { my $IndexPlace = index($ConsString, '-'); if ($IndexPlace != -1) { $ConsString = substr($ConsString, 0, ($IndexPlace)).substr($ConsString, $IndexPlace + 1); $ChrString = substr($ChrString, 0, ($IndexPlace)).substr($ChrString, $IndexPlace + 1); } } for (my $f = 0; $f < length($ChrString); $f++) { my $IndexPlace = index($ChrString, '-'); if ($IndexPlace != -1) { $ConsString = substr($ConsString, 0, ($IndexPlace)).substr($ConsString, $IndexPlace + 1); $ChrString = substr($ChrString, 0, ($IndexPlace)).substr($ChrString, $IndexPlace + 1); } } } #If the Consensus sequence is not the complement, I need to remove the CG dinucleotides and #the corresponding sequences in the chromosome sequence if ($Complement eq 'N') { for (my $d = 0; $d < length($ConsString); $d++) { my $IndexPlace2 = index($ConsString, 'CG'); if ($IndexPlace2 != -1) { $ConsString = substr($ConsString, 0, ($IndexPlace2)).substr($ConsString, $IndexPlace2 + 2); $ChrString = substr($ChrString, 0, ($IndexPlace2)).substr($ChrString, $IndexPlace2 + 2); } } for (my $g = 0; $g < length($ConsString); $g++) { my $IndexPlace = index($ConsString, '-'); if ($IndexPlace != -1) { $ConsString = substr($ConsString, 0, ($IndexPlace)).substr($ConsString, $IndexPlace + 1); $ChrString = substr($ChrString, 0, ($IndexPlace)).substr($ChrString, $IndexPlace + 1); } } for (my $h = 0; $h < length($ChrString); $h++) { my $IndexPlace = index($ChrString, '-'); if ($IndexPlace != -1) { $ConsString = substr($ConsString, 0, ($IndexPlace)).substr($ConsString, $IndexPlace + 1); $ChrString = substr($ChrString, 0, ($IndexPlace)).substr($ChrString, $IndexPlace + 1); } } } #Open the file to print the chromosome sequences to #Make sure there are no slashes in the RepName $RepName =~ s/\//-/; $FileName = ">>".$RepName.".chr"; open(CHRFILE, $FileName) or die "Could not open $FileName\n"; #Print the $ChrString print CHRFILE "$ChrString"; #Close the file close(CHRFILE); #Open the file to print the consensus sequences to $FileName = ">>".$RepName.".con"; open(CONFILE, $FileName) or die "Could not open $FileName\n";; #Print the $ConsString print CONFILE "$ConsString"; #Close the file close(CONFILE); $LastFound = 'E'; } } close(FILE1); close(FILE2); exit;