#!/usr/bin/perl -w # # This file looks for nested TEs using the RepeatMasker rmsk files as input. # A TE has suffered an insertion if another TE has inserted into it. # The TE that suffered the insertion is the nesting TE. The TE that inserted # is the nested TE. # To find a TE that has nested within another TE, the script loops through # each rmsk file. For each TE, the previous and next TE in the file are # compared. If the previous and next TEs have the same repName and strand, # the genoEnd of the previous TE is within 50 bp of the genoStart of the # nested TE, the genoStart of the next TE is within 50 bp of the genoEnd of # the nested TE, and the repEnd of the previous TE is within +/- 20 bp of # the repStart of the next TE, then the TE is determined to be nested # within the previous/next TE. # The data is output to a comma separated value file. # my $usage = "perl find_nestings.pl chromosome\n"; my $Chr = shift or die $usage; # Declare the variables that will be needed @nesting_array = (); # Open the files that will be needed. # The input files used in this analysis were shortened .rmsk files. # Each .rmsk file had all but the LINEs, SINEs, DNAs and LTRs removed. open(FILE1, "<$Chr"."_rmsk.short.txt"); # The output file is a comma separated value file that can be imported into Excel. open(FILE2, ">>$Chr".".nested.csv"); # Open the rmsk file and put all of the lines into an array. while() { chomp($temp = $_); push(@nesting_array, $temp); } close(FILE1); # Loop through the array and look for nestings for (my $i = 1; $i < $#nesting_array; $i++) { #Get the fields for the current record my ($milliDiv, $milliDel, $milliIns, $genoName, $genoStart, $genoEnd, $genoLeft, $strand, $repName, $repClass, $repFamily, $repStart, $repEnd, $repLeft) = split('\t', $nesting_array[$i]); #Get the fields for the previous record my ($milliDivU, $milliDelU, $milliInsU, $genoNameU, $genoStartU, $genoEndU, $genoLeftU, $strandU, $repNameU, $repClassU, $repFamilyU, $repStartU, $repEndU, $repLeftU) = split('\t', $nesting_array[$i - 1]); #Get the fields for the next record my ($milliDivD, $milliDelD, $milliInsD, $genoNameD, $genoStartD, $genoEndD, $genoLeftD, $strandD, $repNameD, $repClassD, $repFamilyD, $repStartD, $repEndD, $repLeftD) = split('\t', $nesting_array[$i + 1]); # Tests for nesting, cluster, or unknown # Find potential nestings # Check to see if the upstream and downstream repNames and strands are the same if (($repNameU eq $repNameD) && ($strandU eq $strandD)) { # Check to see if the upstream genoEnd is within 50 bp of the genoStart of this TE and if the # downstream genoStart is within 50 bp of the genoEnd of this TE if ((($genoStart - $genoEndU) <= 50) && (($genoStartD - $genoEnd) <= 50)) { # Check to see if the upstream and downstream TE repeat positions are within 20 bp of each other. # First, check the + strand if (($strandU eq '+') && (($repStartD - $repEndU) <= 20) && (($repStartD - $repEndU) >= -20)) { # Print the data if ($strand eq '+') { print FILE2 "$milliDiv,$genoName,$genoStart,$genoEnd,$strand,$repName,$repClass,$repStart,$repEnd,$repNameU,$repClassU,$milliDivU,$genoEndU,$strandU,$repStartU,$repEndU,$repNameD,$repClassD,$milliDivD,$genoStartD,$strandD,$repStartD,$repEndD\n"; } if ($strand eq '-') { print FILE2 "$milliDiv,$genoName,$genoStart,$genoEnd,$strand,$repName,$repClass,$repLeft,$repEnd,$repNameU,$repClassU,$milliDivU,$genoEndU,$strandU,$repStartU,$repEndU,$repNameD,$repClassD,$milliDivD,$genoStartD,$strandD,$repStartD,$repEndD\n"; } } # Next, check the - strand if (($strandU eq '-') && (($repLeftU - $repEndD) <= 20) && (($repLeftU - $repEndD) >= -20)) { # Print the data if ($strand eq '+') { print FILE2 "$milliDiv,$genoName,$genoStart,$genoEnd,$strand,$repName,$repClass,$repStart,$repEnd,$repNameU,$repClassU,$milliDivU,$genoEndU,$strandU,$repLeftU,$repEndU,$repNameD,$repClassD,$milliDivD,$genoStartD,$strandD,$repLeftD,$repEndD\n"; } if ($strand eq '-') { print FILE2 "$milliDiv,$genoName,$genoStart,$genoEnd,$strand,$repName,$repClass,$repLeft,$repEnd,$repNameU,$repClassU,$milliDivU,$genoEndU,$strandU,$repLeftU,$repEndU,$repNameD,$repClassD,$milliDivD,$genoStartD,$strandD,$repLeftD,$repEndD\n"; } } } } } close(FILE2); exit;