#Ruby script that extracts insertion sites from mapped sam file #!/usr/bin/ruby ################################################################################ # # Extract insertion points from mappings in SAM format. # # This script parses a list of read mappings in SAM format and produces genome # annotations, where each annotation represents an insertion point implied by # one or more read mappings. The annotations are generated in GFF3 and bedGraph # formats. The script also displays insertion point counts for each chromosome # in the genome. # # The translation from read mappings to insertion points is specific to the Tn7 # insertion library project. For details, please see the PowerPoint slides # describing this project. # # This script can easily be modified to report only those insertion points # satisfying criteria such as a minimal number of read mappings or the presence # of read mappings in both directions. # # Usage: extract_insertion_points.rb [--stack] [--score] # # Parameters: # # sam-path # # The path of a file in SAM format that contains mappings of short reads on # a target genome. # # Options: # # --stack # # Produce GFF3 output that contains an entry for each read mapping at an # insertion point, instead of just one entry for each insertion point. This # might be convenient for use with some annotation viewers that stack # overlapping annotations; the height of the stack will then indicate the # number of mappings found at each insertion point. # # --score # # Produce GFF3 output that stores the read mapping count at each insertion # point in the GFF3 'score' field. Otherwise the field contains a period, # which is the GFF3 way of indicating that no value is intended. # ################################################################################ require 'pp' class InsertionAnnotator attr_accessor :forward_insertions attr_accessor :reverse_insertions attr_accessor :combined_insertions public def initialize(sam_path, stack = false, score = false) @sam_path = sam_path @stack = stack @score = score @forward_insertions = {} @reverse_insertions = {} @combined_insertions = {} end public def annotate @max_count = 0 File.open(@sam_path, 'r') do |sam_file| line_count = 0 while line = sam_file.gets next if line.empty? || (line[0] == '@') qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual = line.split("\t") flag = flag.to_i if flag rname = rname.intern if rname pos = pos.to_i if pos seq = seq.strip if seq next if !flag || !rname || !pos || !seq direction = (flag && (flag & 0x10 != 0)) ? :reverse : :forward # The sequence length can also be obtained from the CIGAR field, e.g. for input # files in which the seq field is not provided. length = seq.length if direction == :reverse insertion_pos = pos + length - 6 else insertion_pos = pos - 1 end insertions = (direction == :reverse) ? @reverse_insertions : @forward_insertions insertions[rname] ||= {} insertion = insertions[rname][insertion_pos] if insertion insertion[:count] += 1 @max_count = [@max_count, insertion[:count]].max else insertions[rname][insertion_pos] = { :chromosome => rname, :position => insertion_pos, :direction => direction, :count => 1 } @max_count = 1 if @max_count < 1 end line_count += 1 puts "Processed #{line_count} lines\n" if line_count % 100 == 0 end end @combined_insertions = Marshal.load(Marshal.dump(@forward_insertions)) @reverse_insertions.each do |chromosome, insertions| insertions.each do |position, insertion| @combined_insertions[chromosome] ||= {} existing_insertion = @combined_insertions[chromosome][position] if existing_insertion existing_insertion[:direction] = :both existing_insertion[:count] += insertion[:count] else @combined_insertions[chromosome][position] = Marshal.load(Marshal.dump(insertion)) end end end @forward_insertions = @forward_insertions.values.collect { |insertions| insertions.values }.flatten(1).sort_by { | insertion| [ insertion[:chromosome].to_s, insertion[:position] ] } @reverse_insertions = @reverse_insertions.values.collect { |insertions| insertions.values }.flatten(1).sort_by { | insertion| [ insertion[:chromosome].to_s, insertion[:position] ] } @combined_insertions = @combined_insertions.values.collect { |insertions| insertions.values }.flatten(1).sort_by { | insertion| [ insertion[:chromosome].to_s, insertion[:position] ] } print_all_counts create_gff3_insertion_annotations('forward_insertions.gff3', @forward_insertions) create_gff3_insertion_annotations('reverse_insertions.gff3', @reverse_insertions) create_gff3_insertion_annotations('insertions.gff3', @combined_insertions) create_bedGraph_insertion_annotations('forward_insertions.bedGraph', @forward_insertions) create_bedGraph_insertion_annotations('reverse_insertions.bedGraph', @reverse_insertions) create_bedGraph_insertion_annotations('insertions.bedGraph', @combined_insertions) end protected def print_all_counts counts = {} @combined_insertions.each do |i| c = i[:chromosome] if counts[c] counts[c] += 1 else counts[c] = 1 end end puts "Insertion point counts (all insertions)\n" print_counts(counts) puts "\n\n" counts = {} @combined_insertions.each do |i| next if i[:direction] != :both c=i[:chromosome] if counts[c] counts[c] += 1 else counts[c] = 1 end end puts "Insertion point counts (insertions read in both directions)\n" print_counts(counts) puts "\n\n" counts = {} @combined_insertions.each do |i| next if i[:count] <= 10 c=i[:chromosome] if counts[c] counts[c] += 1 else counts[c] = 1 end end puts "Insertion point counts (insertions with > 10 reads)\n" print_counts(counts) puts "\n\n" counts = {} @combined_insertions.each do |i| next if i[:direction] != :both next if i[:count] <= 10 c=i[:chromosome] if counts[c] counts[c] += 1 else counts[c] = 1 end end puts "Insertion point counts (insertions read in both directions, with > 10 reads)\n" print_counts(counts) puts "\n\n" end protected def print_counts(counts) total = 0 counts.keys.sort_by { |chromosome| chromosome.to_s.split(/(\d+)/) }.each do |chromosome| total += counts[chromosome] puts "#{chromosome}\t#{counts[chromosome]}\n" end puts "Total\t#{total}\n" end protected def create_bedGraph_insertion_annotations(path, insertions) File.open(path, 'w') do |file| insertions.each do |insertion| count = insertion[:count] bg_chrom = insertion[:chromosome] # Coordinates in the bedGraph format are zero-based, half-open. bg_chromStart = insertion[:position] bg_chromEnd = bg_chromStart bg_dataValue = count line = "#{bg_chrom}\t#{bg_chromStart}\t#{bg_chromEnd}\t#{bg_dataValue}\n" file.write line end end end protected def create_gff3_insertion_annotations(path, insertions) File.open(path, 'w') do |file| insertions.each do |insertion| count = insertion[:count] direction = insertion[:direction].to_s # The GFF3 seqid field contains a "landmark" relative to which positions are # expressed; this is the name of the sequence, e.g. the chromosome. gff3_seqid = insertion[:chromosome] gff3_type = 'insertion_site' # aka SO:0000366 # In GFF3 annotations of insertion points, the position given should indicate # the base to the left of the site. gff3_start = insertion[:position] gff3_end = gff3_start # The GFF3 score field is deliberately ill-defined and can store any # floating-point score. We write a period, the GFF way of indicating an # undefined field value. gff3_score = '.' # If the score option was enabled, we give a score in the range 0-100. The # score is a percant value equal to this insertion's read count divided by # the maximum read count of any insertion. # gff3_score = (count / @max_count * 100).to_s if @score gff3_score = count.to_s if @score # The GFF3 phase field is not used for insertions. gff3_phase = '.' line = "#{gff3_seqid}\tmrsfast-jeremy\t#{gff3_type}\t#{gff3_start}\t#{gff3_end}\t#{gff3_score}\t+\t#{gff3_phase}\tCount=#{count};Direction=#{direction}\n" if @stack count.times { file.write line } else file.write line end end end end end sam_path = ARGV[0] stack = ARGV.include?('--stack') score = ARGV.include?('--score') InsertionAnnotator.new(sam_path, stack, score).annotate #Python script that counts number of insertions/gene import sys insertFilePath = sys.argv[1] chromFilePath = sys.argv[2] inserts = [] with open(insertFilePath, 'r') as insertFile: for line in insertFile: if line: line = line.strip() inserts.append(int(line.split('\t')[1])) countMap = {} with open(chromFilePath, 'r') as chromFile: for line in chromFile: if line: line = line.strip() values = line.split('\t') chromRange = (int(values[2]), int(values[3]), values[4], values[1]) countMap[chromRange] = 0 for insert in inserts: if (insert >= chromRange[0]) and (insert <= chromRange[1]): countMap[chromRange] += 1 print insert for key in countMap.keys(): print '%s\t%d' % (str(key), countMap[key])