[BioRuby-cvs] bioruby/lib/bio/appl/hmmer report.rb,1.8,1.9
Katayama Toshiaki
k at pub.open-bio.org
Mon Oct 31 04:12:05 EST 2005
Update of /home/repository/bioruby/bioruby/lib/bio/appl/hmmer
In directory pub.open-bio.org:/tmp/cvs-serv31374/lib/bio/appl/hmmer
Modified Files:
report.rb
Log Message:
* Rewrited and contributed by Masashi Fujita
* DELIMITER, RS is defined for multiple reports
* Fixed for multiple reports - not expect first few banner lines
* Fixed for a bug that hangs on no hit
* Added support for rare tags such as CS and RF
* Bio::HMMER::Report#Hsp#csline, rfline methods are added
* Bio::HMMER::Report#hsps method is added for the result which contains
HSP but no Hit (caused by some combinations of threshold parameters of
HSP and Hit).
* Returned strings of description etc. is now stripped
* Changed unnecessary accessers to readers
Index: report.rb
===================================================================
RCS file: /home/repository/bioruby/bioruby/lib/bio/appl/hmmer/report.rb,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** report.rb 26 Sep 2005 13:00:05 -0000 1.8
--- report.rb 31 Oct 2005 09:12:03 -0000 1.9
***************
*** 1,6 ****
#
! # bio/appl/hmmer/report.rb - hmmsearch, hmmpfam parser
#
# Copyright (C) 2002 Hiroshi Suga <suga at biophys.kyoto-u.ac.jp>
#
# This library is free software; you can redistribute it and/or
--- 1,7 ----
#
! # bio/appl/hmmer/report.rb - hmmsearch, hmmpfam parserer
#
# Copyright (C) 2002 Hiroshi Suga <suga at biophys.kyoto-u.ac.jp>
+ # Copyright (C) 2005 Masashi Fujita <fujita at kuicr.kyoto-u.ac.jp>
#
# This library is free software; you can redistribute it and/or
***************
*** 25,123 ****
module Bio
class HMMER
class Report
def initialize(data)
- # HMM and sequence profiles
- data.sub!(/(.+ -$\n)(.+ -$\n)\n(.+?\n\n)Scores/m, '')
! @program = parse_program($1)
! @parameter = parse_parameter($2)
! @query_info =parse_query_info($3)
! case @program['name']
! when /hmmsearch/
! is_hmmsearch = true
else
! is_hmmsearch = false # hmmpfam
end
! # Scores for complete sequences. (parsed to Hit objects)
! data.sub!(/.+-$\n(.+?)\n\nParsed/m, '')
! @hits = []
! $1.each do |l|
! @hits.push(Hit.new(l))
! end
! # Scores for domains. (parsed to Hsp objects)
! data.sub!(/.+-$\n(.+?)\n\nAlignments of top-scoring domains:\n/m, '')
! hsps=[]
! $1.each do |l|
! hsps.push(Hsp.new(l,is_hmmsearch))
! end
! # Alignments
! if is_hmmsearch
! data.sub!(/(.+?)\n\n\nHistogram of all scores:\n/m, '')
! else
! data.sub!(/(.+?)\n\n\/\//m, '')
end
! $1.split(/^\S+.*?\n/).slice(1..-1).each_with_index do |al,k|
! al2 = al.gsub(/\n\n/,"\n").to_s.collect { |l|
! l.sub(/^.{19}/,'').sub(/\s(\d+|-)\s*$/,'')
! }
! align = ['', '', '']
! al2.each_with_index { |s,i| align[i%3] += s.chomp }
! align.each { |a| a.sub!(/^.{3}(.*).{3}$/, '\1') }
! hsps[k].hmmseq << align[0]
! hsps[k].midline << align[1]
! hsps[k].flatseq << align[2]
end
! hsps.each do |s|
! @hits.each do |h|
! if h.accession == s.accession
! h.hsps.push(s)
! next
! end
end
end
if is_hmmsearch
! data.sub!(/(.+?)\n\n\n%/m, '')
! @histogram = $1
! @statistical_detail = {}
! data.sub!(/(.+?)\n\n/m, '')
! $1.each do |l|
! @statistical_detail[$1] = $2.to_f if /^\s*(.+)\s*=\s*(\S+)/ =~ l
! end
! @total_seq_searched = nil
! data.sub!(/(.+?)\n\n/m, '')
! $1.each do |l|
! @total_seq_searched = $2.to_i if /^\s*(.+)\s*:\s*(\S+)/ =~ l
end
! @whole_seq_top_hits = {}
! data.sub!(/(.+?)\n\n/m, '')
! $1.each do |l|
! @whole_seq_top_hits[$1] = $2.to_i if /^\s*(.+)\s*:\s*(\S+)/ =~ l
end
! @domain_top_hits = {}
! data.each do |l|
! @domain_top_hits[$1] = $2.to_i if /^\s*(.+)\s*:\s*(\S+)/ =~ l
end
end
- end
- attr_reader :program, :parameter, :query_info, :hits,
- :histogram, :statistical_detail, :total_seq_searched,
- :whole_seq_top_hits, :domain_top_hits
! def each
! @hits.each do |x|
! yield x
end
end
class Hsp
def initialize(data, is_hmmsearch)
--- 26,147 ----
module Bio
class HMMER
+
+ def self.reports(input)
+ ary = []
+ input.each("\n//\n") do |data|
+ if block_given?
+ yield Report.new(data)
+ else
+ ary << Report.new(data)
+ end
+ end
+ return ary
+ end
+
+
+ # Bio::HMMER::Report
class Report
+ # for Bio::FlatFile support
+ DELIMITER = RS = "\n//\n"
+
def initialize(data)
! # The input data is divided into six data fields, i.e. header,
! # query infomation, hits, HSPs, alignments and search statistics.
! # However, header and statistics data don't necessarily exist.
! subdata, is_hmmsearch = get_subdata(data)
! # if header exists, parse it
! if subdata["header"]
! @program, @parameter = parse_header_data(subdata["header"])
else
! @program, @parameter = [{}, {}]
end
! @query_info = parse_query_info(subdata["query"])
! @hits = parse_hit_data(subdata["hit"])
! @hsps = parse_hsp_data(subdata["hsp"], is_hmmsearch)
! if @hsps != []
! # split alignment subdata into an array of alignments
! aln_ary = subdata["alignment"].split(/^\S+.*?\n/).slice(1..-1)
! # append alignment information to corresponding Hsp
! aln_ary.each_with_index do |aln, i|
! @hsps[i].set_alignment(aln)
! end
end
!
! # assign each Hsp object to its parent Hit
! hits_hash = {}
! @hits.each do |hit|
! hits_hash[hit.accession] = hit
end
! @hsps.each do |hsp|
! if hits_hash.has_key?(hsp.accession)
! hits_hash[hsp.accession].append_hsp(hsp)
end
end
+
+ # parse statistics (for hmmsearch)
if is_hmmsearch
! @histogram, @statistical_detail, @total_seq_searched, \
! @whole_seq_top_hits, @domain_top_hits = \
! parse_stat_data(subdata["statistics"])
! end
! end
! attr_reader :program, :parameter, :query_info, :hits, :hsps,
! :histogram, :statistical_detail, :total_seq_searched,
! :whole_seq_top_hits, :domain_top_hits
!
! def each
! @hits.each do |x|
! yield x
! end
! end
!
!
! # Bio::HMMER::Report::Hit
! class Hit
! def initialize(data)
! @hsps = Array.new
! if /^(\S+)\s+(.*?)\s+(\S+)\s+(\S+)\s+(\S+)$/ =~ data
! @accession, @description, @score, @evalue, @num =
! [$1, $2, $3.to_f, $4.to_f, $5.to_i]
end
+ end
+ attr_reader :hsps, :accession, :description, :score, :evalue, :num
! def each
! @hsps.each do |x|
! yield x
end
+ end
! alias target_id accession
! alias hit_id accession
! alias entry_id accession
! alias definition description
! alias bit_score score
!
! def target_def
! if @hsps.size == 1
! "<#{@hsps[0].domain}> #{@description}"
! else
! "<#{@num.to_s}> #{@description}"
end
end
! def append_hsp(hsp)
! @hsps << hsp
end
+
end
+ # Bio::HMMER::Report::Hsp
class Hsp
def initialize(data, is_hmmsearch)
***************
*** 137,229 ****
@query_frame = 1
@target_frame = 1
end
! attr_accessor :accession, :domain, :seq_f, :seq_t, :seq_ft,
:hmm_f, :hmm_t, :hmm_ft, :score, :evalue, :midline, :hmmseq,
! :flatseq, :query_frame, :target_frame
!
! def query_seq
! if @is_hmmsearch; @hmmseq else; @flatseq end
! end
!
! def target_seq
! if @is_hmmsearch; @flatseq else; @hmmseq end
! end
!
! def target_from
! if @is_hmmsearch; @seq_f else; @hmm_f end
! end
! def target_to
! if @is_hmmsearch; @seq_t else; @hmm_t end
end
! def query_from
! if @is_hmmsearch; @hmm_f else; @seq_f end
! end
! def query_to
! if @is_hmmsearch; @hmm_t else; @seq_t end
! end
- def bit_score; @score; end
- def target_id; @accession; end
end
! class Hit
! def initialize(data)
! @hsps = Array.new
! if /^(\S+)\s+(.*)\s+(\S+)\s+(\S+)\s+(\S+)$/ =~ data
! @accession, @description, @score, @evalue, @num =
! [$1, $2, $3.to_f, $4.to_f, $5.to_i]
! end
! end
! attr_accessor :hsps, :accession, :description, :score, :evalue, :num
! def each
! @hsps.each do |x|
! yield x
! end
end
! def target_id; @accession; end
! def hit_id; @accession; end
! def entry_id; @accession; end
! def definition; @description; end
! def bit_score; @score; end
! def target_def
! if @hsps.size == 1
! "<#{@hsps[0].domain}> #{@description}"
! else
! "<#{@num.to_s}> #{@description}"
! end
end
- end
! private
! def parse_program(data)
! hash = {}
! hash['name'], hash['version'], hash['copyright'], hash['license'] =
! data.split(/\n/)
! hash
end
! def parse_parameter(data)
! hash = {}
! data.each do |x|
! if /(.+):\s+(.*)/ =~ x
! hash[$1] = $2
end
end
! hash
end
def parse_query_info(data)
hash = {}
data.each do |x|
! if /(.+):\s+(.*)/ =~ x
hash[$1] = $2
elsif /\s+\[(.+)\]/ =~ x
--- 161,278 ----
@query_frame = 1
@target_frame = 1
+ # CS and RF lines are rarely used.
+ @csline = nil
+ @rfline = nil
end
! attr_reader :accession, :domain, :seq_f, :seq_t, :seq_ft,
:hmm_f, :hmm_t, :hmm_ft, :score, :evalue, :midline, :hmmseq,
! :flatseq, :query_frame, :target_frame, :csline, :rfline
! def set_alignment(aln)
! # First, split the input alignment into an array of
! # "alignment blocks." One block usually has three lines,
! # i.e. hmmseq, midline and flatseq.
! # However, although infrequent, it can contain CS or RF lines.
! aln.split(/ (?:\d+|-)\s*\n\n/).each do |blk|
! lines = blk.split(/\n/)
! cstmp = (lines[0] =~ /^ {16}CS/) ? lines.shift : nil
! rftmp = (lines[0] =~ /^ {16}RF/) ? lines.shift : nil
! aln_width = lines[0][/\S+/].length
! @csline = @csline.to_s + cstmp[19, aln_width] if cstmp
! @rfline = @rfline.to_s + rftmp[19, aln_width] if rftmp
! @hmmseq += lines[0][19, aln_width]
! @midline += lines[1][19, aln_width]
! @flatseq += lines[2][19, aln_width]
! end
! @csline = @csline[3...-3] if @csline
! @rfline = @rfline[3...-3] if @rfline
! @hmmseq = @hmmseq[3...-3]
! @midline = @midline[3...-3]
! @flatseq = @flatseq[3...-3]
end
! def query_seq; @is_hmmsearch ? @hmmseq : @flatseq; end
! def target_seq; @is_hmmsearch ? @flatseq : @hmmseq; end
! def target_from; @is_hmmsearch ? @seq_f : @hmm_f; end
! def target_to; @is_hmmsearch ? @seq_t : @hmm_t; end
! def query_from; @is_hmmsearch ? @hmm_f : @seq_f; end
! def query_to; @is_hmmsearch ? @hmm_t : @seq_t; end
! alias bit_score score
! alias target_id accession
end
! # Bio::HMMER::Report#get_subdata
! def get_subdata(data)
! subdata = {}
! header_prefix = '\Ahmm(search|pfam) - search'
! query_prefix = '^Query (HMM|sequence): .*\nAccession: '
! hit_prefix = '^Scores for (complete sequences|sequence family)'
! hsp_prefix = '^Parsed for domains:'
! aln_prefix = '^Alignments of top-scoring domains:\n'
! stat_prefix = '^\nHistogram of all scores:'
! # if header exists, get it
! if data =~ /#{header_prefix}/
! is_hmmsearch = ($1 == "search") # hmmsearch or hmmpfam
! subdata["header"] = data[/(\A.+?)(?=#{query_prefix})/m]
! else
! is_hmmsearch = false # if no header, assumed to be hmmpfam
end
! # get query, Hit and Hsp data
! subdata["query"] = data[/(#{query_prefix}.+?)(?=#{hit_prefix})/m]
! subdata["hit"] = data[/(#{hit_prefix}.+?)(?=#{hsp_prefix})/m]
! subdata["hsp"] = data[/(#{hsp_prefix}.+?)(?=#{aln_prefix})/m]
! # get alignment data
! if is_hmmsearch
! data =~ /#{aln_prefix}(.+?)#{stat_prefix}/m
! subdata["alignment"] = $1
! else
! data =~ /#{aln_prefix}(.+?)\/\/\n/m
! subdata["alignment"] = $1
! raise "multiple reports found" if $'.length > 0
end
+ # handle -A option of HMMER
+ cutoff_line = '\t\[output cut off at A = \d+ top alignments\]\n\z'
+ subdata["alignment"].sub!(/#{cutoff_line}/, '')
! # get statistics data
! subdata["statistics"] = data[/(#{stat_prefix}.+)\z/m]
! [subdata, is_hmmsearch]
end
+ private :get_subdata
+
+ # Bio::HMMER::Report#parse_header_data
+ def parse_header_data(data)
+ data =~ /\A(.+? - - -$\n)(.+? - - -$\n)\n\z/m
+ program_data = $1
+ parameter_data = $2
! program = {}
! program['name'], program['version'], program['copyright'], \
! program['license'] = program_data.split(/\n/)
!
! parameter = {}
! parameter_data.each do |x|
! if /^(.+?):\s+(.*?)\s*$/ =~ x
! parameter[$1] = $2
end
end
!
! [program, parameter]
end
+ private :parse_header_data
+ # Bio::HMMER::Report#parse_query_info
def parse_query_info(data)
hash = {}
data.each do |x|
! if /^(.+?):\s+(.*?)\s*$/ =~ x
hash[$1] = $2
elsif /\s+\[(.+)\]/ =~ x
***************
*** 233,236 ****
--- 282,351 ----
hash
end
+ private :parse_query_info
+
+ # Bio::HMMER::Report#parse_hit_data
+ def parse_hit_data(data)
+ data.sub!(/.+?---\n/m, '').chop!
+ hits = []
+ return hits if data == "\t[no hits above thresholds]\n"
+ data.each do |l|
+ hits.push(Hit.new(l))
+ end
+ hits
+ end
+ private :parse_hit_data
+
+ # Bio::HMMER::Report#parse_hsp_data
+ def parse_hsp_data(data, is_hmmsearch)
+ data.sub!(/.+?---\n/m, '').chop!
+ hsps=[]
+ return hsps if data == "\t[no hits above thresholds]\n"
+ data.each do |l|
+ hsps.push(Hsp.new(l, is_hmmsearch))
+ end
+ return hsps
+ end
+ private :parse_hsp_data
+
+ # Bio::HMMER::Report#parse_stat_data
+ def parse_stat_data(data)
+ data.sub!(/\nHistogram of all scores:\n(.+?)\n\n\n%/m, '')
+ histogram = $1
+
+ statistical_detail = {}
+ data.sub!(/(.+?)\n\n/m, '')
+ $1.each do |l|
+ statistical_detail[$1] = $2.to_f if /^\s*(.+?)\s*=\s*(\S+)/ =~ l
+ end
+
+ total_seq_searched = nil
+ data.sub!(/(.+?)\n\n/m, '')
+ $1.each do |l|
+ total_seq_searched = $2.to_i if /^\s*(.+)\s*:\s*(\S+)/ =~ l
+ end
+
+ whole_seq_top_hits = {}
+ data.sub!(/(.+?)\n\n/m, '')
+ $1.each do |l|
+ if /^\s*(.+?):\s*(\d+)\s*$/ =~ l
+ whole_seq_top_hits[$1] = $2.to_i
+ elsif /^\s*(.+?):\s*(\S+)\s*$/ =~ l
+ whole_seq_top_hits[$1] = $2
+ end
+ end
+
+ domain_top_hits = {}
+ data.each do |l|
+ if /^\s*(.+?):\s*(\d+)\s*$/ =~ l
+ domain_top_hits[$1] = $2.to_i
+ elsif /^\s*(.+?):\s*(\S+)\s*$/ =~ l
+ domain_top_hits[$1] = $2
+ end
+ end
+
+ [histogram, statistical_detail, total_seq_searched, \
+ whole_seq_top_hits, domain_top_hits]
+ end
+ private :parse_stat_data
end
***************
*** 242,245 ****
--- 357,374 ----
if __FILE__ == $0
+ =begin
+
+ #
+ # for multiple reports in a single output file (hmmpfam)
+ #
+ Bio::HMMER.reports(ARGF.read) do |report|
+ report.hits.each do |hit|
+ hit.hsps.each do |hsp|
+ end
+ end
+ end
+
+ =end
+
begin
require 'pp'
***************
*** 387,390 ****
--- 516,526 ----
== Bio::HMMER::Report::Hsp
+ --- Bio::HMMER::Report#hsps
+
+ Returns an Array of Bio::HMMER::Report::Hsp objects.
+ Under special circumstances, some HSPs do not have
+ parent Hit objects. If you want to access such HSPs,
+ use this method.
+
--- Bio::HMMER::Report::Hsp#target_id
--- Bio::HMMER::Report::Hsp#accession
***************
*** 413,416 ****
--- 549,555 ----
--- Bio::HMMER::Report::Hsp#target_from
--- Bio::HMMER::Report::Hsp#target_to
+
+ --- Bio::HMMER::Report::Hsp#csline
+ --- Bio::HMMER::Report::Hsp#rfline
=end
More information about the bioruby-cvs
mailing list