| Class | Bio::REBASE |
| In: |
lib/bio/db/rebase.rb
|
| Parent: | Object |
bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files
| Author: | Trevor Wennblom <trevor@corevx.com> |
| Copyright: | Copyright (c) 2005-2007 Midwinter Laboratories, LLC (midwinterlabs.com) |
| License: | The Ruby License |
Bio::REBASE provides utilties for interacting with REBASE data in EMBOSS format. REBASE is the Restriction Enzyme Database, more information can be found here:
EMBOSS formatted files located at:
These files are the same as the "emboss_?.???" files located at:
To easily get started with the data you can simply type this command at your shell prompt:
% wget ftp://ftp.neb.com/pub/rebase/emboss*
require 'bio'
require 'pp'
enz = File.read('emboss_e')
ref = File.read('emboss_r')
sup = File.read('emboss_s')
# When creating a new instance of Bio::REBASE
# the contents of the enzyme file must be passed.
# The references and suppiers file contents
# may also be passed.
rebase = Bio::REBASE.new( enz )
rebase = Bio::REBASE.new( enz, ref )
rebase = Bio::REBASE.new( enz, ref, sup )
# The 'read' class method allows you to read in files
# that are REBASE EMBOSS formatted
rebase = Bio::REBASE.read( 'emboss_e' )
rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' )
rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
# The data loaded may be saved in YAML format
rebase.save_yaml( 'enz.yaml' )
rebase.save_yaml( 'enz.yaml', 'ref.yaml' )
rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
# YAML formatted files can also be read with the
# class method 'load_yaml'
rebase = Bio::REBASE.load_yaml( 'enz.yaml' )
rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' )
rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
pp rebase.enzymes[0..4] # ["AarI", "AasI", "AatI", "AatII", "Acc16I"]
pp rebase.enzyme_name?('aasi') # true
pp rebase['AarI'].pattern # "CACCTGC"
pp rebase['AarI'].blunt? # false
pp rebase['AarI'].organism # "Arthrobacter aurescens SS2-322"
pp rebase['AarI'].source # "A. Janulaitis"
pp rebase['AarI'].primary_strand_cut1 # 11
pp rebase['AarI'].primary_strand_cut2 # 0
pp rebase['AarI'].complementary_strand_cut1 # 15
pp rebase['AarI'].complementary_strand_cut2 # 0
pp rebase['AarI'].suppliers # ["F"]
pp rebase['AarI'].supplier_names # ["Fermentas International Inc."]
pp rebase['AarI'].isoschizomers # Currently none stored in the references file
pp rebase['AarI'].methylation # ""
pp rebase['EcoRII'].methylation # "2(5)"
pp rebase['EcoRII'].suppliers # ["F", "J", "M", "O", "S"]
pp rebase['EcoRII'].supplier_names # ["Fermentas International Inc.", "Nippon Gene Co., Ltd.",
# "Roche Applied Science", "Toyobo Biochemicals",
# "Sigma Chemical Corporation"]
# Number of enzymes in the database
pp rebase.size # 673
pp rebase.enzymes.size # 673
rebase.each do |name, info|
pp "#{name}: #{info.methylation}" unless info.methylation.empty?
end
Read YAML formatted files
rebase = Bio::REBASE.load_yaml( 'enz.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
Arguments
| Returns: | Bio::REBASE object |
# File lib/bio/db/rebase.rb, line 261
261: def self.load_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
262: e = YAML.load_file(f_enzyme)
263: r = f_reference ? YAML.load_file(f_reference) : nil
264: s = f_supplier ? YAML.load_file(f_supplier) : nil
265: self.new(e,r,s,true)
266: end
Constructor
Arguments
| Returns: | Bio::REBASE |
# File lib/bio/db/rebase.rb, line 174
174: def initialize( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false )
175: # All your REBASE are belong to us.
176:
177: if yaml
178: @enzyme_data = enzyme_lines
179: @reference_data = reference_lines
180: @supplier_data = supplier_lines
181: else
182: @enzyme_data = parse_enzymes(enzyme_lines)
183: @reference_data = parse_references(reference_lines)
184: @supplier_data = parse_suppliers(supplier_lines)
185: end
186:
187: EnzymeEntry.supplier_data = @supplier_data
188: setup_enzyme_data
189: end
Read REBASE EMBOSS-formatted files
rebase = Bio::REBASE.read( 'emboss_e' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
Arguments
| Returns: | Bio::REBASE object |
# File lib/bio/db/rebase.rb, line 243
243: def self.read( f_enzyme, f_reference=nil, f_supplier=nil )
244: e = IO.readlines(f_enzyme)
245: r = f_reference ? IO.readlines(f_reference) : nil
246: s = f_supplier ? IO.readlines(f_supplier) : nil
247: self.new(e,r,s)
248: end
Check if supplied name is the name of an available enzyme
Arguments
| Returns: | +true/false+ |
# File lib/bio/db/rebase.rb, line 207
207: def enzyme_name?(name)
208: enzymes.each do |e|
209: return true if e.downcase == name.downcase
210: end
211: return false
212: end
Save the current data
rebase.save_yaml( 'enz.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
Arguments
| Returns: | nothing |
# File lib/bio/db/rebase.rb, line 225
225: def save_yaml( f_enzyme, f_reference=nil, f_supplier=nil )
226: File.open(f_enzyme, 'w') { |f| f.puts YAML.dump(@enzyme_data) }
227: File.open(f_reference, 'w') { |f| f.puts YAML.dump(@reference_data) } if f_reference
228: File.open(f_supplier, 'w') { |f| f.puts YAML.dump(@supplier_data) } if f_supplier
229: return
230: end
data is a hash indexed by the :name of each entry which is also a hash
# File lib/bio/db/rebase.rb, line 315
315: def parse_enzymes( lines )
316: data = {}
317: return data if lines == nil
318: lines.each_line do |line|
319: next if line[0].chr == '#'
320: line.chomp!
321:
322: a = line.split("\s")
323:
324: data[ a[0] ] = {
325: :name => a[0],
326: :pattern => a[1],
327: :len => a[2],
328: :ncuts => a[3],
329: :blunt => a[4],
330: :c1 => a[5],
331: :c2 => a[6],
332: :c3 => a[7],
333: :c4 => a[8]
334: }
335: end # lines.each
336: data
337: end
data is a hash indexed by the :name of each entry which is also a hash
# File lib/bio/db/rebase.rb, line 342
342: def parse_references( lines )
343: data = {}
344: return data if lines == nil
345: index = 1
346: h = {}
347: references_left = 0
348:
349: lines.each_line do |line|
350: next if line[0].chr == '#' # Comment
351: next if line[0..1] == '//' # End of entry marker
352: line.chomp!
353:
354: if (1..7).include?( index )
355: h[index] = line
356: references_left = h[index].to_i if index == 7
357: index += 1
358: next
359: end
360:
361: if index == 8
362: h[index] ||= []
363: h[index] << line
364: references_left -= 1
365: end
366:
367: if references_left == 0
368: data[ h[1] ] = {
369: :name => h[1],
370: :organism => h[2],
371: :isoschizomers => h[3],
372: :methylation => h[4],
373: :source => h[5],
374: :suppliers => h[6],
375: :number_of_references => h[7],
376: :references => h[8]
377: }
378: index = 1
379: h = {}
380: end
381: end # lines.each
382: data
383: end
data is a hash indexed by the supplier code
data[supplier_code] returns the suppliers name
# File lib/bio/db/rebase.rb, line 388
388: def parse_suppliers( lines )
389: data = {}
390: return data if lines == nil
391: lines.each_line do |line|
392: next if line[0].chr == '#'
393: data[$1] = $2 if line =~ %r{(.+?)\s(.+)}
394: end
395: data
396: end
Takes a string in one of the three formats listed below and returns a Bio::Reference object
# File lib/bio/db/rebase.rb, line 404
404: def raw_to_reference( line )
405: a = line.split(', ')
406:
407: if a[-1] == 'Unpublished observations.'
408: title = a.pop.chop
409: pages = volume = year = journal = ''
410: else
411: title = ''
412:
413: pages_or_volume = a.pop.chop
414: if pages_or_volume =~ %r{pp\.\s}
415: pages = pages_or_volume
416: pages.gsub!('pp. ', '')
417: volume = a.pop
418: else
419: pages = ''
420: volume = pages_or_volume
421: end
422:
423: volume.gsub!('vol. ', '')
424:
425: year_and_journal = a.pop
426: year_and_journal =~ %r{\((\d+)\)\s(.+)}
427: year = $1
428: journal = $2
429: end
430:
431: authors = []
432:
433: last_name = nil
434: a.each do |e|
435: if last_name
436: authors << "#{last_name}, #{e}"
437: last_name = nil
438: else
439: last_name = e
440: end
441: end
442:
443: ref = {
444: 'title' => title,
445: 'pages' => pages,
446: 'volume' => volume,
447: 'year' => year,
448: 'journal' => journal,
449: 'authors' => authors,
450: }
451:
452: Bio::Reference.new(ref)
453: end
# File lib/bio/db/rebase.rb, line 296
296: def setup_enzyme_and_reference_association
297: return unless @reference_data
298: @reference_data.each do |name, hash|
299: d = @data[name]
300: [:organism, :isoschizomers,
301: :methylation, :source].each { |k| d[k] = hash[k] }
302: d.suppliers = hash[:suppliers].split('')
303: d.references = []
304: hash[:references].each { |k| d.references << raw_to_reference(k) }
305: end
306: end
# File lib/bio/db/rebase.rb, line 272
272: def setup_enzyme_data
273: @data = {}
274:
275: @enzyme_data.each do |name, hash|
276: @data[name] = EnzymeEntry.new
277: d = @data[name]
278: d.pattern = hash[:pattern]
279: # d.blunt?= is a syntax error
280: d[:blunt?] = (hash[:blunt].to_i == 1 ? true : false)
281: d.primary_strand_cut1 = hash[:c1].to_i
282: d.complementary_strand_cut1 = hash[:c2].to_i
283: d.primary_strand_cut2 = hash[:c3].to_i
284: d.complementary_strand_cut2 = hash[:c4].to_i
285:
286: # Set up keys just in case there's no reference data supplied
287: [:organism, :isoschizomers,
288: :methylation, :source].each { |k| d[k] = '' }
289: d.suppliers = []
290: d.references = []
291: end
292:
293: setup_enzyme_and_reference_association
294: end