| Class | Bio::FlatFile::AutoDetect |
| In: |
lib/bio/io/flatfile/autodetection.rb
|
| Parent: | Object |
AutoDetect automatically determines database class of given data.
| TopRule | = | RuleSpecial.new('top') | Special element that is always top priority. | |
| BottomRule | = | RuleSpecial.new('bottom') | Special element that is always bottom priority. |
make a new autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 361
361: def self.[](*arg)
362: a = self.new
363: arg.each { |e| a.add(e) }
364: a
365: end
returns the default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 348
348: def self.default
349: unless @default then
350: @default = self.make_default
351: end
352: @default
353: end
sets the default autodetect object.
# File lib/bio/io/flatfile/autodetection.rb, line 356
356: def self.default=(ad)
357: @default = ad
358: end
make a default of default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 368
368: def self.make_default
369: a = self[
370: genbank = RuleRegexp[ 'Bio::GenBank',
371: /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ],
372: genpept = RuleRegexp[ 'Bio::GenPept',
373: /^LOCUS .+ aa .+/ ],
374: medline = RuleRegexp[ 'Bio::MEDLINE',
375: /^PMID\- [0-9]+$/ ],
376: embl = RuleRegexp[ 'Bio::EMBL',
377: /^ID .+\; .*(DNA|RNA|XXX)\;/ ],
378: sptr = RuleRegexp2[ 'Bio::SPTR',
379: /^ID .+\; *PRT\;/,
380: /^ID [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
381: prosite = RuleRegexp[ 'Bio::PROSITE',
382: /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
383: transfac = RuleRegexp[ 'Bio::TRANSFAC',
384: /^AC [-A-Za-z0-9_\.]+$/ ],
385:
386: aaindex = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
387: if /^H [-A-Z0-9_\.]+$/ =~ text then
388: if text =~ /^M [rc]/ then
389: Bio::AAindex2
390: elsif text =~ /^I A\/L/ then
391: Bio::AAindex1
392: else
393: false #fail to determine
394: end
395: else
396: nil
397: end
398: end,
399:
400: litdb = RuleRegexp[ 'Bio::LITDB',
401: /^CODE [0-9]+$/ ],
402: brite = RuleRegexp[ 'Bio::KEGG::BRITE',
403: /^Entry [A-Z0-9]+/ ],
404: orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
405: /^ENTRY .+ KO\s*/ ],
406: drug = RuleRegexp[ 'Bio::KEGG::DRUG',
407: /^ENTRY .+ Drug\s*/ ],
408: glycan = RuleRegexp[ 'Bio::KEGG::GLYCAN',
409: /^ENTRY .+ Glycan\s*/ ],
410: enzyme = RuleRegexp2[ 'Bio::KEGG::ENZYME',
411: /^ENTRY EC [0-9\.]+$/,
412: /^ENTRY .+ Enzyme\s*/
413: ],
414: compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
415: /^ENTRY C[A-Za-z0-9\._]+$/,
416: /^ENTRY .+ Compound\s*/
417: ],
418: reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
419: /^ENTRY R[A-Za-z0-9\._]+$/,
420: /^ENTRY .+ Reaction\s*/
421: ],
422: genes = RuleRegexp[ 'Bio::KEGG::GENES',
423: /^ENTRY .+ (CDS|gene|.*RNA|Contig) / ],
424: genome = RuleRegexp[ 'Bio::KEGG::GENOME',
425: /^ENTRY [a-z]+$/ ],
426:
427: fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
428: 'Bio::FANTOM::MaXML::Sequence') do |text|
429: if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
430: case $1
431: when 'clusters'
432: Bio::FANTOM::MaXML::Cluster
433: when 'sequences'
434: Bio::FANTOM::MaXML::Sequence
435: else
436: nil #unknown
437: end
438: else
439: nil
440: end
441: end,
442:
443: pdb = RuleRegexp[ 'Bio::PDB',
444: /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ],
445: het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
446: /^RESIDUE +.+ +\d+\s*$/ ],
447:
448: clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
449: /^CLUSTAL .*\(.*\).*sequence +alignment/,
450: /^CLUSTAL FORMAT for T-COFFEE/ ],
451:
452: gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
453: /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],
454:
455: gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
456: /^!!(N|A)A_SEQUENCE .+/ ],
457:
458: blastxml = RuleRegexp[ 'Bio::Blast::Report',
459: /\<\!DOCTYPE BlastOutput PUBLIC / ],
460: wublast = RuleRegexp[ 'Bio::Blast::WU::Report',
461: /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
462: wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
463: /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
464: blast = RuleRegexp[ 'Bio::Blast::Default::Report',
465: /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
466: tblast = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
467: /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
468: rpsblast = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
469: /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
470:
471: blat = RuleRegexp[ 'Bio::Blat::Report',
472: /^psLayout version \d+/ ],
473: spidey = RuleRegexp[ 'Bio::Spidey::Report',
474: /^\-\-SPIDEY version .+\-\-$/ ],
475: hmmer = RuleRegexp[ 'Bio::HMMER::Report',
476: /^HMMER +\d+\./ ],
477: sim4 = RuleRegexp[ 'Bio::Sim4::Report',
478: /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],
479:
480: fastaformat = RuleProc.new('Bio::FastaFormat',
481: 'Bio::NBRF',
482: 'Bio::FastaNumericFormat') do |text|
483: if /^>.+$/ =~ text
484: case text
485: when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
486: Bio::NBRF
487: when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
488: Bio::FastaFormat
489: when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
490: Bio::FastaNumericFormat
491: else
492: false
493: end
494: else
495: nil
496: end
497: end
498: ]
499:
500: # dependencies
501: # NCBI
502: genbank.is_prior_to genpept
503: # EMBL/UniProt
504: embl.is_prior_to sptr
505: sptr.is_prior_to prosite
506: prosite.is_prior_to transfac
507: # KEGG
508: #aaindex.is_prior_to litdb
509: #litdb.is_prior_to brite
510: brite.is_prior_to orthology
511: orthology.is_prior_to drug
512: drug.is_prior_to glycan
513: glycan.is_prior_to enzyme
514: enzyme.is_prior_to compound
515: compound.is_prior_to reaction
516: reaction.is_prior_to genes
517: genes.is_prior_to genome
518: # PDB
519: pdb.is_prior_to het
520: # BLAST
521: wublast.is_prior_to wutblast
522: wutblast.is_prior_to blast
523: blast.is_prior_to tblast
524: # FastaFormat
525: BottomRule.is_prior_to(fastaformat)
526:
527: # for debug
528: #debug_first = RuleDebug.new('debug_first')
529: #a.add(debug_first)
530: #debug_first.is_prior_to(TopRule)
531:
532: ## for debug
533: #debug_last = RuleDebug.new('debug_last')
534: #a.add(debug_last)
535: #BottomRule.is_prior_to(debug_last)
536: #fastaformat.is_prior_to(debug_last)
537:
538: a.rehash
539: return a
540: end
Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 305
305: def autodetect(text, meta = {})
306: r = nil
307: elements.each do |e|
308: #$stderr.puts e.name
309: r = e.guess(text, meta)
310: break if r
311: end
312: r
313: end
autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 318
318: def autodetect_flatfile(ff, lines = 31)
319: meta = {}
320: stream = ff.instance_eval { @stream }
321: begin
322: path = stream.path
323: rescue NameError
324: end
325: if path then
326: meta[:path] = path
327: # call autodetect onece with meta and without any read action
328: if r = self.autodetect(stream.prefetch_buffer, meta)
329: return r
330: end
331: end
332: # reading stream
333: 1.upto(lines) do |x|
334: break unless line = stream.prefetch_gets
335: if line.strip.size > 0 then
336: if r = self.autodetect(stream.prefetch_buffer, meta)
337: return r
338: end
339: end
340: end
341: return nil
342: end
Iterates over each element.
# File lib/bio/io/flatfile/autodetection.rb, line 298
298: def each_rule(&x) #:yields: elem
299: elements.each(&x)
300: end
visualizes the object (mainly for debug)
# File lib/bio/io/flatfile/autodetection.rb, line 291
291: def inspect
292: "<#{self.class.to_s} " +
293: self.elements.collect { |e| e.name.inspect }.join(' ') +
294: ">"
295: end
rebuilds the object and clears internal cache.
# File lib/bio/io/flatfile/autodetection.rb, line 285
285: def rehash
286: @rules.rehash
287: @elements = nil
288: end
(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.
# File lib/bio/io/flatfile/autodetection.rb, line 253
253: def tsort_each_child(elem)
254: if elem == TopRule then
255: @rules.each_value do |e|
256: yield e unless e == TopRule or
257: e.lower_priority_elements.index(TopRule)
258: end
259: elsif elem == BottomRule then
260: @rules.each_value do |e|
261: yield e if e.higher_priority_elements.index(BottomRule)
262: end
263: else
264: elem.lower_priority_elements.each do |e|
265: yield e if e != BottomRule
266: end
267: unless elem.higher_priority_elements.index(BottomRule)
268: yield BottomRule
269: end
270: end
271: end