| Class | Bio::FastaDefline |
| In: |
lib/bio/db/fasta/defline.rb
|
| Parent: | Object |
Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or ":"-separated IDs.
specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
rub.entry_id ==> 'gi|671595'
rub.get('emb') ==> 'CAA85678.1'
rub.emb ==> 'CAA85678.1'
rub.gi ==> '671595'
rub.accession ==> 'CAA85678'
rub.accessions ==> [ 'CAA85678' ]
rub.acc_version ==> 'CAA85678.1'
rub.locus ==> nil
rub.list_ids ==> [["gi", "671595"],
["emb", "CAA85678.1", nil],
["Perovskia abrotanoides"]]
ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
ckr.entry_id ==> "gi|2495000"
ckr.sp ==> "CCKR_CAVPO"
ckr.pir ==> "I51898"
ckr.gb ==> "AAB29504.1"
ckr.gi ==> "2495000"
ckr.accession ==> "AAB29504"
ckr.accessions ==> ["Q63931", "AAB29504"]
ckr.acc_version ==> "AAB29504.1"
ckr.locus ==> nil
ckr.description ==>
"CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
ckr.descriptions ==>
["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
"cholecystokinin A receptor - guinea pig",
"cholecystokinin A receptor; CCK-A receptor [Cavia]"]
ckr.words ==>
["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
"receptor", "type"]
ckr.id_strings ==>
["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
"544724", "AAB29504.1", "Cavia"]
ckr.list_ids ==>
[["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
["gb", "AAB29504.1", nil], ["Cavia"]]
| NSIDs | = | { # NCBI and WU-BLAST 'gi' => [ 'gi' ], # NCBI GI 'gb' => [ 'acc_version', 'locus' ], # GenBank 'emb' => [ 'acc_version', 'locus' ], # EMBL 'dbj' => [ 'acc_version', 'locus' ], # DDBJ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT 'pdb' => [ 'entry_id', 'chain' ], # PDB 'bbs' => [ 'number' ], # GenInfo Backbone Id 'gnl' => [ 'database' , 'entry_id' ], # General database identifier 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence 'lcl' => [ 'entry_id' ], # Local Sequence identifier # WU-BLAST and NCBI 'pir' => [ 'accession', 'entry_id' ], # PIR 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation 'pat' => [ 'country', 'number', 'serial' ], # Patents # WU-BLAST only 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier 'gim' => [ 'number' ], # NCBI GenInfo Import identifier 'gp' => [ 'acc_version', 'locus' ], # GenPept 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank # Original 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB } |
| KillWords | = | [ 'an', 'the', 'this', 'that', 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might', 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with', 'from', 'and', 'or', 'not', 'dna', 'rna', 'mrna', 'cdna', 'orf', 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp', 'similar', 'involved', 'identical', 'identity', 'cds', 'clone', 'library', 'contig', 'contigs', 'homolog', 'homologue', 'homologs', 'homologous', 'protein', 'proteins', 'gene', 'genes', 'product', 'products', 'sequence', 'sequences', 'strain', 'strains', 'region', 'regions', ] |
| KillWordsHash | = | {} |
| KillRegexpArray | = | [ /\A\d{1,3}\%?\z/, /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/, /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ |
| entry_id | [R] | Shows a possibly unique identifier. Returns a string. |
| list_ids | [R] | Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings. |
Parses given string.
# File lib/bio/db/fasta/defline.rb, line 176
176: def initialize(str)
177: @deflines = []
178: @info = {}
179: @list_ids = []
180:
181: @entry_id = nil
182:
183: lines = str.split("\x01")
184: lines.each do |line|
185: add_defline(line)
186: end
187: end
Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 489
489: def acc_version
490: unless defined?(@acc_version) then
491: @acc_version = get_by_type('acc_version')
492: end
493: @acc_version
494: end
Shows accession numbers. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 498
498: def accessions
499: unless defined?(@accessions) then
500: @accessions = get_all_by_type('accession', 'acc_version')
501: @accessions.collect! { |x| x.sub(/\..*\z/, '') }
502: end
503: @accessions
504: end
Parses given string and adds parsed data.
# File lib/bio/db/fasta/defline.rb, line 190
190: def add_defline(str)
191: case str
192: when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
193: # NSIDs
194: # examples:
195: # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
196: #
197: # note: regexp (:?) means grouping without backreferences
198: i = $1
199: d = $2
200: tks = i.split('|')
201: tks << '' if i[-1,1] == '|'
202: a = parse_NSIDs(tks)
203: i = a[0].join('|')
204: a.unshift('|')
205: d = tks.join('|') + ' ' + d unless tks.empty?
206: a << d
207: this_line = a
208: match_EC(d)
209: parse_square_brackets(d).each do |x|
210: if !match_EC(x, false) and x =~ /\A[A-Z]/ then
211: di = [ x ]
212: @list_ids << di
213: @info['organism'] = x unless @info['organism']
214: end
215: end
216:
217: when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
218: # examples:
219: # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
220: # >emb:CACDC28 [X80034] C.albicans CDC28 gene
221: i = $1
222: d = $2
223: a = parse_ColonSepID(i)
224: i = a.join(':')
225: this_line = [ ':', a , d ]
226: match_EC(d)
227: parse_square_brackets(d).each do |x|
228: if !match_EC(x, false) and x =~ /:/ then
229: parse_ColonSepID(x)
230: elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
231: @list_ids << [ $1 ]
232: end
233: end
234:
235: when /^\>?\s*(\S+)(?:\s+(.+))?$/
236: # examples:
237: # >ABC12345 this is test
238: i = $1
239: d = $2.to_s
240: @list_ids << [ i.chomp('.') ]
241: this_line = [ '', [ i ], d ]
242: match_EC(d)
243: else
244: i = str
245: d = ''
246: match_EC(i)
247: this_line = [ '', [ i ], d ]
248: end
249:
250: @deflines << this_line
251: @entry_id = i unless @entry_id
252: end
Shows description.
# File lib/bio/db/fasta/defline.rb, line 332
332: def description
333: @deflines[0].to_a[-1]
334: end
Returns descriptions.
# File lib/bio/db/fasta/defline.rb, line 337
337: def descriptions
338: @deflines.collect do |a|
339: a[-1]
340: end
341: end
Returns identifires by a database name.
# File lib/bio/db/fasta/defline.rb, line 413
413: def get(dbname)
414: db = dbname.to_s
415: r = nil
416: unless r = @info[db] then
417: di = @list_ids.find { |x| x[0] == db.to_s }
418: if di and di.size <= 2 then
419: r = di[-1]
420: elsif di then
421: labels = self.class::NSIDs[db]
422: [ 'acc_version', 'entry_id',
423: 'locus', 'accession', 'number'].each do |x|
424: if i = labels.index(x) then
425: r = di[i+1]
426: break if r
427: end
428: end
429: r = di[1..-1].find { |x| x } unless r
430: end
431: @info[db] = r if r
432: end
433: r
434: end
Returns identifiers by given type.
# File lib/bio/db/fasta/defline.rb, line 449
449: def get_all_by_type(*type_strarg)
450: d = []
451: @list_ids.each do |x|
452: if labels = self.class::NSIDs[x[0]] then
453: type_strarg.each do |y|
454: if i = labels.index(y) then
455: d << x[i+1] if x[i+1]
456: end
457: end
458: end
459: end
460: d
461: end
Returns an identifier by given type.
# File lib/bio/db/fasta/defline.rb, line 437
437: def get_by_type(type_str)
438: @list_ids.each do |x|
439: if labels = self.class::NSIDs[x[0]] then
440: if i = labels.index(type_str) then
441: return x[i+1]
442: end
443: end
444: end
445: nil
446: end
Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 478
478: def gi
479: unless defined?(@gi) then
480: @gi = get_by_type('gi')
481: end
482: @gi
483: end
Shows ID-like strings. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 345
345: def id_strings
346: r = []
347: @list_ids.each do |a|
348: if a.size >= 2 then
349: r.concat a[1..-1].find_all { |x| x }
350: else
351: if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
352: r << a[0]
353: end
354: end
355: end
356: r.concat( words(true, []).find_all do |x|
357: x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
358: x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
359: end)
360: r
361: end
# File lib/bio/db/fasta/defline.rb, line 518
518: def method_missing(name, *args)
519: # raise ArgumentError,
520: # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
521: r = get(name, *args)
522: if !r and !(self.class::NSIDs[name.to_s]) then
523: raise "NameError: undefined method `#{name.inspect}'"
524: end
525: r
526: end
Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.
# File lib/bio/db/fasta/defline.rb, line 324
324: def to_s
325: @deflines.collect { |a|
326: s = a[0]
327: (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
328: }.join("\x01")
329: end
Shows words used in the defline. Returns an Array.
# File lib/bio/db/fasta/defline.rb, line 387
387: def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
388: kwhash = self.class::KillWordsHash)
389: a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
390: a.collect! do |x|
391: x.sub!(/\A[\$\*\-\+]+/, '')
392: x.sub!(/[\$\*\-\=]+\z/, '')
393: if x.size <= 1 then
394: nil
395: elsif kwhash[x.downcase] then
396: nil
397: else
398: if kill_regexp.find { |expr| expr =~ x } then
399: nil
400: else
401: x
402: end
403: end
404: end
405: a.compact!
406: a.collect! { |x| x.downcase } unless case_sensitive
407: a.sort!
408: a.uniq!
409: a
410: end