| Module | Bio::Alignment::Output |
| In: |
lib/bio/alignment.rb
|
common routine for interleaved/non-interleaved phylip format
# File lib/bio/alignment.rb, line 1099
1099: def __output_phylip_common(options = {})
1100: len = self.alignment_length
1101: aln = [ " #{self.number_of_sequences} #{len}\n" ]
1102: sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
1103: if options[:replace_space]
1104: sn.collect! { |x| x.gsub(/\s/, '_') }
1105: end
1106: if !options.has_key?(:escape) or options[:escape]
1107: sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
1108: end
1109: if !options.has_key?(:split) or options[:split]
1110: sn.collect! { |x| x.split(/\s/)[0].to_s }
1111: end
1112: if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
1113: sn = __clustal_avoid_same_name(sn, 10)
1114: end
1115:
1116: namewidth = 10
1117: seqwidth = (options[:width] or 60)
1118: seqwidth = seqwidth.div(10) * 10
1119: seqregexp = Regexp.new("(.{1,#{seqwidth.div(10) * 11}})")
1120: gchar = (options[:gap_char] or '-')
1121:
1122: aseqs = Array.new(self.number_of_sequences).clear
1123: self.each_seq do |s|
1124: aseqs << s.to_s.gsub(self.gap_regexp, gchar)
1125: end
1126: case options[:case].to_s
1127: when /lower/i
1128: aseqs.each { |s| s.downcase! }
1129: when /upper/i
1130: aseqs.each { |s| s.upcase! }
1131: end
1132:
1133: aseqs.collect! do |s|
1134: snx = sn.shift
1135: head = sprintf("%*s", -namewidth, snx.to_s)[0, namewidth]
1136: head2 = ' ' * namewidth
1137: s << (gchar * (len - s.length))
1138: s.gsub!(/(.{1,10})/n, " \\1")
1139: s.gsub!(seqregexp, "\\1\n")
1140: a = s.split(/^/)
1141: head += a.shift
1142: ret = a.collect { |x| head2 + x }
1143: ret.unshift(head)
1144: ret
1145: end
1146: lines = (len + seqwidth - 1).div(seqwidth)
1147: [ aln, aseqs, lines ]
1148: end
# File lib/bio/alignment.rb, line 873
873: def output(format, *arg)
874: case format
875: when :clustal
876: output_clustal(*arg)
877: when :fasta
878: output_fasta(*arg)
879: when :phylip
880: output_phylip(*arg)
881: when :phylipnon
882: output_phylipnon(*arg)
883: when :msf
884: output_msf(*arg)
885: when :molphy
886: output_molphy(*arg)
887: else
888: raise "Unknown format: #{format.inspect}"
889: end
890: end
Generates fasta format text and returns a string.
# File lib/bio/alignment.rb, line 1059
1059: def output_fasta(options={})
1060: #(original)
1061: width = (options[:width] or 70)
1062: if options[:avoid_same_name] then
1063: na = __clustal_avoid_same_name(self.sequence_names, 30)
1064: else
1065: na = self.sequence_names.collect do |k|
1066: k.to_s.gsub(/[\r\n\x00]/, ' ')
1067: end
1068: end
1069: if width and width > 0 then
1070: w_reg = Regexp.new(".{1,#{width}}")
1071: self.collect do |s|
1072: ">#{na.shift}\n" + s.to_s.gsub(w_reg, "\\0\n")
1073: end.join('')
1074: else
1075: self.collect do |s|
1076: ">#{na.shift}\n" + s.to_s + "\n"
1077: end.join('')
1078: end
1079: end
Generates Molphy alignment format text as a string
# File lib/bio/alignment.rb, line 1151
1151: def output_molphy(options = {})
1152: len = self.alignment_length
1153: header = "#{self.number_of_sequences} #{len}\n"
1154: sn = self.sequence_names.collect { |x| x.to_s.gsub(/[\r\n\x00]/, ' ') }
1155: if options[:replace_space]
1156: sn.collect! { |x| x.gsub(/\s/, '_') }
1157: end
1158: if !options.has_key?(:escape) or options[:escape]
1159: sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
1160: end
1161: if !options.has_key?(:split) or options[:split]
1162: sn.collect! { |x| x.split(/\s/)[0].to_s }
1163: end
1164: if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
1165: sn = __clustal_avoid_same_name(sn, 30)
1166: end
1167:
1168: seqwidth = (options[:width] or 60)
1169: seqregexp = Regexp.new("(.{1,#{seqwidth}})")
1170: gchar = (options[:gap_char] or '-')
1171:
1172: aseqs = Array.new(len).clear
1173: self.each_seq do |s|
1174: aseqs << s.to_s.gsub(self.gap_regexp, gchar)
1175: end
1176: case options[:case].to_s
1177: when /lower/i
1178: aseqs.each { |s| s.downcase! }
1179: when /upper/i
1180: aseqs.each { |s| s.upcase! }
1181: end
1182:
1183: aseqs.collect! do |s|
1184: s << (gchar * (len - s.length))
1185: s.gsub!(seqregexp, "\\1\n")
1186: sn.shift + "\n" + s
1187: end
1188: aseqs.unshift(header)
1189: aseqs.join('')
1190: end
Generates msf formatted text as a string
# File lib/bio/alignment.rb, line 1193
1193: def output_msf(options = {})
1194: len = self.seq_length
1195:
1196: if !options.has_key?(:avoid_same_name) or options[:avoid_same_name]
1197: sn = __clustal_avoid_same_name(self.sequence_names)
1198: else
1199: sn = self.sequence_names.collect do |x|
1200: x.to_s.gsub(/[\r\n\x00]/, ' ')
1201: end
1202: end
1203: if !options.has_key?(:replace_space) or options[:replace_space]
1204: sn.collect! { |x| x.gsub(/\s/, '_') }
1205: end
1206: if !options.has_key?(:escape) or options[:escape]
1207: sn.collect! { |x| x.gsub(/[\:\;\,\(\)]/, '_') }
1208: end
1209: if !options.has_key?(:split) or options[:split]
1210: sn.collect! { |x| x.split(/\s/)[0].to_s }
1211: end
1212:
1213: seqwidth = 50
1214: namewidth = [31, sn.collect { |x| x.length }.max ].min
1215: sep = ' ' * 2
1216:
1217: seqregexp = Regexp.new("(.{1,#{seqwidth}})")
1218: gchar = (options[:gap_char] or '.')
1219: pchar = (options[:padding_char] or '~')
1220:
1221: aseqs = Array.new(self.number_of_sequences).clear
1222: self.each_seq do |s|
1223: aseqs << s.to_s.gsub(self.gap_regexp, gchar)
1224: end
1225: aseqs.each do |s|
1226: s.sub!(/\A#{Regexp.escape(gchar)}+/) { |x| pchar * x.length }
1227: s.sub!(/#{Regexp.escape(gchar)}+\z/, '')
1228: s << (pchar * (len - s.length))
1229: end
1230:
1231: case options[:case].to_s
1232: when /lower/i
1233: aseqs.each { |s| s.downcase! }
1234: when /upper/i
1235: aseqs.each { |s| s.upcase! }
1236: else #default upcase
1237: aseqs.each { |s| s.upcase! }
1238: end
1239:
1240: case options[:type].to_s
1241: when /protein/i, /aa/i
1242: amino = true
1243: when /na/i
1244: amino = false
1245: else
1246: if seqclass == Bio::Sequence::AA then
1247: amino = true
1248: elsif seqclass == Bio::Sequence::NA then
1249: amino = false
1250: else
1251: # if we can't determine, we asuume as protein.
1252: amino = aseqs.size
1253: aseqs.each { |x| amino -= 1 if /\A[acgt]\z/i =~ x }
1254: amino = false if amino <= 0
1255: end
1256: end
1257:
1258: seq_type = (amino ? 'P' : 'N')
1259:
1260: fn = (options[:entry_id] or self.__id__.abs.to_s + '.msf')
1261: dt = (options[:time] or Time.now).strftime('%B %d, %Y %H:%M')
1262:
1263: sums = aseqs.collect { |s| GCG::Seq.calc_checksum(s) }
1264: #sums = aseqs.collect { |s| 0 }
1265: sum = 0; sums.each { |x| sum += x }; sum %= 10000
1266: msf =
1267: [
1268: "#{seq_type == 'N' ? 'N' : 'A' }A_MULTIPLE_ALIGNMENT 1.0\n",
1269: "\n",
1270: "\n",
1271: " #{fn} MSF: #{len} Type: #{seq_type} #{dt} Check: #{sum} ..\n",
1272: "\n"
1273: ]
1274:
1275: sn.each do |snx|
1276: msf << ' Name: ' +
1277: sprintf('%*s', -namewidth, snx.to_s)[0, namewidth] +
1278: " Len: #{len} Check: #{sums.shift} Weight: 1.00\n"
1279: end
1280: msf << "\n//\n"
1281:
1282: aseqs.collect! do |s|
1283: snx = sn.shift
1284: head = sprintf("%*s", namewidth, snx.to_s)[0, namewidth] + sep
1285: s.gsub!(seqregexp, "\\1\n")
1286: a = s.split(/^/)
1287: a.collect { |x| head + x }
1288: end
1289: lines = (len + seqwidth - 1).div(seqwidth)
1290: i = 1
1291: lines.times do
1292: msf << "\n"
1293: n_l = i
1294: n_r = [ i + seqwidth - 1, len ].min
1295: if n_l != n_r then
1296: w = [ n_r - n_l + 1 - n_l.to_s.length - n_r.to_s.length, 1 ].max
1297: msf << (' ' * namewidth + sep + n_l.to_s +
1298: ' ' * w + n_r.to_s + "\n")
1299: else
1300: msf << (' ' * namewidth + sep + n_l.to_s + "\n")
1301: end
1302: aseqs.each { |a| msf << a.shift }
1303: i += seqwidth
1304: end
1305: msf << "\n"
1306: msf.join('')
1307: end
generates phylip interleaved alignment format as a string
# File lib/bio/alignment.rb, line 1082
1082: def output_phylip(options = {})
1083: aln, aseqs, lines = __output_phylip_common(options)
1084: lines.times do
1085: aseqs.each { |a| aln << a.shift }
1086: aln << "\n"
1087: end
1088: aln.pop if aln[-1] == "\n"
1089: aln.join('')
1090: end
generates Phylip3.2 (old) non-interleaved format as a string
# File lib/bio/alignment.rb, line 1093
1093: def output_phylipnon(options = {})
1094: aln, aseqs, lines = __output_phylip_common(options)
1095: aln.first + aseqs.join('')
1096: end
# to_clustal is deprecated. Instead, please use output_clustal. +
# File lib/bio/alignment.rb, line 1053
1053: def to_clustal(*arg)
1054: warn "to_clustal is deprecated. Please use output_clustal."
1055: output_clustal(*arg)
1056: end