| Class | Parser::Lexer |
| In: |
lib/parser/lexer.rb
|
| Parent: | Object |
line 3 "lib/parser/lexer.rl"
Read the Ruby Hacking Guide chapter 11, available in English at whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
Remember two things about Ragel scanners:
1) Longest match wins.
2) If two matches have the same length, the first
in source code wins.
General rules of making Ragel and Bison happy:
* `p` (position) and `@te` contain the index of the character
they're pointing to ("current"), plus one. `@ts` contains the index
of the corresponding character. The code for extracting matched token is:
@source_buffer.slice(@ts...@te)
* If your input is `foooooooobar` and the rule is:
'f' 'o'+
the result will be:
foooooooobar
^ ts=0 ^ p=te=9
* A Ragel lexer action should not emit more than one token, unless
you know what you are doing.
* All Ragel commands (fnext, fgoto, ...) end with a semicolon.
* If an action emits the token and transitions to another state, use
these Ragel commands:
emit($whatever)
fnext $next_state; fbreak;
If you perform `fgoto` in an action which does not emit a token nor
rewinds the stream pointer, the parser's side-effectful,
context-sensitive lookahead actions will break in a hard to detect
and debug way.
* If an action does not emit a token:
fgoto $next_state;
* If an action features lookbehind, i.e. matches characters with the
intent of passing them to another action:
p = @ts - 1
fgoto $next_state;
or, if the lookbehind consists of a single character:
fhold; fgoto $next_state;
* Ragel merges actions. So, if you have `e_lparen = '(' %act` and
`c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
_will_ invoke the action `act`.
e_something stands for "something with **e**mbedded action".
* EOF is explicit and is matched by `c_eof`. If you want to introspect
the state of the lexer, add this rule to the state:
c_eof => do_eof;
* If you proceed past EOF, the lexer will complain:
NoMethodError: undefined method `ord' for nil:NilClass
| ESCAPES | = | { ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f", ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t", ?v.ord => "\v", ?\\.ord => "\\" | line 82 "lib/parser/lexer.rl" % | |
| REGEXP_META_CHARACTERS | = | Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze | ||
| RBRACE_OR_RBRACK | = | %w"} ]".freeze | ||
| LEX_STATES | = | { :line_begin => lex_en_line_begin, :expr_dot => lex_en_expr_dot, :expr_fname => lex_en_expr_fname, :expr_value => lex_en_expr_value, :expr_beg => lex_en_expr_beg, :expr_mid => lex_en_expr_mid, :expr_arg => lex_en_expr_arg, :expr_cmdarg => lex_en_expr_cmdarg, :expr_end => lex_en_expr_end, :expr_endarg => lex_en_expr_endarg, :expr_endfn => lex_en_expr_endfn, :expr_labelarg => lex_en_expr_labelarg, :interp_string => lex_en_interp_string, :interp_words => lex_en_interp_words, :plain_string => lex_en_plain_string, :plain_words => lex_en_plain_string, } | ||
| PUNCTUATION | = | { '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE, '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS, '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE, '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA, ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2, '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK, '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH, ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP, '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE, '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH, '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ, '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ, '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ, '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ, '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET, '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2, '!@' => :tBANG, '&.' => :tANDDOT, } | Mapping of strings to parser tokens. | |
| PUNCTUATION_BEGIN | = | { '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR, '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3, '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK, } | ||
| KEYWORDS | = | { 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD, 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD, 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED, 'BEGIN' => :klBEGIN, 'END' => :klEND, } | ||
| KEYWORDS_BEGIN | = | { 'if' => :kIF, 'unless' => :kUNLESS, 'while' => :kWHILE, 'until' => :kUNTIL, 'rescue' => :kRESCUE, 'defined?' => :kDEFINED, } |
| _lex_eof_trans | [RW] | |
| _lex_from_state_actions | [RW] | |
| _lex_index_offsets | [RW] | |
| _lex_indicies | [RW] | |
| _lex_key_spans | [RW] | |
| _lex_to_state_actions | [RW] | |
| _lex_trans_actions | [RW] | |
| _lex_trans_keys | [RW] | |
| _lex_trans_targs | [RW] | |
| cmdarg | [RW] | |
| comments | [RW] | |
| cond | [RW] | |
| diagnostics | [RW] | |
| force_utf32 | [RW] | |
| in_kwarg | [RW] | |
| lex_en_expr_arg | [RW] | |
| lex_en_expr_beg | [RW] | |
| lex_en_expr_cmdarg | [RW] | |
| lex_en_expr_dot | [RW] | |
| lex_en_expr_end | [RW] | |
| lex_en_expr_endarg | [RW] | |
| lex_en_expr_endfn | [RW] | |
| lex_en_expr_fname | [RW] | |
| lex_en_expr_labelarg | [RW] | |
| lex_en_expr_mid | [RW] | |
| lex_en_expr_value | [RW] | |
| lex_en_expr_variable | [RW] | |
| lex_en_interp_backslash_delimited | [RW] | |
| lex_en_interp_backslash_delimited_words | [RW] | |
| lex_en_interp_string | [RW] | |
| lex_en_interp_words | [RW] | |
| lex_en_leading_dot | [RW] | |
| lex_en_line_begin | [RW] | |
| lex_en_line_comment | [RW] | |
| lex_en_plain_backslash_delimited | [RW] | |
| lex_en_plain_backslash_delimited_words | [RW] | |
| lex_en_plain_string | [RW] | |
| lex_en_plain_words | [RW] | |
| lex_en_regexp_modifiers | [RW] | |
| lex_error | [RW] | |
| lex_start | [RW] | |
| source_buffer | [R] | |
| static_env | [RW] | |
| tokens | [RW] |