The following is a small lexer which only matches echo
, quoted and unquoted strings, comments, and prints out the resulting tokens:
{
type token = NEWLINE | ECHO | QUOTED of string | UNQUOTED of string | COMMENT of string
exception Eof
type state = CODE | LINE_COMMENT
let state = ref CODE
}
let newline = '\n'
let alphanum = [ 'A'-'Z' 'a'-'z' '0'-'9' '_' ]
let comment_line = "//"([^ '\n' ]+)
let space = [ ' ' '\t' ]
let quoted = '"'([^ '"' ]+)'"'
let unquoted = ('/'?(alphanum+'/'?)+)
rule code = parse
space+ { code lexbuf }
| newline { code lexbuf }
| "echo" { ECHO }
| quoted { QUOTED (Lexing.lexeme lexbuf) }
| "//" { line_comment "" lexbuf }
| ('/'|alphanum+) { unquoted (Lexing.lexeme lexbuf) lexbuf }
| eof { raise Eof }
and unquoted buff = parse
newline { UNQUOTED buff }
| "//" { state := LINE_COMMENT; if buff = "" then line_comment "" lexbuf else UNQUOTED buff }
| ('/'|alphanum+) { unquoted (buff ^ Lexing.lexeme lexbuf) lexbuf }
| space+ { UNQUOTED buff }
| eof { raise Eof }
and line_comment buff = parse
newline { state := CODE; COMMENT buff }
| _ { line_comment (buff ^ Lexing.lexeme lexbuf) lexbuf }
{
let lexer lb =
match !state with
CODE -> code lb
| LINE_COMMENT -> line_comment "" lb
let _ =
try
let lexbuf = Lexing.from_channel stdin in
while true do
let () =
match lexer lexbuf with
ECHO -> Printf.printf "ECHO\n"
| QUOTED s -> Printf.printf "QUOTED(%s)\n" s
| UNQUOTED s -> Printf.printf "UNQUOTED(%s)\n" s
| COMMENT s -> Printf.printf "COMMENT(%s)\n" s
| NEWLINE -> Printf.printf "\n"
in flush stdout
done
with Eof -> exit 0
}
It's a trick that I used in a project of mine, to overcome that same limitation in ocamllex (compared to the original C lex program which let one match patterns in "look ahead mode"). Basically, it splits the ambiguous rules in their distinct radicals, and switch the lexer to different parser accordingly. It also keeps track of the currently used parser ad the next entry point.
In your situation, the only states it needs to keep track of are the default one (CODE
), and comment mode (LINE_COMMENT
). This could be expanded to support other states if needed.
echo foo
, shouldn't that be enough for it to postpone comment lexing? – Persecution