Lua long strings in fslex

I've been working on a Lua fslex lexer in my spare time, using the ocamllex manual as a reference.

I hit a few snags while trying to tokenize long strings correctly. "Long strings" are delimited by '[' ('=')* '[' and ']' ('=')* ']' tokens; the number of = signs must be the same.

In the first implementation, the lexer seemed to not recognize [[ patterns, producing two LBRACKET tokens despite the longest match rule, whereas [=[ and variations where recognized correctly. In addition, the regular expression failed to ensure that the correct closing token is used, stopping at the first ']' ('=')* ']' capture, no matter the actual long string "level". Also, fslex does not seem to support "as" constructs in regular expressions.


let lualongstring =    '[' ('=')* '[' ( escapeseq | [^ '\\' '[' ] )* ']' ('=')* ']'

(* ... *)
    | lualongstring    { (* ... *) }
    | '['              { LBRACKET }
    | ']'              { RBRACKET }
(* ... *)

I've been trying to solve the issue with another rule in the lexer:


rule tokenize = parse
    (* ... *)
    | '[' ('=')* '['   { longstring (getLongStringLevel(lexeme lexbuf)) lexbuf }
    (* ... *)

and longstring level = parse 
    | ']' ('=')* ']'   { (* check level, do something *) }
    | _                { (* aggregate other chars *) }

    (* or *)

    | _    {
               let c = lexbuf.LexerChar(0);
               (* ... *)           
           }

But I'm stuck, for two reasons: first, I don't think I can "push", so to speak, a token to the next rule once I'm done reading the long string; second, I don't like the idea of reading char by char until the right closing token is found, making the current design useless.

How can I tokenize Lua long strings in fslex? Thanks for reading.

let beginlongbracket = '[' ('=')* '[' let endlongbracket = ']' ('=')* ']' rule tokenize = parse | beginlongbracket { longstring (longBracketLevel(lexeme lexbuf)) lexbuf } (* ... *) and longstring level = parse | endlongbracket { if longBracketLevel(lexeme lexbuf) = level then LUASTRING(endLongString(lexbuf)) else longstring level lexbuf } | _ { toLongString lexbuf (lexeme lexbuf); longstring level lexbuf } | eof { failwith "Unexpected end of file in string." }

let longBracketLevel (str : string) = str.Count(fun c -> c = '=') let createLongStringStorage (lexbuf : LexBuffer<_>) = let sb = new StringBuilder(1000) lexbuf.BufferLocalStore.["longstring"] <- box sb sb let toLongString (lexbuf : LexBuffer<_>) (c : string) = let hasString, sb = lexbuf.BufferLocalStore.TryGetValue("longstring") let storage = if hasString then (sb :?> StringBuilder) else (createLongStringStorage lexbuf) storage.Append(c.[0]) |> ignore let endLongString (lexbuf : LexBuffer<_>) : string = let hasString, sb = lexbuf.BufferLocalStore.TryGetValue("longstring") let ret = if not hasString then "" else (sb :?> StringBuilder).ToString() lexbuf.BufferLocalStore.Remove("longstring") |> ignore ret

Recommended topics

Hot tags