---------------------------------------------------------------------- -- -- This sample shows how to build and use a different lexer in -- Metalua. A lexer is essentially an object with a [newstream] -- method, which takes a source code as a parameter and return a -- lexstream as a result; a lexstream is an object with methods -- [peek], [next], [add] and [is_keyword]. -- -- Although you can rewrite such an object from scratch, it's often -- best to extend the original lexer provided with Metalua; this is -- what's done in this sample. -- -- Here, we introduce an aternative syntax which structure code blocks -- according to indentation, in a way similar to what Python -- does. Since we don't want to modify the parser in depth, here is -- how it works: -- -- * a semicolon at the end of a line, followed by an indentation, -- begins a block; -- -- * when a line begins with an indentation less than the indentation -- of some unterminated blocks, all of these blocks are closed. -- Closing them is done by injecting as many `Keyword{ "end" } -- tokens in the stream as necessary. -- -- * Under such conditions, the Lua keywords which introduce new -- blocks become quite annoying, as they're redundant with the -- semicolon. Therefore, we suppress "then", as well as "do" in -- loops. -- -- * It's now possible to mix indentation-induced blocks, which -- generate implicit "end"s, with normal blocks. For instance, these -- two statements are equivalent: -- -- for i=1,10 print(i) end -- -- for i=1,10: -- print(i) -- -- Also notice that the former version, with explicit "end"s, could -- span on several lines, as long as each of them is indented enough -- not to close any surrounding semicolon-induced block. -- -- For a usage sample of this, look at pysample.lua -- ---------------------------------------------------------------------- ---------------------------------------------------------------------- -- I assume that the regular lexer is loaded; this is always the -- case at metalevel zero. ---------------------------------------------------------------------- assert (mlp.lexer) ---------------------------------------------------------------------- -- Brutal inheritance by full cloning: there's no need to keep shared -- behavior between the original lexer and this one, as I don't plan -- to modify the latter. -- -- Notice that [table.deep_copy()] takes care of also copying -- the metatable, therefore providing a real object-cloning -- facility. ---------------------------------------------------------------------- local super = mlp.lexer pylex = table.deep_copy (super) pylex.stream_mt = { __index=pylex } ---------------------------------------------------------------------- -- We need to be less forgetful, now that whitespaces are significant. -- therefore we completely stop considering tabs as whitespaces: ---------------------------------------------------------------------- pylex.patterns.spaces = "^[ \r\n]*()" ---------------------------------------------------------------------- -- These are the remaining mandatory block end markers: if they -- appear alone on a line, we must not generate an implicit "end". ---------------------------------------------------------------------- pylex.explicit_block_terminators = { ["elseif"]=1, ["else"]=1, ["until"]=1 } ---------------------------------------------------------------------- -- Handling whitespaces: it calls the normal whitespace handler, then -- checks if it's at a beginning of line. If so, and if indentation is -- less than some block indentation levels, add as many "end" keywords -- as required to close all blocks that must be closed. These "end" -- keywords are added by incrementing [self.pending_ends], which will -- actually be converted back into real keywords by the *following* -- call to the whitespace handler. ---------------------------------------------------------------------- function pylex:skip_whitespaces_and_comments() --------------------------------------------------------- -- If there are some "end"s to generate, do it before -- munching more data: --------------------------------------------------------- if self.pending_ends > 0 then -- There were some pending "end" keywords to generate. printf("%i pending ends to generate", self.pending_ends) self.pending_ends = self.pending_ends - 1 return "Keyword", "end" end local previous_i = self.i local tag, content = super.skip_whitespaces_and_comments(self) --------------------------------------------------------- -- When Eof happens, we need to close all open blocks -- before actually returning Eof. --------------------------------------------------------- if tag=="Eof" then local unclosed = #self.indent_levels-1 printf ("There are %i ends to close", unclosed); if unclosed>0 then self.indent_levels = { 0 } self.pending_ends = unclosed-1 return "Keyword", "end" else return tag, content end end assert (not tag, "Original lexer returned a non-Eof whitespace value!?") --------------------------------------------------------- -- Check if this is a line's first token, and if so, -- check indentation --------------------------------------------------------- local j, k = self.i repeat j=j-1; k = self.src:sub(j,j) until j==previous_i or k~=" " if k=="\r" or k=="\n" then -- This is indeed the first token of a line --------------------------------------------------------- -- There are some explicit keywords which close a block -- explicitly. If one of them is going to be read, -- cancel one implicit "end". --------------------------------------------------------- local next_word = self.src:match("^([%a_][%w_]+)", self.i) if next_word and self.explicit_block_terminators[next_word] then self.pending_ends = self.pending_ends-1 end --------------------------------------------------------- -- Generate enough "end"s to match the new indent level. --------------------------------------------------------- local indent_level = self.i-j-1 while true do local block_level = self.indent_levels[1] if block_level > indent_level then self.pending_ends = self.pending_ends+1 table.remove(self.indent_levels, 1) else break end end ------------------------------------------------------ -- We have to do it again, as some "end"s might -- have been added just above: ------------------------------------------------------ if self.pending_ends > 0 then printf("%i pending ends to generate [trail]", self.pending_ends) self.pending_ends = self.pending_ends - 1 return "Keyword", "end" end end end ---------------------------------------------------------------------- -- Find the semicolons followed by a line break ---------------------------------------------------------------------- function pylex:extract_block_begin() local x, y = self.src:match("^: *[\r\n]+() +()", self.i) if not x then x, y = self.src:match("^: %-%-[^\n]*[\r\n]+() +()", self.i) end if x then local ilevel = y-x if ilevel <= self.indent_levels[1] then -- error (string.format ("Messed up indentation: %i->%i", -- self.indent_levels[1], ilevel)) end table.insert (self.indent_levels, 1, ilevel) self.i = y -- There might be comments to skip after indent: super.skip_whitespaces_and_comments (self) return end end ---------------------------------------------------------------------- -- This is some internal hacking of the lexer: when [mlp_lexer] tries -- to extract a lexeme, it tries all the methods of this list in order, -- until one of them actually returns a (tag, content) pair. -- Therefore, we introduce [extract_block_begin], before -- [extract_symbol] (which would wrongly accept line-treminating ":") ---------------------------------------------------------------------- table.insert (pylex.extractors, 2, "extract_block_begin") ---------------------------------------------------------------------- -- Constructor: there's a couple of extra instance fields to set up ---------------------------------------------------------------------- function pylex:newstream(src) print "Opening pythonic lexer stream" local s = super.newstream (self, src) s.indent_levels = { 0 } s.pending_ends = 0 return s end ---------------------------------------------------------------------- -- Now, about syntax tuning: we remove some redundant keywords, since -- they will overlap with ":" indenters: -- * "then"s in if statements -- * "do" in for and while loops -- We do a little bit of extra checking, just in case some other -- extension already hacked the parsers we're about to fiddle with. -- -- In a real extension, you'd rather make these keywords optional -- instead of just deleting them, but here I try to avoid unnecessary -- clutter. ---------------------------------------------------------------------- local x_then_y = mlp.stat:get("if")[2].primary assert (table.remove(x_then_y, 2) == "then", "Not the regular if/then/else parser?!") local for_parser = mlp.stat:get("for") assert (table.remove(for_parser, 3) == "do", "Not the regular for/do parser?!") local while_parser = mlp.stat:get("while") assert (table.remove(while_parser, 3) == "do", "Not the regular while/do parser?!") ---------------------------------------------------------------------- -- Finally, change the lexer used by the compiler! ---------------------------------------------------------------------- mlp.lexer = pylex