----------------------------------------------------------------------
--
-- This sample shows how to build and use a different lexer in
-- Metalua. A lexer is essentially an object with a [newstream]
-- method, which takes a source code as a parameter and return a
-- lexstream as a result; a lexstream is an object with methods
-- [peek], [next], [add] and [is_keyword].
--
-- Although you can rewrite such an object from scratch, it's often
-- best to extend the original lexer provided with Metalua; this is
-- what's done in this sample.
--
-- Here, we introduce an aternative syntax which structure code blocks
-- according to indentation, in a way similar to what Python
-- does. Since we don't want to modify the parser in depth, here is
-- how it works:
--
-- * a semicolon at the end of a line, followed by an indentation,
--   begins a block;
--
-- * when a line begins with an indentation less than the indentation
--   of some unterminated blocks, all of these blocks are closed.
--   Closing them is done by injecting as many `Keyword{ "end" }
--   tokens in the stream as necessary.
--
-- * Under such conditions, the Lua keywords which introduce new
--   blocks become quite annoying, as they're redundant with the
--   semicolon. Therefore, we suppress "then", as well as "do" in
--   loops.
--
-- * It's now possible to mix indentation-induced blocks, which
--   generate implicit "end"s, with normal blocks. For instance, these
--   two statements are equivalent:
--
--   for i=1,10 print(i) end
--
--   for i=1,10:
--      print(i)
--
--   Also notice that the former version, with explicit "end"s, could
--   span on several lines, as long as each of them is indented enough
--   not to close any surrounding semicolon-induced block.
--
--   For a usage sample of this, look at pysample.lua
--
----------------------------------------------------------------------

----------------------------------------------------------------------
-- I assume that the regular lexer is loaded; this is always the
-- case at metalevel zero.
----------------------------------------------------------------------
assert (mlp.lexer)

----------------------------------------------------------------------
-- Brutal inheritance by full cloning: there's no need to keep shared
-- behavior between the original lexer and this one, as I don't plan
-- to modify the latter.
--
-- Notice that [table.deep_copy()] takes care of also copying
-- the metatable, therefore providing a real object-cloning
-- facility.
----------------------------------------------------------------------
local super = mlp.lexer
pylex = table.deep_copy (super)
pylex.stream_mt = { __index=pylex }

----------------------------------------------------------------------
-- We need to be less forgetful, now that whitespaces are significant.
-- therefore we completely stop considering tabs as whitespaces:
----------------------------------------------------------------------
pylex.patterns.spaces = "^[ \r\n]*()"

----------------------------------------------------------------------
-- These are the remaining mandatory block end markers: if they
-- appear alone on a line, we must not generate an implicit "end".
----------------------------------------------------------------------
pylex.explicit_block_terminators = {
   ["elseif"]=1, ["else"]=1, ["until"]=1 }

----------------------------------------------------------------------
-- Handling whitespaces: it calls the normal whitespace handler, then
-- checks if it's at a beginning of line. If so, and if indentation is
-- less than some block indentation levels, add as many "end" keywords
-- as required to close all blocks that must be closed. These "end"
-- keywords are added by incrementing [self.pending_ends], which will
-- actually be converted back into real keywords by the *following*
-- call to the whitespace handler.
----------------------------------------------------------------------
function pylex:skip_whitespaces_and_comments()

   ---------------------------------------------------------
   -- If there are some "end"s to generate, do it before
   -- munching more data:
   ---------------------------------------------------------
   if self.pending_ends > 0 then
      -- There were some pending "end" keywords to generate.
      printf("%i pending ends to generate", self.pending_ends)
      self.pending_ends = self.pending_ends - 1
      return "Keyword", "end"
   end

   local previous_i = self.i
   local tag, content = super.skip_whitespaces_and_comments(self)

   ---------------------------------------------------------
   -- When Eof happens, we need to close all open blocks
   -- before actually returning Eof.
   ---------------------------------------------------------
   if tag=="Eof" then
      local unclosed = #self.indent_levels-1
      printf ("There are %i ends to close", unclosed);
      if unclosed>0 then
         self.indent_levels = { 0 }
         self.pending_ends = unclosed-1
         return "Keyword", "end" 
      else
         return tag, content
      end
   end      

   assert (not tag, "Original lexer returned a non-Eof whitespace value!?")

   ---------------------------------------------------------
   -- Check if this is a line's first token, and if so,
   -- check indentation
   ---------------------------------------------------------
   local j, k = self.i
   repeat j=j-1;  k = self.src:sub(j,j) until j==previous_i or k~=" "
   if k=="\r" or k=="\n" then -- This is indeed the first token of a line

      ---------------------------------------------------------
      -- There are some explicit keywords which close a block
      -- explicitly. If one of them is going to be read,
      -- cancel one implicit "end".
      ---------------------------------------------------------
      local next_word = self.src:match("^([%a_][%w_]+)", self.i)
      if next_word and self.explicit_block_terminators[next_word] then
         self.pending_ends = self.pending_ends-1 end


      ---------------------------------------------------------
      -- Generate enough "end"s to match the new indent level.
      ---------------------------------------------------------
      local indent_level = self.i-j-1
      while true do
         local block_level = self.indent_levels[1]
         if block_level > indent_level then 
            self.pending_ends = self.pending_ends+1 
            table.remove(self.indent_levels, 1)
         else break end
      end
      
      ------------------------------------------------------
      -- We have to do it again, as some "end"s might
      -- have been added just above:
      ------------------------------------------------------
      if self.pending_ends > 0 then
         printf("%i pending ends to generate [trail]", self.pending_ends)
         self.pending_ends = self.pending_ends - 1
         return "Keyword", "end"
      end
   end
end

----------------------------------------------------------------------
-- Find the semicolons followed by a line break
----------------------------------------------------------------------
function pylex:extract_block_begin()
   local x, y = self.src:match("^: *[\r\n]+() +()", self.i)
   if not x then x, y = self.src:match("^: %-%-[^\n]*[\r\n]+() +()", self.i) end
   if x then 
      local ilevel = y-x
      if ilevel <= self.indent_levels[1] then
      --   error (string.format ("Messed up indentation: %i->%i", 
      --                         self.indent_levels[1], ilevel))
      end
      table.insert (self.indent_levels, 1, ilevel) 
      self.i = y
      -- There might be comments to skip after indent:
      super.skip_whitespaces_and_comments (self)
      return
   end
end
   
----------------------------------------------------------------------
-- This is some internal hacking of the lexer: when [mlp_lexer] tries
-- to extract a lexeme, it tries all the methods of this list in order,
-- until one of them actually returns a (tag, content) pair.
-- Therefore, we introduce [extract_block_begin], before 
-- [extract_symbol] (which would wrongly accept line-treminating ":")
----------------------------------------------------------------------
table.insert (pylex.extractors, 2, "extract_block_begin")

----------------------------------------------------------------------
-- Constructor: there's a couple of extra instance fields to set up
----------------------------------------------------------------------
function pylex:newstream(src)
   print "Opening pythonic lexer stream"
   local s = super.newstream (self, src)
   s.indent_levels = { 0 }
   s.pending_ends = 0
   return s
end

----------------------------------------------------------------------
-- Now, about syntax tuning: we remove some redundant keywords, since
-- they will overlap with ":" indenters: 
--  * "then"s in if statements
--  * "do" in for and while loops
-- We do a little bit of extra checking, just in case some other
-- extension already hacked the parsers we're about to fiddle with.
--
-- In a real extension, you'd rather make these keywords optional
-- instead of just deleting them, but here I try to avoid unnecessary
-- clutter.
----------------------------------------------------------------------
local x_then_y = mlp.stat:get("if")[2].primary
assert (table.remove(x_then_y, 2) == "then", 
        "Not the regular if/then/else parser?!")
local for_parser = mlp.stat:get("for")
assert (table.remove(for_parser, 3) == "do", 
        "Not the regular for/do parser?!")
local while_parser = mlp.stat:get("while")
assert (table.remove(while_parser, 3) == "do", 
        "Not the regular while/do parser?!")

----------------------------------------------------------------------
-- Finally, change the lexer used by the compiler!
----------------------------------------------------------------------
mlp.lexer = pylex