----------------------------------------------------------------------
-- Metalua:  $Id$
--
-- Summary: Source file lexer. ~~Currently only works on strings.
-- Some API refactoring is needed.
--
----------------------------------------------------------------------
--
-- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
--
-- This software is released under the MIT Licence, see licence.txt
-- for details.
--
----------------------------------------------------------------------

module ("mlp", package.seeall)
--require "std"

lexer = { }

local debugf = function() end
--local debugf=printf

----------------------------------------------------------------------
-- Patterns used by [lexer:extract] to decompose the raw string into
-- correctly tagged tokens.
----------------------------------------------------------------------
lexer.patterns = {
   spaces              = "^[ \r\n\t]*()",
   short_comment       = "^%-%-([^\n]*)()\n",
   final_short_comment = "^%-%-([^\n]*)()$",
   long_comment        = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
   long_string         = "^%[(=*)%[\n?(.-)%]%1%]()",
   number_mantissa     = {
      "^%d+%.?%d*()",
      "^%d*%d%.%d+()" },
   number_exponant = "^[eE][%+%-]?%d+()",
   word            = "^([%a_][%w_]*)()"
}

----------------------------------------------------------------------
-- Take a letter [x], and returns the character represented by the 
-- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
----------------------------------------------------------------------
local function unesc_letter(x)
   local t = { 
      a = "\a", b = "\b", f = "\f",
      n = "\n", r = "\r", t = "\t", v = "\v",
      ["\\"] = "\\", ["'"] = "'", ['"'] = '"' }
   return t[x] or error("Unknown escape sequence \\"..x)
end

----------------------------------------------------------------------
-- Turn the digits of an escape sequence into the corresponding
-- character, e.g. [unesc_digits("123") == string.char(123)].
----------------------------------------------------------------------
local function unesc_digits (x)
   local k, j, i = x:reverse():byte(1, 3)
   local z = _G.string.byte "0"
   return _G.string.char ((k or z) + 10*(j or z) + 100*(i or z) - 111*z)
end

----------------------------------------------------------------------
-- unescape a whole string, applying [unesc_digits] and [unesc_letter]
-- as many times as required.
----------------------------------------------------------------------
local function unescape_string (s)
   return s:gsub("\\([0-9]+)", unesc_digits):gsub("\\(.)",unesc_letter)
end

lexer.extractors = {
   "skip_whitespaces_and_comments",
   "extract_short_string", "extract_word", "extract_number", 
   "extract_long_string", "extract_symbol" }

----------------------------------------------------------------------
-- Really extract next token fron the raw string 
-- (and update the index).
----------------------------------------------------------------------
function lexer:extract ()

   local previous_i = self.i
   local loc, eof, token = self.i

   local function tk (tag, content)
      assert (tag and content)
      local i, ln = previous_i, self.line
      -- update line numbers
      while true do
         i = self.src:find("\n", i+1, true)
         if not i then break end
         if loc and i <= loc then ln = ln+1 end
         if i <= self.i then self.line = self.line+1 else break end
      end
      local a = { tag = tag, char=loc, line=ln, content }
      if #self.attached_comments > 0 then 
         a.comments = self.attached_comments 
         self.attached_comments = nil
      end
      local amt = { 
         __tostring = function() 
            return string.format ("`%s{'%s'}",a.tag, a[1]) end } 
      setmetatable (a, amt)
      return a
   end

   self.attached_comments = { }
   
   for i = 1, #self.extractors do
      -- printf("method = %s", method)
      local tag, content = self[self.extractors[i]](self)
      -- [loc] is placed just after the leading whitespaces and comments
      if i==1 then loc = self.i end

      if tag then 
         --printf("`%s{ %q }\t%i", tag, content, loc);
         return tk (tag, content) 
      end
   end

   error "Cant extract anything!"
end   

----------------------------------------------------------------------
-- skip whites and comments
-- FIXME: doesn't take into account:
-- - unterminated long comments
-- - short comments without a final \n
----------------------------------------------------------------------
function lexer:skip_whitespaces_and_comments()
   local attached_comments = { }
   repeat
      local _, j
      local again = false
      local last_comment_content = nil
      -- skip spaces
      self.i = self.src:match (self.patterns.spaces, self.i)
      -- skip a long comment if any
      _, last_comment_content, j = self.src:match (self.patterns.long_comment, self.i)
      if j then 
         _G.table.insert(self.attached_comments, 
                         {last_comment_content, self.i, j, "long"})
         self.i=j; again=true 
      end
      -- skip a short comment if any
      last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
      if j then
         _G.table.insert(attached_comments, 
                         {last_comment_content, self.i, j, "short"})
         self.i=j; again=true 
      end
      if self.i>#self.src then return "Eof", "eof" end
   until not again

   if self.src:match (self.patterns.final_short_comment, self.i) then 
      return "Eof", "eof" end
   --assert (not self.src:match(self.patterns.short_comment, self.i))
   --assert (not self.src:match(self.patterns.long_comment, self.i))
   -- --assert (not self.src:match(self.patterns.spaces, self.i))
   return
end

----------------------------------------------------------------------
--
----------------------------------------------------------------------
function lexer:extract_short_string()
   -- [k] is the first unread char, [self.i] points to [k] in [self.src]
   local j, k = self.i, self.src:sub (self.i,self.i)
   if k=="'" or k=='"' then
      -- short string
      repeat
         self.i=self.i+1; 
         local kk = self.src:sub (self.i, self.i)
         if kk=="\\" then 
            self.i=self.i+1; 
            kk = self.src:sub (self.i, self.i)
         end
         if self.i > #self.src then error "Unterminated string" end
         if self.i == "\r" or self.i == "\n" then error "no \\n in short strings!" end
      until self.src:sub (self.i, self.i) == k 
         and ( self.src:sub (self.i-1, self.i-1) ~= '\\' 
         or self.src:sub (self.i-2, self.i-2) == '\\')
      self.i=self.i+1
      return "String", unescape_string (self.src:sub (j+1,self.i-2))
   end   
end

----------------------------------------------------------------------
--
----------------------------------------------------------------------
function lexer:extract_word()
   -- Id / keyword
   local word, j = self.src:match (self.patterns.word, self.i)
   if word then
      self.i = j
      if self.alpha [word] then return "Keyword", word
      else return "Id", word end
   end
end

----------------------------------------------------------------------
--
----------------------------------------------------------------------
function lexer:extract_number()
   -- Number
   local j = self.src:match (self.patterns.number_mantissa[1], self.i) or
             self.src:match (self.patterns.number_mantissa[2], self.i)
   if j then 
      j = self.src:match (self.patterns.number_exponant, j) or j;
      local n = tonumber (self.src:sub (self.i, j-1))
      self.i = j
      return "Number", n
   end
end

----------------------------------------------------------------------
--
----------------------------------------------------------------------
function lexer:extract_long_string()
   -- Long string
   local _, content, j = self.src:match (self.patterns.long_string, self.i)
   if j then self.i = j; return "String", content end
end

----------------------------------------------------------------------
--
----------------------------------------------------------------------
function lexer:extract_symbol()
   -- compound symbol
   local k = self.src:sub (self.i,self.i)
   local syms = self.sym [k]
   if not syms then 
      self.i = self.i + 1
      return "Keyword", k
   end
   for _, sym in pairs (syms) do
      if sym == self.src:sub (self.i, self.i + #sym - 1) then 
         self.i = self.i + #sym; 
         return "Keyword", sym
      end
   end
   -- single char symbol
   self.i = self.i+1
   return "Keyword", k
end

----------------------------------------------------------------------
-- Add a keyword to the list of keywords recognized by the lexer.
----------------------------------------------------------------------
function lexer:add (w)
   if type (w) == "table" then
      for _, x in ipairs (w) do self:add (x) end
   else
      if w:match (self.patterns.word .. "$") then self.alpha [w] = true
      elseif w:match "^%p%p+$" then 
         local k = w:sub(1,1)
         local list = self.sym [k]
         if not list then list = { }; self.sym [k] = list end
         _G.table.insert (list, w)
      elseif w:match "^%p$" then return
      else error "Invalid keyword" end
   end
end

----------------------------------------------------------------------
-- Return the [n]th next token, without consumming it.
-- [n] defaults to 1. If it goes pass the end of the stream, an EOF
-- token is returned.
----------------------------------------------------------------------
function lexer:peek (n)
   assert(self)
   if not n then n=1 end
   if n > #self.peeked then
      for i = #self.peeked+1, n do
         self.peeked [i] = self:extract()
      end
   end
  return self.peeked [n]
end

----------------------------------------------------------------------
-- Return the [n]th next token, removing it as well as the 0..n-1
-- previous tokens. [n] defaults to 1. If it goes pass the end of the
-- stream, an EOF token is returned.
----------------------------------------------------------------------
function lexer:next (n)
   if not n then n=1 end
   self:peek (n)
   local a
   for i=1,n do 
      a = _G.table.remove (self.peeked, 1) 
      if a then debugf ("[L:%i K:%i %s]",  a.line or -1, a.char or -1, a[1]) end
   end
   return a or eof_token
end

----------------------------------------------------------------------
-- Returns an object which saves the stream's current state.
----------------------------------------------------------------------
function lexer:save () return { self.i; _G.table.cat(self.peeked) } end

----------------------------------------------------------------------
-- Restore the stream's state, as saves by method [save].
----------------------------------------------------------------------
function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end

----------------------------------------------------------------------
-- 
----------------------------------------------------------------------
local keywords = {
    "and", "break", "do", "else", "elseif",
    "end", "false", "for", "function", "if",
    "in", "local", "nil", "not", "or", "repeat",
    "return", "then", "true", "until", "while",
    "...", "..", "==", ">=", "<=", "~=", 
    "+{", "-{", "let" }
 
lexer.alpha  = { }; lexer.sym    = { }
for _, w in ipairs(keywords) do lexer:add(w) end

----------------------------------------------------------------------
-- 
----------------------------------------------------------------------
lexer.stream_mt = { __index=lexer }

function lexer:newstream (src)
   local stream = { 
      src    = src; 
      peeked = { }; 
      i      = 1;
      line   = 1;
   }
   setmetatable (stream, self.stream_mt)

   -- skip initial sharp-bang for unix scripts
   if src:sub(1,2)=="#!" then stream.i = src:find "\n" + 1 end
   return stream
end

----------------------------------------------------------------------
-- 
----------------------------------------------------------------------
function lexer:is_keyword (a, ...)
   if not a or a.tag ~= "Keyword" then return false end
   local words = {...}
   if #words == 0 then return a[1] end
   for _, w in ipairs (words) do
      if w == a[1] then return w end
   end
   return false
end

----------------------------------------------------------------------
-- 
----------------------------------------------------------------------
function lexer:check (...)
   local words = {...}
   local a = self:next()
   local function err ()
      error ("Got " .. tostring (a) .. 
             ", expected one of these keywords : '" ..
             _G.table.concat (words,"', '") .. "'") end
          
   if not a or a.tag ~= "Keyword" then err () end
   if #words == 0 then return a[1] end
   for _, w in ipairs (words) do
       if w == a[1] then return w end
   end
   err ()
end

----------------------------------------------------------------------
-- 
----------------------------------------------------------------------
function is_stream (x)
   return getmetable(x) == lexer
end