%
%
%

As a temporary stop gap - a (hand written) lexer
for Green Card. Sigh.

\begin{code}
module Lex ( topLex, runLexer ) where

import Prelude hiding ( lex )
import GCToken
import LexM
import FiniteMap
import Char	( isAlpha, isDigit, isLower, isSpace )
import Casm     ( lookupKind )
import List     ( intersperse )
import Decl
import IO (hPutStrLn, stderr )

\end{code}

This lexer for Green Card is not particularly eventful.
The only interesting thing going on is that we may call
the lexer from two different parser `states': one for
lexing the contents of a Green Card directive, the other
for passing on Haskell code verbatim.

The @topLex@ action is the entry point for the parser,
and it takes care of invoking the right sub-state. (The
lexer moves back to into the `verbatim' state upon exit
from the lexing of a directive.)

It could be argued that a stream-based interface between lexer
and parser would solve this entering and exit from sub-states
more cleanly, but it comes at a cost (error recovery and the
passing of info such as line numbers is more expensive.)

\begin{code}
runLexer :: String -> LexM [Decl] -> String -> IO [Decl]
runLexer fname m str = runLexM fname str (setLexState lex >> m)

type LexCont = Token -> LexM [Decl]
type Lexer   = LexCont -> LexM [Decl]

topLex :: Lexer
topLex cont = do 
 eof <- isEOF
 if eof
  then cont T_eof
  else do
   contf <- getLexState
   contf cont

\end{code}

Initial state of the lexer when starting on a new line.

\begin{code}
lex :: Lexer
lex cont = do 
   c <- getNextChar
   case c of
    '%' -> -- looks directive-like this..
          lexDirective cont
    '{' -> do
      c1 <- getNextChar
      case c1 of
        '-' -> lex_nested_comment ["{-"] 1{-seen-} cont
	_   -> haskellCode [c,c1] cont
    _ -> do
      putBackChar c
      haskellCode [] cont
\end{code}

Gobble up a line of Haskell, taking care of dealing with strings
and comments correctly.

\begin{code}
haskellCode :: [Char] -> Lexer
haskellCode as cont = do
  cs <- getStream
   -- mind-bogglingly quick this.
  case break (\x -> x == '{'
                 || x == '\n'
                 || x == '-'
                 || x == '\\'
		 || x == '"'{-"-} 
  	     ) cs of
   (ls,rs) ->
    case rs of     
     ('"'{-"-}:cs1) -> do
       let (str,cs2) = lex_string cs1
       setStream cs2
       haskellCode (as++ls++'"'{-"-}:str) cont
     ('\n':cs1) -> -- end of the line.
       incLineNo $ do
       setStream cs1
       cont (T_haskell (as++ls))
     ('\\':'"'{-"-}:cs1) -> do  -- escaped quote, eat it.
       setStream cs1
       haskellCode (as++"\\\"") cont
     ('{':'-':cs1) -> do  -- entering block comment.
       setStream cs1
       lex_nested_comment (["{-",as,ls]) 1{-seen-} cont
     ('-':'-':cs1) -> do  -- one-liner, spool until end of line.
       case break (=='\n') cs1 of
        (ls1,cs2) -> do
	   let cs3 = case cs2 of [] -> cs2; (_:cs3) -> cs3
	   setStream cs3
	   incLineNo $ do
	   cont (T_haskell (as++ls++'-':'-':ls1))
     (c:cs1) -> do
       setStream cs1
       haskellCode (as++ls++[c]) cont
     [] -> do
       setStream []
       cont (T_haskell (as++ls))
\end{code}

\begin{code}
lexDirective :: Lexer
lexDirective cont = do
  c <- getNextChar
  case c of
   'C' -> cCode cont
   '-' -> cCode cont
   '{' -> codeChunk (\ code c -> c (T_ccode code)) cont
   '#' -> do
      cs <- getStream
      case span (isAlpha) cs of
       (xs,cs1)
         | xs /= "include" -> do
	     setStream cs1
	     cont (T_unknown xs)
         | otherwise -> do
	     let (fname,cs2) = break (=='\n') cs1
	     setStream (tail cs2)
	     incLineNo (cont (T_hinclude fname))
   _ -> do      
      cs <- getStream
      case span (isAlpha) cs of
       (xs,cs1) -> do
            setStream cs1
       	    case lookupFM kwordsFM (c:xs) of
	      Nothing    -> cont (T_unknown (c:xs))
	      Just contf -> do
	           setLexState lexDirect
		   contf cont

kwordsFM :: FiniteMap String Lexer
kwordsFM = listToFM $
 [ ("code",     codeChunk (\ code c -> c (T_ccode code)))
 , ("safecode", codeChunk (\ code c -> c (T_safecode code)))
 , ("end",      codeChunk (\ code c -> c (T_end code)))
 , ("prefix",   \ c -> c T_prefix)
 , ("fun",      \ c -> c T_fun)
 , ("call",     \ c -> c T_call)
 , ("result",   \ c -> c T_result)
 , ("fail",     \ c -> c T_fail)
 , ("const",    \ c -> c T_const)
 , ("enum",     \ c -> c T_enum)
 , ("dis",      \ c -> c T_dis)
 , ("dllname",  \ c -> c T_dllname)
 , ("callconv", \ c -> c T_callconv)
 ]
\end{code}

A code chunk consist of sequence of characters,
possibly spanning multiple lines. The 'continuation'
character on a new line is '%' in column 0.

\begin{code}
codeChunk :: (String -> Lexer) -> Lexer
codeChunk retcont cont = do
 cs <- getStream
 go [] cs
  where
   go chunk_acc ls = do
     (line_acc, res) <- go_line [] ls
     incLineNo (
       case res of
         Left the_end -> do
             setStream the_end
	     ret (line_acc:chunk_acc)
         Right rs -> do
             go (line_acc:chunk_acc) rs
       )

   go_line acc ls =
    case span (\x -> x /= '\n' && x /= '%') ls of
      (as,[]) -> return (acc++as, Left [])
      (as,'\n':'%':c1:cs1)
          | isSpace c1 -> return (acc++as, Right (c1:cs1))
 	  | otherwise  -> return (acc++as, Left  ('%':c1:cs1))
      (as,'\n':cs1)    -> return (acc++as, Left cs1)
      (as,'%':'}':cs1) -> return (acc++as, Left (tail (dropWhile (/='\n') cs1)))
      (as,'%':cs1)     -> go_line (acc++as++"%") cs1

   ret ccode = do
     let ccode' = concat (intersperse "\n" (reverse ccode))
     setLexState lex
     retcont ccode' cont
\end{code}

\begin{code}
lex_string :: String -> ({-the-}String, String)
lex_string cs = 
 case go [] cs of
   (acc, str) -> (concat (reverse acc), str)
 where
  go acc cs =
   case break (\x -> x == '\\' || x == '"'{-"-}) cs of
    (ls,[]) -> (ls:acc,[])
    (ls,'\\':'"'{-"-}:cs1) -> -- escaped quote
       go ("\\\"":ls:acc) cs1
    (ls,'\\':'\\':cs1) -> -- escaped slash
       go ("\\\\":ls:acc) cs1
    (ls,'\\':x:cs1) -> -- OK, so it doesn't cope with a file ending with a double slash
       go (['\\',x]:ls:acc) cs1
    (ls,'"'{-"-}:cs1) ->
       ("\"":ls:acc, cs1)
\end{code}

\begin{code}
lex_nested_comment :: [String] -> Int -> Lexer
lex_nested_comment acc count cont = go acc count
 where
  go acc count = do
   cs <- getStream
   case span (\ x -> x /='{' && x /= '-' && x /= '\n') cs of
    (ls,rs) ->
     case rs of
      [] -> do
       setStream []
       cont (T_haskell (concat (reverse (ls:acc))))
      ('\n':cs1)   -> do
       setStream cs1
       incLineNo (go ("\n":ls:acc) count)
      ('{':'-':cs1) -> do
        setStream cs1
        go ("{-":ls:acc) (count+1)
      ('-':'}':cs1) -> do
        setStream cs1
	let acc' = ("-}":ls:acc)
        if count == 1 
	 then haskellCode (concat (reverse acc')) cont
	 else go acc' (count-1)
      (c:cs1)     -> do
         setStream cs1
         go ([c]:ls:acc) count

\end{code}

\begin{code}
lexDirect :: Lexer
lexDirect cont = do
 cs <- getStream
 case dropWhile (\x -> isSpace x && x/='\n') cs of
  [] -> do
    setStream cs
    cont T_eof
  (c:cs1) -> do
   setStream cs1
   case c of
    '}' -> cont T_ccurly
    '(' -> cont T_oparen
    ')' -> cont T_cparen
    '[' -> cont T_osquare
    ']' -> cont T_csquare
    ',' -> cont T_comma
    '.' -> cont T_dot
    '/' -> cont T_slash
    '"' -> cont T_dquote
    '=' -> do
      c1 <- getNextChar
      case c1 of
        '>' -> cont T_darrow
        _   -> do
          putBackChar c1
	  cont T_equal
    '<' -> do
      c1 <- getNextChar
      setLexState lexUserDIS
      case c1 of
       '<' -> cont T_odangle
       _   -> do
         putBackChar c1
	 cont T_oangle
    '>' -> do
      c1 <- getNextChar
      case c1 of
       '>' -> cont T_cdangle
       _   -> do
         putBackChar c1
	 cont T_cangle
    '-' -> do
      c1 <- getNextChar
      case c1 of
       '>' -> cont T_arrow
       '-' -> do
         cs <- getStream
	 case dropWhile (/='\n') cs of
	  ('\n':'%':c1:cs1)
	     |isSpace c1 ->
	         incLineNo $ do
		 setStream (c1:cs1)
		 lexDirect cont 
	     | otherwise -> do
	         incLineNo $ do
		 setLexState lex
		 setStream ('%':c1:cs1)
		 lex cont 
	  ('\n':cs1) ->  do
	         incLineNo $ do
		 setLexState lex
		 setStream cs1
		 lex cont 
	  [] -> cont T_eof
       _   -> do
         putBackChar c1
	 cont (T_unknown cs)
    ':' -> do
      c1 <- getNextChar
      case c1 of
       ':' -> cont T_dcolon
       _   -> do
         putBackChar c1
	 cont (T_unknown cs)
    '%' -> do
      c1 <- getNextChar
      case c1 of
       '%' -> do
         cs <- getStream
	 case span (isAlpha) cs of
	   (ls,cs1) -> do
	     setStream cs1
	     case lookupKind ls of
	       Nothing -> cont (T_unknown cs1)
	       Just k  -> cont (T_kind k)
       _   -> do
	 putBackChar c1
	 lexName (T_disname) c cont
     {-
      The syntax of green-card is currently ambiguous,

         Foo { res1 = y } 
	 
      could either mean the constructor Foo applied to the
      result of C expression "res1=y" or the record constructor
      Foo with the field res1 equal/set to y.
      
      To resolve the two, we insist on there not being any 
      whitespace between a record constructor and the obrace.
      (It's too late in the day for GC to start making
       deep-rooted syntax changes..)
     -}
    '{' -> do
      cExp cont 
    '\n' -> do
     catchEOF (cont T_eof) $
      incLineNo $ do
      c1 <- getNextChar
      case c1 of
       '%' -> do
         c2 <- getNextChar
         putBackChar c2
	 if isSpace c2 
	  then lexDirect cont
	  else do
            putBackChar c1
	    setLexState lex
	    lex cont
	    
       _   -> do
         putBackChar c1
	 setLexState lex
	 lex cont

    _ 
     | isLower c -> lexName (T_disname) c cont
     | otherwise -> lexName (T_name) c cont

lexName :: (String -> Token) -> Char -> Lexer
lexName fo c cont = do
  cs <- getStream
  case span (\x -> isAlpha x    || 
		   isDigit x    || 
                   x == '\''    ||
		   x == '`'     ||
		   x == '-'     ||
		   x == '_') cs of
    (ls,rs) -> do
     let ls' = c:ls
     setStream rs
     case ls' of
      "declare" -> cont T_declare
      "in"      -> cont T_in
      _         -> 
        case rs of
	 ('{':rs1) -> do
	    setStream rs1
	    cont (T_reccon ls')
         [] -> cont (fo ls')
         _ -> cont (fo ls')

cCode :: Lexer
cCode cont = do
 cs <- getStream
 go [] cs
  where
   go acc ls =
    case span (/= '\n') ls of
      (as,[]) -> do
         setStream []
	 ret  (as:acc)
      (as,'\n':'%':cs1) ->
         incLineNo (go (as:acc) cs1)
      (as,'\n':cs1) ->
         incLineNo $ do
         setStream cs1
	 ret (as:acc)

   ret ccode = 
       let ccode' = concat (intersperse "\n" (reverse ccode)) in
       cont (T_c ccode')

{- Note: isn't clever about nested braces nor strings. -}
cExp :: Lexer
cExp cont = do
 cs <- getStream
 case span (/='}') cs of
  (ce,cs1) -> do
    let cs2 = case cs1 of [] -> [] ; (_:cs2) -> cs2
    setStream cs2
    cont (T_cexp ce)
\end{code}

\begin{code}
lexUserDIS :: Lexer
lexUserDIS cont = do
  cs <- getStream
  case span (/='/') cs of
   (ms,[]) -> do
      setStream []
      setLexState lex
      cont (T_unknown cs)
   (ms,'/':cs1) ->
      go ms cs1

 where
  go_one acc cs = 
    case span (\x -> x /= '/' && x /= '-' && x /= '>') cs of
     (us,[]) -> return (Left (concat (reverse (us:acc)), []))
     (us,'-':'>':cs1) -> go_one ("->":us:acc) cs1
     (us,'/':cs1) -> return (Left  (concat (reverse (us:acc)), cs1))
     (us,'>':cs1) -> return (Right (concat (reverse (us:acc)), cs1))
     
  go ms cs1 = do
    res <- go_one [] cs1
    case res of
     Right (us,cs2) -> do
        setStream ('>':cs2)
	setLexState lexDirect
	cont (T_user (ms,us,Nothing))
     Left  (us,cs2) -> do
        res <- go_one [] cs2
	case res of
	 Right (k,cs3) -> do
	   setStream ('>':cs3)
  	   setLexState lexDirect
	   case lookupKind k of
	      Nothing -> cont (T_unknown cs2)
	      Just k  -> cont (T_user (ms, us, Just k))
         Left (_,cs3) -> do
           setStream cs3
	   setLexState lex
	   cont (T_unknown cs2)
\end{code}
