0 | module Idrall.Lexer
   1 | 
   2 | import Control.Monad.Identity
   3 | import Control.Monad.Trans
   4 | 
   5 | import Data.Vect
   6 | import Data.List
   7 | import Data.Nat
   8 | import Data.String.Parser
   9 | import Data.String
  10 | import Data.List1
  11 | 
  12 | -- Not a real Lexer, more a collection of small parsing utilities.
  13 | 
  14 | export
  15 | commaSep1' : Monad m => ParseT m a -> ParseT m (List1 a)
  16 | commaSep1' p = p `sepBy1` (token ",")
  17 | 
  18 | mutual
  19 |   blockCommentChunk : Parser String
  20 |   blockCommentChunk =
  21 |     blockComment <|> characters <|> character <|> endOfLine
  22 |     where
  23 |       characters : Parser String
  24 |       characters = (takeWhile1 predicate)
  25 |         where
  26 |           predicate : Char -> Bool
  27 |           predicate c =
  28 |                   '\x20' <= c && c <= '\x10FFFF' && c /= '-' && c /= '{'
  29 |               ||  c == '\n'
  30 |               ||  c == '\t'
  31 | 
  32 |       character : Parser String
  33 |       character = do x <- satisfy predicate
  34 |                      pure $ singleton x
  35 |         where
  36 |           predicate : Char -> Bool
  37 |           predicate c = '\x20' <= c && c <= '\x10FFFF' || c == '\n' || c == '\t'
  38 | 
  39 |       endOfLine : Parser String
  40 |       endOfLine = (string "\r\n" <?> "newline")
  41 | 
  42 |   blockCommentContinue : Parser String
  43 |   blockCommentContinue = endOfComment <|> continue
  44 |     where
  45 |       endOfComment : Parser String
  46 |       endOfComment = token "-}" *> pure ""
  47 | 
  48 |       continue : Parser String
  49 |       continue = do
  50 |           c <- blockCommentChunk
  51 |           c' <- blockCommentContinue
  52 |           pure (c <+> c')
  53 | 
  54 |   public export
  55 |   blockComment : Parser String
  56 |   blockComment = do
  57 |       _ <- string "{-"
  58 |       c <- blockCommentContinue
  59 |       pure c
  60 | 
  61 | public export
  62 | lineComment : Parser String
  63 | lineComment = token "--" *> takeWhile (\c => c /= '\n') <* token "\n"
  64 | 
  65 | public export
  66 | whitespace : Parser ()
  67 | whitespace =  skip blockComment <|> skip (some lineComment) <|> spaces
  68 | 
  69 | public export
  70 | hexNumber : Parser Int
  71 | hexNumber = choice (the (List (Lazy (Parser Int))) [ hexDigit, hexUpper, hexLower ])
  72 | where
  73 |   hexDigit : Parser Int
  74 |   hexDigit = do
  75 |     c <- satisfy (\c => '0' <= c && c <= '9')
  76 |     pure (ord c - ord '0') -- TODO understand this
  77 |   hexUpper : Parser Int
  78 |   hexUpper = do
  79 |     c <- satisfy (\c => 'A' <= c && c <= 'F')
  80 |     pure (10 + ord c - ord 'A')
  81 |   hexLower : Parser Int
  82 |   hexLower = do
  83 |     c <- satisfy (\c => 'a' <= c && c <= 'f')
  84 |     pure (10 + ord c - ord 'a')
  85 | 
  86 | -- https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs
  87 | public export
  88 | isSurrogate : Int -> Bool
  89 | isSurrogate x =
  90 |   ((0xD800 <= x) && (x <= 0xDBFF))
  91 |     || ((0xDC00 <= x) && (x <= 0xDFFF))
  92 | 
  93 | {- TODO fix when idris2 has `Bits`
  94 | public export
  95 | bitAnd : Int -> Int -> Bits 1
  96 | bitAnd x y = and (cast (toInteger x)) (cast (toInteger y))
  97 | where
  98 |   toInteger : Int -> Integer
  99 |   toInteger x = cast x
 100 |   -}
 101 | 
 102 | public export
 103 | validCodepoint : Int -> Bool
 104 | validCodepoint c = not (isSurrogate c
 105 |                        --  TODO fix when idris2 has `Bits`
 106 |                        || True -- (bitsToInt (bitAnd c 0xFFFE)) == 0xFFFE
 107 |                        || True) -- (bitsToInt (bitAnd c 0xFFFF)) == 0xFFFF)
 108 | 
 109 | public export
 110 | unicode : Parser Char
 111 | unicode = do
 112 |   _ <- char 'u'
 113 |   n <- bracedEscapeSequence <|> fourCharacterEscapeSequence
 114 |   pure (chr n)
 115 | where
 116 |   toNumber : List Int -> Int
 117 |   toNumber = foldl (\x,y => x * 16 + y) 0
 118 |   vectToList : Vect m a -> List a
 119 |   vectToList [] = []
 120 |   vectToList (y :: xs) = y :: toList xs
 121 |   fourCharacterEscapeSequence : Parser Int
 122 |   fourCharacterEscapeSequence = do
 123 |     ns <- ntimes 4 hexNumber
 124 |     guard (validCodepoint (toNumber (vectToList ns))) -- TODO use let with idris2 to DRY
 125 |       <|> fail "Invalid Unicode code point"
 126 |     pure (toNumber (vectToList ns))
 127 |   bracedEscapeSequence : Parser Int
 128 |   bracedEscapeSequence = do
 129 |     _ <- char '{'
 130 |     ns <- some hexNumber
 131 |     guard (validCodepoint (toNumber ns)
 132 |           && (toNumber ns) <= 0x10fffd) -- TODO use let with idris2 to DRY
 133 |           <|> fail "Invalid Unicode code point"
 134 |     _ <- char '}'
 135 |     pure (toNumber ns)
 136 |