0 | |||This is a slightly modified copy of the same module from `contrib` package.
   1 | module TyRE.Text.Lexer
   2 | 
   3 | import public Data.Bool
   4 | import public Data.List
   5 | import public Data.Nat
   6 | 
   7 | import public TyRE.Text.Lexer.Core
   8 | import public TyRE.Text.Quantity
   9 | import public TyRE.Text.Token
  10 | 
  11 | public export
  12 | toTokenMap : List (Lexer, k) -> TokenMap (Token k)
  13 | toTokenMap = map $ \(l, kind) => (l, Tok kind)
  14 | 
  15 | ||| Recognise any character.
  16 | ||| /./
  17 | public export
  18 | any : Lexer
  19 | any = pred (const True)
  20 | 
  21 | ||| Recognise a lexer or recognise no input. This is not guaranteed
  22 | ||| to consume input.
  23 | ||| /`l`?/
  24 | public export
  25 | opt : (l : Lexer) -> Recognise False
  26 | opt l = l <|> empty
  27 | 
  28 | ||| Recognise any character if the sub-lexer `l` fails.
  29 | ||| /(?!`l`)./
  30 | public export
  31 | non : (l : Lexer) -> Lexer
  32 | non l = reject l <+> any
  33 | 
  34 | ||| Produce recognisers by applying a function to elements of a container, and
  35 | ||| recognise the first match. Consumes input if the function produces consuming
  36 | ||| recognisers. Fails if the container is empty.
  37 | public export
  38 | choiceMap : {c : Bool} ->
  39 |             Foldable t => (a -> Recognise c) -> t a -> Recognise c
  40 | choiceMap f xs = foldr (\x, acc => rewrite sym (andSameNeutral c) in
  41 |                                            f x <|> acc)
  42 |                        fail xs
  43 | 
  44 | ||| Recognise the first matching recogniser in a container. Consumes input if
  45 | ||| recognisers in the list consume. Fails if the container is empty.
  46 | public export
  47 | choice : {c : _} ->
  48 |          Foldable t => t (Recognise c) -> Recognise c
  49 | choice = Lexer.choiceMap id
  50 | 
  51 | ||| Sequence a list of recognisers. Guaranteed to consume input if the list is
  52 | ||| non-empty and the recognisers consume.
  53 | public export
  54 | concat : {c : _} ->
  55 |          (xs : List (Recognise c)) -> Recognise (c && isCons xs)
  56 | concat = Lexer.Core.concatMap id
  57 | 
  58 | ||| Recognise a specific character.
  59 | ||| /[`x`]/
  60 | public export
  61 | is : (x : Char) -> Lexer
  62 | is x = pred (==x)
  63 | 
  64 | ||| Recognise anything but the given character.
  65 | ||| /[\^`x`]/
  66 | public export
  67 | isNot : (x : Char) -> Lexer
  68 | isNot x = pred (/=x)
  69 | 
  70 | ||| Recognise a specific character (case-insensitive).
  71 | ||| /[`x`]/i
  72 | public export
  73 | like : (x : Char) -> Lexer
  74 | like x = pred (\y => toUpper x == toUpper y)
  75 | 
  76 | ||| Recognise anything but the given character (case-insensitive).
  77 | ||| /[\^`x`]/i
  78 | public export
  79 | notLike : (x : Char) -> Lexer
  80 | notLike x = pred (\y => toUpper x /= toUpper y)
  81 | 
  82 | ||| Recognise a specific string.
  83 | ||| Fails if the string is empty.
  84 | ||| /`str`/
  85 | public export
  86 | exact : (str : String) -> Lexer
  87 | exact str = case unpack str of
  88 |                  [] => fail
  89 |                  (x :: xs) => concatMap is (x :: xs)
  90 | 
  91 | ||| Recognise a specific string (case-insensitive).
  92 | ||| Fails if the string is empty.
  93 | ||| /`str`/i
  94 | public export
  95 | approx : (str : String) -> Lexer
  96 | approx str = case unpack str of
  97 |                   [] => fail
  98 |                   (x :: xs) => concatMap like (x :: xs)
  99 | 
 100 | ||| Recognise any of the characters in the given string.
 101 | ||| /[`chars`]/
 102 | public export
 103 | oneOf : (chars : String) -> Lexer
 104 | oneOf chars = pred (\x => x `elem` unpack chars)
 105 | 
 106 | ||| Recognise a character range. Also works in reverse!
 107 | ||| /[`start`-`end`]/
 108 | public export
 109 | range : (start : Char) -> (end : Char) -> Lexer
 110 | range start end = pred (\x => (x >= min start end)
 111 |                            && (x <= max start end))
 112 | 
 113 | mutual
 114 |   ||| Recognise a sequence of at least one sub-lexers
 115 |   ||| /`l`+/
 116 |   public export
 117 |   some : Lexer -> Lexer
 118 |   some l = l <+> many l
 119 | 
 120 |   ||| Recognise a sequence of at zero or more sub-lexers. This is not
 121 |   ||| guaranteed to consume input
 122 |   ||| /`l`\*/
 123 |   public export
 124 |   many : Lexer -> Recognise False
 125 |   many l = opt (some l)
 126 | 
 127 | ||| Repeat the sub-lexer `l` zero or more times until the lexer
 128 | ||| `stopBefore` is encountered. `stopBefore` will not be consumed.
 129 | ||| Not guaranteed to consume input.
 130 | ||| /((?!`stopBefore`)`l`)\*/
 131 | public export
 132 | manyUntil : (stopBefore : Recognise c) -> (l : Lexer) -> Recognise False
 133 | manyUntil stopBefore l = many (reject stopBefore <+> l)
 134 | 
 135 | ||| Repeat the sub-lexer `l` zero or more times until the lexer
 136 | ||| `stopAfter` is encountered, and consume it. Guaranteed to
 137 | ||| consume if `stopAfter` consumes.
 138 | ||| /`l`\*?`stopAfter`/
 139 | public export
 140 | manyThen : (stopAfter : Recognise c) -> (l : Lexer) -> Recognise c
 141 | manyThen stopAfter l = manyUntil stopAfter l <+> stopAfter
 142 | 
 143 | ||| Recognise many instances of `l` until an instance of `end` is
 144 | ||| encountered.
 145 | |||
 146 | ||| Useful for defining comments.
 147 | public export
 148 | manyTill : (l : Lexer) -> (end : Lexer) -> Recognise False
 149 | manyTill l end = end <|> opt (l <+> manyTill l end)
 150 | 
 151 | ||| Recognise a sub-lexer repeated as specified by `q`. Fails if `q` has
 152 | ||| `min` and `max` in the wrong order. Consumes input unless `min q` is zero.
 153 | ||| /`l`{`q`}/
 154 | public export
 155 | count : (q : Quantity) -> (l : Lexer) -> Recognise (isSucc (min q))
 156 | count (Qty Z Nothing) l = many l
 157 | count (Qty Z (Just Z)) _ = empty
 158 | count (Qty Z (Just (S max))) l = opt $ l <+> count (atMost max) l
 159 | count (Qty (S min) Nothing) l = l <+> count (atLeast min) l
 160 | count (Qty (S min) (Just Z)) _ = fail
 161 | count (Qty (S min) (Just (S max))) l = l <+> count (between min max) l
 162 | 
 163 | ||| Recognise a single digit 0-9
 164 | ||| /[0-9]/
 165 | public export
 166 | digit : Lexer
 167 | digit = pred isDigit
 168 | 
 169 | ||| Recognise one or more digits
 170 | ||| /[0-9]+/
 171 | public export
 172 | digits : Lexer
 173 | digits = some digit
 174 | 
 175 | ||| Recognise a single hexidecimal digit
 176 | ||| /[0-9A-Fa-f]/
 177 | public export
 178 | hexDigit : Lexer
 179 | hexDigit = pred isHexDigit
 180 | 
 181 | ||| Recognise one or more hexidecimal digits
 182 | ||| /[0-9A-Fa-f]+/
 183 | public export
 184 | hexDigits : Lexer
 185 | hexDigits = some hexDigit
 186 | 
 187 | ||| Recognise a single octal digit
 188 | ||| /[0-8]/
 189 | public export
 190 | octDigit : Lexer
 191 | octDigit = pred isHexDigit
 192 | 
 193 | ||| Recognise one or more octal digits
 194 | ||| /[0-8]+/
 195 | public export
 196 | octDigits : Lexer
 197 | octDigits = some hexDigit
 198 | 
 199 | ||| Recognise a single alpha character
 200 | ||| /[A-Za-z]/
 201 | public export
 202 | alpha : Lexer
 203 | alpha = pred isAlpha
 204 | 
 205 | ||| Recognise one or more alpha characters
 206 | ||| /[A-Za-z]+/
 207 | public export
 208 | alphas : Lexer
 209 | alphas = some alpha
 210 | 
 211 | ||| Recognise a lowercase alpha character
 212 | ||| /[a-z]/
 213 | public export
 214 | lower : Lexer
 215 | lower = pred isLower
 216 | 
 217 | ||| Recognise one or more lowercase alpha characters
 218 | ||| /[a-z]+/
 219 | public export
 220 | lowers : Lexer
 221 | lowers = some lower
 222 | 
 223 | ||| Recognise an uppercase alpha character
 224 | ||| /[A-Z]/
 225 | public export
 226 | upper : Lexer
 227 | upper = pred isUpper
 228 | 
 229 | ||| Recognise one or more uppercase alpha characters
 230 | ||| /[A-Z]+/
 231 | public export
 232 | uppers : Lexer
 233 | uppers = some upper
 234 | 
 235 | ||| Recognise an alphanumeric character
 236 | ||| /[A-Za-z0-9]/
 237 | public export
 238 | alphaNum : Lexer
 239 | alphaNum = pred isAlphaNum
 240 | 
 241 | ||| Recognise one or more alphanumeric characters
 242 | ||| /[A-Za-z0-9]+/
 243 | public export
 244 | alphaNums : Lexer
 245 | alphaNums = some alphaNum
 246 | 
 247 | ||| Recognise a single whitespace character
 248 | ||| /\\s/
 249 | public export
 250 | space : Lexer
 251 | space = pred isSpace
 252 | 
 253 | ||| Recognise one or more whitespace characters
 254 | ||| /\\s+/
 255 | public export
 256 | spaces : Lexer
 257 | spaces = some space
 258 | 
 259 | ||| Recognise a single newline sequence. Understands CRLF, CR, and LF
 260 | ||| /\\r\\n|[\\r\\n]/
 261 | public export
 262 | newline : Lexer
 263 | newline = let crlf = "\r\n" in
 264 |               exact crlf <|> oneOf crlf
 265 | 
 266 | ||| Recognise one or more newline sequences. Understands CRLF, CR, and LF
 267 | ||| /(\\r\\n|[\\r\\n])+)/
 268 | public export
 269 | newlines : Lexer
 270 | newlines = some newline
 271 | 
 272 | ||| Recognise a single non-whitespace, non-alphanumeric character
 273 | ||| /[\^\\sA-Za-z0-9]/
 274 | public export
 275 | symbol : Lexer
 276 | symbol = pred (\x => not (isSpace x || isAlphaNum x))
 277 | 
 278 | ||| Recognise one or more non-whitespace, non-alphanumeric characters
 279 | ||| /[\^\\sA-Za-z0-9]+/
 280 | public export
 281 | symbols : Lexer
 282 | symbols = some symbol
 283 | 
 284 | ||| Recognise a single control character
 285 | ||| /[\\x00-\\x1f\\x7f-\\x9f]/
 286 | public export
 287 | control : Lexer
 288 | control = pred isControl
 289 | 
 290 | ||| Recognise one or more control characters
 291 | ||| /[\\x00-\\x1f\\x7f-\\x9f]+/
 292 | public export
 293 | controls : Lexer
 294 | controls = some control
 295 | 
 296 | ||| Recognise zero or more occurrences of a sub-lexer between
 297 | ||| delimiting lexers
 298 | ||| /`start`(`l`)\*?`end`/
 299 | public export
 300 | surround : (start : Lexer) -> (end : Lexer) -> (l : Lexer) -> Lexer
 301 | surround start end l = start <+> manyThen end l
 302 | 
 303 | ||| Recognise zero or more occurrences of a sub-lexer surrounded
 304 | ||| by the same quote lexer on both sides (useful for strings)
 305 | ||| /`q`(`l`)\*?`q`/
 306 | public export
 307 | quote : (q : Lexer) -> (l : Lexer) -> Lexer
 308 | quote q l = surround q q l
 309 | 
 310 | ||| Recognise an escape character (often '\\') followed by a sub-lexer
 311 | ||| /[`esc`]`l`/
 312 | public export
 313 | escape : (esc : Char) -> Lexer -> Lexer
 314 | escape esc l = is esc <+> l
 315 | 
 316 | ||| Recognise a string literal, including escaped characters.
 317 | ||| (Note: doesn't yet handle escape sequences such as \123)
 318 | ||| /"(\\\\.|.)\*?"/
 319 | public export
 320 | stringLit : Lexer
 321 | stringLit = quote (is '"') (escape '\\' any <|> any)
 322 | 
 323 | ||| Recognise a character literal, including escaped characters.
 324 | ||| (Note: doesn't yet handle escape sequences such as \123)
 325 | ||| /'(\\\\.|[\^'])'/
 326 | public export
 327 | charLit : Lexer
 328 | charLit = let q = '\'' in
 329 |               is q <+> (escape '\\' (control <|> any) <|> isNot q) <+> is q
 330 |   where
 331 |     lexStr : List String -> Lexer
 332 |     lexStr [] = fail
 333 |     lexStr (t :: ts) = exact t <|> lexStr ts
 334 | 
 335 |     control : Lexer
 336 |     control = lexStr ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
 337 |                       "BS",  "HT",  "LF",  "VT",  "FF",  "CR",  "SO",  "SI",
 338 |                       "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
 339 |                       "CAN", "EM",  "SUB", "ESC", "FS",  "GS",  "RS",  "US",
 340 |                       "SP",  "DEL"]
 341 |                 <|> (is 'x' <+> hexDigits)
 342 |                 <|> (is 'o' <+> octDigits)
 343 |                 <|> digits
 344 | 
 345 | ||| Recognise an integer literal (possibly with a '-' prefix)
 346 | ||| /-?[0-9]+/
 347 | public export
 348 | intLit : Lexer
 349 | intLit = opt (is '-') <+> digits
 350 | 
 351 | ||| Recognise a hexidecimal literal, prefixed by "0x" or "0X"
 352 | ||| /0[Xx][0-9A-Fa-f]+/
 353 | public export
 354 | hexLit : Lexer
 355 | hexLit = approx "0x" <+> hexDigits
 356 | 
 357 | ||| Recognise `start`, then recognise all input until a newline is encountered,
 358 | ||| and consume the newline. Will succeed if end-of-input is encountered before
 359 | ||| a newline.
 360 | ||| /`start`[\^\\r\\n]+(\\r\\n|[\\r\\n])?/
 361 | public export
 362 | lineComment : (start : Lexer) -> Lexer
 363 | lineComment start = start <+> manyUntil newline any <+> opt newline
 364 | 
 365 | ||| Recognise all input between `start` and `end` lexers.
 366 | ||| Supports balanced nesting.
 367 | |||
 368 | ||| For block comments that don't support nesting (such as C-style comments),
 369 | ||| use `surround`
 370 | public export
 371 | blockComment : (start : Lexer) -> (end : Lexer) -> Lexer
 372 | blockComment start end = start <+> middle <+> end
 373 |   where
 374 |     middle : Recognise False
 375 |     middle = manyUntil end (blockComment start end <|> any)
 376 |