0 | module Text.Lex.Util
   1 | 
   2 | import Text.Lex.Core
   3 | import Text.Lex.Manual
   4 | 
   5 | --------------------------------------------------------------------------------
   6 | --          (Snoc)List Utilities
   7 | --------------------------------------------------------------------------------
   8 | 
   9 | public export
  10 | stripQuotes : SnocList Char -> String
  11 | stripQuotes (sx :< _) = case sx <>> [] of
  12 |   _ :: t => pack t
  13 |   _      => ""
  14 | stripQuotes [<]       = ""
  15 | 
  16 | namespace List
  17 |   export
  18 |   ltrim : List Char -> List Char
  19 |   ltrim (c :: cs) = if isSpace c then ltrim cs else (c :: cs)
  20 |   ltrim []        = []
  21 |   
  22 |   export
  23 |   countHashtag : List Char -> Nat
  24 |   countHashtag ('#' :: t) = S $ countHashtag t
  25 |   countHashtag _          = 0
  26 | 
  27 | namespace SnocList
  28 |   export
  29 |   rtrim : SnocList Char -> SnocList Char
  30 |   rtrim (sc :< c) = if isSpace c then rtrim sc else (sc :< c)
  31 |   rtrim [<]       = [<]
  32 | 
  33 |   export
  34 |   dropHead : Nat -> SnocList Char -> List Char
  35 |   dropHead n = drop n . (<>> [])
  36 |   
  37 |   export
  38 |   countHashtag : SnocList Char -> Nat
  39 |   countHashtag = countHashtag . (<>> [])
  40 | 
  41 | --------------------------------------------------------------------------------
  42 | --          Single-Character Lexers
  43 | --------------------------------------------------------------------------------
  44 | 
  45 | export
  46 | any : Lexer
  47 | any = autoLift tail
  48 | 
  49 | ||| Recognise any character if the sub-lexer `l` fails.
  50 | ||| /(?!`l`)./
  51 | export
  52 | non : (l : Lexer) -> Lexer
  53 | non l = reject l <+> any
  54 | 
  55 | ||| Recognise a specific item.
  56 | ||| /[`x`]/
  57 | export %inline
  58 | is : (c : Char) -> Lexer
  59 | is x = pred (==x)
  60 | 
  61 | ||| Recognise anything but the given item.
  62 | ||| /[\^`x`]/
  63 | export %inline
  64 | isNot : (x : Char) -> Lexer
  65 | isNot x = pred (/=x)
  66 | 
  67 | ||| Recognise a single whitespace character.
  68 | export
  69 | space : Lexer
  70 | space = pred isSpace
  71 | 
  72 | ||| Recognise a single digit.
  73 | export
  74 | digit : Lexer
  75 | digit = pred isDigit
  76 | 
  77 | ||| Recognise a specific character (case-insensitive).
  78 | ||| /[`x`]/i
  79 | export
  80 | like : (x : Char) -> Lexer
  81 | like x = let x' := toUpper x in pred ((x' ==) . toUpper)
  82 | 
  83 | ||| Recognise anything but the given character (case-insensitive).
  84 | ||| /[\^`x`]/i
  85 | export
  86 | notLike : (x : Char) -> Lexer
  87 | notLike x = let x' := toUpper x in pred ((x' /=) . toUpper)
  88 | 
  89 | ||| Recognises one of the given characters.
  90 | export %inline
  91 | oneOf : List Char -> Lexer
  92 | oneOf ts = pred (`elem` ts)
  93 | 
  94 | ||| Recognise a character in a range. Also works in reverse!
  95 | ||| /[`start`-`end`]/
  96 | export %inline
  97 | range : (start : Char) -> (end : Char) -> Lexer
  98 | range start end =
  99 |   let s := min start end
 100 |       e := max start end
 101 |    in pred (\x => x >= s && x <= e)
 102 | 
 103 | --------------------------------------------------------------------------------
 104 | --          Multi-Character Lexers
 105 | --------------------------------------------------------------------------------
 106 | 
 107 | export
 108 | prefixBy : (fs : List (Char -> Bool)) -> Lexer
 109 | prefixBy (f :: []) = pred f
 110 | prefixBy (f :: fs) = pred f <+> prefixBy fs
 111 | prefixBy []        = fail
 112 | 
 113 | export
 114 | exact : String -> Lexer
 115 | exact s =
 116 |   let cs@(_ :: _) := unpack s | [] => fail
 117 |    in autoLift $ exact cs
 118 | 
 119 | export
 120 | approx : String -> Lexer
 121 | approx = prefixBy . map check . unpack
 122 | 
 123 |   where
 124 |     check : Char -> Char -> Bool
 125 |     check c d = toLower c == toLower d
 126 | 
 127 | ||| Recognise a non-empty sequence of digits.
 128 | export
 129 | digits : Lexer
 130 | digits = autoLift digits1
 131 | 
 132 | ||| Recognise a single non-whitespace, non-alphanumeric character
 133 | ||| /[\^\\sA-Za-z0-9]/
 134 | export
 135 | symbol : Lexer
 136 | symbol = pred (\x => not (isSpace x || isAlphaNum x))
 137 | 
 138 | ||| Recognise one or more non-whitespace, non-alphanumeric characters
 139 | ||| /[\^\\sA-Za-z0-9]+/
 140 | export
 141 | symbols : Lexer
 142 | symbols = preds (\x => not (isSpace x || isAlphaNum x))
 143 | 
 144 | ||| Recognise a single control character
 145 | ||| /[\\x00-\\x1f\\x7f-\\x9f]/
 146 | export
 147 | control : Lexer
 148 | control = pred isControl
 149 | 
 150 | ||| Recognise one or more control characters
 151 | ||| /[\\x00-\\x1f\\x7f-\\x9f]+/
 152 | export
 153 | controls : Lexer
 154 | controls = preds isControl
 155 | 
 156 | ||| Recognises a non-empty sequence of the given items
 157 | export %inline
 158 | someOf : List Char -> Lexer
 159 | someOf ts = preds (`elem` ts)
 160 | 
 161 | ||| Recognise some items in a range. Also works in reverse!
 162 | ||| /[`start`-`end`]/
 163 | export %inline
 164 | ranges : (start, end : Char) -> Lexer
 165 | ranges start end =
 166 |   let s := min start end
 167 |       e := max start end
 168 |    in preds (\x => x >= s && x <= e)
 169 | 
 170 | ||| Recognise a non-empty sequence of whitespace characters.
 171 | export
 172 | spaces : Lexer
 173 | spaces = preds isSpace
 174 | 
 175 | ||| Recognise a non-empty sequence of space characters.
 176 | export
 177 | spaceChars : Lexer
 178 | spaceChars = preds isSpaceChar
 179 | 
 180 | ||| Recognise a single newline character sequence
 181 | export
 182 | newline : Lexer
 183 | newline = Lift $ \sc,cs => case cs of
 184 |   '\r' :: '\n' :: t => Succ t
 185 |   '\n' ::         t => Succ t
 186 |   '\r' ::         t => Succ t
 187 |   _                 => fail sc cs
 188 | 
 189 | ||| Reads characters until the next newline character is
 190 | ||| encountered.
 191 | export
 192 | manyTillNewline : Recognise False
 193 | manyTillNewline = preds0 $ \case {'\n' => False; '\r' => False; _ => True}
 194 | 
 195 | ||| Reads characters until the next linefeed character (`'\n'`) is
 196 | ||| encountered.
 197 | export
 198 | manyTillLineFeed : Recognise False
 199 | manyTillLineFeed = preds0 $ \case {'\n' => False; _ => True}
 200 | 
 201 | ||| Lexer for single line comments starting with the given prefix.
 202 | |||
 203 | ||| This reads until (but does not include) the first newline
 204 | ||| character `'\n'` or `'\r'`.
 205 | export
 206 | lineComment : (pre : Lexer) -> Lexer
 207 | lineComment pre = pre <+> manyTillNewline
 208 | 
 209 | ||| Lexer for single line comments starting with the given prefix.
 210 | |||
 211 | ||| This reads until (but does not include) the first line feed
 212 | ||| character (`'\n'`).
 213 | export
 214 | linefeedComment : (pre : Lexer) -> Lexer
 215 | linefeedComment pre = pre <+> manyTillLineFeed
 216 | 
 217 | --------------------------------------------------------------------------------
 218 | --          Combinators
 219 | --------------------------------------------------------------------------------
 220 | 
 221 | export
 222 | atLeast : (n : Nat) -> Lexer -> Recognise (isSucc n)
 223 | atLeast 0     f = many f
 224 | atLeast (S k) f = f <+> atLeast k f
 225 | 
 226 | export
 227 | exactly : (n : Nat) -> Lexer -> {auto 0 _ : IsSucc n} -> Lexer
 228 | exactly 1           f = f
 229 | exactly (S k@(S _)) f = f <+> exactly k f
 230 | 
 231 | export
 232 | manyUntil : (stopBefore : Recognise c) -> Lexer -> Recognise False
 233 | manyUntil sb l = many (reject sb <+> l)
 234 | 
 235 | export
 236 | someUntil : (stopBefore : Recognise c) -> Lexer -> Lexer
 237 | someUntil sb l = some (reject sb <+> l)
 238 | 
 239 | export
 240 | manyThen : (stopAfter : Recognise c) -> (l : Lexer) -> Recognise c
 241 | manyThen stopAfter l = manyUntil stopAfter l <+> stopAfter
 242 | 
 243 | ||| Recognise zero or more occurrences of a sub-lexer between
 244 | ||| delimiting lexers
 245 | ||| /`start`(`l`)\*?`end`/
 246 | export
 247 | surround : (start : Lexer) -> (end : Lexer) -> (l : Lexer) -> Lexer
 248 | surround start end l = start <+> manyThen end l
 249 | 
 250 | ||| Recognise zero or more occurrences of a sub-lexer surrounded
 251 | ||| by the same quote lexer on both sides (useful for strings)
 252 | ||| /`q`(`l`)\*?`q`/
 253 | export
 254 | quote : (q : Lexer) -> (l : Lexer) -> Lexer
 255 | quote q l = surround q q l
 256 | 
 257 | ||| Recognise an escape sub-lexer (often '\\') followed by
 258 | ||| another sub-lexer
 259 | ||| /[`esc`]`l`/
 260 | export
 261 | escape : (esc : Lexer) -> Lexer -> Lexer
 262 | escape esc l = esc <+> l
 263 | 
 264 | --------------------------------------------------------------------------------
 265 | --          Literals
 266 | --------------------------------------------------------------------------------
 267 | 
 268 | export
 269 | stringLit : Lexer
 270 | stringLit = Lift string
 271 | 
 272 | ||| Recognise an integer literal (possibly with a '-' prefix)
 273 | ||| /-?[0-9]+/
 274 | export %inline
 275 | intLit : Lexer
 276 | intLit = autoLift int
 277 | 
 278 | ||| Recognise an integer literal (possibly with a '+' or '-' prefix)
 279 | export %inline
 280 | intLitPlus : Lexer
 281 | intLitPlus = autoLift intPlus
 282 | 
 283 | export %inline
 284 | binDigits : Lexer
 285 | binDigits = preds isBinDigit
 286 | 
 287 | export %inline
 288 | hexDigits : Lexer
 289 | hexDigits = preds isHexDigit
 290 | 
 291 | export %inline
 292 | octDigits : Lexer
 293 | octDigits = preds isOctDigit
 294 | 
 295 | ||| Recognise a binary literal, prefixed by "0b"
 296 | ||| /0b[0-1]+/
 297 | export
 298 | binLit : Lexer
 299 | binLit = exact "0b" <+> binDigits
 300 | 
 301 | ||| Recognise a hexidecimal literal, prefixed by "0x" or "0X"
 302 | ||| /0[Xx][0-9A-Fa-f]+/
 303 | export
 304 | hexLit : Lexer
 305 | hexLit = approx "0x" <+> hexDigits
 306 | 
 307 | ||| Recognise an octal literal, prefixed by "0o"
 308 | ||| /0o[0-9A-Fa-f]+/
 309 | export
 310 | octLit : Lexer
 311 | octLit = exact "0o" <+> preds isOctDigit
 312 | 
 313 | ||| Recognise a decimal integer literal with optional undescores for
 314 | ||| improved readability.
 315 | export
 316 | digitsUnderscoredLit : Lexer
 317 | digitsUnderscoredLit = digits <+> many (is '_' <+> digits)
 318 | 
 319 | ||| Recognise a binary literal with optional undescores for
 320 | ||| improved readability.
 321 | export
 322 | binUnderscoredLit : Lexer
 323 | binUnderscoredLit = binLit <+> many (is '_' <+> binDigits)
 324 | 
 325 | ||| Recognise a hexadecimal literal with optional undescores for
 326 | ||| improved readability.
 327 | export
 328 | hexUnderscoredLit : Lexer
 329 | hexUnderscoredLit = hexLit <+> many (is '_' <+> hexDigits)
 330 | 
 331 | ||| Recognise an octal literal with optional undescores for
 332 | ||| improved readability.
 333 | export
 334 | octUnderscoredLit : Lexer
 335 | octUnderscoredLit = octLit <+> many (is '_' <+> octDigits)
 336 | 
 337 | ||| Recognise a character literal, including escaped characters.
 338 | ||| (Note: doesn't yet handle escape sequences such as \123)
 339 | ||| /'(\\\\.|[\^'])'/
 340 | export
 341 | charLit : Lexer
 342 | charLit = let q = '\'' in
 343 |               is q <+> (escape (is '\\') (control <|> any) <|> isNot q) <+> is q
 344 |   where
 345 |     lexStr : List String -> Lexer
 346 |     lexStr [] = fail
 347 |     lexStr (t :: ts) = exact t <|> lexStr ts
 348 | 
 349 |     control : Lexer
 350 |     control =
 351 |       lexStr
 352 |         ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
 353 |         "BS",  "HT",  "LF",  "VT",  "FF",  "CR",  "SO",  "SI",
 354 |         "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
 355 |         "CAN", "EM",  "SUB", "ESC", "FS",  "GS",  "RS",  "US",
 356 |         "SP",  "DEL"]
 357 |         <|> (is 'x' <+> hexDigits)
 358 |         <|> (is 'o' <+> octDigits)
 359 |         <|> digits
 360 | 
 361 | export
 362 | doubleLit : Lexer
 363 | doubleLit = Lift number
 364 |