1 | module Text.Regex.Parser.ERE
3 | import Control.Monad.Error.Interface
6 | import public Data.Either
7 | import public Data.DPair
9 | import Decidable.Equality
11 | import public Language.Reflection
13 | import public Text.Regex.Interface
14 | import public Text.Regex.Parser
30 | data PostfixOp : Type where
34 | RepN : Nat -> PostfixOp
35 | RepN_ : Nat -> PostfixOp
36 | RepNM : (l, r : Nat) -> (0 lr : l `LTE` r) => PostfixOp
37 | Rep_M : Nat -> PostfixOp
42 | | Cs Bool (List BracketChars)
43 | | Group Bool (SnocList RxLex)
44 | | Edge LineMode EdgeSide
48 | | Post RxLex PostfixOp
51 | postfixOp : Regex rx => PostfixOp -> rx a -> Exists rx
52 | postfixOp Rep0 = Evidence _ . rep
53 | postfixOp Rep1 = Evidence _ . rep1
54 | postfixOp Opt = Evidence _ . opt
55 | postfixOp (RepN n) = Evidence _ . repeatN n
56 | postfixOp (RepN_ n) = Evidence _ . repeatNOrMore n
57 | postfixOp (RepNM n m) = Evidence _ . repeatNM n m
58 | postfixOp (Rep_M m) = Evidence _ . repeatNOrLess m
60 | data CtxtNesting : Type
61 | record LexCtxt where
62 | constructor MkLexCtxt
63 | nesting : CtxtNesting
64 | lexemes : SnocList RxLex
66 | data CtxtNesting : Type where
68 | G : LexCtxt -> (matching : Bool) -> (openingPos : Nat) -> CtxtNesting
70 | lexERE : List Char -> Either BadRegex $
SnocList RxLex
71 | lexERE orig = go (MkLexCtxt E [<]) orig where
74 | pos : (left : List Char) -> Nat
75 | pos xs = orL `minus` length xs
76 | quotableChars : List Char
77 | quotableChars = unpack ".^$|(){}?*+"
79 | push : LexCtxt -> RxLex -> LexCtxt
80 | push (MkLexCtxt ch $
ls :< C l) (C r) = MkLexCtxt ch $
ls :< C (l ++ r)
81 | push (MkLexCtxt ch ls) l = MkLexCtxt ch $
ls :< l
83 | pushPostfix : (tail : List Char) -> LexCtxt -> PostfixOp -> Either BadRegex LexCtxt
84 | pushPostfix _ (MkLexCtxt ch $
sx :< C (cs@(_:<_) :< last)) op = pure $
MkLexCtxt ch $
sx :< C cs :< Post (C [<last]) op
85 | pushPostfix tail (MkLexCtxt ch $
sx :< Alt ) _ = Left $
RegexIsBad (pos tail) "illegal postfix operator after `|`"
86 | pushPostfix tail (MkLexCtxt ch $
sx :< Post {} ) _ = Left $
RegexIsBad (pos tail) "illegal or unsupported double postfix operator"
87 | pushPostfix _ (MkLexCtxt ch $
sx :< x ) op = pure $
MkLexCtxt ch $
sx :< Post x op
88 | pushPostfix tail _ _ = Left $
RegexIsBad (pos tail) "nothing to apply the postfix operator"
90 | go : LexCtxt -> List Char -> Either BadRegex $
SnocList RxLex
91 | go (MkLexCtxt E curr) [] = pure curr
92 | go (MkLexCtxt (G _ _ op) _) [] = Left $
RegexIsBad op "unmatched opening parenthesis"
93 | go ctx $
'.' :: xs = go (push ctx $
AnyC Line) xs
94 | go ctx $
'^' :: xs = go (push ctx $
Edge Line Start) xs
95 | go ctx $
'$' :: xs = go (push ctx $
Edge Line End) xs
96 | go ctx $
'|' :: xs = go (push ctx Alt) xs
97 | go ctx xxs@('('::'?'::':' :: xs) = go (MkLexCtxt (G ctx False (pos xxs)) [<]) xs
98 | go ctx xxs@('('::'?' :: xs) = Left $
RegexIsBad (pos xs) "unknown type of special group"
99 | go ctx xxs@('(' :: xs) = go (MkLexCtxt (G ctx True (pos xxs)) [<]) xs
100 | go (MkLexCtxt E _) xxs@(')' :: xs) = Left $
RegexIsBad (pos xxs) "unmatched closing parenthesis"
101 | go (MkLexCtxt (G ctx mtch op) ls) $
')' :: xs = go (push ctx $
Group mtch ls) xs
102 | go ctx xxs@('['::'^' :: xs) = parseCharsSet (pos xxs) orL True [<] xs >>= \(rest, cs) => go (push ctx $
Cs False cs) $
assert_smaller xs rest
103 | go ctx xxs@('[' :: xs) = parseCharsSet (pos xxs) orL True [<] xs >>= \(rest, cs) => go (push ctx $
Cs True cs) $
assert_smaller xs rest
104 | go ctx xxs@('*' :: xs) = go !(pushPostfix xxs ctx Rep0) xs
105 | go ctx xxs@('+' :: xs) = go !(pushPostfix xxs ctx Rep1) xs
106 | go ctx xxs@('?' :: xs) = go !(pushPostfix xxs ctx Opt) xs
107 | go ctx xxs@('{' :: xs) = do let (bnds, rest) = span (/= '}') xs
108 | let '}' :: rest = rest | _ => Left $
RegexIsBad (pos xxs) "unmatched opening curly bracket"
109 | let posxs : Lazy Nat := pos xs
110 | let l@(_::_):::r@(_::_)::[] = split (== ',') bnds
111 | | l:::[] => parseNat 10 posxs l >>= \n => go !(pushPostfix xxs ctx $
RepN n) $
assert_smaller xs rest
112 | | []:::r::[] => parseNat 10 posxs r >>= \m => go !(pushPostfix xxs ctx $
Rep_M m) $
assert_smaller xs rest
113 | | l:::[]::[] => parseNat 10 posxs l >>= \n => go !(pushPostfix xxs ctx $
RepN_ n) $
assert_smaller xs rest
114 | | _ => Left $
RegexIsBad posxs "too many commas in the bounds, zero or one is expected"
115 | r <- parseNat 10 (1 + posxs + length l) r;
l <- parseNat 10 posxs l
116 | let Yes lr = isLTE l r | No _ => Left $
RegexIsBad posxs "left bound must not be greater than right bound"
117 | go !(pushPostfix xxs ctx $
RepNM l r) $
assert_smaller xs rest
118 | go ctx $
'\\'::'X' :: xs = go (push ctx $
AnyC Text) xs
119 | go ctx $
'\\'::'A' :: xs = go (push ctx $
Edge Text Start) xs
120 | go ctx $
'\\'::'z' :: xs = go (push ctx $
Edge Text End) xs
121 | go ctx $
'\\'::'Z' :: xs = go (push ctx $
NL Final) xs
122 | go ctx $
'\\'::'R' :: xs = go (push ctx $
NL Generic) xs
123 | go ctx $
'\\'::'N' :: xs = go (push ctx $
NL NonNL) xs
124 | go ctx $
'\\'::'w' :: xs = go (push ctx $
Cs True [Class True Word]) xs
125 | go ctx $
'\\'::'W' :: xs = go (push ctx $
Cs True [Class False Word]) xs
126 | go ctx $
'\\'::'s' :: xs = go (push ctx $
Cs True [Class True Space]) xs
127 | go ctx $
'\\'::'S' :: xs = go (push ctx $
Cs True [Class False Space]) xs
128 | go ctx $
'\\'::'d' :: xs = go (push ctx $
Cs True [Class True Digit]) xs
129 | go ctx $
'\\'::'D' :: xs = go (push ctx $
Cs True [Class False Digit]) xs
130 | go ctx $
'\\'::'b' :: xs = go (push ctx $
WB True True) xs
131 | go ctx $
'\\'::'B' :: xs = go (push ctx $
WB False False) xs
132 | go ctx $
'\\'::'<' :: xs = go (push ctx $
WB True False) xs
133 | go ctx $
'\\'::'>' :: xs = go (push ctx $
WB False True) xs
134 | go ctx $
'\\'::'n' :: xs = go (push ctx $
C [<'\n']) xs
135 | go ctx $
'\\'::'r' :: xs = go (push ctx $
C [<'\r']) xs
136 | go ctx $
'\\'::'t' :: xs = go (push ctx $
C [<'\t']) xs
137 | go ctx $
'\\'::'f' :: xs = go (push ctx $
C [<'\f']) xs
138 | go ctx $
'\\'::'v' :: xs = go (push ctx $
C [<'\v']) xs
139 | go ctx $
'\\'::'0' :: xs = go (push ctx $
C [<'\0']) xs
140 | go ctx $
'\\'::'a' :: xs = go (push ctx $
C [<'\a']) xs
141 | go ctx $
'\\'::'\\':: xs = go (push ctx $
C [<'\\']) xs
142 | go ctx $
'\\'::'x'::'{' :: xs = do
143 | let (hexes, rest) = span (/= '}') xs
144 | let '}' :: rest = rest | _ => Left $
RegexIsBad (pos xs `minus` 1) "unmatched opening curly bracket"
145 | n <- parseNat 16 (pos xs) hexes
146 | let True = shiftR (cast {to=Integer} n) 32 == 0 | False => Left $
RegexIsBad (pos xs) "we expect no more than a 32-bit hex number"
147 | go (push ctx $
C [< chr $
cast n]) $
assert_smaller xs rest
148 | go ctx $
'\\'::'x'::ulxs@(u::l :: xs) = parseNat 16 (pos ulxs) [u,l] >>= \n => go (push ctx $
C [< chr $
cast n]) xs
149 | go ctx xxs@('\\'::'x':: xs) = Left $
RegexIsBad (pos xxs) "bad hex character command, use formats \xFF or \x{FFF...}"
150 | go ctx $
'\\'::xxs@(x::xs) = if x `elem` quotableChars
151 | then go (push ctx $
C [<x]) xs
152 | else Left $
RegexIsBad (pos xxs) "unknown quote character '\\\{show x}'"
153 | go ctx $
x :: xs = go (push ctx $
C [<x]) xs
160 | crumple : Monoid a => {0 f : _} ->
161 | List (
n ** f $
Vect n a)
->
162 | Exists $
\tys => (All f tys, (
n ** (All Prelude.id tys -> Vect n a, Any Prelude.id tys -> Vect n a))
)
163 | crumple [] = Evidence _ ([], MkDPair _ (\case [] => [], \case Here
impossible))
164 | crumple ((
n ** r)
:: rs) = do
165 | let Evidence tys' (rs, (
n' ** (cAll, cAny))
) = crumple {f} rs
166 | let spreadAny : Any Prelude.id (Vect n a :: tys') -> Vect (n + n') a
167 | spreadAny $
Here x = x ++ replicate n' neutral
168 | spreadAny $
There x = replicate n neutral ++ cAny x
169 | Evidence _ (r::rs, (
_ ** (\(x::xs) => x ++ cAll xs, spreadAny))
)
175 | parseRegex'' : Regex rx => List RxLex -> (
n ** rx $
Vect n String)
176 | parseRegex'' = alts where
177 | concatAll : List (
n ** rx $
Vect n String)
-> (
n ** rx $
Vect n String)
178 | concatAll xs = let Evidence _ (rs, MkDPair _ (conv, _)) = crumple xs in (
_ ** conv <$> all rs)
179 | alts : List RxLex -> (
n ** rx $
Vect n String)
180 | conseq : List RxLex -> List (
n ** rx $
Vect n String)
182 | let alts = forget $
concatAll . assert_total conseq <$> List.split (\case Alt => True;
_ => False) lxs
183 | let Evidence _ (alts, (
n ** (_, conv))
) = crumple alts
184 | MkDPair _ $
conv <$> exists alts
185 | conseq [] = pure (
_ ** pure [])
186 | conseq $
C [<c] :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ char c
187 | conseq $
C cs :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ string (pack $
cast cs)
188 | conseq $
WB l r :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ wordBoundary l r
189 | conseq $
Cs nonneg cs :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ bracketMatcher nonneg cs
190 | conseq $
Group False sx :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ (alts $
cast sx).snd
191 | conseq $
Group True sx :: xs = (:: conseq xs) $
MkDPair 1 $
(::[]) <$> matchOf (alts $
cast sx).snd
192 | conseq $
Edge t s :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ edge t s
193 | conseq $
NL NonNL :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ sym (not . isNL)
194 | conseq $
NL Generic :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ genericNL
195 | conseq $
NL Final :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ all [optional genericNL, edge Text End]
196 | conseq $
AnyC m :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ anyChar m
197 | conseq $
Post lx op :: xs = (:: conseq xs) $
MkDPair _ $
[] <$ (postfixOp op (alts [lx]).snd).snd
198 | conseq $
Alt :: xs = (:: conseq xs) $
MkDPair _ $
pure []
201 | parseRegex : Regex rx => String -> Either BadRegex (
n ** rx $
Vect n String)
202 | parseRegex str = parseRegex'' . cast <$> lexERE (unpack str)
205 | parseRegexN : Regex rx => (matchingGroupsCnt : Nat) -> String -> Either BadRegex $
rx $
Vect matchingGroupsCnt String
206 | parseRegexN n str = do
207 | (
n' ** r)
<- parseRegex str
208 | let Yes Refl = decEq n n' | No _ => throwError $
RegexIsBad 0 "Regex is expected to contain \{show n} matching groups, but contains \{show n'}"
213 | parseAnyRegex : Regex rx => String -> Either BadRegex $
Exists rx
214 | parseAnyRegex = map (\(_ ** r) => Evidence _ r) . parseRegex
216 | public export %inline
217 | toRegex : Regex rx => (s : String) -> (0 _ : IsRight $
parseRegex {rx} s) => rx $
Vect (fst $
fromRight $
parseRegex {rx} s) String
218 | toRegex s = snd $
fromRight $
parseRegex {rx} s
221 | (.erx) : Regex rx => String -> Elab $
rx a
223 | let patchFC : Nat -> FC -> FC
224 | patchFC ofs fc@(MkFC origin (l, c) (l', _)) = if l /= l' then fc else let p = (l, c + cast ofs) in MkFC origin p p
225 | patchFC ofs fc@(MkVirtualFC origin (l, c) (l', _)) = if l /= l' then fc else let p = (l, c + cast ofs) in MkVirtualFC origin p p
226 | patchFC ofs EmptyFC = EmptyFC
227 | let Right $
Evidence ty r = parseAnyRegex {rx} str
228 | | Left (RegexIsBad idx reason) => do failAt (patchFC idx $
getFC !(quote str)) "Bad regular expression at position \{show idx}: \{reason}"
229 | Just Refl <- catch $
check {expected = a = ty} `(Refl
)
230 | | Nothing => do fail "Unable to match expected type \{show !(quote a)} with the regex type \{show !(quote ty)}"