0 | module Text.ILex.Char.UTF8
3 | import Derive.Prelude
4 | import Text.ILex.RExp
5 | import Text.ParseError
8 | %language ElabReflection
12 | isAscii : Bits8 -> Bool
18 | isStartByte : Bits8 -> Bool
19 | isStartByte b = isAscii b || (b .&. 0b1100_0000) == 0b1100_0000
24 | record Codepoint where
27 | bytes : Vect (S len) Bits8
29 | go : (pre : Bits8) -> (n : Nat) -> Bits32 -> Vect n Bits8
32 | (pre + cast (prim__shr_Bits32 m (cast k * 6) .&. 63)) :: go 128 k m
34 | encode : Bits32 -> Codepoint
36 | if x < 0x80 then CP 0 [cast x]
37 | else if x < 0x800 then CP 1 $
go 0b1100_0000 _ x
38 | else if x < 0x10000 then CP 2 $
go 0b1110_0000 _ x
39 | else CP 3 $
go 0b1111_0000 _ x
47 | c1 : Vect (S len) Bits8
48 | c2 : Vect (S len) Bits8
50 | encodeSP : (n : Bits32) -> Vect (S $
len $
encode n) Bits8
51 | encodeSP n = bytes $
encode n
54 | spans : Vect m Bits8 -> Vect n Bits8 -> List Span
55 | spans x@[_] y@[_] = [SP x y]
56 | spans x@[_,_] y@[_,_] = [SP x y]
57 | spans x@[_,_,_] y@[_,_,_] = [SP x y]
58 | spans x@[_,_,_,_] y@[_,_,_,_] = [SP x y]
59 | spans x@[_] y = SP x (encodeSP 0x7f) :: spans (encodeSP 0x80) y
60 | spans x@[_,_] y = SP x (encodeSP 0x7ff) :: spans (encodeSP 0x800) y
61 | spans x@[_,_,_] y = SP x (encodeSP 0xffff) :: spans (encodeSP 0x10000) y
69 | MinAddByte, MaxAddByte : Bits8
70 | MinAddByte = 0b1000_0000
71 | MaxAddByte = 0b1011_1111
73 | bytes : (x,y : Bits8) -> RExp8 True
74 | bytes x y = Ch (range $
range (cast x) (cast y))
76 | anyBytes : Vect (S n) Bits8 -> RExp8 True
77 | anyBytes [_] = bytes MinAddByte MaxAddByte
78 | anyBytes (_ :: xs@(_::_)) = bytes MinAddByte MaxAddByte >> anyBytes xs
80 | fromSpan : Span -> RExp8 True
81 | fromSpan (SP xs ys) = go xs ys
83 | go : Vect (S n) Bits8 -> Vect (S n) Bits8 -> RExp8 True
84 | go [x] [y] = bytes x y
85 | go (x::xs@(_::_)) (y::ys@(_::_)) =
87 | True => bytes x x >> go xs ys
89 | let r1 := bytes x x >> go xs (ys $> MaxAddByte)
90 | r2 := bytes y y >> go (xs $> MinAddByte) ys
93 | True => r1 <|> r2 <|> (bytes (x+1) (y-1) >> anyBytes xs)
95 | fromRange : Range32 -> RExp8 True
99 | False => assert_total $
100 | case spans (encodeSP $
lowerBound r) (encodeSP $
upperBound r) of
102 | (s::ss) => foldr (\x,y => fromSpan x <|> y) (fromSpan s) ss
104 | convert : List Range32 -> RExp8 True
105 | convert [] = Ch empty
106 | convert (x :: xs) = foldr (\r,y => fromRange r <|> y) (fromRange x) xs
109 | toByteRanges : Set32 -> RExp8 True
110 | toByteRanges = convert . ranges . intersection unicode
113 | toUTF8 : (RExp b, a) -> (RExp8 b, a)
114 | toUTF8 (x,v) = (adjRanges toByteRanges x, v)