4 | import Data.ByteString
5 | import Data.Linear.Ref1
6 | import Derive.Prelude
8 | import IO.Async.Loop.Epoll
9 | import IO.Async.Loop.Posix
11 | import Text.ILex.Derive
14 | import public Text.ILex
17 | %language ElabReflection
24 | data CoordinateSystem = ZeroBased
27 | %runElab derive "CoordinateSystem" [Show,Eq]
34 | data FASTAValue : Type where
35 | NL : ByteString -> FASTAValue
36 | HeaderStart : FASTAValue
37 | HeaderValue : String -> FASTAValue
38 | Adenine : Nat -> FASTAValue
39 | Thymine : Nat -> FASTAValue
40 | Guanine : Nat -> FASTAValue
41 | Cytosine : Nat -> FASTAValue
43 | %runElab derive "FASTAValue" [Show,Eq]
45 | isHeader : FASTAValue -> Bool
46 | isHeader HeaderStart = True
47 | isHeader (HeaderValue _) = True
50 | isData : FASTAValue -> Bool
51 | isData (Adenine _) = True
52 | isData (Thymine _) = True
53 | isData (Guanine _) = True
54 | isData (Cytosine _) = True
62 | record FASTALine where
63 | constructor MkFASTALine
65 | values : List FASTAValue
67 | %runElab derive "FASTALine" [Show,Eq]
69 | Interpolation FASTALine where interpolate = show
77 | FASTA = List FASTALine
83 | linebreak : RExp True
84 | linebreak = '\n' <|> '\r' <|> "\r\n"
86 | nucleotide : RExp True
87 | nucleotide = 'A' <|> 'T' <|> 'G' <|> 'C'
98 | cytosine : RExp True
106 | record FSTCK (q : Type) where
108 | prev_ : Ref q ByteString
109 | cur_ : Ref q ByteString
110 | offset_ : Ref q Nat
111 | relpos_ : Ref q Integer
113 | positions_ : Ref q (SnocList BytePos)
114 | strs : Ref q (SnocList String)
115 | err : Ref q (Maybe $
BBErr Void)
116 | fastavalues : Ref q (SnocList FASTAValue)
117 | fastalines : Ref q (SnocList FASTALine)
118 | fastacounter : Ref q Nat
122 | HasBBErr FSTCK Void where
126 | HasStringLits FSTCK where
130 | HasStack FSTCK (SnocList FASTALine) where
134 | HasBytes FSTCK where
140 | positions = positions_
143 | fastainit : CoordinateSystem -> F1 q (FSTCK q)
144 | fastainit coordsys = T1.do
156 | fc <- case coordsys of
157 | ZeroBased => ref1 Z
158 | OneBased => ref1 (S Z)
160 | pure (F pr fl ro rr ll ps ss er fvs fls fc ln)
166 | %runElab deriveParserState "FSz" "FST"
167 | ["FIni", "FBroken", "FHdr", "FHdrToNLR", "FHdrToNLS", "FHdrDone", "FD", "FDNL", "FEmpty", "FComplete"]
173 | fastaErr : Arr32 FSz (FSTCK q -> F1 q (BBErr Void))
175 | arr32 FSz (unexpected [])
176 | [ E FBroken $
unexpected ["character other than '>'"]
177 | , E FEmpty $
unexpected ["sequence data"]
178 | , E FHdr $
unexpected ["sequence line"]
185 | onFASTAValueHdrS : (x : FSTCK q) => FASTAValue -> F1 q FST
186 | onFASTAValueHdrS v = push1 x.fastavalues v >> pure FHdrToNLS
188 | onFASTAValueHdrR : (x : FSTCK q) => FASTAValue -> F1 q FST
189 | onFASTAValueHdrR v = push1 x.fastavalues v >> pure FHdrToNLR
191 | onFASTAValueAdenine : (x : FSTCK q) => F1 q FST
192 | onFASTAValueAdenine = T1.do
193 | fc <- read1 x.fastacounter
194 | push1 x.fastavalues (Adenine fc) >> write1 x.fastacounter (S fc) >> pure FD
196 | onFASTAValueThymine : (x : FSTCK q) => F1 q FST
197 | onFASTAValueThymine = T1.do
198 | fc <- read1 x.fastacounter
199 | push1 x.fastavalues (Thymine fc) >> write1 x.fastacounter (S fc) >> pure FD
201 | onFASTAValueGuanine : (x : FSTCK q) => F1 q FST
202 | onFASTAValueGuanine = T1.do
203 | fc <- read1 x.fastacounter
204 | push1 x.fastavalues (Guanine fc) >> write1 x.fastacounter (S fc) >> pure FD
206 | onFASTAValueCytosine : (x : FSTCK q) => F1 q FST
207 | onFASTAValueCytosine = T1.do
208 | fc <- read1 x.fastacounter
209 | push1 x.fastavalues (Cytosine fc) >> write1 x.fastacounter (S fc) >> pure FD
211 | onNLFHdr : (x : FSTCK q) => ByteString -> F1 q FST
214 | push1 x.fastavalues (NL v)
215 | fvs@(_::_) <- getList x.fastavalues | [] => pure FEmpty
216 | case Prelude.any isHeader fvs && Prelude.any isData fvs of
217 | True => pure FBroken
220 | push1 x.fastalines (MkFASTALine ln fvs)
223 | onNLFD : (x : FSTCK q) => ByteString -> F1 q FST
226 | push1 x.fastavalues (NL v)
227 | fvs@(_::_) <- getList x.fastavalues | [] => pure FEmpty
228 | case Prelude.any isHeader fvs && Prelude.any isData fvs of
229 | True => pure FBroken
232 | push1 x.fastalines (MkFASTALine ln fvs)
235 | onEOI : (x : FSTCK q) => F1 q (Either (BBErr Void) FST)
238 | fvs@(_::_) <- getList x.fastavalues
239 | | [] => arrFail FSTCK fastaErr FEmpty x
241 | push1 x.fastalines (MkFASTALine ln fvs)
242 | pure (Right FComplete)
244 | fastaInit : DFA q FSz FSTCK
247 | [ string '>' (\_ => onFASTAValueHdrS HeaderStart)
250 | fastaHdrStrStart : DFA q FSz FSTCK
253 | [ string dot (onFASTAValueHdrR . HeaderValue)
256 | fastaHdrStrRest : DFA q FSz FSTCK
259 | [ string dot (onFASTAValueHdrR . HeaderValue)
260 | , bytes linebreak (\bs => onNLFHdr bs)
263 | fastaFDInit : DFA q FSz FSTCK
266 | [ step adenine onFASTAValueAdenine
267 | , step thymine onFASTAValueThymine
268 | , step guanine onFASTAValueGuanine
269 | , step cytosine onFASTAValueCytosine
272 | fastaFD : DFA q FSz FSTCK
275 | [ bytes linebreak onNLFD
276 | , step adenine onFASTAValueAdenine
277 | , step thymine onFASTAValueThymine
278 | , step guanine onFASTAValueGuanine
279 | , step cytosine onFASTAValueCytosine
282 | fastaSteps : Lex1 q FSz FSTCK
286 | , E FHdrToNLS fastaHdrStrStart
287 | , E FHdrToNLR fastaHdrStrRest
288 | , E FHdrDone fastaFDInit
289 | , E FDNL fastaFDInit
293 | fastaEOI : FST -> FSTCK q -> F1 q (Either (BBErr Void) FASTA)
295 | case st == FIni || st == FHdr || st == FEmpty || st == FBroken of
296 | True => arrFail FSTCK fastaErr st x
299 | fasta <- getList x.fastalines
307 | fasta : CoordinateSystem -> P1 q (BBErr Void) FASTA
308 | fasta coordsys = P FIni (fastainit coordsys) fastaSteps snocChunk fastaErr fastaEOI
311 | parseFASTA : CoordinateSystem -> Origin -> String -> Either (ParseError Void) FASTA
312 | parseFASTA coordsys origin str = parseString (fasta coordsys) origin str
318 | streamFASTA : CoordinateSystem
320 | -> AsyncPull Poll Void [BBErr Void, Errno] ()
321 | streamFASTA coordsys pth =
323 | |> streamParse (fasta coordsys)
325 | |> printLnTo Stdout
327 | streamFASTAFiles : CoordinateSystem
328 | -> AsyncPull Poll String [BBErr Void, Errno] ()
329 | -> AsyncPull Poll Void [BBErr Void, Errno] ()
330 | streamFASTAFiles coordsys pths =
331 | flatMap pths (\p => readBytes p |> streamParse (fasta coordsys))
333 | |> printLnTo Stdout