0 | ||| A simple module to process 'literate' documents.
   1 | |||
   2 | ||| The module uses a lexer to split the document into code blocks,
   3 | ||| delineated by user-defined markers, and code lines that are
   4 | ||| indicated be a line marker. The lexer returns a document stripped
   5 | ||| of non-code elements but preserving the original document's line
   6 | ||| count. Column numbering of code lines are not preserved.
   7 | |||
   8 | ||| The underlying tokeniser is greedy.
   9 | |||
  10 | ||| Once it identifies a line marker it reads a prettifying space then
  11 | ||| consumes until the end of line. Once identifies a starting code
  12 | ||| block marker, the lexer will consume input until the next
  13 | ||| identifiable end block is encountered. Any other content is
  14 | ||| treated as part of the original document.
  15 | |||
  16 | ||| Thus, the input literate files *must* be well-formed w.r.t
  17 | ||| to code line markers and code blocks.
  18 | |||
  19 | ||| A further restriction is that literate documents cannot contain
  20 | ||| the markers within the document's main text: This will confuse the
  21 | ||| lexer.
  22 | |||
  23 | module Libraries.Text.Literate
  24 | 
  25 | import Libraries.Text.Lexer
  26 | 
  27 | import Data.List1
  28 | import Data.List.Views
  29 | import Data.String
  30 | 
  31 | %default total
  32 | 
  33 | untilEOL : Recognise False
  34 | untilEOL = manyUntil newline any
  35 | 
  36 | line : String -> Lexer
  37 | line s = exact s <+> (newline <|> space <+> untilEOL)
  38 | 
  39 | block : String -> String -> Lexer
  40 | block s e = surround (exact s <+> untilEOL) (exact e <+> untilEOL) any
  41 | 
  42 | notCodeLine : Lexer
  43 | notCodeLine = newline
  44 |            <|> any <+> untilEOL
  45 | 
  46 | public export
  47 | data LitToken
  48 |   = CodeBlock String String String
  49 |   | Any String
  50 |   | CodeLine String String
  51 | 
  52 | Show LitToken where
  53 |   showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x
  54 |   showPrec d (Any x)           = showCon d "Any" $ showArg x
  55 |   showPrec d (CodeLine m x)    = showCon d "CodeLine" $ showArg m ++ showArg x
  56 | 
  57 | rawTokens : (delims  : List (String, String))
  58 |          -> (markers : List String)
  59 |          -> TokenMap LitToken
  60 | rawTokens delims ls =
  61 |           map (\(l,r) => (block l r, CodeBlock (trim l) (trim r))) delims
  62 |        ++ map (\m => (line m, CodeLine (trim m))) ls
  63 |        ++ [(notCodeLine, Any)]
  64 | 
  65 | ||| Merge the tokens into a single source file.
  66 | reduce : List (WithBounds LitToken) -> List String -> String
  67 | reduce [] acc = fastConcat (reverse acc)
  68 | reduce (MkBounded (Any x) _ _ :: rest) acc =
  69 |   -- newline will always be tokenized as a single token
  70 |   if x == "\n"
  71 |   then reduce rest ("\n"::acc)
  72 |   else reduce rest acc
  73 | 
  74 | reduce (MkBounded (CodeLine m src) _ _ :: rest) acc =
  75 |     if m == trim src
  76 |     then reduce rest ("\n"::acc)
  77 |     else reduce rest ((substr (length m + 1) -- remove space to right of marker.
  78 |                               (length src)
  79 |                               src
  80 |                       )::acc)
  81 | 
  82 | reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc with (lines src) -- Strip the deliminators surrounding the block.
  83 |   reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | [] = reduce rest acc -- 1
  84 |   reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s :: ys) with (snocList ys)
  85 |     reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s :: []) | Empty = reduce rest acc -- 2
  86 |     reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s :: (srcs ++ [f])) | (Snoc f srcs rec) =
  87 |         -- the "\n" counts for the open deliminator; the closing deliminator should always be followed by a (Any "\n"), so we don't add a newline
  88 |         reduce rest ((unlines srcs) :: "\n" :: acc)
  89 | 
  90 | -- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators.
  91 | 
  92 | 
  93 | public export
  94 | record LiterateError where
  95 |   constructor MkLitErr
  96 |   line   : Int
  97 |   column : Int
  98 |   input  : String
  99 | 
 100 | export
 101 | Show LiterateError where
 102 |   show (MkLitErr line col input) = "\{input}:\{show line}:\{show col}"
 103 | 
 104 | ||| Description of literate styles.
 105 | |||
 106 | ||| A 'literate' style comprises of
 107 | |||
 108 | ||| + a list of code block deliminators (`deliminators`);
 109 | ||| + a list of code line markers (`line_markers`); and
 110 | ||| + a list of known file extensions `file_extensions`.
 111 | |||
 112 | ||| Some example specifications:
 113 | |||
 114 | ||| + Bird Style
 115 | |||
 116 | |||```
 117 | |||MkLitStyle Nil [">"] [".lidr"]
 118 | |||```
 119 | |||
 120 | ||| + Literate Haskell (for LaTeX)
 121 | |||
 122 | |||```
 123 | |||MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")]
 124 | |||           Nil
 125 | |||           [".lhs", ".tex"]
 126 | |||```
 127 | |||
 128 | ||| + OrgMode
 129 | |||
 130 | |||```
 131 | |||MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")]
 132 | |||           ["#+IDRIS:"]
 133 | |||           [".org"]
 134 | |||```
 135 | |||
 136 | ||| + Common Mark
 137 | |||
 138 | |||```
 139 | |||MkLitStyle [("```idris","```"), ("<!-- idris","--!>")]
 140 | |||           Nil
 141 | |||           [".md", ".markdown"]
 142 | |||```
 143 | |||
 144 | public export
 145 | record LiterateStyle where
 146 |   constructor MkLitStyle
 147 |   ||| The pairs of start and end tags for code blocks.
 148 |   deliminators : List (String, String)
 149 | 
 150 |   ||| Line markers that indicate a line contains code.
 151 |   line_markers : List String
 152 | 
 153 |   ||| Recognised file extensions. Not used by the module, but will be
 154 |   ||| of use when connecting to code that reads in the original source
 155 |   ||| files.
 156 |   file_extensions : List String
 157 | 
 158 | ||| Given a 'literate specification' lex the file to extract code blocks
 159 | |||
 160 | ||| @specification The literate specification to use.
 161 | ||| @litStr  The literate source file.
 162 | |||
 163 | ||| Returns a `LiterateError` if the literate file contains malformed
 164 | ||| code blocks or code lines.
 165 | export
 166 | lexLiterate : (specification : LiterateStyle)
 167 |            -> (litStr        : String)
 168 |            -> Either LiterateError (List (WithBounds LitToken))
 169 | lexLiterate (MkLitStyle delims markers exts) str =
 170 |       case lex (rawTokens delims markers) str of
 171 |         (toks, (_,_,"")) => Right toks
 172 |         (_, (l,c,i))     => Left (MkLitErr l c i)
 173 | 
 174 | ||| Given a 'literate specification' extract the code from the
 175 | ||| literate source file (`litStr`) that follows the presented style.
 176 | |||
 177 | ||| @specification The literate specification to use.
 178 | ||| @litStr The literate source file.
 179 | |||
 180 | ||| Returns a `LiterateError` if the literate file contains malformed
 181 | ||| code blocks or code lines.
 182 | |||
 183 | export
 184 | extractCode : (specification : LiterateStyle)
 185 |            -> (litStr        : String)
 186 |            -> Either LiterateError String
 187 | extractCode spec str = flip reduce Nil <$> lexLiterate spec str
 188 | 
 189 | ||| Synonym for `extractCode`.
 190 | export
 191 | unlit : (specification : LiterateStyle)
 192 |      -> (litStr        : String)
 193 |      -> Either LiterateError String
 194 | unlit = extractCode
 195 | 
 196 | ||| Is the provided line marked up using a line marker?
 197 | |||
 198 | ||| If the line is suffixed by any one of the style's set of line
 199 | ||| markers then return length of literate line marker, and the code stripped from the line
 200 | ||| marker. Otherwise, return Nothing and the unmarked line.
 201 | export
 202 | isLiterateLine : (specification : LiterateStyle)
 203 |               -> (str : String)
 204 |               -> Pair (Maybe String) String
 205 | isLiterateLine (MkLitStyle delims markers _) str with (lex (rawTokens delims markers) str)
 206 |   isLiterateLine (MkLitStyle delims markers _) str | ([MkBounded (CodeLine m str') _ _], (_,_, "")) = (Just m, str')
 207 |   isLiterateLine (MkLitStyle delims markers _) str | (_, _) = (Nothing, str)
 208 | 
 209 | ||| Given a 'literate specification' embed the given code using the
 210 | ||| literate style provided.
 211 | |||
 212 | ||| If the style uses deliminators to denote code blocks use the first
 213 | ||| pair of deliminators in the style. Otherwise use first linemarker
 214 | ||| in the style. If there is **no style** return the presented code
 215 | ||| string unembedded.
 216 | |||
 217 | |||
 218 | ||| @specification The literate specification to use.
 219 | ||| @code  The code to embed,
 220 | |||
 221 | |||
 222 | export
 223 | embedCode : (specification : LiterateStyle)
 224 |          -> (code : String)
 225 |          -> String
 226 | embedCode (MkLitStyle ((s,e)::delims) _            _) str = unlines [s,str,e]
 227 | embedCode (MkLitStyle Nil             (m::markers) _) str = unwords [m, str]
 228 | embedCode (MkLitStyle _               _            _) str = str
 229 | 
 230 | ||| Synonm for `embedCode`
 231 | export
 232 | relit : (specification : LiterateStyle)
 233 |      -> (code : String)
 234 |      -> String
 235 | relit = embedCode
 236 | 
 237 | -- --------------------------------------------------------------------- [ EOF ]
 238 |