0 | module Graphics.DOT.Lexer
   1 | 
   2 | import Libraries.Text.Lexer
   3 | import Data.String
   4 | import Data.List
   5 | 
   6 | %default total
   7 | 
   8 | public export
   9 | data DOTToken : Type where
  10 |   ||| A keyword
  11 |   Keyword : (kw : String) -> DOTToken
  12 | 
  13 |   ||| An identifier as a name
  14 |   NameID : (name : String) -> DOTToken
  15 |   ||| An identifier as a numeral
  16 |   NumeralID : (numeral : String) -> DOTToken
  17 |   ||| An identifier as a double-quoted string
  18 |   StringID : (str : String) -> DOTToken
  19 |   ||| An identifier as a HTML string
  20 |   HTML_ID : (html : String) -> DOTToken
  21 | 
  22 |   ||| An edge operation in a directed graph
  23 |   DiGrEdgeOp : DOTToken
  24 |   ||| An edge operation in a graph
  25 |   GrEdgeOp : DOTToken
  26 |   ||| A compass point (used in combination with ports)
  27 |   CompassPt : (pt : String) -> DOTToken
  28 |   ||| A comment
  29 |   Comment : DOTToken
  30 |   ||| A multiline indicator
  31 |   MultilineBackslash : DOTToken
  32 |   ||| A double-quoted-string concatenation operator ('+')
  33 |   StrConcat : DOTToken
  34 | 
  35 |   ||| Any amount of whitespace
  36 |   Whitespace : DOTToken
  37 | 
  38 |   ||| [
  39 |   LBracket : DOTToken
  40 |   ||| ]
  41 |   RBracket : DOTToken
  42 |   ||| {
  43 |   LBrace : DOTToken
  44 |   ||| }
  45 |   RBrace : DOTToken
  46 |   ||| ,
  47 |   Comma : DOTToken
  48 |   ||| ;
  49 |   Semicolon : DOTToken
  50 |   ||| :
  51 |   Colon : DOTToken
  52 |   ||| =
  53 |   Equal : DOTToken
  54 | 
  55 | export
  56 | Show DOTToken where
  57 |   show (Keyword kw) = "(KW " ++ kw ++ ")"
  58 |   show (NameID name) = "(NameID " ++ name ++ ")"
  59 |   show (NumeralID numeral) = "(NumeralID " ++ numeral ++ ")"
  60 |   show (StringID str) = "(StringID " ++ str ++ ")"
  61 |   show (HTML_ID html) = "(HTML_ID " ++ html ++ ")"
  62 |   show DiGrEdgeOp = "DiEO"
  63 |   show GrEdgeOp = "GrEO"
  64 |   show (CompassPt pt) = "(CPt " ++ pt ++ ")"
  65 |   show Comment = "COMMENT"
  66 |   show MultilineBackslash = "MLB"
  67 |   show StrConcat = "STR++"
  68 |   show Whitespace = "WS"
  69 |   show LBracket = "LBracket"
  70 |   show RBracket = "RBracket"
  71 |   show LBrace = "LBrace"
  72 |   show RBrace = "RBrace"
  73 |   show Comma = "Comma"
  74 |   show Semicolon = "Semicolon"
  75 |   show Colon = "Colon"
  76 |   show Equal = "EQ"
  77 | 
  78 | ------------------------------------------------------------------------
  79 | -- Keywords
  80 | 
  81 | -- DOT keywords are case-insensitive
  82 | 
  83 | nodeKW : Lexer
  84 | nodeKW = approx "node"
  85 | 
  86 | edgeKW : Lexer
  87 | edgeKW = approx "edge"
  88 | 
  89 | graphKW : Lexer
  90 | graphKW = approx "graph"
  91 | 
  92 | digraphKW : Lexer
  93 | digraphKW = approx "digraph"
  94 | 
  95 | subgraphKW : Lexer
  96 | subgraphKW = approx "subgraph"
  97 | 
  98 | strictKW : Lexer
  99 | strictKW = approx "strict"
 100 | 
 101 | ||| Keywords are "node", "edge", "graph", "digraph", "subgraph", and "strict"
 102 | ||| (without the quotes)
 103 | keyword : Lexer
 104 | keyword =  nodeKW
 105 |        <|> edgeKW
 106 |        <|> graphKW
 107 |        <|> digraphKW
 108 |        <|> subgraphKW
 109 |        <|> strictKW
 110 | 
 111 | 
 112 | ------------------------------------------------------------------------
 113 | -- Identifiers
 114 | 
 115 | underscore : Lexer
 116 | underscore = is '_'
 117 | 
 118 | nameIDHead : Lexer
 119 | nameIDHead =  alpha
 120 |           <|> underscore
 121 | 
 122 | nameIDRest : Lexer
 123 | nameIDRest =  alphaNum
 124 |           <|> underscore
 125 | 
 126 | ||| Any string of alphabetic characters, underscores, or digits, not beginning
 127 | ||| with a digit.
 128 | nameID : Lexer
 129 | nameID = nameIDHead <+> many nameIDRest
 130 | 
 131 | minus : Lexer
 132 | minus = is '-'
 133 | 
 134 | fullStop : Lexer
 135 | fullStop = is '.'
 136 | 
 137 | -- \.[0-9]+
 138 | dotNum : Lexer
 139 | dotNum = fullStop <+> digits
 140 | 
 141 | -- \.[0-9]*
 142 | decimalNumRest : Lexer
 143 | decimalNumRest = fullStop <+> many digit
 144 | 
 145 | -- [0-9]+(\.[0-9]*)?
 146 | decimalNum : Lexer
 147 | decimalNum = digits <+> opt decimalNumRest
 148 | 
 149 | -- \.[0-9]+|[0-9]+(\.[0-9]*)?
 150 | numeral_helper : Lexer
 151 | numeral_helper =  dotNum
 152 |               <|> decimalNum
 153 | 
 154 | ||| A numeral:
 155 | ||| [-]?(\.[0-9]+|[0-9]+(\.[0-9]*)?)
 156 | numeralID : Lexer
 157 | numeralID = opt minus <+> numeral_helper
 158 | 
 159 | ||| Any double-quoted string possibly containing escaped quotes.
 160 | stringID : Lexer
 161 | stringID = stringLit
 162 | 
 163 | --  v  this is probably a library on its own...
 164 | ||| An HTML string
 165 | htmlID : Lexer
 166 | htmlID = ?todo_htmlIDs_are_not_implemented_atm
 167 | 
 168 | 
 169 | ------------------------------------------------------------------------
 170 | -- Edge operations
 171 | 
 172 | digraphEdgeOp : Lexer
 173 | digraphEdgeOp = exact "->"
 174 | 
 175 | graphEdgeOp : Lexer
 176 | graphEdgeOp = exact "--"
 177 | 
 178 | 
 179 | ------------------------------------------------------------------------
 180 | -- Compass Points
 181 | 
 182 | northCPt : Lexer
 183 | northCPt =        is 'n'
 184 | 
 185 | northEastCPt : Lexer
 186 | northEastCPt = exact "ne"
 187 | 
 188 | eastCPt : Lexer
 189 | eastCPt =         is 'e'
 190 | 
 191 | southEastCPt : Lexer
 192 | southEastCPt = exact "se"
 193 | 
 194 | southCPt : Lexer
 195 | southCPt =        is 'e'
 196 | 
 197 | southWestCPt : Lexer
 198 | southWestCPt = exact "sw"
 199 | 
 200 | westCPt : Lexer
 201 | westCPt =         is 'w'
 202 | 
 203 | northWestCPt : Lexer
 204 | northWestCPt = exact "nw"
 205 | 
 206 | centerCPt : Lexer
 207 | centerCPt =       is 'c'
 208 | 
 209 | -- The compass point "_" specifies that an appropriate side of the port adjacent
 210 | -- to the exterioior of the node should be used, if such exists. Otherwise, the
 211 | -- center is used. If no compass point is used with a portname, the default
 212 | -- value is "_".
 213 | underCPt : Lexer
 214 | underCPt =        is '_'
 215 | 
 216 | -- (n | ne | e | se | s | sw | w | nw | c | _)
 217 | compassPt : Lexer
 218 | compassPt = northCPt
 219 |          <|> northEastCPt
 220 |          <|> eastCPt
 221 |          <|> southEastCPt
 222 |          <|> southCPt
 223 |          <|> southWestCPt
 224 |          <|> westCPt
 225 |          <|> northWestCPt
 226 |          <|> centerCPt
 227 |          <|> underCPt
 228 | 
 229 | 
 230 | ------------------------------------------------------------------------
 231 | -- Comments
 232 | 
 233 | -- "a line beginning with a '#' character is considered a line output from a C
 234 | -- preprocessor and discarded"
 235 | cPreProcessorOutput : Lexer
 236 | cPreProcessorOutput = lineComment (is '#')
 237 | 
 238 | cppLineComment : Lexer
 239 | cppLineComment = lineComment (exact "//")
 240 | 
 241 | cppBlockComment : Lexer
 242 | cppBlockComment = blockComment (exact "/*") (exact "*/")
 243 | 
 244 | 
 245 | ||| The language supports C++-style comments `/* */` and `//`. Line starting
 246 | ||| with a '#' are considered pre-processer output and ignored.
 247 | comment : Lexer
 248 | comment =  cppLineComment
 249 |        <|> cppBlockComment
 250 |        <|> cPreProcessorOutput
 251 | 
 252 | 
 253 | ------------------------------------------------------------------------
 254 | -- Misc
 255 | 
 256 | -- DOT allows double-quoted strings to span multiple physical lines using the
 257 | -- standard C convention of a backslash immediately preceding a newline
 258 | -- character
 259 | multilineBackslash : Lexer
 260 | multilineBackslash = (is '\\') <+> newline
 261 | 
 262 | -- double-quoted strings can be concatenated using a '+' operator
 263 | strConcat : Lexer
 264 | strConcat = is '+'
 265 | 
 266 | -- "any amount of whitespace may be inserted [...]"
 267 | whitespace : Lexer
 268 | whitespace = spaces
 269 | 
 270 | lBracket : Lexer
 271 | lBracket = is '['
 272 | 
 273 | rBracket : Lexer
 274 | rBracket = is ']'
 275 | 
 276 | lBrace : Lexer
 277 | lBrace = is '{'
 278 | 
 279 | rBrace : Lexer
 280 | rBrace = is '}'
 281 | 
 282 | comma : Lexer
 283 | comma = is ','
 284 | 
 285 | semicolon : Lexer
 286 | semicolon = is ';'
 287 | 
 288 | colon : Lexer
 289 | colon = is ':'
 290 | 
 291 | equal : Lexer
 292 | equal = is '='
 293 | 
 294 | 
 295 | ------------------------------------------------------------------------
 296 | -- TOKEN MAP
 297 | 
 298 | ||| A mapping from the @Lexer@s to a function of type String -> @DOTToken@
 299 | dotTokenMap : TokenMap DOTToken
 300 | dotTokenMap = [ (keyword,             \str => Keyword (toLower str))
 301 |               , (nameID,              \name => NameID name)
 302 |               , (numeralID,           \numeral => NumeralID numeral)
 303 |               , (stringID,            \str => StringID str)
 304 | --              , (htmlID,              \html => HTML_ID html)
 305 |               , (digraphEdgeOp,       const DiGrEdgeOp)
 306 |               , (graphEdgeOp,         const GrEdgeOp)
 307 |               , (compassPt,           \pt  => CompassPt pt)
 308 |               , (comment,             const Comment)
 309 |               , (multilineBackslash,  const MultilineBackslash)
 310 |               , (strConcat,           const StrConcat)
 311 |               , (whitespace,          const Whitespace)
 312 |               , (lBracket,            const LBracket)
 313 |               , (rBracket,            const RBracket)
 314 |               , (lBrace,              const LBrace)
 315 |               , (rBrace,              const RBrace)
 316 |               , (comma,               const Comma)
 317 |               , (semicolon,           const Semicolon)
 318 |               , (colon,               const Colon)
 319 |               , (equal,               const Equal)
 320 |               ]
 321 | 
 322 | -- comments and whitespace are irrelevant to the program
 323 | relevant : WithBounds DOTToken -> Bool
 324 | relevant (MkBounded Comment _ _)    = False
 325 | relevant (MkBounded Whitespace _ _) = False
 326 | relevant _                          = True
 327 | 
 328 | -- removes the irrelevant tokens
 329 | clean :  (List (WithBounds DOTToken), (Int, (Int, String)))
 330 |       -> (List (WithBounds DOTToken), (Int, (Int, String)))
 331 | clean (tokens, inputRemainder) = (filter relevant tokens, inputRemainder)
 332 | 
 333 | ||| Given a source file, return the token stream
 334 | export
 335 | lex : String -> (List (WithBounds DOTToken), (Int, (Int, String)))
 336 | lex fStr = clean $ lex dotTokenMap fStr
 337 | 
 338 |