aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Markdown.hs
blob: 60ac40fd7fb61295622ed9775dfbaef340310ea5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
-- | Convert markdown to Pandoc document.
module Text.Pandoc.Readers.Markdown ( 
                                     readMarkdown 
                                    ) where

import Text.ParserCombinators.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
import Text.Pandoc.Shared 
import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, 
                                               anyHtmlInlineTag )
import Text.Pandoc.HtmlEntities ( decodeEntities )
import Text.Regex ( matchRegex, mkRegex )
import Text.ParserCombinators.Parsec

-- | Read markdown from an input string and return a Pandoc document.
readMarkdown :: ParserState -> String -> Pandoc
readMarkdown = readWith parseMarkdown

-- | Parse markdown string with default options and print result (for testing).
testString :: String -> IO ()
testString = testStringWith parseMarkdown 

--
-- Constants and data structure definitions
--

spaceChars = " \t"
endLineChars = "\n"
labelStart = '['
labelEnd = ']'
labelSep = ':'
srcStart = '('
srcEnd = ')'
imageStart = '!'
noteStart = '^'
codeStart = '`'
codeEnd = '`'
emphStart = '*'
emphEnd = '*'
emphStartAlt = '_'
emphEndAlt = '_'
autoLinkStart = '<'
autoLinkEnd = '>'
mathStart = '$'
mathEnd = '$'
bulletListMarkers = "*+-"
orderedListDelimiters = "."
escapeChar = '\\'
hruleChars = "*-_"
quoteChars = "'\""
atxHChar = '#'
titleOpeners = "\"'("
setextHChars = ['=','-']
blockQuoteChar = '>'
hyphenChar = '-'

-- treat these as potentially non-text when parsing inline:
specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt, 
                emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart, 
                mathEnd, imageStart, noteStart, hyphenChar]

--
-- auxiliary functions
--

-- | Skip a single endline if there is one.
skipEndline = option Space endline

indentSpaces = do
  state <- getState
  let tabStop = stateTabStop state
  oneOfStrings [ "\t", (replicate tabStop ' ') ] <?> "indentation"

skipNonindentSpaces = do
  state <- getState
  let tabStop = stateTabStop state
  choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)]))

--
-- document structure
--

titleLine = try (do
  char '%'
  skipSpaces
  line <- manyTill inline newline
  return line)

authorsLine = try (do
  char '%'
  skipSpaces
  authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
  newline
  return (map removeLeadingTrailingSpace authors))

dateLine = try (do
  char '%'
  skipSpaces
  date <- many (noneOf "\n")
  newline
  return (removeTrailingSpace date))

titleBlock = try (do
  title <- option [] titleLine
  author <- option [] authorsLine
  date <- option "" dateLine
  option "" blanklines
  return (title, author, date))

parseMarkdown = do
  updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML
  (title, author, date) <- option ([],[],"") titleBlock
  blocks <- parseBlocks
  state <- getState
  let keys = reverse $ stateKeyBlocks state
  return (Pandoc (Meta title author date) (blocks ++ keys))

--
-- parsing blocks
--

parseBlocks = do
  result <- manyTill block eof
  return result

block = choice [ codeBlock, referenceKey, note, header, hrule, list, blockQuote, rawHtmlBlocks, 
                 rawLaTeXEnvironment, para, plain, blankBlock, nullBlock ] <?> "block"

--
-- header blocks
--

header = choice [ setextHeader, atxHeader ] <?> "header"

atxHeader = try (do
  lead <- many1 (char atxHChar)
  skipSpaces
  txt <- many1 (do {notFollowedBy' atxClosing; inline})
  atxClosing
  return (Header (length lead) (normalizeSpaces txt)))

atxClosing = try (do
  skipMany (char atxHChar)
  skipSpaces
  newline
  option "" blanklines)

setextHeader = choice (map (\x -> setextH x) (enumFromTo 1 (length setextHChars)))

setextH n = try (do
    txt <- many1 (do {notFollowedBy newline; inline})
    endline
    many1 (char (setextHChars !! (n-1)))
    skipSpaces
    newline
    option "" blanklines
    return (Header n (normalizeSpaces txt)))

--
-- hrule block
--

hruleWith chr = 
    try (do
           skipSpaces
           char chr
           skipSpaces
           char chr
           skipSpaces
           char chr
           skipMany (oneOf (chr:spaceChars))
           newline
           option "" blanklines
           return HorizontalRule)

hrule = choice (map hruleWith hruleChars) <?> "hrule"

--
-- code blocks
--

indentedLine = try (do
    indentSpaces
    result <- manyTill anyChar newline
    return (result ++ "\n"))

-- two or more indented lines, possibly separated by blank lines
indentedBlock = try (do 
  res1 <- indentedLine
  blanks <- many blankline 
  res2 <- choice [indentedBlock, indentedLine]
  return (res1 ++ blanks ++ res2))

codeBlock = do
    result <- choice [indentedBlock, indentedLine]
    option "" blanklines
    return (CodeBlock result)

--
-- note block
--

note = try (do
    (NoteRef ref) <- noteRef 
    skipSpaces
    raw <- sepBy (many (choice [nonEndline, 
                                (try (do {endline; notFollowedBy (char noteStart); return '\n'}))
                               ])) (try (do {newline; char noteStart; option ' ' (char ' ')}))
    newline
    blanklines
    -- parse the extracted block, which may contain various block elements:
    state <- getState
    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
                   Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
                   Right result -> result
    return (Note ref parsed))

--
-- block quotes
--

emacsBoxQuote = try (do
    string ",----"
    manyTill anyChar newline
    raw <- manyTill (try (do{ char '|'; 
                              option ' ' (char ' '); 
                              result <- manyTill anyChar newline; 
                              return result})) 
                     (string "`----")
    manyTill anyChar newline
    option "" blanklines
    return raw)

emailBlockQuoteStart = try (do
  skipNonindentSpaces
  char blockQuoteChar
  option ' ' (char ' ')
  return "> ")

emailBlockQuote = try (do
    emailBlockQuoteStart
    raw <- sepBy (many (choice [nonEndline, 
                                (try (do{ endline; 
                                          notFollowedBy' emailBlockQuoteStart;
                                          return '\n'}))])) 
           (try (do {newline; emailBlockQuoteStart}))
    newline <|> (do{ eof; return '\n'})
    option "" blanklines
    return raw)

blockQuote = do 
    raw <- choice [ emailBlockQuote, emacsBoxQuote ]
    -- parse the extracted block, which may contain various block elements:
    state <- getState
    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
                   Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
                   Right result -> result
    return (BlockQuote parsed)

--
-- list blocks
--

list = choice [ bulletList, orderedList ] <?> "list"

bulletListStart = 
    try (do
           option ' ' newline -- if preceded by a Plain block in a list context
           skipNonindentSpaces
           notFollowedBy' hrule  -- because hrules start out just like lists
           oneOf bulletListMarkers
           spaceChar
           skipSpaces)

orderedListStart = 
    try (do
           option ' ' newline -- if preceded by a Plain block in a list context
           skipNonindentSpaces
           many1 digit
           oneOf orderedListDelimiters
           oneOf spaceChars
           skipSpaces)

-- parse a line of a list item (start = parser for beginning of list item)
listLine start = try (do
  notFollowedBy' start
  notFollowedBy blankline
  notFollowedBy' (try (do{ indentSpaces; 
                           many (spaceChar);
                           choice [bulletListStart, orderedListStart]}))
  line <- manyTill anyChar newline
  return (line ++ "\n"))

-- parse raw text for one list item, excluding start marker and continuations
rawListItem start = 
    try (do
           start
           result <- many1 (listLine start)
           blanks <- many blankline
           return ((concat result) ++ blanks))

-- continuation of a list item - indented and separated by blankline 
-- or (in compact lists) endline.
-- note: nested lists are parsed as continuations
listContinuation start = 
    try (do
           followedBy' indentSpaces
           result <- many1 (listContinuationLine start)
           blanks <- many blankline
           return ((concat result) ++ blanks))

listContinuationLine start = try (do
    notFollowedBy blankline
    notFollowedBy' start
    option "" indentSpaces
    result <- manyTill anyChar newline
    return (result ++ "\n"))

listItem start = 
    try (do 
           first <- rawListItem start
           rest <- many (listContinuation start)
           -- parsing with ListItemState forces markers at beginning of lines to
           -- count as list item markers, even if not separated by blank space.
           -- see definition of "endline"
           state <- getState
           let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) 
                        "block" raw of
                          Left err -> error $ "Raw block:\n" ++ raw ++ "\nError:\n" ++ show err
                          Right result -> result
                   where raw = concat (first:rest) 
           return parsed)

orderedList = 
    try (do
           items <- many1 (listItem orderedListStart)
           let items' = compactify items
           return (OrderedList items'))

bulletList = 
    try (do
           items <- many1 (listItem bulletListStart)
           let items' = compactify items
           return (BulletList items'))

--
-- paragraph block
--

para = try (do 
  result <- many1 inline
  newline
  choice [ (do{ followedBy' (oneOfStrings [">", ",----"]); return "" }), blanklines ]  
  let result' = normalizeSpaces result
  return (Para result'))

plain = do
  result <- many1 inline
  let result' = normalizeSpaces result
  return (Plain result')

-- 
-- raw html
--

rawHtmlBlocks = try (do
   htmlBlocks <- many1 rawHtmlBlock    
   let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
   let combined' = if (last combined == '\n') then 
                       init combined  -- strip extra newline 
                   else 
                       combined 
   return (RawHtml combined'))

-- 
-- reference key
--

referenceKey = 
    try (do
           skipSpaces
           label <- reference
           char labelSep
           skipSpaces
           option ' ' (char autoLinkStart)
           src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars))
           option ' ' (char autoLinkEnd)
           tit <- option "" title 
           blanklines 
           return (Key label (Src (removeTrailingSpace src) tit))) 

-- 
-- inline
--

text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar, 
                whitespace, endline ] <?> "text"

inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, ltSign, symbol ] <?> "inline"

special = choice [ link, referenceLink, rawHtmlInline, autoLink, 
                   image, noteRef ] <?> "link, inline html, note, or image"

escapedChar = escaped anyChar

ltSign = do
  notFollowedBy' rawHtmlBlocks -- don't return < if it starts html
  char '<'
  return (Str ['<'])

specialCharsMinusLt = filter (/= '<') specialChars

symbol = do 
  result <- oneOf specialCharsMinusLt
  return (Str [result])

hyphens = try (do
  result <- many1 (char '-')
  if (length result) == 1 then
      skipEndline   -- don't want to treat endline after hyphen as a space
    else
      do{ string ""; return Space }
  return (Str result))

-- parses inline code, between codeStart and codeEnd
code1 = 
    try (do 
           char codeStart
           result <- many (noneOf [codeEnd])
           char codeEnd
           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
           return (Code result'))

-- parses inline code, between 2 codeStarts and 2 codeEnds
code2 = 
    try (do
           string [codeStart, codeStart]
           result <- manyTill anyChar (try (string [codeEnd, codeEnd]))
           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
           return (Code result'))

mathWord = many1 (choice [(noneOf (" \t\n\\" ++ [mathEnd])), (try (do {c <- char '\\'; notFollowedBy (char mathEnd); return c}))])

math = try (do
  char mathStart
  notFollowedBy space
  words <- sepBy1 mathWord (many1 space)
  char mathEnd
  return (TeX ("$" ++ (joinWithSep " " words) ++ "$")))

emph = do
  result <- choice [ (enclosed (char emphStart) (char emphEnd) inline), 
                      (enclosed (char emphStartAlt) (char emphEndAlt) inline) ]
  return (Emph (normalizeSpaces result))

strong = do
  result <- choice [ (enclosed (count 2 (char emphStart)) (count 2 (char emphEnd)) inline), 
                     (enclosed (count 2 (char emphStartAlt)) (count 2 (char emphEndAlt)) inline)]
  return (Strong (normalizeSpaces result))

whitespace = do
  many1 (oneOf spaceChars) <?> "whitespace"
  return Space

tabchar = do
  tab
  return (Str "\t")

-- hard line break
linebreak = try (do
  oneOf spaceChars
  many1 (oneOf spaceChars) 
  endline
  return LineBreak )

nonEndline = noneOf endLineChars

str = do 
  result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) 
  return (Str (decodeEntities result))

-- an endline character that can be treated as a space, not a structural break
endline =
    try (do
           newline
           -- next line would allow block quotes without preceding blank line
           -- Markdown.pl does allow this, but there's a chance of a wrapped
           -- greater-than sign triggering a block quote by accident...
--         notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' }))  
           notFollowedBy blankline
           -- parse potential list starts at beginning of line differently if in a list:
           st <- getState
           if (stateParserContext st) == ListItemState then 
               do
                 notFollowedBy' orderedListStart
                 notFollowedBy' bulletListStart
             else
               option () pzero
           return Space)

--
-- links
--

-- a reference label for a link
reference = do
  char labelStart
  label <- manyTill inline (char labelEnd)
  return (normalizeSpaces label)

-- source for a link, with optional title
source = 
    try (do 
           char srcStart
           option ' ' (char autoLinkStart)
           src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners))
           option ' ' (char autoLinkEnd)
           tit <- option "" title
           skipSpaces
           char srcEnd
           return (Src (removeTrailingSpace src) tit))

titleWith startChar endChar =
    try (do
           skipSpaces
           skipEndline  -- a title can be on the next line from the source
           skipSpaces
           char startChar
           tit <- manyTill (choice [ try (do {char '\\'; char endChar}), 
                                     (noneOf (endChar:endLineChars)) ]) (char endChar) 
           let tit' = gsub "\"" "&quot;" tit
           return tit')

title = choice [titleWith '(' ')', titleWith '"' '"', titleWith '\'' '\''] <?> "title"

link = choice [explicitLink, referenceLink] <?> "link"

explicitLink = 
    try (do
           label <- reference
           src <- source 
           return (Link label src)) 

referenceLink = choice [referenceLinkDouble, referenceLinkSingle]

referenceLinkDouble =     -- a link like [this][/url/]
    try (do
           label <- reference
           skipSpaces
           skipEndline
           skipSpaces
           ref <- reference 
           return (Link label (Ref ref))) 

referenceLinkSingle =     -- a link like [this]
    try (do
           label <- reference
           return (Link label (Ref []))) 

autoLink =                -- a link <like.this.com>
    try (do
           notFollowedBy (do {anyHtmlBlockTag; return ' '})
           src <- between (char autoLinkStart) (char autoLinkEnd) 
                  (many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd])))
           case (matchRegex emailAddress src) of
             Just _  -> return (Link [Str src] (Src ("mailto:" ++ src) ""))
             Nothing -> return (Link [Str src] (Src src ""))) 

emailAddress = mkRegex "([^@:/]+)@(([^.]+[.]?)*([^.]+))"  -- presupposes no whitespace

image = 
    try (do
           char imageStart
           (Link label src) <- link
           return (Image label src)) 

noteRef = try (do
    char noteStart
    ref <- between (char '(') (char ')') (many1 (noneOf " \t\n)"))
    return (NoteRef ref))