diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 24 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 154 |
2 files changed, 140 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2bf75654c..9beaaacff 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -32,7 +32,12 @@ module Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, - anyHtmlInlineTag + anyHtmlInlineTag, + anyHtmlTag, + anyHtmlEndTag, + htmlEndTag, + extractTagType, + htmlBlockElement ) where import Text.Regex ( matchRegex, mkRegex ) @@ -78,17 +83,18 @@ inlinesTilEnd tag = try (do inlines <- manyTill inline (htmlEndTag tag) return inlines) --- extract type from a tag: e.g. br from <br>, < br >, </br>, etc. +-- | Extract type from a tag: e.g. 'br' from '<br>' extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of Just [match] -> (map toLower match) Nothing -> "" +-- | Parse any HTML tag (closing or opening) and return text of tag anyHtmlTag = try (do char '<' spaces tag <- many1 alphaNum - attribs <- htmlAttributes + attribs <- htmlAttributes spaces ender <- option "" (string "/") let ender' = if (null ender) then "" else " /" @@ -150,9 +156,10 @@ htmlRegularAttribute = try (do (do a <- many (alphaNum <|> (oneOf "-._:")) return (a,"")) ] - return (name, content, + return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))) +-- | Parse an end tag of type 'tag' htmlEndTag tag = try (do char '<' spaces @@ -174,20 +181,23 @@ anyHtmlInlineTag = try (do tag <- choice [ anyHtmlTag, anyHtmlEndTag ] if isInline tag then return tag else fail "not an inline tag") --- scripts must be treated differently, because they can contain <> etc. +-- | Parses material between script tags. +-- Scripts must be treated differently, because they can contain '<>' etc. htmlScript = try (do open <- string "<script" rest <- manyTill anyChar (htmlEndTag "script") return (open ++ rest ++ "</script>")) +htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] + rawHtmlBlock = try (do notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"]) - body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, - definition] + body <- htmlBlockElement <|> anyHtmlBlockTag sp <- (many space) state <- getState if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null) +-- | Parses an HTML comment. htmlComment = try (do string "<!--" comment <- manyTill anyChar (try (string "-->")) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 2556c0aac..0d58dd87f 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -36,8 +36,11 @@ import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) import Text.Pandoc.Shared -import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, - anyHtmlBlockTag, anyHtmlInlineTag ) +import Text.Pandoc.Readers.HTML ( rawHtmlBlock, + anyHtmlBlockTag, anyHtmlInlineTag, + anyHtmlTag, anyHtmlEndTag, + htmlEndTag, extractTagType, + htmlBlockElement ) import Text.Pandoc.HtmlEntities ( decodeEntities ) import Text.Regex ( matchRegex, mkRegex ) import Text.ParserCombinators.Parsec @@ -107,6 +110,16 @@ skipNonindentSpaces = do let tabStop = stateTabStop state choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)])) +-- | Fail if reader is in strict markdown syntax mode. +failIfStrict = do + state <- getState + if stateStrict state then fail "Strict markdown mode" else return () + +-- | Fail unless we're at beginning of a line. +failUnlessBeginningOfLine = do + pos <- getPosition + if sourceColumn pos == 1 then return () else fail "not beginning of line" + -- -- document structure -- @@ -132,6 +145,7 @@ dateLine = try (do return (removeTrailingSpace date)) titleBlock = try (do + failIfStrict title <- option [] titleLine author <- option [] authorsLine date <- option "" dateLine @@ -147,7 +161,14 @@ parseMarkdown = do updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML, since markdown allows it (title, author, date) <- option ([],[],"") titleBlock - blocks <- parseBlocks + oldState <- getState + oldInput <- getInput + parseBlocks -- go through once just to get list of reference keys + newState <- getState + let keysUsed = stateKeysUsed newState + setInput oldInput + setState (oldState { stateKeysUsed = keysUsed }) + blocks <- parseBlocks -- go through again, for real let blocks' = filter (/= Null) blocks state <- getState let keys = reverse $ stateKeyBlocks state @@ -165,7 +186,7 @@ parseBlocks = do return result block = choice [ codeBlock, note, referenceKey, header, hrule, list, - blockQuote, rawHtmlBlocks, rawLaTeXEnvironment, para, + blockQuote, htmlBlock, rawLaTeXEnvironment', para, plain, blankBlock, nullBlock ] <?> "block" -- @@ -190,8 +211,7 @@ setextHeader = choice $ map (\x -> setextH x) (enumFromTo 1 (length setextHChars)) setextH n = try (do - txt <- many1 (do {notFollowedBy newline; inline}) - endline + txt <- many1Till inline newline many1 (char (setextHChars !! (n-1))) skipSpaces newline @@ -256,6 +276,7 @@ rawLines = do return (concat lines) note = try (do + failIfStrict ref <- noteMarker char ':' skipSpaces @@ -280,6 +301,7 @@ note = try (do -- emacsBoxQuote = try (do + failIfStrict string ",----" manyTill anyChar newline raw <- manyTill (try (do @@ -336,8 +358,9 @@ bulletListStart = try (do orderedListStart = try (do option ' ' newline -- if preceded by a Plain block in a list context skipNonindentSpaces - many1 digit <|> count 1 letter - oneOf orderedListDelimiters + many1 digit <|> (do{failIfStrict; count 1 letter}) + delim <- oneOf orderedListDelimiters + if delim /= '.' then failIfStrict else return () oneOf spaceChars skipSpaces) @@ -410,10 +433,12 @@ bulletList = try (do para = try (do result <- many1 inline newline - choice [ (do - followedBy' (oneOfStrings [">", ",----"]) - return "" ), - blanklines ] + st <- getState + if stateStrict st + then choice [followedBy' blockQuote, followedBy' header, + (do{blanklines; return ()})] + else choice [followedBy' emacsBoxQuote, + (do{blanklines; return ()})] let result' = normalizeSpaces result return (Para result')) @@ -426,6 +451,36 @@ plain = do -- raw html -- +htmlElement = choice [strictHtmlBlock, + htmlBlockElement] <?> "html element" + +htmlBlock = do + st <- getState + if stateStrict st + then do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return (RawHtml (first ++ finalSpace ++ finalNewlines)) + else rawHtmlBlocks + +-- True if tag is self-closing +selfClosing tag = case (matchRegex (mkRegex "\\/[[:space:]]*>$") tag) of + Just _ -> True + Nothing -> False + +strictHtmlBlock = try (do + tag <- anyHtmlBlockTag + let tag' = extractTagType tag + if selfClosing tag || tag' == "hr" + then return tag + else do + contents <- many (do{notFollowedBy' (htmlEndTag tag'); + htmlElement <|> (count 1 anyChar)}) + end <- htmlEndTag tag' + return $ tag ++ (concat contents) ++ end) + rawHtmlBlocks = try (do htmlBlocks <- many1 rawHtmlBlock let combined = concatMap (\(RawHtml str) -> str) htmlBlocks @@ -448,7 +503,18 @@ referenceKey = try (do option ' ' (char autoLinkEnd) tit <- option "" title blanklines - return (Key label (Src (removeTrailingSpace src) tit))) + state <- getState + let keysUsed = stateKeysUsed state + updateState (\st -> st { stateKeysUsed = (label:keysUsed) }) + return $ Key label (Src (removeTrailingSpace src) tit)) + +-- +-- LaTeX +-- + +rawLaTeXEnvironment' = do + failIfStrict + rawLaTeXEnvironment -- -- inline @@ -457,10 +523,10 @@ referenceKey = try (do text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar, whitespace, endline ] <?> "text" -inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, +inline = choice [ rawLaTeXInline', escapedChar, special, hyphens, text, ltSign, symbol ] <?> "inline" -special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline, +special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline', autoLink, image ] <?> "link, inline html, note, or image" escapedChar = escaped anyChar @@ -507,6 +573,7 @@ mathWord = many1 (choice [ (noneOf (" \t\n\\" ++ [mathEnd])), return c))]) math = try (do + failIfStrict char mathStart notFollowedBy space words <- sepBy1 mathWord (many1 space) @@ -549,18 +616,17 @@ str = do -- an endline character that can be treated as a space, not a structural break endline = try (do newline - -- next line would allow block quotes without preceding blank line - -- Markdown.pl does allow this, but there's a chance of a wrapped - -- greater-than sign triggering a block quote by accident... - -- notFollowedBy' (choice [emailBlockQuoteStart, string ",----"]) notFollowedBy blankline - -- parse potential list-starts differently if in a list: st <- getState + if stateStrict st + then do + notFollowedBy' emailBlockQuoteStart + notFollowedBy' header + else return () + -- parse potential list-starts differently if in a list: if (stateParserContext st) == ListItemState - then do - notFollowedBy' orderedListStart - notFollowedBy' bulletListStart - else option () pzero + then notFollowedBy' (orderedListStart <|> bulletListStart) + else return () return Space) -- @@ -571,8 +637,12 @@ endline = try (do reference = do char labelStart notFollowedBy (char noteStart) - label <- manyTill inline (char labelEnd) - return (normalizeSpaces label) + -- allow for embedded brackets: + label <- manyTill ((do{res <- reference; + return $ [Str "["] ++ res ++ [Str "]"]}) <|> + count 1 inline) + (char labelEnd) + return (normalizeSpaces (concat label)) -- source for a link, with optional title source = try (do @@ -590,8 +660,10 @@ titleWith startChar endChar = try (do skipEndline -- a title can be on the next line from the source skipSpaces char startChar - tit <- manyTill (choice [ try (do {char '\\'; char endChar}), - (noneOf (endChar:endLineChars)) ]) (char endChar) + tit <- manyTill anyChar (try (do + char endChar + skipSpaces + followedBy' (char ')' <|> newline))) let tit' = gsub "\"" """ tit return tit') @@ -608,19 +680,26 @@ explicitLink = try (do referenceLink = choice [referenceLinkDouble, referenceLinkSingle] --- a link like [this][/url/] +-- a link like [this][ref] referenceLinkDouble = try (do label <- reference skipSpaces skipEndline skipSpaces ref <- reference - return (Link label (Ref ref))) + let ref' = if null ref then label else ref + state <- getState + if ref' `elem` (stateKeysUsed state) + then return () else fail "no corresponding key" + return (Link label (Ref ref'))) -- a link like [this] referenceLinkSingle = try (do label <- reference - return (Link label (Ref []))) + state <- getState + if label `elem` (stateKeysUsed state) + then return () else fail "no corresponding key" + return (Link label (Ref label))) -- a link <like.this.com> autoLink = try (do @@ -645,6 +724,7 @@ noteMarker = try (do manyTill (noneOf " \t\n") (char labelEnd)) noteRef = try (do + failIfStrict ref <- noteMarker state <- getState let identifiers = (stateNoteIdentifiers state) ++ [ref] @@ -652,6 +732,7 @@ noteRef = try (do return (NoteRef (show (length identifiers)))) inlineNote = try (do + failIfStrict char noteStart char labelStart contents <- manyTill inline (char labelEnd) @@ -664,3 +745,14 @@ inlineNote = try (do (Note ref [Para contents]):noteBlocks}) return (NoteRef ref)) +rawLaTeXInline' = do + failIfStrict + rawLaTeXInline + +rawHtmlInline' = do + st <- getState + result <- if stateStrict st + then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] + else choice [htmlBlockElement, anyHtmlInlineTag] + return (HtmlInline result) + |