From 4ea1b2bdc0a86f135bae4ae95cfc3d45a9416604 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 30 Dec 2006 22:51:49 +0000 Subject: Merged 'strict' branch from r324. This adds a '--strict' option to pandoc, which forces it to stay as close as possible to official Markdown syntax. git-svn-id: https://pandoc.googlecode.com/svn/trunk@347 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Readers/HTML.hs | 24 ++++-- src/Text/Pandoc/Readers/Markdown.hs | 154 ++++++++++++++++++++++++++++-------- src/Text/Pandoc/Shared.hs | 11 ++- src/Text/Pandoc/Writers/HTML.hs | 19 +++-- src/Text/Pandoc/Writers/LaTeX.hs | 4 - src/Text/Pandoc/Writers/Markdown.hs | 14 ++-- src/Text/Pandoc/Writers/RST.hs | 6 -- src/Text/Pandoc/Writers/RTF.hs | 4 - 8 files changed, 162 insertions(+), 74 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2bf75654c..9beaaacff 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -32,7 +32,12 @@ module Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, - anyHtmlInlineTag + anyHtmlInlineTag, + anyHtmlTag, + anyHtmlEndTag, + htmlEndTag, + extractTagType, + htmlBlockElement ) where import Text.Regex ( matchRegex, mkRegex ) @@ -78,17 +83,18 @@ inlinesTilEnd tag = try (do inlines <- manyTill inline (htmlEndTag tag) return inlines) --- extract type from a tag: e.g. br from
, < br >,
, etc. +-- | Extract type from a tag: e.g. 'br' from '
' extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of Just [match] -> (map toLower match) Nothing -> "" +-- | Parse any HTML tag (closing or opening) and return text of tag anyHtmlTag = try (do char '<' spaces tag <- many1 alphaNum - attribs <- htmlAttributes + attribs <- htmlAttributes spaces ender <- option "" (string "/") let ender' = if (null ender) then "" else " /" @@ -150,9 +156,10 @@ htmlRegularAttribute = try (do (do a <- many (alphaNum <|> (oneOf "-._:")) return (a,"")) ] - return (name, content, + return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))) +-- | Parse an end tag of type 'tag' htmlEndTag tag = try (do char '<' spaces @@ -174,20 +181,23 @@ anyHtmlInlineTag = try (do tag <- choice [ anyHtmlTag, anyHtmlEndTag ] if isInline tag then return tag else fail "not an inline tag") --- scripts must be treated differently, because they can contain <> etc. +-- | Parses material between script tags. +-- Scripts must be treated differently, because they can contain '<>' etc. htmlScript = try (do open <- string "")) +htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] + rawHtmlBlock = try (do notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"]) - body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, - definition] + body <- htmlBlockElement <|> anyHtmlBlockTag sp <- (many space) state <- getState if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null) +-- | Parses an HTML comment. htmlComment = try (do string "")) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 2556c0aac..0d58dd87f 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -36,8 +36,11 @@ import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) import Text.Pandoc.Shared -import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, - anyHtmlBlockTag, anyHtmlInlineTag ) +import Text.Pandoc.Readers.HTML ( rawHtmlBlock, + anyHtmlBlockTag, anyHtmlInlineTag, + anyHtmlTag, anyHtmlEndTag, + htmlEndTag, extractTagType, + htmlBlockElement ) import Text.Pandoc.HtmlEntities ( decodeEntities ) import Text.Regex ( matchRegex, mkRegex ) import Text.ParserCombinators.Parsec @@ -107,6 +110,16 @@ skipNonindentSpaces = do let tabStop = stateTabStop state choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)])) +-- | Fail if reader is in strict markdown syntax mode. +failIfStrict = do + state <- getState + if stateStrict state then fail "Strict markdown mode" else return () + +-- | Fail unless we're at beginning of a line. +failUnlessBeginningOfLine = do + pos <- getPosition + if sourceColumn pos == 1 then return () else fail "not beginning of line" + -- -- document structure -- @@ -132,6 +145,7 @@ dateLine = try (do return (removeTrailingSpace date)) titleBlock = try (do + failIfStrict title <- option [] titleLine author <- option [] authorsLine date <- option "" dateLine @@ -147,7 +161,14 @@ parseMarkdown = do updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML, since markdown allows it (title, author, date) <- option ([],[],"") titleBlock - blocks <- parseBlocks + oldState <- getState + oldInput <- getInput + parseBlocks -- go through once just to get list of reference keys + newState <- getState + let keysUsed = stateKeysUsed newState + setInput oldInput + setState (oldState { stateKeysUsed = keysUsed }) + blocks <- parseBlocks -- go through again, for real let blocks' = filter (/= Null) blocks state <- getState let keys = reverse $ stateKeyBlocks state @@ -165,7 +186,7 @@ parseBlocks = do return result block = choice [ codeBlock, note, referenceKey, header, hrule, list, - blockQuote, rawHtmlBlocks, rawLaTeXEnvironment, para, + blockQuote, htmlBlock, rawLaTeXEnvironment', para, plain, blankBlock, nullBlock ] "block" -- @@ -190,8 +211,7 @@ setextHeader = choice $ map (\x -> setextH x) (enumFromTo 1 (length setextHChars)) setextH n = try (do - txt <- many1 (do {notFollowedBy newline; inline}) - endline + txt <- many1Till inline newline many1 (char (setextHChars !! (n-1))) skipSpaces newline @@ -256,6 +276,7 @@ rawLines = do return (concat lines) note = try (do + failIfStrict ref <- noteMarker char ':' skipSpaces @@ -280,6 +301,7 @@ note = try (do -- emacsBoxQuote = try (do + failIfStrict string ",----" manyTill anyChar newline raw <- manyTill (try (do @@ -336,8 +358,9 @@ bulletListStart = try (do orderedListStart = try (do option ' ' newline -- if preceded by a Plain block in a list context skipNonindentSpaces - many1 digit <|> count 1 letter - oneOf orderedListDelimiters + many1 digit <|> (do{failIfStrict; count 1 letter}) + delim <- oneOf orderedListDelimiters + if delim /= '.' then failIfStrict else return () oneOf spaceChars skipSpaces) @@ -410,10 +433,12 @@ bulletList = try (do para = try (do result <- many1 inline newline - choice [ (do - followedBy' (oneOfStrings [">", ",----"]) - return "" ), - blanklines ] + st <- getState + if stateStrict st + then choice [followedBy' blockQuote, followedBy' header, + (do{blanklines; return ()})] + else choice [followedBy' emacsBoxQuote, + (do{blanklines; return ()})] let result' = normalizeSpaces result return (Para result')) @@ -426,6 +451,36 @@ plain = do -- raw html -- +htmlElement = choice [strictHtmlBlock, + htmlBlockElement] "html element" + +htmlBlock = do + st <- getState + if stateStrict st + then do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return (RawHtml (first ++ finalSpace ++ finalNewlines)) + else rawHtmlBlocks + +-- True if tag is self-closing +selfClosing tag = case (matchRegex (mkRegex "\\/[[:space:]]*>$") tag) of + Just _ -> True + Nothing -> False + +strictHtmlBlock = try (do + tag <- anyHtmlBlockTag + let tag' = extractTagType tag + if selfClosing tag || tag' == "hr" + then return tag + else do + contents <- many (do{notFollowedBy' (htmlEndTag tag'); + htmlElement <|> (count 1 anyChar)}) + end <- htmlEndTag tag' + return $ tag ++ (concat contents) ++ end) + rawHtmlBlocks = try (do htmlBlocks <- many1 rawHtmlBlock let combined = concatMap (\(RawHtml str) -> str) htmlBlocks @@ -448,7 +503,18 @@ referenceKey = try (do option ' ' (char autoLinkEnd) tit <- option "" title blanklines - return (Key label (Src (removeTrailingSpace src) tit))) + state <- getState + let keysUsed = stateKeysUsed state + updateState (\st -> st { stateKeysUsed = (label:keysUsed) }) + return $ Key label (Src (removeTrailingSpace src) tit)) + +-- +-- LaTeX +-- + +rawLaTeXEnvironment' = do + failIfStrict + rawLaTeXEnvironment -- -- inline @@ -457,10 +523,10 @@ referenceKey = try (do text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar, whitespace, endline ] "text" -inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, +inline = choice [ rawLaTeXInline', escapedChar, special, hyphens, text, ltSign, symbol ] "inline" -special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline, +special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline', autoLink, image ] "link, inline html, note, or image" escapedChar = escaped anyChar @@ -507,6 +573,7 @@ mathWord = many1 (choice [ (noneOf (" \t\n\\" ++ [mathEnd])), return c))]) math = try (do + failIfStrict char mathStart notFollowedBy space words <- sepBy1 mathWord (many1 space) @@ -549,18 +616,17 @@ str = do -- an endline character that can be treated as a space, not a structural break endline = try (do newline - -- next line would allow block quotes without preceding blank line - -- Markdown.pl does allow this, but there's a chance of a wrapped - -- greater-than sign triggering a block quote by accident... - -- notFollowedBy' (choice [emailBlockQuoteStart, string ",----"]) notFollowedBy blankline - -- parse potential list-starts differently if in a list: st <- getState + if stateStrict st + then do + notFollowedBy' emailBlockQuoteStart + notFollowedBy' header + else return () + -- parse potential list-starts differently if in a list: if (stateParserContext st) == ListItemState - then do - notFollowedBy' orderedListStart - notFollowedBy' bulletListStart - else option () pzero + then notFollowedBy' (orderedListStart <|> bulletListStart) + else return () return Space) -- @@ -571,8 +637,12 @@ endline = try (do reference = do char labelStart notFollowedBy (char noteStart) - label <- manyTill inline (char labelEnd) - return (normalizeSpaces label) + -- allow for embedded brackets: + label <- manyTill ((do{res <- reference; + return $ [Str "["] ++ res ++ [Str "]"]}) <|> + count 1 inline) + (char labelEnd) + return (normalizeSpaces (concat label)) -- source for a link, with optional title source = try (do @@ -590,8 +660,10 @@ titleWith startChar endChar = try (do skipEndline -- a title can be on the next line from the source skipSpaces char startChar - tit <- manyTill (choice [ try (do {char '\\'; char endChar}), - (noneOf (endChar:endLineChars)) ]) (char endChar) + tit <- manyTill anyChar (try (do + char endChar + skipSpaces + followedBy' (char ')' <|> newline))) let tit' = gsub "\"" """ tit return tit') @@ -608,19 +680,26 @@ explicitLink = try (do referenceLink = choice [referenceLinkDouble, referenceLinkSingle] --- a link like [this][/url/] +-- a link like [this][ref] referenceLinkDouble = try (do label <- reference skipSpaces skipEndline skipSpaces ref <- reference - return (Link label (Ref ref))) + let ref' = if null ref then label else ref + state <- getState + if ref' `elem` (stateKeysUsed state) + then return () else fail "no corresponding key" + return (Link label (Ref ref'))) -- a link like [this] referenceLinkSingle = try (do label <- reference - return (Link label (Ref []))) + state <- getState + if label `elem` (stateKeysUsed state) + then return () else fail "no corresponding key" + return (Link label (Ref label))) -- a link autoLink = try (do @@ -645,6 +724,7 @@ noteMarker = try (do manyTill (noneOf " \t\n") (char labelEnd)) noteRef = try (do + failIfStrict ref <- noteMarker state <- getState let identifiers = (stateNoteIdentifiers state) ++ [ref] @@ -652,6 +732,7 @@ noteRef = try (do return (NoteRef (show (length identifiers)))) inlineNote = try (do + failIfStrict char noteStart char labelStart contents <- manyTill inline (char labelEnd) @@ -664,3 +745,14 @@ inlineNote = try (do (Note ref [Para contents]):noteBlocks}) return (NoteRef ref)) +rawLaTeXInline' = do + failIfStrict + rawLaTeXInline + +rawHtmlInline' = do + st <- getState + result <- if stateStrict st + then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] + else choice [htmlBlockElement, anyHtmlInlineTag] + return (HtmlInline result) + diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 8418ecffd..7e4f63ffa 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -114,6 +114,7 @@ data ParserState = ParserState stateTitle :: [Inline], -- ^ Title of document stateAuthors :: [String], -- ^ Authors of document stateDate :: String, -- ^ Date of document + stateStrict :: Bool, -- ^ Use strict markdown syntax stateHeaderTable :: [HeaderType] -- ^ List of header types used, -- in what order (rst only) } @@ -132,6 +133,7 @@ defaultParserState = stateTitle = [], stateAuthors = [], stateDate = [], + stateStrict = False, stateHeaderTable = [] } -- | Consolidate @Str@s and @Space@s in an inline list into one big @Str@. @@ -325,10 +327,11 @@ data WriterOptions = WriterOptions , writerHeader :: String -- ^ Header for the document , writerIncludeBefore :: String -- ^ String to include before the body , writerIncludeAfter :: String -- ^ String to include after the body - , writerSmart :: Bool -- ^ If @True@, use smart typography - , writerS5 :: Bool -- ^ @True@ if we're writing S5 - , writerIncremental :: Bool -- ^ If @True@, inceremental S5 lists - , writerNumberSections :: Bool -- ^ If @True@, number sections in LaTeX + , writerSmart :: Bool -- ^ Use smart typography + , writerS5 :: Bool -- ^ We're writing S5 + , writerIncremental :: Bool -- ^ Incremental S5 lists + , writerNumberSections :: Bool -- ^ Number sections in LaTeX + , writerStrictMarkdown :: Bool -- ^ Use strict markdown syntax , writerTabStop :: Int -- ^ Tabstop for conversion between -- spaces and tabs } deriving Show diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index effede04c..4456a61b5 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -91,12 +91,15 @@ obfuscateLink options text src = then name ++ " at " ++ domain' else text' ++ " (" ++ name ++ " at " ++ domain' ++ ")" in - "" + if writerStrictMarkdown options + then "" ++ + obfuscateString text' ++ "" + else "" _ -> "" ++ text' ++ "" -- malformed email -- | Obfuscate character as entity. @@ -264,8 +267,6 @@ inlineToHtml options (Link text (Src src tit)) = else "" else ">") ++ (inlineListToHtml options text) ++ "" -inlineToHtml options (Link text (Ref [])) = - "[" ++ (inlineListToHtml options text) ++ "]" inlineToHtml options (Link text (Ref ref)) = "[" ++ (inlineListToHtml options text) ++ "][" ++ (inlineListToHtml options ref) ++ "]" @@ -276,8 +277,6 @@ inlineToHtml options (Image alt (Src source tit)) = "\""" -inlineToHtml options (Image alternate (Ref [])) = - "![" ++ (inlineListToHtml options alternate) ++ "]" inlineToHtml options (Image alternate (Ref ref)) = "![" ++ (inlineListToHtml options alternate) ++ "][" ++ (inlineListToHtml options ref) ++ "]" diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index cb8e13305..e34b7b61e 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -180,15 +180,11 @@ inlineToLaTeX notes (LineBreak) = "\\\\\n" inlineToLaTeX notes Space = " " inlineToLaTeX notes (Link text (Src src tit)) = "\\href{" ++ src ++ "}{" ++ (inlineListToLaTeX notes (deVerb text)) ++ "}" -inlineToLaTeX notes (Link text (Ref [])) = "[" ++ - (inlineListToLaTeX notes text) ++ "]" inlineToLaTeX notes (Link text (Ref ref)) = "[" ++ (inlineListToLaTeX notes text) ++ "][" ++ (inlineListToLaTeX notes ref) ++ "]" -- this is what markdown does, for better or worse inlineToLaTeX notes (Image alternate (Src source tit)) = "\\includegraphics{" ++ source ++ "}" -inlineToLaTeX notes (Image alternate (Ref [])) = - "![" ++ (inlineListToLaTeX notes alternate) ++ "]" inlineToLaTeX notes (Image alternate (Ref ref)) = "![" ++ (inlineListToLaTeX notes alternate) ++ "][" ++ (inlineListToLaTeX notes ref) ++ "]" diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 0e0563ab3..bfebc71fe 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -168,11 +168,12 @@ inlineToMarkdown (Link txt (Src src tit)) = (if tit /= "" then text (" \"" ++ (escapeLinkTitle tit) ++ "\"") else empty) <> char ')' -inlineToMarkdown (Link txt (Ref [])) = - char '[' <> inlineListToMarkdown txt <> text "][]" inlineToMarkdown (Link txt (Ref ref)) = - char '[' <> inlineListToMarkdown txt <> char ']' <> char '[' <> - inlineListToMarkdown ref <> char ']' + let first = char '[' <> inlineListToMarkdown txt <> char ']' + second = if (txt == ref) + then empty + else char '[' <> inlineListToMarkdown ref <> char ']' in + first <> second inlineToMarkdown (Image alternate (Src source tit)) = let alt = if (null alternate) || (alternate == [Str ""]) then text "image" @@ -181,10 +182,7 @@ inlineToMarkdown (Image alternate (Src source tit)) = (if tit /= "" then text (" \"" ++ (escapeLinkTitle tit) ++ "\"") else empty) <> char ')' -inlineToMarkdown (Image alternate (Ref [])) = - char '!' <> char '[' <> inlineListToMarkdown alternate <> char ']' inlineToMarkdown (Image alternate (Ref ref)) = - char '!' <> char '[' <> inlineListToMarkdown alternate <> char ']' <> - char '[' <> inlineListToMarkdown ref <> char ']' + char '!' <> inlineToMarkdown (Link alternate (Ref ref)) inlineToMarkdown (NoteRef ref) = text "[^" <> text (escapeString ref) <> char ']' diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index 1c14a4d7f..8b2563eb4 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -202,9 +202,6 @@ inlineToRST (Link txt (Src src tit)) = else linktext' in let ref = text ".. _" <> text linktext'' <> text ": " <> text src in (link, ref' $$ ref) -inlineToRST (Link txt (Ref [])) = - let (linktext, refs) = inlineListToRST txt in - (char '[' <> linktext <> char ']', refs) inlineToRST (Link txt (Ref ref)) = let (linktext, refs1) = inlineListToRST txt (reftext, refs2) = inlineListToRST ref in @@ -216,9 +213,6 @@ inlineToRST (Image alternate (Src source tit)) = let link = char '|' <> alt <> char '|' in let ref = text ".. " <> link <> text " image:: " <> text source in (link, ref' $$ ref) -inlineToRST (Image alternate (Ref [])) = - let (alttext, refs) = inlineListToRST alternate in - (char '|' <> alttext <> char '|', refs) -- The following case won't normally occur... inlineToRST (Image alternate (Ref ref)) = let (alttext, refs1) = inlineListToRST alternate diff --git a/src/Text/Pandoc/Writers/RTF.hs b/src/Text/Pandoc/Writers/RTF.hs index 19b4a5934..28cbe2ee8 100644 --- a/src/Text/Pandoc/Writers/RTF.hs +++ b/src/Text/Pandoc/Writers/RTF.hs @@ -220,15 +220,11 @@ inlineToRTF notes Space = " " inlineToRTF notes (Link text (Src src tit)) = "{\\field{\\*\\fldinst{HYPERLINK \"" ++ (codeStringToRTF src) ++ "\"}}{\\fldrslt{\\ul\n" ++ (inlineListToRTF notes text) ++ "\n}}}\n" -inlineToRTF notes (Link text (Ref [])) = - "[" ++ (inlineListToRTF notes text) ++ "]" inlineToRTF notes (Link text (Ref ref)) = "[" ++ (inlineListToRTF notes text) ++ "][" ++ (inlineListToRTF notes ref) ++ "]" -- this is what markdown does inlineToRTF notes (Image alternate (Src source tit)) = "{\\cf1 [image: " ++ source ++ "]\\cf0}" -inlineToRTF notes (Image alternate (Ref [])) = - "![" ++ (inlineListToRTF notes alternate) ++ "]" inlineToRTF notes (Image alternate (Ref ref)) = "![" ++ (inlineListToRTF notes alternate) ++ "][" ++ (inlineListToRTF notes ref) ++ "]" -- cgit v1.2.3