diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2006-11-26 07:01:37 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2006-11-26 07:01:37 +0000 |
commit | 986c1f9deec5acf01fbb594db00b2ec174296c77 (patch) | |
tree | 5af2a32672ed628cae694291b970d37c4e0cadb0 /src | |
parent | e417ceaa8d7064ad81adbeabceb7204483aac1a5 (diff) | |
download | pandoc-986c1f9deec5acf01fbb594db00b2ec174296c77.tar.gz |
Pandoc bug fixes:
+ LaTeX reader did not parse metadata correctly. Now the title,
author, and date are parsed correctly, and everything else in
the preamble is skipped.
+ Simplified parsing of LaTeX command arguments and options.
The function commandArgs now returns a list of arguments OR
options (in whatever order they appear). The brackets are
included, and a new stripFirstAndLast function is provided
to strip them off when needed. This fixes a problem in dealing
with \newcommand, etc.
+ Added a "try" before "parser" in definition of notFollowedBy'
combinator. Adjusted the code using this combinator accordingly.
+ Changed handling of code blocks. Previously, some readers allowed
trailing newlines, while others stripped them. Now, all readers
strip trailing newlines in code blocks; writers insert a newline
at the end of code blocks as needed.
+ Changed test suite to reflect these changes.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@137 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 10 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 91 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 21 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 5 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/LaTeX.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Markdown.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/RST.hs | 2 | ||||
-rw-r--r-- | src/Text/ParserCombinators/Pandoc.hs | 5 |
10 files changed, 72 insertions, 70 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 054d9eb72..f9a738e94 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -153,15 +153,15 @@ htmlScript = try (do rest <- manyTill anyChar (htmlEndTag "script") return (open ++ rest ++ "</script>")) -rawHtmlBlock = do - notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '}) +rawHtmlBlock = try (do + notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"]) body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition] sp <- (many space) state <- getState if stateParseRaw state then return (RawHtml (body ++ sp)) else - return Null + return Null) htmlComment = try (do string "<!--" @@ -284,12 +284,12 @@ preCodeBlock = try (do result <- manyTill anyChar (htmlEndTag "code") spaces htmlEndTag "pre" - return (CodeBlock (decodeEntities result))) + return (CodeBlock (stripTrailingNewlines (decodeEntities result)))) bareCodeBlock = try (do htmlTag "code" result <- manyTill anyChar (htmlEndTag "code") - return (CodeBlock (decodeEntities result))) + return (CodeBlock (stripTrailingNewlines (decodeEntities result)))) -- -- block quotes diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 3bf3dfd23..bd91c5014 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -39,37 +39,33 @@ normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2" normalizePunctuation :: String -> String normalizePunctuation = normalizeDashes . normalizeQuotes --- | Returns command option (between []) if any, or empty string. -commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]"))) - -- | Returns text between brackets and its matching pair. -bracketedText = try (do - char '{' - result <- many (choice [ try (do{ char '\\'; - b <- oneOf "{}"; - return (['\\', b])}), -- escaped bracket - count 1 (noneOf "{}"), - do {text <- bracketedText; return ("{" ++ text ++ "}")} ]) - char '}' - return (concat result)) +bracketedText openB closeB = try (do + char openB + result <- many (choice [ oneOfStrings [ ['\\', openB], ['\\', closeB] ], + count 1 (noneOf [openB, closeB]), + bracketedText openB closeB ]) + char closeB + return ([openB] ++ (concat result) ++ [closeB])) --- | Parses list of arguments of LaTeX command. -commandArgs = many bracketedText +-- | Returns an option or argument of a LaTeX command +optOrArg = choice [ (bracketedText '{' '}'), (bracketedText '[' ']') ] --- | Parses LaTeX command, returns (name, star, option, list of arguments). +-- | Returns list of options and arguments of a LaTeX command +commandArgs = many optOrArg + +-- | Parses LaTeX command, returns (name, star, list of options/arguments). command = try (do char '\\' name <- many1 alphaNum star <- option "" (string "*") -- some commands have starred versions - opt <- commandOpt args <- commandArgs - return (name, star, opt, args)) + return (name, star, args)) begin name = try (do string "\\begin{" string name char '}' - option "" commandOpt option [] commandArgs spaces return name) @@ -93,7 +89,6 @@ anyEnvironment = try (do name <- many alphaNum star <- option "" (string "*") -- some environments have starred variants char '}' - option "" commandOpt option [] commandArgs spaces contents <- manyTill block (end (name ++ star)) @@ -103,15 +98,14 @@ anyEnvironment = try (do -- parsing documents -- --- | Skip everything up through \begin{document} -skipLaTeXHeader = try (do - manyTill anyChar (begin "document") +-- | Process LaTeX preamble, extracting metadata +processLaTeXPreamble = do + manyTill (choice [bibliographic, comment, unknownCommand]) (try (string "\\begin{document}")) spaces - return "") -- | Parse LaTeX and return 'Pandoc'. parseLaTeX = do - option "" skipLaTeXHeader -- if parsing a fragment, this might not be present + option () processLaTeXPreamble -- preamble might not be present, if a fragment blocks <- parseBlocks spaces option "" (string "\\end{document}") -- if parsing a fragment, this might not be present @@ -121,7 +115,10 @@ parseLaTeX = do let keyBlocks = stateKeyBlocks state let noteBlocks = stateNoteBlocks state let blocks' = filter (/= Null) blocks - return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks))) + let title' = stateTitle state + let authors' = stateAuthors state + let date' = stateDate state + return (Pandoc (Meta title' authors' date') (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks))) -- -- parsing blocks @@ -209,7 +206,7 @@ mathBlockWith start end = try (do list = bulletList <|> orderedList <?> "list" listItem = try (do - ("item", _, _, _) <- command + ("item", _, _) <- command spaces state <- getState let oldParserContext = stateParserContext state @@ -265,7 +262,7 @@ authors = try (do string "\\author{" authors <- manyTill anyChar (char '}') spaces - let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors + let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\\\\\" "\n" authors updateState (\state -> state { stateAuthors = authors' }) return Null) @@ -283,15 +280,15 @@ date = try (do -- this forces items to be parsed in different blocks itemBlock = try (do - ("item", _, opt, _) <- command + ("item", _, args) <- command state <- getState if (stateParserContext state == ListItemState) then fail "item should be handled by list block" else - if null opt then + if null args then return Null else - return (Plain [Str opt])) + return (Plain [Str (stripFirstAndLast (head args))])) -- -- raw LaTeX @@ -312,15 +309,13 @@ rawLaTeXEnvironment = try (do star <- option "" (string "*") -- for starred variants let name' = name ++ star char '}' - opt <- option "" commandOpt args <- option [] commandArgs - let optStr = if (null opt) then "" else "[" ++ opt ++ "]" - let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args + let argStr = concat args contents <- manyTill (choice [(many1 (noneOf "\\")), (do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }), string "\\"]) (end name') spaces - return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++ + return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ argStr ++ (concat contents) ++ "\\end{" ++ name' ++ "}")])) unknownEnvironment = try (do @@ -335,17 +330,16 @@ unknownCommand = try (do notFollowedBy' (string "\\end{itemize}") notFollowedBy' (string "\\end{enumerate}") notFollowedBy' (string "\\end{document}") - (name, star, opt, args) <- command + (name, star, args) <- command spaces - let optStr = if null opt then "" else "[" ++ opt ++ "]" - let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args + let argStr = concat args state <- getState if (name == "item") && ((stateParserContext state) == ListItemState) then fail "should not be parsed as raw" else string "" if stateParseRaw state then - return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)]) + return (Plain [TeX ("\\" ++ name ++ star ++ argStr)]) else return (Plain [Str (joinWithSep " " args)])) @@ -554,13 +548,19 @@ link = try (do return (Link (normalizeSpaces label) ref)) image = try (do - ("includegraphics", _, _, (src:lst)) <- command - return (Image [Str "image"] (Src src ""))) + ("includegraphics", _, args) <- command + let args' = filter (\arg -> (take 1 arg) /= "[") args + let src = if null args' then + Src "" "" + else + Src (stripFirstAndLast (head args')) "" + return (Image [Str "image"] src)) footnote = try (do - ("footnote", _, _, (contents:[])) <- command + ("footnote", _, (contents:[])) <- command + let contents' = stripFirstAndLast contents let blocks = case runParser parseBlocks defaultParserState "footnote" contents of - Left err -> error $ "Input:\n" ++ show contents ++ + Left err -> error $ "Input:\n" ++ show contents' ++ "\nError:\n" ++ show err Right result -> result state <- getState @@ -574,12 +574,11 @@ footnote = try (do -- | Parse any LaTeX command and return it in a raw TeX inline element. rawLaTeXInline :: GenParser Char ParserState Inline rawLaTeXInline = try (do - (name, star, opt, args) <- command - let optStr = if (null opt) then "" else "[" ++ opt ++ "]" - let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args + (name, star, args) <- command + let argStr = concat args state <- getState if ((name == "begin") || (name == "end") || (name == "item")) then fail "not an inline command" else string "" - return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr))) + return (TeX ("\\" ++ name ++ star ++ argStr))) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 60ac40fd7..df2f43e87 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -136,8 +136,7 @@ header = choice [ setextHeader, atxHeader ] <?> "header" atxHeader = try (do lead <- many1 (char atxHChar) skipSpaces - txt <- many1 (do {notFollowedBy' atxClosing; inline}) - atxClosing + txt <- manyTill inline atxClosing return (Header (length lead) (normalizeSpaces txt))) atxClosing = try (do @@ -195,7 +194,7 @@ indentedBlock = try (do codeBlock = do result <- choice [indentedBlock, indentedLine] option "" blanklines - return (CodeBlock result) + return (CodeBlock (stripTrailingNewlines result)) -- -- note block @@ -286,9 +285,9 @@ orderedListStart = listLine start = try (do notFollowedBy' start notFollowedBy blankline - notFollowedBy' (try (do{ indentSpaces; - many (spaceChar); - choice [bulletListStart, orderedListStart]})) + notFollowedBy' (do{ indentSpaces; + many (spaceChar); + choice [bulletListStart, orderedListStart]}) line <- manyTill anyChar newline return (line ++ "\n")) @@ -311,7 +310,7 @@ listContinuation start = return ((concat result) ++ blanks)) listContinuationLine start = try (do - notFollowedBy blankline + notFollowedBy' blankline notFollowedBy' start option "" indentSpaces result <- manyTill anyChar newline @@ -404,10 +403,10 @@ special = choice [ link, referenceLink, rawHtmlInline, autoLink, escapedChar = escaped anyChar -ltSign = do +ltSign = try (do notFollowedBy' rawHtmlBlocks -- don't return < if it starts html char '<' - return (Str ['<']) + return (Str ['<'])) specialCharsMinusLt = filter (/= '<') specialChars @@ -487,7 +486,7 @@ endline = -- next line would allow block quotes without preceding blank line -- Markdown.pl does allow this, but there's a chance of a wrapped -- greater-than sign triggering a block quote by accident... --- notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' })) +-- notFollowedBy' (choice [emailBlockQuoteStart, string ",----"]) notFollowedBy blankline -- parse potential list starts at beginning of line differently if in a list: st <- getState @@ -560,7 +559,7 @@ referenceLinkSingle = -- a link like [this] autoLink = -- a link <like.this.com> try (do - notFollowedBy (do {anyHtmlBlockTag; return ' '}) + notFollowedBy' anyHtmlBlockTag src <- between (char autoLinkStart) (char autoLinkEnd) (many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd]))) case (matchRegex emailAddress src) of diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 82e5ea303..69c7d9baa 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -292,7 +292,7 @@ indentedBlock variable = try (do codeBlock = try (do codeBlockStart result <- indentedBlock False -- the False means we want one tab stop indent on each line - return (CodeBlock result)) + return (CodeBlock (stripTrailingNewlines result))) -- -- raw html diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index c35e33d01..07afba00e 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -11,6 +11,7 @@ module Text.Pandoc.Shared ( removeLeadingTrailingSpace, removeLeadingSpace, removeTrailingSpace, + stripFirstAndLast, -- * Parsing readWith, testStringWith, @@ -227,6 +228,10 @@ removeLeadingSpace = dropWhile (\x -> (x == ' ') || (x == '\n') || (x == '\t')) removeTrailingSpace :: String -> String removeTrailingSpace = reverse . removeLeadingSpace . reverse +-- | Strip leading and trailing characters from string +stripFirstAndLast str = + drop 1 $ take ((length str) - 1) str + -- | Split list of inlines into groups separated by a space. splitBySpace :: [Inline] -> [[Inline]] splitBySpace lst = filter (\a -> (/= Space) (head a)) diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 9eecf2761..6b0696817 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -132,7 +132,7 @@ blockToHtml options (Note ref lst) = "<div class=\"pandocNote\">\n" ++ contents' ++ "</div>\n" blockToHtml options (Key _ _) = "" blockToHtml options (CodeBlock str) = "<pre><code>" ++ (codeStringToHtml str) ++ - "</code></pre>\n" + "\n</code></pre>\n" blockToHtml options (RawHtml str) = str blockToHtml options (BulletList lst) = let attribs = if (writerIncremental options) then " class=\"incremental\"" else "" in diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index 0674652cf..13eac31c2 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -107,7 +107,7 @@ blockToLaTeX notes (BlockQuote lst) = "\\begin{quote}\n" ++ (concatMap (blockToLaTeX notes) lst) ++ "\\end{quote}\n" blockToLaTeX notes (Note ref lst) = "" blockToLaTeX notes (Key _ _) = "" -blockToLaTeX notes (CodeBlock str) = "\\begin{verbatim}\n" ++ str ++ "\\end{verbatim}\n" +blockToLaTeX notes (CodeBlock str) = "\\begin{verbatim}\n" ++ str ++ "\n\\end{verbatim}\n" blockToLaTeX notes (RawHtml str) = "" blockToLaTeX notes (BulletList lst) = "\\begin{itemize}\n" ++ (concatMap (listItemToLaTeX notes) lst) ++ "\\end{itemize}\n" diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index b73090f62..4ca131455 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -83,7 +83,7 @@ blockToMarkdown tabStop (Key txt (Src src tit)) = text " " <> char '[' <> inlineListToMarkdown txt <> char ']' <> text ": " <> text src <> (if tit /= "" then (text (" \"" ++ (escapeLinkTitle tit) ++ "\"")) else empty) blockToMarkdown tabStop (CodeBlock str) = (nest tabStop $ vcat $ map text (lines str)) <> - (if (endsWith '\n' str) then empty else text "\n") <> text "\n" + text "\n" blockToMarkdown tabStop (RawHtml str) = text str blockToMarkdown tabStop (BulletList lst) = vcat (map (bulletListItemToMarkdown tabStop) lst) <> text "\n" diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index 37d895336..7d08d152d 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -91,7 +91,7 @@ blockToRST tabStop (Note ref blocks) = blockToRST tabStop (Key txt (Src src tit)) = (text "ERROR - KEY FOUND", empty) -- shouldn't have a key here blockToRST tabStop (CodeBlock str) = - (hang (text "::\n") tabStop (vcat $ map text (lines ('\n':(str ++ "\n")))), empty) + (hang (text "::\n") tabStop (vcat $ map text (lines ('\n':(str ++ "\n\n")))), empty) blockToRST tabStop (RawHtml str) = let str' = if (endsWith '\n' str) then (str ++ "\n") else (str ++ "\n\n") in (hang (text "\n.. raw:: html\n") 3 (vcat $ map text (lines str')), empty) diff --git a/src/Text/ParserCombinators/Pandoc.hs b/src/Text/ParserCombinators/Pandoc.hs index 9bf0a76f7..a78b776d3 100644 --- a/src/Text/ParserCombinators/Pandoc.hs +++ b/src/Text/ParserCombinators/Pandoc.hs @@ -82,9 +82,8 @@ many1Till p end = try (do -- | A more general form of @notFollowedBy@. This one allows any type of parser to -- be specified, and succeeds only if that parser fails. It does not consume any input. notFollowedBy' :: Show b => GenParser a st b -> GenParser a st () -notFollowedBy' parser = try (do{ c <- parser; unexpected (show c) } - <|> return () - ) +notFollowedBy' parser = try (do { c <- try parser; unexpected (show c) } + <|> return ()) -- | The inverse of @notFollowedBy'@. Fails if parser will fail, otherwise -- returns @()@ (but does not consume any input). |