aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-11-26 07:01:37 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-11-26 07:01:37 +0000
commit986c1f9deec5acf01fbb594db00b2ec174296c77 (patch)
tree5af2a32672ed628cae694291b970d37c4e0cadb0 /src/Text/Pandoc/Readers
parente417ceaa8d7064ad81adbeabceb7204483aac1a5 (diff)
downloadpandoc-986c1f9deec5acf01fbb594db00b2ec174296c77.tar.gz
Pandoc bug fixes:
+ LaTeX reader did not parse metadata correctly. Now the title, author, and date are parsed correctly, and everything else in the preamble is skipped. + Simplified parsing of LaTeX command arguments and options. The function commandArgs now returns a list of arguments OR options (in whatever order they appear). The brackets are included, and a new stripFirstAndLast function is provided to strip them off when needed. This fixes a problem in dealing with \newcommand, etc. + Added a "try" before "parser" in definition of notFollowedBy' combinator. Adjusted the code using this combinator accordingly. + Changed handling of code blocks. Previously, some readers allowed trailing newlines, while others stripped them. Now, all readers strip trailing newlines in code blocks; writers insert a newline at the end of code blocks as needed. + Changed test suite to reflect these changes. git-svn-id: https://pandoc.googlecode.com/svn/trunk@137 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs10
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs91
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs21
-rw-r--r--src/Text/Pandoc/Readers/RST.hs2
4 files changed, 61 insertions, 63 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 054d9eb72..f9a738e94 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -153,15 +153,15 @@ htmlScript = try (do
rest <- manyTill anyChar (htmlEndTag "script")
return (open ++ rest ++ "</script>"))
-rawHtmlBlock = do
- notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '})
+rawHtmlBlock = try (do
+ notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition]
sp <- (many space)
state <- getState
if stateParseRaw state then
return (RawHtml (body ++ sp))
else
- return Null
+ return Null)
htmlComment = try (do
string "<!--"
@@ -284,12 +284,12 @@ preCodeBlock = try (do
result <- manyTill anyChar (htmlEndTag "code")
spaces
htmlEndTag "pre"
- return (CodeBlock (decodeEntities result)))
+ return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
bareCodeBlock = try (do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
- return (CodeBlock (decodeEntities result)))
+ return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
--
-- block quotes
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 3bf3dfd23..bd91c5014 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -39,37 +39,33 @@ normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2"
normalizePunctuation :: String -> String
normalizePunctuation = normalizeDashes . normalizeQuotes
--- | Returns command option (between []) if any, or empty string.
-commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]")))
-
-- | Returns text between brackets and its matching pair.
-bracketedText = try (do
- char '{'
- result <- many (choice [ try (do{ char '\\';
- b <- oneOf "{}";
- return (['\\', b])}), -- escaped bracket
- count 1 (noneOf "{}"),
- do {text <- bracketedText; return ("{" ++ text ++ "}")} ])
- char '}'
- return (concat result))
+bracketedText openB closeB = try (do
+ char openB
+ result <- many (choice [ oneOfStrings [ ['\\', openB], ['\\', closeB] ],
+ count 1 (noneOf [openB, closeB]),
+ bracketedText openB closeB ])
+ char closeB
+ return ([openB] ++ (concat result) ++ [closeB]))
--- | Parses list of arguments of LaTeX command.
-commandArgs = many bracketedText
+-- | Returns an option or argument of a LaTeX command
+optOrArg = choice [ (bracketedText '{' '}'), (bracketedText '[' ']') ]
--- | Parses LaTeX command, returns (name, star, option, list of arguments).
+-- | Returns list of options and arguments of a LaTeX command
+commandArgs = many optOrArg
+
+-- | Parses LaTeX command, returns (name, star, list of options/arguments).
command = try (do
char '\\'
name <- many1 alphaNum
star <- option "" (string "*") -- some commands have starred versions
- opt <- commandOpt
args <- commandArgs
- return (name, star, opt, args))
+ return (name, star, args))
begin name = try (do
string "\\begin{"
string name
char '}'
- option "" commandOpt
option [] commandArgs
spaces
return name)
@@ -93,7 +89,6 @@ anyEnvironment = try (do
name <- many alphaNum
star <- option "" (string "*") -- some environments have starred variants
char '}'
- option "" commandOpt
option [] commandArgs
spaces
contents <- manyTill block (end (name ++ star))
@@ -103,15 +98,14 @@ anyEnvironment = try (do
-- parsing documents
--
--- | Skip everything up through \begin{document}
-skipLaTeXHeader = try (do
- manyTill anyChar (begin "document")
+-- | Process LaTeX preamble, extracting metadata
+processLaTeXPreamble = do
+ manyTill (choice [bibliographic, comment, unknownCommand]) (try (string "\\begin{document}"))
spaces
- return "")
-- | Parse LaTeX and return 'Pandoc'.
parseLaTeX = do
- option "" skipLaTeXHeader -- if parsing a fragment, this might not be present
+ option () processLaTeXPreamble -- preamble might not be present, if a fragment
blocks <- parseBlocks
spaces
option "" (string "\\end{document}") -- if parsing a fragment, this might not be present
@@ -121,7 +115,10 @@ parseLaTeX = do
let keyBlocks = stateKeyBlocks state
let noteBlocks = stateNoteBlocks state
let blocks' = filter (/= Null) blocks
- return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks)))
+ let title' = stateTitle state
+ let authors' = stateAuthors state
+ let date' = stateDate state
+ return (Pandoc (Meta title' authors' date') (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks)))
--
-- parsing blocks
@@ -209,7 +206,7 @@ mathBlockWith start end = try (do
list = bulletList <|> orderedList <?> "list"
listItem = try (do
- ("item", _, _, _) <- command
+ ("item", _, _) <- command
spaces
state <- getState
let oldParserContext = stateParserContext state
@@ -265,7 +262,7 @@ authors = try (do
string "\\author{"
authors <- manyTill anyChar (char '}')
spaces
- let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors
+ let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\\\\\" "\n" authors
updateState (\state -> state { stateAuthors = authors' })
return Null)
@@ -283,15 +280,15 @@ date = try (do
-- this forces items to be parsed in different blocks
itemBlock = try (do
- ("item", _, opt, _) <- command
+ ("item", _, args) <- command
state <- getState
if (stateParserContext state == ListItemState) then
fail "item should be handled by list block"
else
- if null opt then
+ if null args then
return Null
else
- return (Plain [Str opt]))
+ return (Plain [Str (stripFirstAndLast (head args))]))
--
-- raw LaTeX
@@ -312,15 +309,13 @@ rawLaTeXEnvironment = try (do
star <- option "" (string "*") -- for starred variants
let name' = name ++ star
char '}'
- opt <- option "" commandOpt
args <- option [] commandArgs
- let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
- let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+ let argStr = concat args
contents <- manyTill (choice [(many1 (noneOf "\\")),
(do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }),
string "\\"]) (end name')
spaces
- return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++
+ return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ argStr ++
(concat contents) ++ "\\end{" ++ name' ++ "}")]))
unknownEnvironment = try (do
@@ -335,17 +330,16 @@ unknownCommand = try (do
notFollowedBy' (string "\\end{itemize}")
notFollowedBy' (string "\\end{enumerate}")
notFollowedBy' (string "\\end{document}")
- (name, star, opt, args) <- command
+ (name, star, args) <- command
spaces
- let optStr = if null opt then "" else "[" ++ opt ++ "]"
- let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+ let argStr = concat args
state <- getState
if (name == "item") && ((stateParserContext state) == ListItemState) then
fail "should not be parsed as raw"
else
string ""
if stateParseRaw state then
- return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)])
+ return (Plain [TeX ("\\" ++ name ++ star ++ argStr)])
else
return (Plain [Str (joinWithSep " " args)]))
@@ -554,13 +548,19 @@ link = try (do
return (Link (normalizeSpaces label) ref))
image = try (do
- ("includegraphics", _, _, (src:lst)) <- command
- return (Image [Str "image"] (Src src "")))
+ ("includegraphics", _, args) <- command
+ let args' = filter (\arg -> (take 1 arg) /= "[") args
+ let src = if null args' then
+ Src "" ""
+ else
+ Src (stripFirstAndLast (head args')) ""
+ return (Image [Str "image"] src))
footnote = try (do
- ("footnote", _, _, (contents:[])) <- command
+ ("footnote", _, (contents:[])) <- command
+ let contents' = stripFirstAndLast contents
let blocks = case runParser parseBlocks defaultParserState "footnote" contents of
- Left err -> error $ "Input:\n" ++ show contents ++
+ Left err -> error $ "Input:\n" ++ show contents' ++
"\nError:\n" ++ show err
Right result -> result
state <- getState
@@ -574,12 +574,11 @@ footnote = try (do
-- | Parse any LaTeX command and return it in a raw TeX inline element.
rawLaTeXInline :: GenParser Char ParserState Inline
rawLaTeXInline = try (do
- (name, star, opt, args) <- command
- let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
- let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args
+ (name, star, args) <- command
+ let argStr = concat args
state <- getState
if ((name == "begin") || (name == "end") || (name == "item")) then
fail "not an inline command"
else
string ""
- return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr)))
+ return (TeX ("\\" ++ name ++ star ++ argStr)))
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 60ac40fd7..df2f43e87 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -136,8 +136,7 @@ header = choice [ setextHeader, atxHeader ] <?> "header"
atxHeader = try (do
lead <- many1 (char atxHChar)
skipSpaces
- txt <- many1 (do {notFollowedBy' atxClosing; inline})
- atxClosing
+ txt <- manyTill inline atxClosing
return (Header (length lead) (normalizeSpaces txt)))
atxClosing = try (do
@@ -195,7 +194,7 @@ indentedBlock = try (do
codeBlock = do
result <- choice [indentedBlock, indentedLine]
option "" blanklines
- return (CodeBlock result)
+ return (CodeBlock (stripTrailingNewlines result))
--
-- note block
@@ -286,9 +285,9 @@ orderedListStart =
listLine start = try (do
notFollowedBy' start
notFollowedBy blankline
- notFollowedBy' (try (do{ indentSpaces;
- many (spaceChar);
- choice [bulletListStart, orderedListStart]}))
+ notFollowedBy' (do{ indentSpaces;
+ many (spaceChar);
+ choice [bulletListStart, orderedListStart]})
line <- manyTill anyChar newline
return (line ++ "\n"))
@@ -311,7 +310,7 @@ listContinuation start =
return ((concat result) ++ blanks))
listContinuationLine start = try (do
- notFollowedBy blankline
+ notFollowedBy' blankline
notFollowedBy' start
option "" indentSpaces
result <- manyTill anyChar newline
@@ -404,10 +403,10 @@ special = choice [ link, referenceLink, rawHtmlInline, autoLink,
escapedChar = escaped anyChar
-ltSign = do
+ltSign = try (do
notFollowedBy' rawHtmlBlocks -- don't return < if it starts html
char '<'
- return (Str ['<'])
+ return (Str ['<']))
specialCharsMinusLt = filter (/= '<') specialChars
@@ -487,7 +486,7 @@ endline =
-- next line would allow block quotes without preceding blank line
-- Markdown.pl does allow this, but there's a chance of a wrapped
-- greater-than sign triggering a block quote by accident...
--- notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' }))
+-- notFollowedBy' (choice [emailBlockQuoteStart, string ",----"])
notFollowedBy blankline
-- parse potential list starts at beginning of line differently if in a list:
st <- getState
@@ -560,7 +559,7 @@ referenceLinkSingle = -- a link like [this]
autoLink = -- a link <like.this.com>
try (do
- notFollowedBy (do {anyHtmlBlockTag; return ' '})
+ notFollowedBy' anyHtmlBlockTag
src <- between (char autoLinkStart) (char autoLinkEnd)
(many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd])))
case (matchRegex emailAddress src) of
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index 82e5ea303..69c7d9baa 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -292,7 +292,7 @@ indentedBlock variable = try (do
codeBlock = try (do
codeBlockStart
result <- indentedBlock False -- the False means we want one tab stop indent on each line
- return (CodeBlock result))
+ return (CodeBlock (stripTrailingNewlines result)))
--
-- raw html