From df7b68225101966051f8b592a27127bf789eb81e Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Tue, 17 Oct 2006 14:22:29 +0000 Subject: initial import git-svn-id: https://pandoc.googlecode.com/svn/trunk@2 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Readers/HTML.hs | 434 ++++++++++++++++++++++++ src/Text/Pandoc/Readers/LaTeX.hs | 585 ++++++++++++++++++++++++++++++++ src/Text/Pandoc/Readers/Markdown.hs | 582 ++++++++++++++++++++++++++++++++ src/Text/Pandoc/Readers/RST.hs | 644 ++++++++++++++++++++++++++++++++++++ 4 files changed, 2245 insertions(+) create mode 100644 src/Text/Pandoc/Readers/HTML.hs create mode 100644 src/Text/Pandoc/Readers/LaTeX.hs create mode 100644 src/Text/Pandoc/Readers/Markdown.hs create mode 100644 src/Text/Pandoc/Readers/RST.hs (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs new file mode 100644 index 000000000..054d9eb72 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -0,0 +1,434 @@ +-- | Converts HTML to 'Pandoc' document. +module Text.Pandoc.Readers.HTML ( + readHtml, + rawHtmlInline, + rawHtmlBlock, + anyHtmlBlockTag, + anyHtmlInlineTag + ) where + +import Text.Regex ( matchRegex, mkRegex ) +import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar ) +import Maybe ( fromMaybe ) +import Char ( toUpper, toLower ) + +-- | Convert HTML-formatted string to 'Pandoc' document. +readHtml :: ParserState -- ^ Parser state + -> String -- ^ String to parse + -> Pandoc +readHtml = readWith parseHtml + +-- for testing +testString :: String -> IO () +testString = testStringWith parseHtml + +-- +-- Constants +-- + +inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite", + "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q", + "s", "samp", "select", "small", "span", "strike", "strong", "sub", + "sup", "textarea", "tt", "u", "var"] + +-- +-- HTML utility functions +-- + +-- | Read blocks until end tag. +blocksTilEnd tag = try (do + blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag) + return blocks) + +-- | Read inlines until end tag. +inlinesTilEnd tag = try (do + inlines <- manyTill inline (htmlEndTag tag) + return inlines) + +-- extract type from a tag: e.g. br from
, < br >,
, etc. +extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of + Just [match] -> (map toLower match) + Nothing -> "" + +anyHtmlTag = try (do + char '<' + spaces + tag <- many1 alphaNum + attribs <- htmlAttributes + spaces + ender <- option "" (string "/") + let ender' = if (null ender) then "" else " /" + spaces + char '>' + return ("<" ++ tag ++ attribs ++ ender' ++ ">")) + +anyHtmlEndTag = try (do + char '<' + spaces + char '/' + spaces + tagType <- many1 alphaNum + spaces + char '>' + return ("")) + +htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag tag = try (do + char '<' + spaces + stringAnyCase tag + attribs <- many htmlAttribute + spaces + option "" (string "/") + spaces + char '>' + return (tag, (map (\(name, content, raw) -> (name, content)) attribs))) + +-- parses a quoted html attribute value +quoted quoteChar = do + result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar])) + return (result, [quoteChar]) + +htmlAttributes = do + attrList <- many htmlAttribute + return (concatMap (\(name, content, raw) -> raw) attrList) + +htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute + +-- minimized boolean attribute (no = and value) +htmlMinimizedAttribute = try (do + spaces + name <- many1 (choice [letter, oneOf ".-_:"]) + spaces + notFollowedBy (char '=') + let content = name + return (name, content, (" " ++ name))) + +htmlRegularAttribute = try (do + spaces + name <- many1 (choice [letter, oneOf ".-_:"]) + spaces + char '=' + spaces + (content, quoteStr) <- choice [ (quoted '\''), + (quoted '"'), + (do{ a <- (many (alphaNum <|> (oneOf "-._:"))); + return (a,"")} ) ] + return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))) + +htmlEndTag tag = try (do + char '<' + spaces + char '/' + spaces + stringAnyCase tag + spaces + char '>' + return ("")) + +-- | Returns @True@ if the tag is an inline tag. +isInline tag = (extractTagType tag) `elem` inlineHtmlTags + +anyHtmlBlockTag = try (do + tag <- choice [anyHtmlTag, anyHtmlEndTag] + if isInline tag then + fail "inline tag" + else + return tag) + +anyHtmlInlineTag = try (do + tag <- choice [ anyHtmlTag, anyHtmlEndTag ] + if isInline tag then + return tag + else + fail "not an inline tag") + +-- scripts must be treated differently, because they can contain <> etc. +htmlScript = try (do + open <- string "")) + +rawHtmlBlock = do + notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '}) + body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition] + sp <- (many space) + state <- getState + if stateParseRaw state then + return (RawHtml (body ++ sp)) + else + return Null + +htmlComment = try (do + string "")) + return ("")) + +-- +-- parsing documents +-- + +xmlDec = try (do + string "') + return ("")) + +definition = try (do + string "') + return ("")) + +nonTitleNonHead = try (do + notFollowedBy' (htmlTag "title") + notFollowedBy' (htmlTag "/head") + result <- choice [do {rawHtmlBlock; return ' '}, anyChar] + return result) + +parseTitle = try (do + (tag, attribs) <- htmlTag "title" + contents <- inlinesTilEnd tag + spaces + return contents) + +-- parse header and return meta-information (for now, just title) +parseHead = try (do + htmlTag "head" + spaces + skipMany nonTitleNonHead + contents <- option [] parseTitle + skipMany nonTitleNonHead + htmlTag "/head" + return (contents, [], "")) + +skipHtmlTag tag = option ("",[]) (htmlTag tag) + +-- h1 class="title" representation of title in body +bodyTitle = try (do + (tag, attribs) <- htmlTag "h1" + cl <- case (extractAttribute "class" attribs) of + Just "title" -> do {return ""} + otherwise -> fail "not title" + inlinesTilEnd "h1" + return "") + +parseHtml = do + sepEndBy (choice [xmlDec, definition, htmlComment]) spaces + skipHtmlTag "html" + spaces + (title, authors, date) <- option ([], [], "") parseHead + spaces + skipHtmlTag "body" + spaces + option "" bodyTitle -- skip title in body, because it's represented in meta + blocks <- parseBlocks + spaces + option "" (htmlEndTag "body") + spaces + option "" (htmlEndTag "html") + many anyChar -- ignore anything after + eof + state <- getState + let keyBlocks = stateKeyBlocks state + return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks))) + +-- +-- parsing blocks +-- + +parseBlocks = do + spaces + result <- sepEndBy block spaces + return result + +block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain, + rawHtmlBlock ] "block" + +-- +-- header blocks +-- + +header = choice (map headerLevel (enumFromTo 1 5)) "header" + +headerLevel n = try (do + let level = "h" ++ show n + (tag, attribs) <- htmlTag level + contents <- inlinesTilEnd level + return (Header n (normalizeSpaces contents))) + +-- +-- hrule block +-- + +hrule = try (do + (tag, attribs) <- htmlTag "hr" + state <- getState + if (not (null attribs)) && (stateParseRaw state) then + unexpected "attributes in hr" -- in this case we want to parse it as raw html + else + return HorizontalRule) + +-- +-- code blocks +-- + +codeBlock = choice [ preCodeBlock, bareCodeBlock ] "code block" + +preCodeBlock = try (do + htmlTag "pre" + spaces + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + spaces + htmlEndTag "pre" + return (CodeBlock (decodeEntities result))) + +bareCodeBlock = try (do + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + return (CodeBlock (decodeEntities result))) + +-- +-- block quotes +-- + +blockQuote = try (do + tag <- htmlTag "blockquote" + spaces + blocks <- blocksTilEnd "blockquote" + return (BlockQuote blocks)) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList ] "list" + +orderedList = try (do + tag <- htmlTag "ol" + spaces + items <- sepEndBy1 listItem spaces + htmlEndTag "ol" + return (OrderedList items)) + +bulletList = try (do + tag <- htmlTag "ul" + spaces + items <- sepEndBy1 listItem spaces + htmlEndTag "ul" + return (BulletList items)) + +listItem = try (do + tag <- htmlTag "li" + spaces + blocks <- blocksTilEnd "li" + return blocks) + +-- +-- paragraph block +-- + +para = try (do + tag <- htmlTag "p" + result <- inlinesTilEnd "p" + return (Para (normalizeSpaces result))) + +-- +-- plain block +-- + +plain = do + result <- many1 inline + return (Plain (normalizeSpaces result)) + +-- +-- inline +-- + +inline = choice [ text, special ] "inline" + +text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] "text" + +special = choice [ link, image, rawHtmlInline ] "link, inline html, or image" + +entity = try (do + char '&' + body <- choice [(many1 letter), + (try (do{ char '#'; num <- many1 digit; return ("#" ++ num)}))] + char ';' + return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))])) + +code = try (do + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + -- remove internal line breaks, leading and trailing space, and decode entities + let result' = decodeEntities $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result + return (Code result')) + +rawHtmlInline = do + result <- choice [htmlScript, anyHtmlInlineTag] + state <- getState + if stateParseRaw state then + return (HtmlInline result) + else + return (Str "") + +betweenTags tag = try (do + htmlTag tag + result <- inlinesTilEnd tag + return (normalizeSpaces result)) + +emph = try (do + result <- choice [betweenTags "em", betweenTags "it"] + return (Emph result)) + +strong = try (do + result <- choice [betweenTags "b", betweenTags "strong"] + return (Strong result)) + +whitespace = do + many1 space + return Space + +-- hard line break +linebreak = do + htmlTag "br" + return LineBreak + +str = do + result <- many1 (noneOf "<& \t\n") + return (Str (decodeEntities result)) + +-- +-- links and images +-- + +-- extract contents of attribute (attribute names are case-insensitive) +extractAttribute name [] = Nothing +extractAttribute name ((attrName, contents):rest) = + let name' = map toLower name + attrName' = map toLower attrName in + if (attrName' == name') then Just contents else extractAttribute name rest + +link = try (do + (tag, attributes) <- htmlTag "a" + url <- case (extractAttribute "href" attributes) of + Just url -> do {return url} + Nothing -> fail "no href" + let title = fromMaybe "" (extractAttribute "title" attributes) + label <- inlinesTilEnd "a" + ref <- generateReference url title + return (Link (normalizeSpaces label) ref)) + +image = try (do + (tag, attributes) <- htmlTag "img" + url <- case (extractAttribute "src" attributes) of + Just url -> do {return url} + Nothing -> fail "no src" + let title = fromMaybe "" (extractAttribute "title" attributes) + let alt = fromMaybe "" (extractAttribute "alt" attributes) + ref <- generateReference url title + return (Image [Str alt] ref)) diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs new file mode 100644 index 000000000..3bf3dfd23 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -0,0 +1,585 @@ +-- | Converts LaTeX to 'Pandoc' document. +module Text.Pandoc.Readers.LaTeX ( + readLaTeX, + rawLaTeXInline, + rawLaTeXEnvironment + ) where + +import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Maybe ( fromMaybe ) +import Char ( chr ) + +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse + -> Pandoc +readLaTeX = readWith parseLaTeX + +-- for testing +testString = testStringWith parseLaTeX + +-- characters with special meaning +specialChars = "\\$%&^&_~#{}\n \t|<>" + +-- +-- utility functions +-- + +-- | Change quotation marks in a string back to "basic" quotes. +normalizeQuotes :: String -> String +normalizeQuotes = gsub "''" "\"" . gsub "`" "'" + +-- | Change LaTeX En dashes between digits to hyphens. +normalizeDashes :: String -> String +normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2" + +normalizePunctuation :: String -> String +normalizePunctuation = normalizeDashes . normalizeQuotes + +-- | Returns command option (between []) if any, or empty string. +commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]"))) + +-- | Returns text between brackets and its matching pair. +bracketedText = try (do + char '{' + result <- many (choice [ try (do{ char '\\'; + b <- oneOf "{}"; + return (['\\', b])}), -- escaped bracket + count 1 (noneOf "{}"), + do {text <- bracketedText; return ("{" ++ text ++ "}")} ]) + char '}' + return (concat result)) + +-- | Parses list of arguments of LaTeX command. +commandArgs = many bracketedText + +-- | Parses LaTeX command, returns (name, star, option, list of arguments). +command = try (do + char '\\' + name <- many1 alphaNum + star <- option "" (string "*") -- some commands have starred versions + opt <- commandOpt + args <- commandArgs + return (name, star, opt, args)) + +begin name = try (do + string "\\begin{" + string name + char '}' + option "" commandOpt + option [] commandArgs + spaces + return name) + +end name = try (do + string "\\end{" + string name + char '}' + spaces + return name) + +-- | Returns a list of block elements containing the contents of an environment. +environment name = try (do + begin name + spaces + contents <- manyTill block (end name) + return contents) + +anyEnvironment = try (do + string "\\begin{" + name <- many alphaNum + star <- option "" (string "*") -- some environments have starred variants + char '}' + option "" commandOpt + option [] commandArgs + spaces + contents <- manyTill block (end (name ++ star)) + return (BlockQuote contents)) + +-- +-- parsing documents +-- + +-- | Skip everything up through \begin{document} +skipLaTeXHeader = try (do + manyTill anyChar (begin "document") + spaces + return "") + +-- | Parse LaTeX and return 'Pandoc'. +parseLaTeX = do + option "" skipLaTeXHeader -- if parsing a fragment, this might not be present + blocks <- parseBlocks + spaces + option "" (string "\\end{document}") -- if parsing a fragment, this might not be present + spaces + eof + state <- getState + let keyBlocks = stateKeyBlocks state + let noteBlocks = stateNoteBlocks state + let blocks' = filter (/= Null) blocks + return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks))) + +-- +-- parsing blocks +-- + +parseBlocks = do + spaces + result <- many block + return result + +block = choice [ hrule, codeBlock, header, list, blockQuote, mathBlock, comment, + bibliographic, para, specialEnvironment, itemBlock, unknownEnvironment, + unknownCommand ] "block" + +-- +-- header blocks +-- + +header = choice (map headerLevel (enumFromTo 1 5)) "header" + +headerLevel n = try (do + let subs = concat $ replicate (n - 1) "sub" + string ("\\" ++ subs ++ "section") + option ' ' (char '*') + char '{' + title <- manyTill inline (char '}') + spaces + return (Header n (normalizeSpaces title))) + +-- +-- hrule block +-- + +hrule = try (do + oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", "\\newpage" ] + spaces + return HorizontalRule) + +-- +-- code blocks +-- + +codeBlock = try (do + string "\\begin{verbatim}" -- don't use begin function because it gobbles whitespace + option "" blanklines -- we want to gobble blank lines, but not leading space + contents <- manyTill anyChar (try (string "\\end{verbatim}")) + spaces + return (CodeBlock (stripTrailingNewlines contents))) + +-- +-- block quotes +-- + +blockQuote = choice [ blockQuote1, blockQuote2 ] "blockquote" + +blockQuote1 = try (do + blocks <- environment "quote" + spaces + return (BlockQuote blocks)) + +blockQuote2 = try (do + blocks <- environment "quotation" + spaces + return (BlockQuote blocks)) + +-- +-- math block +-- + +mathBlock = mathBlockWith (begin "equation") (end "equation") <|> + mathBlockWith (begin "displaymath") (end "displaymath") <|> + mathBlockWith (string "\\[") (string "\\]") "math block" + +mathBlockWith start end = try (do + start + spaces + result <- manyTill anyChar end + spaces + return (BlockQuote [Para [TeX ("$" ++ result ++ "$")]])) + +-- +-- list blocks +-- + +list = bulletList <|> orderedList "list" + +listItem = try (do + ("item", _, _, _) <- command + spaces + state <- getState + let oldParserContext = stateParserContext state + updateState (\state -> state {stateParserContext = ListItemState}) + blocks <- many block + updateState (\state -> state {stateParserContext = oldParserContext}) + return blocks) + +orderedList = try (do + begin "enumerate" + spaces + items <- many listItem + end "enumerate" + spaces + return (OrderedList items)) + +bulletList = try (do + begin "itemize" + spaces + items <- many listItem + end "itemize" + spaces + return (BulletList items)) + +-- +-- paragraph block +-- + +para = try (do + result <- many1 inline + spaces + return (Para (normalizeSpaces result))) + +-- +-- title authors date +-- + +bibliographic = choice [ maketitle, title, authors, date ] + +maketitle = try (do + string "\\maketitle" + spaces + return Null) + +title = try (do + string "\\title{" + tit <- manyTill inline (char '}') + spaces + updateState (\state -> state { stateTitle = tit }) + return Null) + +authors = try (do + string "\\author{" + authors <- manyTill anyChar (char '}') + spaces + let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors + updateState (\state -> state { stateAuthors = authors' }) + return Null) + +date = try (do + string "\\date{" + date' <- manyTill anyChar (char '}') + spaces + updateState (\state -> state { stateDate = date' }) + return Null) + +-- +-- item block +-- for use in unknown environments that aren't being parsed as raw latex +-- + +-- this forces items to be parsed in different blocks +itemBlock = try (do + ("item", _, opt, _) <- command + state <- getState + if (stateParserContext state == ListItemState) then + fail "item should be handled by list block" + else + if null opt then + return Null + else + return (Plain [Str opt])) + +-- +-- raw LaTeX +-- + +specialEnvironment = do -- these are always parsed as raw + followedBy' (choice (map (\name -> begin name) ["tabular", "figure", "tabbing", "eqnarry", + "picture", "table", "verse", "theorem"])) + rawLaTeXEnvironment + +-- | Parse any LaTeX environment and return a Para block containing +-- the whole literal environment as raw TeX. +rawLaTeXEnvironment :: GenParser Char st Block +rawLaTeXEnvironment = try (do + string "\\begin" + char '{' + name <- many1 alphaNum + star <- option "" (string "*") -- for starred variants + let name' = name ++ star + char '}' + opt <- option "" commandOpt + args <- option [] commandArgs + let optStr = if (null opt) then "" else "[" ++ opt ++ "]" + let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args + contents <- manyTill (choice [(many1 (noneOf "\\")), + (do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }), + string "\\"]) (end name') + spaces + return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++ + (concat contents) ++ "\\end{" ++ name' ++ "}")])) + +unknownEnvironment = try (do + state <- getState + result <- if stateParseRaw state then -- check to see whether we should include raw TeX + rawLaTeXEnvironment -- if so, get the whole raw environment + else + anyEnvironment -- otherwise just the contents + return result) + +unknownCommand = try (do + notFollowedBy' (string "\\end{itemize}") + notFollowedBy' (string "\\end{enumerate}") + notFollowedBy' (string "\\end{document}") + (name, star, opt, args) <- command + spaces + let optStr = if null opt then "" else "[" ++ opt ++ "]" + let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args + state <- getState + if (name == "item") && ((stateParserContext state) == ListItemState) then + fail "should not be parsed as raw" + else + string "" + if stateParseRaw state then + return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)]) + else + return (Plain [Str (joinWithSep " " args)])) + +-- latex comment +comment = try (do + char '%' + result <- manyTill anyChar newline + spaces + return Null) + +-- +-- inline +-- + +inline = choice [ strong, emph, ref, lab, code, linebreak, math, ldots, accentedChar, + specialChar, specialInline, escapedChar, unescapedChar, str, + endline, whitespace ] "inline" + +specialInline = choice [ link, image, footnote, rawLaTeXInline ] + "link, raw TeX, note, or image" + +ldots = try (do + string "\\ldots" + return (Str "...")) + +accentedChar = normalAccentedChar <|> specialAccentedChar + +normalAccentedChar = try (do + char '\\' + accent <- oneOf "'`^\"~" + character <- choice [ between (char '{') (char '}') anyChar, anyChar ] + let table = fromMaybe [] $ lookup character accentTable + let result = case lookup accent table of + Just num -> chr num + Nothing -> '?' + return (Str [result])) + +-- an association list of letters and association list of accents +-- and decimal character numbers. +accentTable = + [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), + ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), + ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), + ('N', [('~', 209)]), + ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), + ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), + ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), + ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), + ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), + ('n', [('~', 241)]), + ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), + ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] + +specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, oslash, pound, + euro, copyright, sect ] + +ccedil = try (do + char '\\' + letter <- choice [try (string "cc"), try (string "cC")] + let num = if letter == "cc" then 231 else 199 + return (Str [chr num])) + +aring = try (do + char '\\' + letter <- choice [try (string "aa"), try (string "AA")] + let num = if letter == "aa" then 229 else 197 + return (Str [chr num])) + +iuml = try (do + string "\\\"" + choice [try (string "\\i"), try (string "{\\i}")] + return (Str [chr 239])) + +icirc = try (do + string "\\^" + choice [try (string "\\i"), try (string "{\\i}")] + return (Str [chr 238])) + +szlig = try (do + string "\\ss" + return (Str [chr 223])) + +oslash = try (do + char '\\' + letter <- choice [char 'o', char 'O'] + let num = if letter == 'o' then 248 else 216 + return (Str [chr num])) + +aelig = try (do + char '\\' + letter <- choice [try (string "ae"), try (string "AE")] + let num = if letter == "ae" then 230 else 198 + return (Str [chr num])) + +pound = try (do + string "\\pounds" + return (Str [chr 163])) + +euro = try (do + string "\\euro" + return (Str [chr 8364])) + +copyright = try (do + string "\\copyright" + return (Str [chr 169])) + +sect = try (do + string "\\S" + return (Str [chr 167])) + +escapedChar = escaped (oneOf " $%^&_#{}") + +unescapedChar = do -- ignore standalone, nonescaped special characters + oneOf "$^&_#{}|<>" + return (Str "") + +specialChar = choice [ backslash, bar, lt, gt ] + +backslash = try (do + string "\\textbackslash" + return (Str "\\")) + +bar = try (do + string "\\textbar" + return (Str "\\")) + +lt = try (do + string "\\textless" + return (Str "<")) + +gt = try (do + string "\\textgreater" + return (Str ">")) + +code = try (do + string "\\verb" + marker <- anyChar + result <- manyTill anyChar (char marker) + let result' = removeLeadingTrailingSpace result + return (Code result')) + +emph = try (do + oneOfStrings [ "\\emph{", "\\textit{" ] + result <- manyTill inline (char '}') + return (Emph result)) + +lab = try (do + string "\\label{" + result <- manyTill anyChar (char '}') + return (Str ("(" ++ result ++ ")"))) + +ref = try (do + string "\\ref{" + result <- manyTill anyChar (char '}') + return (Str (result))) + +strong = try (do + string "\\textbf{" + result <- manyTill inline (char '}') + return (Strong result)) + +whitespace = do + many1 (oneOf "~ \t") + return Space + +-- hard line break +linebreak = try (do + string "\\\\" + return LineBreak) + +str = do + result <- many1 (noneOf specialChars) + return (Str (normalizePunctuation result)) + +-- endline internal to paragraph +endline = try (do + newline + notFollowedBy blankline + return Space) + +-- math +math = math1 <|> math2 "math" + +math1 = try (do + char '$' + result <- many (noneOf "$") + char '$' + return (TeX ("$" ++ result ++ "$"))) + +math2 = try (do + string "\\(" + result <- many (noneOf "$") + string "\\)" + return (TeX ("$" ++ result ++ "$"))) + +-- +-- links and images +-- + +link = try (do + string "\\href{" + url <- manyTill anyChar (char '}') + char '{' + label <- manyTill inline (char '}') + ref <- generateReference url "" + return (Link (normalizeSpaces label) ref)) + +image = try (do + ("includegraphics", _, _, (src:lst)) <- command + return (Image [Str "image"] (Src src ""))) + +footnote = try (do + ("footnote", _, _, (contents:[])) <- command + let blocks = case runParser parseBlocks defaultParserState "footnote" contents of + Left err -> error $ "Input:\n" ++ show contents ++ + "\nError:\n" ++ show err + Right result -> result + state <- getState + let notes = stateNoteBlocks state + let nextRef = case notes of + [] -> "1" + (Note ref body):rest -> (show ((read ref) + 1)) + setState (state { stateNoteBlocks = (Note nextRef blocks):notes }) + return (NoteRef nextRef)) + +-- | Parse any LaTeX command and return it in a raw TeX inline element. +rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline = try (do + (name, star, opt, args) <- command + let optStr = if (null opt) then "" else "[" ++ opt ++ "]" + let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args + state <- getState + if ((name == "begin") || (name == "end") || (name == "item")) then + fail "not an inline command" + else + string "" + return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr))) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs new file mode 100644 index 000000000..60ac40fd7 --- /dev/null +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -0,0 +1,582 @@ +-- | Convert markdown to Pandoc document. +module Text.Pandoc.Readers.Markdown ( + readMarkdown + ) where + +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Definition +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) +import Text.Pandoc.Shared +import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, + anyHtmlInlineTag ) +import Text.Pandoc.HtmlEntities ( decodeEntities ) +import Text.Regex ( matchRegex, mkRegex ) +import Text.ParserCombinators.Parsec + +-- | Read markdown from an input string and return a Pandoc document. +readMarkdown :: ParserState -> String -> Pandoc +readMarkdown = readWith parseMarkdown + +-- | Parse markdown string with default options and print result (for testing). +testString :: String -> IO () +testString = testStringWith parseMarkdown + +-- +-- Constants and data structure definitions +-- + +spaceChars = " \t" +endLineChars = "\n" +labelStart = '[' +labelEnd = ']' +labelSep = ':' +srcStart = '(' +srcEnd = ')' +imageStart = '!' +noteStart = '^' +codeStart = '`' +codeEnd = '`' +emphStart = '*' +emphEnd = '*' +emphStartAlt = '_' +emphEndAlt = '_' +autoLinkStart = '<' +autoLinkEnd = '>' +mathStart = '$' +mathEnd = '$' +bulletListMarkers = "*+-" +orderedListDelimiters = "." +escapeChar = '\\' +hruleChars = "*-_" +quoteChars = "'\"" +atxHChar = '#' +titleOpeners = "\"'(" +setextHChars = ['=','-'] +blockQuoteChar = '>' +hyphenChar = '-' + +-- treat these as potentially non-text when parsing inline: +specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt, + emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart, + mathEnd, imageStart, noteStart, hyphenChar] + +-- +-- auxiliary functions +-- + +-- | Skip a single endline if there is one. +skipEndline = option Space endline + +indentSpaces = do + state <- getState + let tabStop = stateTabStop state + oneOfStrings [ "\t", (replicate tabStop ' ') ] "indentation" + +skipNonindentSpaces = do + state <- getState + let tabStop = stateTabStop state + choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)])) + +-- +-- document structure +-- + +titleLine = try (do + char '%' + skipSpaces + line <- manyTill inline newline + return line) + +authorsLine = try (do + char '%' + skipSpaces + authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") + newline + return (map removeLeadingTrailingSpace authors)) + +dateLine = try (do + char '%' + skipSpaces + date <- many (noneOf "\n") + newline + return (removeTrailingSpace date)) + +titleBlock = try (do + title <- option [] titleLine + author <- option [] authorsLine + date <- option "" dateLine + option "" blanklines + return (title, author, date)) + +parseMarkdown = do + updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML + (title, author, date) <- option ([],[],"") titleBlock + blocks <- parseBlocks + state <- getState + let keys = reverse $ stateKeyBlocks state + return (Pandoc (Meta title author date) (blocks ++ keys)) + +-- +-- parsing blocks +-- + +parseBlocks = do + result <- manyTill block eof + return result + +block = choice [ codeBlock, referenceKey, note, header, hrule, list, blockQuote, rawHtmlBlocks, + rawLaTeXEnvironment, para, plain, blankBlock, nullBlock ] "block" + +-- +-- header blocks +-- + +header = choice [ setextHeader, atxHeader ] "header" + +atxHeader = try (do + lead <- many1 (char atxHChar) + skipSpaces + txt <- many1 (do {notFollowedBy' atxClosing; inline}) + atxClosing + return (Header (length lead) (normalizeSpaces txt))) + +atxClosing = try (do + skipMany (char atxHChar) + skipSpaces + newline + option "" blanklines) + +setextHeader = choice (map (\x -> setextH x) (enumFromTo 1 (length setextHChars))) + +setextH n = try (do + txt <- many1 (do {notFollowedBy newline; inline}) + endline + many1 (char (setextHChars !! (n-1))) + skipSpaces + newline + option "" blanklines + return (Header n (normalizeSpaces txt))) + +-- +-- hrule block +-- + +hruleWith chr = + try (do + skipSpaces + char chr + skipSpaces + char chr + skipSpaces + char chr + skipMany (oneOf (chr:spaceChars)) + newline + option "" blanklines + return HorizontalRule) + +hrule = choice (map hruleWith hruleChars) "hrule" + +-- +-- code blocks +-- + +indentedLine = try (do + indentSpaces + result <- manyTill anyChar newline + return (result ++ "\n")) + +-- two or more indented lines, possibly separated by blank lines +indentedBlock = try (do + res1 <- indentedLine + blanks <- many blankline + res2 <- choice [indentedBlock, indentedLine] + return (res1 ++ blanks ++ res2)) + +codeBlock = do + result <- choice [indentedBlock, indentedLine] + option "" blanklines + return (CodeBlock result) + +-- +-- note block +-- + +note = try (do + (NoteRef ref) <- noteRef + skipSpaces + raw <- sepBy (many (choice [nonEndline, + (try (do {endline; notFollowedBy (char noteStart); return '\n'})) + ])) (try (do {newline; char noteStart; option ' ' (char ' ')})) + newline + blanklines + -- parse the extracted block, which may contain various block elements: + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of + Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err + Right result -> result + return (Note ref parsed)) + +-- +-- block quotes +-- + +emacsBoxQuote = try (do + string ",----" + manyTill anyChar newline + raw <- manyTill (try (do{ char '|'; + option ' ' (char ' '); + result <- manyTill anyChar newline; + return result})) + (string "`----") + manyTill anyChar newline + option "" blanklines + return raw) + +emailBlockQuoteStart = try (do + skipNonindentSpaces + char blockQuoteChar + option ' ' (char ' ') + return "> ") + +emailBlockQuote = try (do + emailBlockQuoteStart + raw <- sepBy (many (choice [nonEndline, + (try (do{ endline; + notFollowedBy' emailBlockQuoteStart; + return '\n'}))])) + (try (do {newline; emailBlockQuoteStart})) + newline <|> (do{ eof; return '\n'}) + option "" blanklines + return raw) + +blockQuote = do + raw <- choice [ emailBlockQuote, emacsBoxQuote ] + -- parse the extracted block, which may contain various block elements: + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of + Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err + Right result -> result + return (BlockQuote parsed) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList ] "list" + +bulletListStart = + try (do + option ' ' newline -- if preceded by a Plain block in a list context + skipNonindentSpaces + notFollowedBy' hrule -- because hrules start out just like lists + oneOf bulletListMarkers + spaceChar + skipSpaces) + +orderedListStart = + try (do + option ' ' newline -- if preceded by a Plain block in a list context + skipNonindentSpaces + many1 digit + oneOf orderedListDelimiters + oneOf spaceChars + skipSpaces) + +-- parse a line of a list item (start = parser for beginning of list item) +listLine start = try (do + notFollowedBy' start + notFollowedBy blankline + notFollowedBy' (try (do{ indentSpaces; + many (spaceChar); + choice [bulletListStart, orderedListStart]})) + line <- manyTill anyChar newline + return (line ++ "\n")) + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = + try (do + start + result <- many1 (listLine start) + blanks <- many blankline + return ((concat result) ++ blanks)) + +-- continuation of a list item - indented and separated by blankline +-- or (in compact lists) endline. +-- note: nested lists are parsed as continuations +listContinuation start = + try (do + followedBy' indentSpaces + result <- many1 (listContinuationLine start) + blanks <- many blankline + return ((concat result) ++ blanks)) + +listContinuationLine start = try (do + notFollowedBy blankline + notFollowedBy' start + option "" indentSpaces + result <- manyTill anyChar newline + return (result ++ "\n")) + +listItem start = + try (do + first <- rawListItem start + rest <- many (listContinuation start) + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) + "block" raw of + Left err -> error $ "Raw block:\n" ++ raw ++ "\nError:\n" ++ show err + Right result -> result + where raw = concat (first:rest) + return parsed) + +orderedList = + try (do + items <- many1 (listItem orderedListStart) + let items' = compactify items + return (OrderedList items')) + +bulletList = + try (do + items <- many1 (listItem bulletListStart) + let items' = compactify items + return (BulletList items')) + +-- +-- paragraph block +-- + +para = try (do + result <- many1 inline + newline + choice [ (do{ followedBy' (oneOfStrings [">", ",----"]); return "" }), blanklines ] + let result' = normalizeSpaces result + return (Para result')) + +plain = do + result <- many1 inline + let result' = normalizeSpaces result + return (Plain result') + +-- +-- raw html +-- + +rawHtmlBlocks = try (do + htmlBlocks <- many1 rawHtmlBlock + let combined = concatMap (\(RawHtml str) -> str) htmlBlocks + let combined' = if (last combined == '\n') then + init combined -- strip extra newline + else + combined + return (RawHtml combined')) + +-- +-- reference key +-- + +referenceKey = + try (do + skipSpaces + label <- reference + char labelSep + skipSpaces + option ' ' (char autoLinkStart) + src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars)) + option ' ' (char autoLinkEnd) + tit <- option "" title + blanklines + return (Key label (Src (removeTrailingSpace src) tit))) + +-- +-- inline +-- + +text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar, + whitespace, endline ] "text" + +inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, ltSign, symbol ] "inline" + +special = choice [ link, referenceLink, rawHtmlInline, autoLink, + image, noteRef ] "link, inline html, note, or image" + +escapedChar = escaped anyChar + +ltSign = do + notFollowedBy' rawHtmlBlocks -- don't return < if it starts html + char '<' + return (Str ['<']) + +specialCharsMinusLt = filter (/= '<') specialChars + +symbol = do + result <- oneOf specialCharsMinusLt + return (Str [result]) + +hyphens = try (do + result <- many1 (char '-') + if (length result) == 1 then + skipEndline -- don't want to treat endline after hyphen as a space + else + do{ string ""; return Space } + return (Str result)) + +-- parses inline code, between codeStart and codeEnd +code1 = + try (do + char codeStart + result <- many (noneOf [codeEnd]) + char codeEnd + let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines + return (Code result')) + +-- parses inline code, between 2 codeStarts and 2 codeEnds +code2 = + try (do + string [codeStart, codeStart] + result <- manyTill anyChar (try (string [codeEnd, codeEnd])) + let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines + return (Code result')) + +mathWord = many1 (choice [(noneOf (" \t\n\\" ++ [mathEnd])), (try (do {c <- char '\\'; notFollowedBy (char mathEnd); return c}))]) + +math = try (do + char mathStart + notFollowedBy space + words <- sepBy1 mathWord (many1 space) + char mathEnd + return (TeX ("$" ++ (joinWithSep " " words) ++ "$"))) + +emph = do + result <- choice [ (enclosed (char emphStart) (char emphEnd) inline), + (enclosed (char emphStartAlt) (char emphEndAlt) inline) ] + return (Emph (normalizeSpaces result)) + +strong = do + result <- choice [ (enclosed (count 2 (char emphStart)) (count 2 (char emphEnd)) inline), + (enclosed (count 2 (char emphStartAlt)) (count 2 (char emphEndAlt)) inline)] + return (Strong (normalizeSpaces result)) + +whitespace = do + many1 (oneOf spaceChars) "whitespace" + return Space + +tabchar = do + tab + return (Str "\t") + +-- hard line break +linebreak = try (do + oneOf spaceChars + many1 (oneOf spaceChars) + endline + return LineBreak ) + +nonEndline = noneOf endLineChars + +str = do + result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) + return (Str (decodeEntities result)) + +-- an endline character that can be treated as a space, not a structural break +endline = + try (do + newline + -- next line would allow block quotes without preceding blank line + -- Markdown.pl does allow this, but there's a chance of a wrapped + -- greater-than sign triggering a block quote by accident... +-- notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' })) + notFollowedBy blankline + -- parse potential list starts at beginning of line differently if in a list: + st <- getState + if (stateParserContext st) == ListItemState then + do + notFollowedBy' orderedListStart + notFollowedBy' bulletListStart + else + option () pzero + return Space) + +-- +-- links +-- + +-- a reference label for a link +reference = do + char labelStart + label <- manyTill inline (char labelEnd) + return (normalizeSpaces label) + +-- source for a link, with optional title +source = + try (do + char srcStart + option ' ' (char autoLinkStart) + src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners)) + option ' ' (char autoLinkEnd) + tit <- option "" title + skipSpaces + char srcEnd + return (Src (removeTrailingSpace src) tit)) + +titleWith startChar endChar = + try (do + skipSpaces + skipEndline -- a title can be on the next line from the source + skipSpaces + char startChar + tit <- manyTill (choice [ try (do {char '\\'; char endChar}), + (noneOf (endChar:endLineChars)) ]) (char endChar) + let tit' = gsub "\"" """ tit + return tit') + +title = choice [titleWith '(' ')', titleWith '"' '"', titleWith '\'' '\''] "title" + +link = choice [explicitLink, referenceLink] "link" + +explicitLink = + try (do + label <- reference + src <- source + return (Link label src)) + +referenceLink = choice [referenceLinkDouble, referenceLinkSingle] + +referenceLinkDouble = -- a link like [this][/url/] + try (do + label <- reference + skipSpaces + skipEndline + skipSpaces + ref <- reference + return (Link label (Ref ref))) + +referenceLinkSingle = -- a link like [this] + try (do + label <- reference + return (Link label (Ref []))) + +autoLink = -- a link + try (do + notFollowedBy (do {anyHtmlBlockTag; return ' '}) + src <- between (char autoLinkStart) (char autoLinkEnd) + (many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd]))) + case (matchRegex emailAddress src) of + Just _ -> return (Link [Str src] (Src ("mailto:" ++ src) "")) + Nothing -> return (Link [Str src] (Src src ""))) + +emailAddress = mkRegex "([^@:/]+)@(([^.]+[.]?)*([^.]+))" -- presupposes no whitespace + +image = + try (do + char imageStart + (Link label src) <- link + return (Image label src)) + +noteRef = try (do + char noteStart + ref <- between (char '(') (char ')') (many1 (noneOf " \t\n)")) + return (NoteRef ref)) + diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs new file mode 100644 index 000000000..82e5ea303 --- /dev/null +++ b/src/Text/Pandoc/Readers/RST.hs @@ -0,0 +1,644 @@ +-- | Parse reStructuredText and return Pandoc document. +module Text.Pandoc.Readers.RST ( + readRST + ) where +import Text.Pandoc.Definition +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Shared +import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag ) +import Text.Regex ( matchRegex, mkRegex ) +import Text.ParserCombinators.Parsec +import Data.Maybe ( fromMaybe ) +import List ( findIndex ) +import Char ( toUpper ) + +-- | Parse reStructuredText string and return Pandoc document. +readRST :: ParserState -> String -> Pandoc +readRST = readWith parseRST + +-- | Parse a string and print result (for testing). +testString :: String -> IO () +testString = testStringWith parseRST + +-- +-- Constants and data structure definitions +--- + +bulletListMarkers = "*+-" +underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" + +-- treat these as potentially non-text when parsing inline: +specialChars = "\\`|*_<>$:[-" + +-- +-- parsing documents +-- + +isAnonKeyBlock block = case block of + (Key [Str "_"] str) -> True + otherwise -> False + +isNotAnonKeyBlock block = not (isAnonKeyBlock block) + +isHeader1 :: Block -> Bool +isHeader1 (Header 1 _) = True +isHeader1 _ = False + +isHeader2 :: Block -> Bool +isHeader2 (Header 2 _) = True +isHeader2 _ = False + +-- | Promote all headers in a list of blocks. (Part of +-- title transformation for RST.) +promoteHeaders :: Int -> [Block] -> [Block] +promoteHeaders num ((Header level text):rest) = + (Header (level - num) text):(promoteHeaders num rest) +promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders num [] = [] + +-- | If list of blocks starts with a header (or a header and subheader) +-- of level that are not found elsewhere, return it as a title and +-- promote all the other headers. +titleTransform :: [Block] -- ^ list of blocks + -> ([Block], [Inline]) -- ^ modified list of blocks, title +titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title and subtitle + if (any isHeader1 rest) || (any isHeader2 rest) then + ((Header 1 head1):(Header 2 head2):rest, []) + else + ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) +titleTransform ((Header 1 head1):rest) = -- title, no subtitle + if (any isHeader1 rest) then + ((Header 1 head1):rest, []) + else + ((promoteHeaders 1 rest), head1) +titleTransform blocks = (blocks, []) + +parseRST = do + state <- getState + input <- getInput + blocks <- parseBlocks -- first pass + let anonymousKeys = filter isAnonKeyBlock blocks + let blocks' = if (null anonymousKeys) then + blocks + else -- run parser again to fill in anonymous links... + case runParser parseBlocks (state { stateKeyBlocks = anonymousKeys }) + "RST source, second pass" input of + Left err -> error $ "\nError:\n" ++ show err + Right result -> (filter isNotAnonKeyBlock result) + let (blocks'', title) = if stateStandalone state then + titleTransform blocks' + else + (blocks', []) + state <- getState + let authors = stateAuthors state + let date = stateDate state + let title' = if (null title) then (stateTitle state) else title + return (Pandoc (Meta title' authors date) blocks'') + +-- +-- parsing blocks +-- + +parseBlocks = do + result <- manyTill block eof + return result + +block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, referenceKey, + imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock, + para, plain, blankBlock, nullBlock ] "block" + +-- +-- field list +-- + +fieldListItem = try (do + char ':' + name <- many1 alphaNum + string ": " + skipSpaces + first <- manyTill anyChar newline + rest <- many (do{ notFollowedBy (char ':'); + notFollowedBy blankline; + skipSpaces; + manyTill anyChar newline }) + return (name, (joinWithSep " " (first:rest)))) + +fieldList = try (do + items <- many1 fieldListItem + blanklines + let authors = case (lookup "Authors" items) of + Just auth -> [auth] + Nothing -> map snd (filter (\(x,y) -> x == "Author") items) + let date = case (lookup "Date" items) of + Just dat -> dat + Nothing -> "" + let title = case (lookup "Title" items) of + Just tit -> [Str tit] + Nothing -> [] + let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && (x /= "Date") && + (x /= "Title")) items + let result = map (\(x,y) -> Para [Strong [Str x], Str ":", Space, Str y]) remaining + updateState (\st -> st { stateAuthors = authors, stateDate = date, stateTitle = title }) + return (BlockQuote result)) + +-- +-- line block +-- + +lineBlockLine = try (do + string "| " + white <- many (oneOf " \t") + line <- manyTill inline newline + let line' = (if null white then [] else [Str white]) ++ line ++ [LineBreak] + return line') + +lineBlock = try (do + lines <- many1 lineBlockLine + blanklines + return $ Para (concat lines)) + +-- +-- paragraph block +-- + +para = choice [ paraBeforeCodeBlock, paraNormal ] "paragraph" + +codeBlockStart = try (do + string "::" + blankline + blankline) + +-- paragraph that ends in a :: starting a code block +paraBeforeCodeBlock = try (do + result <- many1 (do {notFollowedBy' codeBlockStart; inline}) + followedBy' (string "::") + return (Para (if (last result == Space) then + normalizeSpaces result + else + (normalizeSpaces result) ++ [Str ":"]))) + +-- regular paragraph +paraNormal = try (do + result <- many1 inline + newline + blanklines + let result' = normalizeSpaces result + return (Para result')) + +plain = do + result <- many1 inline + let result' = normalizeSpaces result + return (Plain result') + +-- +-- image block +-- + +imageBlock = try (do + string ".. image:: " + src <- manyTill anyChar newline + return (Plain [Image [Str "image"] (Src src "")])) + +-- +-- header blocks +-- + +header = choice [ doubleHeader, singleHeader ] "header" + +-- a header with lines on top and bottom +doubleHeader = try (do + c <- oneOf underlineChars + rest <- many (char c) -- the top line + let lenTop = length (c:rest) + skipSpaces + newline + txt <- many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + if (len > lenTop) then fail "title longer than border" else (do {return ()}) + blankline -- spaces and newline + count lenTop (char c) -- the bottom line + blanklines + -- check to see if we've had this kind of header before. + -- if so, get appropriate level. if not, add to list. + state <- getState + let headerTable = stateHeaderTable state + let (headerTable', level) = case findIndex (== DoubleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return (Header level (normalizeSpaces txt))) + +-- a header with line on the bottom only +singleHeader = try (do + notFollowedBy' whitespace + txt <- many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + blankline + c <- oneOf underlineChars + rest <- count (len - 1) (char c) + many (char c) + blanklines + state <- getState + let headerTable = stateHeaderTable state + let (headerTable', level) = case findIndex (== SingleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return (Header level (normalizeSpaces txt))) + +-- +-- hrule block +-- + +hruleWith chr = + try (do + count 4 (char chr) + skipMany (char chr) + skipSpaces + newline + blanklines + return HorizontalRule) + +hrule = choice (map hruleWith underlineChars) "hrule" + +-- +-- code blocks +-- + +-- read a line indented by a given string +indentedLine indents = try (do + string indents + result <- manyTill anyChar newline + return (result ++ "\n")) + +-- two or more indented lines, possibly separated by blank lines +-- if variable = True, then any indent will work, but it must be consistent through the block +-- if variable = False, indent should be one tab or equivalent in spaces +indentedBlock variable = try (do + state <- getState + let tabStop = stateTabStop state + indents <- if variable then + many1 (oneOf " \t") + else + oneOfStrings ["\t", (replicate tabStop ' ')] + firstline <- manyTill anyChar newline + rest <- many (choice [ indentedLine indents, + try (do {b <- blanklines; l <- indentedLine indents; return (b ++ l)})]) + option "" blanklines + return (firstline ++ "\n" ++ (concat rest))) + +codeBlock = try (do + codeBlockStart + result <- indentedBlock False -- the False means we want one tab stop indent on each line + return (CodeBlock result)) + +-- +-- raw html +-- + +rawHtmlBlock = try (do + string ".. raw:: html" + blanklines + result <- indentedBlock True + return (RawHtml result)) + +-- +-- raw latex +-- + +rawLaTeXBlock = try (do + string ".. raw:: latex" + blanklines + result <- indentedBlock True + return (Para [(TeX result)])) + +-- +-- block quotes +-- + +blockQuote = try (do + block <- indentedBlock True + -- parse the extracted block, which may contain various block elements: + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) + "block" (block ++ "\n\n") of + Left err -> error $ "Raw block:\n" ++ show block ++ "\nError:\n" ++ show err + Right result -> result + return (BlockQuote parsed)) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList ] "list" + +-- parses bullet list start and returns its length (inc. following whitespace) +bulletListStart = + try (do + notFollowedBy' hrule -- because hrules start out just like lists + marker <- oneOf bulletListMarkers + white <- many1 spaceChar + let len = length (marker:white) + return len) + +withPeriodSuffix parser = try (do + a <- parser + b <- char '.' + return (a ++ [b])) + +withParentheses parser = try (do + a <- char '(' + b <- parser + c <- char ')' + return ([a] ++ b ++ [c])) + +withRightParen parser = try (do + a <- parser + b <- char ')' + return (a ++ [b])) + +upcaseWord = map toUpper + +romanNumeral = do + let lowerNumerals = ["i", "ii", "iii", "iiii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", "xxiv" ] + let upperNumerals = map upcaseWord lowerNumerals + result <- choice $ map string (lowerNumerals ++ upperNumerals) + return result + +orderedListEnumerator = choice [ many1 digit, + string "#", + count 1 letter, + romanNumeral ] + +-- parses ordered list start and returns its length (inc. following whitespace) +orderedListStart = + try (do + marker <- choice [ withPeriodSuffix orderedListEnumerator, + withParentheses orderedListEnumerator, + withRightParen orderedListEnumerator ] + white <- many1 spaceChar + let len = length (marker ++ white) + return len) + +-- parse a line of a list item +listLine markerLength = try (do + notFollowedBy blankline + indentWith markerLength + line <- manyTill anyChar newline + return (line ++ "\n")) + +-- indent by specified number of spaces (or equiv. tabs) +indentWith num = do + state <- getState + let tabStop = stateTabStop state + if (num < tabStop) then + count num (char ' ') + else + choice [ try (count num (char ' ')), + (try (do {char '\t'; count (num - tabStop) (char ' ')})) ] + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = + try (do + markerLength <- start + firstLine <- manyTill anyChar newline + restLines <- many (listLine markerLength) + return (markerLength, (firstLine ++ "\n" ++ (concat restLines)))) + +-- continuation of a list item - indented and separated by blankline or (in compact lists) +-- endline. Note: nested lists are parsed as continuations. +listContinuation markerLength = + try (do + blanks <- many1 blankline + result <- many1 (listLine markerLength) + return (blanks ++ (concat result))) + +listItem start = + try (do + (markerLength, first) <- rawListItem start + rest <- many (listContinuation markerLength) + blanks <- choice [ try (do {b <- many blankline; followedBy' start; return b}), + many1 blankline ] -- whole list must end with blank + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) + "list item" raw of + Left err -> error $ "Raw:\n" ++ raw ++ "\nError:\n" ++ show err + Right result -> result + where raw = concat (first:rest) ++ blanks + return parsed) + +orderedList = + try (do + items <- many1 (listItem orderedListStart) + let items' = compactify items + return (OrderedList items')) + +bulletList = + try (do + items <- many1 (listItem bulletListStart) + let items' = compactify items + return (BulletList items')) + +-- +-- unknown directive (e.g. comment) +-- + +unknownDirective = try (do + string ".. " + manyTill anyChar newline + many (do {string " "; + char ':'; + many1 (noneOf "\n:"); + char ':'; + many1 (noneOf "\n"); + newline}) + option "" blanklines + return Null) + +-- +-- reference key +-- + +referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] + +imageKey = try (do + string ".. |" + ref <- manyTill inline (char '|') + skipSpaces + string "image::" + src <- manyTill anyChar newline + return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) ""))) + +anonymousKey = try (do + choice [string ".. __:", string "__"] + skipSpaces + src <- manyTill anyChar newline + state <- getState + return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) ""))) + +regularKeyQuoted = try (do + string ".. _`" + ref <- manyTill inline (string "`:") + skipSpaces + src <- manyTill anyChar newline + return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) ""))) + +regularKey = try (do + string ".. _" + ref <- manyTill inline (char ':') + skipSpaces + src <- manyTill anyChar newline + return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) ""))) + + -- + -- inline + -- + +text = choice [ strong, emph, code, str, tabchar, whitespace, endline ] "text" + +inline = choice [ escapedChar, special, hyphens, text, symbol ] "inline" + +special = choice [ link, image ] "link, inline html, or image" + +hyphens = try (do + result <- many1 (char '-') + option Space endline -- don't want to treat endline after hyphen or dash as a space + return (Str result)) + +escapedChar = escaped anyChar + +symbol = do + result <- oneOf specialChars + return (Str [result]) + +-- parses inline code, between codeStart and codeEnd +code = + try (do + string "``" + result <- manyTill anyChar (string "``") + let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result + return (Code result')) + +emph = do + result <- enclosed (char '*') (char '*') inline + return (Emph (normalizeSpaces result)) + +strong = do + result <- enclosed (string "**") (string "**") inline + return (Strong (normalizeSpaces result)) + +whitespace = do + many1 spaceChar "whitespace" + return Space + +tabchar = do + tab + return (Str "\t") + +str = do + notFollowedBy' oneWordReferenceLink + result <- many1 (noneOf (specialChars ++ "\t\n ")) + return (Str result) + +-- an endline character that can be treated as a space, not a structural break +endline = + try (do + newline + notFollowedBy blankline + -- parse potential list starts at beginning of line differently if in a list: + st <- getState + if ((stateParserContext st) == ListItemState) then + notFollowedBy' (choice [orderedListStart, bulletListStart]) + else + option () pzero + return Space) + +-- +-- links +-- + +link = choice [explicitLink, referenceLink, autoLink, oneWordReferenceLink] "link" + +explicitLink = + try (do + char '`' + label <- manyTill inline (try (do {spaces; char '<'})) + src <- manyTill (noneOf ">\n ") (char '>') + skipSpaces + string "`_" + return (Link (normalizeSpaces label) (Src (removeLeadingTrailingSpace src) ""))) + +anonymousLinkEnding = + try (do + char '_' + state <- getState + let anonKeys = stateKeyBlocks state + -- if there's a list of anon key refs (from previous pass), pop one off. + -- otherwise return an anon key ref for the next pass to take care of... + case anonKeys of + (Key [Str "_"] src):rest -> + do{ setState (state { stateKeyBlocks = rest }); + return src } + otherwise -> return (Ref [Str "_"])) + +referenceLink = + try (do + char '`' + label <- manyTill inline (string "`_") + src <- option (Ref []) anonymousLinkEnding + return (Link (normalizeSpaces label) src)) + +oneWordReferenceLink = + try (do + label <- many1 alphaNum + char '_' + src <- option (Ref []) anonymousLinkEnding + notFollowedBy alphaNum -- because this_is_not a link + return (Link [Str label] src)) + +uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:", + "news:", "telnet:" ] + +uri = try (do + scheme <- uriScheme + identifier <- many1 (noneOf " \t\n") + return (scheme ++ identifier)) + +autoURI = try (do + src <- uri + return (Link [Str src] (Src src ""))) + +emailChar = alphaNum <|> oneOf "-+_." + +emailAddress = try (do + firstLetter <- alphaNum + restAddr <- many emailChar + let addr = firstLetter:restAddr + char '@' + dom <- domain + return (addr ++ '@':dom)) + +domainChar = alphaNum <|> char '-' + +domain = try (do + first <- many1 domainChar + dom <- many1 (try (do{ char '.'; many1 domainChar })) + return (joinWithSep "." (first:dom))) + +autoEmail = try (do + src <- emailAddress + return (Link [Str src] (Src ("mailto:" ++ src) ""))) + +autoLink = autoURI <|> autoEmail + +-- For now, we assume that all substitution references are for images. +image = + try (do + char '|' + ref <- manyTill inline (char '|') + return (Image (normalizeSpaces ref) (Ref ref))) -- cgit v1.2.3