diff options
Diffstat (limited to 'Text/Pandoc/Readers')
-rw-r--r-- | Text/Pandoc/Readers/HTML.hs | 496 | ||||
-rw-r--r-- | Text/Pandoc/Readers/LaTeX.hs | 652 | ||||
-rw-r--r-- | Text/Pandoc/Readers/Markdown.hs | 916 | ||||
-rw-r--r-- | Text/Pandoc/Readers/RST.hs | 640 |
4 files changed, 2704 insertions, 0 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs new file mode 100644 index 000000000..70a071152 --- /dev/null +++ b/Text/Pandoc/Readers/HTML.hs @@ -0,0 +1,496 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.HTML + Copyright : Copyright (C) 2006-7 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of HTML to 'Pandoc' document. +-} +module Text.Pandoc.Readers.HTML ( + readHtml, + rawHtmlInline, + rawHtmlBlock, + anyHtmlBlockTag, + anyHtmlInlineTag, + anyHtmlTag, + anyHtmlEndTag, + htmlEndTag, + extractTagType, + htmlBlockElement + ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.CharacterReferences ( characterReference, + decodeCharacterReferences ) +import Data.Maybe ( fromMaybe ) +import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf ) +import Data.Char ( toUpper, toLower, isAlphaNum ) + +-- | Convert HTML-formatted string to 'Pandoc' document. +readHtml :: ParserState -- ^ Parser state + -> String -- ^ String to parse + -> Pandoc +readHtml = readWith parseHtml + +-- +-- Constants +-- + +eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", + "map", "area", "object", "script"] + +inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", + "br", "cite", "code", "dfn", "em", "font", "i", "img", + "input", "kbd", "label", "q", "s", "samp", "select", + "small", "span", "strike", "strong", "sub", "sup", + "textarea", "tt", "u", "var"] ++ eitherBlockOrInline + +blockHtmlTags = ["address", "blockquote", "center", "dir", "div", + "dl", "fieldset", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "hr", "isindex", "menu", "noframes", + "noscript", "ol", "p", "pre", "table", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr"] ++ eitherBlockOrInline + +-- +-- HTML utility functions +-- + +-- | Read blocks until end tag. +blocksTilEnd tag = do + blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) + return $ filter (/= Null) blocks + +-- | Read inlines until end tag. +inlinesTilEnd tag = manyTill inline (htmlEndTag tag) + +-- | Parse blocks between open and close tag. +blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag + +-- | Parse inlines between open and close tag. +inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag + +-- | Extract type from a tag: e.g. @br@ from @\<br\>@ +extractTagType :: String -> String +extractTagType ('<':rest) = + let isSpaceOrSlash c = c `elem` "/ \n\t" in + map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest +extractTagType _ = "" + +-- | Parse any HTML tag (opening or self-closing) and return text of tag +anyHtmlTag = try $ do + char '<' + spaces + tag <- many1 alphaNum + attribs <- many htmlAttribute + spaces + ender <- option "" (string "/") + let ender' = if null ender then "" else " /" + spaces + char '>' + return $ "<" ++ tag ++ + concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + +anyHtmlEndTag = try $ do + char '<' + spaces + char '/' + spaces + tagType <- many1 alphaNum + spaces + char '>' + return $ "</" ++ tagType ++ ">" + +htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag tag = try $ do + char '<' + spaces + stringAnyCase tag + attribs <- many htmlAttribute + spaces + optional (string "/") + spaces + char '>' + return (tag, (map (\(name, content, raw) -> (name, content)) attribs)) + +-- parses a quoted html attribute value +quoted quoteChar = do + result <- between (char quoteChar) (char quoteChar) + (many (noneOf [quoteChar])) + return (result, [quoteChar]) + +htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute + +-- minimized boolean attribute +htmlMinimizedAttribute = try $ do + many1 space + name <- many1 (choice [letter, oneOf ".-_:"]) + return (name, name, name) + +htmlRegularAttribute = try $ do + many1 space + name <- many1 (choice [letter, oneOf ".-_:"]) + spaces + char '=' + spaces + (content, quoteStr) <- choice [ (quoted '\''), + (quoted '"'), + (do + a <- many (alphaNum <|> (oneOf "-._:")) + return (a,"")) ] + return (name, content, + (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) + +-- | Parse an end tag of type 'tag' +htmlEndTag tag = try $ do + char '<' + spaces + char '/' + spaces + stringAnyCase tag + spaces + char '>' + return $ "</" ++ tag ++ ">" + +-- | Returns @True@ if the tag is (or can be) an inline tag. +isInline tag = (extractTagType tag) `elem` inlineHtmlTags + +-- | Returns @True@ if the tag is (or can be) a block tag. +isBlock tag = (extractTagType tag) `elem` blockHtmlTags + +anyHtmlBlockTag = try $ do + tag <- anyHtmlTag <|> anyHtmlEndTag + if isBlock tag then return tag else fail "inline tag" + +anyHtmlInlineTag = try $ do + tag <- anyHtmlTag <|> anyHtmlEndTag + if isInline tag then return tag else fail "not an inline tag" + +-- | Parses material between script tags. +-- Scripts must be treated differently, because they can contain '<>' etc. +htmlScript = try $ do + open <- string "<script" + rest <- manyTill anyChar (htmlEndTag "script") + return $ open ++ rest ++ "</script>" + +htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] + +rawHtmlBlock = try $ do + notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") + body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag + sp <- many space + state <- getState + if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null + +-- | Parses an HTML comment. +htmlComment = try $ do + string "<!--" + comment <- manyTill anyChar (try (string "-->")) + return $ "<!--" ++ comment ++ "-->" + +-- +-- parsing documents +-- + +xmlDec = try $ do + string "<?" + rest <- manyTill anyChar (char '>') + return $ "<?" ++ rest ++ ">" + +definition = try $ do + string "<!" + rest <- manyTill anyChar (char '>') + return $ "<!" ++ rest ++ ">" + +nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> + ((rawHtmlBlock >> return ' ') <|> anyChar) + +parseTitle = try $ do + (tag, _) <- htmlTag "title" + contents <- inlinesTilEnd tag + spaces + return contents + +-- parse header and return meta-information (for now, just title) +parseHead = try $ do + htmlTag "head" + spaces + skipMany nonTitleNonHead + contents <- option [] parseTitle + skipMany nonTitleNonHead + htmlTag "/head" + return (contents, [], "") + +skipHtmlTag tag = optional (htmlTag tag) + +-- h1 class="title" representation of title in body +bodyTitle = try $ do + (tag, attribs) <- htmlTag "h1" + cl <- case (extractAttribute "class" attribs) of + Just "title" -> return "" + otherwise -> fail "not title" + inlinesTilEnd "h1" + +parseHtml = do + sepEndBy (choice [xmlDec, definition, htmlComment]) spaces + skipHtmlTag "html" + spaces + (title, authors, date) <- option ([], [], "") parseHead + spaces + skipHtmlTag "body" + spaces + optional bodyTitle -- skip title in body, because it's represented in meta + blocks <- parseBlocks + spaces + optional (htmlEndTag "body") + spaces + optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html> + eof + return $ Pandoc (Meta title authors date) blocks + +-- +-- parsing blocks +-- + +parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) + +block = choice [ codeBlock + , header + , hrule + , list + , blockQuote + , para + , plain + , rawHtmlBlock ] <?> "block" + +-- +-- header blocks +-- + +header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" + +headerLevel n = try $ do + let level = "h" ++ show n + (tag, attribs) <- htmlTag level + contents <- inlinesTilEnd level + return $ Header n (normalizeSpaces contents) + +-- +-- hrule block +-- + +hrule = try $ do + (tag, attribs) <- htmlTag "hr" + state <- getState + if not (null attribs) && stateParseRaw state + then unexpected "attributes in hr" -- parse as raw in this case + else return HorizontalRule + +-- +-- code blocks +-- + +-- Note: HTML tags in code blocks (e.g. for syntax highlighting) are +-- skipped, because they are not portable to output formats other than HTML. +codeBlock = try $ do + htmlTag "pre" + result <- manyTill + (many1 (satisfy (/= '<')) <|> + ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) + (htmlEndTag "pre") + let result' = concat result + -- drop leading newline if any + let result'' = if "\n" `isPrefixOf` result' + then drop 1 result' + else result' + -- drop trailing newline if any + let result''' = if "\n" `isSuffixOf` result'' + then init result'' + else result'' + return $ CodeBlock $ decodeCharacterReferences result''' + +-- +-- block quotes +-- + +blockQuote = try $ htmlTag "blockquote" >> spaces >> + blocksTilEnd "blockquote" >>= (return . BlockQuote) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +orderedList = try $ do + (_, attribs) <- htmlTag "ol" + (start, style) <- option (1, DefaultStyle) $ + do failIfStrict + let sta = fromMaybe "1" $ + lookup "start" attribs + let sty = fromMaybe (fromMaybe "" $ + lookup "style" attribs) $ + lookup "class" attribs + let sty' = case sty of + "lower-roman" -> LowerRoman + "upper-roman" -> UpperRoman + "lower-alpha" -> LowerAlpha + "upper-alpha" -> UpperAlpha + "decimal" -> Decimal + _ -> DefaultStyle + return (read sta, sty') + spaces + items <- sepEndBy1 (blocksIn "li") spaces + htmlEndTag "ol" + return $ OrderedList (start, style, DefaultDelim) items + +bulletList = try $ do + htmlTag "ul" + spaces + items <- sepEndBy1 (blocksIn "li") spaces + htmlEndTag "ul" + return $ BulletList items + +definitionList = try $ do + failIfStrict -- def lists not part of standard markdown + tag <- htmlTag "dl" + spaces + items <- sepEndBy1 definitionListItem spaces + htmlEndTag "dl" + return $ DefinitionList items + +definitionListItem = try $ do + terms <- sepEndBy1 (inlinesIn "dt") spaces + defs <- sepEndBy1 (blocksIn "dd") spaces + let term = joinWithSep [LineBreak] terms + return (term, concat defs) + +-- +-- paragraph block +-- + +para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= + return . Para . normalizeSpaces + +-- +-- plain block +-- + +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- inline +-- + +inline = choice [ charRef + , strong + , emph + , superscript + , subscript + , strikeout + , spanStrikeout + , code + , str + , linebreak + , whitespace + , link + , image + , rawHtmlInline + ] <?> "inline" + +code = try $ do + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + -- remove internal line breaks, leading and trailing space, + -- and decode character references + return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ + joinWithSep " " $ lines result + +rawHtmlInline = do + result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag + state <- getState + if stateParseRaw state then return (HtmlInline result) else return (Str "") + +betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= + return . normalizeSpaces + +emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph + +strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong + +superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript + +subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript + +strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= + return . Strikeout + +spanStrikeout = try $ do + failIfStrict -- strict markdown has no strikeout, so treat as raw HTML + (tag, attributes) <- htmlTag "span" + result <- case (extractAttribute "class" attributes) of + Just "strikeout" -> inlinesTilEnd "span" + _ -> fail "not a strikeout" + return $ Strikeout result + +whitespace = many1 space >> return Space + +-- hard line break +linebreak = htmlTag "br" >> optional newline >> return LineBreak + +str = many1 (noneOf "<& \t\n") >>= return . Str + +-- +-- links and images +-- + +-- extract contents of attribute (attribute names are case-insensitive) +extractAttribute name [] = Nothing +extractAttribute name ((attrName, contents):rest) = + let name' = map toLower name + attrName' = map toLower attrName + in if attrName' == name' + then Just (decodeCharacterReferences contents) + else extractAttribute name rest + +link = try $ do + (tag, attributes) <- htmlTag "a" + url <- case (extractAttribute "href" attributes) of + Just url -> return url + Nothing -> fail "no href" + let title = fromMaybe "" $ extractAttribute "title" attributes + label <- inlinesTilEnd "a" + return $ Link (normalizeSpaces label) (url, title) + +image = try $ do + (tag, attributes) <- htmlTag "img" + url <- case (extractAttribute "src" attributes) of + Just url -> return url + Nothing -> fail "no src" + let title = fromMaybe "" $ extractAttribute "title" attributes + let alt = fromMaybe "" (extractAttribute "alt" attributes) + return $ Image [Str alt] (url, title) + diff --git a/Text/Pandoc/Readers/LaTeX.hs b/Text/Pandoc/Readers/LaTeX.hs new file mode 100644 index 000000000..79f9fc0f7 --- /dev/null +++ b/Text/Pandoc/Readers/LaTeX.hs @@ -0,0 +1,652 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.LaTeX + Copyright : Copyright (C) 2006-7 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of LaTeX to 'Pandoc' document. +-} +module Text.Pandoc.Readers.LaTeX ( + readLaTeX, + rawLaTeXInline, + rawLaTeXEnvironment + ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Data.Maybe ( fromMaybe ) +import Data.Char ( chr ) +import Data.List ( isPrefixOf, isSuffixOf ) + +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: ParserState -- ^ Parser state, including options for parser + -> String -- ^ String to parse + -> Pandoc +readLaTeX = readWith parseLaTeX + +-- characters with special meaning +specialChars = "\\`$%^&_~#{}\n \t|<>'\"-" + +-- +-- utility functions +-- + +-- | Returns text between brackets and its matching pair. +bracketedText openB closeB = do + result <- charsInBalanced' openB closeB + return $ [openB] ++ result ++ [closeB] + +-- | Returns an option or argument of a LaTeX command. +optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']' + +-- | True if the string begins with '{'. +isArg ('{':rest) = True +isArg other = False + +-- | Returns list of options and arguments of a LaTeX command. +commandArgs = many optOrArg + +-- | Parses LaTeX command, returns (name, star, list of options or arguments). +command = do + char '\\' + name <- many1 letter + star <- option "" (string "*") -- some commands have starred versions + args <- commandArgs + return (name, star, args) + +begin name = try $ do + string $ "\\begin{" ++ name ++ "}" + optional commandArgs + spaces + return name + +end name = try $ do + string $ "\\end{" ++ name ++ "}" + spaces + return name + +-- | Returns a list of block elements containing the contents of an +-- environment. +environment name = try $ begin name >> spaces >> manyTill block (end name) + +anyEnvironment = try $ do + string "\\begin{" + name <- many letter + star <- option "" (string "*") -- some environments have starred variants + char '}' + optional commandArgs + spaces + contents <- manyTill block (end (name ++ star)) + return $ BlockQuote contents + +-- +-- parsing documents +-- + +-- | Process LaTeX preamble, extracting metadata. +processLaTeXPreamble = try $ manyTill + (choice [bibliographic, comment, unknownCommand, nullBlock]) + (try (string "\\begin{document}")) >> + spaces + +-- | Parse LaTeX and return 'Pandoc'. +parseLaTeX = do + optional processLaTeXPreamble -- preamble might not be present (fragment) + spaces + blocks <- parseBlocks + spaces + optional $ try (string "\\end{document}" >> many anyChar) + -- might not be present (fragment) + spaces + eof + state <- getState + let blocks' = filter (/= Null) blocks + let title' = stateTitle state + let authors' = stateAuthors state + let date' = stateDate state + return $ Pandoc (Meta title' authors' date') blocks' + +-- +-- parsing blocks +-- + +parseBlocks = spaces >> many block + +block = choice [ hrule + , codeBlock + , header + , list + , blockQuote + , mathBlock + , comment + , bibliographic + , para + , specialEnvironment + , itemBlock + , unknownEnvironment + , unknownCommand ] <?> "block" + +-- +-- header blocks +-- + +header = try $ do + char '\\' + subs <- many (try (string "sub")) + string "section" + optional (char '*') + char '{' + title <- manyTill inline (char '}') + spaces + return $ Header (length subs + 1) (normalizeSpaces title) + +-- +-- hrule block +-- + +hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", + "\\newpage" ] >> spaces >> return HorizontalRule + +-- +-- code blocks +-- + +codeBlock = codeBlock1 <|> codeBlock2 + +codeBlock1 = try $ do + string "\\begin{verbatim}" -- don't use begin function because it + -- gobbles whitespace + optional blanklines -- we want to gobble blank lines, but not + -- leading space + contents <- manyTill anyChar (try (string "\\end{verbatim}")) + spaces + return $ CodeBlock (stripTrailingNewlines contents) + +codeBlock2 = try $ do + string "\\begin{Verbatim}" -- used by fancyvrb package + option "" blanklines + contents <- manyTill anyChar (try (string "\\end{Verbatim}")) + spaces + return $ CodeBlock (stripTrailingNewlines contents) + +-- +-- block quotes +-- + +blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>= + return . BlockQuote + +-- +-- math block +-- + +mathBlock = mathBlockWith (begin "equation") (end "equation") <|> + mathBlockWith (begin "displaymath") (end "displaymath") <|> + mathBlockWith (try $ string "\\[") (try $ string "\\]") <?> + "math block" + +mathBlockWith start end = try $ do + start + spaces + result <- manyTill anyChar end + spaces + return $ BlockQuote [Para [TeX ("$" ++ result ++ "$")]] + +-- +-- list blocks +-- + +list = bulletList <|> orderedList <|> definitionList <?> "list" + +listItem = try $ do + ("item", _, args) <- command + spaces + state <- getState + let oldParserContext = stateParserContext state + updateState (\state -> state {stateParserContext = ListItemState}) + blocks <- many block + updateState (\state -> state {stateParserContext = oldParserContext}) + opt <- case args of + ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x -> + parseFromString (many inline) $ tail $ init x + _ -> return [] + return (opt, blocks) + +orderedList = try $ do + string "\\begin{enumerate}" + (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ + try $ do failIfStrict + char '[' + res <- anyOrderedListMarker + char ']' + return res + spaces + option "" $ try $ do string "\\setlength{\\itemindent}" + char '{' + manyTill anyChar (char '}') + spaces + start <- option 1 $ try $ do failIfStrict + string "\\setcounter{enum" + many1 (oneOf "iv") + string "}{" + num <- many1 digit + char '}' + spaces + return $ (read num) + 1 + items <- many listItem + end "enumerate" + spaces + return $ OrderedList (start, style, delim) $ map snd items + +bulletList = try $ do + begin "itemize" + spaces + items <- many listItem + end "itemize" + spaces + return (BulletList $ map snd items) + +definitionList = try $ do + begin "description" + spaces + items <- many listItem + end "description" + spaces + return (DefinitionList items) + +-- +-- paragraph block +-- + +para = many1 inline >>~ spaces >>= return . Para . normalizeSpaces + +-- +-- title authors date +-- + +bibliographic = choice [ maketitle, title, authors, date ] + +maketitle = try (string "\\maketitle") >> spaces >> return Null + +title = try $ do + string "\\title{" + tit <- manyTill inline (char '}') + spaces + updateState (\state -> state { stateTitle = tit }) + return Null + +authors = try $ do + string "\\author{" + authors <- manyTill anyChar (char '}') + spaces + let authors' = map removeLeadingTrailingSpace $ lines $ + substitute "\\\\" "\n" authors + updateState (\state -> state { stateAuthors = authors' }) + return Null + +date = try $ do + string "\\date{" + date' <- manyTill anyChar (char '}') + spaces + updateState (\state -> state { stateDate = date' }) + return Null + +-- +-- item block +-- for use in unknown environments that aren't being parsed as raw latex +-- + +-- this forces items to be parsed in different blocks +itemBlock = try $ do + ("item", _, args) <- command + state <- getState + if (stateParserContext state == ListItemState) + then fail "item should be handled by list block" + else if null args + then return Null + else return $ Plain [Str (stripFirstAndLast (head args))] + +-- +-- raw LaTeX +-- + +specialEnvironment = do -- these are always parsed as raw + lookAhead (choice (map (\name -> begin name) ["tabular", "figure", + "tabbing", "eqnarry", "picture", "table", "verse", "theorem"])) + rawLaTeXEnvironment + +-- | Parse any LaTeX environment and return a Para block containing +-- the whole literal environment as raw TeX. +rawLaTeXEnvironment :: GenParser Char st Block +rawLaTeXEnvironment = try $ do + string "\\begin{" + name <- many1 letter + star <- option "" (string "*") -- for starred variants + let name' = name ++ star + char '}' + args <- option [] commandArgs + let argStr = concat args + contents <- manyTill (choice [ (many1 (noneOf "\\")), + (do + (Para [TeX str]) <- rawLaTeXEnvironment + return str), + string "\\" ]) + (end name') + spaces + return $ Para [TeX $ "\\begin{" ++ name' ++ "}" ++ argStr ++ + concat contents ++ "\\end{" ++ name' ++ "}"] + +unknownEnvironment = try $ do + state <- getState + result <- if stateParseRaw state -- check whether we should include raw TeX + then rawLaTeXEnvironment -- if so, get whole raw environment + else anyEnvironment -- otherwise just the contents + return result + +unknownCommand = try $ do + notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description", + "document"] + (name, star, args) <- command + spaces + let argStr = concat args + state <- getState + if name == "item" && (stateParserContext state) == ListItemState + then fail "should not be parsed as raw" + else string "" + if stateParseRaw state + then return $ Plain [TeX ("\\" ++ name ++ star ++ argStr)] + else return $ Plain [Str (joinWithSep " " args)] + +-- latex comment +comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null + +-- +-- inline +-- + +inline = choice [ str + , endline + , whitespace + , quoted + , apostrophe + , spacer + , strong + , math + , ellipses + , emDash + , enDash + , hyphen + , emph + , strikeout + , superscript + , subscript + , ref + , lab + , code + , url + , link + , image + , footnote + , linebreak + , accentedChar + , specialChar + , rawLaTeXInline + , escapedChar + , unescapedChar + ] <?> "inline" + +accentedChar = normalAccentedChar <|> specialAccentedChar + +normalAccentedChar = try $ do + char '\\' + accent <- oneOf "'`^\"~" + character <- (try $ char '{' >> letter >>~ char '}') <|> letter + let table = fromMaybe [] $ lookup character accentTable + let result = case lookup accent table of + Just num -> chr num + Nothing -> '?' + return $ Str [result] + +-- an association list of letters and association list of accents +-- and decimal character numbers. +accentTable = + [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), + ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), + ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), + ('N', [('~', 209)]), + ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), + ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), + ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), + ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), + ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), + ('n', [('~', 241)]), + ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), + ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] + +specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, + oslash, pound, euro, copyright, sect ] + +ccedil = try $ do + char '\\' + letter <- oneOfStrings ["cc", "cC"] + let num = if letter == "cc" then 231 else 199 + return $ Str [chr num] + +aring = try $ do + char '\\' + letter <- oneOfStrings ["aa", "AA"] + let num = if letter == "aa" then 229 else 197 + return $ Str [chr num] + +iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >> + return (Str [chr 239]) + +icirc = try (string "\\^") >> oneOfStrings ["\\i", "{\\i}"] >> + return (Str [chr 238]) + +szlig = try (string "\\ss") >> return (Str [chr 223]) + +oslash = try $ do + char '\\' + letter <- choice [char 'o', char 'O'] + let num = if letter == 'o' then 248 else 216 + return $ Str [chr num] + +aelig = try $ do + char '\\' + letter <- oneOfStrings ["ae", "AE"] + let num = if letter == "ae" then 230 else 198 + return $ Str [chr num] + +pound = try (string "\\pounds") >> return (Str [chr 163]) + +euro = try (string "\\euro") >> return (Str [chr 8364]) + +copyright = try (string "\\copyright") >> return (Str [chr 169]) + +sect = try (string "\\S") >> return (Str [chr 167]) + +escapedChar = do + result <- escaped (oneOf " $%&_#{}\n") + return $ if result == Str "\n" then Str " " else result + +-- ignore standalone, nonescaped special characters +unescapedChar = oneOf "`$^&_#{}|<>" >> return (Str "") + +specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ] + +backslash = try (string "\\textbackslash") >> return (Str "\\") + +tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~") + +caret = try (string "\\^{}") >> return (Str "^") + +bar = try (string "\\textbar") >> return (Str "\\") + +lt = try (string "\\textless") >> return (Str "<") + +gt = try (string "\\textgreater") >> return (Str ">") + +doubleQuote = char '"' >> return (Str "\"") + +code = code1 <|> code2 + +code1 = try $ do + string "\\verb" + marker <- anyChar + result <- manyTill anyChar (char marker) + return $ Code $ removeLeadingTrailingSpace result + +code2 = try $ do + string "\\texttt{" + result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') + return $ Code result + +emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> + manyTill inline (char '}') >>= return . Emph + +strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>= + return . Strikeout + +superscript = try $ string "\\textsuperscript{" >> + manyTill inline (char '}') >>= return . Superscript + +-- note: \textsubscript isn't a standard latex command, but we use +-- a defined version in pandoc. +subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>= + return . Subscript + +apostrophe = char '\'' >> return Apostrophe + +quoted = doubleQuoted <|> singleQuoted + +singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>= + return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>= + return . Quoted DoubleQuote . normalizeSpaces + +singleQuoteStart = char '`' + +singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum + +doubleQuoteStart = string "``" + +doubleQuoteEnd = try $ string "''" + +ellipses = try $ string "\\ldots" >> optional (try (string "{}")) >> + return Ellipses + +enDash = try (string "--") >> return EnDash + +emDash = try (string "---") >> return EmDash + +hyphen = char '-' >> return (Str "-") + +lab = try $ do + string "\\label{" + result <- manyTill anyChar (char '}') + return $ Str $ "(" ++ result ++ ")" + +ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str + +strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= + return . Strong + +whitespace = many1 (oneOf "~ \t") >> return Space + +-- hard line break +linebreak = try (string "\\\\") >> return LineBreak + +spacer = try (string "\\,") >> return (Str "") + +str = many1 (noneOf specialChars) >>= return . Str + +-- endline internal to paragraph +endline = try $ newline >> notFollowedBy blankline >> return Space + +-- math +math = math1 <|> math2 <?> "math" + +math1 = try $ do + char '$' + result <- many (noneOf "$") + char '$' + return $ TeX ("$" ++ result ++ "$") + +math2 = try $ do + string "\\(" + result <- many (noneOf "$") + string "\\)" + return $ TeX ("$" ++ result ++ "$") + +-- +-- links and images +-- + +url = try $ do + string "\\url" + url <- charsInBalanced '{' '}' + return $ Link [Code url] (url, "") + +link = try $ do + string "\\href{" + url <- manyTill anyChar (char '}') + char '{' + label <- manyTill inline (char '}') + return $ Link (normalizeSpaces label) (url, "") + +image = try $ do + ("includegraphics", _, args) <- command + let args' = filter isArg args -- filter out options + let src = if null args' then + ("", "") + else + (stripFirstAndLast (head args'), "") + return $ Image [Str "image"] src + +footnote = try $ do + (name, _, (contents:[])) <- command + if ((name == "footnote") || (name == "thanks")) + then string "" + else fail "not a footnote or thanks command" + let contents' = stripFirstAndLast contents + -- parse the extracted block, which may contain various block elements: + rest <- getInput + setInput $ contents' + blocks <- parseBlocks + setInput rest + return $ Note blocks + +-- | Parse any LaTeX command and return it in a raw TeX inline element. +rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline = try $ do + (name, star, args) <- command + state <- getState + if ((name == "begin") || (name == "end") || (name == "item")) + then fail "not an inline command" + else string "" + return $ TeX ("\\" ++ name ++ star ++ concat args) + diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs new file mode 100644 index 000000000..ded9f2136 --- /dev/null +++ b/Text/Pandoc/Readers/Markdown.hs @@ -0,0 +1,916 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Markdown + Copyright : Copyright (C) 2006-7 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of markdown-formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Markdown ( + readMarkdown + ) where + +import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex ) +import Data.Ord ( comparing ) +import Data.Char ( isAlphaNum ) +import Data.Maybe ( fromMaybe ) +import Network.URI ( isURI ) +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) +import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, + anyHtmlInlineTag, anyHtmlTag, + anyHtmlEndTag, htmlEndTag, extractTagType, + htmlBlockElement ) +import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) +import Text.ParserCombinators.Parsec + +-- | Read markdown from an input string and return a Pandoc document. +readMarkdown :: ParserState -> String -> Pandoc +readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n") + +-- +-- Constants and data structure definitions +-- + +spaceChars = " \t" +bulletListMarkers = "*+-" +hruleChars = "*-_" +setextHChars = "=-" + +-- treat these as potentially non-text when parsing inline: +specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221" + +-- +-- auxiliary functions +-- + +indentSpaces = try $ do + state <- getState + let tabStop = stateTabStop state + try (count tabStop (char ' ')) <|> + (many (char ' ') >> string "\t") <?> "indentation" + +nonindentSpaces = do + state <- getState + let tabStop = stateTabStop state + sps <- many (char ' ') + if length sps < tabStop + then return sps + else unexpected "indented line" + +-- | Fail unless we're at beginning of a line. +failUnlessBeginningOfLine = do + pos <- getPosition + if sourceColumn pos == 1 then return () else fail "not beginning of line" + +-- | Fail unless we're in "smart typography" mode. +failUnlessSmart = do + state <- getState + if stateSmart state then return () else fail "Smart typography feature" + +-- | Parse an inline Str element with a given content. +inlineString str = try $ do + (Str res) <- inline + if res == str then return res else fail $ "unexpected Str content" + +-- | Parse a sequence of inline elements between a string +-- @opener@ and a string @closer@, including inlines +-- between balanced pairs of @opener@ and a @closer@. +inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline] +inlinesInBalanced opener closer = try $ do + string opener + result <- manyTill ( (do lookAhead (inlineString opener) + -- because it might be a link... + bal <- inlinesInBalanced opener closer + return $ [Str opener] ++ bal ++ [Str closer]) + <|> (count 1 inline)) + (try (string closer)) + return $ concat result + +-- +-- document structure +-- + +titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline + +authorsLine = try $ do + char '%' + skipSpaces + authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") + newline + return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors + +dateLine = try $ do + char '%' + skipSpaces + date <- many (noneOf "\n") + newline + return $ decodeCharacterReferences $ removeTrailingSpace date + +titleBlock = try $ do + failIfStrict + title <- option [] titleLine + author <- option [] authorsLine + date <- option "" dateLine + optional blanklines + return (title, author, date) + +parseMarkdown = do + -- markdown allows raw HTML + updateState (\state -> state { stateParseRaw = True }) + startPos <- getPosition + -- go through once just to get list of reference keys + -- docMinusKeys is the raw document with blanks where the keys were... + docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= + return . concat + setInput docMinusKeys + setPosition startPos + st <- getState + -- go through again for notes unless strict... + if stateStrict st + then return () + else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= + return . concat + st <- getState + let reversedNotes = stateNotes st + updateState $ \st -> st { stateNotes = reverse reversedNotes } + setInput docMinusNotes + setPosition startPos + -- now parse it for real... + (title, author, date) <- option ([],[],"") titleBlock + blocks <- parseBlocks + return $ Pandoc (Meta title author date) $ filter (/= Null) blocks + +-- +-- initial pass for references and notes +-- + +referenceKey = try $ do + startPos <- getPosition + nonindentSpaces + label <- reference + char ':' + skipSpaces + optional (char '<') + src <- many (noneOf "> \n\t") + optional (char '>') + tit <- option "" referenceTitle + blanklines + endPos <- getPosition + let newkey = (label, (removeTrailingSpace src, tit)) + st <- getState + let oldkeys = stateKeys st + updateState $ \st -> st { stateKeys = newkey : oldkeys } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +referenceTitle = try $ do + (many1 spaceChar >> option '\n' newline) <|> newline + skipSpaces + tit <- (charsInBalanced '(' ')' >>= return . unwords . words) + <|> do delim <- char '\'' <|> char '"' + manyTill anyChar (try (char delim >> skipSpaces >> + notFollowedBy (noneOf ")\n"))) + return $ decodeCharacterReferences tit + +noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']') + +rawLine = do + notFollowedBy blankline + notFollowedBy' noteMarker + contents <- many1 nonEndline + end <- option "" (newline >> optional indentSpaces >> return "\n") + return $ contents ++ end + +rawLines = many1 rawLine >>= return . concat + +noteBlock = try $ do + startPos <- getPosition + ref <- noteMarker + char ':' + optional blankline + optional indentSpaces + raw <- sepBy rawLines (try (blankline >> indentSpaces)) + optional blanklines + endPos <- getPosition + -- parse the extracted text, which may contain various block elements: + contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" + let newnote = (ref, contents) + st <- getState + let oldnotes = stateNotes st + updateState $ \st -> st { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- +-- parsing blocks +-- + +parseBlocks = manyTill block eof + +block = choice [ header + , table + , codeBlock + , hrule + , list + , blockQuote + , htmlBlock + , rawLaTeXEnvironment' + , para + , plain + , nullBlock ] <?> "block" + +-- +-- header blocks +-- + +header = atxHeader <|> setextHeader <?> "header" + +atxHeader = try $ do + level <- many1 (char '#') >>= return . length + notFollowedBy (char '.' <|> char ')') -- this would be a list + skipSpaces + text <- manyTill inline atxClosing >>= return . normalizeSpaces + return $ Header level text + +atxClosing = try $ skipMany (char '#') >> blanklines + +setextHeader = try $ do + text <- many1Till inline newline + underlineChar <- oneOf setextHChars + many (char underlineChar) + blanklines + let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 + return $ Header level (normalizeSpaces text) + +-- +-- hrule block +-- + +hrule = try $ do + skipSpaces + start <- oneOf hruleChars + count 2 (skipSpaces >> char start) + skipMany (skipSpaces >> char start) + newline + optional blanklines + return HorizontalRule + +-- +-- code blocks +-- + +indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") + +codeBlock = do + contents <- many1 (indentedLine <|> + try (do b <- blanklines + l <- indentedLine + return $ b ++ l)) + optional blanklines + return $ CodeBlock $ stripTrailingNewlines $ concat contents + +-- +-- block quotes +-- + +emacsBoxQuote = try $ do + failIfStrict + string ",----" + manyTill anyChar newline + raw <- manyTill + (try (char '|' >> optional (char ' ') >> manyTill anyChar newline)) + (try (string "`----")) + blanklines + return raw + +emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ') + +emailBlockQuote = try $ do + emailBlockQuoteStart + raw <- sepBy (many (nonEndline <|> + (try (endline >> notFollowedBy emailBlockQuoteStart >> + return '\n')))) + (try (newline >> emailBlockQuoteStart)) + newline <|> (eof >> return '\n') + optional blanklines + return raw + +blockQuote = do + raw <- emailBlockQuote <|> emacsBoxQuote + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" + return $ BlockQuote contents + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +bulletListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + notFollowedBy' hrule -- because hrules start out just like lists + oneOf bulletListMarkers + spaceChar + skipSpaces + +anyOrderedListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + notFollowedBy $ string "p." >> spaceChar >> digit -- page number + state <- getState + if stateStrict state + then do many1 digit + char '.' + spaceChar + return (1, DefaultStyle, DefaultDelim) + else anyOrderedListMarker >>~ spaceChar + +orderedListStart style delim = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + state <- getState + num <- if stateStrict state + then do many1 digit + char '.' + return 1 + else orderedListMarker style delim + if delim == Period && (style == UpperAlpha || (style == UpperRoman && + num `elem` [1, 5, 10, 50, 100, 500, 1000])) + then char '\t' <|> (spaceChar >> spaceChar) + else spaceChar + skipSpaces + +-- parse a line of a list item (start = parser for beginning of list item) +listLine start = try $ do + notFollowedBy' start + notFollowedBy blankline + notFollowedBy' (do indentSpaces + many (spaceChar) + bulletListStart <|> (anyOrderedListStart >> return ())) + line <- manyTill anyChar newline + return $ line ++ "\n" + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = try $ do + start + result <- many1 (listLine start) + blanks <- many blankline + return $ concat result ++ blanks + +-- continuation of a list item - indented and separated by blankline +-- or (in compact lists) endline. +-- note: nested lists are parsed as continuations +listContinuation start = try $ do + lookAhead indentSpaces + result <- many1 (listContinuationLine start) + blanks <- many blankline + return $ concat result ++ blanks + +listContinuationLine start = try $ do + notFollowedBy blankline + notFollowedBy' start + optional indentSpaces + result <- manyTill anyChar newline + return $ result ++ "\n" + +listItem start = try $ do + first <- rawListItem start + continuations <- many (listContinuation start) + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may contain various block elements: + let raw = concat (first:continuations) + contents <- parseFromString parseBlocks raw + updateState (\st -> st {stateParserContext = oldContext}) + return contents + +orderedList = try $ do + (start, style, delim) <- lookAhead anyOrderedListStart + items <- many1 (listItem (orderedListStart style delim)) + return $ OrderedList (start, style, delim) $ compactify items + +bulletList = many1 (listItem bulletListStart) >>= + return . BulletList . compactify + +-- definition lists + +definitionListItem = try $ do + notFollowedBy blankline + notFollowedBy' indentSpaces + -- first, see if this has any chance of being a definition list: + lookAhead (anyLine >> char ':') + term <- manyTill inline newline + raw <- many1 defRawBlock + state <- getState + let oldContext = stateParserContext state + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ concat raw + updateState (\st -> st {stateParserContext = oldContext}) + return ((normalizeSpaces term), contents) + +defRawBlock = try $ do + char ':' + state <- getState + let tabStop = stateTabStop state + try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") + firstline <- anyLine + rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) + trailing <- option "" blanklines + return $ firstline ++ "\n" ++ unlines rawlines ++ trailing + +definitionList = do + failIfStrict + items <- many1 definitionListItem + let (terms, defs) = unzip items + let defs' = compactify defs + let items' = zip terms defs' + return $ DefinitionList items' + +-- +-- paragraph block +-- + +para = try $ do + result <- many1 inline + newline + blanklines <|> do st <- getState + if stateStrict st + then lookAhead (blockQuote <|> header) >> return "" + else lookAhead emacsBoxQuote >> return "" + return $ Para $ normalizeSpaces result + +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- raw html +-- + +htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" + +htmlBlock = do + st <- getState + if stateStrict st + then try $ do failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return $ RawHtml $ first ++ finalSpace ++ finalNewlines + else rawHtmlBlocks + +-- True if tag is self-closing +isSelfClosing tag = + isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag + +strictHtmlBlock = try $ do + tag <- anyHtmlBlockTag + let tag' = extractTagType tag + if isSelfClosing tag || tag' == "hr" + then return tag + else do contents <- many (notFollowedBy' (htmlEndTag tag') >> + (htmlElement <|> (count 1 anyChar))) + end <- htmlEndTag tag' + return $ tag ++ concat contents ++ end + +rawHtmlBlocks = do + htmlBlocks <- many1 rawHtmlBlock + let combined = concatMap (\(RawHtml str) -> str) htmlBlocks + let combined' = if not (null combined) && last combined == '\n' + then init combined -- strip extra newline + else combined + return $ RawHtml combined' + +-- +-- LaTeX +-- + +rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment + +-- +-- Tables +-- + +-- Parse a dashed line with optional trailing spaces; return its length +-- and the length including trailing space. +dashedLine ch = do + dashes <- many1 (char ch) + sp <- many spaceChar + return $ (length dashes, length $ dashes ++ sp) + +-- Parse a table header with dashed lines of '-' preceded by +-- one line of text. +simpleTableHeader = try $ do + rawContent <- anyLine + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines) = unzip dashes + let indices = scanl (+) (length initSp) lines + let rawHeads = tail $ splitByIndices (init indices) rawContent + let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths + return (rawHeads, aligns, indices) + +-- Parse a table footer - dashed lines followed by blank line. +tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines + +-- Parse a table separator - dashed line. +tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n" + +-- Parse a raw line and split it into chunks by indices. +rawTableLine indices = do + notFollowedBy' (blanklines <|> tableFooter) + line <- many1Till anyChar newline + return $ map removeLeadingTrailingSpace $ tail $ + splitByIndices (init indices) line + +-- Parse a table line and return a list of lists of blocks (columns). +tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) + +-- Parse a multiline table row and return a list of blocks (columns). +multilineRow indices = do + colLines <- many1 (rawTableLine indices) + optional blanklines + let cols = map unlines $ transpose colLines + mapM (parseFromString (many plain)) cols + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int -- Number of columns on terminal + -> [Int] -- Indices + -> [Float] -- Fractional relative sizes of columns +widthsFromIndices _ [] = [] +widthsFromIndices numColumns indices = + let lengths = zipWith (-) indices (0:indices) + totLength = sum lengths + quotient = if totLength > numColumns + then fromIntegral totLength + else fromIntegral numColumns + fracs = map (\l -> (fromIntegral l) / quotient) lengths in + tail fracs + +-- Parses a table caption: inlines beginning with 'Table:' +-- and followed by blank lines. +tableCaption = try $ do + nonindentSpaces + string "Table:" + result <- many1 inline + blanklines + return $ normalizeSpaces result + +-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. +tableWith headerParser lineParser footerParser = try $ do + (rawHeads, aligns, indices) <- headerParser + lines <- many1Till (lineParser indices) footerParser + caption <- option [] tableCaption + heads <- mapM (parseFromString (many plain)) rawHeads + state <- getState + let numColumns = stateColumns state + let widths = widthsFromIndices numColumns indices + return $ Table caption aligns widths heads lines + +-- Parse a simple table with '---' header and one line per row. +simpleTable = tableWith simpleTableHeader tableLine blanklines + +-- Parse a multiline table: starts with row of '-' on top, then header +-- (which may be multiline), then the rows, +-- which may be multiline, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +multilineTable = tableWith multilineTableHeader multilineRow tableFooter + +multilineTableHeader = try $ do + tableSep + rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines) = unzip dashes + let indices = scanl (+) (length initSp) lines + let rawHeadsList = transpose $ map + (\ln -> tail $ splitByIndices (init indices) ln) + rawContent + let rawHeads = map (joinWithSep " ") rawHeadsList + let aligns = zipWith alignType rawHeadsList lengths + return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) + +-- Returns an alignment type for a table, based on a list of strings +-- (the rows of the column header) and a number (the length of the +-- dashed line under the rows. +alignType :: [String] -> Int -> Alignment +alignType [] len = AlignDefault +alignType strLst len = + let str = head $ sortBy (comparing length) $ + map removeTrailingSpace strLst + leftSpace = if null str then False else (str !! 0) `elem` " \t" + rightSpace = length str < len || (str !! (len - 1)) `elem` " \t" + in case (leftSpace, rightSpace) of + (True, False) -> AlignRight + (False, True) -> AlignLeft + (True, True) -> AlignCenter + (False, False) -> AlignDefault + +table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table" + +-- +-- inline +-- + +inline = choice [ str + , smartPunctuation + , whitespace + , endline + , code + , charRef + , strong + , emph + , note + , inlineNote + , link + , image + , math + , strikeout + , superscript + , subscript + , autoLink + , rawHtmlInline' + , rawLaTeXInline' + , escapedChar + , symbol + , ltSign ] <?> "inline" + +escapedChar = do + char '\\' + state <- getState + result <- option '\\' $ if stateStrict state + then oneOf "\\`*_{}[]()>#+-.!~" + else satisfy (not . isAlphaNum) + return $ Str [result] + +ltSign = do + st <- getState + if stateStrict st + then char '<' + else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html + return $ Str ['<'] + +specialCharsMinusLt = filter (/= '<') specialChars + +symbol = do + result <- oneOf specialCharsMinusLt + return $ Str [result] + +-- parses inline code, between n `s and n `s +code = try $ do + starts <- many1 (char '`') + skipSpaces + result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> + (char '\n' >> return " ")) + (try (skipSpaces >> count (length starts) (char '`') >> + notFollowedBy (char '`'))) + return $ Code $ removeLeadingTrailingSpace $ concat result + +mathWord = many1 ((noneOf " \t\n\\$") <|> + (try (char '\\') >>~ notFollowedBy (char '$'))) + +math = try $ do + failIfStrict + char '$' + notFollowedBy space + words <- sepBy1 mathWord (many1 space) + char '$' + return $ TeX ("$" ++ (joinWithSep " " words) ++ "$") + +emph = ((enclosed (char '*') (char '*') inline) <|> + (enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>= + return . Emph . normalizeSpaces + +strong = ((enclosed (string "**") (try $ string "**") inline) <|> + (enclosed (string "__") (try $ string "__") inline)) >>= + return . Strong . normalizeSpaces + +strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= + return . Strikeout . normalizeSpaces + +superscript = failIfStrict >> enclosed (char '^') (char '^') + (notFollowedBy' whitespace >> inline) >>= -- may not contain Space + return . Superscript + +subscript = failIfStrict >> enclosed (char '~') (char '~') + (notFollowedBy' whitespace >> inline) >>= -- may not contain Space + return . Subscript + +smartPunctuation = failUnlessSmart >> + choice [ quoted, apostrophe, dash, ellipses ] + +apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe + +quoted = doubleQuoted <|> singleQuoted + +withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= + return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= + return . Quoted DoubleQuote . normalizeSpaces + +failIfInQuoteContext context = do + st <- getState + if stateQuoteContext st == context + then fail "already inside quotes" + else return () + +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + char '\8216' <|> + (try $ do char '\'' + notFollowedBy (oneOf ")!],.;:-? \t\n") + notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> + satisfy (not . isAlphaNum))) + -- possess/contraction + return '\'') + +singleQuoteEnd = try $ do + char '\8217' <|> char '\'' + notFollowedBy alphaNum + return '\'' + +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + char '\8220' <|> + (try $ do char '"' + notFollowedBy (oneOf " \t\n") + return '"') + +doubleQuoteEnd = char '\8221' <|> char '"' + +ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses + +dash = enDash <|> emDash + +enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash + +emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >> + skipSpaces >> return EmDash + +whitespace = do + sps <- many1 (oneOf spaceChars) + if length sps >= 2 + then option Space (endline >> return LineBreak) + else return Space <?> "whitespace" + +nonEndline = satisfy (/='\n') + +strChar = noneOf (specialChars ++ spaceChars ++ "\n") + +str = many1 strChar >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline = try $ do + newline + notFollowedBy blankline + st <- getState + if stateStrict st + then do notFollowedBy emailBlockQuoteStart + notFollowedBy (char '#') -- atx header + else return () + -- parse potential list-starts differently if in a list: + if stateParserContext st == ListItemState + then notFollowedBy' (bulletListStart <|> + (anyOrderedListStart >> return ())) + else return () + return Space + +-- +-- links +-- + +-- a reference label for a link +reference = notFollowedBy' (string "[^") >> -- footnote reference + inlinesInBalanced "[" "]" >>= (return . normalizeSpaces) + +-- source for a link, with optional title +source = try $ do + char '(' + optional (char '<') + src <- many (noneOf ")> \t\n") + optional (char '>') + tit <- option "" linkTitle + skipSpaces + char ')' + return (removeTrailingSpace src, tit) + +linkTitle = try $ do + (many1 spaceChar >> option '\n' newline) <|> newline + skipSpaces + delim <- char '\'' <|> char '"' + tit <- manyTill anyChar (try (char delim >> skipSpaces >> + notFollowedBy (noneOf ")\n"))) + return $ decodeCharacterReferences tit + +link = try $ do + label <- reference + src <- source <|> referenceLink label + return $ Link label src + +-- a link like [this][ref] or [this][] or [this] +referenceLink label = do + ref <- option [] (try (optional (char ' ') >> + optional (newline >> skipSpaces) >> reference)) + let ref' = if null ref then label else ref + state <- getState + case lookupKeySrc (stateKeys state) ref' of + Nothing -> fail "no corresponding key" + Just target -> return target + +emailAddress = try $ do + name <- many1 (alphaNum <|> char '+') + char '@' + first <- many1 alphaNum + rest <- many1 (char '.' >> many1 alphaNum) + return $ "mailto:" ++ name ++ "@" ++ joinWithSep "." (first:rest) + +uri = try $ do + str <- many1 (noneOf "\n\t >") + if isURI str + then return str + else fail "not a URI" + +autoLink = try $ do + char '<' + src <- uri <|> emailAddress + char '>' + let src' = if "mailto:" `isPrefixOf` src + then drop 7 src + else src + st <- getState + return $ if stateStrict st + then Link [Str src'] (src, "") + else Link [Code src'] (src, "") + +image = try $ do + char '!' + (Link label src) <- link + return $ Image label src + +note = try $ do + failIfStrict + ref <- noteMarker + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just contents -> return $ Note contents + +inlineNote = try $ do + failIfStrict + char '^' + contents <- inlinesInBalanced "[" "]" + return $ Note [Para contents] + +rawLaTeXInline' = failIfStrict >> rawLaTeXInline + +rawHtmlInline' = do + st <- getState + result <- choice $ if stateStrict st + then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] + else [htmlBlockElement, anyHtmlInlineTag] + return $ HtmlInline result + diff --git a/Text/Pandoc/Readers/RST.hs b/Text/Pandoc/Readers/RST.hs new file mode 100644 index 000000000..1239eb688 --- /dev/null +++ b/Text/Pandoc/Readers/RST.hs @@ -0,0 +1,640 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.RST + Copyright : Copyright (C) 2006-7 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion from reStructuredText to 'Pandoc' document. +-} +module Text.Pandoc.Readers.RST ( + readRST + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.ParserCombinators.Parsec +import Data.List ( findIndex, delete ) + +-- | Parse reStructuredText string and return Pandoc document. +readRST :: ParserState -> String -> Pandoc +readRST state str = (readWith parseRST) state (str ++ "\n\n") + +-- +-- Constants and data structure definitions +--- + +bulletListMarkers = "*+-" +underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" + +-- treat these as potentially non-text when parsing inline: +specialChars = "\\`|*_<>$:[-" + +-- +-- parsing documents +-- + +isAnonKey (ref, src) = ref == [Str "_"] + +isHeader :: Int -> Block -> Bool +isHeader n (Header x _) = x == n +isHeader _ _ = False + +-- | Promote all headers in a list of blocks. (Part of +-- title transformation for RST.) +promoteHeaders :: Int -> [Block] -> [Block] +promoteHeaders num ((Header level text):rest) = + (Header (level - num) text):(promoteHeaders num rest) +promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders num [] = [] + +-- | If list of blocks starts with a header (or a header and subheader) +-- of level that are not found elsewhere, return it as a title and +-- promote all the other headers. +titleTransform :: [Block] -- ^ list of blocks + -> ([Block], [Inline]) -- ^ modified list of blocks, title +titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title subtitle + if (any (isHeader 1) rest) || (any (isHeader 2) rest) + then ((Header 1 head1):(Header 2 head2):rest, []) + else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) +titleTransform ((Header 1 head1):rest) = -- title, no subtitle + if (any (isHeader 1) rest) + then ((Header 1 head1):rest, []) + else ((promoteHeaders 1 rest), head1) +titleTransform blocks = (blocks, []) + +parseRST = do + startPos <- getPosition + -- go through once just to get list of reference keys + -- docMinusKeys is the raw document with blanks where the keys were... + docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat + setInput docMinusKeys + setPosition startPos + st <- getState + let reversedKeys = stateKeys st + updateState $ \st -> st { stateKeys = reverse reversedKeys } + -- now parse it for real... + blocks <- parseBlocks + let blocks' = filter (/= Null) blocks + state <- getState + let (blocks'', title) = if stateStandalone state + then titleTransform blocks' + else (blocks', []) + let authors = stateAuthors state + let date = stateDate state + let title' = if (null title) then (stateTitle state) else title + return $ Pandoc (Meta title' authors date) blocks'' + +-- +-- parsing blocks +-- + +parseBlocks = manyTill block eof + +block = choice [ codeBlock + , rawHtmlBlock + , rawLaTeXBlock + , fieldList + , blockQuote + , imageBlock + , unknownDirective + , header + , hrule + , list + , lineBlock + , para + , plain + , nullBlock ] <?> "block" + +-- +-- field list +-- + +fieldListItem indent = try $ do + string indent + char ':' + name <- many1 alphaNum + string ": " + skipSpaces + first <- manyTill anyChar newline + rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >> + indentedBlock + return (name, joinWithSep " " (first:(lines rest))) + +fieldList = try $ do + indent <- lookAhead $ many (oneOf " \t") + items <- many1 $ fieldListItem indent + blanklines + let authors = case lookup "Authors" items of + Just auth -> [auth] + Nothing -> map snd (filter (\(x,y) -> x == "Author") items) + if null authors + then return () + else updateState $ \st -> st {stateAuthors = authors} + case (lookup "Date" items) of + Just dat -> updateState $ \st -> st {stateDate = dat} + Nothing -> return () + case (lookup "Title" items) of + Just tit -> parseFromString (many inline) tit >>= + \t -> updateState $ \st -> st {stateTitle = t} + Nothing -> return () + let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && + (x /= "Date") && (x /= "Title")) items + if null remaining + then return Null + else do terms <- mapM (return . (:[]) . Str . fst) remaining + defs <- mapM (parseFromString (many block) . snd) + remaining + return $ DefinitionList $ zip terms defs + +-- +-- line block +-- + +lineBlockLine = try $ do + string "| " + white <- many (oneOf " \t") + line <- manyTill inline newline + return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak] + +lineBlock = try $ do + lines <- many1 lineBlockLine + blanklines + return $ Para (concat lines) + +-- +-- paragraph block +-- + +para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph" + +codeBlockStart = string "::" >> blankline >> blankline + +-- paragraph that ends in a :: starting a code block +paraBeforeCodeBlock = try $ do + result <- many1 (notFollowedBy' codeBlockStart >> inline) + lookAhead (string "::") + return $ Para $ if last result == Space + then normalizeSpaces result + else (normalizeSpaces result) ++ [Str ":"] + +-- regular paragraph +paraNormal = try $ do + result <- many1 inline + newline + blanklines + return $ Para $ normalizeSpaces result + +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- image block +-- + +imageBlock = try $ do + string ".. image:: " + src <- manyTill anyChar newline + fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t") + many1 $ fieldListItem indent + optional blanklines + case lookup "alt" fields of + Just alt -> return $ Plain [Image [Str alt] (src, alt)] + Nothing -> return $ Plain [Image [Str "image"] (src, "")] +-- +-- header blocks +-- + +header = doubleHeader <|> singleHeader <?> "header" + +-- a header with lines on top and bottom +doubleHeader = try $ do + c <- oneOf underlineChars + rest <- many (char c) -- the top line + let lenTop = length (c:rest) + skipSpaces + newline + txt <- many1 (notFollowedBy blankline >> inline) + pos <- getPosition + let len = (sourceColumn pos) - 1 + if (len > lenTop) then fail "title longer than border" else return () + blankline -- spaces and newline + count lenTop (char c) -- the bottom line + blanklines + -- check to see if we've had this kind of header before. + -- if so, get appropriate level. if not, add to list. + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return $ Header level (normalizeSpaces txt) + +-- a header with line on the bottom only +singleHeader = try $ do + notFollowedBy' whitespace + txt <- many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + blankline + c <- oneOf underlineChars + rest <- count (len - 1) (char c) + many (char c) + blanklines + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return $ Header level (normalizeSpaces txt) + +-- +-- hrule block +-- + +hrule = try $ do + chr <- oneOf underlineChars + count 3 (char chr) + skipMany (char chr) + blankline + blanklines + return HorizontalRule + +-- +-- code blocks +-- + +-- read a line indented by a given string +indentedLine indents = try $ do + string indents + result <- manyTill anyChar newline + return $ result ++ "\n" + +-- two or more indented lines, possibly separated by blank lines. +-- any amount of indentation will work. +indentedBlock = do + indents <- lookAhead $ many1 (oneOf " \t") + lns <- many $ choice $ [ indentedLine indents, + try $ do b <- blanklines + l <- indentedLine indents + return (b ++ l) ] + optional blanklines + return $ concat lns + +codeBlock = try $ do + codeBlockStart + result <- indentedBlock + return $ CodeBlock $ stripTrailingNewlines result + +-- +-- raw html +-- + +rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >> + indentedBlock >>= return . RawHtml + +-- +-- raw latex +-- + +rawLaTeXBlock = try $ do + string ".. raw:: latex" + blanklines + result <- indentedBlock + return $ Para [(TeX result)] + +-- +-- block quotes +-- + +blockQuote = do + raw <- indentedBlock + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ raw ++ "\n\n" + return $ BlockQuote contents + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +definitionListItem = try $ do + term <- many1Till inline endline + raw <- indentedBlock + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ raw ++ "\n\n" + return (normalizeSpaces term, contents) + +definitionList = many1 definitionListItem >>= return . DefinitionList + +-- parses bullet list start and returns its length (inc. following whitespace) +bulletListStart = try $ do + notFollowedBy' hrule -- because hrules start out just like lists + marker <- oneOf bulletListMarkers + white <- many1 spaceChar + return $ length (marker:white) + +-- parses ordered list start and returns its length (inc following whitespace) +orderedListStart style delim = try $ do + (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) + white <- many1 spaceChar + return $ markerLen + length white + +-- parse a line of a list item +listLine markerLength = try $ do + notFollowedBy blankline + indentWith markerLength + line <- manyTill anyChar newline + return $ line ++ "\n" + +-- indent by specified number of spaces (or equiv. tabs) +indentWith num = do + state <- getState + let tabStop = stateTabStop state + if (num < tabStop) + then count num (char ' ') + else choice [ try (count num (char ' ')), + (try (char '\t' >> count (num - tabStop) (char ' '))) ] + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = do + markerLength <- start + firstLine <- manyTill anyChar newline + restLines <- many (listLine markerLength) + return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) + +-- continuation of a list item - indented and separated by blankline or +-- (in compact lists) endline. +-- Note: nested lists are parsed as continuations. +listContinuation markerLength = try $ do + blanks <- many1 blankline + result <- many1 (listLine markerLength) + return $ blanks ++ concat result + +listItem start = try $ do + (markerLength, first) <- rawListItem start + rest <- many (listContinuation markerLength) + blanks <- choice [ try (many blankline >>~ lookAhead start), + many1 blankline ] -- whole list must end with blank. + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may itself contain block elements + parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks + updateState (\st -> st {stateParserContext = oldContext}) + return parsed + +orderedList = do + (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) + items <- many1 (listItem (orderedListStart style delim)) + let items' = compactify items + return $ OrderedList (start, style, delim) items' + +bulletList = many1 (listItem bulletListStart) >>= + return . BulletList . compactify + +-- +-- unknown directive (e.g. comment) +-- + +unknownDirective = try $ do + string ".. " + manyTill anyChar newline + many (string " :" >> many1 (noneOf "\n:") >> char ':' >> + many1 (noneOf "\n") >> newline) + optional blanklines + return Null + +-- +-- reference key +-- + +referenceKey = do + startPos <- getPosition + key <- choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] + st <- getState + let oldkeys = stateKeys st + updateState $ \st -> st { stateKeys = key : oldkeys } + optional blanklines + endPos <- getPosition + -- return enough blanks to replace key + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +targetURI = do + skipSpaces + optional newline + contents <- many1 (try (many spaceChar >> newline >> + many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") + blanklines + return contents + +imageKey = try $ do + string ".. |" + ref <- manyTill inline (char '|') + skipSpaces + string "image::" + src <- targetURI + return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + +anonymousKey = try $ do + oneOfStrings [".. __:", "__"] + src <- targetURI + state <- getState + return ([Str "_"], (removeLeadingTrailingSpace src, "")) + +regularKeyQuoted = try $ do + string ".. _`" + ref <- manyTill inline (char '`') + char ':' + src <- targetURI + return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + +regularKey = try $ do + string ".. _" + ref <- manyTill inline (char ':') + src <- targetURI + return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + + -- + -- inline + -- + +inline = choice [ link + , str + , whitespace + , endline + , strong + , emph + , code + , image + , hyphens + , superscript + , subscript + , escapedChar + , symbol ] <?> "inline" + +hyphens = do + result <- many1 (char '-') + option Space endline + -- don't want to treat endline after hyphen or dash as a space + return $ Str result + +escapedChar = escaped anyChar + +symbol = do + result <- oneOf specialChars + return $ Str [result] + +-- parses inline code, between codeStart and codeEnd +code = try $ do + string "``" + result <- manyTill anyChar (try (string "``")) + return $ Code $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result + +emph = enclosed (char '*') (char '*') inline >>= + return . Emph . normalizeSpaces + +strong = enclosed (string "**") (try $ string "**") inline >>= + return . Strong . normalizeSpaces + +interpreted role = try $ do + optional $ try $ string "\\ " + result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar + nextChar <- lookAhead anyChar + try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "") + return [Str result] + +superscript = interpreted "sup" >>= (return . Superscript) + +subscript = interpreted "sub" >>= (return . Subscript) + +whitespace = many1 spaceChar >> return Space <?> "whitespace" + +str = notFollowedBy' oneWordReference >> + many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline = try $ do + newline + notFollowedBy blankline + -- parse potential list-starts at beginning of line differently in a list: + st <- getState + if (stateParserContext st) == ListItemState + then notFollowedBy (anyOrderedListMarker >> spaceChar) >> + notFollowedBy' bulletListStart + else return () + return Space + +-- +-- links +-- + +link = choice [explicitLink, referenceLink, autoLink] <?> "link" + +explicitLink = try $ do + char '`' + notFollowedBy (char '`') -- `` is marks start of inline code + label <- manyTill inline (try (do {spaces; char '<'})) + src <- manyTill (noneOf ">\n ") (char '>') + skipSpaces + string "`_" + return $ Link (normalizeSpaces label) (removeLeadingTrailingSpace src, "") + +reference = try $ do + char '`' + notFollowedBy (char '`') + label <- many1Till inline (char '`') + char '_' + return label + +oneWordReference = do + raw <- many1 alphaNum + char '_' + notFollowedBy alphaNum -- because this_is_not a link + return [Str raw] + +referenceLink = try $ do + label <- reference <|> oneWordReference + key <- option label (do{char '_'; return [Str "_"]}) -- anonymous link + state <- getState + let keyTable = stateKeys state + src <- case lookupKeySrc keyTable key of + Nothing -> fail "no corresponding key" + Just target -> return target + -- if anonymous link, remove first anon key so it won't be used again + let keyTable' = if (key == [Str "_"]) -- anonymous link? + then delete ([Str "_"], src) keyTable -- remove first anon key + else keyTable + setState $ state { stateKeys = keyTable' } + return $ Link (normalizeSpaces label) src + +uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", + "mailto:", "news:", "telnet:" ] + +uri = try $ do + scheme <- uriScheme + identifier <- many1 (noneOf " \t\n") + return $ scheme ++ identifier + +autoURI = do + src <- uri + return $ Link [Str src] (src, "") + +emailChar = alphaNum <|> oneOf "-+_." + +emailAddress = try $ do + firstLetter <- alphaNum + restAddr <- many emailChar + let addr = firstLetter:restAddr + char '@' + dom <- domain + return $ addr ++ '@':dom + +domainChar = alphaNum <|> char '-' + +domain = do + first <- many1 domainChar + dom <- many1 (try (do{ char '.'; many1 domainChar })) + return $ joinWithSep "." (first:dom) + +autoEmail = do + src <- emailAddress + return $ Link [Str src] ("mailto:" ++ src, "") + +autoLink = autoURI <|> autoEmail + +-- For now, we assume that all substitution references are for images. +image = try $ do + char '|' + ref <- manyTill inline (char '|') + state <- getState + let keyTable = stateKeys state + src <- case lookupKeySrc keyTable ref of + Nothing -> fail "no corresponding key" + Just target -> return target + return $ Image (normalizeSpaces ref) src + |