diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-11-28 21:01:17 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-11-28 21:01:17 +0000 |
commit | 47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864 (patch) | |
tree | ead5f33d20aa3a88c78f11bb8e598a4f741247f1 /Text/Pandoc/Readers | |
parent | 3dfe4cb708ac14369aab19383c1008b2cf4adbc4 (diff) | |
download | pandoc-47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864.tar.gz |
Removed Text directory. This is a remnant of an experiment
moving the contents of src/ to the top level, and should have
been deleted long ago.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1097 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'Text/Pandoc/Readers')
-rw-r--r-- | Text/Pandoc/Readers/HTML.hs | 496 | ||||
-rw-r--r-- | Text/Pandoc/Readers/LaTeX.hs | 651 | ||||
-rw-r--r-- | Text/Pandoc/Readers/Markdown.hs | 909 | ||||
-rw-r--r-- | Text/Pandoc/Readers/RST.hs | 640 |
4 files changed, 0 insertions, 2696 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs deleted file mode 100644 index 70a071152..000000000 --- a/Text/Pandoc/Readers/HTML.hs +++ /dev/null @@ -1,496 +0,0 @@ -{- -Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-7 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of HTML to 'Pandoc' document. --} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.Pandoc.CharacterReferences ( characterReference, - decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf ) -import Data.Char ( toUpper, toLower, isAlphaNum ) - --- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: ParserState -- ^ Parser state - -> String -- ^ String to parse - -> Pandoc -readHtml = readWith parseHtml - --- --- Constants --- - -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object", "script"] - -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] ++ eitherBlockOrInline - -blockHtmlTags = ["address", "blockquote", "center", "dir", "div", - "dl", "fieldset", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "hr", "isindex", "menu", "noframes", - "noscript", "ol", "p", "pre", "table", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr"] ++ eitherBlockOrInline - --- --- HTML utility functions --- - --- | Read blocks until end tag. -blocksTilEnd tag = do - blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) - return $ filter (/= Null) blocks - --- | Read inlines until end tag. -inlinesTilEnd tag = manyTill inline (htmlEndTag tag) - --- | Parse blocks between open and close tag. -blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag - --- | Parse inlines between open and close tag. -inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag - --- | Extract type from a tag: e.g. @br@ from @\<br\>@ -extractTagType :: String -> String -extractTagType ('<':rest) = - let isSpaceOrSlash c = c `elem` "/ \n\t" in - map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest -extractTagType _ = "" - --- | Parse any HTML tag (opening or self-closing) and return text of tag -anyHtmlTag = try $ do - char '<' - spaces - tag <- many1 alphaNum - attribs <- many htmlAttribute - spaces - ender <- option "" (string "/") - let ender' = if null ender then "" else " /" - spaces - char '>' - return $ "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - -anyHtmlEndTag = try $ do - char '<' - spaces - char '/' - spaces - tagType <- many1 alphaNum - spaces - char '>' - return $ "</" ++ tagType ++ ">" - -htmlTag :: String -> GenParser Char st (String, [(String, String)]) -htmlTag tag = try $ do - char '<' - spaces - stringAnyCase tag - attribs <- many htmlAttribute - spaces - optional (string "/") - spaces - char '>' - return (tag, (map (\(name, content, raw) -> (name, content)) attribs)) - --- parses a quoted html attribute value -quoted quoteChar = do - result <- between (char quoteChar) (char quoteChar) - (many (noneOf [quoteChar])) - return (result, [quoteChar]) - -htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute - --- minimized boolean attribute -htmlMinimizedAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - return (name, name, name) - -htmlRegularAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - spaces - char '=' - spaces - (content, quoteStr) <- choice [ (quoted '\''), - (quoted '"'), - (do - a <- many (alphaNum <|> (oneOf "-._:")) - return (a,"")) ] - return (name, content, - (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) - --- | Parse an end tag of type 'tag' -htmlEndTag tag = try $ do - char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "</" ++ tag ++ ">" - --- | Returns @True@ if the tag is (or can be) an inline tag. -isInline tag = (extractTagType tag) `elem` inlineHtmlTags - --- | Returns @True@ if the tag is (or can be) a block tag. -isBlock tag = (extractTagType tag) `elem` blockHtmlTags - -anyHtmlBlockTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "inline tag" - -anyHtmlInlineTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isInline tag then return tag else fail "not an inline tag" - --- | Parses material between script tags. --- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript = try $ do - open <- string "<script" - rest <- manyTill anyChar (htmlEndTag "script") - return $ open ++ rest ++ "</script>" - -htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] - -rawHtmlBlock = try $ do - notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") - body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag - sp <- many space - state <- getState - if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null - --- | Parses an HTML comment. -htmlComment = try $ do - string "<!--" - comment <- manyTill anyChar (try (string "-->")) - return $ "<!--" ++ comment ++ "-->" - --- --- parsing documents --- - -xmlDec = try $ do - string "<?" - rest <- manyTill anyChar (char '>') - return $ "<?" ++ rest ++ ">" - -definition = try $ do - string "<!" - rest <- manyTill anyChar (char '>') - return $ "<!" ++ rest ++ ">" - -nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> - ((rawHtmlBlock >> return ' ') <|> anyChar) - -parseTitle = try $ do - (tag, _) <- htmlTag "title" - contents <- inlinesTilEnd tag - spaces - return contents - --- parse header and return meta-information (for now, just title) -parseHead = try $ do - htmlTag "head" - spaces - skipMany nonTitleNonHead - contents <- option [] parseTitle - skipMany nonTitleNonHead - htmlTag "/head" - return (contents, [], "") - -skipHtmlTag tag = optional (htmlTag tag) - --- h1 class="title" representation of title in body -bodyTitle = try $ do - (tag, attribs) <- htmlTag "h1" - cl <- case (extractAttribute "class" attribs) of - Just "title" -> return "" - otherwise -> fail "not title" - inlinesTilEnd "h1" - -parseHtml = do - sepEndBy (choice [xmlDec, definition, htmlComment]) spaces - skipHtmlTag "html" - spaces - (title, authors, date) <- option ([], [], "") parseHead - spaces - skipHtmlTag "body" - spaces - optional bodyTitle -- skip title in body, because it's represented in meta - blocks <- parseBlocks - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html> - eof - return $ Pandoc (Meta title authors date) blocks - --- --- parsing blocks --- - -parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) - -block = choice [ codeBlock - , header - , hrule - , list - , blockQuote - , para - , plain - , rawHtmlBlock ] <?> "block" - --- --- header blocks --- - -header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" - -headerLevel n = try $ do - let level = "h" ++ show n - (tag, attribs) <- htmlTag level - contents <- inlinesTilEnd level - return $ Header n (normalizeSpaces contents) - --- --- hrule block --- - -hrule = try $ do - (tag, attribs) <- htmlTag "hr" - state <- getState - if not (null attribs) && stateParseRaw state - then unexpected "attributes in hr" -- parse as raw in this case - else return HorizontalRule - --- --- code blocks --- - --- Note: HTML tags in code blocks (e.g. for syntax highlighting) are --- skipped, because they are not portable to output formats other than HTML. -codeBlock = try $ do - htmlTag "pre" - result <- manyTill - (many1 (satisfy (/= '<')) <|> - ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) - (htmlEndTag "pre") - let result' = concat result - -- drop leading newline if any - let result'' = if "\n" `isPrefixOf` result' - then drop 1 result' - else result' - -- drop trailing newline if any - let result''' = if "\n" `isSuffixOf` result'' - then init result'' - else result'' - return $ CodeBlock $ decodeCharacterReferences result''' - --- --- block quotes --- - -blockQuote = try $ htmlTag "blockquote" >> spaces >> - blocksTilEnd "blockquote" >>= (return . BlockQuote) - --- --- list blocks --- - -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -orderedList = try $ do - (_, attribs) <- htmlTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ol" - return $ OrderedList (start, style, DefaultDelim) items - -bulletList = try $ do - htmlTag "ul" - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ul" - return $ BulletList items - -definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - tag <- htmlTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return $ DefinitionList items - -definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = joinWithSep [LineBreak] terms - return (term, concat defs) - --- --- paragraph block --- - -para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= - return . Para . normalizeSpaces - --- --- plain block --- - -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- inline --- - -inline = choice [ charRef - , strong - , emph - , superscript - , subscript - , strikeout - , spanStrikeout - , code - , str - , linebreak - , whitespace - , link - , image - , rawHtmlInline - ] <?> "inline" - -code = try $ do - htmlTag "code" - result <- manyTill anyChar (htmlEndTag "code") - -- remove internal line breaks, leading and trailing space, - -- and decode character references - return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ - joinWithSep " " $ lines result - -rawHtmlInline = do - result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag - state <- getState - if stateParseRaw state then return (HtmlInline result) else return (Str "") - -betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= - return . normalizeSpaces - -emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph - -strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong - -superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript - -subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript - -strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= - return . Strikeout - -spanStrikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (tag, attributes) <- htmlTag "span" - result <- case (extractAttribute "class" attributes) of - Just "strikeout" -> inlinesTilEnd "span" - _ -> fail "not a strikeout" - return $ Strikeout result - -whitespace = many1 space >> return Space - --- hard line break -linebreak = htmlTag "br" >> optional newline >> return LineBreak - -str = many1 (noneOf "<& \t\n") >>= return . Str - --- --- links and images --- - --- extract contents of attribute (attribute names are case-insensitive) -extractAttribute name [] = Nothing -extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName - in if attrName' == name' - then Just (decodeCharacterReferences contents) - else extractAttribute name rest - -link = try $ do - (tag, attributes) <- htmlTag "a" - url <- case (extractAttribute "href" attributes) of - Just url -> return url - Nothing -> fail "no href" - let title = fromMaybe "" $ extractAttribute "title" attributes - label <- inlinesTilEnd "a" - return $ Link (normalizeSpaces label) (url, title) - -image = try $ do - (tag, attributes) <- htmlTag "img" - url <- case (extractAttribute "src" attributes) of - Just url -> return url - Nothing -> fail "no src" - let title = fromMaybe "" $ extractAttribute "title" attributes - let alt = fromMaybe "" (extractAttribute "alt" attributes) - return $ Image [Str alt] (url, title) - diff --git a/Text/Pandoc/Readers/LaTeX.hs b/Text/Pandoc/Readers/LaTeX.hs deleted file mode 100644 index 37cc2bfe4..000000000 --- a/Text/Pandoc/Readers/LaTeX.hs +++ /dev/null @@ -1,651 +0,0 @@ -{- -Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-7 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of LaTeX to 'Pandoc' document. --} -module Text.Pandoc.Readers.LaTeX ( - readLaTeX, - rawLaTeXInline, - rawLaTeXEnvironment - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Data.Maybe ( fromMaybe ) -import Data.Char ( chr ) -import Data.List ( isPrefixOf, isSuffixOf ) - --- | Parse LaTeX from string and return 'Pandoc' document. -readLaTeX :: ParserState -- ^ Parser state, including options for parser - -> String -- ^ String to parse - -> Pandoc -readLaTeX = readWith parseLaTeX - --- characters with special meaning -specialChars = "\\`$%^&_~#{}\n \t|<>'\"-" - --- --- utility functions --- - --- | Returns text between brackets and its matching pair. -bracketedText openB closeB = do - result <- charsInBalanced' openB closeB - return $ [openB] ++ result ++ [closeB] - --- | Returns an option or argument of a LaTeX command. -optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']' - --- | True if the string begins with '{'. -isArg ('{':rest) = True -isArg other = False - --- | Returns list of options and arguments of a LaTeX command. -commandArgs = many optOrArg - --- | Parses LaTeX command, returns (name, star, list of options or arguments). -command = do - char '\\' - name <- many1 letter - star <- option "" (string "*") -- some commands have starred versions - args <- commandArgs - return (name, star, args) - -begin name = try $ do - string $ "\\begin{" ++ name ++ "}" - optional commandArgs - spaces - return name - -end name = try $ do - string $ "\\end{" ++ name ++ "}" - spaces - return name - --- | Returns a list of block elements containing the contents of an --- environment. -environment name = try $ begin name >> spaces >> manyTill block (end name) - -anyEnvironment = try $ do - string "\\begin{" - name <- many letter - star <- option "" (string "*") -- some environments have starred variants - char '}' - optional commandArgs - spaces - contents <- manyTill block (end (name ++ star)) - return $ BlockQuote contents - --- --- parsing documents --- - --- | Process LaTeX preamble, extracting metadata. -processLaTeXPreamble = try $ manyTill - (choice [bibliographic, comment, unknownCommand, nullBlock]) - (try (string "\\begin{document}")) >> - spaces - --- | Parse LaTeX and return 'Pandoc'. -parseLaTeX = do - optional processLaTeXPreamble -- preamble might not be present (fragment) - spaces - blocks <- parseBlocks - spaces - optional $ try (string "\\end{document}" >> many anyChar) - -- might not be present (fragment) - spaces - eof - state <- getState - let blocks' = filter (/= Null) blocks - let title' = stateTitle state - let authors' = stateAuthors state - let date' = stateDate state - return $ Pandoc (Meta title' authors' date') blocks' - --- --- parsing blocks --- - -parseBlocks = spaces >> many block - -block = choice [ hrule - , codeBlock - , header - , list - , blockQuote - , mathBlock - , comment - , bibliographic - , para - , specialEnvironment - , itemBlock - , unknownEnvironment - , unknownCommand ] <?> "block" - --- --- header blocks --- - -header = try $ do - char '\\' - subs <- many (try (string "sub")) - string "section" - optional (char '*') - char '{' - title <- manyTill inline (char '}') - spaces - return $ Header (length subs + 1) (normalizeSpaces title) - --- --- hrule block --- - -hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", - "\\newpage" ] >> spaces >> return HorizontalRule - --- --- code blocks --- - -codeBlock = codeBlock1 <|> codeBlock2 - -codeBlock1 = try $ do - string "\\begin{verbatim}" -- don't use begin function because it - -- gobbles whitespace - optional blanklines -- we want to gobble blank lines, but not - -- leading space - contents <- manyTill anyChar (try (string "\\end{verbatim}")) - spaces - return $ CodeBlock (stripTrailingNewlines contents) - -codeBlock2 = try $ do - string "\\begin{Verbatim}" -- used by fancyvrb package - option "" blanklines - contents <- manyTill anyChar (try (string "\\end{Verbatim}")) - spaces - return $ CodeBlock (stripTrailingNewlines contents) - --- --- block quotes --- - -blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>= - return . BlockQuote - --- --- math block --- - -mathBlock = mathBlockWith (begin "equation") (end "equation") <|> - mathBlockWith (begin "displaymath") (end "displaymath") <|> - mathBlockWith (string "\\[") (string "\\]") <?> "math block" - -mathBlockWith start end = try $ do - start - spaces - result <- manyTill anyChar end - spaces - return $ BlockQuote [Para [TeX ("$" ++ result ++ "$")]] - --- --- list blocks --- - -list = bulletList <|> orderedList <|> definitionList <?> "list" - -listItem = try $ do - ("item", _, args) <- command - spaces - state <- getState - let oldParserContext = stateParserContext state - updateState (\state -> state {stateParserContext = ListItemState}) - blocks <- many block - updateState (\state -> state {stateParserContext = oldParserContext}) - opt <- case args of - ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x -> - parseFromString (many inline) $ tail $ init x - _ -> return [] - return (opt, blocks) - -orderedList = try $ do - string "\\begin{enumerate}" - (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ - try $ do failIfStrict - char '[' - res <- anyOrderedListMarker - char ']' - return res - spaces - option "" $ try $ do string "\\setlength{\\itemindent}" - char '{' - manyTill anyChar (char '}') - spaces - start <- option 1 $ try $ do failIfStrict - string "\\setcounter{enum" - many1 (oneOf "iv") - string "}{" - num <- many1 digit - char '}' - spaces - return $ (read num) + 1 - items <- many listItem - end "enumerate" - spaces - return $ OrderedList (start, style, delim) $ map snd items - -bulletList = try $ do - begin "itemize" - spaces - items <- many listItem - end "itemize" - spaces - return (BulletList $ map snd items) - -definitionList = try $ do - begin "description" - spaces - items <- many listItem - end "description" - spaces - return (DefinitionList items) - --- --- paragraph block --- - -para = many1 inline >>~ spaces >>= return . Para . normalizeSpaces - --- --- title authors date --- - -bibliographic = choice [ maketitle, title, authors, date ] - -maketitle = try (string "\\maketitle") >> spaces >> return Null - -title = try $ do - string "\\title{" - tit <- manyTill inline (char '}') - spaces - updateState (\state -> state { stateTitle = tit }) - return Null - -authors = try $ do - string "\\author{" - authors <- manyTill anyChar (char '}') - spaces - let authors' = map removeLeadingTrailingSpace $ lines $ - substitute "\\\\" "\n" authors - updateState (\state -> state { stateAuthors = authors' }) - return Null - -date = try $ do - string "\\date{" - date' <- manyTill anyChar (char '}') - spaces - updateState (\state -> state { stateDate = date' }) - return Null - --- --- item block --- for use in unknown environments that aren't being parsed as raw latex --- - --- this forces items to be parsed in different blocks -itemBlock = try $ do - ("item", _, args) <- command - state <- getState - if (stateParserContext state == ListItemState) - then fail "item should be handled by list block" - else if null args - then return Null - else return $ Plain [Str (stripFirstAndLast (head args))] - --- --- raw LaTeX --- - -specialEnvironment = do -- these are always parsed as raw - lookAhead (choice (map (\name -> begin name) ["tabular", "figure", - "tabbing", "eqnarry", "picture", "table", "verse", "theorem"])) - rawLaTeXEnvironment - --- | Parse any LaTeX environment and return a Para block containing --- the whole literal environment as raw TeX. -rawLaTeXEnvironment :: GenParser Char st Block -rawLaTeXEnvironment = try $ do - string "\\begin{" - name <- many1 letter - star <- option "" (string "*") -- for starred variants - let name' = name ++ star - char '}' - args <- option [] commandArgs - let argStr = concat args - contents <- manyTill (choice [ (many1 (noneOf "\\")), - (do - (Para [TeX str]) <- rawLaTeXEnvironment - return str), - string "\\" ]) - (end name') - spaces - return $ Para [TeX $ "\\begin{" ++ name' ++ "}" ++ argStr ++ - concat contents ++ "\\end{" ++ name' ++ "}"] - -unknownEnvironment = try $ do - state <- getState - result <- if stateParseRaw state -- check whether we should include raw TeX - then rawLaTeXEnvironment -- if so, get whole raw environment - else anyEnvironment -- otherwise just the contents - return result - -unknownCommand = try $ do - notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description", - "document"] - (name, star, args) <- command - spaces - let argStr = concat args - state <- getState - if name == "item" && (stateParserContext state) == ListItemState - then fail "should not be parsed as raw" - else string "" - if stateParseRaw state - then return $ Plain [TeX ("\\" ++ name ++ star ++ argStr)] - else return $ Plain [Str (joinWithSep " " args)] - --- latex comment -comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null - --- --- inline --- - -inline = choice [ str - , endline - , whitespace - , quoted - , apostrophe - , spacer - , strong - , math - , ellipses - , emDash - , enDash - , hyphen - , emph - , strikeout - , superscript - , subscript - , ref - , lab - , code - , url - , link - , image - , footnote - , linebreak - , accentedChar - , specialChar - , rawLaTeXInline - , escapedChar - , unescapedChar - ] <?> "inline" - -accentedChar = normalAccentedChar <|> specialAccentedChar - -normalAccentedChar = try $ do - char '\\' - accent <- oneOf "'`^\"~" - character <- (try $ char '{' >> letter >>~ char '}') <|> letter - let table = fromMaybe [] $ lookup character accentTable - let result = case lookup accent table of - Just num -> chr num - Nothing -> '?' - return $ Str [result] - --- an association list of letters and association list of accents --- and decimal character numbers. -accentTable = - [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), - ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), - ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), - ('N', [('~', 209)]), - ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), - ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), - ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), - ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), - ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), - ('n', [('~', 241)]), - ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), - ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] - -specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, - oslash, pound, euro, copyright, sect ] - -ccedil = try $ do - char '\\' - letter <- oneOfStrings ["cc", "cC"] - let num = if letter == "cc" then 231 else 199 - return $ Str [chr num] - -aring = try $ do - char '\\' - letter <- oneOfStrings ["aa", "AA"] - let num = if letter == "aa" then 229 else 197 - return $ Str [chr num] - -iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >> - return (Str [chr 239]) - -icirc = try (string "\\^") >> oneOfStrings ["\\i", "{\\i}"] >> - return (Str [chr 238]) - -szlig = try (string "\\ss") >> return (Str [chr 223]) - -oslash = try $ do - char '\\' - letter <- choice [char 'o', char 'O'] - let num = if letter == 'o' then 248 else 216 - return $ Str [chr num] - -aelig = try $ do - char '\\' - letter <- oneOfStrings ["ae", "AE"] - let num = if letter == "ae" then 230 else 198 - return $ Str [chr num] - -pound = try (string "\\pounds") >> return (Str [chr 163]) - -euro = try (string "\\euro") >> return (Str [chr 8364]) - -copyright = try (string "\\copyright") >> return (Str [chr 169]) - -sect = try (string "\\S") >> return (Str [chr 167]) - -escapedChar = do - result <- escaped (oneOf " $%&_#{}\n") - return $ if result == Str "\n" then Str " " else result - --- ignore standalone, nonescaped special characters -unescapedChar = oneOf "`$^&_#{}|<>" >> return (Str "") - -specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ] - -backslash = try (string "\\textbackslash") >> return (Str "\\") - -tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~") - -caret = try (string "\\^{}") >> return (Str "^") - -bar = try (string "\\textbar") >> return (Str "\\") - -lt = try (string "\\textless") >> return (Str "<") - -gt = try (string "\\textgreater") >> return (Str ">") - -doubleQuote = char '"' >> return (Str "\"") - -code = code1 <|> code2 - -code1 = try $ do - string "\\verb" - marker <- anyChar - result <- manyTill anyChar (char marker) - return $ Code $ removeLeadingTrailingSpace result - -code2 = try $ do - string "\\texttt{" - result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') - return $ Code result - -emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> - manyTill inline (char '}') >>= return . Emph - -strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>= - return . Strikeout - -superscript = try $ string "\\textsuperscript{" >> - manyTill inline (char '}') >>= return . Superscript - --- note: \textsubscript isn't a standard latex command, but we use --- a defined version in pandoc. -subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>= - return . Subscript - -apostrophe = char '\'' >> return Apostrophe - -quoted = doubleQuoted <|> singleQuoted - -singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>= - return . Quoted SingleQuote . normalizeSpaces - -doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>= - return . Quoted DoubleQuote . normalizeSpaces - -singleQuoteStart = char '`' - -singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum - -doubleQuoteStart = string "``" - -doubleQuoteEnd = try $ string "''" - -ellipses = try $ string "\\ldots" >> optional (try (string "{}")) >> - return Ellipses - -enDash = try (string "--") >> return EnDash - -emDash = try (string "---") >> return EmDash - -hyphen = char '-' >> return (Str "-") - -lab = try $ do - string "\\label{" - result <- manyTill anyChar (char '}') - return $ Str $ "(" ++ result ++ ")" - -ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str - -strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= - return . Strong - -whitespace = many1 (oneOf "~ \t") >> return Space - --- hard line break -linebreak = try (string "\\\\") >> return LineBreak - -spacer = try (string "\\,") >> return (Str "") - -str = many1 (noneOf specialChars) >>= return . Str - --- endline internal to paragraph -endline = try $ newline >> notFollowedBy blankline >> return Space - --- math -math = math1 <|> math2 <?> "math" - -math1 = try $ do - char '$' - result <- many (noneOf "$") - char '$' - return $ TeX ("$" ++ result ++ "$") - -math2 = try $ do - string "\\(" - result <- many (noneOf "$") - string "\\)" - return $ TeX ("$" ++ result ++ "$") - --- --- links and images --- - -url = try $ do - string "\\url" - url <- charsInBalanced '{' '}' - return $ Link [Code url] (url, "") - -link = try $ do - string "\\href{" - url <- manyTill anyChar (char '}') - char '{' - label <- manyTill inline (char '}') - return $ Link (normalizeSpaces label) (url, "") - -image = try $ do - ("includegraphics", _, args) <- command - let args' = filter isArg args -- filter out options - let src = if null args' then - ("", "") - else - (stripFirstAndLast (head args'), "") - return $ Image [Str "image"] src - -footnote = try $ do - (name, _, (contents:[])) <- command - if ((name == "footnote") || (name == "thanks")) - then string "" - else fail "not a footnote or thanks command" - let contents' = stripFirstAndLast contents - -- parse the extracted block, which may contain various block elements: - rest <- getInput - setInput $ contents' - blocks <- parseBlocks - setInput rest - return $ Note blocks - --- | Parse any LaTeX command and return it in a raw TeX inline element. -rawLaTeXInline :: GenParser Char ParserState Inline -rawLaTeXInline = try $ do - (name, star, args) <- command - state <- getState - if ((name == "begin") || (name == "end") || (name == "item")) - then fail "not an inline command" - else string "" - return $ TeX ("\\" ++ name ++ star ++ concat args) - diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs deleted file mode 100644 index df84c0ac7..000000000 --- a/Text/Pandoc/Readers/Markdown.hs +++ /dev/null @@ -1,909 +0,0 @@ -{- -Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-7 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of markdown-formatted plain text to 'Pandoc' document. --} -module Text.Pandoc.Readers.Markdown ( - readMarkdown - ) where - -import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy ) -import Data.Ord ( comparing ) -import Data.Char ( isAlphaNum ) -import Network.URI ( isURI ) -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) -import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, - anyHtmlInlineTag, anyHtmlTag, - anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement ) -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Text.ParserCombinators.Parsec - --- | Read markdown from an input string and return a Pandoc document. -readMarkdown :: ParserState -> String -> Pandoc -readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n") - --- --- Constants and data structure definitions --- - -spaceChars = " \t" -bulletListMarkers = "*+-" -hruleChars = "*-_" -setextHChars = "=-" - --- treat these as potentially non-text when parsing inline: -specialChars = "\\[]*_~`<>$!^-.&'\"" - --- --- auxiliary functions --- - -indentSpaces = try $ do - state <- getState - let tabStop = stateTabStop state - try (count tabStop (char ' ')) <|> - (many (char ' ') >> string "\t") <?> "indentation" - -nonindentSpaces = do - state <- getState - let tabStop = stateTabStop state - sps <- many (char ' ') - if length sps < tabStop - then return sps - else unexpected "indented line" - --- | Fail unless we're at beginning of a line. -failUnlessBeginningOfLine = do - pos <- getPosition - if sourceColumn pos == 1 then return () else fail "not beginning of line" - --- | Fail unless we're in "smart typography" mode. -failUnlessSmart = do - state <- getState - if stateSmart state then return () else fail "Smart typography feature" - --- | Parse an inline Str element with a given content. -inlineString str = try $ do - (Str res) <- inline - if res == str then return res else fail $ "unexpected Str content" - --- | Parse a sequence of inline elements between a string --- @opener@ and a string @closer@, including inlines --- between balanced pairs of @opener@ and a @closer@. -inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline] -inlinesInBalanced opener closer = try $ do - string opener - result <- manyTill ( (do lookAhead (inlineString opener) - -- because it might be a link... - bal <- inlinesInBalanced opener closer - return $ [Str opener] ++ bal ++ [Str closer]) - <|> (count 1 inline)) - (try (string closer)) - return $ concat result - --- --- document structure --- - -titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline - -authorsLine = try $ do - char '%' - skipSpaces - authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") - newline - return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors - -dateLine = try $ do - char '%' - skipSpaces - date <- many (noneOf "\n") - newline - return $ decodeCharacterReferences $ removeTrailingSpace date - -titleBlock = try $ do - failIfStrict - title <- option [] titleLine - author <- option [] authorsLine - date <- option "" dateLine - optional blanklines - return (title, author, date) - -parseMarkdown = do - -- markdown allows raw HTML - updateState (\state -> state { stateParseRaw = True }) - startPos <- getPosition - -- go through once just to get list of reference keys - -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= - return . concat - setInput docMinusKeys - setPosition startPos - st <- getState - -- go through again for notes unless strict... - if stateStrict st - then return () - else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= - return . concat - st <- getState - let reversedNotes = stateNotes st - updateState $ \st -> st { stateNotes = reverse reversedNotes } - setInput docMinusNotes - setPosition startPos - -- now parse it for real... - (title, author, date) <- option ([],[],"") titleBlock - blocks <- parseBlocks - return $ Pandoc (Meta title author date) $ filter (/= Null) blocks - --- --- initial pass for references and notes --- - -referenceKey = try $ do - startPos <- getPosition - nonindentSpaces - label <- reference - char ':' - skipSpaces - optional (char '<') - src <- many (noneOf "> \n\t") - optional (char '>') - tit <- option "" referenceTitle - blanklines - endPos <- getPosition - let newkey = (label, (removeTrailingSpace src, tit)) - st <- getState - let oldkeys = stateKeys st - updateState $ \st -> st { stateKeys = newkey : oldkeys } - -- return blanks so line count isn't affected - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' - -referenceTitle = try $ do - (many1 spaceChar >> option '\n' newline) <|> newline - skipSpaces - tit <- (charsInBalanced '(' ')' >>= return . unwords . words) - <|> do delim <- char '\'' <|> char '"' - manyTill anyChar (try (char delim >> skipSpaces >> - notFollowedBy (noneOf ")\n"))) - return $ decodeCharacterReferences tit - -noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']') - -rawLine = do - notFollowedBy blankline - notFollowedBy' noteMarker - contents <- many1 nonEndline - end <- option "" (newline >> optional indentSpaces >> return "\n") - return $ contents ++ end - -rawLines = many1 rawLine >>= return . concat - -noteBlock = try $ do - startPos <- getPosition - ref <- noteMarker - char ':' - optional blankline - optional indentSpaces - raw <- sepBy rawLines (try (blankline >> indentSpaces)) - optional blanklines - endPos <- getPosition - -- parse the extracted text, which may contain various block elements: - contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" - let newnote = (ref, contents) - st <- getState - let oldnotes = stateNotes st - updateState $ \st -> st { stateNotes = newnote : oldnotes } - -- return blanks so line count isn't affected - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' - --- --- parsing blocks --- - -parseBlocks = manyTill block eof - -block = choice [ header - , table - , codeBlock - , hrule - , list - , blockQuote - , htmlBlock - , rawLaTeXEnvironment' - , para - , plain - , nullBlock ] <?> "block" - --- --- header blocks --- - -header = atxHeader <|> setextHeader <?> "header" - -atxHeader = try $ do - level <- many1 (char '#') >>= return . length - notFollowedBy (char '.' <|> char ')') -- this would be a list - skipSpaces - text <- manyTill inline atxClosing >>= return . normalizeSpaces - return $ Header level text - -atxClosing = try $ skipMany (char '#') >> blanklines - -setextHeader = try $ do - -- first, see if this block has any chance of being a setextHeader: - lookAhead (anyLine >> oneOf setextHChars) - text <- many1Till inline newline >>= return . normalizeSpaces - level <- choice $ zipWith - (\ch lev -> try (many1 $ char ch) >> blanklines >> return lev) - setextHChars [1..(length setextHChars)] - return $ Header level text - --- --- hrule block --- - -hrule = try $ do - skipSpaces - start <- oneOf hruleChars - count 2 (skipSpaces >> char start) - skipMany (skipSpaces >> char start) - newline - optional blanklines - return HorizontalRule - --- --- code blocks --- - -indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") - -codeBlock = do - contents <- many1 (indentedLine <|> - try (do b <- blanklines - l <- indentedLine - return $ b ++ l)) - optional blanklines - return $ CodeBlock $ stripTrailingNewlines $ concat contents - --- --- block quotes --- - -emacsBoxQuote = try $ do - failIfStrict - string ",----" - manyTill anyChar newline - raw <- manyTill - (try (char '|' >> optional (char ' ') >> manyTill anyChar newline)) - (try (string "`----")) - blanklines - return raw - -emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ') - -emailBlockQuote = try $ do - emailBlockQuoteStart - raw <- sepBy (many (nonEndline <|> - (try (endline >> notFollowedBy emailBlockQuoteStart >> - return '\n')))) - (try (newline >> emailBlockQuoteStart)) - newline <|> (eof >> return '\n') - optional blanklines - return raw - -blockQuote = do - raw <- emailBlockQuote <|> emacsBoxQuote - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" - return $ BlockQuote contents - --- --- list blocks --- - -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -bulletListStart = try $ do - optional newline -- if preceded by a Plain block in a list context - nonindentSpaces - notFollowedBy' hrule -- because hrules start out just like lists - oneOf bulletListMarkers - spaceChar - skipSpaces - -anyOrderedListStart = try $ do - optional newline -- if preceded by a Plain block in a list context - nonindentSpaces - notFollowedBy $ string "p." >> spaceChar >> digit -- page number - state <- getState - if stateStrict state - then do many1 digit - char '.' - spaceChar - return (1, DefaultStyle, DefaultDelim) - else anyOrderedListMarker >>~ spaceChar - -orderedListStart style delim = try $ do - optional newline -- if preceded by a Plain block in a list context - nonindentSpaces - state <- getState - num <- if stateStrict state - then do many1 digit - char '.' - return 1 - else orderedListMarker style delim - if delim == Period && (style == UpperAlpha || (style == UpperRoman && - num `elem` [1, 5, 10, 50, 100, 500, 1000])) - then char '\t' <|> (spaceChar >> spaceChar) - else spaceChar - skipSpaces - --- parse a line of a list item (start = parser for beginning of list item) -listLine start = try $ do - notFollowedBy' start - notFollowedBy blankline - notFollowedBy' (do indentSpaces - many (spaceChar) - bulletListStart <|> (anyOrderedListStart >> return ())) - line <- manyTill anyChar newline - return $ line ++ "\n" - --- parse raw text for one list item, excluding start marker and continuations -rawListItem start = try $ do - start - result <- many1 (listLine start) - blanks <- many blankline - return $ concat result ++ blanks - --- continuation of a list item - indented and separated by blankline --- or (in compact lists) endline. --- note: nested lists are parsed as continuations -listContinuation start = try $ do - lookAhead indentSpaces - result <- many1 (listContinuationLine start) - blanks <- many blankline - return $ concat result ++ blanks - -listContinuationLine start = try $ do - notFollowedBy blankline - notFollowedBy' start - optional indentSpaces - result <- manyTill anyChar newline - return $ result ++ "\n" - -listItem start = try $ do - first <- rawListItem start - continuations <- many (listContinuation start) - -- parsing with ListItemState forces markers at beginning of lines to - -- count as list item markers, even if not separated by blank space. - -- see definition of "endline" - state <- getState - let oldContext = stateParserContext state - setState $ state {stateParserContext = ListItemState} - -- parse the extracted block, which may contain various block elements: - let raw = concat (first:continuations) - contents <- parseFromString parseBlocks raw - updateState (\st -> st {stateParserContext = oldContext}) - return contents - -orderedList = try $ do - (start, style, delim) <- lookAhead anyOrderedListStart - items <- many1 (listItem (orderedListStart style delim)) - return $ OrderedList (start, style, delim) $ compactify items - -bulletList = many1 (listItem bulletListStart) >>= - return . BulletList . compactify - --- definition lists - -definitionListItem = try $ do - notFollowedBy blankline - notFollowedBy' indentSpaces - -- first, see if this has any chance of being a definition list: - lookAhead (anyLine >> char ':') - term <- manyTill inline newline - raw <- many1 defRawBlock - state <- getState - let oldContext = stateParserContext state - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ concat raw - updateState (\st -> st {stateParserContext = oldContext}) - return ((normalizeSpaces term), contents) - -defRawBlock = try $ do - char ':' - state <- getState - let tabStop = stateTabStop state - try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") - firstline <- anyLine - rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) - trailing <- option "" blanklines - return $ firstline ++ "\n" ++ unlines rawlines ++ trailing - -definitionList = do - failIfStrict - items <- many1 definitionListItem - let (terms, defs) = unzip items - let defs' = compactify defs - let items' = zip terms defs' - return $ DefinitionList items' - --- --- paragraph block --- - -para = try $ do - result <- many1 inline - newline - blanklines <|> do st <- getState - if stateStrict st - then lookAhead (blockQuote <|> header) >> return "" - else lookAhead emacsBoxQuote >> return "" - return $ Para $ normalizeSpaces result - -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- raw html --- - -htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" - -htmlBlock = do - st <- getState - if stateStrict st - then try $ do failUnlessBeginningOfLine - first <- htmlElement - finalSpace <- many (oneOf spaceChars) - finalNewlines <- many newline - return $ RawHtml $ first ++ finalSpace ++ finalNewlines - else rawHtmlBlocks - --- True if tag is self-closing -isSelfClosing tag = - isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag - -strictHtmlBlock = try $ do - tag <- anyHtmlBlockTag - let tag' = extractTagType tag - if isSelfClosing tag || tag' == "hr" - then return tag - else do contents <- many (notFollowedBy' (htmlEndTag tag') >> - (htmlElement <|> (count 1 anyChar))) - end <- htmlEndTag tag' - return $ tag ++ concat contents ++ end - -rawHtmlBlocks = do - htmlBlocks <- many1 rawHtmlBlock - let combined = concatMap (\(RawHtml str) -> str) htmlBlocks - let combined' = if not (null combined) && last combined == '\n' - then init combined -- strip extra newline - else combined - return $ RawHtml combined' - --- --- LaTeX --- - -rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment - --- --- Tables --- - --- Parse a dashed line with optional trailing spaces; return its length --- and the length including trailing space. -dashedLine ch = do - dashes <- many1 (char ch) - sp <- many spaceChar - return $ (length dashes, length $ dashes ++ sp) - --- Parse a table header with dashed lines of '-' preceded by --- one line of text. -simpleTableHeader = try $ do - rawContent <- anyLine - initSp <- nonindentSpaces - dashes <- many1 (dashedLine '-') - newline - let (lengths, lines) = unzip dashes - let indices = scanl (+) (length initSp) lines - let rawHeads = tail $ splitByIndices (init indices) rawContent - let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths - return (rawHeads, aligns, indices) - --- Parse a table footer - dashed lines followed by blank line. -tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines - --- Parse a table separator - dashed line. -tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n" - --- Parse a raw line and split it into chunks by indices. -rawTableLine indices = do - notFollowedBy' (blanklines <|> tableFooter) - line <- many1Till anyChar newline - return $ map removeLeadingTrailingSpace $ tail $ - splitByIndices (init indices) line - --- Parse a table line and return a list of lists of blocks (columns). -tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) - --- Parse a multiline table row and return a list of blocks (columns). -multilineRow indices = do - colLines <- many1 (rawTableLine indices) - optional blanklines - let cols = map unlines $ transpose colLines - mapM (parseFromString (many plain)) cols - --- Calculate relative widths of table columns, based on indices -widthsFromIndices :: Int -- Number of columns on terminal - -> [Int] -- Indices - -> [Float] -- Fractional relative sizes of columns -widthsFromIndices _ [] = [] -widthsFromIndices numColumns indices = - let lengths = zipWith (-) indices (0:indices) - totLength = sum lengths - quotient = if totLength > numColumns - then fromIntegral totLength - else fromIntegral numColumns - fracs = map (\l -> (fromIntegral l) / quotient) lengths in - tail fracs - --- Parses a table caption: inlines beginning with 'Table:' --- and followed by blank lines. -tableCaption = try $ do - nonindentSpaces - string "Table:" - result <- many1 inline - blanklines - return $ normalizeSpaces result - --- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. -tableWith headerParser lineParser footerParser = try $ do - (rawHeads, aligns, indices) <- headerParser - lines <- many1Till (lineParser indices) footerParser - caption <- option [] tableCaption - heads <- mapM (parseFromString (many plain)) rawHeads - state <- getState - let numColumns = stateColumns state - let widths = widthsFromIndices numColumns indices - return $ Table caption aligns widths heads lines - --- Parse a simple table with '---' header and one line per row. -simpleTable = tableWith simpleTableHeader tableLine blanklines - --- Parse a multiline table: starts with row of '-' on top, then header --- (which may be multiline), then the rows, --- which may be multiline, separated by blank lines, and --- ending with a footer (dashed line followed by blank line). -multilineTable = tableWith multilineTableHeader multilineRow tableFooter - -multilineTableHeader = try $ do - tableSep - rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) - initSp <- nonindentSpaces - dashes <- many1 (dashedLine '-') - newline - let (lengths, lines) = unzip dashes - let indices = scanl (+) (length initSp) lines - let rawHeadsList = transpose $ map - (\ln -> tail $ splitByIndices (init indices) ln) - rawContent - let rawHeads = map (joinWithSep " ") rawHeadsList - let aligns = zipWith alignType rawHeadsList lengths - return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) - --- Returns an alignment type for a table, based on a list of strings --- (the rows of the column header) and a number (the length of the --- dashed line under the rows. -alignType :: [String] -> Int -> Alignment -alignType [] len = AlignDefault -alignType strLst len = - let str = head $ sortBy (comparing length) $ - map removeTrailingSpace strLst - leftSpace = if null str then False else (str !! 0) `elem` " \t" - rightSpace = length str < len || (str !! (len - 1)) `elem` " \t" - in case (leftSpace, rightSpace) of - (True, False) -> AlignRight - (False, True) -> AlignLeft - (True, True) -> AlignCenter - (False, False) -> AlignDefault - -table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table" - --- --- inline --- - -inline = choice [ str - , smartPunctuation - , whitespace - , endline - , code - , charRef - , strong - , emph - , note - , inlineNote - , link - , image - , math - , strikeout - , superscript - , subscript - , autoLink - , rawHtmlInline' - , rawLaTeXInline' - , escapedChar - , symbol - , ltSign ] <?> "inline" - -escapedChar = do - char '\\' - state <- getState - result <- option '\\' $ if stateStrict state - then oneOf "\\`*_{}[]()>#+-.!~" - else satisfy (not . isAlphaNum) - return $ Str [result] - -ltSign = do - st <- getState - if stateStrict st - then char '<' - else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html - return $ Str ['<'] - -specialCharsMinusLt = filter (/= '<') specialChars - -symbol = do - result <- oneOf specialCharsMinusLt - return $ Str [result] - --- parses inline code, between n `s and n `s -code = try $ do - starts <- many1 (char '`') - skipSpaces - result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> - (char '\n' >> return " ")) - (try (skipSpaces >> count (length starts) (char '`') >> - notFollowedBy (char '`'))) - return $ Code $ removeLeadingTrailingSpace $ concat result - -mathWord = many1 ((noneOf " \t\n\\$") <|> - (try (char '\\') >>~ notFollowedBy (char '$'))) - -math = try $ do - failIfStrict - char '$' - notFollowedBy space - words <- sepBy1 mathWord (many1 space) - char '$' - return $ TeX ("$" ++ (joinWithSep " " words) ++ "$") - -emph = ((enclosed (char '*') (char '*') inline) <|> - (enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>= - return . Emph . normalizeSpaces - -strong = ((enclosed (string "**") (try $ string "**") inline) <|> - (enclosed (string "__") (try $ string "__") inline)) >>= - return . Strong . normalizeSpaces - -strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= - return . Strikeout . normalizeSpaces - -superscript = failIfStrict >> enclosed (char '^') (char '^') - (notFollowedBy' whitespace >> inline) >>= -- may not contain Space - return . Superscript - -subscript = failIfStrict >> enclosed (char '~') (char '~') - (notFollowedBy' whitespace >> inline) >>= -- may not contain Space - return . Subscript - -smartPunctuation = failUnlessSmart >> - choice [ quoted, apostrophe, dash, ellipses ] - -apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe - -quoted = doubleQuoted <|> singleQuoted - -withQuoteContext context parser = do - oldState <- getState - let oldQuoteContext = stateQuoteContext oldState - setState oldState { stateQuoteContext = context } - result <- parser - newState <- getState - setState newState { stateQuoteContext = oldQuoteContext } - return result - -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= - return . Quoted SingleQuote . normalizeSpaces - -doubleQuoted = try $ do - doubleQuoteStart - withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= - return . Quoted DoubleQuote . normalizeSpaces - -failIfInQuoteContext context = do - st <- getState - if stateQuoteContext st == context - then fail "already inside quotes" - else return () - -singleQuoteStart = do - failIfInQuoteContext InSingleQuote - char '\8216' <|> - do char '\'' - notFollowedBy (oneOf ")!],.;:-? \t\n") - notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> - satisfy (not . isAlphaNum))) -- possess/contraction - return '\'' - -singleQuoteEnd = (char '\'' <|> char '\8217') >> notFollowedBy alphaNum - -doubleQuoteStart = failIfInQuoteContext InDoubleQuote >> - (char '"' <|> char '\8220') >> - notFollowedBy (oneOf " \t\n") - -doubleQuoteEnd = char '"' <|> char '\8221' - -ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses - -dash = enDash <|> emDash - -enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash - -emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >> - skipSpaces >> return EmDash - -whitespace = do - sps <- many1 (oneOf spaceChars) - if length sps >= 2 - then option Space (endline >> return LineBreak) - else return Space <?> "whitespace" - -nonEndline = satisfy (/='\n') - -strChar = noneOf (specialChars ++ spaceChars ++ "\n") - -str = many1 strChar >>= return . Str - --- an endline character that can be treated as a space, not a structural break -endline = try $ do - newline - notFollowedBy blankline - st <- getState - if stateStrict st - then do notFollowedBy emailBlockQuoteStart - notFollowedBy (char '#') -- atx header - else return () - -- parse potential list-starts differently if in a list: - if stateParserContext st == ListItemState - then notFollowedBy' (bulletListStart <|> - (anyOrderedListStart >> return ())) - else return () - return Space - --- --- links --- - --- a reference label for a link -reference = notFollowedBy' (string "[^") >> -- footnote reference - inlinesInBalanced "[" "]" >>= (return . normalizeSpaces) - --- source for a link, with optional title -source = try $ do - char '(' - optional (char '<') - src <- many (noneOf ")> \t\n") - optional (char '>') - tit <- option "" linkTitle - skipSpaces - char ')' - return (removeTrailingSpace src, tit) - -linkTitle = try $ do - (many1 spaceChar >> option '\n' newline) <|> newline - skipSpaces - delim <- char '\'' <|> char '"' - tit <- manyTill anyChar (try (char delim >> skipSpaces >> - notFollowedBy (noneOf ")\n"))) - return $ decodeCharacterReferences tit - -link = try $ do - label <- reference - src <- source <|> referenceLink label - return $ Link label src - --- a link like [this][ref] or [this][] or [this] -referenceLink label = do - ref <- option [] (try (optional (char ' ') >> - optional (newline >> skipSpaces) >> reference)) - let ref' = if null ref then label else ref - state <- getState - case lookupKeySrc (stateKeys state) ref' of - Nothing -> fail "no corresponding key" - Just target -> return target - -emailAddress = try $ do - name <- many1 (alphaNum <|> char '+') - char '@' - first <- many1 alphaNum - rest <- many1 (char '.' >> many1 alphaNum) - return $ "mailto:" ++ name ++ "@" ++ joinWithSep "." (first:rest) - -uri = try $ do - str <- many1 (noneOf "\n\t >") - if isURI str - then return str - else fail "not a URI" - -autoLink = try $ do - char '<' - src <- uri <|> emailAddress - char '>' - let src' = if "mailto:" `isPrefixOf` src - then drop 7 src - else src - st <- getState - return $ if stateStrict st - then Link [Str src'] (src, "") - else Link [Code src'] (src, "") - -image = try $ do - char '!' - (Link label src) <- link - return $ Image label src - -note = try $ do - failIfStrict - ref <- noteMarker - state <- getState - let notes = stateNotes state - case lookup ref notes of - Nothing -> fail "note not found" - Just contents -> return $ Note contents - -inlineNote = try $ do - failIfStrict - char '^' - contents <- inlinesInBalanced "[" "]" - return $ Note [Para contents] - -rawLaTeXInline' = failIfStrict >> rawLaTeXInline - -rawHtmlInline' = do - st <- getState - result <- choice $ if stateStrict st - then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] - else [htmlBlockElement, anyHtmlInlineTag] - return $ HtmlInline result - diff --git a/Text/Pandoc/Readers/RST.hs b/Text/Pandoc/Readers/RST.hs deleted file mode 100644 index 1239eb688..000000000 --- a/Text/Pandoc/Readers/RST.hs +++ /dev/null @@ -1,640 +0,0 @@ -{- -Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-7 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion from reStructuredText to 'Pandoc' document. --} -module Text.Pandoc.Readers.RST ( - readRST - ) where -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.ParserCombinators.Parsec -import Data.List ( findIndex, delete ) - --- | Parse reStructuredText string and return Pandoc document. -readRST :: ParserState -> String -> Pandoc -readRST state str = (readWith parseRST) state (str ++ "\n\n") - --- --- Constants and data structure definitions ---- - -bulletListMarkers = "*+-" -underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" - --- treat these as potentially non-text when parsing inline: -specialChars = "\\`|*_<>$:[-" - --- --- parsing documents --- - -isAnonKey (ref, src) = ref == [Str "_"] - -isHeader :: Int -> Block -> Bool -isHeader n (Header x _) = x == n -isHeader _ _ = False - --- | Promote all headers in a list of blocks. (Part of --- title transformation for RST.) -promoteHeaders :: Int -> [Block] -> [Block] -promoteHeaders num ((Header level text):rest) = - (Header (level - num) text):(promoteHeaders num rest) -promoteHeaders num (other:rest) = other:(promoteHeaders num rest) -promoteHeaders num [] = [] - --- | If list of blocks starts with a header (or a header and subheader) --- of level that are not found elsewhere, return it as a title and --- promote all the other headers. -titleTransform :: [Block] -- ^ list of blocks - -> ([Block], [Inline]) -- ^ modified list of blocks, title -titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title subtitle - if (any (isHeader 1) rest) || (any (isHeader 2) rest) - then ((Header 1 head1):(Header 2 head2):rest, []) - else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) -titleTransform ((Header 1 head1):rest) = -- title, no subtitle - if (any (isHeader 1) rest) - then ((Header 1 head1):rest, []) - else ((promoteHeaders 1 rest), head1) -titleTransform blocks = (blocks, []) - -parseRST = do - startPos <- getPosition - -- go through once just to get list of reference keys - -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat - setInput docMinusKeys - setPosition startPos - st <- getState - let reversedKeys = stateKeys st - updateState $ \st -> st { stateKeys = reverse reversedKeys } - -- now parse it for real... - blocks <- parseBlocks - let blocks' = filter (/= Null) blocks - state <- getState - let (blocks'', title) = if stateStandalone state - then titleTransform blocks' - else (blocks', []) - let authors = stateAuthors state - let date = stateDate state - let title' = if (null title) then (stateTitle state) else title - return $ Pandoc (Meta title' authors date) blocks'' - --- --- parsing blocks --- - -parseBlocks = manyTill block eof - -block = choice [ codeBlock - , rawHtmlBlock - , rawLaTeXBlock - , fieldList - , blockQuote - , imageBlock - , unknownDirective - , header - , hrule - , list - , lineBlock - , para - , plain - , nullBlock ] <?> "block" - --- --- field list --- - -fieldListItem indent = try $ do - string indent - char ':' - name <- many1 alphaNum - string ": " - skipSpaces - first <- manyTill anyChar newline - rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >> - indentedBlock - return (name, joinWithSep " " (first:(lines rest))) - -fieldList = try $ do - indent <- lookAhead $ many (oneOf " \t") - items <- many1 $ fieldListItem indent - blanklines - let authors = case lookup "Authors" items of - Just auth -> [auth] - Nothing -> map snd (filter (\(x,y) -> x == "Author") items) - if null authors - then return () - else updateState $ \st -> st {stateAuthors = authors} - case (lookup "Date" items) of - Just dat -> updateState $ \st -> st {stateDate = dat} - Nothing -> return () - case (lookup "Title" items) of - Just tit -> parseFromString (many inline) tit >>= - \t -> updateState $ \st -> st {stateTitle = t} - Nothing -> return () - let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && - (x /= "Date") && (x /= "Title")) items - if null remaining - then return Null - else do terms <- mapM (return . (:[]) . Str . fst) remaining - defs <- mapM (parseFromString (many block) . snd) - remaining - return $ DefinitionList $ zip terms defs - --- --- line block --- - -lineBlockLine = try $ do - string "| " - white <- many (oneOf " \t") - line <- manyTill inline newline - return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak] - -lineBlock = try $ do - lines <- many1 lineBlockLine - blanklines - return $ Para (concat lines) - --- --- paragraph block --- - -para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph" - -codeBlockStart = string "::" >> blankline >> blankline - --- paragraph that ends in a :: starting a code block -paraBeforeCodeBlock = try $ do - result <- many1 (notFollowedBy' codeBlockStart >> inline) - lookAhead (string "::") - return $ Para $ if last result == Space - then normalizeSpaces result - else (normalizeSpaces result) ++ [Str ":"] - --- regular paragraph -paraNormal = try $ do - result <- many1 inline - newline - blanklines - return $ Para $ normalizeSpaces result - -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- image block --- - -imageBlock = try $ do - string ".. image:: " - src <- manyTill anyChar newline - fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t") - many1 $ fieldListItem indent - optional blanklines - case lookup "alt" fields of - Just alt -> return $ Plain [Image [Str alt] (src, alt)] - Nothing -> return $ Plain [Image [Str "image"] (src, "")] --- --- header blocks --- - -header = doubleHeader <|> singleHeader <?> "header" - --- a header with lines on top and bottom -doubleHeader = try $ do - c <- oneOf underlineChars - rest <- many (char c) -- the top line - let lenTop = length (c:rest) - skipSpaces - newline - txt <- many1 (notFollowedBy blankline >> inline) - pos <- getPosition - let len = (sourceColumn pos) - 1 - if (len > lenTop) then fail "title longer than border" else return () - blankline -- spaces and newline - count lenTop (char c) -- the bottom line - blanklines - -- check to see if we've had this kind of header before. - -- if so, get appropriate level. if not, add to list. - state <- getState - let headerTable = stateHeaderTable state - let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of - Just ind -> (headerTable, ind + 1) - Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) - setState (state { stateHeaderTable = headerTable' }) - return $ Header level (normalizeSpaces txt) - --- a header with line on the bottom only -singleHeader = try $ do - notFollowedBy' whitespace - txt <- many1 (do {notFollowedBy blankline; inline}) - pos <- getPosition - let len = (sourceColumn pos) - 1 - blankline - c <- oneOf underlineChars - rest <- count (len - 1) (char c) - many (char c) - blanklines - state <- getState - let headerTable = stateHeaderTable state - let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of - Just ind -> (headerTable, ind + 1) - Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) - setState (state { stateHeaderTable = headerTable' }) - return $ Header level (normalizeSpaces txt) - --- --- hrule block --- - -hrule = try $ do - chr <- oneOf underlineChars - count 3 (char chr) - skipMany (char chr) - blankline - blanklines - return HorizontalRule - --- --- code blocks --- - --- read a line indented by a given string -indentedLine indents = try $ do - string indents - result <- manyTill anyChar newline - return $ result ++ "\n" - --- two or more indented lines, possibly separated by blank lines. --- any amount of indentation will work. -indentedBlock = do - indents <- lookAhead $ many1 (oneOf " \t") - lns <- many $ choice $ [ indentedLine indents, - try $ do b <- blanklines - l <- indentedLine indents - return (b ++ l) ] - optional blanklines - return $ concat lns - -codeBlock = try $ do - codeBlockStart - result <- indentedBlock - return $ CodeBlock $ stripTrailingNewlines result - --- --- raw html --- - -rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >> - indentedBlock >>= return . RawHtml - --- --- raw latex --- - -rawLaTeXBlock = try $ do - string ".. raw:: latex" - blanklines - result <- indentedBlock - return $ Para [(TeX result)] - --- --- block quotes --- - -blockQuote = do - raw <- indentedBlock - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n\n" - return $ BlockQuote contents - --- --- list blocks --- - -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -definitionListItem = try $ do - term <- many1Till inline endline - raw <- indentedBlock - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n\n" - return (normalizeSpaces term, contents) - -definitionList = many1 definitionListItem >>= return . DefinitionList - --- parses bullet list start and returns its length (inc. following whitespace) -bulletListStart = try $ do - notFollowedBy' hrule -- because hrules start out just like lists - marker <- oneOf bulletListMarkers - white <- many1 spaceChar - return $ length (marker:white) - --- parses ordered list start and returns its length (inc following whitespace) -orderedListStart style delim = try $ do - (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) - white <- many1 spaceChar - return $ markerLen + length white - --- parse a line of a list item -listLine markerLength = try $ do - notFollowedBy blankline - indentWith markerLength - line <- manyTill anyChar newline - return $ line ++ "\n" - --- indent by specified number of spaces (or equiv. tabs) -indentWith num = do - state <- getState - let tabStop = stateTabStop state - if (num < tabStop) - then count num (char ' ') - else choice [ try (count num (char ' ')), - (try (char '\t' >> count (num - tabStop) (char ' '))) ] - --- parse raw text for one list item, excluding start marker and continuations -rawListItem start = do - markerLength <- start - firstLine <- manyTill anyChar newline - restLines <- many (listLine markerLength) - return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) - --- continuation of a list item - indented and separated by blankline or --- (in compact lists) endline. --- Note: nested lists are parsed as continuations. -listContinuation markerLength = try $ do - blanks <- many1 blankline - result <- many1 (listLine markerLength) - return $ blanks ++ concat result - -listItem start = try $ do - (markerLength, first) <- rawListItem start - rest <- many (listContinuation markerLength) - blanks <- choice [ try (many blankline >>~ lookAhead start), - many1 blankline ] -- whole list must end with blank. - -- parsing with ListItemState forces markers at beginning of lines to - -- count as list item markers, even if not separated by blank space. - -- see definition of "endline" - state <- getState - let oldContext = stateParserContext state - setState $ state {stateParserContext = ListItemState} - -- parse the extracted block, which may itself contain block elements - parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks - updateState (\st -> st {stateParserContext = oldContext}) - return parsed - -orderedList = do - (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) - items <- many1 (listItem (orderedListStart style delim)) - let items' = compactify items - return $ OrderedList (start, style, delim) items' - -bulletList = many1 (listItem bulletListStart) >>= - return . BulletList . compactify - --- --- unknown directive (e.g. comment) --- - -unknownDirective = try $ do - string ".. " - manyTill anyChar newline - many (string " :" >> many1 (noneOf "\n:") >> char ':' >> - many1 (noneOf "\n") >> newline) - optional blanklines - return Null - --- --- reference key --- - -referenceKey = do - startPos <- getPosition - key <- choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] - st <- getState - let oldkeys = stateKeys st - updateState $ \st -> st { stateKeys = key : oldkeys } - optional blanklines - endPos <- getPosition - -- return enough blanks to replace key - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' - -targetURI = do - skipSpaces - optional newline - contents <- many1 (try (many spaceChar >> newline >> - many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") - blanklines - return contents - -imageKey = try $ do - string ".. |" - ref <- manyTill inline (char '|') - skipSpaces - string "image::" - src <- targetURI - return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) - -anonymousKey = try $ do - oneOfStrings [".. __:", "__"] - src <- targetURI - state <- getState - return ([Str "_"], (removeLeadingTrailingSpace src, "")) - -regularKeyQuoted = try $ do - string ".. _`" - ref <- manyTill inline (char '`') - char ':' - src <- targetURI - return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) - -regularKey = try $ do - string ".. _" - ref <- manyTill inline (char ':') - src <- targetURI - return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) - - -- - -- inline - -- - -inline = choice [ link - , str - , whitespace - , endline - , strong - , emph - , code - , image - , hyphens - , superscript - , subscript - , escapedChar - , symbol ] <?> "inline" - -hyphens = do - result <- many1 (char '-') - option Space endline - -- don't want to treat endline after hyphen or dash as a space - return $ Str result - -escapedChar = escaped anyChar - -symbol = do - result <- oneOf specialChars - return $ Str [result] - --- parses inline code, between codeStart and codeEnd -code = try $ do - string "``" - result <- manyTill anyChar (try (string "``")) - return $ Code $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result - -emph = enclosed (char '*') (char '*') inline >>= - return . Emph . normalizeSpaces - -strong = enclosed (string "**") (try $ string "**") inline >>= - return . Strong . normalizeSpaces - -interpreted role = try $ do - optional $ try $ string "\\ " - result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar - nextChar <- lookAhead anyChar - try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "") - return [Str result] - -superscript = interpreted "sup" >>= (return . Superscript) - -subscript = interpreted "sub" >>= (return . Subscript) - -whitespace = many1 spaceChar >> return Space <?> "whitespace" - -str = notFollowedBy' oneWordReference >> - many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str - --- an endline character that can be treated as a space, not a structural break -endline = try $ do - newline - notFollowedBy blankline - -- parse potential list-starts at beginning of line differently in a list: - st <- getState - if (stateParserContext st) == ListItemState - then notFollowedBy (anyOrderedListMarker >> spaceChar) >> - notFollowedBy' bulletListStart - else return () - return Space - --- --- links --- - -link = choice [explicitLink, referenceLink, autoLink] <?> "link" - -explicitLink = try $ do - char '`' - notFollowedBy (char '`') -- `` is marks start of inline code - label <- manyTill inline (try (do {spaces; char '<'})) - src <- manyTill (noneOf ">\n ") (char '>') - skipSpaces - string "`_" - return $ Link (normalizeSpaces label) (removeLeadingTrailingSpace src, "") - -reference = try $ do - char '`' - notFollowedBy (char '`') - label <- many1Till inline (char '`') - char '_' - return label - -oneWordReference = do - raw <- many1 alphaNum - char '_' - notFollowedBy alphaNum -- because this_is_not a link - return [Str raw] - -referenceLink = try $ do - label <- reference <|> oneWordReference - key <- option label (do{char '_'; return [Str "_"]}) -- anonymous link - state <- getState - let keyTable = stateKeys state - src <- case lookupKeySrc keyTable key of - Nothing -> fail "no corresponding key" - Just target -> return target - -- if anonymous link, remove first anon key so it won't be used again - let keyTable' = if (key == [Str "_"]) -- anonymous link? - then delete ([Str "_"], src) keyTable -- remove first anon key - else keyTable - setState $ state { stateKeys = keyTable' } - return $ Link (normalizeSpaces label) src - -uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", - "mailto:", "news:", "telnet:" ] - -uri = try $ do - scheme <- uriScheme - identifier <- many1 (noneOf " \t\n") - return $ scheme ++ identifier - -autoURI = do - src <- uri - return $ Link [Str src] (src, "") - -emailChar = alphaNum <|> oneOf "-+_." - -emailAddress = try $ do - firstLetter <- alphaNum - restAddr <- many emailChar - let addr = firstLetter:restAddr - char '@' - dom <- domain - return $ addr ++ '@':dom - -domainChar = alphaNum <|> char '-' - -domain = do - first <- many1 domainChar - dom <- many1 (try (do{ char '.'; many1 domainChar })) - return $ joinWithSep "." (first:dom) - -autoEmail = do - src <- emailAddress - return $ Link [Str src] ("mailto:" ++ src, "") - -autoLink = autoURI <|> autoEmail - --- For now, we assume that all substitution references are for images. -image = try $ do - char '|' - ref <- manyTill inline (char '|') - state <- getState - let keyTable = stateKeys state - src <- case lookupKeySrc keyTable ref of - Nothing -> fail "no corresponding key" - Just target -> return target - return $ Image (normalizeSpaces ref) src - |