diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2009-01-24 20:00:26 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2009-01-24 20:00:26 +0000 |
commit | 42aca57dee8d88afa5fac512aeb1198102908865 (patch) | |
tree | 1c6a98bd226f4fffde6768010715bc1d80e5d168 /Text/Pandoc/Readers | |
parent | 39e8d8486693029abfef84c45e85416f7c775280 (diff) | |
download | pandoc-42aca57dee8d88afa5fac512aeb1198102908865.tar.gz |
Moved all haskell source to src subdirectory.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1528 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'Text/Pandoc/Readers')
-rw-r--r-- | Text/Pandoc/Readers/HTML.hs | 675 | ||||
-rw-r--r-- | Text/Pandoc/Readers/LaTeX.hs | 774 | ||||
-rw-r--r-- | Text/Pandoc/Readers/Markdown.hs | 1243 | ||||
-rw-r--r-- | Text/Pandoc/Readers/RST.hs | 707 | ||||
-rw-r--r-- | Text/Pandoc/Readers/TeXMath.hs | 233 |
5 files changed, 0 insertions, 3632 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs deleted file mode 100644 index 65e512b5e..000000000 --- a/Text/Pandoc/Readers/HTML.hs +++ /dev/null @@ -1,675 +0,0 @@ -{- -Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-8 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of HTML to 'Pandoc' document. --} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement, - unsanitaryURI - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf, intercalate ) -import Data.Char ( toLower, isAlphaNum ) -import Network.URI ( parseURIReference, URI (..) ) - --- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: ParserState -- ^ Parser state - -> String -- ^ String to parse - -> Pandoc -readHtml = readWith parseHtml - --- --- Constants --- - -eitherBlockOrInline :: [[Char]] -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object"] - -{- -inlineHtmlTags :: [[Char]] -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] ++ eitherBlockOrInline --} - -blockHtmlTags :: [[Char]] -blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", - "dl", "fieldset", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "hr", "html", "isindex", "menu", "noframes", - "noscript", "ol", "p", "pre", "table", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script"] ++ eitherBlockOrInline - -sanitaryTags :: [[Char]] -sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", - "blockquote", "br", "button", "caption", "center", - "cite", "code", "col", "colgroup", "dd", "del", "dfn", - "dir", "div", "dl", "dt", "em", "fieldset", "font", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "i", "img", "input", "ins", "kbd", "label", "legend", - "li", "map", "menu", "ol", "optgroup", "option", "p", - "pre", "q", "s", "samp", "select", "small", "span", - "strike", "strong", "sub", "sup", "table", "tbody", - "td", "textarea", "tfoot", "th", "thead", "tr", "tt", - "u", "ul", "var"] - -sanitaryAttributes :: [[Char]] -sanitaryAttributes = ["abbr", "accept", "accept-charset", - "accesskey", "action", "align", "alt", "axis", - "border", "cellpadding", "cellspacing", "char", - "charoff", "charset", "checked", "cite", "class", - "clear", "cols", "colspan", "color", "compact", - "coords", "datetime", "dir", "disabled", - "enctype", "for", "frame", "headers", "height", - "href", "hreflang", "hspace", "id", "ismap", - "label", "lang", "longdesc", "maxlength", "media", - "method", "multiple", "name", "nohref", "noshade", - "nowrap", "prompt", "readonly", "rel", "rev", - "rows", "rowspan", "rules", "scope", "selected", - "shape", "size", "span", "src", "start", - "summary", "tabindex", "target", "title", "type", - "usemap", "valign", "value", "vspace", "width"] - --- --- HTML utility functions --- - --- | Returns @True@ if sanitization is specified and the specified tag is --- not on the sanitized tag list. -unsanitaryTag :: [Char] - -> GenParser tok ParserState Bool -unsanitaryTag tag = do - st <- getState - return $ stateSanitizeHTML st && tag `notElem` sanitaryTags - --- | returns @True@ if sanitization is specified and the specified attribute --- is not on the sanitized attribute list. -unsanitaryAttribute :: ([Char], String, t) - -> GenParser tok ParserState Bool -unsanitaryAttribute (attr, val, _) = do - st <- getState - return $ stateSanitizeHTML st && - (attr `notElem` sanitaryAttributes || - (attr `elem` ["href","src"] && unsanitaryURI val)) - --- | Returns @True@ if the specified URI is potentially a security risk. -unsanitaryURI :: String -> Bool -unsanitaryURI u = - let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", - "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", - "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", - "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", - "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", - "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", - "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", - "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", - "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", - "snews:", "webcal:", "ymsgr:"] - in case parseURIReference u of - Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes - Nothing -> True - --- | Read blocks until end tag. -blocksTilEnd :: String -> GenParser Char ParserState [Block] -blocksTilEnd tag = do - blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) - return $ filter (/= Null) blocks - --- | Read inlines until end tag. -inlinesTilEnd :: String -> GenParser Char ParserState [Inline] -inlinesTilEnd tag = manyTill inline (htmlEndTag tag) - --- | Parse blocks between open and close tag. -blocksIn :: String -> GenParser Char ParserState [Block] -blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag - --- | Parse inlines between open and close tag. -inlinesIn :: String -> GenParser Char ParserState [Inline] -inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag - --- | Extract type from a tag: e.g. @br@ from @\<br\>@ -extractTagType :: String -> String -extractTagType ('<':rest) = - let isSpaceOrSlash c = c `elem` "/ \n\t" in - map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest -extractTagType _ = "" - --- | Parse any HTML tag (opening or self-closing) and return text of tag -anyHtmlTag :: GenParser Char ParserState [Char] -anyHtmlTag = try $ do - char '<' - spaces - tag <- many1 alphaNum - attribs <- many htmlAttribute - spaces - ender <- option "" (string "/") - let ender' = if null ender then "" else " /" - spaces - char '>' - let result = "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -anyHtmlEndTag :: GenParser Char ParserState [Char] -anyHtmlEndTag = try $ do - char '<' - spaces - char '/' - spaces - tag <- many1 alphaNum - spaces - char '>' - let result = "</" ++ tag ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) -htmlTag tag = try $ do - char '<' - spaces - stringAnyCase tag - attribs <- many htmlAttribute - spaces - optional (string "/") - spaces - char '>' - return (tag, (map (\(name, content, _) -> (name, content)) attribs)) - --- parses a quoted html attribute value -quoted :: Char -> GenParser Char st (String, String) -quoted quoteChar = do - result <- between (char quoteChar) (char quoteChar) - (many (noneOf [quoteChar])) - return (result, [quoteChar]) - -nullAttribute :: ([Char], [Char], [Char]) -nullAttribute = ("", "", "") - -htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) -htmlAttribute = do - attr <- htmlRegularAttribute <|> htmlMinimizedAttribute - unsanitary <- unsanitaryAttribute attr - if unsanitary - then return nullAttribute - else return attr - --- minimized boolean attribute -htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlMinimizedAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - return (name, name, name) - -htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlRegularAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - spaces - char '=' - spaces - (content, quoteStr) <- choice [ (quoted '\''), - (quoted '"'), - (do - a <- many (alphaNum <|> (oneOf "-._:")) - return (a,"")) ] - return (name, content, - (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) - --- | Parse an end tag of type 'tag' -htmlEndTag :: [Char] -> GenParser Char st [Char] -htmlEndTag tag = try $ do - char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "</" ++ tag ++ ">" - -{- --- | Returns @True@ if the tag is (or can be) an inline tag. -isInline :: String -> Bool -isInline tag = (extractTagType tag) `elem` inlineHtmlTags --} - --- | Returns @True@ if the tag is (or can be) a block tag. -isBlock :: String -> Bool -isBlock tag = (extractTagType tag) `elem` blockHtmlTags - -anyHtmlBlockTag :: GenParser Char ParserState [Char] -anyHtmlBlockTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" - -anyHtmlInlineTag :: GenParser Char ParserState [Char] -anyHtmlInlineTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if not (isBlock tag) then return tag else fail "not an inline tag" - --- | Parses material between script tags. --- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript :: GenParser Char ParserState [Char] -htmlScript = try $ do - open <- string "<script" - rest <- manyTill anyChar (htmlEndTag "script") - st <- getState - if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</script>" - --- | Parses material between style tags. --- Style tags must be treated differently, because they can contain CSS -htmlStyle :: GenParser Char ParserState [Char] -htmlStyle = try $ do - open <- string "<style" - rest <- manyTill anyChar (htmlEndTag "style") - st <- getState - if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</style>" - -htmlBlockElement :: GenParser Char ParserState [Char] -htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] - -rawHtmlBlock :: GenParser Char ParserState Block -rawHtmlBlock = try $ do - body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag - state <- getState - if stateParseRaw state then return (RawHtml body) else return Null - --- This is a block whose contents should be passed through verbatim, not interpreted. -rawVerbatimBlock :: GenParser Char ParserState [Char] -rawVerbatimBlock = try $ do - start <- anyHtmlBlockTag - let tagtype = extractTagType start - if tagtype `elem` ["pre"] - then do - contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar) - end <- htmlEndTag tagtype - return $ start ++ contents ++ end - else fail "Not a verbatim block" - --- We don't want to parse </body> or </html> as raw HTML, since these --- are handled in parseHtml. -rawHtmlBlock' :: GenParser Char ParserState Block -rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") - rawHtmlBlock - --- | Parses an HTML comment. -htmlComment :: GenParser Char st [Char] -htmlComment = try $ do - string "<!--" - comment <- manyTill anyChar (try (string "-->")) - return $ "<!--" ++ comment ++ "-->" - --- --- parsing documents --- - -xmlDec :: GenParser Char st [Char] -xmlDec = try $ do - string "<?" - rest <- manyTill anyChar (char '>') - return $ "<?" ++ rest ++ ">" - -definition :: GenParser Char st [Char] -definition = try $ do - string "<!" - rest <- manyTill anyChar (char '>') - return $ "<!" ++ rest ++ ">" - -nonTitleNonHead :: GenParser Char ParserState Char -nonTitleNonHead = try $ do - notFollowedBy $ (htmlTag "title" >> return ' ') <|> - (htmlEndTag "head" >> return ' ') - (rawHtmlBlock >> return ' ') <|> anyChar - -parseTitle :: GenParser Char ParserState [Inline] -parseTitle = try $ do - (tag, _) <- htmlTag "title" - contents <- inlinesTilEnd tag - spaces - return contents - --- parse header and return meta-information (for now, just title) -parseHead :: GenParser Char ParserState ([Inline], [a], [Char]) -parseHead = try $ do - htmlTag "head" - spaces - skipMany nonTitleNonHead - contents <- option [] parseTitle - skipMany nonTitleNonHead - htmlEndTag "head" - return (contents, [], "") - -skipHtmlTag :: String -> GenParser Char ParserState () -skipHtmlTag tag = optional (htmlTag tag) - --- h1 class="title" representation of title in body -bodyTitle :: GenParser Char ParserState [Inline] -bodyTitle = try $ do - (_, attribs) <- htmlTag "h1" - case (extractAttribute "class" attribs) of - Just "title" -> return "" - _ -> fail "not title" - inlinesTilEnd "h1" - -parseHtml :: GenParser Char ParserState Pandoc -parseHtml = do - sepEndBy (choice [xmlDec, definition, htmlComment]) spaces - skipHtmlTag "html" - spaces - (title, authors, date) <- option ([], [], "") parseHead - spaces - skipHtmlTag "body" - spaces - optional bodyTitle -- skip title in body, because it's represented in meta - blocks <- parseBlocks - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html> - eof - return $ Pandoc (Meta title authors date) blocks - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) - -block :: GenParser Char ParserState Block -block = choice [ codeBlock - , header - , hrule - , list - , blockQuote - , para - , plain - , rawHtmlBlock' - ] <?> "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" - -headerLevel :: Int -> GenParser Char ParserState Block -headerLevel n = try $ do - let level = "h" ++ show n - htmlTag level - contents <- inlinesTilEnd level - return $ Header n (normalizeSpaces contents) - --- --- hrule block --- - -hrule :: GenParser Char ParserState Block -hrule = try $ do - (_, attribs) <- htmlTag "hr" - state <- getState - if not (null attribs) && stateParseRaw state - then unexpected "attributes in hr" -- parse as raw in this case - else return HorizontalRule - --- --- code blocks --- - --- Note: HTML tags in code blocks (e.g. for syntax highlighting) are --- skipped, because they are not portable to output formats other than HTML. -codeBlock :: GenParser Char ParserState Block -codeBlock = try $ do - htmlTag "pre" - result <- manyTill - (many1 (satisfy (/= '<')) <|> - ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) - (htmlEndTag "pre") - let result' = concat result - -- drop leading newline if any - let result'' = if "\n" `isPrefixOf` result' - then drop 1 result' - else result' - -- drop trailing newline if any - let result''' = if "\n" `isSuffixOf` result'' - then init result'' - else result'' - return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result''' - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = try $ htmlTag "blockquote" >> spaces >> - blocksTilEnd "blockquote" >>= (return . BlockQuote) - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - (_, attribs) <- htmlTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ol" - return $ OrderedList (start, style, DefaultDelim) items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - htmlTag "ul" - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ul" - return $ BulletList items - -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - htmlTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return $ DefinitionList items - -definitionListItem :: GenParser Char ParserState ([Inline], [Block]) -definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = intercalate [LineBreak] terms - return (term, concat defs) - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= - return . Para . normalizeSpaces - --- --- plain block --- - -plain :: GenParser Char ParserState Block -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice [ charRef - , strong - , emph - , superscript - , subscript - , strikeout - , spanStrikeout - , code - , str - , linebreak - , whitespace - , link - , image - , rawHtmlInline - ] <?> "inline" - -code :: GenParser Char ParserState Inline -code = try $ do - htmlTag "code" - result <- manyTill anyChar (htmlEndTag "code") - -- remove internal line breaks, leading and trailing space, - -- and decode character references - return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ - intercalate " " $ lines result - -rawHtmlInline :: GenParser Char ParserState Inline -rawHtmlInline = do - result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag - state <- getState - if stateParseRaw state then return (HtmlInline result) else return (Str "") - -betweenTags :: [Char] -> GenParser Char ParserState [Inline] -betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= - return . normalizeSpaces - -emph :: GenParser Char ParserState Inline -emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph - -strong :: GenParser Char ParserState Inline -strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong - -superscript :: GenParser Char ParserState Inline -superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript - -subscript :: GenParser Char ParserState Inline -subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript - -strikeout :: GenParser Char ParserState Inline -strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= - return . Strikeout - -spanStrikeout :: GenParser Char ParserState Inline -spanStrikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (_, attributes) <- htmlTag "span" - result <- case (extractAttribute "class" attributes) of - Just "strikeout" -> inlinesTilEnd "span" - _ -> fail "not a strikeout" - return $ Strikeout result - -whitespace :: GenParser Char st Inline -whitespace = many1 space >> return Space - --- hard line break -linebreak :: GenParser Char ParserState Inline -linebreak = htmlTag "br" >> optional newline >> return LineBreak - -str :: GenParser Char st Inline -str = many1 (noneOf "<& \t\n") >>= return . Str - --- --- links and images --- - --- extract contents of attribute (attribute names are case-insensitive) -extractAttribute :: [Char] -> [([Char], String)] -> Maybe String -extractAttribute _ [] = Nothing -extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName - in if attrName' == name' - then Just (decodeCharacterReferences contents) - else extractAttribute name rest - -link :: GenParser Char ParserState Inline -link = try $ do - (_, attributes) <- htmlTag "a" - url <- case (extractAttribute "href" attributes) of - Just url -> return url - Nothing -> fail "no href" - let title = fromMaybe "" $ extractAttribute "title" attributes - lab <- inlinesTilEnd "a" - return $ Link (normalizeSpaces lab) (url, title) - -image :: GenParser Char ParserState Inline -image = try $ do - (_, attributes) <- htmlTag "img" - url <- case (extractAttribute "src" attributes) of - Just url -> return url - Nothing -> fail "no src" - let title = fromMaybe "" $ extractAttribute "title" attributes - let alt = fromMaybe "" (extractAttribute "alt" attributes) - return $ Image [Str alt] (url, title) - diff --git a/Text/Pandoc/Readers/LaTeX.hs b/Text/Pandoc/Readers/LaTeX.hs deleted file mode 100644 index 9ba5bf372..000000000 --- a/Text/Pandoc/Readers/LaTeX.hs +++ /dev/null @@ -1,774 +0,0 @@ -{- -Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-8 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of LaTeX to 'Pandoc' document. --} -module Text.Pandoc.Readers.LaTeX ( - readLaTeX, - rawLaTeXInline, - rawLaTeXEnvironment' - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Data.Maybe ( fromMaybe ) -import Data.Char ( chr ) -import Data.List ( isPrefixOf, isSuffixOf ) - --- | Parse LaTeX from string and return 'Pandoc' document. -readLaTeX :: ParserState -- ^ Parser state, including options for parser - -> String -- ^ String to parse - -> Pandoc -readLaTeX = readWith parseLaTeX - --- characters with special meaning -specialChars :: [Char] -specialChars = "\\`$%^&_~#{}\n \t|<>'\"-" - --- --- utility functions --- - --- | Returns text between brackets and its matching pair. -bracketedText :: Char -> Char -> GenParser Char st [Char] -bracketedText openB closeB = do - result <- charsInBalanced' openB closeB - return $ [openB] ++ result ++ [closeB] - --- | Returns an option or argument of a LaTeX command. -optOrArg :: GenParser Char st [Char] -optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']' - --- | True if the string begins with '{'. -isArg :: [Char] -> Bool -isArg ('{':_) = True -isArg _ = False - --- | Returns list of options and arguments of a LaTeX command. -commandArgs :: GenParser Char st [[Char]] -commandArgs = many optOrArg - --- | Parses LaTeX command, returns (name, star, list of options or arguments). -command :: GenParser Char st ([Char], [Char], [[Char]]) -command = do - char '\\' - name <- many1 letter - star <- option "" (string "*") -- some commands have starred versions - args <- commandArgs - return (name, star, args) - -begin :: [Char] -> GenParser Char st [Char] -begin name = try $ do - string $ "\\begin{" ++ name ++ "}" - optional commandArgs - spaces - return name - -end :: [Char] -> GenParser Char st [Char] -end name = try $ do - string $ "\\end{" ++ name ++ "}" - return name - --- | Returns a list of block elements containing the contents of an --- environment. -environment :: [Char] -> GenParser Char ParserState [Block] -environment name = try $ begin name >> spaces >> manyTill block (end name) >>~ spaces - -anyEnvironment :: GenParser Char ParserState Block -anyEnvironment = try $ do - string "\\begin{" - name <- many letter - star <- option "" (string "*") -- some environments have starred variants - char '}' - optional commandArgs - spaces - contents <- manyTill block (end (name ++ star)) - spaces - return $ BlockQuote contents - --- --- parsing documents --- - --- | Process LaTeX preamble, extracting metadata. -processLaTeXPreamble :: GenParser Char ParserState () -processLaTeXPreamble = try $ manyTill - (choice [bibliographic, comment, unknownCommand, nullBlock]) - (try (string "\\begin{document}")) >> - spaces - --- | Parse LaTeX and return 'Pandoc'. -parseLaTeX :: GenParser Char ParserState Pandoc -parseLaTeX = do - optional processLaTeXPreamble -- preamble might not be present (fragment) - spaces - blocks <- parseBlocks - spaces - optional $ try (string "\\end{document}" >> many anyChar) - -- might not be present (fragment) - spaces - eof - state <- getState - let blocks' = filter (/= Null) blocks - let title' = stateTitle state - let authors' = stateAuthors state - let date' = stateDate state - return $ Pandoc (Meta title' authors' date') blocks' - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = spaces >> many block - -block :: GenParser Char ParserState Block -block = choice [ hrule - , codeBlock - , header - , list - , blockQuote - , comment - , bibliographic - , para - , itemBlock - , unknownEnvironment - , ignore - , unknownCommand ] <?> "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = try $ do - char '\\' - subs <- many (try (string "sub")) - string "section" - optional (char '*') - char '{' - title' <- manyTill inline (char '}') - spaces - return $ Header (length subs + 1) (normalizeSpaces title') - --- --- hrule block --- - -hrule :: GenParser Char st Block -hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", - "\\newpage" ] >> spaces >> return HorizontalRule - --- --- code blocks --- - -codeBlock :: GenParser Char ParserState Block -codeBlock = codeBlockWith "verbatim" <|> codeBlockWith "Verbatim" <|> lhsCodeBlock --- Note: Verbatim is from fancyvrb. - -codeBlockWith :: String -> GenParser Char st Block -codeBlockWith env = try $ do - string ("\\begin{" ++ env ++ "}") -- don't use begin function because it - -- gobbles whitespace - optional blanklines -- we want to gobble blank lines, but not - -- leading space - contents <- manyTill anyChar (try (string $ "\\end{" ++ env ++ "}")) - spaces - let classes = if env == "code" then ["haskell"] else [] - return $ CodeBlock ("",classes,[]) (stripTrailingNewlines contents) - -lhsCodeBlock :: GenParser Char ParserState Block -lhsCodeBlock = do - failUnlessLHS - (CodeBlock (_,_,_) cont) <- codeBlockWith "code" - return $ CodeBlock ("", ["sourceCode","haskell"], []) cont - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>= - return . BlockQuote - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = bulletList <|> orderedList <|> definitionList <?> "list" - -listItem :: GenParser Char ParserState ([Inline], [Block]) -listItem = try $ do - ("item", _, args) <- command - spaces - state <- getState - let oldParserContext = stateParserContext state - updateState (\s -> s {stateParserContext = ListItemState}) - blocks <- many block - updateState (\s -> s {stateParserContext = oldParserContext}) - opt <- case args of - ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x -> - parseFromString (many inline) $ tail $ init x - _ -> return [] - return (opt, blocks) - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - string "\\begin{enumerate}" - (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ - try $ do failIfStrict - char '[' - res <- anyOrderedListMarker - char ']' - return res - spaces - option "" $ try $ do string "\\setlength{\\itemindent}" - char '{' - manyTill anyChar (char '}') - spaces - start <- option 1 $ try $ do failIfStrict - string "\\setcounter{enum" - many1 (oneOf "iv") - string "}{" - num <- many1 digit - char '}' - spaces - return $ (read num) + 1 - items <- many listItem - end "enumerate" - spaces - return $ OrderedList (start, style, delim) $ map snd items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - begin "itemize" - spaces - items <- many listItem - end "itemize" - spaces - return (BulletList $ map snd items) - -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - begin "description" - spaces - items <- many listItem - end "description" - spaces - return (DefinitionList items) - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = do - res <- many1 inline - spaces - return $ if null (filter (`notElem` [Str "", Space]) res) - then Null - else Para $ normalizeSpaces res - --- --- title authors date --- - -bibliographic :: GenParser Char ParserState Block -bibliographic = choice [ maketitle, title, authors, date ] - -maketitle :: GenParser Char st Block -maketitle = try (string "\\maketitle") >> spaces >> return Null - -title :: GenParser Char ParserState Block -title = try $ do - string "\\title{" - tit <- manyTill inline (char '}') - spaces - updateState (\state -> state { stateTitle = tit }) - return Null - -authors :: GenParser Char ParserState Block -authors = try $ do - string "\\author{" - authors' <- manyTill anyChar (char '}') - spaces - let authors'' = map removeLeadingTrailingSpace $ lines $ - substitute "\\\\" "\n" authors' - updateState (\s -> s { stateAuthors = authors'' }) - return Null - -date :: GenParser Char ParserState Block -date = try $ do - string "\\date{" - date' <- manyTill anyChar (char '}') - spaces - updateState (\state -> state { stateDate = date' }) - return Null - --- --- item block --- for use in unknown environments that aren't being parsed as raw latex --- - --- this forces items to be parsed in different blocks -itemBlock :: GenParser Char ParserState Block -itemBlock = try $ do - ("item", _, args) <- command - state <- getState - if stateParserContext state == ListItemState - then fail "item should be handled by list block" - else if null args - then return Null - else return $ Plain [Str (stripFirstAndLast (head args))] - --- --- raw LaTeX --- - --- | Parse any LaTeX environment and return a Para block containing --- the whole literal environment as raw TeX. -rawLaTeXEnvironment :: GenParser Char st Block -rawLaTeXEnvironment = do - contents <- rawLaTeXEnvironment' - spaces - return $ Para [TeX contents] - --- | Parse any LaTeX environment and return a string containing --- the whole literal environment as raw TeX. -rawLaTeXEnvironment' :: GenParser Char st String -rawLaTeXEnvironment' = try $ do - string "\\begin{" - name <- many1 letter - star <- option "" (string "*") -- for starred variants - let name' = name ++ star - char '}' - args <- option [] commandArgs - let argStr = concat args - contents <- manyTill (choice [ (many1 (noneOf "\\")), - rawLaTeXEnvironment', - string "\\" ]) - (end name') - return $ "\\begin{" ++ name' ++ "}" ++ argStr ++ - concat contents ++ "\\end{" ++ name' ++ "}" - -unknownEnvironment :: GenParser Char ParserState Block -unknownEnvironment = try $ do - state <- getState - result <- if stateParseRaw state -- check whether we should include raw TeX - then rawLaTeXEnvironment -- if so, get whole raw environment - else anyEnvironment -- otherwise just the contents - return result - --- \ignore{} is used conventionally in literate haskell for definitions --- that are to be processed by the compiler but not printed. -ignore :: GenParser Char ParserState Block -ignore = try $ do - ("ignore", _, _) <- command - spaces - return Null - -unknownCommand :: GenParser Char ParserState Block -unknownCommand = try $ do - notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description", - "document"] - state <- getState - if stateParserContext state == ListItemState - then notFollowedBy' $ string "\\item" - else return () - if stateParseRaw state - then do - (name, star, args) <- command - spaces - return $ Plain [TeX ("\\" ++ name ++ star ++ concat args)] - else do -- skip unknown command, leaving arguments to be parsed - char '\\' - letter - many (letter <|> digit) - optional (try $ string "{}") - spaces - return Null - --- latex comment -comment :: GenParser Char st Block -comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice [ str - , endline - , whitespace - , quoted - , apostrophe - , spacer - , strong - , math - , ellipses - , emDash - , enDash - , hyphen - , emph - , strikeout - , superscript - , subscript - , ref - , lab - , code - , url - , link - , image - , footnote - , linebreak - , accentedChar - , specialChar - , rawLaTeXInline - , escapedChar - , unescapedChar - ] <?> "inline" - -accentedChar :: GenParser Char st Inline -accentedChar = normalAccentedChar <|> specialAccentedChar - -normalAccentedChar :: GenParser Char st Inline -normalAccentedChar = try $ do - char '\\' - accent <- oneOf "'`^\"~" - character <- (try $ char '{' >> letter >>~ char '}') <|> letter - let table = fromMaybe [] $ lookup character accentTable - let result = case lookup accent table of - Just num -> chr num - Nothing -> '?' - return $ Str [result] - --- an association list of letters and association list of accents --- and decimal character numbers. -accentTable :: [(Char, [(Char, Int)])] -accentTable = - [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), - ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), - ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), - ('N', [('~', 209)]), - ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), - ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), - ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), - ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), - ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), - ('n', [('~', 241)]), - ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), - ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] - -specialAccentedChar :: GenParser Char st Inline -specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, - oslash, pound, euro, copyright, sect ] - -ccedil :: GenParser Char st Inline -ccedil = try $ do - char '\\' - letter' <- oneOfStrings ["cc", "cC"] - let num = if letter' == "cc" then 231 else 199 - return $ Str [chr num] - -aring :: GenParser Char st Inline -aring = try $ do - char '\\' - letter' <- oneOfStrings ["aa", "AA"] - let num = if letter' == "aa" then 229 else 197 - return $ Str [chr num] - -iuml :: GenParser Char st Inline -iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >> - return (Str [chr 239]) - -szlig :: GenParser Char st Inline -szlig = try (string "\\ss") >> return (Str [chr 223]) - -oslash :: GenParser Char st Inline -oslash = try $ do - char '\\' - letter' <- choice [char 'o', char 'O'] - let num = if letter' == 'o' then 248 else 216 - return $ Str [chr num] - -aelig :: GenParser Char st Inline -aelig = try $ do - char '\\' - letter' <- oneOfStrings ["ae", "AE"] - let num = if letter' == "ae" then 230 else 198 - return $ Str [chr num] - -pound :: GenParser Char st Inline -pound = try (string "\\pounds") >> return (Str [chr 163]) - -euro :: GenParser Char st Inline -euro = try (string "\\euro") >> return (Str [chr 8364]) - -copyright :: GenParser Char st Inline -copyright = try (string "\\copyright") >> return (Str [chr 169]) - -sect :: GenParser Char st Inline -sect = try (string "\\S") >> return (Str [chr 167]) - -escapedChar :: GenParser Char st Inline -escapedChar = do - result <- escaped (oneOf " $%&_#{}\n") - return $ if result == Str "\n" then Str " " else result - --- nonescaped special characters -unescapedChar :: GenParser Char st Inline -unescapedChar = oneOf "`$^&_#{}|<>" >>= return . (\c -> Str [c]) - -specialChar :: GenParser Char st Inline -specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ] - -backslash :: GenParser Char st Inline -backslash = try (string "\\textbackslash") >> optional (try $ string "{}") >> return (Str "\\") - -tilde :: GenParser Char st Inline -tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~") - -caret :: GenParser Char st Inline -caret = try (string "\\^{}") >> return (Str "^") - -bar :: GenParser Char st Inline -bar = try (string "\\textbar") >> optional (try $ string "{}") >> return (Str "\\") - -lt :: GenParser Char st Inline -lt = try (string "\\textless") >> optional (try $ string "{}") >> return (Str "<") - -gt :: GenParser Char st Inline -gt = try (string "\\textgreater") >> optional (try $ string "{}") >> return (Str ">") - -doubleQuote :: GenParser Char st Inline -doubleQuote = char '"' >> return (Str "\"") - -code :: GenParser Char ParserState Inline -code = code1 <|> code2 <|> lhsInlineCode - -code1 :: GenParser Char st Inline -code1 = try $ do - string "\\verb" - marker <- anyChar - result <- manyTill anyChar (char marker) - return $ Code $ removeLeadingTrailingSpace result - -code2 :: GenParser Char st Inline -code2 = try $ do - string "\\texttt{" - result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') - return $ Code result - -lhsInlineCode :: GenParser Char ParserState Inline -lhsInlineCode = try $ do - failUnlessLHS - char '|' - result <- manyTill (noneOf "|\n") (char '|') - return $ Code result - -emph :: GenParser Char ParserState Inline -emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> - manyTill inline (char '}') >>= return . Emph - -strikeout :: GenParser Char ParserState Inline -strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>= - return . Strikeout - -superscript :: GenParser Char ParserState Inline -superscript = try $ string "\\textsuperscript{" >> - manyTill inline (char '}') >>= return . Superscript - --- note: \textsubscript isn't a standard latex command, but we use --- a defined version in pandoc. -subscript :: GenParser Char ParserState Inline -subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>= - return . Subscript - -apostrophe :: GenParser Char ParserState Inline -apostrophe = char '\'' >> return Apostrophe - -quoted :: GenParser Char ParserState Inline -quoted = doubleQuoted <|> singleQuoted - -singleQuoted :: GenParser Char ParserState Inline -singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>= - return . Quoted SingleQuote . normalizeSpaces - -doubleQuoted :: GenParser Char ParserState Inline -doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>= - return . Quoted DoubleQuote . normalizeSpaces - -singleQuoteStart :: GenParser Char st Char -singleQuoteStart = char '`' - -singleQuoteEnd :: GenParser Char st () -singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum - -doubleQuoteStart :: CharParser st String -doubleQuoteStart = string "``" - -doubleQuoteEnd :: CharParser st String -doubleQuoteEnd = string "\"" <|> try (string "''") - -ellipses :: GenParser Char st Inline -ellipses = try $ string "\\ldots" >> optional (try $ string "{}") >> - return Ellipses - -enDash :: GenParser Char st Inline -enDash = try (string "--") >> return EnDash - -emDash :: GenParser Char st Inline -emDash = try (string "---") >> return EmDash - -hyphen :: GenParser Char st Inline -hyphen = char '-' >> return (Str "-") - -lab :: GenParser Char st Inline -lab = try $ do - string "\\label{" - result <- manyTill anyChar (char '}') - return $ Str $ "(" ++ result ++ ")" - -ref :: GenParser Char st Inline -ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str - -strong :: GenParser Char ParserState Inline -strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= - return . Strong - -whitespace :: GenParser Char st Inline -whitespace = many1 (oneOf "~ \t") >> return Space - --- hard line break -linebreak :: GenParser Char st Inline -linebreak = try (string "\\\\") >> return LineBreak - -spacer :: GenParser Char st Inline -spacer = try (string "\\,") >> return (Str "") - -str :: GenParser Char st Inline -str = many1 (noneOf specialChars) >>= return . Str - --- endline internal to paragraph -endline :: GenParser Char st Inline -endline = try $ newline >> notFollowedBy blankline >> return Space - --- math -math :: GenParser Char st Inline -math = (math3 >>= return . Math DisplayMath) - <|> (math1 >>= return . Math InlineMath) - <|> (math2 >>= return . Math InlineMath) - <|> (math4 >>= return . Math DisplayMath) - <|> (math5 >>= return . Math DisplayMath) - <|> (math6 >>= return . Math DisplayMath) - <?> "math" - -math1 :: GenParser Char st String -math1 = try $ char '$' >> manyTill anyChar (char '$') - -math2 :: GenParser Char st String -math2 = try $ string "\\(" >> manyTill anyChar (try $ string "\\)") - -math3 :: GenParser Char st String -math3 = try $ char '$' >> math1 >>~ char '$' - -math4 :: GenParser Char st String -math4 = try $ do - name <- begin "equation" <|> begin "equation*" <|> begin "displaymath" <|> begin "displaymath*" - spaces - manyTill anyChar (end name) - -math5 :: GenParser Char st String -math5 = try $ (string "\\[") >> spaces >> manyTill anyChar (try $ string "\\]") - -math6 :: GenParser Char st String -math6 = try $ do - name <- begin "eqnarray" <|> begin "eqnarray*" - spaces - res <- manyTill anyChar (end name) - return $ filter (/= '&') res -- remove eqnarray alignment codes - --- --- links and images --- - -url :: GenParser Char ParserState Inline -url = try $ do - string "\\url" - url' <- charsInBalanced '{' '}' - return $ Link [Code url'] (url', "") - -link :: GenParser Char ParserState Inline -link = try $ do - string "\\href{" - url' <- manyTill anyChar (char '}') - char '{' - label' <- manyTill inline (char '}') - return $ Link (normalizeSpaces label') (url', "") - -image :: GenParser Char ParserState Inline -image = try $ do - ("includegraphics", _, args) <- command - let args' = filter isArg args -- filter out options - let src = if null args' then - ("", "") - else - (stripFirstAndLast (head args'), "") - return $ Image [Str "image"] src - -footnote :: GenParser Char ParserState Inline -footnote = try $ do - (name, _, (contents:[])) <- command - if ((name == "footnote") || (name == "thanks")) - then string "" - else fail "not a footnote or thanks command" - let contents' = stripFirstAndLast contents - -- parse the extracted block, which may contain various block elements: - rest <- getInput - setInput $ contents' - blocks <- parseBlocks - setInput rest - return $ Note blocks - --- | Parse any LaTeX command and return it in a raw TeX inline element. -rawLaTeXInline :: GenParser Char ParserState Inline -rawLaTeXInline = try $ do - notFollowedBy' $ oneOfStrings ["\\begin", "\\end", "\\item", "\\ignore"] - state <- getState - if stateParseRaw state - then do - (name, star, args) <- command - return $ TeX ("\\" ++ name ++ star ++ concat args) - else do -- skip unknown command, leaving arguments to be parsed - char '\\' - letter - many (letter <|> digit) - optional (try $ string "{}") - return $ Str "" diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs deleted file mode 100644 index 896f5832e..000000000 --- a/Text/Pandoc/Readers/Markdown.hs +++ /dev/null @@ -1,1243 +0,0 @@ -{-# LANGUAGE CPP #-} -{- -Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-8 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of markdown-formatted plain text to 'Pandoc' document. --} -module Text.Pandoc.Readers.Markdown ( - readMarkdown - ) where - -import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate ) -import Data.Ord ( comparing ) -import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper ) -import Data.Maybe -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) -import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, - anyHtmlInlineTag, anyHtmlTag, - anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement, unsanitaryURI ) -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Text.ParserCombinators.Parsec -import Control.Monad (when) - --- | Read markdown from an input string and return a Pandoc document. -readMarkdown :: ParserState -> String -> Pandoc -readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n") - --- --- Constants and data structure definitions --- - -spaceChars :: [Char] -spaceChars = " \t" - -bulletListMarkers :: [Char] -bulletListMarkers = "*+-" - -hruleChars :: [Char] -hruleChars = "*-_" - -setextHChars :: [Char] -setextHChars = "=-" - --- treat these as potentially non-text when parsing inline: -specialChars :: [Char] -specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221" - --- --- auxiliary functions --- - -indentSpaces :: GenParser Char ParserState [Char] -indentSpaces = try $ do - state <- getState - let tabStop = stateTabStop state - try (count tabStop (char ' ')) <|> - (many (char ' ') >> string "\t") <?> "indentation" - -nonindentSpaces :: GenParser Char ParserState [Char] -nonindentSpaces = do - state <- getState - let tabStop = stateTabStop state - sps <- many (char ' ') - if length sps < tabStop - then return sps - else unexpected "indented line" - --- | Fail unless we're at beginning of a line. -failUnlessBeginningOfLine :: GenParser tok st () -failUnlessBeginningOfLine = do - pos <- getPosition - if sourceColumn pos == 1 then return () else fail "not beginning of line" - --- | Fail unless we're in "smart typography" mode. -failUnlessSmart :: GenParser tok ParserState () -failUnlessSmart = do - state <- getState - if stateSmart state then return () else fail "Smart typography feature" - --- | Parse a sequence of inline elements between square brackets, --- including inlines between balanced pairs of square brackets. -inlinesInBalancedBrackets :: GenParser Char ParserState Inline - -> GenParser Char ParserState [Inline] -inlinesInBalancedBrackets parser = try $ do - char '[' - result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser - if res == "[" - then return () - else pzero - bal <- inlinesInBalancedBrackets parser - return $ [Str "["] ++ bal ++ [Str "]"]) - <|> (count 1 parser)) - (char ']') - return $ concat result - --- --- document structure --- - -titleLine :: GenParser Char ParserState [Inline] -titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline - -authorsLine :: GenParser Char st [String] -authorsLine = try $ do - char '%' - skipSpaces - authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") - newline - return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors - -dateLine :: GenParser Char st String -dateLine = try $ do - char '%' - skipSpaces - date <- many (noneOf "\n") - newline - return $ decodeCharacterReferences $ removeTrailingSpace date - -titleBlock :: GenParser Char ParserState ([Inline], [String], [Char]) -titleBlock = try $ do - failIfStrict - title <- option [] titleLine - author <- option [] authorsLine - date <- option "" dateLine - optional blanklines - return (title, author, date) - -parseMarkdown :: GenParser Char ParserState Pandoc -parseMarkdown = do - -- markdown allows raw HTML - updateState (\state -> state { stateParseRaw = True }) - startPos <- getPosition - -- go through once just to get list of reference keys - -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= - return . concat - setInput docMinusKeys - setPosition startPos - st <- getState - -- go through again for notes unless strict... - if stateStrict st - then return () - else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= - return . concat - st' <- getState - let reversedNotes = stateNotes st' - updateState $ \s -> s { stateNotes = reverse reversedNotes } - setInput docMinusNotes - setPosition startPos - -- now parse it for real... - (title, author, date) <- option ([],[],"") titleBlock - blocks <- parseBlocks - return $ Pandoc (Meta title author date) $ filter (/= Null) blocks - --- --- initial pass for references and notes --- - -referenceKey :: GenParser Char ParserState [Char] -referenceKey = try $ do - startPos <- getPosition - nonindentSpaces - lab <- reference - char ':' - skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') - let sourceURL excludes = many $ - optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' ')) - src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" - tit <- option "" referenceTitle - blanklines - endPos <- getPosition - let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit)) - st <- getState - let oldkeys = stateKeys st - updateState $ \s -> s { stateKeys = newkey : oldkeys } - -- return blanks so line count isn't affected - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' - -referenceTitle :: GenParser Char st String -referenceTitle = try $ do - skipSpaces >> optional newline >> skipSpaces - tit <- (charsInBalanced '(' ')' >>= return . unwords . words) - <|> do delim <- char '\'' <|> char '"' - manyTill anyChar (try (char delim >> skipSpaces >> - notFollowedBy (noneOf ")\n"))) - return $ decodeCharacterReferences tit - -noteMarker :: GenParser Char st [Char] -noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']') - -rawLine :: GenParser Char ParserState [Char] -rawLine = do - notFollowedBy blankline - notFollowedBy' noteMarker - contents <- many1 nonEndline - end <- option "" (newline >> optional indentSpaces >> return "\n") - return $ contents ++ end - -rawLines :: GenParser Char ParserState [Char] -rawLines = many1 rawLine >>= return . concat - -noteBlock :: GenParser Char ParserState [Char] -noteBlock = try $ do - startPos <- getPosition - ref <- noteMarker - char ':' - optional blankline - optional indentSpaces - raw <- sepBy rawLines (try (blankline >> indentSpaces)) - optional blanklines - endPos <- getPosition - -- parse the extracted text, which may contain various block elements: - contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" - let newnote = (ref, contents) - st <- getState - let oldnotes = stateNotes st - updateState $ \s -> s { stateNotes = newnote : oldnotes } - -- return blanks so line count isn't affected - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = manyTill block eof - -block :: GenParser Char ParserState Block -block = do - st <- getState - choice (if stateStrict st - then [ header - , codeBlockIndented - , blockQuote - , hrule - , bulletList - , orderedList - , htmlBlock - , para - , plain - , nullBlock ] - else [ codeBlockDelimited - , header - , table - , codeBlockIndented - , lhsCodeBlock - , blockQuote - , hrule - , bulletList - , orderedList - , definitionList - , para - , rawHtmlBlocks - , plain - , nullBlock ]) <?> "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = setextHeader <|> atxHeader <?> "header" - -atxHeader :: GenParser Char ParserState Block -atxHeader = try $ do - level <- many1 (char '#') >>= return . length - notFollowedBy (char '.' <|> char ')') -- this would be a list - skipSpaces - text <- manyTill inline atxClosing >>= return . normalizeSpaces - return $ Header level text - -atxClosing :: GenParser Char st [Char] -atxClosing = try $ skipMany (char '#') >> blanklines - -setextHeader :: GenParser Char ParserState Block -setextHeader = try $ do - text <- many1Till inline newline - underlineChar <- oneOf setextHChars - many (char underlineChar) - blanklines - let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 - return $ Header level (normalizeSpaces text) - --- --- hrule block --- - -hrule :: GenParser Char st Block -hrule = try $ do - skipSpaces - start <- oneOf hruleChars - count 2 (skipSpaces >> char start) - skipMany (oneOf spaceChars <|> char start) - newline - optional blanklines - return HorizontalRule - --- --- code blocks --- - -indentedLine :: GenParser Char ParserState [Char] -indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") - -codeBlockDelimiter :: Maybe Int - -> GenParser Char st (Int, ([Char], [[Char]], [([Char], [Char])])) -codeBlockDelimiter len = try $ do - size <- case len of - Just l -> count l (char '~') >> many (char '~') >> return l - Nothing -> count 3 (char '~') >> many (char '~') >>= - return . (+ 3) . length - many spaceChar - attr <- option ([],[],[]) attributes - blankline - return (size, attr) - -attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) -attributes = try $ do - char '{' - many spaceChar - attrs <- many (attribute >>~ many spaceChar) - char '}' - let (ids, classes, keyvals) = unzip3 attrs - let id' = if null ids then "" else head ids - return (id', concat classes, concat keyvals) - -attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) -attribute = identifierAttr <|> classAttr <|> keyValAttr - -identifier :: GenParser Char st [Char] -identifier = do - first <- letter - rest <- many alphaNum - return (first:rest) - -identifierAttr :: GenParser Char st ([Char], [a], [a1]) -identifierAttr = try $ do - char '#' - result <- identifier - return (result,[],[]) - -classAttr :: GenParser Char st ([Char], [[Char]], [a]) -classAttr = try $ do - char '.' - result <- identifier - return ("",[result],[]) - -keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])]) -keyValAttr = try $ do - key <- identifier - char '=' - char '"' - val <- manyTill (noneOf "\n") (char '"') - return ("",[],[(key,val)]) - -codeBlockDelimited :: GenParser Char st Block -codeBlockDelimited = try $ do - (size, attr) <- codeBlockDelimiter Nothing - contents <- manyTill anyLine (codeBlockDelimiter (Just size)) - blanklines - return $ CodeBlock attr $ intercalate "\n" contents - -codeBlockIndented :: GenParser Char ParserState Block -codeBlockIndented = do - contents <- many1 (indentedLine <|> - try (do b <- blanklines - l <- indentedLine - return $ b ++ l)) - optional blanklines - return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents - -lhsCodeBlock :: GenParser Char ParserState Block -lhsCodeBlock = do - failUnlessLHS - contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX - return $ CodeBlock ("",["sourceCode","haskell"],[]) contents - -lhsCodeBlockLaTeX :: GenParser Char ParserState String -lhsCodeBlockLaTeX = try $ do - string "\\begin{code}" - manyTill spaceChar newline - contents <- many1Till anyChar (try $ string "\\end{code}") - blanklines - return $ stripTrailingNewlines contents - -lhsCodeBlockBird :: GenParser Char ParserState String -lhsCodeBlockBird = try $ do - pos <- getPosition - when (sourceColumn pos /= 1) $ fail "Not in first column" - lns <- many1 birdTrackLine - -- if (as is normal) there is always a space after >, drop it - let lns' = if all (\ln -> null ln || take 1 ln == " ") lns - then map (drop 1) lns - else lns - blanklines - return $ intercalate "\n" lns' - -birdTrackLine :: GenParser Char st [Char] -birdTrackLine = do - char '>' - manyTill anyChar newline - - --- --- block quotes --- - -emailBlockQuoteStart :: GenParser Char ParserState Char -emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ') - -emailBlockQuote :: GenParser Char ParserState [[Char]] -emailBlockQuote = try $ do - emailBlockQuoteStart - raw <- sepBy (many (nonEndline <|> - (try (endline >> notFollowedBy emailBlockQuoteStart >> - return '\n')))) - (try (newline >> emailBlockQuoteStart)) - newline <|> (eof >> return '\n') - optional blanklines - return raw - -blockQuote :: GenParser Char ParserState Block -blockQuote = do - raw <- emailBlockQuote - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" - return $ BlockQuote contents - --- --- list blocks --- - -bulletListStart :: GenParser Char ParserState () -bulletListStart = try $ do - optional newline -- if preceded by a Plain block in a list context - nonindentSpaces - notFollowedBy' hrule -- because hrules start out just like lists - oneOf bulletListMarkers - spaceChar - skipSpaces - -anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim) -anyOrderedListStart = try $ do - optional newline -- if preceded by a Plain block in a list context - nonindentSpaces - notFollowedBy $ string "p." >> spaceChar >> digit -- page number - state <- getState - if stateStrict state - then do many1 digit - char '.' - spaceChar - return (1, DefaultStyle, DefaultDelim) - else do (num, style, delim) <- anyOrderedListMarker - -- if it could be an abbreviated first name, insist on more than one space - if delim == Period && (style == UpperAlpha || (style == UpperRoman && - num `elem` [1, 5, 10, 50, 100, 500, 1000])) - then char '\t' <|> (char ' ' >>~ notFollowedBy (satisfy isUpper)) - else spaceChar - skipSpaces - return (num, style, delim) - -listStart :: GenParser Char ParserState () -listStart = bulletListStart <|> (anyOrderedListStart >> return ()) - --- parse a line of a list item (start = parser for beginning of list item) -listLine :: GenParser Char ParserState [Char] -listLine = try $ do - notFollowedBy' listStart - notFollowedBy blankline - notFollowedBy' (do indentSpaces - many (spaceChar) - listStart) - line <- manyTill anyChar newline - return $ line ++ "\n" - --- parse raw text for one list item, excluding start marker and continuations -rawListItem :: GenParser Char ParserState [Char] -rawListItem = try $ do - listStart - result <- many1 listLine - blanks <- many blankline - return $ concat result ++ blanks - --- continuation of a list item - indented and separated by blankline --- or (in compact lists) endline. --- note: nested lists are parsed as continuations -listContinuation :: GenParser Char ParserState [Char] -listContinuation = try $ do - lookAhead indentSpaces - result <- many1 listContinuationLine - blanks <- many blankline - return $ concat result ++ blanks - -listContinuationLine :: GenParser Char ParserState [Char] -listContinuationLine = try $ do - notFollowedBy blankline - notFollowedBy' listStart - optional indentSpaces - result <- manyTill anyChar newline - return $ result ++ "\n" - -listItem :: GenParser Char ParserState [Block] -listItem = try $ do - first <- rawListItem - continuations <- many listContinuation - -- parsing with ListItemState forces markers at beginning of lines to - -- count as list item markers, even if not separated by blank space. - -- see definition of "endline" - state <- getState - let oldContext = stateParserContext state - setState $ state {stateParserContext = ListItemState} - -- parse the extracted block, which may contain various block elements: - let raw = concat (first:continuations) - contents <- parseFromString parseBlocks raw - updateState (\st -> st {stateParserContext = oldContext}) - return contents - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - (start, style, delim) <- lookAhead anyOrderedListStart - items <- many1 listItem - return $ OrderedList (start, style, delim) $ compactify items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - lookAhead bulletListStart - many1 listItem >>= return . BulletList . compactify - --- definition lists - -definitionListItem :: GenParser Char ParserState ([Inline], [Block]) -definitionListItem = try $ do - notFollowedBy blankline - notFollowedBy' indentSpaces - -- first, see if this has any chance of being a definition list: - lookAhead (anyLine >> char ':') - term <- manyTill inline newline - raw <- many1 defRawBlock - state <- getState - let oldContext = stateParserContext state - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ concat raw - updateState (\st -> st {stateParserContext = oldContext}) - return ((normalizeSpaces term), contents) - -defRawBlock :: GenParser Char ParserState [Char] -defRawBlock = try $ do - char ':' - state <- getState - let tabStop = stateTabStop state - try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") - firstline <- anyLine - rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) - trailing <- option "" blanklines - return $ firstline ++ "\n" ++ unlines rawlines ++ trailing - -definitionList :: GenParser Char ParserState Block -definitionList = do - items <- many1 definitionListItem - let (terms, defs) = unzip items - let defs' = compactify defs - let items' = zip terms defs' - return $ DefinitionList items' - --- --- paragraph block --- - -isHtmlOrBlank :: Inline -> Bool -isHtmlOrBlank (HtmlInline _) = True -isHtmlOrBlank (Space) = True -isHtmlOrBlank (LineBreak) = True -isHtmlOrBlank _ = False - -para :: GenParser Char ParserState Block -para = try $ do - result <- many1 inline - if all isHtmlOrBlank result - then fail "treat as raw HTML" - else return () - newline - blanklines <|> do st <- getState - if stateStrict st - then lookAhead (blockQuote <|> header) >> return "" - else pzero - return $ Para $ normalizeSpaces result - -plain :: GenParser Char ParserState Block -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- raw html --- - -htmlElement :: GenParser Char ParserState [Char] -htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" - -htmlBlock :: GenParser Char ParserState Block -htmlBlock = try $ do - failUnlessBeginningOfLine - first <- htmlElement - finalSpace <- many (oneOf spaceChars) - finalNewlines <- many newline - return $ RawHtml $ first ++ finalSpace ++ finalNewlines - --- True if tag is self-closing -isSelfClosing :: [Char] -> Bool -isSelfClosing tag = - isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag - -strictHtmlBlock :: GenParser Char ParserState [Char] -strictHtmlBlock = try $ do - tag <- anyHtmlBlockTag - let tag' = extractTagType tag - if isSelfClosing tag || tag' == "hr" - then return tag - else do contents <- many (notFollowedBy' (htmlEndTag tag') >> - (htmlElement <|> (count 1 anyChar))) - end <- htmlEndTag tag' - return $ tag ++ concat contents ++ end - -rawHtmlBlocks :: GenParser Char ParserState Block -rawHtmlBlocks = do - htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock - sps <- do sp1 <- many spaceChar - sp2 <- option "" (blankline >> return "\n") - sp3 <- many spaceChar - sp4 <- option "" blanklines - return $ sp1 ++ sp2 ++ sp3 ++ sp4 - -- note: we want raw html to be able to - -- precede a code block, when separated - -- by a blank line - return $ blk ++ sps - let combined = concat htmlBlocks - let combined' = if last combined == '\n' then init combined else combined - return $ RawHtml combined' - --- --- Tables --- - --- Parse a dashed line with optional trailing spaces; return its length --- and the length including trailing space. -dashedLine :: Char - -> GenParser Char st (Int, Int) -dashedLine ch = do - dashes <- many1 (char ch) - sp <- many spaceChar - return $ (length dashes, length $ dashes ++ sp) - --- Parse a table header with dashed lines of '-' preceded by --- one line of text. -simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) -simpleTableHeader = try $ do - rawContent <- anyLine - initSp <- nonindentSpaces - dashes <- many1 (dashedLine '-') - newline - let (lengths, lines') = unzip dashes - let indices = scanl (+) (length initSp) lines' - let rawHeads = tail $ splitByIndices (init indices) rawContent - let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths - return (rawHeads, aligns, indices) - --- Parse a table footer - dashed lines followed by blank line. -tableFooter :: GenParser Char ParserState [Char] -tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines - --- Parse a table separator - dashed line. -tableSep :: GenParser Char ParserState String -tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n" - --- Parse a raw line and split it into chunks by indices. -rawTableLine :: [Int] - -> GenParser Char ParserState [String] -rawTableLine indices = do - notFollowedBy' (blanklines <|> tableFooter) - line <- many1Till anyChar newline - return $ map removeLeadingTrailingSpace $ tail $ - splitByIndices (init indices) line - --- Parse a table line and return a list of lists of blocks (columns). -tableLine :: [Int] - -> GenParser Char ParserState [[Block]] -tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) - --- Parse a multiline table row and return a list of blocks (columns). -multilineRow :: [Int] - -> GenParser Char ParserState [[Block]] -multilineRow indices = do - colLines <- many1 (rawTableLine indices) - optional blanklines - let cols = map unlines $ transpose colLines - mapM (parseFromString (many plain)) cols - --- Calculate relative widths of table columns, based on indices -widthsFromIndices :: Int -- Number of columns on terminal - -> [Int] -- Indices - -> [Double] -- Fractional relative sizes of columns -widthsFromIndices _ [] = [] -widthsFromIndices numColumns indices = - let lengths = zipWith (-) indices (0:indices) - totLength = sum lengths - quotient = if totLength > numColumns - then fromIntegral totLength - else fromIntegral numColumns - fracs = map (\l -> (fromIntegral l) / quotient) lengths in - tail fracs - --- Parses a table caption: inlines beginning with 'Table:' --- and followed by blank lines. -tableCaption :: GenParser Char ParserState [Inline] -tableCaption = try $ do - nonindentSpaces - string "Table:" - result <- many1 inline - blanklines - return $ normalizeSpaces result - --- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. -tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) - -> ([Int] -> GenParser Char ParserState [[Block]]) - -> GenParser Char ParserState end - -> GenParser Char ParserState Block -tableWith headerParser lineParser footerParser = try $ do - (rawHeads, aligns, indices) <- headerParser - lines' <- many1Till (lineParser indices) footerParser - caption <- option [] tableCaption - heads <- mapM (parseFromString (many plain)) rawHeads - state <- getState - let numColumns = stateColumns state - let widths = widthsFromIndices numColumns indices - return $ Table caption aligns widths heads lines' - --- Parse a simple table with '---' header and one line per row. -simpleTable :: GenParser Char ParserState Block -simpleTable = tableWith simpleTableHeader tableLine blanklines - --- Parse a multiline table: starts with row of '-' on top, then header --- (which may be multiline), then the rows, --- which may be multiline, separated by blank lines, and --- ending with a footer (dashed line followed by blank line). -multilineTable :: GenParser Char ParserState Block -multilineTable = tableWith multilineTableHeader multilineRow tableFooter - -multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int]) -multilineTableHeader = try $ do - tableSep - rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) - initSp <- nonindentSpaces - dashes <- many1 (dashedLine '-') - newline - let (lengths, lines') = unzip dashes - let indices = scanl (+) (length initSp) lines' - let rawHeadsList = transpose $ map - (\ln -> tail $ splitByIndices (init indices) ln) - rawContent - let rawHeads = map (intercalate " ") rawHeadsList - let aligns = zipWith alignType rawHeadsList lengths - return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) - --- Returns an alignment type for a table, based on a list of strings --- (the rows of the column header) and a number (the length of the --- dashed line under the rows. -alignType :: [String] - -> Int - -> Alignment -alignType [] _ = AlignDefault -alignType strLst len = - let s = head $ sortBy (comparing length) $ - map removeTrailingSpace strLst - leftSpace = if null s then False else (s !! 0) `elem` " \t" - rightSpace = length s < len || (s !! (len - 1)) `elem` " \t" - in case (leftSpace, rightSpace) of - (True, False) -> AlignRight - (False, True) -> AlignLeft - (True, True) -> AlignCenter - (False, False) -> AlignDefault - -table :: GenParser Char ParserState Block -table = simpleTable <|> multilineTable <?> "table" - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice inlineParsers <?> "inline" - -inlineParsers :: [GenParser Char ParserState Inline] -inlineParsers = [ abbrev - , str - , smartPunctuation - , whitespace - , endline - , code - , charRef - , strong - , emph - , note - , inlineNote - , link -#ifdef _CITEPROC - , inlineCitation -#endif - , image - , math - , strikeout - , superscript - , subscript - , autoLink - , rawHtmlInline' - , rawLaTeXInline' - , escapedChar - , symbol - , ltSign ] - -inlineNonLink :: GenParser Char ParserState Inline -inlineNonLink = (choice $ - map (\parser -> try (parser >>= failIfLink)) inlineParsers) - <?> "inline (non-link)" - -failIfLink :: Inline -> GenParser tok st Inline -failIfLink (Link _ _) = pzero -failIfLink elt = return elt - -escapedChar :: GenParser Char ParserState Inline -escapedChar = do - char '\\' - state <- getState - result <- option '\\' $ if stateStrict state - then oneOf "\\`*_{}[]()>#+-.!~" - else satisfy (not . isAlphaNum) - let result' = if result == ' ' - then '\160' -- '\ ' is a nonbreaking space - else result - return $ Str [result'] - -ltSign :: GenParser Char ParserState Inline -ltSign = do - st <- getState - if stateStrict st - then char '<' - else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html - return $ Str ['<'] - -specialCharsMinusLt :: [Char] -specialCharsMinusLt = filter (/= '<') specialChars - -symbol :: GenParser Char ParserState Inline -symbol = do - result <- oneOf specialCharsMinusLt - return $ Str [result] - --- parses inline code, between n `s and n `s -code :: GenParser Char ParserState Inline -code = try $ do - starts <- many1 (char '`') - skipSpaces - result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> - (char '\n' >> return " ")) - (try (skipSpaces >> count (length starts) (char '`') >> - notFollowedBy (char '`'))) - return $ Code $ removeLeadingTrailingSpace $ concat result - -mathWord :: GenParser Char st [Char] -mathWord = many1 ((noneOf " \t\n\\$") <|> - (try (char '\\') >>~ notFollowedBy (char '$'))) - -math :: GenParser Char ParserState Inline -math = (mathDisplay >>= return . Math DisplayMath) - <|> (mathInline >>= return . Math InlineMath) - -mathDisplay :: GenParser Char ParserState String -mathDisplay = try $ do - failIfStrict - string "$$" - many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$") - -mathInline :: GenParser Char ParserState String -mathInline = try $ do - failIfStrict - char '$' - notFollowedBy space - words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline))) - char '$' - notFollowedBy digit - return $ intercalate " " words' - -emph :: GenParser Char ParserState Inline -emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|> - (enclosed (char '_') (notFollowedBy' strong >> char '_' >> - notFollowedBy alphaNum) inline)) >>= - return . Emph . normalizeSpaces - -strong :: GenParser Char ParserState Inline -strong = ((enclosed (string "**") (try $ string "**") inline) <|> - (enclosed (string "__") (try $ string "__") inline)) >>= - return . Strong . normalizeSpaces - -strikeout :: GenParser Char ParserState Inline -strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= - return . Strikeout . normalizeSpaces - -superscript :: GenParser Char ParserState Inline -superscript = failIfStrict >> enclosed (char '^') (char '^') - (notFollowedBy' whitespace >> inline) >>= -- may not contain Space - return . Superscript - -subscript :: GenParser Char ParserState Inline -subscript = failIfStrict >> enclosed (char '~') (char '~') - (notFollowedBy' whitespace >> inline) >>= -- may not contain Space - return . Subscript - -abbrev :: GenParser Char ParserState Inline -abbrev = failUnlessSmart >> - (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160") - --- an string of letters followed by a period that does not end a sentence --- is assumed to be an abbreviation. It is assumed that sentences don't --- start with lowercase letters or numerals. -assumedAbbrev :: GenParser Char ParserState [Char] -assumedAbbrev = try $ do - result <- many1 $ satisfy isAlpha - string ". " - lookAhead $ satisfy (\x -> isLower x || isDigit x) - return result - --- these strings are treated as abbreviations even if they are followed --- by a capital letter (such as a name). -knownAbbrev :: GenParser Char ParserState [Char] -knownAbbrev = try $ do - result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen", - "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs", - "Sen", "Rep", "Pres", "Hon", "Rev" ] - string ". " - return result - -smartPunctuation :: GenParser Char ParserState Inline -smartPunctuation = failUnlessSmart >> - choice [ quoted, apostrophe, dash, ellipses ] - -apostrophe :: GenParser Char ParserState Inline -apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe - -quoted :: GenParser Char ParserState Inline -quoted = doubleQuoted <|> singleQuoted - -withQuoteContext :: QuoteContext - -> (GenParser Char ParserState Inline) - -> GenParser Char ParserState Inline -withQuoteContext context parser = do - oldState <- getState - let oldQuoteContext = stateQuoteContext oldState - setState oldState { stateQuoteContext = context } - result <- parser - newState <- getState - setState newState { stateQuoteContext = oldQuoteContext } - return result - -singleQuoted :: GenParser Char ParserState Inline -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= - return . Quoted SingleQuote . normalizeSpaces - -doubleQuoted :: GenParser Char ParserState Inline -doubleQuoted = try $ do - doubleQuoteStart - withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= - return . Quoted DoubleQuote . normalizeSpaces - -failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState () -failIfInQuoteContext context = do - st <- getState - if stateQuoteContext st == context - then fail "already inside quotes" - else return () - -singleQuoteStart :: GenParser Char ParserState Char -singleQuoteStart = do - failIfInQuoteContext InSingleQuote - char '\8216' <|> - (try $ do char '\'' - notFollowedBy (oneOf ")!],.;:-? \t\n") - notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> - satisfy (not . isAlphaNum))) - -- possess/contraction - return '\'') - -singleQuoteEnd :: GenParser Char st Char -singleQuoteEnd = try $ do - char '\8217' <|> char '\'' - notFollowedBy alphaNum - return '\'' - -doubleQuoteStart :: GenParser Char ParserState Char -doubleQuoteStart = do - failIfInQuoteContext InDoubleQuote - char '\8220' <|> - (try $ do char '"' - notFollowedBy (oneOf " \t\n") - return '"') - -doubleQuoteEnd :: GenParser Char st Char -doubleQuoteEnd = char '\8221' <|> char '"' - -ellipses :: GenParser Char st Inline -ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses - -dash :: GenParser Char st Inline -dash = enDash <|> emDash - -enDash :: GenParser Char st Inline -enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash - -emDash :: GenParser Char st Inline -emDash = oneOfStrings ["---", "--"] >> return EmDash - -whitespace :: GenParser Char ParserState Inline -whitespace = do - sps <- many1 (oneOf spaceChars) - if length sps >= 2 - then option Space (endline >> return LineBreak) - else return Space <?> "whitespace" - -nonEndline :: GenParser Char st Char -nonEndline = satisfy (/='\n') - -strChar :: GenParser Char st Char -strChar = noneOf (specialChars ++ spaceChars ++ "\n") - -str :: GenParser Char st Inline -str = many1 strChar >>= return . Str - --- an endline character that can be treated as a space, not a structural break -endline :: GenParser Char ParserState Inline -endline = try $ do - newline - notFollowedBy blankline - st <- getState - if stateStrict st - then do notFollowedBy emailBlockQuoteStart - notFollowedBy (char '#') -- atx header - else return () - -- parse potential list-starts differently if in a list: - if stateParserContext st == ListItemState - then notFollowedBy' (bulletListStart <|> - (anyOrderedListStart >> return ())) - else return () - return Space - --- --- links --- - --- a reference label for a link -reference :: GenParser Char ParserState [Inline] -reference = do notFollowedBy' (string "[^") -- footnote reference - result <- inlinesInBalancedBrackets inlineNonLink - return $ normalizeSpaces result - --- source for a link, with optional title -source :: GenParser Char st (String, [Char]) -source = - (try $ charsInBalanced '(' ')' >>= parseFromString source') <|> - -- the following is needed for cases like: [ref](/url(a). - (enclosed (char '(') (char ')') anyChar >>= - parseFromString source') - --- auxiliary function for source -source' :: GenParser Char st (String, [Char]) -source' = do - skipSpaces - let sourceURL excludes = many $ - optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' ')) - src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" - tit <- option "" linkTitle - skipSpaces - eof - return (intercalate "%20" $ words $ removeTrailingSpace src, tit) - -linkTitle :: GenParser Char st String -linkTitle = try $ do - (many1 spaceChar >> option '\n' newline) <|> newline - skipSpaces - delim <- oneOf "'\"" - tit <- manyTill (optional (char '\\') >> anyChar) - (try (char delim >> skipSpaces >> eof)) - return $ decodeCharacterReferences tit - -link :: GenParser Char ParserState Inline -link = try $ do - lab <- reference - src <- source <|> referenceLink lab - sanitize <- getState >>= return . stateSanitizeHTML - if sanitize && unsanitaryURI (fst src) - then fail "Unsanitary URI" - else return $ Link lab src - --- a link like [this][ref] or [this][] or [this] -referenceLink :: [Inline] - -> GenParser Char ParserState (String, [Char]) -referenceLink lab = do - ref <- option [] (try (optional (char ' ') >> - optional (newline >> skipSpaces) >> reference)) - let ref' = if null ref then lab else ref - state <- getState - case lookupKeySrc (stateKeys state) ref' of - Nothing -> fail "no corresponding key" - Just target -> return target - -autoLink :: GenParser Char ParserState Inline -autoLink = try $ do - char '<' - src <- uri <|> (emailAddress >>= (return . ("mailto:" ++))) - char '>' - let src' = if "mailto:" `isPrefixOf` src - then drop 7 src - else src - st <- getState - let sanitize = stateSanitizeHTML st - if sanitize && unsanitaryURI src - then fail "Unsanitary URI" - else return $ if stateStrict st - then Link [Str src'] (src, "") - else Link [Code src'] (src, "") - -image :: GenParser Char ParserState Inline -image = try $ do - char '!' - (Link lab src) <- link - return $ Image lab src - -note :: GenParser Char ParserState Inline -note = try $ do - failIfStrict - ref <- noteMarker - state <- getState - let notes = stateNotes state - case lookup ref notes of - Nothing -> fail "note not found" - Just contents -> return $ Note contents - -inlineNote :: GenParser Char ParserState Inline -inlineNote = try $ do - failIfStrict - char '^' - contents <- inlinesInBalancedBrackets inline - return $ Note [Para contents] - -rawLaTeXInline' :: GenParser Char ParserState Inline -rawLaTeXInline' = do - failIfStrict - (rawConTeXtEnvironment' >>= return . TeX) - <|> (rawLaTeXEnvironment' >>= return . TeX) - <|> rawLaTeXInline - -rawConTeXtEnvironment' :: GenParser Char st String -rawConTeXtEnvironment' = try $ do - string "\\start" - completion <- inBrackets (letter <|> digit <|> spaceChar) - <|> (many1 letter) - contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar)) - (try $ string "\\stop" >> string completion) - return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion - -inBrackets :: (GenParser Char st Char) -> GenParser Char st String -inBrackets parser = do - char '[' - contents <- many parser - char ']' - return $ "[" ++ contents ++ "]" - -rawHtmlInline' :: GenParser Char ParserState Inline -rawHtmlInline' = do - st <- getState - result <- if stateStrict st - then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] - else anyHtmlInlineTag - return $ HtmlInline result - -#ifdef _CITEPROC -inlineCitation :: GenParser Char ParserState Inline -inlineCitation = try $ do - failIfStrict - cit <- citeMarker - let citations = readWith parseCitation defaultParserState cit - mr <- mapM chkCit citations - if catMaybes mr /= [] - then return $ Cite citations [] - else fail "no citation found" - -chkCit :: Target -> GenParser Char ParserState (Maybe Target) -chkCit t = do - st <- getState - case lookupKeySrc (stateKeys st) [Str $ fst t] of - Just _ -> fail "This is a link" - Nothing -> if elem (fst t) $ stateCitations st - then return $ Just t - else return $ Nothing - -citeMarker :: GenParser Char ParserState String -citeMarker = char '[' >> manyTill ( noneOf "\n" <|> (newline >>~ notFollowedBy blankline) ) (char ']') - -parseCitation :: GenParser Char ParserState [(String,String)] -parseCitation = try $ sepBy (parseLabel) (oneOf ";") - -parseLabel :: GenParser Char ParserState (String,String) -parseLabel = try $ do - res <- sepBy (skipSpaces >> optional newline >> skipSpaces >> many1 (noneOf "@;")) (oneOf "@") - case res of - [lab,loc] -> return (lab, loc) - [lab] -> return (lab, "" ) - _ -> return ("" , "" ) - -#endif diff --git a/Text/Pandoc/Readers/RST.hs b/Text/Pandoc/Readers/RST.hs deleted file mode 100644 index 255054c10..000000000 --- a/Text/Pandoc/Readers/RST.hs +++ /dev/null @@ -1,707 +0,0 @@ -{- -Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-8 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion from reStructuredText to 'Pandoc' document. --} -module Text.Pandoc.Readers.RST ( - readRST - ) where -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.ParserCombinators.Parsec -import Control.Monad ( when ) -import Data.List ( findIndex, delete, intercalate ) - --- | Parse reStructuredText string and return Pandoc document. -readRST :: ParserState -> String -> Pandoc -readRST state s = (readWith parseRST) state (s ++ "\n\n") - --- --- Constants and data structure definitions ---- - -bulletListMarkers :: [Char] -bulletListMarkers = "*+-" - -underlineChars :: [Char] -underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" - --- treat these as potentially non-text when parsing inline: -specialChars :: [Char] -specialChars = "\\`|*_<>$:[-" - --- --- parsing documents --- - -isHeader :: Int -> Block -> Bool -isHeader n (Header x _) = x == n -isHeader _ _ = False - --- | Promote all headers in a list of blocks. (Part of --- title transformation for RST.) -promoteHeaders :: Int -> [Block] -> [Block] -promoteHeaders num ((Header level text):rest) = - (Header (level - num) text):(promoteHeaders num rest) -promoteHeaders num (other:rest) = other:(promoteHeaders num rest) -promoteHeaders _ [] = [] - --- | If list of blocks starts with a header (or a header and subheader) --- of level that are not found elsewhere, return it as a title and --- promote all the other headers. -titleTransform :: [Block] -- ^ list of blocks - -> ([Block], [Inline]) -- ^ modified list of blocks, title -titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title subtitle - if (any (isHeader 1) rest) || (any (isHeader 2) rest) - then ((Header 1 head1):(Header 2 head2):rest, []) - else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) -titleTransform ((Header 1 head1):rest) = -- title, no subtitle - if (any (isHeader 1) rest) - then ((Header 1 head1):rest, []) - else ((promoteHeaders 1 rest), head1) -titleTransform blocks = (blocks, []) - -parseRST :: GenParser Char ParserState Pandoc -parseRST = do - startPos <- getPosition - -- go through once just to get list of reference keys - -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat - setInput docMinusKeys - setPosition startPos - st <- getState - let reversedKeys = stateKeys st - updateState $ \s -> s { stateKeys = reverse reversedKeys } - -- now parse it for real... - blocks <- parseBlocks - let blocks' = filter (/= Null) blocks - state <- getState - let (blocks'', title) = if stateStandalone state - then titleTransform blocks' - else (blocks', []) - let authors = stateAuthors state - let date = stateDate state - let title' = if (null title) then (stateTitle state) else title - return $ Pandoc (Meta title' authors date) blocks'' - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = manyTill block eof - -block :: GenParser Char ParserState Block -block = choice [ codeBlock - , rawHtmlBlock - , rawLaTeXBlock - , fieldList - , blockQuote - , imageBlock - , unknownDirective - , header - , hrule - , list - , lineBlock - , lhsCodeBlock - , para - , plain - , nullBlock ] <?> "block" - --- --- field list --- - -fieldListItem :: String -> GenParser Char st ([Char], [Char]) -fieldListItem indent = try $ do - string indent - char ':' - name <- many1 alphaNum - string ": " - skipSpaces - first <- manyTill anyChar newline - rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >> - indentedBlock - return (name, intercalate " " (first:(lines rest))) - -fieldList :: GenParser Char ParserState Block -fieldList = try $ do - indent <- lookAhead $ many (oneOf " \t") - items <- many1 $ fieldListItem indent - blanklines - let authors = case lookup "Authors" items of - Just auth -> [auth] - Nothing -> map snd (filter (\(x,_) -> x == "Author") items) - if null authors - then return () - else updateState $ \st -> st {stateAuthors = authors} - case (lookup "Date" items) of - Just dat -> updateState $ \st -> st {stateDate = dat} - Nothing -> return () - case (lookup "Title" items) of - Just tit -> parseFromString (many inline) tit >>= - \t -> updateState $ \st -> st {stateTitle = t} - Nothing -> return () - let remaining = filter (\(x,_) -> (x /= "Authors") && (x /= "Author") && - (x /= "Date") && (x /= "Title")) items - if null remaining - then return Null - else do terms <- mapM (return . (:[]) . Str . fst) remaining - defs <- mapM (parseFromString (many block) . snd) - remaining - return $ DefinitionList $ zip terms defs - --- --- line block --- - -lineBlockLine :: GenParser Char ParserState [Inline] -lineBlockLine = try $ do - string "| " - white <- many (oneOf " \t") - line <- manyTill inline newline - return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak] - -lineBlock :: GenParser Char ParserState Block -lineBlock = try $ do - lines' <- many1 lineBlockLine - blanklines - return $ Para (concat lines') - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph" - -codeBlockStart :: GenParser Char st Char -codeBlockStart = string "::" >> blankline >> blankline - --- paragraph that ends in a :: starting a code block -paraBeforeCodeBlock :: GenParser Char ParserState Block -paraBeforeCodeBlock = try $ do - result <- many1 (notFollowedBy' codeBlockStart >> inline) - lookAhead (string "::") - return $ Para $ if last result == Space - then normalizeSpaces result - else (normalizeSpaces result) ++ [Str ":"] - --- regular paragraph -paraNormal :: GenParser Char ParserState Block -paraNormal = try $ do - result <- many1 inline - newline - blanklines - return $ Para $ normalizeSpaces result - -plain :: GenParser Char ParserState Block -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- image block --- - -imageBlock :: GenParser Char st Block -imageBlock = try $ do - string ".. image:: " - src <- manyTill anyChar newline - fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t") - many1 $ fieldListItem indent - optional blanklines - case lookup "alt" fields of - Just alt -> return $ Plain [Image [Str alt] (src, alt)] - Nothing -> return $ Plain [Image [Str "image"] (src, "")] --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = doubleHeader <|> singleHeader <?> "header" - --- a header with lines on top and bottom -doubleHeader :: GenParser Char ParserState Block -doubleHeader = try $ do - c <- oneOf underlineChars - rest <- many (char c) -- the top line - let lenTop = length (c:rest) - skipSpaces - newline - txt <- many1 (notFollowedBy blankline >> inline) - pos <- getPosition - let len = (sourceColumn pos) - 1 - if (len > lenTop) then fail "title longer than border" else return () - blankline -- spaces and newline - count lenTop (char c) -- the bottom line - blanklines - -- check to see if we've had this kind of header before. - -- if so, get appropriate level. if not, add to list. - state <- getState - let headerTable = stateHeaderTable state - let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of - Just ind -> (headerTable, ind + 1) - Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) - setState (state { stateHeaderTable = headerTable' }) - return $ Header level (normalizeSpaces txt) - --- a header with line on the bottom only -singleHeader :: GenParser Char ParserState Block -singleHeader = try $ do - notFollowedBy' whitespace - txt <- many1 (do {notFollowedBy blankline; inline}) - pos <- getPosition - let len = (sourceColumn pos) - 1 - blankline - c <- oneOf underlineChars - count (len - 1) (char c) - many (char c) - blanklines - state <- getState - let headerTable = stateHeaderTable state - let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of - Just ind -> (headerTable, ind + 1) - Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) - setState (state { stateHeaderTable = headerTable' }) - return $ Header level (normalizeSpaces txt) - --- --- hrule block --- - -hrule :: GenParser Char st Block -hrule = try $ do - chr <- oneOf underlineChars - count 3 (char chr) - skipMany (char chr) - blankline - blanklines - return HorizontalRule - --- --- code blocks --- - --- read a line indented by a given string -indentedLine :: String -> GenParser Char st [Char] -indentedLine indents = try $ do - string indents - result <- manyTill anyChar newline - return $ result ++ "\n" - --- two or more indented lines, possibly separated by blank lines. --- any amount of indentation will work. -indentedBlock :: GenParser Char st [Char] -indentedBlock = do - indents <- lookAhead $ many1 (oneOf " \t") - lns <- many $ choice $ [ indentedLine indents, - try $ do b <- blanklines - l <- indentedLine indents - return (b ++ l) ] - optional blanklines - return $ concat lns - -codeBlock :: GenParser Char st Block -codeBlock = try $ do - codeBlockStart - result <- indentedBlock - return $ CodeBlock ("",[],[]) $ stripTrailingNewlines result - -lhsCodeBlock :: GenParser Char ParserState Block -lhsCodeBlock = try $ do - failUnlessLHS - pos <- getPosition - when (sourceColumn pos /= 1) $ fail "Not in first column" - lns <- many1 birdTrackLine - -- if (as is normal) there is always a space after >, drop it - let lns' = if all (\ln -> null ln || take 1 ln == " ") lns - then map (drop 1) lns - else lns - blanklines - return $ CodeBlock ("", ["sourceCode", "haskell"], []) $ intercalate "\n" lns' - -birdTrackLine :: GenParser Char st [Char] -birdTrackLine = do - char '>' - manyTill anyChar newline - --- --- raw html --- - -rawHtmlBlock :: GenParser Char st Block -rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >> - indentedBlock >>= return . RawHtml - --- --- raw latex --- - -rawLaTeXBlock :: GenParser Char st Block -rawLaTeXBlock = try $ do - string ".. raw:: latex" - blanklines - result <- indentedBlock - return $ Para [(TeX result)] - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = do - raw <- indentedBlock - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n\n" - return $ BlockQuote contents - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -definitionListItem :: GenParser Char ParserState ([Inline], [Block]) -definitionListItem = try $ do - -- avoid capturing a directive or comment - notFollowedBy (try $ char '.' >> char '.') - term <- many1Till inline endline - raw <- indentedBlock - -- parse the extracted block, which may contain various block elements: - contents <- parseFromString parseBlocks $ raw ++ "\n\n" - return (normalizeSpaces term, contents) - -definitionList :: GenParser Char ParserState Block -definitionList = many1 definitionListItem >>= return . DefinitionList - --- parses bullet list start and returns its length (inc. following whitespace) -bulletListStart :: GenParser Char st Int -bulletListStart = try $ do - notFollowedBy' hrule -- because hrules start out just like lists - marker <- oneOf bulletListMarkers - white <- many1 spaceChar - return $ length (marker:white) - --- parses ordered list start and returns its length (inc following whitespace) -orderedListStart :: ListNumberStyle - -> ListNumberDelim - -> GenParser Char st Int -orderedListStart style delim = try $ do - (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) - white <- many1 spaceChar - return $ markerLen + length white - --- parse a line of a list item -listLine :: Int -> GenParser Char ParserState [Char] -listLine markerLength = try $ do - notFollowedBy blankline - indentWith markerLength - line <- manyTill anyChar newline - return $ line ++ "\n" - --- indent by specified number of spaces (or equiv. tabs) -indentWith :: Int -> GenParser Char ParserState [Char] -indentWith num = do - state <- getState - let tabStop = stateTabStop state - if (num < tabStop) - then count num (char ' ') - else choice [ try (count num (char ' ')), - (try (char '\t' >> count (num - tabStop) (char ' '))) ] - --- parse raw text for one list item, excluding start marker and continuations -rawListItem :: GenParser Char ParserState Int - -> GenParser Char ParserState (Int, [Char]) -rawListItem start = try $ do - markerLength <- start - firstLine <- manyTill anyChar newline - restLines <- many (listLine markerLength) - return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) - --- continuation of a list item - indented and separated by blankline or --- (in compact lists) endline. --- Note: nested lists are parsed as continuations. -listContinuation :: Int -> GenParser Char ParserState [Char] -listContinuation markerLength = try $ do - blanks <- many1 blankline - result <- many1 (listLine markerLength) - return $ blanks ++ concat result - -listItem :: GenParser Char ParserState Int - -> GenParser Char ParserState [Block] -listItem start = try $ do - (markerLength, first) <- rawListItem start - rest <- many (listContinuation markerLength) - blanks <- choice [ try (many blankline >>~ lookAhead start), - many1 blankline ] -- whole list must end with blank. - -- parsing with ListItemState forces markers at beginning of lines to - -- count as list item markers, even if not separated by blank space. - -- see definition of "endline" - state <- getState - let oldContext = stateParserContext state - setState $ state {stateParserContext = ListItemState} - -- parse the extracted block, which may itself contain block elements - parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks - updateState (\st -> st {stateParserContext = oldContext}) - return parsed - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) - items <- many1 (listItem (orderedListStart style delim)) - let items' = compactify items - return $ OrderedList (start, style, delim) items' - -bulletList :: GenParser Char ParserState Block -bulletList = many1 (listItem bulletListStart) >>= - return . BulletList . compactify - --- --- unknown directive (e.g. comment) --- - -unknownDirective :: GenParser Char st Block -unknownDirective = try $ do - string ".." - notFollowedBy (noneOf " \t\n") - manyTill anyChar newline - many $ blanklines <|> (oneOf " \t" >> manyTill anyChar newline) - return Null - --- --- reference key --- - -quotedReferenceName :: GenParser Char ParserState [Inline] -quotedReferenceName = try $ do - char '`' >> notFollowedBy (char '`') -- `` means inline code! - label' <- many1Till inline (char '`') - return label' - -unquotedReferenceName :: GenParser Char ParserState [Inline] -unquotedReferenceName = try $ do - label' <- many1Till inline (lookAhead $ char ':') - return label' - -isolated :: Char -> GenParser Char st Char -isolated ch = try $ char ch >>~ notFollowedBy (char ch) - -simpleReferenceName :: GenParser Char st [Inline] -simpleReferenceName = do - raw <- many1 (alphaNum <|> isolated '-' <|> isolated '.' <|> - (try $ char '_' >>~ lookAhead alphaNum)) - return [Str raw] - -referenceName :: GenParser Char ParserState [Inline] -referenceName = quotedReferenceName <|> - (try $ simpleReferenceName >>~ lookAhead (char ':')) <|> - unquotedReferenceName - -referenceKey :: GenParser Char ParserState [Char] -referenceKey = do - startPos <- getPosition - key <- choice [imageKey, anonymousKey, regularKey] - st <- getState - let oldkeys = stateKeys st - updateState $ \s -> s { stateKeys = key : oldkeys } - optional blanklines - endPos <- getPosition - -- return enough blanks to replace key - return $ replicate (sourceLine endPos - sourceLine startPos) '\n' - -targetURI :: GenParser Char st [Char] -targetURI = do - skipSpaces - optional newline - contents <- many1 (try (many spaceChar >> newline >> - many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") - blanklines - return contents - -imageKey :: GenParser Char ParserState ([Inline], (String, [Char])) -imageKey = try $ do - string ".. |" - ref <- manyTill inline (char '|') - skipSpaces - string "image::" - src <- targetURI - return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) - -anonymousKey :: GenParser Char st ([Inline], (String, [Char])) -anonymousKey = try $ do - oneOfStrings [".. __:", "__"] - src <- targetURI - return ([Str "_"], (removeLeadingTrailingSpace src, "")) - -regularKey :: GenParser Char ParserState ([Inline], (String, [Char])) -regularKey = try $ do - string ".. _" - ref <- referenceName - char ':' - src <- targetURI - return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) - - -- - -- inline - -- - -inline :: GenParser Char ParserState Inline -inline = choice [ link - , str - , whitespace - , endline - , strong - , emph - , code - , image - , hyphens - , superscript - , subscript - , escapedChar - , symbol ] <?> "inline" - -hyphens :: GenParser Char ParserState Inline -hyphens = do - result <- many1 (char '-') - option Space endline - -- don't want to treat endline after hyphen or dash as a space - return $ Str result - -escapedChar :: GenParser Char st Inline -escapedChar = escaped anyChar - -symbol :: GenParser Char ParserState Inline -symbol = do - result <- oneOf specialChars - return $ Str [result] - --- parses inline code, between codeStart and codeEnd -code :: GenParser Char ParserState Inline -code = try $ do - string "``" - result <- manyTill anyChar (try (string "``")) - return $ Code $ removeLeadingTrailingSpace $ intercalate " " $ lines result - -emph :: GenParser Char ParserState Inline -emph = enclosed (char '*') (char '*') inline >>= - return . Emph . normalizeSpaces - -strong :: GenParser Char ParserState Inline -strong = enclosed (string "**") (try $ string "**") inline >>= - return . Strong . normalizeSpaces - -interpreted :: [Char] -> GenParser Char st [Inline] -interpreted role = try $ do - optional $ try $ string "\\ " - result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar - try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "") - return [Str result] - -superscript :: GenParser Char ParserState Inline -superscript = interpreted "sup" >>= (return . Superscript) - -subscript :: GenParser Char ParserState Inline -subscript = interpreted "sub" >>= (return . Subscript) - -whitespace :: GenParser Char ParserState Inline -whitespace = many1 spaceChar >> return Space <?> "whitespace" - -str :: GenParser Char ParserState Inline -str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str - --- an endline character that can be treated as a space, not a structural break -endline :: GenParser Char ParserState Inline -endline = try $ do - newline - notFollowedBy blankline - -- parse potential list-starts at beginning of line differently in a list: - st <- getState - if (stateParserContext st) == ListItemState - then notFollowedBy (anyOrderedListMarker >> spaceChar) >> - notFollowedBy' bulletListStart - else return () - return Space - --- --- links --- - -link :: GenParser Char ParserState Inline -link = choice [explicitLink, referenceLink, autoLink] <?> "link" - -explicitLink :: GenParser Char ParserState Inline -explicitLink = try $ do - char '`' - notFollowedBy (char '`') -- `` marks start of inline code - label' <- manyTill (notFollowedBy (char '`') >> inline) - (try (spaces >> char '<')) - src <- manyTill (noneOf ">\n ") (char '>') - skipSpaces - string "`_" - return $ Link (normalizeSpaces label') (removeLeadingTrailingSpace src, "") - -referenceLink :: GenParser Char ParserState Inline -referenceLink = try $ do - label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_' - key <- option label' (do{char '_'; return [Str "_"]}) -- anonymous link - state <- getState - let keyTable = stateKeys state - src <- case lookupKeySrc keyTable key of - Nothing -> fail "no corresponding key" - Just target -> return target - -- if anonymous link, remove first anon key so it won't be used again - let keyTable' = if (key == [Str "_"]) -- anonymous link? - then delete ([Str "_"], src) keyTable -- remove first anon key - else keyTable - setState $ state { stateKeys = keyTable' } - return $ Link (normalizeSpaces label') src - -autoURI :: GenParser Char ParserState Inline -autoURI = do - src <- uri - return $ Link [Str src] (src, "") - -autoEmail :: GenParser Char ParserState Inline -autoEmail = do - src <- emailAddress - return $ Link [Str src] ("mailto:" ++ src, "") - -autoLink :: GenParser Char ParserState Inline -autoLink = autoURI <|> autoEmail - --- For now, we assume that all substitution references are for images. -image :: GenParser Char ParserState Inline -image = try $ do - char '|' - ref <- manyTill inline (char '|') - state <- getState - let keyTable = stateKeys state - src <- case lookupKeySrc keyTable ref of - Nothing -> fail "no corresponding key" - Just target -> return target - return $ Image (normalizeSpaces ref) src - diff --git a/Text/Pandoc/Readers/TeXMath.hs b/Text/Pandoc/Readers/TeXMath.hs deleted file mode 100644 index 04b0f3b8f..000000000 --- a/Text/Pandoc/Readers/TeXMath.hs +++ /dev/null @@ -1,233 +0,0 @@ -{- -Copyright (C) 2007 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.TeXMath - Copyright : Copyright (C) 2007 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of TeX math to a list of 'Pandoc' inline elements. --} -module Text.Pandoc.Readers.TeXMath ( - readTeXMath - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition - --- | Converts a string of raw TeX math to a list of 'Pandoc' inlines. -readTeXMath :: String -> [Inline] -readTeXMath inp = case parse teXMath ("formula: " ++ inp) inp of - Left _ -> [Str inp] -- if unparseable, just include original - Right res -> res - -teXMath :: GenParser Char st [Inline] -teXMath = manyTill mathPart eof >>= return . concat - -mathPart :: GenParser Char st [Inline] -mathPart = whitespace <|> superscript <|> subscript <|> symbol <|> - argument <|> digits <|> letters <|> misc - -whitespace :: GenParser Char st [Inline] -whitespace = many1 space >> return [] - -symbol :: GenParser Char st [Inline] -symbol = try $ do - char '\\' - res <- many1 letter - case lookup res teXsymbols of - Just m -> return [Str m] - Nothing -> return [Str $ "\\" ++ res] - -argument :: GenParser Char st [Inline] -argument = try $ do - char '{' - res <- many mathPart - char '}' - return $ if null res - then [Str " "] - else [Str "{"] ++ concat res ++ [Str "}"] - -digits :: GenParser Char st [Inline] -digits = do - res <- many1 digit - return [Str res] - -letters :: GenParser Char st [Inline] -letters = do - res <- many1 letter - return [Emph [Str res]] - -misc :: GenParser Char st [Inline] -misc = do - res <- noneOf "}" - return [Str [res]] - -scriptArg :: GenParser Char st [Inline] -scriptArg = try $ do - (try (do{char '{'; r <- many mathPart; char '}'; return $ concat r})) - <|> symbol - <|> (do{c <- (letter <|> digit); return [Str [c]]}) - -superscript :: GenParser Char st [Inline] -superscript = try $ do - char '^' - arg <- scriptArg - return [Superscript arg] - -subscript :: GenParser Char st [Inline] -subscript = try $ do - char '_' - arg <- scriptArg - return [Subscript arg] - -withThinSpace :: String -> String -withThinSpace str = "\x2009" ++ str ++ "\x2009" - -teXsymbols :: [(String, String)] -teXsymbols = - [("alpha","\x3B1") - ,("beta", "\x3B2") - ,("chi", "\x3C7") - ,("delta", "\x3B4") - ,("Delta", "\x394") - ,("epsilon", "\x3B5") - ,("varepsilon", "\x25B") - ,("eta", "\x3B7") - ,("gamma", "\x3B3") - ,("Gamma", "\x393") - ,("iota", "\x3B9") - ,("kappa", "\x3BA") - ,("lambda", "\x3BB") - ,("Lambda", "\x39B") - ,("mu", "\x3BC") - ,("nu", "\x3BD") - ,("omega", "\x3C9") - ,("Omega", "\x3A9") - ,("phi", "\x3C6") - ,("varphi", "\x3D5") - ,("Phi", "\x3A6") - ,("pi", "\x3C0") - ,("Pi", "\x3A0") - ,("psi", "\x3C8") - ,("Psi", "\x3A8") - ,("rho", "\x3C1") - ,("sigma", "\x3C3") - ,("Sigma", "\x3A3") - ,("tau", "\x3C4") - ,("theta", "\x3B8") - ,("vartheta", "\x3D1") - ,("Theta", "\x398") - ,("upsilon", "\x3C5") - ,("xi", "\x3BE") - ,("Xi", "\x39E") - ,("zeta", "\x3B6") - ,("ne", "\x2260") - ,("lt", withThinSpace "<") - ,("le", withThinSpace "\x2264") - ,("leq", withThinSpace "\x2264") - ,("ge", withThinSpace "\x2265") - ,("geq", withThinSpace "\x2265") - ,("prec", withThinSpace "\x227A") - ,("succ", withThinSpace "\x227B") - ,("preceq", withThinSpace "\x2AAF") - ,("succeq", withThinSpace "\x2AB0") - ,("in", withThinSpace "\x2208") - ,("notin", withThinSpace "\x2209") - ,("subset", withThinSpace "\x2282") - ,("supset", withThinSpace "\x2283") - ,("subseteq", withThinSpace "\x2286") - ,("supseteq", withThinSpace "\x2287") - ,("equiv", withThinSpace "\x2261") - ,("cong", withThinSpace "\x2245") - ,("approx", withThinSpace "\x2248") - ,("propto", withThinSpace "\x221D") - ,("cdot", withThinSpace "\x22C5") - ,("star", withThinSpace "\x22C6") - ,("backslash", "\\") - ,("times", withThinSpace "\x00D7") - ,("divide", withThinSpace "\x00F7") - ,("circ", withThinSpace "\x2218") - ,("oplus", withThinSpace "\x2295") - ,("otimes", withThinSpace "\x2297") - ,("odot", withThinSpace "\x2299") - ,("sum", "\x2211") - ,("prod", "\x220F") - ,("wedge", withThinSpace "\x2227") - ,("bigwedge", withThinSpace "\x22C0") - ,("vee", withThinSpace "\x2228") - ,("bigvee", withThinSpace "\x22C1") - ,("cap", withThinSpace "\x2229") - ,("bigcap", withThinSpace "\x22C2") - ,("cup", withThinSpace "\x222A") - ,("bigcup", withThinSpace "\x22C3") - ,("neg", "\x00AC") - ,("implies", withThinSpace "\x21D2") - ,("iff", withThinSpace "\x21D4") - ,("forall", "\x2200") - ,("exists", "\x2203") - ,("bot", "\x22A5") - ,("top", "\x22A4") - ,("vdash", "\x22A2") - ,("models", withThinSpace "\x22A8") - ,("uparrow", "\x2191") - ,("downarrow", "\x2193") - ,("rightarrow", withThinSpace "\x2192") - ,("to", withThinSpace "\x2192") - ,("rightarrowtail", "\x21A3") - ,("twoheadrightarrow", withThinSpace "\x21A0") - ,("twoheadrightarrowtail", withThinSpace "\x2916") - ,("mapsto", withThinSpace "\x21A6") - ,("leftarrow", withThinSpace "\x2190") - ,("leftrightarrow", withThinSpace "\x2194") - ,("Rightarrow", withThinSpace "\x21D2") - ,("Leftarrow", withThinSpace "\x21D0") - ,("Leftrightarrow", withThinSpace "\x21D4") - ,("partial", "\x2202") - ,("nabla", "\x2207") - ,("pm", "\x00B1") - ,("emptyset", "\x2205") - ,("infty", "\x221E") - ,("aleph", "\x2135") - ,("ldots", "...") - ,("therefore", "\x2234") - ,("angle", "\x2220") - ,("quad", "\x00A0\x00A0") - ,("cdots", "\x22EF") - ,("vdots", "\x22EE") - ,("ddots", "\x22F1") - ,("diamond", "\x22C4") - ,("Box", "\x25A1") - ,("lfloor", "\x230A") - ,("rfloor", "\x230B") - ,("lceiling", "\x2308") - ,("rceiling", "\x2309") - ,("langle", "\x2329") - ,("rangle", "\x232A") - ,("{", "{") - ,("}", "}") - ,("[", "[") - ,("]", "]") - ,("|", "|") - ,("||", "||") - ] - |