aboutsummaryrefslogtreecommitdiff
path: root/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-11-28 21:01:17 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-11-28 21:01:17 +0000
commit47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864 (patch)
treeead5f33d20aa3a88c78f11bb8e598a4f741247f1 /Text/Pandoc/Readers
parent3dfe4cb708ac14369aab19383c1008b2cf4adbc4 (diff)
downloadpandoc-47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864.tar.gz
Removed Text directory. This is a remnant of an experiment
moving the contents of src/ to the top level, and should have been deleted long ago. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1097 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'Text/Pandoc/Readers')
-rw-r--r--Text/Pandoc/Readers/HTML.hs496
-rw-r--r--Text/Pandoc/Readers/LaTeX.hs651
-rw-r--r--Text/Pandoc/Readers/Markdown.hs909
-rw-r--r--Text/Pandoc/Readers/RST.hs640
4 files changed, 0 insertions, 2696 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
deleted file mode 100644
index 70a071152..000000000
--- a/Text/Pandoc/Readers/HTML.hs
+++ /dev/null
@@ -1,496 +0,0 @@
-{-
-Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--}
-
-{- |
- Module : Text.Pandoc.Readers.HTML
- Copyright : Copyright (C) 2006-7 John MacFarlane
- License : GNU GPL, version 2 or above
-
- Maintainer : John MacFarlane <jgm@berkeley.edu>
- Stability : alpha
- Portability : portable
-
-Conversion of HTML to 'Pandoc' document.
--}
-module Text.Pandoc.Readers.HTML (
- readHtml,
- rawHtmlInline,
- rawHtmlBlock,
- anyHtmlBlockTag,
- anyHtmlInlineTag,
- anyHtmlTag,
- anyHtmlEndTag,
- htmlEndTag,
- extractTagType,
- htmlBlockElement
- ) where
-
-import Text.ParserCombinators.Parsec
-import Text.Pandoc.Definition
-import Text.Pandoc.Shared
-import Text.Pandoc.CharacterReferences ( characterReference,
- decodeCharacterReferences )
-import Data.Maybe ( fromMaybe )
-import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
-import Data.Char ( toUpper, toLower, isAlphaNum )
-
--- | Convert HTML-formatted string to 'Pandoc' document.
-readHtml :: ParserState -- ^ Parser state
- -> String -- ^ String to parse
- -> Pandoc
-readHtml = readWith parseHtml
-
---
--- Constants
---
-
-eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
- "map", "area", "object", "script"]
-
-inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
- "br", "cite", "code", "dfn", "em", "font", "i", "img",
- "input", "kbd", "label", "q", "s", "samp", "select",
- "small", "span", "strike", "strong", "sub", "sup",
- "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
-
-blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
- "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "hr", "isindex", "menu", "noframes",
- "noscript", "ol", "p", "pre", "table", "ul", "dd",
- "dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr"] ++ eitherBlockOrInline
-
---
--- HTML utility functions
---
-
--- | Read blocks until end tag.
-blocksTilEnd tag = do
- blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
- return $ filter (/= Null) blocks
-
--- | Read inlines until end tag.
-inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
-
--- | Parse blocks between open and close tag.
-blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
-
--- | Parse inlines between open and close tag.
-inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
-
--- | Extract type from a tag: e.g. @br@ from @\<br\>@
-extractTagType :: String -> String
-extractTagType ('<':rest) =
- let isSpaceOrSlash c = c `elem` "/ \n\t" in
- map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
-extractTagType _ = ""
-
--- | Parse any HTML tag (opening or self-closing) and return text of tag
-anyHtmlTag = try $ do
- char '<'
- spaces
- tag <- many1 alphaNum
- attribs <- many htmlAttribute
- spaces
- ender <- option "" (string "/")
- let ender' = if null ender then "" else " /"
- spaces
- char '>'
- return $ "<" ++ tag ++
- concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
-
-anyHtmlEndTag = try $ do
- char '<'
- spaces
- char '/'
- spaces
- tagType <- many1 alphaNum
- spaces
- char '>'
- return $ "</" ++ tagType ++ ">"
-
-htmlTag :: String -> GenParser Char st (String, [(String, String)])
-htmlTag tag = try $ do
- char '<'
- spaces
- stringAnyCase tag
- attribs <- many htmlAttribute
- spaces
- optional (string "/")
- spaces
- char '>'
- return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
-
--- parses a quoted html attribute value
-quoted quoteChar = do
- result <- between (char quoteChar) (char quoteChar)
- (many (noneOf [quoteChar]))
- return (result, [quoteChar])
-
-htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
-
--- minimized boolean attribute
-htmlMinimizedAttribute = try $ do
- many1 space
- name <- many1 (choice [letter, oneOf ".-_:"])
- return (name, name, name)
-
-htmlRegularAttribute = try $ do
- many1 space
- name <- many1 (choice [letter, oneOf ".-_:"])
- spaces
- char '='
- spaces
- (content, quoteStr) <- choice [ (quoted '\''),
- (quoted '"'),
- (do
- a <- many (alphaNum <|> (oneOf "-._:"))
- return (a,"")) ]
- return (name, content,
- (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-
--- | Parse an end tag of type 'tag'
-htmlEndTag tag = try $ do
- char '<'
- spaces
- char '/'
- spaces
- stringAnyCase tag
- spaces
- char '>'
- return $ "</" ++ tag ++ ">"
-
--- | Returns @True@ if the tag is (or can be) an inline tag.
-isInline tag = (extractTagType tag) `elem` inlineHtmlTags
-
--- | Returns @True@ if the tag is (or can be) a block tag.
-isBlock tag = (extractTagType tag) `elem` blockHtmlTags
-
-anyHtmlBlockTag = try $ do
- tag <- anyHtmlTag <|> anyHtmlEndTag
- if isBlock tag then return tag else fail "inline tag"
-
-anyHtmlInlineTag = try $ do
- tag <- anyHtmlTag <|> anyHtmlEndTag
- if isInline tag then return tag else fail "not an inline tag"
-
--- | Parses material between script tags.
--- Scripts must be treated differently, because they can contain '<>' etc.
-htmlScript = try $ do
- open <- string "<script"
- rest <- manyTill anyChar (htmlEndTag "script")
- return $ open ++ rest ++ "</script>"
-
-htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
-
-rawHtmlBlock = try $ do
- notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
- body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
- sp <- many space
- state <- getState
- if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
-
--- | Parses an HTML comment.
-htmlComment = try $ do
- string "<!--"
- comment <- manyTill anyChar (try (string "-->"))
- return $ "<!--" ++ comment ++ "-->"
-
---
--- parsing documents
---
-
-xmlDec = try $ do
- string "<?"
- rest <- manyTill anyChar (char '>')
- return $ "<?" ++ rest ++ ">"
-
-definition = try $ do
- string "<!"
- rest <- manyTill anyChar (char '>')
- return $ "<!" ++ rest ++ ">"
-
-nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
- ((rawHtmlBlock >> return ' ') <|> anyChar)
-
-parseTitle = try $ do
- (tag, _) <- htmlTag "title"
- contents <- inlinesTilEnd tag
- spaces
- return contents
-
--- parse header and return meta-information (for now, just title)
-parseHead = try $ do
- htmlTag "head"
- spaces
- skipMany nonTitleNonHead
- contents <- option [] parseTitle
- skipMany nonTitleNonHead
- htmlTag "/head"
- return (contents, [], "")
-
-skipHtmlTag tag = optional (htmlTag tag)
-
--- h1 class="title" representation of title in body
-bodyTitle = try $ do
- (tag, attribs) <- htmlTag "h1"
- cl <- case (extractAttribute "class" attribs) of
- Just "title" -> return ""
- otherwise -> fail "not title"
- inlinesTilEnd "h1"
-
-parseHtml = do
- sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
- skipHtmlTag "html"
- spaces
- (title, authors, date) <- option ([], [], "") parseHead
- spaces
- skipHtmlTag "body"
- spaces
- optional bodyTitle -- skip title in body, because it's represented in meta
- blocks <- parseBlocks
- spaces
- optional (htmlEndTag "body")
- spaces
- optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
- eof
- return $ Pandoc (Meta title authors date) blocks
-
---
--- parsing blocks
---
-
-parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
-
-block = choice [ codeBlock
- , header
- , hrule
- , list
- , blockQuote
- , para
- , plain
- , rawHtmlBlock ] <?> "block"
-
---
--- header blocks
---
-
-header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
-
-headerLevel n = try $ do
- let level = "h" ++ show n
- (tag, attribs) <- htmlTag level
- contents <- inlinesTilEnd level
- return $ Header n (normalizeSpaces contents)
-
---
--- hrule block
---
-
-hrule = try $ do
- (tag, attribs) <- htmlTag "hr"
- state <- getState
- if not (null attribs) && stateParseRaw state
- then unexpected "attributes in hr" -- parse as raw in this case
- else return HorizontalRule
-
---
--- code blocks
---
-
--- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
--- skipped, because they are not portable to output formats other than HTML.
-codeBlock = try $ do
- htmlTag "pre"
- result <- manyTill
- (many1 (satisfy (/= '<')) <|>
- ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
- (htmlEndTag "pre")
- let result' = concat result
- -- drop leading newline if any
- let result'' = if "\n" `isPrefixOf` result'
- then drop 1 result'
- else result'
- -- drop trailing newline if any
- let result''' = if "\n" `isSuffixOf` result''
- then init result''
- else result''
- return $ CodeBlock $ decodeCharacterReferences result'''
-
---
--- block quotes
---
-
-blockQuote = try $ htmlTag "blockquote" >> spaces >>
- blocksTilEnd "blockquote" >>= (return . BlockQuote)
-
---
--- list blocks
---
-
-list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-
-orderedList = try $ do
- (_, attribs) <- htmlTag "ol"
- (start, style) <- option (1, DefaultStyle) $
- do failIfStrict
- let sta = fromMaybe "1" $
- lookup "start" attribs
- let sty = fromMaybe (fromMaybe "" $
- lookup "style" attribs) $
- lookup "class" attribs
- let sty' = case sty of
- "lower-roman" -> LowerRoman
- "upper-roman" -> UpperRoman
- "lower-alpha" -> LowerAlpha
- "upper-alpha" -> UpperAlpha
- "decimal" -> Decimal
- _ -> DefaultStyle
- return (read sta, sty')
- spaces
- items <- sepEndBy1 (blocksIn "li") spaces
- htmlEndTag "ol"
- return $ OrderedList (start, style, DefaultDelim) items
-
-bulletList = try $ do
- htmlTag "ul"
- spaces
- items <- sepEndBy1 (blocksIn "li") spaces
- htmlEndTag "ul"
- return $ BulletList items
-
-definitionList = try $ do
- failIfStrict -- def lists not part of standard markdown
- tag <- htmlTag "dl"
- spaces
- items <- sepEndBy1 definitionListItem spaces
- htmlEndTag "dl"
- return $ DefinitionList items
-
-definitionListItem = try $ do
- terms <- sepEndBy1 (inlinesIn "dt") spaces
- defs <- sepEndBy1 (blocksIn "dd") spaces
- let term = joinWithSep [LineBreak] terms
- return (term, concat defs)
-
---
--- paragraph block
---
-
-para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=
- return . Para . normalizeSpaces
-
---
--- plain block
---
-
-plain = many1 inline >>= return . Plain . normalizeSpaces
-
---
--- inline
---
-
-inline = choice [ charRef
- , strong
- , emph
- , superscript
- , subscript
- , strikeout
- , spanStrikeout
- , code
- , str
- , linebreak
- , whitespace
- , link
- , image
- , rawHtmlInline
- ] <?> "inline"
-
-code = try $ do
- htmlTag "code"
- result <- manyTill anyChar (htmlEndTag "code")
- -- remove internal line breaks, leading and trailing space,
- -- and decode character references
- return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
- joinWithSep " " $ lines result
-
-rawHtmlInline = do
- result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag
- state <- getState
- if stateParseRaw state then return (HtmlInline result) else return (Str "")
-
-betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
- return . normalizeSpaces
-
-emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph
-
-strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
-
-superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
-
-subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
-
-strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
- return . Strikeout
-
-spanStrikeout = try $ do
- failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
- (tag, attributes) <- htmlTag "span"
- result <- case (extractAttribute "class" attributes) of
- Just "strikeout" -> inlinesTilEnd "span"
- _ -> fail "not a strikeout"
- return $ Strikeout result
-
-whitespace = many1 space >> return Space
-
--- hard line break
-linebreak = htmlTag "br" >> optional newline >> return LineBreak
-
-str = many1 (noneOf "<& \t\n") >>= return . Str
-
---
--- links and images
---
-
--- extract contents of attribute (attribute names are case-insensitive)
-extractAttribute name [] = Nothing
-extractAttribute name ((attrName, contents):rest) =
- let name' = map toLower name
- attrName' = map toLower attrName
- in if attrName' == name'
- then Just (decodeCharacterReferences contents)
- else extractAttribute name rest
-
-link = try $ do
- (tag, attributes) <- htmlTag "a"
- url <- case (extractAttribute "href" attributes) of
- Just url -> return url
- Nothing -> fail "no href"
- let title = fromMaybe "" $ extractAttribute "title" attributes
- label <- inlinesTilEnd "a"
- return $ Link (normalizeSpaces label) (url, title)
-
-image = try $ do
- (tag, attributes) <- htmlTag "img"
- url <- case (extractAttribute "src" attributes) of
- Just url -> return url
- Nothing -> fail "no src"
- let title = fromMaybe "" $ extractAttribute "title" attributes
- let alt = fromMaybe "" (extractAttribute "alt" attributes)
- return $ Image [Str alt] (url, title)
-
diff --git a/Text/Pandoc/Readers/LaTeX.hs b/Text/Pandoc/Readers/LaTeX.hs
deleted file mode 100644
index 37cc2bfe4..000000000
--- a/Text/Pandoc/Readers/LaTeX.hs
+++ /dev/null
@@ -1,651 +0,0 @@
-{-
-Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--}
-
-{- |
- Module : Text.Pandoc.Readers.LaTeX
- Copyright : Copyright (C) 2006-7 John MacFarlane
- License : GNU GPL, version 2 or above
-
- Maintainer : John MacFarlane <jgm@berkeley.edu>
- Stability : alpha
- Portability : portable
-
-Conversion of LaTeX to 'Pandoc' document.
--}
-module Text.Pandoc.Readers.LaTeX (
- readLaTeX,
- rawLaTeXInline,
- rawLaTeXEnvironment
- ) where
-
-import Text.ParserCombinators.Parsec
-import Text.Pandoc.Definition
-import Text.Pandoc.Shared
-import Data.Maybe ( fromMaybe )
-import Data.Char ( chr )
-import Data.List ( isPrefixOf, isSuffixOf )
-
--- | Parse LaTeX from string and return 'Pandoc' document.
-readLaTeX :: ParserState -- ^ Parser state, including options for parser
- -> String -- ^ String to parse
- -> Pandoc
-readLaTeX = readWith parseLaTeX
-
--- characters with special meaning
-specialChars = "\\`$%^&_~#{}\n \t|<>'\"-"
-
---
--- utility functions
---
-
--- | Returns text between brackets and its matching pair.
-bracketedText openB closeB = do
- result <- charsInBalanced' openB closeB
- return $ [openB] ++ result ++ [closeB]
-
--- | Returns an option or argument of a LaTeX command.
-optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']'
-
--- | True if the string begins with '{'.
-isArg ('{':rest) = True
-isArg other = False
-
--- | Returns list of options and arguments of a LaTeX command.
-commandArgs = many optOrArg
-
--- | Parses LaTeX command, returns (name, star, list of options or arguments).
-command = do
- char '\\'
- name <- many1 letter
- star <- option "" (string "*") -- some commands have starred versions
- args <- commandArgs
- return (name, star, args)
-
-begin name = try $ do
- string $ "\\begin{" ++ name ++ "}"
- optional commandArgs
- spaces
- return name
-
-end name = try $ do
- string $ "\\end{" ++ name ++ "}"
- spaces
- return name
-
--- | Returns a list of block elements containing the contents of an
--- environment.
-environment name = try $ begin name >> spaces >> manyTill block (end name)
-
-anyEnvironment = try $ do
- string "\\begin{"
- name <- many letter
- star <- option "" (string "*") -- some environments have starred variants
- char '}'
- optional commandArgs
- spaces
- contents <- manyTill block (end (name ++ star))
- return $ BlockQuote contents
-
---
--- parsing documents
---
-
--- | Process LaTeX preamble, extracting metadata.
-processLaTeXPreamble = try $ manyTill
- (choice [bibliographic, comment, unknownCommand, nullBlock])
- (try (string "\\begin{document}")) >>
- spaces
-
--- | Parse LaTeX and return 'Pandoc'.
-parseLaTeX = do
- optional processLaTeXPreamble -- preamble might not be present (fragment)
- spaces
- blocks <- parseBlocks
- spaces
- optional $ try (string "\\end{document}" >> many anyChar)
- -- might not be present (fragment)
- spaces
- eof
- state <- getState
- let blocks' = filter (/= Null) blocks
- let title' = stateTitle state
- let authors' = stateAuthors state
- let date' = stateDate state
- return $ Pandoc (Meta title' authors' date') blocks'
-
---
--- parsing blocks
---
-
-parseBlocks = spaces >> many block
-
-block = choice [ hrule
- , codeBlock
- , header
- , list
- , blockQuote
- , mathBlock
- , comment
- , bibliographic
- , para
- , specialEnvironment
- , itemBlock
- , unknownEnvironment
- , unknownCommand ] <?> "block"
-
---
--- header blocks
---
-
-header = try $ do
- char '\\'
- subs <- many (try (string "sub"))
- string "section"
- optional (char '*')
- char '{'
- title <- manyTill inline (char '}')
- spaces
- return $ Header (length subs + 1) (normalizeSpaces title)
-
---
--- hrule block
---
-
-hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n",
- "\\newpage" ] >> spaces >> return HorizontalRule
-
---
--- code blocks
---
-
-codeBlock = codeBlock1 <|> codeBlock2
-
-codeBlock1 = try $ do
- string "\\begin{verbatim}" -- don't use begin function because it
- -- gobbles whitespace
- optional blanklines -- we want to gobble blank lines, but not
- -- leading space
- contents <- manyTill anyChar (try (string "\\end{verbatim}"))
- spaces
- return $ CodeBlock (stripTrailingNewlines contents)
-
-codeBlock2 = try $ do
- string "\\begin{Verbatim}" -- used by fancyvrb package
- option "" blanklines
- contents <- manyTill anyChar (try (string "\\end{Verbatim}"))
- spaces
- return $ CodeBlock (stripTrailingNewlines contents)
-
---
--- block quotes
---
-
-blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>=
- return . BlockQuote
-
---
--- math block
---
-
-mathBlock = mathBlockWith (begin "equation") (end "equation") <|>
- mathBlockWith (begin "displaymath") (end "displaymath") <|>
- mathBlockWith (string "\\[") (string "\\]") <?> "math block"
-
-mathBlockWith start end = try $ do
- start
- spaces
- result <- manyTill anyChar end
- spaces
- return $ BlockQuote [Para [TeX ("$" ++ result ++ "$")]]
-
---
--- list blocks
---
-
-list = bulletList <|> orderedList <|> definitionList <?> "list"
-
-listItem = try $ do
- ("item", _, args) <- command
- spaces
- state <- getState
- let oldParserContext = stateParserContext state
- updateState (\state -> state {stateParserContext = ListItemState})
- blocks <- many block
- updateState (\state -> state {stateParserContext = oldParserContext})
- opt <- case args of
- ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x ->
- parseFromString (many inline) $ tail $ init x
- _ -> return []
- return (opt, blocks)
-
-orderedList = try $ do
- string "\\begin{enumerate}"
- (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $
- try $ do failIfStrict
- char '['
- res <- anyOrderedListMarker
- char ']'
- return res
- spaces
- option "" $ try $ do string "\\setlength{\\itemindent}"
- char '{'
- manyTill anyChar (char '}')
- spaces
- start <- option 1 $ try $ do failIfStrict
- string "\\setcounter{enum"
- many1 (oneOf "iv")
- string "}{"
- num <- many1 digit
- char '}'
- spaces
- return $ (read num) + 1
- items <- many listItem
- end "enumerate"
- spaces
- return $ OrderedList (start, style, delim) $ map snd items
-
-bulletList = try $ do
- begin "itemize"
- spaces
- items <- many listItem
- end "itemize"
- spaces
- return (BulletList $ map snd items)
-
-definitionList = try $ do
- begin "description"
- spaces
- items <- many listItem
- end "description"
- spaces
- return (DefinitionList items)
-
---
--- paragraph block
---
-
-para = many1 inline >>~ spaces >>= return . Para . normalizeSpaces
-
---
--- title authors date
---
-
-bibliographic = choice [ maketitle, title, authors, date ]
-
-maketitle = try (string "\\maketitle") >> spaces >> return Null
-
-title = try $ do
- string "\\title{"
- tit <- manyTill inline (char '}')
- spaces
- updateState (\state -> state { stateTitle = tit })
- return Null
-
-authors = try $ do
- string "\\author{"
- authors <- manyTill anyChar (char '}')
- spaces
- let authors' = map removeLeadingTrailingSpace $ lines $
- substitute "\\\\" "\n" authors
- updateState (\state -> state { stateAuthors = authors' })
- return Null
-
-date = try $ do
- string "\\date{"
- date' <- manyTill anyChar (char '}')
- spaces
- updateState (\state -> state { stateDate = date' })
- return Null
-
---
--- item block
--- for use in unknown environments that aren't being parsed as raw latex
---
-
--- this forces items to be parsed in different blocks
-itemBlock = try $ do
- ("item", _, args) <- command
- state <- getState
- if (stateParserContext state == ListItemState)
- then fail "item should be handled by list block"
- else if null args
- then return Null
- else return $ Plain [Str (stripFirstAndLast (head args))]
-
---
--- raw LaTeX
---
-
-specialEnvironment = do -- these are always parsed as raw
- lookAhead (choice (map (\name -> begin name) ["tabular", "figure",
- "tabbing", "eqnarry", "picture", "table", "verse", "theorem"]))
- rawLaTeXEnvironment
-
--- | Parse any LaTeX environment and return a Para block containing
--- the whole literal environment as raw TeX.
-rawLaTeXEnvironment :: GenParser Char st Block
-rawLaTeXEnvironment = try $ do
- string "\\begin{"
- name <- many1 letter
- star <- option "" (string "*") -- for starred variants
- let name' = name ++ star
- char '}'
- args <- option [] commandArgs
- let argStr = concat args
- contents <- manyTill (choice [ (many1 (noneOf "\\")),
- (do
- (Para [TeX str]) <- rawLaTeXEnvironment
- return str),
- string "\\" ])
- (end name')
- spaces
- return $ Para [TeX $ "\\begin{" ++ name' ++ "}" ++ argStr ++
- concat contents ++ "\\end{" ++ name' ++ "}"]
-
-unknownEnvironment = try $ do
- state <- getState
- result <- if stateParseRaw state -- check whether we should include raw TeX
- then rawLaTeXEnvironment -- if so, get whole raw environment
- else anyEnvironment -- otherwise just the contents
- return result
-
-unknownCommand = try $ do
- notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",
- "document"]
- (name, star, args) <- command
- spaces
- let argStr = concat args
- state <- getState
- if name == "item" && (stateParserContext state) == ListItemState
- then fail "should not be parsed as raw"
- else string ""
- if stateParseRaw state
- then return $ Plain [TeX ("\\" ++ name ++ star ++ argStr)]
- else return $ Plain [Str (joinWithSep " " args)]
-
--- latex comment
-comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null
-
---
--- inline
---
-
-inline = choice [ str
- , endline
- , whitespace
- , quoted
- , apostrophe
- , spacer
- , strong
- , math
- , ellipses
- , emDash
- , enDash
- , hyphen
- , emph
- , strikeout
- , superscript
- , subscript
- , ref
- , lab
- , code
- , url
- , link
- , image
- , footnote
- , linebreak
- , accentedChar
- , specialChar
- , rawLaTeXInline
- , escapedChar
- , unescapedChar
- ] <?> "inline"
-
-accentedChar = normalAccentedChar <|> specialAccentedChar
-
-normalAccentedChar = try $ do
- char '\\'
- accent <- oneOf "'`^\"~"
- character <- (try $ char '{' >> letter >>~ char '}') <|> letter
- let table = fromMaybe [] $ lookup character accentTable
- let result = case lookup accent table of
- Just num -> chr num
- Nothing -> '?'
- return $ Str [result]
-
--- an association list of letters and association list of accents
--- and decimal character numbers.
-accentTable =
- [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
- ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
- ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
- ('N', [('~', 209)]),
- ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
- ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
- ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
- ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
- ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
- ('n', [('~', 241)]),
- ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
- ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]
-
-specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig,
- oslash, pound, euro, copyright, sect ]
-
-ccedil = try $ do
- char '\\'
- letter <- oneOfStrings ["cc", "cC"]
- let num = if letter == "cc" then 231 else 199
- return $ Str [chr num]
-
-aring = try $ do
- char '\\'
- letter <- oneOfStrings ["aa", "AA"]
- let num = if letter == "aa" then 229 else 197
- return $ Str [chr num]
-
-iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >>
- return (Str [chr 239])
-
-icirc = try (string "\\^") >> oneOfStrings ["\\i", "{\\i}"] >>
- return (Str [chr 238])
-
-szlig = try (string "\\ss") >> return (Str [chr 223])
-
-oslash = try $ do
- char '\\'
- letter <- choice [char 'o', char 'O']
- let num = if letter == 'o' then 248 else 216
- return $ Str [chr num]
-
-aelig = try $ do
- char '\\'
- letter <- oneOfStrings ["ae", "AE"]
- let num = if letter == "ae" then 230 else 198
- return $ Str [chr num]
-
-pound = try (string "\\pounds") >> return (Str [chr 163])
-
-euro = try (string "\\euro") >> return (Str [chr 8364])
-
-copyright = try (string "\\copyright") >> return (Str [chr 169])
-
-sect = try (string "\\S") >> return (Str [chr 167])
-
-escapedChar = do
- result <- escaped (oneOf " $%&_#{}\n")
- return $ if result == Str "\n" then Str " " else result
-
--- ignore standalone, nonescaped special characters
-unescapedChar = oneOf "`$^&_#{}|<>" >> return (Str "")
-
-specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ]
-
-backslash = try (string "\\textbackslash") >> return (Str "\\")
-
-tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~")
-
-caret = try (string "\\^{}") >> return (Str "^")
-
-bar = try (string "\\textbar") >> return (Str "\\")
-
-lt = try (string "\\textless") >> return (Str "<")
-
-gt = try (string "\\textgreater") >> return (Str ">")
-
-doubleQuote = char '"' >> return (Str "\"")
-
-code = code1 <|> code2
-
-code1 = try $ do
- string "\\verb"
- marker <- anyChar
- result <- manyTill anyChar (char marker)
- return $ Code $ removeLeadingTrailingSpace result
-
-code2 = try $ do
- string "\\texttt{"
- result <- manyTill (noneOf "\\\n~$%^&{}") (char '}')
- return $ Code result
-
-emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >>
- manyTill inline (char '}') >>= return . Emph
-
-strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>=
- return . Strikeout
-
-superscript = try $ string "\\textsuperscript{" >>
- manyTill inline (char '}') >>= return . Superscript
-
--- note: \textsubscript isn't a standard latex command, but we use
--- a defined version in pandoc.
-subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>=
- return . Subscript
-
-apostrophe = char '\'' >> return Apostrophe
-
-quoted = doubleQuoted <|> singleQuoted
-
-singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>=
- return . Quoted SingleQuote . normalizeSpaces
-
-doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>=
- return . Quoted DoubleQuote . normalizeSpaces
-
-singleQuoteStart = char '`'
-
-singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum
-
-doubleQuoteStart = string "``"
-
-doubleQuoteEnd = try $ string "''"
-
-ellipses = try $ string "\\ldots" >> optional (try (string "{}")) >>
- return Ellipses
-
-enDash = try (string "--") >> return EnDash
-
-emDash = try (string "---") >> return EmDash
-
-hyphen = char '-' >> return (Str "-")
-
-lab = try $ do
- string "\\label{"
- result <- manyTill anyChar (char '}')
- return $ Str $ "(" ++ result ++ ")"
-
-ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str
-
-strong = try (string "\\textbf{") >> manyTill inline (char '}') >>=
- return . Strong
-
-whitespace = many1 (oneOf "~ \t") >> return Space
-
--- hard line break
-linebreak = try (string "\\\\") >> return LineBreak
-
-spacer = try (string "\\,") >> return (Str "")
-
-str = many1 (noneOf specialChars) >>= return . Str
-
--- endline internal to paragraph
-endline = try $ newline >> notFollowedBy blankline >> return Space
-
--- math
-math = math1 <|> math2 <?> "math"
-
-math1 = try $ do
- char '$'
- result <- many (noneOf "$")
- char '$'
- return $ TeX ("$" ++ result ++ "$")
-
-math2 = try $ do
- string "\\("
- result <- many (noneOf "$")
- string "\\)"
- return $ TeX ("$" ++ result ++ "$")
-
---
--- links and images
---
-
-url = try $ do
- string "\\url"
- url <- charsInBalanced '{' '}'
- return $ Link [Code url] (url, "")
-
-link = try $ do
- string "\\href{"
- url <- manyTill anyChar (char '}')
- char '{'
- label <- manyTill inline (char '}')
- return $ Link (normalizeSpaces label) (url, "")
-
-image = try $ do
- ("includegraphics", _, args) <- command
- let args' = filter isArg args -- filter out options
- let src = if null args' then
- ("", "")
- else
- (stripFirstAndLast (head args'), "")
- return $ Image [Str "image"] src
-
-footnote = try $ do
- (name, _, (contents:[])) <- command
- if ((name == "footnote") || (name == "thanks"))
- then string ""
- else fail "not a footnote or thanks command"
- let contents' = stripFirstAndLast contents
- -- parse the extracted block, which may contain various block elements:
- rest <- getInput
- setInput $ contents'
- blocks <- parseBlocks
- setInput rest
- return $ Note blocks
-
--- | Parse any LaTeX command and return it in a raw TeX inline element.
-rawLaTeXInline :: GenParser Char ParserState Inline
-rawLaTeXInline = try $ do
- (name, star, args) <- command
- state <- getState
- if ((name == "begin") || (name == "end") || (name == "item"))
- then fail "not an inline command"
- else string ""
- return $ TeX ("\\" ++ name ++ star ++ concat args)
-
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
deleted file mode 100644
index df84c0ac7..000000000
--- a/Text/Pandoc/Readers/Markdown.hs
+++ /dev/null
@@ -1,909 +0,0 @@
-{-
-Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--}
-
-{- |
- Module : Text.Pandoc.Readers.Markdown
- Copyright : Copyright (C) 2006-7 John MacFarlane
- License : GNU GPL, version 2 or above
-
- Maintainer : John MacFarlane <jgm@berkeley.edu>
- Stability : alpha
- Portability : portable
-
-Conversion of markdown-formatted plain text to 'Pandoc' document.
--}
-module Text.Pandoc.Readers.Markdown (
- readMarkdown
- ) where
-
-import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy )
-import Data.Ord ( comparing )
-import Data.Char ( isAlphaNum )
-import Network.URI ( isURI )
-import Text.Pandoc.Definition
-import Text.Pandoc.Shared
-import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
-import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
- anyHtmlInlineTag, anyHtmlTag,
- anyHtmlEndTag, htmlEndTag, extractTagType,
- htmlBlockElement )
-import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
-import Text.ParserCombinators.Parsec
-
--- | Read markdown from an input string and return a Pandoc document.
-readMarkdown :: ParserState -> String -> Pandoc
-readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n")
-
---
--- Constants and data structure definitions
---
-
-spaceChars = " \t"
-bulletListMarkers = "*+-"
-hruleChars = "*-_"
-setextHChars = "=-"
-
--- treat these as potentially non-text when parsing inline:
-specialChars = "\\[]*_~`<>$!^-.&'\""
-
---
--- auxiliary functions
---
-
-indentSpaces = try $ do
- state <- getState
- let tabStop = stateTabStop state
- try (count tabStop (char ' ')) <|>
- (many (char ' ') >> string "\t") <?> "indentation"
-
-nonindentSpaces = do
- state <- getState
- let tabStop = stateTabStop state
- sps <- many (char ' ')
- if length sps < tabStop
- then return sps
- else unexpected "indented line"
-
--- | Fail unless we're at beginning of a line.
-failUnlessBeginningOfLine = do
- pos <- getPosition
- if sourceColumn pos == 1 then return () else fail "not beginning of line"
-
--- | Fail unless we're in "smart typography" mode.
-failUnlessSmart = do
- state <- getState
- if stateSmart state then return () else fail "Smart typography feature"
-
--- | Parse an inline Str element with a given content.
-inlineString str = try $ do
- (Str res) <- inline
- if res == str then return res else fail $ "unexpected Str content"
-
--- | Parse a sequence of inline elements between a string
--- @opener@ and a string @closer@, including inlines
--- between balanced pairs of @opener@ and a @closer@.
-inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline]
-inlinesInBalanced opener closer = try $ do
- string opener
- result <- manyTill ( (do lookAhead (inlineString opener)
- -- because it might be a link...
- bal <- inlinesInBalanced opener closer
- return $ [Str opener] ++ bal ++ [Str closer])
- <|> (count 1 inline))
- (try (string closer))
- return $ concat result
-
---
--- document structure
---
-
-titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
-
-authorsLine = try $ do
- char '%'
- skipSpaces
- authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
- newline
- return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors
-
-dateLine = try $ do
- char '%'
- skipSpaces
- date <- many (noneOf "\n")
- newline
- return $ decodeCharacterReferences $ removeTrailingSpace date
-
-titleBlock = try $ do
- failIfStrict
- title <- option [] titleLine
- author <- option [] authorsLine
- date <- option "" dateLine
- optional blanklines
- return (title, author, date)
-
-parseMarkdown = do
- -- markdown allows raw HTML
- updateState (\state -> state { stateParseRaw = True })
- startPos <- getPosition
- -- go through once just to get list of reference keys
- -- docMinusKeys is the raw document with blanks where the keys were...
- docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>=
- return . concat
- setInput docMinusKeys
- setPosition startPos
- st <- getState
- -- go through again for notes unless strict...
- if stateStrict st
- then return ()
- else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
- return . concat
- st <- getState
- let reversedNotes = stateNotes st
- updateState $ \st -> st { stateNotes = reverse reversedNotes }
- setInput docMinusNotes
- setPosition startPos
- -- now parse it for real...
- (title, author, date) <- option ([],[],"") titleBlock
- blocks <- parseBlocks
- return $ Pandoc (Meta title author date) $ filter (/= Null) blocks
-
---
--- initial pass for references and notes
---
-
-referenceKey = try $ do
- startPos <- getPosition
- nonindentSpaces
- label <- reference
- char ':'
- skipSpaces
- optional (char '<')
- src <- many (noneOf "> \n\t")
- optional (char '>')
- tit <- option "" referenceTitle
- blanklines
- endPos <- getPosition
- let newkey = (label, (removeTrailingSpace src, tit))
- st <- getState
- let oldkeys = stateKeys st
- updateState $ \st -> st { stateKeys = newkey : oldkeys }
- -- return blanks so line count isn't affected
- return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-
-referenceTitle = try $ do
- (many1 spaceChar >> option '\n' newline) <|> newline
- skipSpaces
- tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
- <|> do delim <- char '\'' <|> char '"'
- manyTill anyChar (try (char delim >> skipSpaces >>
- notFollowedBy (noneOf ")\n")))
- return $ decodeCharacterReferences tit
-
-noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']')
-
-rawLine = do
- notFollowedBy blankline
- notFollowedBy' noteMarker
- contents <- many1 nonEndline
- end <- option "" (newline >> optional indentSpaces >> return "\n")
- return $ contents ++ end
-
-rawLines = many1 rawLine >>= return . concat
-
-noteBlock = try $ do
- startPos <- getPosition
- ref <- noteMarker
- char ':'
- optional blankline
- optional indentSpaces
- raw <- sepBy rawLines (try (blankline >> indentSpaces))
- optional blanklines
- endPos <- getPosition
- -- parse the extracted text, which may contain various block elements:
- contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
- let newnote = (ref, contents)
- st <- getState
- let oldnotes = stateNotes st
- updateState $ \st -> st { stateNotes = newnote : oldnotes }
- -- return blanks so line count isn't affected
- return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-
---
--- parsing blocks
---
-
-parseBlocks = manyTill block eof
-
-block = choice [ header
- , table
- , codeBlock
- , hrule
- , list
- , blockQuote
- , htmlBlock
- , rawLaTeXEnvironment'
- , para
- , plain
- , nullBlock ] <?> "block"
-
---
--- header blocks
---
-
-header = atxHeader <|> setextHeader <?> "header"
-
-atxHeader = try $ do
- level <- many1 (char '#') >>= return . length
- notFollowedBy (char '.' <|> char ')') -- this would be a list
- skipSpaces
- text <- manyTill inline atxClosing >>= return . normalizeSpaces
- return $ Header level text
-
-atxClosing = try $ skipMany (char '#') >> blanklines
-
-setextHeader = try $ do
- -- first, see if this block has any chance of being a setextHeader:
- lookAhead (anyLine >> oneOf setextHChars)
- text <- many1Till inline newline >>= return . normalizeSpaces
- level <- choice $ zipWith
- (\ch lev -> try (many1 $ char ch) >> blanklines >> return lev)
- setextHChars [1..(length setextHChars)]
- return $ Header level text
-
---
--- hrule block
---
-
-hrule = try $ do
- skipSpaces
- start <- oneOf hruleChars
- count 2 (skipSpaces >> char start)
- skipMany (skipSpaces >> char start)
- newline
- optional blanklines
- return HorizontalRule
-
---
--- code blocks
---
-
-indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
-
-codeBlock = do
- contents <- many1 (indentedLine <|>
- try (do b <- blanklines
- l <- indentedLine
- return $ b ++ l))
- optional blanklines
- return $ CodeBlock $ stripTrailingNewlines $ concat contents
-
---
--- block quotes
---
-
-emacsBoxQuote = try $ do
- failIfStrict
- string ",----"
- manyTill anyChar newline
- raw <- manyTill
- (try (char '|' >> optional (char ' ') >> manyTill anyChar newline))
- (try (string "`----"))
- blanklines
- return raw
-
-emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
-
-emailBlockQuote = try $ do
- emailBlockQuoteStart
- raw <- sepBy (many (nonEndline <|>
- (try (endline >> notFollowedBy emailBlockQuoteStart >>
- return '\n'))))
- (try (newline >> emailBlockQuoteStart))
- newline <|> (eof >> return '\n')
- optional blanklines
- return raw
-
-blockQuote = do
- raw <- emailBlockQuote <|> emacsBoxQuote
- -- parse the extracted block, which may contain various block elements:
- contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
- return $ BlockQuote contents
-
---
--- list blocks
---
-
-list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-
-bulletListStart = try $ do
- optional newline -- if preceded by a Plain block in a list context
- nonindentSpaces
- notFollowedBy' hrule -- because hrules start out just like lists
- oneOf bulletListMarkers
- spaceChar
- skipSpaces
-
-anyOrderedListStart = try $ do
- optional newline -- if preceded by a Plain block in a list context
- nonindentSpaces
- notFollowedBy $ string "p." >> spaceChar >> digit -- page number
- state <- getState
- if stateStrict state
- then do many1 digit
- char '.'
- spaceChar
- return (1, DefaultStyle, DefaultDelim)
- else anyOrderedListMarker >>~ spaceChar
-
-orderedListStart style delim = try $ do
- optional newline -- if preceded by a Plain block in a list context
- nonindentSpaces
- state <- getState
- num <- if stateStrict state
- then do many1 digit
- char '.'
- return 1
- else orderedListMarker style delim
- if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
- num `elem` [1, 5, 10, 50, 100, 500, 1000]))
- then char '\t' <|> (spaceChar >> spaceChar)
- else spaceChar
- skipSpaces
-
--- parse a line of a list item (start = parser for beginning of list item)
-listLine start = try $ do
- notFollowedBy' start
- notFollowedBy blankline
- notFollowedBy' (do indentSpaces
- many (spaceChar)
- bulletListStart <|> (anyOrderedListStart >> return ()))
- line <- manyTill anyChar newline
- return $ line ++ "\n"
-
--- parse raw text for one list item, excluding start marker and continuations
-rawListItem start = try $ do
- start
- result <- many1 (listLine start)
- blanks <- many blankline
- return $ concat result ++ blanks
-
--- continuation of a list item - indented and separated by blankline
--- or (in compact lists) endline.
--- note: nested lists are parsed as continuations
-listContinuation start = try $ do
- lookAhead indentSpaces
- result <- many1 (listContinuationLine start)
- blanks <- many blankline
- return $ concat result ++ blanks
-
-listContinuationLine start = try $ do
- notFollowedBy blankline
- notFollowedBy' start
- optional indentSpaces
- result <- manyTill anyChar newline
- return $ result ++ "\n"
-
-listItem start = try $ do
- first <- rawListItem start
- continuations <- many (listContinuation start)
- -- parsing with ListItemState forces markers at beginning of lines to
- -- count as list item markers, even if not separated by blank space.
- -- see definition of "endline"
- state <- getState
- let oldContext = stateParserContext state
- setState $ state {stateParserContext = ListItemState}
- -- parse the extracted block, which may contain various block elements:
- let raw = concat (first:continuations)
- contents <- parseFromString parseBlocks raw
- updateState (\st -> st {stateParserContext = oldContext})
- return contents
-
-orderedList = try $ do
- (start, style, delim) <- lookAhead anyOrderedListStart
- items <- many1 (listItem (orderedListStart style delim))
- return $ OrderedList (start, style, delim) $ compactify items
-
-bulletList = many1 (listItem bulletListStart) >>=
- return . BulletList . compactify
-
--- definition lists
-
-definitionListItem = try $ do
- notFollowedBy blankline
- notFollowedBy' indentSpaces
- -- first, see if this has any chance of being a definition list:
- lookAhead (anyLine >> char ':')
- term <- manyTill inline newline
- raw <- many1 defRawBlock
- state <- getState
- let oldContext = stateParserContext state
- -- parse the extracted block, which may contain various block elements:
- contents <- parseFromString parseBlocks $ concat raw
- updateState (\st -> st {stateParserContext = oldContext})
- return ((normalizeSpaces term), contents)
-
-defRawBlock = try $ do
- char ':'
- state <- getState
- let tabStop = stateTabStop state
- try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
- firstline <- anyLine
- rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
- trailing <- option "" blanklines
- return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
-
-definitionList = do
- failIfStrict
- items <- many1 definitionListItem
- let (terms, defs) = unzip items
- let defs' = compactify defs
- let items' = zip terms defs'
- return $ DefinitionList items'
-
---
--- paragraph block
---
-
-para = try $ do
- result <- many1 inline
- newline
- blanklines <|> do st <- getState
- if stateStrict st
- then lookAhead (blockQuote <|> header) >> return ""
- else lookAhead emacsBoxQuote >> return ""
- return $ Para $ normalizeSpaces result
-
-plain = many1 inline >>= return . Plain . normalizeSpaces
-
---
--- raw html
---
-
-htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
-
-htmlBlock = do
- st <- getState
- if stateStrict st
- then try $ do failUnlessBeginningOfLine
- first <- htmlElement
- finalSpace <- many (oneOf spaceChars)
- finalNewlines <- many newline
- return $ RawHtml $ first ++ finalSpace ++ finalNewlines
- else rawHtmlBlocks
-
--- True if tag is self-closing
-isSelfClosing tag =
- isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
-
-strictHtmlBlock = try $ do
- tag <- anyHtmlBlockTag
- let tag' = extractTagType tag
- if isSelfClosing tag || tag' == "hr"
- then return tag
- else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
- (htmlElement <|> (count 1 anyChar)))
- end <- htmlEndTag tag'
- return $ tag ++ concat contents ++ end
-
-rawHtmlBlocks = do
- htmlBlocks <- many1 rawHtmlBlock
- let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
- let combined' = if not (null combined) && last combined == '\n'
- then init combined -- strip extra newline
- else combined
- return $ RawHtml combined'
-
---
--- LaTeX
---
-
-rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment
-
---
--- Tables
---
-
--- Parse a dashed line with optional trailing spaces; return its length
--- and the length including trailing space.
-dashedLine ch = do
- dashes <- many1 (char ch)
- sp <- many spaceChar
- return $ (length dashes, length $ dashes ++ sp)
-
--- Parse a table header with dashed lines of '-' preceded by
--- one line of text.
-simpleTableHeader = try $ do
- rawContent <- anyLine
- initSp <- nonindentSpaces
- dashes <- many1 (dashedLine '-')
- newline
- let (lengths, lines) = unzip dashes
- let indices = scanl (+) (length initSp) lines
- let rawHeads = tail $ splitByIndices (init indices) rawContent
- let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
- return (rawHeads, aligns, indices)
-
--- Parse a table footer - dashed lines followed by blank line.
-tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines
-
--- Parse a table separator - dashed line.
-tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
-
--- Parse a raw line and split it into chunks by indices.
-rawTableLine indices = do
- notFollowedBy' (blanklines <|> tableFooter)
- line <- many1Till anyChar newline
- return $ map removeLeadingTrailingSpace $ tail $
- splitByIndices (init indices) line
-
--- Parse a table line and return a list of lists of blocks (columns).
-tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
-
--- Parse a multiline table row and return a list of blocks (columns).
-multilineRow indices = do
- colLines <- many1 (rawTableLine indices)
- optional blanklines
- let cols = map unlines $ transpose colLines
- mapM (parseFromString (many plain)) cols
-
--- Calculate relative widths of table columns, based on indices
-widthsFromIndices :: Int -- Number of columns on terminal
- -> [Int] -- Indices
- -> [Float] -- Fractional relative sizes of columns
-widthsFromIndices _ [] = []
-widthsFromIndices numColumns indices =
- let lengths = zipWith (-) indices (0:indices)
- totLength = sum lengths
- quotient = if totLength > numColumns
- then fromIntegral totLength
- else fromIntegral numColumns
- fracs = map (\l -> (fromIntegral l) / quotient) lengths in
- tail fracs
-
--- Parses a table caption: inlines beginning with 'Table:'
--- and followed by blank lines.
-tableCaption = try $ do
- nonindentSpaces
- string "Table:"
- result <- many1 inline
- blanklines
- return $ normalizeSpaces result
-
--- Parse a table using 'headerParser', 'lineParser', and 'footerParser'.
-tableWith headerParser lineParser footerParser = try $ do
- (rawHeads, aligns, indices) <- headerParser
- lines <- many1Till (lineParser indices) footerParser
- caption <- option [] tableCaption
- heads <- mapM (parseFromString (many plain)) rawHeads
- state <- getState
- let numColumns = stateColumns state
- let widths = widthsFromIndices numColumns indices
- return $ Table caption aligns widths heads lines
-
--- Parse a simple table with '---' header and one line per row.
-simpleTable = tableWith simpleTableHeader tableLine blanklines
-
--- Parse a multiline table: starts with row of '-' on top, then header
--- (which may be multiline), then the rows,
--- which may be multiline, separated by blank lines, and
--- ending with a footer (dashed line followed by blank line).
-multilineTable = tableWith multilineTableHeader multilineRow tableFooter
-
-multilineTableHeader = try $ do
- tableSep
- rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
- initSp <- nonindentSpaces
- dashes <- many1 (dashedLine '-')
- newline
- let (lengths, lines) = unzip dashes
- let indices = scanl (+) (length initSp) lines
- let rawHeadsList = transpose $ map
- (\ln -> tail $ splitByIndices (init indices) ln)
- rawContent
- let rawHeads = map (joinWithSep " ") rawHeadsList
- let aligns = zipWith alignType rawHeadsList lengths
- return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
-
--- Returns an alignment type for a table, based on a list of strings
--- (the rows of the column header) and a number (the length of the
--- dashed line under the rows.
-alignType :: [String] -> Int -> Alignment
-alignType [] len = AlignDefault
-alignType strLst len =
- let str = head $ sortBy (comparing length) $
- map removeTrailingSpace strLst
- leftSpace = if null str then False else (str !! 0) `elem` " \t"
- rightSpace = length str < len || (str !! (len - 1)) `elem` " \t"
- in case (leftSpace, rightSpace) of
- (True, False) -> AlignRight
- (False, True) -> AlignLeft
- (True, True) -> AlignCenter
- (False, False) -> AlignDefault
-
-table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table"
-
---
--- inline
---
-
-inline = choice [ str
- , smartPunctuation
- , whitespace
- , endline
- , code
- , charRef
- , strong
- , emph
- , note
- , inlineNote
- , link
- , image
- , math
- , strikeout
- , superscript
- , subscript
- , autoLink
- , rawHtmlInline'
- , rawLaTeXInline'
- , escapedChar
- , symbol
- , ltSign ] <?> "inline"
-
-escapedChar = do
- char '\\'
- state <- getState
- result <- option '\\' $ if stateStrict state
- then oneOf "\\`*_{}[]()>#+-.!~"
- else satisfy (not . isAlphaNum)
- return $ Str [result]
-
-ltSign = do
- st <- getState
- if stateStrict st
- then char '<'
- else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
- return $ Str ['<']
-
-specialCharsMinusLt = filter (/= '<') specialChars
-
-symbol = do
- result <- oneOf specialCharsMinusLt
- return $ Str [result]
-
--- parses inline code, between n `s and n `s
-code = try $ do
- starts <- many1 (char '`')
- skipSpaces
- result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
- (char '\n' >> return " "))
- (try (skipSpaces >> count (length starts) (char '`') >>
- notFollowedBy (char '`')))
- return $ Code $ removeLeadingTrailingSpace $ concat result
-
-mathWord = many1 ((noneOf " \t\n\\$") <|>
- (try (char '\\') >>~ notFollowedBy (char '$')))
-
-math = try $ do
- failIfStrict
- char '$'
- notFollowedBy space
- words <- sepBy1 mathWord (many1 space)
- char '$'
- return $ TeX ("$" ++ (joinWithSep " " words) ++ "$")
-
-emph = ((enclosed (char '*') (char '*') inline) <|>
- (enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>=
- return . Emph . normalizeSpaces
-
-strong = ((enclosed (string "**") (try $ string "**") inline) <|>
- (enclosed (string "__") (try $ string "__") inline)) >>=
- return . Strong . normalizeSpaces
-
-strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
- return . Strikeout . normalizeSpaces
-
-superscript = failIfStrict >> enclosed (char '^') (char '^')
- (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
- return . Superscript
-
-subscript = failIfStrict >> enclosed (char '~') (char '~')
- (notFollowedBy' whitespace >> inline) >>= -- may not contain Space
- return . Subscript
-
-smartPunctuation = failUnlessSmart >>
- choice [ quoted, apostrophe, dash, ellipses ]
-
-apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
-
-quoted = doubleQuoted <|> singleQuoted
-
-withQuoteContext context parser = do
- oldState <- getState
- let oldQuoteContext = stateQuoteContext oldState
- setState oldState { stateQuoteContext = context }
- result <- parser
- newState <- getState
- setState newState { stateQuoteContext = oldQuoteContext }
- return result
-
-singleQuoted = try $ do
- singleQuoteStart
- withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
- return . Quoted SingleQuote . normalizeSpaces
-
-doubleQuoted = try $ do
- doubleQuoteStart
- withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
- return . Quoted DoubleQuote . normalizeSpaces
-
-failIfInQuoteContext context = do
- st <- getState
- if stateQuoteContext st == context
- then fail "already inside quotes"
- else return ()
-
-singleQuoteStart = do
- failIfInQuoteContext InSingleQuote
- char '\8216' <|>
- do char '\''
- notFollowedBy (oneOf ")!],.;:-? \t\n")
- notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
- satisfy (not . isAlphaNum))) -- possess/contraction
- return '\''
-
-singleQuoteEnd = (char '\'' <|> char '\8217') >> notFollowedBy alphaNum
-
-doubleQuoteStart = failIfInQuoteContext InDoubleQuote >>
- (char '"' <|> char '\8220') >>
- notFollowedBy (oneOf " \t\n")
-
-doubleQuoteEnd = char '"' <|> char '\8221'
-
-ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses
-
-dash = enDash <|> emDash
-
-enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash
-
-emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >>
- skipSpaces >> return EmDash
-
-whitespace = do
- sps <- many1 (oneOf spaceChars)
- if length sps >= 2
- then option Space (endline >> return LineBreak)
- else return Space <?> "whitespace"
-
-nonEndline = satisfy (/='\n')
-
-strChar = noneOf (specialChars ++ spaceChars ++ "\n")
-
-str = many1 strChar >>= return . Str
-
--- an endline character that can be treated as a space, not a structural break
-endline = try $ do
- newline
- notFollowedBy blankline
- st <- getState
- if stateStrict st
- then do notFollowedBy emailBlockQuoteStart
- notFollowedBy (char '#') -- atx header
- else return ()
- -- parse potential list-starts differently if in a list:
- if stateParserContext st == ListItemState
- then notFollowedBy' (bulletListStart <|>
- (anyOrderedListStart >> return ()))
- else return ()
- return Space
-
---
--- links
---
-
--- a reference label for a link
-reference = notFollowedBy' (string "[^") >> -- footnote reference
- inlinesInBalanced "[" "]" >>= (return . normalizeSpaces)
-
--- source for a link, with optional title
-source = try $ do
- char '('
- optional (char '<')
- src <- many (noneOf ")> \t\n")
- optional (char '>')
- tit <- option "" linkTitle
- skipSpaces
- char ')'
- return (removeTrailingSpace src, tit)
-
-linkTitle = try $ do
- (many1 spaceChar >> option '\n' newline) <|> newline
- skipSpaces
- delim <- char '\'' <|> char '"'
- tit <- manyTill anyChar (try (char delim >> skipSpaces >>
- notFollowedBy (noneOf ")\n")))
- return $ decodeCharacterReferences tit
-
-link = try $ do
- label <- reference
- src <- source <|> referenceLink label
- return $ Link label src
-
--- a link like [this][ref] or [this][] or [this]
-referenceLink label = do
- ref <- option [] (try (optional (char ' ') >>
- optional (newline >> skipSpaces) >> reference))
- let ref' = if null ref then label else ref
- state <- getState
- case lookupKeySrc (stateKeys state) ref' of
- Nothing -> fail "no corresponding key"
- Just target -> return target
-
-emailAddress = try $ do
- name <- many1 (alphaNum <|> char '+')
- char '@'
- first <- many1 alphaNum
- rest <- many1 (char '.' >> many1 alphaNum)
- return $ "mailto:" ++ name ++ "@" ++ joinWithSep "." (first:rest)
-
-uri = try $ do
- str <- many1 (noneOf "\n\t >")
- if isURI str
- then return str
- else fail "not a URI"
-
-autoLink = try $ do
- char '<'
- src <- uri <|> emailAddress
- char '>'
- let src' = if "mailto:" `isPrefixOf` src
- then drop 7 src
- else src
- st <- getState
- return $ if stateStrict st
- then Link [Str src'] (src, "")
- else Link [Code src'] (src, "")
-
-image = try $ do
- char '!'
- (Link label src) <- link
- return $ Image label src
-
-note = try $ do
- failIfStrict
- ref <- noteMarker
- state <- getState
- let notes = stateNotes state
- case lookup ref notes of
- Nothing -> fail "note not found"
- Just contents -> return $ Note contents
-
-inlineNote = try $ do
- failIfStrict
- char '^'
- contents <- inlinesInBalanced "[" "]"
- return $ Note [Para contents]
-
-rawLaTeXInline' = failIfStrict >> rawLaTeXInline
-
-rawHtmlInline' = do
- st <- getState
- result <- choice $ if stateStrict st
- then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
- else [htmlBlockElement, anyHtmlInlineTag]
- return $ HtmlInline result
-
diff --git a/Text/Pandoc/Readers/RST.hs b/Text/Pandoc/Readers/RST.hs
deleted file mode 100644
index 1239eb688..000000000
--- a/Text/Pandoc/Readers/RST.hs
+++ /dev/null
@@ -1,640 +0,0 @@
-{-
-Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--}
-
-{- |
- Module : Text.Pandoc.Readers.RST
- Copyright : Copyright (C) 2006-7 John MacFarlane
- License : GNU GPL, version 2 or above
-
- Maintainer : John MacFarlane <jgm@berkeley.edu>
- Stability : alpha
- Portability : portable
-
-Conversion from reStructuredText to 'Pandoc' document.
--}
-module Text.Pandoc.Readers.RST (
- readRST
- ) where
-import Text.Pandoc.Definition
-import Text.Pandoc.Shared
-import Text.ParserCombinators.Parsec
-import Data.List ( findIndex, delete )
-
--- | Parse reStructuredText string and return Pandoc document.
-readRST :: ParserState -> String -> Pandoc
-readRST state str = (readWith parseRST) state (str ++ "\n\n")
-
---
--- Constants and data structure definitions
----
-
-bulletListMarkers = "*+-"
-underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~"
-
--- treat these as potentially non-text when parsing inline:
-specialChars = "\\`|*_<>$:[-"
-
---
--- parsing documents
---
-
-isAnonKey (ref, src) = ref == [Str "_"]
-
-isHeader :: Int -> Block -> Bool
-isHeader n (Header x _) = x == n
-isHeader _ _ = False
-
--- | Promote all headers in a list of blocks. (Part of
--- title transformation for RST.)
-promoteHeaders :: Int -> [Block] -> [Block]
-promoteHeaders num ((Header level text):rest) =
- (Header (level - num) text):(promoteHeaders num rest)
-promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
-promoteHeaders num [] = []
-
--- | If list of blocks starts with a header (or a header and subheader)
--- of level that are not found elsewhere, return it as a title and
--- promote all the other headers.
-titleTransform :: [Block] -- ^ list of blocks
- -> ([Block], [Inline]) -- ^ modified list of blocks, title
-titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title subtitle
- if (any (isHeader 1) rest) || (any (isHeader 2) rest)
- then ((Header 1 head1):(Header 2 head2):rest, [])
- else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2)
-titleTransform ((Header 1 head1):rest) = -- title, no subtitle
- if (any (isHeader 1) rest)
- then ((Header 1 head1):rest, [])
- else ((promoteHeaders 1 rest), head1)
-titleTransform blocks = (blocks, [])
-
-parseRST = do
- startPos <- getPosition
- -- go through once just to get list of reference keys
- -- docMinusKeys is the raw document with blanks where the keys were...
- docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat
- setInput docMinusKeys
- setPosition startPos
- st <- getState
- let reversedKeys = stateKeys st
- updateState $ \st -> st { stateKeys = reverse reversedKeys }
- -- now parse it for real...
- blocks <- parseBlocks
- let blocks' = filter (/= Null) blocks
- state <- getState
- let (blocks'', title) = if stateStandalone state
- then titleTransform blocks'
- else (blocks', [])
- let authors = stateAuthors state
- let date = stateDate state
- let title' = if (null title) then (stateTitle state) else title
- return $ Pandoc (Meta title' authors date) blocks''
-
---
--- parsing blocks
---
-
-parseBlocks = manyTill block eof
-
-block = choice [ codeBlock
- , rawHtmlBlock
- , rawLaTeXBlock
- , fieldList
- , blockQuote
- , imageBlock
- , unknownDirective
- , header
- , hrule
- , list
- , lineBlock
- , para
- , plain
- , nullBlock ] <?> "block"
-
---
--- field list
---
-
-fieldListItem indent = try $ do
- string indent
- char ':'
- name <- many1 alphaNum
- string ": "
- skipSpaces
- first <- manyTill anyChar newline
- rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >>
- indentedBlock
- return (name, joinWithSep " " (first:(lines rest)))
-
-fieldList = try $ do
- indent <- lookAhead $ many (oneOf " \t")
- items <- many1 $ fieldListItem indent
- blanklines
- let authors = case lookup "Authors" items of
- Just auth -> [auth]
- Nothing -> map snd (filter (\(x,y) -> x == "Author") items)
- if null authors
- then return ()
- else updateState $ \st -> st {stateAuthors = authors}
- case (lookup "Date" items) of
- Just dat -> updateState $ \st -> st {stateDate = dat}
- Nothing -> return ()
- case (lookup "Title" items) of
- Just tit -> parseFromString (many inline) tit >>=
- \t -> updateState $ \st -> st {stateTitle = t}
- Nothing -> return ()
- let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") &&
- (x /= "Date") && (x /= "Title")) items
- if null remaining
- then return Null
- else do terms <- mapM (return . (:[]) . Str . fst) remaining
- defs <- mapM (parseFromString (many block) . snd)
- remaining
- return $ DefinitionList $ zip terms defs
-
---
--- line block
---
-
-lineBlockLine = try $ do
- string "| "
- white <- many (oneOf " \t")
- line <- manyTill inline newline
- return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak]
-
-lineBlock = try $ do
- lines <- many1 lineBlockLine
- blanklines
- return $ Para (concat lines)
-
---
--- paragraph block
---
-
-para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph"
-
-codeBlockStart = string "::" >> blankline >> blankline
-
--- paragraph that ends in a :: starting a code block
-paraBeforeCodeBlock = try $ do
- result <- many1 (notFollowedBy' codeBlockStart >> inline)
- lookAhead (string "::")
- return $ Para $ if last result == Space
- then normalizeSpaces result
- else (normalizeSpaces result) ++ [Str ":"]
-
--- regular paragraph
-paraNormal = try $ do
- result <- many1 inline
- newline
- blanklines
- return $ Para $ normalizeSpaces result
-
-plain = many1 inline >>= return . Plain . normalizeSpaces
-
---
--- image block
---
-
-imageBlock = try $ do
- string ".. image:: "
- src <- manyTill anyChar newline
- fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t")
- many1 $ fieldListItem indent
- optional blanklines
- case lookup "alt" fields of
- Just alt -> return $ Plain [Image [Str alt] (src, alt)]
- Nothing -> return $ Plain [Image [Str "image"] (src, "")]
---
--- header blocks
---
-
-header = doubleHeader <|> singleHeader <?> "header"
-
--- a header with lines on top and bottom
-doubleHeader = try $ do
- c <- oneOf underlineChars
- rest <- many (char c) -- the top line
- let lenTop = length (c:rest)
- skipSpaces
- newline
- txt <- many1 (notFollowedBy blankline >> inline)
- pos <- getPosition
- let len = (sourceColumn pos) - 1
- if (len > lenTop) then fail "title longer than border" else return ()
- blankline -- spaces and newline
- count lenTop (char c) -- the bottom line
- blanklines
- -- check to see if we've had this kind of header before.
- -- if so, get appropriate level. if not, add to list.
- state <- getState
- let headerTable = stateHeaderTable state
- let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of
- Just ind -> (headerTable, ind + 1)
- Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
- setState (state { stateHeaderTable = headerTable' })
- return $ Header level (normalizeSpaces txt)
-
--- a header with line on the bottom only
-singleHeader = try $ do
- notFollowedBy' whitespace
- txt <- many1 (do {notFollowedBy blankline; inline})
- pos <- getPosition
- let len = (sourceColumn pos) - 1
- blankline
- c <- oneOf underlineChars
- rest <- count (len - 1) (char c)
- many (char c)
- blanklines
- state <- getState
- let headerTable = stateHeaderTable state
- let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of
- Just ind -> (headerTable, ind + 1)
- Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
- setState (state { stateHeaderTable = headerTable' })
- return $ Header level (normalizeSpaces txt)
-
---
--- hrule block
---
-
-hrule = try $ do
- chr <- oneOf underlineChars
- count 3 (char chr)
- skipMany (char chr)
- blankline
- blanklines
- return HorizontalRule
-
---
--- code blocks
---
-
--- read a line indented by a given string
-indentedLine indents = try $ do
- string indents
- result <- manyTill anyChar newline
- return $ result ++ "\n"
-
--- two or more indented lines, possibly separated by blank lines.
--- any amount of indentation will work.
-indentedBlock = do
- indents <- lookAhead $ many1 (oneOf " \t")
- lns <- many $ choice $ [ indentedLine indents,
- try $ do b <- blanklines
- l <- indentedLine indents
- return (b ++ l) ]
- optional blanklines
- return $ concat lns
-
-codeBlock = try $ do
- codeBlockStart
- result <- indentedBlock
- return $ CodeBlock $ stripTrailingNewlines result
-
---
--- raw html
---
-
-rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >>
- indentedBlock >>= return . RawHtml
-
---
--- raw latex
---
-
-rawLaTeXBlock = try $ do
- string ".. raw:: latex"
- blanklines
- result <- indentedBlock
- return $ Para [(TeX result)]
-
---
--- block quotes
---
-
-blockQuote = do
- raw <- indentedBlock
- -- parse the extracted block, which may contain various block elements:
- contents <- parseFromString parseBlocks $ raw ++ "\n\n"
- return $ BlockQuote contents
-
---
--- list blocks
---
-
-list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-
-definitionListItem = try $ do
- term <- many1Till inline endline
- raw <- indentedBlock
- -- parse the extracted block, which may contain various block elements:
- contents <- parseFromString parseBlocks $ raw ++ "\n\n"
- return (normalizeSpaces term, contents)
-
-definitionList = many1 definitionListItem >>= return . DefinitionList
-
--- parses bullet list start and returns its length (inc. following whitespace)
-bulletListStart = try $ do
- notFollowedBy' hrule -- because hrules start out just like lists
- marker <- oneOf bulletListMarkers
- white <- many1 spaceChar
- return $ length (marker:white)
-
--- parses ordered list start and returns its length (inc following whitespace)
-orderedListStart style delim = try $ do
- (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim)
- white <- many1 spaceChar
- return $ markerLen + length white
-
--- parse a line of a list item
-listLine markerLength = try $ do
- notFollowedBy blankline
- indentWith markerLength
- line <- manyTill anyChar newline
- return $ line ++ "\n"
-
--- indent by specified number of spaces (or equiv. tabs)
-indentWith num = do
- state <- getState
- let tabStop = stateTabStop state
- if (num < tabStop)
- then count num (char ' ')
- else choice [ try (count num (char ' ')),
- (try (char '\t' >> count (num - tabStop) (char ' '))) ]
-
--- parse raw text for one list item, excluding start marker and continuations
-rawListItem start = do
- markerLength <- start
- firstLine <- manyTill anyChar newline
- restLines <- many (listLine markerLength)
- return (markerLength, (firstLine ++ "\n" ++ (concat restLines)))
-
--- continuation of a list item - indented and separated by blankline or
--- (in compact lists) endline.
--- Note: nested lists are parsed as continuations.
-listContinuation markerLength = try $ do
- blanks <- many1 blankline
- result <- many1 (listLine markerLength)
- return $ blanks ++ concat result
-
-listItem start = try $ do
- (markerLength, first) <- rawListItem start
- rest <- many (listContinuation markerLength)
- blanks <- choice [ try (many blankline >>~ lookAhead start),
- many1 blankline ] -- whole list must end with blank.
- -- parsing with ListItemState forces markers at beginning of lines to
- -- count as list item markers, even if not separated by blank space.
- -- see definition of "endline"
- state <- getState
- let oldContext = stateParserContext state
- setState $ state {stateParserContext = ListItemState}
- -- parse the extracted block, which may itself contain block elements
- parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks
- updateState (\st -> st {stateParserContext = oldContext})
- return parsed
-
-orderedList = do
- (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar)
- items <- many1 (listItem (orderedListStart style delim))
- let items' = compactify items
- return $ OrderedList (start, style, delim) items'
-
-bulletList = many1 (listItem bulletListStart) >>=
- return . BulletList . compactify
-
---
--- unknown directive (e.g. comment)
---
-
-unknownDirective = try $ do
- string ".. "
- manyTill anyChar newline
- many (string " :" >> many1 (noneOf "\n:") >> char ':' >>
- many1 (noneOf "\n") >> newline)
- optional blanklines
- return Null
-
---
--- reference key
---
-
-referenceKey = do
- startPos <- getPosition
- key <- choice [imageKey, anonymousKey, regularKeyQuoted, regularKey]
- st <- getState
- let oldkeys = stateKeys st
- updateState $ \st -> st { stateKeys = key : oldkeys }
- optional blanklines
- endPos <- getPosition
- -- return enough blanks to replace key
- return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-
-targetURI = do
- skipSpaces
- optional newline
- contents <- many1 (try (many spaceChar >> newline >>
- many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n")
- blanklines
- return contents
-
-imageKey = try $ do
- string ".. |"
- ref <- manyTill inline (char '|')
- skipSpaces
- string "image::"
- src <- targetURI
- return (normalizeSpaces ref, (removeLeadingTrailingSpace src, ""))
-
-anonymousKey = try $ do
- oneOfStrings [".. __:", "__"]
- src <- targetURI
- state <- getState
- return ([Str "_"], (removeLeadingTrailingSpace src, ""))
-
-regularKeyQuoted = try $ do
- string ".. _`"
- ref <- manyTill inline (char '`')
- char ':'
- src <- targetURI
- return (normalizeSpaces ref, (removeLeadingTrailingSpace src, ""))
-
-regularKey = try $ do
- string ".. _"
- ref <- manyTill inline (char ':')
- src <- targetURI
- return (normalizeSpaces ref, (removeLeadingTrailingSpace src, ""))
-
- --
- -- inline
- --
-
-inline = choice [ link
- , str
- , whitespace
- , endline
- , strong
- , emph
- , code
- , image
- , hyphens
- , superscript
- , subscript
- , escapedChar
- , symbol ] <?> "inline"
-
-hyphens = do
- result <- many1 (char '-')
- option Space endline
- -- don't want to treat endline after hyphen or dash as a space
- return $ Str result
-
-escapedChar = escaped anyChar
-
-symbol = do
- result <- oneOf specialChars
- return $ Str [result]
-
--- parses inline code, between codeStart and codeEnd
-code = try $ do
- string "``"
- result <- manyTill anyChar (try (string "``"))
- return $ Code $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result
-
-emph = enclosed (char '*') (char '*') inline >>=
- return . Emph . normalizeSpaces
-
-strong = enclosed (string "**") (try $ string "**") inline >>=
- return . Strong . normalizeSpaces
-
-interpreted role = try $ do
- optional $ try $ string "\\ "
- result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar
- nextChar <- lookAhead anyChar
- try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "")
- return [Str result]
-
-superscript = interpreted "sup" >>= (return . Superscript)
-
-subscript = interpreted "sub" >>= (return . Subscript)
-
-whitespace = many1 spaceChar >> return Space <?> "whitespace"
-
-str = notFollowedBy' oneWordReference >>
- many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str
-
--- an endline character that can be treated as a space, not a structural break
-endline = try $ do
- newline
- notFollowedBy blankline
- -- parse potential list-starts at beginning of line differently in a list:
- st <- getState
- if (stateParserContext st) == ListItemState
- then notFollowedBy (anyOrderedListMarker >> spaceChar) >>
- notFollowedBy' bulletListStart
- else return ()
- return Space
-
---
--- links
---
-
-link = choice [explicitLink, referenceLink, autoLink] <?> "link"
-
-explicitLink = try $ do
- char '`'
- notFollowedBy (char '`') -- `` is marks start of inline code
- label <- manyTill inline (try (do {spaces; char '<'}))
- src <- manyTill (noneOf ">\n ") (char '>')
- skipSpaces
- string "`_"
- return $ Link (normalizeSpaces label) (removeLeadingTrailingSpace src, "")
-
-reference = try $ do
- char '`'
- notFollowedBy (char '`')
- label <- many1Till inline (char '`')
- char '_'
- return label
-
-oneWordReference = do
- raw <- many1 alphaNum
- char '_'
- notFollowedBy alphaNum -- because this_is_not a link
- return [Str raw]
-
-referenceLink = try $ do
- label <- reference <|> oneWordReference
- key <- option label (do{char '_'; return [Str "_"]}) -- anonymous link
- state <- getState
- let keyTable = stateKeys state
- src <- case lookupKeySrc keyTable key of
- Nothing -> fail "no corresponding key"
- Just target -> return target
- -- if anonymous link, remove first anon key so it won't be used again
- let keyTable' = if (key == [Str "_"]) -- anonymous link?
- then delete ([Str "_"], src) keyTable -- remove first anon key
- else keyTable
- setState $ state { stateKeys = keyTable' }
- return $ Link (normalizeSpaces label) src
-
-uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://",
- "mailto:", "news:", "telnet:" ]
-
-uri = try $ do
- scheme <- uriScheme
- identifier <- many1 (noneOf " \t\n")
- return $ scheme ++ identifier
-
-autoURI = do
- src <- uri
- return $ Link [Str src] (src, "")
-
-emailChar = alphaNum <|> oneOf "-+_."
-
-emailAddress = try $ do
- firstLetter <- alphaNum
- restAddr <- many emailChar
- let addr = firstLetter:restAddr
- char '@'
- dom <- domain
- return $ addr ++ '@':dom
-
-domainChar = alphaNum <|> char '-'
-
-domain = do
- first <- many1 domainChar
- dom <- many1 (try (do{ char '.'; many1 domainChar }))
- return $ joinWithSep "." (first:dom)
-
-autoEmail = do
- src <- emailAddress
- return $ Link [Str src] ("mailto:" ++ src, "")
-
-autoLink = autoURI <|> autoEmail
-
--- For now, we assume that all substitution references are for images.
-image = try $ do
- char '|'
- ref <- manyTill inline (char '|')
- state <- getState
- let keyTable = stateKeys state
- src <- case lookupKeySrc keyTable ref of
- Nothing -> fail "no corresponding key"
- Just target -> return target
- return $ Image (normalizeSpaces ref) src
-