aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Org.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Org.hs')
-rw-r--r--src/Text/Pandoc/Readers/Org.hs257
1 files changed, 202 insertions, 55 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 5c00a1b27..980f63504 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -1,7 +1,8 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
{-
-Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de>
+Copyright (C) 2014-2015 Albert Krewinkel <tarleb@moltkeplatz.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -42,12 +43,13 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
import Text.Pandoc.Shared (compactify', compactify'DL)
import Text.TeXMath (readTeX, writePandoc, DisplayType(..))
+import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
import Control.Applicative ( Applicative, pure
, (<$>), (<$), (<*>), (<*), (*>) )
import Control.Arrow (first)
import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when)
-import Control.Monad.Reader (Reader, runReader, ask, asks)
+import Control.Monad.Reader (Reader, runReader, ask, asks, local)
import Data.Char (isAlphaNum, toLower)
import Data.Default
import Data.List (intersperse, isPrefixOf, isSuffixOf)
@@ -56,20 +58,57 @@ import Data.Maybe (fromMaybe, isJust)
import Data.Monoid (Monoid, mconcat, mempty, mappend)
import Network.HTTP (urlEncode)
+import Text.Pandoc.Error
+
-- | Parse org-mode string and return a Pandoc document.
readOrg :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
- -> Pandoc
-readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
+ -> Either PandocError Pandoc
+readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
+
+data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext }
-type OrgParser = Parser [Char] OrgParserState
+type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal)
parseOrg :: OrgParser Pandoc
parseOrg = do
blocks' <- parseBlocks
st <- getState
let meta = runF (orgStateMeta' st) st
- return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st)
+ let removeUnwantedBlocks = dropCommentTrees . filter (/= Null)
+ return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st)
+
+-- | Drop COMMENT headers and the document tree below those headers.
+dropCommentTrees :: [Block] -> [Block]
+dropCommentTrees [] = []
+dropCommentTrees blks@(b:bs) =
+ maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b
+
+-- | Return the level of a header starting a comment or :noexport: tree and
+-- Nothing otherwise.
+commentHeaderLevel :: Block -> Maybe Int
+commentHeaderLevel blk =
+ case blk of
+ (Header level _ ((Str "COMMENT"):_)) -> Just level
+ (Header level _ title) | hasNoExportTag title -> Just level
+ _ -> Nothing
+ where
+ hasNoExportTag :: [Inline] -> Bool
+ hasNoExportTag = any isNoExportTag
+
+ isNoExportTag :: Inline -> Bool
+ isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True
+ isNoExportTag _ = False
+
+-- | Drop blocks until a header on or above the given level is seen
+dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block]
+dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n)
+
+isHeaderLevelLowerEq :: Int -> Block -> Bool
+isHeaderLevelLowerEq n blk =
+ case blk of
+ (Header level _ _) -> n >= level
+ _ -> False
--
-- Parser State for Org
@@ -98,6 +137,9 @@ data OrgParserState = OrgParserState
, orgStateNotes' :: OrgNoteTable
}
+instance Default OrgParserLocal where
+ def = OrgParserLocal NoQuote
+
instance HasReaderOptions OrgParserState where
extractReaderOptions = orgStateOptions
@@ -111,6 +153,10 @@ instance HasLastStrPosition OrgParserState where
getLastStrPos = orgStateLastStrPos
setLastStrPos pos st = st{ orgStateLastStrPos = Just pos }
+instance HasQuoteContext st (Reader OrgParserLocal) where
+ getQuoteContext = asks orgLocalQuoteContext
+ withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q})
+
instance Default OrgParserState where
def = defaultOrgParserState
@@ -134,19 +180,6 @@ recordAnchorId :: String -> OrgParser ()
recordAnchorId i = updateState $ \s ->
s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
-addBlockAttribute :: String -> String -> OrgParser ()
-addBlockAttribute key val = updateState $ \s ->
- let attrs = orgStateBlockAttributes s
- in s{ orgStateBlockAttributes = M.insert key val attrs }
-
-lookupBlockAttribute :: String -> OrgParser (Maybe String)
-lookupBlockAttribute key =
- M.lookup key . orgStateBlockAttributes <$> getState
-
-resetBlockAttributes :: OrgParser ()
-resetBlockAttributes = updateState $ \s ->
- s{ orgStateBlockAttributes = orgStateBlockAttributes def }
-
updateLastForbiddenCharPos :: OrgParser ()
updateLastForbiddenCharPos = getPosition >>= \p ->
updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p}
@@ -274,9 +307,18 @@ block = choice [ mempty <$ blanklines
, paraOrPlain
] <?> "block"
+--
+-- Block Attributes
+--
+
+-- | Parse optional block attributes (like #+TITLE or #+NAME)
optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks)
optionalAttributes parser = try $
resetBlockAttributes *> parseBlockAttributes *> parser
+ where
+ resetBlockAttributes :: OrgParser ()
+ resetBlockAttributes = updateState $ \s ->
+ s{ orgStateBlockAttributes = orgStateBlockAttributes def }
parseBlockAttributes :: OrgParser ()
parseBlockAttributes = do
@@ -301,6 +343,15 @@ lookupInlinesAttr attr = try $ do
(fmap Just . parseFromString parseInlines)
val
+addBlockAttribute :: String -> String -> OrgParser ()
+addBlockAttribute key val = updateState $ \s ->
+ let attrs = orgStateBlockAttributes s
+ in s{ orgStateBlockAttributes = M.insert key val attrs }
+
+lookupBlockAttribute :: String -> OrgParser (Maybe String)
+lookupBlockAttribute key =
+ M.lookup key . orgStateBlockAttributes <$> getState
+
--
-- Org Blocks (#+BEGIN_... / #+END_...)
@@ -356,11 +407,11 @@ exportsResults :: [(String, String)] -> Bool
exportsResults attrs = ("rundoc-exports", "results") `elem` attrs
|| ("rundoc-exports", "both") `elem` attrs
-followingResultsBlock :: OrgParser (Maybe String)
+followingResultsBlock :: OrgParser (Maybe (F Blocks))
followingResultsBlock =
optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:"
*> blankline
- *> (unlines <$> many1 exampleLine))
+ *> block)
codeBlock :: BlockProperties -> OrgParser (F Blocks)
codeBlock blkProp = do
@@ -375,7 +426,7 @@ codeBlock blkProp = do
labelledBlck <- maybe (pure codeBlck)
(labelDiv codeBlck)
<$> lookupInlinesAttr "caption"
- let resultBlck = pure $ maybe mempty (exampleCode) resultsContent
+ let resultBlck = fromMaybe mempty resultsContent
return $ (if includeCode then labelledBlck else mempty)
<> (if includeResults then resultBlck else mempty)
where
@@ -614,8 +665,25 @@ parseFormat = try $ do
header :: OrgParser (F Blocks)
header = try $ do
level <- headerStart
- title <- inlinesTillNewline
- return $ B.header level <$> title
+ title <- manyTill inline (lookAhead headerEnd)
+ tags <- headerEnd
+ let inlns = trimInlinesF . mconcat $ title <> map tagToInlineF tags
+ return $ B.header level <$> inlns
+ where
+ tagToInlineF :: String -> F Inlines
+ tagToInlineF t = return $ B.spanWith ("", ["tag"], [("data-tag-name", t)]) mempty
+
+headerEnd :: OrgParser [String]
+headerEnd = option [] headerTags <* newline
+
+headerTags :: OrgParser [String]
+headerTags = try $
+ skipSpaces
+ *> char ':'
+ *> many1 tag
+ <* skipSpaces
+ where tag = many1 (alphaNum <|> oneOf "@%#_")
+ <* char ':'
headerStart :: OrgParser Int
headerStart = try $
@@ -828,12 +896,14 @@ list :: OrgParser (F Blocks)
list = choice [ definitionList, bulletList, orderedList ] <?> "list"
definitionList :: OrgParser (F Blocks)
-definitionList = fmap B.definitionList . fmap compactify'DL . sequence
- <$> many1 (definitionListItem bulletListStart)
+definitionList = try $ do n <- lookAhead (bulletListStart' Nothing)
+ fmap B.definitionList . fmap compactify'DL . sequence
+ <$> many1 (definitionListItem $ bulletListStart' (Just n))
bulletList :: OrgParser (F Blocks)
-bulletList = fmap B.bulletList . fmap compactify' . sequence
- <$> many1 (listItem bulletListStart)
+bulletList = try $ do n <- lookAhead (bulletListStart' Nothing)
+ fmap B.bulletList . fmap compactify' . sequence
+ <$> many1 (listItem (bulletListStart' $ Just n))
orderedList :: OrgParser (F Blocks)
orderedList = fmap B.orderedList . fmap compactify' . sequence
@@ -845,10 +915,27 @@ genericListStart listMarker = try $
(+) <$> (length <$> many spaceChar)
<*> (length <$> listMarker <* many1 spaceChar)
--- parses bullet list start and returns its length (excl. following whitespace)
+-- parses bullet list marker. maybe we know the indent level
bulletListStart :: OrgParser Int
-bulletListStart = genericListStart bulletListMarker
- where bulletListMarker = pure <$> oneOf "*-+"
+bulletListStart = bulletListStart' Nothing
+
+bulletListStart' :: Maybe Int -> OrgParser Int
+-- returns length of bulletList prefix, inclusive of marker
+bulletListStart' Nothing = do ind <- length <$> many spaceChar
+ when (ind == 0) $ notFollowedBy (char '*')
+ oneOf bullets
+ many1 spaceChar
+ return (ind + 1)
+ -- Unindented lists are legal, but they can't use '*' bullets
+ -- We return n to maintain compatibility with the generic listItem
+bulletListStart' (Just n) = do count (n-1) spaceChar
+ when (n == 1) $ notFollowedBy (char '*')
+ oneOf bullets
+ many1 spaceChar
+ return n
+
+bullets :: String
+bullets = "*+-"
orderedListStart :: OrgParser Int
orderedListStart = genericListStart orderedListMarker
@@ -918,6 +1005,7 @@ inline =
, subscript
, superscript
, inlineLaTeX
+ , smart
, symbol
] <* (guard =<< newlinesCountWithinLimits)
<?> "inline"
@@ -927,7 +1015,7 @@ parseInlines = trimInlinesF . mconcat <$> many1 inline
-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
-specialChars = "\"$'()*+-./:<=>[\\]^_{|}~"
+specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
whitespace :: OrgParser (F Inlines)
@@ -1054,13 +1142,13 @@ linkOrImage = explicitOrImageLink
explicitOrImageLink :: OrgParser (F Inlines)
explicitOrImageLink = try $ do
char '['
- srcF <- applyCustomLinkFormat =<< linkTarget
+ srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
title <- enclosedRaw (char '[') (char ']')
title' <- parseFromString (mconcat <$> many inline) title
char ']'
return $ do
src <- srcF
- if isImageFilename src && isImageFilename title
+ if isImageFilename title
then pure $ B.link src "" $ B.image title mempty mempty
else linkToInlinesF src =<< title'
@@ -1087,6 +1175,9 @@ selfTarget = try $ char '[' *> linkTarget <* char ']'
linkTarget :: OrgParser String
linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
+possiblyEmptyLinkTarget :: OrgParser String
+possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
+
applyCustomLinkFormat :: String -> OrgParser (F String)
applyCustomLinkFormat link = do
let (linkType, rest) = break (== ':') link
@@ -1094,27 +1185,38 @@ applyCustomLinkFormat link = do
formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
return $ maybe link ($ drop 1 rest) formatter
-
+-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind
+-- of parsing.
linkToInlinesF :: String -> Inlines -> F Inlines
-linkToInlinesF s@('#':_) = pure . B.link s ""
-linkToInlinesF s
- | isImageFilename s = const . pure $ B.image s "" ""
- | isUri s = pure . B.link s ""
- | isRelativeUrl s = pure . B.link s ""
-linkToInlinesF s = \title -> do
- anchorB <- (s `elem`) <$> asksF orgStateAnchorIds
- if anchorB
- then pure $ B.link ('#':s) "" title
- else pure $ B.emph title
-
-isRelativeUrl :: String -> Bool
-isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s)
+linkToInlinesF s =
+ case s of
+ "" -> pure . B.link "" ""
+ ('#':_) -> pure . B.link s ""
+ _ | isImageFilename s -> const . pure $ B.image s "" ""
+ _ | isFileLink s -> pure . B.link (dropLinkType s) ""
+ _ | isUri s -> pure . B.link s ""
+ _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) ""
+ _ | isRelativeFilePath s -> pure . B.link s ""
+ _ -> internalLink s
+
+isFileLink :: String -> Bool
+isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s)
+
+dropLinkType :: String -> String
+dropLinkType = tail . snd . break (== ':')
+
+isRelativeFilePath :: String -> Bool
+isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) &&
+ (':' `notElem` s)
isUri :: String -> Bool
isUri s = let (scheme, path) = break (== ':') s
- in all (\c -> isAlphaNum c || c `elem` ".-") scheme
+ in all (\c -> isAlphaNum c || c `elem` (".-" :: String)) scheme
&& not (null path)
+isAbsoluteFilePath :: String -> Bool
+isAbsoluteFilePath = ('/' ==) . head
+
isImageFilename :: String -> Bool
isImageFilename filename =
any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
@@ -1124,6 +1226,13 @@ isImageFilename filename =
imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
protocols = [ "file", "http", "https" ]
+internalLink :: String -> Inlines -> F Inlines
+internalLink link title = do
+ anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
+ if anchorB
+ then return $ B.link ('#':link) "" title
+ else return $ B.emph title
+
-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
-- @anchor-id@ set as id. Legal anchors in org-mode are defined through
-- @org-target-regexp@, which is fairly liberal. Since no link is created if
@@ -1148,7 +1257,7 @@ solidify :: String -> String
solidify = map replaceSpecialChar
where replaceSpecialChar c
| isAlphaNum c = c
- | c `elem` "_.-:" = c
+ | c `elem` ("_.-:" :: String) = c
| otherwise = '-'
-- | Parses an inline code block and marks it as an babel block.
@@ -1203,12 +1312,16 @@ displayMath :: OrgParser (F Inlines)
displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
, rawMathBetween "$$" "$$"
]
+
+updatePositions :: Char
+ -> OrgParser (Char)
+updatePositions c = do
+ when (c `elem` emphasisPreChars) updateLastPreCharPos
+ when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
+ return c
+
symbol :: OrgParser (F Inlines)
symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
- where updatePositions c
- | c `elem` emphasisPreChars = c <$ updateLastPreCharPos
- | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos
- | otherwise = return c
emphasisBetween :: Char
-> OrgParser (F Inlines)
@@ -1387,7 +1500,8 @@ simpleSubOrSuperString = try $
inlineLaTeX :: OrgParser (F Inlines)
inlineLaTeX = try $ do
cmd <- inlineLaTeXCommand
- maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd
+ maybe mzero returnF $
+ parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
where
parseAsMath :: String -> Maybe Inlines
parseAsMath cs = B.fromList <$> texMathToPandoc cs
@@ -1395,6 +1509,11 @@ inlineLaTeX = try $ do
parseAsInlineLaTeX :: String -> Maybe Inlines
parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
+ parseAsMathMLSym :: String -> Maybe Inlines
+ parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
+ -- dropWhileEnd would be nice here, but it's not available before base 4.5
+ where clean = reverse . dropWhile (`elem` ("{}" :: String)) . reverse . drop 1
+
state :: ParserState
state = def{ stateOptions = def{ readerParseRaw = True }}
@@ -1413,3 +1532,31 @@ inlineLaTeXCommand = try $ do
count len anyChar
return cs
_ -> mzero
+
+smart :: OrgParser (F Inlines)
+smart = do
+ getOption readerSmart >>= guard
+ doubleQuoted <|> singleQuoted <|>
+ choice (map (return <$>) [orgApostrophe, dash, ellipses])
+ where orgApostrophe =
+ (char '\'' <|> char '\8217') <* updateLastPreCharPos
+ <* updateLastForbiddenCharPos
+ *> return (B.str "\x2019")
+
+singleQuoted :: OrgParser (F Inlines)
+singleQuoted = try $ do
+ singleQuoteStart
+ withQuoteContext InSingleQuote $
+ fmap B.singleQuoted . trimInlinesF . mconcat <$>
+ many1Till inline singleQuoteEnd
+
+-- doubleQuoted will handle regular double-quoted sections, as well
+-- as dialogues with an open double-quote without a close double-quote
+-- in the same paragraph.
+doubleQuoted :: OrgParser (F Inlines)
+doubleQuoted = try $ do
+ doubleQuoteStart
+ contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
+ (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return
+ (fmap B.doubleQuoted . trimInlinesF $ contents))
+ <|> (return $ return (B.str "\8220") <> contents)