diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Org.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 257 |
1 files changed, 202 insertions, 55 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 5c00a1b27..980f63504 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,7 +1,8 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-} {- -Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de> +Copyright (C) 2014-2015 Albert Krewinkel <tarleb@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -42,12 +43,13 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.Pandoc.Shared (compactify', compactify'DL) import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) +import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Applicative ( Applicative, pure , (<$>), (<$), (<*>), (<*), (*>) ) import Control.Arrow (first) import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) -import Control.Monad.Reader (Reader, runReader, ask, asks) +import Control.Monad.Reader (Reader, runReader, ask, asks, local) import Data.Char (isAlphaNum, toLower) import Data.Default import Data.List (intersperse, isPrefixOf, isSuffixOf) @@ -56,20 +58,57 @@ import Data.Maybe (fromMaybe, isJust) import Data.Monoid (Monoid, mconcat, mempty, mappend) import Network.HTTP (urlEncode) +import Text.Pandoc.Error + -- | Parse org-mode string and return a Pandoc document. readOrg :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) - -> Pandoc -readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") + -> Either PandocError Pandoc +readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") + +data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext } -type OrgParser = Parser [Char] OrgParserState +type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal) parseOrg :: OrgParser Pandoc parseOrg = do blocks' <- parseBlocks st <- getState let meta = runF (orgStateMeta' st) st - return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st) + let removeUnwantedBlocks = dropCommentTrees . filter (/= Null) + return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st) + +-- | Drop COMMENT headers and the document tree below those headers. +dropCommentTrees :: [Block] -> [Block] +dropCommentTrees [] = [] +dropCommentTrees blks@(b:bs) = + maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b + +-- | Return the level of a header starting a comment or :noexport: tree and +-- Nothing otherwise. +commentHeaderLevel :: Block -> Maybe Int +commentHeaderLevel blk = + case blk of + (Header level _ ((Str "COMMENT"):_)) -> Just level + (Header level _ title) | hasNoExportTag title -> Just level + _ -> Nothing + where + hasNoExportTag :: [Inline] -> Bool + hasNoExportTag = any isNoExportTag + + isNoExportTag :: Inline -> Bool + isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True + isNoExportTag _ = False + +-- | Drop blocks until a header on or above the given level is seen +dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block] +dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n) + +isHeaderLevelLowerEq :: Int -> Block -> Bool +isHeaderLevelLowerEq n blk = + case blk of + (Header level _ _) -> n >= level + _ -> False -- -- Parser State for Org @@ -98,6 +137,9 @@ data OrgParserState = OrgParserState , orgStateNotes' :: OrgNoteTable } +instance Default OrgParserLocal where + def = OrgParserLocal NoQuote + instance HasReaderOptions OrgParserState where extractReaderOptions = orgStateOptions @@ -111,6 +153,10 @@ instance HasLastStrPosition OrgParserState where getLastStrPos = orgStateLastStrPos setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } +instance HasQuoteContext st (Reader OrgParserLocal) where + getQuoteContext = asks orgLocalQuoteContext + withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q}) + instance Default OrgParserState where def = defaultOrgParserState @@ -134,19 +180,6 @@ recordAnchorId :: String -> OrgParser () recordAnchorId i = updateState $ \s -> s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } -addBlockAttribute :: String -> String -> OrgParser () -addBlockAttribute key val = updateState $ \s -> - let attrs = orgStateBlockAttributes s - in s{ orgStateBlockAttributes = M.insert key val attrs } - -lookupBlockAttribute :: String -> OrgParser (Maybe String) -lookupBlockAttribute key = - M.lookup key . orgStateBlockAttributes <$> getState - -resetBlockAttributes :: OrgParser () -resetBlockAttributes = updateState $ \s -> - s{ orgStateBlockAttributes = orgStateBlockAttributes def } - updateLastForbiddenCharPos :: OrgParser () updateLastForbiddenCharPos = getPosition >>= \p -> updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} @@ -274,9 +307,18 @@ block = choice [ mempty <$ blanklines , paraOrPlain ] <?> "block" +-- +-- Block Attributes +-- + +-- | Parse optional block attributes (like #+TITLE or #+NAME) optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks) optionalAttributes parser = try $ resetBlockAttributes *> parseBlockAttributes *> parser + where + resetBlockAttributes :: OrgParser () + resetBlockAttributes = updateState $ \s -> + s{ orgStateBlockAttributes = orgStateBlockAttributes def } parseBlockAttributes :: OrgParser () parseBlockAttributes = do @@ -301,6 +343,15 @@ lookupInlinesAttr attr = try $ do (fmap Just . parseFromString parseInlines) val +addBlockAttribute :: String -> String -> OrgParser () +addBlockAttribute key val = updateState $ \s -> + let attrs = orgStateBlockAttributes s + in s{ orgStateBlockAttributes = M.insert key val attrs } + +lookupBlockAttribute :: String -> OrgParser (Maybe String) +lookupBlockAttribute key = + M.lookup key . orgStateBlockAttributes <$> getState + -- -- Org Blocks (#+BEGIN_... / #+END_...) @@ -356,11 +407,11 @@ exportsResults :: [(String, String)] -> Bool exportsResults attrs = ("rundoc-exports", "results") `elem` attrs || ("rundoc-exports", "both") `elem` attrs -followingResultsBlock :: OrgParser (Maybe String) +followingResultsBlock :: OrgParser (Maybe (F Blocks)) followingResultsBlock = optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:" *> blankline - *> (unlines <$> many1 exampleLine)) + *> block) codeBlock :: BlockProperties -> OrgParser (F Blocks) codeBlock blkProp = do @@ -375,7 +426,7 @@ codeBlock blkProp = do labelledBlck <- maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption" - let resultBlck = pure $ maybe mempty (exampleCode) resultsContent + let resultBlck = fromMaybe mempty resultsContent return $ (if includeCode then labelledBlck else mempty) <> (if includeResults then resultBlck else mempty) where @@ -614,8 +665,25 @@ parseFormat = try $ do header :: OrgParser (F Blocks) header = try $ do level <- headerStart - title <- inlinesTillNewline - return $ B.header level <$> title + title <- manyTill inline (lookAhead headerEnd) + tags <- headerEnd + let inlns = trimInlinesF . mconcat $ title <> map tagToInlineF tags + return $ B.header level <$> inlns + where + tagToInlineF :: String -> F Inlines + tagToInlineF t = return $ B.spanWith ("", ["tag"], [("data-tag-name", t)]) mempty + +headerEnd :: OrgParser [String] +headerEnd = option [] headerTags <* newline + +headerTags :: OrgParser [String] +headerTags = try $ + skipSpaces + *> char ':' + *> many1 tag + <* skipSpaces + where tag = many1 (alphaNum <|> oneOf "@%#_") + <* char ':' headerStart :: OrgParser Int headerStart = try $ @@ -828,12 +896,14 @@ list :: OrgParser (F Blocks) list = choice [ definitionList, bulletList, orderedList ] <?> "list" definitionList :: OrgParser (F Blocks) -definitionList = fmap B.definitionList . fmap compactify'DL . sequence - <$> many1 (definitionListItem bulletListStart) +definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.definitionList . fmap compactify'DL . sequence + <$> many1 (definitionListItem $ bulletListStart' (Just n)) bulletList :: OrgParser (F Blocks) -bulletList = fmap B.bulletList . fmap compactify' . sequence - <$> many1 (listItem bulletListStart) +bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.bulletList . fmap compactify' . sequence + <$> many1 (listItem (bulletListStart' $ Just n)) orderedList :: OrgParser (F Blocks) orderedList = fmap B.orderedList . fmap compactify' . sequence @@ -845,10 +915,27 @@ genericListStart listMarker = try $ (+) <$> (length <$> many spaceChar) <*> (length <$> listMarker <* many1 spaceChar) --- parses bullet list start and returns its length (excl. following whitespace) +-- parses bullet list marker. maybe we know the indent level bulletListStart :: OrgParser Int -bulletListStart = genericListStart bulletListMarker - where bulletListMarker = pure <$> oneOf "*-+" +bulletListStart = bulletListStart' Nothing + +bulletListStart' :: Maybe Int -> OrgParser Int +-- returns length of bulletList prefix, inclusive of marker +bulletListStart' Nothing = do ind <- length <$> many spaceChar + when (ind == 0) $ notFollowedBy (char '*') + oneOf bullets + many1 spaceChar + return (ind + 1) + -- Unindented lists are legal, but they can't use '*' bullets + -- We return n to maintain compatibility with the generic listItem +bulletListStart' (Just n) = do count (n-1) spaceChar + when (n == 1) $ notFollowedBy (char '*') + oneOf bullets + many1 spaceChar + return n + +bullets :: String +bullets = "*+-" orderedListStart :: OrgParser Int orderedListStart = genericListStart orderedListMarker @@ -918,6 +1005,7 @@ inline = , subscript , superscript , inlineLaTeX + , smart , symbol ] <* (guard =<< newlinesCountWithinLimits) <?> "inline" @@ -927,7 +1015,7 @@ parseInlines = trimInlinesF . mconcat <$> many1 inline -- treat these as potentially non-text when parsing inline: specialChars :: [Char] -specialChars = "\"$'()*+-./:<=>[\\]^_{|}~" +specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" whitespace :: OrgParser (F Inlines) @@ -1054,13 +1142,13 @@ linkOrImage = explicitOrImageLink explicitOrImageLink :: OrgParser (F Inlines) explicitOrImageLink = try $ do char '[' - srcF <- applyCustomLinkFormat =<< linkTarget + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget title <- enclosedRaw (char '[') (char ']') title' <- parseFromString (mconcat <$> many inline) title char ']' return $ do src <- srcF - if isImageFilename src && isImageFilename title + if isImageFilename title then pure $ B.link src "" $ B.image title mempty mempty else linkToInlinesF src =<< title' @@ -1087,6 +1175,9 @@ selfTarget = try $ char '[' *> linkTarget <* char ']' linkTarget :: OrgParser String linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") +possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + applyCustomLinkFormat :: String -> OrgParser (F String) applyCustomLinkFormat link = do let (linkType, rest) = break (== ':') link @@ -1094,27 +1185,38 @@ applyCustomLinkFormat link = do formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters return $ maybe link ($ drop 1 rest) formatter - +-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind +-- of parsing. linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF s@('#':_) = pure . B.link s "" -linkToInlinesF s - | isImageFilename s = const . pure $ B.image s "" "" - | isUri s = pure . B.link s "" - | isRelativeUrl s = pure . B.link s "" -linkToInlinesF s = \title -> do - anchorB <- (s `elem`) <$> asksF orgStateAnchorIds - if anchorB - then pure $ B.link ('#':s) "" title - else pure $ B.emph title - -isRelativeUrl :: String -> Bool -isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s) +linkToInlinesF s = + case s of + "" -> pure . B.link "" "" + ('#':_) -> pure . B.link s "" + _ | isImageFilename s -> const . pure $ B.image s "" "" + _ | isFileLink s -> pure . B.link (dropLinkType s) "" + _ | isUri s -> pure . B.link s "" + _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) "" + _ | isRelativeFilePath s -> pure . B.link s "" + _ -> internalLink s + +isFileLink :: String -> Bool +isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s) + +dropLinkType :: String -> String +dropLinkType = tail . snd . break (== ':') + +isRelativeFilePath :: String -> Bool +isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) && + (':' `notElem` s) isUri :: String -> Bool isUri s = let (scheme, path) = break (== ':') s - in all (\c -> isAlphaNum c || c `elem` ".-") scheme + in all (\c -> isAlphaNum c || c `elem` (".-" :: String)) scheme && not (null path) +isAbsoluteFilePath :: String -> Bool +isAbsoluteFilePath = ('/' ==) . head + isImageFilename :: String -> Bool isImageFilename filename = any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && @@ -1124,6 +1226,13 @@ isImageFilename filename = imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] protocols = [ "file", "http", "https" ] +internalLink :: String -> Inlines -> F Inlines +internalLink link title = do + anchorB <- (link `elem`) <$> asksF orgStateAnchorIds + if anchorB + then return $ B.link ('#':link) "" title + else return $ B.emph title + -- | Parse an anchor like @<<anchor-id>>@ and return an empty span with -- @anchor-id@ set as id. Legal anchors in org-mode are defined through -- @org-target-regexp@, which is fairly liberal. Since no link is created if @@ -1148,7 +1257,7 @@ solidify :: String -> String solidify = map replaceSpecialChar where replaceSpecialChar c | isAlphaNum c = c - | c `elem` "_.-:" = c + | c `elem` ("_.-:" :: String) = c | otherwise = '-' -- | Parses an inline code block and marks it as an babel block. @@ -1203,12 +1312,16 @@ displayMath :: OrgParser (F Inlines) displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" , rawMathBetween "$$" "$$" ] + +updatePositions :: Char + -> OrgParser (Char) +updatePositions c = do + when (c `elem` emphasisPreChars) updateLastPreCharPos + when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos + return c + symbol :: OrgParser (F Inlines) symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) - where updatePositions c - | c `elem` emphasisPreChars = c <$ updateLastPreCharPos - | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos - | otherwise = return c emphasisBetween :: Char -> OrgParser (F Inlines) @@ -1387,7 +1500,8 @@ simpleSubOrSuperString = try $ inlineLaTeX :: OrgParser (F Inlines) inlineLaTeX = try $ do cmd <- inlineLaTeXCommand - maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd + maybe mzero returnF $ + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd where parseAsMath :: String -> Maybe Inlines parseAsMath cs = B.fromList <$> texMathToPandoc cs @@ -1395,6 +1509,11 @@ inlineLaTeX = try $ do parseAsInlineLaTeX :: String -> Maybe Inlines parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + parseAsMathMLSym :: String -> Maybe Inlines + parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) + -- dropWhileEnd would be nice here, but it's not available before base 4.5 + where clean = reverse . dropWhile (`elem` ("{}" :: String)) . reverse . drop 1 + state :: ParserState state = def{ stateOptions = def{ readerParseRaw = True }} @@ -1413,3 +1532,31 @@ inlineLaTeXCommand = try $ do count len anyChar return cs _ -> mzero + +smart :: OrgParser (F Inlines) +smart = do + getOption readerSmart >>= guard + doubleQuoted <|> singleQuoted <|> + choice (map (return <$>) [orgApostrophe, dash, ellipses]) + where orgApostrophe = + (char '\'' <|> char '\8217') <* updateLastPreCharPos + <* updateLastForbiddenCharPos + *> return (B.str "\x2019") + +singleQuoted :: OrgParser (F Inlines) +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ + fmap B.singleQuoted . trimInlinesF . mconcat <$> + many1Till inline singleQuoteEnd + +-- doubleQuoted will handle regular double-quoted sections, as well +-- as dialogues with an open double-quote without a close double-quote +-- in the same paragraph. +doubleQuoted :: OrgParser (F Inlines) +doubleQuoted = try $ do + doubleQuoteStart + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return + (fmap B.doubleQuoted . trimInlinesF $ contents)) + <|> (return $ return (B.str "\8220") <> contents) |