diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Org.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 130 |
1 files changed, 101 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 5c00a1b27..440b6d144 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -42,6 +42,7 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.Pandoc.Shared (compactify', compactify'DL) import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) +import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Applicative ( Applicative, pure , (<$>), (<$), (<*>), (<*), (*>) ) @@ -69,7 +70,32 @@ parseOrg = do blocks' <- parseBlocks st <- getState let meta = runF (orgStateMeta' st) st - return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st) + let removeUnwantedBlocks = dropCommentTrees . filter (/= Null) + return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st) + +-- | Drop COMMENT headers and the document tree below those headers. +dropCommentTrees :: [Block] -> [Block] +dropCommentTrees [] = [] +dropCommentTrees blks@(b:bs) = + maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b + +-- | Return the level of a header starting a comment tree and Nothing +-- otherwise. +commentHeaderLevel :: Block -> Maybe Int +commentHeaderLevel blk = + case blk of + (Header level _ ((Str "COMMENT"):_)) -> Just level + _ -> Nothing + +-- | Drop blocks until a header on or above the given level is seen +dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block] +dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n) + +isHeaderLevelLowerEq :: Int -> Block -> Bool +isHeaderLevelLowerEq n blk = + case blk of + (Header level _ _) -> n >= level + _ -> False -- -- Parser State for Org @@ -828,12 +854,14 @@ list :: OrgParser (F Blocks) list = choice [ definitionList, bulletList, orderedList ] <?> "list" definitionList :: OrgParser (F Blocks) -definitionList = fmap B.definitionList . fmap compactify'DL . sequence - <$> many1 (definitionListItem bulletListStart) +definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.definitionList . fmap compactify'DL . sequence + <$> many1 (definitionListItem $ bulletListStart' (Just n)) bulletList :: OrgParser (F Blocks) -bulletList = fmap B.bulletList . fmap compactify' . sequence - <$> many1 (listItem bulletListStart) +bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.bulletList . fmap compactify' . sequence + <$> many1 (listItem (bulletListStart' $ Just n)) orderedList :: OrgParser (F Blocks) orderedList = fmap B.orderedList . fmap compactify' . sequence @@ -845,10 +873,27 @@ genericListStart listMarker = try $ (+) <$> (length <$> many spaceChar) <*> (length <$> listMarker <* many1 spaceChar) --- parses bullet list start and returns its length (excl. following whitespace) +-- parses bullet list marker. maybe we know the indent level bulletListStart :: OrgParser Int -bulletListStart = genericListStart bulletListMarker - where bulletListMarker = pure <$> oneOf "*-+" +bulletListStart = bulletListStart' Nothing + +bulletListStart' :: Maybe Int -> OrgParser Int +-- returns length of bulletList prefix, inclusive of marker +bulletListStart' Nothing = do ind <- length <$> many spaceChar + when (ind == 0) $ notFollowedBy (char '*') + oneOf bullets + many1 spaceChar + return (ind + 1) + -- Unindented lists are legal, but they can't use '*' bullets + -- We return n to maintain compatibility with the generic listItem +bulletListStart' (Just n) = do count (n-1) spaceChar + when (n == 1) $ notFollowedBy (char '*') + oneOf bullets + many1 spaceChar + return n + +bullets :: String +bullets = "*+-" orderedListStart :: OrgParser Int orderedListStart = genericListStart orderedListMarker @@ -927,7 +972,7 @@ parseInlines = trimInlinesF . mconcat <$> many1 inline -- treat these as potentially non-text when parsing inline: specialChars :: [Char] -specialChars = "\"$'()*+-./:<=>[\\]^_{|}~" +specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" whitespace :: OrgParser (F Inlines) @@ -1054,7 +1099,7 @@ linkOrImage = explicitOrImageLink explicitOrImageLink :: OrgParser (F Inlines) explicitOrImageLink = try $ do char '[' - srcF <- applyCustomLinkFormat =<< linkTarget + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget title <- enclosedRaw (char '[') (char ']') title' <- parseFromString (mconcat <$> many inline) title char ']' @@ -1087,6 +1132,9 @@ selfTarget = try $ char '[' *> linkTarget <* char ']' linkTarget :: OrgParser String linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") +possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + applyCustomLinkFormat :: String -> OrgParser (F String) applyCustomLinkFormat link = do let (linkType, rest) = break (== ':') link @@ -1094,27 +1142,38 @@ applyCustomLinkFormat link = do formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters return $ maybe link ($ drop 1 rest) formatter - +-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind +-- of parsing. linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF s@('#':_) = pure . B.link s "" -linkToInlinesF s - | isImageFilename s = const . pure $ B.image s "" "" - | isUri s = pure . B.link s "" - | isRelativeUrl s = pure . B.link s "" -linkToInlinesF s = \title -> do - anchorB <- (s `elem`) <$> asksF orgStateAnchorIds - if anchorB - then pure $ B.link ('#':s) "" title - else pure $ B.emph title - -isRelativeUrl :: String -> Bool -isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s) +linkToInlinesF s = + case s of + "" -> pure . B.link "" "" + ('#':_) -> pure . B.link s "" + _ | isImageFilename s -> const . pure $ B.image s "" "" + _ | isFileLink s -> pure . B.link (dropLinkType s) "" + _ | isUri s -> pure . B.link s "" + _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) "" + _ | isRelativeFilePath s -> pure . B.link s "" + _ -> internalLink s + +isFileLink :: String -> Bool +isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s) + +dropLinkType :: String -> String +dropLinkType = tail . snd . break (== ':') + +isRelativeFilePath :: String -> Bool +isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) && + (':' `notElem` s) isUri :: String -> Bool isUri s = let (scheme, path) = break (== ':') s in all (\c -> isAlphaNum c || c `elem` ".-") scheme && not (null path) +isAbsoluteFilePath :: String -> Bool +isAbsoluteFilePath = ('/' ==) . head + isImageFilename :: String -> Bool isImageFilename filename = any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && @@ -1124,6 +1183,13 @@ isImageFilename filename = imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] protocols = [ "file", "http", "https" ] +internalLink :: String -> Inlines -> F Inlines +internalLink link title = do + anchorB <- (link `elem`) <$> asksF orgStateAnchorIds + if anchorB + then return $ B.link ('#':link) "" title + else return $ B.emph title + -- | Parse an anchor like @<<anchor-id>>@ and return an empty span with -- @anchor-id@ set as id. Legal anchors in org-mode are defined through -- @org-target-regexp@, which is fairly liberal. Since no link is created if @@ -1205,10 +1271,10 @@ displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" ] symbol :: OrgParser (F Inlines) symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) - where updatePositions c - | c `elem` emphasisPreChars = c <$ updateLastPreCharPos - | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos - | otherwise = return c + where updatePositions c = do + when (c `elem` emphasisPreChars) updateLastPreCharPos + when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos + return c emphasisBetween :: Char -> OrgParser (F Inlines) @@ -1387,7 +1453,8 @@ simpleSubOrSuperString = try $ inlineLaTeX :: OrgParser (F Inlines) inlineLaTeX = try $ do cmd <- inlineLaTeXCommand - maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd + maybe mzero returnF $ + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd where parseAsMath :: String -> Maybe Inlines parseAsMath cs = B.fromList <$> texMathToPandoc cs @@ -1395,6 +1462,11 @@ inlineLaTeX = try $ do parseAsInlineLaTeX :: String -> Maybe Inlines parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + parseAsMathMLSym :: String -> Maybe Inlines + parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) + -- dropWhileEnd would be nice here, but it's not available before base 4.5 + where clean = reverse . dropWhile (`elem` "{}") . reverse . drop 1 + state :: ParserState state = def{ stateOptions = def{ readerParseRaw = True }} |