1 files changed, 101 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 5c00a1b27..440b6d144 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -42,6 +42,7 @@ import           Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
 import           Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
 import           Text.Pandoc.Shared (compactify', compactify'DL)
 import           Text.TeXMath (readTeX, writePandoc, DisplayType(..))
+import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
 
 import           Control.Applicative ( Applicative, pure
                                      , (<$>), (<$), (<*>), (<*), (*>) )
@@ -69,7 +70,32 @@ parseOrg = do
   blocks' <- parseBlocks
   st <- getState
   let meta = runF (orgStateMeta' st) st
-  return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st)
+  let removeUnwantedBlocks = dropCommentTrees . filter (/= Null)
+  return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st)
+
+-- | Drop COMMENT headers and the document tree below those headers.
+dropCommentTrees :: [Block] -> [Block]
+dropCommentTrees [] = []
+dropCommentTrees blks@(b:bs) =
+  maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b
+
+-- | Return the level of a header starting a comment tree and Nothing
+-- otherwise.
+commentHeaderLevel :: Block -> Maybe Int
+commentHeaderLevel blk =
+   case blk of
+     (Header level _ ((Str "COMMENT"):_)) -> Just level
+     _                                    -> Nothing
+
+-- | Drop blocks until a header on or above the given level is seen
+dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block]
+dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n)
+
+isHeaderLevelLowerEq :: Int -> Block -> Bool
+isHeaderLevelLowerEq n blk =
+  case blk of
+    (Header level _ _) -> n >= level
+    _                  -> False
 
 --
 -- Parser State for Org
@@ -828,12 +854,14 @@ list :: OrgParser (F Blocks)
 list = choice [ definitionList, bulletList, orderedList ] <?> "list"
 
 definitionList :: OrgParser (F Blocks)
-definitionList = fmap B.definitionList . fmap compactify'DL . sequence
-                 <$> many1 (definitionListItem bulletListStart)
+definitionList = try $ do n <- lookAhead (bulletListStart' Nothing)
+                          fmap B.definitionList . fmap compactify'DL . sequence
+                            <$> many1 (definitionListItem $ bulletListStart' (Just n))
 
 bulletList :: OrgParser (F Blocks)
-bulletList = fmap B.bulletList . fmap compactify' . sequence
-             <$> many1 (listItem bulletListStart)
+bulletList = try $ do n <- lookAhead (bulletListStart' Nothing)
+                      fmap B.bulletList . fmap compactify' . sequence
+                        <$> many1 (listItem (bulletListStart' $ Just n))
 
 orderedList :: OrgParser (F Blocks)
 orderedList = fmap B.orderedList . fmap compactify' . sequence
@@ -845,10 +873,27 @@ genericListStart listMarker = try $
   (+) <$> (length <$> many spaceChar)
       <*> (length <$> listMarker <* many1 spaceChar)
 
--- parses bullet list start and returns its length (excl. following whitespace)
+-- parses bullet list marker. maybe we know the indent level
 bulletListStart :: OrgParser Int
-bulletListStart = genericListStart bulletListMarker
-  where bulletListMarker = pure <$> oneOf "*-+"
+bulletListStart = bulletListStart' Nothing
+
+bulletListStart' :: Maybe Int -> OrgParser Int
+-- returns length of bulletList prefix, inclusive of marker
+bulletListStart' Nothing  = do ind <- length <$> many spaceChar
+                               when (ind == 0) $ notFollowedBy (char '*')
+                               oneOf bullets
+                               many1 spaceChar
+                               return (ind + 1)
+ -- Unindented lists are legal, but they can't use '*' bullets
+ -- We return n to maintain compatibility with the generic listItem
+bulletListStart' (Just n) = do count (n-1) spaceChar
+                               when (n == 1) $ notFollowedBy (char '*')
+                               oneOf bullets
+                               many1 spaceChar
+                               return n
+
+bullets :: String
+bullets = "*+-"
 
 orderedListStart :: OrgParser Int
 orderedListStart = genericListStart orderedListMarker
@@ -927,7 +972,7 @@ parseInlines = trimInlinesF . mconcat <$> many1 inline
 
 -- treat these as potentially non-text when parsing inline:
 specialChars :: [Char]
-specialChars = "\"$'()*+-./:<=>[\\]^_{|}~"
+specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
 
 
 whitespace :: OrgParser (F Inlines)
@@ -1054,7 +1099,7 @@ linkOrImage = explicitOrImageLink
 explicitOrImageLink :: OrgParser (F Inlines)
 explicitOrImageLink = try $ do
   char '['
-  srcF   <- applyCustomLinkFormat =<< linkTarget
+  srcF   <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
   title  <- enclosedRaw (char '[') (char ']')
   title' <- parseFromString (mconcat <$> many inline) title
   char ']'
@@ -1087,6 +1132,9 @@ selfTarget = try $ char '[' *> linkTarget <* char ']'
 linkTarget :: OrgParser String
 linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
 
+possiblyEmptyLinkTarget :: OrgParser String
+possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
+
 applyCustomLinkFormat :: String -> OrgParser (F String)
 applyCustomLinkFormat link = do
   let (linkType, rest) = break (== ':') link
@@ -1094,27 +1142,38 @@ applyCustomLinkFormat link = do
     formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
     return $ maybe link ($ drop 1 rest) formatter
 
-
+-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind
+-- of parsing.
 linkToInlinesF :: String -> Inlines -> F Inlines
-linkToInlinesF s@('#':_) = pure . B.link s ""
-linkToInlinesF s
-    | isImageFilename s = const . pure $ B.image s "" ""
-    | isUri s           = pure . B.link s ""
-    | isRelativeUrl s   = pure . B.link s ""
-linkToInlinesF s = \title -> do
-  anchorB <- (s `elem`) <$> asksF orgStateAnchorIds
-  if anchorB
-    then pure $ B.link ('#':s) "" title
-    else pure $ B.emph title
-
-isRelativeUrl :: String -> Bool
-isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s)
+linkToInlinesF s =
+  case s of
+    ""      -> pure . B.link "" ""
+    ('#':_) -> pure . B.link s ""
+    _ | isImageFilename s     -> const . pure $ B.image s "" ""
+    _ | isFileLink s          -> pure . B.link (dropLinkType s) ""
+    _ | isUri s               -> pure . B.link s ""
+    _ | isAbsoluteFilePath s  -> pure . B.link ("file://" ++ s) ""
+    _ | isRelativeFilePath s  -> pure . B.link s ""
+    _                         -> internalLink s
+
+isFileLink :: String -> Bool
+isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s)
+
+dropLinkType :: String -> String
+dropLinkType = tail . snd . break (== ':')
+
+isRelativeFilePath :: String -> Bool
+isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) &&
+                       (':' `notElem` s)
 
 isUri :: String -> Bool
 isUri s = let (scheme, path) = break (== ':') s
           in all (\c -> isAlphaNum c || c `elem` ".-") scheme
              && not (null path)
 
+isAbsoluteFilePath :: String -> Bool
+isAbsoluteFilePath = ('/' ==) . head
+
 isImageFilename :: String -> Bool
 isImageFilename filename =
   any (\x -> ('.':x)  `isSuffixOf` filename) imageExtensions &&
@@ -1124,6 +1183,13 @@ isImageFilename filename =
    imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
    protocols = [ "file", "http", "https" ]
 
+internalLink :: String -> Inlines -> F Inlines
+internalLink link title = do
+  anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
+  if anchorB
+    then return $ B.link ('#':link) "" title
+    else return $ B.emph title
+
 -- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
 -- @anchor-id@ set as id.  Legal anchors in org-mode are defined through
 -- @org-target-regexp@, which is fairly liberal.  Since no link is created if
@@ -1205,10 +1271,10 @@ displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
                                                 ]
 symbol :: OrgParser (F Inlines)
 symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
- where updatePositions c
-           | c `elem` emphasisPreChars = c <$ updateLastPreCharPos
-           | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos
-           | otherwise = return c
+ where updatePositions c = do
+         when (c `elem` emphasisPreChars) updateLastPreCharPos
+         when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
+         return c
 
 emphasisBetween :: Char
                 -> OrgParser (F Inlines)
@@ -1387,7 +1453,8 @@ simpleSubOrSuperString = try $
 inlineLaTeX :: OrgParser (F Inlines)
 inlineLaTeX = try $ do
   cmd <- inlineLaTeXCommand
-  maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd
+  maybe mzero returnF $
+     parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
  where
    parseAsMath :: String -> Maybe Inlines
    parseAsMath cs = B.fromList <$> texMathToPandoc cs
@@ -1395,6 +1462,11 @@ inlineLaTeX = try $ do
    parseAsInlineLaTeX :: String -> Maybe Inlines
    parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
 
+   parseAsMathMLSym :: String -> Maybe Inlines
+   parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
+    -- dropWhileEnd would be nice here, but it's not available before base 4.5
+    where clean = reverse . dropWhile (`elem` "{}") . reverse . drop 1
+
    state :: ParserState
    state = def{ stateOptions = def{ readerParseRaw = True }}