1 files changed, 202 insertions, 55 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 5c00a1b27..980f63504 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -1,7 +1,8 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
 {-
-Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de>
+Copyright (C) 2014-2015 Albert Krewinkel <tarleb@moltkeplatz.de>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -42,12 +43,13 @@ import           Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
 import           Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
 import           Text.Pandoc.Shared (compactify', compactify'DL)
 import           Text.TeXMath (readTeX, writePandoc, DisplayType(..))
+import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
 
 import           Control.Applicative ( Applicative, pure
                                      , (<$>), (<$), (<*>), (<*), (*>) )
 import           Control.Arrow (first)
 import           Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when)
-import           Control.Monad.Reader (Reader, runReader, ask, asks)
+import           Control.Monad.Reader (Reader, runReader, ask, asks, local)
 import           Data.Char (isAlphaNum, toLower)
 import           Data.Default
 import           Data.List (intersperse, isPrefixOf, isSuffixOf)
@@ -56,20 +58,57 @@ import           Data.Maybe (fromMaybe, isJust)
 import           Data.Monoid (Monoid, mconcat, mempty, mappend)
 import           Network.HTTP (urlEncode)
 
+import           Text.Pandoc.Error
+
 -- | Parse org-mode string and return a Pandoc document.
 readOrg :: ReaderOptions -- ^ Reader options
         -> String        -- ^ String to parse (assuming @'\n'@ line endings)
-        -> Pandoc
-readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
+        -> Either PandocError Pandoc
+readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
+
+data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext }
 
-type OrgParser = Parser [Char] OrgParserState
+type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal)
 
 parseOrg :: OrgParser Pandoc
 parseOrg = do
   blocks' <- parseBlocks
   st <- getState
   let meta = runF (orgStateMeta' st) st
-  return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st)
+  let removeUnwantedBlocks = dropCommentTrees . filter (/= Null)
+  return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st)
+
+-- | Drop COMMENT headers and the document tree below those headers.
+dropCommentTrees :: [Block] -> [Block]
+dropCommentTrees [] = []
+dropCommentTrees blks@(b:bs) =
+  maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b
+
+-- | Return the level of a header starting a comment or :noexport: tree and
+--  Nothing otherwise.
+commentHeaderLevel :: Block -> Maybe Int
+commentHeaderLevel blk =
+   case blk of
+     (Header level _ ((Str "COMMENT"):_))          -> Just level
+     (Header level _ title) | hasNoExportTag title -> Just level
+     _                                             -> Nothing
+ where
+   hasNoExportTag :: [Inline] -> Bool
+   hasNoExportTag = any isNoExportTag
+
+   isNoExportTag :: Inline -> Bool
+   isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True
+   isNoExportTag _ = False
+
+-- | Drop blocks until a header on or above the given level is seen
+dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block]
+dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n)
+
+isHeaderLevelLowerEq :: Int -> Block -> Bool
+isHeaderLevelLowerEq n blk =
+  case blk of
+    (Header level _ _) -> n >= level
+    _                  -> False
 
 --
 -- Parser State for Org
@@ -98,6 +137,9 @@ data OrgParserState = OrgParserState
                       , orgStateNotes'               :: OrgNoteTable
                       }
 
+instance Default OrgParserLocal where
+  def = OrgParserLocal NoQuote
+
 instance HasReaderOptions OrgParserState where
   extractReaderOptions = orgStateOptions
 
@@ -111,6 +153,10 @@ instance HasLastStrPosition OrgParserState where
   getLastStrPos = orgStateLastStrPos
   setLastStrPos pos st = st{ orgStateLastStrPos = Just pos }
 
+instance HasQuoteContext st (Reader OrgParserLocal) where
+  getQuoteContext = asks orgLocalQuoteContext
+  withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q})
+
 instance Default OrgParserState where
   def = defaultOrgParserState
 
@@ -134,19 +180,6 @@ recordAnchorId :: String -> OrgParser ()
 recordAnchorId i = updateState $ \s ->
   s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
 
-addBlockAttribute :: String -> String -> OrgParser ()
-addBlockAttribute key val = updateState $ \s ->
-  let attrs = orgStateBlockAttributes s
-  in s{ orgStateBlockAttributes = M.insert key val attrs }
-
-lookupBlockAttribute :: String -> OrgParser (Maybe String)
-lookupBlockAttribute key =
-  M.lookup key . orgStateBlockAttributes <$> getState
-
-resetBlockAttributes :: OrgParser ()
-resetBlockAttributes = updateState $ \s ->
-  s{ orgStateBlockAttributes = orgStateBlockAttributes def }
-
 updateLastForbiddenCharPos :: OrgParser ()
 updateLastForbiddenCharPos = getPosition >>= \p ->
   updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p}
@@ -274,9 +307,18 @@ block = choice [ mempty <$ blanklines
                , paraOrPlain
                ] <?> "block"
 
+--
+-- Block Attributes
+--
+
+-- | Parse optional block attributes (like #+TITLE or #+NAME)
 optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks)
 optionalAttributes parser = try $
   resetBlockAttributes *> parseBlockAttributes *> parser
+ where
+   resetBlockAttributes :: OrgParser ()
+   resetBlockAttributes = updateState $ \s ->
+     s{ orgStateBlockAttributes = orgStateBlockAttributes def }
 
 parseBlockAttributes :: OrgParser ()
 parseBlockAttributes = do
@@ -301,6 +343,15 @@ lookupInlinesAttr attr = try $ do
         (fmap Just . parseFromString parseInlines)
         val
 
+addBlockAttribute :: String -> String -> OrgParser ()
+addBlockAttribute key val = updateState $ \s ->
+  let attrs = orgStateBlockAttributes s
+  in s{ orgStateBlockAttributes = M.insert key val attrs }
+
+lookupBlockAttribute :: String -> OrgParser (Maybe String)
+lookupBlockAttribute key =
+  M.lookup key . orgStateBlockAttributes <$> getState
+
 
 --
 -- Org Blocks (#+BEGIN_... / #+END_...)
@@ -356,11 +407,11 @@ exportsResults :: [(String, String)] -> Bool
 exportsResults attrs = ("rundoc-exports", "results") `elem` attrs
                        || ("rundoc-exports", "both") `elem` attrs
 
-followingResultsBlock :: OrgParser (Maybe String)
+followingResultsBlock :: OrgParser (Maybe (F Blocks))
 followingResultsBlock =
        optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:"
                                      *> blankline
-                                     *> (unlines <$> many1 exampleLine))
+                                     *> block)
 
 codeBlock :: BlockProperties -> OrgParser (F Blocks)
 codeBlock blkProp = do
@@ -375,7 +426,7 @@ codeBlock blkProp = do
   labelledBlck      <- maybe (pure codeBlck)
                              (labelDiv codeBlck)
                              <$> lookupInlinesAttr "caption"
-  let resultBlck     = pure $ maybe mempty (exampleCode) resultsContent
+  let resultBlck     = fromMaybe mempty resultsContent
   return $ (if includeCode then labelledBlck else mempty)
            <> (if includeResults then resultBlck else mempty)
  where
@@ -614,8 +665,25 @@ parseFormat = try $ do
 header :: OrgParser (F Blocks)
 header = try $ do
   level <- headerStart
-  title <- inlinesTillNewline
-  return $ B.header level <$> title
+  title <- manyTill inline (lookAhead headerEnd)
+  tags <- headerEnd
+  let inlns = trimInlinesF . mconcat $ title <> map tagToInlineF tags
+  return $ B.header level <$> inlns
+ where
+   tagToInlineF :: String -> F Inlines
+   tagToInlineF t = return $ B.spanWith ("", ["tag"], [("data-tag-name", t)]) mempty
+
+headerEnd :: OrgParser [String]
+headerEnd = option [] headerTags <* newline
+
+headerTags :: OrgParser [String]
+headerTags = try $
+  skipSpaces
+  *> char ':'
+  *> many1 tag
+  <* skipSpaces
+ where tag = many1 (alphaNum <|> oneOf "@%#_")
+             <* char ':'
 
 headerStart :: OrgParser Int
 headerStart = try $
@@ -828,12 +896,14 @@ list :: OrgParser (F Blocks)
 list = choice [ definitionList, bulletList, orderedList ] <?> "list"
 
 definitionList :: OrgParser (F Blocks)
-definitionList = fmap B.definitionList . fmap compactify'DL . sequence
-                 <$> many1 (definitionListItem bulletListStart)
+definitionList = try $ do n <- lookAhead (bulletListStart' Nothing)
+                          fmap B.definitionList . fmap compactify'DL . sequence
+                            <$> many1 (definitionListItem $ bulletListStart' (Just n))
 
 bulletList :: OrgParser (F Blocks)
-bulletList = fmap B.bulletList . fmap compactify' . sequence
-             <$> many1 (listItem bulletListStart)
+bulletList = try $ do n <- lookAhead (bulletListStart' Nothing)
+                      fmap B.bulletList . fmap compactify' . sequence
+                        <$> many1 (listItem (bulletListStart' $ Just n))
 
 orderedList :: OrgParser (F Blocks)
 orderedList = fmap B.orderedList . fmap compactify' . sequence
@@ -845,10 +915,27 @@ genericListStart listMarker = try $
   (+) <$> (length <$> many spaceChar)
       <*> (length <$> listMarker <* many1 spaceChar)
 
--- parses bullet list start and returns its length (excl. following whitespace)
+-- parses bullet list marker. maybe we know the indent level
 bulletListStart :: OrgParser Int
-bulletListStart = genericListStart bulletListMarker
-  where bulletListMarker = pure <$> oneOf "*-+"
+bulletListStart = bulletListStart' Nothing
+
+bulletListStart' :: Maybe Int -> OrgParser Int
+-- returns length of bulletList prefix, inclusive of marker
+bulletListStart' Nothing  = do ind <- length <$> many spaceChar
+                               when (ind == 0) $ notFollowedBy (char '*')
+                               oneOf bullets
+                               many1 spaceChar
+                               return (ind + 1)
+ -- Unindented lists are legal, but they can't use '*' bullets
+ -- We return n to maintain compatibility with the generic listItem
+bulletListStart' (Just n) = do count (n-1) spaceChar
+                               when (n == 1) $ notFollowedBy (char '*')
+                               oneOf bullets
+                               many1 spaceChar
+                               return n
+
+bullets :: String
+bullets = "*+-"
 
 orderedListStart :: OrgParser Int
 orderedListStart = genericListStart orderedListMarker
@@ -918,6 +1005,7 @@ inline =
          , subscript
          , superscript
          , inlineLaTeX
+         , smart
          , symbol
          ] <* (guard =<< newlinesCountWithinLimits)
   <?> "inline"
@@ -927,7 +1015,7 @@ parseInlines = trimInlinesF . mconcat <$> many1 inline
 
 -- treat these as potentially non-text when parsing inline:
 specialChars :: [Char]
-specialChars = "\"$'()*+-./:<=>[\\]^_{|}~"
+specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
 
 
 whitespace :: OrgParser (F Inlines)
@@ -1054,13 +1142,13 @@ linkOrImage = explicitOrImageLink
 explicitOrImageLink :: OrgParser (F Inlines)
 explicitOrImageLink = try $ do
   char '['
-  srcF   <- applyCustomLinkFormat =<< linkTarget
+  srcF   <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
   title  <- enclosedRaw (char '[') (char ']')
   title' <- parseFromString (mconcat <$> many inline) title
   char ']'
   return $ do
     src <- srcF
-    if isImageFilename src && isImageFilename title
+    if isImageFilename title
       then pure $ B.link src "" $ B.image title mempty mempty
       else linkToInlinesF src =<< title'
 
@@ -1087,6 +1175,9 @@ selfTarget = try $ char '[' *> linkTarget <* char ']'
 linkTarget :: OrgParser String
 linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
 
+possiblyEmptyLinkTarget :: OrgParser String
+possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
+
 applyCustomLinkFormat :: String -> OrgParser (F String)
 applyCustomLinkFormat link = do
   let (linkType, rest) = break (== ':') link
@@ -1094,27 +1185,38 @@ applyCustomLinkFormat link = do
     formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
     return $ maybe link ($ drop 1 rest) formatter
 
-
+-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind
+-- of parsing.
 linkToInlinesF :: String -> Inlines -> F Inlines
-linkToInlinesF s@('#':_) = pure . B.link s ""
-linkToInlinesF s
-    | isImageFilename s = const . pure $ B.image s "" ""
-    | isUri s           = pure . B.link s ""
-    | isRelativeUrl s   = pure . B.link s ""
-linkToInlinesF s = \title -> do
-  anchorB <- (s `elem`) <$> asksF orgStateAnchorIds
-  if anchorB
-    then pure $ B.link ('#':s) "" title
-    else pure $ B.emph title
-
-isRelativeUrl :: String -> Bool
-isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s)
+linkToInlinesF s =
+  case s of
+    ""      -> pure . B.link "" ""
+    ('#':_) -> pure . B.link s ""
+    _ | isImageFilename s     -> const . pure $ B.image s "" ""
+    _ | isFileLink s          -> pure . B.link (dropLinkType s) ""
+    _ | isUri s               -> pure . B.link s ""
+    _ | isAbsoluteFilePath s  -> pure . B.link ("file://" ++ s) ""
+    _ | isRelativeFilePath s  -> pure . B.link s ""
+    _                         -> internalLink s
+
+isFileLink :: String -> Bool
+isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s)
+
+dropLinkType :: String -> String
+dropLinkType = tail . snd . break (== ':')
+
+isRelativeFilePath :: String -> Bool
+isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) &&
+                       (':' `notElem` s)
 
 isUri :: String -> Bool
 isUri s = let (scheme, path) = break (== ':') s
-          in all (\c -> isAlphaNum c || c `elem` ".-") scheme
+          in all (\c -> isAlphaNum c || c `elem` (".-" :: String)) scheme
              && not (null path)
 
+isAbsoluteFilePath :: String -> Bool
+isAbsoluteFilePath = ('/' ==) . head
+
 isImageFilename :: String -> Bool
 isImageFilename filename =
   any (\x -> ('.':x)  `isSuffixOf` filename) imageExtensions &&
@@ -1124,6 +1226,13 @@ isImageFilename filename =
    imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
    protocols = [ "file", "http", "https" ]
 
+internalLink :: String -> Inlines -> F Inlines
+internalLink link title = do
+  anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
+  if anchorB
+    then return $ B.link ('#':link) "" title
+    else return $ B.emph title
+
 -- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
 -- @anchor-id@ set as id.  Legal anchors in org-mode are defined through
 -- @org-target-regexp@, which is fairly liberal.  Since no link is created if
@@ -1148,7 +1257,7 @@ solidify :: String -> String
 solidify = map replaceSpecialChar
  where replaceSpecialChar c
            | isAlphaNum c    = c
-           | c `elem` "_.-:" = c
+           | c `elem` ("_.-:" :: String) = c
            | otherwise       = '-'
 
 -- | Parses an inline code block and marks it as an babel block.
@@ -1203,12 +1312,16 @@ displayMath :: OrgParser (F Inlines)
 displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
                                                 , rawMathBetween "$$"  "$$"
                                                 ]
+
+updatePositions :: Char
+                -> OrgParser (Char)
+updatePositions c = do
+  when (c `elem` emphasisPreChars) updateLastPreCharPos
+  when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
+  return c
+
 symbol :: OrgParser (F Inlines)
 symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
- where updatePositions c
-           | c `elem` emphasisPreChars = c <$ updateLastPreCharPos
-           | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos
-           | otherwise = return c
 
 emphasisBetween :: Char
                 -> OrgParser (F Inlines)
@@ -1387,7 +1500,8 @@ simpleSubOrSuperString = try $
 inlineLaTeX :: OrgParser (F Inlines)
 inlineLaTeX = try $ do
   cmd <- inlineLaTeXCommand
-  maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd
+  maybe mzero returnF $
+     parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
  where
    parseAsMath :: String -> Maybe Inlines
    parseAsMath cs = B.fromList <$> texMathToPandoc cs
@@ -1395,6 +1509,11 @@ inlineLaTeX = try $ do
    parseAsInlineLaTeX :: String -> Maybe Inlines
    parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
 
+   parseAsMathMLSym :: String -> Maybe Inlines
+   parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
+    -- dropWhileEnd would be nice here, but it's not available before base 4.5
+    where clean = reverse . dropWhile (`elem` ("{}" :: String)) . reverse . drop 1
+
    state :: ParserState
    state = def{ stateOptions = def{ readerParseRaw = True }}
 
@@ -1413,3 +1532,31 @@ inlineLaTeXCommand = try $ do
       count len anyChar
       return cs
     _ -> mzero
+
+smart :: OrgParser (F Inlines)
+smart = do
+  getOption readerSmart >>= guard
+  doubleQuoted <|> singleQuoted <|>
+    choice (map (return <$>) [orgApostrophe, dash, ellipses])
+  where orgApostrophe =
+          (char '\'' <|> char '\8217') <* updateLastPreCharPos
+                                       <* updateLastForbiddenCharPos
+                                       *> return (B.str "\x2019")
+
+singleQuoted :: OrgParser (F Inlines)
+singleQuoted = try $ do
+  singleQuoteStart
+  withQuoteContext InSingleQuote $
+    fmap B.singleQuoted . trimInlinesF . mconcat <$>
+      many1Till inline singleQuoteEnd
+
+-- doubleQuoted will handle regular double-quoted sections, as well
+-- as dialogues with an open double-quote without a close double-quote
+-- in the same paragraph.
+doubleQuoted :: OrgParser (F Inlines)
+doubleQuoted = try $ do
+  doubleQuoteStart
+  contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
+  (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return
+       (fmap B.doubleQuoted . trimInlinesF $ contents))
+   <|> (return $ return (B.str "\8220") <> contents)