aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorAlbert Krewinkel <tarleb@moltkeplatz.de>2014-04-08 22:39:25 +0200
committerAlbert Krewinkel <tarleb@moltkeplatz.de>2014-04-09 15:26:06 +0200
commit030020236c85c736892a6f8e0dcefca1681e5ce0 (patch)
treed547e6f97d1a9dfa819b410ffc837f51e49700f6 /src/Text
parentbfd598e1e93d9a15c8b69c167c5d0e5dd1503017 (diff)
downloadpandoc-030020236c85c736892a6f8e0dcefca1681e5ce0.tar.gz
Org reader: Precise rules for the recognition of markup
The inline parsers have been rewritten using the org source code as a reference. This fixes a couple of bugs related to erroneous markup recognition.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Readers/Org.hs380
1 files changed, 260 insertions, 120 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 2bb6ee122..392b17bbc 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -32,11 +32,12 @@ import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>), HasMeta(..))
import Text.Pandoc.Definition
import Text.Pandoc.Options
-import Text.Pandoc.Parsing hiding (orderedListMarker, updateLastStrPos)
+import qualified Text.Pandoc.Parsing as P
+import Text.Pandoc.Parsing hiding (newline, orderedListMarker, updateLastStrPos)
import Text.Pandoc.Shared (compactify')
import Control.Applicative (pure, (<$>), (<$), (<*>), (<*), (*>), (<**>))
-import Control.Monad (guard, mzero)
+import Control.Monad (guard, when)
import Data.Char (toLower)
import Data.Default
import Data.List (foldl', isPrefixOf, isSuffixOf)
@@ -47,49 +48,100 @@ import Data.Monoid (mconcat, mempty, mappend)
readOrg :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc
-readOrg opts s = (readWith parseOrg) def{ orgOptions = opts } (s ++ "\n\n")
+readOrg opts s = (readWith parseOrg) def{ orgStateOptions = opts } (s ++ "\n\n")
type OrgParser = Parser [Char] OrgParserState
+parseOrg:: OrgParser Pandoc
+parseOrg = do
+ blocks' <- B.toList <$> parseBlocks
+ st <- getState
+ let meta = orgStateMeta st
+ return $ Pandoc meta $ filter (/= Null) blocks'
+
+--
+-- Parser State for Org
+--
+
-- | Org-mode parser state
data OrgParserState = OrgParserState
- { orgOptions :: ReaderOptions
- , orgInlineCharStack :: [Char]
- , orgLastStrPos :: Maybe SourcePos
- , orgMeta :: Meta
+ { orgStateOptions :: ReaderOptions
+ , orgStateEmphasisCharStack :: [Char]
+ , orgStateEmphasisNewlines :: Maybe Int
+ , orgStateLastForbiddenCharPos :: Maybe SourcePos
+ , orgStateLastPreCharPos :: Maybe SourcePos
+ , orgStateLastStrPos :: Maybe SourcePos
+ , orgStateMeta :: Meta
} deriving (Show)
instance HasReaderOptions OrgParserState where
- extractReaderOptions = orgOptions
+ extractReaderOptions = orgStateOptions
instance HasMeta OrgParserState where
setMeta field val st =
- st{ orgMeta = setMeta field val $ orgMeta st }
+ st{ orgStateMeta = setMeta field val $ orgStateMeta st }
deleteMeta field st =
- st{ orgMeta = deleteMeta field $ orgMeta st }
+ st{ orgStateMeta = deleteMeta field $ orgStateMeta st }
instance Default OrgParserState where
def = defaultOrgParserState
defaultOrgParserState :: OrgParserState
defaultOrgParserState = OrgParserState
- { orgOptions = def
- , orgInlineCharStack = []
- , orgLastStrPos = Nothing
- , orgMeta = nullMeta
+ { orgStateOptions = def
+ , orgStateEmphasisCharStack = []
+ , orgStateEmphasisNewlines = Nothing
+ , orgStateLastForbiddenCharPos = Nothing
+ , orgStateLastPreCharPos = Nothing
+ , orgStateLastStrPos = Nothing
+ , orgStateMeta = nullMeta
}
updateLastStrPos :: OrgParser ()
updateLastStrPos = getPosition >>= \p ->
- updateState $ \s -> s{ orgLastStrPos = Just p }
+ updateState $ \s -> s{ orgStateLastStrPos = Just p }
+updateLastForbiddenCharPos :: OrgParser ()
+updateLastForbiddenCharPos = getPosition >>= \p ->
+ updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p}
-parseOrg:: OrgParser Pandoc
-parseOrg = do
- blocks' <- B.toList <$> parseBlocks
+updateLastPreCharPos :: OrgParser ()
+updateLastPreCharPos = getPosition >>= \p ->
+ updateState $ \s -> s{ orgStateLastPreCharPos = Just p}
+
+pushToInlineCharStack :: Char -> OrgParser ()
+pushToInlineCharStack c = updateState $ \st ->
+ st { orgStateEmphasisCharStack = c:(orgStateEmphasisCharStack st) }
+
+popInlineCharStack :: OrgParser ()
+popInlineCharStack = updateState $ \st ->
+ st { orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ st }
+
+surroundingEmphasisChar :: OrgParser [Char]
+surroundingEmphasisChar = take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
+
+startEmphasisNewlinesCounting :: Int -> OrgParser ()
+startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
+ s { orgStateEmphasisNewlines = Just maxNewlines }
+
+decEmphasisNewlinesCount :: OrgParser ()
+decEmphasisNewlinesCount = updateState $ \s ->
+ s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
+
+newlinesCountWithinLimits :: OrgParser Bool
+newlinesCountWithinLimits = do
st <- getState
- let meta = orgMeta st
- return $ Pandoc meta $ filter (/= Null) blocks'
+ return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
+
+resetEmphasisNewlines :: OrgParser ()
+resetEmphasisNewlines = updateState $ \s ->
+ s{ orgStateEmphasisNewlines = Nothing }
+
+newline :: OrgParser Char
+newline =
+ P.newline
+ <* updateLastPreCharPos
+ <* updateLastForbiddenCharPos
--
-- parsing blocks
@@ -218,7 +270,7 @@ commentLineStart = try $ mappend <$> many spaceChar <*> string "# "
declarationLine :: OrgParser Blocks
declarationLine = try $ do
meta' <- B.setMeta <$> metaKey <*> metaValue <*> pure nullMeta
- updateState $ \st -> st { orgMeta = orgMeta st <> meta' }
+ updateState $ \st -> st { orgStateMeta = orgStateMeta st <> meta' }
return mempty
metaValue :: OrgParser MetaValue
@@ -449,22 +501,24 @@ anyLineNewline = (++ "\n") <$> anyLine
--
inline :: OrgParser Inlines
-inline = choice inlineParsers <?> "inline"
- where inlineParsers = [ whitespace
- , link
- , str
- , endline
- , emph
- , strong
- , strikeout
- , underline
- , code
- , math
- , verbatim
- , subscript
- , superscript
- , symbol
- ]
+inline =
+ choice [ whitespace
+ , link
+ , str
+ , endline
+ , emph
+ , strong
+ , strikeout
+ , underline
+ , code
+ , math
+ , verbatim
+ , subscript
+ , superscript
+ , symbol
+ ] <* (guard =<< newlinesCountWithinLimits)
+ <?> "inline"
+
-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
@@ -472,7 +526,10 @@ specialChars = "\"$'()*+-./:<=>[\\]^_{|}~"
whitespace :: OrgParser Inlines
-whitespace = B.space <$ skipMany1 spaceChar <?> "whitespace"
+whitespace = B.space <$ skipMany1 spaceChar
+ <* updateLastPreCharPos
+ <* updateLastForbiddenCharPos
+ <?> "whitespace"
str :: OrgParser Inlines
str = B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
@@ -492,6 +549,9 @@ endline = try $ do
notFollowedBy' commentLineStart
notFollowedBy' bulletListStart
notFollowedBy' orderedListStart
+ decEmphasisNewlinesCount
+ guard =<< newlinesCountWithinLimits
+ updateLastPreCharPos
return B.space
link :: OrgParser Inlines
@@ -500,42 +560,54 @@ link = explicitOrImageLink <|> selflinkOrImage <?> "link"
explicitOrImageLink :: OrgParser Inlines
explicitOrImageLink = try $ do
char '['
- src <- enclosedRaw (char '[') (char ']')
+ src <- linkTarget
title <- enclosedRaw (char '[') (char ']')
title' <- parseFromString (mconcat <$> many inline) title
char ']'
- return $ if (isImage src) && (isImage title)
+ return $ if (isImageFilename src) && (isImageFilename title)
then B.link src "" (B.image title "" "")
else B.link src "" title'
selflinkOrImage :: OrgParser Inlines
selflinkOrImage = try $ do
- src <- enclosedRaw (string "[[") (string "]]")
- return $ if isImage src
+ src <- (char '[') *> linkTarget <* char ']'
+ return $ if isImageFilename src
then B.image src "" ""
else B.link src "" (B.str src)
+linkTarget :: OrgParser String
+linkTarget = enclosed (char '[') (char ']') (noneOf "\n\r]")
+
+isImageFilename :: String -> Bool
+isImageFilename filename =
+ any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
+ any (\x -> (x++":") `isPrefixOf` filename) protocols ||
+ ':' `notElem` filename
+ where
+ imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
+ protocols = [ "file", "http", "https" ]
+
emph :: OrgParser Inlines
-emph = B.emph <$> inlinesEnclosedBy '/'
+emph = B.emph <$> emphasisBetween '/'
strong :: OrgParser Inlines
-strong = B.strong <$> inlinesEnclosedBy '*'
+strong = B.strong <$> emphasisBetween '*'
strikeout :: OrgParser Inlines
-strikeout = B.strikeout <$> inlinesEnclosedBy '+'
+strikeout = B.strikeout <$> emphasisBetween '+'
-- There is no underline, so we use strong instead.
underline :: OrgParser Inlines
-underline = B.strong <$> inlinesEnclosedBy '_'
+underline = B.strong <$> emphasisBetween '_'
code :: OrgParser Inlines
-code = B.code <$> rawEnclosedBy '='
-
-math :: OrgParser Inlines
-math = B.math <$> rawEnclosedBy '$'
+code = B.code <$> verbatimBetween '='
verbatim :: OrgParser Inlines
-verbatim = B.rawInline "" <$> rawEnclosedBy '~'
+verbatim = B.rawInline "" <$> verbatimBetween '~'
+
+math :: OrgParser Inlines
+math = B.math <$> mathStringBetween '$'
subscript :: OrgParser Inlines
subscript = B.subscript <$> (try $ char '_' *> maybeGroupedByBraces)
@@ -550,7 +622,72 @@ maybeGroupedByBraces = try $
]
symbol :: OrgParser Inlines
-symbol = B.str . (: "") <$> oneOf specialChars
+symbol = B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
+ where updatePositions c
+ | c `elem` emphasisPreChars = c <$ updateLastPreCharPos
+ | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos
+ | otherwise = return c
+
+emphasisBetween :: Char
+ -> OrgParser Inlines
+emphasisBetween c = try $ do
+ startEmphasisNewlinesCounting emphasisAllowedNewlines
+ res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
+ isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
+ when isTopLevelEmphasis
+ resetEmphasisNewlines
+ return res
+
+verbatimBetween :: Char
+ -> OrgParser String
+verbatimBetween c = try $
+ emphasisStart c *>
+ many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c)
+
+-- | Parses a raw string delimited by @c@ using Org's math rules
+mathStringBetween :: Char
+ -> OrgParser String
+mathStringBetween c = try $ do
+ mathStart c
+ body <- many1TillNOrLessNewlines mathAllowedNewlines
+ (noneOf (c:"\n\r"))
+ (lookAhead $ mathEnd c)
+ final <- mathEnd c
+ return $ body ++ [final]
+
+-- | Parses the start (opening character) of emphasis
+emphasisStart :: Char -> OrgParser Char
+emphasisStart c = try $ do
+ guard =<< afterEmphasisPreChar
+ guard =<< notAfterString
+ char c
+ lookAhead (noneOf emphasisForbiddenBorderChars)
+ pushToInlineCharStack c
+ return c
+
+-- | Parses the closing character of emphasis
+emphasisEnd :: Char -> OrgParser Char
+emphasisEnd c = try $ do
+ guard =<< notAfterForbiddenBorderChar
+ char c
+ eof <|> lookAhead (surroundingEmphasisChar >>= \x ->
+ oneOf (x ++ emphasisPostChars))
+ *> return ()
+ updateLastStrPos
+ popInlineCharStack
+ return c
+
+mathStart :: Char -> OrgParser Char
+mathStart c = try $ do
+ char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
+
+mathEnd :: Char -> OrgParser Char
+mathEnd c = try $ do
+ res <- noneOf (c:mathForbiddenBorderChars)
+ char c
+ eof <|> (lookAhead $ oneOf mathPostChars *> pure ())
+ return res
+
enclosedInlines :: OrgParser a
-> OrgParser b
@@ -558,16 +695,6 @@ enclosedInlines :: OrgParser a
enclosedInlines start end = try $
trimInlines . mconcat <$> enclosed start end inline
--- FIXME: This is a hack
-inlinesEnclosedBy :: Char
- -> OrgParser Inlines
-inlinesEnclosedBy c = try $ do
- updateState $ \st -> st { orgInlineCharStack = c:(orgInlineCharStack st) }
- res <- enclosedInlines (atStart (char c) <* endsOnThisOrNextLine c)
- (atEnd $ char c)
- updateState $ \st -> st { orgInlineCharStack = drop 1 . orgInlineCharStack $ st }
- return res
-
enclosedRaw :: OrgParser a
-> OrgParser b
-> OrgParser String
@@ -577,63 +704,76 @@ enclosedRaw start end = try $
spanningTwoLines = try $
anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
-rawEnclosedBy :: Char
- -> OrgParser String
-rawEnclosedBy c = enclosedRaw (atStart $ char c) (atEnd $ char c)
-
--- succeeds only if we're not right after a str (ie. in middle of word)
-atStart :: OrgParser a -> OrgParser a
-atStart p = do
- guard =<< not <$> isRightAfterString
- p
-
--- | succeeds only if we're at the end of a word
-atEnd :: OrgParser a -> OrgParser a
-atEnd p = try $ do
- p <* lookingAtEndOfWord
- where lookingAtEndOfWord =
- eof <|> const (return ()) =<< lookAhead . oneOf =<< postWordChars
-
-isRightAfterString :: OrgParser Bool
-isRightAfterString = do
+-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume
+-- newlines.
+many1TillNOrLessNewlines :: Int
+ -> OrgParser Char
+ -> OrgParser a
+ -> OrgParser String
+many1TillNOrLessNewlines n p end = try $
+ nMoreLines (Just n) mempty >>= oneOrMore
+ where
+ nMoreLines Nothing cs = return cs
+ nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
+ nMoreLines k cs = try $ (final k cs <|> rest k cs)
+ >>= uncurry nMoreLines
+ final _ cs = (\x -> (Nothing, cs ++ x)) <$> (try finalLine)
+ rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> (try $ manyTill p P.newline)
+ finalLine = try $ manyTill p end
+ minus1 k = k - 1
+ oneOrMore cs = guard (not $ null cs) *> return cs
+
+-- Org allows customization of the way it reads emphasis. We use the defaults
+-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
+-- for details).
+
+-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too)
+emphasisPreChars :: [Char]
+emphasisPreChars = "\t \"'({"
+
+-- | Chars allowed at after emphasis
+emphasisPostChars :: [Char]
+emphasisPostChars = "\t\n !\"'),-.:;?\\}"
+
+-- | Chars not allowed at the (inner) border of emphasis
+emphasisForbiddenBorderChars :: [Char]
+emphasisForbiddenBorderChars = "\t\n\r \"',"
+
+-- | The maximum number of newlines within
+emphasisAllowedNewlines :: Int
+emphasisAllowedNewlines = 1
+
+-- LaTeX-style math: see `org-latex-regexps` for details
+
+-- | Chars allowed after an inline ($...$) math statement
+mathPostChars :: [Char]
+mathPostChars = "\t\n \"',-.:;?"
+
+-- | Chars not allowed at the (inner) border of math
+mathForbiddenBorderChars :: [Char]
+mathForbiddenBorderChars = "\t\n\r ,;.$"
+
+-- | Maximum number of newlines in an inline math statement
+mathAllowedNewlines :: Int
+mathAllowedNewlines = 2
+
+-- | Whether we are right behind a char allowed before emphasis
+afterEmphasisPreChar :: OrgParser Bool
+afterEmphasisPreChar = do
pos <- getPosition
- st <- getState
- -- the position `Nothing` isn't after a String, either, hence the double
- -- negation
- return $ not $ orgLastStrPos st /= Just pos
+ lastPrePos <- orgStateLastPreCharPos <$> getState
+ return $ lastPrePos == Nothing || lastPrePos == Just pos
-postWordChars :: OrgParser [Char]
-postWordChars = do
- st <- getState
- return $ "\t\n\r !\"'),-.:?}" ++ (take 1 . drop 1 . orgInlineCharStack $ st)
-
--- FIXME: These functions are hacks and should be replaced
-endsOnThisOrNextLine :: Char
- -> OrgParser ()
-endsOnThisOrNextLine c = do
- inp <- getInput
- let doOtherwise = \rest -> endsOnThisLine rest c (const mzero)
- endsOnThisLine inp c doOtherwise
-
-endsOnThisLine :: [Char]
- -> Char
- -> ([Char] -> OrgParser ())
- -> OrgParser ()
-endsOnThisLine input c doOnOtherLines = do
- postWordChars' <- postWordChars
- case break (`elem` c:"\n") input of
- (_,'\n':rest) -> doOnOtherLines rest
- (_,_:[]) -> return ()
- (_,_:rest@(n:_)) -> if n `elem` postWordChars'
- then return ()
- else endsOnThisLine rest c doOnOtherLines
- _ -> mzero
-
-isImage :: String -> Bool
-isImage filename =
- any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
- any (\x -> (x++":") `isPrefixOf` filename) protocols ||
- ':' `notElem` filename
- where
- imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
- protocols = [ "file", "http", "https" ]
+-- | Whether we are right after the end of a string
+notAfterString :: OrgParser Bool
+notAfterString = do
+ pos <- getPosition
+ lastStrPos <- orgStateLastStrPos <$> getState
+ return $ lastStrPos /= Just pos
+
+-- | Whether the parser is right after a forbidden border char
+notAfterForbiddenBorderChar :: OrgParser Bool
+notAfterForbiddenBorderChar = do
+ pos <- getPosition
+ lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
+ return $ lastFBCPos /= Just pos