diff options
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 93 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 79 |
2 files changed, 114 insertions, 58 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index eec4a3bc9..66ebca253 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -2,6 +2,7 @@ FlexibleContexts , GeneralizedNewtypeDeriving , TypeSynonymInstances +, MultiParamTypeClasses , FlexibleInstances #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -80,6 +81,7 @@ module Text.Pandoc.Parsing ( anyLine, HeaderType (..), ParserContext (..), QuoteContext (..), + HasQuoteContext (..), NoteTable, NoteTable', KeyTable, @@ -88,7 +90,6 @@ module Text.Pandoc.Parsing ( anyLine, toKey, registerHeader, smartPunctuation, - withQuoteContext, singleQuoteStart, singleQuoteEnd, doubleQuoteStart, @@ -106,6 +107,7 @@ module Text.Pandoc.Parsing ( anyLine, runF, askF, asksF, + token, -- * Re-exports from Text.Pandoc.Parsec Stream, runParser, @@ -160,7 +162,6 @@ module Text.Pandoc.Parsing ( anyLine, setSourceColumn, setSourceLine, newPos, - token ) where @@ -170,7 +171,7 @@ import Text.Pandoc.Builder (Blocks, Inlines, rawBlock, HasMeta(..)) import qualified Text.Pandoc.Builder as B import Text.Pandoc.XML (fromEntities) import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) -import Text.Parsec +import Text.Parsec hiding (token) import Text.Parsec.Pos (newPos) import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, isHexDigit, isSpace ) @@ -484,7 +485,8 @@ mathDisplayWith op cl = try $ do string op many1Till (noneOf "\n" <|> (newline <* notFollowedBy' blankline)) (try $ string cl) -mathDisplay :: Stream s m Char => ParserT s ParserState m String +mathDisplay :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m String mathDisplay = (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") <|> (guardEnabled Ext_tex_math_single_backslash >> @@ -492,7 +494,8 @@ mathDisplay = <|> (guardEnabled Ext_tex_math_double_backslash >> mathDisplayWith "\\\\[" "\\\\]") -mathInline :: Stream s m Char => ParserT s ParserState m String +mathInline :: (HasReaderOptions st , Stream s m Char) + => ParserT s st m String mathInline = (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") <|> (guardEnabled Ext_tex_math_single_backslash >> @@ -909,6 +912,21 @@ class HasReaderOptions st where -- default getOption f = (f . extractReaderOptions) <$> getState +class HasQuoteContext st m where + getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext + withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a + +instance Monad m => HasQuoteContext ParserState m where + getQuoteContext = stateQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + instance HasReaderOptions ParserState where extractReaderOptions = stateOptions @@ -1051,9 +1069,9 @@ registerHeader (ident,classes,kvs) header' = do failUnlessSmart :: (Stream s m a, HasReaderOptions st) => ParserT s st m () failUnlessSmart = getOption readerSmart >>= guard -smartPunctuation :: Stream s m Char - => ParserT s ParserState m Inlines - -> ParserT s ParserState m Inlines +smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines smartPunctuation inlineParser = do failUnlessSmart choice [ quoted inlineParser, apostrophe, dash, ellipses ] @@ -1061,46 +1079,33 @@ smartPunctuation inlineParser = do apostrophe :: Stream s m Char => ParserT s st m Inlines apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\x2019") -quoted :: Stream s m Char - => ParserT s ParserState m Inlines - -> ParserT s ParserState m Inlines +quoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser -withQuoteContext :: Stream s m t - => QuoteContext - -> ParserT s ParserState m a - -> ParserT s ParserState m a -withQuoteContext context parser = do - oldState <- getState - let oldQuoteContext = stateQuoteContext oldState - setState oldState { stateQuoteContext = context } - result <- parser - newState <- getState - setState newState { stateQuoteContext = oldQuoteContext } - return result - -singleQuoted :: Stream s m Char - => ParserT s ParserState m Inlines - -> ParserT s ParserState m Inlines +singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines singleQuoted inlineParser = try $ do singleQuoteStart withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= return . B.singleQuoted . mconcat -doubleQuoted :: Stream s m Char - => ParserT s ParserState m Inlines - -> ParserT s ParserState m Inlines +doubleQuoted :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines doubleQuoted inlineParser = try $ do doubleQuoteStart withQuoteContext InDoubleQuote $ manyTill inlineParser doubleQuoteEnd >>= return . B.doubleQuoted . mconcat -failIfInQuoteContext :: Stream s m t +failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) => QuoteContext - -> ParserT s ParserState m () + -> ParserT s st m () failIfInQuoteContext context = do - st <- getState - if stateQuoteContext st == context + context' <- getQuoteContext + if context' == context then fail "already inside quotes" else return () @@ -1110,8 +1115,8 @@ charOrRef cs = guard (c `elem` cs) return c) -singleQuoteStart :: Stream s m Char - => ParserT s ParserState m () +singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m () singleQuoteStart = do failIfInQuoteContext InSingleQuote -- single quote start can't be right after str @@ -1124,8 +1129,8 @@ singleQuoteEnd = try $ do charOrRef "'\8217\146" notFollowedBy alphaNum -doubleQuoteStart :: Stream s m Char - => ParserT s ParserState m () +doubleQuoteStart :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote try $ do charOrRef "\"\8220\147" @@ -1179,6 +1184,14 @@ citeKey = try $ do let key = firstChar:rest return (suppress_author, key) + +token :: (Stream s m t) + => (t -> String) + -> (t -> SourcePos) + -> (t -> Maybe a) + -> ParsecT s st m a +token pp pos match = tokenPrim pp (\_ t _ -> pos t) match + -- -- Macros -- @@ -1200,9 +1213,9 @@ macro = do else return $ rawBlock "latex" def' -- | Apply current macros to string. -applyMacros' :: Stream [Char] m Char +applyMacros' :: (HasReaderOptions st, HasMacros st, Stream [Char] m Char) => String - -> ParserT [Char] ParserState m String + -> ParserT [Char] st m String applyMacros' target = do apply <- getOption readerApplyMacros if apply diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2414dfbf7..597156a5e 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -40,7 +41,7 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) +import Text.Pandoc.Builder (HasMeta (..), Blocks, Inlines, trimInlines) import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing @@ -52,6 +53,8 @@ import Control.Applicative ( (<$>), (<$), (<*) ) import Data.Monoid import Text.Printf (printf) import Debug.Trace (trace) +import Data.Default (Default (..)) +import Control.Monad.Reader (Reader, runReader, asks, local, ask) isSpace :: Char -> Bool isSpace ' ' = True @@ -64,17 +67,26 @@ readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = - case runParser parseDoc def{ stateOptions = opts } "source" tags of + case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } ) "source" tags of Left err' -> error $ "\nError at " ++ show err' Right result -> result where tags = canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do blocks <- (fixPlains False) . mconcat <$> manyTill block eof - meta <- stateMeta <$> getState + meta <- stateMeta . parserState <$> getState return $ Pandoc meta (B.toList blocks) -type TagParser = Parser [Tag String] ParserState +data HTMLState = + HTMLState + { parserState :: ParserState + } + +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext } + +type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) + +type TagParser = HTMLParser [Tag String] pBody :: TagParser Blocks pBody = pInTags "body" block @@ -115,7 +127,6 @@ block = do (take 60 $ show $ B.toList res)) (return ()) return res - pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList @@ -365,8 +376,8 @@ pSelfClosing f g = do pQ :: TagParser Inlines pQ = do - quoteContext <- stateQuoteContext `fmap` getState - let quoteType = case quoteContext of + context <- asks quoteContext + let quoteType = case context of InDoubleQuote -> SingleQuote _ -> DoubleQuote let innerQuoteContext = if quoteType == SingleQuote @@ -477,7 +488,8 @@ pTagText :: TagParser Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState - case runParser (many pTagContents) st "text" str of + qu <- ask + case flip runReader qu $ runParserT (many pTagContents) st "text" str of Left _ -> fail $ "Could not parse `" ++ str ++ "'" Right result -> return $ mconcat result @@ -486,7 +498,9 @@ pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: Parser [Char] ParserState Inlines +type InlinesParser = HTMLParser String + +pTagContents :: InlinesParser Inlines pTagContents = B.displayMath <$> mathDisplay <|> B.math <$> mathInline @@ -496,12 +510,11 @@ pTagContents = <|> pSymbol <|> pBad -pStr :: Parser [Char] ParserState Inlines +pStr :: InlinesParser Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos return $ B.str result isSpecial :: Char -> Bool @@ -516,13 +529,13 @@ isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: Parser [Char] ParserState Inlines +pSymbol :: InlinesParser Inlines pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: Parser [Char] ParserState Inlines +pBad :: InlinesParser Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -556,7 +569,7 @@ pBad = do _ -> '?' return $ B.str [c'] -pSpace :: Parser [Char] ParserState Inlines +pSpace :: InlinesParser Inlines pSpace = many1 (satisfy isSpace) >> return B.space -- @@ -672,19 +685,23 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String +htmlInBalanced :: (Monad m) + => (Tag String -> Bool) + -> ParserT String st m String htmlInBalanced f = try $ do (TagOpen t _, tag) <- htmlTag f guard $ '/' `notElem` tag -- not a self-closing tag let stopper = htmlTag (~== TagClose t) - let anytag = liftM snd $ htmlTag (const True) + let anytag = snd <$> htmlTag (const True) contents <- many $ notFollowedBy' stopper >> (htmlInBalanced f <|> anytag <|> count 1 anyChar) endtag <- liftM snd stopper return $ tag ++ concat contents ++ endtag -- | Matches a tag meeting a certain condition. -htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String) +htmlTag :: Monad m + => (Tag String -> Bool) + -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead $ char '<' >> (oneOf "/!?" <|> letter) (next : _) <- getInput >>= return . canonicalizeTags . parseTags @@ -707,3 +724,29 @@ mkAttr attr = (attribsId, attribsClasses, attribsKV) attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr +-- Instances + +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance HasQuoteContext st (Reader HTMLLocal) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) + +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance Default HTMLState where + def = HTMLState def + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState + + |