diff options
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 331 | 
1 files changed, 194 insertions, 137 deletions
| diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 3bccf89fb..94f933c4d 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,5 +1,5 @@  {-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses, -ViewPatterns#-} +ViewPatterns, OverloadedStrings #-}  {-  Copyright (C) 2006-2017 John MacFarlane <jgm@berkeley.edu> @@ -34,6 +34,7 @@ module Text.Pandoc.Readers.HTML ( readHtml                                  , htmlInBalanced                                  , isInlineTag                                  , isBlockTag +                                , NamedTag(..)                                  , isTextTag                                  , isCommentTag                                  ) where @@ -43,7 +44,7 @@ import Text.HTML.TagSoup.Match  import Text.Pandoc.Definition  import qualified Text.Pandoc.Builder as B  import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField +import Text.Pandoc.Shared ( extractSpaces, addMetaField                            , escapeURI, safeRead )  import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled,                                 Extension (Ext_epub_html_exts, @@ -53,13 +54,14 @@ import Text.Pandoc.Parsing hiding ((<|>))  import Text.Pandoc.Walk  import qualified Data.Map as M  import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf ) +import Data.List ( intercalate, isPrefixOf )  import Data.Char ( isDigit, isLetter, isAlphaNum )  import Control.Monad ( guard, mzero, void, unless )  import Control.Arrow ((***))  import Control.Applicative ( (<|>) )  import Data.Monoid (First (..)) -import Data.Text (Text, unpack) +import Data.Text (Text) +import qualified Data.Text as T  import Text.TeXMath (readMathML, writeTeX)  import Data.Default (Default (..), def)  import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift) @@ -80,7 +82,7 @@ readHtml :: PandocMonad m  readHtml opts inp = do    let tags = stripPrefixes . canonicalizeTags $               parseTagsOptions parseOptions{ optTagPosition = True } -             (unpack inp) +             inp        parseDoc = do          blocks <- (fixPlains False) . mconcat <$> manyTill block eof          meta <- stateMeta . parserState <$> getState @@ -130,7 +132,7 @@ setInPlain = local (\s -> s {inPlain = True})  type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) -type TagParser m = HTMLParser m [Tag String] +type TagParser m = HTMLParser m [Tag Text]  pBody :: PandocMonad m => TagParser m Blocks  pBody = pInTags "body" block @@ -140,12 +142,12 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag    where pTitle = pInTags "title" inline >>= setTitle . trimInlines          setTitle t = mempty <$ (updateState $ B.setMeta "title" t)          pMetaTag = do -          mt <- pSatisfy (~== TagOpen "meta" []) -          let name = fromAttrib "name" mt +          mt <- pSatisfy (matchTagOpen "meta" []) +          let name = T.unpack $ fromAttrib "name" mt            if null name               then return mempty               else do -               let content = fromAttrib "content" mt +               let content = T.unpack $ fromAttrib "content" mt                 updateState $ \s ->                   let ps = parserState s in                   s{ parserState = ps{ @@ -153,9 +155,9 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag                                     (stateMeta ps) } }                 return mempty          pBaseTag = do -          bt <- pSatisfy (~== TagOpen "base" []) +          bt <- pSatisfy (matchTagOpen "base" [])            updateState $ \st -> st{ baseHref = -               parseURIReference $ fromAttrib "href" bt } +               parseURIReference $ T.unpack $ fromAttrib "href" bt }            return mempty  block :: PandocMonad m => TagParser m Blocks @@ -195,29 +197,31 @@ eSwitch :: (PandocMonad m, Monoid a)          -> TagParser m a  eSwitch constructor parser = try $ do    guardEnabled Ext_epub_html_exts -  pSatisfy (~== TagOpen "switch" []) +  pSatisfy (matchTagOpen "switch" [])    cases <- getFirst . mconcat <$>              manyTill (First <$> (eCase <* skipMany pBlank) ) -              (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) +              (lookAhead $ try $ pSatisfy (matchTagOpen "default" []))    skipMany pBlank    fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)    skipMany pBlank -  pSatisfy (~== TagClose "switch") +  pSatisfy (matchTagClose "switch")    return $ maybe fallback constructor cases  eCase :: PandocMonad m => TagParser m (Maybe Inlines)  eCase = do    skipMany pBlank -  TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) +  TagOpen _ attr' <- lookAhead $ pSatisfy $ (matchTagOpen "case" []) +  let attr = toStringAttr attr'    case (flip lookup namespaces) =<< lookup "required-namespace" attr of      Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) -    Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) +    Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case"))  eFootnote :: PandocMonad m => TagParser m ()  eFootnote = try $ do    let notes = ["footnote", "rearnote"]    guardEnabled Ext_epub_html_exts -  (TagOpen tag attr) <- lookAhead $ pAnyTag +  (TagOpen tag attr') <- lookAhead $ pAnyTag +  let attr = toStringAttr attr'    guard (maybe False (flip elem notes) (lookup "type" attr))    let ident = fromMaybe "" (lookup "id" attr)    content <- pInTags tag block @@ -229,7 +233,8 @@ addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)  eNoteref :: PandocMonad m => TagParser m Inlines  eNoteref = try $ do    guardEnabled Ext_epub_html_exts -  TagOpen tag attr <- lookAhead $ pAnyTag +  TagOpen tag attr' <- lookAhead $ pAnyTag +  let attr = toStringAttr attr'    guard (maybe False (== "noteref") (lookup "type" attr))    let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)    guard (not (null ident)) @@ -249,10 +254,10 @@ pList = pBulletList <|> pOrderedList <|> pDefinitionList  pBulletList :: PandocMonad m => TagParser m Blocks  pBulletList = try $ do -  pSatisfy (~== TagOpen "ul" []) +  pSatisfy (matchTagOpen "ul" [])    let nonItem = pSatisfy (\t ->                    not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && -                  not (t ~== TagClose "ul")) +                  not (matchTagClose "ul" t))    -- note: if they have an <ol> or <ul> not in scope of a <li>,    -- treat it as a list item, though it's not valid xhtml...    skipMany nonItem @@ -261,7 +266,8 @@ pBulletList = try $ do  pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks  pListItem nonItem = do -  TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) +  TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" []) +  let attr = toStringAttr attr'    let addId ident bs = case B.toList bs of                             (Plain ils:xs) -> B.fromList (Plain                                  [Span (ident, [], []) ils] : xs) @@ -287,7 +293,8 @@ parseTypeAttr _   = DefaultStyle  pOrderedList :: PandocMonad m => TagParser m Blocks  pOrderedList = try $ do -  TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) +  TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) +  let attribs = toStringAttr attribs'    let (start, style) = (sta', sty')                         where sta = fromMaybe "1" $                                     lookup "start" attribs @@ -309,7 +316,7 @@ pOrderedList = try $ do                                        ]    let nonItem = pSatisfy (\t ->                    not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && -                  not (t ~== TagClose "ol")) +                  not (matchTagClose "ol" t))    -- note: if they have an <ol> or <ul> not in scope of a <li>,    -- treat it as a list item, though it's not valid xhtml...    skipMany nonItem @@ -318,14 +325,14 @@ pOrderedList = try $ do  pDefinitionList :: PandocMonad m => TagParser m Blocks  pDefinitionList = try $ do -  pSatisfy (~== TagOpen "dl" []) +  pSatisfy (matchTagOpen "dl" [])    items <- manyTill pDefListItem (pCloses "dl")    return $ B.definitionList items  pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks])  pDefListItem = try $ do -  let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && -                  not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) +  let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) && +                  not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t))    terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)    defs  <- many1 (try $ skipMany nonItem >> pInTags "dd" block)    skipMany nonItem @@ -348,12 +355,12 @@ fixPlains inList bs = if any isParaish bs'          plainToPara x = x          bs' = B.toList bs -pRawTag :: PandocMonad m => TagParser m String +pRawTag :: PandocMonad m => TagParser m Text  pRawTag = do    tag <- pAnyTag    let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]    if tagOpen ignorable (const True) tag || tagClose ignorable tag -     then return [] +     then return mempty       else return $ renderTags' [tag]  pDiv :: PandocMonad m => TagParser m Blocks @@ -362,7 +369,8 @@ pDiv = try $ do    let isDivLike "div" = True        isDivLike "section" = True        isDivLike _ = False -  TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) +  TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) +  let attr = toStringAttr attr'    contents <- pInTags tag block    let (ident, classes, kvs) = mkAttr attr    let classes' = if tag == "section" @@ -372,7 +380,7 @@ pDiv = try $ do  pRawHtmlBlock :: PandocMonad m => TagParser m Blocks  pRawHtmlBlock = do -  raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag +  raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag)    exts <- getOption readerExtensions    if extensionEnabled Ext_raw_html exts && not (null raw)       then return $ B.rawBlock "html" raw @@ -387,33 +395,35 @@ ignore raw = do      logMessage $ SkippedContent raw pos    return mempty -pHtmlBlock :: PandocMonad m => String -> TagParser m String +pHtmlBlock :: PandocMonad m => Text -> TagParser m Text  pHtmlBlock t = try $ do -  open <- pSatisfy (~== TagOpen t []) -  contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) -  return $ renderTags' $ [open] ++ contents ++ [TagClose t] +  open <- pSatisfy (matchTagOpen t []) +  contents <- manyTill pAnyTag (pSatisfy (matchTagClose t)) +  return $ renderTags' $ [open] <> contents <> [TagClose t]  -- Sets chapter context  eSection :: PandocMonad m => TagParser m Blocks  eSection = try $ do -  let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) +  let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as)    let sectTag = tagOpen (`elem` sectioningContent) matchChapter    TagOpen tag _ <- lookAhead $ pSatisfy sectTag    setInChapter (pInTags tag block) -headerLevel :: PandocMonad m => String -> TagParser m Int +headerLevel :: PandocMonad m => Text -> TagParser m Int  headerLevel tagtype = do -  let level = read (drop 1 tagtype) -  (try $ do -    guardEnabled Ext_epub_html_exts -    asks inChapter >>= guard -    return (level - 1)) -    <|> -      return level +  case safeRead (T.unpack (T.drop 1 tagtype)) of +        Just level -> +          (try $ do +            guardEnabled Ext_epub_html_exts +            asks inChapter >>= guard +            return (level - 1)) +            <|> +              return level +        Nothing -> fail "Could not retrieve header level"  eTitlePage :: PandocMonad m => TagParser m ()  eTitlePage = try $ do -  let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) +  let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as)    let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")                            isTitlePage    TagOpen tag _ <- lookAhead $ pSatisfy groupTag @@ -421,19 +431,21 @@ eTitlePage = try $ do  pHeader :: PandocMonad m => TagParser m Blocks  pHeader = try $ do -  TagOpen tagtype attr <- pSatisfy $ +  TagOpen tagtype attr' <- pSatisfy $                             tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])                             (const True) -  let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] +  let attr = toStringAttr attr' +  let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) +                                               [("class","title")]    level <- headerLevel tagtype    contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)    let ident = fromMaybe "" $ lookup "id" attr    let classes = maybe [] words $ lookup "class" attr    let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] -  attr' <- registerHeader (ident, classes, keyvals) contents +  attr'' <- registerHeader (ident, classes, keyvals) contents    return $ if bodyTitle                then mempty  -- skip a representation of the title in the body -              else B.headerWith attr' level contents +              else B.headerWith attr'' level contents  pHrule :: PandocMonad m => TagParser m Blocks  pHrule = do @@ -442,7 +454,7 @@ pHrule = do  pTable :: PandocMonad m => TagParser m Blocks  pTable = try $ do -  TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) +  TagOpen _ _ <- pSatisfy (matchTagOpen "table" [])    skipMany pBlank    caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank    widths' <- (mconcat <$> many1 pColgroup) <|> many pCol @@ -456,8 +468,8 @@ pTable = try $ do                   else return head''    rowsLs <- many pTBody    rows'  <- pOptInTag "tfoot" $ many pTr -  TagClose _ <- pSatisfy (~== TagClose "table") -  let rows'' = (concat rowsLs) ++ rows' +  TagClose _ <- pSatisfy (matchTagClose "table") +  let rows'' = (concat rowsLs) <> rows'    -- fail on empty table    guard $ not $ null head' && null rows''    let isSinglePlain x = case B.toList x of @@ -468,7 +480,7 @@ pTable = try $ do    let cols = length $ if null head' then head rows'' else head'    -- add empty cells to short rows    let addEmpties r = case cols - length r of -                           n | n > 0 -> r ++ replicate n mempty +                           n | n > 0 -> r <> replicate n mempty                               | otherwise -> r    let rows = map addEmpties rows''    let aligns = replicate cols AlignDefault @@ -481,15 +493,16 @@ pTable = try $ do  pCol :: PandocMonad m => TagParser m Double  pCol = try $ do -  TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) +  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) +  let attribs = toStringAttr attribs'    skipMany pBlank -  optional $ pSatisfy (~== TagClose "col") +  optional $ pSatisfy (matchTagClose "col")    skipMany pBlank    return $ case lookup "width" attribs of             Nothing -> case lookup "style" attribs of                 Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->                   fromMaybe 0.0 $ safeRead ('0':'.':filter -                   (`notElem` " \t\r\n%'\";") xs) +                   (`notElem` (" \t\r\n%'\";" :: [Char])) xs)                 _ -> 0.0             Just x | not (null x) && last x == '%' ->               fromMaybe 0.0 $ safeRead ('0':'.':init x) @@ -497,18 +510,18 @@ pCol = try $ do  pColgroup :: PandocMonad m => TagParser m [Double]  pColgroup = try $ do -  pSatisfy (~== TagOpen "colgroup" []) +  pSatisfy (matchTagOpen "colgroup" [])    skipMany pBlank    manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -noColOrRowSpans :: Tag String -> Bool +noColOrRowSpans :: Tag Text -> Bool  noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"    where isNullOrOne x = case fromAttrib x t of                                ""  -> True                                "1" -> True                                _   -> False -pCell :: PandocMonad m => String -> TagParser m [Blocks] +pCell :: PandocMonad m => Text -> TagParser m [Blocks]  pCell celltype = try $ do    skipMany pBlank    res <- pInTags' celltype noColOrRowSpans block @@ -534,7 +547,8 @@ pPara = do  pCodeBlock :: PandocMonad m => TagParser m Blocks  pCodeBlock = try $ do -  TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) +  TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) +  let attr = toStringAttr attr'    contents <- manyTill pAnyTag (pCloses "pre" <|> eof)    let rawText = concatMap tagToString contents    -- drop leading newline if any @@ -547,8 +561,8 @@ pCodeBlock = try $ do                      _        -> result'    return $ B.codeBlockWith (mkAttr attr) result -tagToString :: Tag String -> String -tagToString (TagText s) = s +tagToString :: Tag Text -> String +tagToString (TagText s) = T.unpack s  tagToString (TagOpen "br" _) = "\n"  tagToString _ = "" @@ -577,20 +591,20 @@ pLocation = do    (TagPosition r c) <- pSat isTagPosition    setPosition $ newPos "input" r c -pSat :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)  pSat f = do    pos <- getPosition    token show (const pos) (\x -> if f x then Just x else Nothing) -pSatisfy :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)  pSatisfy f = try $ optional pLocation >> pSat f -pAnyTag :: PandocMonad m => TagParser m (Tag String) +pAnyTag :: PandocMonad m => TagParser m (Tag Text)  pAnyTag = pSatisfy (const True)  pSelfClosing :: PandocMonad m -             => (String -> Bool) -> ([Attribute String] -> Bool) -             -> TagParser m (Tag String) +             => (Text -> Bool) -> ([Attribute Text] -> Bool) +             -> TagParser m (Tag Text)  pSelfClosing f g = do    open <- pSatisfy (tagOpen f g)    optional $ pSatisfy (tagClose f) @@ -628,7 +642,7 @@ pStrikeout = do    pInlinesInTags "s" B.strikeout <|>      pInlinesInTags "strike" B.strikeout <|>      pInlinesInTags "del" B.strikeout <|> -    try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) +    try (do pSatisfy (matchTagOpen "span" [("class","strikeout")])              contents <- mconcat <$> manyTill inline (pCloses "span")              return $ B.strikeout contents) @@ -639,17 +653,19 @@ pLineBreak = do  -- Unlike fromAttrib from tagsoup, this distinguishes  -- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: String -> Tag String -> Maybe String -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib :: String -> Tag Text -> Maybe String +maybeFromAttrib name (TagOpen _ attrs) = +  T.unpack <$> lookup (T.pack name) attrs  maybeFromAttrib _ _ = Nothing  pLink :: PandocMonad m => TagParser m Inlines  pLink = try $ do    tag <- pSatisfy $ tagOpenLit "a" (const True) -  let title = fromAttrib "title" tag +  let title = T.unpack $ fromAttrib "title" tag    -- take id from id attribute if present, otherwise name -  let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag -  let cls = words $ fromAttrib "class" tag +  let uid = maybe (T.unpack $ fromAttrib "name" tag) id $ +               maybeFromAttrib "id" tag +  let cls = words $ T.unpack $ fromAttrib "class" tag    lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")    -- check for href; if href, then a link, otherwise a span    case maybeFromAttrib "href" tag of @@ -667,30 +683,33 @@ pImage :: PandocMonad m => TagParser m Inlines  pImage = do    tag <- pSelfClosing (=="img") (isJust . lookup "src")    mbBaseHref <- baseHref <$> getState -  let url' = fromAttrib "src" tag +  let url' = T.unpack $ fromAttrib "src" tag    let url = case (parseURIReference url', mbBaseHref) of                   (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)                   _                   -> url' -  let title = fromAttrib "title" tag -  let alt = fromAttrib "alt" tag -  let uid = fromAttrib "id" tag -  let cls = words $ fromAttrib "class" tag +  let title = T.unpack $ fromAttrib "title" tag +  let alt = T.unpack $ fromAttrib "alt" tag +  let uid = T.unpack $ fromAttrib "id" tag +  let cls = words $ T.unpack $ fromAttrib "class" tag    let getAtt k = case fromAttrib k tag of                     "" -> [] -                   v  -> [(k, v)] +                   v  -> [(T.unpack k, T.unpack v)]    let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"]    return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)  pCode :: PandocMonad m => TagParser m Inlines  pCode = try $ do -  (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) +  (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) +  let attr = toStringAttr attr'    result <- manyTill pAnyTag (pCloses open) -  return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result +  return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ T.unpack $ +           innerText result  pSpan :: PandocMonad m => TagParser m Inlines  pSpan = try $ do    guardEnabled Ext_native_spans -  TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) +  TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) +  let attr = toStringAttr attr'    contents <- pInTags "span" inline    let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes                      where styleAttr   = fromMaybe "" $ lookup "style" attr @@ -708,7 +727,7 @@ pRawHtmlInline = do                     then pSatisfy (not . isBlockTag)                     else pSatisfy isInlineTag    exts <- getOption readerExtensions -  let raw = renderTags' [result] +  let raw = T.unpack $ renderTags' [result]    if extensionEnabled Ext_raw_html exts       then return $ B.rawInline "html" raw       else ignore raw @@ -716,32 +735,38 @@ pRawHtmlInline = do  mathMLToTeXMath :: String -> Either String String  mathMLToTeXMath s = writeTeX <$> readMathML s +toStringAttr :: [(Text, Text)] -> [(String, String)] +toStringAttr = map go +  where go (x,y) = (T.unpack x, T.unpack y) +  pMath :: PandocMonad m => Bool -> TagParser m Inlines  pMath inCase = try $ do -  open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) +  open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True)    -- we'll assume math tags are MathML unless specially marked    -- otherwise... +  let attr = toStringAttr attr'    unless inCase $      guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) -  contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) -  case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of +  contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math")) +  case mathMLToTeXMath (T.unpack $ renderTags $ +          [open] <> contents <> [TagClose "math"]) of         Left _   -> return $ B.spanWith ("",["math"],attr) $ B.text $ -                             innerText contents +                             T.unpack $ innerText contents         Right [] -> return mempty         Right x  -> return $ case lookup "display" attr of                                   Just "block" -> B.displayMath x                                   _            -> B.math x -pInlinesInTags :: PandocMonad m => String -> (Inlines -> Inlines) +pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)                 -> TagParser m Inlines  pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: (PandocMonad m, Monoid a) => String -> TagParser m a -> TagParser m a +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a  pInTags tagtype parser = pInTags' tagtype (const True) parser  pInTags' :: (PandocMonad m, Monoid a) -         => String -         -> (Tag String -> Bool) +         => Text +         -> (Tag Text -> Bool)           -> TagParser m a           -> TagParser m a  pInTags' tagtype tagtest parser = try $ do @@ -750,18 +775,18 @@ pInTags' tagtype tagtest parser = try $ do  -- parses p, preceeded by an optional opening tag  -- and followed by an optional closing tags -pOptInTag :: PandocMonad m => String -> TagParser m a -> TagParser m a +pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a  pOptInTag tagtype p = try $ do    skipMany pBlank -  optional $ pSatisfy (~== TagOpen tagtype []) +  optional $ pSatisfy (matchTagOpen tagtype [])    skipMany pBlank    x <- p    skipMany pBlank -  optional $ pSatisfy (~== TagClose tagtype) +  optional $ pSatisfy (matchTagClose tagtype)    skipMany pBlank    return x -pCloses :: PandocMonad m => String -> TagParser m () +pCloses :: PandocMonad m => Text -> TagParser m ()  pCloses tagtype = try $ do    t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag    case t of @@ -782,15 +807,15 @@ pTagText = try $ do    parsed <- lift $ lift $              flip runReaderT qu $ runParserT (many pTagContents) st "text" str    case parsed of -       Left _        -> throwError $ PandocParseError $ "Could not parse `" ++ str ++ "'" +       Left _        -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"         Right result  -> return $ mconcat result  pBlank :: PandocMonad m => TagParser m ()  pBlank = try $ do    (TagText str) <- pSatisfy isTagText -  guard $ all isSpace str +  guard $ T.all isSpace str -type InlinesParser m = HTMLParser m String +type InlinesParser m = HTMLParser m Text  pTagContents :: PandocMonad m => InlinesParser m Inlines  pTagContents = @@ -871,13 +896,13 @@ pSpace = many1 (satisfy isSpace) >>= \xs ->  -- Constants  -- -eitherBlockOrInline :: Set.Set String +eitherBlockOrInline :: Set.Set Text  eitherBlockOrInline = Set.fromList    ["audio", "applet", "button", "iframe", "embed",     "del", "ins", "progress", "map", "area", "noscript", "script",     "object", "svg", "video", "source"] -blockHtmlTags :: Set.Set String +blockHtmlTags :: Set.Set Text  blockHtmlTags = Set.fromList     ["?xml", "!DOCTYPE", "address", "article", "aside",      "blockquote", "body", "canvas", @@ -893,7 +918,7 @@ blockHtmlTags = Set.fromList  -- We want to allow raw docbook in markdown documents, so we  -- include docbook block tags here too. -blockDocBookTags :: Set.Set String +blockDocBookTags :: Set.Set Text  blockDocBookTags = Set.fromList     ["calloutlist", "bibliolist", "glosslist", "itemizedlist",      "orderedlist", "segmentedlist", "simplelist", @@ -908,37 +933,52 @@ blockDocBookTags = Set.fromList      "classsynopsis", "blockquote", "epigraph", "msgset",      "sidebar", "title"] -epubTags :: Set.Set String +epubTags :: Set.Set Text  epubTags = Set.fromList ["case", "switch", "default"] -blockTags :: Set.Set String +blockTags :: Set.Set Text  blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags] -isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen isInlineTagName (const True) t || -                tagClose isInlineTagName t || -                tagComment (const True) t -                 where isInlineTagName x = x `Set.notMember` blockTags - -isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen isBlockTagName (const True) t || -               tagClose isBlockTagName t || -               tagComment (const True) t -                 where isBlockTagName ('?':_) = True -                       isBlockTagName ('!':_) = True -                       isBlockTagName x       = x `Set.member` blockTags -                                             || x `Set.member` -                                                       eitherBlockOrInline - -isTextTag :: Tag String -> Bool +class NamedTag a where +  getTagName :: a -> Maybe Text + +instance NamedTag (Tag Text) where +  getTagName (TagOpen t _) = Just t +  getTagName (TagClose t)  = Just t +  getTagName _             = Nothing + +instance NamedTag (Tag String) where +  getTagName (TagOpen t _) = Just (T.pack t) +  getTagName (TagClose t)  = Just (T.pack t) +  getTagName _             = Nothing + +isInlineTag :: NamedTag (Tag a) => Tag a -> Bool +isInlineTag t = isInlineTagName || isCommentTag t +                 where isInlineTagName = case getTagName t of +                                              Just x -> x +                                                  `Set.notMember` blockTags +                                              Nothing -> False + +isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +isBlockTag t = isBlockTagName || isTagComment t +                 where isBlockTagName = +                         case getTagName t of +                              Just x +                                | "?" `T.isPrefixOf` x -> True +                                | "!" `T.isPrefixOf` x -> True +                                | otherwise -> x `Set.member` blockTags +                                    || x `Set.member` eitherBlockOrInline +                              Nothing -> False + +isTextTag :: Tag a -> Bool  isTextTag = tagText (const True) -isCommentTag :: Tag String -> Bool +isCommentTag :: Tag a -> Bool  isCommentTag = tagComment (const True)  -- taken from HXT and extended  -- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags -closes :: String -> String -> Bool +closes :: Text -> Text -> Bool  _ `closes` "body" = False  _ `closes` "html" = False  "body" `closes` "head" = True @@ -1000,8 +1040,11 @@ htmlInBalanced f = try $ do                            let cs = ec - sc                            lscontents <- unlines <$> count ls anyLine                            cscontents <- count cs anyChar -                          (_,closetag) <- htmlTag (~== TagClose tn) -                          return (lscontents ++ cscontents ++ closetag) +                          closetag <- do +                            x <- many (satisfy (/='>')) +                            char '>' +                            return (x <> ">") +                          return (lscontents <> cscontents <> closetag)                          _ -> mzero      _ -> mzero @@ -1019,7 +1062,7 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts          go n (t:ts') = (t :) <$> go n ts'          go _ [] = mzero -hasTagWarning :: [Tag String] -> Bool +hasTagWarning :: [Tag a] -> Bool  hasTagWarning (TagWarning _:_) = True  hasTagWarning _ = False @@ -1047,47 +1090,48 @@ htmlTag f = try $ do         -- basic sanity check, since the parser is very forgiving         -- and finds tags in stuff like x<y)         guard $ isName tagname +       guard $ not $ null tagname         -- <https://example.org> should NOT be a tag either.         -- tagsoup will parse it as TagOpen "https:" [("example.org","")]         guard $ last tagname /= ':'         rendered <- manyTill anyChar (char '>') -       return (next, rendered ++ ">") +       return (next, rendered <> ">")    case next of         TagComment s           | "<!--" `isPrefixOf` inp -> do            count (length s + 4) anyChar            skipMany (satisfy (/='>'))            char '>' -          return (next, "<!--" ++ s ++ "-->") +          return (next, "<!--" <> s <> "-->")           | otherwise -> fail "bogus comment mode, HTML5 parse error"         TagOpen tagname attr -> do           guard $ all (isName . fst) attr           handleTag tagname -       TagClose tagname -> handleTag tagname +       TagClose tagname -> +         handleTag tagname         _ -> mzero  mkAttr :: [(String, String)] -> Attr  mkAttr attr = (attribsId, attribsClasses, attribsKV)    where attribsId = fromMaybe "" $ lookup "id" attr -        attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes +        attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) <> epubTypes          attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr          epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr  -- Strip namespace prefixes -stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes :: [Tag Text] -> [Tag Text]  stripPrefixes = map stripPrefix -stripPrefix :: Tag String -> Tag String +stripPrefix :: Tag Text -> Tag Text  stripPrefix (TagOpen s as) =      TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)  stripPrefix (TagClose s) = TagClose (stripPrefix' s)  stripPrefix x = x -stripPrefix' :: String -> String +stripPrefix' :: Text -> Text  stripPrefix' s = -  case span (/= ':') s of -    (_, "") -> s -    (_, (_:ts)) -> ts +  if T.null t then s else T.drop 1 t +  where (_, t) = T.span (/= ':') s  isSpace :: Char -> Bool  isSpace ' '  = True @@ -1130,19 +1174,32 @@ instance HasLastStrPosition HTMLState where    setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}    getLastStrPos = getLastStrPos . parserState +-- For now we need a special verison here; the one in Shared has String type +renderTags' :: [Tag Text] -> Text +renderTags' = renderTagsOptions +               renderOptions{ optMinimize = matchTags ["hr", "br", "img", +                                                       "meta", "link"] +                            , optRawTag   = matchTags ["script", "style"] } +              where matchTags = \tags -> flip elem tags . T.toLower +  -- EPUB Specific  --  -- -sectioningContent :: [String] +sectioningContent :: [Text]  sectioningContent = ["article", "aside", "nav", "section"] -groupingContent :: [String] +groupingContent :: [Text]  groupingContent = ["p", "hr", "pre", "blockquote", "ol"                    , "ul", "li", "dl", "dt", "dt", "dd"                    , "figure", "figcaption", "div", "main"] +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as)  {- @@ -1150,7 +1207,7 @@ types :: [(String, ([String], Int))]  types =  -- Document divisions     map (\s -> (s, (["section", "body"], 0)))      ["volume", "part", "chapter", "division"] -  ++ -- Document section and components +  <> -- Document section and components    [      ("abstract",  ([], 0))]  -} | 
