aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authordespresc <christian.j.j.despres@gmail.com>2019-11-04 16:12:37 -0500
committerJohn MacFarlane <jgm@berkeley.edu>2019-11-12 16:03:45 -0800
commit90e436d49604e3fd1ef9432fb23f6d7f6245c7fd (patch)
tree4e7f0692f989643189f1fc6786050d95e239a0ea /src/Text/Pandoc/Readers/HTML.hs
parentd3966372f5049eea56213b069fc4d70d8af9144c (diff)
downloadpandoc-90e436d49604e3fd1ef9432fb23f6d7f6245c7fd.tar.gz
Switch to new pandoc-types and use Text instead of String [API change].
PR #5884. + Use pandoc-types 1.20 and texmath 0.12. + Text is now used instead of String, with a few exceptions. + In the MediaBag module, some of the types using Strings were switched to use FilePath instead (not Text). + In the Parsing module, new parsers `manyChar`, `many1Char`, `manyTillChar`, `many1TillChar`, `many1Till`, `manyUntil`, `mantyUntilChar` have been added: these are like their unsuffixed counterparts but pack some or all of their output. + `glob` in Text.Pandoc.Class still takes String since it seems to be intended as an interface to Glob, which uses strings. It seems to be used only once in the package, in the EPUB writer, so that is not hard to change.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs231
1 files changed, 112 insertions, 119 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index e03ac6a97..1c2892d6a 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,5 +1,5 @@
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
@@ -35,8 +35,7 @@ import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
import Data.Char (isAlphaNum, isLetter)
import Data.Default (Default (..), def)
import Data.Foldable (for_)
-import Data.List (isPrefixOf)
-import Data.List.Split (wordsBy, splitWhen)
+import Data.List.Split (splitWhen)
import qualified Data.Map as M
import Data.Maybe (fromMaybe, isJust, isNothing)
import Data.Monoid (First (..))
@@ -62,8 +61,8 @@ import Text.Pandoc.Options (
extensionEnabled)
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
- extractSpaces, htmlSpanLikeElements,
- onlySimpleTableCells, safeRead, underlineSpan)
+ extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
+ onlySimpleTableCells, safeRead, underlineSpan, tshow)
import Text.Pandoc.Walk
import Text.Parsec.Error
import Text.TeXMath (readMathML, writeTeX)
@@ -93,14 +92,14 @@ readHtml opts inp = do
"source" tags
case result of
Right doc -> return doc
- Left err -> throwError $ PandocParseError $ getError err
+ Left err -> throwError $ PandocParseError $ T.pack $ getError err
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
st <- getState
return $ walk (replaceNotes' (noteTable st)) bs
-replaceNotes' :: [(String, Blocks)] -> Inline -> Inline
+replaceNotes' :: [(Text, Blocks)] -> Inline -> Inline
replaceNotes' noteTbl (RawInline (Format "noteref") ref) =
maybe (Str "") (Note . B.toList) $ lookup ref noteTbl
replaceNotes' _ x = x
@@ -108,9 +107,9 @@ replaceNotes' _ x = x
data HTMLState =
HTMLState
{ parserState :: ParserState,
- noteTable :: [(String, Blocks)],
+ noteTable :: [(Text, Blocks)],
baseHref :: Maybe URI,
- identifiers :: Set.Set String,
+ identifiers :: Set.Set Text,
logMessages :: [LogMessage],
macros :: M.Map Text Macro
}
@@ -134,7 +133,7 @@ pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
for_ (lookup "lang" attr) $
- updateState . B.setMeta "lang" . B.text . T.unpack
+ updateState . B.setMeta "lang" . B.text
pInTags "html" block
pBody :: PandocMonad m => TagParser m Blocks
@@ -146,11 +145,11 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny)
setTitle t = mempty <$ updateState (B.setMeta "title" t)
pMetaTag = do
mt <- pSatisfy (matchTagOpen "meta" [])
- let name = T.unpack $ fromAttrib "name" mt
- if null name
+ let name = fromAttrib "name" mt
+ if T.null name
then return mempty
else do
- let content = T.unpack $ fromAttrib "content" mt
+ let content = fromAttrib "content" mt
updateState $ \s ->
let ps = parserState s in
s{ parserState = ps{
@@ -187,13 +186,13 @@ block = do
, pFigure
, pRawHtmlBlock
]
- trace (take 60 $ show $ B.toList res)
+ trace (T.take 60 $ tshow $ B.toList res)
return res
-namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
+namespaces :: PandocMonad m => [(Text, TagParser m Inlines)]
namespaces = [(mathMLNamespace, pMath True)]
-mathMLNamespace :: String
+mathMLNamespace :: Text
mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
eSwitch :: (PandocMonad m, Monoid a)
@@ -233,7 +232,7 @@ eFootnote = try $ do
content <- pInTags tag block
addNote ident content
-addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
+addNote :: PandocMonad m => Text -> Blocks -> TagParser m ()
addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s})
eNoteref :: PandocMonad m => TagParser m Inlines
@@ -245,8 +244,8 @@ eNoteref = try $ do
-> (lookup "type" as <|> lookup "epub:type" as)
== Just "noteref"
_ -> False)
- ident <- case T.unpack <$> lookup "href" attr of
- Just ('#':rest) -> return rest
+ ident <- case lookup "href" attr >>= T.uncons of
+ Just ('#', rest) -> return rest
_ -> mzero
_ <- manyTill pAny (pSatisfy (\case
TagClose t -> t == tag
@@ -287,7 +286,7 @@ pListItem nonItem = do
maybe id addId (lookup "id" attr) <$>
pInTags "li" block <* skipMany nonItem
-parseListStyleType :: String -> ListNumberStyle
+parseListStyleType :: Text -> ListNumberStyle
parseListStyleType "lower-roman" = LowerRoman
parseListStyleType "upper-roman" = UpperRoman
parseListStyleType "lower-alpha" = LowerAlpha
@@ -295,7 +294,7 @@ parseListStyleType "upper-alpha" = UpperAlpha
parseListStyleType "decimal" = Decimal
parseListStyleType _ = DefaultStyle
-parseTypeAttr :: String -> ListNumberStyle
+parseTypeAttr :: Text -> ListNumberStyle
parseTypeAttr "i" = LowerRoman
parseTypeAttr "I" = UpperRoman
parseTypeAttr "a" = LowerAlpha
@@ -404,20 +403,19 @@ pDiv = try $ do
pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
pRawHtmlBlock = do
- raw <- T.unpack <$>
- (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea"
- <|> pRawTag)
+ raw <- (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea"
+ <|> pRawTag)
exts <- getOption readerExtensions
- if extensionEnabled Ext_raw_html exts && not (null raw)
+ if extensionEnabled Ext_raw_html exts && not (T.null raw)
then return $ B.rawBlock "html" raw
else ignore raw
-ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
+ignore :: (Monoid a, PandocMonad m) => Text -> TagParser m a
ignore raw = do
pos <- getPosition
-- raw can be null for tags like <!DOCTYPE>; see paRawTag
-- in this case we don't want a warning:
- unless (null raw) $
+ unless (T.null raw) $
logMessage $ SkippedContent raw pos
return mempty
@@ -438,7 +436,7 @@ eSection = try $ do
headerLevel :: Text -> TagParser m Int
headerLevel tagtype =
- case safeRead (T.unpack (T.drop 1 tagtype)) of
+ case safeRead (T.drop 1 tagtype) of
Just level ->
-- try (do
-- guardEnabled Ext_epub_html_exts
@@ -468,7 +466,7 @@ pHeader = try $ do
level <- headerLevel tagtype
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
- let classes = maybe [] words $ lookup "class" attr
+ let classes = maybe [] T.words $ lookup "class" attr
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
attr'' <- registerHeader (ident, classes, keyvals) contents
return $ if bodyTitle
@@ -529,14 +527,14 @@ pCol = try $ do
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
let width = case lookup "width" attribs of
- Nothing -> case lookup "style" attribs of
- Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
- fromMaybe 0.0 $ safeRead (filter
- (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
- _ -> 0.0
- Just x | not (null x) && last x == '%' ->
- fromMaybe 0.0 $ safeRead (init x)
- _ -> 0.0
+ Nothing -> case lookup "style" attribs of
+ Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+ fromMaybe 0.0 $ safeRead (T.filter
+ (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+ _ -> 0.0
+ Just (T.unsnoc -> Just (xs, '%')) ->
+ fromMaybe 0.0 $ safeRead xs
+ _ -> 0.0
if width > 0.0
then return $ width / 100.0
else return 0.0
@@ -562,7 +560,7 @@ pCell celltype = try $ do
let extractAlign' [] = ""
extractAlign' ("text-align":x:_) = x
extractAlign' (_:xs) = extractAlign' xs
- let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':'])
+ let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:")
let align = case maybeFromAttrib "align" tag `mplus`
(extractAlign <$> maybeFromAttrib "style" tag) of
Just "left" -> AlignLeft
@@ -610,7 +608,7 @@ pFigure = try $ do
let caption = fromMaybe mempty mbcap
case B.toList <$> mbimg of
Just [Image attr _ (url, tit)] ->
- return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption
+ return $ B.para $ B.imageWith attr url ("fig:" <> tit) caption
_ -> mzero
pCodeBlock :: PandocMonad m => TagParser m Blocks
@@ -618,21 +616,21 @@ pCodeBlock = try $ do
TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
let attr = toStringAttr attr'
contents <- manyTill pAny (pCloses "pre" <|> eof)
- let rawText = concatMap tagToString contents
+ let rawText = T.concat $ map tagToText contents
-- drop leading newline if any
- let result' = case rawText of
- '\n':xs -> xs
- _ -> rawText
+ let result' = case T.uncons rawText of
+ Just ('\n', xs) -> xs
+ _ -> rawText
-- drop trailing newline if any
- let result = case reverse result' of
- '\n':_ -> init result'
- _ -> result'
+ let result = case T.unsnoc result' of
+ Just (result'', '\n') -> result''
+ _ -> result'
return $ B.codeBlockWith (mkAttr attr) result
-tagToString :: Tag Text -> String
-tagToString (TagText s) = T.unpack s
-tagToString (TagOpen "br" _) = "\n"
-tagToString _ = ""
+tagToText :: Tag Text -> Text
+tagToText (TagText s) = s
+tagToText (TagOpen "br" _) = "\n"
+tagToText _ = ""
inline :: PandocMonad m => TagParser m Inlines
inline = choice
@@ -667,7 +665,7 @@ pLocation = do
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat f = do
pos <- getPosition
- token show (const pos) (\x -> if f x then Just x else Nothing)
+ token tshow (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy f = try $ optional pLocation >> pSat f
@@ -688,10 +686,10 @@ pQ = choice $ map try [citedQuote, normalQuote]
where citedQuote = do
tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst))
- url <- canonicalizeUrl $ T.unpack $ fromAttrib "cite" tag
- let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+ url <- canonicalizeUrl $ fromAttrib "cite" tag
+ let uid = fromMaybe (fromAttrib "name" tag) $
maybeFromAttrib "id" tag
- let cls = words $ T.unpack $ fromAttrib "class" tag
+ let cls = T.words $ fromAttrib "class" tag
makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)])
normalQuote = do
@@ -729,7 +727,7 @@ pSpanLike =
TagOpen _ attrs <- pSatisfy $ tagOpenLit tagName (const True)
let (ids, cs, kvs) = mkAttr . toStringAttr $ attrs
content <- mconcat <$> manyTill inline (pCloses tagName <|> eof)
- return $ B.spanWith (ids, T.unpack tagName : cs, kvs) content
+ return $ B.spanWith (ids, tagName : cs, kvs) content
pSmall :: PandocMonad m => TagParser m Inlines
pSmall = pInlinesInTags "small" (B.spanWith ("",["small"],[]))
@@ -753,19 +751,18 @@ pLineBreak = do
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.
-maybeFromAttrib :: String -> Tag Text -> Maybe String
-maybeFromAttrib name (TagOpen _ attrs) =
- T.unpack <$> lookup (T.pack name) attrs
+maybeFromAttrib :: Text -> Tag Text -> Maybe Text
+maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
maybeFromAttrib _ _ = Nothing
pLink :: PandocMonad m => TagParser m Inlines
pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
- let title = T.unpack $ fromAttrib "title" tag
+ let title = fromAttrib "title" tag
-- take id from id attribute if present, otherwise name
- let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+ let uid = fromMaybe (fromAttrib "name" tag) $
maybeFromAttrib "id" tag
- let cls = words $ T.unpack $ fromAttrib "class" tag
+ let cls = T.words $ fromAttrib "class" tag
lab <- mconcat <$> manyTill inline (pCloses "a")
-- check for href; if href, then a link, otherwise a span
case maybeFromAttrib "href" tag of
@@ -778,34 +775,33 @@ pLink = try $ do
pImage :: PandocMonad m => TagParser m Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
- url <- canonicalizeUrl $ T.unpack $ fromAttrib "src" tag
- let title = T.unpack $ fromAttrib "title" tag
- let alt = T.unpack $ fromAttrib "alt" tag
- let uid = T.unpack $ fromAttrib "id" tag
- let cls = words $ T.unpack $ fromAttrib "class" tag
+ url <- canonicalizeUrl $ fromAttrib "src" tag
+ let title = fromAttrib "title" tag
+ let alt = fromAttrib "alt" tag
+ let uid = fromAttrib "id" tag
+ let cls = T.words $ fromAttrib "class" tag
let getAtt k = case fromAttrib k tag of
"" -> []
- v -> [(T.unpack k, T.unpack v)]
+ v -> [(k, v)]
let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"]
return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
-pCodeWithClass :: PandocMonad m => [(T.Text,String)] -> TagParser m Inlines
-pCodeWithClass elemToClass = try $ do
+pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines
+pCodeWithClass elemToClass = try $ do
let tagTest = flip elem . fmap fst $ elemToClass
TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True)
result <- manyTill pAny (pCloses open)
let (ids,cs,kvs) = mkAttr . toStringAttr $ attr'
cs' = maybe cs (:cs) . lookup open $ elemToClass
return . B.codeWith (ids,cs',kvs) .
- unwords . lines . T.unpack . innerText $ result
+ T.unwords . T.lines . innerText $ result
pCode :: PandocMonad m => TagParser m Inlines
pCode = try $ do
(TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
let attr = toStringAttr attr'
result <- manyTill pAny (pCloses open)
- return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $
- innerText result
+ return $ B.codeWith (mkAttr attr) $ T.unwords $ T.lines $ innerText result
pSpan :: PandocMonad m => TagParser m Inlines
pSpan = try $ do
@@ -817,7 +813,7 @@ pSpan = try $ do
where styleAttr = fromMaybe "" $ lookup "style" attr
fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
classes = fromMaybe [] $
- words <$> lookup "class" attr
+ T.words <$> lookup "class" attr
let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
return $ tag contents
@@ -829,18 +825,17 @@ pRawHtmlInline = do
then pSatisfy (not . isBlockTag)
else pSatisfy isInlineTag
exts <- getOption readerExtensions
- let raw = T.unpack $ renderTags' [result]
+ let raw = renderTags' [result]
if extensionEnabled Ext_raw_html exts
then return $ B.rawInline "html" raw
else ignore raw
-mathMLToTeXMath :: String -> Either String String
+mathMLToTeXMath :: Text -> Either Text Text
mathMLToTeXMath s = writeTeX <$> readMathML s
-toStringAttr :: [(Text, Text)] -> [(String, String)]
+toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = map go
- where go (x,y) = (T.unpack (fromMaybe x $ T.stripPrefix "data-" x),
- T.unpack y)
+ where go (x,y) = (fromMaybe x $ T.stripPrefix "data-" x, y)
pScriptMath :: PandocMonad m => TagParser m Inlines
pScriptMath = try $ do
@@ -849,8 +844,7 @@ pScriptMath = try $ do
Just x | "math/tex" `T.isPrefixOf` x
-> return $ "display" `T.isSuffixOf` x
_ -> mzero
- contents <- T.unpack . innerText <$>
- manyTill pAny (pSatisfy (matchTagClose "script"))
+ contents <- innerText <$> manyTill pAny (pSatisfy (matchTagClose "script"))
return $ (if isdisplay then B.displayMath else B.math) contents
pMath :: PandocMonad m => Bool -> TagParser m Inlines
@@ -862,11 +856,11 @@ pMath inCase = try $ do
unless inCase $
guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
contents <- manyTill pAny (pSatisfy (matchTagClose "math"))
- case mathMLToTeXMath (T.unpack $ renderTags $
+ case mathMLToTeXMath (renderTags $
[open] <> contents <> [TagClose "math"]) of
Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $
- T.unpack $ innerText contents
- Right [] -> return mempty
+ innerText contents
+ Right "" -> return mempty
Right x -> return $ case lookup "display" attr of
Just "block" -> B.displayMath x
_ -> B.math x
@@ -925,7 +919,7 @@ pTagText = try $ do
parsed <- lift $ lift $
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
case parsed of
- Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
+ Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'"
Right result -> return $ mconcat result
pBlank :: PandocMonad m => TagParser m ()
@@ -954,11 +948,11 @@ pRawTeX = do
guardEnabled Ext_raw_tex
inp <- getInput
st <- getState
- res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" (T.unpack inp)
+ res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" inp
case res of
Left _ -> mzero
Right (contents, raw) -> do
- _ <- count (length raw) anyChar
+ _ <- count (T.length raw) anyChar
return $ B.rawInline "tex" contents
pStr :: PandocMonad m => InlinesParser m Inlines
@@ -966,7 +960,7 @@ pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
updateLastStrPos
- return $ B.str result
+ return $ B.str $ T.pack result
isSpecial :: Char -> Bool
isSpecial '"' = True
@@ -982,7 +976,7 @@ isSpecial '\8221' = True
isSpecial _ = False
pSymbol :: PandocMonad m => InlinesParser m Inlines
-pSymbol = satisfy isSpecial >>= return . B.str . (:[])
+pSymbol = satisfy isSpecial >>= return . B.str . T.singleton
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
@@ -1019,7 +1013,7 @@ pBad = do
'\158' -> '\382'
'\159' -> '\376'
_ -> '?'
- return $ B.str [c']
+ return $ B.str $ T.singleton c'
pSpace :: PandocMonad m => InlinesParser m Inlines
pSpace = many1 (satisfy isSpace) >>= \xs ->
@@ -1156,8 +1150,8 @@ _ `closes` _ = False
-- | Matches a stretch of HTML in balanced tags.
htmlInBalanced :: Monad m
- => (Tag String -> Bool)
- -> ParserT String st m String
+ => (Tag Text -> Bool)
+ -> ParserT Text st m Text
htmlInBalanced f = try $ do
lookAhead (char '<')
inp <- getInput
@@ -1174,21 +1168,21 @@ htmlInBalanced f = try $ do
(TagClose _ : TagPosition er ec : _) -> do
let ls = er - sr
let cs = ec - sc
- lscontents <- unlines <$> count ls anyLine
+ lscontents <- T.unlines <$> count ls anyLine
cscontents <- count cs anyChar
closetag <- do
x <- many (satisfy (/='>'))
char '>'
return (x <> ">")
- return (lscontents <> cscontents <> closetag)
+ return $ lscontents <> T.pack cscontents <> T.pack closetag
_ -> mzero
_ -> mzero
-htmlInBalanced' :: String
- -> [Tag String]
- -> [Tag String]
+htmlInBalanced' :: Text
+ -> [Tag Text]
+ -> [Tag Text]
htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
- where go :: Int -> [Tag String] -> Maybe [Tag String]
+ where go :: Int -> [Tag Text] -> Maybe [Tag Text]
go n (t@(TagOpen tn' _):rest) | tn' == tagname =
(t :) <$> go (n + 1) rest
go 1 (t@(TagClose tn'):_) | tn' == tagname =
@@ -1204,8 +1198,8 @@ hasTagWarning _ = False
-- | Matches a tag meeting a certain condition.
htmlTag :: (HasReaderOptions st, Monad m)
- => (Tag String -> Bool)
- -> ParserT [Char] st m (Tag String, String)
+ => (Tag Text -> Bool)
+ -> ParserT Text st m (Tag Text, Text)
htmlTag f = try $ do
lookAhead (char '<')
startpos <- getPosition
@@ -1213,7 +1207,7 @@ htmlTag f = try $ do
let ts = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = False
, optTagPosition = True }
- (inp ++ " ") -- add space to ensure that
+ (inp <> " ") -- add space to ensure that
-- we get a TagPosition after the tag
(next, ln, col) <- case ts of
(TagPosition{} : next : TagPosition ln col : _)
@@ -1225,13 +1219,12 @@ htmlTag f = try $ do
-- so we exclude . even though it's a valid character
-- in XML element names
let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
- let isName s = case s of
- [] -> False
- (c:cs) -> isLetter c && all isNameChar cs
- let isPI s = case s of
- ('?':_) -> True -- processing instruction
- _ -> False
-
+ let isName s = case T.uncons s of
+ Nothing -> False
+ Just (c, cs) -> isLetter c && T.all isNameChar cs
+ let isPI s = case T.uncons s of
+ Just ('?', _) -> True -- processing instruction
+ _ -> False
let endpos = if ln == 1
then setSourceColumn startpos
(sourceColumn startpos + (col - 1))
@@ -1247,18 +1240,18 @@ htmlTag f = try $ do
-- basic sanity check, since the parser is very forgiving
-- and finds tags in stuff like x<y)
guard $ isName tagname || isPI tagname
- guard $ not $ null tagname
+ guard $ not $ T.null tagname
-- <https://example.org> should NOT be a tag either.
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
- guard $ last tagname /= ':'
+ guard $ T.last tagname /= ':'
char '<'
rendered <- manyTill anyChar endAngle
- return (next, "<" ++ rendered ++ ">")
+ return (next, T.pack $ "<" ++ rendered ++ ">")
case next of
TagComment s
- | "<!--" `isPrefixOf` inp -> do
+ | "<!--" `T.isPrefixOf` inp -> do
string "<!--"
- count (length s) anyChar
+ count (T.length s) anyChar
string "-->"
stripComments <- getOption readerStripComments
if stripComments
@@ -1272,12 +1265,12 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
-mkAttr :: [(String, String)] -> Attr
+mkAttr :: [(Text, Text)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes
+ attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
- epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+ epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
@@ -1304,11 +1297,11 @@ isSpace _ = False
-- Utilities
-- | Adjusts a url according to the document's base URL.
-canonicalizeUrl :: PandocMonad m => String -> TagParser m String
+canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
canonicalizeUrl url = do
mbBaseHref <- baseHref <$> getState
- return $ case (parseURIReference url, mbBaseHref) of
- (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ return $ case (parseURIReference (T.unpack url), mbBaseHref) of
+ (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url