Switch to new pandoc-types and use Text instead of String [API change].

PR #5884. + Use pandoc-types 1.20 and texmath 0.12. + Text is now used instead of String, with a few exceptions. + In the MediaBag module, some of the types using Strings were switched to use FilePath instead (not Text). + In the Parsing module, new parsers `manyChar`, `many1Char`, `manyTillChar`, `many1TillChar`, `many1Till`, `manyUntil`, `mantyUntilChar` have been added: these are like their unsuffixed counterparts but pack some or all of their output. + `glob` in Text.Pandoc.Class still takes String since it seems to be intended as an interface to Glob, which uses strings. It seems to be used only once in the package, in the EPUB writer, so that is not hard to change.
author: despresc <christian.j.j.despres@gmail.com> 2019-11-04 16:12:37 -0500
committer: John MacFarlane <jgm@berkeley.edu> 2019-11-12 16:03:45 -0800
commit: 90e436d49604e3fd1ef9432fb23f6d7f6245c7fd (patch)
tree: 4e7f0692f989643189f1fc6786050d95e239a0ea /src/Text/Pandoc/Readers/HTML.hs
parent: d3966372f5049eea56213b069fc4d70d8af9144c (diff)
download: pandoc-90e436d49604e3fd1ef9432fb23f6d7f6245c7fd.tar.gz
1 files changed, 112 insertions, 119 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index e03ac6a97..1c2892d6a 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,5 +1,5 @@
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE LambdaCase        #-}
+{-# LANGUAGE NoImplicitPrelude     #-}
+{-# LANGUAGE LambdaCase            #-}
 {-# LANGUAGE FlexibleContexts      #-}
 {-# LANGUAGE FlexibleInstances     #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@@ -35,8 +35,7 @@ import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
 import Data.Char (isAlphaNum, isLetter)
 import Data.Default (Default (..), def)
 import Data.Foldable (for_)
-import Data.List (isPrefixOf)
-import Data.List.Split (wordsBy, splitWhen)
+import Data.List.Split (splitWhen)
 import qualified Data.Map as M
 import Data.Maybe (fromMaybe, isJust, isNothing)
 import Data.Monoid (First (..))
@@ -62,8 +61,8 @@ import Text.Pandoc.Options (
     extensionEnabled)
 import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
-                           extractSpaces, htmlSpanLikeElements,
-                           onlySimpleTableCells, safeRead, underlineSpan)
+                           extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
+                           onlySimpleTableCells, safeRead, underlineSpan, tshow)
 import Text.Pandoc.Walk
 import Text.Parsec.Error
 import Text.TeXMath (readMathML, writeTeX)
@@ -93,14 +92,14 @@ readHtml opts inp = do
        "source" tags
   case result of
     Right doc -> return doc
-    Left  err -> throwError $ PandocParseError $ getError err
+    Left  err -> throwError $ PandocParseError $ T.pack $ getError err
 
 replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
 replaceNotes bs = do
   st <- getState
   return $ walk (replaceNotes' (noteTable st)) bs
 
-replaceNotes' :: [(String, Blocks)] -> Inline -> Inline
+replaceNotes' :: [(Text, Blocks)] -> Inline -> Inline
 replaceNotes' noteTbl (RawInline (Format "noteref") ref) =
   maybe (Str "") (Note . B.toList) $ lookup ref noteTbl
 replaceNotes' _ x = x
@@ -108,9 +107,9 @@ replaceNotes' _ x = x
 data HTMLState =
   HTMLState
   {  parserState :: ParserState,
-     noteTable   :: [(String, Blocks)],
+     noteTable   :: [(Text, Blocks)],
      baseHref    :: Maybe URI,
-     identifiers :: Set.Set String,
+     identifiers :: Set.Set Text,
      logMessages :: [LogMessage],
      macros      :: M.Map Text Macro
   }
@@ -134,7 +133,7 @@ pHtml :: PandocMonad m => TagParser m Blocks
 pHtml = try $ do
   (TagOpen "html" attr) <- lookAhead pAny
   for_ (lookup "lang" attr) $
-    updateState . B.setMeta "lang" . B.text . T.unpack
+    updateState . B.setMeta "lang" . B.text
   pInTags "html" block
 
 pBody :: PandocMonad m => TagParser m Blocks
@@ -146,11 +145,11 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny)
         setTitle t = mempty <$ updateState (B.setMeta "title" t)
         pMetaTag = do
           mt <- pSatisfy (matchTagOpen "meta" [])
-          let name = T.unpack $ fromAttrib "name" mt
-          if null name
+          let name = fromAttrib "name" mt
+          if T.null name
              then return mempty
              else do
-               let content = T.unpack $ fromAttrib "content" mt
+               let content = fromAttrib "content" mt
                updateState $ \s ->
                  let ps = parserState s in
                  s{ parserState = ps{
@@ -187,13 +186,13 @@ block = do
             , pFigure
             , pRawHtmlBlock
             ]
-  trace (take 60 $ show $ B.toList res)
+  trace (T.take 60 $ tshow $ B.toList res)
   return res
 
-namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
+namespaces :: PandocMonad m => [(Text, TagParser m Inlines)]
 namespaces = [(mathMLNamespace, pMath True)]
 
-mathMLNamespace :: String
+mathMLNamespace :: Text
 mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
 
 eSwitch :: (PandocMonad m, Monoid a)
@@ -233,7 +232,7 @@ eFootnote = try $ do
   content <- pInTags tag block
   addNote ident content
 
-addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
+addNote :: PandocMonad m => Text -> Blocks -> TagParser m ()
 addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s})
 
 eNoteref :: PandocMonad m => TagParser m Inlines
@@ -245,8 +244,8 @@ eNoteref = try $ do
                     -> (lookup "type" as <|> lookup "epub:type" as)
                         == Just "noteref"
                  _  -> False)
-  ident <- case T.unpack <$> lookup "href" attr of
-             Just ('#':rest) -> return rest
+  ident <- case lookup "href" attr >>= T.uncons of
+             Just ('#', rest) -> return rest
              _ -> mzero
   _ <- manyTill pAny (pSatisfy (\case
                                    TagClose t -> t == tag
@@ -287,7 +286,7 @@ pListItem nonItem = do
   maybe id addId (lookup "id" attr) <$>
     pInTags "li" block <* skipMany nonItem
 
-parseListStyleType :: String -> ListNumberStyle
+parseListStyleType :: Text -> ListNumberStyle
 parseListStyleType "lower-roman" = LowerRoman
 parseListStyleType "upper-roman" = UpperRoman
 parseListStyleType "lower-alpha" = LowerAlpha
@@ -295,7 +294,7 @@ parseListStyleType "upper-alpha" = UpperAlpha
 parseListStyleType "decimal"     = Decimal
 parseListStyleType _             = DefaultStyle
 
-parseTypeAttr :: String -> ListNumberStyle
+parseTypeAttr :: Text -> ListNumberStyle
 parseTypeAttr "i" = LowerRoman
 parseTypeAttr "I" = UpperRoman
 parseTypeAttr "a" = LowerAlpha
@@ -404,20 +403,19 @@ pDiv = try $ do
 
 pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
 pRawHtmlBlock = do
-  raw <- T.unpack <$>
-    (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea"
-      <|> pRawTag)
+  raw <- (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pHtmlBlock "textarea"
+          <|> pRawTag)
   exts <- getOption readerExtensions
-  if extensionEnabled Ext_raw_html exts && not (null raw)
+  if extensionEnabled Ext_raw_html exts && not (T.null raw)
      then return $ B.rawBlock "html" raw
      else ignore raw
 
-ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
+ignore :: (Monoid a, PandocMonad m) => Text -> TagParser m a
 ignore raw = do
   pos <- getPosition
   -- raw can be null for tags like <!DOCTYPE>; see paRawTag
   -- in this case we don't want a warning:
-  unless (null raw) $
+  unless (T.null raw) $
     logMessage $ SkippedContent raw pos
   return mempty
 
@@ -438,7 +436,7 @@ eSection = try $ do
 
 headerLevel :: Text -> TagParser m Int
 headerLevel tagtype =
-  case safeRead (T.unpack (T.drop 1 tagtype)) of
+  case safeRead (T.drop 1 tagtype) of
         Just level ->
 --          try (do
 --            guardEnabled Ext_epub_html_exts
@@ -468,7 +466,7 @@ pHeader = try $ do
   level <- headerLevel tagtype
   contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
   let ident = fromMaybe "" $ lookup "id" attr
-  let classes = maybe [] words $ lookup "class" attr
+  let classes = maybe [] T.words $ lookup "class" attr
   let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
   attr'' <- registerHeader (ident, classes, keyvals) contents
   return $ if bodyTitle
@@ -529,14 +527,14 @@ pCol = try $ do
   optional $ pSatisfy (matchTagClose "col")
   skipMany pBlank
   let width = case lookup "width" attribs of
-           Nothing -> case lookup "style" attribs of
-               Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
-                 fromMaybe 0.0 $ safeRead (filter
-                   (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
-               _ -> 0.0
-           Just x | not (null x) && last x == '%' ->
-             fromMaybe 0.0 $ safeRead (init x)
-           _ -> 0.0
+                Nothing -> case lookup "style" attribs of
+                  Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+                    fromMaybe 0.0 $ safeRead (T.filter
+                      (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+                  _ -> 0.0
+                Just (T.unsnoc -> Just (xs, '%')) ->
+                  fromMaybe 0.0 $ safeRead xs
+                _ -> 0.0
   if width > 0.0
     then return $ width / 100.0
     else return 0.0
@@ -562,7 +560,7 @@ pCell celltype = try $ do
   let extractAlign' []                 = ""
       extractAlign' ("text-align":x:_) = x
       extractAlign' (_:xs)             = extractAlign' xs
-  let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':'])
+  let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:")
   let align = case maybeFromAttrib "align" tag `mplus`
                    (extractAlign <$> maybeFromAttrib "style" tag) of
                    Just "left"   -> AlignLeft
@@ -610,7 +608,7 @@ pFigure = try $ do
   let caption = fromMaybe mempty mbcap
   case B.toList <$> mbimg of
        Just [Image attr _ (url, tit)] ->
-         return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption
+         return $ B.para $ B.imageWith attr url ("fig:" <> tit) caption
        _ -> mzero
 
 pCodeBlock :: PandocMonad m => TagParser m Blocks
@@ -618,21 +616,21 @@ pCodeBlock = try $ do
   TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
   let attr = toStringAttr attr'
   contents <- manyTill pAny (pCloses "pre" <|> eof)
-  let rawText = concatMap tagToString contents
+  let rawText = T.concat $ map tagToText contents
   -- drop leading newline if any
-  let result' = case rawText of
-                     '\n':xs -> xs
-                     _       -> rawText
+  let result' = case T.uncons rawText of
+                     Just ('\n', xs) -> xs
+                     _               -> rawText
   -- drop trailing newline if any
-  let result = case reverse result' of
-                    '\n':_ -> init result'
-                    _      -> result'
+  let result = case T.unsnoc result' of
+                    Just (result'', '\n') -> result''
+                    _                     -> result'
   return $ B.codeBlockWith (mkAttr attr) result
 
-tagToString :: Tag Text -> String
-tagToString (TagText s)      = T.unpack s
-tagToString (TagOpen "br" _) = "\n"
-tagToString _                = ""
+tagToText :: Tag Text -> Text
+tagToText (TagText s)      = s
+tagToText (TagOpen "br" _) = "\n"
+tagToText _                = ""
 
 inline :: PandocMonad m => TagParser m Inlines
 inline = choice
@@ -667,7 +665,7 @@ pLocation = do
 pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
 pSat f = do
   pos <- getPosition
-  token show (const pos) (\x -> if f x then Just x else Nothing)
+  token tshow (const pos) (\x -> if f x then Just x else Nothing)
 
 pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
 pSatisfy f = try $ optional pLocation >> pSat f
@@ -688,10 +686,10 @@ pQ = choice $ map try [citedQuote, normalQuote]
   where citedQuote = do
           tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst))
 
-          url <- canonicalizeUrl $ T.unpack $ fromAttrib "cite" tag
-          let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+          url <- canonicalizeUrl $ fromAttrib "cite" tag
+          let uid = fromMaybe (fromAttrib "name" tag) $
                        maybeFromAttrib "id" tag
-          let cls = words $ T.unpack $ fromAttrib "class" tag
+          let cls = T.words $ fromAttrib "class" tag
 
           makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)])
         normalQuote = do
@@ -729,7 +727,7 @@ pSpanLike =
       TagOpen _ attrs <- pSatisfy $ tagOpenLit tagName (const True)
       let (ids, cs, kvs) = mkAttr . toStringAttr $ attrs
       content <- mconcat <$> manyTill inline (pCloses tagName <|> eof)
-      return $ B.spanWith (ids, T.unpack tagName : cs, kvs) content
+      return $ B.spanWith (ids, tagName : cs, kvs) content
 
 pSmall :: PandocMonad m => TagParser m Inlines
 pSmall = pInlinesInTags "small" (B.spanWith ("",["small"],[]))
@@ -753,19 +751,18 @@ pLineBreak = do
 
 -- Unlike fromAttrib from tagsoup, this distinguishes
 -- between a missing attribute and an attribute with empty content.
-maybeFromAttrib :: String -> Tag Text -> Maybe String
-maybeFromAttrib name (TagOpen _ attrs) =
-  T.unpack <$> lookup (T.pack name) attrs
+maybeFromAttrib :: Text -> Tag Text -> Maybe Text
+maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
 maybeFromAttrib _ _ = Nothing
 
 pLink :: PandocMonad m => TagParser m Inlines
 pLink = try $ do
   tag <- pSatisfy $ tagOpenLit "a" (const True)
-  let title = T.unpack $ fromAttrib "title" tag
+  let title = fromAttrib "title" tag
   -- take id from id attribute if present, otherwise name
-  let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $
+  let uid = fromMaybe (fromAttrib "name" tag) $
                maybeFromAttrib "id" tag
-  let cls = words $ T.unpack $ fromAttrib "class" tag
+  let cls = T.words $ fromAttrib "class" tag
   lab <- mconcat <$> manyTill inline (pCloses "a")
   -- check for href; if href, then a link, otherwise a span
   case maybeFromAttrib "href" tag of
@@ -778,34 +775,33 @@ pLink = try $ do
 pImage :: PandocMonad m => TagParser m Inlines
 pImage = do
   tag <- pSelfClosing (=="img") (isJust . lookup "src")
-  url <- canonicalizeUrl $ T.unpack $ fromAttrib "src" tag
-  let title = T.unpack $ fromAttrib "title" tag
-  let alt = T.unpack $ fromAttrib "alt" tag
-  let uid = T.unpack $ fromAttrib "id" tag
-  let cls = words $ T.unpack $ fromAttrib "class" tag
+  url <- canonicalizeUrl $ fromAttrib "src" tag
+  let title = fromAttrib "title" tag
+  let alt = fromAttrib "alt" tag
+  let uid = fromAttrib "id" tag
+  let cls = T.words $ fromAttrib "class" tag
   let getAtt k = case fromAttrib k tag of
                    "" -> []
-                   v  -> [(T.unpack k, T.unpack v)]
+                   v  -> [(k, v)]
   let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"]
   return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
 
-pCodeWithClass :: PandocMonad m => [(T.Text,String)] -> TagParser m Inlines 
-pCodeWithClass elemToClass = try $ do 
+pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines
+pCodeWithClass elemToClass = try $ do
   let tagTest = flip elem . fmap fst $ elemToClass
   TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True)
   result <- manyTill pAny (pCloses open)
   let (ids,cs,kvs) = mkAttr . toStringAttr $ attr'
       cs'          = maybe cs (:cs) . lookup open $ elemToClass
   return . B.codeWith (ids,cs',kvs) .
-    unwords . lines . T.unpack . innerText $ result
+    T.unwords . T.lines . innerText $ result
 
 pCode :: PandocMonad m => TagParser m Inlines
 pCode = try $ do
   (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
   let attr = toStringAttr attr'
   result <- manyTill pAny (pCloses open)
-  return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $
-           innerText result
+  return $ B.codeWith (mkAttr attr) $ T.unwords $ T.lines $ innerText result
 
 pSpan :: PandocMonad m => TagParser m Inlines
 pSpan = try $ do
@@ -817,7 +813,7 @@ pSpan = try $ do
                     where styleAttr   = fromMaybe "" $ lookup "style" attr
                           fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
                           classes     = fromMaybe [] $
-                                          words <$> lookup "class" attr
+                                          T.words <$> lookup "class" attr
   let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
   return $ tag contents
 
@@ -829,18 +825,17 @@ pRawHtmlInline = do
                    then pSatisfy (not . isBlockTag)
                    else pSatisfy isInlineTag
   exts <- getOption readerExtensions
-  let raw = T.unpack $ renderTags' [result]
+  let raw = renderTags' [result]
   if extensionEnabled Ext_raw_html exts
      then return $ B.rawInline "html" raw
      else ignore raw
 
-mathMLToTeXMath :: String -> Either String String
+mathMLToTeXMath :: Text -> Either Text Text
 mathMLToTeXMath s = writeTeX <$> readMathML s
 
-toStringAttr :: [(Text, Text)] -> [(String, String)]
+toStringAttr :: [(Text, Text)] -> [(Text, Text)]
 toStringAttr = map go
-  where go (x,y) = (T.unpack (fromMaybe x $ T.stripPrefix "data-" x),
-                    T.unpack y)
+  where go (x,y) = (fromMaybe x $ T.stripPrefix "data-" x, y)
 
 pScriptMath :: PandocMonad m => TagParser m Inlines
 pScriptMath = try $ do
@@ -849,8 +844,7 @@ pScriptMath = try $ do
                     Just x | "math/tex" `T.isPrefixOf` x
                       -> return $ "display" `T.isSuffixOf` x
                     _ -> mzero
-  contents <- T.unpack . innerText <$>
-                manyTill pAny (pSatisfy (matchTagClose "script"))
+  contents <- innerText <$> manyTill pAny (pSatisfy (matchTagClose "script"))
   return $ (if isdisplay then B.displayMath else B.math) contents
 
 pMath :: PandocMonad m => Bool -> TagParser m Inlines
@@ -862,11 +856,11 @@ pMath inCase = try $ do
   unless inCase $
     guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
   contents <- manyTill pAny (pSatisfy (matchTagClose "math"))
-  case mathMLToTeXMath (T.unpack $ renderTags $
+  case mathMLToTeXMath (renderTags $
           [open] <> contents <> [TagClose "math"]) of
        Left _   -> return $ B.spanWith ("",["math"],attr) $ B.text $
-                             T.unpack $ innerText contents
-       Right [] -> return mempty
+                             innerText contents
+       Right "" -> return mempty
        Right x  -> return $ case lookup "display" attr of
                                  Just "block" -> B.displayMath x
                                  _            -> B.math x
@@ -925,7 +919,7 @@ pTagText = try $ do
   parsed <- lift $ lift $
             flip runReaderT qu $ runParserT (many pTagContents) st "text" str
   case parsed of
-       Left _        -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
+       Left _        -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'"
        Right result  -> return $ mconcat result
 
 pBlank :: PandocMonad m => TagParser m ()
@@ -954,11 +948,11 @@ pRawTeX = do
   guardEnabled Ext_raw_tex
   inp <- getInput
   st <- getState
-  res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" (T.unpack inp)
+  res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" inp
   case res of
        Left _                -> mzero
        Right (contents, raw) -> do
-         _ <- count (length raw) anyChar
+         _ <- count (T.length raw) anyChar
          return $ B.rawInline "tex" contents
 
 pStr :: PandocMonad m => InlinesParser m Inlines
@@ -966,7 +960,7 @@ pStr = do
   result <- many1 $ satisfy $ \c ->
                      not (isSpace c) && not (isSpecial c) && not (isBad c)
   updateLastStrPos
-  return $ B.str result
+  return $ B.str $ T.pack result
 
 isSpecial :: Char -> Bool
 isSpecial '"'     = True
@@ -982,7 +976,7 @@ isSpecial '\8221' = True
 isSpecial _       = False
 
 pSymbol :: PandocMonad m => InlinesParser m Inlines
-pSymbol = satisfy isSpecial >>= return . B.str . (:[])
+pSymbol = satisfy isSpecial >>= return . B.str . T.singleton
 
 isBad :: Char -> Bool
 isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
@@ -1019,7 +1013,7 @@ pBad = do
                 '\158' -> '\382'
                 '\159' -> '\376'
                 _      -> '?'
-  return $ B.str [c']
+  return $ B.str $ T.singleton c'
 
 pSpace :: PandocMonad m => InlinesParser m Inlines
 pSpace = many1 (satisfy isSpace) >>= \xs ->
@@ -1156,8 +1150,8 @@ _ `closes` _ = False
 
 -- | Matches a stretch of HTML in balanced tags.
 htmlInBalanced :: Monad m
-               => (Tag String -> Bool)
-               -> ParserT String st m String
+               => (Tag Text -> Bool)
+               -> ParserT Text st m Text
 htmlInBalanced f = try $ do
   lookAhead (char '<')
   inp <- getInput
@@ -1174,21 +1168,21 @@ htmlInBalanced f = try $ do
                         (TagClose _ : TagPosition er ec : _) -> do
                           let ls = er - sr
                           let cs = ec - sc
-                          lscontents <- unlines <$> count ls anyLine
+                          lscontents <- T.unlines <$> count ls anyLine
                           cscontents <- count cs anyChar
                           closetag <- do
                             x <- many (satisfy (/='>'))
                             char '>'
                             return (x <> ">")
-                          return (lscontents <> cscontents <> closetag)
+                          return $ lscontents <> T.pack cscontents <> T.pack closetag
                         _ -> mzero
     _ -> mzero
 
-htmlInBalanced' :: String
-                -> [Tag String]
-                -> [Tag String]
+htmlInBalanced' :: Text
+                -> [Tag Text]
+                -> [Tag Text]
 htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
-  where go :: Int -> [Tag String] -> Maybe [Tag String]
+  where go :: Int -> [Tag Text] -> Maybe [Tag Text]
         go n (t@(TagOpen tn' _):rest) | tn' == tagname =
               (t :) <$> go (n + 1) rest
         go 1 (t@(TagClose tn'):_) | tn' == tagname =
@@ -1204,8 +1198,8 @@ hasTagWarning _                = False
 
 -- | Matches a tag meeting a certain condition.
 htmlTag :: (HasReaderOptions st, Monad m)
-        => (Tag String -> Bool)
-        -> ParserT [Char] st m (Tag String, String)
+        => (Tag Text -> Bool)
+        -> ParserT Text st m (Tag Text, Text)
 htmlTag f = try $ do
   lookAhead (char '<')
   startpos <- getPosition
@@ -1213,7 +1207,7 @@ htmlTag f = try $ do
   let ts = canonicalizeTags $ parseTagsOptions
                                parseOptions{ optTagWarning = False
                                            , optTagPosition = True }
-                               (inp ++ " ") -- add space to ensure that
+                               (inp <> " ") -- add space to ensure that
                                -- we get a TagPosition after the tag
   (next, ln, col) <- case ts of
                       (TagPosition{} : next : TagPosition ln col : _)
@@ -1225,13 +1219,12 @@ htmlTag f = try $ do
   -- so we exclude . even though it's a valid character
   -- in XML element names
   let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
-  let isName s = case s of
-                      []      -> False
-                      (c:cs)  -> isLetter c && all isNameChar cs
-  let isPI s = case s of
-                      ('?':_) -> True -- processing instruction
-                      _       -> False
-
+  let isName s = case T.uncons s of
+                   Nothing      -> False
+                   Just (c, cs) -> isLetter c && T.all isNameChar cs
+  let isPI s = case T.uncons s of
+                 Just ('?', _) -> True -- processing instruction
+                 _             -> False
   let endpos = if ln == 1
                   then setSourceColumn startpos
                          (sourceColumn startpos + (col - 1))
@@ -1247,18 +1240,18 @@ htmlTag f = try $ do
        -- basic sanity check, since the parser is very forgiving
        -- and finds tags in stuff like x<y)
        guard $ isName tagname || isPI tagname
-       guard $ not $ null tagname
+       guard $ not $ T.null tagname
        -- <https://example.org> should NOT be a tag either.
        -- tagsoup will parse it as TagOpen "https:" [("example.org","")]
-       guard $ last tagname /= ':'
+       guard $ T.last tagname /= ':'
        char '<'
        rendered <- manyTill anyChar endAngle
-       return (next, "<" ++ rendered ++ ">")
+       return (next, T.pack $ "<" ++ rendered ++ ">")
   case next of
        TagComment s
-         | "<!--" `isPrefixOf` inp -> do
+         | "<!--" `T.isPrefixOf` inp -> do
           string "<!--"
-          count (length s) anyChar
+          count (T.length s) anyChar
           string "-->"
           stripComments <- getOption readerStripComments
           if stripComments
@@ -1272,12 +1265,12 @@ htmlTag f = try $ do
          handleTag tagname
        _ -> mzero
 
-mkAttr :: [(String, String)] -> Attr
+mkAttr :: [(Text, Text)] -> Attr
 mkAttr attr = (attribsId, attribsClasses, attribsKV)
   where attribsId = fromMaybe "" $ lookup "id" attr
-        attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes
+        attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
         attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
-        epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+        epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
 
 -- Strip namespace prefixes
 stripPrefixes :: [Tag Text] -> [Tag Text]
@@ -1304,11 +1297,11 @@ isSpace _    = False
 -- Utilities
 
 -- | Adjusts a url according to the document's base URL.
-canonicalizeUrl :: PandocMonad m => String -> TagParser m String
+canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
 canonicalizeUrl url = do
   mbBaseHref <- baseHref <$> getState
-  return $ case (parseURIReference url, mbBaseHref) of
-                (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+  return $ case (parseURIReference (T.unpack url), mbBaseHref) of
+                (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
                 _                   -> url
author	despresc <christian.j.j.despres@gmail.com>	2019-11-04 16:12:37 -0500
committer	John MacFarlane <jgm@berkeley.edu>	2019-11-12 16:03:45 -0800
commit	90e436d49604e3fd1ef9432fb23f6d7f6245c7fd (patch)
tree	4e7f0692f989643189f1fc6786050d95e239a0ea /src/Text/Pandoc/Readers/HTML.hs
parent	d3966372f5049eea56213b069fc4d70d8af9144c (diff)
download	pandoc-90e436d49604e3fd1ef9432fb23f6d7f6245c7fd.tar.gz