aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs96
1 files changed, 62 insertions, 34 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index a34e2fb5c..959a2d16f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -43,7 +43,7 @@ import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
-import Text.Pandoc.Shared ( extractSpaces, renderTags'
+import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField
, escapeURI, safeRead, mapLeft )
import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
, Extension (Ext_epub_html_exts,
@@ -52,9 +52,9 @@ import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Walk
import qualified Data.Map as M
import Data.Maybe ( fromMaybe, isJust)
-import Data.List ( intercalate, isInfixOf, isPrefixOf, isSuffixOf )
+import Data.List ( intercalate, isInfixOf, isPrefixOf )
import Data.Char ( isDigit )
-import Control.Monad ( liftM, guard, when, mzero, void, unless )
+import Control.Monad ( guard, when, mzero, void, unless )
import Control.Arrow ((***))
import Control.Applicative ( (<|>) )
import Data.Monoid (First (..))
@@ -63,12 +63,12 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-import Network.URI (isURI)
+import Network.URI (URI, parseURIReference, nonStrictRelativeTo)
import Text.Pandoc.Error
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Text.Pandoc.Compat.Monoid ((<>))
import Text.Parsec.Error
-
+import qualified Data.Set as Set
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ReaderOptions -- ^ Reader options
@@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options
readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $
runParserT parseDoc
- (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty)
+ (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty)
"source" tags
where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
@@ -103,8 +103,8 @@ data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(String, Blocks)],
- baseHref :: Maybe String,
- identifiers :: [String],
+ baseHref :: Maybe URI,
+ identifiers :: Set.Set String,
headerMap :: M.Map Inlines String
}
@@ -137,19 +137,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag
then return mempty
else do
let content = fromAttrib "content" mt
- updateState $ B.setMeta name (B.text content)
+ updateState $ \s ->
+ let ps = parserState s in
+ s{ parserState = ps{
+ stateMeta = addMetaField name (B.text content)
+ (stateMeta ps) } }
return mempty
pBaseTag = do
bt <- pSatisfy (~== TagOpen "base" [])
- let baseH = fromAttrib "href" bt
- if null baseH
- then return mempty
- else do
- let baseH' = case reverse baseH of
- '/':_ -> baseH
- _ -> baseH ++ "/"
- updateState $ \st -> st{ baseHref = Just baseH' }
- return mempty
+ updateState $ \st -> st{ baseHref =
+ parseURIReference $ fromAttrib "href" bt }
+ return mempty
block :: TagParser Blocks
block = do
@@ -441,6 +439,7 @@ pTable = try $ do
-- fail on empty table
guard $ not $ null head' && null rows
let isSinglePlain x = case B.toList x of
+ [] -> True
[Plain _] -> True
_ -> False
let isSimple = all isSinglePlain $ concat (head':rows)
@@ -605,9 +604,9 @@ pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "href" tag
- let url = case (isURI url', mbBaseHref) of
- (False, Just h) -> h ++ url'
- _ -> url'
+ let url = case (parseURIReference url', mbBaseHref) of
+ (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ _ -> url'
let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag
let cls = words $ fromAttrib "class" tag
@@ -619,9 +618,9 @@ pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "src" tag
- let url = case (isURI url', mbBaseHref) of
- (False, Just h) -> h ++ url'
- _ -> url'
+ let url = case (parseURIReference url', mbBaseHref) of
+ (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ _ -> url'
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
let uid = fromAttrib "id" tag
@@ -925,14 +924,45 @@ htmlInBalanced :: (Monad m)
=> (Tag String -> Bool)
-> ParserT String st m String
htmlInBalanced f = try $ do
- (TagOpen t _, tag) <- htmlTag f
- guard $ not $ "/>" `isSuffixOf` tag -- not a self-closing tag
- let stopper = htmlTag (~== TagClose t)
- let anytag = snd <$> htmlTag (const True)
- contents <- many $ notFollowedBy' stopper >>
- (htmlInBalanced f <|> anytag <|> count 1 anyChar)
- endtag <- liftM snd stopper
- return $ tag ++ concat contents ++ endtag
+ lookAhead (char '<')
+ inp <- getInput
+ let ts = canonicalizeTags $
+ parseTagsOptions parseOptions{ optTagWarning = True,
+ optTagPosition = True } inp
+ case ts of
+ (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do
+ guard $ f t
+ guard $ not $ hasTagWarning (t : take 1 rest)
+ case htmlInBalanced' tn (t:rest) of
+ [] -> mzero
+ xs -> case reverse xs of
+ (TagClose _ : TagPosition er ec : _) -> do
+ let ls = er - sr
+ let cs = ec - sc
+ lscontents <- concat <$> count ls anyLine
+ cscontents <- count cs anyChar
+ (_,closetag) <- htmlTag (~== TagClose tn)
+ return (lscontents ++ cscontents ++ closetag)
+ _ -> mzero
+ _ -> mzero
+
+htmlInBalanced' :: String
+ -> [Tag String]
+ -> [Tag String]
+htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
+ where go :: Int -> [Tag String] -> Maybe [Tag String]
+ go n (t@(TagOpen tn' _):rest) | tn' == tagname =
+ (t :) <$> go (n + 1) rest
+ go 1 (t@(TagClose tn'):_) | tn' == tagname =
+ return [t]
+ go n (t@(TagClose tn'):rest) | tn' == tagname =
+ (t :) <$> go (n - 1) rest
+ go n (t:ts') = (t :) <$> go n ts'
+ go _ [] = mzero
+
+hasTagWarning :: [Tag String] -> Bool
+hasTagWarning (TagWarning _:_) = True
+hasTagWarning _ = False
-- | Matches a tag meeting a certain condition.
htmlTag :: Monad m
@@ -941,8 +971,6 @@ htmlTag :: Monad m
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let hasTagWarning (TagWarning _:_) = True
- hasTagWarning _ = False
let (next : rest) = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = True } inp
guard $ f next