From e06810499e69b23f08fc1f8fb7e774e7b11be713 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 13 May 2015 20:39:01 -0700 Subject: HTML reader: Support base tag. We only support the href attribute, as there's no place for "target" in the Pandoc document model for links. Added HTML reader test module, with tests for this feature. Closes #1751. --- pandoc.cabal | 1 + src/Text/Pandoc/Readers/HTML.hs | 35 ++++++++++++++++++++++++++++------- tests/Tests/Readers/HTML.hs | 27 +++++++++++++++++++++++++++ tests/test-pandoc.hs | 2 ++ 4 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 tests/Tests/Readers/HTML.hs diff --git a/pandoc.cabal b/pandoc.cabal index df3ddd92c..df402e360 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -481,6 +481,7 @@ Test-Suite test-pandoc Tests.Shared Tests.Walk Tests.Readers.LaTeX + Tests.Readers.HTML Tests.Readers.Markdown Tests.Readers.Org Tests.Readers.RST diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index df2d37fac..c766bb4ee 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -62,7 +62,7 @@ import Debug.Trace (trace) import Text.TeXMath (readMathML, writeTeX) import Data.Default (Default (..), def) import Control.Monad.Reader (Reader,ask, asks, local, runReader) - +import Network.URI (isURI) import Text.Pandoc.Error import Text.Parsec.Error @@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options -> Either PandocError Pandoc readHtml opts inp = mapLeft (ParseFailure . getError) . flip runReader def $ - runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags + runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing) + "source" tags where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do @@ -98,7 +99,8 @@ replaceNotes' x = return x data HTMLState = HTMLState { parserState :: ParserState, - noteTable :: [(String, Blocks)] + noteTable :: [(String, Blocks)], + baseHref :: Maybe String } data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext @@ -120,7 +122,7 @@ pBody :: TagParser Blocks pBody = pInTags "body" block pHead :: TagParser Blocks -pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) +pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) where pTitle = pInTags "title" inline >>= setTitle . trimInlines setTitle t = mempty <$ (updateState $ B.setMeta "title" t) pMetaTag = do @@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) let content = fromAttrib "content" mt updateState $ B.setMeta name (B.text content) return mempty + pBaseTag = do + bt <- pSatisfy (~== TagOpen "base" []) + let baseH = fromAttrib "href" bt + if null baseH + then return mempty + else do + let baseH' = case reverse baseH of + '/':_ -> baseH + _ -> baseH ++ "/" + updateState $ \st -> st{ baseHref = Just baseH' } + return mempty block :: TagParser Blocks block = do @@ -566,7 +579,11 @@ pAnchor = try $ do pRelLink :: TagParser Inlines pRelLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) - let url = fromAttrib "href" tag + mbBaseHref <- baseHref <$> getState + let url' = fromAttrib "href" tag + let url = case (isURI url', mbBaseHref) of + (False, Just h) -> h ++ url' + _ -> url' let title = fromAttrib "title" tag let uid = fromAttrib "id" tag let spanC = case uid of @@ -578,7 +595,11 @@ pRelLink = try $ do pImage :: TagParser Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") - let url = fromAttrib "src" tag + mbBaseHref <- baseHref <$> getState + let url' = fromAttrib "src" tag + let url = case (isURI url', mbBaseHref) of + (False, Just h) -> h ++ url' + _ -> url' let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag return $ B.image (escapeURI url) title (B.text alt) @@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where extractReaderOptions = extractReaderOptions . parserState instance Default HTMLState where - def = HTMLState def [] + def = HTMLState def [] Nothing instance HasMeta HTMLState where setMeta s b st = st {parserState = setMeta s b $ parserState st} diff --git a/tests/Tests/Readers/HTML.hs b/tests/Tests/Readers/HTML.hs new file mode 100644 index 000000000..2eb87a2f3 --- /dev/null +++ b/tests/Tests/Readers/HTML.hs @@ -0,0 +1,27 @@ +{-# LANGUAGE OverloadedStrings #-} +module Tests.Readers.HTML (tests) where + +import Text.Pandoc.Definition +import Test.Framework +import Tests.Helpers +import Tests.Arbitrary() +import Text.Pandoc.Builder +import Text.Pandoc +import Text.Pandoc.Error + +html :: String -> Pandoc +html = handleError . readHtml def + +tests :: [Test] +tests = [ testGroup "base tag" + [ test html "simple" $ + "\"Stickman\"" =?> + plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman")) + , test html "slash at end of base" $ + "\"Stickman\"" =?> + plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman")) + , test html "absolute URL" $ + "\"Stickman\"" =?> + plain (image "http://example.com/stickman.gif" "" (text "Stickman")) + ] + ] diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs index 805bad414..9bc26416f 100644 --- a/tests/test-pandoc.hs +++ b/tests/test-pandoc.hs @@ -8,6 +8,7 @@ import qualified Tests.Old import qualified Tests.Readers.LaTeX import qualified Tests.Readers.Markdown import qualified Tests.Readers.Org +import qualified Tests.Readers.HTML import qualified Tests.Readers.RST import qualified Tests.Readers.Docx import qualified Tests.Readers.Txt2Tags @@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests , testGroup "Readers" [ testGroup "LaTeX" Tests.Readers.LaTeX.tests , testGroup "Markdown" Tests.Readers.Markdown.tests + , testGroup "HTML" Tests.Readers.HTML.tests , testGroup "Org" Tests.Readers.Org.tests , testGroup "RST" Tests.Readers.RST.tests , testGroup "Docx" Tests.Readers.Docx.tests -- cgit v1.2.3