diff options
-rw-r--r-- | pandoc.cabal | 1 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 35 | ||||
-rw-r--r-- | tests/Tests/Readers/HTML.hs | 27 | ||||
-rw-r--r-- | tests/test-pandoc.hs | 2 |
4 files changed, 58 insertions, 7 deletions
diff --git a/pandoc.cabal b/pandoc.cabal index df3ddd92c..df402e360 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -481,6 +481,7 @@ Test-Suite test-pandoc Tests.Shared Tests.Walk Tests.Readers.LaTeX + Tests.Readers.HTML Tests.Readers.Markdown Tests.Readers.Org Tests.Readers.RST diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index df2d37fac..c766bb4ee 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -62,7 +62,7 @@ import Debug.Trace (trace) import Text.TeXMath (readMathML, writeTeX) import Data.Default (Default (..), def) import Control.Monad.Reader (Reader,ask, asks, local, runReader) - +import Network.URI (isURI) import Text.Pandoc.Error import Text.Parsec.Error @@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options -> Either PandocError Pandoc readHtml opts inp = mapLeft (ParseFailure . getError) . flip runReader def $ - runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags + runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing) + "source" tags where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do @@ -98,7 +99,8 @@ replaceNotes' x = return x data HTMLState = HTMLState { parserState :: ParserState, - noteTable :: [(String, Blocks)] + noteTable :: [(String, Blocks)], + baseHref :: Maybe String } data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext @@ -120,7 +122,7 @@ pBody :: TagParser Blocks pBody = pInTags "body" block pHead :: TagParser Blocks -pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) +pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) where pTitle = pInTags "title" inline >>= setTitle . trimInlines setTitle t = mempty <$ (updateState $ B.setMeta "title" t) pMetaTag = do @@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) let content = fromAttrib "content" mt updateState $ B.setMeta name (B.text content) return mempty + pBaseTag = do + bt <- pSatisfy (~== TagOpen "base" []) + let baseH = fromAttrib "href" bt + if null baseH + then return mempty + else do + let baseH' = case reverse baseH of + '/':_ -> baseH + _ -> baseH ++ "/" + updateState $ \st -> st{ baseHref = Just baseH' } + return mempty block :: TagParser Blocks block = do @@ -566,7 +579,11 @@ pAnchor = try $ do pRelLink :: TagParser Inlines pRelLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) - let url = fromAttrib "href" tag + mbBaseHref <- baseHref <$> getState + let url' = fromAttrib "href" tag + let url = case (isURI url', mbBaseHref) of + (False, Just h) -> h ++ url' + _ -> url' let title = fromAttrib "title" tag let uid = fromAttrib "id" tag let spanC = case uid of @@ -578,7 +595,11 @@ pRelLink = try $ do pImage :: TagParser Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") - let url = fromAttrib "src" tag + mbBaseHref <- baseHref <$> getState + let url' = fromAttrib "src" tag + let url = case (isURI url', mbBaseHref) of + (False, Just h) -> h ++ url' + _ -> url' let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag return $ B.image (escapeURI url) title (B.text alt) @@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where extractReaderOptions = extractReaderOptions . parserState instance Default HTMLState where - def = HTMLState def [] + def = HTMLState def [] Nothing instance HasMeta HTMLState where setMeta s b st = st {parserState = setMeta s b $ parserState st} diff --git a/tests/Tests/Readers/HTML.hs b/tests/Tests/Readers/HTML.hs new file mode 100644 index 000000000..2eb87a2f3 --- /dev/null +++ b/tests/Tests/Readers/HTML.hs @@ -0,0 +1,27 @@ +{-# LANGUAGE OverloadedStrings #-} +module Tests.Readers.HTML (tests) where + +import Text.Pandoc.Definition +import Test.Framework +import Tests.Helpers +import Tests.Arbitrary() +import Text.Pandoc.Builder +import Text.Pandoc +import Text.Pandoc.Error + +html :: String -> Pandoc +html = handleError . readHtml def + +tests :: [Test] +tests = [ testGroup "base tag" + [ test html "simple" $ + "<head><base href=\"http://www.w3schools.com/images\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?> + plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman")) + , test html "slash at end of base" $ + "<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?> + plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman")) + , test html "absolute URL" $ + "<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"http://example.com/stickman.gif\" alt=\"Stickman\"></head>" =?> + plain (image "http://example.com/stickman.gif" "" (text "Stickman")) + ] + ] diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs index 805bad414..9bc26416f 100644 --- a/tests/test-pandoc.hs +++ b/tests/test-pandoc.hs @@ -8,6 +8,7 @@ import qualified Tests.Old import qualified Tests.Readers.LaTeX import qualified Tests.Readers.Markdown import qualified Tests.Readers.Org +import qualified Tests.Readers.HTML import qualified Tests.Readers.RST import qualified Tests.Readers.Docx import qualified Tests.Readers.Txt2Tags @@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests , testGroup "Readers" [ testGroup "LaTeX" Tests.Readers.LaTeX.tests , testGroup "Markdown" Tests.Readers.Markdown.tests + , testGroup "HTML" Tests.Readers.HTML.tests , testGroup "Org" Tests.Readers.Org.tests , testGroup "RST" Tests.Readers.RST.tests , testGroup "Docx" Tests.Readers.Docx.tests |