aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pandoc.cabal1
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs35
-rw-r--r--tests/Tests/Readers/HTML.hs27
-rw-r--r--tests/test-pandoc.hs2
4 files changed, 58 insertions, 7 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index df3ddd92c..df402e360 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -481,6 +481,7 @@ Test-Suite test-pandoc
Tests.Shared
Tests.Walk
Tests.Readers.LaTeX
+ Tests.Readers.HTML
Tests.Readers.Markdown
Tests.Readers.Org
Tests.Readers.RST
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index df2d37fac..c766bb4ee 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -62,7 +62,7 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-
+import Network.URI (isURI)
import Text.Pandoc.Error
import Text.Parsec.Error
@@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options
-> Either PandocError Pandoc
readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $
- runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags
+ runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing)
+ "source" tags
where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
@@ -98,7 +99,8 @@ replaceNotes' x = return x
data HTMLState =
HTMLState
{ parserState :: ParserState,
- noteTable :: [(String, Blocks)]
+ noteTable :: [(String, Blocks)],
+ baseHref :: Maybe String
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
@@ -120,7 +122,7 @@ pBody :: TagParser Blocks
pBody = pInTags "body" block
pHead :: TagParser Blocks
-pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
+pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
pMetaTag = do
@@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
let content = fromAttrib "content" mt
updateState $ B.setMeta name (B.text content)
return mempty
+ pBaseTag = do
+ bt <- pSatisfy (~== TagOpen "base" [])
+ let baseH = fromAttrib "href" bt
+ if null baseH
+ then return mempty
+ else do
+ let baseH' = case reverse baseH of
+ '/':_ -> baseH
+ _ -> baseH ++ "/"
+ updateState $ \st -> st{ baseHref = Just baseH' }
+ return mempty
block :: TagParser Blocks
block = do
@@ -566,7 +579,11 @@ pAnchor = try $ do
pRelLink :: TagParser Inlines
pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
- let url = fromAttrib "href" tag
+ mbBaseHref <- baseHref <$> getState
+ let url' = fromAttrib "href" tag
+ let url = case (isURI url', mbBaseHref) of
+ (False, Just h) -> h ++ url'
+ _ -> url'
let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag
let spanC = case uid of
@@ -578,7 +595,11 @@ pRelLink = try $ do
pImage :: TagParser Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
- let url = fromAttrib "src" tag
+ mbBaseHref <- baseHref <$> getState
+ let url' = fromAttrib "src" tag
+ let url = case (isURI url', mbBaseHref) of
+ (False, Just h) -> h ++ url'
+ _ -> url'
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
return $ B.image (escapeURI url) title (B.text alt)
@@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance Default HTMLState where
- def = HTMLState def []
+ def = HTMLState def [] Nothing
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
diff --git a/tests/Tests/Readers/HTML.hs b/tests/Tests/Readers/HTML.hs
new file mode 100644
index 000000000..2eb87a2f3
--- /dev/null
+++ b/tests/Tests/Readers/HTML.hs
@@ -0,0 +1,27 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Tests.Readers.HTML (tests) where
+
+import Text.Pandoc.Definition
+import Test.Framework
+import Tests.Helpers
+import Tests.Arbitrary()
+import Text.Pandoc.Builder
+import Text.Pandoc
+import Text.Pandoc.Error
+
+html :: String -> Pandoc
+html = handleError . readHtml def
+
+tests :: [Test]
+tests = [ testGroup "base tag"
+ [ test html "simple" $
+ "<head><base href=\"http://www.w3schools.com/images\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
+ plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
+ , test html "slash at end of base" $
+ "<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
+ plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
+ , test html "absolute URL" $
+ "<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"http://example.com/stickman.gif\" alt=\"Stickman\"></head>" =?>
+ plain (image "http://example.com/stickman.gif" "" (text "Stickman"))
+ ]
+ ]
diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs
index 805bad414..9bc26416f 100644
--- a/tests/test-pandoc.hs
+++ b/tests/test-pandoc.hs
@@ -8,6 +8,7 @@ import qualified Tests.Old
import qualified Tests.Readers.LaTeX
import qualified Tests.Readers.Markdown
import qualified Tests.Readers.Org
+import qualified Tests.Readers.HTML
import qualified Tests.Readers.RST
import qualified Tests.Readers.Docx
import qualified Tests.Readers.Txt2Tags
@@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests
, testGroup "Readers"
[ testGroup "LaTeX" Tests.Readers.LaTeX.tests
, testGroup "Markdown" Tests.Readers.Markdown.tests
+ , testGroup "HTML" Tests.Readers.HTML.tests
, testGroup "Org" Tests.Readers.Org.tests
, testGroup "RST" Tests.Readers.RST.tests
, testGroup "Docx" Tests.Readers.Docx.tests