aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-05-13 20:39:01 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-05-13 20:53:19 -0700
commite06810499e69b23f08fc1f8fb7e774e7b11be713 (patch)
tree917f1d47d7b0f3f6ee8987455273dce4ed7ca2c4
parent75cfa7b4624bfbf06c8a47761620bd756ddab254 (diff)
downloadpandoc-e06810499e69b23f08fc1f8fb7e774e7b11be713.tar.gz
HTML reader: Support base tag.
We only support the href attribute, as there's no place for "target" in the Pandoc document model for links. Added HTML reader test module, with tests for this feature. Closes #1751.
-rw-r--r--pandoc.cabal1
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs35
-rw-r--r--tests/Tests/Readers/HTML.hs27
-rw-r--r--tests/test-pandoc.hs2
4 files changed, 58 insertions, 7 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index df3ddd92c..df402e360 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -481,6 +481,7 @@ Test-Suite test-pandoc
Tests.Shared
Tests.Walk
Tests.Readers.LaTeX
+ Tests.Readers.HTML
Tests.Readers.Markdown
Tests.Readers.Org
Tests.Readers.RST
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index df2d37fac..c766bb4ee 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -62,7 +62,7 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-
+import Network.URI (isURI)
import Text.Pandoc.Error
import Text.Parsec.Error
@@ -74,7 +74,8 @@ readHtml :: ReaderOptions -- ^ Reader options
-> Either PandocError Pandoc
readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $
- runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags
+ runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing)
+ "source" tags
where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
@@ -98,7 +99,8 @@ replaceNotes' x = return x
data HTMLState =
HTMLState
{ parserState :: ParserState,
- noteTable :: [(String, Blocks)]
+ noteTable :: [(String, Blocks)],
+ baseHref :: Maybe String
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
@@ -120,7 +122,7 @@ pBody :: TagParser Blocks
pBody = pInTags "body" block
pHead :: TagParser Blocks
-pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
+pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
pMetaTag = do
@@ -132,6 +134,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
let content = fromAttrib "content" mt
updateState $ B.setMeta name (B.text content)
return mempty
+ pBaseTag = do
+ bt <- pSatisfy (~== TagOpen "base" [])
+ let baseH = fromAttrib "href" bt
+ if null baseH
+ then return mempty
+ else do
+ let baseH' = case reverse baseH of
+ '/':_ -> baseH
+ _ -> baseH ++ "/"
+ updateState $ \st -> st{ baseHref = Just baseH' }
+ return mempty
block :: TagParser Blocks
block = do
@@ -566,7 +579,11 @@ pAnchor = try $ do
pRelLink :: TagParser Inlines
pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
- let url = fromAttrib "href" tag
+ mbBaseHref <- baseHref <$> getState
+ let url' = fromAttrib "href" tag
+ let url = case (isURI url', mbBaseHref) of
+ (False, Just h) -> h ++ url'
+ _ -> url'
let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag
let spanC = case uid of
@@ -578,7 +595,11 @@ pRelLink = try $ do
pImage :: TagParser Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
- let url = fromAttrib "src" tag
+ mbBaseHref <- baseHref <$> getState
+ let url' = fromAttrib "src" tag
+ let url = case (isURI url', mbBaseHref) of
+ (False, Just h) -> h ++ url'
+ _ -> url'
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
return $ B.image (escapeURI url) title (B.text alt)
@@ -945,7 +966,7 @@ instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance Default HTMLState where
- def = HTMLState def []
+ def = HTMLState def [] Nothing
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
diff --git a/tests/Tests/Readers/HTML.hs b/tests/Tests/Readers/HTML.hs
new file mode 100644
index 000000000..2eb87a2f3
--- /dev/null
+++ b/tests/Tests/Readers/HTML.hs
@@ -0,0 +1,27 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Tests.Readers.HTML (tests) where
+
+import Text.Pandoc.Definition
+import Test.Framework
+import Tests.Helpers
+import Tests.Arbitrary()
+import Text.Pandoc.Builder
+import Text.Pandoc
+import Text.Pandoc.Error
+
+html :: String -> Pandoc
+html = handleError . readHtml def
+
+tests :: [Test]
+tests = [ testGroup "base tag"
+ [ test html "simple" $
+ "<head><base href=\"http://www.w3schools.com/images\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
+ plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
+ , test html "slash at end of base" $
+ "<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"stickman.gif\" alt=\"Stickman\"></head>" =?>
+ plain (image "http://www.w3schools.com/images/stickman.gif" "" (text "Stickman"))
+ , test html "absolute URL" $
+ "<head><base href=\"http://www.w3schools.com/images/\" ></head><body><img src=\"http://example.com/stickman.gif\" alt=\"Stickman\"></head>" =?>
+ plain (image "http://example.com/stickman.gif" "" (text "Stickman"))
+ ]
+ ]
diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs
index 805bad414..9bc26416f 100644
--- a/tests/test-pandoc.hs
+++ b/tests/test-pandoc.hs
@@ -8,6 +8,7 @@ import qualified Tests.Old
import qualified Tests.Readers.LaTeX
import qualified Tests.Readers.Markdown
import qualified Tests.Readers.Org
+import qualified Tests.Readers.HTML
import qualified Tests.Readers.RST
import qualified Tests.Readers.Docx
import qualified Tests.Readers.Txt2Tags
@@ -46,6 +47,7 @@ tests = [ testGroup "Old" Tests.Old.tests
, testGroup "Readers"
[ testGroup "LaTeX" Tests.Readers.LaTeX.tests
, testGroup "Markdown" Tests.Readers.Markdown.tests
+ , testGroup "HTML" Tests.Readers.HTML.tests
, testGroup "Org" Tests.Readers.Org.tests
, testGroup "RST" Tests.Readers.RST.tests
, testGroup "Docx" Tests.Readers.Docx.tests