aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs28
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs20
2 files changed, 24 insertions, 24 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index eb78979a3..f870a241d 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -74,7 +74,7 @@ readHtml :: PandocMonad m
-> Text -- ^ String to parse (assumes @'\n'@ line endings)
-> m Pandoc
readHtml opts inp = do
- let tags = stripPrefixes . canonicalizeTags $
+ let tags = stripPrefixes $ canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp)
parseDoc = do
@@ -95,6 +95,15 @@ readHtml opts inp = do
Right doc -> return doc
Left err -> throwError $ PandocParseError $ T.pack $ getError err
+-- Strip namespace prefixes on tags (not attributes)
+stripPrefixes :: [Tag Text] -> [Tag Text]
+stripPrefixes = map stripPrefix
+
+stripPrefix :: Tag Text -> Tag Text
+stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as
+stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s)
+stripPrefix x = x
+
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
st <- getState
@@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True})
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
- for_ (lookup "lang" attr) $
+ for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $
updateState . B.setMeta "lang" . B.text
pInTags "html" block
@@ -1024,21 +1033,6 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
--- Strip namespace prefixes
-stripPrefixes :: [Tag Text] -> [Tag Text]
-stripPrefixes = map stripPrefix
-
-stripPrefix :: Tag Text -> Tag Text
-stripPrefix (TagOpen s as) =
- TagOpen (stripPrefix' s) (map (first stripPrefix') as)
-stripPrefix (TagClose s) = TagClose (stripPrefix' s)
-stripPrefix x = x
-
-stripPrefix' :: Text -> Text
-stripPrefix' s =
- if T.null t then s else T.drop 1 t
- where (_, t) = T.span (/= ':') s
-
-- Utilities
-- | Adjusts a url according to the document's base URL.
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 2d58319da..e28ebe77b 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -193,14 +193,20 @@ t1 `closes` t2 |
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
+toStringAttr = foldr go []
where
- go (x,y) =
- case T.stripPrefix "data-" x of
- Just x' | x' `Set.notMember` (html5Attributes <>
- html4Attributes <> rdfaAttributes)
- -> (x',y)
- _ -> (x,y)
+ go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
+ -- treat xml:lang as lang
+ go ("xml:lang",y) ats = go ("lang",y) ats
+ -- prevent duplicate attributes
+ go (x,y) ats
+ | any (\(x',_) -> x == x') ats = ats
+ | otherwise =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> go (x',y) ats
+ _ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.