aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-16 18:40:06 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-16 18:40:06 -0800
commitd7a4996b1e7e201095ce792375a0776984fa8fcd (patch)
tree2d9fe829471bd4217a3e92b72d1c713abe77f274
parent967e7f5fb990b29de48b37be1db40fb149a8cf55 (diff)
downloadpandoc-d7a4996b1e7e201095ce792375a0776984fa8fcd.tar.gz
Split up T.P.XML.Light into submodules.
-rw-r--r--pandoc.cabal3
-rw-r--r--src/Text/Pandoc/XML/Light.hs511
-rw-r--r--src/Text/Pandoc/XML/Light/Output.hs230
-rw-r--r--src/Text/Pandoc/XML/Light/Proc.hs138
-rw-r--r--src/Text/Pandoc/XML/Light/Types.hs190
5 files changed, 568 insertions, 504 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index 22aebd55e..d27520ba0 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -688,6 +688,9 @@ library
Text.Pandoc.Lua.Util,
Text.Pandoc.Lua.Walk,
Text.Pandoc.XML.Light,
+ Text.Pandoc.XML.Light.Types,
+ Text.Pandoc.XML.Light.Proc,
+ Text.Pandoc.XML.Light.Output,
Text.Pandoc.CSS,
Text.Pandoc.CSV,
Text.Pandoc.RoffChar,
diff --git a/src/Text/Pandoc/XML/Light.hs b/src/Text/Pandoc/XML/Light.hs
index 38e4df218..07113ea92 100644
--- a/src/Text/Pandoc/XML/Light.hs
+++ b/src/Text/Pandoc/XML/Light.hs
@@ -1,5 +1,4 @@
{-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.XML.Light
@@ -31,59 +30,9 @@ better performance and accuracy without much change in the
code that used xml-light.
-}
module Text.Pandoc.XML.Light
- ( -- * Basic types, duplicating those from xml-light but with Text
- -- instead of String
- Line
- , Content(..)
- , Element(..)
- , Attr(..)
- , CData(..)
- , CDataKind(..)
- , QName(..)
- , Node(..)
- , unode
- , unqual
- , add_attr
- , add_attrs
- -- * Conversion functions from xml-light types
- , fromXLQName
- , fromXLCData
- , fromXLElement
- , fromXLAttr
- , fromXLContent
- -- * Replacement for xml-light's Text.XML.Proc
- , strContent
- , onlyElems
- , elChildren
- , onlyText
- , findChildren
- , filterChildren
- , filterChildrenName
- , findChild
- , filterChild
- , filterChildName
- , findElement
- , filterElement
- , filterElementName
- , findElements
- , filterElements
- , filterElementsName
- , findAttr
- , lookupAttr
- , lookupAttrBy
- , findAttrBy
- -- * Replacement for xml-light's Text.XML.Output
- , ppTopElement
- , ppElement
- , ppContent
- , ppcElement
- , ppcContent
- , showTopElement
- , showElement
- , showContent
- , useShortEmptyTags
- , defaultConfigPP
- , ConfigPP(..)
+ ( module Text.Pandoc.XML.Light.Types
+ , module Text.Pandoc.XML.Light.Proc
+ , module Text.Pandoc.XML.Light.Output
-- * Replacement for xml-light's Text.XML.Input
, parseXMLElement
, parseXMLContents
@@ -92,16 +41,13 @@ module Text.Pandoc.XML.Light
import qualified Control.Exception as E
import qualified Text.XML as Conduit
import Text.XML.Unresolved (InvalidEventStream(..))
-import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
-import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText)
import qualified Data.Map as M
-import Data.Data (Data)
-import Data.Typeable (Typeable)
-import Data.Maybe (mapMaybe, listToMaybe)
-import Data.List(find)
-import qualified Text.XML.Light as XL
+import Data.Maybe (mapMaybe)
+import Text.Pandoc.XML.Light.Types
+import Text.Pandoc.XML.Light.Proc
+import Text.Pandoc.XML.Light.Output
-- Drop in replacement for parseXMLDoc in xml-light.
parseXMLElement :: TL.Text -> Either T.Text Element
@@ -141,446 +87,3 @@ nodeToContent (Conduit.NodeContent t) =
Just (Text (CData CDataText t Nothing))
nodeToContent _ = Nothing
-unqual :: Text -> QName
-unqual x = QName x Nothing Nothing
-
--- | Add an attribute to an element.
-add_attr :: Attr -> Element -> Element
-add_attr a e = add_attrs [a] e
-
--- | Add some attributes to an element.
-add_attrs :: [Attr] -> Element -> Element
-add_attrs as e = e { elAttribs = as ++ elAttribs e }
-
---
--- type definitions lightly modified from xml-light
---
-
--- | A line is an Integer
-type Line = Integer
-
--- | XML content
-data Content = Elem Element
- | Text CData
- | CRef Text
- deriving (Show, Typeable, Data)
-
--- | XML elements
-data Element = Element {
- elName :: QName,
- elAttribs :: [Attr],
- elContent :: [Content],
- elLine :: Maybe Line
- } deriving (Show, Typeable, Data)
-
--- | XML attributes
-data Attr = Attr {
- attrKey :: QName,
- attrVal :: Text
- } deriving (Eq, Ord, Show, Typeable, Data)
-
--- | XML CData
-data CData = CData {
- cdVerbatim :: CDataKind,
- cdData :: Text,
- cdLine :: Maybe Line
- } deriving (Show, Typeable, Data)
-
-data CDataKind
- = CDataText -- ^ Ordinary character data; pretty printer escapes &, < etc.
- | CDataVerbatim -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[..
- | CDataRaw -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up.
- deriving ( Eq, Show, Typeable, Data )
-
--- | XML qualified names
-data QName = QName {
- qName :: Text,
- qURI :: Maybe Text,
- qPrefix :: Maybe Text
- } deriving (Show, Typeable, Data)
-
-
-instance Eq QName where
- q1 == q2 = compare q1 q2 == EQ
-
-instance Ord QName where
- compare q1 q2 =
- case compare (qName q1) (qName q2) of
- EQ -> case (qURI q1, qURI q2) of
- (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2)
- (u1,u2) -> compare u1 u2
- x -> x
-
-class Node t where
- node :: QName -> t -> Element
-
-instance Node ([Attr],[Content]) where
- node n (attrs,cont) = Element { elName = n
- , elAttribs = attrs
- , elContent = cont
- , elLine = Nothing
- }
-
-instance Node [Attr] where node n as = node n (as,[]::[Content])
-instance Node Attr where node n a = node n [a]
-instance Node () where node n () = node n ([]::[Attr])
-
-instance Node [Content] where node n cs = node n ([]::[Attr],cs)
-instance Node Content where node n c = node n [c]
-instance Node ([Attr],Content) where node n (as,c) = node n (as,[c])
-instance Node (Attr,Content) where node n (a,c) = node n ([a],[c])
-
-instance Node ([Attr],[Element]) where
- node n (as,cs) = node n (as,map Elem cs)
-
-instance Node ([Attr],Element) where node n (as,c) = node n (as,[c])
-instance Node (Attr,Element) where node n (a,c) = node n ([a],c)
-instance Node [Element] where node n es = node n ([]::[Attr],es)
-instance Node Element where node n e = node n [e]
-
-instance Node ([Attr],[CData]) where
- node n (as,cs) = node n (as,map Text cs)
-
-instance Node ([Attr],CData) where node n (as,c) = node n (as,[c])
-instance Node (Attr,CData) where node n (a,c) = node n ([a],c)
-instance Node [CData] where node n es = node n ([]::[Attr],es)
-instance Node CData where node n e = node n [e]
-
-instance Node ([Attr],Text) where
- node n (as,t) = node n (as, CData { cdVerbatim = CDataText
- , cdData = t
- , cdLine = Nothing })
-
-instance Node (Attr,Text ) where node n (a,t) = node n ([a],t)
-instance Node Text where node n t = node n ([]::[Attr],t)
-
--- | Create node with unqualified name
-unode :: Node t => Text -> t -> Element
-unode = node . unqual
-
---
--- conversion from xml-light
---
-
-fromXLQName :: XL.QName -> QName
-fromXLQName qn = QName { qName = T.pack $ XL.qName qn
- , qURI = T.pack <$> XL.qURI qn
- , qPrefix = T.pack <$> XL.qPrefix qn }
-
-fromXLCData :: XL.CData -> CData
-fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of
- XL.CDataText -> CDataText
- XL.CDataVerbatim -> CDataVerbatim
- XL.CDataRaw -> CDataRaw
- , cdData = T.pack $ XL.cdData cd
- , cdLine = XL.cdLine cd }
-
-fromXLElement :: XL.Element -> Element
-fromXLElement el = Element { elName = fromXLQName $ XL.elName el
- , elAttribs = map fromXLAttr $ XL.elAttribs el
- , elContent = map fromXLContent $ XL.elContent el
- , elLine = XL.elLine el }
-
-fromXLAttr :: XL.Attr -> Attr
-fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s)
-
-fromXLContent :: XL.Content -> Content
-fromXLContent (XL.Elem el) = Elem $ fromXLElement el
-fromXLContent (XL.Text cd) = Text $ fromXLCData cd
-fromXLContent (XL.CRef s) = CRef (T.pack s)
-
---
--- copied from xml-light Text.XML.Proc
---
-
--- | Get the text value of an XML element. This function
--- ignores non-text elements, and concatenates all text elements.
-strContent :: Element -> Text
-strContent = mconcat . map cdData . onlyText . elContent
-
--- | Select only the elements from a list of XML content.
-onlyElems :: [Content] -> [Element]
-onlyElems xs = [ x | Elem x <- xs ]
-
--- | Select only the elements from a parent.
-elChildren :: Element -> [Element]
-elChildren e = [ x | Elem x <- elContent e ]
-
--- | Select only the text from a list of XML content.
-onlyText :: [Content] -> [CData]
-onlyText xs = [ x | Text x <- xs ]
-
--- | Find all immediate children with the given name.
-findChildren :: QName -> Element -> [Element]
-findChildren q e = filterChildren ((q ==) . elName) e
-
--- | Filter all immediate children wrt a given predicate.
-filterChildren :: (Element -> Bool) -> Element -> [Element]
-filterChildren p e = filter p (onlyElems (elContent e))
-
-
--- | Filter all immediate children wrt a given predicate over their names.
-filterChildrenName :: (QName -> Bool) -> Element -> [Element]
-filterChildrenName p e = filter (p.elName) (onlyElems (elContent e))
-
-
--- | Find an immediate child with the given name.
-findChild :: QName -> Element -> Maybe Element
-findChild q e = listToMaybe (findChildren q e)
-
--- | Find an immediate child with the given name.
-filterChild :: (Element -> Bool) -> Element -> Maybe Element
-filterChild p e = listToMaybe (filterChildren p e)
-
--- | Find an immediate child with name matching a predicate.
-filterChildName :: (QName -> Bool) -> Element -> Maybe Element
-filterChildName p e = listToMaybe (filterChildrenName p e)
-
--- | Find the left-most occurrence of an element matching given name.
-findElement :: QName -> Element -> Maybe Element
-findElement q e = listToMaybe (findElements q e)
-
--- | Filter the left-most occurrence of an element wrt. given predicate.
-filterElement :: (Element -> Bool) -> Element -> Maybe Element
-filterElement p e = listToMaybe (filterElements p e)
-
--- | Filter the left-most occurrence of an element wrt. given predicate.
-filterElementName :: (QName -> Bool) -> Element -> Maybe Element
-filterElementName p e = listToMaybe (filterElementsName p e)
-
--- | Find all non-nested occurances of an element.
--- (i.e., once we have found an element, we do not search
--- for more occurances among the element's children).
-findElements :: QName -> Element -> [Element]
-findElements qn e = filterElementsName (qn==) e
-
--- | Find all non-nested occurrences of an element wrt. given predicate.
--- (i.e., once we have found an element, we do not search
--- for more occurances among the element's children).
-filterElements :: (Element -> Bool) -> Element -> [Element]
-filterElements p e
- | p e = [e]
- | otherwise = concatMap (filterElements p) $ onlyElems $ elContent e
-
--- | Find all non-nested occurences of an element wrt a predicate over element names.
--- (i.e., once we have found an element, we do not search
--- for more occurances among the element's children).
-filterElementsName :: (QName -> Bool) -> Element -> [Element]
-filterElementsName p e = filterElements (p.elName) e
-
--- | Lookup the value of an attribute.
-findAttr :: QName -> Element -> Maybe Text
-findAttr x e = lookupAttr x (elAttribs e)
-
--- | Lookup attribute name from list.
-lookupAttr :: QName -> [Attr] -> Maybe Text
-lookupAttr x = lookupAttrBy (x ==)
-
--- | Lookup the first attribute whose name satisfies the given predicate.
-lookupAttrBy :: (QName -> Bool) -> [Attr] -> Maybe Text
-lookupAttrBy p as = attrVal `fmap` find (p . attrKey) as
-
--- | Lookup the value of the first attribute whose name
--- satisfies the given predicate.
-findAttrBy :: (QName -> Bool) -> Element -> Maybe Text
-findAttrBy p e = lookupAttrBy p (elAttribs e)
-
-
---
--- duplicates functinos from Text.XML.Output
---
-
--- | The XML 1.0 header
-xmlHeader :: Text
-xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
-
-
---------------------------------------------------------------------------------
-data ConfigPP = ConfigPP
- { shortEmptyTag :: QName -> Bool
- , prettify :: Bool
- }
-
--- | Default pretty orinting configuration.
--- * Always use abbreviate empty tags.
-defaultConfigPP :: ConfigPP
-defaultConfigPP = ConfigPP { shortEmptyTag = const True
- , prettify = False
- }
-
--- | The predicate specifies for which empty tags we should use XML's
--- abbreviated notation <TAG />. This is useful if we are working with
--- some XML-ish standards (such as certain versions of HTML) where some
--- empty tags should always be displayed in the <TAG></TAG> form.
-useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP
-useShortEmptyTags p c = c { shortEmptyTag = p }
-
-
--- | Specify if we should use extra white-space to make document more readable.
--- WARNING: This adds additional white-space to text elements,
--- and so it may change the meaning of the document.
-useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP
-useExtraWhiteSpace p c = c { prettify = p }
-
--- | A configuration that tries to make things pretty
--- (possibly at the cost of changing the semantics a bit
--- through adding white space.)
-prettyConfigPP :: ConfigPP
-prettyConfigPP = useExtraWhiteSpace True defaultConfigPP
-
-
---------------------------------------------------------------------------------
-
-
--- | Pretty printing renders XML documents faithfully,
--- with the exception that whitespace may be added\/removed
--- in non-verbatim character data.
-ppTopElement :: Element -> Text
-ppTopElement = ppcTopElement prettyConfigPP
-
--- | Pretty printing elements
-ppElement :: Element -> Text
-ppElement = ppcElement prettyConfigPP
-
--- | Pretty printing content
-ppContent :: Content -> Text
-ppContent = ppcContent prettyConfigPP
-
--- | Pretty printing renders XML documents faithfully,
--- with the exception that whitespace may be added\/removed
--- in non-verbatim character data.
-ppcTopElement :: ConfigPP -> Element -> Text
-ppcTopElement c e = T.unlines [xmlHeader,ppcElement c e]
-
--- | Pretty printing elements
-ppcElement :: ConfigPP -> Element -> Text
-ppcElement c = TL.toStrict . toLazyText . ppElementS c mempty
-
--- | Pretty printing content
-ppcContent :: ConfigPP -> Content -> Text
-ppcContent c = TL.toStrict . toLazyText . ppContentS c mempty
-
-ppcCData :: ConfigPP -> CData -> Text
-ppcCData c = TL.toStrict . toLazyText . ppCDataS c mempty
-
-type Indent = Builder
-
--- | Pretty printing content using ShowT
-ppContentS :: ConfigPP -> Indent -> Content -> Builder
-ppContentS c i x = case x of
- Elem e -> ppElementS c i e
- Text t -> ppCDataS c i t
- CRef r -> showCRefS r
-
-ppElementS :: ConfigPP -> Indent -> Element -> Builder
-ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <>
- (case elContent e of
- [] | "?" `T.isPrefixOf` qName name -> fromText " ?>"
- | shortEmptyTag c name -> fromText " />"
- [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name
- cs -> singleton '>' <> nl <>
- mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <>
- i <> tagEnd name
- where (nl,sp) = if prettify c then ("\n"," ") else ("","")
- )
- where name = elName e
-
-ppCDataS :: ConfigPP -> Indent -> CData -> Builder
-ppCDataS c i t = i <> if cdVerbatim t /= CDataText || not (prettify c)
- then showCDataS t
- else foldr cons mempty (T.unpack (showCData t))
- where cons :: Char -> Builder -> Builder
- cons '\n' ys = singleton '\n' <> i <> ys
- cons y ys = singleton y <> ys
-
-
-
---------------------------------------------------------------------------------
-
--- | Adds the <?xml?> header.
-showTopElement :: Element -> Text
-showTopElement c = xmlHeader <> showElement c
-
-showContent :: Content -> Text
-showContent = ppcContent defaultConfigPP
-
-showElement :: Element -> Text
-showElement = ppcElement defaultConfigPP
-
-showCData :: CData -> Text
-showCData = ppcCData defaultConfigPP
-
--- Note: crefs should not contain '&', ';', etc.
-showCRefS :: Text -> Builder
-showCRefS r = singleton '&' <> fromText r <> singleton ';'
-
--- | Convert a text element to characters.
-showCDataS :: CData -> Builder
-showCDataS cd =
- case cdVerbatim cd of
- CDataText -> escStr (cdData cd)
- CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <>
- fromText "]]>"
- CDataRaw -> fromText (cdData cd)
-
---------------------------------------------------------------------------------
-escCData :: Text -> Builder
-escCData t
- | "]]>" `T.isPrefixOf` t =
- fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t)
-escCData t
- = case T.uncons t of
- Nothing -> mempty
- Just (c,t') -> singleton c <> escCData t'
-
-escChar :: Char -> Builder
-escChar c = case c of
- '<' -> fromText "&lt;"
- '>' -> fromText "&gt;"
- '&' -> fromText "&amp;"
- '"' -> fromText "&quot;"
- -- we use &#39 instead of &apos; because IE apparently has difficulties
- -- rendering &apos; in xhtml.
- -- Reported by Rohan Drape <rohan.drape@gmail.com>.
- '\'' -> fromText "&#39;"
- _ -> singleton c
-
- {- original xml-light version:
- -- NOTE: We escape '\r' explicitly because otherwise they get lost
- -- when parsed back in because of then end-of-line normalization rules.
- _ | isPrint c || c == '\n' -> singleton c
- | otherwise -> showText "&#" . showsT oc . singleton ';'
- where oc = ord c
- -}
-
-escStr :: Text -> Builder
-escStr cs = if T.any needsEscape cs
- then mconcat (map escChar (T.unpack cs))
- else fromText cs
- where
- needsEscape '<' = True
- needsEscape '>' = True
- needsEscape '&' = True
- needsEscape '"' = True
- needsEscape '\'' = True
- needsEscape _ = False
-
-tagEnd :: QName -> Builder
-tagEnd qn = fromText "</" <> showQName qn <> singleton '>'
-
-tagStart :: QName -> [Attr] -> Builder
-tagStart qn as = singleton '<' <> showQName qn <> as_str
- where as_str = if null as
- then mempty
- else mconcat (map showAttr as)
-
-showAttr :: Attr -> Builder
-showAttr (Attr qn v) = singleton ' ' <> showQName qn <>
- singleton '=' <>
- singleton '"' <> escStr v <> singleton '"'
-
-showQName :: QName -> Builder
-showQName q =
- case qPrefix q of
- Nothing -> fromText (qName q)
- Just p -> fromText p <> singleton ':' <> fromText (qName q)
diff --git a/src/Text/Pandoc/XML/Light/Output.hs b/src/Text/Pandoc/XML/Light/Output.hs
new file mode 100644
index 000000000..dc94ce914
--- /dev/null
+++ b/src/Text/Pandoc/XML/Light/Output.hs
@@ -0,0 +1,230 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.XML.Light.Output
+ Copyright : Copyright (C) 2021 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+-}
+module Text.Pandoc.XML.Light.Output
+ ( -- * Replacement for xml-light's Text.XML.Output
+ ppTopElement
+ , ppElement
+ , ppContent
+ , ppcElement
+ , ppcContent
+ , showTopElement
+ , showElement
+ , showContent
+ , useShortEmptyTags
+ , defaultConfigPP
+ , ConfigPP(..)
+ ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText)
+import Text.Pandoc.XML.Light.Types
+
+--
+-- duplicates functinos from Text.XML.Output
+--
+
+-- | The XML 1.0 header
+xmlHeader :: Text
+xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+
+
+--------------------------------------------------------------------------------
+data ConfigPP = ConfigPP
+ { shortEmptyTag :: QName -> Bool
+ , prettify :: Bool
+ }
+
+-- | Default pretty orinting configuration.
+-- * Always use abbreviate empty tags.
+defaultConfigPP :: ConfigPP
+defaultConfigPP = ConfigPP { shortEmptyTag = const True
+ , prettify = False
+ }
+
+-- | The predicate specifies for which empty tags we should use XML's
+-- abbreviated notation <TAG />. This is useful if we are working with
+-- some XML-ish standards (such as certain versions of HTML) where some
+-- empty tags should always be displayed in the <TAG></TAG> form.
+useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP
+useShortEmptyTags p c = c { shortEmptyTag = p }
+
+
+-- | Specify if we should use extra white-space to make document more readable.
+-- WARNING: This adds additional white-space to text elements,
+-- and so it may change the meaning of the document.
+useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP
+useExtraWhiteSpace p c = c { prettify = p }
+
+-- | A configuration that tries to make things pretty
+-- (possibly at the cost of changing the semantics a bit
+-- through adding white space.)
+prettyConfigPP :: ConfigPP
+prettyConfigPP = useExtraWhiteSpace True defaultConfigPP
+
+
+--------------------------------------------------------------------------------
+
+
+-- | Pretty printing renders XML documents faithfully,
+-- with the exception that whitespace may be added\/removed
+-- in non-verbatim character data.
+ppTopElement :: Element -> Text
+ppTopElement = ppcTopElement prettyConfigPP
+
+-- | Pretty printing elements
+ppElement :: Element -> Text
+ppElement = ppcElement prettyConfigPP
+
+-- | Pretty printing content
+ppContent :: Content -> Text
+ppContent = ppcContent prettyConfigPP
+
+-- | Pretty printing renders XML documents faithfully,
+-- with the exception that whitespace may be added\/removed
+-- in non-verbatim character data.
+ppcTopElement :: ConfigPP -> Element -> Text
+ppcTopElement c e = T.unlines [xmlHeader,ppcElement c e]
+
+-- | Pretty printing elements
+ppcElement :: ConfigPP -> Element -> Text
+ppcElement c = TL.toStrict . toLazyText . ppElementS c mempty
+
+-- | Pretty printing content
+ppcContent :: ConfigPP -> Content -> Text
+ppcContent c = TL.toStrict . toLazyText . ppContentS c mempty
+
+ppcCData :: ConfigPP -> CData -> Text
+ppcCData c = TL.toStrict . toLazyText . ppCDataS c mempty
+
+type Indent = Builder
+
+-- | Pretty printing content using ShowT
+ppContentS :: ConfigPP -> Indent -> Content -> Builder
+ppContentS c i x = case x of
+ Elem e -> ppElementS c i e
+ Text t -> ppCDataS c i t
+ CRef r -> showCRefS r
+
+ppElementS :: ConfigPP -> Indent -> Element -> Builder
+ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <>
+ (case elContent e of
+ [] | "?" `T.isPrefixOf` qName name -> fromText " ?>"
+ | shortEmptyTag c name -> fromText " />"
+ [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name
+ cs -> singleton '>' <> nl <>
+ mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <>
+ i <> tagEnd name
+ where (nl,sp) = if prettify c then ("\n"," ") else ("","")
+ )
+ where name = elName e
+
+ppCDataS :: ConfigPP -> Indent -> CData -> Builder
+ppCDataS c i t = i <> if cdVerbatim t /= CDataText || not (prettify c)
+ then showCDataS t
+ else foldr cons mempty (T.unpack (showCData t))
+ where cons :: Char -> Builder -> Builder
+ cons '\n' ys = singleton '\n' <> i <> ys
+ cons y ys = singleton y <> ys
+
+
+
+--------------------------------------------------------------------------------
+
+-- | Adds the <?xml?> header.
+showTopElement :: Element -> Text
+showTopElement c = xmlHeader <> showElement c
+
+showContent :: Content -> Text
+showContent = ppcContent defaultConfigPP
+
+showElement :: Element -> Text
+showElement = ppcElement defaultConfigPP
+
+showCData :: CData -> Text
+showCData = ppcCData defaultConfigPP
+
+-- Note: crefs should not contain '&', ';', etc.
+showCRefS :: Text -> Builder
+showCRefS r = singleton '&' <> fromText r <> singleton ';'
+
+-- | Convert a text element to characters.
+showCDataS :: CData -> Builder
+showCDataS cd =
+ case cdVerbatim cd of
+ CDataText -> escStr (cdData cd)
+ CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <>
+ fromText "]]>"
+ CDataRaw -> fromText (cdData cd)
+
+--------------------------------------------------------------------------------
+escCData :: Text -> Builder
+escCData t
+ | "]]>" `T.isPrefixOf` t =
+ fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t)
+escCData t
+ = case T.uncons t of
+ Nothing -> mempty
+ Just (c,t') -> singleton c <> escCData t'
+
+escChar :: Char -> Builder
+escChar c = case c of
+ '<' -> fromText "&lt;"
+ '>' -> fromText "&gt;"
+ '&' -> fromText "&amp;"
+ '"' -> fromText "&quot;"
+ -- we use &#39 instead of &apos; because IE apparently has difficulties
+ -- rendering &apos; in xhtml.
+ -- Reported by Rohan Drape <rohan.drape@gmail.com>.
+ '\'' -> fromText "&#39;"
+ _ -> singleton c
+
+ {- original xml-light version:
+ -- NOTE: We escape '\r' explicitly because otherwise they get lost
+ -- when parsed back in because of then end-of-line normalization rules.
+ _ | isPrint c || c == '\n' -> singleton c
+ | otherwise -> showText "&#" . showsT oc . singleton ';'
+ where oc = ord c
+ -}
+
+escStr :: Text -> Builder
+escStr cs = if T.any needsEscape cs
+ then mconcat (map escChar (T.unpack cs))
+ else fromText cs
+ where
+ needsEscape '<' = True
+ needsEscape '>' = True
+ needsEscape '&' = True
+ needsEscape '"' = True
+ needsEscape '\'' = True
+ needsEscape _ = False
+
+tagEnd :: QName -> Builder
+tagEnd qn = fromText "</" <> showQName qn <> singleton '>'
+
+tagStart :: QName -> [Attr] -> Builder
+tagStart qn as = singleton '<' <> showQName qn <> as_str
+ where as_str = if null as
+ then mempty
+ else mconcat (map showAttr as)
+
+showAttr :: Attr -> Builder
+showAttr (Attr qn v) = singleton ' ' <> showQName qn <>
+ singleton '=' <>
+ singleton '"' <> escStr v <> singleton '"'
+
+showQName :: QName -> Builder
+showQName q =
+ case qPrefix q of
+ Nothing -> fromText (qName q)
+ Just p -> fromText p <> singleton ':' <> fromText (qName q)
diff --git a/src/Text/Pandoc/XML/Light/Proc.hs b/src/Text/Pandoc/XML/Light/Proc.hs
new file mode 100644
index 000000000..838d5af74
--- /dev/null
+++ b/src/Text/Pandoc/XML/Light/Proc.hs
@@ -0,0 +1,138 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.XML.Light.Proc
+ Copyright : Copyright (C) 2021 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+-}
+module Text.Pandoc.XML.Light.Proc
+ (
+ -- * Replacement for xml-light's Text.XML.Proc
+ strContent
+ , onlyElems
+ , elChildren
+ , onlyText
+ , findChildren
+ , filterChildren
+ , filterChildrenName
+ , findChild
+ , filterChild
+ , filterChildName
+ , findElement
+ , filterElement
+ , filterElementName
+ , findElements
+ , filterElements
+ , filterElementsName
+ , findAttr
+ , lookupAttr
+ , lookupAttrBy
+ , findAttrBy
+ ) where
+
+import Data.Text (Text)
+import Data.Maybe (listToMaybe)
+import Data.List(find)
+import Text.Pandoc.XML.Light.Types
+
+--
+-- copied from xml-light Text.XML.Proc
+--
+
+-- | Get the text value of an XML element. This function
+-- ignores non-text elements, and concatenates all text elements.
+strContent :: Element -> Text
+strContent = mconcat . map cdData . onlyText . elContent
+
+-- | Select only the elements from a list of XML content.
+onlyElems :: [Content] -> [Element]
+onlyElems xs = [ x | Elem x <- xs ]
+
+-- | Select only the elements from a parent.
+elChildren :: Element -> [Element]
+elChildren e = [ x | Elem x <- elContent e ]
+
+-- | Select only the text from a list of XML content.
+onlyText :: [Content] -> [CData]
+onlyText xs = [ x | Text x <- xs ]
+
+-- | Find all immediate children with the given name.
+findChildren :: QName -> Element -> [Element]
+findChildren q e = filterChildren ((q ==) . elName) e
+
+-- | Filter all immediate children wrt a given predicate.
+filterChildren :: (Element -> Bool) -> Element -> [Element]
+filterChildren p e = filter p (onlyElems (elContent e))
+
+
+-- | Filter all immediate children wrt a given predicate over their names.
+filterChildrenName :: (QName -> Bool) -> Element -> [Element]
+filterChildrenName p e = filter (p.elName) (onlyElems (elContent e))
+
+
+-- | Find an immediate child with the given name.
+findChild :: QName -> Element -> Maybe Element
+findChild q e = listToMaybe (findChildren q e)
+
+-- | Find an immediate child with the given name.
+filterChild :: (Element -> Bool) -> Element -> Maybe Element
+filterChild p e = listToMaybe (filterChildren p e)
+
+-- | Find an immediate child with name matching a predicate.
+filterChildName :: (QName -> Bool) -> Element -> Maybe Element
+filterChildName p e = listToMaybe (filterChildrenName p e)
+
+-- | Find the left-most occurrence of an element matching given name.
+findElement :: QName -> Element -> Maybe Element
+findElement q e = listToMaybe (findElements q e)
+
+-- | Filter the left-most occurrence of an element wrt. given predicate.
+filterElement :: (Element -> Bool) -> Element -> Maybe Element
+filterElement p e = listToMaybe (filterElements p e)
+
+-- | Filter the left-most occurrence of an element wrt. given predicate.
+filterElementName :: (QName -> Bool) -> Element -> Maybe Element
+filterElementName p e = listToMaybe (filterElementsName p e)
+
+-- | Find all non-nested occurances of an element.
+-- (i.e., once we have found an element, we do not search
+-- for more occurances among the element's children).
+findElements :: QName -> Element -> [Element]
+findElements qn e = filterElementsName (qn==) e
+
+-- | Find all non-nested occurrences of an element wrt. given predicate.
+-- (i.e., once we have found an element, we do not search
+-- for more occurances among the element's children).
+filterElements :: (Element -> Bool) -> Element -> [Element]
+filterElements p e
+ | p e = [e]
+ | otherwise = concatMap (filterElements p) $ onlyElems $ elContent e
+
+-- | Find all non-nested occurences of an element wrt a predicate over element names.
+-- (i.e., once we have found an element, we do not search
+-- for more occurances among the element's children).
+filterElementsName :: (QName -> Bool) -> Element -> [Element]
+filterElementsName p e = filterElements (p.elName) e
+
+-- | Lookup the value of an attribute.
+findAttr :: QName -> Element -> Maybe Text
+findAttr x e = lookupAttr x (elAttribs e)
+
+-- | Lookup attribute name from list.
+lookupAttr :: QName -> [Attr] -> Maybe Text
+lookupAttr x = lookupAttrBy (x ==)
+
+-- | Lookup the first attribute whose name satisfies the given predicate.
+lookupAttrBy :: (QName -> Bool) -> [Attr] -> Maybe Text
+lookupAttrBy p as = attrVal `fmap` find (p . attrKey) as
+
+-- | Lookup the value of the first attribute whose name
+-- satisfies the given predicate.
+findAttrBy :: (QName -> Bool) -> Element -> Maybe Text
+findAttrBy p e = lookupAttrBy p (elAttribs e)
+
+
diff --git a/src/Text/Pandoc/XML/Light/Types.hs b/src/Text/Pandoc/XML/Light/Types.hs
new file mode 100644
index 000000000..f338da6ea
--- /dev/null
+++ b/src/Text/Pandoc/XML/Light/Types.hs
@@ -0,0 +1,190 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{- |
+ Module : Text.Pandoc.XML.Light.Types
+ Copyright : Copyright (C) 2021 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+-}
+module Text.Pandoc.XML.Light.Types
+ ( -- * Basic types, duplicating those from xml-light but with Text
+ -- instead of String
+ Line
+ , Content(..)
+ , Element(..)
+ , Attr(..)
+ , CData(..)
+ , CDataKind(..)
+ , QName(..)
+ , Node(..)
+ , unode
+ , unqual
+ , add_attr
+ , add_attrs
+ -- * Conversion functions from xml-light types
+ , fromXLQName
+ , fromXLCData
+ , fromXLElement
+ , fromXLAttr
+ , fromXLContent
+ ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import Data.Data (Data)
+import Data.Typeable (Typeable)
+import qualified Text.XML.Light as XL
+
+--
+-- type definitions lightly modified from xml-light
+--
+
+-- | A line is an Integer
+type Line = Integer
+
+-- | XML content
+data Content = Elem Element
+ | Text CData
+ | CRef Text
+ deriving (Show, Typeable, Data)
+
+-- | XML elements
+data Element = Element {
+ elName :: QName,
+ elAttribs :: [Attr],
+ elContent :: [Content],
+ elLine :: Maybe Line
+ } deriving (Show, Typeable, Data)
+
+-- | XML attributes
+data Attr = Attr {
+ attrKey :: QName,
+ attrVal :: Text
+ } deriving (Eq, Ord, Show, Typeable, Data)
+
+-- | XML CData
+data CData = CData {
+ cdVerbatim :: CDataKind,
+ cdData :: Text,
+ cdLine :: Maybe Line
+ } deriving (Show, Typeable, Data)
+
+data CDataKind
+ = CDataText -- ^ Ordinary character data; pretty printer escapes &, < etc.
+ | CDataVerbatim -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[..
+ | CDataRaw -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up.
+ deriving ( Eq, Show, Typeable, Data )
+
+-- | XML qualified names
+data QName = QName {
+ qName :: Text,
+ qURI :: Maybe Text,
+ qPrefix :: Maybe Text
+ } deriving (Show, Typeable, Data)
+
+
+instance Eq QName where
+ q1 == q2 = compare q1 q2 == EQ
+
+instance Ord QName where
+ compare q1 q2 =
+ case compare (qName q1) (qName q2) of
+ EQ -> case (qURI q1, qURI q2) of
+ (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2)
+ (u1,u2) -> compare u1 u2
+ x -> x
+
+class Node t where
+ node :: QName -> t -> Element
+
+instance Node ([Attr],[Content]) where
+ node n (attrs,cont) = Element { elName = n
+ , elAttribs = attrs
+ , elContent = cont
+ , elLine = Nothing
+ }
+
+instance Node [Attr] where node n as = node n (as,[]::[Content])
+instance Node Attr where node n a = node n [a]
+instance Node () where node n () = node n ([]::[Attr])
+
+instance Node [Content] where node n cs = node n ([]::[Attr],cs)
+instance Node Content where node n c = node n [c]
+instance Node ([Attr],Content) where node n (as,c) = node n (as,[c])
+instance Node (Attr,Content) where node n (a,c) = node n ([a],[c])
+
+instance Node ([Attr],[Element]) where
+ node n (as,cs) = node n (as,map Elem cs)
+
+instance Node ([Attr],Element) where node n (as,c) = node n (as,[c])
+instance Node (Attr,Element) where node n (a,c) = node n ([a],c)
+instance Node [Element] where node n es = node n ([]::[Attr],es)
+instance Node Element where node n e = node n [e]
+
+instance Node ([Attr],[CData]) where
+ node n (as,cs) = node n (as,map Text cs)
+
+instance Node ([Attr],CData) where node n (as,c) = node n (as,[c])
+instance Node (Attr,CData) where node n (a,c) = node n ([a],c)
+instance Node [CData] where node n es = node n ([]::[Attr],es)
+instance Node CData where node n e = node n [e]
+
+instance Node ([Attr],Text) where
+ node n (as,t) = node n (as, CData { cdVerbatim = CDataText
+ , cdData = t
+ , cdLine = Nothing })
+
+instance Node (Attr,Text ) where node n (a,t) = node n ([a],t)
+instance Node Text where node n t = node n ([]::[Attr],t)
+
+-- | Create node with unqualified name
+unode :: Node t => Text -> t -> Element
+unode = node . unqual
+
+unqual :: Text -> QName
+unqual x = QName x Nothing Nothing
+
+-- | Add an attribute to an element.
+add_attr :: Attr -> Element -> Element
+add_attr a e = add_attrs [a] e
+
+-- | Add some attributes to an element.
+add_attrs :: [Attr] -> Element -> Element
+add_attrs as e = e { elAttribs = as ++ elAttribs e }
+
+--
+-- conversion from xml-light
+--
+
+fromXLQName :: XL.QName -> QName
+fromXLQName qn = QName { qName = T.pack $ XL.qName qn
+ , qURI = T.pack <$> XL.qURI qn
+ , qPrefix = T.pack <$> XL.qPrefix qn }
+
+fromXLCData :: XL.CData -> CData
+fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of
+ XL.CDataText -> CDataText
+ XL.CDataVerbatim -> CDataVerbatim
+ XL.CDataRaw -> CDataRaw
+ , cdData = T.pack $ XL.cdData cd
+ , cdLine = XL.cdLine cd }
+
+fromXLElement :: XL.Element -> Element
+fromXLElement el = Element { elName = fromXLQName $ XL.elName el
+ , elAttribs = map fromXLAttr $ XL.elAttribs el
+ , elContent = map fromXLContent $ XL.elContent el
+ , elLine = XL.elLine el }
+
+fromXLAttr :: XL.Attr -> Attr
+fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s)
+
+fromXLContent :: XL.Content -> Content
+fromXLContent (XL.Elem el) = Elem $ fromXLElement el
+fromXLContent (XL.Text cd) = Text $ fromXLCData cd
+fromXLContent (XL.CRef s) = CRef (T.pack s)
+
+