Split up T.P.XML.Light into submodules.

author: John MacFarlane <jgm@berkeley.edu> 2021-02-16 18:40:06 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2021-02-16 18:40:06 -0800
commit: d7a4996b1e7e201095ce792375a0776984fa8fcd (patch)
tree: 2d9fe829471bd4217a3e92b72d1c713abe77f274
parent: 967e7f5fb990b29de48b37be1db40fb149a8cf55 (diff)
download: pandoc-d7a4996b1e7e201095ce792375a0776984fa8fcd.tar.gz
5 files changed, 568 insertions, 504 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index 22aebd55e..d27520ba0 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -688,6 +688,9 @@ library
                    Text.Pandoc.Lua.Util,
                    Text.Pandoc.Lua.Walk,
                    Text.Pandoc.XML.Light,
+                   Text.Pandoc.XML.Light.Types,
+                   Text.Pandoc.XML.Light.Proc,
+                   Text.Pandoc.XML.Light.Output,
                    Text.Pandoc.CSS,
                    Text.Pandoc.CSV,
                    Text.Pandoc.RoffChar,
diff --git a/src/Text/Pandoc/XML/Light.hs b/src/Text/Pandoc/XML/Light.hs
index 38e4df218..07113ea92 100644
--- a/src/Text/Pandoc/XML/Light.hs
+++ b/src/Text/Pandoc/XML/Light.hs
@@ -1,5 +1,4 @@
 {-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE DeriveDataTypeable #-}
 {-# LANGUAGE OverloadedStrings #-}
 {- |
    Module      : Text.Pandoc.XML.Light
@@ -31,59 +30,9 @@ better performance and accuracy without much change in the
 code that used xml-light.
 -}
 module Text.Pandoc.XML.Light
-  ( -- * Basic types, duplicating those from xml-light but with Text
-    -- instead of String
-    Line
-  , Content(..)
-  , Element(..)
-  , Attr(..)
-  , CData(..)
-  , CDataKind(..)
-  , QName(..)
-  , Node(..)
-  , unode
-  , unqual
-  , add_attr
-  , add_attrs
-    -- * Conversion functions from xml-light types
-  , fromXLQName
-  , fromXLCData
-  , fromXLElement
-  , fromXLAttr
-  , fromXLContent
-    -- * Replacement for xml-light's Text.XML.Proc
-  , strContent
-  , onlyElems
-  , elChildren
-  , onlyText
-  , findChildren
-  , filterChildren
-  , filterChildrenName
-  , findChild
-  , filterChild
-  , filterChildName
-  , findElement
-  , filterElement
-  , filterElementName
-  , findElements
-  , filterElements
-  , filterElementsName
-  , findAttr
-  , lookupAttr
-  , lookupAttrBy
-  , findAttrBy
-    -- * Replacement for xml-light's Text.XML.Output
-  , ppTopElement
-  , ppElement
-  , ppContent
-  , ppcElement
-  , ppcContent
-  , showTopElement
-  , showElement
-  , showContent
-  , useShortEmptyTags
-  , defaultConfigPP
-  , ConfigPP(..)
+  ( module Text.Pandoc.XML.Light.Types
+  , module Text.Pandoc.XML.Light.Proc
+  , module Text.Pandoc.XML.Light.Output
     -- * Replacement for xml-light's Text.XML.Input
   , parseXMLElement
   , parseXMLContents
@@ -92,16 +41,13 @@ module Text.Pandoc.XML.Light
 import qualified Control.Exception as E
 import qualified Text.XML as Conduit
 import Text.XML.Unresolved (InvalidEventStream(..))
-import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
-import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText)
 import qualified Data.Map as M
-import Data.Data (Data)
-import Data.Typeable (Typeable)
-import Data.Maybe (mapMaybe, listToMaybe)
-import Data.List(find)
-import qualified Text.XML.Light as XL
+import Data.Maybe (mapMaybe)
+import Text.Pandoc.XML.Light.Types
+import Text.Pandoc.XML.Light.Proc
+import Text.Pandoc.XML.Light.Output
 
 -- Drop in replacement for parseXMLDoc in xml-light.
 parseXMLElement :: TL.Text -> Either T.Text Element
@@ -141,446 +87,3 @@ nodeToContent (Conduit.NodeContent t) =
   Just (Text (CData CDataText t Nothing))
 nodeToContent _ = Nothing
 
-unqual :: Text -> QName
-unqual x = QName x Nothing Nothing
-
--- | Add an attribute to an element.
-add_attr :: Attr -> Element -> Element
-add_attr a e = add_attrs [a] e
-
--- | Add some attributes to an element.
-add_attrs :: [Attr] -> Element -> Element
-add_attrs as e = e { elAttribs = as ++ elAttribs e }
-
---
--- type definitions lightly modified from xml-light
---
-
--- | A line is an Integer
-type Line     = Integer
-
--- | XML content
-data Content  = Elem Element
-              | Text CData
-              | CRef Text
-                deriving (Show, Typeable, Data)
-
--- | XML elements
-data Element  = Element {
-                  elName      :: QName,
-                  elAttribs   :: [Attr],
-                  elContent   :: [Content],
-                  elLine      :: Maybe Line
-                } deriving (Show, Typeable, Data)
-
--- | XML attributes
-data Attr     = Attr {
-                  attrKey :: QName,
-                  attrVal :: Text
-                } deriving (Eq, Ord, Show, Typeable, Data)
-
--- | XML CData
-data CData    = CData {
-                  cdVerbatim  :: CDataKind,
-                  cdData      :: Text,
-                  cdLine      :: Maybe Line
-                } deriving (Show, Typeable, Data)
-
-data CDataKind
- = CDataText      -- ^ Ordinary character data; pretty printer escapes &, < etc.
- | CDataVerbatim  -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[..
- | CDataRaw       -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up.
-   deriving ( Eq, Show, Typeable, Data )
-
--- | XML qualified names
-data QName    = QName {
-                  qName   :: Text,
-                  qURI    :: Maybe Text,
-                  qPrefix :: Maybe Text
-                } deriving (Show, Typeable, Data)
-
-
-instance Eq QName where
-  q1 == q2  = compare q1 q2 == EQ
-
-instance Ord QName where
-  compare q1 q2 =
-    case compare (qName q1) (qName q2) of
-      EQ  -> case (qURI q1, qURI q2) of
-               (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2)
-               (u1,u2)           -> compare u1 u2
-      x   -> x
-
-class Node t where
- node :: QName -> t -> Element
-
-instance Node ([Attr],[Content]) where
-  node n (attrs,cont) = Element { elName     = n
-                                , elAttribs  = attrs
-                                , elContent  = cont
-                                , elLine     = Nothing
-                                }
-
-instance Node [Attr]             where node n as   = node n (as,[]::[Content])
-instance Node Attr               where node n a    = node n [a]
-instance Node ()                 where node n ()   = node n ([]::[Attr])
-
-instance Node [Content]          where node n cs     = node n ([]::[Attr],cs)
-instance Node Content            where node n c      = node n [c]
-instance Node ([Attr],Content)   where node n (as,c) = node n (as,[c])
-instance Node (Attr,Content)     where node n (a,c)  = node n ([a],[c])
-
-instance Node ([Attr],[Element]) where
-  node n (as,cs) = node n (as,map Elem cs)
-
-instance Node ([Attr],Element)   where node n (as,c) = node n (as,[c])
-instance Node (Attr,Element)     where node n (a,c)  = node n ([a],c)
-instance Node [Element]          where node n es     = node n ([]::[Attr],es)
-instance Node Element            where node n e      = node n [e]
-
-instance Node ([Attr],[CData])   where
-  node n (as,cs) = node n (as,map Text cs)
-
-instance Node ([Attr],CData)     where node n (as,c) = node n (as,[c])
-instance Node (Attr,CData)       where node n (a,c)  = node n ([a],c)
-instance Node [CData]            where node n es     = node n ([]::[Attr],es)
-instance Node CData              where node n e      = node n [e]
-
-instance Node ([Attr],Text)      where
-  node n (as,t) = node n (as, CData { cdVerbatim = CDataText
-                                    , cdData = t
-                                    , cdLine = Nothing })
-
-instance Node (Attr,Text )       where node n (a,t)  = node n ([a],t)
-instance Node Text               where node n t      = node n ([]::[Attr],t)
-
--- | Create node with unqualified name
-unode :: Node t => Text -> t -> Element
-unode = node . unqual
-
---
--- conversion from xml-light
---
-
-fromXLQName :: XL.QName -> QName
-fromXLQName qn = QName { qName = T.pack $ XL.qName qn
-                       , qURI = T.pack <$> XL.qURI qn
-                       , qPrefix = T.pack <$> XL.qPrefix qn }
-
-fromXLCData :: XL.CData -> CData
-fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of
-                                        XL.CDataText -> CDataText
-                                        XL.CDataVerbatim -> CDataVerbatim
-                                        XL.CDataRaw -> CDataRaw
-                       , cdData = T.pack $ XL.cdData cd
-                       , cdLine = XL.cdLine cd }
-
-fromXLElement :: XL.Element -> Element
-fromXLElement el = Element { elName = fromXLQName $ XL.elName el
-                           , elAttribs = map fromXLAttr $ XL.elAttribs el
-                           , elContent = map fromXLContent $ XL.elContent el
-                           , elLine = XL.elLine el }
-
-fromXLAttr :: XL.Attr -> Attr
-fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s)
-
-fromXLContent :: XL.Content -> Content
-fromXLContent (XL.Elem el) = Elem $ fromXLElement el
-fromXLContent (XL.Text cd) = Text $ fromXLCData cd
-fromXLContent (XL.CRef s)  = CRef (T.pack s)
-
---
--- copied from xml-light Text.XML.Proc
---
-
--- | Get the text value of an XML element.  This function
--- ignores non-text elements, and concatenates all text elements.
-strContent         :: Element -> Text
-strContent          = mconcat . map cdData . onlyText . elContent
-
--- | Select only the elements from a list of XML content.
-onlyElems          :: [Content] -> [Element]
-onlyElems xs        = [ x | Elem x <- xs ]
-
--- | Select only the elements from a parent.
-elChildren         :: Element -> [Element]
-elChildren e        = [ x | Elem x <- elContent e ]
-
--- | Select only the text from a list of XML content.
-onlyText           :: [Content] -> [CData]
-onlyText xs         = [ x | Text x <- xs ]
-
--- | Find all immediate children with the given name.
-findChildren       :: QName -> Element -> [Element]
-findChildren q e    = filterChildren ((q ==) . elName) e
-
--- | Filter all immediate children wrt a given predicate.
-filterChildren       :: (Element -> Bool) -> Element -> [Element]
-filterChildren p e    = filter p (onlyElems (elContent e))
-
-
--- | Filter all immediate children wrt a given predicate over their names.
-filterChildrenName      :: (QName -> Bool) -> Element -> [Element]
-filterChildrenName p e   = filter (p.elName) (onlyElems (elContent e))
-
-
--- | Find an immediate child with the given name.
-findChild          :: QName -> Element -> Maybe Element
-findChild q e       = listToMaybe (findChildren q e)
-
--- | Find an immediate child with the given name.
-filterChild          :: (Element -> Bool) -> Element -> Maybe Element
-filterChild p e       = listToMaybe (filterChildren p e)
-
--- | Find an immediate child with name matching a predicate.
-filterChildName      :: (QName -> Bool) -> Element -> Maybe Element
-filterChildName p e   = listToMaybe (filterChildrenName p e)
-
--- | Find the left-most occurrence of an element matching given name.
-findElement        :: QName -> Element -> Maybe Element
-findElement q e     = listToMaybe (findElements q e)
-
--- | Filter the left-most occurrence of an element wrt. given predicate.
-filterElement        :: (Element -> Bool) -> Element -> Maybe Element
-filterElement p e     = listToMaybe (filterElements p e)
-
--- | Filter the left-most occurrence of an element wrt. given predicate.
-filterElementName     :: (QName -> Bool) -> Element -> Maybe Element
-filterElementName p e  = listToMaybe (filterElementsName p e)
-
--- | Find all non-nested occurances of an element.
--- (i.e., once we have found an element, we do not search
--- for more occurances among the element's children).
-findElements       :: QName -> Element -> [Element]
-findElements qn e = filterElementsName (qn==) e
-
--- | Find all non-nested occurrences of an element wrt. given predicate.
--- (i.e., once we have found an element, we do not search
--- for more occurances among the element's children).
-filterElements       :: (Element -> Bool) -> Element -> [Element]
-filterElements p e
- | p e        = [e]
- | otherwise  = concatMap (filterElements p) $ onlyElems $ elContent e
-
--- | Find all non-nested occurences of an element wrt a predicate over element names.
--- (i.e., once we have found an element, we do not search
--- for more occurances among the element's children).
-filterElementsName       :: (QName -> Bool) -> Element -> [Element]
-filterElementsName p e = filterElements (p.elName) e
-
--- | Lookup the value of an attribute.
-findAttr          :: QName -> Element -> Maybe Text
-findAttr x e       = lookupAttr x (elAttribs e)
-
--- | Lookup attribute name from list.
-lookupAttr        :: QName -> [Attr] -> Maybe Text
-lookupAttr x       = lookupAttrBy (x ==)
-
--- | Lookup the first attribute whose name satisfies the given predicate.
-lookupAttrBy       :: (QName -> Bool) -> [Attr] -> Maybe Text
-lookupAttrBy p as   = attrVal `fmap` find (p . attrKey) as
-
--- | Lookup the value of the first attribute whose name
--- satisfies the given predicate.
-findAttrBy         :: (QName -> Bool) -> Element -> Maybe Text
-findAttrBy p e      = lookupAttrBy p (elAttribs e)
-
-
---
--- duplicates functinos from Text.XML.Output
---
-
--- | The XML 1.0 header
-xmlHeader :: Text
-xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
-
-
---------------------------------------------------------------------------------
-data ConfigPP = ConfigPP
-  { shortEmptyTag :: QName -> Bool
-  , prettify      :: Bool
-  }
-
--- | Default pretty orinting configuration.
---  * Always use abbreviate empty tags.
-defaultConfigPP :: ConfigPP
-defaultConfigPP = ConfigPP { shortEmptyTag = const True
-                           , prettify      = False
-                           }
-
--- | The predicate specifies for which empty tags we should use XML's
--- abbreviated notation <TAG />.  This is useful if we are working with
--- some XML-ish standards (such as certain versions of HTML) where some
--- empty tags should always be displayed in the <TAG></TAG> form.
-useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP
-useShortEmptyTags p c = c { shortEmptyTag = p }
-
-
--- | Specify if we should use extra white-space to make document more readable.
--- WARNING: This adds additional white-space to text elements,
--- and so it may change the meaning of the document.
-useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP
-useExtraWhiteSpace p c  = c { prettify = p }
-
--- | A configuration that tries to make things pretty
--- (possibly at the cost of changing the semantics a bit
--- through adding white space.)
-prettyConfigPP     :: ConfigPP
-prettyConfigPP      = useExtraWhiteSpace True defaultConfigPP
-
-
---------------------------------------------------------------------------------
-
-
--- | Pretty printing renders XML documents faithfully,
--- with the exception that whitespace may be added\/removed
--- in non-verbatim character data.
-ppTopElement       :: Element -> Text
-ppTopElement        = ppcTopElement prettyConfigPP
-
--- | Pretty printing elements
-ppElement          :: Element -> Text
-ppElement           = ppcElement prettyConfigPP
-
--- | Pretty printing content
-ppContent          :: Content -> Text
-ppContent           = ppcContent prettyConfigPP
-
--- | Pretty printing renders XML documents faithfully,
--- with the exception that whitespace may be added\/removed
--- in non-verbatim character data.
-ppcTopElement      :: ConfigPP -> Element -> Text
-ppcTopElement c e   = T.unlines [xmlHeader,ppcElement c e]
-
--- | Pretty printing elements
-ppcElement         :: ConfigPP -> Element -> Text
-ppcElement c        = TL.toStrict . toLazyText . ppElementS c mempty
-
--- | Pretty printing content
-ppcContent         :: ConfigPP -> Content -> Text
-ppcContent c        = TL.toStrict . toLazyText . ppContentS c mempty
-
-ppcCData           :: ConfigPP -> CData -> Text
-ppcCData c         = TL.toStrict . toLazyText . ppCDataS c mempty
-
-type Indent = Builder
-
--- | Pretty printing content using ShowT
-ppContentS         :: ConfigPP -> Indent -> Content -> Builder
-ppContentS c i x = case x of
-                     Elem e -> ppElementS c i e
-                     Text t -> ppCDataS c i t
-                     CRef r -> showCRefS r
-
-ppElementS         :: ConfigPP -> Indent -> Element -> Builder
-ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <>
-  (case elContent e of
-    [] | "?" `T.isPrefixOf` qName name -> fromText " ?>"
-       | shortEmptyTag c name  -> fromText " />"
-    [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name
-    cs -> singleton '>' <> nl <>
-          mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <>
-          i <> tagEnd name
-      where (nl,sp)  = if prettify c then ("\n","  ") else ("","")
-  )
-  where name = elName e
-
-ppCDataS           :: ConfigPP -> Indent -> CData -> Builder
-ppCDataS c i t     = i <> if cdVerbatim t /= CDataText || not (prettify c)
-                             then showCDataS t
-                             else foldr cons mempty (T.unpack (showCData t))
-  where cons         :: Char -> Builder -> Builder
-        cons '\n' ys  = singleton '\n' <> i <> ys
-        cons y ys     = singleton y <> ys
-
-
-
---------------------------------------------------------------------------------
-
--- | Adds the <?xml?> header.
-showTopElement     :: Element -> Text
-showTopElement c    = xmlHeader <> showElement c
-
-showContent        :: Content -> Text
-showContent         = ppcContent defaultConfigPP
-
-showElement        :: Element -> Text
-showElement         = ppcElement defaultConfigPP
-
-showCData          :: CData -> Text
-showCData           = ppcCData defaultConfigPP
-
--- Note: crefs should not contain '&', ';', etc.
-showCRefS          :: Text -> Builder
-showCRefS r         = singleton '&' <> fromText r <> singleton ';'
-
--- | Convert a text element to characters.
-showCDataS         :: CData -> Builder
-showCDataS cd =
- case cdVerbatim cd of
-   CDataText     -> escStr (cdData cd)
-   CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <>
-                    fromText "]]>"
-   CDataRaw      -> fromText (cdData cd)
-
---------------------------------------------------------------------------------
-escCData           :: Text -> Builder
-escCData t
-  | "]]>" `T.isPrefixOf` t =
-     fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t)
-escCData t
-  = case T.uncons t of
-      Nothing     -> mempty
-      Just (c,t') -> singleton c <> escCData t'
-
-escChar            :: Char -> Builder
-escChar c = case c of
-  '<'   -> fromText "&lt;"
-  '>'   -> fromText "&gt;"
-  '&'   -> fromText "&amp;"
-  '"'   -> fromText "&quot;"
-  -- we use &#39 instead of &apos; because IE apparently has difficulties
-  -- rendering &apos; in xhtml.
-  -- Reported by Rohan Drape <rohan.drape@gmail.com>.
-  '\''  -> fromText "&#39;"
-  _     -> singleton c
-
-  {- original xml-light version:
-  -- NOTE: We escape '\r' explicitly because otherwise they get lost
-  -- when parsed back in because of then end-of-line normalization rules.
-  _ | isPrint c || c == '\n' -> singleton c
-    | otherwise -> showText "&#" . showsT oc . singleton ';'
-      where oc = ord c
-  -}
-
-escStr             :: Text -> Builder
-escStr cs          = if T.any needsEscape cs
-                        then mconcat (map escChar (T.unpack cs))
-                        else fromText cs
- where
-  needsEscape '<' = True
-  needsEscape '>' = True
-  needsEscape '&' = True
-  needsEscape '"' = True
-  needsEscape '\'' = True
-  needsEscape _ = False
-
-tagEnd             :: QName -> Builder
-tagEnd qn           = fromText "</" <> showQName qn <> singleton '>'
-
-tagStart           :: QName -> [Attr] -> Builder
-tagStart qn as      = singleton '<' <> showQName qn <> as_str
- where as_str       = if null as
-                         then mempty
-                         else mconcat (map showAttr as)
-
-showAttr           :: Attr -> Builder
-showAttr (Attr qn v) = singleton ' ' <> showQName qn <>
-                       singleton '=' <>
-                       singleton '"' <> escStr v <> singleton '"'
-
-showQName          :: QName -> Builder
-showQName q         =
-  case qPrefix q of
-    Nothing -> fromText (qName q)
-    Just p  -> fromText p <> singleton ':' <> fromText (qName q)
diff --git a/src/Text/Pandoc/XML/Light/Output.hs b/src/Text/Pandoc/XML/Light/Output.hs
new file mode 100644
index 000000000..dc94ce914
--- /dev/null
+++ b/src/Text/Pandoc/XML/Light/Output.hs
@@ -0,0 +1,230 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.XML.Light.Output
+   Copyright   : Copyright (C) 2021 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+-}
+module Text.Pandoc.XML.Light.Output
+  ( -- * Replacement for xml-light's Text.XML.Output
+    ppTopElement
+  , ppElement
+  , ppContent
+  , ppcElement
+  , ppcContent
+  , showTopElement
+  , showElement
+  , showContent
+  , useShortEmptyTags
+  , defaultConfigPP
+  , ConfigPP(..)
+  ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText)
+import Text.Pandoc.XML.Light.Types
+
+--
+-- duplicates functinos from Text.XML.Output
+--
+
+-- | The XML 1.0 header
+xmlHeader :: Text
+xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+
+
+--------------------------------------------------------------------------------
+data ConfigPP = ConfigPP
+  { shortEmptyTag :: QName -> Bool
+  , prettify      :: Bool
+  }
+
+-- | Default pretty orinting configuration.
+--  * Always use abbreviate empty tags.
+defaultConfigPP :: ConfigPP
+defaultConfigPP = ConfigPP { shortEmptyTag = const True
+                           , prettify      = False
+                           }
+
+-- | The predicate specifies for which empty tags we should use XML's
+-- abbreviated notation <TAG />.  This is useful if we are working with
+-- some XML-ish standards (such as certain versions of HTML) where some
+-- empty tags should always be displayed in the <TAG></TAG> form.
+useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP
+useShortEmptyTags p c = c { shortEmptyTag = p }
+
+
+-- | Specify if we should use extra white-space to make document more readable.
+-- WARNING: This adds additional white-space to text elements,
+-- and so it may change the meaning of the document.
+useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP
+useExtraWhiteSpace p c  = c { prettify = p }
+
+-- | A configuration that tries to make things pretty
+-- (possibly at the cost of changing the semantics a bit
+-- through adding white space.)
+prettyConfigPP     :: ConfigPP
+prettyConfigPP      = useExtraWhiteSpace True defaultConfigPP
+
+
+--------------------------------------------------------------------------------
+
+
+-- | Pretty printing renders XML documents faithfully,
+-- with the exception that whitespace may be added\/removed
+-- in non-verbatim character data.
+ppTopElement       :: Element -> Text
+ppTopElement        = ppcTopElement prettyConfigPP
+
+-- | Pretty printing elements
+ppElement          :: Element -> Text
+ppElement           = ppcElement prettyConfigPP
+
+-- | Pretty printing content
+ppContent          :: Content -> Text
+ppContent           = ppcContent prettyConfigPP
+
+-- | Pretty printing renders XML documents faithfully,
+-- with the exception that whitespace may be added\/removed
+-- in non-verbatim character data.
+ppcTopElement      :: ConfigPP -> Element -> Text
+ppcTopElement c e   = T.unlines [xmlHeader,ppcElement c e]
+
+-- | Pretty printing elements
+ppcElement         :: ConfigPP -> Element -> Text
+ppcElement c        = TL.toStrict . toLazyText . ppElementS c mempty
+
+-- | Pretty printing content
+ppcContent         :: ConfigPP -> Content -> Text
+ppcContent c        = TL.toStrict . toLazyText . ppContentS c mempty
+
+ppcCData           :: ConfigPP -> CData -> Text
+ppcCData c         = TL.toStrict . toLazyText . ppCDataS c mempty
+
+type Indent = Builder
+
+-- | Pretty printing content using ShowT
+ppContentS         :: ConfigPP -> Indent -> Content -> Builder
+ppContentS c i x = case x of
+                     Elem e -> ppElementS c i e
+                     Text t -> ppCDataS c i t
+                     CRef r -> showCRefS r
+
+ppElementS         :: ConfigPP -> Indent -> Element -> Builder
+ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <>
+  (case elContent e of
+    [] | "?" `T.isPrefixOf` qName name -> fromText " ?>"
+       | shortEmptyTag c name  -> fromText " />"
+    [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name
+    cs -> singleton '>' <> nl <>
+          mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <>
+          i <> tagEnd name
+      where (nl,sp)  = if prettify c then ("\n","  ") else ("","")
+  )
+  where name = elName e
+
+ppCDataS           :: ConfigPP -> Indent -> CData -> Builder
+ppCDataS c i t     = i <> if cdVerbatim t /= CDataText || not (prettify c)
+                             then showCDataS t
+                             else foldr cons mempty (T.unpack (showCData t))
+  where cons         :: Char -> Builder -> Builder
+        cons '\n' ys  = singleton '\n' <> i <> ys
+        cons y ys     = singleton y <> ys
+
+
+
+--------------------------------------------------------------------------------
+
+-- | Adds the <?xml?> header.
+showTopElement     :: Element -> Text
+showTopElement c    = xmlHeader <> showElement c
+
+showContent        :: Content -> Text
+showContent         = ppcContent defaultConfigPP
+
+showElement        :: Element -> Text
+showElement         = ppcElement defaultConfigPP
+
+showCData          :: CData -> Text
+showCData           = ppcCData defaultConfigPP
+
+-- Note: crefs should not contain '&', ';', etc.
+showCRefS          :: Text -> Builder
+showCRefS r         = singleton '&' <> fromText r <> singleton ';'
+
+-- | Convert a text element to characters.
+showCDataS         :: CData -> Builder
+showCDataS cd =
+ case cdVerbatim cd of
+   CDataText     -> escStr (cdData cd)
+   CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <>
+                    fromText "]]>"
+   CDataRaw      -> fromText (cdData cd)
+
+--------------------------------------------------------------------------------
+escCData           :: Text -> Builder
+escCData t
+  | "]]>" `T.isPrefixOf` t =
+     fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t)
+escCData t
+  = case T.uncons t of
+      Nothing     -> mempty
+      Just (c,t') -> singleton c <> escCData t'
+
+escChar            :: Char -> Builder
+escChar c = case c of
+  '<'   -> fromText "&lt;"
+  '>'   -> fromText "&gt;"
+  '&'   -> fromText "&amp;"
+  '"'   -> fromText "&quot;"
+  -- we use &#39 instead of &apos; because IE apparently has difficulties
+  -- rendering &apos; in xhtml.
+  -- Reported by Rohan Drape <rohan.drape@gmail.com>.
+  '\''  -> fromText "&#39;"
+  _     -> singleton c
+
+  {- original xml-light version:
+  -- NOTE: We escape '\r' explicitly because otherwise they get lost
+  -- when parsed back in because of then end-of-line normalization rules.
+  _ | isPrint c || c == '\n' -> singleton c
+    | otherwise -> showText "&#" . showsT oc . singleton ';'
+      where oc = ord c
+  -}
+
+escStr             :: Text -> Builder
+escStr cs          = if T.any needsEscape cs
+                        then mconcat (map escChar (T.unpack cs))
+                        else fromText cs
+ where
+  needsEscape '<' = True
+  needsEscape '>' = True
+  needsEscape '&' = True
+  needsEscape '"' = True
+  needsEscape '\'' = True
+  needsEscape _ = False
+
+tagEnd             :: QName -> Builder
+tagEnd qn           = fromText "</" <> showQName qn <> singleton '>'
+
+tagStart           :: QName -> [Attr] -> Builder
+tagStart qn as      = singleton '<' <> showQName qn <> as_str
+ where as_str       = if null as
+                         then mempty
+                         else mconcat (map showAttr as)
+
+showAttr           :: Attr -> Builder
+showAttr (Attr qn v) = singleton ' ' <> showQName qn <>
+                       singleton '=' <>
+                       singleton '"' <> escStr v <> singleton '"'
+
+showQName          :: QName -> Builder
+showQName q         =
+  case qPrefix q of
+    Nothing -> fromText (qName q)
+    Just p  -> fromText p <> singleton ':' <> fromText (qName q)
diff --git a/src/Text/Pandoc/XML/Light/Proc.hs b/src/Text/Pandoc/XML/Light/Proc.hs
new file mode 100644
index 000000000..838d5af74
--- /dev/null
+++ b/src/Text/Pandoc/XML/Light/Proc.hs
@@ -0,0 +1,138 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.XML.Light.Proc
+   Copyright   : Copyright (C) 2021 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+-}
+module Text.Pandoc.XML.Light.Proc
+  ( 
+    -- * Replacement for xml-light's Text.XML.Proc
+    strContent
+  , onlyElems
+  , elChildren
+  , onlyText
+  , findChildren
+  , filterChildren
+  , filterChildrenName
+  , findChild
+  , filterChild
+  , filterChildName
+  , findElement
+  , filterElement
+  , filterElementName
+  , findElements
+  , filterElements
+  , filterElementsName
+  , findAttr
+  , lookupAttr
+  , lookupAttrBy
+  , findAttrBy
+  ) where
+
+import Data.Text (Text)
+import Data.Maybe (listToMaybe)
+import Data.List(find)
+import Text.Pandoc.XML.Light.Types
+
+--
+-- copied from xml-light Text.XML.Proc
+--
+
+-- | Get the text value of an XML element.  This function
+-- ignores non-text elements, and concatenates all text elements.
+strContent         :: Element -> Text
+strContent          = mconcat . map cdData . onlyText . elContent
+
+-- | Select only the elements from a list of XML content.
+onlyElems          :: [Content] -> [Element]
+onlyElems xs        = [ x | Elem x <- xs ]
+
+-- | Select only the elements from a parent.
+elChildren         :: Element -> [Element]
+elChildren e        = [ x | Elem x <- elContent e ]
+
+-- | Select only the text from a list of XML content.
+onlyText           :: [Content] -> [CData]
+onlyText xs         = [ x | Text x <- xs ]
+
+-- | Find all immediate children with the given name.
+findChildren       :: QName -> Element -> [Element]
+findChildren q e    = filterChildren ((q ==) . elName) e
+
+-- | Filter all immediate children wrt a given predicate.
+filterChildren       :: (Element -> Bool) -> Element -> [Element]
+filterChildren p e    = filter p (onlyElems (elContent e))
+
+
+-- | Filter all immediate children wrt a given predicate over their names.
+filterChildrenName      :: (QName -> Bool) -> Element -> [Element]
+filterChildrenName p e   = filter (p.elName) (onlyElems (elContent e))
+
+
+-- | Find an immediate child with the given name.
+findChild          :: QName -> Element -> Maybe Element
+findChild q e       = listToMaybe (findChildren q e)
+
+-- | Find an immediate child with the given name.
+filterChild          :: (Element -> Bool) -> Element -> Maybe Element
+filterChild p e       = listToMaybe (filterChildren p e)
+
+-- | Find an immediate child with name matching a predicate.
+filterChildName      :: (QName -> Bool) -> Element -> Maybe Element
+filterChildName p e   = listToMaybe (filterChildrenName p e)
+
+-- | Find the left-most occurrence of an element matching given name.
+findElement        :: QName -> Element -> Maybe Element
+findElement q e     = listToMaybe (findElements q e)
+
+-- | Filter the left-most occurrence of an element wrt. given predicate.
+filterElement        :: (Element -> Bool) -> Element -> Maybe Element
+filterElement p e     = listToMaybe (filterElements p e)
+
+-- | Filter the left-most occurrence of an element wrt. given predicate.
+filterElementName     :: (QName -> Bool) -> Element -> Maybe Element
+filterElementName p e  = listToMaybe (filterElementsName p e)
+
+-- | Find all non-nested occurances of an element.
+-- (i.e., once we have found an element, we do not search
+-- for more occurances among the element's children).
+findElements       :: QName -> Element -> [Element]
+findElements qn e = filterElementsName (qn==) e
+
+-- | Find all non-nested occurrences of an element wrt. given predicate.
+-- (i.e., once we have found an element, we do not search
+-- for more occurances among the element's children).
+filterElements       :: (Element -> Bool) -> Element -> [Element]
+filterElements p e
+ | p e        = [e]
+ | otherwise  = concatMap (filterElements p) $ onlyElems $ elContent e
+
+-- | Find all non-nested occurences of an element wrt a predicate over element names.
+-- (i.e., once we have found an element, we do not search
+-- for more occurances among the element's children).
+filterElementsName       :: (QName -> Bool) -> Element -> [Element]
+filterElementsName p e = filterElements (p.elName) e
+
+-- | Lookup the value of an attribute.
+findAttr          :: QName -> Element -> Maybe Text
+findAttr x e       = lookupAttr x (elAttribs e)
+
+-- | Lookup attribute name from list.
+lookupAttr        :: QName -> [Attr] -> Maybe Text
+lookupAttr x       = lookupAttrBy (x ==)
+
+-- | Lookup the first attribute whose name satisfies the given predicate.
+lookupAttrBy       :: (QName -> Bool) -> [Attr] -> Maybe Text
+lookupAttrBy p as   = attrVal `fmap` find (p . attrKey) as
+
+-- | Lookup the value of the first attribute whose name
+-- satisfies the given predicate.
+findAttrBy         :: (QName -> Bool) -> Element -> Maybe Text
+findAttrBy p e      = lookupAttrBy p (elAttribs e)
+
+
diff --git a/src/Text/Pandoc/XML/Light/Types.hs b/src/Text/Pandoc/XML/Light/Types.hs
new file mode 100644
index 000000000..f338da6ea
--- /dev/null
+++ b/src/Text/Pandoc/XML/Light/Types.hs
@@ -0,0 +1,190 @@
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE DeriveDataTypeable #-}
+{- |
+   Module      : Text.Pandoc.XML.Light.Types
+   Copyright   : Copyright (C) 2021 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+-}
+module Text.Pandoc.XML.Light.Types
+  ( -- * Basic types, duplicating those from xml-light but with Text
+    -- instead of String
+    Line
+  , Content(..)
+  , Element(..)
+  , Attr(..)
+  , CData(..)
+  , CDataKind(..)
+  , QName(..)
+  , Node(..)
+  , unode
+  , unqual
+  , add_attr
+  , add_attrs
+  -- * Conversion functions from xml-light types
+  , fromXLQName
+  , fromXLCData
+  , fromXLElement
+  , fromXLAttr
+  , fromXLContent
+  ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import Data.Data (Data)
+import Data.Typeable (Typeable)
+import qualified Text.XML.Light as XL
+
+--
+-- type definitions lightly modified from xml-light
+--
+
+-- | A line is an Integer
+type Line     = Integer
+
+-- | XML content
+data Content  = Elem Element
+              | Text CData
+              | CRef Text
+                deriving (Show, Typeable, Data)
+
+-- | XML elements
+data Element  = Element {
+                  elName      :: QName,
+                  elAttribs   :: [Attr],
+                  elContent   :: [Content],
+                  elLine      :: Maybe Line
+                } deriving (Show, Typeable, Data)
+
+-- | XML attributes
+data Attr     = Attr {
+                  attrKey :: QName,
+                  attrVal :: Text
+                } deriving (Eq, Ord, Show, Typeable, Data)
+
+-- | XML CData
+data CData    = CData {
+                  cdVerbatim  :: CDataKind,
+                  cdData      :: Text,
+                  cdLine      :: Maybe Line
+                } deriving (Show, Typeable, Data)
+
+data CDataKind
+ = CDataText      -- ^ Ordinary character data; pretty printer escapes &, < etc.
+ | CDataVerbatim  -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[..
+ | CDataRaw       -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up.
+   deriving ( Eq, Show, Typeable, Data )
+
+-- | XML qualified names
+data QName    = QName {
+                  qName   :: Text,
+                  qURI    :: Maybe Text,
+                  qPrefix :: Maybe Text
+                } deriving (Show, Typeable, Data)
+
+
+instance Eq QName where
+  q1 == q2  = compare q1 q2 == EQ
+
+instance Ord QName where
+  compare q1 q2 =
+    case compare (qName q1) (qName q2) of
+      EQ  -> case (qURI q1, qURI q2) of
+               (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2)
+               (u1,u2)           -> compare u1 u2
+      x   -> x
+
+class Node t where
+ node :: QName -> t -> Element
+
+instance Node ([Attr],[Content]) where
+  node n (attrs,cont) = Element { elName     = n
+                                , elAttribs  = attrs
+                                , elContent  = cont
+                                , elLine     = Nothing
+                                }
+
+instance Node [Attr]             where node n as   = node n (as,[]::[Content])
+instance Node Attr               where node n a    = node n [a]
+instance Node ()                 where node n ()   = node n ([]::[Attr])
+
+instance Node [Content]          where node n cs     = node n ([]::[Attr],cs)
+instance Node Content            where node n c      = node n [c]
+instance Node ([Attr],Content)   where node n (as,c) = node n (as,[c])
+instance Node (Attr,Content)     where node n (a,c)  = node n ([a],[c])
+
+instance Node ([Attr],[Element]) where
+  node n (as,cs) = node n (as,map Elem cs)
+
+instance Node ([Attr],Element)   where node n (as,c) = node n (as,[c])
+instance Node (Attr,Element)     where node n (a,c)  = node n ([a],c)
+instance Node [Element]          where node n es     = node n ([]::[Attr],es)
+instance Node Element            where node n e      = node n [e]
+
+instance Node ([Attr],[CData])   where
+  node n (as,cs) = node n (as,map Text cs)
+
+instance Node ([Attr],CData)     where node n (as,c) = node n (as,[c])
+instance Node (Attr,CData)       where node n (a,c)  = node n ([a],c)
+instance Node [CData]            where node n es     = node n ([]::[Attr],es)
+instance Node CData              where node n e      = node n [e]
+
+instance Node ([Attr],Text)      where
+  node n (as,t) = node n (as, CData { cdVerbatim = CDataText
+                                    , cdData = t
+                                    , cdLine = Nothing })
+
+instance Node (Attr,Text )       where node n (a,t)  = node n ([a],t)
+instance Node Text               where node n t      = node n ([]::[Attr],t)
+
+-- | Create node with unqualified name
+unode :: Node t => Text -> t -> Element
+unode = node . unqual
+
+unqual :: Text -> QName
+unqual x = QName x Nothing Nothing
+
+-- | Add an attribute to an element.
+add_attr :: Attr -> Element -> Element
+add_attr a e = add_attrs [a] e
+
+-- | Add some attributes to an element.
+add_attrs :: [Attr] -> Element -> Element
+add_attrs as e = e { elAttribs = as ++ elAttribs e }
+
+--
+-- conversion from xml-light
+--
+
+fromXLQName :: XL.QName -> QName
+fromXLQName qn = QName { qName = T.pack $ XL.qName qn
+                       , qURI = T.pack <$> XL.qURI qn
+                       , qPrefix = T.pack <$> XL.qPrefix qn }
+
+fromXLCData :: XL.CData -> CData
+fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of
+                                        XL.CDataText -> CDataText
+                                        XL.CDataVerbatim -> CDataVerbatim
+                                        XL.CDataRaw -> CDataRaw
+                       , cdData = T.pack $ XL.cdData cd
+                       , cdLine = XL.cdLine cd }
+
+fromXLElement :: XL.Element -> Element
+fromXLElement el = Element { elName = fromXLQName $ XL.elName el
+                           , elAttribs = map fromXLAttr $ XL.elAttribs el
+                           , elContent = map fromXLContent $ XL.elContent el
+                           , elLine = XL.elLine el }
+
+fromXLAttr :: XL.Attr -> Attr
+fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s)
+
+fromXLContent :: XL.Content -> Content
+fromXLContent (XL.Elem el) = Elem $ fromXLElement el
+fromXLContent (XL.Text cd) = Text $ fromXLCData cd
+fromXLContent (XL.CRef s)  = CRef (T.pack s)
+
+
author	John MacFarlane <jgm@berkeley.edu>	2021-02-16 18:40:06 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2021-02-16 18:40:06 -0800
commit	d7a4996b1e7e201095ce792375a0776984fa8fcd (patch)
tree	2d9fe829471bd4217a3e92b72d1c713abe77f274
parent	967e7f5fb990b29de48b37be1db40fb149a8cf55 (diff)
download	pandoc-d7a4996b1e7e201095ce792375a0776984fa8fcd.tar.gz