diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc/XML/Light.hs | 511 | ||||
-rw-r--r-- | src/Text/Pandoc/XML/Light/Output.hs | 230 | ||||
-rw-r--r-- | src/Text/Pandoc/XML/Light/Proc.hs | 138 | ||||
-rw-r--r-- | src/Text/Pandoc/XML/Light/Types.hs | 190 |
4 files changed, 565 insertions, 504 deletions
diff --git a/src/Text/Pandoc/XML/Light.hs b/src/Text/Pandoc/XML/Light.hs index 38e4df218..07113ea92 100644 --- a/src/Text/Pandoc/XML/Light.hs +++ b/src/Text/Pandoc/XML/Light.hs @@ -1,5 +1,4 @@ {-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.XML.Light @@ -31,59 +30,9 @@ better performance and accuracy without much change in the code that used xml-light. -} module Text.Pandoc.XML.Light - ( -- * Basic types, duplicating those from xml-light but with Text - -- instead of String - Line - , Content(..) - , Element(..) - , Attr(..) - , CData(..) - , CDataKind(..) - , QName(..) - , Node(..) - , unode - , unqual - , add_attr - , add_attrs - -- * Conversion functions from xml-light types - , fromXLQName - , fromXLCData - , fromXLElement - , fromXLAttr - , fromXLContent - -- * Replacement for xml-light's Text.XML.Proc - , strContent - , onlyElems - , elChildren - , onlyText - , findChildren - , filterChildren - , filterChildrenName - , findChild - , filterChild - , filterChildName - , findElement - , filterElement - , filterElementName - , findElements - , filterElements - , filterElementsName - , findAttr - , lookupAttr - , lookupAttrBy - , findAttrBy - -- * Replacement for xml-light's Text.XML.Output - , ppTopElement - , ppElement - , ppContent - , ppcElement - , ppcContent - , showTopElement - , showElement - , showContent - , useShortEmptyTags - , defaultConfigPP - , ConfigPP(..) + ( module Text.Pandoc.XML.Light.Types + , module Text.Pandoc.XML.Light.Proc + , module Text.Pandoc.XML.Light.Output -- * Replacement for xml-light's Text.XML.Input , parseXMLElement , parseXMLContents @@ -92,16 +41,13 @@ module Text.Pandoc.XML.Light import qualified Control.Exception as E import qualified Text.XML as Conduit import Text.XML.Unresolved (InvalidEventStream(..)) -import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL -import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText) import qualified Data.Map as M -import Data.Data (Data) -import Data.Typeable (Typeable) -import Data.Maybe (mapMaybe, listToMaybe) -import Data.List(find) -import qualified Text.XML.Light as XL +import Data.Maybe (mapMaybe) +import Text.Pandoc.XML.Light.Types +import Text.Pandoc.XML.Light.Proc +import Text.Pandoc.XML.Light.Output -- Drop in replacement for parseXMLDoc in xml-light. parseXMLElement :: TL.Text -> Either T.Text Element @@ -141,446 +87,3 @@ nodeToContent (Conduit.NodeContent t) = Just (Text (CData CDataText t Nothing)) nodeToContent _ = Nothing -unqual :: Text -> QName -unqual x = QName x Nothing Nothing - --- | Add an attribute to an element. -add_attr :: Attr -> Element -> Element -add_attr a e = add_attrs [a] e - --- | Add some attributes to an element. -add_attrs :: [Attr] -> Element -> Element -add_attrs as e = e { elAttribs = as ++ elAttribs e } - --- --- type definitions lightly modified from xml-light --- - --- | A line is an Integer -type Line = Integer - --- | XML content -data Content = Elem Element - | Text CData - | CRef Text - deriving (Show, Typeable, Data) - --- | XML elements -data Element = Element { - elName :: QName, - elAttribs :: [Attr], - elContent :: [Content], - elLine :: Maybe Line - } deriving (Show, Typeable, Data) - --- | XML attributes -data Attr = Attr { - attrKey :: QName, - attrVal :: Text - } deriving (Eq, Ord, Show, Typeable, Data) - --- | XML CData -data CData = CData { - cdVerbatim :: CDataKind, - cdData :: Text, - cdLine :: Maybe Line - } deriving (Show, Typeable, Data) - -data CDataKind - = CDataText -- ^ Ordinary character data; pretty printer escapes &, < etc. - | CDataVerbatim -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[.. - | CDataRaw -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up. - deriving ( Eq, Show, Typeable, Data ) - --- | XML qualified names -data QName = QName { - qName :: Text, - qURI :: Maybe Text, - qPrefix :: Maybe Text - } deriving (Show, Typeable, Data) - - -instance Eq QName where - q1 == q2 = compare q1 q2 == EQ - -instance Ord QName where - compare q1 q2 = - case compare (qName q1) (qName q2) of - EQ -> case (qURI q1, qURI q2) of - (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2) - (u1,u2) -> compare u1 u2 - x -> x - -class Node t where - node :: QName -> t -> Element - -instance Node ([Attr],[Content]) where - node n (attrs,cont) = Element { elName = n - , elAttribs = attrs - , elContent = cont - , elLine = Nothing - } - -instance Node [Attr] where node n as = node n (as,[]::[Content]) -instance Node Attr where node n a = node n [a] -instance Node () where node n () = node n ([]::[Attr]) - -instance Node [Content] where node n cs = node n ([]::[Attr],cs) -instance Node Content where node n c = node n [c] -instance Node ([Attr],Content) where node n (as,c) = node n (as,[c]) -instance Node (Attr,Content) where node n (a,c) = node n ([a],[c]) - -instance Node ([Attr],[Element]) where - node n (as,cs) = node n (as,map Elem cs) - -instance Node ([Attr],Element) where node n (as,c) = node n (as,[c]) -instance Node (Attr,Element) where node n (a,c) = node n ([a],c) -instance Node [Element] where node n es = node n ([]::[Attr],es) -instance Node Element where node n e = node n [e] - -instance Node ([Attr],[CData]) where - node n (as,cs) = node n (as,map Text cs) - -instance Node ([Attr],CData) where node n (as,c) = node n (as,[c]) -instance Node (Attr,CData) where node n (a,c) = node n ([a],c) -instance Node [CData] where node n es = node n ([]::[Attr],es) -instance Node CData where node n e = node n [e] - -instance Node ([Attr],Text) where - node n (as,t) = node n (as, CData { cdVerbatim = CDataText - , cdData = t - , cdLine = Nothing }) - -instance Node (Attr,Text ) where node n (a,t) = node n ([a],t) -instance Node Text where node n t = node n ([]::[Attr],t) - --- | Create node with unqualified name -unode :: Node t => Text -> t -> Element -unode = node . unqual - --- --- conversion from xml-light --- - -fromXLQName :: XL.QName -> QName -fromXLQName qn = QName { qName = T.pack $ XL.qName qn - , qURI = T.pack <$> XL.qURI qn - , qPrefix = T.pack <$> XL.qPrefix qn } - -fromXLCData :: XL.CData -> CData -fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of - XL.CDataText -> CDataText - XL.CDataVerbatim -> CDataVerbatim - XL.CDataRaw -> CDataRaw - , cdData = T.pack $ XL.cdData cd - , cdLine = XL.cdLine cd } - -fromXLElement :: XL.Element -> Element -fromXLElement el = Element { elName = fromXLQName $ XL.elName el - , elAttribs = map fromXLAttr $ XL.elAttribs el - , elContent = map fromXLContent $ XL.elContent el - , elLine = XL.elLine el } - -fromXLAttr :: XL.Attr -> Attr -fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s) - -fromXLContent :: XL.Content -> Content -fromXLContent (XL.Elem el) = Elem $ fromXLElement el -fromXLContent (XL.Text cd) = Text $ fromXLCData cd -fromXLContent (XL.CRef s) = CRef (T.pack s) - --- --- copied from xml-light Text.XML.Proc --- - --- | Get the text value of an XML element. This function --- ignores non-text elements, and concatenates all text elements. -strContent :: Element -> Text -strContent = mconcat . map cdData . onlyText . elContent - --- | Select only the elements from a list of XML content. -onlyElems :: [Content] -> [Element] -onlyElems xs = [ x | Elem x <- xs ] - --- | Select only the elements from a parent. -elChildren :: Element -> [Element] -elChildren e = [ x | Elem x <- elContent e ] - --- | Select only the text from a list of XML content. -onlyText :: [Content] -> [CData] -onlyText xs = [ x | Text x <- xs ] - --- | Find all immediate children with the given name. -findChildren :: QName -> Element -> [Element] -findChildren q e = filterChildren ((q ==) . elName) e - --- | Filter all immediate children wrt a given predicate. -filterChildren :: (Element -> Bool) -> Element -> [Element] -filterChildren p e = filter p (onlyElems (elContent e)) - - --- | Filter all immediate children wrt a given predicate over their names. -filterChildrenName :: (QName -> Bool) -> Element -> [Element] -filterChildrenName p e = filter (p.elName) (onlyElems (elContent e)) - - --- | Find an immediate child with the given name. -findChild :: QName -> Element -> Maybe Element -findChild q e = listToMaybe (findChildren q e) - --- | Find an immediate child with the given name. -filterChild :: (Element -> Bool) -> Element -> Maybe Element -filterChild p e = listToMaybe (filterChildren p e) - --- | Find an immediate child with name matching a predicate. -filterChildName :: (QName -> Bool) -> Element -> Maybe Element -filterChildName p e = listToMaybe (filterChildrenName p e) - --- | Find the left-most occurrence of an element matching given name. -findElement :: QName -> Element -> Maybe Element -findElement q e = listToMaybe (findElements q e) - --- | Filter the left-most occurrence of an element wrt. given predicate. -filterElement :: (Element -> Bool) -> Element -> Maybe Element -filterElement p e = listToMaybe (filterElements p e) - --- | Filter the left-most occurrence of an element wrt. given predicate. -filterElementName :: (QName -> Bool) -> Element -> Maybe Element -filterElementName p e = listToMaybe (filterElementsName p e) - --- | Find all non-nested occurances of an element. --- (i.e., once we have found an element, we do not search --- for more occurances among the element's children). -findElements :: QName -> Element -> [Element] -findElements qn e = filterElementsName (qn==) e - --- | Find all non-nested occurrences of an element wrt. given predicate. --- (i.e., once we have found an element, we do not search --- for more occurances among the element's children). -filterElements :: (Element -> Bool) -> Element -> [Element] -filterElements p e - | p e = [e] - | otherwise = concatMap (filterElements p) $ onlyElems $ elContent e - --- | Find all non-nested occurences of an element wrt a predicate over element names. --- (i.e., once we have found an element, we do not search --- for more occurances among the element's children). -filterElementsName :: (QName -> Bool) -> Element -> [Element] -filterElementsName p e = filterElements (p.elName) e - --- | Lookup the value of an attribute. -findAttr :: QName -> Element -> Maybe Text -findAttr x e = lookupAttr x (elAttribs e) - --- | Lookup attribute name from list. -lookupAttr :: QName -> [Attr] -> Maybe Text -lookupAttr x = lookupAttrBy (x ==) - --- | Lookup the first attribute whose name satisfies the given predicate. -lookupAttrBy :: (QName -> Bool) -> [Attr] -> Maybe Text -lookupAttrBy p as = attrVal `fmap` find (p . attrKey) as - --- | Lookup the value of the first attribute whose name --- satisfies the given predicate. -findAttrBy :: (QName -> Bool) -> Element -> Maybe Text -findAttrBy p e = lookupAttrBy p (elAttribs e) - - --- --- duplicates functinos from Text.XML.Output --- - --- | The XML 1.0 header -xmlHeader :: Text -xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" - - --------------------------------------------------------------------------------- -data ConfigPP = ConfigPP - { shortEmptyTag :: QName -> Bool - , prettify :: Bool - } - --- | Default pretty orinting configuration. --- * Always use abbreviate empty tags. -defaultConfigPP :: ConfigPP -defaultConfigPP = ConfigPP { shortEmptyTag = const True - , prettify = False - } - --- | The predicate specifies for which empty tags we should use XML's --- abbreviated notation <TAG />. This is useful if we are working with --- some XML-ish standards (such as certain versions of HTML) where some --- empty tags should always be displayed in the <TAG></TAG> form. -useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP -useShortEmptyTags p c = c { shortEmptyTag = p } - - --- | Specify if we should use extra white-space to make document more readable. --- WARNING: This adds additional white-space to text elements, --- and so it may change the meaning of the document. -useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP -useExtraWhiteSpace p c = c { prettify = p } - --- | A configuration that tries to make things pretty --- (possibly at the cost of changing the semantics a bit --- through adding white space.) -prettyConfigPP :: ConfigPP -prettyConfigPP = useExtraWhiteSpace True defaultConfigPP - - --------------------------------------------------------------------------------- - - --- | Pretty printing renders XML documents faithfully, --- with the exception that whitespace may be added\/removed --- in non-verbatim character data. -ppTopElement :: Element -> Text -ppTopElement = ppcTopElement prettyConfigPP - --- | Pretty printing elements -ppElement :: Element -> Text -ppElement = ppcElement prettyConfigPP - --- | Pretty printing content -ppContent :: Content -> Text -ppContent = ppcContent prettyConfigPP - --- | Pretty printing renders XML documents faithfully, --- with the exception that whitespace may be added\/removed --- in non-verbatim character data. -ppcTopElement :: ConfigPP -> Element -> Text -ppcTopElement c e = T.unlines [xmlHeader,ppcElement c e] - --- | Pretty printing elements -ppcElement :: ConfigPP -> Element -> Text -ppcElement c = TL.toStrict . toLazyText . ppElementS c mempty - --- | Pretty printing content -ppcContent :: ConfigPP -> Content -> Text -ppcContent c = TL.toStrict . toLazyText . ppContentS c mempty - -ppcCData :: ConfigPP -> CData -> Text -ppcCData c = TL.toStrict . toLazyText . ppCDataS c mempty - -type Indent = Builder - --- | Pretty printing content using ShowT -ppContentS :: ConfigPP -> Indent -> Content -> Builder -ppContentS c i x = case x of - Elem e -> ppElementS c i e - Text t -> ppCDataS c i t - CRef r -> showCRefS r - -ppElementS :: ConfigPP -> Indent -> Element -> Builder -ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <> - (case elContent e of - [] | "?" `T.isPrefixOf` qName name -> fromText " ?>" - | shortEmptyTag c name -> fromText " />" - [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name - cs -> singleton '>' <> nl <> - mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <> - i <> tagEnd name - where (nl,sp) = if prettify c then ("\n"," ") else ("","") - ) - where name = elName e - -ppCDataS :: ConfigPP -> Indent -> CData -> Builder -ppCDataS c i t = i <> if cdVerbatim t /= CDataText || not (prettify c) - then showCDataS t - else foldr cons mempty (T.unpack (showCData t)) - where cons :: Char -> Builder -> Builder - cons '\n' ys = singleton '\n' <> i <> ys - cons y ys = singleton y <> ys - - - --------------------------------------------------------------------------------- - --- | Adds the <?xml?> header. -showTopElement :: Element -> Text -showTopElement c = xmlHeader <> showElement c - -showContent :: Content -> Text -showContent = ppcContent defaultConfigPP - -showElement :: Element -> Text -showElement = ppcElement defaultConfigPP - -showCData :: CData -> Text -showCData = ppcCData defaultConfigPP - --- Note: crefs should not contain '&', ';', etc. -showCRefS :: Text -> Builder -showCRefS r = singleton '&' <> fromText r <> singleton ';' - --- | Convert a text element to characters. -showCDataS :: CData -> Builder -showCDataS cd = - case cdVerbatim cd of - CDataText -> escStr (cdData cd) - CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <> - fromText "]]>" - CDataRaw -> fromText (cdData cd) - --------------------------------------------------------------------------------- -escCData :: Text -> Builder -escCData t - | "]]>" `T.isPrefixOf` t = - fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t) -escCData t - = case T.uncons t of - Nothing -> mempty - Just (c,t') -> singleton c <> escCData t' - -escChar :: Char -> Builder -escChar c = case c of - '<' -> fromText "<" - '>' -> fromText ">" - '&' -> fromText "&" - '"' -> fromText """ - -- we use ' instead of ' because IE apparently has difficulties - -- rendering ' in xhtml. - -- Reported by Rohan Drape <rohan.drape@gmail.com>. - '\'' -> fromText "'" - _ -> singleton c - - {- original xml-light version: - -- NOTE: We escape '\r' explicitly because otherwise they get lost - -- when parsed back in because of then end-of-line normalization rules. - _ | isPrint c || c == '\n' -> singleton c - | otherwise -> showText "&#" . showsT oc . singleton ';' - where oc = ord c - -} - -escStr :: Text -> Builder -escStr cs = if T.any needsEscape cs - then mconcat (map escChar (T.unpack cs)) - else fromText cs - where - needsEscape '<' = True - needsEscape '>' = True - needsEscape '&' = True - needsEscape '"' = True - needsEscape '\'' = True - needsEscape _ = False - -tagEnd :: QName -> Builder -tagEnd qn = fromText "</" <> showQName qn <> singleton '>' - -tagStart :: QName -> [Attr] -> Builder -tagStart qn as = singleton '<' <> showQName qn <> as_str - where as_str = if null as - then mempty - else mconcat (map showAttr as) - -showAttr :: Attr -> Builder -showAttr (Attr qn v) = singleton ' ' <> showQName qn <> - singleton '=' <> - singleton '"' <> escStr v <> singleton '"' - -showQName :: QName -> Builder -showQName q = - case qPrefix q of - Nothing -> fromText (qName q) - Just p -> fromText p <> singleton ':' <> fromText (qName q) diff --git a/src/Text/Pandoc/XML/Light/Output.hs b/src/Text/Pandoc/XML/Light/Output.hs new file mode 100644 index 000000000..dc94ce914 --- /dev/null +++ b/src/Text/Pandoc/XML/Light/Output.hs @@ -0,0 +1,230 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.XML.Light.Output + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.XML.Light.Output + ( -- * Replacement for xml-light's Text.XML.Output + ppTopElement + , ppElement + , ppContent + , ppcElement + , ppcContent + , showTopElement + , showElement + , showContent + , useShortEmptyTags + , defaultConfigPP + , ConfigPP(..) + ) where + +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText) +import Text.Pandoc.XML.Light.Types + +-- +-- duplicates functinos from Text.XML.Output +-- + +-- | The XML 1.0 header +xmlHeader :: Text +xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + + +-------------------------------------------------------------------------------- +data ConfigPP = ConfigPP + { shortEmptyTag :: QName -> Bool + , prettify :: Bool + } + +-- | Default pretty orinting configuration. +-- * Always use abbreviate empty tags. +defaultConfigPP :: ConfigPP +defaultConfigPP = ConfigPP { shortEmptyTag = const True + , prettify = False + } + +-- | The predicate specifies for which empty tags we should use XML's +-- abbreviated notation <TAG />. This is useful if we are working with +-- some XML-ish standards (such as certain versions of HTML) where some +-- empty tags should always be displayed in the <TAG></TAG> form. +useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP +useShortEmptyTags p c = c { shortEmptyTag = p } + + +-- | Specify if we should use extra white-space to make document more readable. +-- WARNING: This adds additional white-space to text elements, +-- and so it may change the meaning of the document. +useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP +useExtraWhiteSpace p c = c { prettify = p } + +-- | A configuration that tries to make things pretty +-- (possibly at the cost of changing the semantics a bit +-- through adding white space.) +prettyConfigPP :: ConfigPP +prettyConfigPP = useExtraWhiteSpace True defaultConfigPP + + +-------------------------------------------------------------------------------- + + +-- | Pretty printing renders XML documents faithfully, +-- with the exception that whitespace may be added\/removed +-- in non-verbatim character data. +ppTopElement :: Element -> Text +ppTopElement = ppcTopElement prettyConfigPP + +-- | Pretty printing elements +ppElement :: Element -> Text +ppElement = ppcElement prettyConfigPP + +-- | Pretty printing content +ppContent :: Content -> Text +ppContent = ppcContent prettyConfigPP + +-- | Pretty printing renders XML documents faithfully, +-- with the exception that whitespace may be added\/removed +-- in non-verbatim character data. +ppcTopElement :: ConfigPP -> Element -> Text +ppcTopElement c e = T.unlines [xmlHeader,ppcElement c e] + +-- | Pretty printing elements +ppcElement :: ConfigPP -> Element -> Text +ppcElement c = TL.toStrict . toLazyText . ppElementS c mempty + +-- | Pretty printing content +ppcContent :: ConfigPP -> Content -> Text +ppcContent c = TL.toStrict . toLazyText . ppContentS c mempty + +ppcCData :: ConfigPP -> CData -> Text +ppcCData c = TL.toStrict . toLazyText . ppCDataS c mempty + +type Indent = Builder + +-- | Pretty printing content using ShowT +ppContentS :: ConfigPP -> Indent -> Content -> Builder +ppContentS c i x = case x of + Elem e -> ppElementS c i e + Text t -> ppCDataS c i t + CRef r -> showCRefS r + +ppElementS :: ConfigPP -> Indent -> Element -> Builder +ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <> + (case elContent e of + [] | "?" `T.isPrefixOf` qName name -> fromText " ?>" + | shortEmptyTag c name -> fromText " />" + [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name + cs -> singleton '>' <> nl <> + mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <> + i <> tagEnd name + where (nl,sp) = if prettify c then ("\n"," ") else ("","") + ) + where name = elName e + +ppCDataS :: ConfigPP -> Indent -> CData -> Builder +ppCDataS c i t = i <> if cdVerbatim t /= CDataText || not (prettify c) + then showCDataS t + else foldr cons mempty (T.unpack (showCData t)) + where cons :: Char -> Builder -> Builder + cons '\n' ys = singleton '\n' <> i <> ys + cons y ys = singleton y <> ys + + + +-------------------------------------------------------------------------------- + +-- | Adds the <?xml?> header. +showTopElement :: Element -> Text +showTopElement c = xmlHeader <> showElement c + +showContent :: Content -> Text +showContent = ppcContent defaultConfigPP + +showElement :: Element -> Text +showElement = ppcElement defaultConfigPP + +showCData :: CData -> Text +showCData = ppcCData defaultConfigPP + +-- Note: crefs should not contain '&', ';', etc. +showCRefS :: Text -> Builder +showCRefS r = singleton '&' <> fromText r <> singleton ';' + +-- | Convert a text element to characters. +showCDataS :: CData -> Builder +showCDataS cd = + case cdVerbatim cd of + CDataText -> escStr (cdData cd) + CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <> + fromText "]]>" + CDataRaw -> fromText (cdData cd) + +-------------------------------------------------------------------------------- +escCData :: Text -> Builder +escCData t + | "]]>" `T.isPrefixOf` t = + fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t) +escCData t + = case T.uncons t of + Nothing -> mempty + Just (c,t') -> singleton c <> escCData t' + +escChar :: Char -> Builder +escChar c = case c of + '<' -> fromText "<" + '>' -> fromText ">" + '&' -> fromText "&" + '"' -> fromText """ + -- we use ' instead of ' because IE apparently has difficulties + -- rendering ' in xhtml. + -- Reported by Rohan Drape <rohan.drape@gmail.com>. + '\'' -> fromText "'" + _ -> singleton c + + {- original xml-light version: + -- NOTE: We escape '\r' explicitly because otherwise they get lost + -- when parsed back in because of then end-of-line normalization rules. + _ | isPrint c || c == '\n' -> singleton c + | otherwise -> showText "&#" . showsT oc . singleton ';' + where oc = ord c + -} + +escStr :: Text -> Builder +escStr cs = if T.any needsEscape cs + then mconcat (map escChar (T.unpack cs)) + else fromText cs + where + needsEscape '<' = True + needsEscape '>' = True + needsEscape '&' = True + needsEscape '"' = True + needsEscape '\'' = True + needsEscape _ = False + +tagEnd :: QName -> Builder +tagEnd qn = fromText "</" <> showQName qn <> singleton '>' + +tagStart :: QName -> [Attr] -> Builder +tagStart qn as = singleton '<' <> showQName qn <> as_str + where as_str = if null as + then mempty + else mconcat (map showAttr as) + +showAttr :: Attr -> Builder +showAttr (Attr qn v) = singleton ' ' <> showQName qn <> + singleton '=' <> + singleton '"' <> escStr v <> singleton '"' + +showQName :: QName -> Builder +showQName q = + case qPrefix q of + Nothing -> fromText (qName q) + Just p -> fromText p <> singleton ':' <> fromText (qName q) diff --git a/src/Text/Pandoc/XML/Light/Proc.hs b/src/Text/Pandoc/XML/Light/Proc.hs new file mode 100644 index 000000000..838d5af74 --- /dev/null +++ b/src/Text/Pandoc/XML/Light/Proc.hs @@ -0,0 +1,138 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.XML.Light.Proc + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.XML.Light.Proc + ( + -- * Replacement for xml-light's Text.XML.Proc + strContent + , onlyElems + , elChildren + , onlyText + , findChildren + , filterChildren + , filterChildrenName + , findChild + , filterChild + , filterChildName + , findElement + , filterElement + , filterElementName + , findElements + , filterElements + , filterElementsName + , findAttr + , lookupAttr + , lookupAttrBy + , findAttrBy + ) where + +import Data.Text (Text) +import Data.Maybe (listToMaybe) +import Data.List(find) +import Text.Pandoc.XML.Light.Types + +-- +-- copied from xml-light Text.XML.Proc +-- + +-- | Get the text value of an XML element. This function +-- ignores non-text elements, and concatenates all text elements. +strContent :: Element -> Text +strContent = mconcat . map cdData . onlyText . elContent + +-- | Select only the elements from a list of XML content. +onlyElems :: [Content] -> [Element] +onlyElems xs = [ x | Elem x <- xs ] + +-- | Select only the elements from a parent. +elChildren :: Element -> [Element] +elChildren e = [ x | Elem x <- elContent e ] + +-- | Select only the text from a list of XML content. +onlyText :: [Content] -> [CData] +onlyText xs = [ x | Text x <- xs ] + +-- | Find all immediate children with the given name. +findChildren :: QName -> Element -> [Element] +findChildren q e = filterChildren ((q ==) . elName) e + +-- | Filter all immediate children wrt a given predicate. +filterChildren :: (Element -> Bool) -> Element -> [Element] +filterChildren p e = filter p (onlyElems (elContent e)) + + +-- | Filter all immediate children wrt a given predicate over their names. +filterChildrenName :: (QName -> Bool) -> Element -> [Element] +filterChildrenName p e = filter (p.elName) (onlyElems (elContent e)) + + +-- | Find an immediate child with the given name. +findChild :: QName -> Element -> Maybe Element +findChild q e = listToMaybe (findChildren q e) + +-- | Find an immediate child with the given name. +filterChild :: (Element -> Bool) -> Element -> Maybe Element +filterChild p e = listToMaybe (filterChildren p e) + +-- | Find an immediate child with name matching a predicate. +filterChildName :: (QName -> Bool) -> Element -> Maybe Element +filterChildName p e = listToMaybe (filterChildrenName p e) + +-- | Find the left-most occurrence of an element matching given name. +findElement :: QName -> Element -> Maybe Element +findElement q e = listToMaybe (findElements q e) + +-- | Filter the left-most occurrence of an element wrt. given predicate. +filterElement :: (Element -> Bool) -> Element -> Maybe Element +filterElement p e = listToMaybe (filterElements p e) + +-- | Filter the left-most occurrence of an element wrt. given predicate. +filterElementName :: (QName -> Bool) -> Element -> Maybe Element +filterElementName p e = listToMaybe (filterElementsName p e) + +-- | Find all non-nested occurances of an element. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +findElements :: QName -> Element -> [Element] +findElements qn e = filterElementsName (qn==) e + +-- | Find all non-nested occurrences of an element wrt. given predicate. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +filterElements :: (Element -> Bool) -> Element -> [Element] +filterElements p e + | p e = [e] + | otherwise = concatMap (filterElements p) $ onlyElems $ elContent e + +-- | Find all non-nested occurences of an element wrt a predicate over element names. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +filterElementsName :: (QName -> Bool) -> Element -> [Element] +filterElementsName p e = filterElements (p.elName) e + +-- | Lookup the value of an attribute. +findAttr :: QName -> Element -> Maybe Text +findAttr x e = lookupAttr x (elAttribs e) + +-- | Lookup attribute name from list. +lookupAttr :: QName -> [Attr] -> Maybe Text +lookupAttr x = lookupAttrBy (x ==) + +-- | Lookup the first attribute whose name satisfies the given predicate. +lookupAttrBy :: (QName -> Bool) -> [Attr] -> Maybe Text +lookupAttrBy p as = attrVal `fmap` find (p . attrKey) as + +-- | Lookup the value of the first attribute whose name +-- satisfies the given predicate. +findAttrBy :: (QName -> Bool) -> Element -> Maybe Text +findAttrBy p e = lookupAttrBy p (elAttribs e) + + diff --git a/src/Text/Pandoc/XML/Light/Types.hs b/src/Text/Pandoc/XML/Light/Types.hs new file mode 100644 index 000000000..f338da6ea --- /dev/null +++ b/src/Text/Pandoc/XML/Light/Types.hs @@ -0,0 +1,190 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE DeriveDataTypeable #-} +{- | + Module : Text.Pandoc.XML.Light.Types + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +-} +module Text.Pandoc.XML.Light.Types + ( -- * Basic types, duplicating those from xml-light but with Text + -- instead of String + Line + , Content(..) + , Element(..) + , Attr(..) + , CData(..) + , CDataKind(..) + , QName(..) + , Node(..) + , unode + , unqual + , add_attr + , add_attrs + -- * Conversion functions from xml-light types + , fromXLQName + , fromXLCData + , fromXLElement + , fromXLAttr + , fromXLContent + ) where + +import Data.Text (Text) +import qualified Data.Text as T +import Data.Data (Data) +import Data.Typeable (Typeable) +import qualified Text.XML.Light as XL + +-- +-- type definitions lightly modified from xml-light +-- + +-- | A line is an Integer +type Line = Integer + +-- | XML content +data Content = Elem Element + | Text CData + | CRef Text + deriving (Show, Typeable, Data) + +-- | XML elements +data Element = Element { + elName :: QName, + elAttribs :: [Attr], + elContent :: [Content], + elLine :: Maybe Line + } deriving (Show, Typeable, Data) + +-- | XML attributes +data Attr = Attr { + attrKey :: QName, + attrVal :: Text + } deriving (Eq, Ord, Show, Typeable, Data) + +-- | XML CData +data CData = CData { + cdVerbatim :: CDataKind, + cdData :: Text, + cdLine :: Maybe Line + } deriving (Show, Typeable, Data) + +data CDataKind + = CDataText -- ^ Ordinary character data; pretty printer escapes &, < etc. + | CDataVerbatim -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[.. + | CDataRaw -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up. + deriving ( Eq, Show, Typeable, Data ) + +-- | XML qualified names +data QName = QName { + qName :: Text, + qURI :: Maybe Text, + qPrefix :: Maybe Text + } deriving (Show, Typeable, Data) + + +instance Eq QName where + q1 == q2 = compare q1 q2 == EQ + +instance Ord QName where + compare q1 q2 = + case compare (qName q1) (qName q2) of + EQ -> case (qURI q1, qURI q2) of + (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2) + (u1,u2) -> compare u1 u2 + x -> x + +class Node t where + node :: QName -> t -> Element + +instance Node ([Attr],[Content]) where + node n (attrs,cont) = Element { elName = n + , elAttribs = attrs + , elContent = cont + , elLine = Nothing + } + +instance Node [Attr] where node n as = node n (as,[]::[Content]) +instance Node Attr where node n a = node n [a] +instance Node () where node n () = node n ([]::[Attr]) + +instance Node [Content] where node n cs = node n ([]::[Attr],cs) +instance Node Content where node n c = node n [c] +instance Node ([Attr],Content) where node n (as,c) = node n (as,[c]) +instance Node (Attr,Content) where node n (a,c) = node n ([a],[c]) + +instance Node ([Attr],[Element]) where + node n (as,cs) = node n (as,map Elem cs) + +instance Node ([Attr],Element) where node n (as,c) = node n (as,[c]) +instance Node (Attr,Element) where node n (a,c) = node n ([a],c) +instance Node [Element] where node n es = node n ([]::[Attr],es) +instance Node Element where node n e = node n [e] + +instance Node ([Attr],[CData]) where + node n (as,cs) = node n (as,map Text cs) + +instance Node ([Attr],CData) where node n (as,c) = node n (as,[c]) +instance Node (Attr,CData) where node n (a,c) = node n ([a],c) +instance Node [CData] where node n es = node n ([]::[Attr],es) +instance Node CData where node n e = node n [e] + +instance Node ([Attr],Text) where + node n (as,t) = node n (as, CData { cdVerbatim = CDataText + , cdData = t + , cdLine = Nothing }) + +instance Node (Attr,Text ) where node n (a,t) = node n ([a],t) +instance Node Text where node n t = node n ([]::[Attr],t) + +-- | Create node with unqualified name +unode :: Node t => Text -> t -> Element +unode = node . unqual + +unqual :: Text -> QName +unqual x = QName x Nothing Nothing + +-- | Add an attribute to an element. +add_attr :: Attr -> Element -> Element +add_attr a e = add_attrs [a] e + +-- | Add some attributes to an element. +add_attrs :: [Attr] -> Element -> Element +add_attrs as e = e { elAttribs = as ++ elAttribs e } + +-- +-- conversion from xml-light +-- + +fromXLQName :: XL.QName -> QName +fromXLQName qn = QName { qName = T.pack $ XL.qName qn + , qURI = T.pack <$> XL.qURI qn + , qPrefix = T.pack <$> XL.qPrefix qn } + +fromXLCData :: XL.CData -> CData +fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of + XL.CDataText -> CDataText + XL.CDataVerbatim -> CDataVerbatim + XL.CDataRaw -> CDataRaw + , cdData = T.pack $ XL.cdData cd + , cdLine = XL.cdLine cd } + +fromXLElement :: XL.Element -> Element +fromXLElement el = Element { elName = fromXLQName $ XL.elName el + , elAttribs = map fromXLAttr $ XL.elAttribs el + , elContent = map fromXLContent $ XL.elContent el + , elLine = XL.elLine el } + +fromXLAttr :: XL.Attr -> Attr +fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s) + +fromXLContent :: XL.Content -> Content +fromXLContent (XL.Elem el) = Elem $ fromXLElement el +fromXLContent (XL.Text cd) = Text $ fromXLCData cd +fromXLContent (XL.CRef s) = CRef (T.pack s) + + |