From 74d53f4347623631c17be557d3682dd807214263 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 17 Mar 2013 17:43:51 -0700 Subject: Added Text.Pandoc.Readers.OPML, exporting readOPML. The _note attribute is supported. This is unofficial, but used e.g. in OmniOutliner and supported by multimarkdown. We treat the contents as markdown blocks under a section header. Added to documentation and tests. --- src/Text/Pandoc/Readers/OPML.hs | 95 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/Text/Pandoc/Readers/OPML.hs (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs new file mode 100644 index 000000000..53b599349 --- /dev/null +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -0,0 +1,95 @@ +module Text.Pandoc.Readers.OPML ( readOPML ) where +import Data.Char (toUpper) +import Text.Pandoc.Options +import Text.Pandoc.Definition +import Text.Pandoc.Builder +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Readers.Markdown (readMarkdown) +import Text.XML.Light +import Text.HTML.TagSoup.Entity (lookupEntity) +import Data.Generics +import Data.Monoid +import Control.Monad.State +import Control.Applicative ((<$>), (<$)) + +type OPML = State OPMLState + +data OPMLState = OPMLState{ + opmlSectionLevel :: Int + , opmlDocTitle :: Inlines + , opmlDocAuthors :: [Inlines] + , opmlDocDate :: Inlines + } deriving Show + +readOPML :: ReaderOptions -> String -> Pandoc +readOPML _ inp = setTitle (opmlDocTitle st') + $ setAuthors (opmlDocAuthors st') + $ setDate (opmlDocDate st') + $ doc $ mconcat bs + where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp) + OPMLState{ opmlSectionLevel = 0 + , opmlDocTitle = mempty + , opmlDocAuthors = [] + , opmlDocDate = mempty + } + +-- normalize input, consolidating adjacent Text and CRef elements +normalizeTree :: [Content] -> [Content] +normalizeTree = everywhere (mkT go) + where go :: [Content] -> [Content] + go (Text (CData CDataRaw _ _):xs) = xs + go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = + Text (CData CDataText (s1 ++ s2) z):xs + go (Text (CData CDataText s1 z):CRef r:xs) = + Text (CData CDataText (s1 ++ convertEntity r) z):xs + go (CRef r:Text (CData CDataText s1 z):xs) = + Text (CData CDataText (convertEntity r ++ s1) z):xs + go (CRef r1:CRef r2:xs) = + Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + go xs = xs + +convertEntity :: String -> String +convertEntity e = maybe (map toUpper e) (:[]) (lookupEntity e) + +-- convenience function to get an attribute value, defaulting to "" +attrValue :: String -> Element -> String +attrValue attr elt = + case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of + Just z -> z + Nothing -> "" + +asHtml :: String -> Inlines +asHtml s = case readHtml def s of + Pandoc _ [Plain ils] -> fromList ils + _ -> mempty + +asMarkdown :: String -> Blocks +asMarkdown s = fromList bs + where Pandoc _ bs = readMarkdown def s + +getBlocks :: Element -> OPML Blocks +getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) + +parseBlock :: Content -> OPML Blocks +parseBlock (Elem e) = + case qName (elName e) of + "ownerName" -> mempty <$ modify (\st -> + st{opmlDocAuthors = [text $ strContent e]}) + "dateModified" -> mempty <$ modify (\st -> + st{opmlDocDate = text $ strContent e}) + "title" -> mempty <$ modify (\st -> + st{opmlDocTitle = text $ strContent e}) + "outline" -> gets opmlSectionLevel >>= sect . (+1) + "?xml" -> return mempty + _ -> getBlocks e + where sect n = do let headerText = asHtml $ attrValue "text" e + let noteBlocks = asMarkdown $ attrValue "_note" e + modify $ \st -> st{ opmlSectionLevel = n } + bs <- getBlocks e + modify $ \st -> st{ opmlSectionLevel = n - 1 } + let headerText' = case attrValue "type" e of + "link" -> link + (attrValue "url" e) "" headerText + _ -> headerText + return $ header n headerText' <> noteBlocks <> bs +parseBlock _ = return mempty -- cgit v1.2.3