From 967e7f5fb990b29de48b37be1db40fb149a8cf55 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 14 Feb 2021 22:29:21 -0800 Subject: Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light... ..and add new definitions isomorphic to xml-light's, but with Text instead of String. This allows us to keep most of the code in existing readers that use xml-light, but avoid lots of unnecessary allocation. We also add versions of the functions from xml-light's Text.XML.Light.Output and Text.XML.Light.Proc that operate on our modified XML types, and functions that convert xml-light types to our types (since some of our dependencies, like texmath, use xml-light). Update golden tests for docx and pptx. OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`. Docx: Do a manual traversal to unwrap sdt and smartTag. This is faster, and needed to pass the tests. Benchmarks: A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) C = this commit | Reader | A | B | C | | ------- | ----- | ------ | ----- | | docbook | 18 ms | 12 ms | 10 ms | | opml | 65 ms | 62 ms | 35 ms | | jats | 15 ms | 11 ms | 9 ms | | docx | 72 ms | 69 ms | 44 ms | | odt | 78 ms | 41 ms | 28 ms | | epub | 64 ms | 61 ms | 56 ms | | fb2 | 14 ms | 5 ms | 4 ms | --- src/Text/Pandoc/Readers/DocBook.hs | 76 +++++----- src/Text/Pandoc/Readers/Docx/Parse.hs | 163 ++++++++++----------- src/Text/Pandoc/Readers/Docx/Parse/Styles.hs | 31 ++-- src/Text/Pandoc/Readers/Docx/Util.hs | 27 ++-- src/Text/Pandoc/Readers/EPUB.hs | 65 ++++---- src/Text/Pandoc/Readers/FB2.hs | 93 ++++++------ src/Text/Pandoc/Readers/JATS.hs | 58 ++++---- src/Text/Pandoc/Readers/OPML.hs | 29 ++-- src/Text/Pandoc/Readers/Odt.hs | 5 +- src/Text/Pandoc/Readers/Odt/ContentReader.hs | 13 +- src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs | 3 +- src/Text/Pandoc/Readers/Odt/Generic/Utils.hs | 33 ++--- .../Pandoc/Readers/Odt/Generic/XMLConverter.hs | 23 ++- src/Text/Pandoc/Readers/Odt/Namespaces.hs | 11 +- src/Text/Pandoc/Readers/Odt/StyleReader.hs | 23 +-- 15 files changed, 309 insertions(+), 344 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index ad0108843..e201b54fe 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -31,8 +31,7 @@ import Text.Pandoc.Options import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light {- @@ -578,26 +577,27 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> + convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- convenience function named :: Text -> Element -> Bool -named s e = qName (elName e) == T.unpack s +named s e = qName (elName e) == s -- @@ -634,7 +634,7 @@ isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blockTags isBlockElement _ = False -blockTags :: [String] +blockTags :: [Text] blockTags = [ "abstract" , "ackno" @@ -721,7 +721,7 @@ blockTags = , "variablelist" ] ++ admonitionTags -admonitionTags :: [String] +admonitionTags :: [Text] admonitionTags = ["important","caution","note","tip","warning"] -- Trim leading and trailing newline characters @@ -779,10 +779,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> DB m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "toc" -> skip -- skip TOC, since in pandoc it's autogenerated @@ -837,7 +837,7 @@ parseBlock (Elem e) = "refsect2" -> sect 2 "refsect3" -> sect 3 "refsection" -> gets dbSectionLevel >>= sect . (+1) - l | l `elem` admonitionTags -> parseAdmonition $ T.pack l + l | l `elem` admonitionTags -> parseAdmonition l "area" -> skip "areaset" -> skip "areaspec" -> skip @@ -899,7 +899,7 @@ parseBlock (Elem e) = "subtitle" -> return mempty -- handled in parent element _ -> skip >> getBlocks e where skip = do - let qn = T.pack $ qName $ elName e + let qn = qName $ elName e let name = if "pi-" `T.isPrefixOf` qn then " qn <> "?>" else qn @@ -911,7 +911,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ T.pack $ strContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -965,7 +965,7 @@ parseBlock (Elem e) = w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> (x >= '0' && x <= '9') - || x == '.') (T.pack w) + || x == '.') w if n > 0 then Just n else Nothing let numrows = case bodyrows of [] -> 0 @@ -1048,12 +1048,12 @@ parseMixed container conts = do x <- parseMixed container rs return $ p <> b <> x -parseRow :: PandocMonad m => [String] -> Element -> DB m [Cell] +parseRow :: PandocMonad m => [Text] -> Element -> DB m [Cell] parseRow cn = do let isEntry x = named "entry" x || named "td" x || named "th" x mapM (parseEntry cn) . filterChildren isEntry -parseEntry :: PandocMonad m => [String] -> Element -> DB m Cell +parseEntry :: PandocMonad m => [Text] -> Element -> DB m Cell parseEntry cn el = do let colDistance sa ea = do let iStrt = elemIndex sa cn @@ -1075,7 +1075,7 @@ getInlines :: PandocMonad m => Element -> DB m Inlines getInlines e' = trimInlines . mconcat <$> mapM parseInline (elContent e') -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -1084,9 +1084,9 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> DB m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s +parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = - return $ text $ maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref + return $ text $ maybe (T.toUpper ref) T.pack $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "anchor" -> do @@ -1138,7 +1138,7 @@ parseInline (Elem e) = "userinput" -> codeWithLang "systemitem" -> codeWithLang "varargs" -> return $ code "(...)" - "keycap" -> return (str $ T.pack $ strContent e) + "keycap" -> return (str $ strContent e) "keycombo" -> keycombo <$> mapM parseInline (elContent e) "menuchoice" -> menuchoice <$> @@ -1150,17 +1150,17 @@ parseInline (Elem e) = let title = case attrValue "endterm" e of "" -> maybe "???" xrefTitleByElem (findElementById linkend content) - endterm -> maybe "???" (T.pack . strContent) + endterm -> maybe "???" strContent (findElementById endterm content) return $ link ("#" <> linkend) "" (text title) - "email" -> return $ link ("mailto:" <> T.pack (strContent e)) "" - $ str $ T.pack $ strContent e - "uri" -> return $ link (T.pack $ strContent e) "" $ str $ T.pack $ strContent e + "email" -> return $ link ("mailto:" <> strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e "ulink" -> innerInlines (link (attrValue "url" e) "") "link" -> do ils <- innerInlines id let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "linkend" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, T.words $ attrValue "role" e, []) @@ -1180,7 +1180,7 @@ parseInline (Elem e) = "pi-asciidoc-br" -> return linebreak _ -> skip >> innerInlines id where skip = do - let qn = T.pack $ qName $ elName e + let qn = qName $ elName e let name = if "pi-" `T.isPrefixOf` qn then " qn <> "?>" else qn @@ -1193,7 +1193,7 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ T.pack $ strContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e simpleList = mconcat . intersperse (str "," <> space) <$> mapM getInlines (filterChildren (named "member") e) segmentedList = do @@ -1234,10 +1234,10 @@ parseInline (Elem e) = "sect5" -> descendantContent "title" el "cmdsynopsis" -> descendantContent "command" el "funcsynopsis" -> descendantContent "function" el - _ -> T.pack $ qName (elName el) ++ "_title" + _ -> qName (elName el) <> "_title" where xrefLabel = attrValue "xreflabel" el - descendantContent name = maybe "???" (T.pack . strContent) + descendantContent name = maybe "???" strContent . filterElementName (\n -> qName n == name) -- | Extract a math equation from an element @@ -1258,7 +1258,7 @@ equation e constructor = mathMLEquations :: [Text] mathMLEquations = map writeTeX $ rights $ readMath (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml") - (readMathML . T.pack . showElement) + (readMathML . showElement) latexEquations :: [Text] latexEquations = readMath (\x -> qName (elName x) == "mathphrase") @@ -1272,8 +1272,8 @@ equation e constructor = -- | Get the actual text stored in a CData block. 'showContent' -- returns the text still surrounded by the [[CDATA]] tags. showVerbatimCData :: Content -> Text -showVerbatimCData (Text (CData _ d _)) = T.pack d -showVerbatimCData c = T.pack $ showContent c +showVerbatimCData (Text (CData _ d _)) = d +showVerbatimCData c = showContent c -- | Set the prefix of a name to 'Nothing' diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 056dab6c2..c76f3c171 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -63,6 +63,7 @@ import Data.Char (chr, ord, readLitChar) import Data.List import qualified Data.Map as M import qualified Data.Text as T +import Data.Text (Text) import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util @@ -72,9 +73,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont) -import Text.XML.Light -import qualified Text.XML.Light.Cursor as XMLC -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -128,37 +127,23 @@ mapD f xs = in concatMapM handler xs -unwrap :: NameSpaces -> Content -> [Content] -unwrap ns (Elem element) +unwrapElement :: NameSpaces -> Element -> [Element] +unwrapElement ns element | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element - = concatMap (unwrap ns . Elem) (elChildren sdtContent) + = concatMap (unwrapElement ns) (elChildren sdtContent) | isElem ns "w" "smartTag" element - = concatMap (unwrap ns . Elem) (elChildren element) -unwrap _ content = [content] + = concatMap (unwrapElement ns) (elChildren element) + | otherwise + = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }] -unwrapChild :: NameSpaces -> Content -> Content -unwrapChild ns (Elem element) = - Elem $ element { elContent = concatMap (unwrap ns) (elContent element) } -unwrapChild _ content = content +unwrapContent :: NameSpaces -> Content -> [Content] +unwrapContent ns (Elem element) = map Elem $ unwrapElement ns element +unwrapContent _ content = [content] -walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor -walkDocument' ns cur = - let modifiedCur = XMLC.modifyContent (unwrapChild ns) cur - in - case XMLC.nextDF modifiedCur of - Just cur' -> walkDocument' ns cur' - Nothing -> XMLC.root modifiedCur - -walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument :: NameSpaces -> Element -> Element walkDocument ns element = - let cur = XMLC.fromContent (Elem element) - cur' = walkDocument' ns cur - in - case XMLC.toTree cur' of - Elem element' -> Just element' - _ -> Nothing - + element{ elContent = concatMap (unwrapContent ns) (elContent element) } newtype Docx = Docx Document deriving Show @@ -361,9 +346,9 @@ getDocumentXmlPath zf = do fp <- findAttr (QName "Target" Nothing Nothing) rel -- sometimes there will be a leading slash, which windows seems to -- have trouble with. - return $ case fp of + return $ case T.unpack fp of '/' : fp' -> fp' - _ -> fp + fp' -> fp' archiveToDocument :: Archive -> D Document archiveToDocument zf = do @@ -372,7 +357,7 @@ archiveToDocument zf = do docElem <- maybeToD $ parseXMLFromEntry entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + let bodyElem' = walkDocument namespaces bodyElem body <- elemToBody namespaces bodyElem' return $ Document namespaces body @@ -414,8 +399,8 @@ archiveToNotes zf = fn_namespaces = maybe [] elemToNameSpaces fnElem en_namespaces = maybe [] elemToNameSpaces enElem ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= walkDocument ns >>= elemToNotes ns "footnote" - en = enElem >>= walkDocument ns >>= elemToNotes ns "endnote" + fn = fnElem >>= elemToNotes ns "footnote" . walkDocument ns + en = enElem >>= elemToNotes ns "endnote" . walkDocument ns in Notes ns fn en @@ -424,7 +409,8 @@ archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf >>= parseXMLFromEntry cmts_namespaces = maybe [] elemToNameSpaces cmtsElem - cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces) + cmts = elemToComments cmts_namespaces . walkDocument cmts_namespaces <$> + cmtsElem in case cmts of Just c -> Comments cmts_namespaces c @@ -443,8 +429,8 @@ filePathToRelType path docXmlPath = relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship relElemToRelationship relType element | qName (elName element) == "Relationship" = do - relId <- findAttrText (QName "Id" Nothing Nothing) element - target <- findAttrText (QName "Target" Nothing Nothing) element + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element return $ Relationship relType relId target relElemToRelationship _ _ = Nothing @@ -485,10 +471,10 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do loElemToLevelOverride :: NameSpaces -> Element -> Maybe LevelOverride loElemToLevelOverride ns element | isElem ns "w" "lvlOverride" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element let startOverride = findChildByName ns "w" "startOverride" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger lvl = findChildByName ns "w" "lvl" element >>= levelElemToLevel ns return $ LevelOverride ilvl startOverride lvl @@ -497,9 +483,9 @@ loElemToLevelOverride _ _ = Nothing numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttrTextByName ns "w" "numId" element + numId <- findAttrByName ns "w" "numId" element absNumId <- findChildByName ns "w" "abstractNumId" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let lvlOverrides = mapMaybe (loElemToLevelOverride ns) (findChildrenByName ns "w" "lvlOverride" element) @@ -509,7 +495,7 @@ numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttrTextByName ns "w" "abstractNumId" element + absNumId <- findAttrByName ns "w" "abstractNumId" element let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels @@ -518,14 +504,14 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element fmt <- findChildByName ns "w" "numFmt" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" txt <- findChildByName ns "w" "lvlText" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let start = findChildByName ns "w" "start" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger return (Level ilvl fmt txt start) levelElemToLevel _ _ = Nothing @@ -546,11 +532,11 @@ archiveToNumbering :: Archive -> Numbering archiveToNumbering archive = fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map T.Text Element) +elemToNotes :: NameSpaces -> Text -> Element -> Maybe (M.Map T.Text Element) elemToNotes ns notetype element | isElem ns "w" (notetype <> "s") element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" notetype element) in @@ -562,7 +548,7 @@ elemToComments :: NameSpaces -> Element -> M.Map T.Text Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" "comment" element) in @@ -622,12 +608,12 @@ elemToParIndentation ns element | isElem ns "w" "ind" element = stringToInteger , hangingParIndent = findAttrByName ns "w" "hanging" element >>= - stringToInteger} + stringToInteger } elemToParIndentation _ _ = Nothing -testBitMask :: String -> Int -> Bool +testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = - case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> (n' .|. n) /= 0 @@ -642,7 +628,7 @@ elemToBodyPart ns element | isElem ns "w" "p" element , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do - expsLst <- eitherToD $ readOMML $ T.pack $ showElement c + expsLst <- eitherToD $ readOMML $ showElement c return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element @@ -666,7 +652,7 @@ elemToBodyPart ns element | isElem ns "w" "tbl" element = do let caption' = findChildByName ns "w" "tblPr" element >>= findChildByName ns "w" "tblCaption" - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" caption = fromMaybe "" caption' grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g @@ -705,8 +691,8 @@ getTitleAndAlt :: NameSpaces -> Element -> (T.Text, T.Text) getTitleAndAlt ns element = let mbDocPr = findChildByName ns "wp" "inline" element >>= findChildByName ns "wp" "docPr" - title = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "title") - alt = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "descr") + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart @@ -718,7 +704,7 @@ elemToParPart ns element = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrTextByName ns "r" "embed" + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) @@ -728,7 +714,7 @@ elemToParPart ns element | isElem ns "w" "r" element , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrTextByName ns "r" "id" + >>= findAttrByName ns "r" "id" in case drawing of -- Todo: check out title and attr for deprecated format. @@ -797,7 +783,7 @@ elemToParPart ns element fldCharState <- gets stateFldCharState case fldCharState of FldCharOpen -> do - info <- eitherToD $ parseFieldInfo $ T.pack $ strContent instrText + info <- eitherToD $ parseFieldInfo $ strContent instrText modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} return NullParPart _ -> return NullParPart @@ -818,48 +804,48 @@ elemToParPart ns element return $ ChangedRuns change runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttrTextByName ns "w" "id" element - , Just bmName <- findAttrTextByName ns "w" "name" element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttrTextByName ns "r" "id" element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> - case findAttrTextByName ns "w" "anchor" element of + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttrTextByName ns "w" "anchor" element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - fmap PlainOMath (eitherToD $ readOMML $ T.pack $ showElement element) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttrTextByName ns "w" "id" element - , Just cmtAuthor <- findAttrTextByName ns "w" "author" element - , cmtDate <- findAttrTextByName ns "w" "date" element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem @@ -878,7 +864,7 @@ elemToExtent drawingElem = where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem - >>= findAttr (QName at Nothing Nothing) >>= safeRead . T.pack + >>= findAttr (QName at Nothing Nothing) >>= safeRead childElemToRun :: NameSpaces -> Element -> D Run @@ -889,7 +875,7 @@ childElemToRun ns element = let (title, alt) = getTitleAndAlt ns element a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrText (QName "embed" (lookup "r" ns) (Just "r")) + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= @@ -902,7 +888,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttrTextByName ns "w" "id" element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -910,7 +896,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttrTextByName ns "w" "id" element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -963,15 +949,15 @@ getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Insertion (ChangeInfo cId cAuthor mcDate) getTrackedChange ns element | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Deletion (ChangeInfo cId cAuthor mcDate) getTrackedChange _ _ = Nothing @@ -980,7 +966,7 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (fmap ParaStyleId . findAttrTextByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style @@ -1012,7 +998,7 @@ elemToRunStyleD ns element charStyles <- asks envCharStyles let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrTextByName ns "w" "val" >>= + findAttrByName ns "w" "val" >>= flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle @@ -1022,7 +1008,7 @@ elemToRunElem ns element | isElem ns "w" "t" element || isElem ns "w" "delText" element || isElem ns "m" "t" element = do - let str = T.pack $ strContent element + let str = strContent element font <- asks envFont case font of Nothing -> return $ TextRun str @@ -1044,14 +1030,14 @@ getSymChar :: NameSpaces -> Element -> RunElem getSymChar ns element | Just s <- lowerFromPrivate <$> getCodepoint , Just font <- getFont = - case readLitChar ("\\x" ++ s) of + case readLitChar ("\\x" ++ T.unpack s) of [(char, _)] -> TextRun . maybe "" T.singleton $ getUnicode font char _ -> TextRun "" where getCodepoint = findAttrByName ns "w" "char" element - getFont = textToFont . T.pack =<< findAttrByName ns "w" "font" element - lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + getFont = textToFont =<< findAttrByName ns "w" "font" element + lowerFromPrivate t | "F" `T.isPrefixOf` t = "0" <> T.drop 1 t + | otherwise = t getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1061,8 +1047,9 @@ elemToRunElems ns element let qualName = elemName ns "w" let font = do fontElem <- findElement (qualName "rFonts") element - textToFont . T.pack =<< - foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] + textToFont =<< + foldr ((<|>) . (flip findAttr fontElem . qualName)) + Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index edade8654..0d7271d6a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -48,12 +48,13 @@ import Data.Function (on) import Data.String (IsString(..)) import qualified Data.Map as M import qualified Data.Text as T +import qualified Data.Text.Read +import Data.Text (Text) import Data.Maybe import Data.Coerce import Text.Pandoc.Readers.Docx.Util import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light newtype CharStyleId = CharStyleId T.Text deriving (Show, Eq, Ord, IsString, FromStyleId) @@ -109,7 +110,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool , isRTL :: Maybe Bool , isForceCTL :: Maybe Bool , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String + , rUnderline :: Maybe Text , rParentStyle :: Maybe CharStyle } deriving Show @@ -159,7 +160,7 @@ isBasedOnStyle ns element parentStyle , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element @@ -169,7 +170,7 @@ isBasedOnStyle ns element parentStyle | otherwise = False class HasStyleId a => ElemToStyle a where - cStyleType :: Maybe a -> String + cStyleType :: Maybe a -> Text elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a class FromStyleId (StyleId a) => HasStyleId a where @@ -226,8 +227,10 @@ buildBasedOnList ns element rootStyle = stys -> stys ++ concatMap (buildBasedOnList ns element . Just) stys -stringToInteger :: String -> Maybe Integer -stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +stringToInteger :: Text -> Maybe Integer +stringToInteger s = case Data.Text.Read.decimal s of + Right (x,_) -> Just x + Left _ -> Nothing checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag @@ -247,7 +250,7 @@ checkOnOff _ _ _ = Nothing elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle elemToCharStyle ns element parentStyle - = CharStyle <$> (CharStyleId <$> findAttrTextByName ns "w" "styleId" element) + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) <*> getElementStyleName ns element <*> Just (elemToRunStyle ns element parentStyle) @@ -281,7 +284,7 @@ elemToRunStyle _ _ _ = defaultRunStyle getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element | Just styleName <- getElementStyleName ns element - , Just n <- stringToInteger . T.unpack =<< + , Just n <- stringToInteger =<< (T.stripPrefix "heading " . T.toLower $ fromStyleName styleName) , n > 0 = Just (styleName, fromInteger n) @@ -289,8 +292,8 @@ getHeaderLevel _ _ = Nothing getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> - ((findChildByName ns "w" "name" el >>= findAttrTextByName ns "w" "val") - <|> findAttrTextByName ns "w" "styleId" el) + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (T.Text, T.Text) getNumInfo ns element = do @@ -298,15 +301,15 @@ getNumInfo ns element = do findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= findChildByName ns "w" "ilvl" >>= - findAttrTextByName ns "w" "val") + findAttrByName ns "w" "val") numId <- numPr >>= findChildByName ns "w" "numId" >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" return (numId, lvl) elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle elemToParStyleData ns element parentStyle - | Just styleId <- findAttrTextByName ns "w" "styleId" element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just styleName <- getElementStyleName ns element = Just $ ParStyle { diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index f9c9a8e26..21df03d9e 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.StyleMaps Copyright : © 2014-2020 Jesse Rosenthal , @@ -18,51 +19,45 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName - , findAttrText , findAttrByName - , findAttrTextByName ) where import Data.Maybe (mapMaybe) import qualified Data.Text as T -import Text.XML.Light +import Data.Text (Text) +import Text.Pandoc.XML.Light -type NameSpaces = [(String, String)] +type NameSpaces = [(Text, Text)] elemToNameSpaces :: Element -> NameSpaces elemToNameSpaces = mapMaybe attrToNSPair . elAttribs -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) attrToNSPair _ = Nothing -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (lookup prefix ns) (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = let ns' = ns ++ elemToNameSpaces element in qName (elName element) == name && qURI (elName element) == lookup prefix ns' -findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element findChildByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findChild (elemName ns' pref name) el -findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] findChildrenByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findChildren (elemName ns' pref name) el -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText x = fmap T.pack . findAttr x - -findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findAttr (elemName ns' pref name) el -findAttrTextByName :: NameSpaces -> String -> String -> Element -> Maybe T.Text -findAttrTextByName a b c = fmap T.pack . findAttrByName a b c diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 369c4f0c9..eb8d2405d 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -23,8 +23,8 @@ import Control.DeepSeq (NFData, deepseq) import Control.Monad (guard, liftM, liftM2, mplus) import Control.Monad.Except (throwError) import qualified Data.ByteString.Lazy as BL (ByteString) -import Data.List (isInfixOf) import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Map as M (Map, elems, fromList, lookup) import Data.Maybe (mapMaybe) import qualified Data.Text.Lazy as TL @@ -40,13 +40,12 @@ import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension) import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Options (ReaderOptions (..)) import Text.Pandoc.Readers.HTML (readHtml) -import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) +import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI, tshow) import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy) import Text.Pandoc.Walk (query, walk) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light -type Items = M.Map String (FilePath, MimeType) +type Items = M.Map Text (FilePath, MimeType) readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc readEPUB opts bytes = case toArchiveOrFail bytes of @@ -126,26 +125,27 @@ imageToPandoc s = B.doc . B.para $ B.image (T.pack s) "" mempty imageMimes :: [MimeType] imageMimes = ["image/gif", "image/jpeg", "image/png"] -type CoverId = String +type CoverId = Text type CoverImage = FilePath -parseManifest :: (PandocMonad m) => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) +parseManifest :: (PandocMonad m) + => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) parseManifest content coverId = do manifest <- findElementE (dfName "manifest") content let items = findChildren (dfName "item") manifest r <- mapM parseItem items let cover = findAttr (emptyName "href") =<< filterChild findCover manifest - return (cover `mplus` coverId, M.fromList r) + return (T.unpack <$> (cover `mplus` coverId), M.fromList r) where - findCover e = maybe False (isInfixOf "cover-image") + findCover e = maybe False (T.isInfixOf "cover-image") (findAttr (emptyName "properties") e) || Just True == liftM2 (==) coverId (findAttr (emptyName "id") e) parseItem e = do uid <- findAttrE (emptyName "id") e href <- findAttrE (emptyName "href") e mime <- findAttrE (emptyName "media-type") e - return (uid, (href, T.pack mime)) + return (uid, (T.unpack href, mime)) parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] parseSpine is e = do @@ -173,11 +173,11 @@ parseMeta content = do -- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem parseMetaItem :: Element -> Meta -> Meta parseMetaItem e@(stripNamespace . elName -> field) meta = - addMetaField (renameMeta field) (B.str $ T.pack $ strContent e) meta + addMetaField (renameMeta field) (B.str $ strContent e) meta -renameMeta :: String -> T.Text +renameMeta :: Text -> Text renameMeta "creator" = "author" -renameMeta s = T.pack s +renameMeta s = s getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do @@ -187,7 +187,7 @@ getManifest archive = do ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) as <- fmap (map attrToPair . elAttribs) (findElementE (QName "rootfile" (Just ns) Nothing) docElem) - manifestFile <- mkE "Root not found" (lookup "full-path" as) + manifestFile <- T.unpack <$> mkE "Root not found" (lookup "full-path" as) let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive @@ -201,7 +201,8 @@ fixInternalReferences pathToFile = . walk (fixBlockIRs filename) . walk (fixInlineIRs filename) where - (root, T.unpack . escapeURI . T.pack -> filename) = splitFileName pathToFile + (root, T.unpack . escapeURI . T.pack -> filename) = + splitFileName pathToFile fixInlineIRs :: String -> Inline -> Inline fixInlineIRs s (Span as v) = @@ -214,7 +215,7 @@ fixInlineIRs s (Link as is t) = Link (fixAttrs s as) is t fixInlineIRs _ v = v -prependHash :: [T.Text] -> Inline -> Inline +prependHash :: [Text] -> Inline -> Inline prependHash ps l@(Link attr is (url, tit)) | or [s `T.isPrefixOf` url | s <- ps] = Link attr is ("#" <> url, tit) @@ -231,16 +232,17 @@ fixBlockIRs s (CodeBlock as code) = fixBlockIRs _ b = b fixAttrs :: FilePath -> B.Attr -> B.Attr -fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) +fixAttrs s (ident, cs, kvs) = + (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) -addHash :: String -> T.Text -> T.Text +addHash :: FilePath -> Text -> Text addHash _ "" = "" addHash s ident = T.pack (takeFileName s) <> "#" <> ident -removeEPUBAttrs :: [(T.Text, T.Text)] -> [(T.Text, T.Text)] +removeEPUBAttrs :: [(Text, Text)] -> [(Text, Text)] removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs -isEPUBAttr :: (T.Text, a) -> Bool +isEPUBAttr :: (Text, a) -> Bool isEPUBAttr (k, _) = "epub:" `T.isPrefixOf` k -- Library @@ -257,33 +259,33 @@ uncurry3 f (a, b, c) = f a b c -- Utility -stripNamespace :: QName -> String +stripNamespace :: QName -> Text stripNamespace (QName v _ _) = v -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) attrToNSPair _ = Nothing -attrToPair :: Attr -> (String, String) +attrToPair :: Attr -> (Text, Text) attrToPair (Attr (QName name _ _) val) = (name, val) -defaultNameSpace :: Maybe String +defaultNameSpace :: Maybe Text defaultNameSpace = Just "http://www.idpf.org/2007/opf" -dfName :: String -> QName +dfName :: Text -> QName dfName s = QName s defaultNameSpace Nothing -emptyName :: String -> QName +emptyName :: Text -> QName emptyName s = QName s Nothing Nothing -- Convert Maybe interface to Either -findAttrE :: PandocMonad m => QName -> Element -> m String +findAttrE :: PandocMonad m => QName -> Element -> m Text findAttrE q e = mkE "findAttr" $ findAttr q e findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise . unEscapeString -> path) a = - mkE ("No entry on path: " ++ path) $ findEntryByPath path a + mkE ("No entry on path: " <> T.pack path) $ findEntryByPath path a parseXMLDocE :: PandocMonad m => Entry -> m Element parseXMLDocE entry = @@ -293,7 +295,8 @@ parseXMLDocE entry = fp = T.pack $ eRelativePath entry findElementE :: PandocMonad m => QName -> Element -> m Element -findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x +findElementE e x = + mkE ("Unable to find element: " <> tshow e) $ findElement e x -mkE :: PandocMonad m => String -> Maybe a -> m a -mkE s = maybe (throwError . PandocParseError $ T.pack s) return +mkE :: PandocMonad m => Text -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs index b804eab4f..66e390bd7 100644 --- a/src/Text/Pandoc/Readers/FB2.hs +++ b/src/Text/Pandoc/Readers/FB2.hs @@ -25,7 +25,6 @@ TODO: module Text.Pandoc.Readers.FB2 ( readFB2 ) where import Control.Monad.Except (throwError) import Control.Monad.State.Strict -import Data.ByteString.Lazy.Char8 ( pack ) import Data.ByteString.Base64.Lazy import Data.Functor import Data.List (intersperse) @@ -42,8 +41,8 @@ import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Shared (crFilter) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light +import qualified Text.Pandoc.UTF8 as UTF8 type FB2 m = StateT FB2State m @@ -85,12 +84,12 @@ removeHash t = case T.uncons t of Just ('#', xs) -> xs _ -> t -convertEntity :: String -> Text -convertEntity e = maybe (T.toUpper $ T.pack e) T.pack $ lookupEntity e +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack $ lookupEntity (T.unpack e) parseInline :: PandocMonad m => Content -> FB2 m Inlines parseInline (Elem e) = - case T.pack $ qName $ elName e of + case qName $ elName e of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -98,12 +97,12 @@ parseInline (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement name pure mempty -parseInline (Text x) = pure $ text $ T.pack $ cdData x +parseInline (Text x) = pure $ text $ cdData x parseInline (CRef r) = pure $ str $ convertEntity r parseSubtitle :: PandocMonad m => Element -> FB2 m Blocks @@ -113,7 +112,7 @@ parseSubtitle e = headerWith ("", ["unnumbered"], []) <$> gets fb2SectionLevel < parseRootElement :: PandocMonad m => Element -> FB2 m Blocks parseRootElement e = - case T.pack $ qName $ elName e of + case qName $ elName e of "FictionBook" -> do -- Parse notes before parsing the rest of the content. case filterChild isNotesBody e of @@ -146,7 +145,7 @@ parseNote e = Just sectionId -> do content <- mconcat <$> mapM parseSectionChild (dropTitle $ elChildren e) oldNotes <- gets fb2Notes - modify $ \s -> s { fb2Notes = M.insert ("#" <> T.pack sectionId) content oldNotes } + modify $ \s -> s { fb2Notes = M.insert ("#" <> sectionId) content oldNotes } pure () where isTitle x = qName (elName x) == "title" @@ -158,7 +157,7 @@ parseNote e = -- | Parse a child of @\@ element. parseFictionBookChild :: PandocMonad m => Element -> FB2 m Blocks parseFictionBookChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "stylesheet" -> pure mempty -- stylesheet is ignored "description" -> mempty <$ mapM_ parseDescriptionChild (elChildren e) "body" -> if isNotesBody e @@ -170,7 +169,7 @@ parseFictionBookChild e = -- | Parse a child of @\@ element. parseDescriptionChild :: PandocMonad m => Element -> FB2 m () parseDescriptionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title-info" -> mapM_ parseTitleInfoChild (elChildren e) "src-title-info" -> pure () -- ignore "document-info" -> pure () @@ -184,7 +183,7 @@ parseDescriptionChild e = -- | Parse a child of @\@ element. parseBodyChild :: PandocMonad m => Element -> FB2 m Blocks parseBodyChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "image" -> parseImageElement e "title" -> header <$> gets fb2SectionLevel <*> parseTitleType (elContent e) "epigraph" -> parseEpigraph e @@ -198,7 +197,10 @@ parseBinaryElement e = (Nothing, _) -> report $ IgnoredElement "binary without id attribute" (Just _, Nothing) -> report $ IgnoredElement "binary without content-type attribute" - (Just filename, contentType) -> insertMedia filename (T.pack <$> contentType) (decodeLenient (pack (strContent e))) + (Just filename, contentType) -> + insertMedia (T.unpack filename) contentType + (decodeLenient + (UTF8.fromTextLazy . TL.fromStrict . strContent $ e)) -- * Type parsers @@ -208,13 +210,13 @@ parseAuthor e = T.unwords . catMaybes <$> mapM parseAuthorChild (elChildren e) parseAuthorChild :: PandocMonad m => Element -> FB2 m (Maybe Text) parseAuthorChild e = - case T.pack $ qName $ elName e of - "first-name" -> pure $ Just $ T.pack $ strContent e - "middle-name" -> pure $ Just $ T.pack $ strContent e - "last-name" -> pure $ Just $ T.pack $ strContent e - "nickname" -> pure $ Just $ T.pack $ strContent e - "home-page" -> pure $ Just $ T.pack $ strContent e - "email" -> pure $ Just $ T.pack $ strContent e + case qName $ elName e of + "first-name" -> pure $ Just $ strContent e + "middle-name" -> pure $ Just $ strContent e + "last-name" -> pure $ Just $ strContent e + "nickname" -> pure $ Just $ strContent e + "home-page" -> pure $ Just $ strContent e + "email" -> pure $ Just $ strContent e name -> do report $ IgnoredElement $ name <> " in author" pure Nothing @@ -238,13 +240,13 @@ parseTitleContent _ = pure Nothing parseImageElement :: PandocMonad m => Element -> FB2 m Blocks parseImageElement e = case href of - Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash $ T.pack src) title alt + Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash src) title alt Nothing -> do report $ IgnoredElement " image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - title = maybe "" T.pack $ findAttr (unqual "title") e - imgId = maybe "" T.pack $ findAttr (unqual "id") e + where alt = maybe mempty str $ findAttr (unqual "alt") e + title = fromMaybe "" $ findAttr (unqual "title") e + imgId = fromMaybe "" $ findAttr (unqual "id") e href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e -- | Parse @pType@ @@ -258,7 +260,7 @@ parseCite e = blockQuote . mconcat <$> mapM parseCiteChild (elChildren e) -- | Parse @citeType@ child parseCiteChild :: PandocMonad m => Element -> FB2 m Blocks parseCiteChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "empty-line" -> pure horizontalRule @@ -273,13 +275,13 @@ parsePoem e = mconcat <$> mapM parsePoemChild (elChildren e) parsePoemChild :: PandocMonad m => Element -> FB2 m Blocks parsePoemChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "epigraph" -> parseEpigraph e "stanza" -> parseStanza e "text-author" -> para <$> parsePType e - "date" -> pure $ para $ text $ T.pack $ strContent e + "date" -> pure $ para $ text $ strContent e name -> report (UnexpectedXmlElement name "poem") $> mempty parseStanza :: PandocMonad m => Element -> FB2 m Blocks @@ -292,7 +294,7 @@ joinLineBlocks [] = [] parseStanzaChild :: PandocMonad m => Element -> FB2 m Blocks parseStanzaChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "v" -> lineBlock . (:[]) <$> parsePType e @@ -302,11 +304,11 @@ parseStanzaChild e = parseEpigraph :: PandocMonad m => Element -> FB2 m Blocks parseEpigraph e = divWith (divId, ["epigraph"], []) . mconcat <$> mapM parseEpigraphChild (elChildren e) - where divId = maybe "" T.pack $ findAttr (unqual "id") e + where divId = fromMaybe "" $ findAttr (unqual "id") e parseEpigraphChild :: PandocMonad m => Element -> FB2 m Blocks parseEpigraphChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -320,7 +322,7 @@ parseAnnotation e = mconcat <$> mapM parseAnnotationChild (elChildren e) parseAnnotationChild :: PandocMonad m => Element -> FB2 m Blocks parseAnnotationChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -334,14 +336,14 @@ parseSection :: PandocMonad m => Element -> FB2 m Blocks parseSection e = do n <- gets fb2SectionLevel modify $ \st -> st{ fb2SectionLevel = n + 1 } - let sectionId = maybe "" T.pack $ findAttr (unqual "id") e + let sectionId = fromMaybe "" $ findAttr (unqual "id") e bs <- divWith (sectionId, ["section"], []) . mconcat <$> mapM parseSectionChild (elChildren e) modify $ \st -> st{ fb2SectionLevel = n } pure bs parseSectionChild :: PandocMonad m => Element -> FB2 m Blocks parseSectionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseBodyChild e "epigraph" -> parseEpigraph e "image" -> parseImageElement e @@ -363,16 +365,16 @@ parseStyleType e = mconcat <$> mapM parseInline (elContent e) parseNamedStyle :: PandocMonad m => Element -> FB2 m Inlines parseNamedStyle e = do content <- mconcat <$> mapM parseNamedStyleChild (elContent e) - let lang = maybeToList $ ("lang",) . T.pack <$> findAttr (QName "lang" Nothing (Just "xml")) e + let lang = maybeToList $ ("lang",) <$> findAttr (QName "lang" Nothing (Just "xml")) e case findAttr (unqual "name") e of - Just name -> pure $ spanWith ("", [T.pack name], lang) content + Just name -> pure $ spanWith ("", [name], lang) content Nothing -> do report $ IgnoredElement "link without required name" pure mempty parseNamedStyleChild :: PandocMonad m => Content -> FB2 m Inlines parseNamedStyleChild (Elem e) = - case T.pack $ qName (elName e) of + case qName (elName e) of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -380,7 +382,7 @@ parseNamedStyleChild (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement $ name <> " in style" @@ -392,7 +394,7 @@ parseLinkType :: PandocMonad m => Element -> FB2 m Inlines parseLinkType e = do content <- mconcat <$> mapM parseStyleLinkType (elContent e) notes <- gets fb2Notes - case T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of Just href -> case findAttr (QName "type" Nothing Nothing) e of Just "note" -> case M.lookup href notes of Nothing -> pure $ link href "" content @@ -419,15 +421,14 @@ parseTable _ = pure mempty -- TODO: tables are not supported yet -- | Parse @title-infoType@ parseTitleInfoChild :: PandocMonad m => Element -> FB2 m () parseTitleInfoChild e = - case T.pack $ qName (elName e) of + case qName (elName e) of "genre" -> pure () "author" -> parseAuthor e >>= \author -> modify (\st -> st {fb2Authors = author:fb2Authors st}) - "book-title" -> modify (setMeta "title" (text $ T.pack $ strContent e)) + "book-title" -> modify (setMeta "title" (text $ strContent e)) "annotation" -> parseAnnotation e >>= modify . setMeta "abstract" "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ T.splitOn "," - $ T.pack $ strContent e)) - "date" -> modify (setMeta "date" (text $ T.pack $ strContent e)) + "date" -> modify (setMeta "date" (text $ strContent e)) "coverpage" -> parseCoverPage e "lang" -> pure () "src-lang" -> pure () @@ -441,7 +442,7 @@ parseCoverPage e = Just img -> case href of Just src -> modify (setMeta "cover-image" (MetaString $ removeHash src)) Nothing -> pure () - where href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img + where href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img Nothing -> pure () -- | Parse @inlineImageType@ element @@ -454,5 +455,5 @@ parseInlineImageElement e = Nothing -> do report $ IgnoredElement "inline image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e + where alt = maybe mempty str $ findAttr (unqual "alt") e + href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index dfd343b7a..5353f2001 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -16,7 +16,7 @@ module Text.Pandoc.Readers.JATS ( readJATS ) where import Control.Monad.State.Strict import Control.Monad.Except (throwError) import Text.Pandoc.Error (PandocError(..)) -import Data.Char (isDigit, isSpace, toUpper) +import Data.Char (isDigit, isSpace) import Data.Default import Data.Generics import Data.List (foldl', intersperse) @@ -31,8 +31,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light import qualified Data.Set as S (fromList, member) import Data.Set ((\\)) @@ -67,29 +66,29 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr = fromMaybe "" . maybeAttrValue attr -maybeAttrValue :: String -> Element -> Maybe Text +maybeAttrValue :: Text -> Element -> Maybe Text maybeAttrValue attr elt = - T.pack <$> lookupAttrBy (\x -> qName x == attr) (elAttribs elt) + lookupAttrBy (\x -> qName x == attr) (elAttribs elt) -- convenience function -named :: String -> Element -> Bool +named :: Text -> Element -> Bool named s e = qName (elName e) == s -- @@ -155,10 +154,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> JATS m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "p" -> parseMixed para (elContent e) @@ -207,7 +206,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ textContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -271,7 +270,7 @@ parseBlock (Elem e) = Just "center" -> AlignCenter _ -> AlignDefault let toWidth c = do - w <- findAttrText (unqual "colwidth") c + w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> isDigit x || x == '.') w if n > 0 then Just n else Nothing let numrows = foldl' max 0 $ map length bodyrows @@ -442,16 +441,10 @@ parseRef e = do Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty -- TODO handle mixed-citation -findAttrText :: QName -> Element -> Maybe Text -findAttrText x = fmap T.pack . findAttr x - textContent :: Element -> Text -textContent = T.pack . strContent - -textContentRecursive :: Element -> Text -textContentRecursive = T.pack . strContentRecursive +textContent = strContent -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -460,9 +453,8 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> JATS m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s -parseInline (CRef ref) = - return . text . maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = return . text . convertEntity $ ref parseInline (Elem e) = case qName (elName e) of "italic" -> innerInlines emph @@ -507,9 +499,9 @@ parseInline (Elem e) = else linkWith attr ("#" <> rid) "" ils "ext-link" -> do ils <- innerInlines id - let title = fromMaybe "" $ findAttrText (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e + let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "rid" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, [], []) @@ -529,7 +521,7 @@ parseInline (Elem e) = where innerInlines f = extractSpaces f . mconcat <$> mapM parseInline (elContent e) mathML x = - case readMathML . T.pack . showElement $ everywhere (mkT removePrefix) x of + case readMathML . showElement $ everywhere (mkT removePrefix) x of Left _ -> mempty Right m -> writeTeX m formula constructor = do @@ -547,4 +539,4 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ textContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index bdadc4dd9..184d5a63f 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -13,7 +13,6 @@ Conversion of OPML to 'Pandoc' document. module Text.Pandoc.Readers.OPML ( readOPML ) where import Control.Monad.State.Strict -import Data.Char (toUpper) import Data.Default import Data.Generics import Data.Maybe (fromMaybe) @@ -28,8 +27,7 @@ import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) import Text.Pandoc.Shared (crFilter, blocksToInlines') -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light import Control.Monad.Except (throwError) type OPML m = StateT OPMLState m @@ -69,25 +67,22 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e)) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) - -textContent :: Element -> Text -textContent = T.pack . strContent + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- exceptT :: PandocMonad m => Either PandocError a -> OPML m a -- exceptT = either throwError return @@ -111,11 +106,11 @@ parseBlock :: PandocMonad m => Content -> OPML m Blocks parseBlock (Elem e) = case qName (elName e) of "ownerName" -> mempty <$ modify (\st -> - st{opmlDocAuthors = [text $ textContent e]}) + st{opmlDocAuthors = [text $ strContent e]}) "dateModified" -> mempty <$ modify (\st -> - st{opmlDocDate = text $ textContent e}) + st{opmlDocDate = text $ strContent e}) "title" -> mempty <$ modify (\st -> - st{opmlDocTitle = text $ textContent e}) + st{opmlDocTitle = text $ strContent e}) "outline" -> gets opmlSectionLevel >>= sect . (+1) "?xml" -> return mempty _ -> getBlocks e diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 85308deb1..c274b6fd4 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -14,8 +14,7 @@ Entry point to the odt reader. module Text.Pandoc.Readers.Odt ( readOdt ) where import Codec.Archive.Zip -import qualified Text.XML.Light as XML -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light import qualified Data.ByteString.Lazy as B @@ -91,7 +90,7 @@ archiveToOdt archive = do -- -entryToXmlElem :: Entry -> Either PandocError XML.Element +entryToXmlElem :: Entry -> Either PandocError Element entryToXmlElem entry = case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of Right x -> Right x diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 43c44e7e9..df90880fa 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -29,14 +29,14 @@ import Control.Monad ((<=<)) import qualified Data.ByteString.Lazy as B import Data.Foldable (fold) -import Data.List (find, stripPrefix) +import Data.List (find) import qualified Data.Map as M import qualified Data.Text as T import Data.Maybe import Data.Semigroup (First(..), Option(..)) import Text.TeXMath (readMathML, writeTeX) -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Builder hiding (underline) import Text.Pandoc.MediaBag (MediaBag, insertMedia) @@ -557,7 +557,7 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover >>?% mappend -- extractText :: XML.Content -> Fallible T.Text - extractText (XML.Text cData) = succeedWith (T.pack $ XML.cdData cData) + extractText (XML.Text cData) = succeedWith (XML.cdData cData) extractText _ = failEmpty read_text_seq :: InlineMatcher @@ -777,14 +777,14 @@ read_frame_img = "" -> returnV mempty -< () src' -> do let exts = extensionsFromList [Ext_auto_identifiers] - resource <- lookupResource -< src' + resource <- lookupResource -< T.unpack src' _ <- updateMediaWithResource -< resource w <- findAttrText' NsSVG "width" -< () h <- findAttrText' NsSVG "height" -< () titleNodes <- matchChildContent' [ read_frame_title ] -< () alt <- matchChildContent [] read_plain_text -< () arr (firstMatch . uncurry4 imageWith) -< - (image_attributes w h, T.pack src', inlineListToIdentifier exts (toList titleNodes), alt) + (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt) read_frame_title :: InlineMatcher read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) @@ -804,7 +804,8 @@ read_frame_mathml = case fold src of "" -> returnV mempty -< () src' -> do - let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml" + let path = T.unpack $ + fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml" (_, mathml) <- lookupResource -< path case readMathML (UTF8.toText $ B.toStrict mathml) of Left _ -> returnV mempty -< () diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs index 77174c793..78a7fc0b2 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs @@ -14,9 +14,10 @@ typesafe Haskell namespace identifiers and unsafe "real world" namespaces. module Text.Pandoc.Readers.Odt.Generic.Namespaces where import qualified Data.Map as M +import Data.Text (Text) -- -type NameSpaceIRI = String +type NameSpaceIRI = Text -- type NameSpaceIRIs nsID = M.Map nsID NameSpaceIRI diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs index 6dc56a0d9..edefe3c70 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs @@ -20,7 +20,6 @@ module Text.Pandoc.Readers.Odt.Generic.Utils , reverseComposition , tryToRead , Lookupable(..) -, readLookupables , readLookupable , readPercent , findBy @@ -30,11 +29,11 @@ module Text.Pandoc.Readers.Odt.Generic.Utils import Control.Category (Category, (<<<), (>>>)) import qualified Control.Category as Cat (id) -import Control.Monad (msum) - +import Data.Char (isSpace) import qualified Data.Foldable as F (Foldable, foldr) import Data.Maybe - +import Data.Text (Text) +import qualified Data.Text as T -- | Equivalent to -- > foldr (.) id @@ -76,8 +75,8 @@ swing = flip.(.flip id) -- (nobody wants that) while the latter returns "to much" for simple purposes. -- This function instead applies 'reads' and returns the first match (if any) -- in a 'Maybe'. -tryToRead :: (Read r) => String -> Maybe r -tryToRead = reads >>> listToMaybe >>> fmap fst +tryToRead :: (Read r) => Text -> Maybe r +tryToRead = (reads . T.unpack) >>> listToMaybe >>> fmap fst -- | A version of 'reads' that requires a '%' sign after the number readPercent :: ReadS Int @@ -88,26 +87,12 @@ readPercent s = [ (i,s') | (i , r ) <- reads s -- | Data that can be looked up. -- This is mostly a utility to read data with kind *. class Lookupable a where - lookupTable :: [(String, a)] - --- | The idea is to use this function as if there was a declaration like --- --- > instance (Lookupable a) => (Read a) where --- > readsPrec _ = readLookupables --- . --- But including this code in this form would need UndecideableInstances. --- That is a bad idea. Luckily 'readLookupable' (without the s at the end) --- can be used directly in almost any case. -readLookupables :: (Lookupable a) => String -> [(a,String)] -readLookupables s = [ (a,rest) | (word,rest) <- lex s, - a <- maybeToList (lookup word lookupTable) - ] + lookupTable :: [(Text, a)] -- | Very similar to a simple 'lookup' in the 'lookupTable', but with a lexer. -readLookupable :: (Lookupable a) => String -> Maybe a -readLookupable s = msum - $ map ((`lookup` lookupTable).fst) - $ lex s +readLookupable :: (Lookupable a) => Text -> Maybe a +readLookupable s = + lookup (T.takeWhile (not . isSpace) $ T.dropWhile isSpace s) lookupTable uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index 00c636a0d..0d921e23b 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE LambdaCase #-} @@ -60,11 +61,11 @@ import Control.Arrow import Data.Bool ( bool ) import Data.Either ( rights ) import qualified Data.Map as M -import qualified Data.Text as T +import Data.Text (Text) import Data.Default import Data.Maybe -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Readers.Odt.Arrows.State import Text.Pandoc.Readers.Odt.Arrows.Utils @@ -78,13 +79,13 @@ import Text.Pandoc.Readers.Odt.Generic.Fallible -------------------------------------------------------------------------------- -- -type ElementName = String -type AttributeName = String -type AttributeValue = String -type TextAttributeValue = T.Text +type ElementName = Text +type AttributeName = Text +type AttributeValue = Text +type TextAttributeValue = Text -- -type NameSpacePrefix = String +type NameSpacePrefix = Text -- type NameSpacePrefixes nsID = M.Map nsID NameSpacePrefix @@ -461,7 +462,7 @@ lookupDefaultingAttr :: (NameSpaceID nsID, Lookupable a, Default a) lookupDefaultingAttr nsID attrName = lookupAttrWithDefault nsID attrName def --- | Return value as a (Maybe String) +-- | Return value as a (Maybe Text) findAttr' :: (NameSpaceID nsID) => nsID -> AttributeName -> XMLConverter nsID extraState x (Maybe AttributeValue) @@ -477,7 +478,6 @@ findAttrText' nsID attrName = qualifyName nsID attrName &&& getCurrentElement >>% XML.findAttr - >>^ fmap T.pack -- | Return value as string or fail findAttr :: (NameSpaceID nsID) @@ -492,7 +492,6 @@ findAttrText :: (NameSpaceID nsID) -> FallibleXMLConverter nsID extraState x TextAttributeValue findAttrText nsID attrName = findAttr' nsID attrName - >>^ fmap T.pack >>> maybeToChoice -- | Return value as string or return provided default value @@ -511,7 +510,7 @@ findAttrTextWithDefault :: (NameSpaceID nsID) -> XMLConverter nsID extraState x TextAttributeValue findAttrTextWithDefault nsID attrName deflt = findAttr' nsID attrName - >>^ maybe deflt T.pack + >>^ fromMaybe deflt -- | Read and return value or fail readAttr :: (NameSpaceID nsID, Read attrValue) @@ -748,7 +747,7 @@ matchContent lookups fallback -- Internals -------------------------------------------------------------------------------- -stringToBool' :: String -> Maybe Bool +stringToBool' :: Text -> Maybe Bool stringToBool' val | val `elem` trueValues = Just True | val `elem` falseValues = Just False | otherwise = Nothing diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs index 3a24a1162..70741c28d 100644 --- a/src/Text/Pandoc/Readers/Odt/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Reader.Odt.Namespaces Copyright : Copyright (C) 2015 Martin Linnemann @@ -13,10 +14,10 @@ Namespaces used in odt files. module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..) ) where -import Data.List (isPrefixOf) import qualified Data.Map as M (empty, insert) import Data.Maybe (fromMaybe, listToMaybe) - +import Data.Text (Text) +import qualified Data.Text as T import Text.Pandoc.Readers.Odt.Generic.Namespaces @@ -30,7 +31,7 @@ instance NameSpaceID Namespace where findID :: NameSpaceIRI -> Maybe Namespace -findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `isPrefixOf` iri] +findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `T.isPrefixOf` iri] nsIDmap :: NameSpaceIRIs Namespace nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs @@ -54,12 +55,12 @@ data Namespace = -- Open Document core -- Core XML (basically only for the 'id'-attribute) | NsXML -- Fallback - | NsOther String + | NsOther Text deriving ( Eq, Ord, Show ) -- | Not the actual iri's, but large prefixes of them - this way there are -- less versioning problems and the like. -nsIDs :: [(String,Namespace)] +nsIDs :: [(Text, Namespace)] nsIDs = [ ("urn:oasis:names:tc:opendocument:xmlns:animation" , NsAnim ), ("urn:oasis:names:tc:opendocument:xmlns:chart" , NsChart ), diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 46a777df1..5e10f896c 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -2,6 +2,7 @@ {-# LANGUAGE Arrows #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Odt.StyleReader Copyright : Copyright (C) 2015 Martin Linnemann @@ -46,11 +47,13 @@ import qualified Data.Foldable as F import Data.List (unfoldr) import qualified Data.Map as M import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T import qualified Data.Set as S -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML -import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Shared (safeRead, tshow) import Text.Pandoc.Readers.Odt.Arrows.Utils @@ -90,7 +93,7 @@ instance Default FontPitch where -- -- Thus, we want -type FontFaceName = String +type FontFaceName = Text type FontPitches = M.Map FontFaceName FontPitch @@ -151,7 +154,7 @@ findPitch = ( lookupAttr NsStyle "font-pitch" -- Definitions of main data -------------------------------------------------------------------------------- -type StyleName = String +type StyleName = Text -- | There are two types of styles: named styles with a style family and an -- optional style parent, and default styles for each style family, @@ -355,8 +358,8 @@ getListLevelStyle level ListStyle{..} = -- \^ simpler, but in general less efficient data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType - , listItemPrefix :: Maybe String - , listItemSuffix :: Maybe String + , listItemPrefix :: Maybe Text + , listItemSuffix :: Maybe Text , listItemFormat :: ListItemNumberFormat , listItemStart :: Int } @@ -366,9 +369,9 @@ instance Show ListLevelStyle where show ListLevelStyle{..} = " listItemPrefix) ++ show listItemFormat - ++ maybeToString listItemSuffix + ++ maybeToString (T.unpack <$> listItemSuffix) ++ ">" where maybeToString = fromMaybe "" @@ -471,7 +474,7 @@ readTextProperties = ) where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)] isFontBold = ("normal",False):("bold",True) - :map ((,True).show) ([100,200..900]::[Int]) + :map ((,True) . tshow) ([100,200..900]::[Int]) readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode) readUnderlineMode = readLineMode "text-underline-mode" @@ -481,7 +484,7 @@ readStrikeThroughMode :: StyleReaderSafe _x (Maybe UnderlineMode) readStrikeThroughMode = readLineMode "text-line-through-mode" "text-line-through-style" -readLineMode :: String -> String -> StyleReaderSafe _x (Maybe UnderlineMode) +readLineMode :: Text -> Text -> StyleReaderSafe _x (Maybe UnderlineMode) readLineMode modeAttr styleAttr = proc x -> do isUL <- searchAttr NsStyle styleAttr False isLinePresent -< x mode <- lookupAttr' NsStyle modeAttr -< x -- cgit v1.2.3