From d4748038d76dd430f7c760e73487172a3264c5de Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 12 Aug 2014 22:04:07 -0400 Subject: Docx Reader: Fix font style parsing. Before we just checked for the existence of a tag. Now, we make sure to check for its on/off value. --- src/Text/Pandoc/Readers/Docx/Parse.hs | 39 ++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 175bf2784..32b0f8d93 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -652,30 +652,45 @@ elemToParagraphStyle ns element } elemToParagraphStyle _ _ = defaultParagraphStyle +checkOnOff :: NameSpaces -> Element -> QName -> Bool +checkOnOff ns rPr tag + | Just t <- findChild tag rPr + , Just val <- findAttr (elemName ns "w" "val") t = + case val of + "true" -> True + "false" -> False + "on" -> True + "off" -> False + "1" -> True + "0" -> False + _ -> False + | Just _ <- findChild tag rPr = True +checkOnOff _ _ _ = False + elemToRunStyle :: NameSpaces -> Element -> RunStyle elemToRunStyle ns element | Just rPr <- findChild (elemName ns "w" "rPr") element = RunStyle { - isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr - , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr - , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr - , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr + isBold = checkOnOff ns rPr (elemName ns "w" "b") + , isItalic = checkOnOff ns rPr (elemName ns "w" "i") + , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps") + , isStrike = checkOnOff ns rPr (elemName ns "w" "strike") , isSuperScript = (Just "superscript" == - (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")))) + (findChild (elemName ns "w" "vertAlign") rPr >>= + findAttr (elemName ns "w" "val"))) , isSubScript = (Just "subscript" == - (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")))) + (findChild (elemName ns "w" "vertAlign") rPr >>= + findAttr (elemName ns "w" "val"))) , rUnderline = - findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) + findChild (elemName ns "w" "u") rPr >>= + findAttr (elemName ns "w" "val") , rStyle = - findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) + findChild (elemName ns "w" "rStyle") rPr >>= + findAttr (elemName ns "w" "val") } elemToRunStyle _ _ = defaultRunStyle -- cgit v1.2.3 From aae71ad595f78f6cb7dd1cc5cb0aaef0d3aaf5f1 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 12 Aug 2014 22:08:30 -0400 Subject: Docx reader: Add "BlockQuotation" to divs list. --- src/Text/Pandoc/Readers/Docx.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 9943ebeb8..bcfa4082e 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -272,7 +272,7 @@ codeSpans :: [String] codeSpans = ["VerbatimChar"] blockQuoteDivs :: [String] -blockQuoteDivs = ["Quote", "BlockQuote"] +blockQuoteDivs = ["Quote", "BlockQuote", "BlockQuotation"] codeDivs :: [String] codeDivs = ["SourceCode"] -- cgit v1.2.3 From 194ed8885236d8446b34f44ecf16d4fa9e5c5cbe Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 12 Aug 2014 22:15:09 -0400 Subject: Docx reader: accept explicit "Italic" and "Bold" rStyles. Note that "Italic" can be on, and, from the last commit, `` can be present, but be turned off. In that case, the turned-off tag takes precedence. So, we have to distinguish between something being off and something not being there. Hence, isItalic, isBold, isStrike, and isSmallCaps have become Maybes. --- src/Text/Pandoc/Readers/Docx.hs | 23 ++++++++++++++++++----- src/Text/Pandoc/Readers/Docx/Parse.hs | 26 +++++++++++++------------- 2 files changed, 31 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index bcfa4082e..2c436f76f 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -202,17 +202,30 @@ runStyleToContainers rPr = [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))] spanClassToContainers s | s `elem` spansToKeep = [Container $ Span ("", [s], [])] - spanClassToContainers _ = [] + spanClassToContainers _ = [] classContainers = case rStyle rPr of Nothing -> [] Just s -> spanClassToContainers s + resolveFmt :: Bool -> Maybe Bool -> Bool + resolveFmt _ (Just True) = True + resolveFmt _ (Just False) = False + resolveFmt bool Nothing = bool + formatters = map Container $ mapMaybe id - [ if isBold rPr then (Just Strong) else Nothing - , if isItalic rPr then (Just Emph) else Nothing - , if isSmallCaps rPr then (Just SmallCaps) else Nothing - , if isStrike rPr then (Just Strikeout) else Nothing + [ if resolveFmt (rStyle rPr == Just "Bold") (isBold rPr) + then (Just Strong) + else Nothing + , if resolveFmt (rStyle rPr == Just "Italic") (isItalic rPr) + then (Just Emph) + else Nothing + , if resolveFmt False (isSmallCaps rPr) + then (Just SmallCaps) + else Nothing + , if resolveFmt False (isStrike rPr) + then (Just Strikeout) + else Nothing , if isSuperScript rPr then (Just Superscript) else Nothing , if isSubScript rPr then (Just Subscript) else Nothing , rUnderline rPr >>= diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 32b0f8d93..939fcde27 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -196,10 +196,10 @@ data Run = Run RunStyle [RunElem] data RunElem = TextRun String | LnBrk | Tab deriving Show -data RunStyle = RunStyle { isBold :: Bool - , isItalic :: Bool - , isSmallCaps :: Bool - , isStrike :: Bool +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool + , isSmallCaps :: Maybe Bool + , isStrike :: Maybe Bool , isSuperScript :: Bool , isSubScript :: Bool , rUnderline :: Maybe String @@ -207,10 +207,10 @@ data RunStyle = RunStyle { isBold :: Bool deriving Show defaultRunStyle :: RunStyle -defaultRunStyle = RunStyle { isBold = False - , isItalic = False - , isSmallCaps = False - , isStrike = False +defaultRunStyle = RunStyle { isBold = Nothing + , isItalic = Nothing + , isSmallCaps = Nothing + , isStrike = Nothing , isSuperScript = False , isSubScript = False , rUnderline = Nothing @@ -652,20 +652,20 @@ elemToParagraphStyle ns element } elemToParagraphStyle _ _ = defaultParagraphStyle -checkOnOff :: NameSpaces -> Element -> QName -> Bool +checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag | Just t <- findChild tag rPr , Just val <- findAttr (elemName ns "w" "val") t = - case val of - "true" -> True + Just $ case val of + "true" -> True "false" -> False "on" -> True "off" -> False "1" -> True "0" -> False _ -> False - | Just _ <- findChild tag rPr = True -checkOnOff _ _ _ = False + | Just _ <- findChild tag rPr = Just True +checkOnOff _ _ _ = Nothing elemToRunStyle :: NameSpaces -> Element -> RunStyle -- cgit v1.2.3 From 85579052b5bb7196b62e9d9cd70e164498e49f6c Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 12 Aug 2014 23:33:03 -0400 Subject: Docx reader: Check for null-id'd anchors too. Otherwise they get left dangling in the document. --- src/Text/Pandoc/Readers/Docx.hs | 1 - 1 file changed, 1 deletion(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 2c436f76f..346d54bbe 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -403,7 +403,6 @@ parPartToInlines (PlainOMath exps) = do isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (ident, classes, kvs) ils) = - (not . null) ident && classes == ["anchor"] && null kvs && null ils -- cgit v1.2.3 From 378a795eaae7176426080c4164a66b33d511f87f Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 12 Aug 2014 23:34:45 -0400 Subject: Docx: More robust handling of multiple bookmarks in header. --- src/Text/Pandoc/Readers/Docx.hs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 346d54bbe..28f49251c 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -402,7 +402,7 @@ parPartToInlines (PlainOMath exps) = do isAnchorSpan :: Inline -> Bool -isAnchorSpan (Span (ident, classes, kvs) ils) = +isAnchorSpan (Span (_, classes, kvs) ils) = classes == ["anchor"] && null kvs && null ils @@ -415,14 +415,16 @@ makeHeaderAnchor :: Block -> DocxContext Block -- If there is an anchor already there (an anchor span in the header, -- to be exact), we rename and associate the new id with the old one. makeHeaderAnchor (Header n (_, classes, kvs) ils) - | (x : xs) <- filter isAnchorSpan ils - , (Span (ident, _, _) _) <- x - , notElem ident dummyAnchors = + | xs <- filter isAnchorSpan ils + , idents <- filter (\i -> notElem i dummyAnchors) $ + map (\(Span (ident, _, _) _) -> ident) xs + , not $ null idents = do hdrIDMap <- gets docxAnchorMap let newIdent = uniqueIdent ils (M.elems hdrIDMap) - modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} - return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) + newMap = M.fromList $ map (\i -> (i, newIdent)) idents + modify $ \s -> s {docxAnchorMap = M.union newMap hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ xs) -- Otherwise we just give it a name, and register that name (associate -- it with itself.) makeHeaderAnchor (Header n (_, classes, kvs) ils) = -- cgit v1.2.3 From dca55630e641905d5447b9468d0953227f9e704a Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 12 Aug 2014 23:40:51 -0400 Subject: Docx Reader: Trim line breaks from the beginning and end of Section Headers. We might also want to do this elsewhere (for pars, for example). --- data/templates | 2 +- src/Text/Pandoc/Readers/Docx.hs | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/data/templates b/data/templates index 095196e8d..3befef257 160000 --- a/data/templates +++ b/data/templates @@ -1 +1 @@ -Subproject commit 095196e8d6e873ee36846ca120bf5dfd39e30a8c +Subproject commit 3befef257ce461ae68760004df938f3ca8397b31 diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 28f49251c..dcc2122bd 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -455,6 +455,13 @@ isHeaderContainer :: Container Block -> Bool isHeaderContainer (Container f) | Header _ _ _ <- f [] = True isHeaderContainer _ = False +trimLineBreaks :: [Inline] -> [Inline] +trimLineBreaks [] = [] +trimLineBreaks (LineBreak : ils) = trimLineBreaks ils +trimLineBreaks ils + | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils') +trimLineBreaks ils = ils + bodyPartToBlocks :: BodyPart -> DocxContext [Block] bodyPartToBlocks (Paragraph pPr parparts) | any isBlockCodeContainer (parStyleToContainers pPr) = @@ -467,8 +474,9 @@ bodyPartToBlocks (Paragraph pPr parparts) [CodeBlock ("", [], []) (concatMap parPartToString parparts)] bodyPartToBlocks (Paragraph pPr parparts) | any isHeaderContainer (parStyleToContainers pPr) = do - ils <- normalizeSpaces <$> local (\s -> s{docxInHeaderBlock = True}) - (parPartsToInlines parparts) + ils <- (trimLineBreaks . normalizeSpaces) <$> + local (\s -> s{docxInHeaderBlock = True}) + (parPartsToInlines parparts) let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) Header n attr _ = hdrFun [] hdr <- makeHeaderAnchor $ Header n attr ils -- cgit v1.2.3 From a1320a76f9dad0e23118e67335206c87608e9f8f Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Wed, 13 Aug 2014 00:08:01 -0400 Subject: Docx: Reducible forgot about smallcaps --- src/Text/Pandoc/Readers/Docx/Reducible.hs | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs index 39a93d988..80a0cee17 100644 --- a/src/Text/Pandoc/Readers/Docx/Reducible.hs +++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs @@ -138,6 +138,7 @@ instance Reducible Inline where container (Emph _) = Container Emph container (Strong _) = Container Strong + container (SmallCaps _) = Container SmallCaps container (Strikeout _) = Container Strikeout container (Subscript _) = Container Subscript container (Superscript _) = Container Superscript @@ -147,6 +148,7 @@ instance Reducible Inline where container _ = NullContainer innards (Emph ils) = ils + innards (SmallCaps ils) = ils innards (Strong ils) = ils innards (Strikeout ils) = ils innards (Subscript ils) = ils -- cgit v1.2.3