From 840108a9c11850089a27a3b5458f8561ab1b6a2e Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sun, 27 Jul 2014 15:11:18 -0400 Subject: Docx reader: Make metavalues out of styled paragraphs. This will make paragraphs styled with `Author`, `Title`, `Subtitle`, `Date`, and `Abstract` into pandoc metavalues, rather than text. The implementation only takes those elements from the beginning of the document (ignoring empty paragraphs). Multiple paragraphs in the `Author` style will be made into a metaList, one paragraph per item. Hard linebreaks (shift-return) in the paragraph will be maintained, and can be used for institution, email, etc. --- src/Text/Pandoc/Readers/Docx.hs | 84 ++++++++++++++++++++++++---- tests/Tests/Readers/Docx.hs | 15 +++++ tests/docx.metadata.docx | Bin 0 -> 40487 bytes tests/docx.metadata.native | 2 + tests/docx.metadata_after_normal.docx | Bin 0 -> 57273 bytes tests/docx.metadata_after_normal.native | 7 +++ tests/docx.metadata_author_linebreak.docx | Bin 0 -> 40481 bytes tests/docx.metadata_author_linebreak.native | 2 + 8 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 tests/docx.metadata.docx create mode 100644 tests/docx.metadata.native create mode 100644 tests/docx.metadata_after_normal.docx create mode 100644 tests/docx.metadata_after_normal.native create mode 100644 tests/docx.metadata_author_linebreak.docx create mode 100644 tests/docx.metadata_author_linebreak.native diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 196a3cec5..48a23dd3c 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -87,7 +87,7 @@ import Text.Pandoc.Readers.Docx.Reducible import Text.Pandoc.Readers.Docx.TexChar import Text.Pandoc.Shared import Data.Maybe (mapMaybe, fromMaybe) -import Data.List (delete, isPrefixOf, (\\), intercalate) +import Data.List (delete, isPrefixOf, (\\), intercalate, intersect) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B import Data.ByteString.Base64 (encode) @@ -101,7 +101,8 @@ readDocx :: ReaderOptions -> Pandoc readDocx opts bytes = case archiveToDocx (toArchive bytes) of - Right docx -> Pandoc nullMeta (docxToBlocks opts docx) + Right docx -> Pandoc meta blks where + (meta, blks) = (docxToMetaAndBlocks opts docx) Left _ -> error $ "couldn't parse docx file" data DState = DState { docxAnchorMap :: M.Map String String @@ -134,6 +135,65 @@ spansToKeep = [] divsToKeep :: [String] divsToKeep = ["list-item", "Definition", "DefinitionTerm"] +metaStyles :: M.Map String String +metaStyles = M.fromList [ ("Title", "title") + , ("Subtitle", "subtitle") + , ("Author", "author") + , ("Date", "date") + , ("Abstract", "abstract")] + +sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) +sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) + +isMetaPar :: BodyPart -> Bool +isMetaPar (Paragraph pPr _) = + not $ null $ intersect (pStyle pPr) (M.keys metaStyles) +isMetaPar _ = False + +isEmptyPar :: BodyPart -> Bool +isEmptyPar (Paragraph _ parParts) = + all isEmptyParPart parParts + where + isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems + isEmptyParPart _ = False + isEmptyElem (TextRun s) = trim s == "" + isEmptyElem _ = True +isEmptyPar _ = False + +bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' [] = return M.empty +bodyPartsToMeta' (bp : bps) + | (Paragraph pPr parParts) <- bp + , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (Just metaField) <- M.lookup c metaStyles = do + inlines <- parPartsToInlines parParts + remaining <- bodyPartsToMeta' bps + let + f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f m (MetaList mv) = MetaList (m : mv) + f m n = MetaList [m, n] + return $ M.insertWith f metaField (MetaInlines inlines) remaining +bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps + +bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta bps = do + mp <- bodyPartsToMeta' bps + let mp' = + case M.lookup "author" mp of + Just mv -> M.insert "author" (fixAuthors mv) mp + Nothing -> mp + return $ Meta mp' + +fixAuthors :: MetaValue -> MetaValue +fixAuthors (MetaBlocks blks) = + MetaList $ map g $ filter f blks + where f (Para _) = True + f _ = False + g (Para ils) = MetaInlines ils + g _ = MetaInlines [] +fixAuthors mv = mv + runStyleToContainers :: RunStyle -> [Container Inline] runStyleToContainers rPr = let spanClassToContainers :: String -> [Container Inline] @@ -615,24 +675,26 @@ rewriteLink l@(Link ils ('#':target, title)) = do Nothing -> l rewriteLink il = return il - -bodyToBlocks :: Body -> DocxContext [Block] -bodyToBlocks (Body bps) = do - blks <- concatMapM bodyPartToBlocks bps >>= +bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block]) +bodyToMetaAndBlocks (Body bps) = do + let (metabps, blkbps) = sepBodyParts bps + meta <- bodyPartsToMeta metabps + blks <- concatMapM bodyPartToBlocks blkbps >>= walkM rewriteLink return $ - blocksToDefinitions $ - blocksToBullets $ blks + (meta, + blocksToDefinitions $ + blocksToBullets $ blks) -docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body)) = +docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block]) +docxToMetaAndBlocks opts d@(Docx (Document _ body)) = let dState = DState { docxAnchorMap = M.empty , docxInHeaderBlock = False , docxInTexSubscript = False} dEnv = DEnv { docxOptions = opts , docxDocument = d} in - evalDocxContext (bodyToBlocks body) dEnv dState + evalDocxContext (bodyToMetaAndBlocks body) dEnv dState ilToCode :: Inline -> String ilToCode (Str s) = s diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 7b737f95a..8ad9e08ba 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -164,5 +164,20 @@ tests = [ testGroup "inlines" "docx.track_changes_deletion.docx" "docx.track_changes_deletion_all.native" ] + , testGroup "metadata" + [ testCompareWithOpts def{readerStandalone=True} + "metadata fields" + "docx.metadata.docx" + "docx.metadata.native" + , testCompareWithOpts def{readerStandalone=True} + "linebreak between authors" + "docx.metadata_author_linebreak.docx" + "docx.metadata_author_linebreak.native" + , testCompareWithOpts def{readerStandalone=True} + "stop recording metadata with normal text" + "docx.metadata_after_normal.docx" + "docx.metadata_after_normal.native" + ] + ] diff --git a/tests/docx.metadata.docx b/tests/docx.metadata.docx new file mode 100644 index 000000000..34182a87e Binary files /dev/null and b/tests/docx.metadata.docx differ diff --git a/tests/docx.metadata.native b/tests/docx.metadata.native new file mode 100644 index 000000000..ed7ba63cf --- /dev/null +++ b/tests/docx.metadata.native @@ -0,0 +1,2 @@ +Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]}) +[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]] diff --git a/tests/docx.metadata_after_normal.docx b/tests/docx.metadata_after_normal.docx new file mode 100644 index 000000000..16b8d583c Binary files /dev/null and b/tests/docx.metadata_after_normal.docx differ diff --git a/tests/docx.metadata_after_normal.native b/tests/docx.metadata_after_normal.native new file mode 100644 index 000000000..f0e31f8da --- /dev/null +++ b/tests/docx.metadata_after_normal.native @@ -0,0 +1,7 @@ +Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]}) +[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."] +,Para [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"] +,Para [Str "Mary",Space,Str "Ann",Space,Str "Evans"] +,Para [Str "Aurore",Space,Str "Dupin"] +,Para [Str "July",Space,Str "28,",Space,Str "2014"] +,Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]] diff --git a/tests/docx.metadata_author_linebreak.docx b/tests/docx.metadata_author_linebreak.docx new file mode 100644 index 000000000..94f0e0753 Binary files /dev/null and b/tests/docx.metadata_author_linebreak.docx differ diff --git a/tests/docx.metadata_author_linebreak.native b/tests/docx.metadata_author_linebreak.native new file mode 100644 index 000000000..ed7ba63cf --- /dev/null +++ b/tests/docx.metadata_author_linebreak.native @@ -0,0 +1,2 @@ +Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]}) +[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]] -- cgit v1.2.3 From 54708da371e767cd42598ea8f7fbd7d45c57421b Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 29 Jul 2014 09:26:18 -0400 Subject: Add and update docx tests in pandoc.cabal. --- pandoc.cabal | 12 ++++++++++++ tests/Tests/Readers/Docx.hs | 4 ---- tests/docx.metadata.docx | Bin 40487 -> 39538 bytes tests/docx.metadata_after_normal.docx | Bin 57273 -> 56276 bytes tests/docx.metadata_author_linebreak.docx | Bin 40481 -> 0 bytes tests/docx.metadata_author_linebreak.native | 2 -- 6 files changed, 12 insertions(+), 6 deletions(-) delete mode 100644 tests/docx.metadata_author_linebreak.docx delete mode 100644 tests/docx.metadata_author_linebreak.native diff --git a/pandoc.cabal b/pandoc.cabal index 6597b27ed..9249723ff 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -172,13 +172,25 @@ Extra-Source-Files: tests/fb2.test.jpg, tests/docx.already_auto_ident.docx, tests/docx.block_quotes.docx, + tests/docx.codeblock.docx, + tests/docx.deep_normalize.docx, + tests/docx.definition_list.docx, + tests/docx.hanging_indent.docx, tests/docx.headers.docx, tests/docx.image.docx, + tests/docx.inline_code.docx, tests/docx.inline_formatting.docx, tests/docx.links.docx, tests/docx.lists.docx, + tests/docx.metadata.docx, + tests/docx.metadata_after_normal.docx, + tests/docx.normalize.docx, tests/docx.notes.docx, tests/docx.tables.docx, + tests/docx.tabs.docx, + tests/docx.track_changes_deletion.docx, + tests/docx.track_changes_insertion.docx, + tests/docx.trailing_spaces_in_formatting.docx, tests/docx.unicode.docx, tests/*.native, tests/txt2tags.t2t diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 8ad9e08ba..b1a966969 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -169,10 +169,6 @@ tests = [ testGroup "inlines" "metadata fields" "docx.metadata.docx" "docx.metadata.native" - , testCompareWithOpts def{readerStandalone=True} - "linebreak between authors" - "docx.metadata_author_linebreak.docx" - "docx.metadata_author_linebreak.native" , testCompareWithOpts def{readerStandalone=True} "stop recording metadata with normal text" "docx.metadata_after_normal.docx" diff --git a/tests/docx.metadata.docx b/tests/docx.metadata.docx index 34182a87e..ccf50b475 100644 Binary files a/tests/docx.metadata.docx and b/tests/docx.metadata.docx differ diff --git a/tests/docx.metadata_after_normal.docx b/tests/docx.metadata_after_normal.docx index 16b8d583c..b94a016cb 100644 Binary files a/tests/docx.metadata_after_normal.docx and b/tests/docx.metadata_after_normal.docx differ diff --git a/tests/docx.metadata_author_linebreak.docx b/tests/docx.metadata_author_linebreak.docx deleted file mode 100644 index 94f0e0753..000000000 Binary files a/tests/docx.metadata_author_linebreak.docx and /dev/null differ diff --git a/tests/docx.metadata_author_linebreak.native b/tests/docx.metadata_author_linebreak.native deleted file mode 100644 index ed7ba63cf..000000000 --- a/tests/docx.metadata_author_linebreak.native +++ /dev/null @@ -1,2 +0,0 @@ -Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]}) -[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]] -- cgit v1.2.3