diff options
| -rw-r--r-- | pandoc.cabal | 12 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 84 | ||||
| -rw-r--r-- | tests/Tests/Readers/Docx.hs | 11 | ||||
| -rw-r--r-- | tests/docx.metadata.docx | bin | 0 -> 39538 bytes | |||
| -rw-r--r-- | tests/docx.metadata.native | 2 | ||||
| -rw-r--r-- | tests/docx.metadata_after_normal.docx | bin | 0 -> 56276 bytes | |||
| -rw-r--r-- | tests/docx.metadata_after_normal.native | 7 | 
7 files changed, 105 insertions, 11 deletions
| diff --git a/pandoc.cabal b/pandoc.cabal index 6597b27ed..9249723ff 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -172,13 +172,25 @@ Extra-Source-Files:                   tests/fb2.test.jpg,                   tests/docx.already_auto_ident.docx,                   tests/docx.block_quotes.docx, +                 tests/docx.codeblock.docx, +                 tests/docx.deep_normalize.docx, +                 tests/docx.definition_list.docx, +                 tests/docx.hanging_indent.docx,                   tests/docx.headers.docx,                   tests/docx.image.docx, +                 tests/docx.inline_code.docx,                   tests/docx.inline_formatting.docx,                   tests/docx.links.docx,                   tests/docx.lists.docx, +                 tests/docx.metadata.docx, +                 tests/docx.metadata_after_normal.docx, +                 tests/docx.normalize.docx,                   tests/docx.notes.docx,                   tests/docx.tables.docx, +                 tests/docx.tabs.docx, +                 tests/docx.track_changes_deletion.docx, +                 tests/docx.track_changes_insertion.docx, +                 tests/docx.trailing_spaces_in_formatting.docx,                   tests/docx.unicode.docx,                   tests/*.native,                   tests/txt2tags.t2t diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 196a3cec5..48a23dd3c 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -87,7 +87,7 @@ import Text.Pandoc.Readers.Docx.Reducible  import Text.Pandoc.Readers.Docx.TexChar  import Text.Pandoc.Shared  import Data.Maybe (mapMaybe, fromMaybe) -import Data.List (delete, isPrefixOf, (\\), intercalate) +import Data.List (delete, isPrefixOf, (\\), intercalate, intersect)  import qualified Data.ByteString as BS  import qualified Data.ByteString.Lazy as B  import Data.ByteString.Base64 (encode) @@ -101,7 +101,8 @@ readDocx :: ReaderOptions           -> Pandoc  readDocx opts bytes =    case archiveToDocx (toArchive bytes) of -    Right docx -> Pandoc nullMeta (docxToBlocks opts docx) +    Right docx -> Pandoc meta blks where +      (meta, blks) = (docxToMetaAndBlocks opts docx)      Left _   -> error $ "couldn't parse docx file"  data DState = DState { docxAnchorMap :: M.Map String String @@ -134,6 +135,65 @@ spansToKeep = []  divsToKeep :: [String]  divsToKeep = ["list-item", "Definition", "DefinitionTerm"] +metaStyles :: M.Map String String +metaStyles = M.fromList [ ("Title", "title") +                        , ("Subtitle", "subtitle") +                        , ("Author", "author") +                        , ("Date", "date") +                        , ("Abstract", "abstract")] + +sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) +sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) + +isMetaPar :: BodyPart -> Bool +isMetaPar (Paragraph pPr _) = +  not $ null $ intersect (pStyle pPr) (M.keys metaStyles) +isMetaPar _ = False + +isEmptyPar :: BodyPart -> Bool +isEmptyPar (Paragraph _ parParts) = +  all isEmptyParPart parParts +  where +    isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems +    isEmptyParPart _ = False +    isEmptyElem (TextRun s) = trim s == "" +    isEmptyElem _           = True +isEmptyPar _ = False +   +bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' [] = return M.empty +bodyPartsToMeta' (bp : bps) +  | (Paragraph pPr parParts) <- bp +  , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) +  , (Just metaField) <- M.lookup c metaStyles = do +    inlines <- parPartsToInlines parParts +    remaining <- bodyPartsToMeta' bps +    let   +      f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] +      f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) +      f m (MetaList mv) = MetaList (m : mv) +      f m n             = MetaList [m, n] +    return $ M.insertWith f metaField (MetaInlines inlines) remaining +bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps + +bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta bps = do +  mp <- bodyPartsToMeta' bps +  let mp' = +        case M.lookup "author" mp of +          Just mv -> M.insert "author" (fixAuthors mv) mp +          Nothing -> mp +  return $ Meta mp' + +fixAuthors :: MetaValue -> MetaValue +fixAuthors (MetaBlocks blks) = +  MetaList $ map g $ filter f blks +    where f (Para _) = True +          f _          = False +          g (Para ils) = MetaInlines ils +          g _          = MetaInlines [] +fixAuthors mv = mv +  runStyleToContainers :: RunStyle -> [Container Inline]  runStyleToContainers rPr =    let spanClassToContainers :: String -> [Container Inline] @@ -615,24 +675,26 @@ rewriteLink l@(Link ils ('#':target, title)) = do      Nothing        -> l  rewriteLink il = return il - -bodyToBlocks :: Body -> DocxContext [Block] -bodyToBlocks (Body bps) = do -  blks <- concatMapM bodyPartToBlocks bps >>= +bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block]) +bodyToMetaAndBlocks (Body bps) = do +  let (metabps, blkbps) = sepBodyParts bps +  meta <- bodyPartsToMeta metabps +  blks <- concatMapM bodyPartToBlocks blkbps >>=            walkM rewriteLink    return $ -    blocksToDefinitions $ -    blocksToBullets $ blks +    (meta, +     blocksToDefinitions $ +     blocksToBullets $ blks) -docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body)) = +docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block]) +docxToMetaAndBlocks opts d@(Docx (Document _ body)) =    let dState = DState { docxAnchorMap = M.empty                        , docxInHeaderBlock = False                        , docxInTexSubscript = False}        dEnv   = DEnv { docxOptions  = opts                      , docxDocument = d}    in -   evalDocxContext (bodyToBlocks body) dEnv dState +   evalDocxContext (bodyToMetaAndBlocks body) dEnv dState  ilToCode :: Inline -> String  ilToCode (Str s) = s diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 7b737f95a..b1a966969 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -164,5 +164,16 @@ tests = [ testGroup "inlines"              "docx.track_changes_deletion.docx"              "docx.track_changes_deletion_all.native"            ] +        , testGroup "metadata" +          [ testCompareWithOpts def{readerStandalone=True} +            "metadata fields" +            "docx.metadata.docx" +            "docx.metadata.native" +          , testCompareWithOpts def{readerStandalone=True} +            "stop recording metadata with normal text" +            "docx.metadata_after_normal.docx" +            "docx.metadata_after_normal.native" +          ] +          ] diff --git a/tests/docx.metadata.docx b/tests/docx.metadata.docxBinary files differ new file mode 100644 index 000000000..ccf50b475 --- /dev/null +++ b/tests/docx.metadata.docx diff --git a/tests/docx.metadata.native b/tests/docx.metadata.native new file mode 100644 index 000000000..ed7ba63cf --- /dev/null +++ b/tests/docx.metadata.native @@ -0,0 +1,2 @@ +Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]}) +[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]] diff --git a/tests/docx.metadata_after_normal.docx b/tests/docx.metadata_after_normal.docxBinary files differ new file mode 100644 index 000000000..b94a016cb --- /dev/null +++ b/tests/docx.metadata_after_normal.docx diff --git a/tests/docx.metadata_after_normal.native b/tests/docx.metadata_after_normal.native new file mode 100644 index 000000000..f0e31f8da --- /dev/null +++ b/tests/docx.metadata_after_normal.native @@ -0,0 +1,7 @@ +Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]}) +[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."] +,Para [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"] +,Para [Str "Mary",Space,Str "Ann",Space,Str "Evans"] +,Para [Str "Aurore",Space,Str "Dupin"] +,Para [Str "July",Space,Str "28,",Space,Str "2014"] +,Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]] | 
