aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-07-29 11:15:34 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2014-07-29 11:15:34 -0700
commit8d4eebaff42cfe474042a7dafaa86950640b9e82 (patch)
treee1c58d8ac6442a3eb4cfa6015757226e860a48b9
parent4ddb8433a8326a872fd430d9aae7e2ba8ad7b051 (diff)
parent54708da371e767cd42598ea8f7fbd7d45c57421b (diff)
downloadpandoc-8d4eebaff42cfe474042a7dafaa86950640b9e82.tar.gz
Merge pull request #1463 from jkr/metadata
Make metadata out of styled pars
-rw-r--r--pandoc.cabal12
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs84
-rw-r--r--tests/Tests/Readers/Docx.hs11
-rw-r--r--tests/docx.metadata.docxbin0 -> 39538 bytes
-rw-r--r--tests/docx.metadata.native2
-rw-r--r--tests/docx.metadata_after_normal.docxbin0 -> 56276 bytes
-rw-r--r--tests/docx.metadata_after_normal.native7
7 files changed, 105 insertions, 11 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index 6597b27ed..9249723ff 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -172,13 +172,25 @@ Extra-Source-Files:
tests/fb2.test.jpg,
tests/docx.already_auto_ident.docx,
tests/docx.block_quotes.docx,
+ tests/docx.codeblock.docx,
+ tests/docx.deep_normalize.docx,
+ tests/docx.definition_list.docx,
+ tests/docx.hanging_indent.docx,
tests/docx.headers.docx,
tests/docx.image.docx,
+ tests/docx.inline_code.docx,
tests/docx.inline_formatting.docx,
tests/docx.links.docx,
tests/docx.lists.docx,
+ tests/docx.metadata.docx,
+ tests/docx.metadata_after_normal.docx,
+ tests/docx.normalize.docx,
tests/docx.notes.docx,
tests/docx.tables.docx,
+ tests/docx.tabs.docx,
+ tests/docx.track_changes_deletion.docx,
+ tests/docx.track_changes_insertion.docx,
+ tests/docx.trailing_spaces_in_formatting.docx,
tests/docx.unicode.docx,
tests/*.native,
tests/txt2tags.t2t
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 196a3cec5..48a23dd3c 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -87,7 +87,7 @@ import Text.Pandoc.Readers.Docx.Reducible
import Text.Pandoc.Readers.Docx.TexChar
import Text.Pandoc.Shared
import Data.Maybe (mapMaybe, fromMaybe)
-import Data.List (delete, isPrefixOf, (\\), intercalate)
+import Data.List (delete, isPrefixOf, (\\), intercalate, intersect)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.ByteString.Base64 (encode)
@@ -101,7 +101,8 @@ readDocx :: ReaderOptions
-> Pandoc
readDocx opts bytes =
case archiveToDocx (toArchive bytes) of
- Right docx -> Pandoc nullMeta (docxToBlocks opts docx)
+ Right docx -> Pandoc meta blks where
+ (meta, blks) = (docxToMetaAndBlocks opts docx)
Left _ -> error $ "couldn't parse docx file"
data DState = DState { docxAnchorMap :: M.Map String String
@@ -134,6 +135,65 @@ spansToKeep = []
divsToKeep :: [String]
divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
+metaStyles :: M.Map String String
+metaStyles = M.fromList [ ("Title", "title")
+ , ("Subtitle", "subtitle")
+ , ("Author", "author")
+ , ("Date", "date")
+ , ("Abstract", "abstract")]
+
+sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
+sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
+
+isMetaPar :: BodyPart -> Bool
+isMetaPar (Paragraph pPr _) =
+ not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
+isMetaPar _ = False
+
+isEmptyPar :: BodyPart -> Bool
+isEmptyPar (Paragraph _ parParts) =
+ all isEmptyParPart parParts
+ where
+ isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
+ isEmptyParPart _ = False
+ isEmptyElem (TextRun s) = trim s == ""
+ isEmptyElem _ = True
+isEmptyPar _ = False
+
+bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue)
+bodyPartsToMeta' [] = return M.empty
+bodyPartsToMeta' (bp : bps)
+ | (Paragraph pPr parParts) <- bp
+ , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
+ , (Just metaField) <- M.lookup c metaStyles = do
+ inlines <- parPartsToInlines parParts
+ remaining <- bodyPartsToMeta' bps
+ let
+ f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
+ f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
+ f m (MetaList mv) = MetaList (m : mv)
+ f m n = MetaList [m, n]
+ return $ M.insertWith f metaField (MetaInlines inlines) remaining
+bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
+
+bodyPartsToMeta :: [BodyPart] -> DocxContext Meta
+bodyPartsToMeta bps = do
+ mp <- bodyPartsToMeta' bps
+ let mp' =
+ case M.lookup "author" mp of
+ Just mv -> M.insert "author" (fixAuthors mv) mp
+ Nothing -> mp
+ return $ Meta mp'
+
+fixAuthors :: MetaValue -> MetaValue
+fixAuthors (MetaBlocks blks) =
+ MetaList $ map g $ filter f blks
+ where f (Para _) = True
+ f _ = False
+ g (Para ils) = MetaInlines ils
+ g _ = MetaInlines []
+fixAuthors mv = mv
+
runStyleToContainers :: RunStyle -> [Container Inline]
runStyleToContainers rPr =
let spanClassToContainers :: String -> [Container Inline]
@@ -615,24 +675,26 @@ rewriteLink l@(Link ils ('#':target, title)) = do
Nothing -> l
rewriteLink il = return il
-
-bodyToBlocks :: Body -> DocxContext [Block]
-bodyToBlocks (Body bps) = do
- blks <- concatMapM bodyPartToBlocks bps >>=
+bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block])
+bodyToMetaAndBlocks (Body bps) = do
+ let (metabps, blkbps) = sepBodyParts bps
+ meta <- bodyPartsToMeta metabps
+ blks <- concatMapM bodyPartToBlocks blkbps >>=
walkM rewriteLink
return $
- blocksToDefinitions $
- blocksToBullets $ blks
+ (meta,
+ blocksToDefinitions $
+ blocksToBullets $ blks)
-docxToBlocks :: ReaderOptions -> Docx -> [Block]
-docxToBlocks opts d@(Docx (Document _ body)) =
+docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block])
+docxToMetaAndBlocks opts d@(Docx (Document _ body)) =
let dState = DState { docxAnchorMap = M.empty
, docxInHeaderBlock = False
, docxInTexSubscript = False}
dEnv = DEnv { docxOptions = opts
, docxDocument = d}
in
- evalDocxContext (bodyToBlocks body) dEnv dState
+ evalDocxContext (bodyToMetaAndBlocks body) dEnv dState
ilToCode :: Inline -> String
ilToCode (Str s) = s
diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs
index 7b737f95a..b1a966969 100644
--- a/tests/Tests/Readers/Docx.hs
+++ b/tests/Tests/Readers/Docx.hs
@@ -164,5 +164,16 @@ tests = [ testGroup "inlines"
"docx.track_changes_deletion.docx"
"docx.track_changes_deletion_all.native"
]
+ , testGroup "metadata"
+ [ testCompareWithOpts def{readerStandalone=True}
+ "metadata fields"
+ "docx.metadata.docx"
+ "docx.metadata.native"
+ , testCompareWithOpts def{readerStandalone=True}
+ "stop recording metadata with normal text"
+ "docx.metadata_after_normal.docx"
+ "docx.metadata_after_normal.native"
+ ]
+
]
diff --git a/tests/docx.metadata.docx b/tests/docx.metadata.docx
new file mode 100644
index 000000000..ccf50b475
--- /dev/null
+++ b/tests/docx.metadata.docx
Binary files differ
diff --git a/tests/docx.metadata.native b/tests/docx.metadata.native
new file mode 100644
index 000000000..ed7ba63cf
--- /dev/null
+++ b/tests/docx.metadata.native
@@ -0,0 +1,2 @@
+Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]})
+[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]]
diff --git a/tests/docx.metadata_after_normal.docx b/tests/docx.metadata_after_normal.docx
new file mode 100644
index 000000000..b94a016cb
--- /dev/null
+++ b/tests/docx.metadata_after_normal.docx
Binary files differ
diff --git a/tests/docx.metadata_after_normal.native b/tests/docx.metadata_after_normal.native
new file mode 100644
index 000000000..f0e31f8da
--- /dev/null
+++ b/tests/docx.metadata_after_normal.native
@@ -0,0 +1,7 @@
+Pandoc (Meta {unMeta = fromList [("abstract",MetaInlines [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]),("author",MetaList [MetaInlines [Str "Mary",Space,Str "Ann",Space,Str "Evans"],MetaInlines [Str "Aurore",Space,Str "Dupin"]]),("date",MetaInlines [Str "July",Space,Str "28,",Space,Str "2014"]),("title",MetaInlines [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"])]})
+[Para [Str "And",Space,Str "now",Space,Str "this",Space,Str "is",Space,Str "normal",Space,Str "text."]
+,Para [Str "This",Space,Str "Is",Space,Str "the",Space,Str "Title"]
+,Para [Str "Mary",Space,Str "Ann",Space,Str "Evans"]
+,Para [Str "Aurore",Space,Str "Dupin"]
+,Para [Str "July",Space,Str "28,",Space,Str "2014"]
+,Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "test",Space,Str "of",Space,Str "how",Space,Str "this",Space,Str "all",Space,Str "works.",Space,Str "I\8217ve",Space,Str "skipped",Space,Str "lines",Space,Str "here,",Space,Str "which",Space,Str "pandoc",Space,Str "doesn\8217t",Space,Str "do,",Space,Str "but",Space,Str "which",Space,Str "shouldn\8217t",Space,Str "make",Space,Str "a",Space,Str "difference."]]