aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMilan Bracke <mbracke@antidot.net>2021-06-11 09:26:09 +0200
committerJohn MacFarlane <jgm@berkeley.edu>2021-10-18 19:15:40 -0700
commit6acc82c5d2885c596c52e6c35bed8fe08f535066 (patch)
treead9a434851bc05154cc4680fd779aa8f9367b0e2
parent193f6bfebaa43d0d6749d10a4e7ca78a0d31361d (diff)
downloadpandoc-6acc82c5d2885c596c52e6c35bed8fe08f535066.tar.gz
Docx parser: implement PAGEREF fields
These fields, often used in tables of contents, can be a hyperlink.
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs1
-rw-r--r--src/Text/Pandoc/Readers/Docx/Fields.hs25
-rw-r--r--test/Tests/Readers/Docx.hs4
-rw-r--r--test/docx/0_level_headers.native6
-rw-r--r--test/docx/golden/nested_anchors_in_header.docxbin10126 -> 10163 bytes
-rw-r--r--test/docx/nested_anchors_in_header.native8
-rw-r--r--test/docx/pageref.docxbin0 -> 14431 bytes
-rw-r--r--test/docx/pageref.native4
8 files changed, 41 insertions, 7 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 462e3c679..5c8f20c18 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -448,6 +448,7 @@ parPartToInlines' (PlainOMath exps) =
parPartToInlines' (Field info children) =
case info of
HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children
+ PagerefField fieldAnchor True -> parPartToInlines' $ InternalHyperLink fieldAnchor children
_ -> smushInlines <$> mapM parPartToInlines' children
parPartToInlines' NullParPart = return mempty
diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs
index 442bc3466..5f090b6be 100644
--- a/src/Text/Pandoc/Readers/Docx/Fields.hs
+++ b/src/Text/Pandoc/Readers/Docx/Fields.hs
@@ -21,8 +21,11 @@ import Text.Parsec
import Text.Parsec.Text (Parser)
type URL = T.Text
+type Anchor = T.Text
data FieldInfo = HyperlinkField URL
+ -- The boolean indicates whether the field is a hyperlink.
+ | PagerefField Anchor Bool
| UnknownField
deriving (Show)
@@ -33,6 +36,8 @@ fieldInfo :: Parser FieldInfo
fieldInfo =
try (HyperlinkField <$> hyperlink)
<|>
+ try ((uncurry PagerefField) <$> pageref)
+ <|>
return UnknownField
escapedQuote :: Parser T.Text
@@ -72,3 +77,23 @@ hyperlink = do
("\\l", s) : _ -> farg <> "#" <> s
_ -> farg
return url
+
+-- See ยง17.16.5.45
+pagerefSwitch :: Parser (T.Text, T.Text)
+pagerefSwitch = do
+ sw <- string "\\h"
+ spaces
+ farg <- fieldArgument
+ return (T.pack sw, farg)
+
+pageref :: Parser (Anchor, Bool)
+pageref = do
+ many space
+ string "PAGEREF"
+ spaces
+ farg <- fieldArgument
+ switches <- spaces *> many pagerefSwitch
+ let isLink = case switches of
+ ("\\h", _) : _ -> True
+ _ -> False
+ return (farg, isLink)
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index af6023836..ea4094c82 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -152,6 +152,10 @@ tests = [ testGroup "document"
"docx/nested_instrText.docx"
"docx/nested_instrText.native"
, testCompare
+ "pageref hyperlinks in <w:instrText> tag"
+ "docx/pageref.docx"
+ "docx/pageref.native"
+ , testCompare
"inline image"
"docx/image.docx"
"docx/image_no_embed.native"
diff --git a/test/docx/0_level_headers.native b/test/docx/0_level_headers.native
index ed589b029..7080063f9 100644
--- a/test/docx/0_level_headers.native
+++ b/test/docx/0_level_headers.native
@@ -39,9 +39,9 @@
[])
,Para [Str "CONTENTS"]
,Para [Strong [Str "Section",Space,Str "Page"]]
-,Para [Str "FIGURES",Space,Str "iv"]
-,Para [Str "TABLES",Space,Str "v"]
-,Para [Str "SECTION",Space,Str "1",Space,Str "Introduction",Space,Str "2"]
+,Para [Str "FIGURES",Space,Link ("",[],[]) [Str "iv"] ("#figures","")]
+,Para [Str "TABLES",Space,Link ("",[],[]) [Str "v"] ("#tables","")]
+,Para [Str "SECTION",Space,Str "1",Space,Str "Introduction",Space,Link ("",[],[]) [Str "2"] ("#introduction","")]
,Header 1 ("figures",["Heading-0"],[]) [Str "FIGURES"]
,Para [Strong [Str "Figure",Space,Str "Page"]]
,Para [Strong [Str "No",Space,Str "table",Space,Str "of",Space,Str "figures",Space,Str "entries",Space,Str "found."]]
diff --git a/test/docx/golden/nested_anchors_in_header.docx b/test/docx/golden/nested_anchors_in_header.docx
index 88dd21abd..9d89070d9 100644
--- a/test/docx/golden/nested_anchors_in_header.docx
+++ b/test/docx/golden/nested_anchors_in_header.docx
Binary files differ
diff --git a/test/docx/nested_anchors_in_header.native b/test/docx/nested_anchors_in_header.native
index 314b31663..9cc256d28 100644
--- a/test/docx/nested_anchors_in_header.native
+++ b/test/docx/nested_anchors_in_header.native
@@ -1,8 +1,8 @@
[Header 1 ("\1086\1075\1083\1072\1074\1083\1077\1085\1080\1077",["TOC-Heading"],[]) [Str "\1054\1075\1083\1072\1074\1083\1077\1085\1080\1077"]
-,Para [Link ("",[],[]) [Str "Short",Space,Str "instructions",Space,Str "1"] ("#short-instructions","")]
-,Para [Link ("",[],[]) [Str "Some",Space,Str "instructions",Space,Str "1"] ("#some-instructions","")]
-,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "opening",Space,Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")]
-,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "closing",Space,Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-closing","")]
+,Para [Link ("",[],[]) [Str "Short",Space,Str "instructions",Space,Link ("",[],[]) [Str "1"] ("#short-instructions","")] ("#short-instructions","")]
+,Para [Link ("",[],[]) [Str "Some",Space,Str "instructions",Space,Link ("",[],[]) [Str "1"] ("#some-instructions","")] ("#some-instructions","")]
+,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "opening",Space,Link ("",[],[]) [Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")]
+,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "closing",Space,Link ("",[],[]) [Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-closing","")] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-closing","")]
,Header 1 ("short-instructions",[],[]) [Str "Short",Space,Str "instructions"]
,Para [Link ("",[],[]) [Str "Open",Space,Str "remote",Space,Str "folder"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")]
,Para [Str "Do",Space,Str "staff"]
diff --git a/test/docx/pageref.docx b/test/docx/pageref.docx
new file mode 100644
index 000000000..0a9159cab
--- /dev/null
+++ b/test/docx/pageref.docx
Binary files differ
diff --git a/test/docx/pageref.native b/test/docx/pageref.native
new file mode 100644
index 000000000..6c683de67
--- /dev/null
+++ b/test/docx/pageref.native
@@ -0,0 +1,4 @@
+[Para [Str "Title",Space,Link ("",[],[]) [Str "2"] ("#title","")]
+,Para [Str "Title2",Space,Link ("",[],[]) [Str "2"] ("#title2","")]
+,Header 1 ("title", [],[]) [Str "Title"]
+,Header 1 ("title2",[],[]) [Str "Title2"]]