3 files changed, 186 insertions, 16 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs
new file mode 100644
index 000000000..6eeb55d2f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Fields.hs
@@ -0,0 +1,89 @@
+{-
+Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.Docx.Fields
+   Copyright   : Copyright (C) 2014-2018 Jesse Rosenthal
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Jesse Rosenthal <jrosenthal@jhu.edu>
+   Stability   : alpha
+   Portability : portable
+
+For parsing Field definitions in instText tags, as described in
+ECMA-376-1:2016, §17.16.5 -}
+
+module Text.Pandoc.Readers.Docx.Fields ( FieldInfo(..)
+                                       , parseFieldInfo
+                                       ) where
+
+import Text.Parsec
+import Text.Parsec.String (Parser)
+
+type URL = String
+
+data FieldInfo = HyperlinkField URL
+               | UnknownField
+               deriving (Show)
+
+parseFieldInfo :: String -> Either ParseError FieldInfo
+parseFieldInfo = parse fieldInfo ""
+
+fieldInfo :: Parser FieldInfo
+fieldInfo =
+  try (HyperlinkField <$> hyperlink)
+  <|>
+  return UnknownField
+
+escapedQuote :: Parser String
+escapedQuote = string "\\\""
+
+inQuotes :: Parser String
+inQuotes =
+  (try escapedQuote) <|> (anyChar >>= (\c -> return [c]))
+
+quotedString :: Parser String
+quotedString = do
+  char '"'
+  concat <$> manyTill inQuotes (try (char '"'))
+
+unquotedString :: Parser String
+unquotedString = manyTill anyChar (try $ lookAhead space *> return () <|> eof)
+
+fieldArgument :: Parser String
+fieldArgument = quotedString <|> unquotedString
+
+-- there are other switches, but this is the only one I've seen in the wild so far, so it's the first one I'll implement. See §17.16.5.25
+hyperlinkSwitch :: Parser (String, String)
+hyperlinkSwitch = do
+  sw <- string "\\l"
+  spaces
+  farg <- fieldArgument
+  return (sw, farg)
+
+hyperlink :: Parser URL
+hyperlink = do
+  many space
+  string "HYPERLINK"
+  spaces
+  farg <- fieldArgument
+  switches <- spaces *> many hyperlinkSwitch
+  let url = case switches of
+              ("\\l", s) : _ -> farg ++ ('#': s)
+              _              -> farg
+  return url
diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs
index fa4870fff..c0f05094a 100644
--- a/src/Text/Pandoc/Readers/Docx/Lists.hs
+++ b/src/Text/Pandoc/Readers/Docx/Lists.hs
@@ -44,14 +44,14 @@ isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True
 isListItem _                       = False
 
 getLevel :: Block -> Maybe Integer
-getLevel (Div (_, _, kvs) _) =  fmap read $ lookup "level" kvs
+getLevel (Div (_, _, kvs) _) =  read <$> lookup "level" kvs
 getLevel _                   = Nothing
 
 getLevelN :: Block -> Integer
 getLevelN b = fromMaybe (-1) (getLevel b)
 
 getNumId :: Block -> Maybe Integer
-getNumId (Div (_, _, kvs) _) =  fmap read $ lookup "num-id" kvs
+getNumId (Div (_, _, kvs) _) =  read <$> lookup "num-id" kvs
 getNumId _                   = Nothing
 
 getNumIdN :: Block -> Integer
@@ -140,8 +140,8 @@ flatToBullets' num xs@(b : elems)
         (children, remaining) =
           span
           (\b' ->
-            (getLevelN b') > bLevel ||
-             ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))
+            getLevelN b' > bLevel ||
+             (getLevelN b' == bLevel && getNumIdN b' == bNumId))
           xs
     in
      case getListType b of
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index b79b39369..c123a0018 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -54,6 +54,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
                                       , TrackedChange(..)
                                       , ChangeType(..)
                                       , ChangeInfo(..)
+                                      , FieldInfo(..)
                                       , archiveToDocx
                                       , archiveToDocxWithWarnings
                                       ) where
@@ -70,6 +71,7 @@ import qualified Data.Map as M
 import Data.Maybe
 import System.FilePath
 import Text.Pandoc.Readers.Docx.Util
+import Text.Pandoc.Readers.Docx.Fields
 import Text.Pandoc.Shared (filteredFilesFromArchive, safeRead)
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.TeXMath (Exp)
@@ -90,10 +92,19 @@ data ReaderEnv = ReaderEnv { envNotes         :: Notes
                            }
                deriving Show
 
-data ReaderState = ReaderState { stateWarnings :: [String] }
+data ReaderState = ReaderState { stateWarnings :: [String]
+                               , stateFldCharState :: FldCharState
+                               }
                  deriving Show
 
-data DocxError = DocxError | WrongElem
+data FldCharState = FldCharOpen
+                  | FldCharFieldInfo FieldInfo
+                  | FldCharContent FieldInfo [Run]
+                  | FldCharClosed
+                  deriving (Show)
+
+data DocxError = DocxError
+               | WrongElem
                deriving Show
 
 type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState))
@@ -265,6 +276,9 @@ data ParPart = PlainRun Run
              | Chart                                              -- placeholder for now
              | PlainOMath [Exp]
              | SmartTag [Run]
+             | Field FieldInfo [Run]
+             | NullParPart      -- when we need to return nothing, but
+                                -- not because of an error.
              deriving Show
 
 data Run = Run RunStyle [RunElem]
@@ -328,7 +342,9 @@ archiveToDocxWithWarnings archive = do
       (styles, parstyles) = archiveToStyles archive
       rEnv =
         ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument
-      rState = ReaderState { stateWarnings = [] }
+      rState = ReaderState { stateWarnings = []
+                           , stateFldCharState = FldCharClosed
+                           }
       (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState
   case eitherDoc of
     Right doc -> Right (Docx doc, stateWarnings st)
@@ -342,9 +358,7 @@ archiveToDocument zf = do
   docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
   let namespaces = elemToNameSpaces docElem
   bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
-  let bodyElem' = case walkDocument namespaces bodyElem of
-        Just e -> e
-        Nothing -> bodyElem
+  let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem)
   body <- elemToBody namespaces bodyElem'
   return $ Document namespaces body
 
@@ -587,7 +601,7 @@ elemToTblLook ns element | isElem ns "w" "tblLook" element =
             Just bitMask -> testBitMask bitMask 0x020
             Nothing      -> False
   in
-   return $ TblLook{firstRowFormatting = firstRowFmt}
+   return TblLook{firstRowFormatting = firstRowFmt}
 elemToTblLook _ _ = throwError WrongElem
 
 elemToRow :: NameSpaces -> Element -> D Row
@@ -607,7 +621,7 @@ elemToCell _ _ = throwError WrongElem
 
 elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
 elemToParIndentation ns element | isElem ns "w" "ind" element =
-  Just $ ParIndentation {
+ Just ParIndentation {
     leftParIndent =
        findAttrByName ns "w" "left" element >>=
        stringToInteger
@@ -736,9 +750,77 @@ elemToParPart ns element
   , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart"
   , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem
   = return Chart
+{-
+The next one is a bit complicated. fldChar fields work by first
+having a <w:fldChar fldCharType="begin"> in a run, then a run with
+<w:instrText>, then a <w:fldChar fldCharType="separate"> run, then the
+content runs, and finally a <w:fldChar fldCharType="end"> run. For
+example (omissions and my comments in brackets):
+
+      <w:r>
+        [...]
+        <w:fldChar w:fldCharType="begin"/>
+      </w:r>
+      <w:r>
+        [...]
+        <w:instrText xml:space="preserve"> HYPERLINK [hyperlink url] </w:instrText>
+      </w:r>
+      <w:r>
+        [...]
+        <w:fldChar w:fldCharType="separate"/>
+      </w:r>
+      <w:r w:rsidRPr=[...]>
+        [...]
+        <w:t>Foundations of Analysis, 2nd Edition</w:t>
+      </w:r>
+      <w:r>
+        [...]
+        <w:fldChar w:fldCharType="end"/>
+      </w:r>
+
+So we do this in a number of steps. If we encounter the fldchar begin
+tag, we start open a fldchar state variable (see state above). We add
+the instrtext to it as FieldInfo. Then we close that and start adding
+the runs when we get to separate. Then when we get to end, we produce
+the Field type with approriate FieldInfo and Runs.
+-}
 elemToParPart ns element
-  | isElem ns "w" "r" element =
-    elemToRun ns element >>= (\r -> return $ PlainRun r)
+  | isElem ns "w" "r" element
+  , Just fldChar <- findChildByName ns "w" "fldChar" element
+  , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do
+      fldCharState <- gets stateFldCharState
+      case fldCharState of
+        FldCharClosed | fldCharType == "begin" -> do
+          modify $ \st -> st {stateFldCharState = FldCharOpen}
+          return NullParPart
+        FldCharFieldInfo info | fldCharType == "separate" -> do
+          modify $ \st -> st {stateFldCharState = FldCharContent info []}
+          return NullParPart
+        FldCharContent info runs | fldCharType == "end" -> do
+          modify $ \st -> st {stateFldCharState = FldCharClosed}
+          return $ Field info $ reverse runs
+        _ -> throwError WrongElem
+elemToParPart ns element
+  | isElem ns "w" "r" element
+  , Just instrText <- findChildByName ns "w" "instrText" element = do
+      fldCharState <- gets stateFldCharState
+      case fldCharState of
+        FldCharOpen -> do
+          info <- eitherToD $ parseFieldInfo $ strContent instrText
+          modify $ \st -> st{stateFldCharState = FldCharFieldInfo info}
+          return NullParPart
+        _ -> return NullParPart
+elemToParPart ns element
+  | isElem ns "w" "r" element = do
+    run <- elemToRun ns element
+    -- we check to see if we have an open FldChar in state that we're
+    -- recording.
+    fldCharState <- gets stateFldCharState
+    case fldCharState of
+      FldCharContent info runs -> do
+        modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)}
+        return NullParPart
+      _ -> return $ PlainRun run
 elemToParPart ns element
   | Just change <- getTrackedChange ns element = do
       runs <- mapD (elemToRun ns) (elChildren element)
@@ -1089,8 +1171,7 @@ elemToRunElems ns element
        let font = do
                     fontElem <- findElement (qualName "rFonts") element
                     stringToFont =<<
-                      foldr (<|>) Nothing (
-                        map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"])
+                       foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"]
        local (setFont font) (mapD (elemToRunElem ns) (elChildren element))
 elemToRunElems _ _ = throwError WrongElem