From ae8c0cdba82c9cfb847025c3f5ca410407b0fb96 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Tue, 16 Jan 2018 12:50:37 -0500 Subject: Docx reader: Parse instrText info in fldChar tags. We introduce a new module, Text.Pandoc.Readers.Docx.Fields which contains a simple parsec parser. At the moment, only simple hyperlink fields are accepted, but that can be extended in the future. --- src/Text/Pandoc/Readers/Docx/Fields.hs | 89 ++++++++++++++++++++++++++++++++++ src/Text/Pandoc/Readers/Docx/Parse.hs | 18 +++++-- 2 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Docx/Fields.hs (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs new file mode 100644 index 000000000..69758b431 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fields.hs @@ -0,0 +1,89 @@ +{- +Copyright (C) 2014-2018 Jesse Rosenthal + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fields + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal + Stability : alpha + Portability : portable + +For parsing Field definitions in instText tags, as described in +ECMA-376-1:2016, §17.16.5 -} + +module Text.Pandoc.Readers.Docx.Fields ( FieldInfo(..) + , parseFieldInfo + ) where + +import Text.Parsec +import Text.Parsec.String (Parser) + +type URL = String + +data FieldInfo = HyperlinkField URL + | UnknownField + deriving (Show) + +parseFieldInfo :: String -> Either ParseError FieldInfo +parseFieldInfo = parse fieldInfo "" + +fieldInfo :: Parser FieldInfo +fieldInfo = + (try $ HyperlinkField <$> hyperlink) + <|> + return UnknownField + +escapedQuote :: Parser String +escapedQuote = string "\\\"" + +inQuotes :: Parser String +inQuotes = do + (try escapedQuote) <|> (anyChar >>= (\c -> return [c])) + +quotedString :: Parser String +quotedString = do + char '"' + concat <$> manyTill inQuotes (try (char '"')) + +unquotedString :: Parser String +unquotedString = manyTill anyChar (try (space)) + +fieldArgument :: Parser String +fieldArgument = quotedString <|> unquotedString + +-- there are other switches, but this is the only one I've seen in the wild so far, so it's the first one I'll implement. See §17.16.5.25 +hyperlinkSwitch :: Parser (String, String) +hyperlinkSwitch = do + sw <- string "\\l" + spaces + farg <- fieldArgument + return (sw, farg) + +hyperlink :: Parser URL +hyperlink = do + many space + string "HYPERLINK" + spaces + farg <- fieldArgument + switches <- (spaces *> many hyperlinkSwitch) + let url = case switches of + ("\\l", s) : _ -> farg ++ ('#': s) + _ -> farg + return url diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index b3a0fee8e..5f648666f 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -54,6 +54,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) + , FieldInfo(..) , archiveToDocx , archiveToDocxWithWarnings ) where @@ -70,6 +71,7 @@ import qualified Data.Map as M import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util +import Text.Pandoc.Readers.Docx.Fields import Text.Pandoc.Shared (filteredFilesFromArchive, safeRead) import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) @@ -286,10 +288,6 @@ data Run = Run RunStyle [RunElem] | InlineChart -- placeholder deriving Show -data FieldInfo = HyperlinkField URL - | UnknownField - deriving (Show) - data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen deriving Show @@ -802,8 +800,18 @@ elemToParPart ns element return NullParPart FldCharContent info runs | fldCharType == "end" -> do modify $ \st -> st {stateFldCharState = FldCharClosed} - return $ Field info runs + return $ Field info $ reverse runs _ -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element + , Just instrText <- findChildByName ns "w" "instrText" element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharOpen -> do + info <- eitherToD $ parseFieldInfo $ strContent instrText + modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + return NullParPart + _ -> return NullParPart elemToParPart ns element | isElem ns "w" "r" element = do run <- elemToRun ns element -- cgit v1.2.3