diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx')
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Fields.hs | 89 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Lists.hs | 8 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 105 |
3 files changed, 186 insertions, 16 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs new file mode 100644 index 000000000..6eeb55d2f --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fields.hs @@ -0,0 +1,89 @@ +{- +Copyright (C) 2014-2018 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fields + Copyright : Copyright (C) 2014-2018 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +For parsing Field definitions in instText tags, as described in +ECMA-376-1:2016, §17.16.5 -} + +module Text.Pandoc.Readers.Docx.Fields ( FieldInfo(..) + , parseFieldInfo + ) where + +import Text.Parsec +import Text.Parsec.String (Parser) + +type URL = String + +data FieldInfo = HyperlinkField URL + | UnknownField + deriving (Show) + +parseFieldInfo :: String -> Either ParseError FieldInfo +parseFieldInfo = parse fieldInfo "" + +fieldInfo :: Parser FieldInfo +fieldInfo = + try (HyperlinkField <$> hyperlink) + <|> + return UnknownField + +escapedQuote :: Parser String +escapedQuote = string "\\\"" + +inQuotes :: Parser String +inQuotes = + (try escapedQuote) <|> (anyChar >>= (\c -> return [c])) + +quotedString :: Parser String +quotedString = do + char '"' + concat <$> manyTill inQuotes (try (char '"')) + +unquotedString :: Parser String +unquotedString = manyTill anyChar (try $ lookAhead space *> return () <|> eof) + +fieldArgument :: Parser String +fieldArgument = quotedString <|> unquotedString + +-- there are other switches, but this is the only one I've seen in the wild so far, so it's the first one I'll implement. See §17.16.5.25 +hyperlinkSwitch :: Parser (String, String) +hyperlinkSwitch = do + sw <- string "\\l" + spaces + farg <- fieldArgument + return (sw, farg) + +hyperlink :: Parser URL +hyperlink = do + many space + string "HYPERLINK" + spaces + farg <- fieldArgument + switches <- spaces *> many hyperlinkSwitch + let url = case switches of + ("\\l", s) : _ -> farg ++ ('#': s) + _ -> farg + return url diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index fa4870fff..c0f05094a 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -44,14 +44,14 @@ isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True isListItem _ = False getLevel :: Block -> Maybe Integer -getLevel (Div (_, _, kvs) _) = fmap read $ lookup "level" kvs +getLevel (Div (_, _, kvs) _) = read <$> lookup "level" kvs getLevel _ = Nothing getLevelN :: Block -> Integer getLevelN b = fromMaybe (-1) (getLevel b) getNumId :: Block -> Maybe Integer -getNumId (Div (_, _, kvs) _) = fmap read $ lookup "num-id" kvs +getNumId (Div (_, _, kvs) _) = read <$> lookup "num-id" kvs getNumId _ = Nothing getNumIdN :: Block -> Integer @@ -140,8 +140,8 @@ flatToBullets' num xs@(b : elems) (children, remaining) = span (\b' -> - (getLevelN b') > bLevel || - ((getLevelN b') == bLevel && (getNumIdN b') == bNumId)) + getLevelN b' > bLevel || + (getLevelN b' == bLevel && getNumIdN b' == bNumId)) xs in case getListType b of diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index b79b39369..c123a0018 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -54,6 +54,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) + , FieldInfo(..) , archiveToDocx , archiveToDocxWithWarnings ) where @@ -70,6 +71,7 @@ import qualified Data.Map as M import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util +import Text.Pandoc.Readers.Docx.Fields import Text.Pandoc.Shared (filteredFilesFromArchive, safeRead) import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) @@ -90,10 +92,19 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes } deriving Show -data ReaderState = ReaderState { stateWarnings :: [String] } +data ReaderState = ReaderState { stateWarnings :: [String] + , stateFldCharState :: FldCharState + } deriving Show -data DocxError = DocxError | WrongElem +data FldCharState = FldCharOpen + | FldCharFieldInfo FieldInfo + | FldCharContent FieldInfo [Run] + | FldCharClosed + deriving (Show) + +data DocxError = DocxError + | WrongElem deriving Show type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) @@ -265,6 +276,9 @@ data ParPart = PlainRun Run | Chart -- placeholder for now | PlainOMath [Exp] | SmartTag [Run] + | Field FieldInfo [Run] + | NullParPart -- when we need to return nothing, but + -- not because of an error. deriving Show data Run = Run RunStyle [RunElem] @@ -328,7 +342,9 @@ archiveToDocxWithWarnings archive = do (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument - rState = ReaderState { stateWarnings = [] } + rState = ReaderState { stateWarnings = [] + , stateFldCharState = FldCharClosed + } (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState case eitherDoc of Right doc -> Right (Docx doc, stateWarnings st) @@ -342,9 +358,7 @@ archiveToDocument zf = do docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - let bodyElem' = case walkDocument namespaces bodyElem of - Just e -> e - Nothing -> bodyElem + let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) body <- elemToBody namespaces bodyElem' return $ Document namespaces body @@ -587,7 +601,7 @@ elemToTblLook ns element | isElem ns "w" "tblLook" element = Just bitMask -> testBitMask bitMask 0x020 Nothing -> False in - return $ TblLook{firstRowFormatting = firstRowFmt} + return TblLook{firstRowFormatting = firstRowFmt} elemToTblLook _ _ = throwError WrongElem elemToRow :: NameSpaces -> Element -> D Row @@ -607,7 +621,7 @@ elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation elemToParIndentation ns element | isElem ns "w" "ind" element = - Just $ ParIndentation { + Just ParIndentation { leftParIndent = findAttrByName ns "w" "left" element >>= stringToInteger @@ -736,9 +750,77 @@ elemToParPart ns element , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem = return Chart +{- +The next one is a bit complicated. fldChar fields work by first +having a <w:fldChar fldCharType="begin"> in a run, then a run with +<w:instrText>, then a <w:fldChar fldCharType="separate"> run, then the +content runs, and finally a <w:fldChar fldCharType="end"> run. For +example (omissions and my comments in brackets): + + <w:r> + [...] + <w:fldChar w:fldCharType="begin"/> + </w:r> + <w:r> + [...] + <w:instrText xml:space="preserve"> HYPERLINK [hyperlink url] </w:instrText> + </w:r> + <w:r> + [...] + <w:fldChar w:fldCharType="separate"/> + </w:r> + <w:r w:rsidRPr=[...]> + [...] + <w:t>Foundations of Analysis, 2nd Edition</w:t> + </w:r> + <w:r> + [...] + <w:fldChar w:fldCharType="end"/> + </w:r> + +So we do this in a number of steps. If we encounter the fldchar begin +tag, we start open a fldchar state variable (see state above). We add +the instrtext to it as FieldInfo. Then we close that and start adding +the runs when we get to separate. Then when we get to end, we produce +the Field type with approriate FieldInfo and Runs. +-} elemToParPart ns element - | isElem ns "w" "r" element = - elemToRun ns element >>= (\r -> return $ PlainRun r) + | isElem ns "w" "r" element + , Just fldChar <- findChildByName ns "w" "fldChar" element + , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharClosed | fldCharType == "begin" -> do + modify $ \st -> st {stateFldCharState = FldCharOpen} + return NullParPart + FldCharFieldInfo info | fldCharType == "separate" -> do + modify $ \st -> st {stateFldCharState = FldCharContent info []} + return NullParPart + FldCharContent info runs | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = FldCharClosed} + return $ Field info $ reverse runs + _ -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element + , Just instrText <- findChildByName ns "w" "instrText" element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharOpen -> do + info <- eitherToD $ parseFieldInfo $ strContent instrText + modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + return NullParPart + _ -> return NullParPart +elemToParPart ns element + | isElem ns "w" "r" element = do + run <- elemToRun ns element + -- we check to see if we have an open FldChar in state that we're + -- recording. + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharContent info runs -> do + modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)} + return NullParPart + _ -> return $ PlainRun run elemToParPart ns element | Just change <- getTrackedChange ns element = do runs <- mapD (elemToRun ns) (elChildren element) @@ -1089,8 +1171,7 @@ elemToRunElems ns element let font = do fontElem <- findElement (qualName "rFonts") element stringToFont =<< - foldr (<|>) Nothing ( - map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem |
