diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
28 files changed, 2351 insertions, 631 deletions
diff --git a/src/Text/Pandoc/Readers/Custom.hs b/src/Text/Pandoc/Readers/Custom.hs new file mode 100644 index 000000000..9252a9e45 --- /dev/null +++ b/src/Text/Pandoc/Readers/Custom.hs @@ -0,0 +1,83 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Custom + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Supports custom parsers written in Lua which produce a Pandoc AST. +-} +module Text.Pandoc.Readers.Custom ( readCustom ) where +import Control.Exception +import Control.Monad (when) +import HsLua as Lua hiding (Operation (Div), render) +import Control.Monad.IO.Class (MonadIO) +import Text.Pandoc.Definition +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging +import Text.Pandoc.Lua (Global (..), runLua, setGlobals) +import Text.Pandoc.Lua.PandocLua +import Text.Pandoc.Lua.Marshal.Pandoc (peekPandoc) +import Text.Pandoc.Lua.Util (dofileWithTraceback, callWithTraceback, + pcallWithTraceback) +import Text.Pandoc.Options +import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import qualified Data.Text as T + +-- | Convert custom markup to Pandoc. +readCustom :: (PandocMonad m, MonadIO m, ToSources s) + => FilePath -> ReaderOptions -> s -> m Pandoc +readCustom luaFile opts srcs = do + let globals = [ PANDOC_SCRIPT_FILE luaFile ] + res <- runLua $ do + setGlobals globals + stat <- dofileWithTraceback luaFile + -- check for error in lua script (later we'll change the return type + -- to handle this more gracefully): + when (stat /= Lua.OK) + Lua.throwErrorAsException + parseCustom + case res of + Left msg -> throw msg + Right doc -> return doc + where + parseCustom = do + let input = toSources srcs + getglobal "Reader" + push input + push opts + pcallWithTraceback 2 1 >>= \case + OK -> forcePeek $ peekPandoc top + ErrRun -> do + -- Caught a runtime error. Check if parsing might work if we + -- pass a string instead of a Sources list, then retry. + runPeek (peekText top) >>= \case + Failure {} -> + -- not a string error object. Bail! + throwErrorAsException + Success errmsg -> do + if "string expected, got pandoc Sources" `T.isInfixOf` errmsg + then do + pop 1 + _ <- unPandocLua $ do + report $ Deprecated "old Reader function signature" $ + T.unlines + [ "Reader functions should accept a sources list; " + , "functions expecting `string` input are deprecated. " + , "Use `tostring` to convert the first argument to a " + , "string." + ] + getglobal "Reader" + push $ sourcesToText input -- push sources as string + push opts + callWithTraceback 2 1 + forcePeek $ peekPandoc top + else + -- nothing we can do here + throwErrorAsException + _ -> -- not a runtime error, we won't be able to recover from that + throwErrorAsException diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index c49b82ccf..be90eb23e 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -19,7 +19,7 @@ import Data.Foldable (asum) import Data.Generics import Data.List (intersperse,elemIndex) import Data.List.NonEmpty (nonEmpty) -import Data.Maybe (fromMaybe,mapMaybe) +import Data.Maybe (catMaybes,fromMaybe,mapMaybe,maybeToList) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL @@ -316,7 +316,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] postcode - A postal code in an address [x] preface - Introductory matter preceding the first chapter of a book [ ] prefaceinfo - Meta-information for a Preface -[ ] primary - The primary word or phrase under which an index term should be +[x] primary - The primary word or phrase under which an index term should be sorted [ ] primaryie - A primary term in an index entry, not in the text [ ] printhistory - The printing history of a document @@ -385,7 +385,7 @@ List of all DocBook tags, with [x] indicating implemented, [o] screeninfo - Information about how a screen shot was produced [ ] screenshot - A representation of what the user sees or might see on a computer screen -[ ] secondary - A secondary word or phrase in an index term +[x] secondary - A secondary word or phrase in an index term [ ] secondaryie - A secondary term in an index entry, rather than in the text [x] sect1 - A top-level section of document [x] sect1info - Meta-information for a Sect1 @@ -461,7 +461,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] td - A table entry in an HTML table [x] term - The word or phrase being defined or described in a variable list [ ] termdef - An inline term definition -[ ] tertiary - A tertiary word or phrase in an index term +[x] tertiary - A tertiary word or phrase in an index term [ ] tertiaryie - A tertiary term in an index entry, rather than in the text [ ] textdata - Pointer to external text data [ ] textobject - A wrapper for a text description of an object and its @@ -829,7 +829,7 @@ parseBlock (Elem e) = "section" -> gets dbSectionLevel >>= sect . (+1) "simplesect" -> gets dbSectionLevel >>= - sectWith (attrValue "id" e,["unnumbered"],[]) . (+1) + sectWith(attrValue "id" e) ["unnumbered"] [] . (+1) "refsect1" -> sect 1 "refsect2" -> sect 2 "refsect3" -> sect 3 @@ -907,6 +907,7 @@ parseBlock (Elem e) = let classes' = case attrValue "language" e of "" -> [] x -> [x] + ++ ["numberLines" | attrValue "linenumbering" e == "numbered"] return $ codeBlockWith (attrValue "id" e, classes', []) $ trimNl $ strContentRecursive e parseBlockquote = do @@ -993,8 +994,8 @@ parseBlock (Elem e) = (TableHead nullAttr $ toHeaderRow headrows) [TableBody nullAttr 0 [] $ map toRow bodyrows] (TableFoot nullAttr []) - sect n = sectWith (attrValue "id" e,[],[]) n - sectWith attr n = do + sect n = sectWith(attrValue "id" e) [] [] n + sectWith elId classes attrs n = do isbook <- gets dbBook let n' = if isbook || n == 0 then n + 1 else n headerText <- case filterChild (named "title") e `mplus` @@ -1005,7 +1006,14 @@ parseBlock (Elem e) = modify $ \st -> st{ dbSectionLevel = n } b <- getBlocks e modify $ \st -> st{ dbSectionLevel = n - 1 } - return $ headerWith attr n' headerText <> b + return $ headerWith (elId, classes, maybeToList titleabbrevElAsAttr++attrs) n' headerText <> b + titleabbrevElAsAttr = do + txt <- case filterChild (named "titleabbrev") e `mplus` + (filterChild (named "info") e >>= + filterChild (named "titleabbrev")) of + Just t -> Just ("titleabbrev", strContentRecursive t) + Nothing -> Nothing + return txt lineItems = mapM getInlines $ filterChildren (named "line") e -- | Admonitions are parsed into a div. Following other Docbook tools that output HTML, -- we parse the optional title as a div with the @title@ class, and give the @@ -1079,6 +1087,17 @@ elementToStr :: Content -> Content elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x +childElTextAsAttr :: Text -> Element -> Maybe (Text, Text) +childElTextAsAttr n e = case findChild q e of + Nothing -> Nothing + Just childEl -> Just (n, strContentRecursive childEl) + where q = QName n (Just "http://docbook.org/ns/docbook") Nothing + +attrValueAsOptionalAttr :: Text -> Element -> Maybe (Text, Text) +attrValueAsOptionalAttr n e = case attrValue n e of + "" -> Nothing + _ -> Just (n, attrValue n e) + parseInline :: PandocMonad m => Content -> DB m Inlines parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = @@ -1093,6 +1112,28 @@ parseInline (Elem e) = if ident /= "" || classes /= [] then innerInlines (spanWith (ident,classes,[])) else innerInlines id + "indexterm" -> do + let ident = attrValue "id" e + let classes = T.words $ attrValue "role" e + let attrs = + -- In DocBook, <primary>, <secondary>, <tertiary>, <see>, and <seealso> + -- have mixed content models. However, because we're representing these + -- elements in Pandoc's AST as attributes of a phrase, we flatten all + -- the descendant content of these elements. + [ childElTextAsAttr "primary" e + , childElTextAsAttr "secondary" e + , childElTextAsAttr "tertiary" e + , childElTextAsAttr "see" e + , childElTextAsAttr "seealso" e + , attrValueAsOptionalAttr "significance" e + , attrValueAsOptionalAttr "startref" e + , attrValueAsOptionalAttr "scope" e + , attrValueAsOptionalAttr "class" e + -- We don't do anything with the "pagenum" attribute, because these only + -- occur within literal <index> sections, which is not supported by Pandoc, + -- because Pandoc has no concept of pages. + ] + return $ spanWith (ident, ("indexterm" : classes), (catMaybes attrs)) mempty "equation" -> equation e displayMath "informalequation" -> equation e displayMath "inlineequation" -> equation e math diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index c06adf7e3..5c8f20c18 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -246,8 +246,8 @@ runToText _ = "" parPartToText :: ParPart -> T.Text parPartToText (PlainRun run) = runToText run -parPartToText (InternalHyperLink _ runs) = T.concat $ map runToText runs -parPartToText (ExternalHyperLink _ runs) = T.concat $ map runToText runs +parPartToText (InternalHyperLink _ children) = T.concat $ map parPartToText children +parPartToText (ExternalHyperLink _ children) = T.concat $ map parPartToText children parPartToText _ = "" blacklistedCharStyles :: [CharStyleName] @@ -322,6 +322,7 @@ runToInlines (InlineDrawing fp title alt bs ext) = do (lift . lift) $ P.insertMedia fp Nothing bs return $ imageWith (extentToAttr ext) (T.pack fp) title $ text alt runToInlines InlineChart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" +runToInlines InlineDiagram = return $ spanWith ("", ["diagram"], []) $ text "[DIAGRAM]" extentToAttr :: Extent -> Attr extentToAttr (Just (w, h)) = @@ -434,18 +435,21 @@ parPartToInlines' (Drawing fp title alt bs ext) = do return $ imageWith (extentToAttr ext) (T.pack fp) title $ text alt parPartToInlines' Chart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" -parPartToInlines' (InternalHyperLink anchor runs) = do - ils <- smushInlines <$> mapM runToInlines runs +parPartToInlines' Diagram = + return $ spanWith ("", ["diagram"], []) $ text "[DIAGRAM]" +parPartToInlines' (InternalHyperLink anchor children) = do + ils <- smushInlines <$> mapM parPartToInlines' children return $ link ("#" <> anchor) "" ils -parPartToInlines' (ExternalHyperLink target runs) = do - ils <- smushInlines <$> mapM runToInlines runs +parPartToInlines' (ExternalHyperLink target children) = do + ils <- smushInlines <$> mapM parPartToInlines' children return $ link target "" ils parPartToInlines' (PlainOMath exps) = return $ math $ writeTeX exps -parPartToInlines' (Field info runs) = +parPartToInlines' (Field info children) = case info of - HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs - UnknownField -> smushInlines <$> mapM runToInlines runs + HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children + PagerefField fieldAnchor True -> parPartToInlines' $ InternalHyperLink fieldAnchor children + _ -> smushInlines <$> mapM parPartToInlines' children parPartToInlines' NullParPart = return mempty isAnchorSpan :: Inline -> Bool @@ -532,34 +536,36 @@ trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils extraAttr :: (Eq (StyleName a), HasStyleName a) => a -> Attr extraAttr s = ("", [], [("custom-style", fromStyleName $ getStyleName s)]) -parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) -parStyleToTransform pPr = case pStyle pPr of - c@(getStyleName -> styleName):cs - | styleName `elem` divsToKeep -> do - let pPr' = pPr { pStyle = cs } - transform <- parStyleToTransform pPr' - return $ divWith ("", [normalizeToClassName styleName], []) . transform - | styleName `elem` listParagraphStyles -> do - let pPr' = pPr { pStyle = cs, indentation = Nothing} - transform <- parStyleToTransform pPr' - return $ divWith ("", [normalizeToClassName styleName], []) . transform - | otherwise -> do - let pPr' = pPr { pStyle = cs } - transform <- parStyleToTransform pPr' - styles <- asks (isEnabled Ext_styles . docxOptions) - return $ - (if styles then divWith (extraAttr c) else id) - . (if isBlockQuote c then blockQuote else id) - . transform - [] - | Just left <- indentation pPr >>= leftParIndent -> do - let pPr' = pPr { indentation = Nothing } - hang = fromMaybe 0 $ indentation pPr >>= hangingParIndent - transform <- parStyleToTransform pPr' - return $ if (left - hang) > 0 - then blockQuote . transform - else transform - | otherwise -> return id +paragraphStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) +paragraphStyleToTransform pPr = + let stylenames = map getStyleName (pStyle pPr) + transform = if (`elem` listParagraphStyles) `any` stylenames || relativeIndent pPr <= 0 + then id + else blockQuote + in do + extStylesEnabled <- asks (isEnabled Ext_styles . docxOptions) + return $ foldr (\parStyle transform' -> + (parStyleToTransform extStylesEnabled parStyle) . transform' + ) transform (pStyle pPr) + +parStyleToTransform :: Bool -> ParStyle -> Blocks -> Blocks +parStyleToTransform extStylesEnabled parStyle@(getStyleName -> styleName) + | (styleName `elem` divsToKeep) || (styleName `elem` listParagraphStyles) = + divWith ("", [normalizeToClassName styleName], []) + | otherwise = + (if extStylesEnabled then divWith (extraAttr parStyle) else id) + . (if isBlockQuote parStyle then blockQuote else id) + +-- The relative indent is the indentation minus the indentation of the parent style. +-- This tells us whether this paragraph in particular was indented more and thus +-- should be considered a block quote. +relativeIndent :: ParagraphStyle -> Integer +relativeIndent pPr = + let pStyleLeft = fromMaybe 0 $ pStyleIndentation pPr >>= leftParIndent + pStyleHang = fromMaybe 0 $ pStyleIndentation pPr >>= hangingParIndent + left = fromMaybe pStyleLeft $ indentation pPr >>= leftParIndent + hang = fromMaybe pStyleHang $ indentation pPr >>= hangingParIndent + in (left - hang) - (pStyleLeft - pStyleHang) normalizeToClassName :: (FromStyleName a) => a -> T.Text normalizeToClassName = T.map go . fromStyleName @@ -578,7 +584,7 @@ bodyPartToBlocks (Paragraph pPr parparts) local (\s -> s{ docxInBidi = True }) (bodyPartToBlocks (Paragraph pPr' parparts)) | isCodeDiv pPr = do - transform <- parStyleToTransform pPr + transform <- paragraphStyleToTransform pPr return $ transform $ codeBlock $ @@ -605,7 +611,7 @@ bodyPartToBlocks (Paragraph pPr parparts) else prevParaIls <> space) <> ils' handleInsertion = do modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr' + transform <- paragraphStyleToTransform pPr' return $ transform $ paraOrPlain ils'' opts <- asks docxOptions case (pChange pPr', readerTrackChanges opts) of @@ -620,7 +626,7 @@ bodyPartToBlocks (Paragraph pPr parparts) , AllChanges) -> do let attr = ("", ["paragraph-insertion"], addAuthorAndDate cAuthor cDate) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr' + transform <- paragraphStyleToTransform pPr' return $ transform $ paraOrPlain $ ils'' <> insertMark (Just (TrackedChange Deletion _), AcceptChanges) -> do @@ -632,7 +638,7 @@ bodyPartToBlocks (Paragraph pPr parparts) , AllChanges) -> do let attr = ("", ["paragraph-deletion"], addAuthorAndDate cAuthor cDate) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr' + transform <- paragraphStyleToTransform pPr' return $ transform $ paraOrPlain $ ils'' <> insertMark _ -> handleInsertion diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs index 442bc3466..5f090b6be 100644 --- a/src/Text/Pandoc/Readers/Docx/Fields.hs +++ b/src/Text/Pandoc/Readers/Docx/Fields.hs @@ -21,8 +21,11 @@ import Text.Parsec import Text.Parsec.Text (Parser) type URL = T.Text +type Anchor = T.Text data FieldInfo = HyperlinkField URL + -- The boolean indicates whether the field is a hyperlink. + | PagerefField Anchor Bool | UnknownField deriving (Show) @@ -33,6 +36,8 @@ fieldInfo :: Parser FieldInfo fieldInfo = try (HyperlinkField <$> hyperlink) <|> + try ((uncurry PagerefField) <$> pageref) + <|> return UnknownField escapedQuote :: Parser T.Text @@ -72,3 +77,23 @@ hyperlink = do ("\\l", s) : _ -> farg <> "#" <> s _ -> farg return url + +-- See ยง17.16.5.45 +pagerefSwitch :: Parser (T.Text, T.Text) +pagerefSwitch = do + sw <- string "\\h" + spaces + farg <- fieldArgument + return (T.pack sw, farg) + +pageref :: Parser (Anchor, Bool) +pageref = do + many space + string "PAGEREF" + spaces + farg <- fieldArgument + switches <- spaces *> many pagerefSwitch + let isLink = case switches of + ("\\h", _) : _ -> True + _ -> False + return (farg, isLink) diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index dbb16a821..87a3aebef 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , archiveToDocxWithWarnings , getStyleNames , pHeading + , pStyleIndentation , constructBogusParStyleData , leftBiasedMergeRunStyle , rowsToRowspans @@ -92,14 +93,13 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes deriving Show data ReaderState = ReaderState { stateWarnings :: [T.Text] - , stateFldCharState :: FldCharState + , stateFldCharState :: [FldCharState] } deriving Show data FldCharState = FldCharOpen | FldCharFieldInfo FieldInfo - | FldCharContent FieldInfo [Run] - | FldCharClosed + | FldCharContent FieldInfo [ParPart] deriving (Show) data DocxError = DocxError @@ -194,11 +194,6 @@ data Notes = Notes NameSpaces data Comments = Comments NameSpaces (M.Map T.Text Element) deriving Show -data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer - , rightParIndent :: Maybe Integer - , hangingParIndent :: Maybe Integer} - deriving Show - data ChangeType = Insertion | Deletion deriving Show @@ -318,12 +313,13 @@ data ParPart = PlainRun Run | CommentStart CommentId Author (Maybe CommentDate) [BodyPart] | CommentEnd CommentId | BookMark BookMarkId Anchor - | InternalHyperLink Anchor [Run] - | ExternalHyperLink URL [Run] + | InternalHyperLink Anchor [ParPart] + | ExternalHyperLink URL [ParPart] | Drawing FilePath T.Text T.Text B.ByteString Extent -- title, alt | Chart -- placeholder for now + | Diagram -- placeholder for now | PlainOMath [Exp] - | Field FieldInfo [Run] + | Field FieldInfo [ParPart] | NullParPart -- when we need to return nothing, but -- not because of an error. deriving Show @@ -333,6 +329,7 @@ data Run = Run RunStyle [RunElem] | Endnote [BodyPart] | InlineDrawing FilePath T.Text T.Text B.ByteString Extent -- title, alt | InlineChart -- placeholder + | InlineDiagram -- placeholder deriving Show data RunElem = TextRun T.Text | LnBrk | Tab | SoftHyphen | NoBreakHyphen @@ -375,7 +372,7 @@ archiveToDocxWithWarnings archive = do , envDocXmlPath = docXmlPath } rState = ReaderState { stateWarnings = [] - , stateFldCharState = FldCharClosed + , stateFldCharState = [] } (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState case eitherDoc of @@ -437,6 +434,7 @@ getStyleNames = fmap getStyleName constructBogusParStyleData :: ParaStyleName -> ParStyle constructBogusParStyleData stName = ParStyle { headingLev = Nothing + , indent = Nothing , numInfo = Nothing , psParentStyle = Nothing , pStyleName = stName @@ -507,9 +505,7 @@ archiveToRelationships archive docXmlPath = filePathIsMedia :: FilePath -> Bool filePathIsMedia fp = - let (dir, _) = splitFileName fp - in - (dir == "word/media/") + "media" `elem` splitDirectories (takeDirectory fp) lookupLevel :: T.Text -> T.Text -> Numbering -> Maybe Level lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do @@ -673,20 +669,6 @@ elemToCell ns element | isElem ns "w" "tc" element = return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem -elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation -elemToParIndentation ns element | isElem ns "w" "ind" element = - Just ParIndentation { - leftParIndent = - findAttrByName ns "w" "left" element >>= - stringToInteger - , rightParIndent = - findAttrByName ns "w" "right" element >>= - stringToInteger - , hangingParIndent = - findAttrByName ns "w" "hanging" element >>= - stringToInteger } -elemToParIndentation _ _ = Nothing - testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of @@ -699,6 +681,9 @@ pHeading = getParStyleField headingLev . pStyle pNumInfo :: ParagraphStyle -> Maybe (T.Text, T.Text) pNumInfo = getParStyleField numInfo . pStyle +pStyleIndentation :: ParagraphStyle -> Maybe ParIndentation +pStyleIndentation style = (getParStyleField indent . pStyle) style + elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element @@ -715,28 +700,31 @@ elemToBodyPart ns element elemToBodyPart ns element | isElem ns "w" "p" element = do parstyle <- elemToParagraphStyle ns element <$> asks envParStyles - parparts <- mapD (elemToParPart ns) (elChildren element) + parparts' <- mapD (elemToParPart ns) (elChildren element) + fldCharState <- gets stateFldCharState + modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState} -- Word uses list enumeration for numbered headings, so we only -- want to infer a list from the styles if it is NOT a heading. - case pHeading parstyle of - Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do - levelInfo <- lookupLevel numId lvl <$> asks envNumbering - return $ ListItem parstyle numId lvl levelInfo parparts - _ -> let - hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) - - hasSimpleTableField = fromMaybe False $ do - fldSimple <- findChildByName ns "w" "fldSimple" element - instr <- findAttrByName ns "w" "instr" fldSimple - pure ("Table" `elem` T.words instr) - - hasComplexTableField = fromMaybe False $ do - instrText <- findElementByName ns "w" "instrText" element - pure ("Table" `elem` T.words (strContent instrText)) - - in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) - then return $ TblCaption parstyle parparts - else return $ Paragraph parstyle parparts + let parparts = parparts' ++ (openFldCharsToParParts fldCharState) in + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + levelInfo <- lookupLevel numId lvl <$> asks envNumbering + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do @@ -768,14 +756,30 @@ lookupRelationship docLocation relid rels = where pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels +openFldCharsToParParts :: [FldCharState] -> [ParPart] +openFldCharsToParParts [] = [] +openFldCharsToParParts (FldCharContent info children : ancestors) = case openFldCharsToParParts ancestors of + Field parentInfo siblings : _ -> [Field parentInfo $ siblings ++ [Field info $ reverse children]] + _ -> [Field info $ reverse children] +openFldCharsToParParts (_ : ancestors) = openFldCharsToParParts ancestors + +emptyFldCharContents :: [FldCharState] -> [FldCharState] +emptyFldCharContents = map + (\x -> case x of + FldCharContent info _ -> FldCharContent info [] + _ -> x) + expandDrawingId :: T.Text -> D (FilePath, B.ByteString) expandDrawingId s = do location <- asks envLocation target <- asks (fmap T.unpack . lookupRelationship location s . envRelationships) case target of Just filepath -> do - bytes <- asks (lookup ("word/" ++ filepath) . envMedia) - case bytes of + media <- asks envMedia + let filepath' = case filepath of + ('/':rest) -> rest + _ -> "word/" ++ filepath + case lookup filepath' media of Just bs -> return (filepath, bs) Nothing -> throwError DocxError Nothing -> throwError DocxError @@ -789,44 +793,6 @@ getTitleAndAlt ns element = in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" - , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem - = let (title, alt) = getTitleAndAlt ns drawingElem - a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrByName ns "r" "embed" - in - case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) - Nothing -> throwError WrongElem --- The two cases below are an attempt to deal with images in deprecated vml format. --- Todo: check out title and attr for deprecated format. -elemToParPart ns element - | isElem ns "w" "r" element - , Just _ <- findChildByName ns "w" "pict" element = - let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrByName ns "r" "id" - in - case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) - Nothing -> throwError WrongElem -elemToParPart ns element - | isElem ns "w" "r" element - , Just objectElem <- findChildByName ns "w" "object" element - , Just shapeElem <- findChildByName ns "v" "shape" objectElem - , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem - , Just drawingId <- findAttrByName ns "r" "id" imagedataElem - = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) --- Chart -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" - , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem - = return Chart {- The next one is a bit complicated. fldChar fields work by first having a <w:fldChar fldCharType="begin"> in a run, then a run with @@ -858,8 +824,13 @@ example (omissions and my comments in brackets): So we do this in a number of steps. If we encounter the fldchar begin tag, we start open a fldchar state variable (see state above). We add the instrtext to it as FieldInfo. Then we close that and start adding -the runs when we get to separate. Then when we get to end, we produce -the Field type with appropriate FieldInfo and Runs. +the children when we get to separate. Then when we get to end, we produce +the Field type with appropriate FieldInfo and ParParts. + +Since there can be nested fields, the fldchar state needs to be a stack, +so we can have multiple fldchars open at the same time. When a fldchar is +closed, we either add the resulting field to its parent or we return it if +there is no parent. -} elemToParPart ns element | isElem ns "w" "r" element @@ -867,78 +838,142 @@ elemToParPart ns element , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do fldCharState <- gets stateFldCharState case fldCharState of - FldCharClosed | fldCharType == "begin" -> do - modify $ \st -> st {stateFldCharState = FldCharOpen} + _ | fldCharType == "begin" -> do + modify $ \st -> st {stateFldCharState = FldCharOpen : fldCharState} + return NullParPart + FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do + modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors} return NullParPart - FldCharFieldInfo info | fldCharType == "separate" -> do - modify $ \st -> st {stateFldCharState = FldCharContent info []} + -- Some fields have no content, since Pandoc doesn't understand any of those fields, we can just close it. + FldCharFieldInfo _ : ancestors | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = ancestors} return NullParPart - FldCharContent info runs | fldCharType == "end" -> do - modify $ \st -> st {stateFldCharState = FldCharClosed} - return $ Field info $ reverse runs + [FldCharContent info children] | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = []} + return $ Field info $ reverse children + FldCharContent info children : FldCharContent parentInfo siblings : ancestors | fldCharType == "end" -> + let parent = FldCharContent parentInfo $ (Field info (reverse children)) : siblings in do + modify $ \st -> st {stateFldCharState = parent : ancestors} + return NullParPart _ -> throwError WrongElem elemToParPart ns element | isElem ns "w" "r" element , Just instrText <- findChildByName ns "w" "instrText" element = do fldCharState <- gets stateFldCharState case fldCharState of - FldCharOpen -> do + FldCharOpen : ancestors -> do info <- eitherToD $ parseFieldInfo $ strContent instrText - modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + modify $ \st -> st {stateFldCharState = FldCharFieldInfo info : ancestors} return NullParPart _ -> return NullParPart -elemToParPart ns element +{- +There is an open fldchar, so we calculate the element and add it to the +children. For this we need to first change the fldchar state to an empty +stack to avoid descendants of children simply being added to the state instead +of to their direct parent element. This would happen in the case of a +w:hyperlink element for example. +-} +elemToParPart ns element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharContent info children : ancestors -> do + modify $ \st -> st {stateFldCharState = []} + parPart <- elemToParPart' ns element `catchError` \_ -> return NullParPart + modify $ \st -> st{stateFldCharState = FldCharContent info (parPart : children) : ancestors} + return NullParPart + _ -> elemToParPart' ns element + +elemToParPart' :: NameSpaces -> Element -> D ParPart +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" + , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem + = let (title, alt) = getTitleAndAlt ns drawingElem + a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem + >>= findAttrByName ns "r" "embed" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) + Nothing -> throwError WrongElem +-- The two cases below are an attempt to deal with images in deprecated vml format. +-- Todo: check out title and attr for deprecated format. +elemToParPart' ns element + | isElem ns "w" "r" element + , Just _ <- findChildByName ns "w" "pict" element = + let drawing = findElement (elemName ns "v" "imagedata") element + >>= findAttrByName ns "r" "id" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) + Nothing -> throwError WrongElem +elemToParPart' ns element + | isElem ns "w" "r" element + , Just objectElem <- findChildByName ns "w" "object" element + , Just shapeElem <- findChildByName ns "v" "shape" objectElem + , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem + , Just drawingId <- findAttrByName ns "r" "id" imagedataElem + = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) +-- Diagram +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" + , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem + = return Diagram +-- Chart +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" + , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem + = return Chart +elemToParPart' ns element | isElem ns "w" "r" element = do run <- elemToRun ns element - -- we check to see if we have an open FldChar in state that we're - -- recording. - fldCharState <- gets stateFldCharState - case fldCharState of - FldCharContent info runs -> do - modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)} - return NullParPart - _ -> return $ PlainRun run -elemToParPart ns element + return $ PlainRun run +elemToParPart' ns element | Just change <- getTrackedChange ns element = do runs <- mapD (elemToRun ns) (elChildren element) return $ ChangedRuns change runs -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "bookmarkStart" element , Just bmId <- findAttrByName ns "w" "id" element , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "hyperlink" element , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation - runs <- mapD (elemToRun ns) (elChildren element) + children <- mapD (elemToParPart ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> case findAttrByName ns "w" "anchor" element of - Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs - Nothing -> return $ ExternalHyperLink target runs - Nothing -> return $ ExternalHyperLink "" runs -elemToParPart ns element + Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) children + Nothing -> return $ ExternalHyperLink target children + Nothing -> return $ ExternalHyperLink "" children +elemToParPart' ns element | isElem ns "w" "hyperlink" element , Just anchor <- findAttrByName ns "w" "anchor" element = do - runs <- mapD (elemToRun ns) (elChildren element) - return $ InternalHyperLink anchor runs -elemToParPart ns element + children <- mapD (elemToParPart ns) (elChildren element) + return $ InternalHyperLink anchor children +elemToParPart' ns element | isElem ns "w" "commentRangeStart" element , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "commentRangeEnd" element , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId -elemToParPart ns element +elemToParPart' ns element | isElem ns "m" "oMath" element = fmap PlainOMath (eitherToD $ readOMML $ showElement element) -elemToParPart _ _ = throwError WrongElem +elemToParPart' _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element @@ -987,6 +1022,11 @@ childElemToRun ns element , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) element = return InlineChart childElemToRun ns element + | isElem ns "w" "drawing" element + , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" + , Just _ <- findElement (QName "relIds" (Just c_ns) (Just "dgm")) element + = return InlineDiagram +childElemToRun ns element | isElem ns "w" "footnoteReference" element , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes @@ -1071,8 +1111,7 @@ elemToParagraphStyle ns element sty in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style , indentation = - findChildByName ns "w" "ind" pPr >>= - elemToParIndentation ns + getIndentation ns element , dropCap = case findChildByName ns "w" "framePr" pPr >>= diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index 0d7271d6a..df942579a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -21,6 +21,7 @@ module Text.Pandoc.Readers.Docx.Parse.Styles ( , CharStyle , ParaStyleId(..) , ParStyle(..) + , ParIndentation(..) , RunStyle(..) , HasStyleName , StyleName @@ -37,6 +38,7 @@ module Text.Pandoc.Readers.Docx.Parse.Styles ( , fromStyleName , fromStyleId , stringToInteger + , getIndentation , getNumInfo , elemToRunStyle , defaultRunStyle @@ -115,7 +117,13 @@ data RunStyle = RunStyle { isBold :: Maybe Bool } deriving Show +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer + , hangingParIndent :: Maybe Integer} + deriving Show + data ParStyle = ParStyle { headingLev :: Maybe (ParaStyleName, Int) + , indent :: Maybe ParIndentation , numInfo :: Maybe (T.Text, T.Text) , psParentStyle :: Maybe ParStyle , pStyleName :: ParaStyleName @@ -290,6 +298,22 @@ getHeaderLevel ns element , n > 0 = Just (styleName, fromInteger n) getHeaderLevel _ _ = Nothing +getIndentation :: NameSpaces -> Element -> Maybe ParIndentation +getIndentation ns el = do + indElement <- findChildByName ns "w" "pPr" el >>= + findChildByName ns "w" "ind" + return $ ParIndentation + { + leftParIndent = findAttrByName ns "w" "left" indElement <|> + findAttrByName ns "w" "start" indElement >>= + stringToInteger + , rightParIndent = findAttrByName ns "w" "right" indElement <|> + findAttrByName ns "w" "end" indElement >>= + stringToInteger + , hangingParIndent = findAttrByName ns "w" "hanging" indElement >>= + stringToInteger + } + getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") @@ -314,6 +338,7 @@ elemToParStyleData ns element parentStyle = Just $ ParStyle { headingLev = getHeaderLevel ns element + , indent = getIndentation ns element , numInfo = getNumInfo ns element , psParentStyle = parentStyle , pStyleName = styleName diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index fdf4f28e0..8aa2646b2 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -551,7 +551,7 @@ pFigure = try $ do let caption = fromMaybe mempty mbcap case B.toList <$> mbimg of Just [Image attr _ (url, tit)] -> - return $ B.para $ B.imageWith attr url ("fig:" <> tit) caption + return $ B.simpleFigureWith attr caption url tit _ -> mzero pCodeBlock :: PandocMonad m => TagParser m Blocks @@ -643,7 +643,7 @@ pQ = do case lookup "cite" attrs of Just url -> do let uid = fromMaybe mempty $ - lookup "name" attrs <> lookup "id" attrs + lookup "name" attrs <|> lookup "id" attrs let cls = maybe [] T.words $ lookup "class" attrs url' <- canonicalizeUrl url makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url')]) @@ -705,20 +705,18 @@ pLineBreak = do pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do - tag <- pSatisfy $ tagOpenLit "a" (const True) + tag@(TagOpen _ attr') <- pSatisfy $ tagOpenLit "a" (const True) let title = fromAttrib "title" tag - -- take id from id attribute if present, otherwise name - let uid = fromMaybe (fromAttrib "name" tag) $ - maybeFromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag + let attr = toAttr $ filter (\(k,_) -> k /= "title" && k /= "href") attr' lab <- mconcat <$> manyTill inline (pCloses "a") -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of Nothing -> - return $ extractSpaces (B.spanWith (uid, cls, [])) lab + return $ extractSpaces (B.spanWith attr) lab Just url' -> do url <- canonicalizeUrl url' - return $ extractSpaces (B.linkWith (uid, cls, []) (escapeURI url) title) lab + return $ extractSpaces + (B.linkWith attr (escapeURI url) title) lab pImage :: PandocMonad m => TagParser m Inlines pImage = do diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index bd8d7c96c..a8cdf1de2 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -30,11 +30,11 @@ module Text.Pandoc.Readers.HTML.Parsing ) where -import Control.Monad (guard, void, mzero) +import Control.Monad (void, mzero, mplus) import Data.Maybe (fromMaybe) import Data.Text (Text) import Text.HTML.TagSoup - ( Attribute, Tag (..), isTagText, isTagPosition, isTagOpen, isTagClose, (~==) ) + ( Attribute, Tag (..), isTagPosition, isTagOpen, isTagClose, (~==) ) import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Definition (Attr) import Text.Pandoc.Parsing @@ -118,9 +118,11 @@ pCloses tagtype = try $ do _ -> mzero pBlank :: PandocMonad m => TagParser m () -pBlank = try $ do - (TagText str) <- pSatisfy isTagText - guard $ T.all isSpace str +pBlank = void $ pSatisfy isBlank + where + isBlank (TagText t) = T.all isSpace t + isBlank (TagComment _) = True + isBlank _ = False pLocation :: PandocMonad m => TagParser m () pLocation = do @@ -218,9 +220,10 @@ maybeFromAttrib _ _ = Nothing mkAttr :: [(Text, Text)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) - where attribsId = fromMaybe "" $ lookup "id" attr + where attribsId = fromMaybe "" $ lookup "id" attr `mplus` lookup "name" attr attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes - attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + attribsKV = filter (\(k,_) -> k /= "class" && k /= "id" && k /= "name") + attr epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr toAttr :: [(Text, Text)] -> Attr diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index 6e62e12f5..b23a2abc8 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -16,7 +16,7 @@ HTML table parser. module Text.Pandoc.Readers.HTML.Table (pTable) where import Control.Applicative ((<|>)) -import Data.Maybe (fromMaybe) +import Data.Maybe (fromMaybe, isJust) import Data.Either (lefts, rights) import Data.List.NonEmpty (nonEmpty) import Data.Text (Text) @@ -27,12 +27,13 @@ import Text.Pandoc.Definition import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Parsing ( eof, lookAhead, many, many1, manyTill, option, optional - , optionMaybe, skipMany, try) + , optionMaybe, skipMany, try ) import Text.Pandoc.Readers.HTML.Parsing import Text.Pandoc.Readers.HTML.Types (TagParser) import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) import qualified Data.Text as T import qualified Text.Pandoc.Builder as B +import Control.Monad (guard) -- | Parses a @<col>@ element, returning the column's width. -- An Either value is used: Left i means a "relative length" with @@ -183,11 +184,13 @@ pTableBody :: PandocMonad m -> TagParser m TableBody pTableBody block = try $ do skipMany pBlank - attribs <- option [] $ getAttribs <$> pSatisfy (matchTagOpen "tbody" []) - <* skipMany pBlank + mbattribs <- option Nothing $ Just . getAttribs <$> + pSatisfy (matchTagOpen "tbody" []) <* skipMany pBlank bodyheads <- many (pHeaderRow block) - (rowheads, rows) <- unzip <$> many1 (pRow block <* skipMany pBlank) + (rowheads, rows) <- unzip <$> many (pRow block <* skipMany pBlank) optional $ pSatisfy (matchTagClose "tbody") + guard $ isJust mbattribs || not (null bodyheads && null rows) + let attribs = fromMaybe [] mbattribs return $ TableBody (toAttr attribs) (foldr max 0 rowheads) bodyheads rows where getAttribs (TagOpen _ attribs) = attribs diff --git a/src/Text/Pandoc/Readers/Ipynb.hs b/src/Text/Pandoc/Readers/Ipynb.hs index cd1093109..8e742a888 100644 --- a/src/Text/Pandoc/Readers/Ipynb.hs +++ b/src/Text/Pandoc/Readers/Ipynb.hs @@ -19,6 +19,7 @@ import Data.Char (isDigit) import Data.Maybe (fromMaybe) import Data.Digest.Pure.SHA (sha1, showDigest) import Text.Pandoc.Options +import Control.Applicative ((<|>)) import qualified Data.Scientific as Scientific import qualified Text.Pandoc.Builder as B import Text.Pandoc.Logging @@ -76,7 +77,10 @@ cellToBlocks opts lang c = do let Source ts = cellSource c let source = mconcat ts let kvs = jsonMetaToPairs (cellMetadata c) - let attachments = maybe mempty M.toList $ cellAttachments c + let attachments = case cellAttachments c of + Nothing -> mempty + Just (MimeAttachments m) -> M.toList m + let ident = fromMaybe mempty $ cellId c mapM_ addAttachment attachments case cellType c of Ipynb.Markdown -> do @@ -85,29 +89,34 @@ cellToBlocks opts lang c = do else do Pandoc _ bs <- walk fixImage <$> readMarkdown opts source return bs - return $ B.divWith ("",["cell","markdown"],kvs) + return $ B.divWith (ident,["cell","markdown"],kvs) $ B.fromList bs Ipynb.Heading lev -> do Pandoc _ bs <- readMarkdown opts (T.replicate lev "#" <> " " <> source) - return $ B.divWith ("",["cell","markdown"],kvs) + return $ B.divWith (ident,["cell","markdown"],kvs) $ B.fromList bs Ipynb.Raw -> do -- we use ipynb to indicate no format given (a wildcard in nbformat) - let format = fromMaybe "ipynb" $ lookup "format" kvs + let format = fromMaybe "ipynb" $ lookup "raw_mimetype" kvs <|> lookup "format" kvs let format' = case format of - "text/html" -> "html" - "text/latex" -> "latex" - "application/pdf" -> "latex" - "text/markdown" -> "markdown" - "text/x-rsrt" -> "rst" - _ -> format - return $ B.divWith ("",["cell","raw"],kvs) $ B.rawBlock format' source + "text/html" -> "html" + "slides" -> "html" + "text/latex" -> "latex" + "application/pdf" -> "latex" + "pdf" -> "latex" + "text/markdown" -> "markdown" + "text/x-rst" -> "rst" + "text/restructuredtext" -> "rst" + "text/asciidoc" -> "asciidoc" + _ -> format + return $ B.divWith (ident,["cell","raw"],kvs) + $ B.rawBlock format' source Ipynb.Code{ codeOutputs = outputs, codeExecutionCount = ec } -> do outputBlocks <- mconcat <$> mapM outputToBlock outputs let kvs' = maybe kvs (\x -> ("execution_count", tshow x):kvs) ec - return $ B.divWith ("",["cell","code"],kvs') $ + return $ B.divWith (ident,["cell","code"],kvs') $ B.codeBlockWith ("",[lang],[]) source <> outputBlocks @@ -156,7 +165,7 @@ outputToBlock Err{ errName = ename, -- the output format. handleData :: PandocMonad m => JSONMeta -> MimeBundle -> m B.Blocks -handleData metadata (MimeBundle mb) = +handleData (JSONMeta metadata) (MimeBundle mb) = mconcat <$> mapM dataBlock (M.toList mb) where @@ -192,6 +201,9 @@ handleData metadata (MimeBundle mb) = dataBlock ("text/latex", TextualData t) = return $ B.rawBlock "latex" t + dataBlock ("text/markdown", TextualData t) + = return $ B.rawBlock "markdown" t + dataBlock ("text/plain", TextualData t) = return $ B.codeBlock t @@ -201,7 +213,7 @@ handleData metadata (MimeBundle mb) = dataBlock _ = return mempty jsonMetaToMeta :: JSONMeta -> M.Map Text MetaValue -jsonMetaToMeta = M.map valueToMetaValue +jsonMetaToMeta (JSONMeta m) = M.map valueToMetaValue m where valueToMetaValue :: Value -> MetaValue valueToMetaValue x@Object{} = @@ -220,11 +232,11 @@ jsonMetaToMeta = M.map valueToMetaValue valueToMetaValue Aeson.Null = MetaString "" jsonMetaToPairs :: JSONMeta -> [(Text, Text)] -jsonMetaToPairs = M.toList . M.map +jsonMetaToPairs (JSONMeta m) = M.toList . M.map (\case String t | not (T.all isDigit t) , t /= "true" , t /= "false" -> t - x -> T.pack $ UTF8.toStringLazy $ Aeson.encode x) + x -> T.pack $ UTF8.toStringLazy $ Aeson.encode x) $ m diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index 9cdbf1611..37e0d13bc 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -35,6 +35,7 @@ import Text.Pandoc.XML.Light import qualified Data.Set as S (fromList, member) import Data.Set ((\\)) import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import qualified Data.Foldable as DF type JATS m = StateT JATSState m @@ -226,9 +227,19 @@ parseBlock (Elem e) = mapM getInlines (filterChildren (const True) t) Nothing -> return mempty - img <- getGraphic (Just (capt, attrValue "id" e)) g - return $ para img + + let figAttributes = DF.toList $ + ("alt", ) . strContent <$> + filterChild (named "alt-text") e + + return $ simpleFigureWith + (attrValue "id" e, [], figAttributes) + capt + (attrValue "href" g) + (attrValue "title" g) + _ -> divWith (attrValue "id" e, ["fig"], []) <$> getBlocks e + parseTable = do let isCaption x = named "title" x || named "caption" x capt <- case filterChild isCaption e of diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 27c018e73..20a2db76b 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -390,8 +390,8 @@ inlineCommands = M.unions unescapeURL . removeDoubleQuotes $ untokenize src) -- hyperref - , ("url", (\url -> link url "" (str url)) . unescapeURL . untokenize <$> - bracedUrl) + , ("url", (\url -> linkWith ("",["uri"],[]) url "" (str url)) + . unescapeURL . untokenize <$> bracedUrl) , ("nolinkurl", code . unescapeURL . untokenize <$> bracedUrl) , ("href", do url <- bracedUrl sp @@ -893,7 +893,7 @@ blockCommands = M.fromList addMeta "bibliography" . splitBibs . untokenize)) , ("addbibresource", mempty <$ (skipopts *> braced >>= addMeta "bibliography" . splitBibs . untokenize)) - , ("endinput", mempty <$ skipMany anyTok) + , ("endinput", mempty <$ skipSameFileToks) -- includes , ("lstinputlisting", inputListing) , ("inputminted", inputMinted) @@ -924,6 +924,10 @@ blockCommands = M.fromList , ("epigraph", epigraph) ] +skipSameFileToks :: PandocMonad m => LP m () +skipSameFileToks = do + pos <- getPosition + skipMany $ infile (sourceName pos) environments :: PandocMonad m => M.Map Text (LP m Blocks) environments = M.union (tableEnvironments blocks inline) $ @@ -970,6 +974,7 @@ environments = M.union (tableEnvironments blocks inline) $ , ("toggletrue", braced >>= setToggle True) , ("togglefalse", braced >>= setToggle False) , ("iftoggle", try $ ifToggle >> block) + , ("CSLReferences", braced >> braced >> env "CSLReferences" blocks) ] filecontents :: PandocMonad m => LP m Blocks @@ -1109,24 +1114,28 @@ figure = try $ do addImageCaption :: PandocMonad m => Blocks -> LP m Blocks addImageCaption = walkM go - where go (Image attr@(_, cls, kvs) alt (src,tit)) + where go p@(Para [Image attr@(_, cls, kvs) _ (src, tit)]) | not ("fig:" `T.isPrefixOf` tit) = do st <- getState - let (alt', tit') = case sCaption st of - Just ils -> (toList ils, "fig:" <> tit) - Nothing -> (alt, tit) - attr' = case sLastLabel st of - Just lab -> (lab, cls, kvs) - Nothing -> attr - case attr' of - ("", _, _) -> return () - (ident, _, _) -> do - num <- getNextNumber sLastFigureNum - setState - st{ sLastFigureNum = num - , sLabels = M.insert ident - [Str (renderDottedNum num)] (sLabels st) } - return $ Image attr' alt' (src, tit') + case sCaption st of + Nothing -> return p + Just figureCaption -> do + let mblabel = sLastLabel st + let attr' = case mblabel of + Just lab -> (lab, cls, kvs) + Nothing -> attr + case attr' of + ("", _, _) -> return () + (ident, _, _) -> do + num <- getNextNumber sLastFigureNum + setState + st{ sLastFigureNum = num + , sLabels = M.insert ident + [Str (renderDottedNum num)] (sLabels st) } + + return $ SimpleFigure attr' + (maybe id removeLabel mblabel (B.toList figureCaption)) + (src, tit) go x = return x coloredBlock :: PandocMonad m => Text -> LP m Blocks diff --git a/src/Text/Pandoc/Readers/LaTeX/Inline.hs b/src/Text/Pandoc/Readers/LaTeX/Inline.hs index 7b8bca4af..5938096fd 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Inline.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Inline.hs @@ -35,7 +35,7 @@ import Text.Pandoc.Readers.LaTeX.Parsing import Text.Pandoc.Extensions (extensionEnabled, Extension(..)) import Text.Pandoc.Parsing (getOption, updateState, getState, notFollowedBy, manyTill, getInput, setInput, incSourceColumn, - option, many1, try) + option, many1) import Data.Char (isDigit) import Text.Pandoc.Highlighting (fromListingsLanguage,) import Data.Maybe (maybeToList, fromMaybe) @@ -56,8 +56,7 @@ dolabel = do let refstr = untokenize v updateState $ \st -> st{ sLastLabel = Just refstr } - return $ spanWith (refstr,[],[("label", refstr)]) - $ inBrackets $ str $ untokenize v + return $ spanWith (refstr,[],[("label", refstr)]) mempty doref :: PandocMonad m => Text -> LP m Inlines doref cls = do @@ -160,8 +159,8 @@ romanNumeralArg = spaces *> (parser <|> inBraces) accentWith :: PandocMonad m => LP m Inlines -> Char -> Maybe Char -> LP m Inlines -accentWith tok combiningAccent fallBack = try $ do - ils <- tok +accentWith tok combiningAccent fallBack = do + ils <- option mempty tok case toList ils of (Str (T.uncons -> Just (x, xs)) : ys) -> return $ fromList $ -- try to normalize to the combined character: @@ -339,6 +338,7 @@ refCommands = M.fromList , ("cref", rawInlineOr "cref" $ doref "ref") -- from cleveref.sty , ("vref", rawInlineOr "vref" $ doref "ref+page") -- from varioref.sty , ("eqref", rawInlineOr "eqref" $ doref "eqref") -- from amsmath.sty + , ("autoref", rawInlineOr "autoref" $ doref "autoref") -- from hyperref.sty ] acronymCommands :: PandocMonad m => M.Map Text (LP m Inlines) diff --git a/src/Text/Pandoc/Readers/LaTeX/Macro.hs b/src/Text/Pandoc/Readers/LaTeX/Macro.hs index 5495a8e74..d40277eb5 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Macro.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Macro.hs @@ -15,6 +15,8 @@ import Control.Applicative ((<|>), optional) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T +import qualified Data.List.NonEmpty as NonEmpty +import Data.List.NonEmpty (NonEmpty(..)) macroDef :: (PandocMonad m, Monoid a) => (Text -> a) -> LP m a macroDef constructor = do @@ -23,51 +25,91 @@ macroDef constructor = do guardDisabled Ext_latex_macros) <|> return mempty where commandDef = do - nameMacroPairs <- newcommand <|> letmacro <|> defmacro <|> newif + nameMacroPairs <- newcommand <|> + checkGlobal (letmacro <|> edefmacro <|> defmacro <|> newif) guardDisabled Ext_latex_macros <|> - mapM_ (\(name, macro') -> - updateState (\s -> s{ sMacros = M.insert name macro' - (sMacros s) })) nameMacroPairs + mapM_ insertMacro nameMacroPairs environmentDef = do mbenv <- newenvironment case mbenv of Nothing -> return () Just (name, macro1, macro2) -> guardDisabled Ext_latex_macros <|> - do updateState $ \s -> s{ sMacros = - M.insert name macro1 (sMacros s) } - updateState $ \s -> s{ sMacros = - M.insert ("end" <> name) macro2 (sMacros s) } + do insertMacro (name, macro1) + insertMacro ("end" <> name, macro2) -- @\newenvironment{envname}[n-args][default]{begin}{end}@ -- is equivalent to -- @\newcommand{\envname}[n-args][default]{begin}@ -- @\newcommand{\endenvname}@ +insertMacro :: PandocMonad m => (Text, Macro) -> LP m () +insertMacro (name, macro'@(Macro GlobalScope _ _ _ _)) = + updateState $ \s -> + s{ sMacros = NonEmpty.map (M.insert name macro') (sMacros s) } +insertMacro (name, macro'@(Macro GroupScope _ _ _ _)) = + updateState $ \s -> + s{ sMacros = M.insert name macro' (NonEmpty.head (sMacros s)) :| + NonEmpty.tail (sMacros s) } + +lookupMacro :: PandocMonad m => Text -> LP m Macro +lookupMacro name = do + macros :| _ <- sMacros <$> getState + case M.lookup name macros of + Just m -> return m + Nothing -> fail "Macro not found" + letmacro :: PandocMonad m => LP m [(Text, Macro)] letmacro = do controlSeq "let" - (name, contents) <- withVerbatimMode $ do + withVerbatimMode $ do Tok _ (CtrlSeq name) _ <- anyControlSeq optional $ symbol '=' spaces -- we first parse in verbatim mode, and then expand macros, -- because we don't want \let\foo\bar to turn into -- \let\foo hello if we have previously \def\bar{hello} + target <- anyControlSeq <|> singleChar + case target of + (Tok _ (CtrlSeq name') _) -> + (do m <- lookupMacro name' + pure [(name, m)]) + <|> pure [(name, + Macro GroupScope ExpandWhenDefined [] Nothing [target])] + _ -> pure [(name, Macro GroupScope ExpandWhenDefined [] Nothing [target])] + +checkGlobal :: PandocMonad m => LP m [(Text, Macro)] -> LP m [(Text, Macro)] +checkGlobal p = + (controlSeq "global" *> + (map (\(n, Macro _ expand arg optarg contents) -> + (n, Macro GlobalScope expand arg optarg contents)) <$> p)) + <|> p + +edefmacro :: PandocMonad m => LP m [(Text, Macro)] +edefmacro = do + scope <- (GroupScope <$ controlSeq "edef") + <|> (GlobalScope <$ controlSeq "xdef") + (name, contents) <- withVerbatimMode $ do + Tok _ (CtrlSeq name) _ <- anyControlSeq + -- we first parse in verbatim mode, and then expand macros, + -- because we don't want \let\foo\bar to turn into + -- \let\foo hello if we have previously \def\bar{hello} contents <- bracedOrToken return (name, contents) - contents' <- doMacros' 0 contents - return [(name, Macro ExpandWhenDefined [] Nothing contents')] + -- expand macros + contents' <- parseFromToks (many anyTok) contents + return [(name, Macro scope ExpandWhenDefined [] Nothing contents')] defmacro :: PandocMonad m => LP m [(Text, Macro)] defmacro = do -- we use withVerbatimMode, because macros are to be expanded -- at point of use, not point of definition - controlSeq "def" + scope <- (GroupScope <$ controlSeq "def") + <|> (GlobalScope <$ controlSeq "gdef") withVerbatimMode $ do Tok _ (CtrlSeq name) _ <- anyControlSeq argspecs <- many (argspecArg <|> argspecPattern) contents <- bracedOrToken - return [(name, Macro ExpandWhenUsed argspecs Nothing contents)] + return [(name, Macro scope ExpandWhenUsed argspecs Nothing contents)] -- \newif\iffoo' defines: -- \iffoo to be \iffalse @@ -82,16 +124,16 @@ newif = do -- \def\footrue{\def\iffoo\iftrue} -- \def\foofalse{\def\iffoo\iffalse} let base = T.drop 2 name - return [ (name, Macro ExpandWhenUsed [] Nothing + return [ (name, Macro GroupScope ExpandWhenUsed [] Nothing [Tok pos (CtrlSeq "iffalse") "\\iffalse"]) , (base <> "true", - Macro ExpandWhenUsed [] Nothing + Macro GroupScope ExpandWhenUsed [] Nothing [ Tok pos (CtrlSeq "def") "\\def" , Tok pos (CtrlSeq name) ("\\" <> name) , Tok pos (CtrlSeq "iftrue") "\\iftrue" ]) , (base <> "false", - Macro ExpandWhenUsed [] Nothing + Macro GroupScope ExpandWhenUsed [] Nothing [ Tok pos (CtrlSeq "def") "\\def" , Tok pos (CtrlSeq name) ("\\" <> name) , Tok pos (CtrlSeq "iffalse") "\\iffalse" @@ -138,14 +180,13 @@ newcommand = do : (contents' ++ [ Tok pos Symbol "}", Tok pos Symbol "}" ]) _ -> contents' - macros <- sMacros <$> getState - case M.lookup name macros of - Just macro - | mtype == "newcommand" -> do - report $ MacroAlreadyDefined txt pos - return [(name, macro)] - | mtype == "providecommand" -> return [(name, macro)] - _ -> return [(name, Macro ExpandWhenUsed argspecs optarg contents)] + let macro = Macro GroupScope ExpandWhenUsed argspecs optarg contents + (do lookupMacro name + case mtype of + "providecommand" -> return [] + "renewcommand" -> return [(name, macro)] + _ -> [] <$ report (MacroAlreadyDefined txt pos)) + <|> pure [(name, macro)] newenvironment :: PandocMonad m => LP m (Maybe (Text, Macro, Macro)) newenvironment = do @@ -164,17 +205,23 @@ newenvironment = do let argspecs = map (\i -> ArgNum i) [1..numargs] startcontents <- spaces >> bracedOrToken endcontents <- spaces >> bracedOrToken - macros <- sMacros <$> getState - case M.lookup name macros of - Just _ - | mtype == "newenvironment" -> do - report $ MacroAlreadyDefined name pos - return Nothing - | mtype == "provideenvironment" -> - return Nothing - _ -> return $ Just (name, - Macro ExpandWhenUsed argspecs optarg startcontents, - Macro ExpandWhenUsed [] Nothing endcontents) + -- we need the environment to be in a group so macros defined + -- inside behave correctly: + let bg = Tok pos (CtrlSeq "bgroup") "\\bgroup " + let eg = Tok pos (CtrlSeq "egroup") "\\egroup " + let result = (name, + Macro GroupScope ExpandWhenUsed argspecs optarg + (bg:startcontents), + Macro GroupScope ExpandWhenUsed [] Nothing + (endcontents ++ [eg])) + (do lookupMacro name + case mtype of + "provideenvironment" -> return Nothing + "renewenvironment" -> return (Just result) + _ -> do + report $ MacroAlreadyDefined name pos + return Nothing) + <|> return (Just result) bracketedNum :: PandocMonad m => LP m Int bracketedNum = do diff --git a/src/Text/Pandoc/Readers/LaTeX/Math.hs b/src/Text/Pandoc/Readers/LaTeX/Math.hs index 5b49a0376..01edce7ed 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Math.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Math.hs @@ -142,14 +142,15 @@ newtheorem inline = do theoremEnvironment :: PandocMonad m => LP m Blocks -> LP m Inlines -> Text -> LP m Blocks theoremEnvironment blocks opt name = do + resetCaption tmap <- sTheoremMap <$> getState case M.lookup name tmap of Nothing -> mzero Just tspec -> do optTitle <- option mempty $ (\x -> space <> "(" <> x <> ")") <$> opt - mblabel <- option Nothing $ Just . untokenize <$> - try (spaces >> controlSeq "label" >> spaces >> braced) bs <- env name blocks + mblabel <- sLastLabel <$> getState + number <- if theoremNumber tspec then do @@ -169,9 +170,7 @@ theoremEnvironment blocks opt name = do Just ident -> updateState $ \s -> s{ sLabels = M.insert ident - (B.toList $ - theoremName tspec <> "\160" <> - str (renderDottedNum num)) (sLabels s) } + (B.toList $ str (renderDottedNum num)) (sLabels s) } Nothing -> return () return $ space <> B.text (renderDottedNum num) else return mempty @@ -181,13 +180,14 @@ theoremEnvironment blocks opt name = do RemarkStyle -> B.emph let title = titleEmph (theoremName tspec <> number) <> optTitle <> "." <> space - return $ divWith (fromMaybe "" mblabel, [name], []) $ addTitle title + return $ divWith (fromMaybe "" mblabel, [name], []) + $ addTitle title + $ maybe id removeLabel mblabel $ case theoremStyle tspec of PlainStyle -> walk italicize bs _ -> bs - proof :: PandocMonad m => LP m Blocks -> LP m Inlines -> LP m Blocks proof blocks opt = do title <- option (B.text "Proof") opt diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs index 9dac4d6ef..9eb4a0cbc 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -45,6 +45,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing , isNewlineTok , isWordTok , isArgTok + , infile , spaces , spaces1 , tokTypeIn @@ -89,6 +90,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing , resetCaption , env , addMeta + , removeLabel ) where import Control.Applicative (many, (<|>)) @@ -102,6 +104,9 @@ import qualified Data.IntMap as IntMap import qualified Data.Map as M import qualified Data.Set as Set import Data.Text (Text) +import Data.Maybe (fromMaybe) +import Data.List.NonEmpty (NonEmpty(..)) +import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Text as T import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad, report) @@ -115,6 +120,7 @@ import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), ArgSpec (..), Tok (..), TokType (..)) import Text.Pandoc.Shared import Text.Parsec.Pos +import Text.Pandoc.Walk newtype DottedNum = DottedNum [Int] deriving (Show, Eq) @@ -146,7 +152,7 @@ data TheoremSpec = data LaTeXState = LaTeXState{ sOptions :: ReaderOptions , sMeta :: Meta , sQuoteContext :: QuoteContext - , sMacros :: M.Map Text Macro + , sMacros :: NonEmpty (M.Map Text Macro) , sContainers :: [Text] , sLogMessages :: [LogMessage] , sIdentifiers :: Set.Set Text @@ -173,7 +179,7 @@ defaultLaTeXState :: LaTeXState defaultLaTeXState = LaTeXState{ sOptions = def , sMeta = nullMeta , sQuoteContext = NoQuote - , sMacros = M.empty + , sMacros = M.empty :| [] , sContainers = [] , sLogMessages = [] , sIdentifiers = Set.empty @@ -220,8 +226,9 @@ instance HasIncludeFiles LaTeXState where dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s } instance HasMacros LaTeXState where - extractMacros st = sMacros st - updateMacros f st = st{ sMacros = f (sMacros st) } + extractMacros st = NonEmpty.head $ sMacros st + updateMacros f st = st{ sMacros = f (NonEmpty.head (sMacros st)) + :| NonEmpty.tail (sMacros st) } instance HasReaderOptions LaTeXState where extractReaderOptions = sOptions @@ -254,7 +261,7 @@ rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s, Show a) rawLaTeXParser toks retokenize parser valParser = do pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate } - let lstate' = lstate { sMacros = extractMacros pstate } + let lstate' = lstate { sMacros = extractMacros pstate :| [] } let setStartPos = case toks of Tok pos _ _ : _ -> setPosition pos _ -> return () @@ -267,14 +274,14 @@ rawLaTeXParser toks retokenize parser valParser = do Right (endpos, toks') -> do res <- lift $ runParserT (do when retokenize $ do -- retokenize, applying macros - ts <- many (satisfyTok (const True)) + ts <- many anyTok setInput ts rawparser) lstate' "chunk" toks' case res of Left _ -> mzero Right ((val, raw), st) -> do - updateState (updateMacros (sMacros st <>)) + updateState (updateMacros ((NonEmpty.head (sMacros st)) <>)) let skipTilPos stopPos = do anyChar pos <- getPosition @@ -296,10 +303,10 @@ rawLaTeXParser toks retokenize parser valParser = do applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) => Text -> ParserT Sources s m Text applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> - do let retokenize = untokenize <$> many (satisfyTok (const True)) + do let retokenize = untokenize <$> many anyTok pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate - , sMacros = extractMacros pstate } + , sMacros = extractMacros pstate :| [] } res <- runParserT retokenize lstate "math" (tokenize "math" s) case res of Left e -> Prelude.fail (show e) @@ -552,10 +559,10 @@ doMacros' n inp = handleMacros n' spos name ts = do when (n' > 20) -- detect macro expansion loops $ throwError $ PandocMacroLoop name - macros <- sMacros <$> getState + (macros :| _ ) <- sMacros <$> getState case M.lookup name macros of Nothing -> trySpecialMacro name ts - Just (Macro expansionPoint argspecs optarg newtoks) -> do + Just (Macro _scope expansionPoint argspecs optarg newtoks) -> do let getargs' = do args <- (case expansionPoint of @@ -642,6 +649,9 @@ isArgTok :: Tok -> Bool isArgTok (Tok _ (Arg _) _) = True isArgTok _ = False +infile :: PandocMonad m => SourceName -> LP m Tok +infile reference = satisfyTok (\(Tok source _ _) -> (sourceName source) == reference) + spaces :: PandocMonad m => LP m () spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) @@ -745,10 +755,22 @@ primEscape = do bgroup :: PandocMonad m => LP m Tok bgroup = try $ do optional sp - symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup" + t <- symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup" + -- Add a copy of the macro table to the top of the macro stack, + -- private for this group. We inherit all the macros defined in + -- the parent group. + updateState $ \s -> s{ sMacros = NonEmpty.cons (NonEmpty.head (sMacros s)) + (sMacros s) } + return t + egroup :: PandocMonad m => LP m Tok -egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup" +egroup = do + t <- symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup" + -- remove the group's macro table from the stack + updateState $ \s -> s{ sMacros = fromMaybe (sMacros s) $ + NonEmpty.nonEmpty (NonEmpty.tail (sMacros s)) } + return t grouped :: (PandocMonad m, Monoid a) => LP m a -> LP m a grouped parser = try $ do @@ -921,6 +943,9 @@ getRawCommand name txt = do void $ count 4 braced "def" -> void $ manyTill anyTok braced + "vadjust" -> + void (manyTill anyTok braced) <|> + void (satisfyTok isPreTok) -- see #7531 _ | isFontSizeCommand name -> return () | otherwise -> do skipopts @@ -928,6 +953,10 @@ getRawCommand name txt = do void $ many braced return $ txt <> untokenize rawargs +isPreTok :: Tok -> Bool +isPreTok (Tok _ Word "pre") = True +isPreTok _ = False + isDigitTok :: Tok -> Bool isDigitTok (Tok _ Word t) = T.all isDigit t isDigitTok _ = False @@ -1017,7 +1046,16 @@ resetCaption = updateState $ \st -> st{ sCaption = Nothing , sLastLabel = Nothing } env :: PandocMonad m => Text -> LP m a -> LP m a -env name p = p <* end_ name +env name p = do + -- environments are groups as far as macros are concerned, + -- so we need a local copy of the macro table (see above, bgroup, egroup): + updateState $ \s -> s{ sMacros = NonEmpty.cons (NonEmpty.head (sMacros s)) + (sMacros s) } + result <- p + updateState $ \s -> s{ sMacros = fromMaybe (sMacros s) $ + NonEmpty.nonEmpty (NonEmpty.tail (sMacros s)) } + end_ name + return result tokWith :: PandocMonad m => LP m Inlines -> LP m Inlines tokWith inlineParser = try $ spaces >> @@ -1031,3 +1069,16 @@ tokWith inlineParser = try $ spaces >> addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> LP m () addMeta field val = updateState $ \st -> st{ sMeta = addMetaField field val $ sMeta st } + +-- remove label spans to avoid duplicated identifier +removeLabel :: Walkable [Inline] a => Text -> a -> a +removeLabel lbl = walk go + where + go (Span (_,_,kvs) _ : rest) + | Just lbl' <- lookup "label" kvs + , lbl' == lbl = go (dropWhile isSpaceOrSoftBreak rest) + go (x:xs) = x : go xs + go [] = [] + isSpaceOrSoftBreak Space = True + isSpaceOrSoftBreak SoftBreak = True + isSpaceOrSoftBreak _ = False diff --git a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs index b8bf0ce7f..e4738a763 100644 --- a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs +++ b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs @@ -23,11 +23,15 @@ siunitxCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines) siunitxCommands tok = M.fromList [ ("si", dosi tok) + , ("unit", dosi tok) -- v3 version of si , ("SI", doSI tok) + , ("qty", doSI tok) -- v3 version of SI , ("SIrange", doSIrange True tok) + , ("qtyrange", doSIrange True tok) -- v3 version of SIrange + , ("SIlist", doSIlist tok) + , ("qtylist", doSIlist tok) -- v3 version of SIlist , ("numrange", doSIrange False tok) , ("numlist", doSInumlist) - , ("SIlist", doSIlist tok) , ("num", doSInum) , ("ang", doSIang) ] diff --git a/src/Text/Pandoc/Readers/LaTeX/Table.hs b/src/Text/Pandoc/Readers/LaTeX/Table.hs index f56728fe1..7d5c4f265 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Table.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Table.hs @@ -368,7 +368,9 @@ addTableCaption = walkM go ((_,classes,kvs), Just ident) -> (ident,classes,kvs) _ -> attr - return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf + return $ addAttrDiv attr' + $ maybe id removeLabel mblabel + $ Table nullAttr capt spec th tb tf go x = return x -- TODO: For now we add a Div to contain table attributes, since diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs index c20b72bc5..a4eae56db 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Types.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs @@ -15,6 +15,7 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..) , Macro(..) , ArgSpec(..) , ExpansionPoint(..) + , MacroScope(..) , SourcePos ) where @@ -43,7 +44,10 @@ tokToText (Tok _ _ t) = t data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed deriving (Eq, Ord, Show) -data Macro = Macro ExpansionPoint [ArgSpec] (Maybe [Tok]) [Tok] +data MacroScope = GlobalScope | GroupScope + deriving (Eq, Ord, Show) + +data Macro = Macro MacroScope ExpansionPoint [ArgSpec] (Maybe [Tok]) [Tok] deriving Show data ArgSpec = ArgNum Int | Pattern [Tok] diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 2dc7ddf52..b5017a433 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,4 +1,5 @@ {-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} @@ -21,14 +22,14 @@ module Text.Pandoc.Readers.Markdown ( import Control.Monad import Control.Monad.Except (throwError) import Data.Char (isAlphaNum, isPunctuation, isSpace) +import Text.DocLayout (realLength) import Data.List (transpose, elemIndex, sortOn, foldl') -import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.Map as M import Data.Maybe import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T -import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString as BS import System.FilePath (addExtension, takeExtension, takeDirectory) import qualified System.FilePath.Windows as Windows import qualified System.FilePath.Posix as Posix @@ -39,6 +40,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad (..), report) import Text.Pandoc.Definition as Pandoc import Text.Pandoc.Emoji (emojiToInline) import Text.Pandoc.Error +import Safe.Foldable (maximumBounded) import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Walk (walk) @@ -72,14 +74,12 @@ readMarkdown opts s = do yamlToMeta :: PandocMonad m => ReaderOptions -> Maybe FilePath - -> BL.ByteString + -> BS.ByteString -> m Meta yamlToMeta opts mbfp bstr = do let parser = do oldPos <- getPosition - case mbfp of - Nothing -> return () - Just fp -> setPosition $ initialPos fp + setPosition $ initialPos (fromMaybe "" mbfp) meta <- yamlBsToMeta (fmap B.toMetaValue <$> parseBlocks) bstr setPosition oldPos return $ runF meta defaultParserState @@ -95,7 +95,7 @@ yamlToRefs :: PandocMonad m => (Text -> Bool) -> ReaderOptions -> Maybe FilePath - -> BL.ByteString + -> BS.ByteString -> m [MetaValue] yamlToRefs idpred opts mbfp bstr = do let parser = do @@ -198,6 +198,7 @@ inlinesInBalancedBrackets = go openBrackets = (() <$ (escapedChar <|> code <|> + math <|> rawHtmlInline <|> rawLaTeXInline') >> go openBrackets) <|> @@ -326,6 +327,7 @@ referenceKey :: PandocMonad m => MarkdownParser m (F Blocks) referenceKey = try $ do pos <- getPosition skipNonindentSpaces + notFollowedBy (void cite) (_,raw) <- reference char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') @@ -829,7 +831,7 @@ listLineCommon :: PandocMonad m => MarkdownParser m Text listLineCommon = T.concat <$> manyTill ( many1Char (satisfy $ \c -> c `notElem` ['\n', '<', '`']) <|> fmap snd (withRaw code) - <|> fmap snd (htmlTag isCommentTag) + <|> fmap (renderTags . (:[]) . fst) (htmlTag isCommentTag) <|> countChar 1 anyChar ) newline @@ -1013,19 +1015,18 @@ normalDefinitionList = do para :: PandocMonad m => MarkdownParser m (F Blocks) para = try $ do exts <- getOption readerExtensions - let implicitFigures x - | extensionEnabled Ext_implicit_figures exts = do - x' <- x - case B.toList x' of - [Image attr alt (src,tit)] - | not (null alt) -> - -- the fig: at beginning of title indicates a figure - return $ B.singleton - $ Image attr alt (src, "fig:" <> tit) - _ -> return x' - | otherwise = x - result <- implicitFigures . trimInlinesF <$> inlines1 - option (B.plain <$> result) + + result <- trimInlinesF <$> inlines1 + let figureOr constr inlns = + case B.toList inlns of + [Image attr figCaption (src, tit)] + | extensionEnabled Ext_implicit_figures exts + , not (null figCaption) -> do + B.simpleFigureWith attr (B.fromList figCaption) src tit + + _ -> constr inlns + + option (figureOr B.plain <$> result) $ try $ do newline (mempty <$ blanklines) @@ -1047,7 +1048,7 @@ para = try $ do if divLevel > 0 then lookAhead divFenceEnd else mzero - return $ B.para <$> result + return $ figureOr B.para <$> result plain :: PandocMonad m => MarkdownParser m (F Blocks) plain = fmap B.plain . trimInlinesF <$> inlines1 @@ -1124,7 +1125,12 @@ rawHtmlBlocks = do let selfClosing = "/>" `T.isSuffixOf` raw -- we don't want '<td> text' to be a code block: skipMany spaceChar - indentlevel <- (blankline >> length <$> many (char ' ')) <|> return 0 + tabStop <- getOption readerTabStop + indentlevel <- option 0 $ + do blankline + sum <$> many ( (1 <$ char ' ') + <|> + (tabStop <$ char '\t') ) -- try to find closing tag -- we set stateInHtmlBlock so that closing tags that can be either block or -- inline will not be parsed as inline tags @@ -1355,26 +1361,30 @@ pipeTable = try $ do nonindentSpaces lookAhead nonspaceChar (heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak - let heads' = take (length aligns) <$> heads + let cellContents = parseFromString' pipeTableCell . trim + let numcols = length aligns + let heads' = take numcols heads lines' <- many pipeTableRow - let lines'' = map (take (length aligns) <$>) lines' - let maxlength = maximum $ - fmap (\x -> T.length . stringify $ runF x def) (heads' :| lines'') - numColumns <- getOption readerColumns - let widths = if maxlength > numColumns + let lines'' = map (take numcols) lines' + let lineWidths = map (sum . map realLength) (heads' : lines'') + columns <- getOption readerColumns + -- add numcols + 1 for the pipes themselves + let widths = if maximumBounded (sum seplengths : lineWidths) + (numcols + 1) > columns then map (\len -> fromIntegral len / fromIntegral (sum seplengths)) seplengths else replicate (length aligns) 0.0 - return (aligns, widths, toHeaderRow <$> heads', map toRow <$> sequence lines'') + (headCells :: F [Blocks]) <- sequence <$> mapM cellContents heads' + (rows :: F [[Blocks]]) <- sequence <$> mapM (fmap sequence . mapM cellContents) lines'' + return (aligns, widths, toHeaderRow <$> headCells, map toRow <$> rows) sepPipe :: PandocMonad m => MarkdownParser m () sepPipe = try $ do char '|' <|> char '+' notFollowedBy blankline --- parse a row, also returning probable alignments for org-table cells -pipeTableRow :: PandocMonad m => MarkdownParser m (F [Blocks]) +-- parse a row, returning raw cell contents +pipeTableRow :: PandocMonad m => MarkdownParser m [Text] pipeTableRow = try $ do scanForPipe skipMany spaceChar @@ -1382,13 +1392,11 @@ pipeTableRow = try $ do -- split into cells let chunk = void (code <|> math <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') <|> void (noneOf "|\n\r") - let cellContents = withRaw (many chunk) >>= - parseFromString' pipeTableCell . trim . snd - cells <- cellContents `sepEndBy1` char '|' + cells <- (snd <$> withRaw (many chunk)) `sepEndBy1` char '|' -- surrounding pipes needed for a one-column table: guard $ not (length cells == 1 && not openPipe) blankline - return $ sequence cells + return cells pipeTableCell :: PandocMonad m => MarkdownParser m (F Blocks) pipeTableCell = @@ -1692,21 +1700,29 @@ strikeout = fmap B.strikeout <$> superscript :: PandocMonad m => MarkdownParser m (F Inlines) superscript = do - guardEnabled Ext_superscript fmap B.superscript <$> try (do char '^' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '^')) + mconcat <$> (try regularSuperscript <|> try mmdShortSuperscript)) + where regularSuperscript = many1Till (do guardEnabled Ext_superscript + notFollowedBy spaceChar + notFollowedBy newline + inline) (char '^') + mmdShortSuperscript = do guardEnabled Ext_short_subsuperscripts + result <- T.pack <$> many1 alphaNum + return $ return $ return $ B.str result subscript :: PandocMonad m => MarkdownParser m (F Inlines) subscript = do - guardEnabled Ext_subscript fmap B.subscript <$> try (do char '~' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '~')) + mconcat <$> (try regularSubscript <|> mmdShortSubscript)) + where regularSubscript = many1Till (do guardEnabled Ext_subscript + notFollowedBy spaceChar + notFollowedBy newline + inline) (char '~') + mmdShortSubscript = do guardEnabled Ext_short_subsuperscripts + result <- T.pack <$> many1 alphaNum + return $ return $ return $ B.str result whitespace :: PandocMonad m => MarkdownParser m (F Inlines) whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace" @@ -1768,7 +1784,6 @@ endline = try $ do reference :: PandocMonad m => MarkdownParser m (F Inlines, Text) reference = do guardDisabled Ext_footnotes <|> notFollowedBy' (string "[^") - guardDisabled Ext_citations <|> notFollowedBy' (string "[@") withRaw $ trimInlinesF <$> inlinesInBalancedBrackets parenthesizedChars :: PandocMonad m => MarkdownParser m Text @@ -2187,6 +2202,7 @@ normalCite = try $ do citations <- citeList spnl char ']' + notFollowedBy (oneOf "{([") -- not a link or a bracketed span return citations suffix :: PandocMonad m => MarkdownParser m (F Inlines) @@ -2200,7 +2216,7 @@ suffix = try $ do prefix :: PandocMonad m => MarkdownParser m (F Inlines) prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' + manyTill (notFollowedBy (char ';') >> inline) (char ']' <|> lookAhead (try $ do optional (try (char ';' >> spnl)) citeKey True diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 825e4a2eb..9348a8053 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -201,7 +201,12 @@ para = do contents <- trimInlines . mconcat <$> many1 inline if F.all (==Space) contents then return mempty - else return $ B.para contents + else case B.toList contents of + -- For the MediaWiki format all images are considered figures + [Image attr figureCaption (src, title)] -> + return $ B.simpleFigureWith + attr (B.fromList figureCaption) src title + _ -> return $ B.para contents table :: PandocMonad m => MWParser m Blocks table = do @@ -631,7 +636,7 @@ image = try $ do let attr = ("", [], kvs) caption <- (B.str fname <$ sym "]]") <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]"))) - return $ B.imageWith attr fname ("fig:" <> stringify caption) caption + return $ B.imageWith attr fname (stringify caption) caption imageOption :: PandocMonad m => MWParser m Text imageOption = try $ char '|' *> opt diff --git a/src/Text/Pandoc/Readers/Metadata.hs b/src/Text/Pandoc/Readers/Metadata.hs index cbc523b25..7991dca5c 100644 --- a/src/Text/Pandoc/Readers/Metadata.hs +++ b/src/Text/Pandoc/Readers/Metadata.hs @@ -17,102 +17,62 @@ module Text.Pandoc.Readers.Metadata ( yamlMetaBlock, yamlMap ) where -import Control.Monad + import Control.Monad.Except (throwError) -import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString as B import qualified Data.Map as M -import Data.Maybe import Data.Text (Text) import qualified Data.Text as T -import qualified Data.YAML as YAML -import qualified Data.YAML.Event as YE +import qualified Data.Yaml as Yaml +import Data.Aeson (Value(..), Object, Result(..), fromJSON, (.:?), withObject) +import Data.Aeson.Types (parse) +import Text.Pandoc.Shared (tshow) import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) -import Text.Pandoc.Definition +import Text.Pandoc.Definition hiding (Null) import Text.Pandoc.Error -import Text.Pandoc.Parsing hiding (tableWith) -import Text.Pandoc.Shared -import qualified Data.Text.Lazy as TL +import Text.Pandoc.Parsing hiding (tableWith, parse) + + import qualified Text.Pandoc.UTF8 as UTF8 yamlBsToMeta :: (PandocMonad m, HasLastStrPosition st) => ParserT Sources st m (Future st MetaValue) - -> BL.ByteString + -> B.ByteString -> ParserT Sources st m (Future st Meta) yamlBsToMeta pMetaValue bstr = do - case YAML.decodeNode' YAML.failsafeSchemaResolver False False bstr of - Right (YAML.Doc (YAML.Mapping _ _ o):_) - -> fmap Meta <$> yamlMap pMetaValue o + case Yaml.decodeAllEither' bstr of + Right (Object o:_) -> fmap Meta <$> yamlMap pMetaValue o Right [] -> return . return $ mempty - Right [YAML.Doc (YAML.Scalar _ YAML.SNull)] - -> return . return $ mempty - -- the following is what we get from a comment: - Right [YAML.Doc (YAML.Scalar _ (YAML.SUnknown _ ""))] - -> return . return $ mempty + Right [Null] -> return . return $ mempty Right _ -> Prelude.fail "expected YAML object" - Left (yamlpos, err') - -> do pos <- getPosition - setPosition $ incSourceLine - (setSourceColumn pos (YE.posColumn yamlpos)) - (YE.posLine yamlpos - 1) - Prelude.fail err' - -fakePos :: YAML.Pos -fakePos = YAML.Pos (-1) (-1) 1 0 - -lookupYAML :: Text - -> YAML.Node YE.Pos - -> Maybe (YAML.Node YE.Pos) -lookupYAML t (YAML.Mapping _ _ m) = - M.lookup (YAML.Scalar fakePos (YAML.SUnknown YE.untagged t)) m - `mplus` - M.lookup (YAML.Scalar fakePos (YAML.SStr t)) m -lookupYAML _ _ = Nothing + Left err' -> do + throwError $ PandocParseError + $ T.pack $ Yaml.prettyPrintParseException err' -- Returns filtered list of references. yamlBsToRefs :: (PandocMonad m, HasLastStrPosition st) => ParserT Sources st m (Future st MetaValue) -> (Text -> Bool) -- ^ Filter for id - -> BL.ByteString + -> B.ByteString -> ParserT Sources st m (Future st [MetaValue]) yamlBsToRefs pMetaValue idpred bstr = - case YAML.decodeNode' YAML.failsafeSchemaResolver False False bstr of - Right (YAML.Doc o@YAML.Mapping{}:_) - -> case lookupYAML "references" o of - Just (YAML.Sequence _ _ ns) -> do - let g n = case lookupYAML "id" n of - Just n' -> - case nodeToKey n' of - Nothing -> False - Just t -> idpred t || - case lookupYAML "other-ids" n of - Just (YAML.Sequence _ _ ns') -> - let ts' = mapMaybe nodeToKey ns' - in any idpred ts' - _ -> False - Nothing -> False - sequence <$> - mapM (yamlToMetaValue pMetaValue) (filter g ns) - Just _ -> - Prelude.fail "expecting sequence in 'references' field" - Nothing -> - Prelude.fail "expecting 'references' field" - - Right [] -> return . return $ mempty - Right [YAML.Doc (YAML.Scalar _ YAML.SNull)] - -> return . return $ mempty - Right _ -> Prelude.fail "expecting YAML object" - Left (yamlpos, err') - -> do pos <- getPosition - setPosition $ incSourceLine - (setSourceColumn pos (YE.posColumn yamlpos)) - (YE.posLine yamlpos - 1) - Prelude.fail err' - - -nodeToKey :: YAML.Node YE.Pos -> Maybe Text -nodeToKey (YAML.Scalar _ (YAML.SStr t)) = Just t -nodeToKey (YAML.Scalar _ (YAML.SUnknown _ t)) = Just t -nodeToKey _ = Nothing + case Yaml.decodeAllEither' bstr of + Right (Object m : _) -> do + let isSelected (String t) = idpred t + isSelected _ = False + let hasSelectedId (Object o) = + case parse (withObject "ref" (.:? "id")) (Object o) of + Success (Just id') -> isSelected id' + _ -> False + hasSelectedId _ = False + case parse (withObject "metadata" (.:? "references")) (Object m) of + Success (Just refs) -> sequence <$> + mapM (yamlToMetaValue pMetaValue) (filter hasSelectedId refs) + _ -> return $ return [] + Right _ -> return . return $ [] + Left err' -> do + throwError $ PandocParseError + $ T.pack $ Yaml.prettyPrintParseException err' normalizeMetaValue :: (PandocMonad m, HasLastStrPosition st) => ParserT Sources st m (Future st MetaValue) @@ -133,47 +93,36 @@ normalizeMetaValue pMetaValue x = isSpaceChar '\t' = True isSpaceChar _ = False -checkBoolean :: Text -> Maybe Bool -checkBoolean t - | t == T.pack "true" || t == T.pack "True" || t == T.pack "TRUE" = Just True - | t == T.pack "false" || t == T.pack "False" || t == T.pack "FALSE" = Just False - | otherwise = Nothing - yamlToMetaValue :: (PandocMonad m, HasLastStrPosition st) => ParserT Sources st m (Future st MetaValue) - -> YAML.Node YE.Pos + -> Value -> ParserT Sources st m (Future st MetaValue) -yamlToMetaValue pMetaValue (YAML.Scalar _ x) = - case x of - YAML.SStr t -> normalizeMetaValue pMetaValue t - YAML.SBool b -> return $ return $ MetaBool b - YAML.SFloat d -> return $ return $ MetaString $ tshow d - YAML.SInt i -> return $ return $ MetaString $ tshow i - YAML.SUnknown _ t -> - case checkBoolean t of - Just b -> return $ return $ MetaBool b - Nothing -> normalizeMetaValue pMetaValue t - YAML.SNull -> return $ return $ MetaString "" - -yamlToMetaValue pMetaValue (YAML.Sequence _ _ xs) = - fmap MetaList . sequence - <$> mapM (yamlToMetaValue pMetaValue) xs -yamlToMetaValue pMetaValue (YAML.Mapping _ _ o) = - fmap MetaMap <$> yamlMap pMetaValue o -yamlToMetaValue _ _ = return $ return $ MetaString "" +yamlToMetaValue pMetaValue v = + case v of + String t -> normalizeMetaValue pMetaValue t + Bool b -> return $ return $ MetaBool b + Number d -> normalizeMetaValue pMetaValue $ + case fromJSON v of + Success (x :: Int) -> tshow x + _ -> tshow d + Null -> return $ return $ MetaString "" + Array{} -> do + case fromJSON v of + Error err' -> throwError $ PandocParseError $ T.pack err' + Success xs -> fmap MetaList . sequence <$> + mapM (yamlToMetaValue pMetaValue) xs + Object o -> fmap MetaMap <$> yamlMap pMetaValue o yamlMap :: (PandocMonad m, HasLastStrPosition st) => ParserT Sources st m (Future st MetaValue) - -> M.Map (YAML.Node YE.Pos) (YAML.Node YE.Pos) + -> Object -> ParserT Sources st m (Future st (M.Map Text MetaValue)) yamlMap pMetaValue o = do - kvs <- forM (M.toList o) $ \(key, v) -> do - k <- maybe (throwError $ PandocParseError - "Non-string key in YAML mapping") - return $ nodeToKey key - return (k, v) - let kvs' = filter (not . ignorable . fst) kvs - fmap M.fromList . sequence <$> mapM toMeta kvs' + case fromJSON (Object o) of + Error err' -> throwError $ PandocParseError $ T.pack err' + Success (m' :: M.Map Text Value) -> do + let kvs = filter (not . ignorable . fst) $ M.toList m' + fmap M.fromList . sequence <$> mapM toMeta kvs where ignorable t = "_" `T.isSuffixOf` t toMeta (k, v) = do @@ -194,7 +143,7 @@ yamlMetaBlock parser = try $ do -- by including --- and ..., we allow yaml blocks with just comments: let rawYaml = T.unlines ("---" : (rawYamlLines ++ ["..."])) optional blanklines - yamlBsToMeta parser $ UTF8.fromTextLazy $ TL.fromStrict rawYaml + yamlBsToMeta parser $ UTF8.fromText rawYaml stopLine :: Monad m => ParserT Sources st m () stopLine = try $ (string "---" <|> string "...") >> blankline >> return () diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs index f18d2f9a7..9a689b0e8 100644 --- a/src/Text/Pandoc/Readers/Org/Blocks.hs +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -474,15 +474,16 @@ figure = try $ do figCaption = fromMaybe mempty $ blockAttrCaption figAttrs figKeyVals = blockAttrKeyValues figAttrs attr = (figLabel, mempty, figKeyVals) - figTitle = (if isFigure then withFigPrefix else id) figName - in - B.para . B.imageWith attr imgSrc figTitle <$> figCaption - - withFigPrefix :: Text -> Text - withFigPrefix cs = - if "fig:" `T.isPrefixOf` cs - then cs - else "fig:" <> cs + in if isFigure + then (\c -> + B.simpleFigureWith + attr c imgSrc (unstackFig figName)) <$> figCaption + else B.para . B.imageWith attr imgSrc figName <$> figCaption + unstackFig :: Text -> Text + unstackFig figName = + if "fig:" `T.isPrefixOf` figName + then T.drop 4 figName + else figName -- | Succeeds if looking at the end of the current paragraph endOfParagraph :: Monad m => OrgParser m () @@ -889,7 +890,10 @@ listItem parseIndentedMarker = try . withContext ListItemState $ do firstLine <- anyLineNewline blank <- option "" ("\n" <$ blankline) rest <- T.concat <$> many (listContinuation markerLength) - contents <- parseFromString blocks $ firstLine <> blank <> rest + contents <- parseFromString (do initial <- paraOrPlain <|> pure mempty + subsequent <- blocks + return $ initial <> subsequent) + (firstLine <> blank <> rest) return (maybe id (prependInlines . checkboxToInlines) box <$> contents) -- | Prepend inlines to blocks, adding them to the first paragraph or diff --git a/src/Text/Pandoc/Readers/Org/DocumentTree.hs b/src/Text/Pandoc/Readers/Org/DocumentTree.hs index 2dcbecb1d..1c4f253cc 100644 --- a/src/Text/Pandoc/Readers/Org/DocumentTree.hs +++ b/src/Text/Pandoc/Readers/Org/DocumentTree.hs @@ -41,6 +41,7 @@ documentTree :: PandocMonad m -> OrgParser m (F Inlines) -> OrgParser m (F Headline) documentTree blocks inline = do + properties <- option mempty propertiesDrawer initialBlocks <- blocks headlines <- sequence <$> manyTill (headline blocks inline 1) eof title <- fmap docTitle . orgStateMeta <$> getState @@ -54,7 +55,7 @@ documentTree blocks inline = do , headlineText = B.fromList title' , headlineTags = mempty , headlinePlanning = emptyPlanning - , headlineProperties = mempty + , headlineProperties = properties , headlineContents = initialBlocks' , headlineChildren = headlines' } @@ -163,8 +164,15 @@ unprunedHeadlineToBlocks hdln st = in if not usingSelectedTags || any (`Set.member` orgStateSelectTags st) (headlineTags rootNode') then do headlineBlocks <- headlineToBlocks rootNode' + -- add metadata from root node :PROPERTIES: + updateState $ \s -> + s{ orgStateMeta = foldr + (\(PropertyKey k, PropertyValue v) m -> + B.setMeta k v <$> m) + (orgStateMeta s) + (headlineProperties rootNode') } -- ignore first headline, it's the document's title - return . drop 1 . B.toList $ headlineBlocks + return $ drop 1 $ B.toList headlineBlocks else do headlineBlocks <- mconcat <$> mapM headlineToBlocks (headlineChildren rootNode') return . B.toList $ headlineBlocks diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index 6862dd71e..617f98a10 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -31,11 +31,10 @@ import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.TeXMath (DisplayType (..), readTeX, writePandoc) import Text.Pandoc.Sources (ToSources(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap - -import Control.Monad (guard, mplus, mzero, unless, void, when) +import Safe (lastMay) +import Control.Monad (guard, mplus, mzero, unless, when, void) import Control.Monad.Trans (lift) import Data.Char (isAlphaNum, isSpace) -import Data.List (intersperse) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T @@ -148,31 +147,177 @@ endline = try $ do -- Citations -- --- The state of citations is a bit confusing due to the lack of an official --- syntax and multiple syntaxes coexisting. The pandocOrgCite syntax was the --- first to be implemented here and is almost identical to Markdown's citation --- syntax. The org-ref package is in wide use to handle citations, but the --- syntax is a bit limiting and not quite as simple to write. The --- semi-official Org-mode citation syntax is based on John MacFarlane's Pandoc --- sytax and Org-oriented enhancements contributed by Richard Lawrence and --- others. It's dubbed Berkeley syntax due the place of activity of its main --- contributors. All this should be consolidated once an official Org-mode --- citation syntax has emerged. +-- We first try to parse official org-cite citations, then fall +-- back to org-ref citations (which are still in wide use). + +-- | A citation in org-cite style +orgCite :: PandocMonad m => OrgParser m (F [Citation]) +orgCite = try $ do + string "[cite" + (sty, _variants) <- citeStyle + char ':' + spnl + globalPref <- option mempty (try (citePrefix <* char ';')) + items <- citeItems + globalSuff <- option mempty (try (char ';' *> citeSuffix)) + spnl + char ']' + return $ adjustCiteStyle sty . + addPrefixToFirstItem globalPref . + addSuffixToLastItem globalSuff $ items + +adjustCiteStyle :: CiteStyle -> (F [Citation]) -> (F [Citation]) +adjustCiteStyle sty cs = do + cs' <- cs + case cs' of + [] -> return [] + (d:ds) -- TODO needs refinement + -> case sty of + TextStyle -> return $ d{ citationMode = AuthorInText + , citationSuffix = dropWhile (== Space) + (citationSuffix d)} : ds + NoAuthorStyle -> return $ d{ citationMode = SuppressAuthor } : ds + _ -> return (d:ds) + +addPrefixToFirstItem :: (F Inlines) -> (F [Citation]) -> (F [Citation]) +addPrefixToFirstItem aff cs = do + cs' <- cs + aff' <- aff + case cs' of + [] -> return [] + (d:ds) -> return (d{ citationPrefix = + B.toList aff' <> citationPrefix d }:ds) + +addSuffixToLastItem :: (F Inlines) -> (F [Citation]) -> (F [Citation]) +addSuffixToLastItem aff cs = do + cs' <- cs + aff' <- aff + case lastMay cs' of + Nothing -> return cs' + Just d -> + return (init cs' ++ [d{ citationSuffix = + citationSuffix d <> B.toList aff' }]) + +citeItems :: PandocMonad m => OrgParser m (F [Citation]) +citeItems = sequence <$> citeItem `sepBy1` (char ';') + +citeItem :: PandocMonad m => OrgParser m (F Citation) +citeItem = do + pref <- citePrefix + itemKey <- orgCiteKey + suff <- citeSuffix + return $ do + pre' <- pref + suf' <- suff + return Citation + { citationId = itemKey + , citationPrefix = B.toList pre' + , citationSuffix = B.toList suf' + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + +orgCiteKey :: PandocMonad m => OrgParser m Text +orgCiteKey = do + char '@' + T.pack <$> many1 (satisfy orgCiteKeyChar) + +orgCiteKeyChar :: Char -> Bool +orgCiteKeyChar c = + isAlphaNum c || c `elem` ['.',':','?','!','`','\'','/','*','@','+','|', + '(',')','{','}','<','>','&','_','^','$','#', + '%','~','-'] + +rawAffix :: PandocMonad m => Bool -> OrgParser m Text +rawAffix isPrefix = snd <$> withRaw + (many + (affixChar + <|> + try (void (char '[' >> rawAffix isPrefix >> char ']')))) + where + affixChar = void $ satisfy $ \c -> + not (c == '^' || c == ';' || c == '[' || c == ']') && + (not isPrefix || c /= '@') + +citePrefix :: PandocMonad m => OrgParser m (F Inlines) +citePrefix = + rawAffix True >>= parseFromString (trimInlinesF . mconcat <$> many inline) + +citeSuffix :: PandocMonad m => OrgParser m (F Inlines) +citeSuffix = + rawAffix False >>= parseFromString parseSuffix + where + parseSuffix = do + hasSpace <- option False + (True <$ try (spaceChar >> skipSpaces >> lookAhead nonspaceChar)) + ils <- trimInlinesF . mconcat <$> many inline + return $ if hasSpace + then (B.space <>) <$> ils + else ils + +citeStyle :: PandocMonad m => OrgParser m (CiteStyle, [CiteVariant]) +citeStyle = option (DefStyle, []) $ do + sty <- option DefStyle $ try $ char '/' *> orgCiteStyle + variants <- option [] $ try $ char '/' *> orgCiteVariants + return (sty, variants) + +orgCiteStyle :: PandocMonad m => OrgParser m CiteStyle +orgCiteStyle = choice $ map try + [ NoAuthorStyle <$ string "noauthor" + , NoAuthorStyle <$ string "na" + , LocatorsStyle <$ string "locators" + , LocatorsStyle <$ char 'l' + , NociteStyle <$ string "nocite" + , NociteStyle <$ char 'n' + , TextStyle <$ string "text" + , TextStyle <$ char 't' + ] + +orgCiteVariants :: PandocMonad m => OrgParser m [CiteVariant] +orgCiteVariants = + (fullnameVariant `sepBy1` (char '-')) <|> (many1 onecharVariant) + where + fullnameVariant = choice $ map try + [ Bare <$ string "bare" + , Caps <$ string "caps" + , Full <$ string "full" + ] + onecharVariant = choice + [ Bare <$ char 'b' + , Caps <$ char 'c' + , Full <$ char 'f' + ] + +data CiteStyle = + NoAuthorStyle + | LocatorsStyle + | NociteStyle + | TextStyle + | DefStyle + deriving Show + +data CiteVariant = + Caps + | Bare + | Full + deriving Show + + +spnl :: PandocMonad m => OrgParser m () +spnl = + skipSpaces *> optional (newline *> notFollowedBy blankline *> skipSpaces) cite :: PandocMonad m => OrgParser m (F Inlines) -cite = try $ berkeleyCite <|> do +cite = do guardEnabled Ext_citations - (cs, raw) <- withRaw $ choice - [ pandocOrgCite + (cs, raw) <- withRaw $ try $ choice + [ orgCite , orgRefCite - , berkeleyTextualCite ] return $ flip B.cite (B.text raw) <$> cs --- | A citation in Pandoc Org-mode style (@[prefix \@citekey suffix]@). -pandocOrgCite :: PandocMonad m => OrgParser m (F [Citation]) -pandocOrgCite = try $ - char '[' *> skipSpaces *> citeList <* skipSpaces <* char ']' +-- org-ref orgRefCite :: PandocMonad m => OrgParser m (F [Citation]) orgRefCite = try $ choice @@ -201,100 +346,6 @@ normalOrgRefCite = try $ do , citationHash = 0 } --- | Read an Berkeley-style Org-mode citation. Berkeley citation style was --- develop and adjusted to Org-mode style by John MacFarlane and Richard --- Lawrence, respectively, both philosophers at UC Berkeley. -berkeleyCite :: PandocMonad m => OrgParser m (F Inlines) -berkeleyCite = try $ do - bcl <- berkeleyCitationList - return $ do - parens <- berkeleyCiteParens <$> bcl - prefix <- berkeleyCiteCommonPrefix <$> bcl - suffix <- berkeleyCiteCommonSuffix <$> bcl - citationList <- berkeleyCiteCitations <$> bcl - return $ - if parens - then toCite - . maybe id (alterFirst . prependPrefix) prefix - . maybe id (alterLast . appendSuffix) suffix - $ citationList - else maybe mempty (<> " ") prefix - <> toListOfCites (map toInTextMode citationList) - <> maybe mempty (", " <>) suffix - where - toCite :: [Citation] -> Inlines - toCite cs = B.cite cs mempty - - toListOfCites :: [Citation] -> Inlines - toListOfCites = mconcat . intersperse ", " . map (\c -> B.cite [c] mempty) - - toInTextMode :: Citation -> Citation - toInTextMode c = c { citationMode = AuthorInText } - - alterFirst, alterLast :: (a -> a) -> [a] -> [a] - alterFirst _ [] = [] - alterFirst f (c:cs) = f c : cs - alterLast f = reverse . alterFirst f . reverse - - prependPrefix, appendSuffix :: Inlines -> Citation -> Citation - prependPrefix pre c = c { citationPrefix = B.toList pre <> citationPrefix c } - appendSuffix suf c = c { citationSuffix = citationSuffix c <> B.toList suf } - -data BerkeleyCitationList = BerkeleyCitationList - { berkeleyCiteParens :: Bool - , berkeleyCiteCommonPrefix :: Maybe Inlines - , berkeleyCiteCommonSuffix :: Maybe Inlines - , berkeleyCiteCitations :: [Citation] - } -berkeleyCitationList :: PandocMonad m => OrgParser m (F BerkeleyCitationList) -berkeleyCitationList = try $ do - char '[' - parens <- choice [ False <$ berkeleyBareTag, True <$ berkeleyParensTag ] - char ':' - skipSpaces - commonPrefix <- optionMaybe (try $ citationListPart <* char ';') - citations <- citeList - commonSuffix <- optionMaybe (try citationListPart) - char ']' - return (BerkeleyCitationList parens - <$> sequence commonPrefix - <*> sequence commonSuffix - <*> citations) - where - citationListPart :: PandocMonad m => OrgParser m (F Inlines) - citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do - notFollowedBy' $ citeKey False - notFollowedBy (oneOf ";]") - inline - -berkeleyBareTag :: PandocMonad m => OrgParser m () -berkeleyBareTag = try $ void berkeleyBareTag' - -berkeleyParensTag :: PandocMonad m => OrgParser m () -berkeleyParensTag = try . void $ enclosedByPair1 '(' ')' berkeleyBareTag' - -berkeleyBareTag' :: PandocMonad m => OrgParser m () -berkeleyBareTag' = try $ void (string "cite") - -berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation]) -berkeleyTextualCite = try $ do - (suppressAuthor, key) <- citeKey False - returnF . return $ Citation - { citationId = key - , citationPrefix = mempty - , citationSuffix = mempty - , citationMode = if suppressAuthor then SuppressAuthor else AuthorInText - , citationNoteNum = 0 - , citationHash = 0 - } - --- The following is what a Berkeley-style bracketed textual citation parser --- would look like. However, as these citations are a subset of Pandoc's Org --- citation style, this isn't used. --- berkeleyBracketedTextualCite :: PandocMonad m => OrgParser m (F [Citation]) --- berkeleyBracketedTextualCite = try . (fmap head) $ --- enclosedByPair1 '[' ']' berkeleyTextualCite - -- | Read a link-like org-ref style citation. The citation includes pre and -- post text. However, multiple citations are not possible due to limitations -- in the syntax. @@ -345,39 +396,6 @@ orgRefCiteMode = , ("citeyear", SuppressAuthor) ] -citeList :: PandocMonad m => OrgParser m (F [Citation]) -citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) - -citation :: PandocMonad m => OrgParser m (F Citation) -citation = try $ do - pref <- prefix - (suppress_author, key) <- citeKey False - suff <- suffix - return $ do - x <- pref - y <- suff - return Citation - { citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - where - prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead (citeKey False))) - suffix = try $ do - hasSpace <- option False (notFollowedBy nonspaceChar >> return True) - skipSpaces - rest <- trimInlinesF . mconcat <$> - many (notFollowedBy (oneOf ";]") *> inline) - return $ if hasSpace - then (B.space <>) <$> rest - else rest - footnote :: PandocMonad m => OrgParser m (F Inlines) footnote = try $ do note <- inlineNote <|> referencedNote diff --git a/src/Text/Pandoc/Readers/Org/Meta.hs b/src/Text/Pandoc/Readers/Org/Meta.hs index a1b21046a..ccb6744e7 100644 --- a/src/Text/Pandoc/Readers/Org/Meta.hs +++ b/src/Text/Pandoc/Readers/Org/Meta.hs @@ -27,13 +27,13 @@ import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Shared (blocksToInlines, safeRead) +import Text.Pandoc.Network.HTTP (urlEncode) import Control.Monad (mzero, void) import Data.List (intercalate, intersperse) import Data.Map (Map) import Data.Maybe (fromMaybe) import Data.Text (Text) -import Network.HTTP (urlEncode) import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.Text as T @@ -188,7 +188,7 @@ parseFormat = try $ replacePlain <|> replaceUrl <|> justAppend -- inefficient replacePlain = try $ (\x -> T.concat . flip intersperse x) <$> sequence [tillSpecifier 's', rest] - replaceUrl = try $ (\x -> T.concat . flip intersperse x . T.pack . urlEncode . T.unpack) + replaceUrl = try $ (\x -> T.concat . flip intersperse x . urlEncode) <$> sequence [tillSpecifier 'h', rest] justAppend = try $ (<>) <$> rest diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 3990f0cb5..88471eb0a 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -466,14 +466,11 @@ includeDirective top fields body = do let classes = maybe [] T.words (lookup "class" fields) let ident = maybe "" trimr $ lookup "name" fields let parser = - case lookup "code" fields of + case lookup "code" fields `mplus` lookup "literal" fields of Just lang -> (codeblock ident classes fields (trimr lang) False . sourcesToText) <$> getInput - Nothing -> - case lookup "literal" fields of - Just _ -> B.rawBlock "rst" . sourcesToText <$> getInput - Nothing -> parseBlocks + Nothing -> parseBlocks let isLiteral = isJust (lookup "code" fields `mplus` lookup "literal" fields) let selectLines = (case trim <$> lookup "end-before" fields of @@ -728,8 +725,8 @@ directive' = do "figure" -> do (caption, legend) <- parseFromString' extractCaption body' let src = escapeURI $ trim top - return $ B.para (B.imageWith (imgAttr "figclass") src "fig:" - caption) <> legend + return $ B.simpleFigureWith + (imgAttr "figclass") caption src "" <> legend "image" -> do let src = escapeURI $ trim top let alt = B.str $ maybe "image" trim $ lookup "alt" fields @@ -922,14 +919,22 @@ addNewRole roleText fields = do (baseRole, baseFmt, baseAttr) = getBaseRole (parentRole, Nothing, nullAttr) customRoles fmt = if parentRole == "raw" then lookup "format" fields else baseFmt - annotate :: [Text] -> [Text] - annotate = maybe id (:) $ - if baseRole == "code" - then lookup "language" fields - else Nothing - attr = let (ident, classes, keyValues) = baseAttr - -- nub in case role name & language class are the same - in (ident, nub . (role :) . annotate $ classes, keyValues) + + updateClasses :: [Text] -> [Text] + updateClasses oldClasses = let + + codeLanguageClass = if baseRole == "code" + then maybeToList (lookup "language" fields) + else [] + + -- if no ":class:" field is given, the default is the role name + classFieldClasses = maybe [role] T.words (lookup "class" fields) + + -- nub in case role name & language class are the same + in nub (classFieldClasses ++ codeLanguageClass ++ oldClasses) + + attr = let (ident, baseClasses, keyValues) = baseAttr + in (ident, updateClasses baseClasses, keyValues) -- warn about syntax we ignore forM_ fields $ \(key, _) -> case key of @@ -1158,10 +1163,11 @@ referenceNames = do let rn = try $ do string ".. _" ref <- quotedReferenceName - <|> manyChar ( noneOf ":\n" + <|> manyChar ( noneOf "\\:\n" <|> try (char '\n' <* string " " <* notFollowedBy blankline) + <|> try (char '\\' *> char ':') <|> try (char ':' <* lookAhead alphaNum) ) char ':' diff --git a/src/Text/Pandoc/Readers/RTF.hs b/src/Text/Pandoc/Readers/RTF.hs new file mode 100644 index 000000000..3938681f4 --- /dev/null +++ b/src/Text/Pandoc/Readers/RTF.hs @@ -0,0 +1,1351 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.RTF + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane (<jgm@berkeley.edu>) + Stability : alpha + Portability : portable + +Conversion of RTF documents 'Pandoc' document. +We target version 1.5 of the RTF spec. +-} +module Text.Pandoc.Readers.RTF (readRTF) where + +import qualified Data.IntMap as IntMap +import qualified Data.Sequence as Seq +import Control.Monad +import Control.Monad.Except (throwError) +import Data.List (find, foldl') +import Data.Word (Word8, Word16) +import Data.Default +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Read as TR +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class.PandocMonad (PandocMonad (..), insertMedia) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Shared (safeRead, tshow) +import Data.Char (isAlphaNum, chr, isAscii, isLetter, isSpace, ord) +import qualified Data.ByteString.Lazy as BL +import Data.Digest.Pure.SHA (sha1, showDigest) +import Data.Maybe (mapMaybe, fromMaybe) +import Safe (lastMay, initSafe, headDef) +-- import Debug.Trace + +-- TODO: +-- [ ] more complex table features +-- + +-- | Read RTF from an input string and return a Pandoc document. +readRTF :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readRTF opts s = do + let sources = toSources s + parsed <- readWithM parseRTF def{ sOptions = opts } sources + case parsed of + Left e -> throwError e + Right d -> return d + +data CharSet = ANSI | Mac | Pc | Pca + deriving (Show, Eq) + +-- first index is the list (or override) id, second is the list level +type ListTable = IntMap.IntMap ListLevelTable +type ListLevelTable = IntMap.IntMap ListType + +data RTFState = RTFState { sOptions :: ReaderOptions + , sCharSet :: CharSet + , sGroupStack :: [Properties] + , sListStack :: [List] + , sCurrentCell :: Blocks + , sTableRows :: [TableRow] -- reverse order + , sTextContent :: [(Properties, Text)] + , sMetadata :: [(Text, Inlines)] + , sFontTable :: FontTable + , sStylesheet :: Stylesheet + , sListTable :: ListTable + , sListOverrideTable :: ListTable + , sEatChars :: Int + } deriving (Show) + +instance Default RTFState where + def = RTFState { sOptions = def + , sCharSet = ANSI + , sGroupStack = [] + , sListStack = [] + , sCurrentCell = mempty + , sTableRows = [] + , sTextContent = [] + , sMetadata = [] + , sFontTable = mempty + , sStylesheet = mempty + , sListTable = mempty + , sListOverrideTable = mempty + , sEatChars = 0 + } + +type FontTable = IntMap.IntMap FontFamily + +data FontFamily = + Roman | Swiss | Modern | Script | Decor | Tech | Bidi + deriving (Show, Eq) + +data StyleType = ParagraphStyle | SectionStyle | CharStyle | TableStyle + deriving (Show, Eq) + +data Style = + Style { styleNum :: Int + , styleType :: StyleType + , styleBasedOn :: Maybe Int + , styleName :: Text + , styleFormatting :: [Tok] + } deriving (Show, Eq) + +type Stylesheet = IntMap.IntMap Style + +data PictType = + Emfblip | Pngblip | Jpegblip + deriving (Show, Eq) + +data Pict = + Pict { picType :: Maybe PictType + , picWidth :: Maybe Int + , picHeight :: Maybe Int + , picWidthGoal :: Maybe Int + , picHeightGoal :: Maybe Int + , picBinary :: Bool + , picData :: Text + , picName :: Text + , picBytes :: BL.ByteString + } deriving (Show, Eq) + +instance Default Pict where + def = Pict { picType = Nothing + , picWidth = Nothing + , picHeight = Nothing + , picWidthGoal = Nothing + , picHeightGoal = Nothing + , picBinary = False + , picData = mempty + , picName = mempty + , picBytes = mempty } + +data Properties = + Properties + { gBold :: Bool + , gItalic :: Bool + , gCaps :: Bool + , gDeleted :: Bool + , gSub :: Bool + , gSuper :: Bool + , gSmallCaps :: Bool + , gUnderline :: Bool + , gHyperlink :: Maybe Text + , gAnchor :: Maybe Text + , gImage :: Maybe Pict + , gFontFamily :: Maybe FontFamily + , gHidden :: Bool + , gUC :: Int -- number of ansi chars to skip after unicode char + , gFootnote :: Maybe Blocks + , gOutlineLevel :: Maybe ListLevel + , gListOverride :: Maybe Override + , gListLevel :: Maybe Int + , gInTable :: Bool + } deriving (Show, Eq) + +instance Default Properties where + def = Properties { gBold = False + , gItalic = False + , gCaps = False + , gDeleted = False + , gSub = False + , gSuper = False + , gSmallCaps = False + , gUnderline = False + , gHyperlink = Nothing + , gAnchor = Nothing + , gImage = Nothing + , gFontFamily = Nothing + , gHidden = False + , gUC = 1 + , gFootnote = Nothing + , gOutlineLevel = Nothing + , gListOverride = Nothing + , gListLevel = Nothing + , gInTable = False + } + +type RTFParser m = ParserT Sources RTFState m + +data ListType = Bullet | Ordered ListAttributes + deriving (Show, Eq) + +type Override = Int + +type ListLevel = Int + +data List = + List Override ListLevel ListType [Blocks] -- items in reverse order + deriving (Show, Eq) + +newtype TableRow = TableRow [Blocks] -- cells in reverse order + deriving (Show, Eq) + +parseRTF :: PandocMonad m => RTFParser m Pandoc +parseRTF = do + skipMany nl + toks <- many tok + -- return $! traceShowId toks + bs <- (case toks of + -- if we start with {\rtf1...}, parse that and ignore + -- what follows (which in certain cases can be non-RTF content) + rtftok@(Tok _ (Grouped (Tok _ (ControlWord "rtf" (Just 1)) : _))) : _ + -> foldM processTok mempty [rtftok] + _ -> foldM processTok mempty toks) + >>= emitBlocks + unclosed <- closeContainers + let doc = B.doc $ bs <> unclosed + kvs <- sMetadata <$> getState + pure $ foldr (uncurry B.setMeta) doc kvs + +data Tok = Tok SourcePos TokContents + deriving (Show, Eq) + +data TokContents = + ControlWord Text (Maybe Int) + | ControlSymbol Char + | UnformattedText Text + | BinData BL.ByteString + | HexVal Word8 + | Grouped [Tok] + deriving (Show, Eq) + +tok :: PandocMonad m => RTFParser m Tok +tok = do + pos <- getPosition + Tok pos <$> ((controlThing <|> unformattedText <|> grouped) <* skipMany nl) + where + controlThing = do + char '\\' *> + ( binData + <|> (ControlWord <$> letterSequence <*> (parameter <* optional delimChar)) + <|> (HexVal <$> hexVal) + <|> (ControlSymbol <$> anyChar) ) + binData = try $ do + string "bin" <* notFollowedBy letter + n <- fromMaybe 0 <$> parameter + spaces + -- NOTE: We assume here that if the document contains binary + -- data, it will not be valid UTF-8 and hence it will have been + -- read as latin1, so we can recover the data in the following + -- way. This is probably not completely reliable, but I don't + -- know if we can do better without making this reader take + -- a ByteString input. + dat <- BL.pack . map (fromIntegral . ord) <$> count n anyChar + return $ BinData dat + parameter = do + hyph <- string "-" <|> pure "" + rest <- many digit + let pstr = T.pack $ hyph <> rest + return $ safeRead pstr + hexVal = do + char '\'' + x <- hexDigit + y <- hexDigit + return $ hexToWord (T.pack [x,y]) + letterSequence = T.pack <$> many1 (satisfy (\c -> isAscii c && isLetter c)) + unformattedText = + UnformattedText . T.pack . mconcat <$> + many1 ( many1 (satisfy (not . isSpecial)) + <|> ("" <$ nl)) + grouped = Grouped <$> (char '{' *> skipMany nl *> manyTill tok (char '}')) + +nl :: PandocMonad m => RTFParser m () +nl = void (char '\n' <|> char '\r') + +isSpecial :: Char -> Bool +isSpecial '{' = True +isSpecial '}' = True +isSpecial '\\' = True +isSpecial '\n' = True +isSpecial _ = False + +delimChar :: PandocMonad m => RTFParser m Char +delimChar = satisfy (\c -> not (isAlphaNum c || isSpecial c)) + +modifyGroup :: PandocMonad m + => (Properties -> Properties) + -> RTFParser m () +modifyGroup f = + updateState $ \st -> + st{ sGroupStack = + case sGroupStack st of + [] -> [] + (x:xs) -> f x : xs } + +addFormatting :: (Properties, Text) -> Inlines +addFormatting (_, "\n") = B.linebreak +addFormatting (props, _) | gHidden props = mempty +addFormatting (props, _) | Just bs <- gFootnote props = B.note bs +addFormatting (props, txt) = + (if gBold props then B.strong else id) . + (if gItalic props then B.emph else id) . + (if gDeleted props then B.strikeout else id) . + (if gSub props then B.subscript else id) . + (if gSuper props then B.superscript else id) . + (if gSmallCaps props then B.smallcaps else id) . + (if gUnderline props then B.underline else id) . + (case gHyperlink props of + Nothing -> id + Just linkdest -> B.link linkdest mempty) . + (case gAnchor props of + Nothing -> id + Just ident -> B.spanWith (ident,[],[])) . + (case gFontFamily props of + Just Modern -> B.code + _ -> case gImage props of + Just pict -> + let attr = ("",[], + (case picWidthGoal pict of + Nothing -> [] + Just w -> [("width", tshow (fromIntegral w / 1440 + :: Double) + <> "in")]) ++ + (case picHeightGoal pict of + Nothing -> [] + Just h -> [("height", tshow (fromIntegral h / 1440 + :: Double) + <> "in")])) + in B.imageWith attr (picName pict) "" . B.text + Nothing -> B.text) . + (if gCaps props then T.toUpper else id) + $ txt + +addText :: PandocMonad m => Text -> RTFParser m () +addText t = do + gs <- sGroupStack <$> getState + let props = case gs of + (x:_) -> x + _ -> def + updateState (\s -> s{ sTextContent = (props, t) : sTextContent s }) + +inGroup :: PandocMonad m => RTFParser m a -> RTFParser m a +inGroup p = do + updateState $ \st -> + st{ sGroupStack = + case sGroupStack st of + [] -> [def] + (x:xs) -> (x:x:xs) } -- inherit current group's properties + result <- p + updateState $ \st -> + st{ sGroupStack = + case sGroupStack st of + [] -> [] -- should not happen + (_:xs) -> xs } + return result + +getStyleFormatting :: PandocMonad m => Int -> RTFParser m [Tok] +getStyleFormatting stynum = do + stylesheet <- sStylesheet <$> getState + case IntMap.lookup stynum stylesheet of + Nothing -> return [] + Just sty -> + case styleBasedOn sty of + Just i -> (<> styleFormatting sty) <$> getStyleFormatting i + Nothing -> return $ styleFormatting sty + +isMetadataField :: Text -> Bool +isMetadataField "title" = True +isMetadataField "subject" = True +isMetadataField "author" = True +isMetadataField "manager" = True +isMetadataField "company" = True +isMetadataField "operator" = True +isMetadataField "category" = True +isMetadataField "keywords" = True +isMetadataField "comment" = True +isMetadataField "doccomm" = True +isMetadataField "hlinkbase" = True +isMetadataField "generator" = True +isMetadataField _ = False + +isHeaderFooter :: Text -> Bool +isHeaderFooter "header" = True +isHeaderFooter "headerl" = True +isHeaderFooter "headerr" = True +isHeaderFooter "headerf" = True +isHeaderFooter "footer" = True +isHeaderFooter "footerl" = True +isHeaderFooter "footerr" = True +isHeaderFooter "footerf" = True +isHeaderFooter _ = False + +boolParam :: Maybe Int -> Bool +boolParam (Just 0) = False +boolParam _ = True + +isUnderline :: Text -> Bool +isUnderline "ul" = True +isUnderline "uld" = True +isUnderline "uldash" = True +isUnderline "uldashd" = True +isUnderline "uldashdd" = True +isUnderline "uldb" = True +isUnderline "ulth" = True +isUnderline "ulthd" = True +isUnderline "ulthdash" = True +isUnderline "ulw" = True +isUnderline "ulwave" = True +isUnderline _ = False + +processTok :: PandocMonad m => Blocks -> Tok -> RTFParser m Blocks +processTok bs (Tok pos tok') = do + setPosition pos + case tok' of + HexVal{} -> return () + UnformattedText{} -> return () + _ -> updateState $ \s -> s{ sEatChars = 0 } + case tok' of + Grouped (Tok _ (ControlSymbol '*') : toks) -> + bs <$ (do oldTextContent <- sTextContent <$> getState + processTok mempty (Tok pos (Grouped toks)) + updateState $ \st -> st{ sTextContent = oldTextContent }) + Grouped (Tok _ (ControlWord "fonttbl" _) : toks) -> inGroup $ do + updateState $ \s -> s{ sFontTable = processFontTable toks } + pure bs + Grouped (Tok _ (ControlWord "field" _) : toks) -> + inGroup $ handleField bs toks + Grouped (Tok _ (ControlWord "pict" _) : toks) -> + bs <$ inGroup (handlePict toks) + Grouped (Tok _ (ControlWord "stylesheet" _) : toks) -> + bs <$ inGroup (handleStylesheet toks) + Grouped (Tok _ (ControlWord "listtext" _) : _) -> do + -- eject any previous list items...sometimes TextEdit + -- doesn't put in a \par + emitBlocks bs + Grouped (Tok _ (ControlWord "pgdsc" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "colortbl" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "listtable" _) : toks) -> + bs <$ inGroup (handleListTable toks) + Grouped (Tok _ (ControlWord "listoverridetable" _) : toks) -> + bs <$ inGroup (handleListOverrideTable toks) + Grouped (Tok _ (ControlWord "wgrffmtfilter" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "themedata" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "colorschememapping" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "datastore" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "latentstyles" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "pntxta" _) : _) -> pure bs -- TODO + Grouped (Tok _ (ControlWord "pntxtb" _) : _) -> pure bs -- TODO + Grouped (Tok _ (ControlWord "xmlnstbl" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "filetbl" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "expandedcolortbl" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "listtables" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "revtbl" _) : _) -> pure bs + Grouped (Tok _ (ControlWord "bkmkstart" _) + : Tok _ (UnformattedText t) : _) -> do + -- TODO ideally we'd put the span around bkmkstart/end, but this + -- is good for now: + modifyGroup (\g -> g{ gAnchor = Just $ T.strip t }) + pure bs + Grouped (Tok _ (ControlWord "bkmkend" _) : _) -> do + modifyGroup (\g -> g{ gAnchor = Nothing }) + pure bs + Grouped (Tok _ (ControlWord f _) : _) | isHeaderFooter f -> pure bs + Grouped (Tok _ (ControlWord "footnote" _) : toks) -> do + noteBs <- inGroup $ processDestinationToks toks + modifyGroup (\g -> g{ gFootnote = Just noteBs }) + addText "*" + modifyGroup (\g -> g{ gFootnote = Nothing }) + return bs + Grouped (Tok _ (ControlWord "info" _) : toks) -> + bs <$ inGroup (processDestinationToks toks) + Grouped (Tok _ (ControlWord f _) : toks) | isMetadataField f -> inGroup $ do + foldM_ processTok mempty toks + annotatedToks <- reverse . sTextContent <$> getState + updateState $ \s -> s{ sTextContent = [] } + let ils = B.trimInlines . mconcat $ map addFormatting annotatedToks + updateState $ \s -> s{ sMetadata = (f, ils) : sMetadata s } + pure bs + Grouped toks -> inGroup (foldM processTok bs toks) + UnformattedText t -> bs <$ do + -- return $! traceShowId $! (pos, t) + eatChars <- sEatChars <$> getState + case eatChars of + 0 -> addText t + n | n < T.length t -> do + updateState $ \s -> s{ sEatChars = 0 } + addText (T.drop n t) + | otherwise -> do + updateState $ \s -> s{ sEatChars = n - T.length t } + HexVal n -> bs <$ do + eatChars <- sEatChars <$> getState + if eatChars == 0 + then do + charset <- sCharSet <$> getState + case charset of + ANSI -> addText (T.singleton $ ansiToChar n) + Mac -> addText (T.singleton $ macToChar n) + Pc -> addText (T.singleton $ pcToChar n) + Pca -> addText (T.singleton $ pcaToChar n) + else updateState $ \s -> s{ sEatChars = eatChars - 1 } + ControlWord "ansi" _ -> bs <$ + updateState (\s -> s{ sCharSet = ANSI }) + ControlWord "mac" _ -> bs <$ + updateState (\s -> s{ sCharSet = Mac }) + ControlWord "pc" _ -> bs <$ + updateState (\s -> s{ sCharSet = Pc }) + ControlWord "pca" _ -> bs <$ + updateState (\s -> s{ sCharSet = Pca }) + ControlWord "outlinelevel" mbp -> bs <$ + modifyGroup (\g -> g{ gOutlineLevel = mbp }) + ControlWord "ls" mbp -> bs <$ + modifyGroup (\g -> g{ gListOverride = mbp }) + ControlWord "ilvl" mbp -> bs <$ + modifyGroup (\g -> g{ gListLevel = mbp }) + ControlSymbol '\\' -> bs <$ addText "\\" + ControlSymbol '{' -> bs <$ addText "{" + ControlSymbol '}' -> bs <$ addText "}" + ControlSymbol '~' -> bs <$ addText "\x00a0" + ControlSymbol '-' -> bs <$ addText "\x00ad" + ControlSymbol '_' -> bs <$ addText "\x2011" + ControlWord "trowd" _ -> bs <$ do -- add new row + updateState $ \s -> s{ sTableRows = TableRow [] : sTableRows s + , sCurrentCell = mempty } + ControlWord "cell" _ -> bs <$ do + new <- emitBlocks mempty + curCell <- (<> new) . sCurrentCell <$> getState + updateState $ \s -> s{ sTableRows = + case sTableRows s of + TableRow cs : rs -> + TableRow (curCell : cs) : rs + [] -> [TableRow [curCell]] -- shouldn't happen + , sCurrentCell = mempty } + ControlWord "intbl" _ -> bs <$ modifyGroup (\g -> g{ gInTable = True }) + ControlWord "plain" _ -> bs <$ modifyGroup (const def) + ControlWord "lquote" _ -> bs <$ addText "\x2018" + ControlWord "rquote" _ -> bs <$ addText "\x2019" + ControlWord "ldblquote" _ -> bs <$ addText "\x201C" + ControlWord "rdblquote" _ -> bs <$ addText "\x201D" + ControlWord "emdash" _ -> bs <$ addText "\x2014" + ControlWord "emspace" _ -> bs <$ addText "\x2003" + ControlWord "enspace" _ -> bs <$ addText "\x2002" + ControlWord "endash" _ -> bs <$ addText "\x2013" + ControlWord "bullet" _ -> bs <$ addText "\x2022" + ControlWord "tab" _ -> bs <$ addText "\t" + ControlWord "line" _ -> bs <$ addText "\n" + ControlSymbol '\n' -> bs <$ addText "\n" + ControlSymbol '\r' -> bs <$ addText "\n" + ControlWord "uc" (Just i) -> bs <$ modifyGroup (\g -> g{ gUC = i }) + ControlWord "cs" (Just n) -> do + getStyleFormatting n >>= foldM processTok bs + ControlWord "s" (Just n) -> do + getStyleFormatting n >>= foldM processTok bs + ControlWord "ds" (Just n) -> do + getStyleFormatting n >>= foldM processTok bs + ControlWord "f" (Just i) -> bs <$ do + fontTable <- sFontTable <$> getState + modifyGroup (\g -> g{ gFontFamily = IntMap.lookup i fontTable }) + ControlWord "u" (Just i) -> bs <$ do + st <- getState + let curgroup = case sGroupStack st of + [] -> def + (x:_) -> x + updateState $ \s -> s{ sEatChars = gUC curgroup } + -- "RTF control words generally accept signed 16-bit numbers as + -- arguments. For this reason, Unicode values greater than 32767 + -- must be expressed as negative numbers." + let codepoint :: Word16 + codepoint = fromIntegral i + addText (T.singleton (chr $ fromIntegral codepoint)) + ControlWord "caps" mbp -> bs <$ + modifyGroup (\g -> g{ gCaps = boolParam mbp }) + ControlWord "deleted" mbp -> bs <$ + modifyGroup (\g -> g{ gDeleted = boolParam mbp }) + ControlWord "b" mbp -> bs <$ + modifyGroup (\g -> g{ gBold = boolParam mbp }) + ControlWord "i" mbp -> bs <$ + modifyGroup (\g -> g{ gItalic = boolParam mbp }) + ControlWord "sub" mbp -> bs <$ + modifyGroup (\g -> g{ gSub = boolParam mbp }) + ControlWord "super" mbp -> bs <$ + modifyGroup (\g -> g{ gSuper = boolParam mbp }) + ControlWord "up" mbp -> bs <$ + modifyGroup (\g -> g{ gSuper = boolParam mbp }) + ControlWord "strike" mbp -> bs <$ + modifyGroup (\g -> g{ gDeleted = boolParam mbp }) + ControlWord "strikedl" mbp -> bs <$ + modifyGroup (\g -> g{ gDeleted = boolParam mbp }) + ControlWord "striked" mbp -> bs <$ + modifyGroup (\g -> g{ gDeleted = boolParam mbp }) + ControlWord "scaps" mbp -> bs <$ + modifyGroup (\g -> g{ gSmallCaps = boolParam mbp }) + ControlWord "v" mbp -> bs <$ + modifyGroup (\g -> g{ gHidden = boolParam mbp }) + ControlWord x mbp | isUnderline x -> bs <$ + modifyGroup (\g -> g{ gUnderline = boolParam mbp }) + ControlWord "ulnone" _ -> bs <$ + modifyGroup (\g -> g{ gUnderline = False }) + ControlWord "pard" _ -> bs <$ do + modifyGroup (const def) + getStyleFormatting 0 >>= foldM processTok bs + ControlWord "par" _ -> emitBlocks bs + _ -> pure bs + +processDestinationToks :: PandocMonad m => [Tok] -> RTFParser m Blocks +processDestinationToks toks = do + textContent <- sTextContent <$> getState + liststack <- sListStack <$> getState + updateState $ \s -> s{ sTextContent = mempty + , sListStack = [] } + result <- inGroup $ + foldM processTok mempty toks >>= emitBlocks + unclosed <- closeContainers + updateState $ \s -> s{ sTextContent = textContent + , sListStack = liststack } + return $ result <> unclosed + +-- close lists >= level +closeLists :: PandocMonad m => Int -> RTFParser m Blocks +closeLists lvl = do + lists <- sListStack <$> getState + case lists of + (List _ lvl' lt items : rest) | lvl' >= lvl -> do + let newlist = (case lt of + Bullet -> B.bulletList + Ordered listAttr -> B.orderedListWith listAttr) + (reverse items) + updateState $ \s -> s{ sListStack = rest } + case rest of + [] -> do + updateState $ \s -> s{ sListStack = rest } + pure newlist + (List lo lvl'' lt' [] : rest') -> do -- should not happen + updateState $ \s -> s{ sListStack = + List lo lvl'' lt' [newlist] : rest' } + closeLists lvl + (List lo lvl'' lt' (i:is) : rest') -> do + updateState $ \s -> s{ sListStack = + List lo lvl'' lt' (i <> newlist : is) : rest' } + closeLists lvl + _ -> pure mempty + +closeTable :: PandocMonad m => RTFParser m Blocks +closeTable = do + rawrows <- sTableRows <$> getState + if null rawrows + then return mempty + else do + let getCells (TableRow cs) = reverse cs + let rows = map getCells . reverse $ rawrows + updateState $ \s -> s{ sCurrentCell = mempty + , sTableRows = [] } + return $ B.simpleTable [] rows + +closeContainers :: PandocMonad m => RTFParser m Blocks +closeContainers = do + tbl <- closeTable + lists <- closeLists 0 + return $ tbl <> lists + +trimFinalLineBreak :: Inlines -> Inlines +trimFinalLineBreak ils = + case Seq.viewr (B.unMany ils) of + rest Seq.:> LineBreak -> B.Many rest + _ -> ils + +emitBlocks :: PandocMonad m => Blocks -> RTFParser m Blocks +emitBlocks bs = do + annotatedToks <- reverse . sTextContent <$> getState + updateState $ \s -> s{ sTextContent = [] } + let justCode = def{ gFontFamily = Just Modern } + let prop = case annotatedToks of + [] -> def + ((p,_):_) -> p + tbl <- if gInTable prop || null annotatedToks + then pure mempty + else closeTable + new <- + case annotatedToks of + [] -> pure mempty + _ | Just lst <- gListOverride prop + -> do + let level = fromMaybe 0 $ gListLevel prop + listOverrideTable <- sListOverrideTable <$> getState + let listType = fromMaybe Bullet $ + IntMap.lookup lst listOverrideTable >>= IntMap.lookup level + lists <- sListStack <$> getState + -- get para contents of list item + let newbs = B.para . B.trimInlines . trimFinalLineBreak . mconcat $ + map addFormatting annotatedToks + case lists of + (List lo parentlevel _lt items : cs) + | lo == lst + , parentlevel == level + -- add another item to existing list + -> do updateState $ \s -> + s{ sListStack = + List lo level listType (newbs:items) : cs } + pure mempty + | lo /= lst || level < parentlevel + -- close parent list and add new list + -> do new <- closeLists level -- close open lists > level + updateState $ \s -> + s{ sListStack = List lst level listType [newbs] : + sListStack s } + pure new + _ -> do -- add new list (level > parentlevel) + updateState $ \s -> + s{ sListStack = List lst level listType [newbs] : + sListStack s } + pure mempty + | Just lvl <- gOutlineLevel prop + -> do + lists <- closeLists 0 + pure $ lists <> + B.header (lvl + 1) + (B.trimInlines . mconcat $ map addFormatting + $ removeCommonFormatting + annotatedToks) + | all ((== justCode) . fst) annotatedToks + -> do + lists <- closeLists 0 + pure $ lists <> + B.codeBlock (mconcat $ map snd annotatedToks) + | all (T.all isSpace . snd) annotatedToks + -> closeLists 0 + | otherwise -> do + lists <- closeLists 0 + pure $ lists <> + B.para (B.trimInlines . trimFinalLineBreak . mconcat + $ map addFormatting annotatedToks) + if gInTable prop + then do + updateState $ \s -> s{ sCurrentCell = sCurrentCell s <> new } + pure bs + else do + pure $ bs <> tbl <> new + +-- Headers often have a style applied. We usually want to remove +-- this, because headers will have their own styling in the target +-- format. +removeCommonFormatting :: [(Properties, Text)] -> [(Properties, Text)] +removeCommonFormatting = + (\ts -> + if all (gBold . fst) ts + then map (\(p,t) -> (p{ gBold = False }, t)) ts + else ts) . + (\ts -> + if all (gItalic . fst) ts + then map (\(p,t) -> (p{ gItalic = False }, t)) ts + else ts) + + +-- {\field{\*\fldinst{HYPERLINK "http://pandoc.org"}}{\fldrslt foo}} +handleField :: PandocMonad m => Blocks -> [Tok] -> RTFParser m Blocks +handleField bs + (Tok _ + (Grouped + (Tok _ (ControlSymbol '*') + :Tok _ (ControlWord "fldinst" Nothing) + :Tok _ (Grouped (Tok _ (UnformattedText insttext):rest)) + :_)) + :linktoks) + | Just linkdest <- getHyperlink insttext + = do let linkdest' = case rest of + (Tok _ (ControlSymbol '\\') + : Tok _ (UnformattedText t) + : _) | Just bkmrk <- T.stripPrefix "l" t + -> "#" <> unquote bkmrk + _ -> linkdest + modifyGroup $ \g -> g{ gHyperlink = Just linkdest' } + result <- foldM processTok bs linktoks + modifyGroup $ \g -> g{ gHyperlink = Nothing } + return result +handleField bs _ = pure bs + +unquote :: Text -> Text +unquote = T.dropWhile (=='"') . T.dropWhileEnd (=='"') . T.strip + +handleListTable :: PandocMonad m => [Tok] -> RTFParser m () +handleListTable toks = do + mapM_ handleList toks + +handleList :: PandocMonad m => Tok -> RTFParser m () +handleList (Tok _ (Grouped (Tok _ (ControlWord "list" _) : toks))) = do + let listid = headDef 0 [n | Tok _ (ControlWord "listid" (Just n)) <- toks] + let levels = [ts | Tok _ (Grouped (Tok _ (ControlWord "listlevel" _) : ts)) + <- toks] + tbl <- foldM handleListLevel mempty (zip [0..] levels) + updateState $ \s -> s{ sListTable = IntMap.insert listid tbl $ sListTable s } +handleList _ = return () + +handleListLevel :: PandocMonad m + => ListLevelTable + -> (Int, [Tok]) + -> RTFParser m ListLevelTable +handleListLevel levelTable (lvl, toks) = do + let start = headDef 1 + [n | Tok _ (ControlWord "levelstartat" (Just n)) <- toks] + let mbNumberStyle = + case [n | Tok _ (ControlWord "levelnfc" (Just n)) <- toks] of + [] -> Nothing + (0:_) -> Just Decimal + (1:_) -> Just UpperRoman + (2:_) -> Just LowerRoman + (3:_) -> Just UpperAlpha + (4:_) -> Just LowerAlpha + (23:_) -> Nothing + (255:_) -> Nothing + _ -> Just DefaultStyle + let listType = case mbNumberStyle of + Nothing -> Bullet + Just numStyle -> Ordered (start,numStyle,Period) + return $ IntMap.insert lvl listType levelTable + +handleListOverrideTable :: PandocMonad m => [Tok] -> RTFParser m () +handleListOverrideTable toks = mapM_ handleListOverride toks + +handleListOverride :: PandocMonad m => Tok -> RTFParser m () +handleListOverride + (Tok _ (Grouped (Tok _ (ControlWord "listoverride" _) : toks))) = do + let listid = headDef 0 [n | Tok _ (ControlWord "listid" (Just n)) <- toks] + let lsn = headDef 0 [n | Tok _ (ControlWord "ls" (Just n)) <- toks] + -- TODO override stuff, esp. start num -- for now we just handle indirection + listTable <- sListTable <$> getState + case IntMap.lookup listid listTable of + Nothing -> return () + Just tbl -> updateState $ \s -> + s{ sListOverrideTable = IntMap.insert lsn tbl $ + sListOverrideTable s } +handleListOverride _ = return () + +handleStylesheet :: PandocMonad m => [Tok] -> RTFParser m () +handleStylesheet toks = do + let styles = mapMaybe parseStyle toks + updateState $ \s -> s{ sStylesheet = IntMap.fromList + $ zip (map styleNum styles) styles } + +parseStyle :: Tok -> Maybe Style +parseStyle (Tok _ (Grouped toks)) = do + let (styType, styNum, rest) = + case toks of + Tok _ (ControlWord "s" (Just n)) : ts -> (ParagraphStyle, n, ts) + Tok _ (ControlWord "ds" (Just n)) : ts -> (SectionStyle, n, ts) + Tok _ (ControlWord "cs" (Just n)) : ts -> (CharStyle, n, ts) + Tok _ (ControlWord "ts" (Just n)) : ts -> (TableStyle, n, ts) + _ -> (ParagraphStyle, 0, toks) + let styName = case lastMay rest of + Just (Tok _ (UnformattedText t)) -> T.dropWhileEnd (==';') t + _ -> mempty + let isBasedOn (Tok _ (ControlWord "sbasedon" (Just _))) = True + isBasedOn _ = False + let styBasedOn = case find isBasedOn toks of + Just (Tok _ (ControlWord "sbasedon" (Just i))) -> Just i + _ -> Nothing + let isStyleControl (Tok _ (ControlWord x _)) = + x `elem` ["cs", "s", "ds", "additive", "sbasedon", "snext", + "sautoupd", "shidden", "keycode", "alt", "shift", + "ctrl", "fn"] + isStyleControl _ = False + let styFormatting = filter (not . isStyleControl) (initSafe rest) + return $ Style{ styleNum = styNum + , styleType = styType + , styleBasedOn = styBasedOn + , styleName = styName + , styleFormatting = styFormatting + } +parseStyle _ = Nothing + +hexToWord :: Text -> Word8 +hexToWord t = case TR.hexadecimal t of + Left _ -> 0 + Right (x,_) -> x + + +handlePict :: PandocMonad m => [Tok] -> RTFParser m () +handlePict toks = do + let pict = foldl' getPictData def toks + let altText = "image" + let bytes = + if picBinary pict + then picBytes pict + else BL.pack $ map hexToWord $ T.chunksOf 2 $ picData pict + let (mimetype, ext) = + case picType pict of + Just Emfblip -> (Just "image/x-emf", ".emf") + Just Pngblip -> (Just "image/png", ".png") + Just Jpegblip -> (Just "image/jpeg", ".jpg") + Nothing -> (Nothing, "") + case mimetype of + Just mt -> do + let pictname = showDigest (sha1 bytes) <> ext + insertMedia pictname (Just mt) bytes + modifyGroup $ \g -> g{ gImage = Just pict{ picName = T.pack pictname, + picBytes = bytes } } + addText altText + modifyGroup $ \g -> g{ gImage = Nothing } + _ -> return () + where + getPictData :: Pict -> Tok -> Pict + getPictData pict (Tok _ tok') = + case tok' of + ControlWord "emfblip" _-> pict{ picType = Just Emfblip } + ControlWord "pngblip" _-> pict{ picType = Just Pngblip } + ControlWord "jpegblip" _-> pict{ picType = Just Jpegblip } + ControlWord "picw" (Just w) -> pict{ picWidth = Just w } + ControlWord "pich" (Just h) -> pict{ picHeight = Just h } + ControlWord "picwgoal" (Just w) -> pict{ picWidthGoal = Just w } + ControlWord "pichgoal" (Just h) -> pict{ picHeightGoal = Just h } + BinData d | not (BL.null d) + -> pict{ picBinary = True, picBytes = picBytes pict <> d } + UnformattedText t -> pict{ picData = t } + _ -> pict + + +getHyperlink :: Text -> Maybe Text +getHyperlink t = + case T.stripPrefix "HYPERLINK" (T.strip t) of + Nothing -> Nothing + Just rest -> Just $ unquote rest + +processFontTable :: [Tok] -> FontTable +processFontTable = snd . foldl' go (0, mempty) + where + go (fontnum, tbl) (Tok _ tok') = + case tok' of + (ControlWord "f" (Just i)) -> (i, tbl) + (ControlWord "fnil" _) -> (fontnum, tbl) + (ControlWord "froman" _) -> (fontnum, IntMap.insert fontnum Roman tbl) + (ControlWord "fswiss" _) -> (fontnum, IntMap.insert fontnum Swiss tbl) + (ControlWord "fmodern" _) -> (fontnum, IntMap.insert fontnum Modern tbl) + (ControlWord "fscript" _) -> (fontnum, IntMap.insert fontnum Script tbl) + (ControlWord "fdecor" _) -> (fontnum, IntMap.insert fontnum Decor tbl) + (ControlWord "ftech" _) -> (fontnum, IntMap.insert fontnum Tech tbl) + (ControlWord "fbidi" _) -> (fontnum, IntMap.insert fontnum Bidi tbl) + (Grouped ts) -> foldl' go (fontnum, tbl) ts + _ -> (fontnum, tbl) + + +ansiToChar :: Word8 -> Char +ansiToChar i = chr $ + case i of + 128 -> 8364 + 130 -> 8218 + 131 -> 402 + 132 -> 8222 + 133 -> 8230 + 134 -> 8224 + 135 -> 8225 + 136 -> 710 + 137 -> 8240 + 138 -> 352 + 139 -> 8249 + 140 -> 338 + 142 -> 381 + 145 -> 8216 + 146 -> 8217 + 147 -> 8220 + 148 -> 8221 + 149 -> 8226 + 150 -> 8211 + 151 -> 8212 + 152 -> 732 + 153 -> 8482 + 154 -> 353 + 155 -> 8250 + 156 -> 339 + 158 -> 382 + 159 -> 376 + 173 -> 0xAD + _ -> fromIntegral i + +macToChar :: Word8 -> Char +macToChar i = chr $ + case i of + 0x80 -> 0xC4 + 0x81 -> 0xC5 + 0x82 -> 0xC7 + 0x83 -> 0xC9 + 0x84 -> 0xD1 + 0x85 -> 0xD6 + 0x86 -> 0xDC + 0x87 -> 0xE1 + 0x88 -> 0xE0 + 0x89 -> 0xE2 + 0x8A -> 0xE4 + 0x8B -> 0xE3 + 0x8C -> 0xE5 + 0x8D -> 0xE7 + 0x8E -> 0xE9 + 0x8F -> 0xE8 + 0x90 -> 0xEA + 0x91 -> 0xEB + 0x92 -> 0xED + 0x93 -> 0xEC + 0x94 -> 0xEE + 0x95 -> 0xEF + 0x96 -> 0xF1 + 0x97 -> 0xF3 + 0x98 -> 0xF2 + 0x99 -> 0xF4 + 0x9A -> 0xF6 + 0x9B -> 0xF5 + 0x9C -> 0xFA + 0x9D -> 0xF9 + 0x9E -> 0xFB + 0x9F -> 0xFC + 0xA0 -> 0xDD + 0xA1 -> 0xB0 + 0xA2 -> 0xA2 + 0xA3 -> 0xA3 + 0xA4 -> 0xA7 + 0xA5 -> 0xD7 + 0xA6 -> 0xB6 + 0xA7 -> 0xDF + 0xA8 -> 0xAE + 0xA9 -> 0xA9 + 0xAA -> 0xB2 + 0xAB -> 0xB4 + 0xAC -> 0xA8 + 0xAD -> 0xB3 + 0xAE -> 0xC6 + 0xAF -> 0xD8 + 0xB0 -> 0xB9 + 0xB1 -> 0xB1 + 0xB2 -> 0xBC + 0xB3 -> 0xBD + 0xB4 -> 0xA5 + 0xB5 -> 0xB5 + 0xBA -> 0xBE + 0xBB -> 0xAA + 0xBC -> 0xBA + 0xBE -> 0xE6 + 0xBF -> 0xF8 + 0xC0 -> 0xBF + 0xC1 -> 0xA1 + 0xC2 -> 0xAC + 0xC3 -> 0x0141 + 0xC4 -> 0x0192 + 0xC5 -> 0x02CB + 0xC7 -> 0xAB + 0xC8 -> 0xBB + 0xC9 -> 0xA6 + 0xCA -> 0xA0 + 0xCB -> 0xC0 + 0xCC -> 0xC3 + 0xCD -> 0xD5 + 0xCE -> 0x0152 + 0xCF -> 0x0153 + 0xD0 -> 0xAD + 0xD4 -> 0x0142 + 0xD6 -> 0xF7 + 0xD8 -> 0xFF + 0xD9 -> 0x0178 + 0xDB -> 0xA4 + 0xDC -> 0xD0 + 0xDD -> 0xF0 + 0xDE -> 0xDE + 0xDF -> 0xFE + 0xE0 -> 0xFD + 0xE1 -> 0xB7 + 0xE5 -> 0xC2 + 0xE6 -> 0xCA + 0xE7 -> 0xC1 + 0xE8 -> 0xCB + 0xE9 -> 0xC8 + 0xEA -> 0xCD + 0xEB -> 0xCE + 0xEC -> 0xCF + 0xED -> 0xCC + 0xEE -> 0xD3 + 0xEF -> 0xD4 + 0xF1 -> 0xD2 + 0xF2 -> 0xDA + 0xF3 -> 0xDB + 0xF4 -> 0xD9 + 0xF5 -> 0x0131 + 0xF6 -> 0x02C6 + 0xF7 -> 0x02DC + 0xF8 -> 0xAF + 0xF9 -> 0x02D8 + 0xFA -> 0x02D9 + 0xFB -> 0x02DA + 0xFC -> 0xB8 + 0xFD -> 0x02DD + 0xFE -> 0x02DB + 0xFF -> 0x02C7 + _ -> fromIntegral i + +pcToChar :: Word8 -> Char +pcToChar i = chr $ + case i of + 0x80 -> 0xc7 + 0x81 -> 0xfc + 0x82 -> 0xe9 + 0x83 -> 0xe2 + 0x84 -> 0xe4 + 0x85 -> 0xe0 + 0x86 -> 0xe5 + 0x87 -> 0xe7 + 0x88 -> 0xea + 0x89 -> 0xeb + 0x8a -> 0xe8 + 0x8b -> 0xef + 0x8c -> 0xee + 0x8d -> 0xec + 0x8e -> 0xc4 + 0x8f -> 0xc5 + 0x90 -> 0xc9 + 0x91 -> 0xe6 + 0x92 -> 0xc6 + 0x93 -> 0xf4 + 0x94 -> 0xf6 + 0x95 -> 0xf2 + 0x96 -> 0xfb + 0x97 -> 0xf9 + 0x98 -> 0xff + 0x99 -> 0xd6 + 0x9a -> 0xdc + 0x9b -> 0xa2 + 0x9c -> 0xa3 + 0x9d -> 0xa5 + 0x9e -> 0x20a7 + 0x9f -> 0x0192 + 0xa0 -> 0xe1 + 0xa1 -> 0xed + 0xa2 -> 0xf3 + 0xa3 -> 0xfa + 0xa4 -> 0xf1 + 0xa5 -> 0xd1 + 0xa6 -> 0xaa + 0xa7 -> 0xba + 0xa8 -> 0xbf + 0xa9 -> 0x2310 + 0xaa -> 0xac + 0xab -> 0xbd + 0xac -> 0xbc + 0xad -> 0xa1 + 0xae -> 0xab + 0xaf -> 0xbb + 0xb0 -> 0x2591 + 0xb1 -> 0x2592 + 0xb2 -> 0x2593 + 0xb3 -> 0x2502 + 0xb4 -> 0x2524 + 0xb5 -> 0x2561 + 0xb6 -> 0x2562 + 0xb7 -> 0x2556 + 0xb8 -> 0x2555 + 0xb9 -> 0x2563 + 0xba -> 0x2551 + 0xbb -> 0x2557 + 0xbc -> 0x255d + 0xbd -> 0x255c + 0xbe -> 0x255b + 0xbf -> 0x2510 + 0xc0 -> 0x2514 + 0xc1 -> 0x2534 + 0xc2 -> 0x252c + 0xc3 -> 0x251c + 0xc4 -> 0x2500 + 0xc5 -> 0x253c + 0xc6 -> 0x255e + 0xc7 -> 0x255f + 0xc8 -> 0x255a + 0xc9 -> 0x2554 + 0xca -> 0x2569 + 0xcb -> 0x2566 + 0xcc -> 0x2560 + 0xcd -> 0x2550 + 0xce -> 0x256c + 0xcf -> 0x2567 + 0xd0 -> 0x2568 + 0xd1 -> 0x2564 + 0xd2 -> 0x2565 + 0xd3 -> 0x2559 + 0xd4 -> 0x2558 + 0xd5 -> 0x2552 + 0xd6 -> 0x2553 + 0xd7 -> 0x256b + 0xd8 -> 0x256a + 0xd9 -> 0x2518 + 0xda -> 0x250c + 0xdb -> 0x2588 + 0xdc -> 0x2584 + 0xdd -> 0x258c + 0xde -> 0x2590 + 0xdf -> 0x2580 + 0xe0 -> 0x03b1 + 0xe1 -> 0xdf + 0xe2 -> 0x0393 + 0xe3 -> 0x03c0 + 0xe4 -> 0x03a3 + 0xe5 -> 0x03c3 + 0xe6 -> 0xb5 + 0xe7 -> 0x03c4 + 0xe8 -> 0x03a6 + 0xe9 -> 0x0398 + 0xea -> 0x03a9 + 0xeb -> 0x03b4 + 0xec -> 0x221e + 0xed -> 0x03c6 + 0xee -> 0x03b5 + 0xef -> 0x2229 + 0xf0 -> 0x2261 + 0xf1 -> 0xb1 + 0xf2 -> 0x2265 + 0xf3 -> 0x2264 + 0xf4 -> 0x2320 + 0xf5 -> 0x2321 + 0xf6 -> 0xf7 + 0xf7 -> 0x2248 + 0xf8 -> 0xb0 + 0xf9 -> 0x2219 + 0xfa -> 0xb7 + 0xfb -> 0x221a + 0xfc -> 0x207f + 0xfd -> 0xb2 + 0xfe -> 0x25a0 + 0xff -> 0xa0 + _ -> fromIntegral i + +pcaToChar :: Word8 -> Char +pcaToChar i = chr $ + case i of + 0x80 -> 0x00c7 + 0x81 -> 0x00fc + 0x82 -> 0x00e9 + 0x83 -> 0x00e2 + 0x84 -> 0x00e4 + 0x85 -> 0x00e0 + 0x86 -> 0x00e5 + 0x87 -> 0x00e7 + 0x88 -> 0x00ea + 0x89 -> 0x00eb + 0x8a -> 0x00e8 + 0x8b -> 0x00ef + 0x8c -> 0x00ee + 0x8d -> 0x00ec + 0x8e -> 0x00c4 + 0x8f -> 0x00c5 + 0x90 -> 0x00c9 + 0x91 -> 0x00e6 + 0x92 -> 0x00c6 + 0x93 -> 0x00f4 + 0x94 -> 0x00f6 + 0x95 -> 0x00f2 + 0x96 -> 0x00fb + 0x97 -> 0x00f9 + 0x98 -> 0x00ff + 0x99 -> 0x00d6 + 0x9a -> 0x00dc + 0x9b -> 0x00f8 + 0x9c -> 0x00a3 + 0x9d -> 0x00d8 + 0x9e -> 0x00d7 + 0x9f -> 0x0192 + 0xa0 -> 0x00e1 + 0xa1 -> 0x00ed + 0xa2 -> 0x00f3 + 0xa3 -> 0x00fa + 0xa4 -> 0x00f1 + 0xa5 -> 0x00d1 + 0xa6 -> 0x00aa + 0xa7 -> 0x00ba + 0xa8 -> 0x00bf + 0xa9 -> 0x00ae + 0xaa -> 0x00ac + 0xab -> 0x00bd + 0xac -> 0x00bc + 0xad -> 0x00a1 + 0xae -> 0x00ab + 0xaf -> 0x00bb + 0xb0 -> 0x2591 + 0xb1 -> 0x2592 + 0xb2 -> 0x2593 + 0xb3 -> 0x2502 + 0xb4 -> 0x2524 + 0xb5 -> 0x00c1 + 0xb6 -> 0x00c2 + 0xb7 -> 0x00c0 + 0xb8 -> 0x00a9 + 0xb9 -> 0x2563 + 0xba -> 0x2551 + 0xbb -> 0x2557 + 0xbc -> 0x255d + 0xbd -> 0x00a2 + 0xbe -> 0x00a5 + 0xbf -> 0x2510 + 0xc0 -> 0x2514 + 0xc1 -> 0x2534 + 0xc2 -> 0x252c + 0xc3 -> 0x251c + 0xc4 -> 0x2500 + 0xc5 -> 0x253c + 0xc6 -> 0x00e3 + 0xc7 -> 0x00c3 + 0xc8 -> 0x255a + 0xc9 -> 0x2554 + 0xca -> 0x2569 + 0xcb -> 0x2566 + 0xcc -> 0x2560 + 0xcd -> 0x2550 + 0xce -> 0x256c + 0xcf -> 0x00a4 + 0xd0 -> 0x00f0 + 0xd1 -> 0x00d0 + 0xd2 -> 0x00ca + 0xd3 -> 0x00cb + 0xd4 -> 0x00c8 + 0xd5 -> 0x0131 + 0xd6 -> 0x00cd + 0xd7 -> 0x00ce + 0xd8 -> 0x00cf + 0xd9 -> 0x2518 + 0xda -> 0x250c + 0xdb -> 0x2588 + 0xdc -> 0x2584 + 0xdd -> 0x00a6 + 0xde -> 0x00cc + 0xdf -> 0x2580 + 0xe0 -> 0x00d3 + 0xe1 -> 0x00df + 0xe2 -> 0x00d4 + 0xe3 -> 0x00d2 + 0xe4 -> 0x00f5 + 0xe5 -> 0x00d5 + 0xe6 -> 0x00b5 + 0xe7 -> 0x00fe + 0xe8 -> 0x00de + 0xe9 -> 0x00da + 0xea -> 0x00db + 0xeb -> 0x00d9 + 0xec -> 0x00fd + 0xed -> 0x00dd + 0xee -> 0x00af + 0xef -> 0x00b4 + 0xf0 -> 0x00ad + 0xf1 -> 0x00b1 + 0xf2 -> 0x2017 + 0xf3 -> 0x00be + 0xf4 -> 0x00b6 + 0xf5 -> 0x00a7 + 0xf6 -> 0x00f7 + 0xf7 -> 0x00b8 + 0xf8 -> 0x00b0 + 0xf9 -> 0x00a8 + 0xfa -> 0x00b7 + 0xfb -> 0x00b9 + 0xfc -> 0x00b3 + 0xfd -> 0x00b2 + 0xfe -> 0x25a0 + 0xff -> 0x00a0 + _ -> fromIntegral i |