diff options
Diffstat (limited to 'src/Text')
28 files changed, 886 insertions, 429 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 3f46648a2..d59ee7846 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -115,6 +115,7 @@ module Text.Pandoc , writeHaddock , writeCommonMark , writeCustom + , writeTEI -- * Rendering templates and default templates , module Text.Pandoc.Templates -- * Miscellaneous @@ -169,6 +170,7 @@ import Text.Pandoc.Writers.AsciiDoc import Text.Pandoc.Writers.Haddock import Text.Pandoc.Writers.CommonMark import Text.Pandoc.Writers.Custom +import Text.Pandoc.Writers.TEI import Text.Pandoc.Templates import Text.Pandoc.Options import Text.Pandoc.Shared (safeRead, warn, mapLeft, pandocVersion) @@ -221,6 +223,14 @@ mkStringReaderWithWarnings r = StringReader $ \o s -> mkBSReader :: (ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag)) -> Reader mkBSReader r = ByteStringReader (\o s -> return $ r o s) +mkBSReaderWithWarnings :: (ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag, [String])) -> Reader +mkBSReaderWithWarnings r = ByteStringReader $ \o s -> + case r o s of + Left err -> return $ Left err + Right (doc, mediaBag, warnings) -> do + mapM_ warn warnings + return $ Right (doc, mediaBag) + -- | Association list of formats and readers. readers :: [(String, Reader)] readers = [ ("native" , StringReader $ \_ s -> return $ readNative s) @@ -241,7 +251,7 @@ readers = [ ("native" , StringReader $ \_ s -> return $ readNative s) ,("latex" , mkStringReader readLaTeX) ,("haddock" , mkStringReader readHaddock) ,("twiki" , mkStringReader readTWiki) - ,("docx" , mkBSReader readDocx) + ,("docx" , mkBSReaderWithWarnings readDocxWithWarnings) ,("odt" , mkBSReader readOdt) ,("t2t" , mkStringReader readTxt2TagsNoMacros) ,("epub" , mkBSReader readEPUB) @@ -304,6 +314,7 @@ writers = [ ,("asciidoc" , PureStringWriter writeAsciiDoc) ,("haddock" , PureStringWriter writeHaddock) ,("commonmark" , PureStringWriter writeCommonMark) + ,("tei" , PureStringWriter writeTEI) ] getDefaultExtensions :: String -> Set Extension diff --git a/src/Text/Pandoc/Highlighting.hs b/src/Text/Pandoc/Highlighting.hs index d7a14c129..ecfef1832 100644 --- a/src/Text/Pandoc/Highlighting.hs +++ b/src/Text/Pandoc/Highlighting.hs @@ -70,7 +70,8 @@ highlight formatter (_, classes, keyvals) rawCode = startNumber = firstNum, numberLines = any (`elem` ["number","numberLines", "number-lines"]) classes } - lcclasses = map (map toLower) classes + lcclasses = map (map toLower) + (classes ++ concatMap languagesByExtension classes) in case find (`elem` lcLanguages) lcclasses of Nothing | numberLines fmtOpts -> Just diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 7dd47cd59..b5736c63d 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -202,7 +202,6 @@ githubMarkdownExtensions :: Set Extension githubMarkdownExtensions = Set.fromList [ Ext_pipe_tables , Ext_raw_html - , Ext_tex_math_single_backslash , Ext_fenced_code_blocks , Ext_auto_identifiers , Ext_ascii_identifiers @@ -265,6 +264,7 @@ data ReaderOptions = ReaderOptions{ , readerDefaultImageExtension :: String -- ^ Default extension for images , readerTrace :: Bool -- ^ Print debugging info , readerTrackChanges :: TrackChanges + , readerFileScope :: Bool -- ^ Parse before combining } deriving (Show, Read, Data, Typeable, Generic) instance Default ReaderOptions @@ -281,6 +281,7 @@ instance Default ReaderOptions , readerDefaultImageExtension = "" , readerTrace = False , readerTrackChanges = AcceptChanges + , readerFileScope = False } -- diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 16fe75ed5..325231846 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -915,7 +915,7 @@ data ParserState = ParserState stateMeta' :: F Meta, -- ^ Document metadata stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) - stateIdentifiers :: [String], -- ^ List of header identifiers used + stateIdentifiers :: Set.Set String, -- ^ Header identifiers used stateNextExample :: Int, -- ^ Number of next example stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers stateHasChapters :: Bool, -- ^ True if \chapter encountered @@ -973,8 +973,8 @@ instance HasHeaderMap ParserState where updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st } class HasIdentifierList st where - extractIdentifierList :: st -> [String] - updateIdentifierList :: ([String] -> [String]) -> st -> st + extractIdentifierList :: st -> Set.Set String + updateIdentifierList :: (Set.Set String -> Set.Set String) -> st -> st instance HasIdentifierList ParserState where extractIdentifierList = stateIdentifiers @@ -1013,7 +1013,7 @@ defaultParserState = stateMeta' = return nullMeta, stateHeaderTable = [], stateHeaders = M.empty, - stateIdentifiers = [], + stateIdentifiers = Set.empty, stateNextExample = 1, stateExamples = M.empty, stateHasChapters = False, @@ -1092,8 +1092,8 @@ registerHeader (ident,classes,kvs) header' = do let id'' = if Ext_ascii_identifiers `Set.member` exts then catMaybes $ map toAsciiChar id' else id' - updateState $ updateIdentifierList $ - if id' == id'' then (id' :) else ([id', id''] ++) + updateState $ updateIdentifierList $ Set.insert id' + updateState $ updateIdentifierList $ Set.insert id'' updateState $ updateHeaderMap $ insert' header' id' return (id'',classes,kvs) else do diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 44f67ce75..604bc20de 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -50,8 +50,7 @@ implemented, [-] means partially implemented): * Inlines - [X] Str - - [X] Emph (From italics. `underline` currently read as span. In - future, it might optionally be emph as well) + - [X] Emph (italics and underline both read as Emph) - [X] Strong - [X] Strikeout - [X] Superscript @@ -62,16 +61,16 @@ implemented, [-] means partially implemented): - [X] Code (styled with `VerbatimChar`) - [X] Space - [X] LineBreak (these are invisible in Word: entered with Shift-Return) - - [ ] Math + - [X] Math - [X] Link (links to an arbitrary bookmark create a span with the target as id and "anchor" class) - - [-] Image (Links to path in archive. Future option for - data-encoded URI likely.) + - [X] Image - [X] Note (Footnotes and Endnotes are silently combined.) -} module Text.Pandoc.Readers.Docx - ( readDocx + ( readDocxWithWarnings + , readDocx ) where import Codec.Archive.Zip @@ -81,7 +80,7 @@ import Text.Pandoc.Builder import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists -import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Readers.Docx.Combine import Text.Pandoc.Shared import Text.Pandoc.MediaBag (insertMedia, MediaBag) import Data.List (delete, (\\), intersect) @@ -89,6 +88,7 @@ import Text.TeXMath (writeTeX) import Data.Default (Default) import qualified Data.ByteString.Lazy as B import qualified Data.Map as M +import qualified Data.Set as Set import Control.Monad.Reader import Control.Monad.State import Data.Sequence (ViewL(..), viewl) @@ -97,14 +97,22 @@ import qualified Data.Sequence as Seq (null) import Text.Pandoc.Error import Text.Pandoc.Compat.Except +readDocxWithWarnings :: ReaderOptions + -> B.ByteString + -> Either PandocError (Pandoc, MediaBag, [String]) +readDocxWithWarnings opts bytes = + case archiveToDocxWithWarnings (toArchive bytes) of + Right (docx, warnings) -> do + (meta, blks, mediaBag) <- docxToOutput opts docx + return (Pandoc meta blks, mediaBag, warnings) + Left _ -> Left (ParseFailure "couldn't parse docx file") + readDocx :: ReaderOptions -> B.ByteString -> Either PandocError (Pandoc, MediaBag) -readDocx opts bytes = - case archiveToDocx (toArchive bytes) of - Right docx -> (\(meta, blks, mediaBag) -> (Pandoc meta blks, mediaBag)) - <$> (docxToOutput opts docx) - Left _ -> Left (ParseFailure "couldn't parse docx file") +readDocx opts bytes = do + (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes + return (pandoc, mediaBag) data DState = DState { docxAnchorMap :: M.Map String String , docxMediaBag :: MediaBag @@ -166,7 +174,7 @@ bodyPartsToMeta' (bp : bps) | (Paragraph pPr parParts) <- bp , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) , (Just metaField) <- M.lookup c metaStyles = do - inlines <- concatReduce <$> mapM parPartToInlines parParts + inlines <- smushInlines <$> mapM parPartToInlines parParts remaining <- bodyPartsToMeta' bps let f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] @@ -290,13 +298,13 @@ runToInlines (Run rs runElems) Just SubScrpt -> subscript codeString _ -> codeString | otherwise = do - let ils = concatReduce (map runElemToInlines runElems) + let ils = smushInlines (map runElemToInlines runElems) return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils runToInlines (Footnote bps) = do - blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) return $ note blksList runToInlines (Endnote bps) = do - blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) return $ note blksList runToInlines (InlineDrawing fp bs ext) = do mediaBag <- gets docxMediaBag @@ -315,19 +323,19 @@ parPartToInlines (PlainRun r) = runToInlines r parPartToInlines (Insertion _ author date runs) = do opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> concatReduce <$> mapM runToInlines runs + AcceptChanges -> smushInlines <$> mapM runToInlines runs RejectChanges -> return mempty AllChanges -> do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["insertion"], [("author", author), ("date", date)]) return $ spanWith attr ils parPartToInlines (Deletion _ author date runs) = do opts <- asks docxOptions case readerTrackChanges opts of AcceptChanges -> return mempty - RejectChanges -> concatReduce <$> mapM runToInlines runs + RejectChanges -> smushInlines <$> mapM runToInlines runs AllChanges -> do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["deletion"], [("author", author), ("date", date)]) return $ spanWith attr ils parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = @@ -350,7 +358,7 @@ parPartToInlines (BookMark _ anchor) = -- avoid an extra pass. let newAnchor = if not inHdrBool && anchor `elem` (M.elems anchorMap) - then uniqueIdent [Str anchor] (M.elems anchorMap) + then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) else anchor unless inHdrBool (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) @@ -360,10 +368,10 @@ parPartToInlines (Drawing fp bs ext) = do modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } return $ imageWith (extentToAttr ext) fp "" "" parPartToInlines (InternalHyperLink anchor runs) = do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs return $ link ('#' : anchor) "" ils parPartToInlines (ExternalHyperLink target runs) = do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs return $ link target "" ils parPartToInlines (PlainOMath exps) = do return $ math $ writeTeX exps @@ -393,7 +401,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) | (c:cs) <- filter isAnchorSpan ils , (Span (ident, ["anchor"], _) _) <- c = do hdrIDMap <- gets docxAnchorMap - let newIdent = uniqueIdent ils (M.elems hdrIDMap) + let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs)) -- Otherwise we just give it a name, and register that name (associate @@ -401,7 +409,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) makeHeaderAnchor' (Header n (_, classes, kvs) ils) = do hdrIDMap <- gets docxAnchorMap - let newIdent = uniqueIdent ils (M.elems hdrIDMap) + let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) ils makeHeaderAnchor' blk = return blk @@ -416,7 +424,7 @@ singleParaToPlain blks = blks cellToBlocks :: Cell -> DocxContext Blocks cellToBlocks (Cell bps) = do - blks <- concatReduce <$> mapM bodyPartToBlocks bps + blks <- smushBlocks <$> mapM bodyPartToBlocks bps return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks rowToBlocksList :: Row -> DocxContext [Blocks] @@ -478,11 +486,11 @@ bodyPartToBlocks (Paragraph pPr parparts) $ concatMap parPartToString parparts | Just (style, n) <- pHeading pPr = do ils <- local (\s-> s{docxInHeaderBlock=True}) $ - (concatReduce <$> mapM parPartToInlines parparts) + (smushInlines <$> mapM parPartToInlines parparts) makeHeaderAnchor $ headerWith ("", delete style (pStyle pPr), []) n ils | otherwise = do - ils <- concatReduce <$> mapM parPartToInlines parparts >>= + ils <- smushInlines <$> mapM parPartToInlines parparts >>= (return . fromList . trimLineBreaks . normalizeSpaces . toList) dropIls <- gets docxDropCap let ils' = dropIls <> ils @@ -560,7 +568,7 @@ bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps - blks <- concatReduce <$> mapM bodyPartToBlocks blkbps + blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks mediaBag <- gets docxMediaBag return $ (meta, diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs new file mode 100644 index 000000000..39e0df825 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -0,0 +1,154 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, + PatternGuards #-} + +module Text.Pandoc.Readers.Docx.Combine ( smushInlines + , smushBlocks + ) + where + +import Text.Pandoc.Builder +import Data.List +import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr, (><), (|>)) +import qualified Data.Sequence as Seq (null) + +data Modifier a = Modifier (a -> a) + | AttrModifier (Attr -> a -> a) Attr + | NullModifier + +spaceOutInlinesL :: Inlines -> (Inlines, Inlines) +spaceOutInlinesL ms = (l, stackInlines fs (m' <> r)) + where (l, m, r) = spaceOutInlines ms + (fs, m') = unstackInlines m + +spaceOutInlinesR :: Inlines -> (Inlines, Inlines) +spaceOutInlinesR ms = (stackInlines fs (l <> m'), r) + where (l, m, r) = spaceOutInlines ms + (fs, m') = unstackInlines m + +spaceOutInlines :: Inlines -> (Inlines, Inlines, Inlines) +spaceOutInlines ils = + let (fs, ils') = unstackInlines ils + contents = unMany ils' + left = case viewl contents of + (Space :< _) -> space + _ -> mempty + right = case viewr contents of + (_ :> Space) -> space + _ -> mempty in + (left, (stackInlines fs $ trimInlines . Many $ contents), right) + +stackInlines :: [Modifier Inlines] -> Inlines -> Inlines +stackInlines [] ms = ms +stackInlines (NullModifier : fs) ms = stackInlines fs ms +stackInlines ((Modifier f) : fs) ms = + if isEmpty ms + then stackInlines fs ms + else f $ stackInlines fs ms +stackInlines ((AttrModifier f attr) : fs) ms = f attr $ stackInlines fs ms + +unstackInlines :: Inlines -> ([Modifier Inlines], Inlines) +unstackInlines ms = case ilModifier ms of + NullModifier -> ([], ms) + _ -> (f : fs, ms') where + f = ilModifier ms + (fs, ms') = unstackInlines $ ilInnards ms + +ilModifier :: Inlines -> Modifier Inlines +ilModifier ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph _) -> Modifier emph + (Strong _) -> Modifier strong + (SmallCaps _) -> Modifier smallcaps + (Strikeout _) -> Modifier strikeout + (Superscript _) -> Modifier superscript + (Subscript _) -> Modifier subscript + (Link attr _ tgt) -> Modifier $ linkWith attr (fst tgt) (snd tgt) + (Span attr _) -> AttrModifier spanWith attr + _ -> NullModifier + _ -> NullModifier + +ilInnards :: Inlines -> Inlines +ilInnards ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph lst) -> fromList lst + (Strong lst) -> fromList lst + (SmallCaps lst) -> fromList lst + (Strikeout lst) -> fromList lst + (Superscript lst) -> fromList lst + (Subscript lst) -> fromList lst + (Link _ lst _) -> fromList lst + (Span _ lst) -> fromList lst + _ -> ils + _ -> ils + +inlinesL :: Inlines -> (Inlines, Inlines) +inlinesL ils = case viewl $ unMany ils of + (s :< sq) -> (singleton s, Many sq) + _ -> (mempty, ils) + +inlinesR :: Inlines -> (Inlines, Inlines) +inlinesR ils = case viewr $ unMany ils of + (sq :> s) -> (Many sq, singleton s) + _ -> (ils, mempty) + +combineInlines :: Inlines -> Inlines -> Inlines +combineInlines x y = + let (xs', x') = inlinesR x + (y', ys') = inlinesL y + in + xs' <> (combineSingletonInlines x' y') <> ys' + +combineSingletonInlines :: Inlines -> Inlines -> Inlines +combineSingletonInlines x y = + let (xfs, xs) = unstackInlines x + (yfs, ys) = unstackInlines y + shared = xfs `intersect` yfs + x_remaining = xfs \\ shared + y_remaining = yfs \\ shared + x_rem_attr = filter isAttrModifier x_remaining + y_rem_attr = filter isAttrModifier y_remaining + in + case null shared of + True | isEmpty xs && isEmpty ys -> + stackInlines (x_rem_attr ++ y_rem_attr) mempty + | isEmpty xs -> + let (sp, y') = spaceOutInlinesL y in + (stackInlines x_rem_attr mempty) <> sp <> y' + | isEmpty ys -> + let (x', sp) = spaceOutInlinesR x in + x' <> sp <> (stackInlines y_rem_attr mempty) + | otherwise -> + let (x', xsp) = spaceOutInlinesR x + (ysp, y') = spaceOutInlinesL y + in + x' <> xsp <> ysp <> y' + False -> stackInlines shared $ + combineInlines + (stackInlines x_remaining xs) + (stackInlines y_remaining ys) + +combineBlocks :: Blocks -> Blocks -> Blocks +combineBlocks bs cs + | bs' :> (BlockQuote bs'') <- viewr (unMany bs) + , (BlockQuote cs'') :< cs' <- viewl (unMany cs) = + Many $ (bs' |> (BlockQuote (bs'' <> cs''))) >< cs' +combineBlocks bs cs = bs <> cs + +instance (Monoid a, Eq a) => Eq (Modifier a) where + (Modifier f) == (Modifier g) = (f mempty == g mempty) + (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) + (NullModifier) == (NullModifier) = True + _ == _ = False + +isEmpty :: (Monoid a, Eq a) => a -> Bool +isEmpty x = x == mempty + +isAttrModifier :: Modifier a -> Bool +isAttrModifier (AttrModifier _ _) = True +isAttrModifier _ = False + +smushInlines :: [Inlines] -> Inlines +smushInlines xs = foldl combineInlines mempty xs + +smushBlocks :: [Blocks] -> Blocks +smushBlocks xs = foldl combineBlocks mempty xs diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index eec8b12c9..364483929 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , Row(..) , Cell(..) , archiveToDocx + , archiveToDocxWithWarnings ) where import Codec.Archive.Zip import Text.XML.Light @@ -60,6 +61,7 @@ import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 import Control.Monad.Reader +import Control.Monad.State import Control.Applicative ((<|>)) import qualified Data.Map as M import Text.Pandoc.Compat.Except @@ -81,16 +83,20 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes } deriving Show +data ReaderState = ReaderState { stateWarnings :: [String] } + deriving Show + + data DocxError = DocxError | WrongElem deriving Show instance Error DocxError where noMsg = WrongElem -type D = ExceptT DocxError (Reader ReaderEnv) +type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) -runD :: D a -> ReaderEnv -> Either DocxError a -runD dx re = runReader (runExceptT dx) re +runD :: D a -> ReaderEnv -> ReaderState -> (Either DocxError a, ReaderState) +runD dx re rs = runState (runReaderT (runExceptT dx) re) rs maybeToD :: Maybe a -> D a maybeToD (Just a) = return a @@ -257,7 +263,10 @@ type Author = String type ChangeDate = String archiveToDocx :: Archive -> Either DocxError Docx -archiveToDocx archive = do +archiveToDocx archive = fst <$> archiveToDocxWithWarnings archive + +archiveToDocxWithWarnings :: Archive -> Either DocxError (Docx, [String]) +archiveToDocxWithWarnings archive = do let notes = archiveToNotes archive numbering = archiveToNumbering archive rels = archiveToRelationships archive @@ -265,8 +274,12 @@ archiveToDocx archive = do (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles InDocument - doc <- runD (archiveToDocument archive) rEnv - return $ Docx doc + rState = ReaderState { stateWarnings = [] } + (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState + case eitherDoc of + Right doc -> Right (Docx doc, stateWarnings st) + Left e -> Left e + archiveToDocument :: Archive -> D Document @@ -576,12 +589,14 @@ elemToBodyPart ns element sty <- asks envParStyles let parstyle = elemToParagraphStyle ns element sty parparts <- mapD (elemToParPart ns) (elChildren element) - case pNumInfo parstyle of - Just (numId, lvl) -> do - num <- asks envNumbering - let levelInfo = lookupLevel numId lvl num - return $ ListItem parstyle numId lvl levelInfo parparts - Nothing -> return $ Paragraph parstyle parparts + -- Word uses list enumeration for numbered headings, so we only + -- want to infer a list from the styles if it is NOT a heading. + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + num <- asks envNumbering + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do let caption' = findChild (elemName ns "w" "tblPr") element @@ -702,36 +717,58 @@ elemToExtent drawingElem = getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem >>= findAttr (QName at Nothing Nothing) >>= safeRead -elemToRun :: NameSpaces -> Element -> D Run -elemToRun ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + +childElemToRun :: NameSpaces -> Element -> D Run +childElemToRun ns element + | isElem ns "w" "drawing" element = let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= - (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent drawingElem) + (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent element) Nothing -> throwError WrongElem -elemToRun ns element - | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "footnoteReference") element - , Just fnId <- findAttr (elemName ns "w" "id") ref = do +childElemToRun ns element + | isElem ns "w" "footnoteReference" element + , Just fnId <- findAttr (elemName ns "w" "id") element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Footnote bps Nothing -> return $ Footnote [] -elemToRun ns element - | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "endnoteReference") element - , Just enId <- findAttr (elemName ns "w" "id") ref = do +childElemToRun ns element + | isElem ns "w" "endnoteReference" element + , Just enId <- findAttr (elemName ns "w" "id") element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Endnote bps Nothing -> return $ Endnote [] +childElemToRun _ _ = throwError WrongElem + +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element = + do let choices = findChildren (elemName ns "mc" "Choice") altCont + choiceChildren = map head $ filter (not . null) $ map elChildren choices + outputs <- mapD (childElemToRun ns) choiceChildren + case outputs of + r : _ -> return r + [] -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + childElemToRun ns drawingElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "footnoteReference") element = + childElemToRun ns ref +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "endnoteReference") element = + childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element = do runElems <- elemToRunElems ns element @@ -940,3 +977,4 @@ elemToRunElems _ _ = throwError WrongElem setFont :: Maybe Font -> ReaderEnv -> ReaderEnv setFont f s = s{envFont = f} + diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs deleted file mode 100644 index c93b40119..000000000 --- a/src/Text/Pandoc/Readers/Docx/Reducible.hs +++ /dev/null @@ -1,181 +0,0 @@ -{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, - PatternGuards #-} - -module Text.Pandoc.Readers.Docx.Reducible ( concatReduce - , (<+>) - ) - where - - -import Text.Pandoc.Builder -import Data.List -import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) -import qualified Data.Sequence as Seq (null) - -data Modifier a = Modifier (a -> a) - | AttrModifier (Attr -> a -> a) Attr - | NullModifier - -class (Eq a) => Modifiable a where - modifier :: a -> Modifier a - innards :: a -> a - getL :: a -> (a, a) - getR :: a -> (a, a) - spaceOut :: a -> (a, a, a) - -spaceOutL :: (Monoid a, Modifiable a) => a -> (a, a) -spaceOutL ms = (l, stack fs (m' <> r)) - where (l, m, r) = spaceOut ms - (fs, m') = unstack m - -spaceOutR :: (Monoid a, Modifiable a) => a -> (a, a) -spaceOutR ms = (stack fs (l <> m'), r) - where (l, m, r) = spaceOut ms - (fs, m') = unstack m - -instance (Monoid a, Show a) => Show (Modifier a) where - show (Modifier f) = show $ f mempty - show (AttrModifier f attr) = show $ f attr mempty - show (NullModifier) = "NullModifier" - -instance (Monoid a, Eq a) => Eq (Modifier a) where - (Modifier f) == (Modifier g) = (f mempty == g mempty) - (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) - (NullModifier) == (NullModifier) = True - _ == _ = False - -instance Modifiable Inlines where - modifier ils = case viewl (unMany ils) of - (x :< xs) | Seq.null xs -> case x of - (Emph _) -> Modifier emph - (Strong _) -> Modifier strong - (SmallCaps _) -> Modifier smallcaps - (Strikeout _) -> Modifier strikeout - (Superscript _) -> Modifier superscript - (Subscript _) -> Modifier subscript - (Span attr _) -> AttrModifier spanWith attr - _ -> NullModifier - _ -> NullModifier - - innards ils = case viewl (unMany ils) of - (x :< xs) | Seq.null xs -> case x of - (Emph lst) -> fromList lst - (Strong lst) -> fromList lst - (SmallCaps lst) -> fromList lst - (Strikeout lst) -> fromList lst - (Superscript lst) -> fromList lst - (Subscript lst) -> fromList lst - (Span _ lst) -> fromList lst - _ -> ils - _ -> ils - - getL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) - _ -> (mempty, ils) - - getR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) - _ -> (ils, mempty) - - spaceOut ils = - let (fs, ils') = unstack ils - contents = unMany ils' - left = case viewl contents of - (Space :< _) -> space - _ -> mempty - right = case viewr contents of - (_ :> Space) -> space - _ -> mempty in - (left, (stack fs $ trimInlines .Many $ contents), right) - -instance Modifiable Blocks where - modifier blks = case viewl (unMany blks) of - (x :< xs) | Seq.null xs -> case x of - (BlockQuote _) -> Modifier blockQuote - -- (Div attr _) -> AttrModifier divWith attr - _ -> NullModifier - _ -> NullModifier - - innards blks = case viewl (unMany blks) of - (x :< xs) | Seq.null xs -> case x of - (BlockQuote lst) -> fromList lst - -- (Div attr lst) -> fromList lst - _ -> blks - _ -> blks - - spaceOut blks = (mempty, blks, mempty) - - getL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) - _ -> (mempty, ils) - - getR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) - _ -> (ils, mempty) - - -unstack :: (Modifiable a) => a -> ([Modifier a], a) -unstack ms = case modifier ms of - NullModifier -> ([], ms) - _ -> (f : fs, ms') where - f = modifier ms - (fs, ms') = unstack $ innards ms - -stack :: (Monoid a, Modifiable a) => [Modifier a] -> a -> a -stack [] ms = ms -stack (NullModifier : fs) ms = stack fs ms -stack ((Modifier f) : fs) ms = - if isEmpty ms - then stack fs ms - else f $ stack fs ms -stack ((AttrModifier f attr) : fs) ms = f attr $ stack fs ms - -isEmpty :: (Monoid a, Eq a) => a -> Bool -isEmpty x = x == mempty - - -combine :: (Monoid a, Modifiable a, Eq a) => a -> a -> a -combine x y = - let (xs', x') = getR x - (y', ys') = getL y - in - xs' <> (combineSingleton x' y') <> ys' - -isAttrModifier :: Modifier a -> Bool -isAttrModifier (AttrModifier _ _) = True -isAttrModifier _ = False - -combineSingleton :: (Monoid a, Modifiable a, Eq a) => a -> a -> a -combineSingleton x y = - let (xfs, xs) = unstack x - (yfs, ys) = unstack y - shared = xfs `intersect` yfs - x_remaining = xfs \\ shared - y_remaining = yfs \\ shared - x_rem_attr = filter isAttrModifier x_remaining - y_rem_attr = filter isAttrModifier y_remaining - in - case null shared of - True | isEmpty xs && isEmpty ys -> - stack (x_rem_attr ++ y_rem_attr) mempty - | isEmpty xs -> - let (sp, y') = spaceOutL y in - (stack x_rem_attr mempty) <> sp <> y' - | isEmpty ys -> - let (x', sp) = spaceOutR x in - x' <> sp <> (stack y_rem_attr mempty) - | otherwise -> - let (x', xsp) = spaceOutR x - (ysp, y') = spaceOutL y - in - x' <> xsp <> ysp <> y' - False -> stack shared $ - combine - (stack x_remaining xs) - (stack y_remaining ys) - -(<+>) :: (Monoid a, Modifiable a, Eq a) => a -> a -> a -x <+> y = combine x y - -concatReduce :: (Monoid a, Modifiable a) => [a] -> a -concatReduce xs = foldl combine mempty xs diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 79aa540f6..07d282708 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -30,7 +30,7 @@ import Control.Monad (guard, liftM, when) import Data.List (isPrefixOf, isInfixOf) import Data.Maybe (mapMaybe, fromMaybe) import qualified Data.Map as M (Map, lookup, fromList, elems) -import Control.DeepSeq.Generics (deepseq, NFData) +import Control.DeepSeq (deepseq, NFData) import Debug.Trace (trace) diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index a34e2fb5c..959a2d16f 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -43,7 +43,7 @@ import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags' +import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField , escapeURI, safeRead, mapLeft ) import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) , Extension (Ext_epub_html_exts, @@ -52,9 +52,9 @@ import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Walk import qualified Data.Map as M import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf, isSuffixOf ) +import Data.List ( intercalate, isInfixOf, isPrefixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Monad ( guard, when, mzero, void, unless ) import Control.Arrow ((***)) import Control.Applicative ( (<|>) ) import Data.Monoid (First (..)) @@ -63,12 +63,12 @@ import Debug.Trace (trace) import Text.TeXMath (readMathML, writeTeX) import Data.Default (Default (..), def) import Control.Monad.Reader (Reader,ask, asks, local, runReader) -import Network.URI (isURI) +import Network.URI (URI, parseURIReference, nonStrictRelativeTo) import Text.Pandoc.Error import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Compat.Monoid ((<>)) import Text.Parsec.Error - +import qualified Data.Set as Set -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options @@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options readHtml opts inp = mapLeft (ParseFailure . getError) . flip runReader def $ runParserT parseDoc - (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty) + (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) "source" tags where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp @@ -103,8 +103,8 @@ data HTMLState = HTMLState { parserState :: ParserState, noteTable :: [(String, Blocks)], - baseHref :: Maybe String, - identifiers :: [String], + baseHref :: Maybe URI, + identifiers :: Set.Set String, headerMap :: M.Map Inlines String } @@ -137,19 +137,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag then return mempty else do let content = fromAttrib "content" mt - updateState $ B.setMeta name (B.text content) + updateState $ \s -> + let ps = parserState s in + s{ parserState = ps{ + stateMeta = addMetaField name (B.text content) + (stateMeta ps) } } return mempty pBaseTag = do bt <- pSatisfy (~== TagOpen "base" []) - let baseH = fromAttrib "href" bt - if null baseH - then return mempty - else do - let baseH' = case reverse baseH of - '/':_ -> baseH - _ -> baseH ++ "/" - updateState $ \st -> st{ baseHref = Just baseH' } - return mempty + updateState $ \st -> st{ baseHref = + parseURIReference $ fromAttrib "href" bt } + return mempty block :: TagParser Blocks block = do @@ -441,6 +439,7 @@ pTable = try $ do -- fail on empty table guard $ not $ null head' && null rows let isSinglePlain x = case B.toList x of + [] -> True [Plain _] -> True _ -> False let isSimple = all isSinglePlain $ concat (head':rows) @@ -605,9 +604,9 @@ pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) mbBaseHref <- baseHref <$> getState let url' = fromAttrib "href" tag - let url = case (isURI url', mbBaseHref) of - (False, Just h) -> h ++ url' - _ -> url' + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' let title = fromAttrib "title" tag let uid = fromAttrib "id" tag let cls = words $ fromAttrib "class" tag @@ -619,9 +618,9 @@ pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState let url' = fromAttrib "src" tag - let url = case (isURI url', mbBaseHref) of - (False, Just h) -> h ++ url' - _ -> url' + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag let uid = fromAttrib "id" tag @@ -925,14 +924,45 @@ htmlInBalanced :: (Monad m) => (Tag String -> Bool) -> ParserT String st m String htmlInBalanced f = try $ do - (TagOpen t _, tag) <- htmlTag f - guard $ not $ "/>" `isSuffixOf` tag -- not a self-closing tag - let stopper = htmlTag (~== TagClose t) - let anytag = snd <$> htmlTag (const True) - contents <- many $ notFollowedBy' stopper >> - (htmlInBalanced f <|> anytag <|> count 1 anyChar) - endtag <- liftM snd stopper - return $ tag ++ concat contents ++ endtag + lookAhead (char '<') + inp <- getInput + let ts = canonicalizeTags $ + parseTagsOptions parseOptions{ optTagWarning = True, + optTagPosition = True } inp + case ts of + (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do + guard $ f t + guard $ not $ hasTagWarning (t : take 1 rest) + case htmlInBalanced' tn (t:rest) of + [] -> mzero + xs -> case reverse xs of + (TagClose _ : TagPosition er ec : _) -> do + let ls = er - sr + let cs = ec - sc + lscontents <- concat <$> count ls anyLine + cscontents <- count cs anyChar + (_,closetag) <- htmlTag (~== TagClose tn) + return (lscontents ++ cscontents ++ closetag) + _ -> mzero + _ -> mzero + +htmlInBalanced' :: String + -> [Tag String] + -> [Tag String] +htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts + where go :: Int -> [Tag String] -> Maybe [Tag String] + go n (t@(TagOpen tn' _):rest) | tn' == tagname = + (t :) <$> go (n + 1) rest + go 1 (t@(TagClose tn'):_) | tn' == tagname = + return [t] + go n (t@(TagClose tn'):rest) | tn' == tagname = + (t :) <$> go (n - 1) rest + go n (t:ts') = (t :) <$> go n ts' + go _ [] = mzero + +hasTagWarning :: [Tag String] -> Bool +hasTagWarning (TagWarning _:_) = True +hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: Monad m @@ -941,8 +971,6 @@ htmlTag :: Monad m htmlTag f = try $ do lookAhead (char '<') inp <- getInput - let hasTagWarning (TagWarning _:_) = True - hasTagWarning _ = False let (next : rest) = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = True } inp guard $ f next diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index d2e8d9d17..2be55c9da 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -166,10 +166,18 @@ mathInline :: LP String -> LP Inlines mathInline p = math <$> (try p >>= applyMacros') mathChars :: LP String -mathChars = (concat <$>) $ - many $ - many1 (satisfy (\c -> c /= '$' && c /='\\')) - <|> (\c -> ['\\',c]) <$> try (char '\\' *> anyChar) +mathChars = + concat <$> many (escapedChar + <|> (snd <$> withRaw braced) + <|> many1 (satisfy isOrdChar)) + where escapedChar = try $ do char '\\' + c <- anyChar + return ['\\',c] + isOrdChar '$' = False + isOrdChar '{' = False + isOrdChar '}' = False + isOrdChar '\\' = False + isOrdChar _ = True quoted' :: (Inlines -> Inlines) -> LP String -> LP () -> LP Inlines quoted' f starter ender = do @@ -179,10 +187,11 @@ quoted' f starter ender = do then do ils <- many (notFollowedBy ender >> inline) (ender >> return (f (mconcat ils))) <|> - lit (case startchs of - "``" -> "“" - "`" -> "‘" - _ -> startchs) + (<> mconcat ils) <$> + lit (case startchs of + "``" -> "“" + "`" -> "‘" + _ -> startchs) else lit startchs doubleQuote :: LP Inlines @@ -421,7 +430,8 @@ inlineCommand = try $ do else if parseRaw then return $ rawInline "latex" rawcommand else return mempty - lookupListDefault mzero [name',name] inlineCommands + (lookupListDefault mzero [name',name] inlineCommands <* + optional (try (string "{}"))) <|> raw unlessParseRaw :: LP () @@ -434,6 +444,7 @@ isBlockCommand s = s `M.member` blockCommands inlineEnvironments :: M.Map String (LP Inlines) inlineEnvironments = M.fromList [ ("displaymath", mathEnv id Nothing "displaymath") + , ("math", math <$> verbEnv "math") , ("equation", mathEnv id Nothing "equation") , ("equation*", mathEnv id Nothing "equation*") , ("gather", mathEnv id (Just "gathered") "gather") diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 77c3a1016..b5d175453 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -36,7 +36,7 @@ import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate ) import qualified Data.Map as M import Data.Scientific (coefficient, base10Exponent) import Data.Ord ( comparing ) -import Data.Char ( isSpace, isAlphaNum, toLower ) +import Data.Char ( isSpace, isAlphaNum, toLower, isPunctuation ) import Data.Maybe import Text.Pandoc.Definition import Text.Pandoc.Emoji (emojis) @@ -61,7 +61,6 @@ import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockT import Control.Monad import System.FilePath (takeExtension, addExtension) import Text.HTML.TagSoup -import Text.HTML.TagSoup.Match (tagOpen) import qualified Data.Set as Set import Text.Printf (printf) import Debug.Trace (trace) @@ -1052,12 +1051,11 @@ strictHtmlBlock :: MarkdownParser String strictHtmlBlock = htmlInBalanced (not . isInlineTag) rawVerbatimBlock :: MarkdownParser String -rawVerbatimBlock = try $ do - (TagOpen tag _, open) <- htmlTag (tagOpen (flip elem - ["pre", "style", "script"]) - (const True)) - contents <- manyTill anyChar (htmlTag (~== TagClose tag)) - return $ open ++ contents ++ renderTags' [TagClose tag] +rawVerbatimBlock = htmlInBalanced isVerbTag + where isVerbTag (TagOpen "pre" _) = True + isVerbTag (TagOpen "style" _) = True + isVerbTag (TagOpen "script" _) = True + isVerbTag _ = False rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do @@ -1356,16 +1354,18 @@ pipeTable = try $ do nonindentSpaces lookAhead nonspaceChar (heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak + let heads' = take (length aligns) <$> heads lines' <- many pipeTableRow + let lines'' = map (take (length aligns) <$>) lines' let maxlength = maximum $ - map (\x -> length . stringify $ runF x def) (heads : lines') + map (\x -> length . stringify $ runF x def) (heads' : lines'') numColumns <- getOption readerColumns let widths = if maxlength > numColumns then map (\len -> fromIntegral (len + 1) / fromIntegral numColumns) seplengths else replicate (length aligns) 0.0 - return $ (aligns, widths, heads, sequence lines') + return $ (aligns, widths, heads', sequence lines'') sepPipe :: MarkdownParser () sepPipe = try $ do @@ -1374,25 +1374,27 @@ sepPipe = try $ do -- parse a row, also returning probable alignments for org-table cells pipeTableRow :: MarkdownParser (F [Blocks]) -pipeTableRow = do +pipeTableRow = try $ do + scanForPipe skipMany spaceChar openPipe <- (True <$ char '|') <|> return False - let cell = mconcat <$> - many (notFollowedBy (blankline <|> char '|') >> inline) - first <- cell - rest <- many $ sepPipe *> cell + -- split into cells + let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') + <|> void (noneOf "|\n\r") + let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>= + parseFromString pipeTableCell + cells <- cellContents `sepEndBy1` (char '|') -- surrounding pipes needed for a one-column table: - guard $ not (null rest && not openPipe) - optional (char '|') + guard $ not (length cells == 1 && not openPipe) blankline - let cells = sequence (first:rest) - return $ do - cells' <- cells - return $ map - (\ils -> - case trimInlines ils of - ils' | B.isNull ils' -> mempty - | otherwise -> B.plain $ ils') cells' + return $ sequence cells + +pipeTableCell :: MarkdownParser (F Blocks) +pipeTableCell = do + result <- many inline + if null result + then return mempty + else return $ B.plain . mconcat <$> sequence result pipeTableHeaderPart :: Parser [Char] st (Alignment, Int) pipeTableHeaderPart = try $ do @@ -1554,7 +1556,7 @@ math :: MarkdownParser (F Inlines) math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) <|> (return . B.math <$> (mathInline >>= applyMacros')) <+?> ((getOption readerSmart >>= guard) *> (return <$> apostrophe) - <* notFollowedBy space) + <* notFollowedBy (space <|> satisfy isPunctuation)) -- Parses material enclosed in *s, **s, _s, or __s. -- Designed to avoid backtracking. diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index d29ec50e7..950497992 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -52,6 +52,7 @@ import Text.HTML.TagSoup import Data.Sequence (viewl, ViewL(..), (<|)) import qualified Data.Foldable as F import qualified Data.Map as M +import qualified Data.Set as Set import Data.Char (isDigit, isSpace) import Data.Maybe (fromMaybe) import Text.Printf (printf) @@ -69,7 +70,7 @@ readMediaWiki opts s = , mwNextLinkNumber = 1 , mwCategoryLinks = [] , mwHeaderMap = M.empty - , mwIdentifierList = [] + , mwIdentifierList = Set.empty } (s ++ "\n") @@ -78,7 +79,7 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwNextLinkNumber :: Int , mwCategoryLinks :: [Inlines] , mwHeaderMap :: M.Map Inlines String - , mwIdentifierList :: [String] + , mwIdentifierList :: Set.Set String } type MWParser = Parser [Char] MWState diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 1f1c57646..8c475eefc 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -61,6 +61,7 @@ import Text.Pandoc.Readers.Odt.Generic.XMLConverter import Text.Pandoc.Readers.Odt.Generic.Fallible import Text.Pandoc.Readers.Odt.Generic.Utils +import qualified Data.Set as Set -------------------------------------------------------------------------------- -- State @@ -221,7 +222,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do getHeaderAnchor :: OdtReaderSafe Inlines Anchor getHeaderAnchor = proc title -> do state <- getExtraState -< () - let anchor = uniqueIdent (toList title) (usedAnchors state) + let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state) modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index c7906618c..7dd611be3 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -2,7 +2,7 @@ {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-} {- -Copyright (C) 2014-2015 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Org - Copyright : Copyright (C) 2014-2015 Albert Krewinkel + Copyright : Copyright (C) 2014-2016 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -53,6 +53,7 @@ import Data.Char (isAlphaNum, toLower) import Data.Default import Data.List (intersperse, isPrefixOf, isSuffixOf) import qualified Data.Map as M +import qualified Data.Set as Set import Data.Maybe (fromMaybe, isJust) import Network.HTTP (urlEncode) @@ -144,7 +145,7 @@ data OrgParserState = OrgParserState , orgStateMeta' :: F Meta , orgStateNotes' :: OrgNoteTable , orgStateParserContext :: ParserContext - , orgStateIdentifiers :: [String] + , orgStateIdentifiers :: Set.Set String , orgStateHeaderMap :: M.Map Inlines String } @@ -186,7 +187,7 @@ defaultOrgParserState = OrgParserState , orgStateMeta' = return nullMeta , orgStateNotes' = [] , orgStateParserContext = NullState - , orgStateIdentifiers = [] + , orgStateIdentifiers = Set.empty , orgStateHeaderMap = M.empty } @@ -628,7 +629,7 @@ figure = try $ do maybeNam <- lookupBlockAttribute "name" guard $ isJust maybeCap || isJust maybeNam return ( fromMaybe mempty maybeCap - , maybe mempty withFigPrefix maybeNam ) + , withFigPrefix $ fromMaybe mempty maybeNam ) withFigPrefix cs = if "fig:" `isPrefixOf` cs then cs @@ -1238,37 +1239,37 @@ applyCustomLinkFormat link = do formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters return $ maybe link ($ drop 1 rest) formatter --- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind --- of parsing. +-- | Take a link and return a function which produces new inlines when given +-- description inlines. linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF s = +linkToInlinesF linkStr = + case linkStr of + "" -> pure . B.link mempty "" -- wiki link (empty by convention) + ('#':_) -> pure . B.link linkStr "" -- document-local fraction + _ -> case cleanLinkString linkStr of + (Just cleanedLink) -> if isImageFilename cleanedLink + then const . pure $ B.image cleanedLink "" "" + else pure . B.link cleanedLink "" + Nothing -> internalLink linkStr -- other internal link + +-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if +-- the string does not appear to be a link. +cleanLinkString :: String -> Maybe String +cleanLinkString s = case s of - "" -> pure . B.link "" "" - ('#':_) -> pure . B.link s "" - _ | isImageFilename s -> const . pure $ B.image s "" "" - _ | isFileLink s -> pure . B.link (dropLinkType s) "" - _ | isUri s -> pure . B.link s "" - _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) "" - _ | isRelativeFilePath s -> pure . B.link s "" - _ -> internalLink s - -isFileLink :: String -> Bool -isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s) - -dropLinkType :: String -> String -dropLinkType = tail . snd . break (== ':') - -isRelativeFilePath :: String -> Bool -isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) && - (':' `notElem` s) - -isUri :: String -> Bool -isUri s = let (scheme, path) = break (== ':') s - in all (\c -> isAlphaNum c || c `elem` (".-" :: String)) scheme - && not (null path) - -isAbsoluteFilePath :: String -> Bool -isAbsoluteFilePath = ('/' ==) . head + '/':_ -> Just $ "file://" ++ s -- absolute path + '.':'/':_ -> Just s -- relative path + '.':'.':'/':_ -> Just s -- relative path + -- Relative path or URL (file schema) + 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' + _ | isUrl s -> Just s -- URL + _ -> Nothing + where + isUrl :: String -> Bool + isUrl cs = + let (scheme, path) = break (== ':') cs + in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme + && not (null path) isImageFilename :: String -> Bool isImageFilename filename = diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index dd1d289a3..6f64540f8 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -583,7 +583,18 @@ code2 = do -- | Html / CSS attributes attributes :: Parser [Char] ParserState Attr -attributes = (foldl (flip ($)) ("",[],[])) `fmap` many attribute +attributes = (foldl (flip ($)) ("",[],[])) <$> + try (do special <- option id specialAttribute + attrs <- many attribute + return (special : attrs)) + +specialAttribute :: Parser [Char] ParserState (Attr -> Attr) +specialAttribute = do + alignStr <- ("center" <$ char '=') <|> + ("justify" <$ try (string "<>")) <|> + ("right" <$ char '>') <|> + ("left" <$ char '<') + return $ addStyle ("text-align:" ++ alignStr) attribute :: Parser [Char] ParserState (Attr -> Attr) attribute = classIdAttr <|> styleAttr <|> langAttr @@ -602,7 +613,13 @@ classIdAttr = try $ do -- (class class #id) styleAttr :: Parser [Char] ParserState (Attr -> Attr) styleAttr = do style <- try $ enclosed (char '{') (char '}') anyChar' - return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals) + return $ addStyle style + +addStyle :: String -> Attr -> Attr +addStyle style (id',classes,keyvals) = + (id',classes,keyvals') + where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"] + style' = style ++ ";" ++ concat [v | ("style",v) <- keyvals] langAttr :: Parser [Char] ParserState (Attr -> Attr) langAttr = do diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index aa07c81e1..075d76847 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -706,14 +706,14 @@ headerLtEq _ _ = False -- | Generate a unique identifier from a list of inlines. -- Second argument is a list of already used identifiers. -uniqueIdent :: [Inline] -> [String] -> String +uniqueIdent :: [Inline] -> Set.Set String -> String uniqueIdent title' usedIdents = let baseIdent = case inlineListToIdentifier title' of "" -> "section" x -> x numIdent n = baseIdent ++ "-" ++ show n - in if baseIdent `elem` usedIdents - then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of + in if baseIdent `Set.member` usedIdents + then case find (\x -> not $ numIdent x `Set.member` usedIdents) ([1..60000] :: [Int]) of Just x -> numIdent x Nothing -> baseIdent -- if we have more than 60,000, allow repeats else baseIdent @@ -892,8 +892,10 @@ readDataFileUTF8 userDir fname = parseURIReference' :: String -> Maybe URI parseURIReference' s = case parseURIReference s of - Just u | length (uriScheme u) > 2 -> Just u - _ -> Nothing + Just u + | length (uriScheme u) > 2 -> Just u + | null (uriScheme u) -> Just u -- protocol-relative + _ -> Nothing -- | Fetch an image or other item from the local filesystem or the net. -- Returns raw content and maybe mime type. diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs index 498e2d10f..8d54d62bd 100644 --- a/src/Text/Pandoc/Writers/ConTeXt.hs +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -279,7 +279,17 @@ blockListToConTeXt lst = liftM vcat $ mapM blockToConTeXt lst -- | Convert list of inline elements to ConTeXt. inlineListToConTeXt :: [Inline] -- ^ Inlines to convert -> State WriterState Doc -inlineListToConTeXt lst = liftM hcat $ mapM inlineToConTeXt lst +inlineListToConTeXt lst = liftM hcat $ mapM inlineToConTeXt $ addStruts lst + -- We add a \strut after a line break that precedes a space, + -- or the space gets swallowed + where addStruts (LineBreak : s : xs) | isSpacey s = + LineBreak : RawInline (Format "context") "\\strut " : s : + addStruts xs + addStruts (x:xs) = x : addStruts xs + addStruts [] = [] + isSpacey Space = True + isSpacey (Str ('\160':_)) = True + isSpacey _ = False -- | Convert inline element to ConTeXt inlineToConTeXt :: Inline -- ^ Inline to convert diff --git a/src/Text/Pandoc/Writers/Custom.hs b/src/Text/Pandoc/Writers/Custom.hs index 9671fc05b..d69eaaa64 100644 --- a/src/Text/Pandoc/Writers/Custom.hs +++ b/src/Text/Pandoc/Writers/Custom.hs @@ -222,8 +222,8 @@ blockToCustom _ Null = return "" blockToCustom lua (Plain inlines) = callfunc lua "Plain" inlines -blockToCustom lua (Para [Image _ txt (src,tit)]) = - callfunc lua "CaptionedImage" src tit txt +blockToCustom lua (Para [Image attr txt (src,tit)]) = + callfunc lua "CaptionedImage" src tit txt (attrToMap attr) blockToCustom lua (Para inlines) = callfunc lua "Para" inlines diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index 827d32620..a841e1b66 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -34,6 +34,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BL8 import qualified Data.Map as M +import qualified Data.Set as Set import qualified Text.Pandoc.UTF8 as UTF8 import Codec.Archive.Zip import Data.Time.Clock.POSIX @@ -95,7 +96,7 @@ data WriterState = WriterState{ stTextProperties :: [Element] , stParaProperties :: [Element] , stFootnotes :: [Element] - , stSectionIds :: [String] + , stSectionIds :: Set.Set String , stExternalLinks :: M.Map String String , stImages :: M.Map FilePath (String, String, Maybe MimeType, Element, B.ByteString) , stListLevel :: Int @@ -117,7 +118,7 @@ defaultWriterState = WriterState{ stTextProperties = [] , stParaProperties = [] , stFootnotes = defaultFootnotes - , stSectionIds = [] + , stSectionIds = Set.empty , stExternalLinks = M.empty , stImages = M.empty , stListLevel = -1 @@ -742,7 +743,7 @@ blockToOpenXML opts (Header lev (ident,_,_) lst) = do let bookmarkName = if null ident then uniqueIdent lst usedIdents else ident - modify $ \s -> s{ stSectionIds = bookmarkName : stSectionIds s } + modify $ \s -> s{ stSectionIds = Set.insert bookmarkName $ stSectionIds s } id' <- getUniqueId let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id') ,("w:name",bookmarkName)] () @@ -1102,7 +1103,7 @@ inlineToOpenXML opts (Link _ txt (src,_)) = do M.insert src i extlinks } return i return [ mknode "w:hyperlink" [("r:id",id')] contents ] -inlineToOpenXML opts (Image attr alt (src, tit)) = do +inlineToOpenXML opts (Image attr alt (src, _)) = do -- first, check to see if we've already done this image pageWidth <- gets stPrintWidth imgs <- gets stImages @@ -1153,7 +1154,7 @@ inlineToOpenXML opts (Image attr alt (src, tit)) = do mknode "wp:inline" [] [ mknode "wp:extent" [("cx",show xemu),("cy",show yemu)] () , mknode "wp:effectExtent" [("b","0"),("l","0"),("r","0"),("t","0")] () - , mknode "wp:docPr" [("descr",tit),("id","1"),("name","Picture")] () + , mknode "wp:docPr" [("descr",stringify alt),("id","1"),("name","Picture")] () , graphic ] let imgext = case mt >>= extensionFromMimeType of Just x -> '.':x diff --git a/src/Text/Pandoc/Writers/DokuWiki.hs b/src/Text/Pandoc/Writers/DokuWiki.hs index f1088b158..56e2b9027 100644 --- a/src/Text/Pandoc/Writers/DokuWiki.hs +++ b/src/Text/Pandoc/Writers/DokuWiki.hs @@ -452,8 +452,11 @@ inlineToDokuWiki _ (Code _ str) = inlineToDokuWiki _ (Str str) = return $ escapeString str -inlineToDokuWiki _ (Math _ str) = return $ "$" ++ str ++ "$" +inlineToDokuWiki _ (Math mathType str) = return $ delim ++ str ++ delim -- note: str should NOT be escaped + where delim = case mathType of + DisplayMath -> "$$" + InlineMath -> "$" inlineToDokuWiki _ (RawInline f str) | f == Format "dokuwiki" = return str diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 64f94f41f..804dbb926 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -31,6 +31,7 @@ Conversion of 'Pandoc' documents to EPUB. module Text.Pandoc.Writers.EPUB ( writeEPUB ) where import Data.IORef ( IORef, newIORef, readIORef, modifyIORef ) import qualified Data.Map as M +import qualified Data.Set as Set import Data.Maybe ( fromMaybe, catMaybes ) import Data.List ( isPrefixOf, isInfixOf, intercalate ) import System.Environment ( getEnv ) @@ -916,13 +917,13 @@ showChapter = printf "ch%03d.xhtml" -- Add identifiers to any headers without them. addIdentifiers :: [Block] -> [Block] -addIdentifiers bs = evalState (mapM go bs) [] +addIdentifiers bs = evalState (mapM go bs) Set.empty where go (Header n (ident,classes,kvs) ils) = do ids <- get let ident' = if null ident then uniqueIdent ils ids else ident - put $ ident' : ids + modify $ Set.insert ident' return $ Header n (ident',classes,kvs) ils go x = return x diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 73a8906c3..c5b6a6db2 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -140,40 +140,38 @@ pandocToHtml opts (Pandoc meta blocks) = do st <- get let notes = reverse (stNotes st) let thebody = blocks' >> footnoteSection opts notes - let math = if stMath st - then case writerHTMLMathMethod opts of - LaTeXMathML (Just url) -> - H.script ! A.src (toValue url) - ! A.type_ "text/javascript" - $ mempty - MathML (Just url) -> - H.script ! A.src (toValue url) - ! A.type_ "text/javascript" - $ mempty - MathJax url -> - H.script ! A.src (toValue url) - ! A.type_ "text/javascript" - $ case writerSlideVariant opts of - SlideousSlides -> - preEscapedString - "MathJax.Hub.Queue([\"Typeset\",MathJax.Hub]);" - _ -> mempty - JsMath (Just url) -> - H.script ! A.src (toValue url) - ! A.type_ "text/javascript" - $ mempty - KaTeX js css -> - (H.script ! A.src (toValue js) $ mempty) <> - (H.link ! A.rel "stylesheet" ! A.href (toValue css)) <> - (H.script ! A.type_ "text/javascript" $ toHtml renderKaTeX) - _ -> case lookup "mathml-script" (writerVariables opts) of - Just s | not (writerHtml5 opts) -> - H.script ! A.type_ "text/javascript" - $ preEscapedString - ("/*<![CDATA[*/\n" ++ s ++ "/*]]>*/\n") - | otherwise -> mempty - Nothing -> mempty - else mempty + let math = case writerHTMLMathMethod opts of + LaTeXMathML (Just url) -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ mempty + MathML (Just url) -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ mempty + MathJax url -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ case writerSlideVariant opts of + SlideousSlides -> + preEscapedString + "MathJax.Hub.Queue([\"Typeset\",MathJax.Hub]);" + _ -> mempty + JsMath (Just url) -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ mempty + KaTeX js css -> + (H.script ! A.src (toValue js) $ mempty) <> + (H.link ! A.rel "stylesheet" ! A.href (toValue css)) <> + (H.script ! A.type_ "text/javascript" $ toHtml renderKaTeX) + _ -> case lookup "mathml-script" (writerVariables opts) of + Just s | not (writerHtml5 opts) -> + H.script ! A.type_ "text/javascript" + $ preEscapedString + ("/*<![CDATA[*/\n" ++ s ++ "/*]]>*/\n") + | otherwise -> mempty + Nothing -> mempty let context = (if stHighlighting st then defField "highlighting-css" (styleToCss $ writerHighlightStyle opts) @@ -647,7 +645,7 @@ alignmentToString alignment = case alignment of AlignLeft -> "left" AlignRight -> "right" AlignCenter -> "center" - AlignDefault -> "left" + AlignDefault -> "" tableItemToHtml :: WriterOptions -> (Html -> Html) @@ -660,7 +658,10 @@ tableItemToHtml opts tag' align' item = do let attribs = if writerHtml5 opts then A.style (toValue $ "text-align: " ++ alignStr ++ ";") else A.align (toValue alignStr) - return $ (tag' ! attribs $ contents) >> nl opts + let tag'' = if null alignStr + then tag' + else tag' ! attribs + return $ (tag'' $ contents) >> nl opts toListItems :: WriterOptions -> [Html] -> [Html] toListItems opts items = map (toListItem opts) items ++ [nl opts] diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index 7b2911bcf..0f47132b3 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -113,12 +113,7 @@ pandocToLaTeX options (Pandoc meta blocks) = do (fmap (render colwidth) . inlineListToLaTeX) meta let bookClasses = ["memoir","book","report","scrreprt","scrbook"] - let documentClass = case P.parse (do P.skipMany (P.satisfy (/='\\')) - P.string "\\documentclass" - P.skipMany (P.satisfy (/='{')) - P.char '{' - P.manyTill P.letter (P.char '}')) "template" - template of + let documentClass = case P.parse pDocumentClass "template" template of Right r -> r Left _ -> "" case lookup "documentclass" (writerVariables options) `mplus` @@ -577,26 +572,29 @@ blockToLaTeX (Header level (id',classes,_) lst) = do blockToLaTeX (Table caption aligns widths heads rows) = do headers <- if all null heads then return empty - else ($$ "\\midrule\n") `fmap` - (tableRowToLaTeX True aligns widths) heads + else do + contents <- (tableRowToLaTeX True aligns widths) heads + return ("\\toprule" $$ contents $$ "\\midrule") let endhead = if all null heads then empty else text "\\endhead" + let endfirsthead = if all null heads + then empty + else text "\\endfirsthead" captionText <- inlineListToLaTeX caption let capt = if isEmpty captionText then empty - else text "\\caption" <> braces captionText - <> "\\tabularnewline\n\\toprule\n" - <> headers - <> "\\endfirsthead" + else text "\\caption" <> braces captionText <> "\\tabularnewline" + $$ headers + $$ endfirsthead rows' <- mapM (tableRowToLaTeX False aligns widths) rows let colDescriptors = text $ concat $ map toColDescriptor aligns modify $ \s -> s{ stTable = True } - return $ "\\begin{longtable}[c]" <> + return $ "\\begin{longtable}[]" <> braces ("@{}" <> colDescriptors <> "@{}") -- the @{} removes extra space at beginning and end $$ capt - $$ "\\toprule" + $$ (if all null heads then "\\toprule" else empty) $$ headers $$ endhead $$ vcat rows' @@ -1265,3 +1263,24 @@ commonFromBcp47 x = fromIso $ head x deNote :: Inline -> Inline deNote (Note _) = RawInline (Format "latex") "" deNote x = x + +pDocumentOptions :: P.Parsec String () [String] +pDocumentOptions = do + P.char '[' + opts <- P.sepBy + (P.many $ P.spaces *> P.noneOf (" ,]" :: String) <* P.spaces) + (P.char ',') + P.char ']' + return opts + +pDocumentClass :: P.Parsec String () String +pDocumentClass = + do P.skipMany (P.satisfy (/='\\')) + P.string "\\documentclass" + classOptions <- pDocumentOptions <|> return [] + if ("article" :: String) `elem` classOptions + then return "article" + else do P.skipMany (P.satisfy (/='{')) + P.char '{' + P.manyTill P.letter (P.char '}') + diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 5a92f3cdf..ce993093c 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -53,6 +53,7 @@ import Data.Yaml (Value(Object,String,Array,Bool,Number)) import qualified Data.HashMap.Strict as H import qualified Data.Vector as V import qualified Data.Text as T +import qualified Data.Set as Set type Notes = [[Block]] type Ref = ([Inline], Target, Attr) @@ -61,11 +62,11 @@ data WriterState = WriterState { stNotes :: Notes , stRefs :: Refs , stRefShortcutable :: Bool , stInList :: Bool - , stIds :: [String] + , stIds :: Set.Set String , stPlain :: Bool } instance Default WriterState where def = WriterState{ stNotes = [], stRefs = [], stRefShortcutable = True, - stInList = False, stIds = [], stPlain = False } + stInList = False, stIds = Set.empty, stPlain = False } -- | Convert Pandoc to Markdown. writeMarkdown :: WriterOptions -> Pandoc -> String @@ -116,7 +117,7 @@ plainTitleBlock tit auths dat = dat <> cr yamlMetadataBlock :: Value -> Doc -yamlMetadataBlock v = "---" $$ (jsonToYaml v) $$ "..." +yamlMetadataBlock v = "---" $$ (jsonToYaml v) $$ "---" jsonToYaml :: Value -> Doc jsonToYaml (Object hashmap) = @@ -364,7 +365,7 @@ blockToMarkdown opts (Header level attr inlines) = do -- so we know whether to print an explicit identifier ids <- gets stIds let autoId = uniqueIdent inlines ids - modify $ \st -> st{ stIds = autoId : ids } + modify $ \st -> st{ stIds = Set.insert autoId ids } let attr' = case attr of ("",[],[]) -> empty (id',[],[]) | isEnabled Ext_auto_identifiers opts diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs index d843d2efd..20086ed19 100644 --- a/src/Text/Pandoc/Writers/Org.hs +++ b/src/Text/Pandoc/Writers/Org.hs @@ -102,6 +102,10 @@ escapeString = escapeStringUsing $ , ('\x2026',"...") ] ++ backslashEscapes "^_" +isRawFormat :: Format -> Bool +isRawFormat f = + f == Format "latex" || f == Format "tex" || f == Format "org" + -- | Convert Pandoc block element to Org. blockToOrg :: Block -- ^ Block element -> State WriterState Doc @@ -129,7 +133,7 @@ blockToOrg (Para inlines) = do blockToOrg (RawBlock "html" str) = return $ blankline $$ "#+BEGIN_HTML" $$ nest 2 (text str) $$ "#+END_HTML" $$ blankline -blockToOrg (RawBlock f str) | f `elem` ["org", "latex", "tex"] = +blockToOrg (RawBlock f str) | isRawFormat f = return $ text str blockToOrg (RawBlock _ _) = return empty blockToOrg HorizontalRule = return $ blankline $$ "--------------" $$ blankline @@ -271,7 +275,8 @@ inlineToOrg (Math t str) = do return $ if t == InlineMath then "$" <> text str <> "$" else "$$" <> text str <> "$$" -inlineToOrg (RawInline f str) | f == "tex" || f == "latex" = return $ text str +inlineToOrg (RawInline f str) | isRawFormat f = + return $ text str inlineToOrg (RawInline _ _) = return empty inlineToOrg (LineBreak) = return (text "\\\\" <> cr) inlineToOrg Space = return space diff --git a/src/Text/Pandoc/Writers/TEI.hs b/src/Text/Pandoc/Writers/TEI.hs new file mode 100644 index 000000000..b9e683ab9 --- /dev/null +++ b/src/Text/Pandoc/Writers/TEI.hs @@ -0,0 +1,320 @@ +{-# LANGUAGE OverloadedStrings, PatternGuards #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Docbook + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Docbook XML. +-} +module Text.Pandoc.Writers.TEI (writeTEI) where +import Text.Pandoc.Definition +import Text.Pandoc.XML +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Data.List ( stripPrefix, isPrefixOf, isSuffixOf ) +import Data.Char ( toLower ) +import Text.Pandoc.Highlighting ( languages, languagesByExtension ) +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import qualified Text.Pandoc.Builder as B + +-- | Convert list of authors to a docbook <author> section +authorToTEI :: WriterOptions -> [Inline] -> B.Inlines +authorToTEI opts name' = + let name = render Nothing $ inlinesToTEI opts name' + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + in B.rawInline "tei" $ render colwidth $ + inTagsSimple "author" (text $ escapeStringForXML name) + +-- | Convert Pandoc document to string in Docbook format. +writeTEI :: WriterOptions -> Pandoc -> String +writeTEI opts (Pandoc meta blocks) = + let elements = hierarchicalize blocks + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + render' = render colwidth + opts' = if "/book>" `isSuffixOf` + (trimr $ writerTemplate opts) + then opts{ writerChapters = True } + else opts + startLvl = if writerChapters opts' then 0 else 1 + auths' = map (authorToTEI opts) $ docAuthors meta + meta' = B.setMeta "author" auths' meta + Just metadata = metaToJSON opts + (Just . render colwidth . (vcat . + (map (elementToTEI opts' startLvl)) . hierarchicalize)) + (Just . render colwidth . inlinesToTEI opts') + meta' + main = render' $ vcat (map (elementToTEI opts' startLvl) elements) + context = defField "body" main + $ defField "mathml" (case writerHTMLMathMethod opts of + MathML _ -> True + _ -> False) + $ metadata + in if writerStandalone opts + then renderTemplate' (writerTemplate opts) context + else main + +-- | Convert an Element to TEI. +elementToTEI :: WriterOptions -> Int -> Element -> Doc +elementToTEI opts _ (Blk block) = blockToTEI opts block +elementToTEI opts lvl (Sec _ _num (id',_,_) title elements) = + -- TEI doesn't allow sections with no content, so insert some if needed + let elements' = if null elements + then [Blk (Para [])] + else elements + divType = case lvl of + n | n == 0 -> "chapter" + | n >= 1 && n <= 5 -> "level" ++ show n + | otherwise -> "section" + in inTags True "div" [("type", divType) | not (null id')] $ +-- ("id", writerIdentifierPrefix opts ++ id') | not (null id')] $ + inTagsSimple "head" (inlinesToTEI opts title) $$ + vcat (map (elementToTEI opts (lvl + 1)) elements') + +-- | Convert a list of Pandoc blocks to TEI. +blocksToTEI :: WriterOptions -> [Block] -> Doc +blocksToTEI opts = vcat . map (blockToTEI opts) + +-- | Auxiliary function to convert Plain block to Para. +plainToPara :: Block -> Block +plainToPara (Plain x) = Para x +plainToPara x = x + +-- | Convert a list of pairs of terms and definitions into a TEI +-- list with labels and items. +deflistItemsToTEI :: WriterOptions -> [([Inline],[[Block]])] -> Doc +deflistItemsToTEI opts items = + vcat $ map (\(term, defs) -> deflistItemToTEI opts term defs) items + +-- | Convert a term and a list of blocks into a TEI varlistentry. +deflistItemToTEI :: WriterOptions -> [Inline] -> [[Block]] -> Doc +deflistItemToTEI opts term defs = + let def' = concatMap (map plainToPara) defs + in inTagsIndented "label" (inlinesToTEI opts term) $$ + inTagsIndented "item" (blocksToTEI opts def') + +-- | Convert a list of lists of blocks to a list of TEI list items. +listItemsToTEI :: WriterOptions -> [[Block]] -> Doc +listItemsToTEI opts items = vcat $ map (listItemToTEI opts) items + +-- | Convert a list of blocks into a TEI list item. +listItemToTEI :: WriterOptions -> [Block] -> Doc +listItemToTEI opts item = + inTagsIndented "item" $ blocksToTEI opts $ map plainToPara item + +imageToTEI :: WriterOptions -> Attr -> String -> Doc +imageToTEI _ attr src = selfClosingTag "graphic" $ + ("url", src) : idAndRole attr ++ dims + where + dims = go Width "width" ++ go Height "depth" + go dir dstr = case (dimension dir attr) of + Just a -> [(dstr, show a)] + Nothing -> [] + +-- | Convert a Pandoc block element to TEI. +blockToTEI :: WriterOptions -> Block -> Doc +blockToTEI _ Null = empty +-- Add ids to paragraphs in divs with ids - this is needed for +-- pandoc-citeproc to get link anchors in bibliographies: +blockToTEI opts (Div (ident,_,_) [Para lst]) = + let attribs = [("id", ident) | not (null ident)] in + inTags False "p" attribs $ inlinesToTEI opts lst +blockToTEI opts (Div _ bs) = blocksToTEI opts $ map plainToPara bs +blockToTEI _ (Header _ _ _) = empty -- should not occur after hierarchicalize +-- For TEI simple, text must be within containing block element, so +-- we use plainToPara to ensure that Plain text ends up contained by +-- something. +blockToTEI opts (Plain lst) = blockToTEI opts $ Para lst +-- title beginning with fig: indicates that the image is a figure +--blockToTEI opts (Para [Image attr txt (src,'f':'i':'g':':':_)]) = +-- let alt = inlinesToTEI opts txt +-- capt = if null txt +-- then empty +-- else inTagsSimple "title" alt +-- in inTagsIndented "figure" $ +-- capt $$ +-- (inTagsIndented "mediaobject" $ +-- (inTagsIndented "imageobject" +-- (imageToTEI opts attr src)) $$ +-- inTagsSimple "textobject" (inTagsSimple "phrase" alt)) +blockToTEI opts (Para lst) = + inTags False "p" [] $ inlinesToTEI opts lst +blockToTEI opts (BlockQuote blocks) = + inTagsIndented "quote" $ blocksToTEI opts blocks +blockToTEI _ (CodeBlock (_,classes,_) str) = + text ("<ab type='codeblock " ++ lang ++ "'>") <> cr <> + flush (text (escapeStringForXML str) <> cr <> text "</ab>") + where lang = if null langs + then "" + else escapeStringForXML (head langs) + isLang l = map toLower l `elem` map (map toLower) languages + langsFrom s = if isLang s + then [s] + else languagesByExtension . map toLower $ s + langs = concatMap langsFrom classes +blockToTEI opts (BulletList lst) = + let attribs = [("type", "unordered")] + in inTags True "list" attribs $ listItemsToTEI opts lst +blockToTEI _ (OrderedList _ []) = empty +blockToTEI opts (OrderedList (start, numstyle, _) (first:rest)) = + let attribs = case numstyle of + DefaultStyle -> [] + Decimal -> [("type", "ordered:arabic")] + Example -> [("type", "ordered:arabic")] + UpperAlpha -> [("type", "ordered:upperalpha")] + LowerAlpha -> [("type", "ordered:loweralpha")] + UpperRoman -> [("type", "ordered:upperroman")] + LowerRoman -> [("type", "ordered:lowerroman")] + items = if start == 1 + then listItemsToTEI opts (first:rest) + else (inTags True "item" [("n",show start)] + (blocksToTEI opts $ map plainToPara first)) $$ + listItemsToTEI opts rest + in inTags True "list" attribs items +blockToTEI opts (DefinitionList lst) = + let attribs = [("type", "definition")] + in inTags True "list" attribs $ deflistItemsToTEI opts lst +blockToTEI _ (RawBlock f str) + | f == "tei" = text str -- raw TEI block (should such a thing exist). +-- | f == "html" = text str -- allow html for backwards compatibility + | otherwise = empty +blockToTEI _ HorizontalRule = + selfClosingTag "milestone" [("unit","undefined"), ("type","separator"),("rendition","line")] + +-- | TEI Tables +-- TEI Simple's tables are composed of cells and rows; other +-- table info in the AST is here lossily discard. +blockToTEI opts (Table _ _ _ headers rows) = + let + headers' = tableHeadersToTEI opts headers +-- headers' = if all null headers +-- then return empty +-- else tableRowToTEI opts headers + in + inTags True "table" [] $ + vcat $ [headers'] <> map (tableRowToTEI opts) rows + +tableRowToTEI :: WriterOptions + -> [[Block]] + -> Doc +tableRowToTEI opts cols = + inTagsIndented "row" $ vcat $ map (tableItemToTEI opts) cols + +tableHeadersToTEI :: WriterOptions + -> [[Block]] + -> Doc +tableHeadersToTEI opts cols = + inTags True "row" [("role","label")] $ vcat $ map (tableItemToTEI opts) cols + +tableItemToTEI :: WriterOptions + -> [Block] + -> Doc +tableItemToTEI opts item = + inTags False "cell" [] $ vcat $ map (blockToTEI opts) item + +-- | Convert a list of inline elements to TEI. +inlinesToTEI :: WriterOptions -> [Inline] -> Doc +inlinesToTEI opts lst = hcat $ map (inlineToTEI opts) lst + +-- | Convert an inline element to TEI. +inlineToTEI :: WriterOptions -> Inline -> Doc +inlineToTEI _ (Str str) = text $ escapeStringForXML str +inlineToTEI opts (Emph lst) = + inTags False "hi" [("rendition","simple:italic")] $ inlinesToTEI opts lst +inlineToTEI opts (Strong lst) = + inTags False "hi" [("rendition", "simple:bold")] $ inlinesToTEI opts lst +inlineToTEI opts (Strikeout lst) = + inTags False "hi" [("rendition", "simple:strikethrough")] $ + inlinesToTEI opts lst +inlineToTEI opts (Superscript lst) = + inTags False "hi" [("rendition", "simple:superscript")] $ inlinesToTEI opts lst +inlineToTEI opts (Subscript lst) = + inTags False "hi" [("rendition", "simple:subscript")] $ inlinesToTEI opts lst +inlineToTEI opts (SmallCaps lst) = + inTags False "hi" [("rendition", "simple:smallcaps")] $ + inlinesToTEI opts lst +inlineToTEI opts (Quoted _ lst) = + inTagsSimple "quote" $ inlinesToTEI opts lst +inlineToTEI opts (Cite _ lst) = + inlinesToTEI opts lst +inlineToTEI opts (Span _ ils) = + inlinesToTEI opts ils +inlineToTEI _ (Code _ str) = + inTags False "seg" [("type","code")] $ text (escapeStringForXML str) +-- Distinguish display from inline math by wrapping the former in a "figure." +inlineToTEI _ (Math t str) = + case t of + InlineMath -> inTags False "formula" [("notation","TeX")] $ + text (str) + DisplayMath -> inTags True "figure" [("type","math")] $ + inTags False "formula" [("notation","TeX")] $ text (str) + +inlineToTEI _ (RawInline f x) | f == "tei" = text x + | otherwise = empty +inlineToTEI _ LineBreak = selfClosingTag "lb" [] +inlineToTEI _ Space = space +-- because we use \n for LineBreak, we can't do soft breaks: +inlineToTEI _ SoftBreak = space +inlineToTEI opts (Link attr txt (src, _)) + | Just email <- stripPrefix "mailto:" src = + let emailLink = text $ + escapeStringForXML $ email + in case txt of + [Str s] | escapeURI s == email -> emailLink + _ -> inlinesToTEI opts txt <+> + char '(' <> emailLink <> char ')' + | otherwise = + (if isPrefixOf "#" src + then inTags False "ref" $ ("target", drop 1 src) : idAndRole attr + else inTags False "ref" $ ("target", src) : idAndRole attr ) $ + inlinesToTEI opts txt +inlineToTEI opts (Image attr description (src, tit)) = + let titleDoc = if null tit + then empty + else inTags False "figDesc" [] (text $ escapeStringForXML tit) + imageDesc = if null description + then empty + else inTags False "head" [] (inlinesToTEI opts description) + in inTagsIndented "figure" $ imageDesc $$ + imageToTEI opts attr src $$ titleDoc +inlineToTEI opts (Note contents) = + inTagsIndented "note" $ blocksToTEI opts contents + +idAndRole :: Attr -> [(String, String)] +idAndRole (id',cls,_) = ident ++ role + where + ident = if null id' + then [] + else [("id", id')] + role = if null cls + then [] + else [("role", unwords cls)] + diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs index 1aefaa678..8420704dc 100644 --- a/src/Text/Pandoc/Writers/Texinfo.hs +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -43,13 +43,14 @@ import Text.Pandoc.Pretty import Text.Pandoc.ImageSize import Network.URI ( isURI, unEscapeString ) import System.FilePath +import qualified Data.Set as Set data WriterState = WriterState { stStrikeout :: Bool -- document contains strikeout , stSuperscript :: Bool -- document contains superscript , stSubscript :: Bool -- document contains subscript , stEscapeComma :: Bool -- in a context where we need @comma - , stIdentifiers :: [String] -- header ids used already + , stIdentifiers :: Set.Set String -- header ids used already , stOptions :: WriterOptions -- writer options } @@ -64,7 +65,7 @@ writeTexinfo options document = evalState (pandocToTexinfo options $ wrapTop document) $ WriterState { stStrikeout = False, stSuperscript = False, stEscapeComma = False, stSubscript = False, - stIdentifiers = [], stOptions = options} + stIdentifiers = Set.empty, stOptions = options} -- | Add a "Top" node around the document, needed by Texinfo. wrapTop :: Pandoc -> Pandoc @@ -215,7 +216,7 @@ blockToTexinfo (Header level _ lst) = do txt <- inlineListToTexinfo lst idsUsed <- gets stIdentifiers let id' = uniqueIdent lst idsUsed - modify $ \st -> st{ stIdentifiers = id' : idsUsed } + modify $ \st -> st{ stIdentifiers = Set.insert id' idsUsed } return $ if (level > 0) && (level <= 4) then blankline <> text "@node " <> node $$ text (seccmd level) <> txt $$ |