aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs7
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs805
-rw-r--r--src/Text/Pandoc/Readers/Docx/Fonts.hs238
-rw-r--r--src/Text/Pandoc/Readers/Docx/Lists.hs35
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs981
-rw-r--r--src/Text/Pandoc/Readers/Docx/Reducible.hs183
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs285
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs341
-rw-r--r--src/Text/Pandoc/Readers/Haddock.hs40
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs5
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs236
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs5
-rw-r--r--src/Text/Pandoc/Readers/Org.hs51
-rw-r--r--src/Text/Pandoc/Readers/RST.hs13
-rw-r--r--src/Text/Pandoc/Readers/TeXMath.hs20
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs24
-rw-r--r--src/Text/Pandoc/Readers/Txt2Tags.hs548
17 files changed, 2743 insertions, 1074 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index cf1d5132e..1e119e729 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -769,7 +769,12 @@ parseBlock (Elem e) =
"" -> []
x -> [x]
return $ codeBlockWith (attrValue "id" e, classes', [])
- $ trimNl $ strContent e
+ $ trimNl $ strContentRecursive e
+ strContentRecursive = strContent . (\e' -> e'{ elContent =
+ map elementToStr $ elContent e' })
+ elementToStr :: Content -> Content
+ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
+ elementToStr x = x
parseBlockquote = do
attrib <- case filterChild (named "attribution") e of
Nothing -> return mempty
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 09c2330fb..085ee01fc 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE PatternGuards #-}
+
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
@@ -76,48 +78,188 @@ import Codec.Archive.Zip
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Builder (text, toList)
-import Text.Pandoc.Generic (bottomUp)
-import Text.Pandoc.MIME (getMimeType)
-import Text.Pandoc.UTF8 (toString)
+import Text.Pandoc.Walk
import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Readers.Docx.Lists
-import Data.Maybe (mapMaybe, isJust, fromJust)
-import Data.List (delete, isPrefixOf, (\\), intersect)
-import qualified Data.ByteString as BS
+import Text.Pandoc.Readers.Docx.Reducible
+import Text.Pandoc.Shared
+import Text.Pandoc.MediaBag (insertMedia, MediaBag)
+import Data.Maybe (mapMaybe)
+import Data.List (delete, stripPrefix, (\\), intersect)
+import Data.Monoid
+import Text.TeXMath (writeTeX)
+import Data.Default (Default)
import qualified Data.ByteString.Lazy as B
-import Data.ByteString.Base64 (encode)
-import System.FilePath (combine)
+import qualified Data.Map as M
+import Control.Monad.Reader
+import Control.Monad.State
+import Control.Applicative ((<$>))
readDocx :: ReaderOptions
-> B.ByteString
- -> Pandoc
+ -> (Pandoc, MediaBag)
readDocx opts bytes =
case archiveToDocx (toArchive bytes) of
- Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
- Nothing -> error $ "couldn't parse docx file"
-
-runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)])
-runStyleToSpanAttr rPr = ("",
- mapMaybe id [
- if isBold rPr then (Just "strong") else Nothing,
- if isItalic rPr then (Just "emph") else Nothing,
- if isSmallCaps rPr then (Just "smallcaps") else Nothing,
- if isStrike rPr then (Just "strike") else Nothing,
- if isSuperScript rPr then (Just "superscript") else Nothing,
- if isSubScript rPr then (Just "subscript") else Nothing,
- rStyle rPr],
- case underline rPr of
- Just fmt -> [("underline", fmt)]
- _ -> []
- )
-
-parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)])
-parStyleToDivAttr pPr = ("",
- pStyle pPr,
- case indent pPr of
- Just n -> [("indent", (show n))]
- Nothing -> []
- )
+ Right docx -> (Pandoc meta blks, mediaBag) where
+ (meta, blks, mediaBag) = (docxToOutput opts docx)
+ Left _ -> error $ "couldn't parse docx file"
+
+data DState = DState { docxAnchorMap :: M.Map String String
+ , docxMediaBag :: MediaBag }
+
+instance Default DState where
+ def = DState { docxAnchorMap = M.empty
+ , docxMediaBag = mempty }
+
+data DEnv = DEnv { docxOptions :: ReaderOptions
+ , docxInHeaderBlock :: Bool }
+
+instance Default DEnv where
+ def = DEnv def False
+
+type DocxContext = ReaderT DEnv (State DState)
+
+evalDocxContext :: DocxContext a -> DEnv -> DState -> a
+evalDocxContext ctx env st = evalState (runReaderT ctx env) st
+
+concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
+concatMapM f xs = liftM concat (mapM f xs)
+
+-- This is empty, but we put it in for future-proofing.
+spansToKeep :: [String]
+spansToKeep = []
+
+divsToKeep :: [String]
+divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
+
+metaStyles :: M.Map String String
+metaStyles = M.fromList [ ("Title", "title")
+ , ("Subtitle", "subtitle")
+ , ("Author", "author")
+ , ("Date", "date")
+ , ("Abstract", "abstract")]
+
+sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
+sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
+
+isMetaPar :: BodyPart -> Bool
+isMetaPar (Paragraph pPr _) =
+ not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
+isMetaPar _ = False
+
+isEmptyPar :: BodyPart -> Bool
+isEmptyPar (Paragraph _ parParts) =
+ all isEmptyParPart parParts
+ where
+ isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
+ isEmptyParPart _ = False
+ isEmptyElem (TextRun s) = trim s == ""
+ isEmptyElem _ = True
+isEmptyPar _ = False
+
+bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue)
+bodyPartsToMeta' [] = return M.empty
+bodyPartsToMeta' (bp : bps)
+ | (Paragraph pPr parParts) <- bp
+ , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
+ , (Just metaField) <- M.lookup c metaStyles = do
+ inlines <- parPartsToInlines parParts
+ remaining <- bodyPartsToMeta' bps
+ let
+ f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
+ f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
+ f m (MetaList mv) = MetaList (m : mv)
+ f m n = MetaList [m, n]
+ return $ M.insertWith f metaField (MetaInlines inlines) remaining
+bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
+
+bodyPartsToMeta :: [BodyPart] -> DocxContext Meta
+bodyPartsToMeta bps = do
+ mp <- bodyPartsToMeta' bps
+ let mp' =
+ case M.lookup "author" mp of
+ Just mv -> M.insert "author" (fixAuthors mv) mp
+ Nothing -> mp
+ return $ Meta mp'
+
+fixAuthors :: MetaValue -> MetaValue
+fixAuthors (MetaBlocks blks) =
+ MetaList $ map g $ filter f blks
+ where f (Para _) = True
+ f _ = False
+ g (Para ils) = MetaInlines ils
+ g _ = MetaInlines []
+fixAuthors mv = mv
+
+runStyleToContainers :: RunStyle -> [Container Inline]
+runStyleToContainers rPr =
+ let spanClassToContainers :: String -> [Container Inline]
+ spanClassToContainers s | s `elem` codeSpans =
+ [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))]
+ spanClassToContainers s | s `elem` spansToKeep =
+ [Container $ Span ("", [s], [])]
+ spanClassToContainers _ = []
+
+ classContainers = case rStyle rPr of
+ Nothing -> []
+ Just s -> spanClassToContainers s
+
+ formatters = map Container $ mapMaybe id
+ [ if isBold rPr then (Just Strong) else Nothing
+ , if isItalic rPr then (Just Emph) else Nothing
+ , if isSmallCaps rPr then (Just SmallCaps) else Nothing
+ , if isStrike rPr then (Just Strikeout) else Nothing
+ , if isSuperScript rPr then (Just Superscript) else Nothing
+ , if isSubScript rPr then (Just Subscript) else Nothing
+ , rUnderline rPr >>=
+ (\f -> if f == "single" then (Just Emph) else Nothing)
+ ]
+ in
+ classContainers ++ formatters
+
+parStyleToContainers :: ParagraphStyle -> [Container Block]
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, Just n <- isHeaderClass c =
+ [Container $ \_ -> Header n ("", delete ("Heading" ++ show n) cs, []) []]
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` divsToKeep =
+ let pPr' = pPr { pStyle = cs }
+ in
+ (Container $ Div ("", [c], [])) : (parStyleToContainers pPr')
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` codeDivs =
+ -- This is a bit of a cludge. We make the codeblock from the raw
+ -- parparts in bodyPartToBlocks. But we need something to match against.
+ let pPr' = pPr { pStyle = cs }
+ in
+ (Container $ \_ -> CodeBlock ("", [], []) "") : (parStyleToContainers pPr')
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` listParagraphDivs =
+ let pPr' = pPr { pStyle = cs, indentation = Nothing}
+ in
+ (Container $ Div ("", [c], [])) : (parStyleToContainers pPr')
+
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` blockQuoteDivs =
+ let pPr' = pPr { pStyle = cs \\ blockQuoteDivs }
+ in
+ (Container BlockQuote) : (parStyleToContainers pPr')
+parStyleToContainers pPr | (_:cs) <- pStyle pPr =
+ let pPr' = pPr { pStyle = cs}
+ in
+ parStyleToContainers pPr'
+parStyleToContainers pPr | null (pStyle pPr),
+ Just left <- indentation pPr >>= leftParIndent,
+ Just hang <- indentation pPr >>= hangingParIndent =
+ let pPr' = pPr { indentation = Nothing }
+ in
+ case (left - hang) > 0 of
+ True -> (Container BlockQuote) : (parStyleToContainers pPr')
+ False -> parStyleToContainers pPr'
+parStyleToContainers pPr | null (pStyle pPr),
+ Just left <- indentation pPr >>= leftParIndent =
+ let pPr' = pPr { indentation = Nothing }
+ in
+ case left > 0 of
+ True -> (Container BlockQuote) : (parStyleToContainers pPr')
+ False -> parStyleToContainers pPr'
+parStyleToContainers _ = []
+
strToInlines :: String -> [Inline]
strToInlines = toList . text
@@ -144,115 +286,103 @@ runElemToString (Tab) = ['\t']
runElemsToString :: [RunElem] -> String
runElemsToString = concatMap runElemToString
---- We use this instead of the more general
---- Text.Pandoc.Shared.normalize for reasons of efficiency. For
---- whatever reason, `normalize` makes a run take almost twice as
---- long. (It does more, but this does what we need)
-inlineNormalize :: [Inline] -> [Inline]
-inlineNormalize [] = []
-inlineNormalize (Str "" : ils) = inlineNormalize ils
-inlineNormalize ((Str s) : (Str s') : l) =
- inlineNormalize (Str (s++s') : l)
-inlineNormalize ((Emph ils) : (Emph ils') : l) =
- inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Emph ils) : l) =
- Emph (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Strong ils) : (Strong ils') : l) =
- inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Strong ils) : l) =
- Strong (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) =
- inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Strikeout ils) : l) =
- Strikeout (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Superscript ils) : (Superscript ils') : l) =
- inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Superscript ils) : l) =
- Superscript (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Subscript ils) : (Subscript ils') : l) =
- inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Subscript ils) : l) =
- Subscript (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Space : Space : l)) =
- inlineNormalize $ (Space : l)
-inlineNormalize ((Quoted qt ils) : l) =
- Quoted qt (inlineNormalize ils) : inlineNormalize l
-inlineNormalize ((Cite cits ils) : l) =
- let
- f :: Citation -> Citation
- f (Citation s pref suff mode num hash) =
- Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash
- in
- Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Link ils s) : l) =
- Link (inlineNormalize ils) s : (inlineNormalize l)
-inlineNormalize ((Image ils s) : l) =
- Image (inlineNormalize ils) s : (inlineNormalize l)
-inlineNormalize ((Note blks) : l) =
- Note (map blockNormalize blks) : (inlineNormalize l)
-inlineNormalize ((Span attr ils) : l) =
- Span attr (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize (il : l) = il : (inlineNormalize l)
-
-stripSpaces :: [Inline] -> [Inline]
-stripSpaces ils =
- reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils
-
-blockNormalize :: Block -> Block
-blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils
-blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils
-blockNormalize (Header n attr ils) =
- Header n attr $ stripSpaces $ inlineNormalize ils
-blockNormalize (Table ils align width hdr cells) =
- Table (stripSpaces $ inlineNormalize ils) align width hdr cells
-blockNormalize (DefinitionList pairs) =
- DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs
-blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks)
-blockNormalize (OrderedList attr blkslst) =
- OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst
-blockNormalize (BulletList blkslst) =
- BulletList $ map (\blks -> map blockNormalize blks) blkslst
-blockNormalize (Div attr blks) = Div attr (map blockNormalize blks)
-blockNormalize blk = blk
-
-runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
-runToInlines _ _ (Run rs runElems)
- | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans =
- case runStyleToSpanAttr rs == ("", [], []) of
- True -> [Str (runElemsToString runElems)]
- False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]]
- | otherwise = case runStyleToSpanAttr rs == ("", [], []) of
- True -> concatMap runElemToInlines runElems
- False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)]
-runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
- case (getFootNote fnId notes) of
- Just bodyParts ->
- [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
- Nothing ->
- [Note [Div ("", ["footnote"], []) []]]
-runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
- case (getEndNote fnId notes) of
- Just bodyParts ->
- [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
- Nothing ->
- [Note [Div ("", ["endnote"], []) []]]
-
-parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
-parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
-parPartToInlines _ _ (BookMark _ anchor) =
- [Span (anchor, ["anchor"], []) []]
-parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
- case lookupRelationship relid rels of
- Just target -> [Image [] (combine "word" target, "")]
- Nothing -> [Image [] ("", "")]
-parPartToInlines opts docx (InternalHyperLink anchor runs) =
- [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
-parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) =
- case lookupRelationship relid rels of
- Just target ->
- [Link (concatMap (runToInlines opts docx) runs) (target, "")]
- Nothing ->
- [Link (concatMap (runToInlines opts docx) runs) ("", "")]
+runToString :: Run -> String
+runToString (Run _ runElems) = runElemsToString runElems
+runToString _ = ""
+
+parPartToString :: ParPart -> String
+parPartToString (PlainRun run) = runToString run
+parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
+parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
+parPartToString _ = ""
+
+
+inlineCodeContainer :: Container Inline -> Bool
+inlineCodeContainer (Container f) = case f [] of
+ Code _ "" -> True
+ _ -> False
+inlineCodeContainer _ = False
+
+
+runToInlines :: Run -> DocxContext [Inline]
+runToInlines (Run rs runElems)
+ | any inlineCodeContainer (runStyleToContainers rs) =
+ return $
+ rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
+ | otherwise =
+ return $
+ rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
+runToInlines (Footnote bps) =
+ concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks])
+runToInlines (Endnote bps) =
+ concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks])
+runToInlines (InlineDrawing fp bs) = do
+ mediaBag <- gets docxMediaBag
+ modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ return [Image [] (fp, "")]
+
+
+
+
+parPartToInlines :: ParPart -> DocxContext [Inline]
+parPartToInlines (PlainRun r) = runToInlines r
+parPartToInlines (Insertion _ author date runs) = do
+ opts <- asks docxOptions
+ case readerTrackChanges opts of
+ AcceptChanges -> concatMapM runToInlines runs >>= return
+ RejectChanges -> return []
+ AllChanges -> do
+ ils <- (concatMapM runToInlines runs)
+ return [Span
+ ("", ["insertion"], [("author", author), ("date", date)])
+ ils]
+parPartToInlines (Deletion _ author date runs) = do
+ opts <- asks docxOptions
+ case readerTrackChanges opts of
+ AcceptChanges -> return []
+ RejectChanges -> concatMapM runToInlines runs >>= return
+ AllChanges -> do
+ ils <- concatMapM runToInlines runs
+ return [Span
+ ("", ["deletion"], [("author", author), ("date", date)])
+ ils]
+parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return []
+parPartToInlines (BookMark _ anchor) =
+ -- We record these, so we can make sure not to overwrite
+ -- user-defined anchor links with header auto ids.
+ do
+ -- get whether we're in a header.
+ inHdrBool <- asks docxInHeaderBlock
+ -- Get the anchor map.
+ anchorMap <- gets docxAnchorMap
+ -- We don't want to rewrite if we're in a header, since we'll take
+ -- care of that later, when we make the header anchor. If the
+ -- bookmark were already in uniqueIdent form, this would lead to a
+ -- duplication. Otherwise, we check to see if the id is already in
+ -- there. Rewrite if necessary. This will have the possible effect
+ -- of rewriting user-defined anchor links. However, since these
+ -- are not defined in pandoc, it seems like a necessary evil to
+ -- avoid an extra pass.
+ let newAnchor =
+ if not inHdrBool && anchor `elem` (M.elems anchorMap)
+ then uniqueIdent [Str anchor] (M.elems anchorMap)
+ else anchor
+ unless inHdrBool
+ (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
+ return [Span (newAnchor, ["anchor"], []) []]
+parPartToInlines (Drawing fp bs) = do
+ mediaBag <- gets docxMediaBag
+ modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ return [Image [] (fp, "")]
+parPartToInlines (InternalHyperLink anchor runs) = do
+ ils <- concatMapM runToInlines runs
+ return [Link ils ('#' : anchor, "")]
+parPartToInlines (ExternalHyperLink target runs) = do
+ ils <- concatMapM runToInlines runs
+ return [Link ils (target, "")]
+parPartToInlines (PlainOMath exps) = do
+ return [Math InlineMath (writeTeX exps)]
+
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (ident, classes, kvs) ils) =
@@ -265,74 +395,106 @@ isAnchorSpan _ = False
dummyAnchors :: [String]
dummyAnchors = ["_GoBack"]
-makeHeaderAnchors :: Block -> Block
-makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
- case filter isAnchorSpan ils of
- [] -> h
- (x@(Span (ident, _, _) _) : xs) ->
- case ident `elem` dummyAnchors of
- True -> h
- False -> Header n (ident, classes, kvs) (ils \\ (x:xs))
- _ -> h
-makeHeaderAnchors blk = blk
-
-
-parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
-parPartsToInlines opts docx parparts =
- --
- -- We're going to skip data-uri's for now. It should be an option,
- -- not mandatory.
- --
- (if False -- TODO depend on option
- then bottomUp (makeImagesSelfContained docx)
- else id) $
- bottomUp spanTrim $
- bottomUp spanCorrect $
- bottomUp spanReduce $
- concatMap (parPartToInlines opts docx) parparts
-
-cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
-cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps
-
-rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
-rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
-
-bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block
-bodyPartToBlock opts docx (Paragraph pPr parparts) =
- Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)]
-bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
+makeHeaderAnchor :: Block -> DocxContext Block
+-- If there is an anchor already there (an anchor span in the header,
+-- to be exact), we rename and associate the new id with the old one.
+makeHeaderAnchor (Header n (_, classes, kvs) ils)
+ | (x : xs) <- filter isAnchorSpan ils
+ , (Span (ident, _, _) _) <- x
+ , notElem ident dummyAnchors =
+ do
+ hdrIDMap <- gets docxAnchorMap
+ let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
+ return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs))
+-- Otherwise we just give it a name, and register that name (associate
+-- it with itself.)
+makeHeaderAnchor (Header n (_, classes, kvs) ils) =
+ do
+ hdrIDMap <- gets docxAnchorMap
+ let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
+ return $ Header n (newIdent, classes, kvs) ils
+makeHeaderAnchor blk = return blk
+
+
+parPartsToInlines :: [ParPart] -> DocxContext [Inline]
+parPartsToInlines parparts = do
+ ils <- concatMapM parPartToInlines parparts
+ return $ reduceList $ ils
+
+cellToBlocks :: Cell -> DocxContext [Block]
+cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps
+
+rowToBlocksList :: Row -> DocxContext [[Block]]
+rowToBlocksList (Row cells) = mapM cellToBlocks cells
+
+isBlockCodeContainer :: Container Block -> Bool
+isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True
+isBlockCodeContainer _ = False
+
+isHeaderContainer :: Container Block -> Bool
+isHeaderContainer (Container f) | Header _ _ _ <- f [] = True
+isHeaderContainer _ = False
+
+bodyPartToBlocks :: BodyPart -> DocxContext [Block]
+bodyPartToBlocks (Paragraph pPr parparts)
+ | any isBlockCodeContainer (parStyleToContainers pPr) =
+ let
+ otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr)
+ in
+ return $
+ rebuild
+ otherConts
+ [CodeBlock ("", [], []) (concatMap parPartToString parparts)]
+bodyPartToBlocks (Paragraph pPr parparts)
+ | any isHeaderContainer (parStyleToContainers pPr) = do
+ ils <- normalizeSpaces <$> local (\s -> s{docxInHeaderBlock = True})
+ (parPartsToInlines parparts)
+ let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr)
+ Header n attr _ = hdrFun []
+ hdr <- makeHeaderAnchor $ Header n attr ils
+ return [hdr]
+bodyPartToBlocks (Paragraph pPr parparts) = do
+ ils <- parPartsToInlines parparts >>= (return . normalizeSpaces)
+ case ils of
+ [] -> return []
+ _ -> do
+ return $
+ rebuild
+ (parStyleToContainers pPr)
+ [Para ils]
+bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do
let
- kvs = case lookupLevel numId lvl numbering of
- Just (_, fmt, txt, Just start) -> [ ("level", lvl)
- , ("num-id", numId)
- , ("format", fmt)
- , ("text", txt)
- , ("start", (show start))
- ]
-
- Just (_, fmt, txt, Nothing) -> [ ("level", lvl)
- , ("num-id", numId)
- , ("format", fmt)
- , ("text", txt)
- ]
- Nothing -> []
- in
- Div
- ("", ["list-item"], kvs)
- [bodyPartToBlock opts docx (Paragraph pPr parparts)]
-bodyPartToBlock _ _ (Tbl _ _ _ []) =
- Para []
-bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
+ kvs = case levelInfo of
+ (_, fmt, txt, Just start) -> [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ , ("start", (show start))
+ ]
+
+ (_, fmt, txt, Nothing) -> [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ ]
+ blks <- bodyPartToBlocks (Paragraph pPr parparts)
+ return $ [Div ("", ["list-item"], kvs) blks]
+bodyPartToBlocks (Tbl _ _ _ []) =
+ return [Para []]
+bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
let caption = strToInlines cap
(hdr, rows) = case firstRowFormatting look of
True -> (Just r, rs)
False -> (Nothing, r:rs)
- hdrCells = case hdr of
- Just r' -> rowToBlocksList opts docx r'
- Nothing -> []
- cells = map (rowToBlocksList opts docx) rows
+ hdrCells <- case hdr of
+ Just r' -> rowToBlocksList r'
+ Nothing -> return []
- size = case null hdrCells of
+ cells <- mapM rowToBlocksList rows
+
+ let size = case null hdrCells of
True -> length $ head cells
False -> length $ hdrCells
--
@@ -341,208 +503,49 @@ bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
- alignments = take size (repeat AlignDefault)
- widths = take size (repeat 0) :: [Double]
- in
- Table caption alignments widths hdrCells cells
-
-makeImagesSelfContained :: Docx -> Inline -> Inline
-makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
- case lookup uri media of
- Just bs -> case getMimeType uri of
- Just mime -> let data_uri =
- "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
- in
- Image alt (data_uri, title)
- Nothing -> i
- Nothing -> i
-makeImagesSelfContained _ inline = inline
-
-bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
-bodyToBlocks opts docx (Body bps) =
- bottomUp removeEmptyPars $
- map blockNormalize $
- bottomUp spanRemove $
- bottomUp divRemove $
- map (makeHeaderAnchors) $
- bottomUp divCorrect $
- bottomUp divReduce $
- bottomUp divCorrectPreReduce $
- bottomUp blocksToDefinitions $
- blocksToBullets $
- map (bodyPartToBlock opts docx) bps
-
-docxToBlocks :: ReaderOptions -> Docx -> [Block]
-docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
-
-spanReduce :: [Inline] -> [Inline]
-spanReduce [] = []
-spanReduce ((Span (id1, classes1, kvs1) ils1) : ils)
- | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils)
-spanReduce (s1@(Span (id1, classes1, kvs1) ils1) :
- s2@(Span (id2, classes2, kvs2) ils2) :
- ils) =
- let classes' = classes1 `intersect` classes2
- kvs' = kvs1 `intersect` kvs2
- classes1' = classes1 \\ classes'
- kvs1' = kvs1 \\ kvs'
- classes2' = classes2 \\ classes'
- kvs2' = kvs2 \\ kvs'
- in
- case null classes' && null kvs' of
- True -> s1 : (spanReduce (s2 : ils))
- False -> let attr' = ("", classes', kvs')
- attr1' = (id1, classes1', kvs1')
- attr2' = (id2, classes2', kvs2')
- in
- spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] :
- ils)
-spanReduce (il:ils) = il : (spanReduce ils)
+ alignments = replicate size AlignDefault
+ widths = replicate size 0 :: [Double]
+
+ return [Table caption alignments widths hdrCells cells]
+bodyPartToBlocks (OMathPara e) = do
+ return [Para [Math DisplayMath (writeTeX e)]]
+
+
+-- replace targets with generated anchors.
+rewriteLink :: Inline -> DocxContext Inline
+rewriteLink l@(Link ils ('#':target, title)) = do
+ anchorMap <- gets docxAnchorMap
+ return $ case M.lookup target anchorMap of
+ Just newTarget -> (Link ils ('#':newTarget, title))
+ Nothing -> l
+rewriteLink il = return il
+
+bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag)
+bodyToOutput (Body bps) = do
+ let (metabps, blkbps) = sepBodyParts bps
+ meta <- bodyPartsToMeta metabps
+ blks <- concatMapM bodyPartToBlocks blkbps >>=
+ walkM rewriteLink
+ mediaBag <- gets docxMediaBag
+ return $ (meta,
+ blocksToDefinitions $ blocksToBullets $ blks,
+ mediaBag)
+
+docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag)
+docxToOutput opts (Docx (Document _ body)) =
+ let dEnv = def { docxOptions = opts} in
+ evalDocxContext (bodyToOutput body) dEnv def
+
ilToCode :: Inline -> String
ilToCode (Str s) = s
-ilToCode _ = ""
-
-spanRemove' :: Inline -> [Inline]
-spanRemove' s@(Span (ident, classes, _) [])
- -- "_GoBack" is automatically inserted. We don't want to keep it.
- | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s]
-spanRemove' (Span (_, _, kvs) ils) =
- case lookup "underline" kvs of
- Just val -> [Span ("", [], [("underline", val)]) ils]
- Nothing -> ils
-spanRemove' il = [il]
-
-spanRemove :: [Inline] -> [Inline]
-spanRemove = concatMap spanRemove'
-
-spanTrim' :: Inline -> [Inline]
-spanTrim' il@(Span _ []) = [il]
-spanTrim' il@(Span attr (il':[]))
- | il' == Space = [Span attr [], Space]
- | otherwise = [il]
-spanTrim' (Span attr ils)
- | head ils == Space && last ils == Space =
- [Space, Span attr (init $ tail ils), Space]
- | head ils == Space = [Space, Span attr (tail ils)]
- | last ils == Space = [Span attr (init ils), Space]
-spanTrim' il = [il]
-
-spanTrim :: [Inline] -> [Inline]
-spanTrim = concatMap spanTrim'
-
-spanCorrect' :: Inline -> [Inline]
-spanCorrect' (Span ("", [], []) ils) = ils
-spanCorrect' (Span (ident, classes, kvs) ils)
- | "emph" `elem` classes =
- [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils]
- | "strong" `elem` classes =
- [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils]
- | "smallcaps" `elem` classes =
- [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils]
- | "strike" `elem` classes =
- [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils]
- | "superscript" `elem` classes =
- [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils]
- | "subscript" `elem` classes =
- [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils]
- | (not . null) (codeSpans `intersect` classes) =
- [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)]
- | otherwise =
- [Span (ident, classes, kvs) ils]
-spanCorrect' il = [il]
-
-spanCorrect :: [Inline] -> [Inline]
-spanCorrect = concatMap spanCorrect'
-
-removeEmptyPars :: [Block] -> [Block]
-removeEmptyPars blks = filter (\b -> b /= (Para [])) blks
-
-divReduce :: [Block] -> [Block]
-divReduce [] = []
-divReduce ((Div (id1, classes1, kvs1) blks1) : blks)
- | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks)
-divReduce (d1@(Div (id1, classes1, kvs1) blks1) :
- d2@(Div (id2, classes2, kvs2) blks2) :
- blks) =
- let classes' = classes1 `intersect` classes2
- kvs' = kvs1 `intersect` kvs2
- classes1' = classes1 \\ classes'
- kvs1' = kvs1 \\ kvs'
- classes2' = classes2 \\ classes'
- kvs2' = kvs2 \\ kvs'
- in
- case null classes' && null kvs' of
- True -> d1 : (divReduce (d2 : blks))
- False -> let attr' = ("", classes', kvs')
- attr1' = (id1, classes1', kvs1')
- attr2' = (id2, classes2', kvs2')
- in
- divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] :
- blks)
-divReduce (blk:blks) = blk : (divReduce blks)
+ilToCode Space = " "
+ilToCode _ = ""
isHeaderClass :: String -> Maybe Int
-isHeaderClass s | "Heading" `isPrefixOf` s =
- case reads (drop (length "Heading") s) :: [(Int, String)] of
+isHeaderClass s | Just s' <- stripPrefix "Heading" s =
+ case reads s' :: [(Int, String)] of
[] -> Nothing
((n, "") : []) -> Just n
_ -> Nothing
isHeaderClass _ = Nothing
-
-findHeaderClass :: [String] -> Maybe Int
-findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of
- [] -> Nothing
- n : _ -> Just n
-
-blksToInlines :: [Block] -> [Inline]
-blksToInlines (Para ils : _) = ils
-blksToInlines (Plain ils : _) = ils
-blksToInlines _ = []
-
-divCorrectPreReduce' :: Block -> [Block]
-divCorrectPreReduce' (Div (ident, classes, kvs) blks)
- | isJust $ findHeaderClass classes =
- let n = fromJust $ findHeaderClass classes
- in
- [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)]
- | otherwise = [Div (ident, classes, kvs) blks]
-divCorrectPreReduce' blk = [blk]
-
-divCorrectPreReduce :: [Block] -> [Block]
-divCorrectPreReduce = concatMap divCorrectPreReduce'
-
-blkToCode :: Block -> String
-blkToCode (Para []) = ""
-blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils))
-blkToCode (Para ((Span (_, classes, _) ils'): ils))
- | (not . null) (codeSpans `intersect` classes) =
- (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
-blkToCode _ = ""
-
-divRemove' :: Block -> [Block]
-divRemove' (Div (_, _, kvs) blks) =
- case lookup "indent" kvs of
- Just val -> [Div ("", [], [("indent", val)]) blks]
- Nothing -> blks
-divRemove' blk = [blk]
-
-divRemove :: [Block] -> [Block]
-divRemove = concatMap divRemove'
-
-divCorrect' :: Block -> [Block]
-divCorrect' b@(Div (ident, classes, kvs) blks)
- | (not . null) (blockQuoteDivs `intersect` classes) =
- [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]]
- | (not . null) (codeDivs `intersect` classes) =
- [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)]
- | otherwise =
- case lookup "indent" kvs of
- Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]
- Just _ ->
- [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]]
- Nothing -> [b]
-divCorrect' blk = [blk]
-
-divCorrect :: [Block] -> [Block]
-divCorrect = concatMap divCorrect'
diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs
new file mode 100644
index 000000000..b44c71412
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs
@@ -0,0 +1,238 @@
+{-
+Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Fonts
+ Copyright : Copyright (C) 2014 Matthew Pickering
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Matthew Pickering <matthewtpickering@gmail.com>
+ Stability : alpha
+ Portability : portable
+
+Utilities to convert between font codepoints and unicode characters.
+-}
+module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where
+
+
+-- | Enumeration of recognised fonts
+data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol>
+ deriving (Show, Eq)
+
+-- | Given a font and codepoint, returns the corresponding unicode
+-- character
+getUnicode :: Font -> Char -> Maybe Char
+getUnicode Symbol c = lookup c symbol
+
+-- Generated from lib/fonts/symbol.txt
+symbol :: [(Char, Char)]
+symbol =
+ [ (' ',' ')
+ , (' ','\160')
+ , ('!','!')
+ , ('"','\8704')
+ , ('#','#')
+ , ('$','\8707')
+ , ('%','%')
+ , ('&','&')
+ , ('\'','\8715')
+ , ('(','(')
+ , (')',')')
+ , ('*','\8727')
+ , ('+','+')
+ , (',',',')
+ , ('-','\8722')
+ , ('.','.')
+ , ('/','/')
+ , ('0','0')
+ , ('1','1')
+ , ('2','2')
+ , ('3','3')
+ , ('4','4')
+ , ('5','5')
+ , ('6','6')
+ , ('7','7')
+ , ('8','8')
+ , ('9','9')
+ , (':',':')
+ , (';',';')
+ , ('<','<')
+ , ('=','=')
+ , ('>','>')
+ , ('?','?')
+ , ('@','\8773')
+ , ('A','\913')
+ , ('B','\914')
+ , ('C','\935')
+ , ('D','\916')
+ , ('D','\8710')
+ , ('E','\917')
+ , ('F','\934')
+ , ('G','\915')
+ , ('H','\919')
+ , ('I','\921')
+ , ('J','\977')
+ , ('K','\922')
+ , ('L','\923')
+ , ('M','\924')
+ , ('N','\925')
+ , ('O','\927')
+ , ('P','\928')
+ , ('Q','\920')
+ , ('R','\929')
+ , ('S','\931')
+ , ('T','\932')
+ , ('U','\933')
+ , ('V','\962')
+ , ('W','\937')
+ , ('W','\8486')
+ , ('X','\926')
+ , ('Y','\936')
+ , ('Z','\918')
+ , ('[','[')
+ , ('\\','\8756')
+ , (']',']')
+ , ('^','\8869')
+ , ('_','_')
+ , ('`','\63717')
+ , ('a','\945')
+ , ('b','\946')
+ , ('c','\967')
+ , ('d','\948')
+ , ('e','\949')
+ , ('f','\966')
+ , ('g','\947')
+ , ('h','\951')
+ , ('i','\953')
+ , ('j','\981')
+ , ('k','\954')
+ , ('l','\955')
+ , ('m','\181')
+ , ('m','\956')
+ , ('n','\957')
+ , ('o','\959')
+ , ('p','\960')
+ , ('q','\952')
+ , ('r','\961')
+ , ('s','\963')
+ , ('t','\964')
+ , ('u','\965')
+ , ('v','\982')
+ , ('w','\969')
+ , ('x','\958')
+ , ('y','\968')
+ , ('z','\950')
+ , ('{','{')
+ , ('|','|')
+ , ('}','}')
+ , ('~','\8764')
+ , ('\160','\8364')
+ , ('\161','\978')
+ , ('\162','\8242')
+ , ('\163','\8804')
+ , ('\164','\8260')
+ , ('\164','\8725')
+ , ('\165','\8734')
+ , ('\166','\402')
+ , ('\167','\9827')
+ , ('\168','\9830')
+ , ('\169','\9829')
+ , ('\170','\9824')
+ , ('\171','\8596')
+ , ('\172','\8592')
+ , ('\173','\8593')
+ , ('\174','\8594')
+ , ('\175','\8595')
+ , ('\176','\176')
+ , ('\177','\177')
+ , ('\178','\8243')
+ , ('\179','\8805')
+ , ('\180','\215')
+ , ('\181','\8733')
+ , ('\182','\8706')
+ , ('\183','\8226')
+ , ('\184','\247')
+ , ('\185','\8800')
+ , ('\186','\8801')
+ , ('\187','\8776')
+ , ('\188','\8230')
+ , ('\189','\63718')
+ , ('\190','\63719')
+ , ('\191','\8629')
+ , ('\192','\8501')
+ , ('\193','\8465')
+ , ('\194','\8476')
+ , ('\195','\8472')
+ , ('\196','\8855')
+ , ('\197','\8853')
+ , ('\198','\8709')
+ , ('\199','\8745')
+ , ('\200','\8746')
+ , ('\201','\8835')
+ , ('\202','\8839')
+ , ('\203','\8836')
+ , ('\204','\8834')
+ , ('\205','\8838')
+ , ('\206','\8712')
+ , ('\207','\8713')
+ , ('\208','\8736')
+ , ('\209','\8711')
+ , ('\210','\63194')
+ , ('\211','\63193')
+ , ('\212','\63195')
+ , ('\213','\8719')
+ , ('\214','\8730')
+ , ('\215','\8901')
+ , ('\216','\172')
+ , ('\217','\8743')
+ , ('\218','\8744')
+ , ('\219','\8660')
+ , ('\220','\8656')
+ , ('\221','\8657')
+ , ('\222','\8658')
+ , ('\223','\8659')
+ , ('\224','\9674')
+ , ('\225','\9001')
+ , ('\226','\63720')
+ , ('\227','\63721')
+ , ('\228','\63722')
+ , ('\229','\8721')
+ , ('\230','\63723')
+ , ('\231','\63724')
+ , ('\232','\63725')
+ , ('\233','\63726')
+ , ('\234','\63727')
+ , ('\235','\63728')
+ , ('\236','\63729')
+ , ('\237','\63730')
+ , ('\238','\63731')
+ , ('\239','\63732')
+ , ('\241','\9002')
+ , ('\242','\8747')
+ , ('\243','\8992')
+ , ('\244','\63733')
+ , ('\245','\8993')
+ , ('\246','\63734')
+ , ('\247','\63735')
+ , ('\248','\63736')
+ , ('\249','\63737')
+ , ('\250','\63738')
+ , ('\251','\63739')
+ , ('\252','\63740')
+ , ('\253','\63741')
+ , ('\254','\63742')]
diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs
index 68559d98b..ea195c14a 100644
--- a/src/Text/Pandoc/Readers/Docx/Lists.hs
+++ b/src/Text/Pandoc/Readers/Docx/Lists.hs
@@ -29,9 +29,12 @@ Functions for converting flat docx paragraphs into nested lists.
-}
module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets
- , blocksToDefinitions) where
+ , blocksToDefinitions
+ , listParagraphDivs
+ ) where
import Text.Pandoc.JSON
+import Text.Pandoc.Generic (bottomUp)
import Text.Pandoc.Shared (trim)
import Control.Monad
import Data.List
@@ -118,7 +121,7 @@ handleListParagraphs (
in
handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks)
handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks)
-
+
separateBlocks' :: Block -> [[Block]] -> [[Block]]
separateBlocks' blk ([] : []) = [[blk]]
separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]]
@@ -136,7 +139,7 @@ flatToBullets' :: Integer -> [Block] -> [Block]
flatToBullets' _ [] = []
flatToBullets' num xs@(b : elems)
| getLevelN b == num = b : (flatToBullets' num elems)
- | otherwise =
+ | otherwise =
let bNumId = getNumIdN b
bLevel = getLevelN b
(children, remaining) =
@@ -159,10 +162,9 @@ flatToBullets elems = flatToBullets' (-1) elems
blocksToBullets :: [Block] -> [Block]
blocksToBullets blks =
- -- bottomUp removeListItemDivs $
+ bottomUp removeListDivs $
flatToBullets $ (handleListParagraphs blks)
-
plainParaInlines :: Block -> [Inline]
plainParaInlines (Plain ils) = ils
plainParaInlines (Para ils) = ils
@@ -199,10 +201,27 @@ blocksToDefinitions' [] acc (b:blks) =
blocksToDefinitions' defAcc acc (b:blks) =
blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
+removeListDivs' :: Block -> [Block]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | "list-item" `elem` classes =
+ case delete "list-item" classes of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) $ blks]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | not $ null $ listParagraphDivs `intersect` classes =
+ case classes \\ listParagraphDivs of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) blks]
+removeListDivs' blk = [blk]
+
+removeListDivs :: [Block] -> [Block]
+removeListDivs = concatMap removeListDivs'
+
+
blocksToDefinitions :: [Block] -> [Block]
blocksToDefinitions = blocksToDefinitions' [] []
-
-
-
+
+
+
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 18200bcf9..1abd4bc6b 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE PatternGuards, ViewPatterns #-}
+
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
@@ -8,49 +10,46 @@ the Free Software Foundation; either version 2 of the License, or
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
- Module : Text.Pandoc.Readers.Docx.Parse
- Copyright : Copyright (C) 2014 Jesse Rosenthal
- License : GNU GPL, version 2 or above
+ Module : Text.Pandoc.Readers.Docx.Parse
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
- Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
- Stability : alpha
- Portability : portable
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
Conversion of docx archive into Docx haskell type
-}
+module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
+ , Document(..)
+ , Body(..)
+ , BodyPart(..)
+ , TblLook(..)
+ , ParPart(..)
+ , Run(..)
+ , RunElem(..)
+ , Notes
+ , Numbering
+ , Relationship
+ , Media
+ , RunStyle(..)
+ , ParIndentation(..)
+ , ParagraphStyle(..)
+ , Row(..)
+ , Cell(..)
+ , archiveToDocx
+ ) where
-module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
- , Document(..)
- , Body(..)
- , BodyPart(..)
- , TblLook(..)
- , ParPart(..)
- , Run(..)
- , RunElem(..)
- , Notes
- , Numbering
- , Relationship
- , Media
- , RunStyle(..)
- , ParagraphStyle(..)
- , Row(..)
- , Cell(..)
- , getFootNote
- , getEndNote
- , lookupLevel
- , lookupRelationship
- , archiveToDocx
- ) where
import Codec.Archive.Zip
import Text.XML.Light
import Data.Maybe
@@ -59,39 +58,244 @@ import System.FilePath
import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import qualified Text.Pandoc.UTF8 as UTF8
+import Control.Monad.Reader
+import Control.Applicative ((<$>), (<|>))
+import qualified Data.Map as M
+import Text.Pandoc.Compat.Except
+import Text.TeXMath.Readers.OMML (readOMML)
+import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
+import Text.TeXMath (Exp)
+import Data.Char (readLitChar, ord, chr)
+
+data ReaderEnv = ReaderEnv { envNotes :: Notes
+ , envNumbering :: Numbering
+ , envRelationships :: [Relationship]
+ , envMedia :: Media
+ , envFont :: Maybe Font
+ }
+ deriving Show
+
+data DocxError = DocxError | WrongElem
+ deriving Show
+
+instance Error DocxError where
+ noMsg = WrongElem
+
+type D = ExceptT DocxError (Reader ReaderEnv)
+
+runD :: D a -> ReaderEnv -> Either DocxError a
+runD dx re = runReader (runExceptT dx ) re
+
+maybeToD :: Maybe a -> D a
+maybeToD (Just a) = return a
+maybeToD Nothing = throwError DocxError
+
+eitherToD :: Either a b -> D b
+eitherToD (Right b) = return b
+eitherToD (Left _) = throwError DocxError
+
+concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
+concatMapM f xs = liftM concat (mapM f xs)
+
+
+-- This is similar to `mapMaybe`: it maps a function returning the D
+-- monad over a list, and only keeps the non-erroring return values.
+mapD :: (a -> D b) -> [a] -> D [b]
+mapD f xs =
+ let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return [])
+ in
+ concatMapM handler xs
-attrToNSPair :: Attr -> Maybe (String, String)
-attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
-attrToNSPair _ = Nothing
+type NameSpaces = [(String, String)]
+data Docx = Docx Document
+ deriving Show
-type NameSpaces = [(String, String)]
+data Document = Document NameSpaces Body
+ deriving Show
-data Docx = Docx Document Notes Numbering [Relationship] Media
+data Body = Body [BodyPart]
deriving Show
-archiveToDocx :: Archive -> Maybe Docx
+type Media = [(FilePath, B.ByteString)]
+
+data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
+ deriving Show
+
+data Numb = Numb String String -- right now, only a key to an abstract num
+ deriving Show
+
+data AbstractNumb = AbstractNumb String [Level]
+ deriving Show
+
+-- (ilvl, format, string, start)
+type Level = (String, String, String, Maybe Integer)
+
+data Relationship = Relationship (RelId, Target)
+ deriving Show
+
+data Notes = Notes NameSpaces
+ (Maybe (M.Map String Element))
+ (Maybe (M.Map String Element))
+ deriving Show
+
+data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer
+ , rightParIndent :: Maybe Integer
+ , hangingParIndent :: Maybe Integer}
+ deriving Show
+
+data ParagraphStyle = ParagraphStyle { pStyle :: [String]
+ , indentation :: Maybe ParIndentation
+ }
+ deriving Show
+
+defaultParagraphStyle :: ParagraphStyle
+defaultParagraphStyle = ParagraphStyle { pStyle = []
+ , indentation = Nothing
+ }
+
+
+data BodyPart = Paragraph ParagraphStyle [ParPart]
+ | ListItem ParagraphStyle String String Level [ParPart]
+ | Tbl String TblGrid TblLook [Row]
+ | OMathPara [Exp]
+ deriving Show
+
+type TblGrid = [Integer]
+
+data TblLook = TblLook {firstRowFormatting::Bool}
+ deriving Show
+
+defaultTblLook :: TblLook
+defaultTblLook = TblLook{firstRowFormatting = False}
+
+data Row = Row [Cell]
+ deriving Show
+
+data Cell = Cell [BodyPart]
+ deriving Show
+
+data ParPart = PlainRun Run
+ | Insertion ChangeId Author ChangeDate [Run]
+ | Deletion ChangeId Author ChangeDate [Run]
+ | BookMark BookMarkId Anchor
+ | InternalHyperLink Anchor [Run]
+ | ExternalHyperLink URL [Run]
+ | Drawing FilePath B.ByteString
+ | PlainOMath [Exp]
+ deriving Show
+
+data Run = Run RunStyle [RunElem]
+ | Footnote [BodyPart]
+ | Endnote [BodyPart]
+ | InlineDrawing FilePath B.ByteString
+ deriving Show
+
+data RunElem = TextRun String | LnBrk | Tab
+ deriving Show
+
+data RunStyle = RunStyle { isBold :: Bool
+ , isItalic :: Bool
+ , isSmallCaps :: Bool
+ , isStrike :: Bool
+ , isSuperScript :: Bool
+ , isSubScript :: Bool
+ , rUnderline :: Maybe String
+ , rStyle :: Maybe String }
+ deriving Show
+
+defaultRunStyle :: RunStyle
+defaultRunStyle = RunStyle { isBold = False
+ , isItalic = False
+ , isSmallCaps = False
+ , isStrike = False
+ , isSuperScript = False
+ , isSubScript = False
+ , rUnderline = Nothing
+ , rStyle = Nothing
+ }
+
+
+type Target = String
+type Anchor = String
+type URL = String
+type BookMarkId = String
+type RelId = String
+type ChangeId = String
+type Author = String
+type ChangeDate = String
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
+attrToNSPair _ = Nothing
+
+archiveToDocx :: Archive -> Either DocxError Docx
archiveToDocx archive = do
- let notes = archiveToNotes archive
- rels = archiveToRelationships archive
- media = archiveToMedia archive
- doc <- archiveToDocument archive
- numbering <- archiveToNumbering archive
- return $ Docx doc notes numbering rels media
-
-data Document = Document NameSpaces Body
- deriving Show
+ let notes = archiveToNotes archive
+ numbering = archiveToNumbering archive
+ rels = archiveToRelationships archive
+ media = archiveToMedia archive
+ rEnv = ReaderEnv notes numbering rels media Nothing
+ doc <- runD (archiveToDocument archive) rEnv
+ return $ Docx doc
+
-archiveToDocument :: Archive -> Maybe Document
+archiveToDocument :: Archive -> D Document
archiveToDocument zf = do
- entry <- findEntryByPath "word/document.xml" zf
- docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
- let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
- bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem
+ entry <- maybeToD $ findEntryByPath "word/document.xml" zf
+ docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem
body <- elemToBody namespaces bodyElem
return $ Document namespaces body
-type Media = [(FilePath, B.ByteString)]
+elemToBody :: NameSpaces -> Element -> D Body
+elemToBody ns element | isElem ns "w" "body" element =
+ mapD (elemToBodyPart ns) (elChildren element) >>=
+ (\bps -> return $ Body bps)
+elemToBody _ _ = throwError WrongElem
+
+archiveToNotes :: Archive -> Notes
+archiveToNotes zf =
+ let fnElem = findEntryByPath "word/footnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ enElem = findEntryByPath "word/endnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ fn_namespaces = case fnElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ en_namespaces = case enElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
+ fn = fnElem >>= (elemToNotes ns "footnote")
+ en = enElem >>= (elemToNotes ns "endnote")
+ in
+ Notes ns fn en
+
+filePathIsRel :: FilePath -> Bool
+filePathIsRel fp =
+ let (dir, name) = splitFileName fp
+ in
+ (dir == "word/_rels/") && ((takeExtension name) == ".rels")
+
+relElemToRelationship :: Element -> Maybe Relationship
+relElemToRelationship element | qName (elName element) == "Relationship" =
+ do
+ relId <- findAttr (QName "Id" Nothing Nothing) element
+ target <- findAttr (QName "Target" Nothing Nothing) element
+ return $ Relationship (relId, target)
+relElemToRelationship _ = Nothing
+
+
+archiveToRelationships :: Archive -> [Relationship]
+archiveToRelationships archive =
+ let relPaths = filter filePathIsRel (filesInArchive archive)
+ entries = mapMaybe (\f -> findEntryByPath f archive) relPaths
+ relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
+ rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems
+ in
+ rels
filePathIsMedia :: FilePath -> Bool
filePathIsMedia fp =
@@ -109,18 +313,6 @@ archiveToMedia :: Archive -> Media
archiveToMedia zf =
mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
-data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
- deriving Show
-
-data Numb = Numb String String -- right now, only a key to an abstract num
- deriving Show
-
-data AbstractNumb = AbstractNumb String [Level]
- deriving Show
-
--- (ilvl, format, string, start)
-type Level = (String, String, String, Maybe Integer)
-
lookupLevel :: String -> String -> Numbering -> Maybe Level
lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
@@ -148,7 +340,7 @@ absNumElemToAbsNum ns element |
let levelElems = findChildren
(QName "lvl" (lookup "w" ns) (Just "w"))
element
- levels = mapMaybe id $ map (levelElemToLevel ns) levelElems
+ levels = mapMaybe (levelElemToLevel ns) levelElems
return $ AbstractNumb absNumId levels
absNumElemToAbsNum _ _ = Nothing
@@ -167,8 +359,8 @@ levelElemToLevel ns element |
return (ilvl, fmt, txt, start)
levelElemToLevel _ _ = Nothing
-archiveToNumbering :: Archive -> Maybe Numbering
-archiveToNumbering zf =
+archiveToNumbering' :: Archive -> Maybe Numbering
+archiveToNumbering' zf = do
case findEntryByPath "word/numbering.xml" zf of
Nothing -> Just $ Numbering [] [] []
Just entry -> do
@@ -180,321 +372,281 @@ archiveToNumbering zf =
absNumElems = findChildren
(QName "abstractNum" (lookup "w" namespaces) (Just "w"))
numberingElem
- nums = mapMaybe id $ map (numElemToNum namespaces) numElems
- absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems
+ nums = mapMaybe (numElemToNum namespaces) numElems
+ absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems
return $ Numbering namespaces nums absNums
-data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])])
- deriving Show
-
-noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart])
-noteElemToNote ns element
- | qName (elName element) `elem` ["endnote", "footnote"] &&
- qURI (elName element) == (lookup "w" ns) =
- do
- noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
- let bps = map fromJust
- $ filter isJust
- $ map (elemToBodyPart ns)
- $ filterChildrenName (isParOrTbl ns) element
- return $ (noteId, bps)
-noteElemToNote _ _ = Nothing
-
-getFootNote :: String -> Notes -> Maybe [BodyPart]
-getFootNote s (Notes _ fns _) = fns >>= (lookup s)
-
-getEndNote :: String -> Notes -> Maybe [BodyPart]
-getEndNote s (Notes _ _ ens) = ens >>= (lookup s)
+archiveToNumbering :: Archive -> Numbering
+archiveToNumbering archive =
+ fromMaybe (Numbering [] [] []) (archiveToNumbering' archive)
-elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])]
+elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element)
elemToNotes ns notetype element
- | qName (elName element) == (notetype ++ "s") &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ map fromJust
- $ filter isJust
- $ map (noteElemToNote ns)
- $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element
+ | isElem ns "w" (notetype ++ "s") element =
+ let pairs = mapMaybe
+ (\e -> findAttr (elemName ns "w" "id") e >>=
+ (\a -> Just (a, e)))
+ (findChildren (elemName ns "w" notetype) element)
+ in
+ Just $ M.fromList $ pairs
elemToNotes _ _ _ = Nothing
-archiveToNotes :: Archive -> Notes
-archiveToNotes zf =
- let fnElem = findEntryByPath "word/footnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
- enElem = findEntryByPath "word/endnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
- fn_namespaces = case fnElem of
- Just e -> mapMaybe attrToNSPair (elAttribs e)
- Nothing -> []
- en_namespaces = case enElem of
- Just e -> mapMaybe attrToNSPair (elAttribs e)
- Nothing -> []
- ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
- fn = fnElem >>= (elemToNotes ns "footnote")
- en = enElem >>= (elemToNotes ns "endnote")
- in
- Notes ns fn en
+---------------------------------------------
+---------------------------------------------
+elemName :: NameSpaces -> String -> String -> QName
+elemName ns prefix name = (QName name (lookup prefix ns) (Just prefix))
-data Relationship = Relationship (RelId, Target)
- deriving Show
+isElem :: NameSpaces -> String -> String -> Element -> Bool
+isElem ns prefix name element =
+ qName (elName element) == name &&
+ qURI (elName element) == (lookup prefix ns)
-lookupRelationship :: RelId -> [Relationship] -> Maybe Target
-lookupRelationship relid rels =
- lookup relid (map (\(Relationship pair) -> pair) rels)
-filePathIsRel :: FilePath -> Bool
-filePathIsRel fp =
- let (dir, name) = splitFileName fp
+elemToTblGrid :: NameSpaces -> Element -> D TblGrid
+elemToTblGrid ns element | isElem ns "w" "tblGrid" element =
+ let cols = findChildren (elemName ns "w" "gridCol") element
in
- (dir == "word/_rels/") && ((takeExtension name) == ".rels")
-
-relElemToRelationship :: Element -> Maybe Relationship
-relElemToRelationship element | qName (elName element) == "Relationship" =
- do
- relId <- findAttr (QName "Id" Nothing Nothing) element
- target <- findAttr (QName "Target" Nothing Nothing) element
- return $ Relationship (relId, target)
-relElemToRelationship _ = Nothing
-
-
-archiveToRelationships :: Archive -> [Relationship]
-archiveToRelationships archive =
- let relPaths = filter filePathIsRel (filesInArchive archive)
- entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths
- relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
- rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems
+ mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger))
+ cols
+elemToTblGrid _ _ = throwError WrongElem
+
+elemToTblLook :: NameSpaces -> Element -> D TblLook
+elemToTblLook ns element | isElem ns "w" "tblLook" element =
+ let firstRow = findAttr (elemName ns "w" "firstRow") element
+ val = findAttr (elemName ns "w" "val") element
+ firstRowFmt =
+ case firstRow of
+ Just "1" -> True
+ Just _ -> False
+ Nothing -> case val of
+ Just bitMask -> testBitMask bitMask 0x020
+ Nothing -> False
in
- rels
-
-data Body = Body [BodyPart]
- deriving Show
+ return $ TblLook{firstRowFormatting = firstRowFmt}
+elemToTblLook _ _ = throwError WrongElem
-isParOrTbl :: NameSpaces -> QName -> Bool
-isParOrTbl ns q = qName q `elem` ["p", "tbl"] &&
- qURI q == (lookup "w" ns)
+elemToRow :: NameSpaces -> Element -> D Row
+elemToRow ns element | isElem ns "w" "tr" element =
+ do
+ let cellElems = findChildren (elemName ns "w" "tc") element
+ cells <- mapD (elemToCell ns) cellElems
+ return $ Row cells
+elemToRow _ _ = throwError WrongElem
-elemToBody :: NameSpaces -> Element -> Maybe Body
-elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) =
- Just $ Body
- $ map fromJust
- $ filter isJust
- $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element
-elemToBody _ _ = Nothing
+elemToCell :: NameSpaces -> Element -> D Cell
+elemToCell ns element | isElem ns "w" "tc" element =
+ do
+ cellContents <- mapD (elemToBodyPart ns) (elChildren element)
+ return $ Cell cellContents
+elemToCell _ _ = throwError WrongElem
+
+elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
+elemToParIndentation ns element | isElem ns "w" "ind" element =
+ Just $ ParIndentation {
+ leftParIndent =
+ findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger
+ , rightParIndent =
+ findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger
+ , hangingParIndent =
+ findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger}
+elemToParIndentation _ _ = Nothing
-isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool
-isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] &&
- qURI q == (lookup "w" ns)
elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String)
-elemToNumInfo ns element
- | qName (elName element) == "p" &&
- qURI (elName element) == (lookup "w" ns) =
- do
- pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element
- numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr
- lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>=
- findAttr (QName "val" (lookup "w" ns) (Just "w"))
- numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>=
- findAttr (QName "val" (lookup "w" ns) (Just "w"))
- return (numId, lvl)
+elemToNumInfo ns element | isElem ns "w" "p" element = do
+ let pPr = findChild (elemName ns "w" "pPr") element
+ numPr = pPr >>= findChild (elemName ns "w" "numPr")
+ lvl <- numPr >>=
+ findChild (elemName ns "w" "ilvl") >>=
+ findAttr (elemName ns "w" "val")
+ numId <- numPr >>=
+ findChild (elemName ns "w" "numId") >>=
+ findAttr (elemName ns "w" "val")
+ return (numId, lvl)
elemToNumInfo _ _ = Nothing
--- isBookMarkTag :: NameSpaces -> QName -> Bool
--- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] &&
--- qURI q == (lookup "w" ns)
-
--- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark
--- parChildrenToBookmark ns (bms : bme : _)
--- | qName (elName bms) == "bookmarkStart" &&
--- qURI (elName bms) == (lookup "w" ns) &&
--- qName (elName bme) == "bookmarkEnd" &&
--- qURI (elName bme) == (lookup "w" ns) = do
--- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms
--- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms
--- return $ (bmId, bmName)
--- parChildrenToBookmark _ _ = Nothing
-
-elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart
-elemToBodyPart ns element
- | qName (elName element) == "p" &&
- qURI (elName element) == (lookup "w" ns) =
- let parstyle = elemToParagraphStyle ns element
- parparts = mapMaybe id
- $ map (elemToParPart ns)
- $ filterChildrenName (isRunOrLinkOrBookmark ns) element
- in
- case elemToNumInfo ns element of
- Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts
- Nothing -> Just $ Paragraph parstyle parparts
- | qName (elName element) == "tbl" &&
- qURI (elName element) == (lookup "w" ns) =
- let
- caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
- >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w"))
- >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
- grid = case
- findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element
- of
- Just g -> elemToTblGrid ns g
- Nothing -> []
- tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
- >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w"))
- >>= elemToTblLook ns
- in
- Just $ Tbl
- (fromMaybe "" caption)
- grid
- (fromMaybe defaultTblLook tblLook)
- (mapMaybe (elemToRow ns) (elChildren element))
- | otherwise = Nothing
-
-elemToTblLook :: NameSpaces -> Element -> Maybe TblLook
-elemToTblLook ns element
- | qName (elName element) == "tblLook" &&
- qURI (elName element) == (lookup "w" ns) =
- let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element
- val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element
- firstRowFmt =
- case firstRow of
- Just "1" -> True
- Just _ -> False
- Nothing -> case val of
- Just bitMask -> testBitMask bitMask 0x020
- Nothing -> False
- in
- Just $ TblLook{firstRowFormatting = firstRowFmt}
-elemToTblLook _ _ = Nothing
-
testBitMask :: String -> Int -> Bool
testBitMask bitMaskS n =
case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of
[] -> False
((n', _) : _) -> ((n' .|. n) /= 0)
-data ParagraphStyle = ParagraphStyle { pStyle :: [String]
- , indent :: Maybe Integer
- }
- deriving Show
-
-defaultParagraphStyle :: ParagraphStyle
-defaultParagraphStyle = ParagraphStyle { pStyle = []
- , indent = Nothing
- }
-
-elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
-elemToParagraphStyle ns element =
- case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of
- Just pPr ->
- ParagraphStyle
- {pStyle =
- mapMaybe id $
- map
- (findAttr (QName "val" (lookup "w" ns) (Just "w")))
- (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr)
- , indent =
- findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>=
- findAttr (QName "left" (lookup "w" ns) (Just "w")) >>=
- stringToInteger
- }
- Nothing -> defaultParagraphStyle
-
-
-data BodyPart = Paragraph ParagraphStyle [ParPart]
- | ListItem ParagraphStyle String String [ParPart]
- | Tbl String TblGrid TblLook [Row]
-
- deriving Show
-
-type TblGrid = [Integer]
-
-data TblLook = TblLook {firstRowFormatting::Bool}
- deriving Show
-
-defaultTblLook :: TblLook
-defaultTblLook = TblLook{firstRowFormatting = False}
-
stringToInteger :: String -> Maybe Integer
stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
-elemToTblGrid :: NameSpaces -> Element -> TblGrid
-elemToTblGrid ns element
- | qName (elName element) == "tblGrid" &&
- qURI (elName element) == (lookup "w" ns) =
- let
- cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element
- in
- mapMaybe (\e ->
- findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e
- >>= stringToInteger
- )
- cols
-elemToTblGrid _ _ = []
-
-data Row = Row [Cell]
- deriving Show
-
-
-elemToRow :: NameSpaces -> Element -> Maybe Row
-elemToRow ns element
- | qName (elName element) == "tr" &&
- qURI (elName element) == (lookup "w" ns) =
- let
- cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element
- in
- Just $ Row (mapMaybe (elemToCell ns) cells)
-elemToRow _ _ = Nothing
+elemToBodyPart :: NameSpaces -> Element -> D BodyPart
+elemToBodyPart ns element
+ | isElem ns "w" "p" element
+ , (c:_) <- findChildren (elemName ns "m" "oMathPara") element =
+ do
+ expsLst <- eitherToD $ readOMML $ showElement c
+ return $ OMathPara expsLst
+elemToBodyPart ns element
+ | isElem ns "w" "p" element
+ , Just (numId, lvl) <- elemToNumInfo ns element = do
+ let parstyle = elemToParagraphStyle ns element
+ parparts <- mapD (elemToParPart ns) (elChildren element)
+ num <- asks envNumbering
+ case lookupLevel numId lvl num of
+ Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts
+ Nothing -> throwError WrongElem
+elemToBodyPart ns element
+ | isElem ns "w" "p" element = do
+ let parstyle = elemToParagraphStyle ns element
+ parparts <- mapD (elemToParPart ns) (elChildren element)
+ return $ Paragraph parstyle parparts
+elemToBodyPart ns element
+ | isElem ns "w" "tbl" element = do
+ let caption' = findChild (elemName ns "w" "tblPr") element
+ >>= findChild (elemName ns "w" "tblCaption")
+ >>= findAttr (elemName ns "w" "val")
+ caption = (fromMaybe "" caption')
+ grid' = case findChild (elemName ns "w" "tblGrid") element of
+ Just g -> elemToTblGrid ns g
+ Nothing -> return []
+ tblLook' = case findChild (elemName ns "w" "tblPr") element >>=
+ findChild (elemName ns "w" "tblLook")
+ of
+ Just l -> elemToTblLook ns l
+ Nothing -> return defaultTblLook
+
+ grid <- grid'
+ tblLook <- tblLook'
+ rows <- mapD (elemToRow ns) (elChildren element)
+ return $ Tbl caption grid tblLook rows
+elemToBodyPart _ _ = throwError WrongElem
-data Cell = Cell [BodyPart]
- deriving Show
+lookupRelationship :: RelId -> [Relationship] -> Maybe Target
+lookupRelationship relid rels =
+ lookup relid (map (\(Relationship pair) -> pair) rels)
-elemToCell :: NameSpaces -> Element -> Maybe Cell
-elemToCell ns element
- | qName (elName element) == "tc" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element))
-elemToCell _ _ = Nothing
+expandDrawingId :: String -> D (FilePath, B.ByteString)
+expandDrawingId s = do
+ target <- asks (lookupRelationship s . envRelationships)
+ case target of
+ Just filepath -> do
+ bytes <- asks (lookup (combine "word" filepath) . envMedia)
+ case bytes of
+ Just bs -> return (filepath, bs)
+ Nothing -> throwError DocxError
+ Nothing -> throwError DocxError
+
+elemToParPart :: NameSpaces -> Element -> D ParPart
+elemToParPart ns element
+ | isElem ns "w" "r" element
+ , Just _ <- findChild (elemName ns "w" "drawing") element =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs)
+ Nothing -> throwError WrongElem
+elemToParPart ns element
+ | isElem ns "w" "r" element =
+ elemToRun ns element >>= (\r -> return $ PlainRun r)
+elemToParPart ns element
+ | isElem ns "w" "ins" element
+ , Just cId <- findAttr (elemName ns "w" "id") element
+ , Just cAuthor <- findAttr (elemName ns "w" "author") element
+ , Just cDate <- findAttr (elemName ns "w" "date") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ Insertion cId cAuthor cDate runs
+elemToParPart ns element
+ | isElem ns "w" "del" element
+ , Just cId <- findAttr (elemName ns "w" "id") element
+ , Just cAuthor <- findAttr (elemName ns "w" "author") element
+ , Just cDate <- findAttr (elemName ns "w" "date") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ Deletion cId cAuthor cDate runs
+elemToParPart ns element
+ | isElem ns "w" "bookmarkStart" element
+ , Just bmId <- findAttr (elemName ns "w" "id") element
+ , Just bmName <- findAttr (elemName ns "w" "name") element =
+ return $ BookMark bmId bmName
+elemToParPart ns element
+ | isElem ns "w" "hyperlink" element
+ , Just anchor <- findAttr (elemName ns "w" "anchor") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ InternalHyperLink anchor runs
+elemToParPart ns element
+ | isElem ns "w" "hyperlink" element
+ , Just relId <- findAttr (elemName ns "r" "id") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ rels <- asks envRelationships
+ return $ case lookupRelationship relId rels of
+ Just target -> ExternalHyperLink target runs
+ Nothing -> ExternalHyperLink "" runs
+elemToParPart ns element
+ | isElem ns "m" "oMath" element =
+ (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath)
+elemToParPart _ _ = throwError WrongElem
-data ParPart = PlainRun Run
- | BookMark BookMarkId Anchor
- | InternalHyperLink Anchor [Run]
- | ExternalHyperLink RelId [Run]
- | Drawing String
- deriving Show
+lookupFootnote :: String -> Notes -> Maybe Element
+lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s)
-data Run = Run RunStyle [RunElem]
- | Footnote String
- | Endnote String
- deriving Show
+lookupEndnote :: String -> Notes -> Maybe Element
+lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s)
-data RunElem = TextRun String | LnBrk | Tab
- deriving Show
+elemToRun :: NameSpaces -> Element -> D Run
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>=
+ (\(fp, bs) -> return $ InlineDrawing fp bs)
+ Nothing -> throwError WrongElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "footnoteReference") element
+ , Just fnId <- findAttr (elemName ns "w" "id") ref = do
+ notes <- asks envNotes
+ case lookupFootnote fnId notes of
+ Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e)
+ return $ Footnote bps
+ Nothing -> return $ Footnote []
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "endnoteReference") element
+ , Just enId <- findAttr (elemName ns "w" "id") ref = do
+ notes <- asks envNotes
+ case lookupEndnote enId notes of
+ Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e)
+ return $ Endnote bps
+ Nothing -> return $ Endnote []
+elemToRun ns element
+ | isElem ns "w" "r" element = do
+ runElems <- elemToRunElems ns element
+ return $ Run (elemToRunStyle ns element) runElems
+elemToRun _ _ = throwError WrongElem
-data RunStyle = RunStyle { isBold :: Bool
- , isItalic :: Bool
- , isSmallCaps :: Bool
- , isStrike :: Bool
- , isSuperScript :: Bool
- , isSubScript :: Bool
- , underline :: Maybe String
- , rStyle :: Maybe String }
- deriving Show
+elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
+elemToParagraphStyle ns element
+ | Just pPr <- findChild (elemName ns "w" "pPr") element =
+ ParagraphStyle
+ {pStyle =
+ mapMaybe
+ (findAttr (elemName ns "w" "val"))
+ (findChildren (elemName ns "w" "pStyle") pPr)
+ , indentation =
+ findChild (elemName ns "w" "ind") pPr >>=
+ elemToParIndentation ns
+ }
+elemToParagraphStyle _ _ = defaultParagraphStyle
-defaultRunStyle :: RunStyle
-defaultRunStyle = RunStyle { isBold = False
- , isItalic = False
- , isSmallCaps = False
- , isStrike = False
- , isSuperScript = False
- , isSubScript = False
- , underline = Nothing
- , rStyle = Nothing
- }
elemToRunStyle :: NameSpaces -> Element -> RunStyle
-elemToRunStyle ns element =
- case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of
- Just rPr ->
- RunStyle
+elemToRunStyle ns element
+ | Just rPr <- findChild (elemName ns "w" "rPr") element =
+ RunStyle
{
isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr
, isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr
@@ -508,100 +660,65 @@ elemToRunStyle ns element =
(Just "subscript" ==
(findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))))
- , underline =
+ , rUnderline =
findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
, rStyle =
findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
}
- Nothing -> defaultRunStyle
+elemToRunStyle _ _ = defaultRunStyle
-elemToRun :: NameSpaces -> Element -> Maybe Run
-elemToRun ns element
- | qName (elName element) == "r" &&
- qURI (elName element) == (lookup "w" ns) =
- case
- findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>=
- findAttr (QName "id" (lookup "w" ns) (Just "w"))
- of
- Just s -> Just $ Footnote s
- Nothing ->
- case
- findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>=
- findAttr (QName "id" (lookup "w" ns) (Just "w"))
- of
- Just s -> Just $ Endnote s
- Nothing -> Just $
- Run (elemToRunStyle ns element)
- (elemToRunElems ns element)
-elemToRun _ _ = Nothing
-
-elemToRunElem :: NameSpaces -> Element -> Maybe RunElem
+elemToRunElem :: NameSpaces -> Element -> D RunElem
elemToRunElem ns element
- | qName (elName element) == "t" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ TextRun (strContent element)
- | qName (elName element) == "br" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ LnBrk
- | qName (elName element) == "tab" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ Tab
- | otherwise = Nothing
-
-
-elemToRunElems :: NameSpaces -> Element -> [RunElem]
+ | isElem ns "w" "t" element
+ || isElem ns "w" "delText" element
+ || isElem ns "m" "t" element = do
+ let str = strContent element
+ font <- asks envFont
+ case font of
+ Nothing -> return $ TextRun str
+ Just f -> return . TextRun $
+ map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str
+ | isElem ns "w" "br" element = return LnBrk
+ | isElem ns "w" "tab" element = return Tab
+ | isElem ns "w" "sym" element = return (getSymChar ns element)
+ | otherwise = throwError WrongElem
+ where
+ lowerFromPrivate (ord -> c)
+ | c >= ord '\xF000' = chr $ c - ord '\xF000'
+ | otherwise = chr c
+
+-- The char attribute is a hex string
+getSymChar :: NameSpaces -> Element -> RunElem
+getSymChar ns element
+ | Just s <- lowerFromPrivate <$> getCodepoint
+ , Just font <- getFont =
+ let [(char, _)] = readLitChar ("\\x" ++ s) in
+ TextRun . maybe "" (:[]) $ getUnicode font char
+ where
+ getCodepoint = findAttr (elemName ns "w" "char") element
+ getFont = stringToFont =<< findAttr (elemName ns "w" "font") element
+ lowerFromPrivate ('F':xs) = '0':xs
+ lowerFromPrivate xs = xs
+getSymChar _ _ = TextRun ""
+
+stringToFont :: String -> Maybe Font
+stringToFont "Symbol" = Just Symbol
+stringToFont _ = Nothing
+
+elemToRunElems :: NameSpaces -> Element -> D [RunElem]
elemToRunElems ns element
- | qName (elName element) == "r" &&
- qURI (elName element) == (lookup "w" ns) =
- mapMaybe (elemToRunElem ns) (elChildren element)
- | otherwise = []
-
-elemToDrawing :: NameSpaces -> Element -> Maybe ParPart
-elemToDrawing ns element
- | qName (elName element) == "drawing" &&
- qURI (elName element) == (lookup "w" ns) =
- let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
- in
- findElement (QName "blip" (Just a_ns) (Just "a")) element
- >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
- >>= (\s -> Just $ Drawing s)
-elemToDrawing _ _ = Nothing
-
-
-elemToParPart :: NameSpaces -> Element -> Maybe ParPart
-elemToParPart ns element
- | qName (elName element) == "r" &&
- qURI (elName element) == (lookup "w" ns) =
- case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of
- Just drawingElem -> elemToDrawing ns drawingElem
- Nothing -> do
- r <- elemToRun ns element
- return $ PlainRun r
-elemToParPart ns element
- | qName (elName element) == "bookmarkStart" &&
- qURI (elName element) == (lookup "w" ns) = do
- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element
- return $ BookMark bmId bmName
-elemToParPart ns element
- | qName (elName element) == "hyperlink" &&
- qURI (elName element) == (lookup "w" ns) =
- let runs = map fromJust $ filter isJust $ map (elemToRun ns)
- $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element
- in
- case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of
- Just anchor ->
- Just $ InternalHyperLink anchor runs
- Nothing ->
- case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of
- Just relId -> Just $ ExternalHyperLink relId runs
- Nothing -> Nothing
-elemToParPart _ _ = Nothing
-
-type Target = String
-type Anchor = String
-type BookMarkId = String
-type RelId = String
-
+ | isElem ns "w" "r" element
+ || isElem ns "m" "r" element = do
+ let qualName = elemName ns "w"
+ let font = do
+ fontElem <- findElement (qualName "rFonts") element
+ stringToFont =<<
+ (foldr (<|>) Nothing $
+ map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"])
+ local (setFont font) (mapD (elemToRunElem ns) (elChildren element))
+elemToRunElems _ _ = throwError WrongElem
+
+setFont :: Maybe Font -> ReaderEnv -> ReaderEnv
+setFont f s = s{envFont = f}
diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs
new file mode 100644
index 000000000..2dbef4131
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs
@@ -0,0 +1,183 @@
+{-# LANGUAGE OverloadedStrings, PatternGuards #-}
+
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Reducible
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Typeclass for combining adjacent blocks and inlines correctly.
+-}
+
+
+module Text.Pandoc.Readers.Docx.Reducible ((<++>),
+ (<+++>),
+ Reducible,
+ Container(..),
+ container,
+ innards,
+ reduceList,
+ reduceListB,
+ rebuild)
+ where
+
+import Text.Pandoc.Builder
+import Data.List ((\\), intersect)
+
+data Container a = Container ([a] -> a) | NullContainer
+
+instance (Eq a) => Eq (Container a) where
+ (Container x) == (Container y) = ((x []) == (y []))
+ NullContainer == NullContainer = True
+ _ == _ = False
+
+instance (Show a) => Show (Container a) where
+ show (Container x) = "Container {" ++
+ (reverse $ drop 3 $ reverse $ show $ x []) ++
+ "}"
+ show (NullContainer) = "NullContainer"
+
+class Reducible a where
+ (<++>) :: a -> a -> [a]
+ container :: a -> Container a
+ innards :: a -> [a]
+ isSpace :: a -> Bool
+
+(<+++>) :: (Reducible a) => Many a -> Many a -> Many a
+mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms
+
+reduceListB :: (Reducible a) => Many a -> Many a
+reduceListB = fromList . reduceList . toList
+
+reduceList' :: (Reducible a) => [a] -> [a] -> [a]
+reduceList' acc [] = acc
+reduceList' [] (x:xs) = reduceList' [x] xs
+reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs
+
+reduceList :: (Reducible a) => [a] -> [a]
+reduceList = reduceList' []
+
+combineReducibles :: (Reducible a, Eq a) => a -> a -> [a]
+combineReducibles r s =
+ let (conts, rs) = topLevelContainers r
+ (conts', ss) = topLevelContainers s
+ shared = conts `intersect` conts'
+ remaining = conts \\ shared
+ remaining' = conts' \\ shared
+ in
+ case null shared of
+ True | (x : xs) <- reverse rs
+ , isSpace x ->
+ rebuild conts (reverse xs) ++ [x, s]
+ | (x : xs) <- ss
+ , isSpace x ->
+ [r, x] ++ rebuild conts' (xs)
+ True -> [r,s]
+ False -> rebuild
+ shared $
+ reduceList $
+ (rebuild remaining rs) ++ (rebuild remaining' ss)
+
+instance Reducible Inline where
+ s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) =
+ let classes' = classes1 `intersect` classes2
+ kvs' = kvs1 `intersect` kvs2
+ classes1' = classes1 \\ classes'
+ kvs1' = kvs1 \\ kvs'
+ classes2' = classes2 \\ classes'
+ kvs2' = kvs2 \\ kvs'
+ in
+ case null classes' && null kvs' of
+ True -> [s1,s2]
+ False -> let attr' = ("", classes', kvs')
+ attr1' = (id1, classes1', kvs1')
+ attr2' = (id2, classes2', kvs2')
+ s1' = case null classes1' && null kvs1' of
+ True -> ils1
+ False -> [Span attr1' ils1]
+ s2' = case null classes2' && null kvs2' of
+ True -> ils2
+ False -> [Span attr2' ils2]
+ in
+ [Span attr' $ reduceList $ s1' ++ s2']
+
+ (Str x) <++> (Str y) = [Str (x++y)]
+ il <++> il' = combineReducibles il il'
+
+ container (Emph _) = Container Emph
+ container (Strong _) = Container Strong
+ container (Strikeout _) = Container Strikeout
+ container (Subscript _) = Container Subscript
+ container (Superscript _) = Container Superscript
+ container (Quoted qt _) = Container $ Quoted qt
+ container (Cite cs _) = Container $ Cite cs
+ container (Span attr _) = Container $ Span attr
+ container _ = NullContainer
+
+ innards (Emph ils) = ils
+ innards (Strong ils) = ils
+ innards (Strikeout ils) = ils
+ innards (Subscript ils) = ils
+ innards (Superscript ils) = ils
+ innards (Quoted _ ils) = ils
+ innards (Cite _ ils) = ils
+ innards (Span _ ils) = ils
+ innards _ = []
+
+ isSpace Space = True
+ isSpace _ = False
+
+instance Reducible Block where
+ (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes =
+ [Div (ident, classes, kvs) (reduceList blks), blk]
+
+ blk <++> blk' = combineReducibles blk blk'
+
+ container (BlockQuote _) = Container BlockQuote
+ container (Div attr _) = Container $ Div attr
+ container _ = NullContainer
+
+ innards (BlockQuote bs) = bs
+ innards (Div _ bs) = bs
+ innards _ = []
+
+ isSpace _ = False
+
+
+topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a])
+topLevelContainers' (r : []) = case container r of
+ NullContainer -> ([], [r])
+ _ ->
+ let (conts, inns) = topLevelContainers' (innards r)
+ in
+ ((container r) : conts, inns)
+topLevelContainers' rs = ([], rs)
+
+topLevelContainers :: (Reducible a) => a -> ([Container a], [a])
+topLevelContainers il = topLevelContainers' [il]
+
+rebuild :: [Container a] -> [a] -> [a]
+rebuild [] xs = xs
+rebuild ((Container f) : cs) xs = rebuild cs $ [f xs]
+rebuild (NullContainer : cs) xs = rebuild cs $ xs
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
new file mode 100644
index 000000000..f900c0adc
--- /dev/null
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -0,0 +1,285 @@
+{-# LANGUAGE
+ ViewPatterns
+ , StandaloneDeriving
+ , TupleSections
+ , FlexibleContexts #-}
+
+module Text.Pandoc.Readers.EPUB
+ (readEPUB)
+ where
+
+import Text.XML.Light
+import Text.Pandoc.Definition hiding (Attr)
+import Text.Pandoc.Walk (walk, query)
+import Text.Pandoc.Generic(bottomUp)
+import Text.Pandoc.Readers.HTML (readHtml)
+import Text.Pandoc.Options ( ReaderOptions(..), readerTrace)
+import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField)
+import Text.Pandoc.MediaBag (MediaBag, insertMedia)
+import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
+import qualified Text.Pandoc.Builder as B
+import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry
+ , findEntryByPath, Entry)
+import qualified Data.ByteString.Lazy as BL (ByteString)
+import System.FilePath ( takeFileName, (</>), dropFileName, normalise
+ , dropFileName
+ , splitFileName )
+import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
+import Control.Applicative ((<$>))
+import Control.Monad (guard, liftM, when)
+import Data.Monoid (mempty, (<>))
+import Data.List (isPrefixOf, isInfixOf)
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Map as M (Map, lookup, fromList, elems)
+import Control.DeepSeq.Generics (deepseq, NFData)
+
+import Debug.Trace (trace)
+
+type MIME = String
+
+type Items = M.Map String (FilePath, MIME)
+
+readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag)
+readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes)
+
+runEPUB :: Except String a -> a
+runEPUB = either error id . runExcept
+
+-- Note that internal reference are aggresively normalised so that all ids
+-- are of the form "filename#id"
+--
+archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
+archiveToEPUB os archive = do
+ -- root is path to folder with manifest file in
+ (root, content) <- getManifest archive
+ meta <- parseMeta content
+ (cover, items) <- parseManifest content
+ -- No need to collapse here as the image path is from the manifest file
+ let coverDoc = fromMaybe mempty (imageToPandoc <$> cover)
+ spine <- parseSpine items content
+ let escapedSpine = map (escapeURI . takeFileName . fst) spine
+ Pandoc _ bs <-
+ foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine))
+ `liftM` parseSpineElem root b) mempty spine
+ let ast = coverDoc <> (Pandoc meta bs)
+ let mediaBag = fetchImages (M.elems items) root archive ast
+ return $ (ast, mediaBag)
+ where
+ os' = os {readerParseRaw = True}
+ parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc
+ parseSpineElem (normalise -> r) (normalise -> path, mime) = do
+ when (readerTrace os) (traceM path)
+ doc <- mimeToReader mime r path
+ let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
+ return $ docSpan <> doc
+ mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc
+ mimeToReader "application/xhtml+xml" (normalise -> root) (normalise -> path) = do
+ fname <- findEntryByPathE (root </> path) archive
+ return $ fixInternalReferences path .
+ readHtml os' .
+ UTF8.toStringLazy $
+ fromEntry fname
+ mimeToReader s _ path
+ | s `elem` imageMimes = return $ imageToPandoc path
+ | otherwise = return $ mempty
+
+-- paths should be absolute when this function is called
+-- renameImages should do this
+fetchImages :: [(FilePath, MIME)]
+ -> FilePath -- ^ Root
+ -> Archive
+ -> Pandoc
+ -> MediaBag
+fetchImages mimes root arc (query iq -> links) =
+ foldr (uncurry3 insertMedia) mempty
+ (mapMaybe getEntry links)
+ where
+ getEntry link =
+ let abslink = root </> link in
+ (link , lookup link mimes, ) . fromEntry
+ <$> findEntryByPath abslink arc
+
+iq :: Inline -> [FilePath]
+iq (Image _ (url, _)) = [url]
+iq _ = []
+
+-- Remove relative paths
+renameImages :: FilePath -> Inline -> Inline
+renameImages root (Image a (url, b)) = Image a (collapseFilePath (root </> url), b)
+renameImages _ x = x
+
+imageToPandoc :: FilePath -> Pandoc
+imageToPandoc s = B.doc . B.para $ B.image s "" mempty
+
+imageMimes :: [String]
+imageMimes = ["image/gif", "image/jpeg", "image/png"]
+
+type CoverImage = FilePath
+
+parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items)
+parseManifest content = do
+ manifest <- findElementE (dfName "manifest") content
+ let items = findChildren (dfName "item") manifest
+ r <- mapM parseItem items
+ let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
+ return (cover, (M.fromList r))
+ where
+ findCover e = maybe False (isInfixOf "cover-image")
+ (findAttr (emptyName "properties") e)
+ parseItem e = do
+ uid <- findAttrE (emptyName "id") e
+ href <- findAttrE (emptyName "href") e
+ mime <- findAttrE (emptyName "media-type") e
+ return (uid, (href, mime))
+
+parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)]
+parseSpine is e = do
+ spine <- findElementE (dfName "spine") e
+ let itemRefs = findChildren (dfName "itemref") spine
+ mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
+ where
+ parseItemRef ref = do
+ let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
+ guard linear
+ findAttr (emptyName "idref") ref
+
+parseMeta :: MonadError String m => Element -> m Meta
+parseMeta content = do
+ meta <- findElementE (dfName "metadata") content
+ let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
+ dcspace _ = False
+ let dcs = filterChildrenName dcspace meta
+ let r = foldr parseMetaItem nullMeta dcs
+ return r
+
+-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
+parseMetaItem :: Element -> Meta -> Meta
+parseMetaItem e@(stripNamespace . elName -> field) meta =
+ addMetaField (renameMeta field) (B.str $ strContent e) meta
+
+renameMeta :: String -> String
+renameMeta "creator" = "author"
+renameMeta s = s
+
+getManifest :: MonadError String m => Archive -> m (String, Element)
+getManifest archive = do
+ metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
+ docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
+ as <- liftM ((map attrToPair) . elAttribs)
+ (findElementE (QName "rootfile" (Just ns) Nothing) docElem)
+ manifestFile <- mkE "Root not found" (lookup "full-path" as)
+ let rootdir = dropFileName manifestFile
+ --mime <- lookup "media-type" as
+ manifest <- findEntryByPathE manifestFile archive
+ liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+
+-- Fixup
+
+fixInternalReferences :: FilePath -> Pandoc -> Pandoc
+fixInternalReferences pathToFile =
+ (walk $ renameImages root)
+ . (walk normalisePath)
+ . (walk $ fixBlockIRs filename)
+ . (walk $ fixInlineIRs filename)
+ where
+ (root, escapeURI -> filename) = splitFileName pathToFile
+
+fixInlineIRs :: String -> Inline -> Inline
+fixInlineIRs s (Span as v) =
+ Span (fixAttrs s as) v
+fixInlineIRs s (Code as code) =
+ Code (fixAttrs s as) code
+fixInlineIRs s (Link t ('#':url, tit)) =
+ Link t (addHash s url, tit)
+fixInlineIRs _ v = v
+
+normalisePath :: Inline -> Inline
+normalisePath (Link t (url, tit)) =
+ let (path, uid) = span (/= '#') url in
+ Link t (takeFileName path ++ uid, tit)
+normalisePath s = s
+
+prependHash :: [String] -> Inline -> Inline
+prependHash ps l@(Link is (url, tit))
+ | or [s `isPrefixOf` url | s <- ps] =
+ Link is ('#':url, tit)
+ | otherwise = l
+prependHash _ i = i
+
+fixBlockIRs :: String -> Block -> Block
+fixBlockIRs s (Div as b) =
+ Div (fixAttrs s as) b
+fixBlockIRs s (Header i as b) =
+ Header i (fixAttrs s as) b
+fixBlockIRs s (CodeBlock as code) =
+ CodeBlock (fixAttrs s as) code
+fixBlockIRs _ b = b
+
+fixAttrs :: FilePath -> B.Attr -> B.Attr
+fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
+
+addHash :: String -> String -> String
+addHash _ "" = ""
+addHash s ident = s ++ "#" ++ ident
+
+removeEPUBAttrs :: [(String, String)] -> [(String, String)]
+removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs
+
+isEPUBAttr :: (String, String) -> Bool
+isEPUBAttr (k, _) = "epub:" `isPrefixOf` k
+
+-- Library
+
+-- Strict version of foldM
+foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a
+foldM' _ z [] = return z
+foldM' f z (x:xs) = do
+ z' <- f z x
+ z' `deepseq` foldM' f z' xs
+
+uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
+uncurry3 f (a, b, c) = f a b c
+
+traceM :: Monad m => String -> m ()
+traceM = flip trace (return ())
+
+-- Utility
+
+stripNamespace :: QName -> String
+stripNamespace (QName v _ _) = v
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
+attrToNSPair _ = Nothing
+
+attrToPair :: Attr -> (String, String)
+attrToPair (Attr (QName name _ _) val) = (name, val)
+
+defaultNameSpace :: Maybe String
+defaultNameSpace = Just "http://www.idpf.org/2007/opf"
+
+dfName :: String -> QName
+dfName s = QName s defaultNameSpace Nothing
+
+emptyName :: String -> QName
+emptyName s = QName s Nothing Nothing
+
+-- Convert Maybe interface to Either
+
+findAttrE :: MonadError String m => QName -> Element -> m String
+findAttrE q e = mkE "findAttr" $ findAttr q e
+
+findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry
+findEntryByPathE (normalise -> path) a =
+ mkE ("No entry on path: " ++ path) $ findEntryByPath path a
+
+parseXMLDocE :: MonadError String m => String -> m Element
+parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
+
+findElementE :: MonadError String m => QName -> Element -> m Element
+findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
+
+mkE :: MonadError String m => String -> Maybe a -> m a
+mkE s = maybe (throwError s) return
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 552e8a251..1789b865f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses #-}
{-
Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
@@ -40,41 +41,69 @@ import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
-import Text.Pandoc.Shared
-import Text.Pandoc.Options
-import Text.Pandoc.Parsing
-import Data.Maybe ( fromMaybe, isJust )
-import Data.List ( intercalate )
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
+import Text.Pandoc.Shared ( extractSpaces, renderTags'
+ , escapeURI, safeRead )
+import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
+ , Extension (Ext_epub_html_exts,
+ Ext_native_divs, Ext_native_spans))
+import Text.Pandoc.Parsing hiding ((<|>))
+import Text.Pandoc.Walk
+import Data.Maybe ( fromMaybe, isJust)
+import Data.List ( intercalate, isInfixOf )
import Data.Char ( isDigit )
-import Control.Monad ( liftM, guard, when, mzero )
-import Control.Applicative ( (<$>), (<$), (<*) )
-import Data.Monoid
+import Control.Monad ( liftM, guard, when, mzero, void, unless )
+import Control.Arrow ((***))
+import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>))
+import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..))
import Text.Printf (printf)
import Debug.Trace (trace)
+import Text.TeXMath (readMathML, writeTeX)
+import Data.Default (Default (..), def)
+import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-isSpace :: Char -> Bool
-isSpace ' ' = True
-isSpace '\t' = True
-isSpace '\n' = True
-isSpace _ = False
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
readHtml opts inp =
- case runParser parseDoc def{ stateOptions = opts } "source" tags of
+ case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of
Left err' -> error $ "\nError at " ++ show err'
Right result -> result
- where tags = canonicalizeTags $
+ where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
blocks <- (fixPlains False) . mconcat <$> manyTill block eof
- meta <- stateMeta <$> getState
- return $ Pandoc meta (B.toList blocks)
+ meta <- stateMeta . parserState <$> getState
+ bs' <- replaceNotes (B.toList blocks)
+ return $ Pandoc meta bs'
+
+replaceNotes :: [Block] -> TagParser [Block]
+replaceNotes = walkM replaceNotes'
+
+replaceNotes' :: Inline -> TagParser Inline
+replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
+ where
+ getNotes = noteTable <$> getState
+replaceNotes' x = return x
-type TagParser = Parser [Tag String] ParserState
+data HTMLState =
+ HTMLState
+ { parserState :: ParserState,
+ noteTable :: [(String, Blocks)]
+ }
+
+data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
+ , inChapter :: Bool -- ^ Set if in chapter section
+ }
+
+setInChapter :: HTMLParser s a -> HTMLParser s a
+setInChapter = local (\s -> s {inChapter = True})
+
+type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
+
+type TagParser = HTMLParser [Tag String]
pBody :: TagParser Blocks
pBody = pInTags "body" block
@@ -98,7 +127,11 @@ block = do
tr <- getOption readerTrace
pos <- getPosition
res <- choice
- [ pPara
+ [ eSection
+ , eSwitch B.para block
+ , mempty <$ eFootnote
+ , mempty <$ eTOC
+ , pPara
, pHeader
, pBlockQuote
, pCodeBlock
@@ -115,6 +148,63 @@ block = do
(take 60 $ show $ B.toList res)) (return ())
return res
+namespaces :: [(String, TagParser Inlines)]
+namespaces = [(mathMLNamespace, pMath True)]
+
+mathMLNamespace :: String
+mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
+
+eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a
+eSwitch constructor parser = try $ do
+ guardEnabled Ext_epub_html_exts
+ pSatisfy (~== TagOpen "switch" [])
+ cases <- getFirst . mconcat <$>
+ manyTill (First <$> (eCase <* skipMany pBlank) )
+ (lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
+ skipMany pBlank
+ fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
+ skipMany pBlank
+ pSatisfy (~== TagClose "switch")
+ return $ maybe fallback constructor cases
+
+eCase :: TagParser (Maybe Inlines)
+eCase = do
+ skipMany pBlank
+ TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
+ case (flip lookup namespaces) =<< lookup "required-namespace" attr of
+ Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
+ Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
+
+eFootnote :: TagParser ()
+eFootnote = try $ do
+ let notes = ["footnote", "rearnote"]
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr) <- lookAhead $ pAnyTag
+ guard (maybe False (flip elem notes) (lookup "type" attr))
+ let ident = fromMaybe "" (lookup "id" attr)
+ content <- pInTags tag block
+ addNote ident content
+
+addNote :: String -> Blocks -> TagParser ()
+addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
+
+eNoteref :: TagParser Inlines
+eNoteref = try $ do
+ guardEnabled Ext_epub_html_exts
+ TagOpen tag attr <- lookAhead $ pAnyTag
+ guard (maybe False (== "noteref") (lookup "type" attr))
+ let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
+ guard (not (null ident))
+ pInTags tag block
+ return $ B.rawInline "noteref" ident
+
+-- Strip TOC if there is one, better to generate again
+eTOC :: TagParser ()
+eTOC = try $ do
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr) <- lookAhead $ pAnyTag
+ guard (maybe False (== "toc") (lookup "type" attr))
+ void (pInTags tag block)
pList :: TagParser Blocks
pList = pBulletList <|> pOrderedList <|> pDefinitionList
@@ -128,9 +218,15 @@ pBulletList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
+ items <- manyTill (pListItem nonItem) (pCloses "ul")
return $ B.bulletList $ map (fixPlains True) items
+pListItem :: TagParser a -> TagParser Blocks
+pListItem nonItem = do
+ TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
+ let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr)
+ (liDiv <>) <$> pInTags "li" block <* skipMany nonItem
+
pOrderedList :: TagParser Blocks
pOrderedList = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
@@ -156,7 +252,7 @@ pOrderedList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
+ items <- manyTill (pListItem nonItem) (pCloses "ol")
return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
pDefinitionList :: TagParser Blocks
@@ -194,14 +290,14 @@ fixPlains inList bs = if any isParaish bs'
pRawTag :: TagParser String
pRawTag = do
tag <- pAnyTag
- let ignorable x = x `elem` ["html","head","body"]
+ let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
if tagOpen ignorable (const True) tag || tagClose ignorable tag
then return []
else return $ renderTags' [tag]
pDiv :: TagParser Blocks
pDiv = try $ do
- getOption readerParseRaw >>= guard
+ guardEnabled Ext_native_divs
TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True)
contents <- pInTags "div" block
return $ B.divWith (mkAttr attr) contents
@@ -220,13 +316,35 @@ pHtmlBlock t = try $ do
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
+-- Sets chapter context
+eSection :: TagParser Blocks
+eSection = try $ do
+ let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
+ let sectTag = tagOpen (`elem` sectioningContent) matchChapter
+ TagOpen tag _ <- lookAhead $ pSatisfy sectTag
+ setInChapter (pInTags tag block)
+
+headerLevel :: String -> TagParser Int
+headerLevel tagtype = do
+ let level = read (drop 1 tagtype)
+ (try $ do
+ guardEnabled Ext_epub_html_exts
+ asks inChapter >>= guard
+ return (level - 1))
+ <|>
+ return level
+
+
+
+
+
pHeader :: TagParser Blocks
pHeader = try $ do
TagOpen tagtype attr <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
- let level = read (drop 1 tagtype)
+ level <- headerLevel tagtype
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
let classes = maybe [] words $ lookup "class" attr
@@ -244,7 +362,7 @@ pTable :: TagParser Blocks
pTable = try $ do
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
skipMany pBlank
- caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank
+ caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
-- TODO actually read these and take width information from them
widths' <- pColgroup <|> many pCol
head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th")
@@ -326,7 +444,9 @@ pCodeBlock = try $ do
inline :: TagParser Inlines
inline = choice
- [ pTagText
+ [ eNoteref
+ , eSwitch id inline
+ , pTagText
, pQ
, pEmph
, pStrong
@@ -338,6 +458,7 @@ inline = choice
, pImage
, pCode
, pSpan
+ , pMath False
, pRawHtmlInline
]
@@ -366,8 +487,8 @@ pSelfClosing f g = do
pQ :: TagParser Inlines
pQ = do
- quoteContext <- stateQuoteContext `fmap` getState
- let quoteType = case quoteContext of
+ context <- asks quoteContext
+ let quoteType = case context of
InDoubleQuote -> SingleQuote
_ -> DoubleQuote
let innerQuoteContext = if quoteType == SingleQuote
@@ -406,12 +527,24 @@ pLineBreak = do
return B.linebreak
pLink :: TagParser Inlines
-pLink = try $ do
+pLink = pRelLink <|> pAnchor
+
+pAnchor :: TagParser Inlines
+pAnchor = try $ do
+ tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id"))
+ return $ B.spanWith (fromAttrib "id" tag , [], []) mempty
+
+pRelLink :: TagParser Inlines
+pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
let title = fromAttrib "title" tag
+ let uid = fromAttrib "id" tag
+ let spanC = case uid of
+ [] -> id
+ s -> B.spanWith (s, [], [])
lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
- return $ B.link (escapeURI url) title lab
+ return $ spanC $ B.link (escapeURI url) title lab
pImage :: TagParser Inlines
pImage = do
@@ -429,7 +562,7 @@ pCode = try $ do
pSpan :: TagParser Inlines
pSpan = try $ do
- getOption readerParseRaw >>= guard
+ guardEnabled Ext_native_spans
TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
contents <- pInTags "span" inline
return $ B.spanWith (mkAttr attr) contents
@@ -442,6 +575,22 @@ pRawHtmlInline = do
then return $ B.rawInline "html" $ renderTags' [result]
else return mempty
+mathMLToTeXMath :: String -> Either String String
+mathMLToTeXMath s = writeTeX <$> readMathML s
+
+pMath :: Bool -> TagParser Inlines
+pMath inCase = try $ do
+ open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
+ unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr)))
+ contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
+ let math = mathMLToTeXMath $
+ (renderTags $ [open] ++ contents ++ [TagClose "math"])
+ let constructor =
+ maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath)
+ (lookup "display" attr)
+ return $ either (const mempty)
+ (\x -> if null x then mempty else constructor x) math
+
pInlinesInTags :: String -> (Inlines -> Inlines)
-> TagParser Inlines
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
@@ -479,7 +628,8 @@ pTagText :: TagParser Inlines
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
st <- getState
- case runParser (many pTagContents) st "text" str of
+ qu <- ask
+ case flip runReader qu $ runParserT (many pTagContents) st "text" str of
Left _ -> fail $ "Could not parse `" ++ str ++ "'"
Right result -> return $ mconcat result
@@ -488,7 +638,9 @@ pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ all isSpace str
-pTagContents :: Parser [Char] ParserState Inlines
+type InlinesParser = HTMLParser String
+
+pTagContents :: InlinesParser Inlines
pTagContents =
B.displayMath <$> mathDisplay
<|> B.math <$> mathInline
@@ -498,12 +650,11 @@ pTagContents =
<|> pSymbol
<|> pBad
-pStr :: Parser [Char] ParserState Inlines
+pStr :: InlinesParser Inlines
pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
- pos <- getPosition
- updateState $ \s -> s{ stateLastStrPos = Just pos }
+ updateLastStrPos
return $ B.str result
isSpecial :: Char -> Bool
@@ -518,13 +669,13 @@ isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
-pSymbol :: Parser [Char] ParserState Inlines
+pSymbol :: InlinesParser Inlines
pSymbol = satisfy isSpecial >>= return . B.str . (:[])
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
-pBad :: Parser [Char] ParserState Inlines
+pBad :: InlinesParser Inlines
pBad = do
c <- satisfy isBad
let c' = case c of
@@ -558,7 +709,7 @@ pBad = do
_ -> '?'
return $ B.str [c']
-pSpace :: Parser [Char] ParserState Inlines
+pSpace :: InlinesParser Inlines
pSpace = many1 (satisfy isSpace) >> return B.space
--
@@ -566,8 +717,10 @@ pSpace = many1 (satisfy isSpace) >> return B.space
--
eitherBlockOrInline :: [String]
-eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
- "map", "area", "object"]
+eitherBlockOrInline = ["audio", "applet", "button", "iframe",
+ "del", "ins",
+ "progress", "map", "area", "noscript", "script",
+ "object", "svg", "video", "source"]
{-
inlineHtmlTags :: [[Char]]
@@ -579,15 +732,17 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
-}
blockHtmlTags :: [String]
-blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "canvas",
+blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside",
+ "blockquote", "body", "button", "canvas",
"caption", "center", "col", "colgroup", "dd", "dir", "div",
- "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer",
- "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "map", "menu",
- "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress",
- "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd",
+ "dl", "dt", "embed", "fieldset", "figcaption", "figure",
+ "footer", "form", "h1", "h2", "h3", "h4",
+ "h5", "h6", "head", "header", "hgroup", "hr", "html",
+ "isindex", "menu", "noframes", "ol", "output", "p", "pre",
+ "section", "table", "tbody", "textarea",
+ "thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr", "script", "style", "svg", "video"]
+ "th", "thead", "tr", "script", "style"]
-- We want to allow raw docbook in markdown documents, so we
-- include docbook block tags here too.
@@ -605,8 +760,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
+epubTags :: [String]
+epubTags = ["case", "switch", "default"]
+
blockTags :: [String]
-blockTags = blockHtmlTags ++ blockDocBookTags
+blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
isInlineTag :: Tag String -> Bool
isInlineTag t = tagOpen isInlineTagName (const True) t ||
@@ -670,19 +828,23 @@ _ `closes` _ = False
--- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags.
-htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String
+htmlInBalanced :: (Monad m)
+ => (Tag String -> Bool)
+ -> ParserT String st m String
htmlInBalanced f = try $ do
(TagOpen t _, tag) <- htmlTag f
guard $ '/' `notElem` tag -- not a self-closing tag
let stopper = htmlTag (~== TagClose t)
- let anytag = liftM snd $ htmlTag (const True)
+ let anytag = snd <$> htmlTag (const True)
contents <- many $ notFollowedBy' stopper >>
(htmlInBalanced f <|> anytag <|> count 1 anyChar)
endtag <- liftM snd stopper
return $ tag ++ concat contents ++ endtag
-- | Matches a tag meeting a certain condition.
-htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String)
+htmlTag :: Monad m
+ => (Tag String -> Bool)
+ -> ParserT [Char] st m (Tag String, String)
htmlTag f = try $ do
lookAhead $ char '<' >> (oneOf "/!?" <|> letter)
(next : _) <- getInput >>= return . canonicalizeTags . parseTags
@@ -701,7 +863,78 @@ htmlTag f = try $ do
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+ attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+ epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+
+-- Strip namespace prefixes
+stripPrefixes :: [Tag String] -> [Tag String]
+stripPrefixes = map stripPrefix
+
+stripPrefix :: Tag String -> Tag String
+stripPrefix (TagOpen s as) =
+ TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
+stripPrefix (TagClose s) = TagClose (stripPrefix' s)
+stripPrefix x = x
+
+stripPrefix' :: String -> String
+stripPrefix' s =
+ case span (/= ':') s of
+ (_, "") -> s
+ (_, (_:ts)) -> ts
+
+isSpace :: Char -> Bool
+isSpace ' ' = True
+isSpace '\t' = True
+isSpace '\n' = True
+isSpace '\r' = True
+isSpace _ = False
+
+-- Instances
+-- This signature should be more general
+-- MonadReader HTMLLocal m => HasQuoteContext st m
+instance HasQuoteContext st (Reader HTMLLocal) where
+ getQuoteContext = asks quoteContext
+ withQuoteContext q = local (\s -> s{quoteContext = q})
+instance HasReaderOptions HTMLState where
+ extractReaderOptions = extractReaderOptions . parserState
+
+instance Default HTMLState where
+ def = HTMLState def []
+
+instance HasMeta HTMLState where
+ setMeta s b st = st {parserState = setMeta s b $ parserState st}
+ deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
+
+instance Default HTMLLocal where
+ def = HTMLLocal NoQuote False
+
+instance HasLastStrPosition HTMLState where
+ setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
+ getLastStrPos = getLastStrPos . parserState
+
+
+-- EPUB Specific
+--
+--
+sectioningContent :: [String]
+sectioningContent = ["article", "aside", "nav", "section"]
+
+{-
+groupingContent :: [String]
+groupingContent = ["p", "hr", "pre", "blockquote", "ol"
+ , "ul", "li", "dl", "dt", "dt", "dd"
+ , "figure", "figcaption", "div", "main"]
+
+
+
+types :: [(String, ([String], Int))]
+types = -- Document divisions
+ map (\s -> (s, (["section", "body"], 0)))
+ ["volume", "part", "chapter", "division"]
+ ++ -- Document section and components
+ [
+ ("abstract", ([], 0))]
+-}
diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs
index a3dfb7c3c..4b46c869d 100644
--- a/src/Text/Pandoc/Readers/Haddock.hs
+++ b/src/Text/Pandoc/Readers/Haddock.hs
@@ -43,11 +43,8 @@ docHToBlocks d' =
(docHToInlines False $ headerTitle h)
DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2)
DocString _ -> inlineFallback
- DocParagraph (DocHeader h) -> docHToBlocks (DocHeader h)
DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h
- DocParagraph x -> let (ils, rest) = getInlines x
- in (B.para $ docHToInlines False ils)
- <> docHToBlocks rest
+ DocParagraph x -> B.para $ docHToInlines False x
DocIdentifier _ -> inlineFallback
DocIdentifierUnchecked _ -> inlineFallback
DocModule s -> B.plain $ docHToInlines False $ DocModule s
@@ -115,40 +112,6 @@ docHToInlines isCode d' =
DocProperty _ -> mempty
DocExamples _ -> mempty
-getInlines :: DocH String Identifier -> (DocH String Identifier, DocH String Identifier)
-getInlines (DocAppend x y) = if isInline x
- then let (a, b) = getInlines y
- in (DocAppend x a, b)
- else (DocEmpty, DocAppend x y)
-getInlines x = if isInline x
- then (x, DocEmpty)
- else (DocEmpty, x)
-
-isInline :: DocH String Identifier -> Bool
-isInline d' =
- case d' of
- DocEmpty -> True
- DocAppend d1 _ -> isInline d1
- DocString _ -> True
- DocParagraph _ -> False
- DocIdentifier _ -> True
- DocIdentifierUnchecked _ -> True
- DocModule _ -> True
- DocWarning _ -> True
- DocEmphasis _ -> True
- DocMonospaced _ -> True
- DocBold _ -> True
- DocHeader _ -> False
- DocUnorderedList _ -> False
- DocOrderedList _ -> False
- DocDefList _ -> False
- DocCodeBlock _ -> False
- DocHyperlink _ -> True
- DocPic _ -> True
- DocAName _ -> True
- DocProperty _ -> False
- DocExamples _ -> False
-
-- | Create an 'Example', stripping superfluous characters as appropriate
makeExample :: String -> String -> [String] -> Blocks
makeExample prompt expression result =
@@ -173,4 +136,3 @@ makeExample prompt expression result =
substituteBlankLine "<BLANKLINE>" = ""
substituteBlankLine line = line
coder = B.codeWith ([], ["result"], [])
-
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 97bfaa455..9f51e9a8f 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -41,7 +41,6 @@ import Text.Pandoc.Shared
import Text.Pandoc.Options
import Text.Pandoc.Parsing hiding ((<|>), many, optional, space,
mathDisplay, mathInline)
-import Text.Parsec.Prim (ParsecT, runParserT)
import qualified Text.Pandoc.UTF8 as UTF8
import Data.Char ( chr, ord )
import Control.Monad.Trans (lift)
@@ -104,7 +103,7 @@ dimenarg = try $ do
sp :: LP ()
sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t')
- <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline)
+ <|> (try $ newline <* lookAhead anyChar <* notFollowedBy blankline)
isLowerHex :: Char -> Bool
isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f'
@@ -808,7 +807,7 @@ rawEnv name = do
----
-type IncludeParser = ParsecT [Char] [String] IO String
+type IncludeParser = ParserT [Char] [String] IO String
-- | Replace "include" commands with file contents.
handleIncludes :: String -> IO String
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 6c710c8ff..861f81b23 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -353,8 +353,7 @@ referenceKey = try $ do
notFollowedBy' referenceTitle
notFollowedBy' (() <$ reference)
many1 $ notFollowedBy space >> litChar
- let betweenAngles = try $ char '<' >>
- manyTill (escapedChar' <|> litChar) (char '>')
+ let betweenAngles = try $ char '<' >> manyTill litChar (char '>')
src <- try betweenAngles <|> sourceURL
tit <- option "" referenceTitle
-- currently we just ignore MMD-style link/image attributes
@@ -571,7 +570,7 @@ attributes :: MarkdownParser Attr
attributes = try $ do
char '{'
spnl
- attrs <- many (attribute >>~ spnl)
+ attrs <- many (attribute <* spnl)
char '}'
return $ foldl (\x f -> f x) nullAttr attrs
@@ -688,7 +687,7 @@ birdTrackLine c = try $ do
--
emailBlockQuoteStart :: MarkdownParser Char
-emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ')
+emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ')
emailBlockQuote :: MarkdownParser [String]
emailBlockQuote = try $ do
@@ -752,7 +751,7 @@ listLine = try $ do
notFollowedBy' (do indentSpaces
many spaceChar
listStart)
- notFollowedBy' $ htmlTag (~== TagClose "div")
+ notFollowedByHtmlCloser
optional (() <$ indentSpaces)
chunks <- manyTill
( many1 (satisfy $ \c -> c /= '\n' && c /= '<')
@@ -781,11 +780,18 @@ listContinuation = try $ do
blanks <- many blankline
return $ concat result ++ blanks
+notFollowedByHtmlCloser :: MarkdownParser ()
+notFollowedByHtmlCloser = do
+ inHtmlBlock <- stateInHtmlBlock <$> getState
+ case inHtmlBlock of
+ Just t -> notFollowedBy' $ htmlTag (~== TagClose t)
+ Nothing -> return ()
+
listContinuationLine :: MarkdownParser String
listContinuationLine = try $ do
notFollowedBy blankline
notFollowedBy' listStart
- notFollowedBy' $ htmlTag (~== TagClose "div")
+ notFollowedByHtmlCloser
optional indentSpaces
result <- anyLine
return $ result ++ "\n"
@@ -840,38 +846,53 @@ defListMarker = do
else mzero
return ()
-definitionListItem :: MarkdownParser (F (Inlines, [Blocks]))
-definitionListItem = try $ do
- -- first, see if this has any chance of being a definition list:
- lookAhead (anyLine >> optional blankline >> defListMarker)
- term <- trimInlinesF . mconcat <$> manyTill inline newline
- optional blankline
- raw <- many1 defRawBlock
- state <- getState
- let oldContext = stateParserContext state
- -- parse the extracted block, which may contain various block elements:
+definitionListItem :: Bool -> MarkdownParser (F (Inlines, [Blocks]))
+definitionListItem compact = try $ do
+ rawLine' <- anyLine
+ raw <- many1 $ defRawBlock compact
+ term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine'
contents <- mapM (parseFromString parseBlocks) raw
- updateState (\st -> st {stateParserContext = oldContext})
+ optional blanklines
return $ liftM2 (,) term (sequence contents)
-defRawBlock :: MarkdownParser String
-defRawBlock = try $ do
+defRawBlock :: Bool -> MarkdownParser String
+defRawBlock compact = try $ do
+ hasBlank <- option False $ blankline >> return True
defListMarker
firstline <- anyLine
- rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
- trailing <- option "" blanklines
- cont <- liftM concat $ many $ do
- lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine
- trl <- option "" blanklines
- return $ unlines lns ++ trl
- return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont
+ let dline = try
+ ( do notFollowedBy blankline
+ if compact -- laziness not compatible with compact
+ then () <$ indentSpaces
+ else (() <$ indentSpaces)
+ <|> notFollowedBy defListMarker
+ anyLine )
+ rawlines <- many dline
+ cont <- liftM concat $ many $ try $ do
+ trailing <- option "" blanklines
+ ln <- indentSpaces >> notFollowedBy blankline >> anyLine
+ lns <- many dline
+ return $ trailing ++ unlines (ln:lns)
+ return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++
+ if hasBlank || not (null cont) then "\n\n" else ""
definitionList :: MarkdownParser (F Blocks)
-definitionList = do
- guardEnabled Ext_definition_lists
- items <- fmap sequence $ many1 definitionListItem
+definitionList = try $ do
+ lookAhead (anyLine >> optional blankline >> defListMarker)
+ compactDefinitionList <|> normalDefinitionList
+
+compactDefinitionList :: MarkdownParser (F Blocks)
+compactDefinitionList = do
+ guardEnabled Ext_compact_definition_lists
+ items <- fmap sequence $ many1 $ definitionListItem True
return $ B.definitionList <$> fmap compactify'DL items
+normalDefinitionList :: MarkdownParser (F Blocks)
+normalDefinitionList = do
+ guardEnabled Ext_definition_lists
+ items <- fmap sequence $ many1 $ definitionListItem False
+ return $ B.definitionList <$> items
+
--
-- paragraph block
--
@@ -914,16 +935,34 @@ htmlElement = rawVerbatimBlock
htmlBlock :: MarkdownParser (F Blocks)
htmlBlock = do
guardEnabled Ext_raw_html
- res <- (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)
- <|> htmlBlock'
- return $ return $ B.rawBlock "html" res
-
-htmlBlock' :: MarkdownParser String
+ try (do
+ (TagOpen t attrs) <- lookAhead $ fst <$> htmlTag isBlockTag
+ (guard (t `elem` ["pre","style","script"]) >>
+ (return . B.rawBlock "html") <$> rawVerbatimBlock)
+ <|> (do guardEnabled Ext_markdown_attribute
+ oldMarkdownAttribute <- stateMarkdownAttribute <$> getState
+ markdownAttribute <-
+ case lookup "markdown" attrs of
+ Just "0" -> False <$ updateState (\st -> st{
+ stateMarkdownAttribute = False })
+ Just _ -> True <$ updateState (\st -> st{
+ stateMarkdownAttribute = True })
+ Nothing -> return oldMarkdownAttribute
+ res <- if markdownAttribute
+ then rawHtmlBlocks
+ else htmlBlock'
+ updateState $ \st -> st{ stateMarkdownAttribute =
+ oldMarkdownAttribute }
+ return res)
+ <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks))
+ <|> htmlBlock'
+
+htmlBlock' :: MarkdownParser (F Blocks)
htmlBlock' = try $ do
first <- htmlElement
- finalSpace <- many spaceChar
- finalNewlines <- many newline
- return $ first ++ finalSpace ++ finalNewlines
+ skipMany spaceChar
+ optional blanklines
+ return $ return $ B.rawBlock "html" first
strictHtmlBlock :: MarkdownParser String
strictHtmlBlock = htmlInBalanced (not . isInlineTag)
@@ -934,48 +973,36 @@ rawVerbatimBlock = try $ do
["pre", "style", "script"])
(const True))
contents <- manyTill anyChar (htmlTag (~== TagClose tag))
- return $ open ++ contents ++ renderTags [TagClose tag]
+ return $ open ++ contents ++ renderTags' [TagClose tag]
rawTeXBlock :: MarkdownParser (F Blocks)
rawTeXBlock = do
guardEnabled Ext_raw_tex
- result <- (B.rawBlock "latex" <$> rawLaTeXBlock)
- <|> (B.rawBlock "context" <$> rawConTeXtEnvironment)
+ result <- (B.rawBlock "latex" . concat <$>
+ rawLaTeXBlock `sepEndBy1` blankline)
+ <|> (B.rawBlock "context" . concat <$>
+ rawConTeXtEnvironment `sepEndBy1` blankline)
spaces
return $ return result
-rawHtmlBlocks :: MarkdownParser String
+rawHtmlBlocks :: MarkdownParser (F Blocks)
rawHtmlBlocks = do
- htmlBlocks <- many1 $ try $ do
- s <- rawVerbatimBlock <|> try (
- do (t,raw) <- htmlTag isBlockTag
- guard $ t ~/= TagOpen "div" [] &&
- t ~/= TagClose "div"
- exts <- getOption readerExtensions
- -- if open tag, need markdown="1" if
- -- markdown_attributes extension is set
- case t of
- TagOpen _ as
- | Ext_markdown_attribute `Set.member`
- exts ->
- if "markdown" `notElem`
- map fst as
- then mzero
- else return $
- stripMarkdownAttribute raw
- | otherwise -> return raw
- _ -> return raw )
- sps <- do sp1 <- many spaceChar
- sp2 <- option "" (blankline >> return "\n")
- sp3 <- many spaceChar
- sp4 <- option "" blanklines
- return $ sp1 ++ sp2 ++ sp3 ++ sp4
- -- note: we want raw html to be able to
- -- precede a code block, when separated
- -- by a blank line
- return $ s ++ sps
- let combined = concat htmlBlocks
- return $ if last combined == '\n' then init combined else combined
+ (TagOpen tagtype _, raw) <- htmlTag isBlockTag
+ -- try to find closing tag
+ -- we set stateInHtmlBlock so that closing tags that can be either block or
+ -- inline will not be parsed as inline tags
+ oldInHtmlBlock <- stateInHtmlBlock <$> getState
+ updateState $ \st -> st{ stateInHtmlBlock = Just tagtype }
+ let closer = htmlTag (\x -> x ~== TagClose tagtype)
+ contents <- mconcat <$> many (notFollowedBy' closer >> block)
+ result <-
+ (closer >>= \(_, rawcloser) -> return (
+ return (B.rawBlock "html" $ stripMarkdownAttribute raw) <>
+ contents <>
+ return (B.rawBlock "html" rawcloser)))
+ <|> return (return (B.rawBlock "html" raw) <> contents)
+ updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock }
+ return result
-- remove markdown="1" attribute
stripMarkdownAttribute :: String -> String
@@ -1163,7 +1190,7 @@ gridPart ch = do
return (length dashes, length dashes + 1)
gridDashedLines :: Char -> Parser [Char] st [(Int,Int)]
-gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline
+gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline
removeFinalBar :: String -> String
removeFinalBar =
@@ -1388,8 +1415,7 @@ escapedChar = do
ltSign :: MarkdownParser (F Inlines)
ltSign = do
guardDisabled Ext_raw_html
- <|> guardDisabled Ext_markdown_in_html_blocks
- <|> (notFollowedBy' (htmlTag isBlockTag) >> return ())
+ <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag))
char '<'
return $ return $ B.str "<"
@@ -1434,52 +1460,60 @@ math = (return . B.displayMath <$> (mathDisplay >>= applyMacros'))
enclosure :: Char
-> MarkdownParser (F Inlines)
enclosure c = do
+ -- we can't start an enclosure with _ if after a string and
+ -- the intraword_underscores extension is enabled:
+ guardDisabled Ext_intraword_underscores
+ <|> guard (c == '*')
+ <|> (guard =<< notAfterString)
cs <- many1 (char c)
(return (B.str cs) <>) <$> whitespace
- <|> case length cs of
+ <|> do
+ case length cs of
3 -> three c
2 -> two c mempty
1 -> one c mempty
_ -> return (return $ B.str cs)
+ender :: Char -> Int -> MarkdownParser ()
+ender c n = try $ do
+ count n (char c)
+ guard (c == '*')
+ <|> guardDisabled Ext_intraword_underscores
+ <|> notFollowedBy alphaNum
+
-- Parse inlines til you hit one c or a sequence of two cs.
-- If one c, emit emph and then parse two.
-- If two cs, emit strong and then parse one.
-- Otherwise, emit ccc then the results.
three :: Char -> MarkdownParser (F Inlines)
three c = do
- contents <- mconcat <$> many (notFollowedBy (char c) >> inline)
- (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents))
- <|> (try (string [c,c]) >> one c (B.strong <$> contents))
- <|> (char c >> two c (B.emph <$> contents))
+ contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline)
+ (ender c 3 >> return ((B.strong . B.emph) <$> contents))
+ <|> (ender c 2 >> one c (B.strong <$> contents))
+ <|> (ender c 1 >> two c (B.emph <$> contents))
<|> return (return (B.str [c,c,c]) <> contents)
-- Parse inlines til you hit two c's, and emit strong.
-- If you never do hit two cs, emit ** plus inlines parsed.
two :: Char -> F Inlines -> MarkdownParser (F Inlines)
two c prefix' = do
- let ender = try $ string [c,c]
- contents <- mconcat <$> many (try $ notFollowedBy ender >> inline)
- (ender >> return (B.strong <$> (prefix' <> contents)))
+ contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline)
+ (ender c 2 >> return (B.strong <$> (prefix' <> contents)))
<|> return (return (B.str [c,c]) <> (prefix' <> contents))
-- Parse inlines til you hit a c, and emit emph.
-- If you never hit a c, emit * plus inlines parsed.
one :: Char -> F Inlines -> MarkdownParser (F Inlines)
one c prefix' = do
- contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline)
+ contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline)
<|> try (string [c,c] >>
- notFollowedBy (char c) >>
+ notFollowedBy (ender c 1) >>
two c mempty) )
- (char c >> return (B.emph <$> (prefix' <> contents)))
+ (ender c 1 >> return (B.emph <$> (prefix' <> contents)))
<|> return (return (B.str [c]) <> (prefix' <> contents))
strongOrEmph :: MarkdownParser (F Inlines)
-strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_')
- where checkIntraword = do
- exts <- getOption readerExtensions
- when (Ext_intraword_underscores `Set.member` exts) $ do
- guard =<< notAfterString
+strongOrEmph = enclosure '*' <|> enclosure '_'
-- | Parses a list of inlines between start and end delimiters.
inlinesBetween :: (Show b)
@@ -1489,7 +1523,7 @@ inlinesBetween :: (Show b)
inlinesBetween start end =
(trimInlinesF . mconcat) <$> try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
- innerSpace = try $ whitespace >>~ notFollowedBy' end
+ innerSpace = try $ whitespace <* notFollowedBy' end
strikeout :: MarkdownParser (F Inlines)
strikeout = fmap B.strikeout <$>
@@ -1730,7 +1764,7 @@ inBrackets parser = do
spanHtml :: MarkdownParser (F Inlines)
spanHtml = try $ do
- guardEnabled Ext_markdown_in_html_blocks
+ guardEnabled Ext_native_spans
(TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" [])
contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span"))
let ident = fromMaybe "" $ lookup "id" attrs
@@ -1745,14 +1779,19 @@ spanHtml = try $ do
divHtml :: MarkdownParser (F Blocks)
divHtml = try $ do
- guardEnabled Ext_markdown_in_html_blocks
+ guardEnabled Ext_native_divs
(TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" [])
+ -- we set stateInHtmlBlock so that closing tags that can be either block or
+ -- inline will not be parsed as inline tags
+ oldInHtmlBlock <- stateInHtmlBlock <$> getState
+ updateState $ \st -> st{ stateInHtmlBlock = Just "div" }
bls <- option "" (blankline >> option "" blanklines)
contents <- mconcat <$>
many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block)
closed <- option False (True <$ htmlTag (~== TagClose "div"))
if closed
then do
+ updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock }
let ident = fromMaybe "" $ lookup "id" attrs
let classes = maybe [] words $ lookup "class" attrs
let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"]
@@ -1763,10 +1802,17 @@ divHtml = try $ do
rawHtmlInline :: MarkdownParser (F Inlines)
rawHtmlInline = do
guardEnabled Ext_raw_html
+ inHtmlBlock <- stateInHtmlBlock <$> getState
+ let isCloseBlockTag t = case inHtmlBlock of
+ Just t' -> t ~== TagClose t'
+ Nothing -> False
mdInHtml <- option False $
- guardEnabled Ext_markdown_in_html_blocks >> return True
+ ( guardEnabled Ext_markdown_in_html_blocks
+ <|> guardEnabled Ext_markdown_attribute
+ ) >> return True
(_,result) <- htmlTag $ if mdInHtml
- then isInlineTag
+ then (\x -> isInlineTag x &&
+ not (isCloseBlockTag x))
else not . isTextTag
return $ return $ B.rawInline "html" result
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index f1dcce8f7..e43b8a86c 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -569,7 +569,8 @@ endline = () <$ try (newline <*
imageIdentifiers :: [MWParser ()]
imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers]
- where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier"]
+ where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier",
+ "Bild"]
image :: MWParser Inlines
image = try $ do
@@ -634,7 +635,7 @@ inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
inlinesBetween start end =
(trimInlines . mconcat) <$> try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
- innerSpace = try $ whitespace >>~ notFollowedBy' end
+ innerSpace = try $ whitespace <* notFollowedBy' end
emph :: MWParser Inlines
emph = B.emph <$> nested (inlinesBetween start end)
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 7a35e2ca0..e1c29d1e8 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -41,10 +41,10 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
)
import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
import Text.Pandoc.Shared (compactify', compactify'DL)
-import Text.TeXMath (texMathToPandoc, DisplayType(..))
+import Text.TeXMath (readTeX, writePandoc, DisplayType(..))
import Control.Applicative ( Applicative, pure
- , (<$>), (<$), (<*>), (<*), (*>), (<**>) )
+ , (<$>), (<$), (<*>), (<*), (*>) )
import Control.Arrow (first)
import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when)
import Control.Monad.Reader (Reader, runReader, ask, asks)
@@ -274,7 +274,7 @@ optionalAttributes parser = try $
parseBlockAttributes :: OrgParser ()
parseBlockAttributes = do
attrs <- many attribute
- () <$ mapM (uncurry parseAndAddAttribute) attrs
+ mapM_ (uncurry parseAndAddAttribute) attrs
where
attribute :: OrgParser (String, String)
attribute = try $ do
@@ -341,14 +341,36 @@ verseBlock blkProp = try $ do
fmap B.para . mconcat . intersperse (pure B.linebreak)
<$> mapM (parseFromString parseInlines) (lines content)
+exportsCode :: [(String, String)] -> Bool
+exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs
+ || ("rundoc-exports", "results") `elem` attrs)
+
+exportsResults :: [(String, String)] -> Bool
+exportsResults attrs = ("rundoc-exports", "results") `elem` attrs
+ || ("rundoc-exports", "both") `elem` attrs
+
+followingResultsBlock :: OrgParser (Maybe String)
+followingResultsBlock =
+ optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:"
+ *> blankline
+ *> (unlines <$> many1 exampleLine))
+
codeBlock :: BlockProperties -> OrgParser (F Blocks)
codeBlock blkProp = do
skipSpaces
- (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders)
- id' <- fromMaybe "" <$> lookupBlockAttribute "name"
- content <- rawBlockContent blkProp
- let codeBlck = B.codeBlockWith ( id', classes, kv ) content
- maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption"
+ (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders)
+ id' <- fromMaybe "" <$> lookupBlockAttribute "name"
+ content <- rawBlockContent blkProp
+ resultsContent <- followingResultsBlock
+ let includeCode = exportsCode kv
+ let includeResults = exportsResults kv
+ let codeBlck = B.codeBlockWith ( id', classes, kv ) content
+ labelledBlck <- maybe (pure codeBlck)
+ (labelDiv codeBlck)
+ <$> lookupInlinesAttr "caption"
+ let resultBlck = pure $ maybe mempty (exampleCode) resultsContent
+ return $ (if includeCode then labelledBlck else mempty)
+ <> (if includeResults then resultBlck else mempty)
where
labelDiv blk value =
B.divWith nullAttr <$> (mappend <$> labelledBlock value
@@ -780,8 +802,12 @@ noteBlock = try $ do
-- Paragraphs or Plain text
paraOrPlain :: OrgParser (F Blocks)
-paraOrPlain = try $
- parseInlines <**> (fmap <$> option B.plain (try $ newline *> pure B.para))
+paraOrPlain = try $ do
+ ils <- parseInlines
+ nl <- option False (newline >> return True)
+ try (guard nl >> notFollowedBy (orderedListStart <|> bulletListStart) >>
+ return (B.para <$> ils))
+ <|> (return (B.plain <$> ils))
inlinesTillNewline :: OrgParser (F Inlines)
inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline
@@ -1357,7 +1383,7 @@ inlineLaTeX = try $ do
maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd
where
parseAsMath :: String -> Maybe Inlines
- parseAsMath cs = maybeRight $ B.fromList <$> texMathToPandoc DisplayInline cs
+ parseAsMath cs = B.fromList <$> texMathToPandoc cs
parseAsInlineLaTeX :: String -> Maybe Inlines
parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
@@ -1365,6 +1391,9 @@ inlineLaTeX = try $ do
state :: ParserState
state = def{ stateOptions = def{ readerParseRaw = True }}
+ texMathToPandoc inp = (maybeRight $ readTeX inp) >>=
+ writePandoc DisplayInline
+
maybeRight :: Either a b -> Maybe b
maybeRight = either (const Nothing) Just
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index fa8438e70..e5eccb116 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -47,7 +47,7 @@ import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
import qualified Text.Pandoc.Builder as B
import Data.Monoid (mconcat, mempty)
import Data.Sequence (viewr, ViewR(..))
-import Data.Char (toLower)
+import Data.Char (toLower, isHexDigit)
-- | Parse reStructuredText string and return Pandoc document.
readRST :: ReaderOptions -- ^ Reader options
@@ -460,7 +460,7 @@ listItem :: RSTParser Int
listItem start = try $ do
(markerLength, first) <- rawListItem start
rest <- many (listContinuation markerLength)
- blanks <- choice [ try (many blankline >>~ lookAhead start),
+ blanks <- choice [ try (many blankline <* lookAhead start),
many1 blankline ] -- whole list must end with blank.
-- parsing with ListItemState forces markers at beginning of lines to
-- count as list item markers, even if not separated by blank space.
@@ -480,7 +480,7 @@ listItem start = try $ do
orderedList :: RSTParser Blocks
orderedList = try $ do
- (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar)
+ (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar)
items <- many1 (listItem (orderedListStart style delim))
let items' = compactify' items
return $ B.orderedListWith (start, style, delim) items'
@@ -656,9 +656,6 @@ extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc
where (ds,rest) = span isHexDigit s
mbc = safeRead ('\'':'\\':'x':ds ++ "'")
-isHexDigit :: Char -> Bool
-isHexDigit c = c `elem` "0123456789ABCDEFabcdef"
-
extractCaption :: RSTParser (Inlines, Blocks)
extractCaption = do
capt <- trimInlines . mconcat <$> many inline
@@ -747,7 +744,7 @@ simpleReferenceName = do
referenceName :: RSTParser Inlines
referenceName = quotedReferenceName <|>
- (try $ simpleReferenceName >>~ lookAhead (char ':')) <|>
+ (try $ simpleReferenceName <* lookAhead (char ':')) <|>
unquotedReferenceName
referenceKey :: RSTParser [Char]
@@ -1076,7 +1073,7 @@ explicitLink = try $ do
referenceLink :: RSTParser Inlines
referenceLink = try $ do
- (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) >>~
+ (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <*
char '_'
state <- getState
let keyTable = stateKeys state
diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs
index f03eae044..3fee3051e 100644
--- a/src/Text/Pandoc/Readers/TeXMath.hs
+++ b/src/Text/Pandoc/Readers/TeXMath.hs
@@ -27,7 +27,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion of TeX math to a list of 'Pandoc' inline elements.
-}
-module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where
+module Text.Pandoc.Readers.TeXMath ( texMathToInlines ) where
import Text.Pandoc.Definition
import Text.TeXMath
@@ -35,22 +35,14 @@ import Text.TeXMath
-- | Converts a raw TeX math formula to a list of 'Pandoc' inlines.
-- Defaults to raw formula between @$@ or @$$@ characters if entire formula
-- can't be converted.
-readTeXMath' :: MathType
+texMathToInlines :: MathType
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> [Inline]
-readTeXMath' mt inp = case texMathToPandoc dt inp of
- Left _ -> [Str (delim ++ inp ++ delim)]
- Right res -> res
+texMathToInlines mt inp =
+ case writePandoc dt `fmap` readTeX inp of
+ Right (Just ils) -> ils
+ _ -> [Str (delim ++ inp ++ delim)]
where (dt, delim) = case mt of
DisplayMath -> (DisplayBlock, "$$")
InlineMath -> (DisplayInline, "$")
-{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-}
--- | Converts a raw TeX math formula to a list of 'Pandoc' inlines.
--- Defaults to raw formula between @$@ characters if entire formula
--- can't be converted. (This is provided for backwards compatibility;
--- it is better to use @readTeXMath'@, which properly distinguishes
--- between display and inline math.)
-readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings)
- -> [Inline]
-readTeXMath = readTeXMath' InlineMath
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 6d839ec1d..cd34da942 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -265,8 +265,20 @@ definitionList :: Parser [Char] ParserState Blocks
definitionList = try $ B.definitionList <$> many1 definitionListItem
-- | List start character.
-listStart :: Parser [Char] st Char
-listStart = oneOf "*#-"
+listStart :: Parser [Char] ParserState ()
+listStart = genericListStart '*'
+ <|> () <$ genericListStart '#'
+ <|> () <$ definitionListStart
+
+genericListStart :: Char -> Parser [Char] st ()
+genericListStart c = () <$ try (many1 (char c) >> whitespace)
+
+definitionListStart :: Parser [Char] ParserState Inlines
+definitionListStart = try $ do
+ char '-'
+ whitespace
+ trimInlines . mconcat <$>
+ many1Till inline (try (string ":=")) <* optional whitespace
listInline :: Parser [Char] ParserState Inlines
listInline = try (notFollowedBy newline >> inline)
@@ -278,8 +290,7 @@ listInline = try (notFollowedBy newline >> inline)
-- break.
definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks])
definitionListItem = try $ do
- string "- "
- term <- mconcat <$> many1Till inline (try (whitespace >> string ":="))
+ term <- definitionListStart
def' <- multilineDef <|> inlineDef
return (term, def')
where inlineDef :: Parser [Char] ParserState [Blocks]
@@ -488,7 +499,7 @@ str = do
return $ B.str fullStr
-- | Some number of space chars
-whitespace :: Parser [Char] ParserState Inlines
+whitespace :: Parser [Char] st Inlines
whitespace = many1 spaceChar >> return B.space <?> "whitespace"
-- | In Textile, an isolated endline character is a line break
@@ -616,7 +627,8 @@ simpleInline border construct = try $ do
attr <- attributes
body <- trimInlines . mconcat <$>
withQuoteContext InSingleQuote
- (manyTill inline (try border <* notFollowedBy alphaNum))
+ (manyTill (notFollowedBy newline >> inline)
+ (try border <* notFollowedBy alphaNum))
return $ construct $
if attr == nullAttr
then body
diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs
new file mode 100644
index 000000000..3a51b9d84
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Txt2Tags.hs
@@ -0,0 +1,548 @@
+{-# LANGUAGE ViewPatterns #-}
+{-
+Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Txt2Tags
+ Copyright : Copyright (C) 2014 Matthew Pickering
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Matthew Pickering <matthewtpickering@gmail.com>
+
+Conversion of txt2tags formatted plain text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags
+ , getT2TMeta
+ , T2TMeta (..)
+ , readTxt2TagsNoMacros)
+ where
+
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder ( Inlines, Blocks, (<>)
+ , trimInlines )
+import Text.Pandoc.Definition
+import Text.Pandoc.Options
+import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL)
+import Text.Pandoc.Parsing hiding (space, spaces, uri, macro)
+import Control.Applicative ((<$>), (<$), (<*>), (<*), (*>))
+import Data.Char (toLower)
+import Data.List (transpose, intersperse, intercalate)
+import Data.Maybe (fromMaybe)
+import Data.Monoid (Monoid, mconcat, mempty, mappend)
+--import Network.URI (isURI) -- Not sure whether to use this function
+import Control.Monad (void, guard, when)
+import Data.Default
+import Control.Monad.Reader (Reader, runReader, asks)
+
+import Data.Time.LocalTime (getZonedTime)
+import Text.Pandoc.Compat.Directory(getModificationTime)
+import Data.Time.Format (formatTime)
+import System.Locale (defaultTimeLocale)
+import System.IO.Error (catchIOError)
+
+type T2T = ParserT String ParserState (Reader T2TMeta)
+
+-- | An object for the T2T macros meta information
+-- the contents of each field is simply substituted verbatim into the file
+data T2TMeta = T2TMeta {
+ date :: String -- ^ Current date
+ , mtime :: String -- ^ Last modification time of infile
+ , infile :: FilePath -- ^ Input file
+ , outfile :: FilePath -- ^ Output file
+ } deriving Show
+
+instance Default T2TMeta where
+ def = T2TMeta "" "" "" ""
+
+-- | Get the meta information required by Txt2Tags macros
+getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta
+getT2TMeta inps out = do
+ curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime
+ let getModTime = fmap (formatTime defaultTimeLocale "%F") .
+ getModificationTime
+ curMtime <- catchIOError
+ (maximum <$> mapM getModTime inps)
+ (const (return ""))
+ return $ T2TMeta curDate curMtime (intercalate ", " inps) out
+
+-- | Read Txt2Tags from an input string returning a Pandoc document
+readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Pandoc
+readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n")
+
+-- | Read Txt2Tags (ignoring all macros) from an input string returning
+-- a Pandoc document
+readTxt2TagsNoMacros :: ReaderOptions -> String -> Pandoc
+readTxt2TagsNoMacros = readTxt2Tags def
+
+parseT2T :: T2T Pandoc
+parseT2T = do
+ _ <- (Nothing <$ try blankline) <|> (Just <$> (count 3 anyLine))
+ config <- manyTill setting (notFollowedBy setting)
+ -- TODO: Handle settings better
+ let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) nullMeta config
+ updateState (\s -> s {stateMeta = settings})
+ body <- mconcat <$> manyTill block eof
+ return $ Pandoc mempty (B.toList body)
+
+type Keyword = String
+type Value = String
+
+setting :: T2T (Keyword, Value)
+setting = do
+ string "%!"
+ keyword <- ignoreSpacesCap (many1 alphaNum)
+ char ':'
+ value <- ignoreSpacesCap (manyTill anyChar (newline))
+ return (keyword, value)
+
+-- Blocks
+
+parseBlocks :: T2T Blocks
+parseBlocks = mconcat <$> manyTill block eof
+
+block :: T2T Blocks
+block = do
+ choice
+ [ mempty <$ blanklines
+ , quote
+ , hrule -- hrule must go above title
+ , title
+ , commentBlock
+ , verbatim
+ , rawBlock
+ , taggedBlock
+ , list
+ , table
+ , para
+ ]
+
+title :: T2T Blocks
+title = try $ balancedTitle '+' <|> balancedTitle '='
+
+balancedTitle :: Char -> T2T Blocks
+balancedTitle c = try $ do
+ spaces
+ level <- length <$> many1 (char c)
+ guard (level <= 5) -- Max header level 5
+ heading <- manyTill (noneOf "\n\r") (count level (char c))
+ label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-"))
+ many spaceChar *> newline
+ let attr = maybe nullAttr (\x -> (x, [], [])) label
+ return $ B.headerWith attr level (trimInlines $ B.text heading)
+
+para :: T2T Blocks
+para = try $ do
+ ils <- parseInlines
+ nl <- option False (True <$ newline)
+ option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils))
+ where
+ listStart = try bulletListStart <|> orderedListStart
+
+commentBlock :: T2T Blocks
+commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment
+
+-- Seperator and Strong line treated the same
+hrule :: T2T Blocks
+hrule = try $ do
+ spaces
+ line <- many1 (oneOf "=-_")
+ guard (length line >= 20)
+ B.horizontalRule <$ blankline
+
+quote :: T2T Blocks
+quote = try $ do
+ lookAhead tab
+ rawQuote <- many1 (tab *> optional spaces *> anyLine)
+ contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n")
+ return $ B.blockQuote contents
+
+commentLine :: T2T Inlines
+commentLine = comment
+
+-- List Parsing code from Org Reader
+
+list :: T2T Blocks
+list = choice [bulletList, orderedList, definitionList]
+
+bulletList :: T2T Blocks
+bulletList = B.bulletList . compactify'
+ <$> many1 (listItem bulletListStart parseBlocks)
+
+orderedList :: T2T Blocks
+orderedList = B.orderedList . compactify'
+ <$> many1 (listItem orderedListStart parseBlocks)
+
+definitionList :: T2T Blocks
+definitionList = try $ do
+ B.definitionList . compactify'DL <$>
+ many1 (listItem definitionListStart definitionListEnd)
+
+definitionListEnd :: T2T (Inlines, [Blocks])
+definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks)
+
+genericListStart :: T2T Char
+ -> T2T Int
+genericListStart listMarker = try $
+ (2+) <$> (length <$> many spaceChar
+ <* listMarker <* space <* notFollowedBy space)
+
+-- parses bullet list \start and returns its length (excl. following whitespace)
+bulletListStart :: T2T Int
+bulletListStart = genericListStart (char '-')
+
+orderedListStart :: T2T Int
+orderedListStart = genericListStart (char '+' )
+
+definitionListStart :: T2T Int
+definitionListStart = genericListStart (char ':')
+
+-- parse raw text for one list item, excluding start marker and continuations
+listItem :: T2T Int
+ -> T2T a
+ -> T2T a
+listItem start end = try $ do
+ markerLength <- try start
+ firstLine <- anyLineNewline
+ blank <- option "" ("\n" <$ blankline)
+ rest <- concat <$> many (listContinuation markerLength)
+ parseFromString end $ firstLine ++ blank ++ rest
+
+-- continuation of a list item - indented and separated by blankline or endline.
+-- Note: nested lists are parsed as continuations.
+listContinuation :: Int
+ -> T2T String
+listContinuation markerLength = try $
+ notFollowedBy' (blankline >> blankline)
+ *> (mappend <$> (concat <$> many1 listLine)
+ <*> many blankline)
+ where listLine = try $ indentWith markerLength *> anyLineNewline
+
+anyLineNewline :: T2T String
+anyLineNewline = (++ "\n") <$> anyLine
+
+indentWith :: Int -> T2T String
+indentWith n = count n space
+
+-- Table
+
+table :: T2T Blocks
+table = try $ do
+ header <- fmap snd <$> option mempty (try headerRow)
+ rows <- many1 (many commentLine *> tableRow)
+ let columns = transpose rows
+ let ncolumns = length columns
+ let aligns = map (foldr1 findAlign) (map (map fst) columns)
+ let rows' = map (map snd) rows
+ let size = maximum (map length rows')
+ let rowsPadded = map (pad size) rows'
+ let headerPadded = if (not (null header)) then pad size header else mempty
+ return $ B.table mempty
+ (zip aligns (replicate ncolumns 0.0))
+ headerPadded rowsPadded
+
+pad :: (Show a, Monoid a) => Int -> [a] -> [a]
+pad n xs = xs ++ (replicate (n - length xs) mempty)
+
+
+findAlign :: Alignment -> Alignment -> Alignment
+findAlign x y
+ | x == y = x
+ | otherwise = AlignDefault
+
+headerRow :: T2T [(Alignment, Blocks)]
+headerRow = genericRow (string "||")
+
+tableRow :: T2T [(Alignment, Blocks)]
+tableRow = genericRow (char '|')
+
+genericRow :: T2T a -> T2T [(Alignment, Blocks)]
+genericRow start = try $ do
+ spaces *> start
+ manyTill tableCell newline <?> "genericRow"
+
+
+tableCell :: T2T (Alignment, Blocks)
+tableCell = try $ do
+ leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead
+ content <- (manyTill inline (try $ lookAhead (cellEnd)))
+ rightSpaces <- length <$> many space
+ let align =
+ case compare leftSpaces rightSpaces of
+ LT -> AlignLeft
+ EQ -> AlignCenter
+ GT -> AlignRight
+ endOfCell
+ return $ (align, B.plain (B.trimInlines $ mconcat content))
+ where
+ cellEnd = (void newline <|> (many1 space *> endOfCell))
+
+endOfCell :: T2T ()
+endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline)
+
+-- Raw area
+
+verbatim :: T2T Blocks
+verbatim = genericBlock anyLineNewline B.codeBlock "```"
+
+rawBlock :: T2T Blocks
+rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\""
+
+taggedBlock :: T2T Blocks
+taggedBlock = do
+ target <- getTarget
+ genericBlock anyLineNewline (B.rawBlock target) "'''"
+
+-- Generic
+
+genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks
+genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s
+
+blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks
+blockMarkupArea p f s = try $ (do
+ string s *> blankline
+ f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline))))
+
+blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks
+blockMarkupLine p f s = try (f <$> (string s *> space *> p))
+
+-- Can be in either block or inline position
+comment :: Monoid a => T2T a
+comment = try $ do
+ atStart
+ notFollowedBy macro
+ mempty <$ (char '%' *> anyLine)
+
+-- Inline
+
+parseInlines :: T2T Inlines
+parseInlines = trimInlines . mconcat <$> many1 inline
+
+inline :: T2T Inlines
+inline = do
+ choice
+ [ endline
+ , macro
+ , commentLine
+ , whitespace
+ , url
+ , link
+ , image
+ , bold
+ , underline
+ , code
+ , raw
+ , tagged
+ , strike
+ , italic
+ , code
+ , str
+ , symbol
+ ]
+
+bold :: T2T Inlines
+bold = inlineMarkup inline B.strong '*' (B.str)
+
+underline :: T2T Inlines
+underline = inlineMarkup inline B.emph '_' (B.str)
+
+strike :: T2T Inlines
+strike = inlineMarkup inline B.strikeout '-' (B.str)
+
+italic :: T2T Inlines
+italic = inlineMarkup inline B.emph '/' (B.str)
+
+code :: T2T Inlines
+code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id
+
+raw :: T2T Inlines
+raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id
+
+tagged :: T2T Inlines
+tagged = do
+ target <- getTarget
+ inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id
+
+-- Parser for markup indicated by a double character.
+-- Inline markup is greedy and glued
+-- Greedy meaning ***a*** = Bold [Str "*a*"]
+-- Glued meaning that markup must be tight to content
+-- Markup can't pass newlines
+inlineMarkup :: Monoid a
+ => (T2T a) -- Content parser
+ -> (a -> Inlines) -- Constructor
+ -> Char -- Fence
+ -> (String -> a) -- Special Case to handle ******
+ -> T2T Inlines
+inlineMarkup p f c special = try $ do
+ start <- many1 (char c)
+ let l = length start
+ guard (l >= 2)
+ when (l == 2) (void $ notFollowedBy space)
+ -- We must make sure that there is no space before the start of the
+ -- closing tags
+ body <- optionMaybe (try $ manyTill (noneOf "\n\r") $
+ (try $ lookAhead (noneOf " " >> string [c,c] )))
+ case body of
+ Just middle -> do
+ lastChar <- anyChar
+ end <- many1 (char c)
+ let parser inp = parseFromString (mconcat <$> many p) inp
+ let start' = special (drop 2 start)
+ body' <- parser (middle ++ [lastChar])
+ let end' = special (drop 2 end)
+ return $ f (start' <> body' <> end')
+ Nothing -> do -- Either bad or case such as *****
+ guard (l >= 5)
+ let body' = (replicate (l - 4) c)
+ return $ f (special body')
+
+link :: T2T Inlines
+link = try imageLink <|> titleLink
+
+-- Link with title
+titleLink :: T2T Inlines
+titleLink = try $ do
+ char '['
+ notFollowedBy space
+ tokens <- sepBy1 (many $ noneOf " ]") space
+ guard (length tokens >= 2)
+ char ']'
+ let link' = last tokens
+ guard (length link' > 0)
+ let tit = concat (intersperse " " (init tokens))
+ return $ B.link link' "" (B.text tit)
+
+-- Link with image
+imageLink :: T2T Inlines
+imageLink = try $ do
+ char '['
+ body <- image
+ many1 space
+ l <- manyTill (noneOf "\n\r ") (char ']')
+ return (B.link l "" body)
+
+macro :: T2T Inlines
+macro = try $ do
+ name <- string "%%" *> oneOfStringsCI (map fst commands)
+ optional (try $ enclosed (char '(') (char ')') anyChar)
+ lookAhead (spaceChar <|> oneOf specialChars <|> newline)
+ maybe (return mempty) (\f -> B.str <$> asks f) (lookup name commands)
+ where
+ commands = [ ("date", date), ("mtime", mtime)
+ , ("infile", infile), ("outfile", outfile)]
+
+-- raw URLs in text are automatically linked
+url :: T2T Inlines
+url = try $ do
+ (rawUrl, escapedUrl) <- (try uri <|> emailAddress)
+ return $ B.link rawUrl "" (B.str escapedUrl)
+
+uri :: T2T (String, String)
+uri = try $ do
+ address <- t2tURI
+ return (address, escapeURI address)
+
+-- The definition of a URI in the T2T source differs from the
+-- actual definition. This is a transcription of the definition in
+-- the source of v2.6
+--isT2TURI :: String -> Bool
+--isT2TURI (parse t2tURI "" -> Right _) = True
+--isT2TURI _ = False
+
+t2tURI :: T2T String
+t2tURI = do
+ start <- try ((++) <$> proto <*> urlLogin) <|> guess
+ domain <- many1 chars
+ sep <- many (char '/')
+ form' <- option mempty ((:) <$> char '?' <*> many1 form)
+ anchor' <- option mempty ((:) <$> char '#' <*> many anchor)
+ return (start ++ domain ++ sep ++ form' ++ anchor')
+ where
+ protos = ["http", "https", "ftp", "telnet", "gopher", "wais"]
+ proto = (++) <$> oneOfStrings protos <*> string "://"
+ guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23"))
+ <|> stringAnyCase "ftp") <*> ((:[]) <$> char '.')
+ login = alphaNum <|> oneOf "_.-"
+ pass = many (noneOf " @")
+ chars = alphaNum <|> oneOf "%._/~:,=$@&+-"
+ anchor = alphaNum <|> oneOf "%._0"
+ form = chars <|> oneOf ";*"
+ urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@')
+
+
+image :: T2T Inlines
+image = try $ do
+ -- List taken from txt2tags source
+ let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"]
+ char '['
+ path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions))
+ ext <- oneOfStrings extensions
+ char ']'
+ return $ B.image (path ++ ext) "" mempty
+
+-- Characters used in markup
+specialChars :: String
+specialChars = "%*-_/|:+"
+
+tab :: T2T Char
+tab = char '\t'
+
+space :: T2T Char
+space = char ' '
+
+spaces :: T2T String
+spaces = many space
+
+endline :: T2T Inlines
+endline = try $ do
+ newline
+ notFollowedBy blankline
+ notFollowedBy hrule
+ notFollowedBy title
+ notFollowedBy verbatim
+ notFollowedBy rawBlock
+ notFollowedBy taggedBlock
+ notFollowedBy quote
+ notFollowedBy list
+ notFollowedBy table
+ return $ B.space
+
+str :: T2T Inlines
+str = try $ do
+ B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
+
+whitespace :: T2T Inlines
+whitespace = try $ B.space <$ spaceChar
+
+symbol :: T2T Inlines
+symbol = B.str . (:[]) <$> oneOf specialChars
+
+-- Utility
+
+getTarget :: T2T String
+getTarget = do
+ mv <- lookupMeta "target" . stateMeta <$> getState
+ let MetaString target = fromMaybe (MetaString "html") mv
+ return target
+
+atStart :: T2T ()
+atStart = (sourceColumn <$> getPosition) >>= guard . (== 1)
+
+ignoreSpacesCap :: T2T String -> T2T String
+ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces)
+