diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
| -rw-r--r-- | src/Text/Pandoc/Readers/DocBook.hs | 7 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 805 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Fonts.hs | 238 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Lists.hs | 35 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 981 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Reducible.hs | 183 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/EPUB.hs | 285 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 341 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Haddock.hs | 40 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 5 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 236 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/MediaWiki.hs | 5 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 51 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 13 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/TeXMath.hs | 20 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 24 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Txt2Tags.hs | 548 |
17 files changed, 2743 insertions, 1074 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index cf1d5132e..1e119e729 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -769,7 +769,12 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ strContent e + $ trimNl $ strContentRecursive e + strContentRecursive = strContent . (\e' -> e'{ elContent = + map elementToStr $ elContent e' }) + elementToStr :: Content -> Content + elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing + elementToStr x = x parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 09c2330fb..085ee01fc 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards #-} + {- Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> @@ -76,48 +78,188 @@ import Codec.Archive.Zip import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (text, toList) -import Text.Pandoc.Generic (bottomUp) -import Text.Pandoc.MIME (getMimeType) -import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists -import Data.Maybe (mapMaybe, isJust, fromJust) -import Data.List (delete, isPrefixOf, (\\), intersect) -import qualified Data.ByteString as BS +import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Shared +import Text.Pandoc.MediaBag (insertMedia, MediaBag) +import Data.Maybe (mapMaybe) +import Data.List (delete, stripPrefix, (\\), intersect) +import Data.Monoid +import Text.TeXMath (writeTeX) +import Data.Default (Default) import qualified Data.ByteString.Lazy as B -import Data.ByteString.Base64 (encode) -import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State +import Control.Applicative ((<$>)) readDocx :: ReaderOptions -> B.ByteString - -> Pandoc + -> (Pandoc, MediaBag) readDocx opts bytes = case archiveToDocx (toArchive bytes) of - Just docx -> Pandoc nullMeta (docxToBlocks opts docx) - Nothing -> error $ "couldn't parse docx file" - -runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)]) -runStyleToSpanAttr rPr = ("", - mapMaybe id [ - if isBold rPr then (Just "strong") else Nothing, - if isItalic rPr then (Just "emph") else Nothing, - if isSmallCaps rPr then (Just "smallcaps") else Nothing, - if isStrike rPr then (Just "strike") else Nothing, - if isSuperScript rPr then (Just "superscript") else Nothing, - if isSubScript rPr then (Just "subscript") else Nothing, - rStyle rPr], - case underline rPr of - Just fmt -> [("underline", fmt)] - _ -> [] - ) - -parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)]) -parStyleToDivAttr pPr = ("", - pStyle pPr, - case indent pPr of - Just n -> [("indent", (show n))] - Nothing -> [] - ) + Right docx -> (Pandoc meta blks, mediaBag) where + (meta, blks, mediaBag) = (docxToOutput opts docx) + Left _ -> error $ "couldn't parse docx file" + +data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag } + +instance Default DState where + def = DState { docxAnchorMap = M.empty + , docxMediaBag = mempty } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxInHeaderBlock :: Bool } + +instance Default DEnv where + def = DEnv def False + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + +-- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + +divsToKeep :: [String] +divsToKeep = ["list-item", "Definition", "DefinitionTerm"] + +metaStyles :: M.Map String String +metaStyles = M.fromList [ ("Title", "title") + , ("Subtitle", "subtitle") + , ("Author", "author") + , ("Date", "date") + , ("Abstract", "abstract")] + +sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) +sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) + +isMetaPar :: BodyPart -> Bool +isMetaPar (Paragraph pPr _) = + not $ null $ intersect (pStyle pPr) (M.keys metaStyles) +isMetaPar _ = False + +isEmptyPar :: BodyPart -> Bool +isEmptyPar (Paragraph _ parParts) = + all isEmptyParPart parParts + where + isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems + isEmptyParPart _ = False + isEmptyElem (TextRun s) = trim s == "" + isEmptyElem _ = True +isEmptyPar _ = False + +bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' [] = return M.empty +bodyPartsToMeta' (bp : bps) + | (Paragraph pPr parParts) <- bp + , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (Just metaField) <- M.lookup c metaStyles = do + inlines <- parPartsToInlines parParts + remaining <- bodyPartsToMeta' bps + let + f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f m (MetaList mv) = MetaList (m : mv) + f m n = MetaList [m, n] + return $ M.insertWith f metaField (MetaInlines inlines) remaining +bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps + +bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta bps = do + mp <- bodyPartsToMeta' bps + let mp' = + case M.lookup "author" mp of + Just mv -> M.insert "author" (fixAuthors mv) mp + Nothing -> mp + return $ Meta mp' + +fixAuthors :: MetaValue -> MetaValue +fixAuthors (MetaBlocks blks) = + MetaList $ map g $ filter f blks + where f (Para _) = True + f _ = False + g (Para ils) = MetaInlines ils + g _ = MetaInlines [] +fixAuthors mv = mv + +runStyleToContainers :: RunStyle -> [Container Inline] +runStyleToContainers rPr = + let spanClassToContainers :: String -> [Container Inline] + spanClassToContainers s | s `elem` codeSpans = + [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))] + spanClassToContainers s | s `elem` spansToKeep = + [Container $ Span ("", [s], [])] + spanClassToContainers _ = [] + + classContainers = case rStyle rPr of + Nothing -> [] + Just s -> spanClassToContainers s + + formatters = map Container $ mapMaybe id + [ if isBold rPr then (Just Strong) else Nothing + , if isItalic rPr then (Just Emph) else Nothing + , if isSmallCaps rPr then (Just SmallCaps) else Nothing + , if isStrike rPr then (Just Strikeout) else Nothing + , if isSuperScript rPr then (Just Superscript) else Nothing + , if isSubScript rPr then (Just Subscript) else Nothing + , rUnderline rPr >>= + (\f -> if f == "single" then (Just Emph) else Nothing) + ] + in + classContainers ++ formatters + +parStyleToContainers :: ParagraphStyle -> [Container Block] +parStyleToContainers pPr | (c:cs) <- pStyle pPr, Just n <- isHeaderClass c = + [Container $ \_ -> Header n ("", delete ("Heading" ++ show n) cs, []) []] +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` divsToKeep = + let pPr' = pPr { pStyle = cs } + in + (Container $ Div ("", [c], [])) : (parStyleToContainers pPr') +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` codeDivs = + -- This is a bit of a cludge. We make the codeblock from the raw + -- parparts in bodyPartToBlocks. But we need something to match against. + let pPr' = pPr { pStyle = cs } + in + (Container $ \_ -> CodeBlock ("", [], []) "") : (parStyleToContainers pPr') +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` listParagraphDivs = + let pPr' = pPr { pStyle = cs, indentation = Nothing} + in + (Container $ Div ("", [c], [])) : (parStyleToContainers pPr') + +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` blockQuoteDivs = + let pPr' = pPr { pStyle = cs \\ blockQuoteDivs } + in + (Container BlockQuote) : (parStyleToContainers pPr') +parStyleToContainers pPr | (_:cs) <- pStyle pPr = + let pPr' = pPr { pStyle = cs} + in + parStyleToContainers pPr' +parStyleToContainers pPr | null (pStyle pPr), + Just left <- indentation pPr >>= leftParIndent, + Just hang <- indentation pPr >>= hangingParIndent = + let pPr' = pPr { indentation = Nothing } + in + case (left - hang) > 0 of + True -> (Container BlockQuote) : (parStyleToContainers pPr') + False -> parStyleToContainers pPr' +parStyleToContainers pPr | null (pStyle pPr), + Just left <- indentation pPr >>= leftParIndent = + let pPr' = pPr { indentation = Nothing } + in + case left > 0 of + True -> (Container BlockQuote) : (parStyleToContainers pPr') + False -> parStyleToContainers pPr' +parStyleToContainers _ = [] + strToInlines :: String -> [Inline] strToInlines = toList . text @@ -144,115 +286,103 @@ runElemToString (Tab) = ['\t'] runElemsToString :: [RunElem] -> String runElemsToString = concatMap runElemToString ---- We use this instead of the more general ---- Text.Pandoc.Shared.normalize for reasons of efficiency. For ---- whatever reason, `normalize` makes a run take almost twice as ---- long. (It does more, but this does what we need) -inlineNormalize :: [Inline] -> [Inline] -inlineNormalize [] = [] -inlineNormalize (Str "" : ils) = inlineNormalize ils -inlineNormalize ((Str s) : (Str s') : l) = - inlineNormalize (Str (s++s') : l) -inlineNormalize ((Emph ils) : (Emph ils') : l) = - inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Emph ils) : l) = - Emph (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Strong ils) : (Strong ils') : l) = - inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Strong ils) : l) = - Strong (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) = - inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Strikeout ils) : l) = - Strikeout (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Superscript ils) : (Superscript ils') : l) = - inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Superscript ils) : l) = - Superscript (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Subscript ils) : (Subscript ils') : l) = - inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Subscript ils) : l) = - Subscript (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Space : Space : l)) = - inlineNormalize $ (Space : l) -inlineNormalize ((Quoted qt ils) : l) = - Quoted qt (inlineNormalize ils) : inlineNormalize l -inlineNormalize ((Cite cits ils) : l) = - let - f :: Citation -> Citation - f (Citation s pref suff mode num hash) = - Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash - in - Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Link ils s) : l) = - Link (inlineNormalize ils) s : (inlineNormalize l) -inlineNormalize ((Image ils s) : l) = - Image (inlineNormalize ils) s : (inlineNormalize l) -inlineNormalize ((Note blks) : l) = - Note (map blockNormalize blks) : (inlineNormalize l) -inlineNormalize ((Span attr ils) : l) = - Span attr (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize (il : l) = il : (inlineNormalize l) - -stripSpaces :: [Inline] -> [Inline] -stripSpaces ils = - reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils - -blockNormalize :: Block -> Block -blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils -blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils -blockNormalize (Header n attr ils) = - Header n attr $ stripSpaces $ inlineNormalize ils -blockNormalize (Table ils align width hdr cells) = - Table (stripSpaces $ inlineNormalize ils) align width hdr cells -blockNormalize (DefinitionList pairs) = - DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs -blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks) -blockNormalize (OrderedList attr blkslst) = - OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst -blockNormalize (BulletList blkslst) = - BulletList $ map (\blks -> map blockNormalize blks) blkslst -blockNormalize (Div attr blks) = Div attr (map blockNormalize blks) -blockNormalize blk = blk - -runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) - | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans = - case runStyleToSpanAttr rs == ("", [], []) of - True -> [Str (runElemsToString runElems)] - False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]] - | otherwise = case runStyleToSpanAttr rs == ("", [], []) of - True -> concatMap runElemToInlines runElems - False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)] -runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = - case (getFootNote fnId notes) of - Just bodyParts -> - [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] - Nothing -> - [Note [Div ("", ["footnote"], []) []]] -runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = - case (getEndNote fnId notes) of - Just bodyParts -> - [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] - Nothing -> - [Note [Div ("", ["endnote"], []) []]] - -parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines _ _ (BookMark _ anchor) = - [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = - case lookupRelationship relid rels of - Just target -> [Image [] (combine "word" target, "")] - Nothing -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = - [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = - case lookupRelationship relid rels of - Just target -> - [Link (concatMap (runToInlines opts docx) runs) (target, "")] - Nothing -> - [Link (concatMap (runToInlines opts docx) runs) ("", "")] +runToString :: Run -> String +runToString (Run _ runElems) = runElemsToString runElems +runToString _ = "" + +parPartToString :: ParPart -> String +parPartToString (PlainRun run) = runToString run +parPartToString (InternalHyperLink _ runs) = concatMap runToString runs +parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs +parPartToString _ = "" + + +inlineCodeContainer :: Container Inline -> Bool +inlineCodeContainer (Container f) = case f [] of + Code _ "" -> True + _ -> False +inlineCodeContainer _ = False + + +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) + | any inlineCodeContainer (runStyleToContainers rs) = + return $ + rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] + | otherwise = + return $ + rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) +runToInlines (Footnote bps) = + concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) +runToInlines (Endnote bps) = + concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) +runToInlines (InlineDrawing fp bs) = do + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + return [Image [] (fp, "")] + + + + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- get whether we're in a header. + inHdrBool <- asks docxInHeaderBlock + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- We don't want to rewrite if we're in a header, since we'll take + -- care of that later, when we make the header anchor. If the + -- bookmark were already in uniqueIdent form, this would lead to a + -- duplication. Otherwise, we check to see if the id is already in + -- there. Rewrite if necessary. This will have the possible effect + -- of rewriting user-defined anchor links. However, since these + -- are not defined in pandoc, it seems like a necessary evil to + -- avoid an extra pass. + let newAnchor = + if not inHdrBool && anchor `elem` (M.elems anchorMap) + then uniqueIdent [Str anchor] (M.elems anchorMap) + else anchor + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) + return [Span (newAnchor, ["anchor"], []) []] +parPartToInlines (Drawing fp bs) = do + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + return [Image [] (fp, "")] +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink target runs) = do + ils <- concatMapM runToInlines runs + return [Link ils (target, "")] +parPartToInlines (PlainOMath exps) = do + return [Math InlineMath (writeTeX exps)] + isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (ident, classes, kvs) ils) = @@ -265,74 +395,106 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchors :: Block -> Block -makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = - case filter isAnchorSpan ils of - [] -> h - (x@(Span (ident, _, _) _) : xs) -> - case ident `elem` dummyAnchors of - True -> h - False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) - _ -> h -makeHeaderAnchors blk = blk - - -parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = - -- - -- We're going to skip data-uri's for now. It should be an option, - -- not mandatory. - -- - (if False -- TODO depend on option - then bottomUp (makeImagesSelfContained docx) - else id) $ - bottomUp spanTrim $ - bottomUp spanCorrect $ - bottomUp spanReduce $ - concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps - -rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells - -bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block -bodyPartToBlock opts docx (Paragraph pPr parparts) = - Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)] -bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = +makeHeaderAnchor :: Block -> DocxContext Block +-- If there is an anchor already there (an anchor span in the header, +-- to be exact), we rename and associate the new id with the old one. +makeHeaderAnchor (Header n (_, classes, kvs) ils) + | (x : xs) <- filter isAnchorSpan ils + , (Span (ident, _, _) _) <- x + , notElem ident dummyAnchors = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) +-- Otherwise we just give it a name, and register that name (associate +-- it with itself.) +makeHeaderAnchor (Header n (_, classes, kvs) ils) = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) ils +makeHeaderAnchor blk = return blk + + +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells + +isBlockCodeContainer :: Container Block -> Bool +isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True +isBlockCodeContainer _ = False + +isHeaderContainer :: Container Block -> Bool +isHeaderContainer (Container f) | Header _ _ _ <- f [] = True +isHeaderContainer _ = False + +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) + | any isBlockCodeContainer (parStyleToContainers pPr) = + let + otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr) + in + return $ + rebuild + otherConts + [CodeBlock ("", [], []) (concatMap parPartToString parparts)] +bodyPartToBlocks (Paragraph pPr parparts) + | any isHeaderContainer (parStyleToContainers pPr) = do + ils <- normalizeSpaces <$> local (\s -> s{docxInHeaderBlock = True}) + (parPartsToInlines parparts) + let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) + Header n attr _ = hdrFun [] + hdr <- makeHeaderAnchor $ Header n attr ils + return [hdr] +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + case ils of + [] -> return [] + _ -> do + return $ + rebuild + (parStyleToContainers pPr) + [Para ils] +bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do let - kvs = case lookupLevel numId lvl numbering of - Just (_, fmt, txt, Just start) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - , ("start", (show start)) - ] - - Just (_, fmt, txt, Nothing) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - ] - Nothing -> [] - in - Div - ("", ["list-item"], kvs) - [bodyPartToBlock opts docx (Paragraph pPr parparts)] -bodyPartToBlock _ _ (Tbl _ _ _ []) = - Para [] -bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = + kvs = case levelInfo of + (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do let caption = strToInlines cap (hdr, rows) = case firstRowFormatting look of True -> (Just r, rs) False -> (Nothing, r:rs) - hdrCells = case hdr of - Just r' -> rowToBlocksList opts docx r' - Nothing -> [] - cells = map (rowToBlocksList opts docx) rows + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] - size = case null hdrCells of + cells <- mapM rowToBlocksList rows + + let size = case null hdrCells of True -> length $ head cells False -> length $ hdrCells -- @@ -341,208 +503,49 @@ bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = -- moment. Width information is in the TblGrid field of the Tbl, -- so should be possible. Alignment might be more difficult, -- since there doesn't seem to be a column entity in docx. - alignments = take size (repeat AlignDefault) - widths = take size (repeat 0) :: [Double] - in - Table caption alignments widths hdrCells cells - -makeImagesSelfContained :: Docx -> Inline -> Inline -makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = - case lookup uri media of - Just bs -> case getMimeType uri of - Just mime -> let data_uri = - "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) - in - Image alt (data_uri, title) - Nothing -> i - Nothing -> i -makeImagesSelfContained _ inline = inline - -bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = - bottomUp removeEmptyPars $ - map blockNormalize $ - bottomUp spanRemove $ - bottomUp divRemove $ - map (makeHeaderAnchors) $ - bottomUp divCorrect $ - bottomUp divReduce $ - bottomUp divCorrectPreReduce $ - bottomUp blocksToDefinitions $ - blocksToBullets $ - map (bodyPartToBlock opts docx) bps - -docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body - -spanReduce :: [Inline] -> [Inline] -spanReduce [] = [] -spanReduce ((Span (id1, classes1, kvs1) ils1) : ils) - | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils) -spanReduce (s1@(Span (id1, classes1, kvs1) ils1) : - s2@(Span (id2, classes2, kvs2) ils2) : - ils) = - let classes' = classes1 `intersect` classes2 - kvs' = kvs1 `intersect` kvs2 - classes1' = classes1 \\ classes' - kvs1' = kvs1 \\ kvs' - classes2' = classes2 \\ classes' - kvs2' = kvs2 \\ kvs' - in - case null classes' && null kvs' of - True -> s1 : (spanReduce (s2 : ils)) - False -> let attr' = ("", classes', kvs') - attr1' = (id1, classes1', kvs1') - attr2' = (id2, classes2', kvs2') - in - spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] : - ils) -spanReduce (il:ils) = il : (spanReduce ils) + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] +bodyPartToBlocks (OMathPara e) = do + return [Para [Math DisplayMath (writeTeX e)]] + + +-- replace targets with generated anchors. +rewriteLink :: Inline -> DocxContext Inline +rewriteLink l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink il = return il + +bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) +bodyToOutput (Body bps) = do + let (metabps, blkbps) = sepBodyParts bps + meta <- bodyPartsToMeta metabps + blks <- concatMapM bodyPartToBlocks blkbps >>= + walkM rewriteLink + mediaBag <- gets docxMediaBag + return $ (meta, + blocksToDefinitions $ blocksToBullets $ blks, + mediaBag) + +docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) +docxToOutput opts (Docx (Document _ body)) = + let dEnv = def { docxOptions = opts} in + evalDocxContext (bodyToOutput body) dEnv def + ilToCode :: Inline -> String ilToCode (Str s) = s -ilToCode _ = "" - -spanRemove' :: Inline -> [Inline] -spanRemove' s@(Span (ident, classes, _) []) - -- "_GoBack" is automatically inserted. We don't want to keep it. - | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s] -spanRemove' (Span (_, _, kvs) ils) = - case lookup "underline" kvs of - Just val -> [Span ("", [], [("underline", val)]) ils] - Nothing -> ils -spanRemove' il = [il] - -spanRemove :: [Inline] -> [Inline] -spanRemove = concatMap spanRemove' - -spanTrim' :: Inline -> [Inline] -spanTrim' il@(Span _ []) = [il] -spanTrim' il@(Span attr (il':[])) - | il' == Space = [Span attr [], Space] - | otherwise = [il] -spanTrim' (Span attr ils) - | head ils == Space && last ils == Space = - [Space, Span attr (init $ tail ils), Space] - | head ils == Space = [Space, Span attr (tail ils)] - | last ils == Space = [Span attr (init ils), Space] -spanTrim' il = [il] - -spanTrim :: [Inline] -> [Inline] -spanTrim = concatMap spanTrim' - -spanCorrect' :: Inline -> [Inline] -spanCorrect' (Span ("", [], []) ils) = ils -spanCorrect' (Span (ident, classes, kvs) ils) - | "emph" `elem` classes = - [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils] - | "strong" `elem` classes = - [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils] - | "smallcaps" `elem` classes = - [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils] - | "strike" `elem` classes = - [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils] - | "superscript" `elem` classes = - [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils] - | "subscript" `elem` classes = - [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils] - | (not . null) (codeSpans `intersect` classes) = - [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)] - | otherwise = - [Span (ident, classes, kvs) ils] -spanCorrect' il = [il] - -spanCorrect :: [Inline] -> [Inline] -spanCorrect = concatMap spanCorrect' - -removeEmptyPars :: [Block] -> [Block] -removeEmptyPars blks = filter (\b -> b /= (Para [])) blks - -divReduce :: [Block] -> [Block] -divReduce [] = [] -divReduce ((Div (id1, classes1, kvs1) blks1) : blks) - | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks) -divReduce (d1@(Div (id1, classes1, kvs1) blks1) : - d2@(Div (id2, classes2, kvs2) blks2) : - blks) = - let classes' = classes1 `intersect` classes2 - kvs' = kvs1 `intersect` kvs2 - classes1' = classes1 \\ classes' - kvs1' = kvs1 \\ kvs' - classes2' = classes2 \\ classes' - kvs2' = kvs2 \\ kvs' - in - case null classes' && null kvs' of - True -> d1 : (divReduce (d2 : blks)) - False -> let attr' = ("", classes', kvs') - attr1' = (id1, classes1', kvs1') - attr2' = (id2, classes2', kvs2') - in - divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] : - blks) -divReduce (blk:blks) = blk : (divReduce blks) +ilToCode Space = " " +ilToCode _ = "" isHeaderClass :: String -> Maybe Int -isHeaderClass s | "Heading" `isPrefixOf` s = - case reads (drop (length "Heading") s) :: [(Int, String)] of +isHeaderClass s | Just s' <- stripPrefix "Heading" s = + case reads s' :: [(Int, String)] of [] -> Nothing ((n, "") : []) -> Just n _ -> Nothing isHeaderClass _ = Nothing - -findHeaderClass :: [String] -> Maybe Int -findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of - [] -> Nothing - n : _ -> Just n - -blksToInlines :: [Block] -> [Inline] -blksToInlines (Para ils : _) = ils -blksToInlines (Plain ils : _) = ils -blksToInlines _ = [] - -divCorrectPreReduce' :: Block -> [Block] -divCorrectPreReduce' (Div (ident, classes, kvs) blks) - | isJust $ findHeaderClass classes = - let n = fromJust $ findHeaderClass classes - in - [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)] - | otherwise = [Div (ident, classes, kvs) blks] -divCorrectPreReduce' blk = [blk] - -divCorrectPreReduce :: [Block] -> [Block] -divCorrectPreReduce = concatMap divCorrectPreReduce' - -blkToCode :: Block -> String -blkToCode (Para []) = "" -blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils)) -blkToCode (Para ((Span (_, classes, _) ils'): ils)) - | (not . null) (codeSpans `intersect` classes) = - (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils)) -blkToCode _ = "" - -divRemove' :: Block -> [Block] -divRemove' (Div (_, _, kvs) blks) = - case lookup "indent" kvs of - Just val -> [Div ("", [], [("indent", val)]) blks] - Nothing -> blks -divRemove' blk = [blk] - -divRemove :: [Block] -> [Block] -divRemove = concatMap divRemove' - -divCorrect' :: Block -> [Block] -divCorrect' b@(Div (ident, classes, kvs) blks) - | (not . null) (blockQuoteDivs `intersect` classes) = - [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]] - | (not . null) (codeDivs `intersect` classes) = - [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)] - | otherwise = - case lookup "indent" kvs of - Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks] - Just _ -> - [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]] - Nothing -> [b] -divCorrect' blk = [blk] - -divCorrect :: [Block] -> [Block] -divCorrect = concatMap divCorrect' diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs new file mode 100644 index 000000000..b44c71412 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs @@ -0,0 +1,238 @@ +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fonts + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + Stability : alpha + Portability : portable + +Utilities to convert between font codepoints and unicode characters. +-} +module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where + + +-- | Enumeration of recognised fonts +data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol> + deriving (Show, Eq) + +-- | Given a font and codepoint, returns the corresponding unicode +-- character +getUnicode :: Font -> Char -> Maybe Char +getUnicode Symbol c = lookup c symbol + +-- Generated from lib/fonts/symbol.txt +symbol :: [(Char, Char)] +symbol = + [ (' ',' ') + , (' ','\160') + , ('!','!') + , ('"','\8704') + , ('#','#') + , ('$','\8707') + , ('%','%') + , ('&','&') + , ('\'','\8715') + , ('(','(') + , (')',')') + , ('*','\8727') + , ('+','+') + , (',',',') + , ('-','\8722') + , ('.','.') + , ('/','/') + , ('0','0') + , ('1','1') + , ('2','2') + , ('3','3') + , ('4','4') + , ('5','5') + , ('6','6') + , ('7','7') + , ('8','8') + , ('9','9') + , (':',':') + , (';',';') + , ('<','<') + , ('=','=') + , ('>','>') + , ('?','?') + , ('@','\8773') + , ('A','\913') + , ('B','\914') + , ('C','\935') + , ('D','\916') + , ('D','\8710') + , ('E','\917') + , ('F','\934') + , ('G','\915') + , ('H','\919') + , ('I','\921') + , ('J','\977') + , ('K','\922') + , ('L','\923') + , ('M','\924') + , ('N','\925') + , ('O','\927') + , ('P','\928') + , ('Q','\920') + , ('R','\929') + , ('S','\931') + , ('T','\932') + , ('U','\933') + , ('V','\962') + , ('W','\937') + , ('W','\8486') + , ('X','\926') + , ('Y','\936') + , ('Z','\918') + , ('[','[') + , ('\\','\8756') + , (']',']') + , ('^','\8869') + , ('_','_') + , ('`','\63717') + , ('a','\945') + , ('b','\946') + , ('c','\967') + , ('d','\948') + , ('e','\949') + , ('f','\966') + , ('g','\947') + , ('h','\951') + , ('i','\953') + , ('j','\981') + , ('k','\954') + , ('l','\955') + , ('m','\181') + , ('m','\956') + , ('n','\957') + , ('o','\959') + , ('p','\960') + , ('q','\952') + , ('r','\961') + , ('s','\963') + , ('t','\964') + , ('u','\965') + , ('v','\982') + , ('w','\969') + , ('x','\958') + , ('y','\968') + , ('z','\950') + , ('{','{') + , ('|','|') + , ('}','}') + , ('~','\8764') + , ('\160','\8364') + , ('\161','\978') + , ('\162','\8242') + , ('\163','\8804') + , ('\164','\8260') + , ('\164','\8725') + , ('\165','\8734') + , ('\166','\402') + , ('\167','\9827') + , ('\168','\9830') + , ('\169','\9829') + , ('\170','\9824') + , ('\171','\8596') + , ('\172','\8592') + , ('\173','\8593') + , ('\174','\8594') + , ('\175','\8595') + , ('\176','\176') + , ('\177','\177') + , ('\178','\8243') + , ('\179','\8805') + , ('\180','\215') + , ('\181','\8733') + , ('\182','\8706') + , ('\183','\8226') + , ('\184','\247') + , ('\185','\8800') + , ('\186','\8801') + , ('\187','\8776') + , ('\188','\8230') + , ('\189','\63718') + , ('\190','\63719') + , ('\191','\8629') + , ('\192','\8501') + , ('\193','\8465') + , ('\194','\8476') + , ('\195','\8472') + , ('\196','\8855') + , ('\197','\8853') + , ('\198','\8709') + , ('\199','\8745') + , ('\200','\8746') + , ('\201','\8835') + , ('\202','\8839') + , ('\203','\8836') + , ('\204','\8834') + , ('\205','\8838') + , ('\206','\8712') + , ('\207','\8713') + , ('\208','\8736') + , ('\209','\8711') + , ('\210','\63194') + , ('\211','\63193') + , ('\212','\63195') + , ('\213','\8719') + , ('\214','\8730') + , ('\215','\8901') + , ('\216','\172') + , ('\217','\8743') + , ('\218','\8744') + , ('\219','\8660') + , ('\220','\8656') + , ('\221','\8657') + , ('\222','\8658') + , ('\223','\8659') + , ('\224','\9674') + , ('\225','\9001') + , ('\226','\63720') + , ('\227','\63721') + , ('\228','\63722') + , ('\229','\8721') + , ('\230','\63723') + , ('\231','\63724') + , ('\232','\63725') + , ('\233','\63726') + , ('\234','\63727') + , ('\235','\63728') + , ('\236','\63729') + , ('\237','\63730') + , ('\238','\63731') + , ('\239','\63732') + , ('\241','\9002') + , ('\242','\8747') + , ('\243','\8992') + , ('\244','\63733') + , ('\245','\8993') + , ('\246','\63734') + , ('\247','\63735') + , ('\248','\63736') + , ('\249','\63737') + , ('\250','\63738') + , ('\251','\63739') + , ('\252','\63740') + , ('\253','\63741') + , ('\254','\63742')] diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index 68559d98b..ea195c14a 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -29,9 +29,12 @@ Functions for converting flat docx paragraphs into nested lists. -} module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets - , blocksToDefinitions) where + , blocksToDefinitions + , listParagraphDivs + ) where import Text.Pandoc.JSON +import Text.Pandoc.Generic (bottomUp) import Text.Pandoc.Shared (trim) import Control.Monad import Data.List @@ -118,7 +121,7 @@ handleListParagraphs ( in handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) - + separateBlocks' :: Block -> [[Block]] -> [[Block]] separateBlocks' blk ([] : []) = [[blk]] separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] @@ -136,7 +139,7 @@ flatToBullets' :: Integer -> [Block] -> [Block] flatToBullets' _ [] = [] flatToBullets' num xs@(b : elems) | getLevelN b == num = b : (flatToBullets' num elems) - | otherwise = + | otherwise = let bNumId = getNumIdN b bLevel = getLevelN b (children, remaining) = @@ -159,10 +162,9 @@ flatToBullets elems = flatToBullets' (-1) elems blocksToBullets :: [Block] -> [Block] blocksToBullets blks = - -- bottomUp removeListItemDivs $ + bottomUp removeListDivs $ flatToBullets $ (handleListParagraphs blks) - plainParaInlines :: Block -> [Inline] plainParaInlines (Plain ils) = ils plainParaInlines (Para ils) = ils @@ -199,10 +201,27 @@ blocksToDefinitions' [] acc (b:blks) = blocksToDefinitions' defAcc acc (b:blks) = blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks +removeListDivs' :: Block -> [Block] +removeListDivs' (Div (ident, classes, kvs) blks) + | "list-item" `elem` classes = + case delete "list-item" classes of + [] -> blks + classes' -> [Div (ident, classes', kvs) $ blks] +removeListDivs' (Div (ident, classes, kvs) blks) + | not $ null $ listParagraphDivs `intersect` classes = + case classes \\ listParagraphDivs of + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] +removeListDivs' blk = [blk] + +removeListDivs :: [Block] -> [Block] +removeListDivs = concatMap removeListDivs' + + blocksToDefinitions :: [Block] -> [Block] blocksToDefinitions = blocksToDefinitions' [] [] - - - + + + diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 18200bcf9..1abd4bc6b 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards, ViewPatterns #-} + {- Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> @@ -8,49 +10,46 @@ the Free Software Foundation; either version 2 of the License, or This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Docx.Parse - Copyright : Copyright (C) 2014 Jesse Rosenthal - License : GNU GPL, version 2 or above + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above - Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> - Stability : alpha - Portability : portable + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable Conversion of docx archive into Docx haskell type -} +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , ParIndentation(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , archiveToDocx + ) where -module Text.Pandoc.Readers.Docx.Parse ( Docx(..) - , Document(..) - , Body(..) - , BodyPart(..) - , TblLook(..) - , ParPart(..) - , Run(..) - , RunElem(..) - , Notes - , Numbering - , Relationship - , Media - , RunStyle(..) - , ParagraphStyle(..) - , Row(..) - , Cell(..) - , getFootNote - , getEndNote - , lookupLevel - , lookupRelationship - , archiveToDocx - ) where import Codec.Archive.Zip import Text.XML.Light import Data.Maybe @@ -59,39 +58,244 @@ import System.FilePath import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad.Reader +import Control.Applicative ((<$>), (<|>)) +import qualified Data.Map as M +import Text.Pandoc.Compat.Except +import Text.TeXMath.Readers.OMML (readOMML) +import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) +import Text.TeXMath (Exp) +import Data.Char (readLitChar, ord, chr) + +data ReaderEnv = ReaderEnv { envNotes :: Notes + , envNumbering :: Numbering + , envRelationships :: [Relationship] + , envMedia :: Media + , envFont :: Maybe Font + } + deriving Show + +data DocxError = DocxError | WrongElem + deriving Show + +instance Error DocxError where + noMsg = WrongElem + +type D = ExceptT DocxError (Reader ReaderEnv) + +runD :: D a -> ReaderEnv -> Either DocxError a +runD dx re = runReader (runExceptT dx ) re + +maybeToD :: Maybe a -> D a +maybeToD (Just a) = return a +maybeToD Nothing = throwError DocxError + +eitherToD :: Either a b -> D b +eitherToD (Right b) = return b +eitherToD (Left _) = throwError DocxError + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + + +-- This is similar to `mapMaybe`: it maps a function returning the D +-- monad over a list, and only keeps the non-erroring return values. +mapD :: (a -> D b) -> [a] -> D [b] +mapD f xs = + let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return []) + in + concatMapM handler xs -attrToNSPair :: Attr -> Maybe (String, String) -attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing +type NameSpaces = [(String, String)] +data Docx = Docx Document + deriving Show -type NameSpaces = [(String, String)] +data Document = Document NameSpaces Body + deriving Show -data Docx = Docx Document Notes Numbering [Relationship] Media +data Body = Body [BodyPart] deriving Show -archiveToDocx :: Archive -> Maybe Docx +type Media = [(FilePath, B.ByteString)] + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +data Relationship = Relationship (RelId, Target) + deriving Show + +data Notes = Notes NameSpaces + (Maybe (M.Map String Element)) + (Maybe (M.Map String Element)) + deriving Show + +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer + , hangingParIndent :: Maybe Integer} + deriving Show + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indentation :: Maybe ParIndentation + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indentation = Nothing + } + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String Level [ParPart] + | Tbl String TblGrid TblLook [Row] + | OMathPara [Exp] + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +data Row = Row [Cell] + deriving Show + +data Cell = Cell [BodyPart] + deriving Show + +data ParPart = PlainRun Run + | Insertion ChangeId Author ChangeDate [Run] + | Deletion ChangeId Author ChangeDate [Run] + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink URL [Run] + | Drawing FilePath B.ByteString + | PlainOMath [Exp] + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote [BodyPart] + | Endnote [BodyPart] + | InlineDrawing FilePath B.ByteString + deriving Show + +data RunElem = TextRun String | LnBrk | Tab + deriving Show + +data RunStyle = RunStyle { isBold :: Bool + , isItalic :: Bool + , isSmallCaps :: Bool + , isStrike :: Bool + , isSuperScript :: Bool + , isSubScript :: Bool + , rUnderline :: Maybe String + , rStyle :: Maybe String } + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = False + , isItalic = False + , isSmallCaps = False + , isStrike = False + , isSuperScript = False + , isSubScript = False + , rUnderline = Nothing + , rStyle = Nothing + } + + +type Target = String +type Anchor = String +type URL = String +type BookMarkId = String +type RelId = String +type ChangeId = String +type Author = String +type ChangeDate = String + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + +archiveToDocx :: Archive -> Either DocxError Docx archiveToDocx archive = do - let notes = archiveToNotes archive - rels = archiveToRelationships archive - media = archiveToMedia archive - doc <- archiveToDocument archive - numbering <- archiveToNumbering archive - return $ Docx doc notes numbering rels media - -data Document = Document NameSpaces Body - deriving Show + let notes = archiveToNotes archive + numbering = archiveToNumbering archive + rels = archiveToRelationships archive + media = archiveToMedia archive + rEnv = ReaderEnv notes numbering rels media Nothing + doc <- runD (archiveToDocument archive) rEnv + return $ Docx doc + -archiveToDocument :: Archive -> Maybe Document +archiveToDocument :: Archive -> D Document archiveToDocument zf = do - entry <- findEntryByPath "word/document.xml" zf - docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry - let namespaces = mapMaybe attrToNSPair (elAttribs docElem) - bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem + entry <- maybeToD $ findEntryByPath "word/document.xml" zf + docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem body <- elemToBody namespaces bodyElem return $ Document namespaces body -type Media = [(FilePath, B.ByteString)] +elemToBody :: NameSpaces -> Element -> D Body +elemToBody ns element | isElem ns "w" "body" element = + mapD (elemToBodyPart ns) (elChildren element) >>= + (\bps -> return $ Body bps) +elemToBody _ _ = throwError WrongElem + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + en_namespaces = case enElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = + let (dir, name) = splitFileName fp + in + (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship (relId, target) +relElemToRelationship _ = Nothing + + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + let relPaths = filter filePathIsRel (filesInArchive archive) + entries = mapMaybe (\f -> findEntryByPath f archive) relPaths + relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries + rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems + in + rels filePathIsMedia :: FilePath -> Bool filePathIsMedia fp = @@ -109,18 +313,6 @@ archiveToMedia :: Archive -> Media archiveToMedia zf = mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) -data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] - deriving Show - -data Numb = Numb String String -- right now, only a key to an abstract num - deriving Show - -data AbstractNumb = AbstractNumb String [Level] - deriving Show - --- (ilvl, format, string, start) -type Level = (String, String, String, Maybe Integer) - lookupLevel :: String -> String -> Numbering -> Maybe Level lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs @@ -148,7 +340,7 @@ absNumElemToAbsNum ns element | let levelElems = findChildren (QName "lvl" (lookup "w" ns) (Just "w")) element - levels = mapMaybe id $ map (levelElemToLevel ns) levelElems + levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels absNumElemToAbsNum _ _ = Nothing @@ -167,8 +359,8 @@ levelElemToLevel ns element | return (ilvl, fmt, txt, start) levelElemToLevel _ _ = Nothing -archiveToNumbering :: Archive -> Maybe Numbering -archiveToNumbering zf = +archiveToNumbering' :: Archive -> Maybe Numbering +archiveToNumbering' zf = do case findEntryByPath "word/numbering.xml" zf of Nothing -> Just $ Numbering [] [] [] Just entry -> do @@ -180,321 +372,281 @@ archiveToNumbering zf = absNumElems = findChildren (QName "abstractNum" (lookup "w" namespaces) (Just "w")) numberingElem - nums = mapMaybe id $ map (numElemToNum namespaces) numElems - absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems + nums = mapMaybe (numElemToNum namespaces) numElems + absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems return $ Numbering namespaces nums absNums -data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) - deriving Show - -noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) -noteElemToNote ns element - | qName (elName element) `elem` ["endnote", "footnote"] && - qURI (elName element) == (lookup "w" ns) = - do - noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element - let bps = map fromJust - $ filter isJust - $ map (elemToBodyPart ns) - $ filterChildrenName (isParOrTbl ns) element - return $ (noteId, bps) -noteElemToNote _ _ = Nothing - -getFootNote :: String -> Notes -> Maybe [BodyPart] -getFootNote s (Notes _ fns _) = fns >>= (lookup s) - -getEndNote :: String -> Notes -> Maybe [BodyPart] -getEndNote s (Notes _ _ ens) = ens >>= (lookup s) +archiveToNumbering :: Archive -> Numbering +archiveToNumbering archive = + fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] +elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element) elemToNotes ns notetype element - | qName (elName element) == (notetype ++ "s") && - qURI (elName element) == (lookup "w" ns) = - Just $ map fromJust - $ filter isJust - $ map (noteElemToNote ns) - $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element + | isElem ns "w" (notetype ++ "s") element = + let pairs = mapMaybe + (\e -> findAttr (elemName ns "w" "id") e >>= + (\a -> Just (a, e))) + (findChildren (elemName ns "w" notetype) element) + in + Just $ M.fromList $ pairs elemToNotes _ _ _ = Nothing -archiveToNotes :: Archive -> Notes -archiveToNotes zf = - let fnElem = findEntryByPath "word/footnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - enElem = findEntryByPath "word/endnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - fn_namespaces = case fnElem of - Just e -> mapMaybe attrToNSPair (elAttribs e) - Nothing -> [] - en_namespaces = case enElem of - Just e -> mapMaybe attrToNSPair (elAttribs e) - Nothing -> [] - ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= (elemToNotes ns "footnote") - en = enElem >>= (elemToNotes ns "endnote") - in - Notes ns fn en +--------------------------------------------- +--------------------------------------------- +elemName :: NameSpaces -> String -> String -> QName +elemName ns prefix name = (QName name (lookup prefix ns) (Just prefix)) -data Relationship = Relationship (RelId, Target) - deriving Show +isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem ns prefix name element = + qName (elName element) == name && + qURI (elName element) == (lookup prefix ns) -lookupRelationship :: RelId -> [Relationship] -> Maybe Target -lookupRelationship relid rels = - lookup relid (map (\(Relationship pair) -> pair) rels) -filePathIsRel :: FilePath -> Bool -filePathIsRel fp = - let (dir, name) = splitFileName fp +elemToTblGrid :: NameSpaces -> Element -> D TblGrid +elemToTblGrid ns element | isElem ns "w" "tblGrid" element = + let cols = findChildren (elemName ns "w" "gridCol") element in - (dir == "word/_rels/") && ((takeExtension name) == ".rels") - -relElemToRelationship :: Element -> Maybe Relationship -relElemToRelationship element | qName (elName element) == "Relationship" = - do - relId <- findAttr (QName "Id" Nothing Nothing) element - target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship (relId, target) -relElemToRelationship _ = Nothing - - -archiveToRelationships :: Archive -> [Relationship] -archiveToRelationships archive = - let relPaths = filter filePathIsRel (filesInArchive archive) - entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths - relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries - rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems + mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger)) + cols +elemToTblGrid _ _ = throwError WrongElem + +elemToTblLook :: NameSpaces -> Element -> D TblLook +elemToTblLook ns element | isElem ns "w" "tblLook" element = + let firstRow = findAttr (elemName ns "w" "firstRow") element + val = findAttr (elemName ns "w" "val") element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False in - rels - -data Body = Body [BodyPart] - deriving Show + return $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = throwError WrongElem -isParOrTbl :: NameSpaces -> QName -> Bool -isParOrTbl ns q = qName q `elem` ["p", "tbl"] && - qURI q == (lookup "w" ns) +elemToRow :: NameSpaces -> Element -> D Row +elemToRow ns element | isElem ns "w" "tr" element = + do + let cellElems = findChildren (elemName ns "w" "tc") element + cells <- mapD (elemToCell ns) cellElems + return $ Row cells +elemToRow _ _ = throwError WrongElem -elemToBody :: NameSpaces -> Element -> Maybe Body -elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = - Just $ Body - $ map fromJust - $ filter isJust - $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element -elemToBody _ _ = Nothing +elemToCell :: NameSpaces -> Element -> D Cell +elemToCell ns element | isElem ns "w" "tc" element = + do + cellContents <- mapD (elemToBodyPart ns) (elChildren element) + return $ Cell cellContents +elemToCell _ _ = throwError WrongElem + +elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation +elemToParIndentation ns element | isElem ns "w" "ind" element = + Just $ ParIndentation { + leftParIndent = + findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>= + stringToInteger + , rightParIndent = + findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>= + stringToInteger + , hangingParIndent = + findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>= + stringToInteger} +elemToParIndentation _ _ = Nothing -isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool -isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] && - qURI q == (lookup "w" ns) elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) -elemToNumInfo ns element - | qName (elName element) == "p" && - qURI (elName element) == (lookup "w" ns) = - do - pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element - numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr - lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - return (numId, lvl) +elemToNumInfo ns element | isElem ns "w" "p" element = do + let pPr = findChild (elemName ns "w" "pPr") element + numPr = pPr >>= findChild (elemName ns "w" "numPr") + lvl <- numPr >>= + findChild (elemName ns "w" "ilvl") >>= + findAttr (elemName ns "w" "val") + numId <- numPr >>= + findChild (elemName ns "w" "numId") >>= + findAttr (elemName ns "w" "val") + return (numId, lvl) elemToNumInfo _ _ = Nothing --- isBookMarkTag :: NameSpaces -> QName -> Bool --- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] && --- qURI q == (lookup "w" ns) - --- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark --- parChildrenToBookmark ns (bms : bme : _) --- | qName (elName bms) == "bookmarkStart" && --- qURI (elName bms) == (lookup "w" ns) && --- qName (elName bme) == "bookmarkEnd" && --- qURI (elName bme) == (lookup "w" ns) = do --- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms --- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms --- return $ (bmId, bmName) --- parChildrenToBookmark _ _ = Nothing - -elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart -elemToBodyPart ns element - | qName (elName element) == "p" && - qURI (elName element) == (lookup "w" ns) = - let parstyle = elemToParagraphStyle ns element - parparts = mapMaybe id - $ map (elemToParPart ns) - $ filterChildrenName (isRunOrLinkOrBookmark ns) element - in - case elemToNumInfo ns element of - Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts - Nothing -> Just $ Paragraph parstyle parparts - | qName (elName element) == "tbl" && - qURI (elName element) == (lookup "w" ns) = - let - caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element - >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - grid = case - findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element - of - Just g -> elemToTblGrid ns g - Nothing -> [] - tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element - >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) - >>= elemToTblLook ns - in - Just $ Tbl - (fromMaybe "" caption) - grid - (fromMaybe defaultTblLook tblLook) - (mapMaybe (elemToRow ns) (elChildren element)) - | otherwise = Nothing - -elemToTblLook :: NameSpaces -> Element -> Maybe TblLook -elemToTblLook ns element - | qName (elName element) == "tblLook" && - qURI (elName element) == (lookup "w" ns) = - let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element - val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element - firstRowFmt = - case firstRow of - Just "1" -> True - Just _ -> False - Nothing -> case val of - Just bitMask -> testBitMask bitMask 0x020 - Nothing -> False - in - Just $ TblLook{firstRowFormatting = firstRowFmt} -elemToTblLook _ _ = Nothing - testBitMask :: String -> Int -> Bool testBitMask bitMaskS n = case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> ((n' .|. n) /= 0) -data ParagraphStyle = ParagraphStyle { pStyle :: [String] - , indent :: Maybe Integer - } - deriving Show - -defaultParagraphStyle :: ParagraphStyle -defaultParagraphStyle = ParagraphStyle { pStyle = [] - , indent = Nothing - } - -elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle -elemToParagraphStyle ns element = - case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of - Just pPr -> - ParagraphStyle - {pStyle = - mapMaybe id $ - map - (findAttr (QName "val" (lookup "w" ns) (Just "w"))) - (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) - , indent = - findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= - findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= - stringToInteger - } - Nothing -> defaultParagraphStyle - - -data BodyPart = Paragraph ParagraphStyle [ParPart] - | ListItem ParagraphStyle String String [ParPart] - | Tbl String TblGrid TblLook [Row] - - deriving Show - -type TblGrid = [Integer] - -data TblLook = TblLook {firstRowFormatting::Bool} - deriving Show - -defaultTblLook :: TblLook -defaultTblLook = TblLook{firstRowFormatting = False} - stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) -elemToTblGrid :: NameSpaces -> Element -> TblGrid -elemToTblGrid ns element - | qName (elName element) == "tblGrid" && - qURI (elName element) == (lookup "w" ns) = - let - cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element - in - mapMaybe (\e -> - findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e - >>= stringToInteger - ) - cols -elemToTblGrid _ _ = [] - -data Row = Row [Cell] - deriving Show - - -elemToRow :: NameSpaces -> Element -> Maybe Row -elemToRow ns element - | qName (elName element) == "tr" && - qURI (elName element) == (lookup "w" ns) = - let - cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element - in - Just $ Row (mapMaybe (elemToCell ns) cells) -elemToRow _ _ = Nothing +elemToBodyPart :: NameSpaces -> Element -> D BodyPart +elemToBodyPart ns element + | isElem ns "w" "p" element + , (c:_) <- findChildren (elemName ns "m" "oMathPara") element = + do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst +elemToBodyPart ns element + | isElem ns "w" "p" element + , Just (numId, lvl) <- elemToNumInfo ns element = do + let parstyle = elemToParagraphStyle ns element + parparts <- mapD (elemToParPart ns) (elChildren element) + num <- asks envNumbering + case lookupLevel numId lvl num of + Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts + Nothing -> throwError WrongElem +elemToBodyPart ns element + | isElem ns "w" "p" element = do + let parstyle = elemToParagraphStyle ns element + parparts <- mapD (elemToParPart ns) (elChildren element) + return $ Paragraph parstyle parparts +elemToBodyPart ns element + | isElem ns "w" "tbl" element = do + let caption' = findChild (elemName ns "w" "tblPr") element + >>= findChild (elemName ns "w" "tblCaption") + >>= findAttr (elemName ns "w" "val") + caption = (fromMaybe "" caption') + grid' = case findChild (elemName ns "w" "tblGrid") element of + Just g -> elemToTblGrid ns g + Nothing -> return [] + tblLook' = case findChild (elemName ns "w" "tblPr") element >>= + findChild (elemName ns "w" "tblLook") + of + Just l -> elemToTblLook ns l + Nothing -> return defaultTblLook + + grid <- grid' + tblLook <- tblLook' + rows <- mapD (elemToRow ns) (elChildren element) + return $ Tbl caption grid tblLook rows +elemToBodyPart _ _ = throwError WrongElem -data Cell = Cell [BodyPart] - deriving Show +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = + lookup relid (map (\(Relationship pair) -> pair) rels) -elemToCell :: NameSpaces -> Element -> Maybe Cell -elemToCell ns element - | qName (elName element) == "tc" && - qURI (elName element) == (lookup "w" ns) = - Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) -elemToCell _ _ = Nothing +expandDrawingId :: String -> D (FilePath, B.ByteString) +expandDrawingId s = do + target <- asks (lookupRelationship s . envRelationships) + case target of + Just filepath -> do + bytes <- asks (lookup (combine "word" filepath) . envMedia) + case bytes of + Just bs -> return (filepath, bs) + Nothing -> throwError DocxError + Nothing -> throwError DocxError + +elemToParPart :: NameSpaces -> Element -> D ParPart +elemToParPart ns element + | isElem ns "w" "r" element + , Just _ <- findChild (elemName ns "w" "drawing") element = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs) + Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element = + elemToRun ns element >>= (\r -> return $ PlainRun r) +elemToParPart ns element + | isElem ns "w" "ins" element + , Just cId <- findAttr (elemName ns "w" "id") element + , Just cAuthor <- findAttr (elemName ns "w" "author") element + , Just cDate <- findAttr (elemName ns "w" "date") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Insertion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "del" element + , Just cId <- findAttr (elemName ns "w" "id") element + , Just cAuthor <- findAttr (elemName ns "w" "author") element + , Just cDate <- findAttr (elemName ns "w" "date") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Deletion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "bookmarkStart" element + , Just bmId <- findAttr (elemName ns "w" "id") element + , Just bmName <- findAttr (elemName ns "w" "name") element = + return $ BookMark bmId bmName +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just anchor <- findAttr (elemName ns "w" "anchor") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ InternalHyperLink anchor runs +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just relId <- findAttr (elemName ns "r" "id") element = do + runs <- mapD (elemToRun ns) (elChildren element) + rels <- asks envRelationships + return $ case lookupRelationship relId rels of + Just target -> ExternalHyperLink target runs + Nothing -> ExternalHyperLink "" runs +elemToParPart ns element + | isElem ns "m" "oMath" element = + (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) +elemToParPart _ _ = throwError WrongElem -data ParPart = PlainRun Run - | BookMark BookMarkId Anchor - | InternalHyperLink Anchor [Run] - | ExternalHyperLink RelId [Run] - | Drawing String - deriving Show +lookupFootnote :: String -> Notes -> Maybe Element +lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) -data Run = Run RunStyle [RunElem] - | Footnote String - | Endnote String - deriving Show +lookupEndnote :: String -> Notes -> Maybe Element +lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) -data RunElem = TextRun String | LnBrk | Tab - deriving Show +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= + (\(fp, bs) -> return $ InlineDrawing fp bs) + Nothing -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "footnoteReference") element + , Just fnId <- findAttr (elemName ns "w" "id") ref = do + notes <- asks envNotes + case lookupFootnote fnId notes of + Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + return $ Footnote bps + Nothing -> return $ Footnote [] +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "endnoteReference") element + , Just enId <- findAttr (elemName ns "w" "id") ref = do + notes <- asks envNotes + case lookupEndnote enId notes of + Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + return $ Endnote bps + Nothing -> return $ Endnote [] +elemToRun ns element + | isElem ns "w" "r" element = do + runElems <- elemToRunElems ns element + return $ Run (elemToRunStyle ns element) runElems +elemToRun _ _ = throwError WrongElem -data RunStyle = RunStyle { isBold :: Bool - , isItalic :: Bool - , isSmallCaps :: Bool - , isStrike :: Bool - , isSuperScript :: Bool - , isSubScript :: Bool - , underline :: Maybe String - , rStyle :: Maybe String } - deriving Show +elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle +elemToParagraphStyle ns element + | Just pPr <- findChild (elemName ns "w" "pPr") element = + ParagraphStyle + {pStyle = + mapMaybe + (findAttr (elemName ns "w" "val")) + (findChildren (elemName ns "w" "pStyle") pPr) + , indentation = + findChild (elemName ns "w" "ind") pPr >>= + elemToParIndentation ns + } +elemToParagraphStyle _ _ = defaultParagraphStyle -defaultRunStyle :: RunStyle -defaultRunStyle = RunStyle { isBold = False - , isItalic = False - , isSmallCaps = False - , isStrike = False - , isSuperScript = False - , isSubScript = False - , underline = Nothing - , rStyle = Nothing - } elemToRunStyle :: NameSpaces -> Element -> RunStyle -elemToRunStyle ns element = - case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of - Just rPr -> - RunStyle +elemToRunStyle ns element + | Just rPr <- findChild (elemName ns "w" "rPr") element = + RunStyle { isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr @@ -508,100 +660,65 @@ elemToRunStyle ns element = (Just "subscript" == (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= findAttr (QName "val" (lookup "w" ns) (Just "w")))) - , underline = + , rUnderline = findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) , rStyle = findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) } - Nothing -> defaultRunStyle +elemToRunStyle _ _ = defaultRunStyle -elemToRun :: NameSpaces -> Element -> Maybe Run -elemToRun ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - case - findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= - findAttr (QName "id" (lookup "w" ns) (Just "w")) - of - Just s -> Just $ Footnote s - Nothing -> - case - findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= - findAttr (QName "id" (lookup "w" ns) (Just "w")) - of - Just s -> Just $ Endnote s - Nothing -> Just $ - Run (elemToRunStyle ns element) - (elemToRunElems ns element) -elemToRun _ _ = Nothing - -elemToRunElem :: NameSpaces -> Element -> Maybe RunElem +elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element - | qName (elName element) == "t" && - qURI (elName element) == (lookup "w" ns) = - Just $ TextRun (strContent element) - | qName (elName element) == "br" && - qURI (elName element) == (lookup "w" ns) = - Just $ LnBrk - | qName (elName element) == "tab" && - qURI (elName element) == (lookup "w" ns) = - Just $ Tab - | otherwise = Nothing - - -elemToRunElems :: NameSpaces -> Element -> [RunElem] + | isElem ns "w" "t" element + || isElem ns "w" "delText" element + || isElem ns "m" "t" element = do + let str = strContent element + font <- asks envFont + case font of + Nothing -> return $ TextRun str + Just f -> return . TextRun $ + map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str + | isElem ns "w" "br" element = return LnBrk + | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "sym" element = return (getSymChar ns element) + | otherwise = throwError WrongElem + where + lowerFromPrivate (ord -> c) + | c >= ord '\xF000' = chr $ c - ord '\xF000' + | otherwise = chr c + +-- The char attribute is a hex string +getSymChar :: NameSpaces -> Element -> RunElem +getSymChar ns element + | Just s <- lowerFromPrivate <$> getCodepoint + , Just font <- getFont = + let [(char, _)] = readLitChar ("\\x" ++ s) in + TextRun . maybe "" (:[]) $ getUnicode font char + where + getCodepoint = findAttr (elemName ns "w" "char") element + getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + lowerFromPrivate ('F':xs) = '0':xs + lowerFromPrivate xs = xs +getSymChar _ _ = TextRun "" + +stringToFont :: String -> Maybe Font +stringToFont "Symbol" = Just Symbol +stringToFont _ = Nothing + +elemToRunElems :: NameSpaces -> Element -> D [RunElem] elemToRunElems ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - mapMaybe (elemToRunElem ns) (elChildren element) - | otherwise = [] - -elemToDrawing :: NameSpaces -> Element -> Maybe ParPart -elemToDrawing ns element - | qName (elName element) == "drawing" && - qURI (elName element) == (lookup "w" ns) = - let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - in - findElement (QName "blip" (Just a_ns) (Just "a")) element - >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) - >>= (\s -> Just $ Drawing s) -elemToDrawing _ _ = Nothing - - -elemToParPart :: NameSpaces -> Element -> Maybe ParPart -elemToParPart ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of - Just drawingElem -> elemToDrawing ns drawingElem - Nothing -> do - r <- elemToRun ns element - return $ PlainRun r -elemToParPart ns element - | qName (elName element) == "bookmarkStart" && - qURI (elName element) == (lookup "w" ns) = do - bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element - bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element - return $ BookMark bmId bmName -elemToParPart ns element - | qName (elName element) == "hyperlink" && - qURI (elName element) == (lookup "w" ns) = - let runs = map fromJust $ filter isJust $ map (elemToRun ns) - $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element - in - case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of - Just anchor -> - Just $ InternalHyperLink anchor runs - Nothing -> - case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of - Just relId -> Just $ ExternalHyperLink relId runs - Nothing -> Nothing -elemToParPart _ _ = Nothing - -type Target = String -type Anchor = String -type BookMarkId = String -type RelId = String - + | isElem ns "w" "r" element + || isElem ns "m" "r" element = do + let qualName = elemName ns "w" + let font = do + fontElem <- findElement (qualName "rFonts") element + stringToFont =<< + (foldr (<|>) Nothing $ + map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) +elemToRunElems _ _ = throwError WrongElem + +setFont :: Maybe Font -> ReaderEnv -> ReaderEnv +setFont f s = s{envFont = f} diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs new file mode 100644 index 000000000..2dbef4131 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs @@ -0,0 +1,183 @@ +{-# LANGUAGE OverloadedStrings, PatternGuards #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Reducible + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Typeclass for combining adjacent blocks and inlines correctly. +-} + + +module Text.Pandoc.Readers.Docx.Reducible ((<++>), + (<+++>), + Reducible, + Container(..), + container, + innards, + reduceList, + reduceListB, + rebuild) + where + +import Text.Pandoc.Builder +import Data.List ((\\), intersect) + +data Container a = Container ([a] -> a) | NullContainer + +instance (Eq a) => Eq (Container a) where + (Container x) == (Container y) = ((x []) == (y [])) + NullContainer == NullContainer = True + _ == _ = False + +instance (Show a) => Show (Container a) where + show (Container x) = "Container {" ++ + (reverse $ drop 3 $ reverse $ show $ x []) ++ + "}" + show (NullContainer) = "NullContainer" + +class Reducible a where + (<++>) :: a -> a -> [a] + container :: a -> Container a + innards :: a -> [a] + isSpace :: a -> Bool + +(<+++>) :: (Reducible a) => Many a -> Many a -> Many a +mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms + +reduceListB :: (Reducible a) => Many a -> Many a +reduceListB = fromList . reduceList . toList + +reduceList' :: (Reducible a) => [a] -> [a] -> [a] +reduceList' acc [] = acc +reduceList' [] (x:xs) = reduceList' [x] xs +reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs + +reduceList :: (Reducible a) => [a] -> [a] +reduceList = reduceList' [] + +combineReducibles :: (Reducible a, Eq a) => a -> a -> [a] +combineReducibles r s = + let (conts, rs) = topLevelContainers r + (conts', ss) = topLevelContainers s + shared = conts `intersect` conts' + remaining = conts \\ shared + remaining' = conts' \\ shared + in + case null shared of + True | (x : xs) <- reverse rs + , isSpace x -> + rebuild conts (reverse xs) ++ [x, s] + | (x : xs) <- ss + , isSpace x -> + [r, x] ++ rebuild conts' (xs) + True -> [r,s] + False -> rebuild + shared $ + reduceList $ + (rebuild remaining rs) ++ (rebuild remaining' ss) + +instance Reducible Inline where + s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> [s1,s2] + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + s1' = case null classes1' && null kvs1' of + True -> ils1 + False -> [Span attr1' ils1] + s2' = case null classes2' && null kvs2' of + True -> ils2 + False -> [Span attr2' ils2] + in + [Span attr' $ reduceList $ s1' ++ s2'] + + (Str x) <++> (Str y) = [Str (x++y)] + il <++> il' = combineReducibles il il' + + container (Emph _) = Container Emph + container (Strong _) = Container Strong + container (Strikeout _) = Container Strikeout + container (Subscript _) = Container Subscript + container (Superscript _) = Container Superscript + container (Quoted qt _) = Container $ Quoted qt + container (Cite cs _) = Container $ Cite cs + container (Span attr _) = Container $ Span attr + container _ = NullContainer + + innards (Emph ils) = ils + innards (Strong ils) = ils + innards (Strikeout ils) = ils + innards (Subscript ils) = ils + innards (Superscript ils) = ils + innards (Quoted _ ils) = ils + innards (Cite _ ils) = ils + innards (Span _ ils) = ils + innards _ = [] + + isSpace Space = True + isSpace _ = False + +instance Reducible Block where + (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes = + [Div (ident, classes, kvs) (reduceList blks), blk] + + blk <++> blk' = combineReducibles blk blk' + + container (BlockQuote _) = Container BlockQuote + container (Div attr _) = Container $ Div attr + container _ = NullContainer + + innards (BlockQuote bs) = bs + innards (Div _ bs) = bs + innards _ = [] + + isSpace _ = False + + +topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a]) +topLevelContainers' (r : []) = case container r of + NullContainer -> ([], [r]) + _ -> + let (conts, inns) = topLevelContainers' (innards r) + in + ((container r) : conts, inns) +topLevelContainers' rs = ([], rs) + +topLevelContainers :: (Reducible a) => a -> ([Container a], [a]) +topLevelContainers il = topLevelContainers' [il] + +rebuild :: [Container a] -> [a] -> [a] +rebuild [] xs = xs +rebuild ((Container f) : cs) xs = rebuild cs $ [f xs] +rebuild (NullContainer : cs) xs = rebuild cs $ xs diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs new file mode 100644 index 000000000..f900c0adc --- /dev/null +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -0,0 +1,285 @@ +{-# LANGUAGE + ViewPatterns + , StandaloneDeriving + , TupleSections + , FlexibleContexts #-} + +module Text.Pandoc.Readers.EPUB + (readEPUB) + where + +import Text.XML.Light +import Text.Pandoc.Definition hiding (Attr) +import Text.Pandoc.Walk (walk, query) +import Text.Pandoc.Generic(bottomUp) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Options ( ReaderOptions(..), readerTrace) +import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField) +import Text.Pandoc.MediaBag (MediaBag, insertMedia) +import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) +import qualified Text.Pandoc.Builder as B +import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry + , findEntryByPath, Entry) +import qualified Data.ByteString.Lazy as BL (ByteString) +import System.FilePath ( takeFileName, (</>), dropFileName, normalise + , dropFileName + , splitFileName ) +import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Control.Applicative ((<$>)) +import Control.Monad (guard, liftM, when) +import Data.Monoid (mempty, (<>)) +import Data.List (isPrefixOf, isInfixOf) +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Map as M (Map, lookup, fromList, elems) +import Control.DeepSeq.Generics (deepseq, NFData) + +import Debug.Trace (trace) + +type MIME = String + +type Items = M.Map String (FilePath, MIME) + +readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag) +readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) + +runEPUB :: Except String a -> a +runEPUB = either error id . runExcept + +-- Note that internal reference are aggresively normalised so that all ids +-- are of the form "filename#id" +-- +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) +archiveToEPUB os archive = do + -- root is path to folder with manifest file in + (root, content) <- getManifest archive + meta <- parseMeta content + (cover, items) <- parseManifest content + -- No need to collapse here as the image path is from the manifest file + let coverDoc = fromMaybe mempty (imageToPandoc <$> cover) + spine <- parseSpine items content + let escapedSpine = map (escapeURI . takeFileName . fst) spine + Pandoc _ bs <- + foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) + `liftM` parseSpineElem root b) mempty spine + let ast = coverDoc <> (Pandoc meta bs) + let mediaBag = fetchImages (M.elems items) root archive ast + return $ (ast, mediaBag) + where + os' = os {readerParseRaw = True} + parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc + parseSpineElem (normalise -> r) (normalise -> path, mime) = do + when (readerTrace os) (traceM path) + doc <- mimeToReader mime r path + let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty + return $ docSpan <> doc + mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" (normalise -> root) (normalise -> path) = do + fname <- findEntryByPathE (root </> path) archive + return $ fixInternalReferences path . + readHtml os' . + UTF8.toStringLazy $ + fromEntry fname + mimeToReader s _ path + | s `elem` imageMimes = return $ imageToPandoc path + | otherwise = return $ mempty + +-- paths should be absolute when this function is called +-- renameImages should do this +fetchImages :: [(FilePath, MIME)] + -> FilePath -- ^ Root + -> Archive + -> Pandoc + -> MediaBag +fetchImages mimes root arc (query iq -> links) = + foldr (uncurry3 insertMedia) mempty + (mapMaybe getEntry links) + where + getEntry link = + let abslink = root </> link in + (link , lookup link mimes, ) . fromEntry + <$> findEntryByPath abslink arc + +iq :: Inline -> [FilePath] +iq (Image _ (url, _)) = [url] +iq _ = [] + +-- Remove relative paths +renameImages :: FilePath -> Inline -> Inline +renameImages root (Image a (url, b)) = Image a (collapseFilePath (root </> url), b) +renameImages _ x = x + +imageToPandoc :: FilePath -> Pandoc +imageToPandoc s = B.doc . B.para $ B.image s "" mempty + +imageMimes :: [String] +imageMimes = ["image/gif", "image/jpeg", "image/png"] + +type CoverImage = FilePath + +parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items) +parseManifest content = do + manifest <- findElementE (dfName "manifest") content + let items = findChildren (dfName "item") manifest + r <- mapM parseItem items + let cover = findAttr (emptyName "href") =<< filterChild findCover manifest + return (cover, (M.fromList r)) + where + findCover e = maybe False (isInfixOf "cover-image") + (findAttr (emptyName "properties") e) + parseItem e = do + uid <- findAttrE (emptyName "id") e + href <- findAttrE (emptyName "href") e + mime <- findAttrE (emptyName "media-type") e + return (uid, (href, mime)) + +parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)] +parseSpine is e = do + spine <- findElementE (dfName "spine") e + let itemRefs = findChildren (dfName "itemref") spine + mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + where + parseItemRef ref = do + let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) + guard linear + findAttr (emptyName "idref") ref + +parseMeta :: MonadError String m => Element -> m Meta +parseMeta content = do + meta <- findElementE (dfName "metadata") content + let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True + dcspace _ = False + let dcs = filterChildrenName dcspace meta + let r = foldr parseMetaItem nullMeta dcs + return r + +-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem +parseMetaItem :: Element -> Meta -> Meta +parseMetaItem e@(stripNamespace . elName -> field) meta = + addMetaField (renameMeta field) (B.str $ strContent e) meta + +renameMeta :: String -> String +renameMeta "creator" = "author" +renameMeta s = s + +getManifest :: MonadError String m => Archive -> m (String, Element) +getManifest archive = do + metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive + docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) + as <- liftM ((map attrToPair) . elAttribs) + (findElementE (QName "rootfile" (Just ns) Nothing) docElem) + manifestFile <- mkE "Root not found" (lookup "full-path" as) + let rootdir = dropFileName manifestFile + --mime <- lookup "media-type" as + manifest <- findEntryByPathE manifestFile archive + liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + +-- Fixup + +fixInternalReferences :: FilePath -> Pandoc -> Pandoc +fixInternalReferences pathToFile = + (walk $ renameImages root) + . (walk normalisePath) + . (walk $ fixBlockIRs filename) + . (walk $ fixInlineIRs filename) + where + (root, escapeURI -> filename) = splitFileName pathToFile + +fixInlineIRs :: String -> Inline -> Inline +fixInlineIRs s (Span as v) = + Span (fixAttrs s as) v +fixInlineIRs s (Code as code) = + Code (fixAttrs s as) code +fixInlineIRs s (Link t ('#':url, tit)) = + Link t (addHash s url, tit) +fixInlineIRs _ v = v + +normalisePath :: Inline -> Inline +normalisePath (Link t (url, tit)) = + let (path, uid) = span (/= '#') url in + Link t (takeFileName path ++ uid, tit) +normalisePath s = s + +prependHash :: [String] -> Inline -> Inline +prependHash ps l@(Link is (url, tit)) + | or [s `isPrefixOf` url | s <- ps] = + Link is ('#':url, tit) + | otherwise = l +prependHash _ i = i + +fixBlockIRs :: String -> Block -> Block +fixBlockIRs s (Div as b) = + Div (fixAttrs s as) b +fixBlockIRs s (Header i as b) = + Header i (fixAttrs s as) b +fixBlockIRs s (CodeBlock as code) = + CodeBlock (fixAttrs s as) code +fixBlockIRs _ b = b + +fixAttrs :: FilePath -> B.Attr -> B.Attr +fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) + +addHash :: String -> String -> String +addHash _ "" = "" +addHash s ident = s ++ "#" ++ ident + +removeEPUBAttrs :: [(String, String)] -> [(String, String)] +removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs + +isEPUBAttr :: (String, String) -> Bool +isEPUBAttr (k, _) = "epub:" `isPrefixOf` k + +-- Library + +-- Strict version of foldM +foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a +foldM' _ z [] = return z +foldM' f z (x:xs) = do + z' <- f z x + z' `deepseq` foldM' f z' xs + +uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d +uncurry3 f (a, b, c) = f a b c + +traceM :: Monad m => String -> m () +traceM = flip trace (return ()) + +-- Utility + +stripNamespace :: QName -> String +stripNamespace (QName v _ _) = v + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) +attrToNSPair _ = Nothing + +attrToPair :: Attr -> (String, String) +attrToPair (Attr (QName name _ _) val) = (name, val) + +defaultNameSpace :: Maybe String +defaultNameSpace = Just "http://www.idpf.org/2007/opf" + +dfName :: String -> QName +dfName s = QName s defaultNameSpace Nothing + +emptyName :: String -> QName +emptyName s = QName s Nothing Nothing + +-- Convert Maybe interface to Either + +findAttrE :: MonadError String m => QName -> Element -> m String +findAttrE q e = mkE "findAttr" $ findAttr q e + +findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry +findEntryByPathE (normalise -> path) a = + mkE ("No entry on path: " ++ path) $ findEntryByPath path a + +parseXMLDocE :: MonadError String m => String -> m Element +parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc + +findElementE :: MonadError String m => QName -> Element -> m Element +findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x + +mkE :: MonadError String m => String -> Maybe a -> m a +mkE s = maybe (throwError s) return diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 552e8a251..1789b865f 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -40,41 +41,69 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) -import Text.Pandoc.Shared -import Text.Pandoc.Options -import Text.Pandoc.Parsing -import Data.Maybe ( fromMaybe, isJust ) -import Data.List ( intercalate ) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) +import Text.Pandoc.Shared ( extractSpaces, renderTags' + , escapeURI, safeRead ) +import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) + , Extension (Ext_epub_html_exts, + Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Walk +import Data.Maybe ( fromMaybe, isJust) +import Data.List ( intercalate, isInfixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$), (<*) ) -import Data.Monoid +import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Arrow ((***)) +import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>)) +import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..)) import Text.Printf (printf) import Debug.Trace (trace) +import Text.TeXMath (readMathML, writeTeX) +import Data.Default (Default (..), def) +import Control.Monad.Reader (Reader,ask, asks, local, runReader) -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = - case runParser parseDoc def{ stateOptions = opts } "source" tags of + case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of Left err' -> error $ "\nError at " ++ show err' Right result -> result - where tags = canonicalizeTags $ + where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do blocks <- (fixPlains False) . mconcat <$> manyTill block eof - meta <- stateMeta <$> getState - return $ Pandoc meta (B.toList blocks) + meta <- stateMeta . parserState <$> getState + bs' <- replaceNotes (B.toList blocks) + return $ Pandoc meta bs' + +replaceNotes :: [Block] -> TagParser [Block] +replaceNotes = walkM replaceNotes' + +replaceNotes' :: Inline -> TagParser Inline +replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes + where + getNotes = noteTable <$> getState +replaceNotes' x = return x -type TagParser = Parser [Tag String] ParserState +data HTMLState = + HTMLState + { parserState :: ParserState, + noteTable :: [(String, Blocks)] + } + +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + } + +setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter = local (\s -> s {inChapter = True}) + +type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) + +type TagParser = HTMLParser [Tag String] pBody :: TagParser Blocks pBody = pInTags "body" block @@ -98,7 +127,11 @@ block = do tr <- getOption readerTrace pos <- getPosition res <- choice - [ pPara + [ eSection + , eSwitch B.para block + , mempty <$ eFootnote + , mempty <$ eTOC + , pPara , pHeader , pBlockQuote , pCodeBlock @@ -115,6 +148,63 @@ block = do (take 60 $ show $ B.toList res)) (return ()) return res +namespaces :: [(String, TagParser Inlines)] +namespaces = [(mathMLNamespace, pMath True)] + +mathMLNamespace :: String +mathMLNamespace = "http://www.w3.org/1998/Math/MathML" + +eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a +eSwitch constructor parser = try $ do + guardEnabled Ext_epub_html_exts + pSatisfy (~== TagOpen "switch" []) + cases <- getFirst . mconcat <$> + manyTill (First <$> (eCase <* skipMany pBlank) ) + (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + skipMany pBlank + fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) + skipMany pBlank + pSatisfy (~== TagClose "switch") + return $ maybe fallback constructor cases + +eCase :: TagParser (Maybe Inlines) +eCase = do + skipMany pBlank + TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + case (flip lookup namespaces) =<< lookup "required-namespace" attr of + Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + +eFootnote :: TagParser () +eFootnote = try $ do + let notes = ["footnote", "rearnote"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (flip elem notes) (lookup "type" attr)) + let ident = fromMaybe "" (lookup "id" attr) + content <- pInTags tag block + addNote ident content + +addNote :: String -> Blocks -> TagParser () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) + +eNoteref :: TagParser Inlines +eNoteref = try $ do + guardEnabled Ext_epub_html_exts + TagOpen tag attr <- lookAhead $ pAnyTag + guard (maybe False (== "noteref") (lookup "type" attr)) + let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) + guard (not (null ident)) + pInTags tag block + return $ B.rawInline "noteref" ident + +-- Strip TOC if there is one, better to generate again +eTOC :: TagParser () +eTOC = try $ do + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (== "toc") (lookup "type" attr)) + void (pInTags tag block) pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList @@ -128,9 +218,15 @@ pBulletList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") + items <- manyTill (pListItem nonItem) (pCloses "ul") return $ B.bulletList $ map (fixPlains True) items +pListItem :: TagParser a -> TagParser Blocks +pListItem nonItem = do + TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) + let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) + (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + pOrderedList :: TagParser Blocks pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) @@ -156,7 +252,7 @@ pOrderedList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") + items <- manyTill (pListItem nonItem) (pCloses "ol") return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items pDefinitionList :: TagParser Blocks @@ -194,14 +290,14 @@ fixPlains inList bs = if any isParaish bs' pRawTag :: TagParser String pRawTag = do tag <- pAnyTag - let ignorable x = x `elem` ["html","head","body"] + let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag then return [] else return $ renderTags' [tag] pDiv :: TagParser Blocks pDiv = try $ do - getOption readerParseRaw >>= guard + guardEnabled Ext_native_divs TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True) contents <- pInTags "div" block return $ B.divWith (mkAttr attr) contents @@ -220,13 +316,35 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] +-- Sets chapter context +eSection :: TagParser Blocks +eSection = try $ do + let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let sectTag = tagOpen (`elem` sectioningContent) matchChapter + TagOpen tag _ <- lookAhead $ pSatisfy sectTag + setInChapter (pInTags tag block) + +headerLevel :: String -> TagParser Int +headerLevel tagtype = do + let level = read (drop 1 tagtype) + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + + + + + pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] - let level = read (drop 1 tagtype) + level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr @@ -244,7 +362,7 @@ pTable :: TagParser Blocks pTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank - caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank + caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank -- TODO actually read these and take width information from them widths' <- pColgroup <|> many pCol head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") @@ -326,7 +444,9 @@ pCodeBlock = try $ do inline :: TagParser Inlines inline = choice - [ pTagText + [ eNoteref + , eSwitch id inline + , pTagText , pQ , pEmph , pStrong @@ -338,6 +458,7 @@ inline = choice , pImage , pCode , pSpan + , pMath False , pRawHtmlInline ] @@ -366,8 +487,8 @@ pSelfClosing f g = do pQ :: TagParser Inlines pQ = do - quoteContext <- stateQuoteContext `fmap` getState - let quoteType = case quoteContext of + context <- asks quoteContext + let quoteType = case context of InDoubleQuote -> SingleQuote _ -> DoubleQuote let innerQuoteContext = if quoteType == SingleQuote @@ -406,12 +527,24 @@ pLineBreak = do return B.linebreak pLink :: TagParser Inlines -pLink = try $ do +pLink = pRelLink <|> pAnchor + +pAnchor :: TagParser Inlines +pAnchor = try $ do + tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id")) + return $ B.spanWith (fromAttrib "id" tag , [], []) mempty + +pRelLink :: TagParser Inlines +pRelLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) let url = fromAttrib "href" tag let title = fromAttrib "title" tag + let uid = fromAttrib "id" tag + let spanC = case uid of + [] -> id + s -> B.spanWith (s, [], []) lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") - return $ B.link (escapeURI url) title lab + return $ spanC $ B.link (escapeURI url) title lab pImage :: TagParser Inlines pImage = do @@ -429,7 +562,7 @@ pCode = try $ do pSpan :: TagParser Inlines pSpan = try $ do - getOption readerParseRaw >>= guard + guardEnabled Ext_native_spans TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) contents <- pInTags "span" inline return $ B.spanWith (mkAttr attr) contents @@ -442,6 +575,22 @@ pRawHtmlInline = do then return $ B.rawInline "html" $ renderTags' [result] else return mempty +mathMLToTeXMath :: String -> Either String String +mathMLToTeXMath s = writeTeX <$> readMathML s + +pMath :: Bool -> TagParser Inlines +pMath inCase = try $ do + open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr))) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) + let math = mathMLToTeXMath $ + (renderTags $ [open] ++ contents ++ [TagClose "math"]) + let constructor = + maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath) + (lookup "display" attr) + return $ either (const mempty) + (\x -> if null x then mempty else constructor x) math + pInlinesInTags :: String -> (Inlines -> Inlines) -> TagParser Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline @@ -479,7 +628,8 @@ pTagText :: TagParser Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState - case runParser (many pTagContents) st "text" str of + qu <- ask + case flip runReader qu $ runParserT (many pTagContents) st "text" str of Left _ -> fail $ "Could not parse `" ++ str ++ "'" Right result -> return $ mconcat result @@ -488,7 +638,9 @@ pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: Parser [Char] ParserState Inlines +type InlinesParser = HTMLParser String + +pTagContents :: InlinesParser Inlines pTagContents = B.displayMath <$> mathDisplay <|> B.math <$> mathInline @@ -498,12 +650,11 @@ pTagContents = <|> pSymbol <|> pBad -pStr :: Parser [Char] ParserState Inlines +pStr :: InlinesParser Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos return $ B.str result isSpecial :: Char -> Bool @@ -518,13 +669,13 @@ isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: Parser [Char] ParserState Inlines +pSymbol :: InlinesParser Inlines pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: Parser [Char] ParserState Inlines +pBad :: InlinesParser Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -558,7 +709,7 @@ pBad = do _ -> '?' return $ B.str [c'] -pSpace :: Parser [Char] ParserState Inlines +pSpace :: InlinesParser Inlines pSpace = many1 (satisfy isSpace) >> return B.space -- @@ -566,8 +717,10 @@ pSpace = many1 (satisfy isSpace) >> return B.space -- eitherBlockOrInline :: [String] -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object"] +eitherBlockOrInline = ["audio", "applet", "button", "iframe", + "del", "ins", + "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] {- inlineHtmlTags :: [[Char]] @@ -579,15 +732,17 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", -} blockHtmlTags :: [String] -blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "canvas", +blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "button", "canvas", "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", - "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "map", "menu", - "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", - "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd", + "dl", "dt", "embed", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "menu", "noframes", "ol", "output", "p", "pre", + "section", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style", "svg", "video"] + "th", "thead", "tr", "script", "style"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. @@ -605,8 +760,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] +epubTags :: [String] +epubTags = ["case", "switch", "default"] + blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags +blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags isInlineTag :: Tag String -> Bool isInlineTag t = tagOpen isInlineTagName (const True) t || @@ -670,19 +828,23 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String +htmlInBalanced :: (Monad m) + => (Tag String -> Bool) + -> ParserT String st m String htmlInBalanced f = try $ do (TagOpen t _, tag) <- htmlTag f guard $ '/' `notElem` tag -- not a self-closing tag let stopper = htmlTag (~== TagClose t) - let anytag = liftM snd $ htmlTag (const True) + let anytag = snd <$> htmlTag (const True) contents <- many $ notFollowedBy' stopper >> (htmlInBalanced f <|> anytag <|> count 1 anyChar) endtag <- liftM snd stopper return $ tag ++ concat contents ++ endtag -- | Matches a tag meeting a certain condition. -htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String) +htmlTag :: Monad m + => (Tag String -> Bool) + -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead $ char '<' >> (oneOf "/!?" <|> letter) (next : _) <- getInput >>= return . canonicalizeTags . parseTags @@ -701,7 +863,78 @@ htmlTag f = try $ do mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + +-- Strip namespace prefixes +stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag String -> Tag String +stripPrefix (TagOpen s as) = + TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) +stripPrefix (TagClose s) = TagClose (stripPrefix' s) +stripPrefix x = x + +stripPrefix' :: String -> String +stripPrefix' s = + case span (/= ':') s of + (_, "") -> s + (_, (_:ts)) -> ts + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- Instances +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance HasQuoteContext st (Reader HTMLLocal) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance Default HTMLState where + def = HTMLState def [] + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote False + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState + + +-- EPUB Specific +-- +-- +sectioningContent :: [String] +sectioningContent = ["article", "aside", "nav", "section"] + +{- +groupingContent :: [String] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] + + + +types :: [(String, ([String], Int))] +types = -- Document divisions + map (\s -> (s, (["section", "body"], 0))) + ["volume", "part", "chapter", "division"] + ++ -- Document section and components + [ + ("abstract", ([], 0))] +-} diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index a3dfb7c3c..4b46c869d 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -43,11 +43,8 @@ docHToBlocks d' = (docHToInlines False $ headerTitle h) DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2) DocString _ -> inlineFallback - DocParagraph (DocHeader h) -> docHToBlocks (DocHeader h) DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h - DocParagraph x -> let (ils, rest) = getInlines x - in (B.para $ docHToInlines False ils) - <> docHToBlocks rest + DocParagraph x -> B.para $ docHToInlines False x DocIdentifier _ -> inlineFallback DocIdentifierUnchecked _ -> inlineFallback DocModule s -> B.plain $ docHToInlines False $ DocModule s @@ -115,40 +112,6 @@ docHToInlines isCode d' = DocProperty _ -> mempty DocExamples _ -> mempty -getInlines :: DocH String Identifier -> (DocH String Identifier, DocH String Identifier) -getInlines (DocAppend x y) = if isInline x - then let (a, b) = getInlines y - in (DocAppend x a, b) - else (DocEmpty, DocAppend x y) -getInlines x = if isInline x - then (x, DocEmpty) - else (DocEmpty, x) - -isInline :: DocH String Identifier -> Bool -isInline d' = - case d' of - DocEmpty -> True - DocAppend d1 _ -> isInline d1 - DocString _ -> True - DocParagraph _ -> False - DocIdentifier _ -> True - DocIdentifierUnchecked _ -> True - DocModule _ -> True - DocWarning _ -> True - DocEmphasis _ -> True - DocMonospaced _ -> True - DocBold _ -> True - DocHeader _ -> False - DocUnorderedList _ -> False - DocOrderedList _ -> False - DocDefList _ -> False - DocCodeBlock _ -> False - DocHyperlink _ -> True - DocPic _ -> True - DocAName _ -> True - DocProperty _ -> False - DocExamples _ -> False - -- | Create an 'Example', stripping superfluous characters as appropriate makeExample :: String -> String -> [String] -> Blocks makeExample prompt expression result = @@ -173,4 +136,3 @@ makeExample prompt expression result = substituteBlankLine "<BLANKLINE>" = "" substituteBlankLine line = line coder = B.codeWith ([], ["result"], []) - diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 97bfaa455..9f51e9a8f 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -41,7 +41,6 @@ import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, mathDisplay, mathInline) -import Text.Parsec.Prim (ParsecT, runParserT) import qualified Text.Pandoc.UTF8 as UTF8 import Data.Char ( chr, ord ) import Control.Monad.Trans (lift) @@ -104,7 +103,7 @@ dimenarg = try $ do sp :: LP () sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') - <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline) + <|> (try $ newline <* lookAhead anyChar <* notFollowedBy blankline) isLowerHex :: Char -> Bool isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' @@ -808,7 +807,7 @@ rawEnv name = do ---- -type IncludeParser = ParsecT [Char] [String] IO String +type IncludeParser = ParserT [Char] [String] IO String -- | Replace "include" commands with file contents. handleIncludes :: String -> IO String diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 6c710c8ff..861f81b23 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -353,8 +353,7 @@ referenceKey = try $ do notFollowedBy' referenceTitle notFollowedBy' (() <$ reference) many1 $ notFollowedBy space >> litChar - let betweenAngles = try $ char '<' >> - manyTill (escapedChar' <|> litChar) (char '>') + let betweenAngles = try $ char '<' >> manyTill litChar (char '>') src <- try betweenAngles <|> sourceURL tit <- option "" referenceTitle -- currently we just ignore MMD-style link/image attributes @@ -571,7 +570,7 @@ attributes :: MarkdownParser Attr attributes = try $ do char '{' spnl - attrs <- many (attribute >>~ spnl) + attrs <- many (attribute <* spnl) char '}' return $ foldl (\x f -> f x) nullAttr attrs @@ -688,7 +687,7 @@ birdTrackLine c = try $ do -- emailBlockQuoteStart :: MarkdownParser Char -emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ') +emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ') emailBlockQuote :: MarkdownParser [String] emailBlockQuote = try $ do @@ -752,7 +751,7 @@ listLine = try $ do notFollowedBy' (do indentSpaces many spaceChar listStart) - notFollowedBy' $ htmlTag (~== TagClose "div") + notFollowedByHtmlCloser optional (() <$ indentSpaces) chunks <- manyTill ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') @@ -781,11 +780,18 @@ listContinuation = try $ do blanks <- many blankline return $ concat result ++ blanks +notFollowedByHtmlCloser :: MarkdownParser () +notFollowedByHtmlCloser = do + inHtmlBlock <- stateInHtmlBlock <$> getState + case inHtmlBlock of + Just t -> notFollowedBy' $ htmlTag (~== TagClose t) + Nothing -> return () + listContinuationLine :: MarkdownParser String listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart - notFollowedBy' $ htmlTag (~== TagClose "div") + notFollowedByHtmlCloser optional indentSpaces result <- anyLine return $ result ++ "\n" @@ -840,38 +846,53 @@ defListMarker = do else mzero return () -definitionListItem :: MarkdownParser (F (Inlines, [Blocks])) -definitionListItem = try $ do - -- first, see if this has any chance of being a definition list: - lookAhead (anyLine >> optional blankline >> defListMarker) - term <- trimInlinesF . mconcat <$> manyTill inline newline - optional blankline - raw <- many1 defRawBlock - state <- getState - let oldContext = stateParserContext state - -- parse the extracted block, which may contain various block elements: +definitionListItem :: Bool -> MarkdownParser (F (Inlines, [Blocks])) +definitionListItem compact = try $ do + rawLine' <- anyLine + raw <- many1 $ defRawBlock compact + term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine' contents <- mapM (parseFromString parseBlocks) raw - updateState (\st -> st {stateParserContext = oldContext}) + optional blanklines return $ liftM2 (,) term (sequence contents) -defRawBlock :: MarkdownParser String -defRawBlock = try $ do +defRawBlock :: Bool -> MarkdownParser String +defRawBlock compact = try $ do + hasBlank <- option False $ blankline >> return True defListMarker firstline <- anyLine - rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) - trailing <- option "" blanklines - cont <- liftM concat $ many $ do - lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine - trl <- option "" blanklines - return $ unlines lns ++ trl - return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont + let dline = try + ( do notFollowedBy blankline + if compact -- laziness not compatible with compact + then () <$ indentSpaces + else (() <$ indentSpaces) + <|> notFollowedBy defListMarker + anyLine ) + rawlines <- many dline + cont <- liftM concat $ many $ try $ do + trailing <- option "" blanklines + ln <- indentSpaces >> notFollowedBy blankline >> anyLine + lns <- many dline + return $ trailing ++ unlines (ln:lns) + return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++ + if hasBlank || not (null cont) then "\n\n" else "" definitionList :: MarkdownParser (F Blocks) -definitionList = do - guardEnabled Ext_definition_lists - items <- fmap sequence $ many1 definitionListItem +definitionList = try $ do + lookAhead (anyLine >> optional blankline >> defListMarker) + compactDefinitionList <|> normalDefinitionList + +compactDefinitionList :: MarkdownParser (F Blocks) +compactDefinitionList = do + guardEnabled Ext_compact_definition_lists + items <- fmap sequence $ many1 $ definitionListItem True return $ B.definitionList <$> fmap compactify'DL items +normalDefinitionList :: MarkdownParser (F Blocks) +normalDefinitionList = do + guardEnabled Ext_definition_lists + items <- fmap sequence $ many1 $ definitionListItem False + return $ B.definitionList <$> items + -- -- paragraph block -- @@ -914,16 +935,34 @@ htmlElement = rawVerbatimBlock htmlBlock :: MarkdownParser (F Blocks) htmlBlock = do guardEnabled Ext_raw_html - res <- (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks) - <|> htmlBlock' - return $ return $ B.rawBlock "html" res - -htmlBlock' :: MarkdownParser String + try (do + (TagOpen t attrs) <- lookAhead $ fst <$> htmlTag isBlockTag + (guard (t `elem` ["pre","style","script"]) >> + (return . B.rawBlock "html") <$> rawVerbatimBlock) + <|> (do guardEnabled Ext_markdown_attribute + oldMarkdownAttribute <- stateMarkdownAttribute <$> getState + markdownAttribute <- + case lookup "markdown" attrs of + Just "0" -> False <$ updateState (\st -> st{ + stateMarkdownAttribute = False }) + Just _ -> True <$ updateState (\st -> st{ + stateMarkdownAttribute = True }) + Nothing -> return oldMarkdownAttribute + res <- if markdownAttribute + then rawHtmlBlocks + else htmlBlock' + updateState $ \st -> st{ stateMarkdownAttribute = + oldMarkdownAttribute } + return res) + <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)) + <|> htmlBlock' + +htmlBlock' :: MarkdownParser (F Blocks) htmlBlock' = try $ do first <- htmlElement - finalSpace <- many spaceChar - finalNewlines <- many newline - return $ first ++ finalSpace ++ finalNewlines + skipMany spaceChar + optional blanklines + return $ return $ B.rawBlock "html" first strictHtmlBlock :: MarkdownParser String strictHtmlBlock = htmlInBalanced (not . isInlineTag) @@ -934,48 +973,36 @@ rawVerbatimBlock = try $ do ["pre", "style", "script"]) (const True)) contents <- manyTill anyChar (htmlTag (~== TagClose tag)) - return $ open ++ contents ++ renderTags [TagClose tag] + return $ open ++ contents ++ renderTags' [TagClose tag] rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do guardEnabled Ext_raw_tex - result <- (B.rawBlock "latex" <$> rawLaTeXBlock) - <|> (B.rawBlock "context" <$> rawConTeXtEnvironment) + result <- (B.rawBlock "latex" . concat <$> + rawLaTeXBlock `sepEndBy1` blankline) + <|> (B.rawBlock "context" . concat <$> + rawConTeXtEnvironment `sepEndBy1` blankline) spaces return $ return result -rawHtmlBlocks :: MarkdownParser String +rawHtmlBlocks :: MarkdownParser (F Blocks) rawHtmlBlocks = do - htmlBlocks <- many1 $ try $ do - s <- rawVerbatimBlock <|> try ( - do (t,raw) <- htmlTag isBlockTag - guard $ t ~/= TagOpen "div" [] && - t ~/= TagClose "div" - exts <- getOption readerExtensions - -- if open tag, need markdown="1" if - -- markdown_attributes extension is set - case t of - TagOpen _ as - | Ext_markdown_attribute `Set.member` - exts -> - if "markdown" `notElem` - map fst as - then mzero - else return $ - stripMarkdownAttribute raw - | otherwise -> return raw - _ -> return raw ) - sps <- do sp1 <- many spaceChar - sp2 <- option "" (blankline >> return "\n") - sp3 <- many spaceChar - sp4 <- option "" blanklines - return $ sp1 ++ sp2 ++ sp3 ++ sp4 - -- note: we want raw html to be able to - -- precede a code block, when separated - -- by a blank line - return $ s ++ sps - let combined = concat htmlBlocks - return $ if last combined == '\n' then init combined else combined + (TagOpen tagtype _, raw) <- htmlTag isBlockTag + -- try to find closing tag + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just tagtype } + let closer = htmlTag (\x -> x ~== TagClose tagtype) + contents <- mconcat <$> many (notFollowedBy' closer >> block) + result <- + (closer >>= \(_, rawcloser) -> return ( + return (B.rawBlock "html" $ stripMarkdownAttribute raw) <> + contents <> + return (B.rawBlock "html" rawcloser))) + <|> return (return (B.rawBlock "html" raw) <> contents) + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + return result -- remove markdown="1" attribute stripMarkdownAttribute :: String -> String @@ -1163,7 +1190,7 @@ gridPart ch = do return (length dashes, length dashes + 1) gridDashedLines :: Char -> Parser [Char] st [(Int,Int)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline removeFinalBar :: String -> String removeFinalBar = @@ -1388,8 +1415,7 @@ escapedChar = do ltSign :: MarkdownParser (F Inlines) ltSign = do guardDisabled Ext_raw_html - <|> guardDisabled Ext_markdown_in_html_blocks - <|> (notFollowedBy' (htmlTag isBlockTag) >> return ()) + <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag)) char '<' return $ return $ B.str "<" @@ -1434,52 +1460,60 @@ math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) enclosure :: Char -> MarkdownParser (F Inlines) enclosure c = do + -- we can't start an enclosure with _ if after a string and + -- the intraword_underscores extension is enabled: + guardDisabled Ext_intraword_underscores + <|> guard (c == '*') + <|> (guard =<< notAfterString) cs <- many1 (char c) (return (B.str cs) <>) <$> whitespace - <|> case length cs of + <|> do + case length cs of 3 -> three c 2 -> two c mempty 1 -> one c mempty _ -> return (return $ B.str cs) +ender :: Char -> Int -> MarkdownParser () +ender c n = try $ do + count n (char c) + guard (c == '*') + <|> guardDisabled Ext_intraword_underscores + <|> notFollowedBy alphaNum + -- Parse inlines til you hit one c or a sequence of two cs. -- If one c, emit emph and then parse two. -- If two cs, emit strong and then parse one. -- Otherwise, emit ccc then the results. three :: Char -> MarkdownParser (F Inlines) three c = do - contents <- mconcat <$> many (notFollowedBy (char c) >> inline) - (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents)) - <|> (try (string [c,c]) >> one c (B.strong <$> contents)) - <|> (char c >> two c (B.emph <$> contents)) + contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline) + (ender c 3 >> return ((B.strong . B.emph) <$> contents)) + <|> (ender c 2 >> one c (B.strong <$> contents)) + <|> (ender c 1 >> two c (B.emph <$> contents)) <|> return (return (B.str [c,c,c]) <> contents) -- Parse inlines til you hit two c's, and emit strong. -- If you never do hit two cs, emit ** plus inlines parsed. two :: Char -> F Inlines -> MarkdownParser (F Inlines) two c prefix' = do - let ender = try $ string [c,c] - contents <- mconcat <$> many (try $ notFollowedBy ender >> inline) - (ender >> return (B.strong <$> (prefix' <> contents))) + contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline) + (ender c 2 >> return (B.strong <$> (prefix' <> contents))) <|> return (return (B.str [c,c]) <> (prefix' <> contents)) -- Parse inlines til you hit a c, and emit emph. -- If you never hit a c, emit * plus inlines parsed. one :: Char -> F Inlines -> MarkdownParser (F Inlines) one c prefix' = do - contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline) + contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline) <|> try (string [c,c] >> - notFollowedBy (char c) >> + notFollowedBy (ender c 1) >> two c mempty) ) - (char c >> return (B.emph <$> (prefix' <> contents))) + (ender c 1 >> return (B.emph <$> (prefix' <> contents))) <|> return (return (B.str [c]) <> (prefix' <> contents)) strongOrEmph :: MarkdownParser (F Inlines) -strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_') - where checkIntraword = do - exts <- getOption readerExtensions - when (Ext_intraword_underscores `Set.member` exts) $ do - guard =<< notAfterString +strongOrEmph = enclosure '*' <|> enclosure '_' -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) @@ -1489,7 +1523,7 @@ inlinesBetween :: (Show b) inlinesBetween start end = (trimInlinesF . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace >>~ notFollowedBy' end + innerSpace = try $ whitespace <* notFollowedBy' end strikeout :: MarkdownParser (F Inlines) strikeout = fmap B.strikeout <$> @@ -1730,7 +1764,7 @@ inBrackets parser = do spanHtml :: MarkdownParser (F Inlines) spanHtml = try $ do - guardEnabled Ext_markdown_in_html_blocks + guardEnabled Ext_native_spans (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span")) let ident = fromMaybe "" $ lookup "id" attrs @@ -1745,14 +1779,19 @@ spanHtml = try $ do divHtml :: MarkdownParser (F Blocks) divHtml = try $ do - guardEnabled Ext_markdown_in_html_blocks + guardEnabled Ext_native_divs (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just "div" } bls <- option "" (blankline >> option "" blanklines) contents <- mconcat <$> many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block) closed <- option False (True <$ htmlTag (~== TagClose "div")) if closed then do + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } let ident = fromMaybe "" $ lookup "id" attrs let classes = maybe [] words $ lookup "class" attrs let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] @@ -1763,10 +1802,17 @@ divHtml = try $ do rawHtmlInline :: MarkdownParser (F Inlines) rawHtmlInline = do guardEnabled Ext_raw_html + inHtmlBlock <- stateInHtmlBlock <$> getState + let isCloseBlockTag t = case inHtmlBlock of + Just t' -> t ~== TagClose t' + Nothing -> False mdInHtml <- option False $ - guardEnabled Ext_markdown_in_html_blocks >> return True + ( guardEnabled Ext_markdown_in_html_blocks + <|> guardEnabled Ext_markdown_attribute + ) >> return True (_,result) <- htmlTag $ if mdInHtml - then isInlineTag + then (\x -> isInlineTag x && + not (isCloseBlockTag x)) else not . isTextTag return $ return $ B.rawInline "html" result diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index f1dcce8f7..e43b8a86c 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -569,7 +569,8 @@ endline = () <$ try (newline <* imageIdentifiers :: [MWParser ()] imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] - where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier"] + where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier", + "Bild"] image :: MWParser Inlines image = try $ do @@ -634,7 +635,7 @@ inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines inlinesBetween start end = (trimInlines . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace >>~ notFollowedBy' end + innerSpace = try $ whitespace <* notFollowedBy' end emph :: MWParser Inlines emph = B.emph <$> nested (inlinesBetween start end) diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 7a35e2ca0..e1c29d1e8 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -41,10 +41,10 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF ) import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.Pandoc.Shared (compactify', compactify'DL) -import Text.TeXMath (texMathToPandoc, DisplayType(..)) +import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) import Control.Applicative ( Applicative, pure - , (<$>), (<$), (<*>), (<*), (*>), (<**>) ) + , (<$>), (<$), (<*>), (<*), (*>) ) import Control.Arrow (first) import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) import Control.Monad.Reader (Reader, runReader, ask, asks) @@ -274,7 +274,7 @@ optionalAttributes parser = try $ parseBlockAttributes :: OrgParser () parseBlockAttributes = do attrs <- many attribute - () <$ mapM (uncurry parseAndAddAttribute) attrs + mapM_ (uncurry parseAndAddAttribute) attrs where attribute :: OrgParser (String, String) attribute = try $ do @@ -341,14 +341,36 @@ verseBlock blkProp = try $ do fmap B.para . mconcat . intersperse (pure B.linebreak) <$> mapM (parseFromString parseInlines) (lines content) +exportsCode :: [(String, String)] -> Bool +exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs + || ("rundoc-exports", "results") `elem` attrs) + +exportsResults :: [(String, String)] -> Bool +exportsResults attrs = ("rundoc-exports", "results") `elem` attrs + || ("rundoc-exports", "both") `elem` attrs + +followingResultsBlock :: OrgParser (Maybe String) +followingResultsBlock = + optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:" + *> blankline + *> (unlines <$> many1 exampleLine)) + codeBlock :: BlockProperties -> OrgParser (F Blocks) codeBlock blkProp = do skipSpaces - (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) - id' <- fromMaybe "" <$> lookupBlockAttribute "name" - content <- rawBlockContent blkProp - let codeBlck = B.codeBlockWith ( id', classes, kv ) content - maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption" + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + id' <- fromMaybe "" <$> lookupBlockAttribute "name" + content <- rawBlockContent blkProp + resultsContent <- followingResultsBlock + let includeCode = exportsCode kv + let includeResults = exportsResults kv + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + labelledBlck <- maybe (pure codeBlck) + (labelDiv codeBlck) + <$> lookupInlinesAttr "caption" + let resultBlck = pure $ maybe mempty (exampleCode) resultsContent + return $ (if includeCode then labelledBlck else mempty) + <> (if includeResults then resultBlck else mempty) where labelDiv blk value = B.divWith nullAttr <$> (mappend <$> labelledBlock value @@ -780,8 +802,12 @@ noteBlock = try $ do -- Paragraphs or Plain text paraOrPlain :: OrgParser (F Blocks) -paraOrPlain = try $ - parseInlines <**> (fmap <$> option B.plain (try $ newline *> pure B.para)) +paraOrPlain = try $ do + ils <- parseInlines + nl <- option False (newline >> return True) + try (guard nl >> notFollowedBy (orderedListStart <|> bulletListStart) >> + return (B.para <$> ils)) + <|> (return (B.plain <$> ils)) inlinesTillNewline :: OrgParser (F Inlines) inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline @@ -1357,7 +1383,7 @@ inlineLaTeX = try $ do maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd where parseAsMath :: String -> Maybe Inlines - parseAsMath cs = maybeRight $ B.fromList <$> texMathToPandoc DisplayInline cs + parseAsMath cs = B.fromList <$> texMathToPandoc cs parseAsInlineLaTeX :: String -> Maybe Inlines parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs @@ -1365,6 +1391,9 @@ inlineLaTeX = try $ do state :: ParserState state = def{ stateOptions = def{ readerParseRaw = True }} + texMathToPandoc inp = (maybeRight $ readTeX inp) >>= + writePandoc DisplayInline + maybeRight :: Either a b -> Maybe b maybeRight = either (const Nothing) Just diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index fa8438e70..e5eccb116 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -47,7 +47,7 @@ import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import qualified Text.Pandoc.Builder as B import Data.Monoid (mconcat, mempty) import Data.Sequence (viewr, ViewR(..)) -import Data.Char (toLower) +import Data.Char (toLower, isHexDigit) -- | Parse reStructuredText string and return Pandoc document. readRST :: ReaderOptions -- ^ Reader options @@ -460,7 +460,7 @@ listItem :: RSTParser Int listItem start = try $ do (markerLength, first) <- rawListItem start rest <- many (listContinuation markerLength) - blanks <- choice [ try (many blankline >>~ lookAhead start), + blanks <- choice [ try (many blankline <* lookAhead start), many1 blankline ] -- whole list must end with blank. -- parsing with ListItemState forces markers at beginning of lines to -- count as list item markers, even if not separated by blank space. @@ -480,7 +480,7 @@ listItem start = try $ do orderedList :: RSTParser Blocks orderedList = try $ do - (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) + (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar) items <- many1 (listItem (orderedListStart style delim)) let items' = compactify' items return $ B.orderedListWith (start, style, delim) items' @@ -656,9 +656,6 @@ extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc where (ds,rest) = span isHexDigit s mbc = safeRead ('\'':'\\':'x':ds ++ "'") -isHexDigit :: Char -> Bool -isHexDigit c = c `elem` "0123456789ABCDEFabcdef" - extractCaption :: RSTParser (Inlines, Blocks) extractCaption = do capt <- trimInlines . mconcat <$> many inline @@ -747,7 +744,7 @@ simpleReferenceName = do referenceName :: RSTParser Inlines referenceName = quotedReferenceName <|> - (try $ simpleReferenceName >>~ lookAhead (char ':')) <|> + (try $ simpleReferenceName <* lookAhead (char ':')) <|> unquotedReferenceName referenceKey :: RSTParser [Char] @@ -1076,7 +1073,7 @@ explicitLink = try $ do referenceLink :: RSTParser Inlines referenceLink = try $ do - (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) >>~ + (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <* char '_' state <- getState let keyTable = stateKeys state diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index f03eae044..3fee3051e 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -27,7 +27,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} -module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where +module Text.Pandoc.Readers.TeXMath ( texMathToInlines ) where import Text.Pandoc.Definition import Text.TeXMath @@ -35,22 +35,14 @@ import Text.TeXMath -- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. -- Defaults to raw formula between @$@ or @$$@ characters if entire formula -- can't be converted. -readTeXMath' :: MathType +texMathToInlines :: MathType -> String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath' mt inp = case texMathToPandoc dt inp of - Left _ -> [Str (delim ++ inp ++ delim)] - Right res -> res +texMathToInlines mt inp = + case writePandoc dt `fmap` readTeX inp of + Right (Just ils) -> ils + _ -> [Str (delim ++ inp ++ delim)] where (dt, delim) = case mt of DisplayMath -> (DisplayBlock, "$$") InlineMath -> (DisplayInline, "$") -{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-} --- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. --- Defaults to raw formula between @$@ characters if entire formula --- can't be converted. (This is provided for backwards compatibility; --- it is better to use @readTeXMath'@, which properly distinguishes --- between display and inline math.) -readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) - -> [Inline] -readTeXMath = readTeXMath' InlineMath diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 6d839ec1d..cd34da942 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -265,8 +265,20 @@ definitionList :: Parser [Char] ParserState Blocks definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. -listStart :: Parser [Char] st Char -listStart = oneOf "*#-" +listStart :: Parser [Char] ParserState () +listStart = genericListStart '*' + <|> () <$ genericListStart '#' + <|> () <$ definitionListStart + +genericListStart :: Char -> Parser [Char] st () +genericListStart c = () <$ try (many1 (char c) >> whitespace) + +definitionListStart :: Parser [Char] ParserState Inlines +definitionListStart = try $ do + char '-' + whitespace + trimInlines . mconcat <$> + many1Till inline (try (string ":=")) <* optional whitespace listInline :: Parser [Char] ParserState Inlines listInline = try (notFollowedBy newline >> inline) @@ -278,8 +290,7 @@ listInline = try (notFollowedBy newline >> inline) -- break. definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks]) definitionListItem = try $ do - string "- " - term <- mconcat <$> many1Till inline (try (whitespace >> string ":=")) + term <- definitionListStart def' <- multilineDef <|> inlineDef return (term, def') where inlineDef :: Parser [Char] ParserState [Blocks] @@ -488,7 +499,7 @@ str = do return $ B.str fullStr -- | Some number of space chars -whitespace :: Parser [Char] ParserState Inlines +whitespace :: Parser [Char] st Inlines whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break @@ -616,7 +627,8 @@ simpleInline border construct = try $ do attr <- attributes body <- trimInlines . mconcat <$> withQuoteContext InSingleQuote - (manyTill inline (try border <* notFollowedBy alphaNum)) + (manyTill (notFollowedBy newline >> inline) + (try border <* notFollowedBy alphaNum)) return $ construct $ if attr == nullAttr then body diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs new file mode 100644 index 000000000..3a51b9d84 --- /dev/null +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -0,0 +1,548 @@ +{-# LANGUAGE ViewPatterns #-} +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Txt2Tags + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + +Conversion of txt2tags formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags + , getT2TMeta + , T2TMeta (..) + , readTxt2TagsNoMacros) + where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, (<>) + , trimInlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL) +import Text.Pandoc.Parsing hiding (space, spaces, uri, macro) +import Control.Applicative ((<$>), (<$), (<*>), (<*), (*>)) +import Data.Char (toLower) +import Data.List (transpose, intersperse, intercalate) +import Data.Maybe (fromMaybe) +import Data.Monoid (Monoid, mconcat, mempty, mappend) +--import Network.URI (isURI) -- Not sure whether to use this function +import Control.Monad (void, guard, when) +import Data.Default +import Control.Monad.Reader (Reader, runReader, asks) + +import Data.Time.LocalTime (getZonedTime) +import Text.Pandoc.Compat.Directory(getModificationTime) +import Data.Time.Format (formatTime) +import System.Locale (defaultTimeLocale) +import System.IO.Error (catchIOError) + +type T2T = ParserT String ParserState (Reader T2TMeta) + +-- | An object for the T2T macros meta information +-- the contents of each field is simply substituted verbatim into the file +data T2TMeta = T2TMeta { + date :: String -- ^ Current date + , mtime :: String -- ^ Last modification time of infile + , infile :: FilePath -- ^ Input file + , outfile :: FilePath -- ^ Output file + } deriving Show + +instance Default T2TMeta where + def = T2TMeta "" "" "" "" + +-- | Get the meta information required by Txt2Tags macros +getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta +getT2TMeta inps out = do + curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime + let getModTime = fmap (formatTime defaultTimeLocale "%F") . + getModificationTime + curMtime <- catchIOError + (maximum <$> mapM getModTime inps) + (const (return "")) + return $ T2TMeta curDate curMtime (intercalate ", " inps) out + +-- | Read Txt2Tags from an input string returning a Pandoc document +readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Pandoc +readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n") + +-- | Read Txt2Tags (ignoring all macros) from an input string returning +-- a Pandoc document +readTxt2TagsNoMacros :: ReaderOptions -> String -> Pandoc +readTxt2TagsNoMacros = readTxt2Tags def + +parseT2T :: T2T Pandoc +parseT2T = do + _ <- (Nothing <$ try blankline) <|> (Just <$> (count 3 anyLine)) + config <- manyTill setting (notFollowedBy setting) + -- TODO: Handle settings better + let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) nullMeta config + updateState (\s -> s {stateMeta = settings}) + body <- mconcat <$> manyTill block eof + return $ Pandoc mempty (B.toList body) + +type Keyword = String +type Value = String + +setting :: T2T (Keyword, Value) +setting = do + string "%!" + keyword <- ignoreSpacesCap (many1 alphaNum) + char ':' + value <- ignoreSpacesCap (manyTill anyChar (newline)) + return (keyword, value) + +-- Blocks + +parseBlocks :: T2T Blocks +parseBlocks = mconcat <$> manyTill block eof + +block :: T2T Blocks +block = do + choice + [ mempty <$ blanklines + , quote + , hrule -- hrule must go above title + , title + , commentBlock + , verbatim + , rawBlock + , taggedBlock + , list + , table + , para + ] + +title :: T2T Blocks +title = try $ balancedTitle '+' <|> balancedTitle '=' + +balancedTitle :: Char -> T2T Blocks +balancedTitle c = try $ do + spaces + level <- length <$> many1 (char c) + guard (level <= 5) -- Max header level 5 + heading <- manyTill (noneOf "\n\r") (count level (char c)) + label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-")) + many spaceChar *> newline + let attr = maybe nullAttr (\x -> (x, [], [])) label + return $ B.headerWith attr level (trimInlines $ B.text heading) + +para :: T2T Blocks +para = try $ do + ils <- parseInlines + nl <- option False (True <$ newline) + option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils)) + where + listStart = try bulletListStart <|> orderedListStart + +commentBlock :: T2T Blocks +commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment + +-- Seperator and Strong line treated the same +hrule :: T2T Blocks +hrule = try $ do + spaces + line <- many1 (oneOf "=-_") + guard (length line >= 20) + B.horizontalRule <$ blankline + +quote :: T2T Blocks +quote = try $ do + lookAhead tab + rawQuote <- many1 (tab *> optional spaces *> anyLine) + contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n") + return $ B.blockQuote contents + +commentLine :: T2T Inlines +commentLine = comment + +-- List Parsing code from Org Reader + +list :: T2T Blocks +list = choice [bulletList, orderedList, definitionList] + +bulletList :: T2T Blocks +bulletList = B.bulletList . compactify' + <$> many1 (listItem bulletListStart parseBlocks) + +orderedList :: T2T Blocks +orderedList = B.orderedList . compactify' + <$> many1 (listItem orderedListStart parseBlocks) + +definitionList :: T2T Blocks +definitionList = try $ do + B.definitionList . compactify'DL <$> + many1 (listItem definitionListStart definitionListEnd) + +definitionListEnd :: T2T (Inlines, [Blocks]) +definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks) + +genericListStart :: T2T Char + -> T2T Int +genericListStart listMarker = try $ + (2+) <$> (length <$> many spaceChar + <* listMarker <* space <* notFollowedBy space) + +-- parses bullet list \start and returns its length (excl. following whitespace) +bulletListStart :: T2T Int +bulletListStart = genericListStart (char '-') + +orderedListStart :: T2T Int +orderedListStart = genericListStart (char '+' ) + +definitionListStart :: T2T Int +definitionListStart = genericListStart (char ':') + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: T2T Int + -> T2T a + -> T2T a +listItem start end = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString end $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> T2T String +listContinuation markerLength = try $ + notFollowedBy' (blankline >> blankline) + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: T2T String +anyLineNewline = (++ "\n") <$> anyLine + +indentWith :: Int -> T2T String +indentWith n = count n space + +-- Table + +table :: T2T Blocks +table = try $ do + header <- fmap snd <$> option mempty (try headerRow) + rows <- many1 (many commentLine *> tableRow) + let columns = transpose rows + let ncolumns = length columns + let aligns = map (foldr1 findAlign) (map (map fst) columns) + let rows' = map (map snd) rows + let size = maximum (map length rows') + let rowsPadded = map (pad size) rows' + let headerPadded = if (not (null header)) then pad size header else mempty + return $ B.table mempty + (zip aligns (replicate ncolumns 0.0)) + headerPadded rowsPadded + +pad :: (Show a, Monoid a) => Int -> [a] -> [a] +pad n xs = xs ++ (replicate (n - length xs) mempty) + + +findAlign :: Alignment -> Alignment -> Alignment +findAlign x y + | x == y = x + | otherwise = AlignDefault + +headerRow :: T2T [(Alignment, Blocks)] +headerRow = genericRow (string "||") + +tableRow :: T2T [(Alignment, Blocks)] +tableRow = genericRow (char '|') + +genericRow :: T2T a -> T2T [(Alignment, Blocks)] +genericRow start = try $ do + spaces *> start + manyTill tableCell newline <?> "genericRow" + + +tableCell :: T2T (Alignment, Blocks) +tableCell = try $ do + leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead + content <- (manyTill inline (try $ lookAhead (cellEnd))) + rightSpaces <- length <$> many space + let align = + case compare leftSpaces rightSpaces of + LT -> AlignLeft + EQ -> AlignCenter + GT -> AlignRight + endOfCell + return $ (align, B.plain (B.trimInlines $ mconcat content)) + where + cellEnd = (void newline <|> (many1 space *> endOfCell)) + +endOfCell :: T2T () +endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline) + +-- Raw area + +verbatim :: T2T Blocks +verbatim = genericBlock anyLineNewline B.codeBlock "```" + +rawBlock :: T2T Blocks +rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\"" + +taggedBlock :: T2T Blocks +taggedBlock = do + target <- getTarget + genericBlock anyLineNewline (B.rawBlock target) "'''" + +-- Generic + +genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks +genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s + +blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupArea p f s = try $ (do + string s *> blankline + f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline)))) + +blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupLine p f s = try (f <$> (string s *> space *> p)) + +-- Can be in either block or inline position +comment :: Monoid a => T2T a +comment = try $ do + atStart + notFollowedBy macro + mempty <$ (char '%' *> anyLine) + +-- Inline + +parseInlines :: T2T Inlines +parseInlines = trimInlines . mconcat <$> many1 inline + +inline :: T2T Inlines +inline = do + choice + [ endline + , macro + , commentLine + , whitespace + , url + , link + , image + , bold + , underline + , code + , raw + , tagged + , strike + , italic + , code + , str + , symbol + ] + +bold :: T2T Inlines +bold = inlineMarkup inline B.strong '*' (B.str) + +underline :: T2T Inlines +underline = inlineMarkup inline B.emph '_' (B.str) + +strike :: T2T Inlines +strike = inlineMarkup inline B.strikeout '-' (B.str) + +italic :: T2T Inlines +italic = inlineMarkup inline B.emph '/' (B.str) + +code :: T2T Inlines +code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id + +raw :: T2T Inlines +raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id + +tagged :: T2T Inlines +tagged = do + target <- getTarget + inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id + +-- Parser for markup indicated by a double character. +-- Inline markup is greedy and glued +-- Greedy meaning ***a*** = Bold [Str "*a*"] +-- Glued meaning that markup must be tight to content +-- Markup can't pass newlines +inlineMarkup :: Monoid a + => (T2T a) -- Content parser + -> (a -> Inlines) -- Constructor + -> Char -- Fence + -> (String -> a) -- Special Case to handle ****** + -> T2T Inlines +inlineMarkup p f c special = try $ do + start <- many1 (char c) + let l = length start + guard (l >= 2) + when (l == 2) (void $ notFollowedBy space) + -- We must make sure that there is no space before the start of the + -- closing tags + body <- optionMaybe (try $ manyTill (noneOf "\n\r") $ + (try $ lookAhead (noneOf " " >> string [c,c] ))) + case body of + Just middle -> do + lastChar <- anyChar + end <- many1 (char c) + let parser inp = parseFromString (mconcat <$> many p) inp + let start' = special (drop 2 start) + body' <- parser (middle ++ [lastChar]) + let end' = special (drop 2 end) + return $ f (start' <> body' <> end') + Nothing -> do -- Either bad or case such as ***** + guard (l >= 5) + let body' = (replicate (l - 4) c) + return $ f (special body') + +link :: T2T Inlines +link = try imageLink <|> titleLink + +-- Link with title +titleLink :: T2T Inlines +titleLink = try $ do + char '[' + notFollowedBy space + tokens <- sepBy1 (many $ noneOf " ]") space + guard (length tokens >= 2) + char ']' + let link' = last tokens + guard (length link' > 0) + let tit = concat (intersperse " " (init tokens)) + return $ B.link link' "" (B.text tit) + +-- Link with image +imageLink :: T2T Inlines +imageLink = try $ do + char '[' + body <- image + many1 space + l <- manyTill (noneOf "\n\r ") (char ']') + return (B.link l "" body) + +macro :: T2T Inlines +macro = try $ do + name <- string "%%" *> oneOfStringsCI (map fst commands) + optional (try $ enclosed (char '(') (char ')') anyChar) + lookAhead (spaceChar <|> oneOf specialChars <|> newline) + maybe (return mempty) (\f -> B.str <$> asks f) (lookup name commands) + where + commands = [ ("date", date), ("mtime", mtime) + , ("infile", infile), ("outfile", outfile)] + +-- raw URLs in text are automatically linked +url :: T2T Inlines +url = try $ do + (rawUrl, escapedUrl) <- (try uri <|> emailAddress) + return $ B.link rawUrl "" (B.str escapedUrl) + +uri :: T2T (String, String) +uri = try $ do + address <- t2tURI + return (address, escapeURI address) + +-- The definition of a URI in the T2T source differs from the +-- actual definition. This is a transcription of the definition in +-- the source of v2.6 +--isT2TURI :: String -> Bool +--isT2TURI (parse t2tURI "" -> Right _) = True +--isT2TURI _ = False + +t2tURI :: T2T String +t2tURI = do + start <- try ((++) <$> proto <*> urlLogin) <|> guess + domain <- many1 chars + sep <- many (char '/') + form' <- option mempty ((:) <$> char '?' <*> many1 form) + anchor' <- option mempty ((:) <$> char '#' <*> many anchor) + return (start ++ domain ++ sep ++ form' ++ anchor') + where + protos = ["http", "https", "ftp", "telnet", "gopher", "wais"] + proto = (++) <$> oneOfStrings protos <*> string "://" + guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23")) + <|> stringAnyCase "ftp") <*> ((:[]) <$> char '.') + login = alphaNum <|> oneOf "_.-" + pass = many (noneOf " @") + chars = alphaNum <|> oneOf "%._/~:,=$@&+-" + anchor = alphaNum <|> oneOf "%._0" + form = chars <|> oneOf ";*" + urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@') + + +image :: T2T Inlines +image = try $ do + -- List taken from txt2tags source + let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"] + char '[' + path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions)) + ext <- oneOfStrings extensions + char ']' + return $ B.image (path ++ ext) "" mempty + +-- Characters used in markup +specialChars :: String +specialChars = "%*-_/|:+" + +tab :: T2T Char +tab = char '\t' + +space :: T2T Char +space = char ' ' + +spaces :: T2T String +spaces = many space + +endline :: T2T Inlines +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy hrule + notFollowedBy title + notFollowedBy verbatim + notFollowedBy rawBlock + notFollowedBy taggedBlock + notFollowedBy quote + notFollowedBy list + notFollowedBy table + return $ B.space + +str :: T2T Inlines +str = try $ do + B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + +whitespace :: T2T Inlines +whitespace = try $ B.space <$ spaceChar + +symbol :: T2T Inlines +symbol = B.str . (:[]) <$> oneOf specialChars + +-- Utility + +getTarget :: T2T String +getTarget = do + mv <- lookupMeta "target" . stateMeta <$> getState + let MetaString target = fromMaybe (MetaString "html") mv + return target + +atStart :: T2T () +atStart = (sourceColumn <$> getPosition) >>= guard . (== 1) + +ignoreSpacesCap :: T2T String -> T2T String +ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces) + |
