diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 78 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 114 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 18 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Txt2Tags.hs | 55 |
5 files changed, 196 insertions, 71 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index a1c16a03a..8ebe59569 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -95,6 +95,7 @@ import Control.Monad.Reader import Control.Monad.State import Control.Applicative ((<$>)) import Data.Sequence (ViewL(..), viewl) +import qualified Data.Sequence as Seq (null) readDocx :: ReaderOptions -> B.ByteString @@ -196,12 +197,6 @@ fixAuthors mv = mv codeStyles :: [String] codeStyles = ["VerbatimChar"] -strongStyles :: [String] -strongStyles = ["Strong", "Bold"] - -emphStyles :: [String] -emphStyles = ["Emphasis", "Italic"] - blockQuoteDivs :: [String] blockQuoteDivs = ["Quote", "BlockQuote", "BlockQuotation"] @@ -228,27 +223,44 @@ parPartToString (InternalHyperLink _ runs) = concatMap runToString runs parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs parPartToString _ = "" +blacklistedCharStyles :: [String] +blacklistedCharStyles = ["Hyperlink"] + +resolveDependentRunStyle :: RunStyle -> RunStyle +resolveDependentRunStyle rPr + | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = + rPr + | Just (_, cs) <- rStyle rPr = + let rPr' = resolveDependentRunStyle cs + in + RunStyle { isBold = case isBold rPr of + Just bool -> Just bool + Nothing -> isBold rPr' + , isItalic = case isItalic rPr of + Just bool -> Just bool + Nothing -> isItalic rPr' + , isSmallCaps = case isSmallCaps rPr of + Just bool -> Just bool + Nothing -> isSmallCaps rPr' + , isStrike = case isStrike rPr of + Just bool -> Just bool + Nothing -> isStrike rPr' + , rVertAlign = case rVertAlign rPr of + Just valign -> Just valign + Nothing -> rVertAlign rPr' + , rUnderline = case rUnderline rPr of + Just ulstyle -> Just ulstyle + Nothing -> rUnderline rPr' + , rStyle = rStyle rPr } + | otherwise = rPr + runStyleToTransform :: RunStyle -> (Inlines -> Inlines) runStyleToTransform rPr - | Just s <- rStyle rPr + | Just (s, _) <- rStyle rPr , s `elem` spansToKeep = let rPr' = rPr{rStyle = Nothing} in (spanWith ("", [s], [])) . (runStyleToTransform rPr') - | Just s <- rStyle rPr - , s `elem` emphStyles = - let rPr' = rPr{rStyle = Nothing, isItalic = Nothing} - in - case isItalic rPr of - Just False -> runStyleToTransform rPr' - _ -> emph . (runStyleToTransform rPr') - | Just s <- rStyle rPr - , s `elem` strongStyles = - let rPr' = rPr{rStyle = Nothing, isBold = Nothing} - in - case isBold rPr of - Just False -> runStyleToTransform rPr' - _ -> strong . (runStyleToTransform rPr') | Just True <- isItalic rPr = emph . (runStyleToTransform rPr {isItalic = Nothing}) | Just True <- isBold rPr = @@ -257,22 +269,22 @@ runStyleToTransform rPr smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing}) | Just True <- isStrike rPr = strikeout . (runStyleToTransform rPr {isStrike = Nothing}) - | isSuperScript rPr = - superscript . (runStyleToTransform rPr {isSuperScript = False}) - | isSubScript rPr = - subscript . (runStyleToTransform rPr {isSubScript = False}) + | Just SupScrpt <- rVertAlign rPr = + superscript . (runStyleToTransform rPr {rVertAlign = Nothing}) + | Just SubScrpt <- rVertAlign rPr = + subscript . (runStyleToTransform rPr {rVertAlign = Nothing}) | Just "single" <- rUnderline rPr = emph . (runStyleToTransform rPr {rUnderline = Nothing}) | otherwise = id runToInlines :: Run -> DocxContext Inlines runToInlines (Run rs runElems) - | Just s <- rStyle rs + | Just (s, _) <- rStyle rs , s `elem` codeStyles = return $ code $ concatMap runElemToString runElems | otherwise = do let ils = concatReduce (map runElemToInlines runElems) - return $ (runStyleToTransform rs) ils + return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils runToInlines (Footnote bps) = do blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) return $ note blksList @@ -380,11 +392,21 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) = return $ Header n (newIdent, classes, kvs) ils makeHeaderAnchor' blk = return blk +-- Rewrite a standalone paragraph block as a plain +singleParaToPlain :: Blocks -> Blocks +singleParaToPlain blks + | (Para (ils) :< seeq) <- viewl $ unMany blks + , Seq.null seeq = + singleton $ Plain ils +singleParaToPlain blks = blks + cellToBlocks :: Cell -> DocxContext Blocks cellToBlocks (Cell bps) = concatReduce <$> mapM bodyPartToBlocks bps rowToBlocksList :: Row -> DocxContext [Blocks] -rowToBlocksList (Row cells) = mapM cellToBlocks cells +rowToBlocksList (Row cells) = do + blksList <- mapM cellToBlocks cells + return $ map singleParaToPlain blksList trimLineBreaks :: [Inline] -> [Inline] trimLineBreaks [] = [] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 939fcde27..e7a6c3ffb 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -43,13 +43,13 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , Relationship , Media , RunStyle(..) + , VertAlign(..) , ParIndentation(..) , ParagraphStyle(..) , Row(..) , Cell(..) , archiveToDocx ) where - import Codec.Archive.Zip import Text.XML.Light import Data.Maybe @@ -72,6 +72,7 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes , envRelationships :: [Relationship] , envMedia :: Media , envFont :: Maybe Font + , envCharStyles :: CharStyleMap } deriving Show @@ -119,6 +120,10 @@ data Body = Body [BodyPart] type Media = [(FilePath, B.ByteString)] +type CharStyle = (String, RunStyle) + +type CharStyleMap = M.Map String RunStyle + data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] deriving Show @@ -196,14 +201,16 @@ data Run = Run RunStyle [RunElem] data RunElem = TextRun String | LnBrk | Tab deriving Show +data VertAlign = BaseLn | SupScrpt | SubScrpt + deriving Show + data RunStyle = RunStyle { isBold :: Maybe Bool , isItalic :: Maybe Bool , isSmallCaps :: Maybe Bool , isStrike :: Maybe Bool - , isSuperScript :: Bool - , isSubScript :: Bool + , rVertAlign :: Maybe VertAlign , rUnderline :: Maybe String - , rStyle :: Maybe String } + , rStyle :: Maybe CharStyle} deriving Show defaultRunStyle :: RunStyle @@ -211,11 +218,9 @@ defaultRunStyle = RunStyle { isBold = Nothing , isItalic = Nothing , isSmallCaps = Nothing , isStrike = Nothing - , isSuperScript = False - , isSubScript = False + , rVertAlign = Nothing , rUnderline = Nothing - , rStyle = Nothing - } + , rStyle = Nothing} type Target = String @@ -237,7 +242,8 @@ archiveToDocx archive = do numbering = archiveToNumbering archive rels = archiveToRelationships archive media = archiveToMedia archive - rEnv = ReaderEnv notes numbering rels media Nothing + styles = archiveToStyles archive + rEnv = ReaderEnv notes numbering rels media Nothing styles doc <- runD (archiveToDocument archive) rEnv return $ Docx doc @@ -257,6 +263,53 @@ elemToBody ns element | isElem ns "w" "body" element = (\bps -> return $ Body bps) elemToBody _ _ = throwError WrongElem +archiveToStyles :: Archive -> CharStyleMap +archiveToStyles zf = + let stylesElem = findEntryByPath "word/styles.xml" zf >>= + (parseXMLDoc . UTF8.toStringLazy . fromEntry) + in + case stylesElem of + Nothing -> M.empty + Just styElem -> + let namespaces = mapMaybe attrToNSPair (elAttribs styElem) + in + M.fromList $ buildBasedOnList namespaces styElem Nothing + +isBasedOnStyle :: NameSpaces -> Element -> Maybe CharStyle -> Bool +isBasedOnStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "character" <- findAttr (elemName ns "w" "type") element + , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>= + findAttr (elemName ns "w" "val") + , Just (parentId, _) <- parentStyle = (basedOnVal == parentId) + | isElem ns "w" "style" element + , Just "character" <- findAttr (elemName ns "w" "type") element + , Nothing <- findChild (elemName ns "w" "basedOn") element + , Nothing <- parentStyle = True + | otherwise = False + +elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle +elemToCharStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "character" <- findAttr (elemName ns "w" "type") element + , Just styleId <- findAttr (elemName ns "w" "styleId") element = + Just (styleId, elemToRunStyle ns element parentStyle) + | otherwise = Nothing + +getStyleChildren :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle] +getStyleChildren ns element parentStyle + | isElem ns "w" "styles" element = + mapMaybe (\e -> elemToCharStyle ns e parentStyle) $ + filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element + | otherwise = [] + +buildBasedOnList :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle] +buildBasedOnList ns element rootStyle = + case (getStyleChildren ns element rootStyle) of + [] -> [] + stys -> stys ++ + (concatMap (\s -> buildBasedOnList ns element (Just s)) stys) + archiveToNotes :: Archive -> Notes archiveToNotes zf = let fnElem = findEntryByPath "word/footnotes.xml" zf @@ -627,7 +680,8 @@ elemToRun ns element elemToRun ns element | isElem ns "w" "r" element = do runElems <- elemToRunElems ns element - return $ Run (elemToRunStyle ns element) runElems + runStyle <- elemToRunStyleD ns element + return $ Run runStyle runElems elemToRun _ _ = throwError WrongElem elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle @@ -667,9 +721,22 @@ checkOnOff ns rPr tag | Just _ <- findChild tag rPr = Just True checkOnOff _ _ _ = Nothing - -elemToRunStyle :: NameSpaces -> Element -> RunStyle -elemToRunStyle ns element +elemToRunStyleD :: NameSpaces -> Element -> D RunStyle +elemToRunStyleD ns element + | Just rPr <- findChild (elemName ns "w" "rPr") element = do + charStyles <- asks envCharStyles + let parentSty = case + findChild (elemName ns "w" "rStyle") rPr >>= + findAttr (elemName ns "w" "val") + of + Just styName | Just style <- M.lookup styName charStyles -> + Just (styName, style) + _ -> Nothing + return $ elemToRunStyle ns element parentSty +elemToRunStyleD _ _ = return defaultRunStyle + +elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle +elemToRunStyle ns element parentStyle | Just rPr <- findChild (elemName ns "w" "rPr") element = RunStyle { @@ -677,22 +744,19 @@ elemToRunStyle ns element , isItalic = checkOnOff ns rPr (elemName ns "w" "i") , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps") , isStrike = checkOnOff ns rPr (elemName ns "w" "strike") - , isSuperScript = - (Just "superscript" == - (findChild (elemName ns "w" "vertAlign") rPr >>= - findAttr (elemName ns "w" "val"))) - , isSubScript = - (Just "subscript" == - (findChild (elemName ns "w" "vertAlign") rPr >>= - findAttr (elemName ns "w" "val"))) + , rVertAlign = + findChild (elemName ns "w" "vertAlign") rPr >>= + findAttr (elemName ns "w" "val") >>= + \v -> Just $ case v of + "superscript" -> SupScrpt + "subscript" -> SubScrpt + _ -> BaseLn , rUnderline = findChild (elemName ns "w" "u") rPr >>= findAttr (elemName ns "w" "val") - , rStyle = - findChild (elemName ns "w" "rStyle") rPr >>= - findAttr (elemName ns "w" "val") + , rStyle = parentStyle } -elemToRunStyle _ _ = defaultRunStyle +elemToRunStyle _ _ _ = defaultRunStyle elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index bd60a74fa..4ea5f41d5 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -91,16 +91,20 @@ replaceNotes' x = return x data HTMLState = HTMLState { parserState :: ParserState, - noteTable :: [(String, Blocks)] + noteTable :: [(String, Blocks)] } data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain } setInChapter :: HTMLParser s a -> HTMLParser s a setInChapter = local (\s -> s {inChapter = True}) +setInPlain :: HTMLParser s a -> HTMLParser s a +setInPlain = local (\s -> s {inPlain = True}) + type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) type TagParser = HTMLParser [Tag String] @@ -141,8 +145,8 @@ block = do , pTable , pHead , pBody - , pPlain , pDiv + , pPlain , pRawHtmlBlock ] when tr $ trace (printf "line %d: %s" (sourceLine pos) @@ -422,7 +426,7 @@ pBlockQuote = do pPlain :: TagParser Blocks pPlain = do - contents <- trimInlines . mconcat <$> many1 inline + contents <- setInPlain $ trimInlines . mconcat <$> many1 inline if B.isNull contents then return mempty else return $ B.plain contents @@ -579,7 +583,11 @@ pSpan = try $ do pRawHtmlInline :: TagParser Inlines pRawHtmlInline = do - result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag + inplain <- asks inPlain + result <- pSatisfy (tagComment (const True)) + <|> if inplain + then pSatisfy (not . isBlockTag) + else pSatisfy isInlineTag parseRaw <- getOption readerParseRaw if parseRaw then return $ B.rawInline "html" $ renderTags' [result] @@ -919,7 +927,7 @@ instance HasMeta HTMLState where deleteMeta s st = st {parserState = deleteMeta s $ parserState st} instance Default HTMLLocal where - def = HTMLLocal NoQuote False + def = HTMLLocal NoQuote False False instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index e1c29d1e8..62421d2fb 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -483,7 +483,7 @@ exampleCode :: String -> Blocks exampleCode = B.codeBlockWith ("", ["example"], []) exampleLine :: OrgParser String -exampleLine = try $ string ": " *> anyLine +exampleLine = try $ skipSpaces *> string ": " *> anyLine -- Drawers for properties or a logbook drawer :: OrgParser (F Blocks) diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs index 3a51b9d84..6f8c19ac7 100644 --- a/src/Text/Pandoc/Readers/Txt2Tags.hs +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -73,11 +73,13 @@ instance Default T2TMeta where getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta getT2TMeta inps out = do curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime - let getModTime = fmap (formatTime defaultTimeLocale "%F") . + let getModTime = fmap (formatTime defaultTimeLocale "%T") . getModificationTime - curMtime <- catchIOError - (maximum <$> mapM getModTime inps) - (const (return "")) + curMtime <- case inps of + [] -> formatTime defaultTimeLocale "%T" <$> getZonedTime + _ -> catchIOError + (maximum <$> mapM getModTime inps) + (const (return "")) return $ T2TMeta curDate curMtime (intercalate ", " inps) out -- | Read Txt2Tags from an input string returning a Pandoc document @@ -91,13 +93,42 @@ readTxt2TagsNoMacros = readTxt2Tags def parseT2T :: T2T Pandoc parseT2T = do - _ <- (Nothing <$ try blankline) <|> (Just <$> (count 3 anyLine)) + -- Parse header if standalone flag is set + standalone <- getOption readerStandalone + when standalone parseHeader + body <- mconcat <$> manyTill block eof + meta' <- stateMeta <$> getState + return $ Pandoc meta' (B.toList body) + +parseHeader :: T2T () +parseHeader = do + () <$ try blankline <|> header + meta <- stateMeta <$> getState + optional blanklines config <- manyTill setting (notFollowedBy setting) -- TODO: Handle settings better - let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) nullMeta config - updateState (\s -> s {stateMeta = settings}) - body <- mconcat <$> manyTill block eof - return $ Pandoc mempty (B.toList body) + let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) meta config + updateState (\s -> s {stateMeta = settings}) <* optional blanklines + +header :: T2T () +header = titleline >> authorline >> dateline + +headerline :: B.ToMetaValue a => String -> T2T a -> T2T () +headerline field p = (() <$ try blankline) + <|> (p >>= updateState . B.setMeta field) + +titleline :: T2T () +titleline = + headerline "title" (trimInlines . mconcat <$> manyTill inline newline) + +authorline :: T2T () +authorline = + headerline "author" (sepBy author (char ';') <* newline) + where + author = trimInlines . mconcat <$> many (notFollowedBy (char ';' <|> newline) >> inline) + +dateline :: T2T () +dateline = headerline "date" (trimInlines . mconcat <$> manyTill inline newline) type Keyword = String type Value = String @@ -242,7 +273,7 @@ indentWith n = count n space table :: T2T Blocks table = try $ do - header <- fmap snd <$> option mempty (try headerRow) + tableHeader <- fmap snd <$> option mempty (try headerRow) rows <- many1 (many commentLine *> tableRow) let columns = transpose rows let ncolumns = length columns @@ -250,7 +281,7 @@ table = try $ do let rows' = map (map snd) rows let size = maximum (map length rows') let rowsPadded = map (pad size) rows' - let headerPadded = if (not (null header)) then pad size header else mempty + let headerPadded = if (not (null tableHeader)) then pad size tableHeader else mempty return $ B.table mempty (zip aligns (replicate ncolumns 0.0)) headerPadded rowsPadded @@ -497,7 +528,7 @@ image = try $ do -- Characters used in markup specialChars :: String -specialChars = "%*-_/|:+" +specialChars = "%*-_/|:+;" tab :: T2T Char tab = char '\t' |