aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs78
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs114
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs18
-rw-r--r--src/Text/Pandoc/Readers/Org.hs2
-rw-r--r--src/Text/Pandoc/Readers/Txt2Tags.hs55
5 files changed, 196 insertions, 71 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index a1c16a03a..8ebe59569 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -95,6 +95,7 @@ import Control.Monad.Reader
import Control.Monad.State
import Control.Applicative ((<$>))
import Data.Sequence (ViewL(..), viewl)
+import qualified Data.Sequence as Seq (null)
readDocx :: ReaderOptions
-> B.ByteString
@@ -196,12 +197,6 @@ fixAuthors mv = mv
codeStyles :: [String]
codeStyles = ["VerbatimChar"]
-strongStyles :: [String]
-strongStyles = ["Strong", "Bold"]
-
-emphStyles :: [String]
-emphStyles = ["Emphasis", "Italic"]
-
blockQuoteDivs :: [String]
blockQuoteDivs = ["Quote", "BlockQuote", "BlockQuotation"]
@@ -228,27 +223,44 @@ parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
parPartToString _ = ""
+blacklistedCharStyles :: [String]
+blacklistedCharStyles = ["Hyperlink"]
+
+resolveDependentRunStyle :: RunStyle -> RunStyle
+resolveDependentRunStyle rPr
+ | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles =
+ rPr
+ | Just (_, cs) <- rStyle rPr =
+ let rPr' = resolveDependentRunStyle cs
+ in
+ RunStyle { isBold = case isBold rPr of
+ Just bool -> Just bool
+ Nothing -> isBold rPr'
+ , isItalic = case isItalic rPr of
+ Just bool -> Just bool
+ Nothing -> isItalic rPr'
+ , isSmallCaps = case isSmallCaps rPr of
+ Just bool -> Just bool
+ Nothing -> isSmallCaps rPr'
+ , isStrike = case isStrike rPr of
+ Just bool -> Just bool
+ Nothing -> isStrike rPr'
+ , rVertAlign = case rVertAlign rPr of
+ Just valign -> Just valign
+ Nothing -> rVertAlign rPr'
+ , rUnderline = case rUnderline rPr of
+ Just ulstyle -> Just ulstyle
+ Nothing -> rUnderline rPr'
+ , rStyle = rStyle rPr }
+ | otherwise = rPr
+
runStyleToTransform :: RunStyle -> (Inlines -> Inlines)
runStyleToTransform rPr
- | Just s <- rStyle rPr
+ | Just (s, _) <- rStyle rPr
, s `elem` spansToKeep =
let rPr' = rPr{rStyle = Nothing}
in
(spanWith ("", [s], [])) . (runStyleToTransform rPr')
- | Just s <- rStyle rPr
- , s `elem` emphStyles =
- let rPr' = rPr{rStyle = Nothing, isItalic = Nothing}
- in
- case isItalic rPr of
- Just False -> runStyleToTransform rPr'
- _ -> emph . (runStyleToTransform rPr')
- | Just s <- rStyle rPr
- , s `elem` strongStyles =
- let rPr' = rPr{rStyle = Nothing, isBold = Nothing}
- in
- case isBold rPr of
- Just False -> runStyleToTransform rPr'
- _ -> strong . (runStyleToTransform rPr')
| Just True <- isItalic rPr =
emph . (runStyleToTransform rPr {isItalic = Nothing})
| Just True <- isBold rPr =
@@ -257,22 +269,22 @@ runStyleToTransform rPr
smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing})
| Just True <- isStrike rPr =
strikeout . (runStyleToTransform rPr {isStrike = Nothing})
- | isSuperScript rPr =
- superscript . (runStyleToTransform rPr {isSuperScript = False})
- | isSubScript rPr =
- subscript . (runStyleToTransform rPr {isSubScript = False})
+ | Just SupScrpt <- rVertAlign rPr =
+ superscript . (runStyleToTransform rPr {rVertAlign = Nothing})
+ | Just SubScrpt <- rVertAlign rPr =
+ subscript . (runStyleToTransform rPr {rVertAlign = Nothing})
| Just "single" <- rUnderline rPr =
emph . (runStyleToTransform rPr {rUnderline = Nothing})
| otherwise = id
runToInlines :: Run -> DocxContext Inlines
runToInlines (Run rs runElems)
- | Just s <- rStyle rs
+ | Just (s, _) <- rStyle rs
, s `elem` codeStyles =
return $ code $ concatMap runElemToString runElems
| otherwise = do
let ils = concatReduce (map runElemToInlines runElems)
- return $ (runStyleToTransform rs) ils
+ return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils
runToInlines (Footnote bps) = do
blksList <- concatReduce <$> (mapM bodyPartToBlocks bps)
return $ note blksList
@@ -380,11 +392,21 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) =
return $ Header n (newIdent, classes, kvs) ils
makeHeaderAnchor' blk = return blk
+-- Rewrite a standalone paragraph block as a plain
+singleParaToPlain :: Blocks -> Blocks
+singleParaToPlain blks
+ | (Para (ils) :< seeq) <- viewl $ unMany blks
+ , Seq.null seeq =
+ singleton $ Plain ils
+singleParaToPlain blks = blks
+
cellToBlocks :: Cell -> DocxContext Blocks
cellToBlocks (Cell bps) = concatReduce <$> mapM bodyPartToBlocks bps
rowToBlocksList :: Row -> DocxContext [Blocks]
-rowToBlocksList (Row cells) = mapM cellToBlocks cells
+rowToBlocksList (Row cells) = do
+ blksList <- mapM cellToBlocks cells
+ return $ map singleParaToPlain blksList
trimLineBreaks :: [Inline] -> [Inline]
trimLineBreaks [] = []
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 939fcde27..e7a6c3ffb 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -43,13 +43,13 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, Relationship
, Media
, RunStyle(..)
+ , VertAlign(..)
, ParIndentation(..)
, ParagraphStyle(..)
, Row(..)
, Cell(..)
, archiveToDocx
) where
-
import Codec.Archive.Zip
import Text.XML.Light
import Data.Maybe
@@ -72,6 +72,7 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes
, envRelationships :: [Relationship]
, envMedia :: Media
, envFont :: Maybe Font
+ , envCharStyles :: CharStyleMap
}
deriving Show
@@ -119,6 +120,10 @@ data Body = Body [BodyPart]
type Media = [(FilePath, B.ByteString)]
+type CharStyle = (String, RunStyle)
+
+type CharStyleMap = M.Map String RunStyle
+
data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
deriving Show
@@ -196,14 +201,16 @@ data Run = Run RunStyle [RunElem]
data RunElem = TextRun String | LnBrk | Tab
deriving Show
+data VertAlign = BaseLn | SupScrpt | SubScrpt
+ deriving Show
+
data RunStyle = RunStyle { isBold :: Maybe Bool
, isItalic :: Maybe Bool
, isSmallCaps :: Maybe Bool
, isStrike :: Maybe Bool
- , isSuperScript :: Bool
- , isSubScript :: Bool
+ , rVertAlign :: Maybe VertAlign
, rUnderline :: Maybe String
- , rStyle :: Maybe String }
+ , rStyle :: Maybe CharStyle}
deriving Show
defaultRunStyle :: RunStyle
@@ -211,11 +218,9 @@ defaultRunStyle = RunStyle { isBold = Nothing
, isItalic = Nothing
, isSmallCaps = Nothing
, isStrike = Nothing
- , isSuperScript = False
- , isSubScript = False
+ , rVertAlign = Nothing
, rUnderline = Nothing
- , rStyle = Nothing
- }
+ , rStyle = Nothing}
type Target = String
@@ -237,7 +242,8 @@ archiveToDocx archive = do
numbering = archiveToNumbering archive
rels = archiveToRelationships archive
media = archiveToMedia archive
- rEnv = ReaderEnv notes numbering rels media Nothing
+ styles = archiveToStyles archive
+ rEnv = ReaderEnv notes numbering rels media Nothing styles
doc <- runD (archiveToDocument archive) rEnv
return $ Docx doc
@@ -257,6 +263,53 @@ elemToBody ns element | isElem ns "w" "body" element =
(\bps -> return $ Body bps)
elemToBody _ _ = throwError WrongElem
+archiveToStyles :: Archive -> CharStyleMap
+archiveToStyles zf =
+ let stylesElem = findEntryByPath "word/styles.xml" zf >>=
+ (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ in
+ case stylesElem of
+ Nothing -> M.empty
+ Just styElem ->
+ let namespaces = mapMaybe attrToNSPair (elAttribs styElem)
+ in
+ M.fromList $ buildBasedOnList namespaces styElem Nothing
+
+isBasedOnStyle :: NameSpaces -> Element -> Maybe CharStyle -> Bool
+isBasedOnStyle ns element parentStyle
+ | isElem ns "w" "style" element
+ , Just "character" <- findAttr (elemName ns "w" "type") element
+ , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>=
+ findAttr (elemName ns "w" "val")
+ , Just (parentId, _) <- parentStyle = (basedOnVal == parentId)
+ | isElem ns "w" "style" element
+ , Just "character" <- findAttr (elemName ns "w" "type") element
+ , Nothing <- findChild (elemName ns "w" "basedOn") element
+ , Nothing <- parentStyle = True
+ | otherwise = False
+
+elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle
+elemToCharStyle ns element parentStyle
+ | isElem ns "w" "style" element
+ , Just "character" <- findAttr (elemName ns "w" "type") element
+ , Just styleId <- findAttr (elemName ns "w" "styleId") element =
+ Just (styleId, elemToRunStyle ns element parentStyle)
+ | otherwise = Nothing
+
+getStyleChildren :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle]
+getStyleChildren ns element parentStyle
+ | isElem ns "w" "styles" element =
+ mapMaybe (\e -> elemToCharStyle ns e parentStyle) $
+ filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element
+ | otherwise = []
+
+buildBasedOnList :: NameSpaces -> Element -> Maybe CharStyle -> [CharStyle]
+buildBasedOnList ns element rootStyle =
+ case (getStyleChildren ns element rootStyle) of
+ [] -> []
+ stys -> stys ++
+ (concatMap (\s -> buildBasedOnList ns element (Just s)) stys)
+
archiveToNotes :: Archive -> Notes
archiveToNotes zf =
let fnElem = findEntryByPath "word/footnotes.xml" zf
@@ -627,7 +680,8 @@ elemToRun ns element
elemToRun ns element
| isElem ns "w" "r" element = do
runElems <- elemToRunElems ns element
- return $ Run (elemToRunStyle ns element) runElems
+ runStyle <- elemToRunStyleD ns element
+ return $ Run runStyle runElems
elemToRun _ _ = throwError WrongElem
elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
@@ -667,9 +721,22 @@ checkOnOff ns rPr tag
| Just _ <- findChild tag rPr = Just True
checkOnOff _ _ _ = Nothing
-
-elemToRunStyle :: NameSpaces -> Element -> RunStyle
-elemToRunStyle ns element
+elemToRunStyleD :: NameSpaces -> Element -> D RunStyle
+elemToRunStyleD ns element
+ | Just rPr <- findChild (elemName ns "w" "rPr") element = do
+ charStyles <- asks envCharStyles
+ let parentSty = case
+ findChild (elemName ns "w" "rStyle") rPr >>=
+ findAttr (elemName ns "w" "val")
+ of
+ Just styName | Just style <- M.lookup styName charStyles ->
+ Just (styName, style)
+ _ -> Nothing
+ return $ elemToRunStyle ns element parentSty
+elemToRunStyleD _ _ = return defaultRunStyle
+
+elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle
+elemToRunStyle ns element parentStyle
| Just rPr <- findChild (elemName ns "w" "rPr") element =
RunStyle
{
@@ -677,22 +744,19 @@ elemToRunStyle ns element
, isItalic = checkOnOff ns rPr (elemName ns "w" "i")
, isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps")
, isStrike = checkOnOff ns rPr (elemName ns "w" "strike")
- , isSuperScript =
- (Just "superscript" ==
- (findChild (elemName ns "w" "vertAlign") rPr >>=
- findAttr (elemName ns "w" "val")))
- , isSubScript =
- (Just "subscript" ==
- (findChild (elemName ns "w" "vertAlign") rPr >>=
- findAttr (elemName ns "w" "val")))
+ , rVertAlign =
+ findChild (elemName ns "w" "vertAlign") rPr >>=
+ findAttr (elemName ns "w" "val") >>=
+ \v -> Just $ case v of
+ "superscript" -> SupScrpt
+ "subscript" -> SubScrpt
+ _ -> BaseLn
, rUnderline =
findChild (elemName ns "w" "u") rPr >>=
findAttr (elemName ns "w" "val")
- , rStyle =
- findChild (elemName ns "w" "rStyle") rPr >>=
- findAttr (elemName ns "w" "val")
+ , rStyle = parentStyle
}
-elemToRunStyle _ _ = defaultRunStyle
+elemToRunStyle _ _ _ = defaultRunStyle
elemToRunElem :: NameSpaces -> Element -> D RunElem
elemToRunElem ns element
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index bd60a74fa..4ea5f41d5 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -91,16 +91,20 @@ replaceNotes' x = return x
data HTMLState =
HTMLState
{ parserState :: ParserState,
- noteTable :: [(String, Blocks)]
+ noteTable :: [(String, Blocks)]
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
, inChapter :: Bool -- ^ Set if in chapter section
+ , inPlain :: Bool -- ^ Set if in pPlain
}
setInChapter :: HTMLParser s a -> HTMLParser s a
setInChapter = local (\s -> s {inChapter = True})
+setInPlain :: HTMLParser s a -> HTMLParser s a
+setInPlain = local (\s -> s {inPlain = True})
+
type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
type TagParser = HTMLParser [Tag String]
@@ -141,8 +145,8 @@ block = do
, pTable
, pHead
, pBody
- , pPlain
, pDiv
+ , pPlain
, pRawHtmlBlock
]
when tr $ trace (printf "line %d: %s" (sourceLine pos)
@@ -422,7 +426,7 @@ pBlockQuote = do
pPlain :: TagParser Blocks
pPlain = do
- contents <- trimInlines . mconcat <$> many1 inline
+ contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
if B.isNull contents
then return mempty
else return $ B.plain contents
@@ -579,7 +583,11 @@ pSpan = try $ do
pRawHtmlInline :: TagParser Inlines
pRawHtmlInline = do
- result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
+ inplain <- asks inPlain
+ result <- pSatisfy (tagComment (const True))
+ <|> if inplain
+ then pSatisfy (not . isBlockTag)
+ else pSatisfy isInlineTag
parseRaw <- getOption readerParseRaw
if parseRaw
then return $ B.rawInline "html" $ renderTags' [result]
@@ -919,7 +927,7 @@ instance HasMeta HTMLState where
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
instance Default HTMLLocal where
- def = HTMLLocal NoQuote False
+ def = HTMLLocal NoQuote False False
instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index e1c29d1e8..62421d2fb 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -483,7 +483,7 @@ exampleCode :: String -> Blocks
exampleCode = B.codeBlockWith ("", ["example"], [])
exampleLine :: OrgParser String
-exampleLine = try $ string ": " *> anyLine
+exampleLine = try $ skipSpaces *> string ": " *> anyLine
-- Drawers for properties or a logbook
drawer :: OrgParser (F Blocks)
diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs
index 3a51b9d84..6f8c19ac7 100644
--- a/src/Text/Pandoc/Readers/Txt2Tags.hs
+++ b/src/Text/Pandoc/Readers/Txt2Tags.hs
@@ -73,11 +73,13 @@ instance Default T2TMeta where
getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta
getT2TMeta inps out = do
curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime
- let getModTime = fmap (formatTime defaultTimeLocale "%F") .
+ let getModTime = fmap (formatTime defaultTimeLocale "%T") .
getModificationTime
- curMtime <- catchIOError
- (maximum <$> mapM getModTime inps)
- (const (return ""))
+ curMtime <- case inps of
+ [] -> formatTime defaultTimeLocale "%T" <$> getZonedTime
+ _ -> catchIOError
+ (maximum <$> mapM getModTime inps)
+ (const (return ""))
return $ T2TMeta curDate curMtime (intercalate ", " inps) out
-- | Read Txt2Tags from an input string returning a Pandoc document
@@ -91,13 +93,42 @@ readTxt2TagsNoMacros = readTxt2Tags def
parseT2T :: T2T Pandoc
parseT2T = do
- _ <- (Nothing <$ try blankline) <|> (Just <$> (count 3 anyLine))
+ -- Parse header if standalone flag is set
+ standalone <- getOption readerStandalone
+ when standalone parseHeader
+ body <- mconcat <$> manyTill block eof
+ meta' <- stateMeta <$> getState
+ return $ Pandoc meta' (B.toList body)
+
+parseHeader :: T2T ()
+parseHeader = do
+ () <$ try blankline <|> header
+ meta <- stateMeta <$> getState
+ optional blanklines
config <- manyTill setting (notFollowedBy setting)
-- TODO: Handle settings better
- let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) nullMeta config
- updateState (\s -> s {stateMeta = settings})
- body <- mconcat <$> manyTill block eof
- return $ Pandoc mempty (B.toList body)
+ let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) meta config
+ updateState (\s -> s {stateMeta = settings}) <* optional blanklines
+
+header :: T2T ()
+header = titleline >> authorline >> dateline
+
+headerline :: B.ToMetaValue a => String -> T2T a -> T2T ()
+headerline field p = (() <$ try blankline)
+ <|> (p >>= updateState . B.setMeta field)
+
+titleline :: T2T ()
+titleline =
+ headerline "title" (trimInlines . mconcat <$> manyTill inline newline)
+
+authorline :: T2T ()
+authorline =
+ headerline "author" (sepBy author (char ';') <* newline)
+ where
+ author = trimInlines . mconcat <$> many (notFollowedBy (char ';' <|> newline) >> inline)
+
+dateline :: T2T ()
+dateline = headerline "date" (trimInlines . mconcat <$> manyTill inline newline)
type Keyword = String
type Value = String
@@ -242,7 +273,7 @@ indentWith n = count n space
table :: T2T Blocks
table = try $ do
- header <- fmap snd <$> option mempty (try headerRow)
+ tableHeader <- fmap snd <$> option mempty (try headerRow)
rows <- many1 (many commentLine *> tableRow)
let columns = transpose rows
let ncolumns = length columns
@@ -250,7 +281,7 @@ table = try $ do
let rows' = map (map snd) rows
let size = maximum (map length rows')
let rowsPadded = map (pad size) rows'
- let headerPadded = if (not (null header)) then pad size header else mempty
+ let headerPadded = if (not (null tableHeader)) then pad size tableHeader else mempty
return $ B.table mempty
(zip aligns (replicate ncolumns 0.0))
headerPadded rowsPadded
@@ -497,7 +528,7 @@ image = try $ do
-- Characters used in markup
specialChars :: String
-specialChars = "%*-_/|:+"
+specialChars = "%*-_/|:+;"
tab :: T2T Char
tab = char '\t'