aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs30
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs94
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs6
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs55
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs80
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs2
-rw-r--r--src/Text/Pandoc/Readers/Odt.hs4
-rw-r--r--src/Text/Pandoc/Readers/Org.hs161
8 files changed, 256 insertions, 176 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index eb71d8dd8..9c7c3b264 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -50,8 +50,7 @@ implemented, [-] means partially implemented):
* Inlines
- [X] Str
- - [X] Emph (From italics. `underline` currently read as span. In
- future, it might optionally be emph as well)
+ - [X] Emph (italics and underline both read as Emph)
- [X] Strong
- [X] Strikeout
- [X] Superscript
@@ -62,16 +61,16 @@ implemented, [-] means partially implemented):
- [X] Code (styled with `VerbatimChar`)
- [X] Space
- [X] LineBreak (these are invisible in Word: entered with Shift-Return)
- - [ ] Math
+ - [X] Math
- [X] Link (links to an arbitrary bookmark create a span with the target as
id and "anchor" class)
- - [-] Image (Links to path in archive. Future option for
- data-encoded URI likely.)
+ - [X] Image
- [X] Note (Footnotes and Endnotes are silently combined.)
-}
module Text.Pandoc.Readers.Docx
- ( readDocx
+ ( readDocxWithWarnings
+ , readDocx
) where
import Codec.Archive.Zip
@@ -98,14 +97,23 @@ import qualified Data.Sequence as Seq (null)
import Text.Pandoc.Error
import Text.Pandoc.Compat.Except
+readDocxWithWarnings :: ReaderOptions
+ -> B.ByteString
+ -> Either PandocError (Pandoc, MediaBag, [String])
+readDocxWithWarnings opts bytes
+ | Right archive <- toArchiveOrFail bytes
+ , Right (docx, warnings) <- archiveToDocxWithWarnings archive = do
+ (meta, blks, mediaBag) <- docxToOutput opts docx
+ return (Pandoc meta blks, mediaBag, warnings)
+readDocxWithWarnings _ _ =
+ Left (ParseFailure "couldn't parse docx file")
+
readDocx :: ReaderOptions
-> B.ByteString
-> Either PandocError (Pandoc, MediaBag)
-readDocx opts bytes =
- case archiveToDocx (toArchive bytes) of
- Right docx -> (\(meta, blks, mediaBag) -> (Pandoc meta blks, mediaBag))
- <$> (docxToOutput opts docx)
- Left _ -> Left (ParseFailure "couldn't parse docx file")
+readDocx opts bytes = do
+ (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes
+ return (pandoc, mediaBag)
data DState = DState { docxAnchorMap :: M.Map String String
, docxMediaBag :: MediaBag
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index eec8b12c9..7265ef8dd 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, Row(..)
, Cell(..)
, archiveToDocx
+ , archiveToDocxWithWarnings
) where
import Codec.Archive.Zip
import Text.XML.Light
@@ -60,6 +61,7 @@ import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import qualified Text.Pandoc.UTF8 as UTF8
import Control.Monad.Reader
+import Control.Monad.State
import Control.Applicative ((<|>))
import qualified Data.Map as M
import Text.Pandoc.Compat.Except
@@ -81,16 +83,20 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes
}
deriving Show
+data ReaderState = ReaderState { stateWarnings :: [String] }
+ deriving Show
+
+
data DocxError = DocxError | WrongElem
deriving Show
instance Error DocxError where
noMsg = WrongElem
-type D = ExceptT DocxError (Reader ReaderEnv)
+type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState))
-runD :: D a -> ReaderEnv -> Either DocxError a
-runD dx re = runReader (runExceptT dx) re
+runD :: D a -> ReaderEnv -> ReaderState -> (Either DocxError a, ReaderState)
+runD dx re rs = runState (runReaderT (runExceptT dx) re) rs
maybeToD :: Maybe a -> D a
maybeToD (Just a) = return a
@@ -257,7 +263,10 @@ type Author = String
type ChangeDate = String
archiveToDocx :: Archive -> Either DocxError Docx
-archiveToDocx archive = do
+archiveToDocx archive = fst <$> archiveToDocxWithWarnings archive
+
+archiveToDocxWithWarnings :: Archive -> Either DocxError (Docx, [String])
+archiveToDocxWithWarnings archive = do
let notes = archiveToNotes archive
numbering = archiveToNumbering archive
rels = archiveToRelationships archive
@@ -265,8 +274,12 @@ archiveToDocx archive = do
(styles, parstyles) = archiveToStyles archive
rEnv =
ReaderEnv notes numbering rels media Nothing styles parstyles InDocument
- doc <- runD (archiveToDocument archive) rEnv
- return $ Docx doc
+ rState = ReaderState { stateWarnings = [] }
+ (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState
+ case eitherDoc of
+ Right doc -> Right (Docx doc, stateWarnings st)
+ Left e -> Left e
+
archiveToDocument :: Archive -> D Document
@@ -576,12 +589,14 @@ elemToBodyPart ns element
sty <- asks envParStyles
let parstyle = elemToParagraphStyle ns element sty
parparts <- mapD (elemToParPart ns) (elChildren element)
- case pNumInfo parstyle of
- Just (numId, lvl) -> do
- num <- asks envNumbering
- let levelInfo = lookupLevel numId lvl num
- return $ ListItem parstyle numId lvl levelInfo parparts
- Nothing -> return $ Paragraph parstyle parparts
+ -- Word uses list enumeration for numbered headings, so we only
+ -- want to infer a list from the styles if it is NOT a heading.
+ case pHeading parstyle of
+ Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
+ num <- asks envNumbering
+ let levelInfo = lookupLevel numId lvl num
+ return $ ListItem parstyle numId lvl levelInfo parparts
+ _ -> return $ Paragraph parstyle parparts
elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
let caption' = findChild (elemName ns "w" "tblPr") element
@@ -646,14 +661,14 @@ elemToParPart ns element
| isElem ns "w" "r" element =
elemToRun ns element >>= (\r -> return $ PlainRun r)
elemToParPart ns element
- | isElem ns "w" "ins" element
+ | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element
, Just cId <- findAttr (elemName ns "w" "id") element
, Just cAuthor <- findAttr (elemName ns "w" "author") element
, Just cDate <- findAttr (elemName ns "w" "date") element = do
runs <- mapD (elemToRun ns) (elChildren element)
return $ Insertion cId cAuthor cDate runs
elemToParPart ns element
- | isElem ns "w" "del" element
+ | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element
, Just cId <- findAttr (elemName ns "w" "id") element
, Just cAuthor <- findAttr (elemName ns "w" "author") element
, Just cDate <- findAttr (elemName ns "w" "date") element = do
@@ -702,36 +717,58 @@ elemToExtent drawingElem =
getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem
>>= findAttr (QName at Nothing Nothing) >>= safeRead
-elemToRun :: NameSpaces -> Element -> D Run
-elemToRun ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+
+childElemToRun :: NameSpaces -> Element -> D Run
+childElemToRun ns element
+ | isElem ns "w" "drawing" element =
let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
- drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element
>>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
in
case drawing of
Just s -> expandDrawingId s >>=
- (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent drawingElem)
+ (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent element)
Nothing -> throwError WrongElem
-elemToRun ns element
- | isElem ns "w" "r" element
- , Just ref <- findChild (elemName ns "w" "footnoteReference") element
- , Just fnId <- findAttr (elemName ns "w" "id") ref = do
+childElemToRun ns element
+ | isElem ns "w" "footnoteReference" element
+ , Just fnId <- findAttr (elemName ns "w" "id") element = do
notes <- asks envNotes
case lookupFootnote fnId notes of
Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e)
return $ Footnote bps
Nothing -> return $ Footnote []
-elemToRun ns element
- | isElem ns "w" "r" element
- , Just ref <- findChild (elemName ns "w" "endnoteReference") element
- , Just enId <- findAttr (elemName ns "w" "id") ref = do
+childElemToRun ns element
+ | isElem ns "w" "endnoteReference" element
+ , Just enId <- findAttr (elemName ns "w" "id") element = do
notes <- asks envNotes
case lookupEndnote enId notes of
Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e)
return $ Endnote bps
Nothing -> return $ Endnote []
+childElemToRun _ _ = throwError WrongElem
+
+elemToRun :: NameSpaces -> Element -> D Run
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element =
+ do let choices = findChildren (elemName ns "mc" "Choice") altCont
+ choiceChildren = map head $ filter (not . null) $ map elChildren choices
+ outputs <- mapD (childElemToRun ns) choiceChildren
+ case outputs of
+ r : _ -> return r
+ [] -> throwError WrongElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+ childElemToRun ns drawingElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "footnoteReference") element =
+ childElemToRun ns ref
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "endnoteReference") element =
+ childElemToRun ns ref
elemToRun ns element
| isElem ns "w" "r" element = do
runElems <- elemToRunElems ns element
@@ -940,3 +977,4 @@ elemToRunElems _ _ = throwError WrongElem
setFont :: Maybe Font -> ReaderEnv -> ReaderEnv
setFont f s = s{envFont = f}
+
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index 07d282708..144ba9ca2 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -19,7 +19,7 @@ import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
import Text.Pandoc.Compat.Monoid ((<>))
import Text.Pandoc.MIME (MimeType)
import qualified Text.Pandoc.Builder as B
-import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry
+import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry
, findEntryByPath, Entry)
import qualified Data.ByteString.Lazy as BL (ByteString)
import System.FilePath ( takeFileName, (</>), dropFileName, normalise
@@ -39,7 +39,9 @@ import Text.Pandoc.Error
type Items = M.Map String (FilePath, MimeType)
readEPUB :: ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag)
-readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes)
+readEPUB opts bytes = case toArchiveOrFail bytes of
+ Right archive -> runEPUB $ archiveToEPUB opts $ archive
+ Left _ -> Left $ ParseFailure "Couldn't extract ePub file"
runEPUB :: Except PandocError a -> Either PandocError a
runEPUB = runExcept
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 69df13aac..8ee5da543 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -63,7 +63,7 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-import Network.URI (isURI)
+import Network.URI (URI, parseURIReference, nonStrictRelativeTo)
import Text.Pandoc.Error
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Text.Pandoc.Compat.Monoid ((<>))
@@ -103,7 +103,7 @@ data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(String, Blocks)],
- baseHref :: Maybe String,
+ baseHref :: Maybe URI,
identifiers :: Set.Set String,
headerMap :: M.Map Inlines String
}
@@ -145,15 +145,9 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag
return mempty
pBaseTag = do
bt <- pSatisfy (~== TagOpen "base" [])
- let baseH = fromAttrib "href" bt
- if null baseH
- then return mempty
- else do
- let baseH' = case reverse baseH of
- '/':_ -> baseH
- _ -> baseH ++ "/"
- updateState $ \st -> st{ baseHref = Just baseH' }
- return mempty
+ updateState $ \st -> st{ baseHref =
+ parseURIReference $ fromAttrib "href" bt }
+ return mempty
block :: TagParser Blocks
block = do
@@ -610,9 +604,9 @@ pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "href" tag
- let url = case (isURI url', mbBaseHref) of
- (False, Just h) -> h ++ url'
- _ -> url'
+ let url = case (parseURIReference url', mbBaseHref) of
+ (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ _ -> url'
let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag
let cls = words $ fromAttrib "class" tag
@@ -624,9 +618,9 @@ pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "src" tag
- let url = case (isURI url', mbBaseHref) of
- (False, Just h) -> h ++ url'
- _ -> url'
+ let url = case (parseURIReference url', mbBaseHref) of
+ (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ _ -> url'
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
let uid = fromAttrib "id" tag
@@ -945,7 +939,7 @@ htmlInBalanced f = try $ do
(TagClose _ : TagPosition er ec : _) -> do
let ls = er - sr
let cs = ec - sc
- lscontents <- concat <$> count ls anyLine
+ lscontents <- unlines <$> count ls anyLine
cscontents <- count cs anyChar
(_,closetag) <- htmlTag (~== TagClose tn)
return (lscontents ++ cscontents ++ closetag)
@@ -977,11 +971,20 @@ htmlTag :: Monad m
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let (next : rest) = canonicalizeTags $ parseTagsOptions
- parseOptions{ optTagWarning = True } inp
+ let (next : _) = canonicalizeTags $ parseTagsOptions
+ parseOptions{ optTagWarning = False } inp
guard $ f next
+ let handleTag tagname = do
+ -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
+ -- should NOT be parsed as an HTML tag, see #2277
+ guard $ not ('.' `elem` tagname)
+ -- <https://example.org> should NOT be a tag either.
+ -- tagsoup will parse it as TagOpen "https:" [("example.org","")]
+ guard $ not (null tagname)
+ guard $ last tagname /= ':'
+ rendered <- manyTill anyChar (char '>')
+ return (next, rendered ++ ">")
case next of
- TagWarning _ -> fail "encountered TagWarning"
TagComment s
| "<!--" `isPrefixOf` inp -> do
count (length s + 4) anyChar
@@ -989,13 +992,9 @@ htmlTag f = try $ do
char '>'
return (next, "<!--" ++ s ++ "-->")
| otherwise -> fail "bogus comment mode, HTML5 parse error"
- _ -> do
- -- we get a TagWarning on things like
- -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
- -- which should NOT be parsed as an HTML tag, see #2277
- guard $ not $ hasTagWarning rest
- rendered <- manyTill anyChar (char '>')
- return (next, rendered ++ ">")
+ TagOpen tagname _attr -> handleTag tagname
+ TagClose tagname -> handleTag tagname
+ _ -> mzero
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 587726084..e43714526 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -122,9 +122,6 @@ inList = do
ctx <- stateParserContext <$> getState
guard (ctx == ListItemState)
-isNull :: F Inlines -> Bool
-isNull ils = B.isNull $ runF ils def
-
spnl :: Parser [Char] st ()
spnl = try $ do
skipSpaces
@@ -188,31 +185,38 @@ charsInBalancedBrackets openBrackets =
-- document structure
--
-titleLine :: MarkdownParser (F Inlines)
-titleLine = try $ do
+rawTitleBlockLine :: MarkdownParser String
+rawTitleBlockLine = do
char '%'
skipSpaces
- res <- many $ (notFollowedBy newline >> inline)
- <|> try (endline >> whitespace)
- newline
+ first <- anyLine
+ rest <- many $ try $ do spaceChar
+ notFollowedBy blankline
+ skipSpaces
+ anyLine
+ return $ trim $ unlines (first:rest)
+
+titleLine :: MarkdownParser (F Inlines)
+titleLine = try $ do
+ raw <- rawTitleBlockLine
+ res <- parseFromString (many inline) raw
return $ trimInlinesF $ mconcat res
authorsLine :: MarkdownParser (F [Inlines])
authorsLine = try $ do
- char '%'
- skipSpaces
- authors <- sepEndBy (many (notFollowedBy (satisfy $ \c ->
- c == ';' || c == '\n') >> inline))
- (char ';' <|>
- try (newline >> notFollowedBy blankline >> spaceChar))
- newline
- return $ sequence $ filter (not . isNull) $ map (trimInlinesF . mconcat) authors
+ raw <- rawTitleBlockLine
+ let sep = (char ';' <* spaces) <|> newline
+ let pAuthors = sepEndBy
+ (trimInlinesF . mconcat <$> many
+ (try $ notFollowedBy sep >> inline))
+ sep
+ sequence <$> parseFromString pAuthors raw
dateLine :: MarkdownParser (F Inlines)
dateLine = try $ do
- char '%'
- skipSpaces
- trimInlinesF . mconcat <$> manyTill inline newline
+ raw <- rawTitleBlockLine
+ res <- parseFromString (many inline) raw
+ return $ trimInlinesF $ mconcat res
titleBlock :: MarkdownParser ()
titleBlock = pandocTitleBlock <|> mmdTitleBlock
@@ -1354,16 +1358,18 @@ pipeTable = try $ do
nonindentSpaces
lookAhead nonspaceChar
(heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak
+ let heads' = take (length aligns) <$> heads
lines' <- many pipeTableRow
+ let lines'' = map (take (length aligns) <$>) lines'
let maxlength = maximum $
- map (\x -> length . stringify $ runF x def) (heads : lines')
+ map (\x -> length . stringify $ runF x def) (heads' : lines'')
numColumns <- getOption readerColumns
let widths = if maxlength > numColumns
then map (\len ->
fromIntegral (len + 1) / fromIntegral numColumns)
seplengths
else replicate (length aligns) 0.0
- return $ (aligns, widths, heads, sequence lines')
+ return $ (aligns, widths, heads', sequence lines'')
sepPipe :: MarkdownParser ()
sepPipe = try $ do
@@ -1372,25 +1378,27 @@ sepPipe = try $ do
-- parse a row, also returning probable alignments for org-table cells
pipeTableRow :: MarkdownParser (F [Blocks])
-pipeTableRow = do
+pipeTableRow = try $ do
+ scanForPipe
skipMany spaceChar
openPipe <- (True <$ char '|') <|> return False
- let cell = mconcat <$>
- many (notFollowedBy (blankline <|> char '|') >> inline)
- first <- cell
- rest <- many $ sepPipe *> cell
+ -- split into cells
+ let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline')
+ <|> void (noneOf "|\n\r")
+ let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>=
+ parseFromString pipeTableCell
+ cells <- cellContents `sepEndBy1` (char '|')
-- surrounding pipes needed for a one-column table:
- guard $ not (null rest && not openPipe)
- optional (char '|')
+ guard $ not (length cells == 1 && not openPipe)
blankline
- let cells = sequence (first:rest)
- return $ do
- cells' <- cells
- return $ map
- (\ils ->
- case trimInlines ils of
- ils' | B.isNull ils' -> mempty
- | otherwise -> B.plain $ ils') cells'
+ return $ sequence cells
+
+pipeTableCell :: MarkdownParser (F Blocks)
+pipeTableCell = do
+ result <- many inline
+ if null result
+ then return mempty
+ else return $ B.plain . mconcat <$> sequence result
pipeTableHeaderPart :: Parser [Char] st (Alignment, Int)
pipeTableHeaderPart = try $ do
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index 950497992..d3cee08e2 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -225,7 +225,7 @@ table = do
Nothing -> 1.0
caption <- option mempty tableCaption
optional rowsep
- hasheader <- option False $ True <$ (lookAhead (char '!'))
+ hasheader <- option False $ True <$ (lookAhead (skipSpaces *> char '!'))
(cellspecs',hdr) <- unzip <$> tableRow
let widths = map ((tableWidth *) . snd) cellspecs'
let restwidth = tableWidth - sum widths
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs
index a925c1d84..68e89263c 100644
--- a/src/Text/Pandoc/Readers/Odt.hs
+++ b/src/Text/Pandoc/Readers/Odt.hs
@@ -59,7 +59,9 @@ readOdt _ bytes = case bytesToOdt bytes of
--
bytesToOdt :: B.ByteString -> Either PandocError Pandoc
-bytesToOdt bytes = archiveToOdt $ toArchive bytes
+bytesToOdt bytes = case toArchiveOrFail bytes of
+ Right archive -> archiveToOdt archive
+ Left _ -> Left $ ParseFailure "Couldn't parse odt file."
--
archiveToOdt :: Archive -> Either PandocError Pandoc
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index 7dd611be3..5a50a8f34 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -35,6 +35,7 @@ import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..),
trimInlines )
import Text.Pandoc.Definition
import Text.Pandoc.Compat.Monoid ((<>))
+import Text.Pandoc.Error
import Text.Pandoc.Options
import qualified Text.Pandoc.Parsing as P
import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
@@ -49,7 +50,7 @@ import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
import Control.Arrow (first)
import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when)
import Control.Monad.Reader (Reader, runReader, ask, asks, local)
-import Data.Char (isAlphaNum, toLower)
+import Data.Char (isAlphaNum, isSpace, toLower)
import Data.Default
import Data.List (intersperse, isPrefixOf, isSuffixOf)
import qualified Data.Map as M
@@ -57,8 +58,6 @@ import qualified Data.Set as Set
import Data.Maybe (fromMaybe, isJust)
import Network.HTTP (urlEncode)
-import Text.Pandoc.Error
-
-- | Parse org-mode string and return a Pandoc document.
readOrg :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
@@ -391,6 +390,9 @@ lookupBlockAttribute key =
type BlockProperties = (Int, String) -- (Indentation, Block-Type)
+updateIndent :: BlockProperties -> Int -> BlockProperties
+updateIndent (_, blkType) indent = (indent, blkType)
+
orgBlock :: OrgParser (F Blocks)
orgBlock = try $ do
blockProp@(_, blkType) <- blockHeaderStart
@@ -407,11 +409,23 @@ orgBlock = try $ do
_ -> withParsed (fmap $ divWithClass blkType)
blockHeaderStart :: OrgParser (Int, String)
-blockHeaderStart = try $ (,) <$> indent <*> blockType
+blockHeaderStart = try $ (,) <$> indentation <*> blockType
where
- indent = length <$> many spaceChar
blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord)
+indentation :: OrgParser Int
+indentation = try $ do
+ tabStop <- getOption readerTabStop
+ s <- many spaceChar
+ return $ spaceLength tabStop s
+
+spaceLength :: Int -> String -> Int
+spaceLength tabStop s = (sum . map charLen) s
+ where
+ charLen ' ' = 1
+ charLen '\t' = tabStop
+ charLen _ = 0
+
withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks)
withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp))
@@ -450,7 +464,8 @@ codeBlock blkProp = do
skipSpaces
(classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders)
id' <- fromMaybe "" <$> lookupBlockAttribute "name"
- content <- rawBlockContent blkProp
+ leadingIndent <- lookAhead indentation
+ content <- rawBlockContent (updateIndent blkProp leadingIndent)
resultsContent <- followingResultsBlock
let includeCode = exportsCode kv
let includeResults = exportsResults kv
@@ -472,7 +487,7 @@ rawBlockContent (indent, blockType) = try $
unlines . map commaEscaped <$> manyTill indentedLine blockEnder
where
indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine)
- blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType)
+ blockEnder = try $ skipSpaces *> stringAnyCase ("#+end_" <> blockType)
parsedBlockContent :: BlockProperties -> OrgParser (F Blocks)
parsedBlockContent blkProps = try $ do
@@ -758,9 +773,13 @@ data OrgTableRow = OrgContentRow (F [Blocks])
| OrgAlignRow [Alignment]
| OrgHlineRow
+-- OrgTable is strongly related to the pandoc table ADT. Using the same
+-- (i.e. pandoc-global) ADT would mean that the reader would break if the
+-- global structure was to be changed, which would be bad. The final table
+-- should be generated using a builder function. Column widths aren't
+-- implemented yet, so they are not tracked here.
data OrgTable = OrgTable
- { orgTableColumns :: Int
- , orgTableAlignments :: [Alignment]
+ { orgTableAlignments :: [Alignment]
, orgTableHeader :: [Blocks]
, orgTableRows :: [[Blocks]]
}
@@ -776,7 +795,7 @@ table = try $ do
orgToPandocTable :: OrgTable
-> Inlines
-> Blocks
-orgToPandocTable (OrgTable _ aligns heads lns) caption =
+orgToPandocTable (OrgTable aligns heads lns) caption =
B.table caption (zip aligns $ repeat 0) heads lns
tableStart :: OrgParser Char
@@ -787,18 +806,19 @@ tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow)
tableContentRow :: OrgParser OrgTableRow
tableContentRow = try $
- OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline)
+ OrgContentRow . sequence <$> (tableStart *> many1Till tableContentCell newline)
tableContentCell :: OrgParser (F Blocks)
tableContentCell = try $
- fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell
-
-endOfCell :: OrgParser Char
-endOfCell = try $ char '|' <|> lookAhead newline
+ fmap B.plain . trimInlinesF . mconcat <$> manyTill inline endOfCell
tableAlignRow :: OrgParser OrgTableRow
-tableAlignRow = try $
- OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline)
+tableAlignRow = try $ do
+ tableStart
+ cells <- many1Till tableAlignCell newline
+ -- Empty rows are regular (i.e. content) rows, not alignment rows.
+ guard $ any (/= AlignDefault) cells
+ return $ OrgAlignRow cells
tableAlignCell :: OrgParser Alignment
tableAlignCell =
@@ -813,65 +833,61 @@ tableAlignCell =
where emptyCell = try $ skipSpaces *> endOfCell
tableAlignFromChar :: OrgParser Alignment
-tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft
- , char 'c' *> return AlignCenter
- , char 'r' *> return AlignRight
- ]
+tableAlignFromChar = try $
+ choice [ char 'l' *> return AlignLeft
+ , char 'c' *> return AlignCenter
+ , char 'r' *> return AlignRight
+ ]
tableHline :: OrgParser OrgTableRow
tableHline = try $
OrgHlineRow <$ (tableStart *> char '-' *> anyLine)
+endOfCell :: OrgParser Char
+endOfCell = try $ char '|' <|> lookAhead newline
+
rowsToTable :: [OrgTableRow]
-> F OrgTable
-rowsToTable = foldM (flip rowToContent) zeroTable
- where zeroTable = OrgTable 0 mempty mempty mempty
-
-normalizeTable :: OrgTable
- -> OrgTable
-normalizeTable (OrgTable cols aligns heads lns) =
- let aligns' = fillColumns aligns AlignDefault
- heads' = if heads == mempty
- then mempty
- else fillColumns heads (B.plain mempty)
- lns' = map (`fillColumns` B.plain mempty) lns
- fillColumns base padding = take cols $ base ++ repeat padding
- in OrgTable cols aligns' heads' lns'
+rowsToTable = foldM rowToContent emptyTable
+ where emptyTable = OrgTable mempty mempty mempty
+normalizeTable :: OrgTable -> OrgTable
+normalizeTable (OrgTable aligns heads rows) = OrgTable aligns' heads rows
+ where
+ refRow = if heads /= mempty
+ then heads
+ else if rows == mempty then mempty else head rows
+ cols = length refRow
+ fillColumns base padding = take cols $ base ++ repeat padding
+ aligns' = fillColumns aligns AlignDefault
-- One or more horizontal rules after the first content line mark the previous
-- line as a header. All other horizontal lines are discarded.
-rowToContent :: OrgTableRow
- -> OrgTable
- -> F OrgTable
-rowToContent OrgHlineRow t = maybeBodyToHeader t
-rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t
-rowToContent (OrgContentRow rf) t = do
- rs <- rf
- setLongestRow rs =<< appendToBody rs t
-
-setLongestRow :: [a]
- -> OrgTable
- -> F OrgTable
-setLongestRow rs t =
- return t{ orgTableColumns = max (length rs) (orgTableColumns t) }
-
-maybeBodyToHeader :: OrgTable
- -> F OrgTable
-maybeBodyToHeader t = case t of
- OrgTable{ orgTableHeader = [], orgTableRows = b:[] } ->
- return t{ orgTableHeader = b , orgTableRows = [] }
- _ -> return t
-
-appendToBody :: [Blocks]
- -> OrgTable
+rowToContent :: OrgTable
+ -> OrgTableRow
-> F OrgTable
-appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] }
+rowToContent orgTable row =
+ case row of
+ OrgHlineRow -> return singleRowPromotedToHeader
+ OrgAlignRow as -> return . setAligns $ as
+ OrgContentRow cs -> appendToBody cs
+ where
+ singleRowPromotedToHeader :: OrgTable
+ singleRowPromotedToHeader = case orgTable of
+ OrgTable{ orgTableHeader = [], orgTableRows = b:[] } ->
+ orgTable{ orgTableHeader = b , orgTableRows = [] }
+ _ -> orgTable
-setAligns :: [Alignment]
- -> OrgTable
- -> F OrgTable
-setAligns aligns t = return $ t{ orgTableAlignments = aligns }
+ setAligns :: [Alignment] -> OrgTable
+ setAligns aligns = orgTable{ orgTableAlignments = aligns }
+
+ appendToBody :: F [Blocks] -> F OrgTable
+ appendToBody frow = do
+ newRow <- frow
+ let oldRows = orgTableRows orgTable
+ -- NOTE: This is an inefficient O(n) operation. This should be changed
+ -- if performance ever becomes a problem.
+ return orgTable{ orgTableRows = oldRows ++ [newRow] }
--
@@ -1565,14 +1581,14 @@ inlineLaTeX = try $ do
parseAsMathMLSym :: String -> Maybe Inlines
parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
- -- dropWhileEnd would be nice here, but it's not available before base 4.5
- where clean = reverse . dropWhile (`elem` ("{}" :: String)) . reverse . drop 1
+ -- drop initial backslash and any trailing "{}"
+ where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1
state :: ParserState
state = def{ stateOptions = def{ readerParseRaw = True }}
- texMathToPandoc inp = (maybeRight $ readTeX inp) >>=
- writePandoc DisplayInline
+ texMathToPandoc :: String -> Maybe [Inline]
+ texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline
maybeRight :: Either a b -> Maybe b
maybeRight = either (const Nothing) Just
@@ -1582,11 +1598,18 @@ inlineLaTeXCommand = try $ do
rest <- getInput
case runParser rawLaTeXInline def "source" rest of
Right (RawInline _ cs) -> do
- let len = length cs
+ -- drop any trailing whitespace, those are not be part of the command as
+ -- far as org mode is concerned.
+ let cmdNoSpc = dropWhileEnd isSpace cs
+ let len = length cmdNoSpc
count len anyChar
- return cs
+ return cmdNoSpc
_ -> mzero
+-- Taken from Data.OldList.
+dropWhileEnd :: (a -> Bool) -> [a] -> [a]
+dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) []
+
smart :: OrgParser (F Inlines)
smart = do
getOption readerSmart >>= guard