diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 30 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 94 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/EPUB.hs | 6 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 55 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 80 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/MediaWiki.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 161 |
8 files changed, 256 insertions, 176 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index eb71d8dd8..9c7c3b264 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -50,8 +50,7 @@ implemented, [-] means partially implemented): * Inlines - [X] Str - - [X] Emph (From italics. `underline` currently read as span. In - future, it might optionally be emph as well) + - [X] Emph (italics and underline both read as Emph) - [X] Strong - [X] Strikeout - [X] Superscript @@ -62,16 +61,16 @@ implemented, [-] means partially implemented): - [X] Code (styled with `VerbatimChar`) - [X] Space - [X] LineBreak (these are invisible in Word: entered with Shift-Return) - - [ ] Math + - [X] Math - [X] Link (links to an arbitrary bookmark create a span with the target as id and "anchor" class) - - [-] Image (Links to path in archive. Future option for - data-encoded URI likely.) + - [X] Image - [X] Note (Footnotes and Endnotes are silently combined.) -} module Text.Pandoc.Readers.Docx - ( readDocx + ( readDocxWithWarnings + , readDocx ) where import Codec.Archive.Zip @@ -98,14 +97,23 @@ import qualified Data.Sequence as Seq (null) import Text.Pandoc.Error import Text.Pandoc.Compat.Except +readDocxWithWarnings :: ReaderOptions + -> B.ByteString + -> Either PandocError (Pandoc, MediaBag, [String]) +readDocxWithWarnings opts bytes + | Right archive <- toArchiveOrFail bytes + , Right (docx, warnings) <- archiveToDocxWithWarnings archive = do + (meta, blks, mediaBag) <- docxToOutput opts docx + return (Pandoc meta blks, mediaBag, warnings) +readDocxWithWarnings _ _ = + Left (ParseFailure "couldn't parse docx file") + readDocx :: ReaderOptions -> B.ByteString -> Either PandocError (Pandoc, MediaBag) -readDocx opts bytes = - case archiveToDocx (toArchive bytes) of - Right docx -> (\(meta, blks, mediaBag) -> (Pandoc meta blks, mediaBag)) - <$> (docxToOutput opts docx) - Left _ -> Left (ParseFailure "couldn't parse docx file") +readDocx opts bytes = do + (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes + return (pandoc, mediaBag) data DState = DState { docxAnchorMap :: M.Map String String , docxMediaBag :: MediaBag diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index eec8b12c9..7265ef8dd 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , Row(..) , Cell(..) , archiveToDocx + , archiveToDocxWithWarnings ) where import Codec.Archive.Zip import Text.XML.Light @@ -60,6 +61,7 @@ import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 import Control.Monad.Reader +import Control.Monad.State import Control.Applicative ((<|>)) import qualified Data.Map as M import Text.Pandoc.Compat.Except @@ -81,16 +83,20 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes } deriving Show +data ReaderState = ReaderState { stateWarnings :: [String] } + deriving Show + + data DocxError = DocxError | WrongElem deriving Show instance Error DocxError where noMsg = WrongElem -type D = ExceptT DocxError (Reader ReaderEnv) +type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) -runD :: D a -> ReaderEnv -> Either DocxError a -runD dx re = runReader (runExceptT dx) re +runD :: D a -> ReaderEnv -> ReaderState -> (Either DocxError a, ReaderState) +runD dx re rs = runState (runReaderT (runExceptT dx) re) rs maybeToD :: Maybe a -> D a maybeToD (Just a) = return a @@ -257,7 +263,10 @@ type Author = String type ChangeDate = String archiveToDocx :: Archive -> Either DocxError Docx -archiveToDocx archive = do +archiveToDocx archive = fst <$> archiveToDocxWithWarnings archive + +archiveToDocxWithWarnings :: Archive -> Either DocxError (Docx, [String]) +archiveToDocxWithWarnings archive = do let notes = archiveToNotes archive numbering = archiveToNumbering archive rels = archiveToRelationships archive @@ -265,8 +274,12 @@ archiveToDocx archive = do (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles InDocument - doc <- runD (archiveToDocument archive) rEnv - return $ Docx doc + rState = ReaderState { stateWarnings = [] } + (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState + case eitherDoc of + Right doc -> Right (Docx doc, stateWarnings st) + Left e -> Left e + archiveToDocument :: Archive -> D Document @@ -576,12 +589,14 @@ elemToBodyPart ns element sty <- asks envParStyles let parstyle = elemToParagraphStyle ns element sty parparts <- mapD (elemToParPart ns) (elChildren element) - case pNumInfo parstyle of - Just (numId, lvl) -> do - num <- asks envNumbering - let levelInfo = lookupLevel numId lvl num - return $ ListItem parstyle numId lvl levelInfo parparts - Nothing -> return $ Paragraph parstyle parparts + -- Word uses list enumeration for numbered headings, so we only + -- want to infer a list from the styles if it is NOT a heading. + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + num <- asks envNumbering + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do let caption' = findChild (elemName ns "w" "tblPr") element @@ -646,14 +661,14 @@ elemToParPart ns element | isElem ns "w" "r" element = elemToRun ns element >>= (\r -> return $ PlainRun r) elemToParPart ns element - | isElem ns "w" "ins" element + | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element , Just cId <- findAttr (elemName ns "w" "id") element , Just cAuthor <- findAttr (elemName ns "w" "author") element , Just cDate <- findAttr (elemName ns "w" "date") element = do runs <- mapD (elemToRun ns) (elChildren element) return $ Insertion cId cAuthor cDate runs elemToParPart ns element - | isElem ns "w" "del" element + | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element , Just cId <- findAttr (elemName ns "w" "id") element , Just cAuthor <- findAttr (elemName ns "w" "author") element , Just cDate <- findAttr (elemName ns "w" "date") element = do @@ -702,36 +717,58 @@ elemToExtent drawingElem = getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem >>= findAttr (QName at Nothing Nothing) >>= safeRead -elemToRun :: NameSpaces -> Element -> D Run -elemToRun ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + +childElemToRun :: NameSpaces -> Element -> D Run +childElemToRun ns element + | isElem ns "w" "drawing" element = let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= - (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent drawingElem) + (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent element) Nothing -> throwError WrongElem -elemToRun ns element - | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "footnoteReference") element - , Just fnId <- findAttr (elemName ns "w" "id") ref = do +childElemToRun ns element + | isElem ns "w" "footnoteReference" element + , Just fnId <- findAttr (elemName ns "w" "id") element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Footnote bps Nothing -> return $ Footnote [] -elemToRun ns element - | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "endnoteReference") element - , Just enId <- findAttr (elemName ns "w" "id") ref = do +childElemToRun ns element + | isElem ns "w" "endnoteReference" element + , Just enId <- findAttr (elemName ns "w" "id") element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Endnote bps Nothing -> return $ Endnote [] +childElemToRun _ _ = throwError WrongElem + +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element = + do let choices = findChildren (elemName ns "mc" "Choice") altCont + choiceChildren = map head $ filter (not . null) $ map elChildren choices + outputs <- mapD (childElemToRun ns) choiceChildren + case outputs of + r : _ -> return r + [] -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + childElemToRun ns drawingElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "footnoteReference") element = + childElemToRun ns ref +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "endnoteReference") element = + childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element = do runElems <- elemToRunElems ns element @@ -940,3 +977,4 @@ elemToRunElems _ _ = throwError WrongElem setFont :: Maybe Font -> ReaderEnv -> ReaderEnv setFont f s = s{envFont = f} + diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 07d282708..144ba9ca2 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -19,7 +19,7 @@ import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) import Text.Pandoc.Compat.Monoid ((<>)) import Text.Pandoc.MIME (MimeType) import qualified Text.Pandoc.Builder as B -import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry +import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry , findEntryByPath, Entry) import qualified Data.ByteString.Lazy as BL (ByteString) import System.FilePath ( takeFileName, (</>), dropFileName, normalise @@ -39,7 +39,9 @@ import Text.Pandoc.Error type Items = M.Map String (FilePath, MimeType) readEPUB :: ReaderOptions -> BL.ByteString -> Either PandocError (Pandoc, MediaBag) -readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) +readEPUB opts bytes = case toArchiveOrFail bytes of + Right archive -> runEPUB $ archiveToEPUB opts $ archive + Left _ -> Left $ ParseFailure "Couldn't extract ePub file" runEPUB :: Except PandocError a -> Either PandocError a runEPUB = runExcept diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 69df13aac..8ee5da543 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -63,7 +63,7 @@ import Debug.Trace (trace) import Text.TeXMath (readMathML, writeTeX) import Data.Default (Default (..), def) import Control.Monad.Reader (Reader,ask, asks, local, runReader) -import Network.URI (isURI) +import Network.URI (URI, parseURIReference, nonStrictRelativeTo) import Text.Pandoc.Error import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Compat.Monoid ((<>)) @@ -103,7 +103,7 @@ data HTMLState = HTMLState { parserState :: ParserState, noteTable :: [(String, Blocks)], - baseHref :: Maybe String, + baseHref :: Maybe URI, identifiers :: Set.Set String, headerMap :: M.Map Inlines String } @@ -145,15 +145,9 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag return mempty pBaseTag = do bt <- pSatisfy (~== TagOpen "base" []) - let baseH = fromAttrib "href" bt - if null baseH - then return mempty - else do - let baseH' = case reverse baseH of - '/':_ -> baseH - _ -> baseH ++ "/" - updateState $ \st -> st{ baseHref = Just baseH' } - return mempty + updateState $ \st -> st{ baseHref = + parseURIReference $ fromAttrib "href" bt } + return mempty block :: TagParser Blocks block = do @@ -610,9 +604,9 @@ pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) mbBaseHref <- baseHref <$> getState let url' = fromAttrib "href" tag - let url = case (isURI url', mbBaseHref) of - (False, Just h) -> h ++ url' - _ -> url' + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' let title = fromAttrib "title" tag let uid = fromAttrib "id" tag let cls = words $ fromAttrib "class" tag @@ -624,9 +618,9 @@ pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState let url' = fromAttrib "src" tag - let url = case (isURI url', mbBaseHref) of - (False, Just h) -> h ++ url' - _ -> url' + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag let uid = fromAttrib "id" tag @@ -945,7 +939,7 @@ htmlInBalanced f = try $ do (TagClose _ : TagPosition er ec : _) -> do let ls = er - sr let cs = ec - sc - lscontents <- concat <$> count ls anyLine + lscontents <- unlines <$> count ls anyLine cscontents <- count cs anyChar (_,closetag) <- htmlTag (~== TagClose tn) return (lscontents ++ cscontents ++ closetag) @@ -977,11 +971,20 @@ htmlTag :: Monad m htmlTag f = try $ do lookAhead (char '<') inp <- getInput - let (next : rest) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = True } inp + let (next : _) = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False } inp guard $ f next + let handleTag tagname = do + -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> + -- should NOT be parsed as an HTML tag, see #2277 + guard $ not ('.' `elem` tagname) + -- <https://example.org> should NOT be a tag either. + -- tagsoup will parse it as TagOpen "https:" [("example.org","")] + guard $ not (null tagname) + guard $ last tagname /= ':' + rendered <- manyTill anyChar (char '>') + return (next, rendered ++ ">") case next of - TagWarning _ -> fail "encountered TagWarning" TagComment s | "<!--" `isPrefixOf` inp -> do count (length s + 4) anyChar @@ -989,13 +992,9 @@ htmlTag f = try $ do char '>' return (next, "<!--" ++ s ++ "-->") | otherwise -> fail "bogus comment mode, HTML5 parse error" - _ -> do - -- we get a TagWarning on things like - -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> - -- which should NOT be parsed as an HTML tag, see #2277 - guard $ not $ hasTagWarning rest - rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") + TagOpen tagname _attr -> handleTag tagname + TagClose tagname -> handleTag tagname + _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 587726084..e43714526 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -122,9 +122,6 @@ inList = do ctx <- stateParserContext <$> getState guard (ctx == ListItemState) -isNull :: F Inlines -> Bool -isNull ils = B.isNull $ runF ils def - spnl :: Parser [Char] st () spnl = try $ do skipSpaces @@ -188,31 +185,38 @@ charsInBalancedBrackets openBrackets = -- document structure -- -titleLine :: MarkdownParser (F Inlines) -titleLine = try $ do +rawTitleBlockLine :: MarkdownParser String +rawTitleBlockLine = do char '%' skipSpaces - res <- many $ (notFollowedBy newline >> inline) - <|> try (endline >> whitespace) - newline + first <- anyLine + rest <- many $ try $ do spaceChar + notFollowedBy blankline + skipSpaces + anyLine + return $ trim $ unlines (first:rest) + +titleLine :: MarkdownParser (F Inlines) +titleLine = try $ do + raw <- rawTitleBlockLine + res <- parseFromString (many inline) raw return $ trimInlinesF $ mconcat res authorsLine :: MarkdownParser (F [Inlines]) authorsLine = try $ do - char '%' - skipSpaces - authors <- sepEndBy (many (notFollowedBy (satisfy $ \c -> - c == ';' || c == '\n') >> inline)) - (char ';' <|> - try (newline >> notFollowedBy blankline >> spaceChar)) - newline - return $ sequence $ filter (not . isNull) $ map (trimInlinesF . mconcat) authors + raw <- rawTitleBlockLine + let sep = (char ';' <* spaces) <|> newline + let pAuthors = sepEndBy + (trimInlinesF . mconcat <$> many + (try $ notFollowedBy sep >> inline)) + sep + sequence <$> parseFromString pAuthors raw dateLine :: MarkdownParser (F Inlines) dateLine = try $ do - char '%' - skipSpaces - trimInlinesF . mconcat <$> manyTill inline newline + raw <- rawTitleBlockLine + res <- parseFromString (many inline) raw + return $ trimInlinesF $ mconcat res titleBlock :: MarkdownParser () titleBlock = pandocTitleBlock <|> mmdTitleBlock @@ -1354,16 +1358,18 @@ pipeTable = try $ do nonindentSpaces lookAhead nonspaceChar (heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak + let heads' = take (length aligns) <$> heads lines' <- many pipeTableRow + let lines'' = map (take (length aligns) <$>) lines' let maxlength = maximum $ - map (\x -> length . stringify $ runF x def) (heads : lines') + map (\x -> length . stringify $ runF x def) (heads' : lines'') numColumns <- getOption readerColumns let widths = if maxlength > numColumns then map (\len -> fromIntegral (len + 1) / fromIntegral numColumns) seplengths else replicate (length aligns) 0.0 - return $ (aligns, widths, heads, sequence lines') + return $ (aligns, widths, heads', sequence lines'') sepPipe :: MarkdownParser () sepPipe = try $ do @@ -1372,25 +1378,27 @@ sepPipe = try $ do -- parse a row, also returning probable alignments for org-table cells pipeTableRow :: MarkdownParser (F [Blocks]) -pipeTableRow = do +pipeTableRow = try $ do + scanForPipe skipMany spaceChar openPipe <- (True <$ char '|') <|> return False - let cell = mconcat <$> - many (notFollowedBy (blankline <|> char '|') >> inline) - first <- cell - rest <- many $ sepPipe *> cell + -- split into cells + let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') + <|> void (noneOf "|\n\r") + let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>= + parseFromString pipeTableCell + cells <- cellContents `sepEndBy1` (char '|') -- surrounding pipes needed for a one-column table: - guard $ not (null rest && not openPipe) - optional (char '|') + guard $ not (length cells == 1 && not openPipe) blankline - let cells = sequence (first:rest) - return $ do - cells' <- cells - return $ map - (\ils -> - case trimInlines ils of - ils' | B.isNull ils' -> mempty - | otherwise -> B.plain $ ils') cells' + return $ sequence cells + +pipeTableCell :: MarkdownParser (F Blocks) +pipeTableCell = do + result <- many inline + if null result + then return mempty + else return $ B.plain . mconcat <$> sequence result pipeTableHeaderPart :: Parser [Char] st (Alignment, Int) pipeTableHeaderPart = try $ do diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 950497992..d3cee08e2 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -225,7 +225,7 @@ table = do Nothing -> 1.0 caption <- option mempty tableCaption optional rowsep - hasheader <- option False $ True <$ (lookAhead (char '!')) + hasheader <- option False $ True <$ (lookAhead (skipSpaces *> char '!')) (cellspecs',hdr) <- unzip <$> tableRow let widths = map ((tableWidth *) . snd) cellspecs' let restwidth = tableWidth - sum widths diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index a925c1d84..68e89263c 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -59,7 +59,9 @@ readOdt _ bytes = case bytesToOdt bytes of -- bytesToOdt :: B.ByteString -> Either PandocError Pandoc -bytesToOdt bytes = archiveToOdt $ toArchive bytes +bytesToOdt bytes = case toArchiveOrFail bytes of + Right archive -> archiveToOdt archive + Left _ -> Left $ ParseFailure "Couldn't parse odt file." -- archiveToOdt :: Archive -> Either PandocError Pandoc diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 7dd611be3..5a50a8f34 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -35,6 +35,7 @@ import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), trimInlines ) import Text.Pandoc.Definition import Text.Pandoc.Compat.Monoid ((<>)) +import Text.Pandoc.Error import Text.Pandoc.Options import qualified Text.Pandoc.Parsing as P import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF @@ -49,7 +50,7 @@ import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Arrow (first) import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) import Control.Monad.Reader (Reader, runReader, ask, asks, local) -import Data.Char (isAlphaNum, toLower) +import Data.Char (isAlphaNum, isSpace, toLower) import Data.Default import Data.List (intersperse, isPrefixOf, isSuffixOf) import qualified Data.Map as M @@ -57,8 +58,6 @@ import qualified Data.Set as Set import Data.Maybe (fromMaybe, isJust) import Network.HTTP (urlEncode) -import Text.Pandoc.Error - -- | Parse org-mode string and return a Pandoc document. readOrg :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) @@ -391,6 +390,9 @@ lookupBlockAttribute key = type BlockProperties = (Int, String) -- (Indentation, Block-Type) +updateIndent :: BlockProperties -> Int -> BlockProperties +updateIndent (_, blkType) indent = (indent, blkType) + orgBlock :: OrgParser (F Blocks) orgBlock = try $ do blockProp@(_, blkType) <- blockHeaderStart @@ -407,11 +409,23 @@ orgBlock = try $ do _ -> withParsed (fmap $ divWithClass blkType) blockHeaderStart :: OrgParser (Int, String) -blockHeaderStart = try $ (,) <$> indent <*> blockType +blockHeaderStart = try $ (,) <$> indentation <*> blockType where - indent = length <$> many spaceChar blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) +indentation :: OrgParser Int +indentation = try $ do + tabStop <- getOption readerTabStop + s <- many spaceChar + return $ spaceLength tabStop s + +spaceLength :: Int -> String -> Int +spaceLength tabStop s = (sum . map charLen) s + where + charLen ' ' = 1 + charLen '\t' = tabStop + charLen _ = 0 + withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) @@ -450,7 +464,8 @@ codeBlock blkProp = do skipSpaces (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) id' <- fromMaybe "" <$> lookupBlockAttribute "name" - content <- rawBlockContent blkProp + leadingIndent <- lookAhead indentation + content <- rawBlockContent (updateIndent blkProp leadingIndent) resultsContent <- followingResultsBlock let includeCode = exportsCode kv let includeResults = exportsResults kv @@ -472,7 +487,7 @@ rawBlockContent (indent, blockType) = try $ unlines . map commaEscaped <$> manyTill indentedLine blockEnder where indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) - blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType) + blockEnder = try $ skipSpaces *> stringAnyCase ("#+end_" <> blockType) parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) parsedBlockContent blkProps = try $ do @@ -758,9 +773,13 @@ data OrgTableRow = OrgContentRow (F [Blocks]) | OrgAlignRow [Alignment] | OrgHlineRow +-- OrgTable is strongly related to the pandoc table ADT. Using the same +-- (i.e. pandoc-global) ADT would mean that the reader would break if the +-- global structure was to be changed, which would be bad. The final table +-- should be generated using a builder function. Column widths aren't +-- implemented yet, so they are not tracked here. data OrgTable = OrgTable - { orgTableColumns :: Int - , orgTableAlignments :: [Alignment] + { orgTableAlignments :: [Alignment] , orgTableHeader :: [Blocks] , orgTableRows :: [[Blocks]] } @@ -776,7 +795,7 @@ table = try $ do orgToPandocTable :: OrgTable -> Inlines -> Blocks -orgToPandocTable (OrgTable _ aligns heads lns) caption = +orgToPandocTable (OrgTable aligns heads lns) caption = B.table caption (zip aligns $ repeat 0) heads lns tableStart :: OrgParser Char @@ -787,18 +806,19 @@ tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) tableContentRow :: OrgParser OrgTableRow tableContentRow = try $ - OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline) + OrgContentRow . sequence <$> (tableStart *> many1Till tableContentCell newline) tableContentCell :: OrgParser (F Blocks) tableContentCell = try $ - fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell - -endOfCell :: OrgParser Char -endOfCell = try $ char '|' <|> lookAhead newline + fmap B.plain . trimInlinesF . mconcat <$> manyTill inline endOfCell tableAlignRow :: OrgParser OrgTableRow -tableAlignRow = try $ - OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline) +tableAlignRow = try $ do + tableStart + cells <- many1Till tableAlignCell newline + -- Empty rows are regular (i.e. content) rows, not alignment rows. + guard $ any (/= AlignDefault) cells + return $ OrgAlignRow cells tableAlignCell :: OrgParser Alignment tableAlignCell = @@ -813,65 +833,61 @@ tableAlignCell = where emptyCell = try $ skipSpaces *> endOfCell tableAlignFromChar :: OrgParser Alignment -tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft - , char 'c' *> return AlignCenter - , char 'r' *> return AlignRight - ] +tableAlignFromChar = try $ + choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] tableHline :: OrgParser OrgTableRow tableHline = try $ OrgHlineRow <$ (tableStart *> char '-' *> anyLine) +endOfCell :: OrgParser Char +endOfCell = try $ char '|' <|> lookAhead newline + rowsToTable :: [OrgTableRow] -> F OrgTable -rowsToTable = foldM (flip rowToContent) zeroTable - where zeroTable = OrgTable 0 mempty mempty mempty - -normalizeTable :: OrgTable - -> OrgTable -normalizeTable (OrgTable cols aligns heads lns) = - let aligns' = fillColumns aligns AlignDefault - heads' = if heads == mempty - then mempty - else fillColumns heads (B.plain mempty) - lns' = map (`fillColumns` B.plain mempty) lns - fillColumns base padding = take cols $ base ++ repeat padding - in OrgTable cols aligns' heads' lns' +rowsToTable = foldM rowToContent emptyTable + where emptyTable = OrgTable mempty mempty mempty +normalizeTable :: OrgTable -> OrgTable +normalizeTable (OrgTable aligns heads rows) = OrgTable aligns' heads rows + where + refRow = if heads /= mempty + then heads + else if rows == mempty then mempty else head rows + cols = length refRow + fillColumns base padding = take cols $ base ++ repeat padding + aligns' = fillColumns aligns AlignDefault -- One or more horizontal rules after the first content line mark the previous -- line as a header. All other horizontal lines are discarded. -rowToContent :: OrgTableRow - -> OrgTable - -> F OrgTable -rowToContent OrgHlineRow t = maybeBodyToHeader t -rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t -rowToContent (OrgContentRow rf) t = do - rs <- rf - setLongestRow rs =<< appendToBody rs t - -setLongestRow :: [a] - -> OrgTable - -> F OrgTable -setLongestRow rs t = - return t{ orgTableColumns = max (length rs) (orgTableColumns t) } - -maybeBodyToHeader :: OrgTable - -> F OrgTable -maybeBodyToHeader t = case t of - OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> - return t{ orgTableHeader = b , orgTableRows = [] } - _ -> return t - -appendToBody :: [Blocks] - -> OrgTable +rowToContent :: OrgTable + -> OrgTableRow -> F OrgTable -appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] } +rowToContent orgTable row = + case row of + OrgHlineRow -> return singleRowPromotedToHeader + OrgAlignRow as -> return . setAligns $ as + OrgContentRow cs -> appendToBody cs + where + singleRowPromotedToHeader :: OrgTable + singleRowPromotedToHeader = case orgTable of + OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> + orgTable{ orgTableHeader = b , orgTableRows = [] } + _ -> orgTable -setAligns :: [Alignment] - -> OrgTable - -> F OrgTable -setAligns aligns t = return $ t{ orgTableAlignments = aligns } + setAligns :: [Alignment] -> OrgTable + setAligns aligns = orgTable{ orgTableAlignments = aligns } + + appendToBody :: F [Blocks] -> F OrgTable + appendToBody frow = do + newRow <- frow + let oldRows = orgTableRows orgTable + -- NOTE: This is an inefficient O(n) operation. This should be changed + -- if performance ever becomes a problem. + return orgTable{ orgTableRows = oldRows ++ [newRow] } -- @@ -1565,14 +1581,14 @@ inlineLaTeX = try $ do parseAsMathMLSym :: String -> Maybe Inlines parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) - -- dropWhileEnd would be nice here, but it's not available before base 4.5 - where clean = reverse . dropWhile (`elem` ("{}" :: String)) . reverse . drop 1 + -- drop initial backslash and any trailing "{}" + where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 state :: ParserState state = def{ stateOptions = def{ readerParseRaw = True }} - texMathToPandoc inp = (maybeRight $ readTeX inp) >>= - writePandoc DisplayInline + texMathToPandoc :: String -> Maybe [Inline] + texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline maybeRight :: Either a b -> Maybe b maybeRight = either (const Nothing) Just @@ -1582,11 +1598,18 @@ inlineLaTeXCommand = try $ do rest <- getInput case runParser rawLaTeXInline def "source" rest of Right (RawInline _ cs) -> do - let len = length cs + -- drop any trailing whitespace, those are not be part of the command as + -- far as org mode is concerned. + let cmdNoSpc = dropWhileEnd isSpace cs + let len = length cmdNoSpc count len anyChar - return cs + return cmdNoSpc _ -> mzero +-- Taken from Data.OldList. +dropWhileEnd :: (a -> Bool) -> [a] -> [a] +dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] + smart :: OrgParser (F Inlines) smart = do getOption readerSmart >>= guard |