From a340c7249f8e19d36ee4a68663b4c97e0893292b Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sun, 22 May 2016 16:33:31 +0200 Subject: Org reader: extract parsing function to module The Org-mode reader uses many functions defined in the `Text.Pandoc.Parsing` utility module. Some of the functions are overwritten with versions adapted to Org-mode idiosyncrasies. These special functions, as well as the normal Pandoc versions, are combined in a single module to increase the ease of use. This leads to decoupling of Org-mode and Pandoc and hence to slightly cleaner code. The downside is code-bloat due to repeated import/export statements. --- src/Text/Pandoc/Readers/Org.hs | 86 ++------------ src/Text/Pandoc/Readers/Org/ParserState.hs | 7 +- src/Text/Pandoc/Readers/Org/Parsing.hs | 182 +++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 78 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Org/Parsing.hs (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 621e7107f..fd811c078 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -29,27 +29,23 @@ Conversion of org-mode formatted plain text to 'Pandoc' document. -} module Text.Pandoc.Readers.Org ( readOrg ) where +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + import qualified Text.Pandoc.Builder as B import Text.Pandoc.Builder ( Inlines, Blocks ) import Text.Pandoc.Definition import Text.Pandoc.Compat.Monoid ((<>)) import Text.Pandoc.Error import Text.Pandoc.Options -import qualified Text.Pandoc.Parsing as P -import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF - , anyLine, blanklines, newline - , orderedListMarker - , parseFromString - ) import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) -import Text.Pandoc.Readers.Org.ParserState import Text.Pandoc.Shared (compactify', compactify'DL) import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Arrow (first) import Control.Monad (foldM, guard, mplus, mzero, when) -import Control.Monad.Reader ( Reader, runReader ) +import Control.Monad.Reader ( runReader ) import Data.Char (isAlphaNum, isSpace, toLower, toUpper) import Data.List ( foldl', intersperse, isPrefixOf, isSuffixOf ) import qualified Data.Map as M @@ -63,9 +59,6 @@ readOrg :: ReaderOptions -- ^ Reader options -> Either PandocError Pandoc readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") --- | The parser used to read org files. -type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal) - -- -- Functions acting on the parser state -- @@ -73,14 +66,6 @@ recordAnchorId :: String -> OrgParser () recordAnchorId i = updateState $ \s -> s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } -updateLastForbiddenCharPos :: OrgParser () -updateLastForbiddenCharPos = getPosition >>= \p -> - updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} - -updateLastPreCharPos :: OrgParser () -updateLastPreCharPos = getPosition >>= \p -> - updateState $ \s -> s{ orgStateLastPreCharPos = Just p} - pushToInlineCharStack :: Char -> OrgParser () pushToInlineCharStack c = updateState $ \s -> s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } @@ -258,57 +243,6 @@ isHeaderLevelLowerEq n blk = _ -> False --- --- Adaptions and specializations of parsing utilities --- - --- The version Text.Pandoc.Parsing cannot be used, as we need additional parts --- of the state saved and restored. -parseFromString :: OrgParser a -> String -> OrgParser a -parseFromString parser str' = do - oldLastPreCharPos <- orgStateLastPreCharPos <$> getState - updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } - result <- P.parseFromString parser str' - updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } - return result - --- | Like @Text.Parsec.Char.newline@, but causes additional state changes. -newline :: OrgParser Char -newline = - P.newline - <* updateLastPreCharPos - <* updateLastForbiddenCharPos - --- | Like @Text.Parsec.Char.blanklines@, but causes additional state changes. -blanklines :: OrgParser [Char] -blanklines = - P.blanklines - <* updateLastPreCharPos - <* updateLastForbiddenCharPos - -anyLine :: OrgParser String -anyLine = - P.anyLine - <* updateLastPreCharPos - <* updateLastForbiddenCharPos - --- | Succeeds when we're in list context. -inList :: OrgParser () -inList = do - ctx <- orgStateParserContext <$> getState - guard (ctx == ListItemState) - --- | Parse in different context -withContext :: ParserContext -- ^ New parser context - -> OrgParser a -- ^ Parser to run in that context - -> OrgParser a -withContext context parser = do - oldContext <- orgStateParserContext <$> getState - updateState $ \s -> s{ orgStateParserContext = context } - result <- parser - updateState $ \s -> s{ orgStateParserContext = oldContext } - return result - -- -- parsing blocks -- @@ -398,7 +332,7 @@ keyValues = try $ endOfValue :: OrgParser () endOfValue = lookAhead $ (() <$ try (many1 spaceChar <* key)) - <|> () <$ P.newline + <|> () <$ newline -- @@ -675,7 +609,7 @@ propertiesDrawer = try $ do key = try $ skipSpaces *> char ':' *> many1Till nonspaceChar (char ':') value :: OrgParser String - value = try $ skipSpaces *> manyTill anyChar (try $ skipSpaces *> P.newline) + value = try $ skipSpaces *> manyTill anyChar (try $ skipSpaces *> newline) keyValuesToAttr :: [(String, String)] -> Attr keyValuesToAttr kvs = @@ -696,7 +630,7 @@ keyValuesToAttr kvs = figure :: OrgParser (F Blocks) figure = try $ do figAttrs <- blockAttributes - src <- skipSpaces *> selfTarget <* skipSpaces <* P.newline + src <- skipSpaces *> selfTarget <* skipSpaces <* newline guard . not . isNothing . blockAttrCaption $ figAttrs guard (isImageFilename src) let figName = fromMaybe mempty $ blockAttrName figAttrs @@ -785,7 +719,7 @@ parseFormat = try $ do header :: OrgParser (F Blocks) header = try $ do level <- headerStart - title <- manyTill inline (lookAhead $ optional headerTags <* P.newline) + title <- manyTill inline (lookAhead $ optional headerTags <* newline) tags <- option [] headerTags newline propAttr <- option nullAttr (keyValuesToAttr <$> propertiesDrawer) @@ -1083,7 +1017,7 @@ definitionListItem parseMarkerGetLength = try $ do return $ (,) <$> term' <*> fmap (:[]) contents' where definitionMarker = - spaceChar *> string "::" <* (spaceChar <|> lookAhead P.newline) + spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) -- parse raw text for one list item, excluding start marker and continuations @@ -1559,7 +1493,7 @@ many1TillNOrLessNewlines n p end = try $ nMoreLines k cs = try $ (final k cs <|> rest k cs) >>= uncurry nMoreLines final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine - rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p P.newline) + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline) finalLine = try $ manyTill p end minus1 k = k - 1 oneOrMore cs = guard (not $ null cs) *> return cs diff --git a/src/Text/Pandoc/Readers/Org/ParserState.hs b/src/Text/Pandoc/Readers/Org/ParserState.hs index 6a902cd46..e648a883e 100644 --- a/src/Text/Pandoc/Readers/Org/ParserState.hs +++ b/src/Text/Pandoc/Readers/Org/ParserState.hs @@ -29,9 +29,11 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Define the Org-mode parser state. -} module Text.Pandoc.Readers.Org.ParserState - ( OrgParserState(..) - , OrgParserLocal(..) + ( OrgParserState (..) + , OrgParserLocal (..) , OrgNoteRecord + , HasReaderOptions (..) + , HasQuoteContext (..) , F(..) , askF , asksF @@ -184,6 +186,7 @@ modifyExportSettings :: ExportSettingSetter a -> a -> OrgParserState -> OrgParse modifyExportSettings setter val state = state { orgStateExportSettings = setter val . orgStateExportSettings $ state } + -- -- Parser state reader -- diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs new file mode 100644 index 000000000..efe2ae25f --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -0,0 +1,182 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Org-mode parsing utilities. + +Most functions are simply re-exports from @Text.Pandoc.Parsing@, some +functions are adapted to Org-mode specific functionality. +-} +module Text.Pandoc.Readers.Org.Parsing + ( OrgParser + , anyLine + , blanklines + , newline + , parseFromString + , inList + , withContext + , updateLastForbiddenCharPos + , updateLastPreCharPos + -- * Re-exports from Text.Pandoc.Parser + , ParserContext (..) + , many1Till + , notFollowedBy' + , spaceChar + , nonspaceChar + , skipSpaces + , blankline + , enclosed + , stringAnyCase + , charsInBalanced + , uri + , withRaw + , readWithM + , guardEnabled + , updateLastStrPos + , notAfterString + , ParserState (..) + , registerHeader + , QuoteContext (..) + , singleQuoteStart + , singleQuoteEnd + , doubleQuoteStart + , doubleQuoteEnd + , dash + , ellipses + , citeKey + -- * Re-exports from Text.Pandoc.Parsec + , runParser + , getInput + , char + , letter + , digit + , alphaNum + , skipMany1 + , spaces + , anyChar + , string + , count + , eof + , noneOf + , oneOf + , lookAhead + , notFollowedBy + , many + , many1 + , manyTill + , (<|>) + , () + , choice + , try + , sepBy + , sepBy1 + , option + , optional + , optionMaybe + , getState + , updateState + , SourcePos + , getPosition + ) where + +import Text.Pandoc.Readers.Org.ParserState + +import qualified Text.Pandoc.Parsing as P +import Text.Pandoc.Parsing hiding ( anyLine, blanklines, newline + , parseFromString ) + +import Control.Monad ( guard ) +import Control.Monad.Reader ( Reader ) + +-- | The parser used to read org files. +type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal) + +-- +-- Adaptions and specializations of parsing utilities +-- + +-- | Parse any line of text +anyLine :: OrgParser String +anyLine = + P.anyLine + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts +-- of the state saved and restored. +parseFromString :: OrgParser a -> String -> OrgParser a +parseFromString parser str' = do + oldLastPreCharPos <- orgStateLastPreCharPos <$> getState + updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } + result <- P.parseFromString parser str' + updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } + return result + +-- | Like @Text.Parsec.Char.newline@, but causes additional state changes. +newline :: OrgParser Char +newline = + P.newline + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- | Like @Text.Parsec.Char.blanklines@, but causes additional state changes. +blanklines :: OrgParser [Char] +blanklines = + P.blanklines + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- | Succeeds when we're in list context. +inList :: OrgParser () +inList = do + ctx <- orgStateParserContext <$> getState + guard (ctx == ListItemState) + +-- | Parse in different context +withContext :: ParserContext -- ^ New parser context + -> OrgParser a -- ^ Parser to run in that context + -> OrgParser a +withContext context parser = do + oldContext <- orgStateParserContext <$> getState + updateState $ \s -> s{ orgStateParserContext = context } + result <- parser + updateState $ \s -> s{ orgStateParserContext = oldContext } + return result + +-- +-- Parser state update functions +-- + +-- | Set the current position as the last position at which a forbidden char +-- was found (i.e. a character which is not allowed at the inner border of +-- markup). +updateLastForbiddenCharPos :: OrgParser () +updateLastForbiddenCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} + +-- | Set the current parser position as the position at which a character was +-- seen which allows inline markup to follow. +updateLastPreCharPos :: OrgParser () +updateLastPreCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastPreCharPos = Just p} -- cgit v1.2.3 From 39e8b4276e6d88d5cbb943d04c866dde9bf6473c Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sun, 22 May 2016 16:52:06 +0200 Subject: Org reader: extract inline parser to module Inline parsing code is moved to a separate module. Parsers for block starts are extracted as well, as those are used in the `endline` parser. This is part of the Org-mode reader cleanup effort. --- pandoc.cabal | 2 + src/Text/Pandoc/Readers/Org.hs | 797 ++--------------------------- src/Text/Pandoc/Readers/Org/BlockStarts.hs | 112 ++++ src/Text/Pandoc/Readers/Org/Inlines.hs | 715 ++++++++++++++++++++++++++ src/Text/Pandoc/Readers/Org/Parsing.hs | 19 + 5 files changed, 889 insertions(+), 756 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Org/BlockStarts.hs create mode 100644 src/Text/Pandoc/Readers/Org/Inlines.hs (limited to 'src/Text') diff --git a/pandoc.cabal b/pandoc.cabal index 61b5043ba..7286bd890 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -393,6 +393,8 @@ Library Text.Pandoc.Readers.Odt.Generic.XMLConverter, Text.Pandoc.Readers.Odt.Arrows.State, Text.Pandoc.Readers.Odt.Arrows.Utils, + Text.Pandoc.Readers.Org.BlockStarts, + Text.Pandoc.Readers.Org.Inlines, Text.Pandoc.Readers.Org.ParserState, Text.Pandoc.Readers.Org.Parsing, Text.Pandoc.Writers.Shared, diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index fd811c078..605d2220e 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,5 +1,4 @@ {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE OverloadedStrings #-} {- Copyright (C) 2014-2016 Albert Krewinkel @@ -29,6 +28,8 @@ Conversion of org-mode formatted plain text to 'Pandoc' document. -} module Text.Pandoc.Readers.Org ( readOrg ) where +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.Inlines import Text.Pandoc.Readers.Org.ParserState import Text.Pandoc.Readers.Org.Parsing @@ -38,19 +39,16 @@ import Text.Pandoc.Definition import Text.Pandoc.Compat.Monoid ((<>)) import Text.Pandoc.Error import Text.Pandoc.Options -import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) -import Text.Pandoc.Shared (compactify', compactify'DL) -import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) -import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap +import Text.Pandoc.Shared ( compactify', compactify'DL ) -import Control.Arrow (first) -import Control.Monad (foldM, guard, mplus, mzero, when) +import Control.Arrow ( first ) +import Control.Monad ( foldM, guard, mzero ) import Control.Monad.Reader ( runReader ) -import Data.Char (isAlphaNum, isSpace, toLower, toUpper) -import Data.List ( foldl', intersperse, isPrefixOf, isSuffixOf ) +import Data.Char ( toLower, toUpper) +import Data.List ( foldl', intersperse, isPrefixOf ) import qualified Data.Map as M import Data.Maybe ( fromMaybe, isNothing ) -import Network.HTTP (urlEncode) +import Network.HTTP ( urlEncode ) -- | Parse org-mode string and return a Pandoc document. @@ -59,54 +57,6 @@ readOrg :: ReaderOptions -- ^ Reader options -> Either PandocError Pandoc readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") --- --- Functions acting on the parser state --- -recordAnchorId :: String -> OrgParser () -recordAnchorId i = updateState $ \s -> - s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } - -pushToInlineCharStack :: Char -> OrgParser () -pushToInlineCharStack c = updateState $ \s -> - s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } - -popInlineCharStack :: OrgParser () -popInlineCharStack = updateState $ \s -> - s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } - -surroundingEmphasisChar :: OrgParser [Char] -surroundingEmphasisChar = - take 1 . drop 1 . orgStateEmphasisCharStack <$> getState - -startEmphasisNewlinesCounting :: Int -> OrgParser () -startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> - s{ orgStateEmphasisNewlines = Just maxNewlines } - -decEmphasisNewlinesCount :: OrgParser () -decEmphasisNewlinesCount = updateState $ \s -> - s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } - -newlinesCountWithinLimits :: OrgParser Bool -newlinesCountWithinLimits = do - st <- getState - return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True - -resetEmphasisNewlines :: OrgParser () -resetEmphasisNewlines = updateState $ \s -> - s{ orgStateEmphasisNewlines = Nothing } - -addLinkFormat :: String - -> (String -> String) - -> OrgParser () -addLinkFormat key formatter = updateState $ \s -> - let fs = orgStateLinkFormatters s - in s{ orgStateLinkFormatters = M.insert key formatter fs } - -addToNotesTable :: OrgNoteRecord -> OrgParser () -addToNotesTable note = do - oldnotes <- orgStateNotes' <$> getState - updateState $ \s -> s{ orgStateNotes' = note:oldnotes } - -- -- Export Settings -- @@ -259,7 +209,7 @@ block = choice [ mempty <$ blanklines , genericDrawer , specialLine , header - , return <$> hline + , horizontalRule , list , latexFragment , noteBlock @@ -457,9 +407,6 @@ indentWith num = do type SwitchOption = (Char, Maybe String) -orgArgWord :: OrgParser String -orgArgWord = many1 orgArgWordChar - -- | Parse code block arguments -- TODO: We currently don't handle switches. codeHeaderArgs :: OrgParser ([String], [(String, String)]) @@ -474,7 +421,10 @@ codeHeaderArgs = try $ do , map toRundocAttrib (("language", language) : parameters) ) else ([ pandocLang ], parameters) - where hasRundocParameters = not . null + where + hasRundocParameters = not . null + toRundocAttrib = first ("rundoc-" ++) + switch :: OrgParser SwitchOption switch = try $ simpleSwitch <|> lineNumbersSwitch @@ -508,17 +458,6 @@ blockOption = try $ do paramValue <- option "yes" orgParamValue return (argKey, paramValue) -inlineBlockOption :: OrgParser (String, String) -inlineBlockOption = try $ do - argKey <- orgArgKey - paramValue <- option "yes" orgInlineParamValue - return (argKey, paramValue) - -orgArgKey :: OrgParser String -orgArgKey = try $ - skipSpaces *> char ':' - *> many1 orgArgWordChar - orgParamValue :: OrgParser String orgParamValue = try $ skipSpaces @@ -526,19 +465,6 @@ orgParamValue = try $ *> many1 (noneOf "\t\n\r ") <* skipSpaces -orgInlineParamValue :: OrgParser String -orgInlineParamValue = try $ - skipSpaces - *> notFollowedBy (char ':') - *> many1 (noneOf "\t\n\r ]") - <* skipSpaces - -orgArgWordChar :: OrgParser Char -orgArgWordChar = alphaNum <|> oneOf "-_" - -toRundocAttrib :: (String, String) -> (String, String) -toRundocAttrib = first ("rundoc-" ++) - commaEscaped :: String -> String commaEscaped (',':cs@('*':_)) = cs commaEscaped (',':cs@('#':'+':_)) = cs @@ -552,7 +478,10 @@ exampleCode :: String -> Blocks exampleCode = B.codeBlockWith ("", ["example"], []) exampleLine :: OrgParser String -exampleLine = try $ skipSpaces *> string ": " *> anyLine +exampleLine = try $ exampleLineStart *> anyLine + +horizontalRule :: OrgParser (F Blocks) +horizontalRule = return B.horizontalRule <$ try hline -- @@ -582,11 +511,6 @@ genericDrawer = try $ do drawerDiv :: String -> F Blocks -> F Blocks drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty) -drawerStart :: OrgParser String -drawerStart = try $ - skipSpaces *> drawerName <* skipSpaces <* newline - where drawerName = char ':' *> manyTill nonspaceChar (char ':') - drawerLine :: OrgParser String drawerLine = anyLine @@ -639,31 +563,38 @@ figure = try $ do let attr = (mempty, mempty, figKeyVals) return $ (B.para . B.imageWith attr src (withFigPrefix figName) <$> figCaption) where + withFigPrefix :: String -> String withFigPrefix cs = if "fig:" `isPrefixOf` cs then cs else "fig:" ++ cs + selfTarget :: OrgParser String + selfTarget = try $ char '[' *> linkTarget <* char ']' + + -- -- Comments, Options and Metadata -- + +addLinkFormat :: String + -> (String -> String) + -> OrgParser () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + specialLine :: OrgParser (F Blocks) specialLine = fmap return . try $ metaLine <|> commentLine -metaLine :: OrgParser Blocks -metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) - -- The order, in which blocks are tried, makes sure that we're not looking at -- the beginning of a block, so we don't need to check for it -metaLineStart :: OrgParser () -metaLineStart = try $ skipSpaces <* string "#+" +metaLine :: OrgParser Blocks +metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) commentLine :: OrgParser Blocks commentLine = commentLineStart *> anyLine *> pure mempty -commentLineStart :: OrgParser () -commentLineStart = try $ skipSpaces <* string "# " - declarationLine :: OrgParser () declarationLine = try $ do key <- metaKey @@ -741,23 +672,6 @@ header = try $ do *> many1 tag <* skipSpaces -headerStart :: OrgParser Int -headerStart = try $ - (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos - - --- Don't use (or need) the reader wrapper here, we want hline to be --- @show@able. Otherwise we can't use it with @notFollowedBy'@. - --- | Horizontal Line (five -- dashes or more) -hline :: OrgParser Blocks -hline = try $ do - skipSpaces - string "-----" - many (char '-') - skipSpaces - newline - return B.horizontalRule -- -- Tables @@ -793,9 +707,6 @@ orgToPandocTable :: OrgTable orgToPandocTable (OrgTable aligns heads lns) caption = B.table caption (zip aligns $ repeat 0) heads lns -tableStart :: OrgParser Char -tableStart = try $ skipSpaces *> char '|' - tableRows :: OrgParser [OrgTableRow] tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) @@ -899,25 +810,12 @@ latexFragment = try $ do , "\\end{", e, "}\n" ] -latexEnvStart :: OrgParser String -latexEnvStart = try $ do - skipSpaces *> string "\\begin{" - *> latexEnvName - <* string "}" - <* blankline - latexEnd :: String -> OrgParser () latexEnd envName = try $ () <$ skipSpaces <* string ("\\end{" ++ envName ++ "}") <* blankline --- | Parses a LaTeX environment name. -latexEnvName :: OrgParser String -latexEnvName = try $ do - mappend <$> many1 alphaNum - <*> option "" (string "*") - -- -- Footnote defintions @@ -942,7 +840,7 @@ paraOrPlain = try $ do -- is directly followed by a list item, in which case the block is read as -- plain text. try (guard nl - *> notFollowedBy (inList *> (orderedListStart <|> bulletListStart)) + *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart)) *> return (B.para <$> ils)) <|> (return (B.plain <$> ils)) @@ -971,38 +869,21 @@ orderedList :: OrgParser (F Blocks) orderedList = fmap B.orderedList . fmap compactify' . sequence <$> many1 (listItem orderedListStart) -genericListStart :: OrgParser String - -> OrgParser Int -genericListStart listMarker = try $ - (+) <$> (length <$> many spaceChar) - <*> (length <$> listMarker <* many1 spaceChar) - --- parses bullet list marker. maybe we know the indent level -bulletListStart :: OrgParser Int -bulletListStart = bulletListStart' Nothing - bulletListStart' :: Maybe Int -> OrgParser Int -- returns length of bulletList prefix, inclusive of marker bulletListStart' Nothing = do ind <- length <$> many spaceChar - when (ind == 0) $ notFollowedBy (char '*') - oneOf bullets - many1 spaceChar + oneOf (bullets $ ind == 0) + skipSpaces1 return (ind + 1) - -- Unindented lists are legal, but they can't use '*' bullets - -- We return n to maintain compatibility with the generic listItem bulletListStart' (Just n) = do count (n-1) spaceChar - when (n == 1) $ notFollowedBy (char '*') - oneOf bullets + oneOf (bullets $ n == 1) many1 spaceChar return n -bullets :: String -bullets = "*+-" - -orderedListStart :: OrgParser Int -orderedListStart = genericListStart orderedListMarker - -- Ordered list markers allowed in org-mode - where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") +-- Unindented lists are legal, but they can't use '*' bullets. +-- We return n to maintain compatibility with the generic listItem. +bullets :: Bool -> String +bullets unindented = if unindented then "+-" else "*+-" definitionListItem :: OrgParser Int -> OrgParser (F (Inlines, [Blocks])) @@ -1040,602 +921,6 @@ listContinuation markerLength = try $ <*> many blankline) where listLine = try $ indentWith markerLength *> anyLineNewline +-- | Parse any line, include the final newline in the output. anyLineNewline :: OrgParser String anyLineNewline = (++ "\n") <$> anyLine - - --- --- inline --- - -inline :: OrgParser (F Inlines) -inline = - choice [ whitespace - , linebreak - , cite - , footnote - , linkOrImage - , anchor - , inlineCodeBlock - , str - , endline - , emph - , strong - , strikeout - , underline - , code - , math - , displayMath - , verbatim - , subscript - , superscript - , inlineLaTeX - , smart - , symbol - ] <* (guard =<< newlinesCountWithinLimits) - "inline" - -parseInlines :: OrgParser (F Inlines) -parseInlines = trimInlinesF . mconcat <$> many1 inline - --- treat these as potentially non-text when parsing inline: -specialChars :: [Char] -specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" - - -whitespace :: OrgParser (F Inlines) -whitespace = pure B.space <$ skipMany1 spaceChar - <* updateLastPreCharPos - <* updateLastForbiddenCharPos - "whitespace" - -linebreak :: OrgParser (F Inlines) -linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline - -str :: OrgParser (F Inlines) -str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") - <* updateLastStrPos - --- | An endline character that can be treated as a space, not a structural --- break. This should reflect the values of the Emacs variable --- @org-element-pagaraph-separate@. -endline :: OrgParser (F Inlines) -endline = try $ do - newline - notFollowedBy blankline - notFollowedBy' exampleLine - notFollowedBy' hline - notFollowedBy' noteMarker - notFollowedBy' tableStart - notFollowedBy' drawerStart - notFollowedBy' headerStart - notFollowedBy' metaLineStart - notFollowedBy' latexEnvStart - notFollowedBy' commentLineStart - notFollowedBy' bulletListStart - notFollowedBy' orderedListStart - decEmphasisNewlinesCount - guard =<< newlinesCountWithinLimits - updateLastPreCharPos - return . return $ B.softbreak - -cite :: OrgParser (F Inlines) -cite = try $ do - guardEnabled Ext_citations - (cs, raw) <- withRaw normalCite - return $ (flip B.cite (B.text raw)) <$> cs - -normalCite :: OrgParser (F [Citation]) -normalCite = try $ char '[' - *> skipSpaces - *> citeList - <* skipSpaces - <* char ']' - -citeList :: OrgParser (F [Citation]) -citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) - -citation :: OrgParser (F Citation) -citation = try $ do - pref <- prefix - (suppress_author, key) <- citeKey - suff <- suffix - return $ do - x <- pref - y <- suff - return $ Citation{ citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - where - prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) - suffix = try $ do - hasSpace <- option False (notFollowedBy nonspaceChar >> return True) - skipSpaces - rest <- trimInlinesF . mconcat <$> - many (notFollowedBy (oneOf ";]") *> inline) - return $ if hasSpace - then (B.space <>) <$> rest - else rest - -footnote :: OrgParser (F Inlines) -footnote = try $ inlineNote <|> referencedNote - -inlineNote :: OrgParser (F Inlines) -inlineNote = try $ do - string "[fn:" - ref <- many alphaNum - char ':' - note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') - when (not $ null ref) $ - addToNotesTable ("fn:" ++ ref, note) - return $ B.note <$> note - -referencedNote :: OrgParser (F Inlines) -referencedNote = try $ do - ref <- noteMarker - return $ do - notes <- asksF orgStateNotes' - case lookup ref notes of - Nothing -> return $ B.str $ "[" ++ ref ++ "]" - Just contents -> do - st <- askF - let contents' = runF contents st{ orgStateNotes' = [] } - return $ B.note contents' - -noteMarker :: OrgParser String -noteMarker = try $ do - char '[' - choice [ many1Till digit (char ']') - , (++) <$> string "fn:" - <*> many1Till (noneOf "\n\r\t ") (char ']') - ] - -linkOrImage :: OrgParser (F Inlines) -linkOrImage = explicitOrImageLink - <|> selflinkOrImage - <|> angleLink - <|> plainLink - "link or image" - -explicitOrImageLink :: OrgParser (F Inlines) -explicitOrImageLink = try $ do - char '[' - srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget - title <- enclosedRaw (char '[') (char ']') - title' <- parseFromString (mconcat <$> many inline) title - char ']' - return $ do - src <- srcF - if isImageFilename title - then pure $ B.link src "" $ B.image title mempty mempty - else linkToInlinesF src =<< title' - -selflinkOrImage :: OrgParser (F Inlines) -selflinkOrImage = try $ do - src <- char '[' *> linkTarget <* char ']' - return $ linkToInlinesF src (B.str src) - -plainLink :: OrgParser (F Inlines) -plainLink = try $ do - (orig, src) <- uri - returnF $ B.link src "" (B.str orig) - -angleLink :: OrgParser (F Inlines) -angleLink = try $ do - char '<' - link <- plainLink - char '>' - return link - -selfTarget :: OrgParser String -selfTarget = try $ char '[' *> linkTarget <* char ']' - -linkTarget :: OrgParser String -linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") - -possiblyEmptyLinkTarget :: OrgParser String -possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") - -applyCustomLinkFormat :: String -> OrgParser (F String) -applyCustomLinkFormat link = do - let (linkType, rest) = break (== ':') link - return $ do - formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters - return $ maybe link ($ drop 1 rest) formatter - --- | Take a link and return a function which produces new inlines when given --- description inlines. -linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF linkStr = - case linkStr of - "" -> pure . B.link mempty "" -- wiki link (empty by convention) - ('#':_) -> pure . B.link linkStr "" -- document-local fraction - _ -> case cleanLinkString linkStr of - (Just cleanedLink) -> if isImageFilename cleanedLink - then const . pure $ B.image cleanedLink "" "" - else pure . B.link cleanedLink "" - Nothing -> internalLink linkStr -- other internal link - --- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if --- the string does not appear to be a link. -cleanLinkString :: String -> Maybe String -cleanLinkString s = - case s of - '/':_ -> Just $ "file://" ++ s -- absolute path - '.':'/':_ -> Just s -- relative path - '.':'.':'/':_ -> Just s -- relative path - -- Relative path or URL (file schema) - 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' - _ | isUrl s -> Just s -- URL - _ -> Nothing - where - isUrl :: String -> Bool - isUrl cs = - let (scheme, path) = break (== ':') cs - in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme - && not (null path) - -isImageFilename :: String -> Bool -isImageFilename filename = - any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && - (any (\x -> (x++":") `isPrefixOf` filename) protocols || - ':' `notElem` filename) - where - imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] - protocols = [ "file", "http", "https" ] - -internalLink :: String -> Inlines -> F Inlines -internalLink link title = do - anchorB <- (link `elem`) <$> asksF orgStateAnchorIds - if anchorB - then return $ B.link ('#':link) "" title - else return $ B.emph title - --- | Parse an anchor like @<>@ and return an empty span with --- @anchor-id@ set as id. Legal anchors in org-mode are defined through --- @org-target-regexp@, which is fairly liberal. Since no link is created if --- @anchor-id@ contains spaces, we are more restrictive in what is accepted as --- an anchor. - -anchor :: OrgParser (F Inlines) -anchor = try $ do - anchorId <- parseAnchor - recordAnchorId anchorId - returnF $ B.spanWith (solidify anchorId, [], []) mempty - where - parseAnchor = string "<<" - *> many1 (noneOf "\t\n\r<>\"' ") - <* string ">>" - <* skipSpaces - --- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors --- the org function @org-export-solidify-link-text@. - -solidify :: String -> String -solidify = map replaceSpecialChar - where replaceSpecialChar c - | isAlphaNum c = c - | c `elem` ("_.-:" :: String) = c - | otherwise = '-' - --- | Parses an inline code block and marks it as an babel block. -inlineCodeBlock :: OrgParser (F Inlines) -inlineCodeBlock = try $ do - string "src_" - lang <- many1 orgArgWordChar - opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption - inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") - let attrClasses = [translateLang lang, rundocBlockClass] - let attrKeyVal = map toRundocAttrib (("language", lang) : opts) - returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode - -enclosedByPair :: Char -- ^ opening char - -> Char -- ^ closing char - -> OrgParser a -- ^ parser - -> OrgParser [a] -enclosedByPair s e p = char s *> many1Till p (char e) - -emph :: OrgParser (F Inlines) -emph = fmap B.emph <$> emphasisBetween '/' - -strong :: OrgParser (F Inlines) -strong = fmap B.strong <$> emphasisBetween '*' - -strikeout :: OrgParser (F Inlines) -strikeout = fmap B.strikeout <$> emphasisBetween '+' - --- There is no underline, so we use strong instead. -underline :: OrgParser (F Inlines) -underline = fmap B.strong <$> emphasisBetween '_' - -verbatim :: OrgParser (F Inlines) -verbatim = return . B.code <$> verbatimBetween '=' - -code :: OrgParser (F Inlines) -code = return . B.code <$> verbatimBetween '~' - -subscript :: OrgParser (F Inlines) -subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) - -superscript :: OrgParser (F Inlines) -superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) - -math :: OrgParser (F Inlines) -math = return . B.math <$> choice [ math1CharBetween '$' - , mathStringBetween '$' - , rawMathBetween "\\(" "\\)" - ] - -displayMath :: OrgParser (F Inlines) -displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" - , rawMathBetween "$$" "$$" - ] - -updatePositions :: Char - -> OrgParser (Char) -updatePositions c = do - when (c `elem` emphasisPreChars) updateLastPreCharPos - when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos - return c - -symbol :: OrgParser (F Inlines) -symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) - -emphasisBetween :: Char - -> OrgParser (F Inlines) -emphasisBetween c = try $ do - startEmphasisNewlinesCounting emphasisAllowedNewlines - res <- enclosedInlines (emphasisStart c) (emphasisEnd c) - isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState - when isTopLevelEmphasis - resetEmphasisNewlines - return res - -verbatimBetween :: Char - -> OrgParser String -verbatimBetween c = try $ - emphasisStart c *> - many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) - --- | Parses a raw string delimited by @c@ using Org's math rules -mathStringBetween :: Char - -> OrgParser String -mathStringBetween c = try $ do - mathStart c - body <- many1TillNOrLessNewlines mathAllowedNewlines - (noneOf (c:"\n\r")) - (lookAhead $ mathEnd c) - final <- mathEnd c - return $ body ++ [final] - --- | Parse a single character between @c@ using math rules -math1CharBetween :: Char - -> OrgParser String -math1CharBetween c = try $ do - char c - res <- noneOf $ c:mathForbiddenBorderChars - char c - eof <|> () <$ lookAhead (oneOf mathPostChars) - return [res] - -rawMathBetween :: String - -> String - -> OrgParser String -rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) - --- | Parses the start (opening character) of emphasis -emphasisStart :: Char -> OrgParser Char -emphasisStart c = try $ do - guard =<< afterEmphasisPreChar - guard =<< notAfterString - char c - lookAhead (noneOf emphasisForbiddenBorderChars) - pushToInlineCharStack c - return c - --- | Parses the closing character of emphasis -emphasisEnd :: Char -> OrgParser Char -emphasisEnd c = try $ do - guard =<< notAfterForbiddenBorderChar - char c - eof <|> () <$ lookAhead acceptablePostChars - updateLastStrPos - popInlineCharStack - return c - where acceptablePostChars = - surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) - -mathStart :: Char -> OrgParser Char -mathStart c = try $ - char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) - -mathEnd :: Char -> OrgParser Char -mathEnd c = try $ do - res <- noneOf (c:mathForbiddenBorderChars) - char c - eof <|> () <$ lookAhead (oneOf mathPostChars) - return res - - -enclosedInlines :: OrgParser a - -> OrgParser b - -> OrgParser (F Inlines) -enclosedInlines start end = try $ - trimInlinesF . mconcat <$> enclosed start end inline - -enclosedRaw :: OrgParser a - -> OrgParser b - -> OrgParser String -enclosedRaw start end = try $ - start *> (onSingleLine <|> spanningTwoLines) - where onSingleLine = try $ many1Till (noneOf "\n\r") end - spanningTwoLines = try $ - anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine - --- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume --- newlines. -many1TillNOrLessNewlines :: Int - -> OrgParser Char - -> OrgParser a - -> OrgParser String -many1TillNOrLessNewlines n p end = try $ - nMoreLines (Just n) mempty >>= oneOrMore - where - nMoreLines Nothing cs = return cs - nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine - nMoreLines k cs = try $ (final k cs <|> rest k cs) - >>= uncurry nMoreLines - final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine - rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline) - finalLine = try $ manyTill p end - minus1 k = k - 1 - oneOrMore cs = guard (not $ null cs) *> return cs - --- Org allows customization of the way it reads emphasis. We use the defaults --- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` --- for details). - --- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) -emphasisPreChars :: [Char] -emphasisPreChars = "\t \"'({" - --- | Chars allowed at after emphasis -emphasisPostChars :: [Char] -emphasisPostChars = "\t\n !\"'),-.:;?\\}" - --- | Chars not allowed at the (inner) border of emphasis -emphasisForbiddenBorderChars :: [Char] -emphasisForbiddenBorderChars = "\t\n\r \"'," - --- | The maximum number of newlines within -emphasisAllowedNewlines :: Int -emphasisAllowedNewlines = 1 - --- LaTeX-style math: see `org-latex-regexps` for details - --- | Chars allowed after an inline ($...$) math statement -mathPostChars :: [Char] -mathPostChars = "\t\n \"'),-.:;?" - --- | Chars not allowed at the (inner) border of math -mathForbiddenBorderChars :: [Char] -mathForbiddenBorderChars = "\t\n\r ,;.$" - --- | Maximum number of newlines in an inline math statement -mathAllowedNewlines :: Int -mathAllowedNewlines = 2 - --- | Whether we are right behind a char allowed before emphasis -afterEmphasisPreChar :: OrgParser Bool -afterEmphasisPreChar = do - pos <- getPosition - lastPrePos <- orgStateLastPreCharPos <$> getState - return . fromMaybe True $ (== pos) <$> lastPrePos - --- | Whether the parser is right after a forbidden border char -notAfterForbiddenBorderChar :: OrgParser Bool -notAfterForbiddenBorderChar = do - pos <- getPosition - lastFBCPos <- orgStateLastForbiddenCharPos <$> getState - return $ lastFBCPos /= Just pos - --- | Read a sub- or superscript expression -subOrSuperExpr :: OrgParser (F Inlines) -subOrSuperExpr = try $ - choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") - , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") - , simpleSubOrSuperString - ] >>= parseFromString (mconcat <$> many inline) - where enclosing (left, right) s = left : s ++ [right] - -simpleSubOrSuperString :: OrgParser String -simpleSubOrSuperString = try $ do - state <- getState - guard . exportSubSuperscripts . orgStateExportSettings $ state - choice [ string "*" - , mappend <$> option [] ((:[]) <$> oneOf "+-") - <*> many1 alphaNum - ] - -inlineLaTeX :: OrgParser (F Inlines) -inlineLaTeX = try $ do - cmd <- inlineLaTeXCommand - maybe mzero returnF $ - parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd - where - parseAsMath :: String -> Maybe Inlines - parseAsMath cs = B.fromList <$> texMathToPandoc cs - - parseAsInlineLaTeX :: String -> Maybe Inlines - parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs - - parseAsMathMLSym :: String -> Maybe Inlines - parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) - -- drop initial backslash and any trailing "{}" - where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 - - state :: ParserState - state = def{ stateOptions = def{ readerParseRaw = True }} - - texMathToPandoc :: String -> Maybe [Inline] - texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline - -maybeRight :: Either a b -> Maybe b -maybeRight = either (const Nothing) Just - -inlineLaTeXCommand :: OrgParser String -inlineLaTeXCommand = try $ do - rest <- getInput - case runParser rawLaTeXInline def "source" rest of - Right (RawInline _ cs) -> do - -- drop any trailing whitespace, those are not be part of the command as - -- far as org mode is concerned. - let cmdNoSpc = dropWhileEnd isSpace cs - let len = length cmdNoSpc - count len anyChar - return cmdNoSpc - _ -> mzero - --- Taken from Data.OldList. -dropWhileEnd :: (a -> Bool) -> [a] -> [a] -dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] - -smart :: OrgParser (F Inlines) -smart = do - getOption readerSmart >>= guard - doubleQuoted <|> singleQuoted <|> - choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses]) - where - orgDash = dash <* updatePositions '-' - orgEllipses = ellipses <* updatePositions '.' - orgApostrophe = - (char '\'' <|> char '\8217') <* updateLastPreCharPos - <* updateLastForbiddenCharPos - *> return (B.str "\x2019") - -singleQuoted :: OrgParser (F Inlines) -singleQuoted = try $ do - singleQuoteStart - updatePositions '\'' - withQuoteContext InSingleQuote $ - fmap B.singleQuoted . trimInlinesF . mconcat <$> - many1Till inline (singleQuoteEnd <* updatePositions '\'') - --- doubleQuoted will handle regular double-quoted sections, as well --- as dialogues with an open double-quote without a close double-quote --- in the same paragraph. -doubleQuoted :: OrgParser (F Inlines) -doubleQuoted = try $ do - doubleQuoteStart - updatePositions '"' - contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) - (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return - (fmap B.doubleQuoted . trimInlinesF $ contents)) - <|> (return $ return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/Org/BlockStarts.hs b/src/Text/Pandoc/Readers/Org/BlockStarts.hs new file mode 100644 index 000000000..e4dc31342 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/BlockStarts.hs @@ -0,0 +1,112 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Parsers for Org-mode inline elements. +-} +module Text.Pandoc.Readers.Org.BlockStarts + ( exampleLineStart + , hline + , noteMarker + , tableStart + , drawerStart + , headerStart + , metaLineStart + , latexEnvStart + , commentLineStart + , bulletListStart + , orderedListStart + ) where + +import Text.Pandoc.Readers.Org.Parsing + +-- | Horizontal Line (five -- dashes or more) +hline :: OrgParser () +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return () + +-- | Read the start of a header line, return the header level +headerStart :: OrgParser Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos + +tableStart :: OrgParser Char +tableStart = try $ skipSpaces *> char '|' + +latexEnvStart :: OrgParser String +latexEnvStart = try $ do + skipSpaces *> string "\\begin{" + *> latexEnvName + <* string "}" + <* blankline + where + latexEnvName :: OrgParser String + latexEnvName = try $ mappend <$> many1 alphaNum <*> option "" (string "*") + + +-- | Parses bullet list marker. +bulletListStart :: OrgParser () +bulletListStart = try $ + choice + [ () <$ skipSpaces <* oneOf "+-" <* skipSpaces1 + , () <$ skipSpaces1 <* char '*' <* skipSpaces1 + ] + +genericListStart :: OrgParser String + -> OrgParser Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +orderedListStart :: OrgParser Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +drawerStart :: OrgParser String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* newline + where drawerName = char ':' *> manyTill nonspaceChar (char ':') + +metaLineStart :: OrgParser () +metaLineStart = try $ skipSpaces <* string "#+" + +commentLineStart :: OrgParser () +commentLineStart = try $ skipSpaces <* string "# " + +exampleLineStart :: OrgParser () +exampleLineStart = () <$ try (skipSpaces *> string ": ") + +noteMarker :: OrgParser String +noteMarker = try $ do + char '[' + choice [ many1Till digit (char ']') + , (++) <$> string "fn:" + <*> many1Till (noneOf "\n\r\t ") (char ']') + ] diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs new file mode 100644 index 000000000..0c3840979 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -0,0 +1,715 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Parsers for Org-mode inline elements. +-} +module Text.Pandoc.Readers.Org.Inlines + ( inline + , addToNotesTable + , parseInlines + , isImageFilename + , linkTarget + ) where + +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Compat.Monoid ( (<>) ) +import Text.Pandoc.Options +import Text.Pandoc.Readers.LaTeX ( inlineCommand, rawLaTeXInline ) +import Text.TeXMath ( readTeX, writePandoc, DisplayType(..) ) +import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap + +import Control.Arrow ( first ) +import Control.Monad ( guard, mplus, mzero, when ) +import Data.Char ( isAlphaNum, isSpace ) +import Data.List ( isPrefixOf, isSuffixOf ) +import Data.Maybe ( fromMaybe ) +import qualified Data.Map as M + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +toRundocAttrib :: (String, String) -> (String, String) +toRundocAttrib = first ("rundoc-" ++) + +translateLang :: String -> String +translateLang "C" = "c" +translateLang "C++" = "cpp" +translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported +translateLang "js" = "javascript" +translateLang "lisp" = "commonlisp" +translateLang "R" = "r" +translateLang "sh" = "bash" +translateLang "sqlite" = "sql" +translateLang cs = cs + +-- +-- Functions acting on the parser state +-- +recordAnchorId :: String -> OrgParser () +recordAnchorId i = updateState $ \s -> + s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + +pushToInlineCharStack :: Char -> OrgParser () +pushToInlineCharStack c = updateState $ \s -> + s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } + +popInlineCharStack :: OrgParser () +popInlineCharStack = updateState $ \s -> + s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } + +surroundingEmphasisChar :: OrgParser [Char] +surroundingEmphasisChar = + take 1 . drop 1 . orgStateEmphasisCharStack <$> getState + +startEmphasisNewlinesCounting :: Int -> OrgParser () +startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Just maxNewlines } + +decEmphasisNewlinesCount :: OrgParser () +decEmphasisNewlinesCount = updateState $ \s -> + s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } + +newlinesCountWithinLimits :: OrgParser Bool +newlinesCountWithinLimits = do + st <- getState + return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True + +resetEmphasisNewlines :: OrgParser () +resetEmphasisNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Nothing } + +addToNotesTable :: OrgNoteRecord -> OrgParser () +addToNotesTable note = do + oldnotes <- orgStateNotes' <$> getState + updateState $ \s -> s{ orgStateNotes' = note:oldnotes } + +-- | Parse a single Org-mode inline element +inline :: OrgParser (F Inlines) +inline = + choice [ whitespace + , linebreak + , cite + , footnote + , linkOrImage + , anchor + , inlineCodeBlock + , str + , endline + , emph + , strong + , strikeout + , underline + , code + , math + , displayMath + , verbatim + , subscript + , superscript + , inlineLaTeX + , smart + , symbol + ] <* (guard =<< newlinesCountWithinLimits) + "inline" + +parseInlines :: OrgParser (F Inlines) +parseInlines = trimInlinesF . mconcat <$> many1 inline + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" + + +whitespace :: OrgParser (F Inlines) +whitespace = pure B.space <$ skipMany1 spaceChar + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + "whitespace" + +linebreak :: OrgParser (F Inlines) +linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline + +str :: OrgParser (F Inlines) +str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- | An endline character that can be treated as a space, not a structural +-- break. This should reflect the values of the Emacs variable +-- @org-element-pagaraph-separate@. +endline :: OrgParser (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy' exampleLineStart + notFollowedBy' hline + notFollowedBy' noteMarker + notFollowedBy' tableStart + notFollowedBy' drawerStart + notFollowedBy' headerStart + notFollowedBy' metaLineStart + notFollowedBy' latexEnvStart + notFollowedBy' commentLineStart + notFollowedBy' bulletListStart + notFollowedBy' orderedListStart + decEmphasisNewlinesCount + guard =<< newlinesCountWithinLimits + updateLastPreCharPos + return . return $ B.softbreak + +cite :: OrgParser (F Inlines) +cite = try $ do + guardEnabled Ext_citations + (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + +normalCite :: OrgParser (F [Citation]) +normalCite = try $ char '[' + *> skipSpaces + *> citeList + <* skipSpaces + <* char ']' + +citeList :: OrgParser (F [Citation]) +citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) + +citation :: OrgParser (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +footnote :: OrgParser (F Inlines) +footnote = try $ inlineNote <|> referencedNote + +inlineNote :: OrgParser (F Inlines) +inlineNote = try $ do + string "[fn:" + ref <- many alphaNum + char ':' + note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') + when (not $ null ref) $ + addToNotesTable ("fn:" ++ ref, note) + return $ B.note <$> note + +referencedNote :: OrgParser (F Inlines) +referencedNote = try $ do + ref <- noteMarker + return $ do + notes <- asksF orgStateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just contents -> do + st <- askF + let contents' = runF contents st{ orgStateNotes' = [] } + return $ B.note contents' + +linkOrImage :: OrgParser (F Inlines) +linkOrImage = explicitOrImageLink + <|> selflinkOrImage + <|> angleLink + <|> plainLink + "link or image" + +explicitOrImageLink :: OrgParser (F Inlines) +explicitOrImageLink = try $ do + char '[' + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget + title <- enclosedRaw (char '[') (char ']') + title' <- parseFromString (mconcat <$> many inline) title + char ']' + return $ do + src <- srcF + if isImageFilename title + then pure $ B.link src "" $ B.image title mempty mempty + else linkToInlinesF src =<< title' + +selflinkOrImage :: OrgParser (F Inlines) +selflinkOrImage = try $ do + src <- char '[' *> linkTarget <* char ']' + return $ linkToInlinesF src (B.str src) + +plainLink :: OrgParser (F Inlines) +plainLink = try $ do + (orig, src) <- uri + returnF $ B.link src "" (B.str orig) + +angleLink :: OrgParser (F Inlines) +angleLink = try $ do + char '<' + link <- plainLink + char '>' + return link + +linkTarget :: OrgParser String +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") + +possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + +applyCustomLinkFormat :: String -> OrgParser (F String) +applyCustomLinkFormat link = do + let (linkType, rest) = break (== ':') link + return $ do + formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters + return $ maybe link ($ drop 1 rest) formatter + +-- | Take a link and return a function which produces new inlines when given +-- description inlines. +linkToInlinesF :: String -> Inlines -> F Inlines +linkToInlinesF linkStr = + case linkStr of + "" -> pure . B.link mempty "" -- wiki link (empty by convention) + ('#':_) -> pure . B.link linkStr "" -- document-local fraction + _ -> case cleanLinkString linkStr of + (Just cleanedLink) -> if isImageFilename cleanedLink + then const . pure $ B.image cleanedLink "" "" + else pure . B.link cleanedLink "" + Nothing -> internalLink linkStr -- other internal link + +-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if +-- the string does not appear to be a link. +cleanLinkString :: String -> Maybe String +cleanLinkString s = + case s of + '/':_ -> Just $ "file://" ++ s -- absolute path + '.':'/':_ -> Just s -- relative path + '.':'.':'/':_ -> Just s -- relative path + -- Relative path or URL (file schema) + 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' + _ | isUrl s -> Just s -- URL + _ -> Nothing + where + isUrl :: String -> Bool + isUrl cs = + let (scheme, path) = break (== ':') cs + in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme + && not (null path) + +isImageFilename :: String -> Bool +isImageFilename filename = + any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && + (any (\x -> (x++":") `isPrefixOf` filename) protocols || + ':' `notElem` filename) + where + imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] + protocols = [ "file", "http", "https" ] + +internalLink :: String -> Inlines -> F Inlines +internalLink link title = do + anchorB <- (link `elem`) <$> asksF orgStateAnchorIds + if anchorB + then return $ B.link ('#':link) "" title + else return $ B.emph title + +-- | Parse an anchor like @<>@ and return an empty span with +-- @anchor-id@ set as id. Legal anchors in org-mode are defined through +-- @org-target-regexp@, which is fairly liberal. Since no link is created if +-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as +-- an anchor. + +anchor :: OrgParser (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + recordAnchorId anchorId + returnF $ B.spanWith (solidify anchorId, [], []) mempty + where + parseAnchor = string "<<" + *> many1 (noneOf "\t\n\r<>\"' ") + <* string ">>" + <* skipSpaces + +-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors +-- the org function @org-export-solidify-link-text@. + +solidify :: String -> String +solidify = map replaceSpecialChar + where replaceSpecialChar c + | isAlphaNum c = c + | c `elem` ("_.-:" :: String) = c + | otherwise = '-' + +-- | Parses an inline code block and marks it as an babel block. +inlineCodeBlock :: OrgParser (F Inlines) +inlineCodeBlock = try $ do + string "src_" + lang <- many1 orgArgWordChar + opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption + inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") + let attrClasses = [translateLang lang, rundocBlockClass] + let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode + where + inlineBlockOption :: OrgParser (String, String) + inlineBlockOption = try $ do + argKey <- orgArgKey + paramValue <- option "yes" orgInlineParamValue + return (argKey, paramValue) + + orgInlineParamValue :: OrgParser String + orgInlineParamValue = try $ + skipSpaces + *> notFollowedBy (char ':') + *> many1 (noneOf "\t\n\r ]") + <* skipSpaces + + + +enclosedByPair :: Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser a -- ^ parser + -> OrgParser [a] +enclosedByPair s e p = char s *> many1Till p (char e) + +emph :: OrgParser (F Inlines) +emph = fmap B.emph <$> emphasisBetween '/' + +strong :: OrgParser (F Inlines) +strong = fmap B.strong <$> emphasisBetween '*' + +strikeout :: OrgParser (F Inlines) +strikeout = fmap B.strikeout <$> emphasisBetween '+' + +-- There is no underline, so we use strong instead. +underline :: OrgParser (F Inlines) +underline = fmap B.strong <$> emphasisBetween '_' + +verbatim :: OrgParser (F Inlines) +verbatim = return . B.code <$> verbatimBetween '=' + +code :: OrgParser (F Inlines) +code = return . B.code <$> verbatimBetween '~' + +subscript :: OrgParser (F Inlines) +subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) + +superscript :: OrgParser (F Inlines) +superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) + +math :: OrgParser (F Inlines) +math = return . B.math <$> choice [ math1CharBetween '$' + , mathStringBetween '$' + , rawMathBetween "\\(" "\\)" + ] + +displayMath :: OrgParser (F Inlines) +displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" + , rawMathBetween "$$" "$$" + ] + +updatePositions :: Char + -> OrgParser (Char) +updatePositions c = do + when (c `elem` emphasisPreChars) updateLastPreCharPos + when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos + return c + +symbol :: OrgParser (F Inlines) +symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) + +emphasisBetween :: Char + -> OrgParser (F Inlines) +emphasisBetween c = try $ do + startEmphasisNewlinesCounting emphasisAllowedNewlines + res <- enclosedInlines (emphasisStart c) (emphasisEnd c) + isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState + when isTopLevelEmphasis + resetEmphasisNewlines + return res + +verbatimBetween :: Char + -> OrgParser String +verbatimBetween c = try $ + emphasisStart c *> + many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) + +-- | Parses a raw string delimited by @c@ using Org's math rules +mathStringBetween :: Char + -> OrgParser String +mathStringBetween c = try $ do + mathStart c + body <- many1TillNOrLessNewlines mathAllowedNewlines + (noneOf (c:"\n\r")) + (lookAhead $ mathEnd c) + final <- mathEnd c + return $ body ++ [final] + +-- | Parse a single character between @c@ using math rules +math1CharBetween :: Char + -> OrgParser String +math1CharBetween c = try $ do + char c + res <- noneOf $ c:mathForbiddenBorderChars + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return [res] + +rawMathBetween :: String + -> String + -> OrgParser String +rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) + +-- | Parses the start (opening character) of emphasis +emphasisStart :: Char -> OrgParser Char +emphasisStart c = try $ do + guard =<< afterEmphasisPreChar + guard =<< notAfterString + char c + lookAhead (noneOf emphasisForbiddenBorderChars) + pushToInlineCharStack c + return c + +-- | Parses the closing character of emphasis +emphasisEnd :: Char -> OrgParser Char +emphasisEnd c = try $ do + guard =<< notAfterForbiddenBorderChar + char c + eof <|> () <$ lookAhead acceptablePostChars + updateLastStrPos + popInlineCharStack + return c + where acceptablePostChars = + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + +mathStart :: Char -> OrgParser Char +mathStart c = try $ + char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) + +mathEnd :: Char -> OrgParser Char +mathEnd c = try $ do + res <- noneOf (c:mathForbiddenBorderChars) + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return res + + +enclosedInlines :: OrgParser a + -> OrgParser b + -> OrgParser (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> enclosed start end inline + +enclosedRaw :: OrgParser a + -> OrgParser b + -> OrgParser String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume +-- newlines. +many1TillNOrLessNewlines :: Int + -> OrgParser Char + -> OrgParser a + -> OrgParser String +many1TillNOrLessNewlines n p end = try $ + nMoreLines (Just n) mempty >>= oneOrMore + where + nMoreLines Nothing cs = return cs + nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine + nMoreLines k cs = try $ (final k cs <|> rest k cs) + >>= uncurry nMoreLines + final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline) + finalLine = try $ manyTill p end + minus1 k = k - 1 + oneOrMore cs = guard (not $ null cs) *> return cs + +-- Org allows customization of the way it reads emphasis. We use the defaults +-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` +-- for details). + +-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) +emphasisPreChars :: [Char] +emphasisPreChars = "\t \"'({" + +-- | Chars allowed at after emphasis +emphasisPostChars :: [Char] +emphasisPostChars = "\t\n !\"'),-.:;?\\}" + +-- | Chars not allowed at the (inner) border of emphasis +emphasisForbiddenBorderChars :: [Char] +emphasisForbiddenBorderChars = "\t\n\r \"'," + +-- | The maximum number of newlines within +emphasisAllowedNewlines :: Int +emphasisAllowedNewlines = 1 + +-- LaTeX-style math: see `org-latex-regexps` for details + +-- | Chars allowed after an inline ($...$) math statement +mathPostChars :: [Char] +mathPostChars = "\t\n \"'),-.:;?" + +-- | Chars not allowed at the (inner) border of math +mathForbiddenBorderChars :: [Char] +mathForbiddenBorderChars = "\t\n\r ,;.$" + +-- | Maximum number of newlines in an inline math statement +mathAllowedNewlines :: Int +mathAllowedNewlines = 2 + +-- | Whether we are right behind a char allowed before emphasis +afterEmphasisPreChar :: OrgParser Bool +afterEmphasisPreChar = do + pos <- getPosition + lastPrePos <- orgStateLastPreCharPos <$> getState + return . fromMaybe True $ (== pos) <$> lastPrePos + +-- | Whether the parser is right after a forbidden border char +notAfterForbiddenBorderChar :: OrgParser Bool +notAfterForbiddenBorderChar = do + pos <- getPosition + lastFBCPos <- orgStateLastForbiddenCharPos <$> getState + return $ lastFBCPos /= Just pos + +-- | Read a sub- or superscript expression +subOrSuperExpr :: OrgParser (F Inlines) +subOrSuperExpr = try $ + choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + , simpleSubOrSuperString + ] >>= parseFromString (mconcat <$> many inline) + where enclosing (left, right) s = left : s ++ [right] + +simpleSubOrSuperString :: OrgParser String +simpleSubOrSuperString = try $ do + state <- getState + guard . exportSubSuperscripts . orgStateExportSettings $ state + choice [ string "*" + , mappend <$> option [] ((:[]) <$> oneOf "+-") + <*> many1 alphaNum + ] + +inlineLaTeX :: OrgParser (F Inlines) +inlineLaTeX = try $ do + cmd <- inlineLaTeXCommand + maybe mzero returnF $ + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd + where + parseAsMath :: String -> Maybe Inlines + parseAsMath cs = B.fromList <$> texMathToPandoc cs + + parseAsInlineLaTeX :: String -> Maybe Inlines + parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + + parseAsMathMLSym :: String -> Maybe Inlines + parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) + -- drop initial backslash and any trailing "{}" + where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 + + state :: ParserState + state = def{ stateOptions = def{ readerParseRaw = True }} + + texMathToPandoc :: String -> Maybe [Inline] + texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline + +maybeRight :: Either a b -> Maybe b +maybeRight = either (const Nothing) Just + +inlineLaTeXCommand :: OrgParser String +inlineLaTeXCommand = try $ do + rest <- getInput + case runParser rawLaTeXInline def "source" rest of + Right (RawInline _ cs) -> do + -- drop any trailing whitespace, those are not be part of the command as + -- far as org mode is concerned. + let cmdNoSpc = dropWhileEnd isSpace cs + let len = length cmdNoSpc + count len anyChar + return cmdNoSpc + _ -> mzero + +-- Taken from Data.OldList. +dropWhileEnd :: (a -> Bool) -> [a] -> [a] +dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] + +smart :: OrgParser (F Inlines) +smart = do + getOption readerSmart >>= guard + doubleQuoted <|> singleQuoted <|> + choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses]) + where + orgDash = dash <* updatePositions '-' + orgEllipses = ellipses <* updatePositions '.' + orgApostrophe = + (char '\'' <|> char '\8217') <* updateLastPreCharPos + <* updateLastForbiddenCharPos + *> return (B.str "\x2019") + +singleQuoted :: OrgParser (F Inlines) +singleQuoted = try $ do + singleQuoteStart + updatePositions '\'' + withQuoteContext InSingleQuote $ + fmap B.singleQuoted . trimInlinesF . mconcat <$> + many1Till inline (singleQuoteEnd <* updatePositions '\'') + +-- doubleQuoted will handle regular double-quoted sections, as well +-- as dialogues with an open double-quote without a close double-quote +-- in the same paragraph. +doubleQuoted :: OrgParser (F Inlines) +doubleQuoted = try $ do + doubleQuoteStart + updatePositions '"' + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return + (fmap B.doubleQuoted . trimInlinesF $ contents)) + <|> (return $ return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs index efe2ae25f..9a1420645 100644 --- a/src/Text/Pandoc/Readers/Org/Parsing.hs +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -34,10 +34,14 @@ module Text.Pandoc.Readers.Org.Parsing , blanklines , newline , parseFromString + , skipSpaces1 , inList , withContext , updateLastForbiddenCharPos , updateLastPreCharPos + , orgArgKey + , orgArgWord + , orgArgWordChar -- * Re-exports from Text.Pandoc.Parser , ParserContext (..) , many1Till @@ -133,6 +137,10 @@ parseFromString parser str' = do updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } return result +-- | Skip one or more tab or space characters. +skipSpaces1 :: OrgParser () +skipSpaces1 = skipMany1 spaceChar + -- | Like @Text.Parsec.Char.newline@, but causes additional state changes. newline :: OrgParser Char newline = @@ -180,3 +188,14 @@ updateLastForbiddenCharPos = getPosition >>= \p -> updateLastPreCharPos :: OrgParser () updateLastPreCharPos = getPosition >>= \p -> updateState $ \s -> s{ orgStateLastPreCharPos = Just p} + +orgArgKey :: OrgParser String +orgArgKey = try $ + skipSpaces *> char ':' + *> many1 orgArgWordChar + +orgArgWord :: OrgParser String +orgArgWord = many1 orgArgWordChar + +orgArgWordChar :: OrgParser Char +orgArgWordChar = alphaNum <|> oneOf "-_" -- cgit v1.2.3 From eea6d6568f99eda689b93210a22692c7f79b4bbf Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Wed, 25 May 2016 22:50:51 +0200 Subject: Org reader: extract blocks parser to module Block parsing code is moved to a separate module. This is part of the Org-mode reader cleanup effort. --- pandoc.cabal | 1 + src/Text/Pandoc/Readers/Org.hs | 853 +------------------------------- src/Text/Pandoc/Readers/Org/Blocks.hs | 891 ++++++++++++++++++++++++++++++++++ 3 files changed, 901 insertions(+), 844 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Org/Blocks.hs (limited to 'src/Text') diff --git a/pandoc.cabal b/pandoc.cabal index 7286bd890..7d7250a21 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -394,6 +394,7 @@ Library Text.Pandoc.Readers.Odt.Arrows.State, Text.Pandoc.Readers.Odt.Arrows.Utils, Text.Pandoc.Readers.Org.BlockStarts, + Text.Pandoc.Readers.Org.Blocks, Text.Pandoc.Readers.Org.Inlines, Text.Pandoc.Readers.Org.ParserState, Text.Pandoc.Readers.Org.Parsing, diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 605d2220e..1042b5a21 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE FlexibleContexts #-} {- Copyright (C) 2014-2016 Albert Krewinkel @@ -28,27 +27,15 @@ Conversion of org-mode formatted plain text to 'Pandoc' document. -} module Text.Pandoc.Readers.Org ( readOrg ) where -import Text.Pandoc.Readers.Org.BlockStarts -import Text.Pandoc.Readers.Org.Inlines -import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.Blocks ( blockList, meta ) +import Text.Pandoc.Readers.Org.Parsing ( OrgParser, readWithM ) +import Text.Pandoc.Readers.Org.ParserState ( OrgParserState (..) ) -import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder ( Inlines, Blocks ) import Text.Pandoc.Definition -import Text.Pandoc.Compat.Monoid ((<>)) import Text.Pandoc.Error import Text.Pandoc.Options -import Text.Pandoc.Shared ( compactify', compactify'DL ) -import Control.Arrow ( first ) -import Control.Monad ( foldM, guard, mzero ) import Control.Monad.Reader ( runReader ) -import Data.Char ( toLower, toUpper) -import Data.List ( foldl', intersperse, isPrefixOf ) -import qualified Data.Map as M -import Data.Maybe ( fromMaybe, isNothing ) -import Network.HTTP ( urlEncode ) -- | Parse org-mode string and return a Pandoc document. @@ -57,106 +44,17 @@ readOrg :: ReaderOptions -- ^ Reader options -> Either PandocError Pandoc readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") --- --- Export Settings --- -exportSetting :: OrgParser () -exportSetting = choice - [ booleanSetting "^" setExportSubSuperscripts - , ignoredSetting "'" - , ignoredSetting "*" - , ignoredSetting "-" - , ignoredSetting ":" - , ignoredSetting "<" - , ignoredSetting "\\n" - , ignoredSetting "arch" - , ignoredSetting "author" - , ignoredSetting "c" - , ignoredSetting "creator" - , complementableListSetting "d" setExportDrawers - , ignoredSetting "date" - , ignoredSetting "e" - , ignoredSetting "email" - , ignoredSetting "f" - , ignoredSetting "H" - , ignoredSetting "inline" - , ignoredSetting "num" - , ignoredSetting "p" - , ignoredSetting "pri" - , ignoredSetting "prop" - , ignoredSetting "stat" - , ignoredSetting "tags" - , ignoredSetting "tasks" - , ignoredSetting "tex" - , ignoredSetting "timestamp" - , ignoredSetting "title" - , ignoredSetting "toc" - , ignoredSetting "todo" - , ignoredSetting "|" - ] "export setting" - -booleanSetting :: String -> ExportSettingSetter Bool -> OrgParser () -booleanSetting settingIdentifier setter = try $ do - string settingIdentifier - char ':' - value <- elispBoolean - updateState $ modifyExportSettings setter value - --- | Read an elisp boolean. Only NIL is treated as false, non-NIL values are --- interpreted as true. -elispBoolean :: OrgParser Bool -elispBoolean = try $ do - value <- many1 nonspaceChar - return $ case map toLower value of - "nil" -> False - "{}" -> False - "()" -> False - _ -> True - --- | A list or a complement list (i.e. a list starting with `not`). -complementableListSetting :: String - -> ExportSettingSetter (Either [String] [String]) - -> OrgParser () -complementableListSetting settingIdentifier setter = try $ do - _ <- string settingIdentifier <* char ':' - value <- choice [ Left <$> complementStringList - , Right <$> stringList - , (\b -> if b then Left [] else Right []) <$> elispBoolean - ] - updateState $ modifyExportSettings setter value - where - -- Read a plain list of strings. - stringList :: OrgParser [String] - stringList = try $ - char '(' - *> sepBy elispString spaces - <* char ')' - - -- Read an emacs lisp list specifying a complement set. - complementStringList :: OrgParser [String] - complementStringList = try $ - string "(not " - *> sepBy elispString spaces - <* char ')' - - elispString :: OrgParser String - elispString = try $ - char '"' - *> manyTill alphaNum (char '"') - -ignoredSetting :: String -> OrgParser () -ignoredSetting s = try (() <$ string s <* char ':' <* many1 nonspaceChar) - -- -- Parser -- parseOrg :: OrgParser Pandoc parseOrg = do - blocks' <- parseBlocks - st <- getState - let meta = runF (orgStateMeta' st) st - let removeUnwantedBlocks = dropCommentTrees . filter (/= Null) - return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st) + blocks' <- blockList + meta' <- meta + return . Pandoc meta' $ removeUnwantedBlocks blocks' + where + removeUnwantedBlocks :: [Block] -> [Block] + removeUnwantedBlocks = dropCommentTrees . filter (/= Null) -- | Drop COMMENT headers and the document tree below those headers. dropCommentTrees :: [Block] -> [Block] @@ -191,736 +89,3 @@ isHeaderLevelLowerEq n blk = case blk of (Header level _ _) -> n >= level _ -> False - - --- --- parsing blocks --- - -parseBlocks :: OrgParser (F Blocks) -parseBlocks = mconcat <$> manyTill block eof - -block :: OrgParser (F Blocks) -block = choice [ mempty <$ blanklines - , table - , orgBlock - , figure - , example - , genericDrawer - , specialLine - , header - , horizontalRule - , list - , latexFragment - , noteBlock - , paraOrPlain - ] "block" - - --- --- Block Attributes --- - --- | Attributes that may be added to figures (like a name or caption). -data BlockAttributes = BlockAttributes - { blockAttrName :: Maybe String - , blockAttrCaption :: Maybe (F Inlines) - , blockAttrKeyValues :: [(String, String)] - } - -stringyMetaAttribute :: (String -> Bool) -> OrgParser (String, String) -stringyMetaAttribute attrCheck = try $ do - metaLineStart - attrName <- map toUpper <$> many1Till nonspaceChar (char ':') - guard $ attrCheck attrName - skipSpaces - attrValue <- anyLine - return (attrName, attrValue) - -blockAttributes :: OrgParser BlockAttributes -blockAttributes = try $ do - kv <- many (stringyMetaAttribute attrCheck) - let caption = foldl' (appendValues "CAPTION") Nothing kv - let kvAttrs = foldl' (appendValues "ATTR_HTML") Nothing kv - let name = lookup "NAME" kv - caption' <- maybe (return Nothing) - (fmap Just . parseFromString parseInlines) - caption - kvAttrs' <- parseFromString keyValues . (++ "\n") $ fromMaybe mempty kvAttrs - return $ BlockAttributes - { blockAttrName = name - , blockAttrCaption = caption' - , blockAttrKeyValues = kvAttrs' - } - where - attrCheck :: String -> Bool - attrCheck attr = - case attr of - "NAME" -> True - "CAPTION" -> True - "ATTR_HTML" -> True - _ -> False - - appendValues :: String -> Maybe String -> (String, String) -> Maybe String - appendValues attrName accValue (key, value) = - if key /= attrName - then accValue - else case accValue of - Just acc -> Just $ acc ++ ' ':value - Nothing -> Just value - -keyValues :: OrgParser [(String, String)] -keyValues = try $ - manyTill ((,) <$> key <*> value) newline - where - key :: OrgParser String - key = try $ skipSpaces *> char ':' *> many1 nonspaceChar - - value :: OrgParser String - value = skipSpaces *> manyTill anyChar endOfValue - - endOfValue :: OrgParser () - endOfValue = - lookAhead $ (() <$ try (many1 spaceChar <* key)) - <|> () <$ newline - - --- --- Org Blocks (#+BEGIN_... / #+END_...) --- - -type BlockProperties = (Int, String) -- (Indentation, Block-Type) - -updateIndent :: BlockProperties -> Int -> BlockProperties -updateIndent (_, blkType) indent = (indent, blkType) - -orgBlock :: OrgParser (F Blocks) -orgBlock = try $ do - blockAttrs <- blockAttributes - blockProp@(_, blkType) <- blockHeaderStart - ($ blockProp) $ - case blkType of - "comment" -> withRaw' (const mempty) - "html" -> withRaw' (return . (B.rawBlock blkType)) - "latex" -> withRaw' (return . (B.rawBlock blkType)) - "ascii" -> withRaw' (return . (B.rawBlock blkType)) - "example" -> withRaw' (return . exampleCode) - "quote" -> withParsed (fmap B.blockQuote) - "verse" -> verseBlock - "src" -> codeBlock blockAttrs - _ -> withParsed (fmap $ divWithClass blkType) - -blockHeaderStart :: OrgParser (Int, String) -blockHeaderStart = try $ (,) <$> indentation <*> blockType - where - blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) - -indentation :: OrgParser Int -indentation = try $ do - tabStop <- getOption readerTabStop - s <- many spaceChar - return $ spaceLength tabStop s - -spaceLength :: Int -> String -> Int -spaceLength tabStop s = (sum . map charLen) s - where - charLen ' ' = 1 - charLen '\t' = tabStop - charLen _ = 0 - -withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) -withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) - -withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) -withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp)) - -ignHeaders :: OrgParser () -ignHeaders = (() <$ newline) <|> (() <$ anyLine) - -divWithClass :: String -> Blocks -> Blocks -divWithClass cls = B.divWith ("", [cls], []) - -verseBlock :: BlockProperties -> OrgParser (F Blocks) -verseBlock blkProp = try $ do - ignHeaders - content <- rawBlockContent blkProp - fmap B.para . mconcat . intersperse (pure B.linebreak) - <$> mapM (parseFromString parseInlines) (map (++ "\n") . lines $ content) - -exportsCode :: [(String, String)] -> Bool -exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs - || ("rundoc-exports", "results") `elem` attrs) - -exportsResults :: [(String, String)] -> Bool -exportsResults attrs = ("rundoc-exports", "results") `elem` attrs - || ("rundoc-exports", "both") `elem` attrs - -followingResultsBlock :: OrgParser (Maybe (F Blocks)) -followingResultsBlock = - optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:" - *> blankline - *> block) - -codeBlock :: BlockAttributes -> BlockProperties -> OrgParser (F Blocks) -codeBlock blockAttrs blkProp = do - skipSpaces - (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) - leadingIndent <- lookAhead indentation - content <- rawBlockContent (updateIndent blkProp leadingIndent) - resultsContent <- followingResultsBlock - let id' = fromMaybe mempty $ blockAttrName blockAttrs - let includeCode = exportsCode kv - let includeResults = exportsResults kv - let codeBlck = B.codeBlockWith ( id', classes, kv ) content - let labelledBlck = maybe (pure codeBlck) - (labelDiv codeBlck) - (blockAttrCaption blockAttrs) - let resultBlck = fromMaybe mempty resultsContent - return $ (if includeCode then labelledBlck else mempty) - <> (if includeResults then resultBlck else mempty) - where - labelDiv blk value = - B.divWith nullAttr <$> (mappend <$> labelledBlock value - <*> pure blk) - labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) - -rawBlockContent :: BlockProperties -> OrgParser String -rawBlockContent (indent, blockType) = try $ - unlines . map commaEscaped <$> manyTill indentedLine blockEnder - where - indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) - blockEnder = try $ skipSpaces *> stringAnyCase ("#+end_" <> blockType) - -parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) -parsedBlockContent blkProps = try $ do - raw <- rawBlockContent blkProps - parseFromString parseBlocks (raw ++ "\n") - --- indent by specified number of spaces (or equiv. tabs) -indentWith :: Int -> OrgParser String -indentWith num = do - tabStop <- getOption readerTabStop - if num < tabStop - then count num (char ' ') - else choice [ try (count num (char ' ')) - , try (char '\t' >> count (num - tabStop) (char ' ')) ] - -type SwitchOption = (Char, Maybe String) - --- | Parse code block arguments --- TODO: We currently don't handle switches. -codeHeaderArgs :: OrgParser ([String], [(String, String)]) -codeHeaderArgs = try $ do - language <- skipSpaces *> orgArgWord - _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) - parameters <- manyTill blockOption newline - let pandocLang = translateLang language - return $ - if hasRundocParameters parameters - then ( [ pandocLang, rundocBlockClass ] - , map toRundocAttrib (("language", language) : parameters) - ) - else ([ pandocLang ], parameters) - where - hasRundocParameters = not . null - toRundocAttrib = first ("rundoc-" ++) - - -switch :: OrgParser SwitchOption -switch = try $ simpleSwitch <|> lineNumbersSwitch - where - simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) - lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> - (string "-l \"" *> many1Till nonspaceChar (char '"')) - -translateLang :: String -> String -translateLang "C" = "c" -translateLang "C++" = "cpp" -translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported -translateLang "js" = "javascript" -translateLang "lisp" = "commonlisp" -translateLang "R" = "r" -translateLang "sh" = "bash" -translateLang "sqlite" = "sql" -translateLang cs = cs - --- | Prefix used for Rundoc classes and arguments. -rundocPrefix :: String -rundocPrefix = "rundoc-" - --- | The class-name used to mark rundoc blocks. -rundocBlockClass :: String -rundocBlockClass = rundocPrefix ++ "block" - -blockOption :: OrgParser (String, String) -blockOption = try $ do - argKey <- orgArgKey - paramValue <- option "yes" orgParamValue - return (argKey, paramValue) - -orgParamValue :: OrgParser String -orgParamValue = try $ - skipSpaces - *> notFollowedBy (char ':' ) - *> many1 (noneOf "\t\n\r ") - <* skipSpaces - -commaEscaped :: String -> String -commaEscaped (',':cs@('*':_)) = cs -commaEscaped (',':cs@('#':'+':_)) = cs -commaEscaped cs = cs - -example :: OrgParser (F Blocks) -example = try $ do - return . return . exampleCode =<< unlines <$> many1 exampleLine - -exampleCode :: String -> Blocks -exampleCode = B.codeBlockWith ("", ["example"], []) - -exampleLine :: OrgParser String -exampleLine = try $ exampleLineStart *> anyLine - -horizontalRule :: OrgParser (F Blocks) -horizontalRule = return B.horizontalRule <$ try hline - - --- --- Drawers --- - --- | A generic drawer which has no special meaning for org-mode. --- Whether or not this drawer is included in the output depends on the drawers --- export setting. -genericDrawer :: OrgParser (F Blocks) -genericDrawer = try $ do - name <- map toUpper <$> drawerStart - content <- manyTill drawerLine (try drawerEnd) - state <- getState - -- Include drawer if it is explicitly included in or not explicitly excluded - -- from the list of drawers that should be exported. PROPERTIES drawers are - -- never exported. - case (exportDrawers . orgStateExportSettings $ state) of - _ | name == "PROPERTIES" -> return mempty - Left names | name `elem` names -> return mempty - Right names | name `notElem` names -> return mempty - _ -> drawerDiv name <$> parseLines content - where - parseLines :: [String] -> OrgParser (F Blocks) - parseLines = parseFromString parseBlocks . (++ "\n") . unlines - - drawerDiv :: String -> F Blocks -> F Blocks - drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty) - -drawerLine :: OrgParser String -drawerLine = anyLine - -drawerEnd :: OrgParser String -drawerEnd = try $ - skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline - --- | Read a :PROPERTIES: drawer and return the key/value pairs contained --- within. -propertiesDrawer :: OrgParser [(String, String)] -propertiesDrawer = try $ do - drawerType <- drawerStart - guard $ map toUpper drawerType == "PROPERTIES" - manyTill property (try drawerEnd) - where - property :: OrgParser (String, String) - property = try $ (,) <$> key <*> value - - key :: OrgParser String - key = try $ skipSpaces *> char ':' *> many1Till nonspaceChar (char ':') - - value :: OrgParser String - value = try $ skipSpaces *> manyTill anyChar (try $ skipSpaces *> newline) - -keyValuesToAttr :: [(String, String)] -> Attr -keyValuesToAttr kvs = - let - lowerKvs = map (\(k, v) -> (map toLower k, v)) kvs - id' = fromMaybe mempty . lookup "custom_id" $ lowerKvs - cls = fromMaybe mempty . lookup "class" $ lowerKvs - kvs' = filter (flip notElem ["custom_id", "class"] . fst) lowerKvs - in - (id', words cls, kvs') - - --- --- Figures --- - --- | Figures (Image on a line by itself, preceded by name and/or caption) -figure :: OrgParser (F Blocks) -figure = try $ do - figAttrs <- blockAttributes - src <- skipSpaces *> selfTarget <* skipSpaces <* newline - guard . not . isNothing . blockAttrCaption $ figAttrs - guard (isImageFilename src) - let figName = fromMaybe mempty $ blockAttrName figAttrs - let figCaption = fromMaybe mempty $ blockAttrCaption figAttrs - let figKeyVals = blockAttrKeyValues figAttrs - let attr = (mempty, mempty, figKeyVals) - return $ (B.para . B.imageWith attr src (withFigPrefix figName) <$> figCaption) - where - withFigPrefix :: String -> String - withFigPrefix cs = - if "fig:" `isPrefixOf` cs - then cs - else "fig:" ++ cs - - selfTarget :: OrgParser String - selfTarget = try $ char '[' *> linkTarget <* char ']' - - --- --- Comments, Options and Metadata --- - -addLinkFormat :: String - -> (String -> String) - -> OrgParser () -addLinkFormat key formatter = updateState $ \s -> - let fs = orgStateLinkFormatters s - in s{ orgStateLinkFormatters = M.insert key formatter fs } - -specialLine :: OrgParser (F Blocks) -specialLine = fmap return . try $ metaLine <|> commentLine - --- The order, in which blocks are tried, makes sure that we're not looking at --- the beginning of a block, so we don't need to check for it -metaLine :: OrgParser Blocks -metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) - -commentLine :: OrgParser Blocks -commentLine = commentLineStart *> anyLine *> pure mempty - -declarationLine :: OrgParser () -declarationLine = try $ do - key <- metaKey - inlinesF <- metaInlines - updateState $ \st -> - let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta - in st { orgStateMeta' = orgStateMeta' st <> meta' } - return () - -metaInlines :: OrgParser (F MetaValue) -metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline - -metaKey :: OrgParser String -metaKey = map toLower <$> many1 (noneOf ": \n\r") - <* char ':' - <* skipSpaces - -optionLine :: OrgParser () -optionLine = try $ do - key <- metaKey - case key of - "link" -> parseLinkFormat >>= uncurry addLinkFormat - "options" -> () <$ sepBy spaces exportSetting - _ -> mzero - -parseLinkFormat :: OrgParser ((String, String -> String)) -parseLinkFormat = try $ do - linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces - linkSubst <- parseFormat - return (linkType, linkSubst) - --- | An ad-hoc, single-argument-only implementation of a printf-style format --- parser. -parseFormat :: OrgParser (String -> String) -parseFormat = try $ do - replacePlain <|> replaceUrl <|> justAppend - where - -- inefficient, but who cares - replacePlain = try $ (\x -> concat . flip intersperse x) - <$> sequence [tillSpecifier 's', rest] - replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode) - <$> sequence [tillSpecifier 'h', rest] - justAppend = try $ (++) <$> rest - - rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") - tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) - --- --- Headers --- - --- | Headers -header :: OrgParser (F Blocks) -header = try $ do - level <- headerStart - title <- manyTill inline (lookAhead $ optional headerTags <* newline) - tags <- option [] headerTags - newline - propAttr <- option nullAttr (keyValuesToAttr <$> propertiesDrawer) - inlines <- runF (tagTitle title tags) <$> getState - attr <- registerHeader propAttr inlines - return $ pure (B.headerWith attr level inlines) - where - tagTitle :: [F Inlines] -> [String] -> F Inlines - tagTitle title tags = trimInlinesF . mconcat $ title <> map tagToInlineF tags - - tagToInlineF :: String -> F Inlines - tagToInlineF t = return $ B.spanWith ("", ["tag"], [("data-tag-name", t)]) mempty - - headerTags :: OrgParser [String] - headerTags = try $ - let tag = many1 (alphaNum <|> oneOf "@%#_") <* char ':' - in skipSpaces - *> char ':' - *> many1 tag - <* skipSpaces - - --- --- Tables --- - -data OrgTableRow = OrgContentRow (F [Blocks]) - | OrgAlignRow [Alignment] - | OrgHlineRow - --- OrgTable is strongly related to the pandoc table ADT. Using the same --- (i.e. pandoc-global) ADT would mean that the reader would break if the --- global structure was to be changed, which would be bad. The final table --- should be generated using a builder function. Column widths aren't --- implemented yet, so they are not tracked here. -data OrgTable = OrgTable - { orgTableAlignments :: [Alignment] - , orgTableHeader :: [Blocks] - , orgTableRows :: [[Blocks]] - } - -table :: OrgParser (F Blocks) -table = try $ do - blockAttrs <- blockAttributes - lookAhead tableStart - do - rows <- tableRows - let caption = fromMaybe (return mempty) $ blockAttrCaption blockAttrs - return $ (<$> caption) . orgToPandocTable . normalizeTable =<< rowsToTable rows - -orgToPandocTable :: OrgTable - -> Inlines - -> Blocks -orgToPandocTable (OrgTable aligns heads lns) caption = - B.table caption (zip aligns $ repeat 0) heads lns - -tableRows :: OrgParser [OrgTableRow] -tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) - -tableContentRow :: OrgParser OrgTableRow -tableContentRow = try $ - OrgContentRow . sequence <$> (tableStart *> many1Till tableContentCell newline) - -tableContentCell :: OrgParser (F Blocks) -tableContentCell = try $ - fmap B.plain . trimInlinesF . mconcat <$> manyTill inline endOfCell - -tableAlignRow :: OrgParser OrgTableRow -tableAlignRow = try $ do - tableStart - cells <- many1Till tableAlignCell newline - -- Empty rows are regular (i.e. content) rows, not alignment rows. - guard $ any (/= AlignDefault) cells - return $ OrgAlignRow cells - -tableAlignCell :: OrgParser Alignment -tableAlignCell = - choice [ try $ emptyCell *> return AlignDefault - , try $ skipSpaces - *> char '<' - *> tableAlignFromChar - <* many digit - <* char '>' - <* emptyCell - ] "alignment info" - where emptyCell = try $ skipSpaces *> endOfCell - -tableAlignFromChar :: OrgParser Alignment -tableAlignFromChar = try $ - choice [ char 'l' *> return AlignLeft - , char 'c' *> return AlignCenter - , char 'r' *> return AlignRight - ] - -tableHline :: OrgParser OrgTableRow -tableHline = try $ - OrgHlineRow <$ (tableStart *> char '-' *> anyLine) - -endOfCell :: OrgParser Char -endOfCell = try $ char '|' <|> lookAhead newline - -rowsToTable :: [OrgTableRow] - -> F OrgTable -rowsToTable = foldM rowToContent emptyTable - where emptyTable = OrgTable mempty mempty mempty - -normalizeTable :: OrgTable -> OrgTable -normalizeTable (OrgTable aligns heads rows) = OrgTable aligns' heads rows - where - refRow = if heads /= mempty - then heads - else if rows == mempty then mempty else head rows - cols = length refRow - fillColumns base padding = take cols $ base ++ repeat padding - aligns' = fillColumns aligns AlignDefault - --- One or more horizontal rules after the first content line mark the previous --- line as a header. All other horizontal lines are discarded. -rowToContent :: OrgTable - -> OrgTableRow - -> F OrgTable -rowToContent orgTable row = - case row of - OrgHlineRow -> return singleRowPromotedToHeader - OrgAlignRow as -> return . setAligns $ as - OrgContentRow cs -> appendToBody cs - where - singleRowPromotedToHeader :: OrgTable - singleRowPromotedToHeader = case orgTable of - OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> - orgTable{ orgTableHeader = b , orgTableRows = [] } - _ -> orgTable - - setAligns :: [Alignment] -> OrgTable - setAligns aligns = orgTable{ orgTableAlignments = aligns } - - appendToBody :: F [Blocks] -> F OrgTable - appendToBody frow = do - newRow <- frow - let oldRows = orgTableRows orgTable - -- NOTE: This is an inefficient O(n) operation. This should be changed - -- if performance ever becomes a problem. - return orgTable{ orgTableRows = oldRows ++ [newRow] } - - --- --- LaTeX fragments --- -latexFragment :: OrgParser (F Blocks) -latexFragment = try $ do - envName <- latexEnvStart - content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) - return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) - where - c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" - , c - , "\\end{", e, "}\n" - ] - -latexEnd :: String -> OrgParser () -latexEnd envName = try $ - () <$ skipSpaces - <* string ("\\end{" ++ envName ++ "}") - <* blankline - - --- --- Footnote defintions --- -noteBlock :: OrgParser (F Blocks) -noteBlock = try $ do - ref <- noteMarker <* skipSpaces - content <- mconcat <$> blocksTillHeaderOrNote - addToNotesTable (ref, content) - return mempty - where - blocksTillHeaderOrNote = - many1Till block (eof <|> () <$ lookAhead noteMarker - <|> () <$ lookAhead headerStart) - --- Paragraphs or Plain text -paraOrPlain :: OrgParser (F Blocks) -paraOrPlain = try $ do - ils <- parseInlines - nl <- option False (newline *> return True) - -- Read block as paragraph, except if we are in a list context and the block - -- is directly followed by a list item, in which case the block is read as - -- plain text. - try (guard nl - *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart)) - *> return (B.para <$> ils)) - <|> (return (B.plain <$> ils)) - -inlinesTillNewline :: OrgParser (F Inlines) -inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline - - --- --- list blocks --- - -list :: OrgParser (F Blocks) -list = choice [ definitionList, bulletList, orderedList ] "list" - -definitionList :: OrgParser (F Blocks) -definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) - fmap B.definitionList . fmap compactify'DL . sequence - <$> many1 (definitionListItem $ bulletListStart' (Just n)) - -bulletList :: OrgParser (F Blocks) -bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) - fmap B.bulletList . fmap compactify' . sequence - <$> many1 (listItem (bulletListStart' $ Just n)) - -orderedList :: OrgParser (F Blocks) -orderedList = fmap B.orderedList . fmap compactify' . sequence - <$> many1 (listItem orderedListStart) - -bulletListStart' :: Maybe Int -> OrgParser Int --- returns length of bulletList prefix, inclusive of marker -bulletListStart' Nothing = do ind <- length <$> many spaceChar - oneOf (bullets $ ind == 0) - skipSpaces1 - return (ind + 1) -bulletListStart' (Just n) = do count (n-1) spaceChar - oneOf (bullets $ n == 1) - many1 spaceChar - return n - --- Unindented lists are legal, but they can't use '*' bullets. --- We return n to maintain compatibility with the generic listItem. -bullets :: Bool -> String -bullets unindented = if unindented then "+-" else "*+-" - -definitionListItem :: OrgParser Int - -> OrgParser (F (Inlines, [Blocks])) -definitionListItem parseMarkerGetLength = try $ do - markerLength <- parseMarkerGetLength - term <- manyTill (noneOf "\n\r") (try definitionMarker) - line1 <- anyLineNewline - blank <- option "" ("\n" <$ blankline) - cont <- concat <$> many (listContinuation markerLength) - term' <- parseFromString parseInlines term - contents' <- parseFromString parseBlocks $ line1 ++ blank ++ cont - return $ (,) <$> term' <*> fmap (:[]) contents' - where - definitionMarker = - spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) - - --- parse raw text for one list item, excluding start marker and continuations -listItem :: OrgParser Int - -> OrgParser (F Blocks) -listItem start = try . withContext ListItemState $ do - markerLength <- try start - firstLine <- anyLineNewline - blank <- option "" ("\n" <$ blankline) - rest <- concat <$> many (listContinuation markerLength) - parseFromString parseBlocks $ firstLine ++ blank ++ rest - --- continuation of a list item - indented and separated by blankline or endline. --- Note: nested lists are parsed as continuations. -listContinuation :: Int - -> OrgParser String -listContinuation markerLength = try $ - notFollowedBy' blankline - *> (mappend <$> (concat <$> many1 listLine) - <*> many blankline) - where listLine = try $ indentWith markerLength *> anyLineNewline - --- | Parse any line, include the final newline in the output. -anyLineNewline :: OrgParser String -anyLineNewline = (++ "\n") <$> anyLine diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs new file mode 100644 index 000000000..b374acfe2 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -0,0 +1,891 @@ +{-# LANGUAGE FlexibleContexts #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + +Parsers for Org-mode block elements. +-} +module Text.Pandoc.Readers.Org.Blocks + ( blockList + , meta + ) where + +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.Inlines +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks ) +import Text.Pandoc.Definition +import Text.Pandoc.Compat.Monoid ((<>)) +import Text.Pandoc.Options +import Text.Pandoc.Shared ( compactify', compactify'DL ) + +import Control.Arrow ( first ) +import Control.Monad ( foldM, guard, mzero ) +import Data.Char ( toLower, toUpper) +import Data.List ( foldl', intersperse, isPrefixOf ) +import qualified Data.Map as M +import Data.Maybe ( fromMaybe, isNothing ) +import Network.HTTP ( urlEncode ) + + +-- +-- parsing blocks +-- + +-- | Get a list of blocks. +blockList :: OrgParser [Block] +blockList = do + blocks' <- blocks + st <- getState + return . B.toList $ runF blocks' st + +-- | Get the meta information safed in the state. +meta :: OrgParser Meta +meta = do + st <- getState + return $ runF (orgStateMeta' st) st + +blocks :: OrgParser (F Blocks) +blocks = mconcat <$> manyTill block eof + +block :: OrgParser (F Blocks) +block = choice [ mempty <$ blanklines + , table + , orgBlock + , figure + , example + , genericDrawer + , specialLine + , header + , horizontalRule + , list + , latexFragment + , noteBlock + , paraOrPlain + ] "block" + + +-- +-- Block Attributes +-- + +-- | Attributes that may be added to figures (like a name or caption). +data BlockAttributes = BlockAttributes + { blockAttrName :: Maybe String + , blockAttrCaption :: Maybe (F Inlines) + , blockAttrKeyValues :: [(String, String)] + } + +stringyMetaAttribute :: (String -> Bool) -> OrgParser (String, String) +stringyMetaAttribute attrCheck = try $ do + metaLineStart + attrName <- map toUpper <$> many1Till nonspaceChar (char ':') + guard $ attrCheck attrName + skipSpaces + attrValue <- anyLine + return (attrName, attrValue) + +blockAttributes :: OrgParser BlockAttributes +blockAttributes = try $ do + kv <- many (stringyMetaAttribute attrCheck) + let caption = foldl' (appendValues "CAPTION") Nothing kv + let kvAttrs = foldl' (appendValues "ATTR_HTML") Nothing kv + let name = lookup "NAME" kv + caption' <- maybe (return Nothing) + (fmap Just . parseFromString parseInlines) + caption + kvAttrs' <- parseFromString keyValues . (++ "\n") $ fromMaybe mempty kvAttrs + return $ BlockAttributes + { blockAttrName = name + , blockAttrCaption = caption' + , blockAttrKeyValues = kvAttrs' + } + where + attrCheck :: String -> Bool + attrCheck attr = + case attr of + "NAME" -> True + "CAPTION" -> True + "ATTR_HTML" -> True + _ -> False + + appendValues :: String -> Maybe String -> (String, String) -> Maybe String + appendValues attrName accValue (key, value) = + if key /= attrName + then accValue + else case accValue of + Just acc -> Just $ acc ++ ' ':value + Nothing -> Just value + +keyValues :: OrgParser [(String, String)] +keyValues = try $ + manyTill ((,) <$> key <*> value) newline + where + key :: OrgParser String + key = try $ skipSpaces *> char ':' *> many1 nonspaceChar + + value :: OrgParser String + value = skipSpaces *> manyTill anyChar endOfValue + + endOfValue :: OrgParser () + endOfValue = + lookAhead $ (() <$ try (many1 spaceChar <* key)) + <|> () <$ newline + + +-- +-- Org Blocks (#+BEGIN_... / #+END_...) +-- + +type BlockProperties = (Int, String) -- (Indentation, Block-Type) + +updateIndent :: BlockProperties -> Int -> BlockProperties +updateIndent (_, blkType) indent = (indent, blkType) + +orgBlock :: OrgParser (F Blocks) +orgBlock = try $ do + blockAttrs <- blockAttributes + blockProp@(_, blkType) <- blockHeaderStart + ($ blockProp) $ + case blkType of + "comment" -> withRaw' (const mempty) + "html" -> withRaw' (return . (B.rawBlock blkType)) + "latex" -> withRaw' (return . (B.rawBlock blkType)) + "ascii" -> withRaw' (return . (B.rawBlock blkType)) + "example" -> withRaw' (return . exampleCode) + "quote" -> withParsed (fmap B.blockQuote) + "verse" -> verseBlock + "src" -> codeBlock blockAttrs + _ -> withParsed (fmap $ divWithClass blkType) + +blockHeaderStart :: OrgParser (Int, String) +blockHeaderStart = try $ (,) <$> indentation <*> blockType + where + blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) + +indentation :: OrgParser Int +indentation = try $ do + tabStop <- getOption readerTabStop + s <- many spaceChar + return $ spaceLength tabStop s + +spaceLength :: Int -> String -> Int +spaceLength tabStop s = (sum . map charLen) s + where + charLen ' ' = 1 + charLen '\t' = tabStop + charLen _ = 0 + +withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) + +withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp)) + +ignHeaders :: OrgParser () +ignHeaders = (() <$ newline) <|> (() <$ anyLine) + +divWithClass :: String -> Blocks -> Blocks +divWithClass cls = B.divWith ("", [cls], []) + +verseBlock :: BlockProperties -> OrgParser (F Blocks) +verseBlock blkProp = try $ do + ignHeaders + content <- rawBlockContent blkProp + fmap B.para . mconcat . intersperse (pure B.linebreak) + <$> mapM (parseFromString parseInlines) (map (++ "\n") . lines $ content) + +exportsCode :: [(String, String)] -> Bool +exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs + || ("rundoc-exports", "results") `elem` attrs) + +exportsResults :: [(String, String)] -> Bool +exportsResults attrs = ("rundoc-exports", "results") `elem` attrs + || ("rundoc-exports", "both") `elem` attrs + +followingResultsBlock :: OrgParser (Maybe (F Blocks)) +followingResultsBlock = + optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:" + *> blankline + *> block) + +codeBlock :: BlockAttributes -> BlockProperties -> OrgParser (F Blocks) +codeBlock blockAttrs blkProp = do + skipSpaces + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + leadingIndent <- lookAhead indentation + content <- rawBlockContent (updateIndent blkProp leadingIndent) + resultsContent <- followingResultsBlock + let id' = fromMaybe mempty $ blockAttrName blockAttrs + let includeCode = exportsCode kv + let includeResults = exportsResults kv + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + let labelledBlck = maybe (pure codeBlck) + (labelDiv codeBlck) + (blockAttrCaption blockAttrs) + let resultBlck = fromMaybe mempty resultsContent + return $ (if includeCode then labelledBlck else mempty) + <> (if includeResults then resultBlck else mempty) + where + labelDiv blk value = + B.divWith nullAttr <$> (mappend <$> labelledBlock value + <*> pure blk) + labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + +rawBlockContent :: BlockProperties -> OrgParser String +rawBlockContent (indent, blockType) = try $ + unlines . map commaEscaped <$> manyTill indentedLine blockEnder + where + indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) + blockEnder = try $ skipSpaces *> stringAnyCase ("#+end_" <> blockType) + +parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) +parsedBlockContent blkProps = try $ do + raw <- rawBlockContent blkProps + parseFromString blocks (raw ++ "\n") + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Int -> OrgParser String +indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> count (num - tabStop) (char ' ')) ] + +type SwitchOption = (Char, Maybe String) + +-- | Parse code block arguments +-- TODO: We currently don't handle switches. +codeHeaderArgs :: OrgParser ([String], [(String, String)]) +codeHeaderArgs = try $ do + language <- skipSpaces *> orgArgWord + _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) + parameters <- manyTill blockOption newline + let pandocLang = translateLang language + return $ + if hasRundocParameters parameters + then ( [ pandocLang, rundocBlockClass ] + , map toRundocAttrib (("language", language) : parameters) + ) + else ([ pandocLang ], parameters) + where + hasRundocParameters = not . null + toRundocAttrib = first ("rundoc-" ++) + + +switch :: OrgParser SwitchOption +switch = try $ simpleSwitch <|> lineNumbersSwitch + where + simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) + lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> + (string "-l \"" *> many1Till nonspaceChar (char '"')) + +translateLang :: String -> String +translateLang "C" = "c" +translateLang "C++" = "cpp" +translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported +translateLang "js" = "javascript" +translateLang "lisp" = "commonlisp" +translateLang "R" = "r" +translateLang "sh" = "bash" +translateLang "sqlite" = "sql" +translateLang cs = cs + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +blockOption :: OrgParser (String, String) +blockOption = try $ do + argKey <- orgArgKey + paramValue <- option "yes" orgParamValue + return (argKey, paramValue) + +orgParamValue :: OrgParser String +orgParamValue = try $ + skipSpaces + *> notFollowedBy (char ':' ) + *> many1 (noneOf "\t\n\r ") + <* skipSpaces + +commaEscaped :: String -> String +commaEscaped (',':cs@('*':_)) = cs +commaEscaped (',':cs@('#':'+':_)) = cs +commaEscaped cs = cs + +example :: OrgParser (F Blocks) +example = try $ do + return . return . exampleCode =<< unlines <$> many1 exampleLine + +exampleCode :: String -> Blocks +exampleCode = B.codeBlockWith ("", ["example"], []) + +exampleLine :: OrgParser String +exampleLine = try $ exampleLineStart *> anyLine + +horizontalRule :: OrgParser (F Blocks) +horizontalRule = return B.horizontalRule <$ try hline + + +-- +-- Drawers +-- + +-- | A generic drawer which has no special meaning for org-mode. +-- Whether or not this drawer is included in the output depends on the drawers +-- export setting. +genericDrawer :: OrgParser (F Blocks) +genericDrawer = try $ do + name <- map toUpper <$> drawerStart + content <- manyTill drawerLine (try drawerEnd) + state <- getState + -- Include drawer if it is explicitly included in or not explicitly excluded + -- from the list of drawers that should be exported. PROPERTIES drawers are + -- never exported. + case (exportDrawers . orgStateExportSettings $ state) of + _ | name == "PROPERTIES" -> return mempty + Left names | name `elem` names -> return mempty + Right names | name `notElem` names -> return mempty + _ -> drawerDiv name <$> parseLines content + where + parseLines :: [String] -> OrgParser (F Blocks) + parseLines = parseFromString blocks . (++ "\n") . unlines + + drawerDiv :: String -> F Blocks -> F Blocks + drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty) + +drawerLine :: OrgParser String +drawerLine = anyLine + +drawerEnd :: OrgParser String +drawerEnd = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline + +-- | Read a :PROPERTIES: drawer and return the key/value pairs contained +-- within. +propertiesDrawer :: OrgParser [(String, String)] +propertiesDrawer = try $ do + drawerType <- drawerStart + guard $ map toUpper drawerType == "PROPERTIES" + manyTill property (try drawerEnd) + where + property :: OrgParser (String, String) + property = try $ (,) <$> key <*> value + + key :: OrgParser String + key = try $ skipSpaces *> char ':' *> many1Till nonspaceChar (char ':') + + value :: OrgParser String + value = try $ skipSpaces *> manyTill anyChar (try $ skipSpaces *> newline) + +keyValuesToAttr :: [(String, String)] -> Attr +keyValuesToAttr kvs = + let + lowerKvs = map (\(k, v) -> (map toLower k, v)) kvs + id' = fromMaybe mempty . lookup "custom_id" $ lowerKvs + cls = fromMaybe mempty . lookup "class" $ lowerKvs + kvs' = filter (flip notElem ["custom_id", "class"] . fst) lowerKvs + in + (id', words cls, kvs') + + +-- +-- Figures +-- + +-- | Figures (Image on a line by itself, preceded by name and/or caption) +figure :: OrgParser (F Blocks) +figure = try $ do + figAttrs <- blockAttributes + src <- skipSpaces *> selfTarget <* skipSpaces <* newline + guard . not . isNothing . blockAttrCaption $ figAttrs + guard (isImageFilename src) + let figName = fromMaybe mempty $ blockAttrName figAttrs + let figCaption = fromMaybe mempty $ blockAttrCaption figAttrs + let figKeyVals = blockAttrKeyValues figAttrs + let attr = (mempty, mempty, figKeyVals) + return $ (B.para . B.imageWith attr src (withFigPrefix figName) <$> figCaption) + where + withFigPrefix :: String -> String + withFigPrefix cs = + if "fig:" `isPrefixOf` cs + then cs + else "fig:" ++ cs + + selfTarget :: OrgParser String + selfTarget = try $ char '[' *> linkTarget <* char ']' + + +-- +-- Comments, Options and Metadata +-- + +addLinkFormat :: String + -> (String -> String) + -> OrgParser () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + +specialLine :: OrgParser (F Blocks) +specialLine = fmap return . try $ metaLine <|> commentLine + +-- The order, in which blocks are tried, makes sure that we're not looking at +-- the beginning of a block, so we don't need to check for it +metaLine :: OrgParser Blocks +metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) + +commentLine :: OrgParser Blocks +commentLine = commentLineStart *> anyLine *> pure mempty + +declarationLine :: OrgParser () +declarationLine = try $ do + key <- metaKey + inlinesF <- metaInlines + updateState $ \st -> + let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta + in st { orgStateMeta' = orgStateMeta' st <> meta' } + return () + +metaInlines :: OrgParser (F MetaValue) +metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline + +metaKey :: OrgParser String +metaKey = map toLower <$> many1 (noneOf ": \n\r") + <* char ':' + <* skipSpaces + +optionLine :: OrgParser () +optionLine = try $ do + key <- metaKey + case key of + "link" -> parseLinkFormat >>= uncurry addLinkFormat + "options" -> () <$ sepBy spaces exportSetting + _ -> mzero + +-- +-- Export Settings +-- + +-- | Read and process org-mode specific export options. +exportSetting :: OrgParser () +exportSetting = choice + [ booleanSetting "^" setExportSubSuperscripts + , ignoredSetting "'" + , ignoredSetting "*" + , ignoredSetting "-" + , ignoredSetting ":" + , ignoredSetting "<" + , ignoredSetting "\\n" + , ignoredSetting "arch" + , ignoredSetting "author" + , ignoredSetting "c" + , ignoredSetting "creator" + , complementableListSetting "d" setExportDrawers + , ignoredSetting "date" + , ignoredSetting "e" + , ignoredSetting "email" + , ignoredSetting "f" + , ignoredSetting "H" + , ignoredSetting "inline" + , ignoredSetting "num" + , ignoredSetting "p" + , ignoredSetting "pri" + , ignoredSetting "prop" + , ignoredSetting "stat" + , ignoredSetting "tags" + , ignoredSetting "tasks" + , ignoredSetting "tex" + , ignoredSetting "timestamp" + , ignoredSetting "title" + , ignoredSetting "toc" + , ignoredSetting "todo" + , ignoredSetting "|" + ] "export setting" + +booleanSetting :: String -> ExportSettingSetter Bool -> OrgParser () +booleanSetting settingIdentifier setter = try $ do + string settingIdentifier + char ':' + value <- elispBoolean + updateState $ modifyExportSettings setter value + +-- | Read an elisp boolean. Only NIL is treated as false, non-NIL values are +-- interpreted as true. +elispBoolean :: OrgParser Bool +elispBoolean = try $ do + value <- many1 nonspaceChar + return $ case map toLower value of + "nil" -> False + "{}" -> False + "()" -> False + _ -> True + +-- | A list or a complement list (i.e. a list starting with `not`). +complementableListSetting :: String + -> ExportSettingSetter (Either [String] [String]) + -> OrgParser () +complementableListSetting settingIdentifier setter = try $ do + _ <- string settingIdentifier <* char ':' + value <- choice [ Left <$> complementStringList + , Right <$> stringList + , (\b -> if b then Left [] else Right []) <$> elispBoolean + ] + updateState $ modifyExportSettings setter value + where + -- Read a plain list of strings. + stringList :: OrgParser [String] + stringList = try $ + char '(' + *> sepBy elispString spaces + <* char ')' + + -- Read an emacs lisp list specifying a complement set. + complementStringList :: OrgParser [String] + complementStringList = try $ + string "(not " + *> sepBy elispString spaces + <* char ')' + + elispString :: OrgParser String + elispString = try $ + char '"' + *> manyTill alphaNum (char '"') + +ignoredSetting :: String -> OrgParser () +ignoredSetting s = try (() <$ string s <* char ':' <* many1 nonspaceChar) + + +parseLinkFormat :: OrgParser ((String, String -> String)) +parseLinkFormat = try $ do + linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces + linkSubst <- parseFormat + return (linkType, linkSubst) + +-- | An ad-hoc, single-argument-only implementation of a printf-style format +-- parser. +parseFormat :: OrgParser (String -> String) +parseFormat = try $ do + replacePlain <|> replaceUrl <|> justAppend + where + -- inefficient, but who cares + replacePlain = try $ (\x -> concat . flip intersperse x) + <$> sequence [tillSpecifier 's', rest] + replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode) + <$> sequence [tillSpecifier 'h', rest] + justAppend = try $ (++) <$> rest + + rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") + tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) + +-- +-- Headers +-- + +-- | Headers +header :: OrgParser (F Blocks) +header = try $ do + level <- headerStart + title <- manyTill inline (lookAhead $ optional headerTags <* newline) + tags <- option [] headerTags + newline + propAttr <- option nullAttr (keyValuesToAttr <$> propertiesDrawer) + inlines <- runF (tagTitle title tags) <$> getState + attr <- registerHeader propAttr inlines + return $ pure (B.headerWith attr level inlines) + where + tagTitle :: [F Inlines] -> [String] -> F Inlines + tagTitle title tags = trimInlinesF . mconcat $ title <> map tagToInlineF tags + + tagToInlineF :: String -> F Inlines + tagToInlineF t = return $ B.spanWith ("", ["tag"], [("data-tag-name", t)]) mempty + + headerTags :: OrgParser [String] + headerTags = try $ + let tag = many1 (alphaNum <|> oneOf "@%#_") <* char ':' + in skipSpaces + *> char ':' + *> many1 tag + <* skipSpaces + + +-- +-- Tables +-- + +data OrgTableRow = OrgContentRow (F [Blocks]) + | OrgAlignRow [Alignment] + | OrgHlineRow + +-- OrgTable is strongly related to the pandoc table ADT. Using the same +-- (i.e. pandoc-global) ADT would mean that the reader would break if the +-- global structure was to be changed, which would be bad. The final table +-- should be generated using a builder function. Column widths aren't +-- implemented yet, so they are not tracked here. +data OrgTable = OrgTable + { orgTableAlignments :: [Alignment] + , orgTableHeader :: [Blocks] + , orgTableRows :: [[Blocks]] + } + +table :: OrgParser (F Blocks) +table = try $ do + blockAttrs <- blockAttributes + lookAhead tableStart + do + rows <- tableRows + let caption = fromMaybe (return mempty) $ blockAttrCaption blockAttrs + return $ (<$> caption) . orgToPandocTable . normalizeTable =<< rowsToTable rows + +orgToPandocTable :: OrgTable + -> Inlines + -> Blocks +orgToPandocTable (OrgTable aligns heads lns) caption = + B.table caption (zip aligns $ repeat 0) heads lns + +tableRows :: OrgParser [OrgTableRow] +tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) + +tableContentRow :: OrgParser OrgTableRow +tableContentRow = try $ + OrgContentRow . sequence <$> (tableStart *> many1Till tableContentCell newline) + +tableContentCell :: OrgParser (F Blocks) +tableContentCell = try $ + fmap B.plain . trimInlinesF . mconcat <$> manyTill inline endOfCell + +tableAlignRow :: OrgParser OrgTableRow +tableAlignRow = try $ do + tableStart + cells <- many1Till tableAlignCell newline + -- Empty rows are regular (i.e. content) rows, not alignment rows. + guard $ any (/= AlignDefault) cells + return $ OrgAlignRow cells + +tableAlignCell :: OrgParser Alignment +tableAlignCell = + choice [ try $ emptyCell *> return AlignDefault + , try $ skipSpaces + *> char '<' + *> tableAlignFromChar + <* many digit + <* char '>' + <* emptyCell + ] "alignment info" + where emptyCell = try $ skipSpaces *> endOfCell + +tableAlignFromChar :: OrgParser Alignment +tableAlignFromChar = try $ + choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] + +tableHline :: OrgParser OrgTableRow +tableHline = try $ + OrgHlineRow <$ (tableStart *> char '-' *> anyLine) + +endOfCell :: OrgParser Char +endOfCell = try $ char '|' <|> lookAhead newline + +rowsToTable :: [OrgTableRow] + -> F OrgTable +rowsToTable = foldM rowToContent emptyTable + where emptyTable = OrgTable mempty mempty mempty + +normalizeTable :: OrgTable -> OrgTable +normalizeTable (OrgTable aligns heads rows) = OrgTable aligns' heads rows + where + refRow = if heads /= mempty + then heads + else if rows == mempty then mempty else head rows + cols = length refRow + fillColumns base padding = take cols $ base ++ repeat padding + aligns' = fillColumns aligns AlignDefault + +-- One or more horizontal rules after the first content line mark the previous +-- line as a header. All other horizontal lines are discarded. +rowToContent :: OrgTable + -> OrgTableRow + -> F OrgTable +rowToContent orgTable row = + case row of + OrgHlineRow -> return singleRowPromotedToHeader + OrgAlignRow as -> return . setAligns $ as + OrgContentRow cs -> appendToBody cs + where + singleRowPromotedToHeader :: OrgTable + singleRowPromotedToHeader = case orgTable of + OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> + orgTable{ orgTableHeader = b , orgTableRows = [] } + _ -> orgTable + + setAligns :: [Alignment] -> OrgTable + setAligns aligns = orgTable{ orgTableAlignments = aligns } + + appendToBody :: F [Blocks] -> F OrgTable + appendToBody frow = do + newRow <- frow + let oldRows = orgTableRows orgTable + -- NOTE: This is an inefficient O(n) operation. This should be changed + -- if performance ever becomes a problem. + return orgTable{ orgTableRows = oldRows ++ [newRow] } + + +-- +-- LaTeX fragments +-- +latexFragment :: OrgParser (F Blocks) +latexFragment = try $ do + envName <- latexEnvStart + content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) + return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) + where + c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" + , c + , "\\end{", e, "}\n" + ] + +latexEnd :: String -> OrgParser () +latexEnd envName = try $ + () <$ skipSpaces + <* string ("\\end{" ++ envName ++ "}") + <* blankline + + +-- +-- Footnote defintions +-- +noteBlock :: OrgParser (F Blocks) +noteBlock = try $ do + ref <- noteMarker <* skipSpaces + content <- mconcat <$> blocksTillHeaderOrNote + addToNotesTable (ref, content) + return mempty + where + blocksTillHeaderOrNote = + many1Till block (eof <|> () <$ lookAhead noteMarker + <|> () <$ lookAhead headerStart) + +-- Paragraphs or Plain text +paraOrPlain :: OrgParser (F Blocks) +paraOrPlain = try $ do + ils <- parseInlines + nl <- option False (newline *> return True) + -- Read block as paragraph, except if we are in a list context and the block + -- is directly followed by a list item, in which case the block is read as + -- plain text. + try (guard nl + *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart)) + *> return (B.para <$> ils)) + <|> (return (B.plain <$> ils)) + +inlinesTillNewline :: OrgParser (F Inlines) +inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline + + +-- +-- list blocks +-- + +list :: OrgParser (F Blocks) +list = choice [ definitionList, bulletList, orderedList ] "list" + +definitionList :: OrgParser (F Blocks) +definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.definitionList . fmap compactify'DL . sequence + <$> many1 (definitionListItem $ bulletListStart' (Just n)) + +bulletList :: OrgParser (F Blocks) +bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.bulletList . fmap compactify' . sequence + <$> many1 (listItem (bulletListStart' $ Just n)) + +orderedList :: OrgParser (F Blocks) +orderedList = fmap B.orderedList . fmap compactify' . sequence + <$> many1 (listItem orderedListStart) + +bulletListStart' :: Maybe Int -> OrgParser Int +-- returns length of bulletList prefix, inclusive of marker +bulletListStart' Nothing = do ind <- length <$> many spaceChar + oneOf (bullets $ ind == 0) + skipSpaces1 + return (ind + 1) +bulletListStart' (Just n) = do count (n-1) spaceChar + oneOf (bullets $ n == 1) + many1 spaceChar + return n + +-- Unindented lists are legal, but they can't use '*' bullets. +-- We return n to maintain compatibility with the generic listItem. +bullets :: Bool -> String +bullets unindented = if unindented then "+-" else "*+-" + +definitionListItem :: OrgParser Int + -> OrgParser (F (Inlines, [Blocks])) +definitionListItem parseMarkerGetLength = try $ do + markerLength <- parseMarkerGetLength + term <- manyTill (noneOf "\n\r") (try definitionMarker) + line1 <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + cont <- concat <$> many (listContinuation markerLength) + term' <- parseFromString parseInlines term + contents' <- parseFromString blocks $ line1 ++ blank ++ cont + return $ (,) <$> term' <*> fmap (:[]) contents' + where + definitionMarker = + spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) + + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: OrgParser Int + -> OrgParser (F Blocks) +listItem start = try . withContext ListItemState $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString blocks $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> OrgParser String +listContinuation markerLength = try $ + notFollowedBy' blankline + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +-- | Parse any line, include the final newline in the output. +anyLineNewline :: OrgParser String +anyLineNewline = (++ "\n") <$> anyLine -- cgit v1.2.3