diff options
author | Albert Krewinkel <albert@zeitkraut.de> | 2016-05-11 12:26:54 +0200 |
---|---|---|
committer | Albert Krewinkel <albert@zeitkraut.de> | 2016-05-11 19:13:42 +0200 |
commit | 7a0729ea093cbf78188f9ef4e5b7c0f9a6b82c9b (patch) | |
tree | bf80e5250817256f28731e5bc39a8b38593d4f89 /src | |
parent | fd9ec835ec1447fb6040ee63a1fa8ee13b19d555 (diff) | |
download | pandoc-7a0729ea093cbf78188f9ef4e5b7c0f9a6b82c9b.tar.gz |
Org reader: move parser state into separate module
The org reader code has become large and confusing. Extracting smaller
parts into submodules should help to clean things up.
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 215 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org/ParserState.hs | 175 |
2 files changed, 232 insertions, 158 deletions
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 5a50a8f34..610397d58 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,6 +1,5 @@ +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE GeneralizedNewtypeDeriving #-} -{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-} {- Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -31,8 +30,7 @@ Conversion of org-mode formatted plain text to 'Pandoc' document. module Text.Pandoc.Readers.Org ( readOrg ) where import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), - trimInlines ) +import Text.Pandoc.Builder ( Inlines, Blocks ) import Text.Pandoc.Definition import Text.Pandoc.Compat.Monoid ((<>)) import Text.Pandoc.Error @@ -43,153 +41,33 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF , parseFromString, blanklines ) import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) +import Text.Pandoc.Readers.Org.ParserState import Text.Pandoc.Shared (compactify', compactify'DL) import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Arrow (first) -import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) -import Control.Monad.Reader (Reader, runReader, ask, asks, local) +import Control.Monad (foldM, guard, mplus, mzero, when) +import Control.Monad.Reader ( Reader, runReader ) import Data.Char (isAlphaNum, isSpace, toLower) -import Data.Default import Data.List (intersperse, isPrefixOf, isSuffixOf) import qualified Data.Map as M -import qualified Data.Set as Set import Data.Maybe (fromMaybe, isJust) import Network.HTTP (urlEncode) + -- | Parse org-mode string and return a Pandoc document. readOrg :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) -> Either PandocError Pandoc readOrg opts s = flip runReader def $ readWithM parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") -data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext } - +-- | The parser used to read org files. type OrgParser = ParserT [Char] OrgParserState (Reader OrgParserLocal) -instance HasIdentifierList OrgParserState where - extractIdentifierList = orgStateIdentifiers - updateIdentifierList f s = s{ orgStateIdentifiers = f (orgStateIdentifiers s) } - -instance HasHeaderMap OrgParserState where - extractHeaderMap = orgStateHeaderMap - updateHeaderMap f s = s{ orgStateHeaderMap = f (orgStateHeaderMap s) } - -parseOrg :: OrgParser Pandoc -parseOrg = do - blocks' <- parseBlocks - st <- getState - let meta = runF (orgStateMeta' st) st - let removeUnwantedBlocks = dropCommentTrees . filter (/= Null) - return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st) - --- | Drop COMMENT headers and the document tree below those headers. -dropCommentTrees :: [Block] -> [Block] -dropCommentTrees [] = [] -dropCommentTrees (b:bs) = - maybe (b:dropCommentTrees bs) - (dropCommentTrees . flip dropUntilHeaderAboveLevel bs) - (commentHeaderLevel b) - --- | Return the level of a header starting a comment or :noexport: tree and --- Nothing otherwise. -commentHeaderLevel :: Block -> Maybe Int -commentHeaderLevel blk = - case blk of - (Header level _ ((Str "COMMENT"):_)) -> Just level - (Header level _ title) | hasNoExportTag title -> Just level - _ -> Nothing - where - hasNoExportTag :: [Inline] -> Bool - hasNoExportTag = any isNoExportTag - - isNoExportTag :: Inline -> Bool - isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True - isNoExportTag _ = False - --- | Drop blocks until a header on or above the given level is seen -dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block] -dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n) - -isHeaderLevelLowerEq :: Int -> Block -> Bool -isHeaderLevelLowerEq n blk = - case blk of - (Header level _ _) -> n >= level - _ -> False - -- --- Parser State for Org +-- Functions acting on the parser state -- - -type OrgNoteRecord = (String, F Blocks) -type OrgNoteTable = [OrgNoteRecord] - -type OrgBlockAttributes = M.Map String String - -type OrgLinkFormatters = M.Map String (String -> String) - --- | Org-mode parser state -data OrgParserState = OrgParserState - { orgStateOptions :: ReaderOptions - , orgStateAnchorIds :: [String] - , orgStateBlockAttributes :: OrgBlockAttributes - , orgStateEmphasisCharStack :: [Char] - , orgStateEmphasisNewlines :: Maybe Int - , orgStateLastForbiddenCharPos :: Maybe SourcePos - , orgStateLastPreCharPos :: Maybe SourcePos - , orgStateLastStrPos :: Maybe SourcePos - , orgStateLinkFormatters :: OrgLinkFormatters - , orgStateMeta :: Meta - , orgStateMeta' :: F Meta - , orgStateNotes' :: OrgNoteTable - , orgStateParserContext :: ParserContext - , orgStateIdentifiers :: Set.Set String - , orgStateHeaderMap :: M.Map Inlines String - } - -instance Default OrgParserLocal where - def = OrgParserLocal NoQuote - -instance HasReaderOptions OrgParserState where - extractReaderOptions = orgStateOptions - -instance HasMeta OrgParserState where - setMeta field val st = - st{ orgStateMeta = setMeta field val $ orgStateMeta st } - deleteMeta field st = - st{ orgStateMeta = deleteMeta field $ orgStateMeta st } - -instance HasLastStrPosition OrgParserState where - getLastStrPos = orgStateLastStrPos - setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } - -instance HasQuoteContext st (Reader OrgParserLocal) where - getQuoteContext = asks orgLocalQuoteContext - withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q}) - -instance Default OrgParserState where - def = defaultOrgParserState - -defaultOrgParserState :: OrgParserState -defaultOrgParserState = OrgParserState - { orgStateOptions = def - , orgStateAnchorIds = [] - , orgStateBlockAttributes = M.empty - , orgStateEmphasisCharStack = [] - , orgStateEmphasisNewlines = Nothing - , orgStateLastForbiddenCharPos = Nothing - , orgStateLastPreCharPos = Nothing - , orgStateLastStrPos = Nothing - , orgStateLinkFormatters = M.empty - , orgStateMeta = nullMeta - , orgStateMeta' = return nullMeta - , orgStateNotes' = [] - , orgStateParserContext = NullState - , orgStateIdentifiers = Set.empty - , orgStateHeaderMap = M.empty - } - recordAnchorId :: String -> OrgParser () recordAnchorId i = updateState $ \s -> s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } @@ -243,44 +121,65 @@ addToNotesTable note = do oldnotes <- orgStateNotes' <$> getState updateState $ \s -> s{ orgStateNotes' = note:oldnotes } --- The version Text.Pandoc.Parsing cannot be used, as we need additional parts --- of the state saved and restored. -parseFromString :: OrgParser a -> String -> OrgParser a -parseFromString parser str' = do - oldLastPreCharPos <- orgStateLastPreCharPos <$> getState - updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } - result <- P.parseFromString parser str' - updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } - return result - - -- --- Adaptions and specializations of parsing utilities +-- Parser -- +parseOrg :: OrgParser Pandoc +parseOrg = do + blocks' <- parseBlocks + st <- getState + let meta = runF (orgStateMeta' st) st + let removeUnwantedBlocks = dropCommentTrees . filter (/= Null) + return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st) -newtype F a = F { unF :: Reader OrgParserState a - } deriving (Monad, Applicative, Functor) +-- | Drop COMMENT headers and the document tree below those headers. +dropCommentTrees :: [Block] -> [Block] +dropCommentTrees [] = [] +dropCommentTrees (b:bs) = + maybe (b:dropCommentTrees bs) + (dropCommentTrees . flip dropUntilHeaderAboveLevel bs) + (commentHeaderLevel b) -runF :: F a -> OrgParserState -> a -runF = runReader . unF +-- | Return the level of a header starting a comment or :noexport: tree and +-- Nothing otherwise. +commentHeaderLevel :: Block -> Maybe Int +commentHeaderLevel blk = + case blk of + (Header level _ ((Str "COMMENT"):_)) -> Just level + (Header level _ title) | hasNoExportTag title -> Just level + _ -> Nothing + where + hasNoExportTag :: [Inline] -> Bool + hasNoExportTag = any isNoExportTag -askF :: F OrgParserState -askF = F ask + isNoExportTag :: Inline -> Bool + isNoExportTag (Span ("", ["tag"], [("data-tag-name", "noexport")]) []) = True + isNoExportTag _ = False -asksF :: (OrgParserState -> a) -> F a -asksF f = F $ asks f +-- | Drop blocks until a header on or above the given level is seen +dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block] +dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n) -instance Monoid a => Monoid (F a) where - mempty = return mempty - mappend = liftM2 mappend - mconcat = fmap mconcat . sequence +isHeaderLevelLowerEq :: Int -> Block -> Bool +isHeaderLevelLowerEq n blk = + case blk of + (Header level _ _) -> n >= level + _ -> False -trimInlinesF :: F Inlines -> F Inlines -trimInlinesF = liftM trimInlines -returnF :: a -> OrgParser (F a) -returnF = return . return +-- +-- Adaptions and specializations of parsing utilities +-- +-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts +-- of the state saved and restored. +parseFromString :: OrgParser a -> String -> OrgParser a +parseFromString parser str' = do + oldLastPreCharPos <- orgStateLastPreCharPos <$> getState + updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } + result <- P.parseFromString parser str' + updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } + return result -- | Like @Text.Parsec.Char.newline@, but causes additional state changes. newline :: OrgParser Char diff --git a/src/Text/Pandoc/Readers/Org/ParserState.hs b/src/Text/Pandoc/Readers/Org/ParserState.hs new file mode 100644 index 000000000..680c469f3 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/ParserState.hs @@ -0,0 +1,175 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Define the Org-mode parser state. +-} +module Text.Pandoc.Readers.Org.ParserState + ( OrgParserState(..) + , OrgParserLocal(..) + , OrgNoteRecord + , F(..) + , askF + , asksF + , trimInlinesF + , runF + , returnF + ) where + +import Control.Monad (liftM, liftM2) +import Control.Monad.Reader (Reader, runReader, ask, asks, local) + +import Data.Default (Default(..)) +import qualified Data.Map as M +import qualified Data.Set as Set + +import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), + trimInlines ) +import Text.Pandoc.Definition ( Meta(..), nullMeta ) +import Text.Pandoc.Options ( ReaderOptions(..) ) +import Text.Pandoc.Parsing ( HasHeaderMap(..) + , HasIdentifierList(..) + , HasLastStrPosition(..) + , HasQuoteContext(..) + , HasReaderOptions(..) + , ParserContext(..) + , QuoteContext(..) + , SourcePos ) + +-- | An inline note / footnote containing the note key and its (inline) value. +type OrgNoteRecord = (String, F Blocks) +-- | Table of footnotes +type OrgNoteTable = [OrgNoteRecord] +-- | Map of org block attributes (e.g. LABEL, CAPTION, NAME, etc) +type OrgBlockAttributes = M.Map String String +-- | Map of functions for link transformations. The map key is refers to the +-- link-type, the corresponding function transforms the given link string. +type OrgLinkFormatters = M.Map String (String -> String) + +-- | Org-mode parser state +data OrgParserState = OrgParserState + { orgStateOptions :: ReaderOptions + , orgStateAnchorIds :: [String] + , orgStateBlockAttributes :: OrgBlockAttributes + , orgStateEmphasisCharStack :: [Char] + , orgStateEmphasisNewlines :: Maybe Int + , orgStateLastForbiddenCharPos :: Maybe SourcePos + , orgStateLastPreCharPos :: Maybe SourcePos + , orgStateLastStrPos :: Maybe SourcePos + , orgStateLinkFormatters :: OrgLinkFormatters + , orgStateMeta :: Meta + , orgStateMeta' :: F Meta + , orgStateNotes' :: OrgNoteTable + , orgStateParserContext :: ParserContext + , orgStateIdentifiers :: Set.Set String + , orgStateHeaderMap :: M.Map Inlines String + } + +data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext } + +instance Default OrgParserLocal where + def = OrgParserLocal NoQuote + +instance HasReaderOptions OrgParserState where + extractReaderOptions = orgStateOptions + +instance HasMeta OrgParserState where + setMeta field val st = + st{ orgStateMeta = setMeta field val $ orgStateMeta st } + deleteMeta field st = + st{ orgStateMeta = deleteMeta field $ orgStateMeta st } + +instance HasLastStrPosition OrgParserState where + getLastStrPos = orgStateLastStrPos + setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } + +instance HasQuoteContext st (Reader OrgParserLocal) where + getQuoteContext = asks orgLocalQuoteContext + withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q}) + +instance HasIdentifierList OrgParserState where + extractIdentifierList = orgStateIdentifiers + updateIdentifierList f s = s{ orgStateIdentifiers = f (orgStateIdentifiers s) } + +instance HasHeaderMap OrgParserState where + extractHeaderMap = orgStateHeaderMap + updateHeaderMap f s = s{ orgStateHeaderMap = f (orgStateHeaderMap s) } + + +instance Default OrgParserState where + def = defaultOrgParserState + +defaultOrgParserState :: OrgParserState +defaultOrgParserState = OrgParserState + { orgStateOptions = def + , orgStateAnchorIds = [] + , orgStateBlockAttributes = M.empty + , orgStateEmphasisCharStack = [] + , orgStateEmphasisNewlines = Nothing + , orgStateLastForbiddenCharPos = Nothing + , orgStateLastPreCharPos = Nothing + , orgStateLastStrPos = Nothing + , orgStateLinkFormatters = M.empty + , orgStateMeta = nullMeta + , orgStateMeta' = return nullMeta + , orgStateNotes' = [] + , orgStateParserContext = NullState + , orgStateIdentifiers = Set.empty + , orgStateHeaderMap = M.empty + } + + +-- +-- Parser state reader +-- + +-- | Reader monad wrapping the parser state. This is used to delay evaluation +-- until all relevant information has been parsed and made available in the +-- parser state. See also the newtype of the same name in +-- Text.Pandoc.Parsing. +newtype F a = F { unF :: Reader OrgParserState a + } deriving (Functor, Applicative, Monad) + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = fmap mconcat . sequence + +runF :: F a -> OrgParserState -> a +runF = runReader . unF + +askF :: F OrgParserState +askF = F ask + +asksF :: (OrgParserState -> a) -> F a +asksF f = F $ asks f + +trimInlinesF :: F Inlines -> F Inlines +trimInlinesF = liftM trimInlines + +returnF :: Monad m => a -> m (F a) +returnF = return . return |