aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Parsing.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2012-07-27 21:04:02 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2012-08-01 21:45:40 -0700
commitfadc7b0d873cb021b69d06bd37313be84afeecca (patch)
treeabcc413a98cbd70c20592ca696cf5e3a11850a0b /src/Text/Pandoc/Parsing.hs
parent973c7ecacf68e39ca51bb8633a032ff2fd9eda07 (diff)
downloadpandoc-fadc7b0d873cb021b69d06bd37313be84afeecca.tar.gz
Major rewrite of markdown reader.
* Use Builder's Inlines/Blocks instead of lists. * Return values in the reader monad, which are then run (at the end of parsing) against the final parser state. This allows links, notes, and example numbers to be resolved without a second parser pass. * An effect of using Builder is that everything is normalized automatically. * New exports from Text.Pandoc.Parsing: widthsFromIndices, NoteTable', KeyTable', Key', toKey', withQuoteContext, singleQuoteStart, singleQuoteEnd, doubleQuoteStart, doubleQuoteEnd, ellipses, apostrophe, dash * Updated opendocument tests. * Don't derive Show for ParserState. * Benchmarks: markdown reader takes 82% of the time it took before. Markdown writer takes 92% of the time (here the speedup is probably due to the fact that everything is normalized by default).
Diffstat (limited to 'src/Text/Pandoc/Parsing.hs')
-rw-r--r--src/Text/Pandoc/Parsing.hs57
1 files changed, 43 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 5ad6af891..eb52aab02 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -56,6 +56,7 @@ module Text.Pandoc.Parsing ( (>>~),
orderedListMarker,
charRef,
tableWith,
+ widthsFromIndices,
gridTableWith,
readWith,
testStringWith,
@@ -68,12 +69,24 @@ module Text.Pandoc.Parsing ( (>>~),
ParserContext (..),
QuoteContext (..),
NoteTable,
+ NoteTable',
KeyTable,
Key,
toKey,
fromKey,
lookupKeySrc,
+ KeyTable',
+ Key',
+ toKey',
smartPunctuation,
+ withQuoteContext,
+ singleQuoteStart,
+ singleQuoteEnd,
+ doubleQuoteStart,
+ doubleQuoteEnd,
+ ellipses,
+ apostrophe,
+ dash,
macro,
applyMacros',
Parser,
@@ -133,19 +146,20 @@ where
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Generic
+import Text.Pandoc.Builder (Blocks)
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
import Text.Parsec
import Text.Parsec.Pos (newPos)
import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
import Data.List ( intercalate, transpose )
import Network.URI ( parseURI, URI (..), isAllowedInURI )
-import Control.Monad ( join, liftM, guard, mzero )
import Text.Pandoc.Shared
import qualified Data.Map as M
import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions)
import Text.HTML.TagSoup.Entity ( lookupEntity )
import Data.Default
import qualified Data.Set as Set
+import Control.Monad.Reader
type Parser t s = Parsec t s
@@ -579,11 +593,12 @@ widthsFromIndices numColumns' indices =
-- (which may be grid), then the rows,
-- which may be grid, separated by blank lines, and
-- ending with a footer (dashed line followed by blank line).
-gridTableWith :: Parser [Char] ParserState Block -- ^ Block parser
+gridTableWith :: Parser [Char] ParserState [Block] -- ^ Block list parser
-> Bool -- ^ Headerless table
-> Parser [Char] ParserState Block
-gridTableWith block headless =
- tableWith (gridTableHeader headless block) (gridTableRow block) (gridTableSep '-') gridTableFooter
+gridTableWith blocks headless =
+ tableWith (gridTableHeader headless blocks) (gridTableRow blocks)
+ (gridTableSep '-') gridTableFooter
gridTableSplitLine :: [Int] -> String -> [String]
gridTableSplitLine indices line = map removeFinalBar $ tail $
@@ -608,9 +623,9 @@ gridTableSep ch = try $ gridDashedLines ch >> return '\n'
-- | Parse header for a grid table.
gridTableHeader :: Bool -- ^ Headerless table
- -> Parser [Char] ParserState Block
+ -> Parser [Char] ParserState [Block]
-> Parser [Char] ParserState ([[Block]], [Alignment], [Int])
-gridTableHeader headless block = try $ do
+gridTableHeader headless blocks = try $ do
optional blanklines
dashes <- gridDashedLines '-'
rawContent <- if headless
@@ -629,7 +644,7 @@ gridTableHeader headless block = try $ do
then replicate (length dashes) ""
else map (intercalate " ") $ transpose
$ map (gridTableSplitLine indices) rawContent
- heads <- mapM (parseFromString $ many block) $
+ heads <- mapM (parseFromString blocks) $
map removeLeadingTrailingSpace rawHeads
return (heads, aligns, indices)
@@ -640,14 +655,14 @@ gridTableRawLine indices = do
return (gridTableSplitLine indices line)
-- | Parse row of grid table.
-gridTableRow :: Parser [Char] ParserState Block
+gridTableRow :: Parser [Char] ParserState [Block]
-> [Int]
-> Parser [Char] ParserState [[Block]]
-gridTableRow block indices = do
+gridTableRow blocks indices = do
colLines <- many1 (gridTableRawLine indices)
let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $
transpose colLines
- mapM (liftM compactifyCell . parseFromString (many block)) cols
+ mapM (liftM compactifyCell . parseFromString blocks) cols
removeOneLeadingSpace :: [String] -> [String]
removeOneLeadingSpace xs =
@@ -688,10 +703,13 @@ data ParserState = ParserState
{ stateOptions :: ReaderOptions, -- ^ User options
stateParserContext :: ParserContext, -- ^ Inside list?
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
+ stateAllowLinks :: Bool, -- ^ Allow parsing of links
stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph
stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed
stateKeys :: KeyTable, -- ^ List of reference keys
- stateNotes :: NoteTable, -- ^ List of notes
+ stateKeys' :: KeyTable', -- ^ List of reference keys (with fallbacks)
+ stateNotes :: NoteTable, -- ^ List of notes (raw bodies)
+ stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies)
stateTitle :: [Inline], -- ^ Title of document
stateAuthors :: [[Inline]], -- ^ Authors of document
stateDate :: [Inline], -- ^ Date of document
@@ -702,7 +720,6 @@ data ParserState = ParserState
stateMacros :: [Macro], -- ^ List of macros defined so far
stateRstDefaultRole :: String -- ^ Current rST default interpreted text role
}
- deriving Show
instance Default ParserState where
def = defaultParserState
@@ -712,10 +729,13 @@ defaultParserState =
ParserState { stateOptions = def,
stateParserContext = NullState,
stateQuoteContext = NoQuote,
+ stateAllowLinks = True,
stateMaxNestingLevel = 6,
stateLastStrPos = Nothing,
stateKeys = M.empty,
+ stateKeys' = M.empty,
stateNotes = [],
+ stateNotes' = [],
stateTitle = [],
stateAuthors = [],
stateDate = [],
@@ -755,6 +775,8 @@ data QuoteContext
type NoteTable = [(String, String)]
+type NoteTable' = [(String, Reader ParserState Blocks)] -- used in markdown reader
+
newtype Key = Key [Inline] deriving (Show, Read, Eq, Ord)
toKey :: [Inline] -> Key
@@ -772,6 +794,13 @@ fromKey (Key xs) = xs
type KeyTable = M.Map Key Target
+newtype Key' = Key' String deriving (Show, Read, Eq, Ord)
+
+toKey' :: String -> Key'
+toKey' = Key' . map toLower . unwords . words
+
+type KeyTable' = M.Map Key' Target
+
-- | Look up key in key table and return target object.
lookupKeySrc :: KeyTable -- ^ Key table
-> Key -- ^ Key
@@ -798,8 +827,8 @@ quoted :: Parser [Char] ParserState Inline
quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser
withQuoteContext :: QuoteContext
- -> (Parser [Char] ParserState Inline)
- -> Parser [Char] ParserState Inline
+ -> Parser [Char] ParserState a
+ -> Parser [Char] ParserState a
withQuoteContext context parser = do
oldState <- getState
let oldQuoteContext = stateQuoteContext oldState