From 20170c328f12fc0214c2c50d14c8f7d03ee05e70 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 22 Jan 2016 10:16:47 -0800 Subject: Changed type of Shared.uniqueIdent argument from [String] to Set String. This avoids performance problems in documents with many identically named headers. Closes #2671. --- src/Text/Pandoc/Parsing.hs | 12 ++++++------ src/Text/Pandoc/Readers/Docx.hs | 7 ++++--- src/Text/Pandoc/Readers/HTML.hs | 6 +++--- src/Text/Pandoc/Readers/MediaWiki.hs | 5 +++-- src/Text/Pandoc/Readers/Odt/ContentReader.hs | 3 ++- src/Text/Pandoc/Readers/Org.hs | 5 +++-- src/Text/Pandoc/Shared.hs | 6 +++--- src/Text/Pandoc/Writers/Docx.hs | 7 ++++--- src/Text/Pandoc/Writers/EPUB.hs | 5 +++-- src/Text/Pandoc/Writers/Markdown.hs | 7 ++++--- src/Text/Pandoc/Writers/Texinfo.hs | 7 ++++--- 11 files changed, 39 insertions(+), 31 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 16fe75ed5..325231846 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -915,7 +915,7 @@ data ParserState = ParserState stateMeta' :: F Meta, -- ^ Document metadata stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) - stateIdentifiers :: [String], -- ^ List of header identifiers used + stateIdentifiers :: Set.Set String, -- ^ Header identifiers used stateNextExample :: Int, -- ^ Number of next example stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers stateHasChapters :: Bool, -- ^ True if \chapter encountered @@ -973,8 +973,8 @@ instance HasHeaderMap ParserState where updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st } class HasIdentifierList st where - extractIdentifierList :: st -> [String] - updateIdentifierList :: ([String] -> [String]) -> st -> st + extractIdentifierList :: st -> Set.Set String + updateIdentifierList :: (Set.Set String -> Set.Set String) -> st -> st instance HasIdentifierList ParserState where extractIdentifierList = stateIdentifiers @@ -1013,7 +1013,7 @@ defaultParserState = stateMeta' = return nullMeta, stateHeaderTable = [], stateHeaders = M.empty, - stateIdentifiers = [], + stateIdentifiers = Set.empty, stateNextExample = 1, stateExamples = M.empty, stateHasChapters = False, @@ -1092,8 +1092,8 @@ registerHeader (ident,classes,kvs) header' = do let id'' = if Ext_ascii_identifiers `Set.member` exts then catMaybes $ map toAsciiChar id' else id' - updateState $ updateIdentifierList $ - if id' == id'' then (id' :) else ([id', id''] ++) + updateState $ updateIdentifierList $ Set.insert id' + updateState $ updateIdentifierList $ Set.insert id'' updateState $ updateHeaderMap $ insert' header' id' return (id'',classes,kvs) else do diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 44f67ce75..1b3269136 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -89,6 +89,7 @@ import Text.TeXMath (writeTeX) import Data.Default (Default) import qualified Data.ByteString.Lazy as B import qualified Data.Map as M +import qualified Data.Set as Set import Control.Monad.Reader import Control.Monad.State import Data.Sequence (ViewL(..), viewl) @@ -350,7 +351,7 @@ parPartToInlines (BookMark _ anchor) = -- avoid an extra pass. let newAnchor = if not inHdrBool && anchor `elem` (M.elems anchorMap) - then uniqueIdent [Str anchor] (M.elems anchorMap) + then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) else anchor unless inHdrBool (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) @@ -393,7 +394,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) | (c:cs) <- filter isAnchorSpan ils , (Span (ident, ["anchor"], _) _) <- c = do hdrIDMap <- gets docxAnchorMap - let newIdent = uniqueIdent ils (M.elems hdrIDMap) + let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs)) -- Otherwise we just give it a name, and register that name (associate @@ -401,7 +402,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) makeHeaderAnchor' (Header n (_, classes, kvs) ils) = do hdrIDMap <- gets docxAnchorMap - let newIdent = uniqueIdent ils (M.elems hdrIDMap) + let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) ils makeHeaderAnchor' blk = return blk diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index a34e2fb5c..c3be1f544 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -68,7 +68,7 @@ import Text.Pandoc.Error import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Compat.Monoid ((<>)) import Text.Parsec.Error - +import qualified Data.Set as Set -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options @@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options readHtml opts inp = mapLeft (ParseFailure . getError) . flip runReader def $ runParserT parseDoc - (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty) + (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) "source" tags where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp @@ -104,7 +104,7 @@ data HTMLState = { parserState :: ParserState, noteTable :: [(String, Blocks)], baseHref :: Maybe String, - identifiers :: [String], + identifiers :: Set.Set String, headerMap :: M.Map Inlines String } diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index d29ec50e7..950497992 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -52,6 +52,7 @@ import Text.HTML.TagSoup import Data.Sequence (viewl, ViewL(..), (<|)) import qualified Data.Foldable as F import qualified Data.Map as M +import qualified Data.Set as Set import Data.Char (isDigit, isSpace) import Data.Maybe (fromMaybe) import Text.Printf (printf) @@ -69,7 +70,7 @@ readMediaWiki opts s = , mwNextLinkNumber = 1 , mwCategoryLinks = [] , mwHeaderMap = M.empty - , mwIdentifierList = [] + , mwIdentifierList = Set.empty } (s ++ "\n") @@ -78,7 +79,7 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwNextLinkNumber :: Int , mwCategoryLinks :: [Inlines] , mwHeaderMap :: M.Map Inlines String - , mwIdentifierList :: [String] + , mwIdentifierList :: Set.Set String } type MWParser = Parser [Char] MWState diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 1f1c57646..8c475eefc 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -61,6 +61,7 @@ import Text.Pandoc.Readers.Odt.Generic.XMLConverter import Text.Pandoc.Readers.Odt.Generic.Fallible import Text.Pandoc.Readers.Odt.Generic.Utils +import qualified Data.Set as Set -------------------------------------------------------------------------------- -- State @@ -221,7 +222,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do getHeaderAnchor :: OdtReaderSafe Inlines Anchor getHeaderAnchor = proc title -> do state <- getExtraState -< () - let anchor = uniqueIdent (toList title) (usedAnchors state) + let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state) modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index c7906618c..d82541638 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -53,6 +53,7 @@ import Data.Char (isAlphaNum, toLower) import Data.Default import Data.List (intersperse, isPrefixOf, isSuffixOf) import qualified Data.Map as M +import qualified Data.Set as Set import Data.Maybe (fromMaybe, isJust) import Network.HTTP (urlEncode) @@ -144,7 +145,7 @@ data OrgParserState = OrgParserState , orgStateMeta' :: F Meta , orgStateNotes' :: OrgNoteTable , orgStateParserContext :: ParserContext - , orgStateIdentifiers :: [String] + , orgStateIdentifiers :: Set.Set String , orgStateHeaderMap :: M.Map Inlines String } @@ -186,7 +187,7 @@ defaultOrgParserState = OrgParserState , orgStateMeta' = return nullMeta , orgStateNotes' = [] , orgStateParserContext = NullState - , orgStateIdentifiers = [] + , orgStateIdentifiers = Set.empty , orgStateHeaderMap = M.empty } diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index aa07c81e1..b5efcf172 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -706,14 +706,14 @@ headerLtEq _ _ = False -- | Generate a unique identifier from a list of inlines. -- Second argument is a list of already used identifiers. -uniqueIdent :: [Inline] -> [String] -> String +uniqueIdent :: [Inline] -> Set.Set String -> String uniqueIdent title' usedIdents = let baseIdent = case inlineListToIdentifier title' of "" -> "section" x -> x numIdent n = baseIdent ++ "-" ++ show n - in if baseIdent `elem` usedIdents - then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of + in if baseIdent `Set.member` usedIdents + then case find (\x -> not $ numIdent x `Set.member` usedIdents) ([1..60000] :: [Int]) of Just x -> numIdent x Nothing -> baseIdent -- if we have more than 60,000, allow repeats else baseIdent diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index 827d32620..150e19043 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -34,6 +34,7 @@ import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BL8 import qualified Data.Map as M +import qualified Data.Set as Set import qualified Text.Pandoc.UTF8 as UTF8 import Codec.Archive.Zip import Data.Time.Clock.POSIX @@ -95,7 +96,7 @@ data WriterState = WriterState{ stTextProperties :: [Element] , stParaProperties :: [Element] , stFootnotes :: [Element] - , stSectionIds :: [String] + , stSectionIds :: Set.Set String , stExternalLinks :: M.Map String String , stImages :: M.Map FilePath (String, String, Maybe MimeType, Element, B.ByteString) , stListLevel :: Int @@ -117,7 +118,7 @@ defaultWriterState = WriterState{ stTextProperties = [] , stParaProperties = [] , stFootnotes = defaultFootnotes - , stSectionIds = [] + , stSectionIds = Set.empty , stExternalLinks = M.empty , stImages = M.empty , stListLevel = -1 @@ -742,7 +743,7 @@ blockToOpenXML opts (Header lev (ident,_,_) lst) = do let bookmarkName = if null ident then uniqueIdent lst usedIdents else ident - modify $ \s -> s{ stSectionIds = bookmarkName : stSectionIds s } + modify $ \s -> s{ stSectionIds = Set.insert bookmarkName $ stSectionIds s } id' <- getUniqueId let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id') ,("w:name",bookmarkName)] () diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 64f94f41f..804dbb926 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -31,6 +31,7 @@ Conversion of 'Pandoc' documents to EPUB. module Text.Pandoc.Writers.EPUB ( writeEPUB ) where import Data.IORef ( IORef, newIORef, readIORef, modifyIORef ) import qualified Data.Map as M +import qualified Data.Set as Set import Data.Maybe ( fromMaybe, catMaybes ) import Data.List ( isPrefixOf, isInfixOf, intercalate ) import System.Environment ( getEnv ) @@ -916,13 +917,13 @@ showChapter = printf "ch%03d.xhtml" -- Add identifiers to any headers without them. addIdentifiers :: [Block] -> [Block] -addIdentifiers bs = evalState (mapM go bs) [] +addIdentifiers bs = evalState (mapM go bs) Set.empty where go (Header n (ident,classes,kvs) ils) = do ids <- get let ident' = if null ident then uniqueIdent ils ids else ident - put $ ident' : ids + modify $ Set.insert ident' return $ Header n (ident',classes,kvs) ils go x = return x diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 5a92f3cdf..4c4675524 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -53,6 +53,7 @@ import Data.Yaml (Value(Object,String,Array,Bool,Number)) import qualified Data.HashMap.Strict as H import qualified Data.Vector as V import qualified Data.Text as T +import qualified Data.Set as Set type Notes = [[Block]] type Ref = ([Inline], Target, Attr) @@ -61,11 +62,11 @@ data WriterState = WriterState { stNotes :: Notes , stRefs :: Refs , stRefShortcutable :: Bool , stInList :: Bool - , stIds :: [String] + , stIds :: Set.Set String , stPlain :: Bool } instance Default WriterState where def = WriterState{ stNotes = [], stRefs = [], stRefShortcutable = True, - stInList = False, stIds = [], stPlain = False } + stInList = False, stIds = Set.empty, stPlain = False } -- | Convert Pandoc to Markdown. writeMarkdown :: WriterOptions -> Pandoc -> String @@ -364,7 +365,7 @@ blockToMarkdown opts (Header level attr inlines) = do -- so we know whether to print an explicit identifier ids <- gets stIds let autoId = uniqueIdent inlines ids - modify $ \st -> st{ stIds = autoId : ids } + modify $ \st -> st{ stIds = Set.insert autoId ids } let attr' = case attr of ("",[],[]) -> empty (id',[],[]) | isEnabled Ext_auto_identifiers opts diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs index 1aefaa678..8420704dc 100644 --- a/src/Text/Pandoc/Writers/Texinfo.hs +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -43,13 +43,14 @@ import Text.Pandoc.Pretty import Text.Pandoc.ImageSize import Network.URI ( isURI, unEscapeString ) import System.FilePath +import qualified Data.Set as Set data WriterState = WriterState { stStrikeout :: Bool -- document contains strikeout , stSuperscript :: Bool -- document contains superscript , stSubscript :: Bool -- document contains subscript , stEscapeComma :: Bool -- in a context where we need @comma - , stIdentifiers :: [String] -- header ids used already + , stIdentifiers :: Set.Set String -- header ids used already , stOptions :: WriterOptions -- writer options } @@ -64,7 +65,7 @@ writeTexinfo options document = evalState (pandocToTexinfo options $ wrapTop document) $ WriterState { stStrikeout = False, stSuperscript = False, stEscapeComma = False, stSubscript = False, - stIdentifiers = [], stOptions = options} + stIdentifiers = Set.empty, stOptions = options} -- | Add a "Top" node around the document, needed by Texinfo. wrapTop :: Pandoc -> Pandoc @@ -215,7 +216,7 @@ blockToTexinfo (Header level _ lst) = do txt <- inlineListToTexinfo lst idsUsed <- gets stIdentifiers let id' = uniqueIdent lst idsUsed - modify $ \st -> st{ stIdentifiers = id' : idsUsed } + modify $ \st -> st{ stIdentifiers = Set.insert id' idsUsed } return $ if (level > 0) && (level <= 4) then blankline <> text "@node " <> node $$ text (seccmd level) <> txt $$ -- cgit v1.2.3