diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 19 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 23 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 217 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 130 |
4 files changed, 172 insertions, 217 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 96244e58f..803fc91c5 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -45,7 +45,7 @@ import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Entities ( characterEntity, decodeEntities ) -import Maybe ( fromMaybe ) +import Data.Maybe ( fromMaybe ) import Data.List ( intersect, takeWhile, dropWhile ) import Data.Char ( toUpper, toLower, isAlphaNum ) @@ -267,9 +267,7 @@ parseHtml = do option "" (htmlEndTag "html") many anyChar -- ignore anything after </html> eof - state <- getState - let keyBlocks = stateKeyBlocks state - return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks))) + return (Pandoc (Meta title authors date) blocks) -- -- parsing blocks @@ -456,11 +454,7 @@ link = try $ do Nothing -> fail "no href" let title = fromMaybe "" (extractAttribute "title" attributes) label <- inlinesTilEnd "a" - state <- getState - ref <- if stateInlineLinks state - then return (Src url title) - else generateReference url title - return $ Link (normalizeSpaces label) ref + return $ Link (normalizeSpaces label) (url, title) image = try $ do (tag, attributes) <- htmlTag "img" @@ -469,8 +463,5 @@ image = try $ do Nothing -> fail "no src" let title = fromMaybe "" (extractAttribute "title" attributes) let alt = fromMaybe "" (extractAttribute "alt" attributes) - state <- getState - ref <- if stateInlineLinks state - then return (Src url title) - else generateReference url title - return $ Image [Str alt] ref + return $ Image [Str alt] (url, title) + diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 33c4a75ee..b0062ceff 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -37,8 +37,8 @@ import Text.ParserCombinators.Parsec import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Shared -import Maybe ( fromMaybe ) -import Char ( chr ) +import Data.Maybe ( fromMaybe ) +import Data.Char ( chr ) -- | Parse LaTeX from string and return 'Pandoc' document. readLaTeX :: ParserState -- ^ Parser state, including options for parser @@ -135,14 +135,11 @@ parseLaTeX = do spaces eof state <- getState - let keyBlocks = stateKeyBlocks state - let noteBlocks = stateNoteBlocks state let blocks' = filter (/= Null) blocks let title' = stateTitle state let authors' = stateAuthors state let date' = stateDate state - return (Pandoc (Meta title' authors' date') - (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks))) + return (Pandoc (Meta title' authors' date') blocks') -- -- parsing blocks @@ -618,15 +615,15 @@ link = try (do url <- manyTill anyChar (char '}') char '{' label <- manyTill inline (char '}') - return (Link (normalizeSpaces label) (Src url ""))) + return (Link (normalizeSpaces label) (url, ""))) image = try (do ("includegraphics", _, args) <- command let args' = filter isArg args -- filter out options let src = if null args' then - Src "" "" + ("", "") else - Src (stripFirstAndLast (head args')) "" + (stripFirstAndLast (head args'), "") return (Image [Str "image"] src)) footnote = try (do @@ -640,13 +637,7 @@ footnote = try (do setInput $ contents' blocks <- parseBlocks setInput rest - state <- getState - let notes = stateNoteBlocks state - let nextRef = case notes of - [] -> "1" - (Note ref body):rest -> (show ((read ref) + 1)) - setState (state { stateNoteBlocks = (Note nextRef blocks):notes }) - return (NoteRef nextRef)) + return (Note blocks)) -- | Parse any LaTeX command and return it in a raw TeX inline element. rawLaTeXInline :: GenParser Char ParserState Inline diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index a2e84e8c2..353dd45dd 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -31,7 +31,7 @@ module Text.Pandoc.Readers.Markdown ( readMarkdown ) where -import Data.List ( findIndex, sortBy, transpose, isSuffixOf, intersect ) +import Data.List ( findIndex, sortBy, transpose, isSuffixOf, intersect, lookup ) import Data.Char ( isAlphaNum ) import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition @@ -160,28 +160,72 @@ titleBlock = try (do option "" blanklines return (title, author, date)) --- | Returns the number assigned to a Note block -numberOfNote :: Block -> Int -numberOfNote (Note ref _) = (read ref) -numberOfNote _ = 0 - parseMarkdown = do - updateState (\state -> state { stateParseRaw = True }) - -- need to parse raw HTML, since markdown allows it + updateState (\state -> state { stateParseRaw = True }) -- parse raw HTML: markdown allows it (title, author, date) <- option ([],[],"") titleBlock -- go through once just to get list of reference keys - keysUsed <- lookAhead $ (do {manyTill (referenceKey <|> (do{anyLine; return Null})) eof; - newState <- getState; - return $ stateKeysUsed newState}) - updateState (\st -> st { stateKeysUsed = keysUsed }) + refs <- manyTill (noteBlock <|> referenceKey <|> (do l <- lineClump + return (LineClump l))) eof + let keys = map (\(KeyBlock label target) -> (label, target)) $ + filter isKeyBlock refs + let notes = map (\(NoteBlock label blocks) -> (label, blocks)) $ + filter isNoteBlock refs + let rawlines = map (\(LineClump ln) -> ln) $ filter isLineClump refs + setInput $ concat rawlines -- with note blocks and keys stripped out + updateState (\state -> state { stateKeys = keys, stateNotes = notes }) blocks <- parseBlocks -- go through again, for real let blocks' = filter (/= Null) blocks - state <- getState - let keys = reverse $ stateKeyBlocks state - let notes = reverse $ stateNoteBlocks state - let sortedNotes = sortBy (\x y -> compare (numberOfNote x) - (numberOfNote y)) notes - return (Pandoc (Meta title author date) (blocks' ++ sortedNotes ++ keys)) + return (Pandoc (Meta title author date) blocks') + +-- +-- initial pass for references +-- + +referenceKey = try $ do + nonindentSpaces + label <- reference + char labelSep + skipSpaces + option ' ' (char autoLinkStart) + src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars)) + option ' ' (char autoLinkEnd) + tit <- option "" title + blanklines + return $ KeyBlock label (removeTrailingSpace src, tit) + +noteMarker = try (do + char labelStart + char noteStart + manyTill (noneOf " \t\n") (char labelEnd)) + +rawLine = try (do + notFollowedBy' blankline + notFollowedBy' noteMarker + contents <- many1 nonEndline + end <- option "" (do + newline + option "" (try indentSpaces) + return "\n") + return (contents ++ end)) + +rawLines = do + lines <- many1 rawLine + return (concat lines) + +noteBlock = try $ do + failIfStrict + ref <- noteMarker + char ':' + option ' ' (try blankline) + option "" (try indentSpaces) + raw <- sepBy rawLines (try (do {blankline; indentSpaces})) + option "" blanklines + -- parse the extracted text, which may contain various block elements: + rest <- getInput + setInput $ (joinWithSep "\n" raw) ++ "\n\n" + contents <- parseBlocks + setInput rest + return (NoteBlock ref contents) -- -- parsing blocks @@ -189,9 +233,17 @@ parseMarkdown = do parseBlocks = manyTill block eof -block = choice [ header, table, codeBlock, note, referenceKey, hrule, list, - blockQuote, htmlBlock, rawLaTeXEnvironment', para, - plain, nullBlock ] <?> "block" +block = choice [ header + , table + , codeBlock + , hrule + , list + , blockQuote + , htmlBlock + , rawLaTeXEnvironment' + , para + , plain + , nullBlock ] <?> "block" -- -- header blocks @@ -262,45 +314,6 @@ codeBlock = do return (CodeBlock (stripTrailingNewlines result)) -- --- note block --- - -rawLine = try (do - notFollowedBy' blankline - notFollowedBy' noteMarker - contents <- many1 nonEndline - end <- option "" (do - newline - option "" (try indentSpaces) - return "\n") - return (contents ++ end)) - -rawLines = do - lines <- many1 rawLine - return (concat lines) - -note = try (do - failIfStrict - ref <- noteMarker - char ':' - skipSpaces - skipEndline - raw <- sepBy rawLines (try (do {blankline; indentSpaces})) - option "" blanklines - -- parse the extracted text, which may contain various block elements: - rest <- getInput - setInput $ (joinWithSep "\n" raw) ++ "\n\n" - contents <- parseBlocks - setInput rest - state <- getState - let identifiers = stateNoteIdentifiers state - case (findIndex (== ref) identifiers) of - Just n -> updateState (\s -> s {stateNoteBlocks = - (Note (show (n+1)) contents):(stateNoteBlocks s)}) - Nothing -> updateState id - return Null) - --- -- block quotes -- @@ -535,25 +548,6 @@ rawHtmlBlocks = try (do else combined return (RawHtml combined')) --- --- reference key --- - -referenceKey = try (do - nonindentSpaces - label <- reference - char labelSep - skipSpaces - option ' ' (char autoLinkStart) - src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars)) - option ' ' (char autoLinkEnd) - tit <- option "" title - blanklines - state <- getState - let keysUsed = stateKeysUsed state - setState state { stateKeysUsed = (label:keysUsed) } - return $ Key label (Src (removeTrailingSpace src) tit)) - -- -- LaTeX -- @@ -713,7 +707,7 @@ table = do inline = choice [ rawLaTeXInline' , escapedChar , entity - , noteRef + , note , inlineNote , link , referenceLink @@ -933,7 +927,7 @@ reference = try $ do return (normalizeSpaces label) -- source for a link, with optional title -source = try (do +source = try $ do char srcStart option ' ' (char autoLinkStart) src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners)) @@ -941,7 +935,7 @@ source = try (do tit <- option "" title skipSpaces char srcEnd - return (Src (removeTrailingSpace src) tit)) + return (removeTrailingSpace src, tit) titleWith startChar endChar = try (do skipSpaces @@ -965,30 +959,18 @@ explicitLink = try (do src <- source return (Link label src)) -referenceLink = choice [referenceLinkDouble, referenceLinkSingle] - --- a link like [this][ref] -referenceLinkDouble = try (do +-- a link like [this][ref] or [this][] or [this] +referenceLink = try $ do label <- reference - skipSpaces - option ' ' newline - skipSpaces - ref <- reference + ref <- option [] (try (do skipSpaces + option ' ' newline + skipSpaces + reference)) let ref' = if null ref then label else ref state <- getState - if ref' `elem` (stateKeysUsed state) - then return () - else fail "no corresponding key" - return (Link label (Ref ref'))) - --- a link like [this] -referenceLinkSingle = try (do - label <- reference - state <- getState - if label `elem` (stateKeysUsed state) - then return () - else fail "no corresponding key" - return (Link label (Ref label))) + case lookupKeySrc (stateKeys state) ref' of + Nothing -> fail "no corresponding key" + Just target -> return (Link label target) autoLink = autoLinkEmail <|> autoLinkRegular @@ -999,7 +981,7 @@ autoLinkEmail = try $ do domain <- sepBy1 (many1 (noneOf "/:.@<> \t\n")) (char '.') let src = name ++ "@" ++ (joinWithSep "." domain) char autoLinkEnd - return $ Link [Str src] (Src ("mailto:" ++ src) "") + return $ Link [Str src] (("mailto:" ++ src), "") -- a link <http://like.this.com> autoLinkRegular = try $ do @@ -1007,39 +989,28 @@ autoLinkRegular = try $ do prot <- oneOfStrings ["http:", "ftp:", "mailto:"] rest <- many1Till (noneOf " \t\n<>") (char autoLinkEnd) let src = prot ++ rest - return $ Link [Str src] (Src src "") + return $ Link [Str src] (src, "") image = try (do char imageStart (Link label src) <- link return (Image label src)) -noteMarker = try (do - char labelStart - char noteStart - manyTill (noneOf " \t\n") (char labelEnd)) - -noteRef = try (do +note = try $ do failIfStrict ref <- noteMarker state <- getState - let identifiers = (stateNoteIdentifiers state) ++ [ref] - setState state {stateNoteIdentifiers = identifiers} - return (NoteRef (show (length identifiers)))) + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just contents -> return (Note contents) -inlineNote = try (do +inlineNote = try $ do failIfStrict char noteStart char labelStart contents <- manyTill inline (char labelEnd) - state <- getState - let identifiers = stateNoteIdentifiers state - let ref = show $ (length identifiers) + 1 - let noteBlocks = stateNoteBlocks state - setState state {stateNoteIdentifiers = (identifiers ++ [ref]), - stateNoteBlocks = - (Note ref [Para contents]):noteBlocks} - return (NoteRef ref)) + return (Note [Para contents]) rawLaTeXInline' = do failIfStrict diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index a3de0a2ea..d2143af38 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -37,8 +37,8 @@ import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag ) import Text.Regex ( matchRegex, mkRegex ) import Text.ParserCombinators.Parsec import Data.Maybe ( fromMaybe ) -import List ( findIndex ) -import Char ( toUpper ) +import Data.List ( findIndex, delete ) +import Data.Char ( toUpper ) -- | Parse reStructuredText string and return Pandoc document. readRST :: ParserState -> String -> Pandoc @@ -62,11 +62,7 @@ specialChars = "\\`|*_<>$:[-" -- parsing documents -- -isAnonKeyBlock block = case block of - (Key [Str "_"] str) -> True - otherwise -> False - -isNotAnonKeyBlock block = not (isAnonKeyBlock block) +isAnonKey (ref, src) = (ref == [Str "_"]) isHeader1 :: Block -> Bool isHeader1 (Header 1 _) = True @@ -101,20 +97,22 @@ titleTransform blocks = (blocks, []) parseRST = do -- first pass: get anonymous keys - keyBlocks <- lookAhead $ manyTill (anonymousKey <|> (do{anyLine; return Null})) eof - let anonymousKeys = filter (/= Null) keyBlocks - -- run parser again to fill in anonymous links... - updateState (\st -> st { stateKeyBlocks = anonymousKeys }) - state <- getState + refs <- manyTill (referenceKey <|> (do l <- lineClump + return (LineClump l))) eof + let keys = map (\(KeyBlock label target) -> (label, target)) $ + filter isKeyBlock refs + let rawlines = map (\(LineClump ln) -> ln) $ filter isLineClump refs + setInput $ concat rawlines -- with keys stripped out + updateState (\state -> state { stateKeys = keys }) blocks <- parseBlocks - let blocks' = filter isNotAnonKeyBlock blocks + let blocks' = filter (/= Null) blocks + state <- getState let (blocks'', title) = if stateStandalone state then titleTransform blocks' else (blocks', []) - state' <- getState - let authors = stateAuthors state' - let date = stateDate state' - let title' = if (null title) then (stateTitle state') else title + let authors = stateAuthors state + let date = stateDate state + let title' = if (null title) then (stateTitle state) else title return (Pandoc (Meta title' authors date) blocks'') -- @@ -124,7 +122,7 @@ parseRST = do parseBlocks = manyTill block eof block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, - referenceKey, imageBlock, unknownDirective, header, + imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock, para, plain, nullBlock ] <?> "block" @@ -221,7 +219,7 @@ plain = do imageBlock = try (do string ".. image:: " src <- manyTill anyChar newline - return (Plain [Image [Str "image"] (Src src "")])) + return (Plain [Image [Str "image"] (src, "")])) -- -- header blocks @@ -492,43 +490,43 @@ unknownDirective = try (do -- reference key -- -referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] +referenceKey = do + result <- choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] + option "" blanklines + return result -imageKey = try (do +imageKey = try $ do string ".. |" ref <- manyTill inline (char '|') skipSpaces string "image::" src <- manyTill anyChar newline - return (Key (normalizeSpaces ref) - (Src (removeLeadingTrailingSpace src) ""))) + return $ KeyBlock (normalizeSpaces ref) (removeLeadingTrailingSpace src, "") -anonymousKey = try (do +anonymousKey = try $ do oneOfStrings [".. __:", "__"] skipSpaces option ' ' newline src <- manyTill anyChar newline state <- getState - return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) ""))) + return $ KeyBlock [Str "_"] (removeLeadingTrailingSpace src, "") -regularKeyQuoted = try (do +regularKeyQuoted = try $ do string ".. _`" ref <- manyTill inline (char '`') char ':' skipSpaces option ' ' newline src <- manyTill anyChar newline - return (Key (normalizeSpaces ref) - (Src (removeLeadingTrailingSpace src) ""))) + return $ KeyBlock (normalizeSpaces ref) (removeLeadingTrailingSpace src, "") -regularKey = try (do +regularKey = try $ do string ".. _" ref <- manyTill inline (char ':') skipSpaces option ' ' newline src <- manyTill anyChar newline - return (Key (normalizeSpaces ref) - (Src (removeLeadingTrailingSpace src) ""))) + return $ KeyBlock (normalizeSpaces ref) (removeLeadingTrailingSpace src, "") -- -- inline @@ -577,7 +575,7 @@ tabchar = do return (Str "\t") str = do - notFollowedBy' oneWordReferenceLink + notFollowedBy' oneWordReference result <- many1 (noneOf (specialChars ++ "\t\n ")) return (Str result) @@ -596,46 +594,44 @@ endline = try (do -- links -- -link = choice [explicitLink, referenceLink, autoLink, - oneWordReferenceLink] <?> "link" +link = choice [explicitLink, referenceLink, autoLink] <?> "link" -explicitLink = try (do +explicitLink = try $ do char '`' notFollowedBy (char '`') -- `` is marks start of inline code label <- manyTill inline (try (do {spaces; char '<'})) src <- manyTill (noneOf ">\n ") (char '>') skipSpaces string "`_" - return (Link (normalizeSpaces label) - (Src (removeLeadingTrailingSpace src) ""))) + return $ Link (normalizeSpaces label) (removeLeadingTrailingSpace src, "") -anonymousLinkEnding = try (do - char '_' - state <- getState - let anonKeys = stateKeyBlocks state - -- if there's a list of anon key refs (from previous pass), pop one off. - -- otherwise return an anon key ref for the next pass to take care of... - case anonKeys of - (Key [Str "_"] src):rest -> - do - setState (state { stateKeyBlocks = rest }) - return src - otherwise -> return (Ref [Str "_"])) - -referenceLink = try (do +reference = try $ do char '`' notFollowedBy (char '`') - label <- manyTill inline (char '`') + label <- many1Till inline (char '`') char '_' - src <- option (Ref []) anonymousLinkEnding - return (Link (normalizeSpaces label) src)) + return label -oneWordReferenceLink = try (do - label <- many1 alphaNum +oneWordReference = do + raw <- many1 alphaNum char '_' - src <- option (Ref []) anonymousLinkEnding notFollowedBy alphaNum -- because this_is_not a link - return (Link [Str label] src)) + return [Str raw] + +referenceLink = try $ do + label <- reference <|> oneWordReference + key <- option label (do{char '_'; return [Str "_"]}) -- anonymous link + state <- getState + let keyTable = stateKeys state + src <- case lookupKeySrc keyTable key of + Nothing -> fail "no corresponding key" + Just target -> return target + -- if anonymous link, remove first anon key so it won't be used again + let keyTable' = if (key == [Str "_"]) -- anonymous link? + then delete ([Str "_"], src) keyTable -- remove first anon key + else keyTable + setState $ state { stateKeys = keyTable' } + return $ Link (normalizeSpaces label) src uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:", "news:", "telnet:" ] @@ -645,9 +641,9 @@ uri = try (do identifier <- many1 (noneOf " \t\n") return (scheme ++ identifier)) -autoURI = try (do +autoURI = try $ do src <- uri - return (Link [Str src] (Src src ""))) + return $ Link [Str src] (src, "") emailChar = alphaNum <|> oneOf "-+_." @@ -666,14 +662,20 @@ domain = try (do dom <- many1 (try (do{ char '.'; many1 domainChar })) return (joinWithSep "." (first:dom))) -autoEmail = try (do +autoEmail = try $ do src <- emailAddress - return (Link [Str src] (Src ("mailto:" ++ src) ""))) + return $ Link [Str src] ("mailto:" ++ src, "") autoLink = autoURI <|> autoEmail -- For now, we assume that all substitution references are for images. -image = try (do +image = try $ do char '|' ref <- manyTill inline (char '|') - return (Image (normalizeSpaces ref) (Ref ref))) + state <- getState + let keyTable = stateKeys state + src <- case lookupKeySrc keyTable ref of + Nothing -> fail "no corresponding key" + Just target -> return target + return (Image (normalizeSpaces ref) src) + |