diff options
author | John MacFarlane <jgm@berkeley.edu> | 2010-12-22 20:25:15 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2010-12-30 13:55:40 -0800 |
commit | 904050fa36715e18522d80432a2666fcbaacd105 (patch) | |
tree | 4745876e797d400539dd80309d31c330a013e969 /src/Text/Pandoc/Readers/Textile.hs | |
parent | 220fe5fab89ce84fcb98f0430c4126281ca8362d (diff) | |
download | pandoc-904050fa36715e18522d80432a2666fcbaacd105.tar.gz |
New HTML reader using tagsoup as a lexer.
* The new reader is faster and more accurate.
* API changes for Text.Pandoc.Readers.HTML:
- removed rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag,
anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType,
htmlBlockElement, htmlComment
- added htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag
* tagsoup is a new dependency.
* Text.Pandoc.Parsing: Generalized type on readWith.
* Benchmark.hs: Added length calculation to force full evaluation.
* Updated HTML reader tests.
* Updated markdown and textile readers to use the functions from
the HTML reader.
* Note: The markdown reader now correctly handles some cases it did not
before. For example:
<hr/>
is reproduced without adding a space.
<script>
a = '<b>';
</script>
is parsed correctly.
Diffstat (limited to 'src/Text/Pandoc/Readers/Textile.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 30 |
1 files changed, 15 insertions, 15 deletions
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 52a9e12c8..8362c542c 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -58,10 +58,9 @@ module Text.Pandoc.Readers.Textile ( readTextile) where import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag, -- find code blocks - rawHtmlBlock, rawHtmlInline ) --- import Text.Pandoc.Readers.Markdown (smartPunctuation) +import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) import Text.ParserCombinators.Parsec +import Text.HTML.TagSoup.Match import Data.Char ( digitToInt, isLetter ) import Control.Monad ( guard, liftM ) @@ -127,7 +126,7 @@ blockParsers = [ codeBlock , blockQuote , hrule , anyList - , rawHtmlBlock' + , rawHtmlBlock , maybeExplicitBlock "table" table , maybeExplicitBlock "p" para , nullBlock ] @@ -139,8 +138,8 @@ block = choice blockParsers <?> "block" -- | Code Blocks in Textile are between <pre> and </pre> codeBlock :: GenParser Char ParserState Block codeBlock = try $ do - htmlTag False "pre" - result' <- manyTill anyChar (try $ htmlEndTag "pre" >> blockBreak) + htmlTag (tagOpen (=="pre") null) + result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak) -- drop leading newline if any let result'' = case result' of '\n':xs -> xs @@ -261,21 +260,19 @@ definitionListItem = try $ do -- this ++ "\n\n" does not look very good ds <- parseFromString parseBlocks (s ++ "\n\n") return [ds] - - + -- | This terminates a block such as a paragraph. Because of raw html -- blocks support, we have to lookAhead for a rawHtmlBlock. blockBreak :: GenParser Char ParserState () -blockBreak = try $ choice - [newline >> blanklines >> return (), - lookAhead rawHtmlBlock' >> return ()] +blockBreak = try (newline >> blanklines >> return ()) <|> + (lookAhead rawHtmlBlock >> return ()) -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock' :: GenParser Char ParserState Block -rawHtmlBlock' = try $ do - b <- rawHtmlBlock +rawHtmlBlock :: GenParser Char ParserState Block +rawHtmlBlock = try $ do + (_,b) <- htmlTag isBlockTag optional blanklines - return b + return $ RawHtml b -- | In textile, paragraphs are separated by blank lines. para :: GenParser Char ParserState Block @@ -450,6 +447,9 @@ endline = try $ do newline >> notFollowedBy blankline return LineBreak +rawHtmlInline :: GenParser Char ParserState Inline +rawHtmlInline = liftM (HtmlInline . snd) $ htmlTag isInlineTag + -- | Textile standard link syntax is label:"target" link :: GenParser Char ParserState Inline link = try $ do |