New HTML reader using tagsoup as a lexer.

* The new reader is faster and more accurate. * API changes for Text.Pandoc.Readers.HTML: - removed rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement, htmlComment - added htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag * tagsoup is a new dependency. * Text.Pandoc.Parsing: Generalized type on readWith. * Benchmark.hs: Added length calculation to force full evaluation. * Updated HTML reader tests. * Updated markdown and textile readers to use the functions from the HTML reader. * Note: The markdown reader now correctly handles some cases it did not before. For example: <hr/> is reproduced without adding a space. <script> a = '<b>'; </script> is parsed correctly.
author: John MacFarlane <jgm@berkeley.edu> 2010-12-22 20:25:15 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2010-12-30 13:55:40 -0800
commit: 904050fa36715e18522d80432a2666fcbaacd105 (patch)
tree: 4745876e797d400539dd80309d31c330a013e969 /src/Text/Pandoc/Readers/Textile.hs
parent: 220fe5fab89ce84fcb98f0430c4126281ca8362d (diff)
download: pandoc-904050fa36715e18522d80432a2666fcbaacd105.tar.gz
1 files changed, 15 insertions, 15 deletions
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 52a9e12c8..8362c542c 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -58,10 +58,9 @@ module Text.Pandoc.Readers.Textile ( readTextile) where
 import Text.Pandoc.Definition
 import Text.Pandoc.Shared 
 import Text.Pandoc.Parsing
-import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag, -- find code blocks
-                                  rawHtmlBlock, rawHtmlInline )
--- import Text.Pandoc.Readers.Markdown (smartPunctuation)
+import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
 import Text.ParserCombinators.Parsec
+import Text.HTML.TagSoup.Match
 import Data.Char ( digitToInt, isLetter )
 import Control.Monad ( guard, liftM )
 
@@ -127,7 +126,7 @@ blockParsers = [ codeBlock
                , blockQuote
                , hrule
                , anyList
-               , rawHtmlBlock'
+               , rawHtmlBlock
                , maybeExplicitBlock "table" table
                , maybeExplicitBlock "p" para
                , nullBlock ]
@@ -139,8 +138,8 @@ block = choice blockParsers <?> "block"
 -- | Code Blocks in Textile are between <pre> and </pre>
 codeBlock :: GenParser Char ParserState Block
 codeBlock = try $ do
-  htmlTag False "pre"
-  result' <- manyTill anyChar (try $ htmlEndTag "pre" >> blockBreak)
+  htmlTag (tagOpen (=="pre") null)
+  result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak)
   -- drop leading newline if any
   let result'' = case result' of
                       '\n':xs -> xs
@@ -261,21 +260,19 @@ definitionListItem = try $ do
           -- this ++ "\n\n" does not look very good
           ds <- parseFromString parseBlocks (s ++ "\n\n")
           return [ds]
-  
-  
+
 -- | This terminates a block such as a paragraph. Because of raw html
 -- blocks support, we have to lookAhead for a rawHtmlBlock.
 blockBreak :: GenParser Char ParserState ()
-blockBreak = try $ choice 
-             [newline >> blanklines >> return (),
-              lookAhead rawHtmlBlock' >> return ()]
+blockBreak = try (newline >> blanklines >> return ()) <|>
+              (lookAhead rawHtmlBlock >> return ())
 
 -- | A raw Html Block, optionally followed by blanklines
-rawHtmlBlock' :: GenParser Char ParserState Block
-rawHtmlBlock' = try $ do
-  b <- rawHtmlBlock
+rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock = try $ do
+  (_,b) <- htmlTag isBlockTag
   optional blanklines
-  return b
+  return $ RawHtml b
 
 -- | In textile, paragraphs are separated by blank lines.
 para :: GenParser Char ParserState Block
@@ -450,6 +447,9 @@ endline = try $ do
   newline >> notFollowedBy blankline
   return LineBreak
 
+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = liftM (HtmlInline . snd) $ htmlTag isInlineTag
+
 -- | Textile standard link syntax is label:"target"
 link :: GenParser Char ParserState Inline
 link = try $ do
author	John MacFarlane <jgm@berkeley.edu>	2010-12-22 20:25:15 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2010-12-30 13:55:40 -0800
commit	904050fa36715e18522d80432a2666fcbaacd105 (patch)
tree	4745876e797d400539dd80309d31c330a013e969 /src/Text/Pandoc/Readers/Textile.hs
parent	220fe5fab89ce84fcb98f0430c4126281ca8362d (diff)
download	pandoc-904050fa36715e18522d80432a2666fcbaacd105.tar.gz