From 904050fa36715e18522d80432a2666fcbaacd105 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 22 Dec 2010 20:25:15 -0800 Subject: New HTML reader using tagsoup as a lexer. * The new reader is faster and more accurate. * API changes for Text.Pandoc.Readers.HTML: - removed rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement, htmlComment - added htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag * tagsoup is a new dependency. * Text.Pandoc.Parsing: Generalized type on readWith. * Benchmark.hs: Added length calculation to force full evaluation. * Updated HTML reader tests. * Updated markdown and textile readers to use the functions from the HTML reader. * Note: The markdown reader now correctly handles some cases it did not before. For example:
is reproduced without adding a space. is parsed correctly. --- src/Text/Pandoc/Parsing.hs | 6 +- src/Text/Pandoc/Readers/HTML.hs | 961 ++++++++++++++---------------------- src/Text/Pandoc/Readers/Markdown.hs | 55 +-- src/Text/Pandoc/Readers/Textile.hs | 30 +- 4 files changed, 424 insertions(+), 628 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index d8cd7cd7c..3035a2319 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -287,7 +287,7 @@ nullBlock :: GenParser Char st Block nullBlock = anyChar >> return Null -- | Fail if reader is in strict markdown syntax mode. -failIfStrict :: GenParser Char ParserState () +failIfStrict :: GenParser a ParserState () failIfStrict = do state <- getState if stateStrict state then fail "strict mode" else return () @@ -567,9 +567,9 @@ gridTableFooter = blanklines --- -- | Parse a string with a given parser and state. -readWith :: GenParser Char ParserState a -- ^ parser +readWith :: GenParser t ParserState a -- ^ parser -> ParserState -- ^ initial state - -> String -- ^ input string + -> [t] -- ^ input -> a readWith parser state input = case runParser parser state "source" input of diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index c25a73418..16379a82c 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -27,36 +27,355 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of HTML to 'Pandoc' document. -} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - htmlTag, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement, - htmlComment, +module Text.Pandoc.Readers.HTML ( readHtml + , htmlTag + , htmlInBalanced + , isInlineTag + , isBlockTag + , isTextTag + , isCommentTag ) where import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Parsec.Pos +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Match import Text.Pandoc.Definition +import Text.Pandoc.Builder (text, toList) import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( isPrefixOf, isSuffixOf, intercalate ) -import Data.Char ( toLower, isAlphaNum ) -import Control.Monad ( liftM, when ) +import Data.Maybe ( fromMaybe, isJust ) +import Data.List ( intercalate ) +import Data.Char ( isSpace, isDigit ) +import Control.Monad ( liftM, guard ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc -readHtml = readWith parseHtml +readHtml st inp = Pandoc meta blocks + where blocks = readWith parseBody st body + tags = canonicalizeTags $ + parseTagsOptions parseOptions{ optTagPosition = True } inp + hasHeader = any (~== TagOpen "head" []) tags + (meta, rest) = if hasHeader + then parseHeader tags + else (Meta [] [] [], tags) + body = filter (\t -> not $ + tagOpen (`elem` ["html","head","body"]) (const True) t || + tagClose (`elem` ["html","head","body"]) t) rest + +type TagParser = GenParser (Tag String) ParserState + +parseHeader :: [Tag String] -> (Meta, [Tag String]) +parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest) + where (tit,r) = break (~== TagClose "title") $ drop 1 $ + dropWhile (\t -> not $ t ~== TagOpen "title" []) tags + tit' = concatMap fromTagText $ filter isTagText tit + tit'' = normalizeSpaces $ toList $ text tit' + rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head") r + +parseBody :: TagParser [Block] +parseBody = liftM concat $ manyTill block eof + +block :: TagParser [Block] +block = optional pLocation >> + choice [ + pPara + , pHeader + , pBlockQuote + , pCodeBlock + , pList + , pHrule + , pPlain + , pRawHtmlBlock + ] + +renderTags' :: [Tag String] -> String +renderTags' = renderTagsOptions + renderOptions{ optMinimize = (`elem` ["hr","br","img"]) } + +pList :: TagParser [Block] +pList = pBulletList <|> pOrderedList <|> pDefinitionList + +pBulletList :: TagParser [Block] +pBulletList = try $ do + pSatisfy (~== TagOpen "ul" []) + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ul")) + -- note: if they have an
    or