From 47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Wed, 28 Nov 2007 21:01:17 +0000 Subject: Removed Text directory. This is a remnant of an experiment moving the contents of src/ to the top level, and should have been deleted long ago. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1097 788f1e2b-df1e-0410-8736-df70ead52e1b --- Text/Pandoc/Readers/HTML.hs | 496 -------------------------------------------- 1 file changed, 496 deletions(-) delete mode 100644 Text/Pandoc/Readers/HTML.hs (limited to 'Text/Pandoc/Readers/HTML.hs') diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs deleted file mode 100644 index 70a071152..000000000 --- a/Text/Pandoc/Readers/HTML.hs +++ /dev/null @@ -1,496 +0,0 @@ -{- -Copyright (C) 2006-7 John MacFarlane - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-7 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane - Stability : alpha - Portability : portable - -Conversion of HTML to 'Pandoc' document. --} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.Pandoc.CharacterReferences ( characterReference, - decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf ) -import Data.Char ( toUpper, toLower, isAlphaNum ) - --- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: ParserState -- ^ Parser state - -> String -- ^ String to parse - -> Pandoc -readHtml = readWith parseHtml - --- --- Constants --- - -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object", "script"] - -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] ++ eitherBlockOrInline - -blockHtmlTags = ["address", "blockquote", "center", "dir", "div", - "dl", "fieldset", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "hr", "isindex", "menu", "noframes", - "noscript", "ol", "p", "pre", "table", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr"] ++ eitherBlockOrInline - --- --- HTML utility functions --- - --- | Read blocks until end tag. -blocksTilEnd tag = do - blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) - return $ filter (/= Null) blocks - --- | Read inlines until end tag. -inlinesTilEnd tag = manyTill inline (htmlEndTag tag) - --- | Parse blocks between open and close tag. -blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag - --- | Parse inlines between open and close tag. -inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag - --- | Extract type from a tag: e.g. @br@ from @\@ -extractTagType :: String -> String -extractTagType ('<':rest) = - let isSpaceOrSlash c = c `elem` "/ \n\t" in - map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest -extractTagType _ = "" - --- | Parse any HTML tag (opening or self-closing) and return text of tag -anyHtmlTag = try $ do - char '<' - spaces - tag <- many1 alphaNum - attribs <- many htmlAttribute - spaces - ender <- option "" (string "/") - let ender' = if null ender then "" else " /" - spaces - char '>' - return $ "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - -anyHtmlEndTag = try $ do - char '<' - spaces - char '/' - spaces - tagType <- many1 alphaNum - spaces - char '>' - return $ "" - -htmlTag :: String -> GenParser Char st (String, [(String, String)]) -htmlTag tag = try $ do - char '<' - spaces - stringAnyCase tag - attribs <- many htmlAttribute - spaces - optional (string "/") - spaces - char '>' - return (tag, (map (\(name, content, raw) -> (name, content)) attribs)) - --- parses a quoted html attribute value -quoted quoteChar = do - result <- between (char quoteChar) (char quoteChar) - (many (noneOf [quoteChar])) - return (result, [quoteChar]) - -htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute - --- minimized boolean attribute -htmlMinimizedAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - return (name, name, name) - -htmlRegularAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - spaces - char '=' - spaces - (content, quoteStr) <- choice [ (quoted '\''), - (quoted '"'), - (do - a <- many (alphaNum <|> (oneOf "-._:")) - return (a,"")) ] - return (name, content, - (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) - --- | Parse an end tag of type 'tag' -htmlEndTag tag = try $ do - char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "" - --- | Returns @True@ if the tag is (or can be) an inline tag. -isInline tag = (extractTagType tag) `elem` inlineHtmlTags - --- | Returns @True@ if the tag is (or can be) a block tag. -isBlock tag = (extractTagType tag) `elem` blockHtmlTags - -anyHtmlBlockTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "inline tag" - -anyHtmlInlineTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isInline tag then return tag else fail "not an inline tag" - --- | Parses material between script tags. --- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript = try $ do - open <- string "" - -htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] - -rawHtmlBlock = try $ do - notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") - body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag - sp <- many space - state <- getState - if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null - --- | Parses an HTML comment. -htmlComment = try $ do - string "")) - return $ "" - --- --- parsing documents --- - -xmlDec = try $ do - string "') - return $ "" - -definition = try $ do - string "') - return $ "" - -nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> - ((rawHtmlBlock >> return ' ') <|> anyChar) - -parseTitle = try $ do - (tag, _) <- htmlTag "title" - contents <- inlinesTilEnd tag - spaces - return contents - --- parse header and return meta-information (for now, just title) -parseHead = try $ do - htmlTag "head" - spaces - skipMany nonTitleNonHead - contents <- option [] parseTitle - skipMany nonTitleNonHead - htmlTag "/head" - return (contents, [], "") - -skipHtmlTag tag = optional (htmlTag tag) - --- h1 class="title" representation of title in body -bodyTitle = try $ do - (tag, attribs) <- htmlTag "h1" - cl <- case (extractAttribute "class" attribs) of - Just "title" -> return "" - otherwise -> fail "not title" - inlinesTilEnd "h1" - -parseHtml = do - sepEndBy (choice [xmlDec, definition, htmlComment]) spaces - skipHtmlTag "html" - spaces - (title, authors, date) <- option ([], [], "") parseHead - spaces - skipHtmlTag "body" - spaces - optional bodyTitle -- skip title in body, because it's represented in meta - blocks <- parseBlocks - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore anything after - eof - return $ Pandoc (Meta title authors date) blocks - --- --- parsing blocks --- - -parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) - -block = choice [ codeBlock - , header - , hrule - , list - , blockQuote - , para - , plain - , rawHtmlBlock ] "block" - --- --- header blocks --- - -header = choice (map headerLevel (enumFromTo 1 5)) "header" - -headerLevel n = try $ do - let level = "h" ++ show n - (tag, attribs) <- htmlTag level - contents <- inlinesTilEnd level - return $ Header n (normalizeSpaces contents) - --- --- hrule block --- - -hrule = try $ do - (tag, attribs) <- htmlTag "hr" - state <- getState - if not (null attribs) && stateParseRaw state - then unexpected "attributes in hr" -- parse as raw in this case - else return HorizontalRule - --- --- code blocks --- - --- Note: HTML tags in code blocks (e.g. for syntax highlighting) are --- skipped, because they are not portable to output formats other than HTML. -codeBlock = try $ do - htmlTag "pre" - result <- manyTill - (many1 (satisfy (/= '<')) <|> - ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) - (htmlEndTag "pre") - let result' = concat result - -- drop leading newline if any - let result'' = if "\n" `isPrefixOf` result' - then drop 1 result' - else result' - -- drop trailing newline if any - let result''' = if "\n" `isSuffixOf` result'' - then init result'' - else result'' - return $ CodeBlock $ decodeCharacterReferences result''' - --- --- block quotes --- - -blockQuote = try $ htmlTag "blockquote" >> spaces >> - blocksTilEnd "blockquote" >>= (return . BlockQuote) - --- --- list blocks --- - -list = choice [ bulletList, orderedList, definitionList ] "list" - -orderedList = try $ do - (_, attribs) <- htmlTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ol" - return $ OrderedList (start, style, DefaultDelim) items - -bulletList = try $ do - htmlTag "ul" - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ul" - return $ BulletList items - -definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - tag <- htmlTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return $ DefinitionList items - -definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = joinWithSep [LineBreak] terms - return (term, concat defs) - --- --- paragraph block --- - -para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= - return . Para . normalizeSpaces - --- --- plain block --- - -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- inline --- - -inline = choice [ charRef - , strong - , emph - , superscript - , subscript - , strikeout - , spanStrikeout - , code - , str - , linebreak - , whitespace - , link - , image - , rawHtmlInline - ] "inline" - -code = try $ do - htmlTag "code" - result <- manyTill anyChar (htmlEndTag "code") - -- remove internal line breaks, leading and trailing space, - -- and decode character references - return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ - joinWithSep " " $ lines result - -rawHtmlInline = do - result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag - state <- getState - if stateParseRaw state then return (HtmlInline result) else return (Str "") - -betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= - return . normalizeSpaces - -emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph - -strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong - -superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript - -subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript - -strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= - return . Strikeout - -spanStrikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (tag, attributes) <- htmlTag "span" - result <- case (extractAttribute "class" attributes) of - Just "strikeout" -> inlinesTilEnd "span" - _ -> fail "not a strikeout" - return $ Strikeout result - -whitespace = many1 space >> return Space - --- hard line break -linebreak = htmlTag "br" >> optional newline >> return LineBreak - -str = many1 (noneOf "<& \t\n") >>= return . Str - --- --- links and images --- - --- extract contents of attribute (attribute names are case-insensitive) -extractAttribute name [] = Nothing -extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName - in if attrName' == name' - then Just (decodeCharacterReferences contents) - else extractAttribute name rest - -link = try $ do - (tag, attributes) <- htmlTag "a" - url <- case (extractAttribute "href" attributes) of - Just url -> return url - Nothing -> fail "no href" - let title = fromMaybe "" $ extractAttribute "title" attributes - label <- inlinesTilEnd "a" - return $ Link (normalizeSpaces label) (url, title) - -image = try $ do - (tag, attributes) <- htmlTag "img" - url <- case (extractAttribute "src" attributes) of - Just url -> return url - Nothing -> fail "no src" - let title = fromMaybe "" $ extractAttribute "title" attributes - let alt = fromMaybe "" (extractAttribute "alt" attributes) - return $ Image [Str alt] (url, title) - -- cgit v1.2.3