Moved everything from src into the top-level directory.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@1104 788f1e2b-df1e-0410-8736-df70ead52e1b
author: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2007-11-29 02:02:34 +0000
committer: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2007-11-29 02:02:34 +0000
commit: 9a67a486c2dc98d14d9687ceb4b01befa09114df (patch)
tree: 42d2f11f5e39a2850934b1ee6e36003df5802993 /Text/Pandoc/Readers/HTML.hs
parent: 47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864 (diff)
download: pandoc-9a67a486c2dc98d14d9687ceb4b01befa09114df.tar.gz
1 files changed, 496 insertions, 0 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
new file mode 100644
index 000000000..70a071152
--- /dev/null
+++ b/Text/Pandoc/Readers/HTML.hs
@@ -0,0 +1,496 @@
+{-
+Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.HTML
+   Copyright   : Copyright (C) 2006-7 John MacFarlane
+   License     : GNU GPL, version 2 or above 
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha 
+   Portability : portable
+
+Conversion of HTML to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.HTML ( 
+                                 readHtml, 
+                                 rawHtmlInline, 
+                                 rawHtmlBlock, 
+                                 anyHtmlBlockTag, 
+                                 anyHtmlInlineTag,  
+                                 anyHtmlTag,
+                                 anyHtmlEndTag,
+                                 htmlEndTag,
+                                 extractTagType,
+                                 htmlBlockElement 
+                                ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared 
+import Text.Pandoc.CharacterReferences ( characterReference, 
+                                         decodeCharacterReferences )
+import Data.Maybe ( fromMaybe )
+import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
+import Data.Char ( toUpper, toLower, isAlphaNum )
+
+-- | Convert HTML-formatted string to 'Pandoc' document.
+readHtml :: ParserState   -- ^ Parser state
+         -> String        -- ^ String to parse
+         -> Pandoc
+readHtml = readWith parseHtml
+
+--
+-- Constants
+--
+
+eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
+                  "map", "area", "object", "script"]
+
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
+                  "br", "cite", "code", "dfn", "em", "font", "i", "img",
+                  "input", "kbd", "label", "q", "s", "samp", "select",
+                  "small", "span", "strike", "strong", "sub", "sup",
+                  "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+
+blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
+                 "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
+                 "h5", "h6", "hr", "isindex", "menu", "noframes",
+                 "noscript", "ol", "p", "pre", "table", "ul", "dd",
+                 "dt", "frameset", "li", "tbody", "td", "tfoot",
+                 "th", "thead", "tr"] ++ eitherBlockOrInline
+
+--
+-- HTML utility functions
+--
+
+-- | Read blocks until end tag.
+blocksTilEnd tag = do
+  blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
+  return $ filter (/= Null) blocks
+
+-- | Read inlines until end tag.
+inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
+
+-- | Parse blocks between open and close tag.
+blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
+
+-- | Parse inlines between open and close tag.
+inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
+
+-- | Extract type from a tag:  e.g. @br@ from @\<br\>@
+extractTagType :: String -> String
+extractTagType ('<':rest) = 
+  let isSpaceOrSlash c = c `elem` "/ \n\t" in
+  map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
+extractTagType _ = ""
+
+-- | Parse any HTML tag (opening or self-closing) and return text of tag
+anyHtmlTag = try $ do
+  char '<'
+  spaces
+  tag <- many1 alphaNum
+  attribs <- many htmlAttribute
+  spaces
+  ender <- option "" (string "/")
+  let ender' = if null ender then "" else " /"
+  spaces
+  char '>'
+  return $ "<" ++ tag ++ 
+           concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+
+anyHtmlEndTag = try $ do
+  char '<'   
+  spaces
+  char '/'
+  spaces
+  tagType <- many1 alphaNum
+  spaces
+  char '>'
+  return $ "</" ++ tagType ++ ">"
+
+htmlTag :: String -> GenParser Char st (String, [(String, String)])
+htmlTag tag = try $ do
+  char '<'
+  spaces
+  stringAnyCase tag
+  attribs <- many htmlAttribute
+  spaces
+  optional (string "/")
+  spaces
+  char '>'
+  return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
+
+-- parses a quoted html attribute value
+quoted quoteChar = do
+  result <- between (char quoteChar) (char quoteChar) 
+                    (many (noneOf [quoteChar]))
+  return (result, [quoteChar])
+
+htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
+
+-- minimized boolean attribute
+htmlMinimizedAttribute = try $ do
+  many1 space
+  name <- many1 (choice [letter, oneOf ".-_:"])
+  return (name, name, name)
+
+htmlRegularAttribute = try $ do
+  many1 space
+  name <- many1 (choice [letter, oneOf ".-_:"])
+  spaces
+  char '='
+  spaces
+  (content, quoteStr) <- choice [ (quoted '\''), 
+                                  (quoted '"'), 
+                                  (do
+                                     a <- many (alphaNum <|> (oneOf "-._:"))
+                                     return (a,"")) ]
+  return (name, content,
+          (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
+
+-- | Parse an end tag of type 'tag'
+htmlEndTag tag = try $ do
+  char '<'   
+  spaces
+  char '/'
+  spaces
+  stringAnyCase tag
+  spaces
+  char '>'
+  return $ "</" ++ tag ++ ">"
+
+-- | Returns @True@ if the tag is (or can be) an inline tag.
+isInline tag = (extractTagType tag) `elem` inlineHtmlTags
+
+-- | Returns @True@ if the tag is (or can be) a block tag.
+isBlock tag = (extractTagType tag) `elem` blockHtmlTags 
+
+anyHtmlBlockTag = try $ do
+  tag <- anyHtmlTag <|> anyHtmlEndTag
+  if isBlock tag then return tag else fail "inline tag"
+
+anyHtmlInlineTag = try $ do
+  tag <- anyHtmlTag <|> anyHtmlEndTag
+  if isInline tag then return tag else fail "not an inline tag"
+
+-- | Parses material between script tags.
+-- Scripts must be treated differently, because they can contain '<>' etc.
+htmlScript = try $ do
+  open <- string "<script"
+  rest <- manyTill anyChar (htmlEndTag "script")
+  return $ open ++ rest ++ "</script>"
+
+htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
+
+rawHtmlBlock = try $ do
+  notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
+  body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
+  sp <- many space
+  state <- getState
+  if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
+
+-- | Parses an HTML comment.
+htmlComment = try $ do
+  string "<!--"
+  comment <- manyTill anyChar (try (string "-->"))
+  return $ "<!--" ++ comment ++ "-->"
+
+--
+-- parsing documents
+--
+
+xmlDec = try $ do
+  string "<?"
+  rest <- manyTill anyChar (char '>')
+  return $ "<?" ++ rest ++ ">"
+
+definition = try $ do
+  string "<!"
+  rest <- manyTill anyChar (char '>')
+  return $ "<!" ++ rest ++ ">"
+
+nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
+  ((rawHtmlBlock >> return ' ') <|> anyChar)
+
+parseTitle = try $ do
+  (tag, _) <- htmlTag "title"
+  contents <- inlinesTilEnd tag
+  spaces
+  return contents
+
+-- parse header and return meta-information (for now, just title)
+parseHead = try $ do
+  htmlTag "head"
+  spaces
+  skipMany nonTitleNonHead
+  contents <- option [] parseTitle
+  skipMany nonTitleNonHead
+  htmlTag "/head"
+  return (contents, [], "")
+
+skipHtmlTag tag = optional (htmlTag tag)
+
+-- h1 class="title" representation of title in body
+bodyTitle = try $ do
+  (tag, attribs) <- htmlTag "h1"  
+  cl <- case (extractAttribute "class" attribs) of
+          Just "title" -> return ""
+          otherwise    -> fail "not title"
+  inlinesTilEnd "h1"
+
+parseHtml = do
+  sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
+  skipHtmlTag "html"
+  spaces
+  (title, authors, date) <- option ([], [], "") parseHead 
+  spaces
+  skipHtmlTag "body"
+  spaces
+  optional bodyTitle  -- skip title in body, because it's represented in meta
+  blocks <- parseBlocks
+  spaces
+  optional (htmlEndTag "body")
+  spaces
+  optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
+  eof
+  return $ Pandoc (Meta title authors date) blocks
+
+--
+-- parsing blocks
+--
+
+parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
+
+block = choice [ codeBlock
+               , header
+               , hrule
+               , list
+               , blockQuote
+               , para
+               , plain
+               , rawHtmlBlock ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel n = try $ do
+    let level = "h" ++ show n
+    (tag, attribs) <- htmlTag level
+    contents <- inlinesTilEnd level
+    return $ Header n (normalizeSpaces contents)
+
+--
+-- hrule block
+--
+
+hrule = try  $ do
+  (tag, attribs) <- htmlTag "hr"
+  state <- getState
+  if not (null attribs) && stateParseRaw state
+     then unexpected "attributes in hr" -- parse as raw in this case
+     else return HorizontalRule
+
+--
+-- code blocks
+--
+
+-- Note:  HTML tags in code blocks (e.g. for syntax highlighting) are 
+-- skipped, because they are not portable to output formats other than HTML.
+codeBlock = try $ do
+    htmlTag "pre" 
+    result <- manyTill 
+              (many1 (satisfy (/= '<')) <|> 
+               ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
+              (htmlEndTag "pre")
+    let result' = concat result
+    -- drop leading newline if any
+    let result'' = if "\n" `isPrefixOf` result'
+                      then drop 1 result'
+                      else result'
+    -- drop trailing newline if any
+    let result''' = if "\n" `isSuffixOf` result''
+                       then init result''
+                       else result''
+    return $ CodeBlock $ decodeCharacterReferences result'''
+
+--
+-- block quotes
+--
+
+blockQuote = try $ htmlTag "blockquote" >> spaces >> 
+                   blocksTilEnd "blockquote" >>= (return . BlockQuote)
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList, definitionList ] <?> "list"
+
+orderedList = try $ do
+  (_, attribs) <- htmlTag "ol"
+  (start, style) <- option (1, DefaultStyle) $
+                           do failIfStrict
+                              let sta = fromMaybe "1" $ 
+                                        lookup "start" attribs
+                              let sty = fromMaybe (fromMaybe "" $
+                                        lookup "style" attribs) $
+                                        lookup "class" attribs
+                              let sty' = case sty of
+                                          "lower-roman"  -> LowerRoman
+                                          "upper-roman"  -> UpperRoman
+                                          "lower-alpha"  -> LowerAlpha
+                                          "upper-alpha"  -> UpperAlpha
+                                          "decimal"      -> Decimal
+                                          _              -> DefaultStyle
+                              return (read sta, sty')
+  spaces
+  items <- sepEndBy1 (blocksIn "li") spaces
+  htmlEndTag "ol"
+  return $ OrderedList (start, style, DefaultDelim) items
+
+bulletList = try $ do
+  htmlTag "ul"
+  spaces
+  items <- sepEndBy1 (blocksIn "li") spaces
+  htmlEndTag "ul"
+  return $ BulletList items
+
+definitionList = try $ do
+  failIfStrict  -- def lists not part of standard markdown
+  tag <- htmlTag "dl"
+  spaces
+  items <- sepEndBy1 definitionListItem spaces
+  htmlEndTag "dl"
+  return $ DefinitionList items
+
+definitionListItem = try $ do
+  terms <- sepEndBy1 (inlinesIn "dt") spaces
+  defs <- sepEndBy1 (blocksIn "dd") spaces
+  let term = joinWithSep [LineBreak] terms
+  return (term, concat defs)
+
+--
+-- paragraph block
+--
+
+para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= 
+             return . Para . normalizeSpaces
+
+-- 
+-- plain block
+--
+
+plain = many1 inline >>= return . Plain . normalizeSpaces
+
+-- 
+-- inline
+--
+
+inline = choice [ charRef
+                , strong
+                , emph
+                , superscript
+                , subscript
+                , strikeout
+                , spanStrikeout
+                , code
+                , str
+                , linebreak
+                , whitespace
+                , link
+                , image
+                , rawHtmlInline
+                ] <?> "inline"
+
+code = try $ do 
+  htmlTag "code"
+  result <- manyTill anyChar (htmlEndTag "code")
+  -- remove internal line breaks, leading and trailing space,
+  -- and decode character references
+  return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ 
+                  joinWithSep " " $ lines result 
+
+rawHtmlInline = do
+  result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag
+  state <- getState
+  if stateParseRaw state then return (HtmlInline result) else return (Str "")
+
+betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= 
+                        return . normalizeSpaces
+
+emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph
+
+strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
+
+superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
+
+subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
+
+strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
+            return . Strikeout
+
+spanStrikeout = try $ do
+  failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
+  (tag, attributes) <- htmlTag "span" 
+  result <- case (extractAttribute "class" attributes) of
+              Just "strikeout" -> inlinesTilEnd "span"
+              _                -> fail "not a strikeout"
+  return $ Strikeout result
+
+whitespace = many1 space >> return Space
+
+-- hard line break
+linebreak = htmlTag "br" >> optional newline >> return LineBreak
+
+str = many1 (noneOf "<& \t\n") >>= return . Str
+
+--
+-- links and images
+--
+
+-- extract contents of attribute (attribute names are case-insensitive)
+extractAttribute name [] = Nothing
+extractAttribute name ((attrName, contents):rest) = 
+  let name'     = map toLower name 
+      attrName' = map toLower attrName
+  in  if attrName' == name'
+         then Just (decodeCharacterReferences contents)
+         else extractAttribute name rest
+
+link = try $ do
+  (tag, attributes) <- htmlTag "a"  
+  url <- case (extractAttribute "href" attributes) of
+           Just url -> return url
+           Nothing  -> fail "no href"
+  let title = fromMaybe "" $ extractAttribute "title" attributes
+  label <- inlinesTilEnd "a"
+  return $ Link (normalizeSpaces label) (url, title)
+
+image = try $ do
+  (tag, attributes) <- htmlTag "img" 
+  url <- case (extractAttribute "src" attributes) of
+           Just url -> return url
+           Nothing  -> fail "no src"
+  let title = fromMaybe "" $ extractAttribute "title" attributes
+  let alt = fromMaybe "" (extractAttribute "alt" attributes)
+  return $ Image [Str alt] (url, title)
+
author	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2007-11-29 02:02:34 +0000
committer	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2007-11-29 02:02:34 +0000
commit	9a67a486c2dc98d14d9687ceb4b01befa09114df (patch)
tree	42d2f11f5e39a2850934b1ee6e36003df5802993 /Text/Pandoc/Readers/HTML.hs
parent	47a4a3ab897ab748a4b7eab2ccb95cd9cb0e3864 (diff)
download	pandoc-9a67a486c2dc98d14d9687ceb4b01befa09114df.tar.gz