{- Copyright (C) 2006-2010 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.HTML Copyright : Copyright (C) 2006-2010 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of HTML to 'Pandoc' document. -} module Text.Pandoc.Readers.HTML ( readHtml , htmlTag , htmlInBalanced , isInlineTag , isBlockTag , isTextTag , isCommentTag ) where import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import Text.Pandoc.Builder (text, toList) import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing import Data.Maybe ( fromMaybe, isJust ) import Data.List ( intercalate ) import Data.Char ( isDigit ) import Control.Monad ( liftM, guard, when, mzero ) isSpace :: Char -> Bool isSpace ' ' = True isSpace '\t' = True isSpace '\n' = True isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = Pandoc meta blocks where blocks = readWith parseBody def{ stateOptions = opts } rest tags = canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp hasHeader = any (~== TagOpen "head" []) tags (meta, rest) = if hasHeader then parseHeader tags else (Meta [] [] [], tags) type TagParser = Parser [Tag String] ParserState -- TODO - fix this - not every header has a title tag parseHeader :: [Tag String] -> (Meta, [Tag String]) parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest) where (tit,_) = break (~== TagClose "title") $ drop 1 $ dropWhile (\t -> not $ t ~== TagOpen "title" []) tags tit' = concatMap fromTagText $ filter isTagText tit tit'' = normalizeSpaces $ toList $ text tit' rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" || t ~== TagOpen "body" []) tags parseBody :: TagParser [Block] parseBody = liftM (fixPlains False . concat) $ manyTill block eof block :: TagParser [Block] block = choice [ pPara , pHeader , pBlockQuote , pCodeBlock , pList , pHrule , pSimpleTable , pPlain , pRawHtmlBlock ] pList :: TagParser [Block] pList = pBulletList <|> pOrderedList <|> pDefinitionList pBulletList :: TagParser [Block] pBulletList = try $ do pSatisfy (~== TagOpen "ul" []) let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && not (t ~== TagClose "ul")) -- note: if they have an

, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") return [BulletList $ map (fixPlains True) items] pOrderedList :: TagParser [Block] pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) let (start, style) = (sta', sty') where sta = fromMaybe "1" $ lookup "start" attribs sta' = if all isDigit sta then read sta else 1 sty = fromMaybe (fromMaybe "" $ lookup "style" attribs) $ lookup "class" attribs sty' = case sty of "lower-roman" -> LowerRoman "upper-roman" -> UpperRoman "lower-alpha" -> LowerAlpha "upper-alpha" -> UpperAlpha "decimal" -> Decimal _ -> DefaultStyle let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && not (t ~== TagClose "ol")) -- note: if they have an