aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs43
1 files changed, 20 insertions, 23 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 32ce46fba..f6657a4d1 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -39,7 +39,7 @@ module Text.Pandoc.Readers.HTML ( readHtml
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
-import Text.Pandoc.Builder (text, toList)
+import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Shared
import Text.Pandoc.Options
import Text.Pandoc.Parsing
@@ -47,6 +47,7 @@ import Data.Maybe ( fromMaybe, isJust )
import Data.List ( intercalate )
import Data.Char ( isDigit )
import Control.Monad ( liftM, guard, when, mzero )
+import Control.Applicative ( (<$>), (<$) )
isSpace :: Char -> Bool
isSpace ' ' = True
@@ -58,32 +59,26 @@ isSpace _ = False
readHtml :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
-readHtml opts inp = Pandoc meta blocks
- where blocks = case runParser parseBody def{ stateOptions = opts }
- "source" rest of
- Left err' -> error $ "\nError at " ++ show err'
- Right result -> result
- tags = canonicalizeTags $
+readHtml opts inp =
+ case runParser parseDoc def{ stateOptions = opts } "source" tags of
+ Left err' -> error $ "\nError at " ++ show err'
+ Right result -> result
+ where tags = canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
- hasHeader = any (~== TagOpen "head" []) tags
- (meta, rest) = if hasHeader
- then parseHeader tags
- else (Meta [] [] [], tags)
+ parseDoc = do
+ blocks <- (fixPlains False . concat) <$> manyTill block eof
+ meta <- stateMeta <$> getState
+ return $ Pandoc meta blocks
type TagParser = Parser [Tag String] ParserState
--- TODO - fix this - not every header has a title tag
-parseHeader :: [Tag String] -> (Meta, [Tag String])
-parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
- where (tit,_) = break (~== TagClose "title") $ drop 1 $
- dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
- tit' = concatMap fromTagText $ filter isTagText tit
- tit'' = normalizeSpaces $ toList $ text tit'
- rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" ||
- t ~== TagOpen "body" []) tags
+pBody :: TagParser [Block]
+pBody = pInTags "body" block
-parseBody :: TagParser [Block]
-parseBody = liftM (fixPlains False . concat) $ manyTill block eof
+pHead :: TagParser [Block]
+pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag)
+ where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces
+ setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t))
block :: TagParser [Block]
block = choice
@@ -94,6 +89,8 @@ block = choice
, pList
, pHrule
, pSimpleTable
+ , pHead
+ , pBody
, pPlain
, pRawHtmlBlock
]
@@ -366,7 +363,7 @@ pImage = do
let url = fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
- return [Image (toList $ text alt) (escapeURI url, title)]
+ return [Image (B.toList $ B.text alt) (escapeURI url, title)]
pCode :: TagParser [Inline]
pCode = try $ do