aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <fiddlosopher@gmail.com>2013-05-10 22:53:35 -0700
committerJohn MacFarlane <fiddlosopher@gmail.com>2013-06-24 20:29:41 -0700
commitf869f7e08dad315945d52be3fcacf6ff0c05c5c1 (patch)
tree4c426ebf5a30b51499859f9d41a890534b6a18a6 /src/Text/Pandoc/Readers/HTML.hs
parente32a8f5981969bb6d0a11bd945188c35817e4d96 (diff)
downloadpandoc-f869f7e08dad315945d52be3fcacf6ff0c05c5c1.tar.gz
Use new flexible metadata type.
* Depend on pandoc 1.12. * Added yaml dependency. * `Text.Pandoc.XML`: Removed `stripTags`. (API change.) * `Text.Pandoc.Shared`: Added `metaToJSON`. This will be used in writers to create a JSON object for use in the templates from the pandoc metadata. * Revised readers and writers to use the new Meta type. * `Text.Pandoc.Options`: Added `Ext_yaml_title_block`. * Markdown reader: Added support for YAML metadata block. Note that it must come at the beginning of the document. * `Text.Pandoc.Parsing.ParserState`: Replace `stateTitle`, `stateAuthors`, `stateDate` with `stateMeta`. * RST reader: Improved metadata. Treat initial field list as metadata when standalone specified. Previously ALL fields "title", "author", "date" in field lists were treated as metadata, even if not at the beginning. Use `subtitle` metadata field for subtitle. * `Text.Pandoc.Templates`: Export `renderTemplate'` that takes a string instead of a compiled template.. * OPML template: Use 'for' loop for authors. * Org template: '#+TITLE:' is inserted before the title. Previously the writer did this.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs43
1 files changed, 20 insertions, 23 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 32ce46fba..f6657a4d1 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -39,7 +39,7 @@ module Text.Pandoc.Readers.HTML ( readHtml
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
-import Text.Pandoc.Builder (text, toList)
+import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Shared
import Text.Pandoc.Options
import Text.Pandoc.Parsing
@@ -47,6 +47,7 @@ import Data.Maybe ( fromMaybe, isJust )
import Data.List ( intercalate )
import Data.Char ( isDigit )
import Control.Monad ( liftM, guard, when, mzero )
+import Control.Applicative ( (<$>), (<$) )
isSpace :: Char -> Bool
isSpace ' ' = True
@@ -58,32 +59,26 @@ isSpace _ = False
readHtml :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
-readHtml opts inp = Pandoc meta blocks
- where blocks = case runParser parseBody def{ stateOptions = opts }
- "source" rest of
- Left err' -> error $ "\nError at " ++ show err'
- Right result -> result
- tags = canonicalizeTags $
+readHtml opts inp =
+ case runParser parseDoc def{ stateOptions = opts } "source" tags of
+ Left err' -> error $ "\nError at " ++ show err'
+ Right result -> result
+ where tags = canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
- hasHeader = any (~== TagOpen "head" []) tags
- (meta, rest) = if hasHeader
- then parseHeader tags
- else (Meta [] [] [], tags)
+ parseDoc = do
+ blocks <- (fixPlains False . concat) <$> manyTill block eof
+ meta <- stateMeta <$> getState
+ return $ Pandoc meta blocks
type TagParser = Parser [Tag String] ParserState
--- TODO - fix this - not every header has a title tag
-parseHeader :: [Tag String] -> (Meta, [Tag String])
-parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
- where (tit,_) = break (~== TagClose "title") $ drop 1 $
- dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
- tit' = concatMap fromTagText $ filter isTagText tit
- tit'' = normalizeSpaces $ toList $ text tit'
- rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" ||
- t ~== TagOpen "body" []) tags
+pBody :: TagParser [Block]
+pBody = pInTags "body" block
-parseBody :: TagParser [Block]
-parseBody = liftM (fixPlains False . concat) $ manyTill block eof
+pHead :: TagParser [Block]
+pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag)
+ where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces
+ setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t))
block :: TagParser [Block]
block = choice
@@ -94,6 +89,8 @@ block = choice
, pList
, pHrule
, pSimpleTable
+ , pHead
+ , pBody
, pPlain
, pRawHtmlBlock
]
@@ -366,7 +363,7 @@ pImage = do
let url = fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
- return [Image (toList $ text alt) (escapeURI url, title)]
+ return [Image (B.toList $ B.text alt) (escapeURI url, title)]
pCode :: TagParser [Inline]
pCode = try $ do