Use new flexible metadata type.

* Depend on pandoc 1.12. * Added yaml dependency. * `Text.Pandoc.XML`: Removed `stripTags`. (API change.) * `Text.Pandoc.Shared`: Added `metaToJSON`. This will be used in writers to create a JSON object for use in the templates from the pandoc metadata. * Revised readers and writers to use the new Meta type. * `Text.Pandoc.Options`: Added `Ext_yaml_title_block`. * Markdown reader: Added support for YAML metadata block. Note that it must come at the beginning of the document. * `Text.Pandoc.Parsing.ParserState`: Replace `stateTitle`, `stateAuthors`, `stateDate` with `stateMeta`. * RST reader: Improved metadata. Treat initial field list as metadata when standalone specified. Previously ALL fields "title", "author", "date" in field lists were treated as metadata, even if not at the beginning. Use `subtitle` metadata field for subtitle. * `Text.Pandoc.Templates`: Export `renderTemplate'` that takes a string instead of a compiled template.. * OPML template: Use 'for' loop for authors. * Org template: '#+TITLE:' is inserted before the title. Previously the writer did this.
author: John MacFarlane <fiddlosopher@gmail.com> 2013-05-10 22:53:35 -0700
committer: John MacFarlane <fiddlosopher@gmail.com> 2013-06-24 20:29:41 -0700
commit: f869f7e08dad315945d52be3fcacf6ff0c05c5c1 (patch)
tree: 4c426ebf5a30b51499859f9d41a890534b6a18a6 /src/Text/Pandoc/Readers/HTML.hs
parent: e32a8f5981969bb6d0a11bd945188c35817e4d96 (diff)
download: pandoc-f869f7e08dad315945d52be3fcacf6ff0c05c5c1.tar.gz
1 files changed, 20 insertions, 23 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 32ce46fba..f6657a4d1 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -39,7 +39,7 @@ module Text.Pandoc.Readers.HTML ( readHtml
 import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match
 import Text.Pandoc.Definition
-import Text.Pandoc.Builder (text, toList)
+import qualified Text.Pandoc.Builder as B
 import Text.Pandoc.Shared
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
@@ -47,6 +47,7 @@ import Data.Maybe ( fromMaybe, isJust )
 import Data.List ( intercalate )
 import Data.Char ( isDigit )
 import Control.Monad ( liftM, guard, when, mzero )
+import Control.Applicative ( (<$>), (<$) )
 
 isSpace :: Char -> Bool
 isSpace ' '  = True
@@ -58,32 +59,26 @@ isSpace _    = False
 readHtml :: ReaderOptions -- ^ Reader options
          -> String        -- ^ String to parse (assumes @'\n'@ line endings)
          -> Pandoc
-readHtml opts inp = Pandoc meta blocks
-  where blocks  = case runParser parseBody def{ stateOptions = opts }
-                    "source" rest of
-                      Left err'    -> error $ "\nError at " ++ show  err'
-                      Right result -> result
-        tags    = canonicalizeTags $
+readHtml opts inp =
+  case runParser parseDoc def{ stateOptions = opts } "source" tags of
+          Left err'    -> error $ "\nError at " ++ show  err'
+          Right result -> result
+    where tags = canonicalizeTags $
                    parseTagsOptions parseOptions{ optTagPosition = True } inp
-        hasHeader = any (~== TagOpen "head" []) tags
-        (meta, rest) = if hasHeader
-                          then parseHeader tags
-                          else (Meta [] [] [], tags)
+          parseDoc = do
+             blocks <- (fixPlains False . concat) <$> manyTill block eof
+             meta <- stateMeta <$> getState
+             return $ Pandoc meta blocks
 
 type TagParser = Parser [Tag String] ParserState
 
--- TODO - fix this - not every header has a title tag
-parseHeader :: [Tag String] -> (Meta, [Tag String])
-parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
-  where (tit,_) = break (~== TagClose "title") $ drop 1 $
-                   dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
-        tit' = concatMap fromTagText $ filter isTagText tit
-        tit'' = normalizeSpaces $ toList $ text tit'
-        rest  = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head" ||
-                                                t ~== TagOpen "body" []) tags
+pBody :: TagParser [Block]
+pBody = pInTags "body" block
 
-parseBody :: TagParser [Block]
-parseBody = liftM (fixPlains False . concat) $ manyTill block eof
+pHead :: TagParser [Block]
+pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag)
+  where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces
+        setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t))
 
 block :: TagParser [Block]
 block = choice
@@ -94,6 +89,8 @@ block = choice
             , pList
             , pHrule
             , pSimpleTable
+            , pHead
+            , pBody
             , pPlain
             , pRawHtmlBlock
             ]
@@ -366,7 +363,7 @@ pImage = do
   let url = fromAttrib "src" tag
   let title = fromAttrib "title" tag
   let alt = fromAttrib "alt" tag
-  return [Image (toList $ text alt) (escapeURI url, title)]
+  return [Image (B.toList $ B.text alt) (escapeURI url, title)]
 
 pCode :: TagParser [Inline]
 pCode = try $ do
author	John MacFarlane <fiddlosopher@gmail.com>	2013-05-10 22:53:35 -0700
committer	John MacFarlane <fiddlosopher@gmail.com>	2013-06-24 20:29:41 -0700
commit	f869f7e08dad315945d52be3fcacf6ff0c05c5c1 (patch)
tree	4c426ebf5a30b51499859f9d41a890534b6a18a6 /src/Text/Pandoc/Readers/HTML.hs
parent	e32a8f5981969bb6d0a11bd945188c35817e4d96 (diff)
download	pandoc-f869f7e08dad315945d52be3fcacf6ff0c05c5c1.tar.gz