4 files changed, 244 insertions, 118 deletions
diff --git a/src/Text/Pandoc/CSS.hs b/src/Text/Pandoc/CSS.hs
index 80251850b..d98c85147 100644
--- a/src/Text/Pandoc/CSS.hs
+++ b/src/Text/Pandoc/CSS.hs
@@ -11,41 +11,46 @@ Portability : portable
 
 Tools for working with CSS.
 -}
-module Text.Pandoc.CSS ( pickStyleAttrProps
-                       , pickStylesToKVs
-                       )
+module Text.Pandoc.CSS
+  ( cssAttributes
+  , pickStyleAttrProps
+  , pickStylesToKVs
+  )
 where
 
-import qualified Data.Text as T
 import Data.Maybe (mapMaybe, listToMaybe)
+import Data.Text (Text, pack)
 import Text.Pandoc.Shared (trim)
 import Text.Parsec
 import Text.Parsec.Text
 
-ruleParser :: Parser (T.Text, T.Text)
+ruleParser :: Parser (Text, Text)
 ruleParser = do
     p <- many1 (noneOf ":")  <* char ':'
     v <- many1 (noneOf ":;") <* optional (char ';') <* spaces
-    return (trim $ T.pack p, trim $ T.pack v)
+    return (trim $ pack p, trim $ pack v)
 
-styleAttrParser :: Parser [(T.Text, T.Text)]
+styleAttrParser :: Parser [(Text, Text)]
 styleAttrParser = many1 ruleParser
 
-eitherToMaybe :: Either a b -> Maybe b
-eitherToMaybe (Right x) = Just x
-eitherToMaybe _         = Nothing
+-- | Parses a style string, returning the CSS attributes.
+-- Returns an empty list on failure.
+cssAttributes :: Text -> [(Text, Text)]
+cssAttributes styleString =
+  -- Use Data.Either.fromRight once GHC 8.0 is no longer supported
+  case parse styleAttrParser "" styleString of
+    Left _  -> []
+    Right x -> x
 
 -- | takes a list of keys/properties and a CSS string and
 -- returns the corresponding key-value-pairs.
-pickStylesToKVs :: [T.Text] -> T.Text -> [(T.Text, T.Text)]
+pickStylesToKVs :: [Text] -> Text -> [(Text, Text)]
 pickStylesToKVs props styleAttr =
-  case parse styleAttrParser "" styleAttr of
-    Left _       -> []
-    Right styles -> filter (\s -> fst s `elem` props) styles
+  filter (\s -> fst s `elem` props) $ cssAttributes styleAttr
 
 -- | takes a list of key/property synonyms and a CSS string and maybe
 -- returns the value of the first match (in order of the supplied list)
-pickStyleAttrProps :: [T.Text] -> T.Text -> Maybe T.Text
+pickStyleAttrProps :: [Text] -> Text -> Maybe Text
 pickStyleAttrProps lookupProps styleAttr = do
-    styles <- eitherToMaybe $ parse styleAttrParser "" styleAttr
+    styles <- either (const Nothing) Just $ parse styleAttrParser "" styleAttr
     listToMaybe $ mapMaybe (`lookup` styles) lookupProps
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index fa996d2f0..eb78979a3 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -26,7 +26,7 @@ module Text.Pandoc.Readers.HTML ( readHtml
 
 import Control.Applicative ((<|>))
 import Control.Arrow (first)
-import Control.Monad (guard, mplus, msum, mzero, unless, void)
+import Control.Monad (guard, msum, mzero, unless, void)
 import Control.Monad.Except (throwError)
 import Control.Monad.Reader (ask, asks, lift, local, runReaderT)
 import Data.ByteString.Base64 (encode)
@@ -50,7 +50,7 @@ import Text.Pandoc.CSS (pickStyleAttrProps)
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Pandoc.Definition
 import Text.Pandoc.Readers.HTML.Parsing
-import Text.Pandoc.Readers.HTML.Table (pTable')
+import Text.Pandoc.Readers.HTML.Table (pTable)
 import Text.Pandoc.Readers.HTML.TagCategories
 import Text.Pandoc.Readers.HTML.Types
 import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
@@ -63,8 +63,7 @@ import Text.Pandoc.Options (
     extensionEnabled)
 import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
-                           extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
-                           safeRead, tshow)
+                           extractSpaces, htmlSpanLikeElements, safeRead, tshow)
 import Text.Pandoc.Walk
 import Text.Parsec.Error
 import Text.TeXMath (readMathML, writeTeX)
@@ -159,7 +158,7 @@ block = do
             , pCodeBlock
             , pList
             , pHrule
-            , pTable
+            , pTable block
             , pHtml
             , pHead
             , pBody
@@ -464,31 +463,6 @@ pHrule = do
   pSelfClosing (=="hr") (const True)
   return B.horizontalRule
 
-pTable :: PandocMonad m => TagParser m Blocks
-pTable = pTable' block pCell
-
-pCell :: PandocMonad m => Text -> TagParser m [Cell]
-pCell celltype = try $ do
-  skipMany pBlank
-  tag <- lookAhead $ pSatisfy (\t -> t ~== TagOpen celltype [])
-  let extractAlign' []                 = ""
-      extractAlign' ("text-align":x:_) = x
-      extractAlign' (_:xs)             = extractAlign' xs
-  let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:")
-  let align = case maybeFromAttrib "align" tag `mplus`
-                   (extractAlign <$> maybeFromAttrib "style" tag) of
-                   Just "left"   -> AlignLeft
-                   Just "right"  -> AlignRight
-                   Just "center" -> AlignCenter
-                   _             -> AlignDefault
-  let rowspan = RowSpan . fromMaybe 1 $
-                safeRead =<< maybeFromAttrib "rowspan" tag
-  let colspan = ColSpan . fromMaybe 1 $
-                safeRead =<< maybeFromAttrib "colspan" tag
-  res <- pInTags celltype block
-  skipMany pBlank
-  return [B.cell align rowspan colspan res]
-
 pBlockQuote :: PandocMonad m => TagParser m Blocks
 pBlockQuote = do
   contents <- pInTags "blockquote" block
@@ -653,12 +627,6 @@ pLineBreak = do
   pSelfClosing (=="br") (const True)
   return B.linebreak
 
--- Unlike fromAttrib from tagsoup, this distinguishes
--- between a missing attribute and an attribute with empty content.
-maybeFromAttrib :: Text -> Tag Text -> Maybe Text
-maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
-maybeFromAttrib _ _ = Nothing
-
 pLink :: PandocMonad m => TagParser m Inlines
 pLink = try $ do
   tag <- pSatisfy $ tagOpenLit "a" (const True)
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 8788a933e..2d58319da 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase            #-}
 {-# LANGUAGE OverloadedStrings     #-}
 {- |
    Module      : Text.Pandoc.Readers.HTML.Parsing
@@ -15,6 +16,7 @@ module Text.Pandoc.Readers.HTML.Parsing
   , pInTags
   , pInTags'
   , pInTag
+  , pInTagWithAttribs
   , pAny
   , pCloses
   , pSatisfy
@@ -22,6 +24,7 @@ module Text.Pandoc.Readers.HTML.Parsing
   , matchTagClose
   , matchTagOpen
   , isSpace
+  , maybeFromAttrib
   , toAttr
   , toStringAttr
   )
@@ -31,11 +34,11 @@ import Control.Monad (guard, void, mzero)
 import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import Text.HTML.TagSoup
-  ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
+  ( Attribute, Tag (..), isTagText, isTagPosition, isTagOpen, isTagClose, (~==) )
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
 import Text.Pandoc.Definition (Attr)
 import Text.Pandoc.Parsing
-  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
+  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, option, optional
   , skipMany, setPosition, token, try)
 import Text.Pandoc.Readers.HTML.TagCategories
 import Text.Pandoc.Readers.HTML.Types
@@ -60,25 +63,41 @@ pInTags' :: (PandocMonad m, Monoid a)
          -> TagParser m a
          -> TagParser m a
 pInTags' tagtype tagtest parser = try $ do
-  pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
+  pSatisfy $ \t -> matchTagOpen tagtype [] t && tagtest t
   mconcat <$> manyTill parser (pCloses tagtype <|> eof)
 
--- parses p, preceded by an opening tag (optional if tagsOptional)
--- and followed by a closing tag (optional if tagsOptional)
-pInTag :: PandocMonad m => TagOmission -> Text -> TagParser m a -> TagParser m a
-pInTag tagOmission tagtype p = try $ do
-  skipMany pBlank
+pInTag :: PandocMonad m
+       => TagOmission    -- ^ Whether some tags can be omitted
+       -> Text           -- ^ @tagtype@ Tag name
+       -> TagParser m a  -- ^ @p@ Content parser
+       -> TagParser m a
+pInTag tagOmission tagtype = fmap snd . pInTagWithAttribs tagOmission tagtype
+
+-- | Returns the contents of a tag together with its attributes; parses
+-- @p@, preceded by an opening tag (optional if TagsOmittable) and
+-- followed by a closing tag (optional unless TagsRequired).
+pInTagWithAttribs :: PandocMonad m
+                  => TagOmission    -- ^ Whether some tags can be omitted
+                  -> Text           -- ^ @tagtype@ Tag name
+                  -> TagParser m a  -- ^ @p@ Content parser
+                  -> TagParser m ([Attribute Text], a)
+pInTagWithAttribs tagOmission tagtype p = try $ do
   let openingOptional = tagOmission == TagsOmittable
   let closingOptional = tagOmission /= TagsRequired
-  (if openingOptional then optional else void) $
-    pSatisfy (matchTagOpen tagtype [])
+  skipMany pBlank
+  attribs <- (if openingOptional then option [] else id)
+             (getAttribs <$> pSatisfy (matchTagOpen tagtype []))
   skipMany pBlank
   x <- p
   skipMany pBlank
   (if closingOptional then optional else void) $
     pSatisfy (matchTagClose tagtype)
   skipMany pBlank
-  return x
+  return (attribs, x)
+  where
+    getAttribs = \case
+      TagOpen _ attribs -> attribs
+      _                 -> []
 
 pCloses :: PandocMonad m => Text -> TagParser m ()
 pCloses tagtype = try $ do
@@ -183,6 +202,12 @@ toStringAttr = map go
          -> (x',y)
        _ -> (x,y)
 
+-- Unlike fromAttrib from tagsoup, this distinguishes
+-- between a missing attribute and an attribute with empty content.
+maybeFromAttrib :: Text -> Tag Text -> Maybe Text
+maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
+maybeFromAttrib _ _ = Nothing
+
 mkAttr :: [(Text, Text)] -> Attr
 mkAttr attr = (attribsId, attribsClasses, attribsKV)
   where attribsId = fromMaybe "" $ lookup "id" attr
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
index e6d0a9097..e40d90221 100644
--- a/src/Text/Pandoc/Readers/HTML/Table.hs
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ViewPatterns #-}
 {- |
@@ -12,17 +13,19 @@
 
 HTML table parser.
 -}
-module Text.Pandoc.Readers.HTML.Table (pTable') where
+module Text.Pandoc.Readers.HTML.Table (pTable) where
 
-import Control.Monad (guard)
+import Control.Applicative ((<|>))
 import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import Text.HTML.TagSoup
 import Text.Pandoc.Builder (Blocks)
+import Text.Pandoc.CSS (cssAttributes)
 import Text.Pandoc.Definition
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
 import Text.Pandoc.Parsing
-  ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+  ( eof, lookAhead, many, many1, manyTill, option, optional
+  , optionMaybe, skipMany, try)
 import Text.Pandoc.Readers.HTML.Parsing
 import Text.Pandoc.Readers.HTML.Types (TagParser)
 import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
@@ -57,58 +60,183 @@ pColgroup = try $ do
   skipMany pBlank
   manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
 
--- | Parses a simple HTML table
-pTable' :: PandocMonad m
-        => TagParser m Blocks           -- ^ Caption parser
-        -> (Text -> TagParser m [Cell]) -- ^ Table cell parser
-        -> TagParser m Blocks
-pTable' block pCell = try $ do
-  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
-  let attribs = toAttr attribs'
+pCell :: PandocMonad m
+      => TagParser m Blocks
+      -> Text
+      -> TagParser m [Cell]
+pCell block celltype = try $ do
+  skipMany pBlank
+  TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype [])
+  let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs
+  let align = case lookup "align" attribs <|>
+                   lookup "text-align" cssAttribs of
+                Just "left"   -> AlignLeft
+                Just "right"  -> AlignRight
+                Just "center" -> AlignCenter
+                _             -> AlignDefault
+  let rowspan = RowSpan . fromMaybe 1 $
+                safeRead =<< lookup "rowspan" attribs
+  let colspan = ColSpan . fromMaybe 1 $
+                safeRead =<< lookup "colspan" attribs
+  res <- pInTags celltype block
+  skipMany pBlank
+  let handledAttribs = ["align", "colspan", "rowspan", "text-align"]
+      attribs' = foldr go [] attribs
+      go kv@(k, _) acc = case k of
+        "style" -> case filter ((/= "text-align") . fst) cssAttribs of
+                     [] -> acc
+                     cs -> ("style", toStyleString cs) : acc
+        -- drop attrib if it's already handled
+        _ | k `elem` handledAttribs -> acc
+        _ -> kv : acc
+  return [B.cellWith (toAttr attribs') align rowspan colspan res]
+
+-- | Create a style attribute string from a list of CSS attributes
+toStyleString :: [(Text, Text)] -> Text
+toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v)
+
+data RowType
+  = HeaderCells
+  | AllCells
+
+-- | Parses a table row
+pRow :: PandocMonad m
+     => TagParser m Blocks
+     -> RowType
+     -> TagParser m [B.Row]
+pRow block rowType = try $ do
+  skipMany pBlank
+  case rowType of
+    HeaderCells -> do
+      maybeCells <- optionMaybe (pInTags "tr" (pCell block "th"))
+      return $ case maybeCells of
+        Nothing    -> []
+        Just cells -> [Row nullAttr cells]
+    AllCells    -> do
+      cells <- pInTags "tr" (pCell block "td" <|> pCell block "th")
+      return [Row nullAttr cells]
+
+-- | Parses a table head
+pTableHead :: PandocMonad m
+           => TagParser m Blocks
+           -> TagParser m TableHead
+pTableHead block = try $ do
+  skipMany pBlank
+  (attribs, rows) <-  pInTagWithAttribs ClosingTagOptional "thead"
+                                        (option [] $ pRow block AllCells)
+                  <|> pInTagWithAttribs TagsOmittable "thead"
+                                        (pRow block HeaderCells)
+  let cells = concatMap (\(Row _ cs) -> cs) rows
+  if null cells
+    then TableHead nullAttr <$>
+         pInTag TagsOmittable "tbody" (pRow block HeaderCells)
+    else return $ TableHead (toAttr attribs) [Row nullAttr cells]
+
+-- | Parses a table foot
+pTableFoot :: PandocMonad m
+           => TagParser m Blocks
+           -> TagParser m TableFoot
+pTableFoot block = try $ do
+  skipMany pBlank
+  TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank
+  rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank)
+  optional $ pSatisfy (matchTagClose "tfoot")
+  return $ TableFoot (toAttr attribs) rows
+
+-- | Parses a table body
+pTableBody :: PandocMonad m
+           => TagParser m Blocks
+           -> TagParser m TableBody
+pTableBody block = do
   skipMany pBlank
-  caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
-  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
-  let pTh = option [] $ pInTags "tr" (pCell "th")
-      pTr = try $ skipMany pBlank
-               *> pInTags "tr" (pCell "td" <|> pCell "th")
-      pTBody = pInTag TagsOmittable "tbody" $ many1 pTr
-  head'' <- pInTag ClosingTagOptional "thead" (option [] pTr)
-        <|> pInTag TagsOmittable "thead" pTh
-  head'  <- pInTag TagsOmittable "tbody"
-               (if null head'' then pTh else return head'')
-  topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr
-  rowsLs <- many pTBody
-  bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr
+  (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody"
+                     (mconcat <$> many1 (pRow block AllCells))
+  return $ TableBody (toAttr attribs) 0 [] rows
+
+
+-- | Parses a simple HTML table
+pTable :: PandocMonad m
+       => TagParser m Blocks -- ^ Caption and cell contents parser
+       -> TagParser m Blocks
+pTable block = try $ do
+  TagOpen _ attribs <- pSatisfy (matchTagOpen "table" [])  <* skipMany pBlank
+  caption <- option mempty $ pInTags "caption" block       <* skipMany pBlank
+  widths  <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
+  thead   <- pTableHead block               <* skipMany pBlank
+  topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
+  tbodies <- many (pTableBody block)        <* skipMany pBlank
+  botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
   TagClose _ <- pSatisfy (matchTagClose "table")
-  let rows = concat rowsLs <> topfoot <> bottomfoot
-      rows''' = map (map cellContents) rows
+  let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot
+  case normalize widths thead tbodies tfoot of
+    Left err -> fail err
+    Right (colspecs, thead', tbodies', tfoot') -> return $
+      B.tableWith (toAttr attribs)
+                  (B.simpleCaption caption)
+                  colspecs
+                  thead'
+                  tbodies'
+                  tfoot'
+data TableType
+  = SimpleTable
+  | NormalTable
+
+tableType :: [[Cell]] -> TableType
+tableType cells =
+  if onlySimpleTableCells $ map (map cellContents) cells
+  then SimpleTable
+  else NormalTable
+  where
+    cellContents :: Cell -> [Block]
+    cellContents (Cell _ _ _ _ bs) = bs
+
+normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot
+          -> Either String ([ColSpec], TableHead, [TableBody], TableFoot)
+normalize widths head' bodies foot = do
+  let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot
+  let rowLength = length . rowCells
+  let ncols = maximum (map rowLength rows)
+  let tblType = tableType (map rowCells rows)
   -- fail on empty table
-  guard $ not $ null head' && null rows'''
-  let isSimple = onlySimpleTableCells $
-                 map cellContents head' : rows'''
-  let cols = if null head'
-                then maximum (map length rows''')
-                else length head'
-  let aligns = case rows of
-                 (cs:_) -> take cols $
-                           concatMap cellAligns cs ++ repeat AlignDefault
-                 _      -> replicate cols AlignDefault
-  let widths = if null widths'
-                  then if isSimple
-                       then replicate cols ColWidthDefault
-                       else replicate cols (ColWidth (1.0 / fromIntegral cols))
-                  else widths'
-  let toRow = Row nullAttr
-      toHeaderRow l = [toRow l | not (null l)]
-  return $ B.tableWith attribs
-                   (B.simpleCaption caption)
-                   (zip aligns widths)
-                   (TableHead nullAttr $ toHeaderRow head')
-                   [TableBody nullAttr 0 [] $ map toRow rows]
-                   (TableFoot nullAttr [])
-
-cellContents :: Cell -> [Block]
-cellContents (Cell _ _ _ _ bs) = bs
-
-cellAligns :: Cell -> [Alignment]
-cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+  if null rows
+    then Left "empty table"
+    else Right
+         ( zip (calculateAlignments ncols bodies)
+               (normalizeColWidths ncols tblType widths)
+         , head'
+         , bodies
+         , foot
+         )
+
+normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth]
+normalizeColWidths ncols tblType = \case
+  [] -> case tblType of
+          SimpleTable -> replicate ncols ColWidthDefault
+          NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols)
+  widths -> widths
+
+calculateAlignments :: Int -> [TableBody] -> [Alignment]
+calculateAlignments cols tbodies =
+  case cells of
+    cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault
+    _    -> replicate cols AlignDefault
+  where
+    cells :: [[Cell]]
+    cells = concatMap bodyRowCells tbodies
+    cellAligns :: Cell -> [Alignment]
+    cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+
+bodyRowCells :: TableBody -> [[Cell]]
+bodyRowCells = map rowCells . bodyRows
+
+headRows :: TableHead -> [B.Row]
+headRows (TableHead _ rows) = rows
+
+bodyRows :: TableBody -> [B.Row]
+bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows'
+
+footRows :: TableFoot -> [B.Row]
+footRows (TableFoot _ rows) = rows
+
+rowCells :: B.Row -> [Cell]
+rowCells (Row _ cells) = cells