aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML/Table.hs
blob: b23a2abc846131e316e6a2ac258b032b7754db06 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
{-# LANGUAGE LambdaCase        #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns      #-}
{- |
   Module      : Text.Pandoc.Readers.HTML.Table
   Copyright   : © 2006-2021 John MacFarlane,
                   2020-2021 Albert Krewinkel
   License     : GNU GPL, version 2 or above

   Maintainer  : Albert Krewinkel <albert@zeitkraut.de>
   Stability   : alpha
   Portability : portable

HTML table parser.
-}
module Text.Pandoc.Readers.HTML.Table (pTable) where

import Control.Applicative ((<|>))
import Data.Maybe (fromMaybe, isJust)
import Data.Either (lefts, rights)
import Data.List.NonEmpty (nonEmpty)
import Data.Text (Text)
import Text.HTML.TagSoup
import Text.Pandoc.Builder (Blocks)
import Text.Pandoc.CSS (cssAttributes)
import Text.Pandoc.Definition
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Parsing
  ( eof, lookAhead, many, many1, manyTill, option, optional
  , optionMaybe, skipMany, try )
import Text.Pandoc.Readers.HTML.Parsing
import Text.Pandoc.Readers.HTML.Types (TagParser)
import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
import qualified Data.Text as T
import qualified Text.Pandoc.Builder as B
import Control.Monad (guard)

-- | Parses a @<col>@ element, returning the column's width.
-- An Either value is used:  Left i means a "relative length" with
-- integral value i (see https://www.w3.org/TR/html4/types.html#h-6.6);
-- Right w means a regular width.  Defaults to @'Right ColWidthDefault'@
-- if the width is not set or cannot be determined.
pCol :: PandocMonad m => TagParser m (Either Int ColWidth)
pCol = try $ do
  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
  let attribs = toStringAttr attribs'
  skipMany pBlank
  optional $ pSatisfy (matchTagClose "col")
  skipMany pBlank
  return $ case lookup "width" attribs of
                Nothing -> case lookup "style" attribs of
                  Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
                    maybe (Right ColWidthDefault) (Right . ColWidth . (/ 100.0))
                      $ safeRead (T.filter
                                   (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
                  _ -> Right ColWidthDefault
                Just (T.unsnoc -> Just (xs, '*')) ->
                  maybe (Left 1) Left $ safeRead xs
                Just (T.unsnoc -> Just (xs, '%')) ->
                  maybe (Right ColWidthDefault)
                        (Right . ColWidth . (/ 100.0)) $ safeRead xs
                _ -> Right ColWidthDefault

pColgroup :: PandocMonad m => TagParser m [Either Int ColWidth]
pColgroup = try $ do
  pSatisfy (matchTagOpen "colgroup" [])
  skipMany pBlank
  manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank

resolveRelativeLengths :: [Either Int ColWidth] -> [ColWidth]
resolveRelativeLengths ws =
  let remaining = 1 - sum (map getColWidth $ rights ws)
      relatives = sum $ lefts ws
      relUnit = remaining / fromIntegral relatives
      toColWidth (Right x) = x
      toColWidth (Left i) = ColWidth (fromIntegral i * relUnit)
  in  map toColWidth ws

getColWidth :: ColWidth -> Double
getColWidth ColWidthDefault = 0
getColWidth (ColWidth w) = w

data CellType
  = HeaderCell
  | BodyCell
  deriving Eq

pCell :: PandocMonad m
      => TagParser m Blocks
      -> CellType
      -> TagParser m (CellType, Cell)
pCell block celltype = try $ do
  let celltype' = case celltype of
        HeaderCell -> "th"
        BodyCell   -> "td"
  skipMany pBlank
  TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype' [])
  let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs
  let align = case lookup "align" attribs <|>
                   lookup "text-align" cssAttribs of
                Just "left"   -> AlignLeft
                Just "right"  -> AlignRight
                Just "center" -> AlignCenter
                _             -> AlignDefault
  let rowspan = RowSpan . fromMaybe 1 $
                safeRead =<< lookup "rowspan" attribs
  let colspan = ColSpan . fromMaybe 1 $
                safeRead =<< lookup "colspan" attribs
  res <- pInTags celltype' block
  skipMany pBlank
  let handledAttribs = ["align", "colspan", "rowspan", "text-align"]
      attribs' = foldr go [] attribs
      go kv@(k, _) acc = case k of
        "style" -> case filter ((/= "text-align") . fst) cssAttribs of
                     [] -> acc
                     cs -> ("style", toStyleString cs) : acc
        -- drop attrib if it's already handled
        _ | k `elem` handledAttribs -> acc
        _ -> kv : acc
  return (celltype, B.cellWith (toAttr attribs') align rowspan colspan res)

-- | Create a style attribute string from a list of CSS attributes
toStyleString :: [(Text, Text)] -> Text
toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v)

-- | Parses a normal table row; returns the row together with the number
-- of header cells at the beginning of the row.
pRow :: PandocMonad m
     => TagParser m Blocks
     -> TagParser m (RowHeadColumns, B.Row)
pRow block = try $ do
  skipMany pBlank
  TagOpen _ attribs <- pSatisfy (matchTagOpen "tr" []) <* skipMany pBlank
  cells <- many (pCell block BodyCell <|> pCell block HeaderCell)
  TagClose _ <- pSatisfy (matchTagClose "tr")
  return ( RowHeadColumns $ length (takeWhile ((== HeaderCell) . fst) cells)
         , Row (toAttr attribs) $ map snd cells
         )

-- | Parses a header row, i.e., a row which containing nothing but
-- @<th>@ elements.
pHeaderRow :: PandocMonad m
           => TagParser m Blocks
           -> TagParser m B.Row
pHeaderRow block = try $ do
  skipMany pBlank
  let pThs = map snd <$> many (pCell block HeaderCell)
  let mkRow (attribs, cells) = Row (toAttr attribs) cells
  mkRow <$> pInTagWithAttribs TagsRequired "tr" pThs

-- | Parses a table head. If there is no @thead@ element, this looks for
-- a row of @<th>@-only elements as the first line of the table.
pTableHead :: PandocMonad m
           => TagParser m Blocks
           -> TagParser m TableHead
pTableHead block = try $ do
  skipMany pBlank
  let pRows = many (pRow block)
  let pThead = pInTagWithAttribs ClosingTagOptional "thead" pRows
  optionMaybe pThead >>= \case
    Just (attribs, rows) ->
      return $ TableHead (toAttr attribs) $ map snd rows
    Nothing -> mkTableHead <$> optionMaybe (pHeaderRow block)
               where
                 mkTableHead = TableHead nullAttr . \case
                   -- Use row as header only if it's non-empty
                   Just row@(Row _ (_:_)) -> [row]
                   _                      -> []

-- | Parses a table foot
pTableFoot :: PandocMonad m
           => TagParser m Blocks
           -> TagParser m TableFoot
pTableFoot block = try $ do
  skipMany pBlank
  TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank
  rows <- many (fmap snd $ pRow block <* skipMany pBlank)
  optional $ pSatisfy (matchTagClose "tfoot")
  return $ TableFoot (toAttr attribs) rows

-- | Parses a table body
pTableBody :: PandocMonad m
           => TagParser m Blocks
           -> TagParser m TableBody
pTableBody block = try $ do
  skipMany pBlank
  mbattribs <- option Nothing $ Just . getAttribs <$>
                 pSatisfy (matchTagOpen "tbody" []) <* skipMany pBlank
  bodyheads <- many (pHeaderRow block)
  (rowheads, rows) <- unzip <$> many (pRow block <* skipMany pBlank)
  optional $ pSatisfy (matchTagClose "tbody")
  guard $ isJust mbattribs || not (null bodyheads && null rows)
  let attribs = fromMaybe [] mbattribs
  return $ TableBody (toAttr attribs) (foldr max 0 rowheads) bodyheads rows
  where
    getAttribs (TagOpen _ attribs) = attribs
    getAttribs _ = []

-- | Parses a simple HTML table
pTable :: PandocMonad m
       => TagParser m Blocks -- ^ Caption and cell contents parser
       -> TagParser m Blocks
pTable block = try $ do
  TagOpen _ attribs <- pSatisfy (matchTagOpen "table" [])  <* skipMany pBlank
  caption <- option mempty $ pInTags "caption" block       <* skipMany pBlank
  widths <- resolveRelativeLengths <$>
              ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
  thead   <- pTableHead block               <* skipMany pBlank
  topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
  tbodies <- many (pTableBody block)        <* skipMany pBlank
  botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
  TagClose _ <- pSatisfy (matchTagClose "table")
  let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot
  case normalize widths thead tbodies tfoot of
    Left err -> fail err
    Right (colspecs, thead', tbodies', tfoot') -> return $
      B.tableWith (toAttr attribs)
                  (B.simpleCaption caption)
                  colspecs
                  thead'
                  tbodies'
                  tfoot'
data TableType
  = SimpleTable
  | NormalTable

tableType :: [[Cell]] -> TableType
tableType cells =
  if onlySimpleTableCells $ map (map cellContents) cells
  then SimpleTable
  else NormalTable
  where
    cellContents :: Cell -> [Block]
    cellContents (Cell _ _ _ _ bs) = bs

normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot
          -> Either String ([ColSpec], TableHead, [TableBody], TableFoot)
normalize widths head' bodies foot = do
  let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot
  let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs
  let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells
  let ncols = maybe 0 maximum $ nonEmpty $ map rowLength rows
  let tblType = tableType (map rowCells rows)
  -- fail on empty table
  if null rows
    then Left "empty table"
    else Right
         ( zip (calculateAlignments ncols bodies)
               (normalizeColWidths ncols tblType widths)
         , head'
         , bodies
         , foot
         )

normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth]
normalizeColWidths ncols tblType = \case
  [] -> case tblType of
          SimpleTable -> replicate ncols ColWidthDefault
          NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols)
  widths -> widths

calculateAlignments :: Int -> [TableBody] -> [Alignment]
calculateAlignments cols tbodies =
  case cells of
    cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault
    _    -> replicate cols AlignDefault
  where
    cells :: [[Cell]]
    cells = concatMap bodyRowCells tbodies
    cellAligns :: Cell -> [Alignment]
    cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align

bodyRowCells :: TableBody -> [[Cell]]
bodyRowCells = map rowCells . bodyRows

headRows :: TableHead -> [B.Row]
headRows (TableHead _ rows) = rows

bodyRows :: TableBody -> [B.Row]
bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows'

footRows :: TableFoot -> [B.Row]
footRows (TableFoot _ rows) = rows

rowCells :: B.Row -> [Cell]
rowCells (Row _ cells) = cells