src/Text/Pandoc/Readers/LaTeX/Table.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.Pandoc.Readers.LaTeX.Table
  ( tableEnvironments )
where

import Data.Functor (($>))
import Text.Pandoc.Class
import Text.Pandoc.Readers.LaTeX.Parsing
import Text.Pandoc.Readers.LaTeX.Types
import Text.Pandoc.Builder as B
import qualified Data.Map as M
import Data.Text (Text)
import Data.Maybe (fromMaybe)
import qualified Data.Text as T
import Control.Applicative ((<|>), optional, many)
import Control.Monad (when, void)
import Text.Pandoc.Shared (safeRead, trim)
import Text.Pandoc.Logging (LogMessage(SkippedContent))
import Text.Pandoc.Walk (walkM)
import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
                            optional, space, spaces, withRaw, (<|>))

tableEnvironments :: PandocMonad m
                  => LP m Blocks
                  -> LP m Inlines
                  -> M.Map Text (LP m Blocks)
tableEnvironments blocks inline =
  M.fromList
  [ ("longtable",  env "longtable" $
          resetCaption *>
            simpTable blocks inline "longtable" False >>= addTableCaption)
  , ("table",  env "table" $
          skipopts *> resetCaption *> blocks >>= addTableCaption)
  , ("tabular*", env "tabular*" $ simpTable blocks inline "tabular*" True)
  , ("tabularx", env "tabularx" $ simpTable blocks inline "tabularx" True)
  , ("tabular", env "tabular"  $ simpTable blocks inline "tabular" False)
  ]

hline :: PandocMonad m => LP m ()
hline = try $ do
  spaces
  controlSeq "hline" <|>
    (controlSeq "cline" <* braced) <|>
    -- booktabs rules:
    controlSeq "toprule" <|>
    controlSeq "bottomrule" <|>
    controlSeq "midrule" <|>
    controlSeq "endhead" <|>
    controlSeq "endfirsthead"
  spaces
  optional rawopt
  return ()

lbreak :: PandocMonad m => LP m Tok
lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline")
         <* skipopts <* spaces

amp :: PandocMonad m => LP m Tok
amp = symbol '&'

-- Split a Word into individual Symbols (for parseAligns)
splitWordTok :: PandocMonad m => LP m ()
splitWordTok = do
  inp <- getInput
  case inp of
       (Tok spos Word t : rest) ->
         setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest
       _ -> return ()

parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))]
parseAligns = try $ do
  let maybeBar = skipMany
        (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced)))
  let cAlign = AlignCenter <$ symbol 'c'
  let lAlign = AlignLeft <$ symbol 'l'
  let rAlign = AlignRight <$ symbol 'r'
  let parAlign = AlignLeft <$ symbol 'p'
  -- aligns from tabularx
  let xAlign = AlignLeft <$ symbol 'X'
  let mAlign = AlignLeft <$ symbol 'm'
  let bAlign = AlignLeft <$ symbol 'b'
  let alignChar = splitWordTok *> (  cAlign <|> lAlign <|> rAlign <|> parAlign
                                 <|> xAlign <|> mAlign <|> bAlign )
  let alignPrefix = symbol '>' >> braced
  let alignSuffix = symbol '<' >> braced
  let colWidth = try $ do
        symbol '{'
        ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth")
        spaces
        symbol '}'
        return $ safeRead ds
  let alignSpec = do
        pref <- option [] alignPrefix
        spaces
        al <- alignChar
        width <- colWidth <|> option Nothing (do s <- untokenize <$> braced
                                                 pos <- getPosition
                                                 report $ SkippedContent s pos
                                                 return Nothing)
        spaces
        suff <- option [] alignSuffix
        return (al, width, (pref, suff))
  let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro
        symbol '*'
        spaces
        ds <- trim . untokenize <$> braced
        spaces
        spec <- braced
        case safeRead ds of
             Just n  ->
               getInput >>= setInput . (mconcat (replicate n spec) ++)
             Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number"
  bgroup
  spaces
  maybeBar
  aligns' <- many $ try $ spaces >> optional starAlign >>
                            (alignSpec <* maybeBar)
  spaces
  egroup
  spaces
  return $ map toSpec aligns'
  where
    toColWidth (Just w) | w > 0 = ColWidth w
    toColWidth _                = ColWidthDefault
    toSpec (x, y, z) = (x, toColWidth y, z)

-- N.B. this parser returns a Row that may have erroneous empty cells
-- in it. See the note above fixTableHead for details.
parseTableRow :: PandocMonad m
              => LP m Blocks -- ^ block parser
              -> LP m Inlines -- ^ inline parser
              -> Text   -- ^ table environment name
              -> [([Tok], [Tok])] -- ^ pref/suffixes
              -> LP m Row
parseTableRow blocks inline envname prefsufs = do
  notFollowedBy (spaces *> end_ envname)
  -- contexts that can contain & that is not colsep:
  let canContainAmp (Tok _ (CtrlSeq "begin") _) = True
      canContainAmp (Tok _ (CtrlSeq "verb") _)  = True
      canContainAmp (Tok _ (CtrlSeq "Verb") _)  = True
      canContainAmp _       = False
  -- add prefixes and suffixes in token stream:
  let celltoks (pref, suff) = do
        prefpos <- getPosition
        contents <- mconcat <$>
            many ( snd <$> withRaw
                     ((lookAhead (controlSeq "parbox") >>
                       void blocks) -- #5711
                      <|>
                      (lookAhead (satisfyTok canContainAmp) >> void inline)
                      <|>
                      (lookAhead (symbol '$') >> void inline))
                  <|>
                   (do notFollowedBy
                         (() <$ amp <|> () <$ lbreak <|> end_ envname)
                       count 1 anyTok) )

        suffpos <- getPosition
        option [] (count 1 amp)
        return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff
  rawcells <- mapM celltoks prefsufs
  cells <- mapM (parseFromToks (parseTableCell blocks)) rawcells
  spaces
  return $ Row nullAttr cells

parseTableCell :: PandocMonad m => LP m Blocks -> LP m Cell
parseTableCell blocks = do
  spaces
  updateState $ \st -> st{ sInTableCell = True }
  cell' <-   multicolumnCell blocks
         <|> multirowCell blocks
         <|> parseSimpleCell
         <|> parseEmptyCell
  updateState $ \st -> st{ sInTableCell = False }
  spaces
  return cell'
  where
    -- The parsing of empty cells is important in LaTeX, especially when dealing
    -- with multirow/multicolumn. See #6603.
    parseEmptyCell = spaces $> emptyCell
    parseSimpleCell = simpleCell <$> (plainify <$> blocks)


cellAlignment :: PandocMonad m => LP m Alignment
cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|')
  where
    alignment = do
      c <- untoken <$> singleChar
      return $ case c of
        "l" -> AlignLeft
        "r" -> AlignRight
        "c" -> AlignCenter
        "*" -> AlignDefault
        _   -> AlignDefault

plainify :: Blocks -> Blocks
plainify bs = case toList bs of
                [Para ils] -> plain (fromList ils)
                _          -> bs

multirowCell :: PandocMonad m => LP m Blocks -> LP m Cell
multirowCell blocks = controlSeq "multirow" >> do
  -- Full prototype for \multirow macro is:
  --     \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text}
  -- However, everything except `nrows` and `text` make
  -- sense in the context of the Pandoc AST
  _ <- optional $ symbol '[' *> cellAlignment <* symbol ']'   -- vertical position
  nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced
  _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']')  -- bigstrut-related
  _ <- symbol '{' *> manyTill anyTok (symbol '}')             -- Cell width
  _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']')  -- Length used for fine-tuning
  content <- symbol '{' *> (plainify <$> blocks) <* symbol '}'
  return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content

multicolumnCell :: PandocMonad m => LP m Blocks -> LP m Cell
multicolumnCell blocks = controlSeq "multicolumn" >> do
  span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced
  alignment <- symbol '{' *> cellAlignment <* symbol '}'

  let singleCell = do
        content <- plainify <$> blocks
        return $ cell alignment (RowSpan 1) (ColSpan span') content

  -- Two possible contents: either a \multirow cell, or content.
  -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}}
  -- Note that a \multirow cell can be nested in a \multicolumn,
  -- but not the other way around. See #6603
  let nestedCell = do
        (Cell _ _ (RowSpan rs) _ bs) <- multirowCell blocks
        return $ cell
                  alignment
                  (RowSpan rs)
                  (ColSpan span')
                  (fromList bs)

  symbol '{' *> (nestedCell <|> singleCell) <* symbol '}'

-- LaTeX tables are stored with empty cells underneath multirow cells
-- denoting the grid spaces taken up by them. More specifically, if a
-- cell spans m rows, then it will overwrite all the cells in the
-- columns it spans for (m-1) rows underneath it, requiring padding
-- cells in these places. These padding cells need to be removed for
-- proper table reading. See #6603.
--
-- These fixTable functions do not otherwise fix up malformed
-- input tables: that is left to the table builder.
fixTableHead :: TableHead -> TableHead
fixTableHead (TableHead attr rows) = TableHead attr rows'
  where
    rows' = fixTableRows rows

fixTableBody :: TableBody -> TableBody
fixTableBody (TableBody attr rhc th tb)
  = TableBody attr rhc th' tb'
  where
    th' = fixTableRows th
    tb' = fixTableRows tb

fixTableRows :: [Row] -> [Row]
fixTableRows = fixTableRows' $ repeat Nothing
  where
    fixTableRows' oldHang (Row attr cells : rs)
      = let (newHang, cells') = fixTableRow oldHang cells
            rs'               = fixTableRows' newHang rs
        in Row attr cells' : rs'
    fixTableRows' _ [] = []

-- The overhang is represented as Just (relative cell dimensions) or
-- Nothing for an empty grid space.
fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell])
fixTableRow oldHang cells
  -- If there's overhang, drop cells until their total width meets the
  -- width of the occupied grid spaces (or we run out)
  | (n, prefHang, restHang) <- splitHang oldHang
  , n > 0
  = let cells' = dropToWidth getCellW n cells
        (restHang', cells'') = fixTableRow restHang cells'
    in (prefHang restHang', cells'')
  -- Otherwise record the overhang of a pending cell and fix the rest
  -- of the row
  | c@(Cell _ _ h w _):cells' <- cells
  = let h' = max 1 h
        w' = max 1 w
        oldHang' = dropToWidth getHangW w' oldHang
        (newHang, cells'') = fixTableRow oldHang' cells'
    in (toHang w' h' <> newHang, c : cells'')
  | otherwise
  = (oldHang, [])
  where
    getCellW (Cell _ _ _ w _) = w
    getHangW = maybe 1 fst
    getCS (ColSpan n) = n

    toHang c r
      | r > 1     = [Just (c, r)]
      | otherwise = replicate (getCS c) Nothing

    -- Take the prefix of the overhang list representing filled grid
    -- spaces. Also return the remainder and the length of this prefix.
    splitHang = splitHang' 0 id

    splitHang' !n l (Just (c, r):xs)
      = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs
    splitHang' n l xs = (n, l, xs)

    -- Drop list items until the total width of the dropped items
    -- exceeds the passed width.
    dropToWidth _     n l | n < 1 = l
    dropToWidth wproj n (c:cs)    = dropToWidth wproj (n - wproj c) cs
    dropToWidth _     _ []        = []

simpTable :: PandocMonad m
          => LP m Blocks
          -> LP m Inlines
          -> Text
          -> Bool
          -> LP m Blocks
simpTable blocks inline envname hasWidthParameter = try $ do
  when hasWidthParameter $ () <$ tokWith inline
  skipopts
  colspecs <- parseAligns
  let (aligns, widths, prefsufs) = unzip3 colspecs
  optional $ controlSeq "caption" *> setCaption inline
  spaces
  optional label
  spaces
  optional lbreak
  spaces
  skipMany hline
  spaces
  header' <- option [] . try . fmap (:[]) $
             parseTableRow blocks inline envname prefsufs <*
               lbreak <* many1 hline
  spaces
  rows <- sepEndBy (parseTableRow blocks inline envname prefsufs)
                    (lbreak <* optional (skipMany hline))
  spaces
  optional $ controlSeq "caption" *> setCaption inline
  spaces
  optional label
  spaces
  optional lbreak
  spaces
  lookAhead $ controlSeq "end" -- make sure we're at end
  let th  = fixTableHead $ TableHead nullAttr header'
  let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows]
  let tf  = TableFoot nullAttr []
  return $ table emptyCaption (zip aligns widths) th tbs tf

addTableCaption :: PandocMonad m => Blocks -> LP m Blocks
addTableCaption = walkM go
  where go (Table attr c spec th tb tf) = do
          st <- getState
          let mblabel = sLastLabel st
          capt <- case (sCaption st, mblabel) of
                   (Just ils, Nothing)  -> return $ caption Nothing (plain ils)
                   (Just ils, Just lab) -> do
                     num <- getNextNumber sLastTableNum
                     setState
                       st{ sLastTableNum = num
                         , sLabels = M.insert lab
                                    [Str (renderDottedNum num)]
                                    (sLabels st) }
                     return $ caption Nothing (plain ils) -- add number??
                   (Nothing, _)  -> return c
          let attr' = case (attr, mblabel) of
                        ((_,classes,kvs), Just ident) ->
                           (ident,classes,kvs)
                        _ -> attr
          return $ addAttrDiv attr'
                 $ maybe id removeLabel mblabel
                 $ Table nullAttr capt spec th tb tf
        go x = return x

-- TODO: For now we add a Div to contain table attributes, since
-- most writers don't do anything yet with attributes on Table.
-- This can be removed when that changes.
addAttrDiv :: Attr -> Block -> Block
addAttrDiv ("",[],[]) b = b
addAttrDiv attr b       = Div attr [b]