diff options
author | John MacFarlane <jgm@berkeley.edu> | 2021-02-27 21:40:56 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2021-02-27 21:40:56 -0800 |
commit | 08231f5cdd16e31d38d9d6bf59bc5ca12638b438 (patch) | |
tree | 5b67f617d33a5aaad2398eac2d944659a7a0edee /src/Text/Pandoc/Readers/LaTeX | |
parent | 925815bb33b462e1a4c19a8e2c617d403dec0ce7 (diff) | |
download | pandoc-08231f5cdd16e31d38d9d6bf59bc5ca12638b438.tar.gz |
Factor out T.P.Readers.LaTeX.Table.
Diffstat (limited to 'src/Text/Pandoc/Readers/LaTeX')
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Parsing.hs | 33 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Table.hs | 373 |
2 files changed, 406 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs index c2e10570d..4a9fa03ad 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -54,6 +54,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing , comment , anyTok , singleChar + , tokWith , specialChars , endline , blankline @@ -80,6 +81,10 @@ module Text.Pandoc.Readers.LaTeX.Parsing , rawopt , overlaySpecification , getNextNumber + , label + , setCaption + , resetCaption + , env ) where import Control.Applicative (many, (<|>)) @@ -914,3 +919,31 @@ getNextNumber getCurrentNum = do Just n -> [n, 1] Nothing -> [1] +label :: PandocMonad m => LP m () +label = do + controlSeq "label" + t <- braced + updateState $ \st -> st{ sLastLabel = Just $ untokenize t } + +setCaption :: PandocMonad m => LP m Inlines -> LP m () +setCaption inline = try $ do + skipopts + ils <- tokWith inline + optional $ try $ spaces *> label + updateState $ \st -> st{ sCaption = Just ils } + +resetCaption :: PandocMonad m => LP m () +resetCaption = updateState $ \st -> st{ sCaption = Nothing + , sLastLabel = Nothing } + +env :: PandocMonad m => Text -> LP m a -> LP m a +env name p = p <* end_ name + +tokWith :: PandocMonad m => LP m Inlines -> LP m Inlines +tokWith inlineParser = try $ spaces >> + grouped inlineParser + <|> (lookAhead anyControlSeq >> inlineParser) + <|> singleChar' + where singleChar' = do + Tok _ _ t <- singleChar + return $ str t diff --git a/src/Text/Pandoc/Readers/LaTeX/Table.hs b/src/Text/Pandoc/Readers/LaTeX/Table.hs new file mode 100644 index 000000000..2ea9caf58 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Table.hs @@ -0,0 +1,373 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Table + ( tableEnvironments ) +where + +import Data.Functor (($>)) +import Text.Pandoc.Class +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Builder as B +import qualified Data.Map as M +import Data.Text (Text) +import Data.Maybe (fromMaybe) +import qualified Data.Text as T +import Control.Applicative ((<|>), optional, many) +import Control.Monad (when, void) +import Text.Pandoc.Shared (safeRead, trim) +import Text.Pandoc.Logging (LogMessage(SkippedContent)) +import Text.Pandoc.Walk (walkM) +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) + +tableEnvironments :: PandocMonad m + => LP m Blocks + -> LP m Inlines + -> M.Map Text (LP m Blocks) +tableEnvironments blocks inline = + M.fromList + [ ("longtable", env "longtable" $ + resetCaption *> + simpTable blocks inline "longtable" False >>= addTableCaption) + , ("table", env "table" $ + skipopts *> resetCaption *> blocks >>= addTableCaption) + , ("tabular*", env "tabular*" $ simpTable blocks inline "tabular*" True) + , ("tabularx", env "tabularx" $ simpTable blocks inline "tabularx" True) + , ("tabular", env "tabular" $ simpTable blocks inline "tabular" False) + ] + +hline :: PandocMonad m => LP m () +hline = try $ do + spaces + controlSeq "hline" <|> + -- booktabs rules: + controlSeq "toprule" <|> + controlSeq "bottomrule" <|> + controlSeq "midrule" <|> + controlSeq "endhead" <|> + controlSeq "endfirsthead" + spaces + optional rawopt + return () + +lbreak :: PandocMonad m => LP m Tok +lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") + <* skipopts <* spaces + +amp :: PandocMonad m => LP m Tok +amp = symbol '&' + +-- Split a Word into individual Symbols (for parseAligns) +splitWordTok :: PandocMonad m => LP m () +splitWordTok = do + inp <- getInput + case inp of + (Tok spos Word t : rest) -> + setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest + _ -> return () + +parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))] +parseAligns = try $ do + let maybeBar = skipMany + (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced))) + let cAlign = AlignCenter <$ symbol 'c' + let lAlign = AlignLeft <$ symbol 'l' + let rAlign = AlignRight <$ symbol 'r' + let parAlign = AlignLeft <$ symbol 'p' + -- aligns from tabularx + let xAlign = AlignLeft <$ symbol 'X' + let mAlign = AlignLeft <$ symbol 'm' + let bAlign = AlignLeft <$ symbol 'b' + let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign + <|> xAlign <|> mAlign <|> bAlign ) + let alignPrefix = symbol '>' >> braced + let alignSuffix = symbol '<' >> braced + let colWidth = try $ do + symbol '{' + ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth") + spaces + symbol '}' + return $ safeRead ds + let alignSpec = do + pref <- option [] alignPrefix + spaces + al <- alignChar + width <- colWidth <|> option Nothing (do s <- untokenize <$> braced + pos <- getPosition + report $ SkippedContent s pos + return Nothing) + spaces + suff <- option [] alignSuffix + return (al, width, (pref, suff)) + let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro + symbol '*' + spaces + ds <- trim . untokenize <$> braced + spaces + spec <- braced + case safeRead ds of + Just n -> + getInput >>= setInput . (mconcat (replicate n spec) ++) + Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number" + bgroup + spaces + maybeBar + aligns' <- many $ try $ spaces >> optional starAlign >> + (alignSpec <* maybeBar) + spaces + egroup + spaces + return $ map toSpec aligns' + where + toColWidth (Just w) | w > 0 = ColWidth w + toColWidth _ = ColWidthDefault + toSpec (x, y, z) = (x, toColWidth y, z) + +-- N.B. this parser returns a Row that may have erroneous empty cells +-- in it. See the note above fixTableHead for details. +parseTableRow :: PandocMonad m + => LP m Blocks -- ^ block parser + -> LP m Inlines -- ^ inline parser + -> Text -- ^ table environment name + -> [([Tok], [Tok])] -- ^ pref/suffixes + -> LP m Row +parseTableRow blocks inline envname prefsufs = do + notFollowedBy (spaces *> end_ envname) + -- add prefixes and suffixes in token stream: + let celltoks (pref, suff) = do + prefpos <- getPosition + contents <- mconcat <$> + many ( snd <$> withRaw + ((lookAhead (controlSeq "parbox") >> + void blocks) -- #5711 + <|> + (lookAhead (controlSeq "begin") >> void inline) + <|> + (lookAhead (symbol '$') >> void inline)) + <|> + (do notFollowedBy + (() <$ amp <|> () <$ lbreak <|> end_ envname) + count 1 anyTok) ) + + suffpos <- getPosition + option [] (count 1 amp) + return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff + rawcells <- mapM celltoks prefsufs + cells <- mapM (parseFromToks (parseTableCell blocks)) rawcells + spaces + return $ Row nullAttr cells + +parseTableCell :: PandocMonad m => LP m Blocks -> LP m Cell +parseTableCell blocks = do + spaces + updateState $ \st -> st{ sInTableCell = True } + cell' <- multicolumnCell blocks + <|> multirowCell blocks + <|> parseSimpleCell + <|> parseEmptyCell + updateState $ \st -> st{ sInTableCell = False } + spaces + return cell' + where + -- The parsing of empty cells is important in LaTeX, especially when dealing + -- with multirow/multicolumn. See #6603. + parseEmptyCell = spaces $> emptyCell + parseSimpleCell = simpleCell <$> (plainify <$> blocks) + + +cellAlignment :: PandocMonad m => LP m Alignment +cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|') + where + alignment = do + c <- untoken <$> singleChar + return $ case c of + "l" -> AlignLeft + "r" -> AlignRight + "c" -> AlignCenter + "*" -> AlignDefault + _ -> AlignDefault + +plainify :: Blocks -> Blocks +plainify bs = case toList bs of + [Para ils] -> plain (fromList ils) + _ -> bs + +multirowCell :: PandocMonad m => LP m Blocks -> LP m Cell +multirowCell blocks = controlSeq "multirow" >> do + -- Full prototype for \multirow macro is: + -- \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text} + -- However, everything except `nrows` and `text` make + -- sense in the context of the Pandoc AST + _ <- optional $ symbol '[' *> cellAlignment <* symbol ']' -- vertical position + nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced + _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- bigstrut-related + _ <- symbol '{' *> manyTill anyTok (symbol '}') -- Cell width + _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- Length used for fine-tuning + content <- symbol '{' *> (plainify <$> blocks) <* symbol '}' + return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content + +multicolumnCell :: PandocMonad m => LP m Blocks -> LP m Cell +multicolumnCell blocks = controlSeq "multicolumn" >> do + span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced + alignment <- symbol '{' *> cellAlignment <* symbol '}' + + let singleCell = do + content <- plainify <$> blocks + return $ cell alignment (RowSpan 1) (ColSpan span') content + + -- Two possible contents: either a \multirow cell, or content. + -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}} + -- Note that a \multirow cell can be nested in a \multicolumn, + -- but not the other way around. See #6603 + let nestedCell = do + (Cell _ _ (RowSpan rs) _ bs) <- multirowCell blocks + return $ cell + alignment + (RowSpan rs) + (ColSpan span') + (fromList bs) + + symbol '{' *> (nestedCell <|> singleCell) <* symbol '}' + +-- LaTeX tables are stored with empty cells underneath multirow cells +-- denoting the grid spaces taken up by them. More specifically, if a +-- cell spans m rows, then it will overwrite all the cells in the +-- columns it spans for (m-1) rows underneath it, requiring padding +-- cells in these places. These padding cells need to be removed for +-- proper table reading. See #6603. +-- +-- These fixTable functions do not otherwise fix up malformed +-- input tables: that is left to the table builder. +fixTableHead :: TableHead -> TableHead +fixTableHead (TableHead attr rows) = TableHead attr rows' + where + rows' = fixTableRows rows + +fixTableBody :: TableBody -> TableBody +fixTableBody (TableBody attr rhc th tb) + = TableBody attr rhc th' tb' + where + th' = fixTableRows th + tb' = fixTableRows tb + +fixTableRows :: [Row] -> [Row] +fixTableRows = fixTableRows' $ repeat Nothing + where + fixTableRows' oldHang (Row attr cells : rs) + = let (newHang, cells') = fixTableRow oldHang cells + rs' = fixTableRows' newHang rs + in Row attr cells' : rs' + fixTableRows' _ [] = [] + +-- The overhang is represented as Just (relative cell dimensions) or +-- Nothing for an empty grid space. +fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell]) +fixTableRow oldHang cells + -- If there's overhang, drop cells until their total width meets the + -- width of the occupied grid spaces (or we run out) + | (n, prefHang, restHang) <- splitHang oldHang + , n > 0 + = let cells' = dropToWidth getCellW n cells + (restHang', cells'') = fixTableRow restHang cells' + in (prefHang restHang', cells'') + -- Otherwise record the overhang of a pending cell and fix the rest + -- of the row + | c@(Cell _ _ h w _):cells' <- cells + = let h' = max 1 h + w' = max 1 w + oldHang' = dropToWidth getHangW w' oldHang + (newHang, cells'') = fixTableRow oldHang' cells' + in (toHang w' h' <> newHang, c : cells'') + | otherwise + = (oldHang, []) + where + getCellW (Cell _ _ _ w _) = w + getHangW = maybe 1 fst + getCS (ColSpan n) = n + + toHang c r + | r > 1 = [Just (c, r)] + | otherwise = replicate (getCS c) Nothing + + -- Take the prefix of the overhang list representing filled grid + -- spaces. Also return the remainder and the length of this prefix. + splitHang = splitHang' 0 id + + splitHang' !n l (Just (c, r):xs) + = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs + splitHang' n l xs = (n, l, xs) + + -- Drop list items until the total width of the dropped items + -- exceeds the passed width. + dropToWidth _ n l | n < 1 = l + dropToWidth wproj n (c:cs) = dropToWidth wproj (n - wproj c) cs + dropToWidth _ _ [] = [] + +simpTable :: PandocMonad m + => LP m Blocks + -> LP m Inlines + -> Text + -> Bool + -> LP m Blocks +simpTable blocks inline envname hasWidthParameter = try $ do + when hasWidthParameter $ () <$ tokWith inline + skipopts + colspecs <- parseAligns + let (aligns, widths, prefsufs) = unzip3 colspecs + optional $ controlSeq "caption" *> setCaption inline + spaces + optional label + spaces + optional lbreak + spaces + skipMany hline + spaces + header' <- option [] . try . fmap (:[]) $ + parseTableRow blocks inline envname prefsufs <* + lbreak <* many1 hline + spaces + rows <- sepEndBy (parseTableRow blocks inline envname prefsufs) + (lbreak <* optional (skipMany hline)) + spaces + optional $ controlSeq "caption" *> setCaption inline + spaces + optional label + spaces + optional lbreak + spaces + lookAhead $ controlSeq "end" -- make sure we're at end + let th = fixTableHead $ TableHead nullAttr header' + let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows] + let tf = TableFoot nullAttr [] + return $ table emptyCaption (zip aligns widths) th tbs tf + +addTableCaption :: PandocMonad m => Blocks -> LP m Blocks +addTableCaption = walkM go + where go (Table attr c spec th tb tf) = do + st <- getState + let mblabel = sLastLabel st + capt <- case (sCaption st, mblabel) of + (Just ils, Nothing) -> return $ caption Nothing (plain ils) + (Just ils, Just lab) -> do + num <- getNextNumber sLastTableNum + setState + st{ sLastTableNum = num + , sLabels = M.insert lab + [Str (renderDottedNum num)] + (sLabels st) } + return $ caption Nothing (plain ils) -- add number?? + (Nothing, _) -> return c + let attr' = case (attr, mblabel) of + ((_,classes,kvs), Just ident) -> + (ident,classes,kvs) + _ -> attr + return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf + go x = return x + +-- TODO: For now we add a Div to contain table attributes, since +-- most writers don't do anything yet with attributes on Table. +-- This can be removed when that changes. +addAttrDiv :: Attr -> Block -> Block +addAttrDiv ("",[],[]) b = b +addAttrDiv attr b = Div attr [b] |