aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-25 11:55:44 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-25 12:01:35 -0700
commit31759731e754de9b37131229f65a5f8787b53a5d (patch)
treef42678680107a4d1b874a1c18eec996a3be5a8e8
parent07fc8501726563d32b57fa5740e90dec17f8f4a8 (diff)
downloadpandoc-31759731e754de9b37131229f65a5f8787b53a5d.tar.gz
Implemented groff table lexing.
We don't yet actually parse the tables in man, but most of the hard work is done. Also: Export lexGroff from T.P.Readers.Groff, instead of lower-level definitions. Rename things in T.P.Readers.Groff as `*Groff` rather than `*Man`.
-rw-r--r--src/Text/Pandoc/Readers/Groff.hs191
-rw-r--r--src/Text/Pandoc/Readers/Man.hs51
2 files changed, 162 insertions, 80 deletions
diff --git a/src/Text/Pandoc/Readers/Groff.hs b/src/Text/Pandoc/Readers/Groff.hs
index 4b92bd85a..e48f53432 100644
--- a/src/Text/Pandoc/Readers/Groff.hs
+++ b/src/Text/Pandoc/Readers/Groff.hs
@@ -37,18 +37,16 @@ module Text.Pandoc.Readers.Groff
, defaultFontSpec
, LinePart(..)
, Arg
- , ManToken(..)
- , ManTokens(..)
- , singleTok
- , RoffState(..)
- , ManLexer
- , manToken
+ , GroffToken(..)
+ , GroffTokens(..)
, linePartsToString
+ , lexGroff
)
where
import Prelude
import Control.Monad (void, mzero, guard, when)
+import Control.Monad.Except (throwError)
import Text.Pandoc.Class
(getResourcePath, readFileFromDirs, PandocMonad(..), report)
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
@@ -56,7 +54,6 @@ import Data.Default (Default)
import qualified Data.Map as M
import Data.List (intercalate, isSuffixOf)
import qualified Data.Text as T
-import Text.Pandoc.Builder as B
import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Options
import Text.Pandoc.Parsing
@@ -92,21 +89,22 @@ data LinePart = RoffStr String
type Arg = [LinePart]
-- TODO parse tables (see man tbl)
-data ManToken = MLine [LinePart]
+data GroffToken = MLine [LinePart]
| MEmptyLine
| MMacro MacroKind [Arg] SourcePos
- | MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
+ | MTable [[String]] [[GroffTokens]] SourcePos
deriving Show
-newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
+newtype GroffTokens = GroffTokens { unGroffTokens :: Seq.Seq GroffToken }
deriving (Show, Semigroup, Monoid)
-singleTok :: ManToken -> ManTokens
-singleTok t = ManTokens (Seq.singleton t)
+singleTok :: GroffToken -> GroffTokens
+singleTok t = GroffTokens (Seq.singleton t)
-data RoffState = RoffState { customMacros :: M.Map String ManTokens
+data RoffState = RoffState { customMacros :: M.Map String GroffTokens
, prevFont :: FontSpec
, currentFont :: FontSpec
+ , tableTabChar :: Char
} deriving Show
instance Default RoffState where
@@ -120,12 +118,13 @@ instance Default RoffState where
, ("R", "\x00AE") ]
, prevFont = defaultFontSpec
, currentFont = defaultFontSpec
+ , tableTabChar = '\t'
}
-type ManLexer m = ParserT [Char] RoffState m
+type GroffLexer m = ParserT [Char] RoffState m
--
--- Lexer: String -> ManToken
+-- Lexer: String -> GroffToken
--
eofline :: Stream s m Char => ParsecT s u m ()
@@ -142,7 +141,7 @@ combiningAccentsMap :: M.Map String Char
combiningAccentsMap =
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-escape :: PandocMonad m => ManLexer m [LinePart]
+escape :: PandocMonad m => GroffLexer m [LinePart]
escape = do
char '\\'
c <- anyChar
@@ -224,14 +223,14 @@ escape = do
Nothing -> mzero
Just c -> return c
- escUnknown :: PandocMonad m => String -> ManLexer m [LinePart]
+ escUnknown :: PandocMonad m => String -> GroffLexer m [LinePart]
escUnknown s = do
pos <- getPosition
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
return [RoffStr "\xFFFD"]
-- \s-1 \s0
-escFontSize :: PandocMonad m => ManLexer m [LinePart]
+escFontSize :: PandocMonad m => GroffLexer m [LinePart]
escFontSize = do
let sign = option "" $ count 1 (oneOf "+-")
let toFontSize xs =
@@ -253,7 +252,7 @@ escFontSize = do
toFontSize (s ++ ds)
]
-escFont :: PandocMonad m => ManLexer m [LinePart]
+escFont :: PandocMonad m => GroffLexer m [LinePart]
escFont = do
font <- choice
[ char 'S' >> return defaultFontSpec
@@ -267,7 +266,7 @@ escFont = do
, currentFont = font }
return [Font font]
-lettersFont :: PandocMonad m => ManLexer m FontSpec
+lettersFont :: PandocMonad m => GroffLexer m FontSpec
lettersFont = try $ do
char '['
fs <- many letterFontKind
@@ -277,7 +276,7 @@ lettersFont = try $ do
then prevFont <$> getState
else return $ foldr ($) defaultFontSpec fs
-letterFontKind :: PandocMonad m => ManLexer m (FontSpec -> FontSpec)
+letterFontKind :: PandocMonad m => GroffLexer m (FontSpec -> FontSpec)
letterFontKind = choice [
oneOf ['B','b'] >> return (\fs -> fs{ fontBold = True })
, oneOf ['I','i'] >> return (\fs -> fs { fontItalic = True })
@@ -288,7 +287,7 @@ letterFontKind = choice [
-- separate function from lexMacro since real man files sometimes do not
-- follow the rules
-lexComment :: PandocMonad m => ManLexer m ManTokens
+lexComment :: PandocMonad m => GroffLexer m GroffTokens
lexComment = do
try $ string ".\\\""
many Parsec.space
@@ -296,18 +295,18 @@ lexComment = do
char '\n'
return mempty
-lexMacro :: PandocMonad m => ManLexer m ManTokens
+lexMacro :: PandocMonad m => GroffLexer m GroffTokens
lexMacro = do
pos <- getPosition
char '.' <|> char '\''
- many spacetab
+ skipMany spacetab
macroName <- many (satisfy (not . isSpace))
case macroName of
"nop" -> return mempty
"ie" -> lexConditional
"if" -> lexConditional
"el" -> skipConditional
- "TS" -> lexTable
+ "TS" -> lexTable pos
_ -> do
args <- lexArgs
@@ -323,30 +322,100 @@ lexMacro = do
"so" -> lexIncludeFile args
_ -> resolveMacro macroName args pos
--- | TODO placeholder
-lexTable :: PandocMonad m => ManLexer m ManTokens
-lexTable = do
- pos <- getPosition
- manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
- report $ SkippedContent "table" pos
- return mempty
-
+lexTable :: PandocMonad m => SourcePos -> GroffLexer m GroffTokens
+lexTable pos = do
+ spaces
+ optional tableOptions
+ spaces
+ aligns <- tableFormatSpec
+ spaces
+ rows <- manyTill tableRow (try (string ".TE" >> skipMany spacetab >> eofline))
+ return $ singleTok $ MTable aligns rows pos
+
+tableCell :: PandocMonad m => GroffLexer m GroffTokens
+tableCell = (enclosedCell <|> simpleCell) >>= lexGroff . T.pack
+ where
+ enclosedCell = do
+ try (string "T{")
+ manyTill anyChar (try (string "T}"))
+ simpleCell = do
+ tabChar <- tableTabChar <$> getState
+ many1 (notFollowedBy (char tabChar <|> newline) >> anyChar)
+
+tableRow :: PandocMonad m => GroffLexer m [GroffTokens]
+tableRow = do
+ tabChar <- tableTabChar <$> getState
+ c <- tableCell
+ cs <- many $ try (char tabChar >> tableCell)
+ skipMany spacetab
+ eofline
+ return (c:cs)
+
+tableOptions :: PandocMonad m => GroffLexer m ()
+tableOptions = try $ do
+ opts <- many1 tableOption <* spaces <* char ';'
+ case lookup "tab" opts of
+ Just (c:_) -> modifyState $ \st -> st{ tableTabChar = c }
+ _ -> modifyState $ \st -> st{ tableTabChar = '\t' }
+ return ()
+
+tableOption :: PandocMonad m => GroffLexer m (String, String)
+tableOption = do
+ k <- many1 letter
+ v <- option "" $ do
+ char '('
+ manyTill anyChar (char ')')
+ spaces
+ optional (char ',')
+ return (k,v)
+
+tableFormatSpec :: PandocMonad m => GroffLexer m [[String]]
+tableFormatSpec = do
+ speclines <- tableFormatSpecLine `sepBy1` (newline <|> char ',')
+ char '.'
+ return speclines
+
+tableFormatSpecLine :: PandocMonad m => GroffLexer m [String]
+tableFormatSpecLine = do
+ as <- many1 $ skipMany spacetab >> tableColFormat
+ skipMany spacetab
+ return as
+
+tableColFormat :: PandocMonad m => GroffLexer m String
+tableColFormat = do
+ pipePrefix <- option "" $ try $ string "|" <* notFollowedBy spacetab
+ c <- oneOf ['a','A','c','C','l','L','n','N','r','R','s','S','^','_','-',
+ '=','|']
+ numsuffix <- option "" $ many1 digit
+ suffixes <- many $ do
+ x <- oneOf ['b','B','d','D','e','E','f','F','i','I','m','M',
+ 'p','P','t','T','u','U','v','V','w','W','x','X', 'z','Z']
+ num <- if x == 'w'
+ then many1 digit <|>
+ do char '('
+ xs <- manyTill anyChar (char ')')
+ return ("(" ++ xs ++ ")")
+ else return ""
+ return $ x : num
+ pipeSuffix <- option "" $ string "|"
+ return $ pipePrefix ++ (c : numsuffix ++ concat suffixes ++ pipeSuffix)
+
-- We don't fully handle the conditional. But we do
-- include everything under '.ie n', which occurs commonly
-- in man pages. We always skip the '.el' part.
-lexConditional :: PandocMonad m => ManLexer m ManTokens
+lexConditional :: PandocMonad m => GroffLexer m GroffTokens
lexConditional = do
skipMany spacetab
lexNCond <|> skipConditional
-- n means nroff mode
-lexNCond :: PandocMonad m => ManLexer m ManTokens
+lexNCond :: PandocMonad m => GroffLexer m GroffTokens
lexNCond = do
char '\n'
many1 spacetab
lexGroup <|> manToken
-lexGroup :: PandocMonad m => ManLexer m ManTokens
+lexGroup :: PandocMonad m => GroffLexer m GroffTokens
lexGroup = do
groupstart
mconcat <$> manyTill manToken groupend
@@ -354,14 +423,14 @@ lexGroup = do
groupstart = try $ string "\\{\\" >> newline
groupend = try $ string "\\}" >> eofline
-skipConditional :: PandocMonad m => ManLexer m ManTokens
+skipConditional :: PandocMonad m => GroffLexer m GroffTokens
skipConditional = do
rest <- anyLine
when ("\\{\\" `isSuffixOf` rest) $
void $ manyTill anyChar (try (string "\\}"))
return mempty
-lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
+lexIncludeFile :: PandocMonad m => [Arg] -> GroffLexer m GroffTokens
lexIncludeFile args = do
pos <- getPosition
case args of
@@ -376,7 +445,7 @@ lexIncludeFile args = do
[] -> return mempty
resolveMacro :: PandocMonad m
- => String -> [Arg] -> SourcePos -> ManLexer m ManTokens
+ => String -> [Arg] -> SourcePos -> GroffLexer m GroffTokens
resolveMacro macroName args pos = do
macros <- customMacros <$> getState
case M.lookup macroName macros of
@@ -390,9 +459,9 @@ resolveMacro macroName args pos = do
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
- return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
+ return $ GroffTokens . fmap fillMacroArg . unGroffTokens $ ts
-lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
+lexStringDef :: PandocMonad m => [Arg] -> GroffLexer m GroffTokens
lexStringDef args = do -- string definition
case args of
[] -> fail "No argument to .ds"
@@ -403,7 +472,7 @@ lexStringDef args = do -- string definition
st{ customMacros = M.insert stringName ts (customMacros st) }
return mempty
-lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
+lexMacroDef :: PandocMonad m => [Arg] -> GroffLexer m GroffTokens
lexMacroDef args = do -- macro definition
(macroName, stopMacro) <-
case args of
@@ -413,7 +482,7 @@ lexMacroDef args = do -- macro definition
[] -> fail "No argument to .de"
let stop = try $ do
char '.' <|> char '\''
- many spacetab
+ skipMany spacetab
string stopMacro
_ <- lexArgs
return ()
@@ -422,7 +491,7 @@ lexMacroDef args = do -- macro definition
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
-lexArgs :: PandocMonad m => ManLexer m [Arg]
+lexArgs :: PandocMonad m => GroffLexer m [Arg]
lexArgs = do
args <- many $ try oneArg
skipMany spacetab
@@ -431,20 +500,20 @@ lexArgs = do
where
- oneArg :: PandocMonad m => ManLexer m [LinePart]
+ oneArg :: PandocMonad m => GroffLexer m [LinePart]
oneArg = do
skipMany $ try $ string "\\\n" -- continuation line
try quotedArg <|> plainArg
-- try, because there are some erroneous files, e.g. linux/bpf.2
- plainArg :: PandocMonad m => ManLexer m [LinePart]
+ plainArg :: PandocMonad m => GroffLexer m [LinePart]
plainArg = do
skipMany spacetab
mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote)
where
unescapedQuote = char '"' >> return [RoffStr "\""]
- quotedArg :: PandocMonad m => ManLexer m [LinePart]
+ quotedArg :: PandocMonad m => GroffLexer m [LinePart]
quotedArg = do
skipMany spacetab
char '"'
@@ -459,7 +528,7 @@ lexArgs = do
char '"'
return [RoffStr "\""]
-escStar :: PandocMonad m => ManLexer m [LinePart]
+escStar :: PandocMonad m => GroffLexer m [LinePart]
escStar = try $ do
pos <- getPosition
c <- anyChar
@@ -478,14 +547,14 @@ escStar = try $ do
-- strings and macros share namespace
resolveString stringname pos = do
- ManTokens ts <- resolveMacro stringname [] pos
+ GroffTokens ts <- resolveMacro stringname [] pos
case Foldable.toList ts of
[MLine xs] -> return xs
_ -> do
report $ SkippedContent ("unknown string " ++ stringname) pos
return mempty
-lexLine :: PandocMonad m => ManLexer m ManTokens
+lexLine :: PandocMonad m => GroffLexer m GroffTokens
lexLine = do
lnparts <- mconcat <$> many1 linePart
eofline
@@ -496,35 +565,35 @@ lexLine = do
go (RoffStr "" : xs) = go xs
go xs = return $ singleTok $ MLine xs
-linePart :: PandocMonad m => ManLexer m [LinePart]
+linePart :: PandocMonad m => GroffLexer m [LinePart]
linePart = macroArg <|> escape <|>
regularText <|> quoteChar <|> spaceTabChar
-macroArg :: PandocMonad m => ManLexer m [LinePart]
+macroArg :: PandocMonad m => GroffLexer m [LinePart]
macroArg = try $ do
string "\\\\$"
x <- digit
return [MacroArg $ ord x - ord '0']
-regularText :: PandocMonad m => ManLexer m [LinePart]
+regularText :: PandocMonad m => GroffLexer m [LinePart]
regularText = do
s <- many1 $ noneOf "\n\r\t \\\""
return [RoffStr s]
-quoteChar :: PandocMonad m => ManLexer m [LinePart]
+quoteChar :: PandocMonad m => GroffLexer m [LinePart]
quoteChar = do
char '"'
return [RoffStr "\""]
-spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
+spaceTabChar :: PandocMonad m => GroffLexer m [LinePart]
spaceTabChar = do
c <- spacetab
return [RoffStr [c]]
-lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
+lexEmptyLine :: PandocMonad m => GroffLexer m GroffTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
-manToken :: PandocMonad m => ManLexer m ManTokens
+manToken :: PandocMonad m => GroffLexer m GroffTokens
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
linePartsToString :: [LinePart] -> String
@@ -532,3 +601,11 @@ linePartsToString = mconcat . map go
where
go (RoffStr s) = s
go _ = mempty
+
+-- | Tokenize a string as a sequence of groff tokens.
+lexGroff :: PandocMonad m => T.Text -> m GroffTokens
+lexGroff txt = do
+ eithertokens <- readWithM (mconcat <$> many manToken) def (T.unpack txt)
+ case eithertokens of
+ Left e -> throwError e
+ Right tokenz -> return tokenz
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs
index 8dda237ba..dc4ba3f52 100644
--- a/src/Text/Pandoc/Readers/Man.hs
+++ b/src/Text/Pandoc/Readers/Man.hs
@@ -61,28 +61,24 @@ instance Default ManState where
def = ManState { readerOptions = def
, metadata = nullMeta }
-type ManParser m = ParserT [ManToken] ManState m
+type ManParser m = ParserT [GroffToken] ManState m
-- | Read man (troff) from an input string and return a Pandoc document.
readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc
readMan opts txt = do
- eithertokens <- readWithM
- (Foldable.toList . unManTokens . mconcat <$> many manToken)
- def (T.unpack $ crFilter txt)
- case eithertokens of
- Left e -> throwError e
- Right tokenz -> do
- let state = def {readerOptions = opts} :: ManState
- eitherdoc <- readWithMTokens parseMan state tokenz
- either throwError return eitherdoc
+ tokenz <- lexGroff (crFilter txt)
+ let state = def {readerOptions = opts} :: ManState
+ eitherdoc <- readWithMTokens parseMan state
+ (Foldable.toList . unGroffTokens $ tokenz)
+ either throwError return eitherdoc
where
readWithMTokens :: PandocMonad m
- => ParserT [ManToken] ManState m a -- ^ parser
+ => ParserT [GroffToken] ManState m a -- ^ parser
-> ManState -- ^ initial state
- -> [ManToken] -- ^ input
+ -> [GroffToken] -- ^ input
-> m (Either PandocError a)
readWithMTokens parser state input =
let leftF = PandocParsecError . intercalate "\n" $ show <$> input
@@ -109,19 +105,28 @@ parseBlock = choice [ parseList
, parsePara
, parseCodeBlock
, parseHeader
+ , parseTable
, skipUnkownMacro
]
+parseTable :: PandocMonad m => ManParser m Blocks
+parseTable = do
+ let isMTable (MTable{}) = True
+ isMTable _ = False
+ MTable _aligns _rows pos <- msatisfy isMTable
+ report $ SkippedContent "TABLE" pos
+ return $ B.para (B.text "TABLE")
+
parseNewParagraph :: PandocMonad m => ManParser m Blocks
parseNewParagraph = do
mmacro "P" <|> mmacro "PP" <|> mmacro "LP" <|> memptyLine
return mempty
--
--- Parser: [ManToken] -> Pandoc
+-- Parser: [GroffToken] -> Pandoc
--
-msatisfy :: Monad m => (ManToken -> Bool) -> ParserT [ManToken] st m ManToken
+msatisfy :: Monad m => (GroffToken -> Bool) -> ParserT [GroffToken] st m GroffToken
msatisfy predic = tokenPrim show nextPos testTok
where
testTok t = if predic t then Just t else Nothing
@@ -130,32 +135,32 @@ msatisfy predic = tokenPrim show nextPos testTok
(setSourceColumn
(setSourceLine pos $ sourceLine pos + 1) 1) ""
-mtoken :: PandocMonad m => ManParser m ManToken
+mtoken :: PandocMonad m => ManParser m GroffToken
mtoken = msatisfy (const True)
-mline :: PandocMonad m => ManParser m ManToken
+mline :: PandocMonad m => ManParser m GroffToken
mline = msatisfy isMLine where
isMLine (MLine _) = True
isMLine _ = False
-memptyLine :: PandocMonad m => ManParser m ManToken
+memptyLine :: PandocMonad m => ManParser m GroffToken
memptyLine = msatisfy isMEmptyLine where
isMEmptyLine MEmptyLine = True
isMEmptyLine _ = False
-mmacro :: PandocMonad m => MacroKind -> ManParser m ManToken
+mmacro :: PandocMonad m => MacroKind -> ManParser m GroffToken
mmacro mk = msatisfy isMMacro where
isMMacro (MMacro mk' _ _) | mk == mk' = True
| otherwise = False
isMMacro _ = False
-mmacroAny :: PandocMonad m => ManParser m ManToken
+mmacroAny :: PandocMonad m => ManParser m GroffToken
mmacroAny = msatisfy isMMacro where
isMMacro (MMacro{}) = True
isMMacro _ = False
--
--- ManToken -> Block functions
+-- GroffToken -> Block functions
--
parseTitle :: PandocMonad m => ManParser m Blocks
@@ -284,12 +289,12 @@ lineInl = do
(MLine fragments) <- mline
return $ linePartsToInlines fragments
-bareIP :: PandocMonad m => ManParser m ManToken
+bareIP :: PandocMonad m => ManParser m GroffToken
bareIP = msatisfy isBareIP where
isBareIP (MMacro "IP" [] _) = True
isBareIP _ = False
-endmacro :: PandocMonad m => String -> ManParser m ManToken
+endmacro :: PandocMonad m => String -> ManParser m GroffToken
endmacro name = mmacro name <|> lookAhead newBlockMacro
where
newBlockMacro = msatisfy isNewBlockMacro
@@ -307,7 +312,7 @@ parseCodeBlock = try $ do
where
- extractText :: ManToken -> Maybe String
+ extractText :: GroffToken -> Maybe String
extractText (MLine ss)
| not (null ss)
, all isFontToken ss = Nothing