aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Roff.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-26 21:28:38 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-26 21:29:33 -0700
commite0f985bb2139f142223f8d21e28a3a6bf4605cb7 (patch)
tree44b06d5d5483cd1c98dfc3ff55ae866bb40df59d /src/Text/Pandoc/Readers/Roff.hs
parent52df18f476b7eb7935c2c01f566fa1adee4a8621 (diff)
downloadpandoc-e0f985bb2139f142223f8d21e28a3a6bf4605cb7.tar.gz
Rename Groff -> Roff.
Module T.P.Readers.Groff -> T.P.Readers.Roff Module T.P.Writers.Groff -> T.P.Writers.Roff Module T.P.GroffChar -> T.P.RoffChar GroffTokens -> RoffTokens GroffToken -> RoffToken.
Diffstat (limited to 'src/Text/Pandoc/Readers/Roff.hs')
-rw-r--r--src/Text/Pandoc/Readers/Roff.hs650
1 files changed, 650 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs
new file mode 100644
index 000000000..9d71c98b3
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Roff.hs
@@ -0,0 +1,650 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-
+ Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
+ and John MacFarlane
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Roff
+ Copyright : Copyright (C) 2018 Yan Pashkovsky and John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Yan Pashkovsky <yanp.bugz@gmail.com>
+ Stability : WIP
+ Portability : portable
+
+Tokenizer for groff formats (man, ms).
+-}
+module Text.Pandoc.Readers.Roff
+ ( MacroKind
+ , FontSpec(..)
+ , defaultFontSpec
+ , LinePart(..)
+ , Arg
+ , TableOption
+ , CellFormat(..)
+ , TableRow
+ , RoffToken(..)
+ , RoffTokens(..)
+ , linePartsToString
+ , lexRoff
+ )
+where
+
+import Prelude
+import Safe (lastDef)
+import Control.Monad (void, mzero, guard, when)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class
+ (getResourcePath, readFileFromDirs, PandocMonad(..), report)
+import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
+import Data.Default (Default)
+import qualified Data.Map as M
+import Data.List (intercalate, isSuffixOf)
+import qualified Data.Text as T
+import Text.Pandoc.Logging (LogMessage(..))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared (safeRead)
+import Text.Parsec hiding (tokenPrim)
+import qualified Text.Parsec as Parsec
+import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+import qualified Data.Sequence as Seq
+import qualified Data.Foldable as Foldable
+import qualified Data.Text.Normalize as Normalize
+
+-- import Debug.Trace (traceShowId)
+
+--
+-- Data Types
+--
+data FontSpec = FontSpec{ fontBold :: Bool
+ , fontItalic :: Bool
+ , fontMonospace :: Bool
+ } deriving (Show, Eq, Ord)
+
+defaultFontSpec :: FontSpec
+defaultFontSpec = FontSpec False False False
+
+type MacroKind = String
+
+data LinePart = RoffStr String
+ | Font FontSpec
+ | FontSize Int
+ | MacroArg Int
+ deriving Show
+
+type Arg = [LinePart]
+
+type TableOption = (String, String)
+
+data CellFormat =
+ CellFormat
+ { columnType :: Char
+ , pipePrefix :: Bool
+ , pipeSuffix :: Bool
+ , columnSuffixes :: [String]
+ } deriving (Show, Eq, Ord)
+
+type TableRow = ([CellFormat], [RoffTokens])
+
+-- TODO parse tables (see man tbl)
+data RoffToken = MLine [LinePart]
+ | MEmptyLine
+ | MMacro MacroKind [Arg] SourcePos
+ | MTable [TableOption] [TableRow] SourcePos
+ deriving Show
+
+newtype RoffTokens = RoffTokens { unRoffTokens :: Seq.Seq RoffToken }
+ deriving (Show, Semigroup, Monoid)
+
+singleTok :: RoffToken -> RoffTokens
+singleTok t = RoffTokens (Seq.singleton t)
+
+data RoffState = RoffState { customMacros :: M.Map String RoffTokens
+ , prevFont :: FontSpec
+ , currentFont :: FontSpec
+ , tableTabChar :: Char
+ } deriving Show
+
+instance Default RoffState where
+ def = RoffState { customMacros = M.fromList
+ $ map (\(n, s) ->
+ (n, singleTok
+ (MLine [RoffStr s])))
+ [ ("Tm", "\x2122")
+ , ("lq", "\x201C")
+ , ("rq", "\x201D")
+ , ("R", "\x00AE") ]
+ , prevFont = defaultFontSpec
+ , currentFont = defaultFontSpec
+ , tableTabChar = '\t'
+ }
+
+type RoffLexer m = ParserT [Char] RoffState m
+
+--
+-- Lexer: String -> RoffToken
+--
+
+eofline :: Stream s m Char => ParsecT s u m ()
+eofline = void newline <|> eof
+
+spacetab :: Stream s m Char => ParsecT s u m Char
+spacetab = char ' ' <|> char '\t'
+
+characterCodeMap :: M.Map String Char
+characterCodeMap =
+ M.fromList $ map (\(x,y) -> (y,x)) characterCodes
+
+combiningAccentsMap :: M.Map String Char
+combiningAccentsMap =
+ M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
+
+escape :: PandocMonad m => RoffLexer m [LinePart]
+escape = do
+ char '\\'
+ c <- anyChar
+ case c of
+ 'f' -> escFont
+ 's' -> escFontSize
+ '*' -> escStar
+ '(' -> twoCharGlyph
+ '[' -> bracketedGlyph
+ '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
+ '#' -> mempty <$ manyTill anyChar newline
+ '%' -> return mempty
+ '{' -> return mempty
+ '}' -> return mempty
+ '&' -> return mempty
+ '\n' -> return mempty
+ ':' -> return mempty
+ '0' -> return mempty
+ 'c' -> return mempty
+ '-' -> return [RoffStr "-"]
+ '_' -> return [RoffStr "_"]
+ ' ' -> return [RoffStr " "]
+ '\\' -> return [RoffStr "\\"]
+ 't' -> return [RoffStr "\t"]
+ 'e' -> return [RoffStr "\\"]
+ '`' -> return [RoffStr "`"]
+ '^' -> return [RoffStr " "]
+ '|' -> return [RoffStr " "]
+ '\'' -> return [RoffStr "`"]
+ '.' -> return [RoffStr "`"]
+ '~' -> return [RoffStr "\160"] -- nonbreaking space
+ _ -> escUnknown ['\\',c]
+
+ where
+
+ twoCharGlyph = do
+ cs <- count 2 anyChar
+ case M.lookup cs characterCodeMap of
+ Just c -> return [RoffStr [c]]
+ Nothing -> escUnknown ('\\':'(':cs)
+
+ bracketedGlyph = unicodeGlyph <|> charGlyph
+
+ charGlyph = do
+ cs <- manyTill (noneOf ['[',']','\n']) (char ']')
+ (case words cs of
+ [] -> mzero
+ [s] -> case M.lookup s characterCodeMap of
+ Nothing -> mzero
+ Just c -> return [RoffStr [c]]
+ (s:ss) -> do
+ basechar <- case M.lookup cs characterCodeMap of
+ Nothing ->
+ case s of
+ [ch] | isAscii ch && isAlphaNum ch ->
+ return ch
+ _ -> mzero
+ Just c -> return c
+ let addAccents [] xs = return $ T.unpack .
+ Normalize.normalize Normalize.NFC .
+ T.pack $ reverse xs
+ addAccents (a:as) xs =
+ case M.lookup a combiningAccentsMap of
+ Just x -> addAccents as (x:xs)
+ Nothing -> mzero
+ addAccents ss [basechar] >>= \xs -> return [RoffStr xs])
+ <|> escUnknown ("\\[" ++ cs ++ "]")
+
+ unicodeGlyph = try $ do
+ xs <- ucharCode `sepBy1` (char '_') <* char ']'
+ return [RoffStr xs]
+
+ ucharCode = try $ do
+ char 'u'
+ cs <- many1 (satisfy isHexDigit)
+ let lcs = length cs
+ guard $ lcs >= 4 && lcs <= 6
+ case chr <$> safeRead ('0':'x':cs) of
+ Nothing -> mzero
+ Just c -> return c
+
+ escUnknown :: PandocMonad m => String -> RoffLexer m [LinePart]
+ escUnknown s = do
+ pos <- getPosition
+ report $ SkippedContent ("Unknown escape sequence " ++ s) pos
+ return [RoffStr "\xFFFD"]
+
+-- \s-1 \s0
+escFontSize :: PandocMonad m => RoffLexer m [LinePart]
+escFontSize = do
+ let sign = option "" $ ("-" <$ char '-' <|> "" <$ char '+')
+ let toFontSize xs =
+ case safeRead xs of
+ Nothing -> mzero
+ Just n -> return [FontSize n]
+ choice
+ [ do char '('
+ s <- sign
+ ds <- count 2 digit
+ toFontSize (s ++ ds)
+ , do char '['
+ s <- sign
+ ds <- many1 digit
+ char ']'
+ toFontSize (s ++ ds)
+ , do s <- sign
+ ds <- count 1 digit
+ toFontSize (s ++ ds)
+ ]
+
+escFont :: PandocMonad m => RoffLexer m [LinePart]
+escFont = do
+ font <- choice
+ [ char 'S' >> return defaultFontSpec
+ , digit >> return defaultFontSpec
+ , char '(' >> anyChar >> anyChar >> return defaultFontSpec
+ , digit >> return defaultFontSpec
+ , ($ defaultFontSpec) <$> letterFontKind
+ , lettersFont
+ ]
+ modifyState $ \st -> st{ prevFont = currentFont st
+ , currentFont = font }
+ return [Font font]
+
+lettersFont :: PandocMonad m => RoffLexer m FontSpec
+lettersFont = try $ do
+ char '['
+ fs <- many letterFontKind
+ skipMany letter
+ char ']'
+ if null fs
+ then prevFont <$> getState
+ else return $ foldr ($) defaultFontSpec fs
+
+letterFontKind :: PandocMonad m => RoffLexer m (FontSpec -> FontSpec)
+letterFontKind = choice [
+ oneOf ['B','b'] >> return (\fs -> fs{ fontBold = True })
+ , oneOf ['I','i'] >> return (\fs -> fs { fontItalic = True })
+ , oneOf ['C','c'] >> return (\fs -> fs { fontMonospace = True })
+ , oneOf ['P','p','R','r'] >> return id
+ ]
+
+
+-- separate function from lexMacro since real man files sometimes do not
+-- follow the rules
+lexComment :: PandocMonad m => RoffLexer m RoffTokens
+lexComment = do
+ try $ string ".\\\""
+ many Parsec.space
+ skipMany $ noneOf "\n"
+ char '\n'
+ return mempty
+
+lexMacro :: PandocMonad m => RoffLexer m RoffTokens
+lexMacro = do
+ pos <- getPosition
+ char '.' <|> char '\''
+ skipMany spacetab
+ macroName <- many (satisfy (not . isSpace))
+ case macroName of
+ "nop" -> return mempty
+ "ie" -> lexConditional
+ "if" -> lexConditional
+ "el" -> skipConditional
+ "TS" -> lexTable pos
+
+ _ -> do
+ args <- lexArgs
+ case macroName of
+ "" -> return mempty
+ "\\\"" -> return mempty
+ "\\#" -> return mempty
+ "de" -> lexMacroDef args
+ "de1" -> lexMacroDef args
+ "ds" -> lexStringDef args
+ "ds1" -> lexStringDef args
+ "sp" -> return $ singleTok MEmptyLine
+ "so" -> lexIncludeFile args
+ _ -> resolveMacro macroName args pos
+
+lexTable :: PandocMonad m => SourcePos -> RoffLexer m RoffTokens
+lexTable pos = do
+ skipMany lexComment
+ spaces
+ opts <- option [] tableOptions
+ case lookup "tab" opts of
+ Just (c:_) -> modifyState $ \st -> st{ tableTabChar = c }
+ _ -> modifyState $ \st -> st{ tableTabChar = '\t' }
+ spaces
+ skipMany lexComment
+ spaces
+ rows <- lexTableRows
+ morerows <- many $ try $ do
+ string ".T&"
+ skipMany spacetab
+ newline
+ lexTableRows
+ string ".TE"
+ skipMany spacetab
+ eofline
+ return $ singleTok $ MTable opts (rows ++ concat morerows) pos
+
+lexTableRows :: PandocMonad m => RoffLexer m [TableRow]
+lexTableRows = do
+ aligns <- tableFormatSpec
+ spaces
+ skipMany lexComment
+ spaces
+ rows <- many (notFollowedBy (try (string ".TE") <|> try (string ".T&")) >>
+ tableRow)
+ return $ zip aligns rows
+
+tableCell :: PandocMonad m => RoffLexer m RoffTokens
+tableCell = (enclosedCell <|> simpleCell) >>= lexRoff . T.pack
+ where
+ enclosedCell = do
+ try (string "T{")
+ manyTill anyChar (try (string "T}"))
+ simpleCell = do
+ tabChar <- tableTabChar <$> getState
+ many (notFollowedBy (char tabChar <|> newline) >> anyChar)
+
+tableRow :: PandocMonad m => RoffLexer m [RoffTokens]
+tableRow = do
+ tabChar <- tableTabChar <$> getState
+ c <- tableCell
+ cs <- many $ try (char tabChar >> tableCell)
+ skipMany spacetab
+ eofline
+ skipMany lexComment
+ return (c:cs)
+
+tableOptions :: PandocMonad m => RoffLexer m [TableOption]
+tableOptions = try $ many1 tableOption <* spaces <* char ';'
+
+tableOption :: PandocMonad m => RoffLexer m TableOption
+tableOption = do
+ k <- many1 letter
+ v <- option "" $ do
+ char '('
+ manyTill anyChar (char ')')
+ skipMany spacetab
+ optional (char ',')
+ skipMany spacetab
+ return (k,v)
+
+tableFormatSpec :: PandocMonad m => RoffLexer m [[CellFormat]]
+tableFormatSpec = do
+ speclines <- tableFormatSpecLine `sepBy1` (newline <|> char ',')
+ skipMany spacetab
+ char '.'
+ return $ speclines ++ repeat (lastDef [] speclines) -- last line is default
+
+tableFormatSpecLine :: PandocMonad m => RoffLexer m [CellFormat]
+tableFormatSpecLine =
+ many1 $ try $ skipMany spacetab >> tableColFormat
+
+tableColFormat :: PandocMonad m => RoffLexer m CellFormat
+tableColFormat = do
+ pipePrefix' <- option False
+ $ True <$ (try $ string "|" <* notFollowedBy spacetab)
+ c <- oneOf ['a','A','c','C','l','L','n','N','r','R','s','S','^','_','-',
+ '=','|']
+ numsuffixes <- option [] $ (:[]) <$> many1 digit
+ suffixes <- many $ do
+ x <- oneOf ['b','B','d','D','e','E','f','F','i','I','m','M',
+ 'p','P','t','T','u','U','v','V','w','W','x','X', 'z','Z']
+ num <- if x == 'w'
+ then many1 digit <|>
+ do char '('
+ xs <- manyTill anyChar (char ')')
+ return ("(" ++ xs ++ ")")
+ else return ""
+ return $ x : num
+ pipeSuffix' <- option False $ True <$ string "|"
+ return $ CellFormat
+ { columnType = c
+ , pipePrefix = pipePrefix'
+ , pipeSuffix = pipeSuffix'
+ , columnSuffixes = numsuffixes ++ suffixes }
+
+-- We don't fully handle the conditional. But we do
+-- include everything under '.ie n', which occurs commonly
+-- in man pages. We always skip the '.el' part.
+lexConditional :: PandocMonad m => RoffLexer m RoffTokens
+lexConditional = do
+ skipMany spacetab
+ lexNCond <|> skipConditional
+
+-- n means nroff mode
+lexNCond :: PandocMonad m => RoffLexer m RoffTokens
+lexNCond = do
+ char '\n'
+ many1 spacetab
+ lexGroup <|> manToken
+
+lexGroup :: PandocMonad m => RoffLexer m RoffTokens
+lexGroup = do
+ groupstart
+ mconcat <$> manyTill manToken groupend
+ where
+ groupstart = try $ string "\\{\\" >> newline
+ groupend = try $ string "\\}" >> eofline
+
+skipConditional :: PandocMonad m => RoffLexer m RoffTokens
+skipConditional = do
+ rest <- anyLine
+ when ("\\{\\" `isSuffixOf` rest) $
+ void $ manyTill anyChar (try (string "\\}"))
+ return mempty
+
+lexIncludeFile :: PandocMonad m => [Arg] -> RoffLexer m RoffTokens
+lexIncludeFile args = do
+ pos <- getPosition
+ case args of
+ (f:_) -> do
+ let fp = linePartsToString f
+ dirs <- getResourcePath
+ result <- readFileFromDirs dirs fp
+ case result of
+ Nothing -> report $ CouldNotLoadIncludeFile fp pos
+ Just s -> getInput >>= setInput . (s ++)
+ return mempty
+ [] -> return mempty
+
+resolveMacro :: PandocMonad m
+ => String -> [Arg] -> SourcePos -> RoffLexer m RoffTokens
+resolveMacro macroName args pos = do
+ macros <- customMacros <$> getState
+ case M.lookup macroName macros of
+ Nothing -> return $ singleTok $ MMacro macroName args pos
+ Just ts -> do
+ let fillLP (MacroArg i) zs =
+ case drop (i - 1) args of
+ [] -> zs
+ (ys:_) -> ys ++ zs
+ fillLP z zs = z : zs
+ let fillMacroArg (MLine lineparts) =
+ MLine (foldr fillLP [] lineparts)
+ fillMacroArg x = x
+ return $ RoffTokens . fmap fillMacroArg . unRoffTokens $ ts
+
+lexStringDef :: PandocMonad m => [Arg] -> RoffLexer m RoffTokens
+lexStringDef args = do -- string definition
+ case args of
+ [] -> fail "No argument to .ds"
+ (x:ys) -> do
+ let ts = singleTok $ MLine (intercalate [RoffStr " " ] ys)
+ let stringName = linePartsToString x
+ modifyState $ \st ->
+ st{ customMacros = M.insert stringName ts (customMacros st) }
+ return mempty
+
+lexMacroDef :: PandocMonad m => [Arg] -> RoffLexer m RoffTokens
+lexMacroDef args = do -- macro definition
+ (macroName, stopMacro) <-
+ case args of
+ (x : y : _) -> return (linePartsToString x, linePartsToString y)
+ -- optional second arg
+ (x:_) -> return (linePartsToString x, ".")
+ [] -> fail "No argument to .de"
+ let stop = try $ do
+ char '.' <|> char '\''
+ skipMany spacetab
+ string stopMacro
+ _ <- lexArgs
+ return ()
+ ts <- mconcat <$> manyTill manToken stop
+ modifyState $ \st ->
+ st{ customMacros = M.insert macroName ts (customMacros st) }
+ return mempty
+
+lexArgs :: PandocMonad m => RoffLexer m [Arg]
+lexArgs = do
+ args <- many $ try oneArg
+ skipMany spacetab
+ eofline
+ return args
+
+ where
+
+ oneArg :: PandocMonad m => RoffLexer m [LinePart]
+ oneArg = do
+ skipMany $ try $ string "\\\n" -- continuation line
+ try quotedArg <|> plainArg
+ -- try, because there are some erroneous files, e.g. linux/bpf.2
+
+ plainArg :: PandocMonad m => RoffLexer m [LinePart]
+ plainArg = do
+ skipMany spacetab
+ mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote)
+ where
+ unescapedQuote = char '"' >> return [RoffStr "\""]
+
+ quotedArg :: PandocMonad m => RoffLexer m [LinePart]
+ quotedArg = do
+ skipMany spacetab
+ char '"'
+ xs <- mconcat <$>
+ many (macroArg <|> escape <|> regularText
+ <|> spaceTabChar <|> escapedQuote)
+ char '"'
+ return xs
+ where
+ escapedQuote = try $ do
+ char '"'
+ char '"'
+ return [RoffStr "\""]
+
+escStar :: PandocMonad m => RoffLexer m [LinePart]
+escStar = try $ do
+ pos <- getPosition
+ c <- anyChar
+ case c of
+ '(' -> do
+ cs <- count 2 anyChar
+ resolveString cs pos
+ '[' -> do
+ cs <- many (noneOf "\t\n\r ]")
+ char ']'
+ resolveString cs pos
+ 'S' -> return mempty -- switch back to default font size
+ _ -> resolveString [c] pos
+
+ where
+
+ -- strings and macros share namespace
+ resolveString stringname pos = do
+ RoffTokens ts <- resolveMacro stringname [] pos
+ case Foldable.toList ts of
+ [MLine xs] -> return xs
+ _ -> do
+ report $ SkippedContent ("unknown string " ++ stringname) pos
+ return mempty
+
+lexLine :: PandocMonad m => RoffLexer m RoffTokens
+lexLine = do
+ lnparts <- mconcat <$> many1 linePart
+ eofline
+ go lnparts
+ where -- return empty line if we only have empty strings;
+ -- this can happen if the line just contains \f[C], for example.
+ go [] = return mempty
+ go (RoffStr "" : xs) = go xs
+ go xs = return $ singleTok $ MLine xs
+
+linePart :: PandocMonad m => RoffLexer m [LinePart]
+linePart = macroArg <|> escape <|>
+ regularText <|> quoteChar <|> spaceTabChar
+
+macroArg :: PandocMonad m => RoffLexer m [LinePart]
+macroArg = try $ do
+ string "\\\\$"
+ x <- digit
+ return [MacroArg $ ord x - ord '0']
+
+regularText :: PandocMonad m => RoffLexer m [LinePart]
+regularText = do
+ s <- many1 $ noneOf "\n\r\t \\\""
+ return [RoffStr s]
+
+quoteChar :: PandocMonad m => RoffLexer m [LinePart]
+quoteChar = do
+ char '"'
+ return [RoffStr "\""]
+
+spaceTabChar :: PandocMonad m => RoffLexer m [LinePart]
+spaceTabChar = do
+ c <- spacetab
+ return [RoffStr [c]]
+
+lexEmptyLine :: PandocMonad m => RoffLexer m RoffTokens
+lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
+
+manToken :: PandocMonad m => RoffLexer m RoffTokens
+manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
+
+linePartsToString :: [LinePart] -> String
+linePartsToString = mconcat . map go
+ where
+ go (RoffStr s) = s
+ go _ = mempty
+
+-- | Tokenize a string as a sequence of groff tokens.
+lexRoff :: PandocMonad m => T.Text -> m RoffTokens
+lexRoff txt = do
+ eithertokens <- readWithM (mconcat <$> many manToken) def (T.unpack txt)
+ case eithertokens of
+ Left e -> throwError e
+ Right tokenz -> return tokenz