aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Man.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Man.hs')
-rw-r--r--src/Text/Pandoc/Readers/Man.hs471
1 files changed, 6 insertions, 465 deletions
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs
index f3013be5f..50ec0c019 100644
--- a/src/Text/Pandoc/Readers/Man.hs
+++ b/src/Text/Pandoc/Readers/Man.hs
@@ -18,8 +18,6 @@ You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-}
{- |
@@ -36,76 +34,25 @@ Conversion of man to 'Pandoc' document.
module Text.Pandoc.Readers.Man (readMan) where
import Prelude
-import Control.Monad (liftM, void, mzero, guard, when)
-import Control.Monad.Except (throwError)
-import Text.Pandoc.Class
- (getResourcePath, readFileFromDirs, PandocMonad(..), report)
-import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default)
+import Control.Monad (liftM, mzero, guard)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class (PandocMonad(..), report)
import Data.Maybe (catMaybes)
-import qualified Data.Map as M
-import Data.Set (Set)
import qualified Data.Set as S
-import Data.List (intersperse, intercalate, isSuffixOf)
+import Data.List (intersperse, intercalate)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
import Text.Pandoc.Error (PandocError (PandocParsecError))
import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Options
import Text.Pandoc.Parsing
-import Text.Pandoc.Shared (crFilter, safeRead)
+import Text.Pandoc.Shared (crFilter)
+import Text.Pandoc.Readers.Groff -- TODO explicit imports
import Text.Parsec hiding (tokenPrim)
import qualified Text.Parsec as Parsec
import Text.Parsec.Pos (updatePosString)
-import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
-import qualified Data.Sequence as Seq
import qualified Data.Foldable as Foldable
-import qualified Data.Text.Normalize as Normalize
-
--- import Debug.Trace (traceShowId)
-
---
--- Data Types
---
-data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
-
-type MacroKind = String
-
-type Font = Set FontKind
-
-data LinePart = RoffStr (String, Font)
- | MacroArg Int
- deriving Show
-
-type Arg = [LinePart]
-
--- TODO parse tables (see man tbl)
-data ManToken = MLine [LinePart]
- | MEmptyLine
- | MMacro MacroKind [Arg] SourcePos
- | MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
- deriving Show
-
-newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
- deriving (Show, Semigroup, Monoid)
-
-singleTok :: ManToken -> ManTokens
-singleTok t = ManTokens (Seq.singleton t)
-
-data RoffState = RoffState { fontKind :: Font
- , customMacros :: M.Map String ManTokens
- } deriving Show
-
-instance Default RoffState where
- def = RoffState { customMacros = M.fromList
- $ map (\(n, s) ->
- (n, singleTok
- (MLine [RoffStr (s, mempty)])))
- [ ("Tm", "\x2122")
- , ("lq", "\x201C")
- , ("rq", "\x201D")
- , ("R", "\x00AE") ]
- , fontKind = S.singleton Regular }
data ManState = ManState { readerOptions :: ReaderOptions
, metadata :: Meta
@@ -115,7 +62,6 @@ instance Default ManState where
def = ManState { readerOptions = def
, metadata = nullMeta }
-type ManLexer m = ParserT [Char] RoffState m
type ManParser m = ParserT [ManToken] ManState m
@@ -147,405 +93,6 @@ readMan opts txt = do
mapLeft f (Left x) = Left $ f x
mapLeft _ (Right r) = Right r
---
--- Lexer: String -> ManToken
---
-
-eofline :: Stream s m Char => ParsecT s u m ()
-eofline = void newline <|> eof
-
-spacetab :: Stream s m Char => ParsecT s u m Char
-spacetab = char ' ' <|> char '\t'
-
-characterCodeMap :: M.Map String Char
-characterCodeMap =
- M.fromList $ map (\(x,y) -> (y,x)) characterCodes
-
-combiningAccentsMap :: M.Map String Char
-combiningAccentsMap =
- M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-
-escapeLexer :: PandocMonad m => ManLexer m String
-escapeLexer = try $ do
- char '\\'
- c <- noneOf ['*','$'] -- see escStar, macroArg
- case c of
- '(' -> twoCharGlyph
- '[' -> bracketedGlyph
- 'f' -> escFont
- 's' -> escFontSize
- '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
- '#' -> mempty <$ manyTill anyChar newline
- '%' -> return mempty
- '{' -> return mempty
- '}' -> return mempty
- '&' -> return mempty
- '\n' -> return mempty
- ':' -> return mempty
- '0' -> return mempty
- 'c' -> return mempty
- '-' -> return "-"
- '_' -> return "_"
- ' ' -> return " "
- '\\' -> return "\\"
- 't' -> return "\t"
- 'e' -> return "\\"
- '`' -> return "`"
- '^' -> return " "
- '|' -> return " "
- '\'' -> return "`"
- '.' -> return "`"
- '~' -> return "\160" -- nonbreaking space
- _ -> escUnknown ['\\',c] "\xFFFD"
-
- where
-
- twoCharGlyph = do
- cs <- count 2 anyChar
- case M.lookup cs characterCodeMap of
- Just c -> return [c]
- Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
-
- bracketedGlyph = unicodeGlyph <|> charGlyph
-
- charGlyph = do
- cs <- manyTill (noneOf ['[',']','\n']) (char ']')
- (case words cs of
- [] -> mzero
- [s] -> case M.lookup s characterCodeMap of
- Nothing -> mzero
- Just c -> return [c]
- (s:ss) -> do
- basechar <- case M.lookup cs characterCodeMap of
- Nothing ->
- case s of
- [ch] | isAscii ch && isAlphaNum ch ->
- return ch
- _ -> mzero
- Just c -> return c
- let addAccents [] xs = return $ T.unpack .
- Normalize.normalize Normalize.NFC .
- T.pack $ reverse xs
- addAccents (a:as) xs =
- case M.lookup a combiningAccentsMap of
- Just x -> addAccents as (x:xs)
- Nothing -> mzero
- addAccents ss [basechar])
- <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
-
- unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
-
- ucharCode = try $ do
- char 'u'
- cs <- many1 (satisfy isHexDigit)
- let lcs = length cs
- guard $ lcs >= 4 && lcs <= 6
- case chr <$> safeRead ('0':'x':cs) of
- Nothing -> mzero
- Just c -> return c
-
- -- \s-1 \s0 -- we ignore these
- escFontSize :: PandocMonad m => ManLexer m String
- escFontSize = do
- pos <- getPosition
- pm <- option "" $ count 1 (oneOf "+-")
- ds <- many1 digit
- report $ SkippedContent ("\\s" ++ pm ++ ds) pos
- return mempty
-
- escFont :: PandocMonad m => ManLexer m String
- escFont = do
- font <- choice
- [ S.singleton <$> letterFontKind
- , char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
- , char 'S' >> return (S.singleton Regular)
- , try lettersFont
- , digit >> return (S.singleton Regular)
- ]
- modifyState (\r -> r {fontKind = font})
- return mempty
-
- lettersFont :: PandocMonad m => ManLexer m Font
- lettersFont = do
- char '['
- fs <- many letterFontKind
- skipMany letter
- char ']'
- return $ S.fromList fs
-
- letterFontKind :: PandocMonad m => ManLexer m FontKind
- letterFontKind = choice [
- oneOf ['B','b'] >> return Bold
- , oneOf ['I','i'] >> return Italic
- , oneOf ['C','c'] >> return Monospace
- , oneOf ['P','p','R','r'] >> return Regular
- ]
-
- escUnknown :: PandocMonad m => String -> a -> ManLexer m a
- escUnknown s x = do
- pos <- getPosition
- report $ SkippedContent ("Unknown escape sequence " ++ s) pos
- return x
-
-currentFont :: PandocMonad m => ManLexer m Font
-currentFont = fontKind <$> getState
-
--- separate function from lexMacro since real man files sometimes do not follow the rules
-lexComment :: PandocMonad m => ManLexer m ManTokens
-lexComment = do
- try $ string ".\\\""
- many Parsec.space
- skipMany $ noneOf "\n"
- char '\n'
- return mempty
-
-lexMacro :: PandocMonad m => ManLexer m ManTokens
-lexMacro = do
- pos <- getPosition
- char '.' <|> char '\''
- many spacetab
- macroName <- many (satisfy (not . isSpace))
- case macroName of
- "nop" -> return mempty
- "ie" -> lexConditional
- "if" -> lexConditional
- "el" -> skipConditional
- "TS" -> lexTable
-
- _ -> do
- args <- lexArgs
- case macroName of
- "" -> return mempty
- "\\\"" -> return mempty
- "\\#" -> return mempty
- "de" -> lexMacroDef args
- "de1" -> lexMacroDef args
- "ds" -> lexStringDef args
- "ds1" -> lexStringDef args
- "sp" -> return $ singleTok MEmptyLine
- "so" -> lexIncludeFile args
- _ -> resolveMacro macroName args pos
-
--- | TODO placeholder
-lexTable :: PandocMonad m => ManLexer m ManTokens
-lexTable = do
- pos <- getPosition
- manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
- report $ SkippedContent "table" pos
- return mempty
-
--- We don't fully handle the conditional. But we do
--- include everything under '.ie n', which occurs commonly
--- in man pages. We always skip the '.el' part.
-lexConditional :: PandocMonad m => ManLexer m ManTokens
-lexConditional = do
- skipMany spacetab
- parseNCond <|> skipConditional
-
--- n means nroff mode
-parseNCond :: PandocMonad m => ManLexer m ManTokens
-parseNCond = do
- char '\n'
- many1 spacetab
- lexGroup <|> manToken
-
-lexGroup :: PandocMonad m => ManLexer m ManTokens
-lexGroup = do
- groupstart
- mconcat <$> manyTill manToken groupend
- where
- groupstart = try $ string "\\{\\" >> newline
- groupend = try $ string "\\}" >> eofline
-
-skipConditional :: PandocMonad m => ManLexer m ManTokens
-skipConditional = do
- rest <- anyLine
- when ("\\{\\" `isSuffixOf` rest) $
- void $ manyTill anyChar (try (string "\\}"))
- return mempty
-
-lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
-lexIncludeFile args = do
- pos <- getPosition
- case args of
- (f:_) -> do
- let fp = linePartsToString f
- dirs <- getResourcePath
- result <- readFileFromDirs dirs fp
- case result of
- Nothing -> report $ CouldNotLoadIncludeFile fp pos
- Just s -> getInput >>= setInput . (s ++)
- return mempty
- [] -> return mempty
-
-resolveMacro :: PandocMonad m
- => String -> [Arg] -> SourcePos -> ManLexer m ManTokens
-resolveMacro macroName args pos = do
- macros <- customMacros <$> getState
- case M.lookup macroName macros of
- Nothing -> return $ singleTok $ MMacro macroName args pos
- Just ts -> do
- let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
- fillLP (MacroArg i) zs =
- case drop (i - 1) args of
- [] -> zs
- (ys:_) -> ys ++ zs
- let fillMacroArg (MLine lineparts) =
- MLine (foldr fillLP [] lineparts)
- fillMacroArg x = x
- return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
-
-lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
-lexStringDef args = do -- string definition
- case args of
- [] -> fail "No argument to .ds"
- (x:ys) -> do
- let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
- let stringName = linePartsToString x
- modifyState $ \st ->
- st{ customMacros = M.insert stringName ts (customMacros st) }
- return mempty
-
-lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
-lexMacroDef args = do -- macro definition
- (macroName, stopMacro) <-
- case args of
- (x : y : _) -> return (linePartsToString x, linePartsToString y)
- -- optional second arg
- (x:_) -> return (linePartsToString x, ".")
- [] -> fail "No argument to .de"
- let stop = try $ do
- char '.' <|> char '\''
- many spacetab
- string stopMacro
- _ <- lexArgs
- return ()
- ts <- mconcat <$> manyTill manToken stop
- modifyState $ \st ->
- st{ customMacros = M.insert macroName ts (customMacros st) }
- return mempty
-
-lexArgs :: PandocMonad m => ManLexer m [Arg]
-lexArgs = do
- args <- many $ try oneArg
- skipMany spacetab
- eofline
- return args
-
- where
-
- oneArg :: PandocMonad m => ManLexer m [LinePart]
- oneArg = do
- skipMany $ try $ string "\\\n" -- continuation line
- try quotedArg <|> plainArg
- -- try, because there are some erroneous files, e.g. linux/bpf.2
-
- plainArg :: PandocMonad m => ManLexer m [LinePart]
- plainArg = do
- skipMany spacetab
- mconcat <$> many1
- (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
- where
- unescapedQuote = do
- char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
-
-
- quotedArg :: PandocMonad m => ManLexer m [LinePart]
- quotedArg = do
- skipMany spacetab
- char '"'
- xs <- mconcat <$>
- many (macroArg <|> esc <|> escStar <|> regularText
- <|> spaceTabChar <|> escapedQuote)
- char '"'
- return xs
- where
- escapedQuote = try $ do
- char '"'
- char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
-
-escStar :: PandocMonad m => ManLexer m [LinePart]
-escStar = try $ do
- pos <- getPosition
- char '\\'
- char '*'
- c <- anyChar
- case c of
- '(' -> do
- cs <- count 2 anyChar
- resolveString cs pos
- '[' -> do
- cs <- many (noneOf "\t\n\r ]")
- char ']'
- resolveString cs pos
- 'S' -> return mempty -- switch back to default font size
- _ -> resolveString [c] pos
-
- where
-
- -- strings and macros share namespace
- resolveString stringname pos = do
- ManTokens ts <- resolveMacro stringname [] pos
- case Foldable.toList ts of
- [MLine xs] -> return xs
- _ -> do
- report $ SkippedContent ("unknown string " ++ stringname) pos
- return mempty
-
-lexLine :: PandocMonad m => ManLexer m ManTokens
-lexLine = do
- lnparts <- mconcat <$> many1 linePart
- eofline
- go lnparts
- where -- return empty line if we only have empty strings;
- -- this can happen if the line just contains \f[C], for example.
- go [] = return mempty
- go (RoffStr ("",_):xs) = go xs
- go xs = return $ singleTok $ MLine xs
-
-linePart :: PandocMonad m => ManLexer m [LinePart]
-linePart = macroArg <|> esc <|> escStar <|>
- regularText <|> quoteChar <|> spaceTabChar
-
-macroArg :: PandocMonad m => ManLexer m [LinePart]
-macroArg = try $ do
- string "\\\\$"
- x <- digit
- return [MacroArg $ ord x - ord '0']
-
-esc :: PandocMonad m => ManLexer m [LinePart]
-esc = do
- s <- escapeLexer
- font <- currentFont
- return [RoffStr (s, font)]
-
-regularText :: PandocMonad m => ManLexer m [LinePart]
-regularText = do
- s <- many1 $ noneOf "\n\r\t \\\""
- font <- currentFont
- return [RoffStr (s, font)]
-
-quoteChar :: PandocMonad m => ManLexer m [LinePart]
-quoteChar = do
- char '"'
- font <- currentFont
- return [RoffStr ("\"", font)]
-
-spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
-spaceTabChar = do
- c <- spacetab
- font <- currentFont
- return [RoffStr ([c], font)]
-
-lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
-lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
-
-manToken :: PandocMonad m => ManLexer m ManTokens
-manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
parseMan :: PandocMonad m => ManParser m Pandoc
parseMan = do
@@ -656,12 +203,6 @@ linePartsToInlines = go
isItalic (RoffStr (_,f)) = Italic `S.member` f
isItalic _ = False
-linePartsToString :: [LinePart] -> String
-linePartsToString = mconcat . map go
- where
- go (RoffStr (s, _)) = s
- go _ = mempty
-
parsePara :: PandocMonad m => ManParser m Blocks
parsePara = para . trimInlines <$> parseInlines