aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-24 17:38:08 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-24 17:38:08 -0700
commit6c71100fcf0abc609dda323a76c78b0838234044 (patch)
tree74ba52ca6f3a0fc79200b4a8c4f69a6c30756609 /src/Text/Pandoc/Readers
parent123381765c195d361d44b636a7d01cf8688066c0 (diff)
downloadpandoc-6c71100fcf0abc609dda323a76c78b0838234044.tar.gz
Added Text.Pandoc.Readers.Groff.
This is an internal module that exports a tokenizer for groff formats. Closes #4998.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Groff.hs524
-rw-r--r--src/Text/Pandoc/Readers/Man.hs471
2 files changed, 530 insertions, 465 deletions
diff --git a/src/Text/Pandoc/Readers/Groff.hs b/src/Text/Pandoc/Readers/Groff.hs
new file mode 100644
index 000000000..90ae8561a
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Groff.hs
@@ -0,0 +1,524 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-
+ Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
+ and John MacFarlane
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Groff
+ Copyright : Copyright (C) 2018 Yan Pashkovsky and John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Yan Pashkovsky <yanp.bugz@gmail.com>
+ Stability : WIP
+ Portability : portable
+
+Tokenizer for groff formats (man, ms).
+-}
+module Text.Pandoc.Readers.Groff
+ ( FontKind(..)
+ , Font
+ , MacroKind
+ , LinePart(..)
+ , Arg
+ , ManToken(..)
+ , ManTokens(..)
+ , singleTok
+ , RoffState(..)
+ , ManLexer
+ , manToken
+ , linePartsToString
+ )
+where
+
+import Prelude
+import Control.Monad (void, mzero, guard, when)
+import Text.Pandoc.Class
+ (getResourcePath, readFileFromDirs, PandocMonad(..), report)
+import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
+import Data.Default (Default)
+import qualified Data.Map as M
+import Data.Set (Set)
+import qualified Data.Set as S
+import Data.List (intercalate, isSuffixOf)
+import qualified Data.Text as T
+import Text.Pandoc.Builder as B
+import Text.Pandoc.Logging (LogMessage(..))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared (safeRead)
+import Text.Parsec hiding (tokenPrim)
+import qualified Text.Parsec as Parsec
+import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
+import qualified Data.Sequence as Seq
+import qualified Data.Foldable as Foldable
+import qualified Data.Text.Normalize as Normalize
+
+-- import Debug.Trace (traceShowId)
+
+--
+-- Data Types
+--
+data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
+
+type MacroKind = String
+
+type Font = Set FontKind
+
+data LinePart = RoffStr (String, Font)
+ | MacroArg Int
+ deriving Show
+
+type Arg = [LinePart]
+
+-- TODO parse tables (see man tbl)
+data ManToken = MLine [LinePart]
+ | MEmptyLine
+ | MMacro MacroKind [Arg] SourcePos
+ | MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
+ deriving Show
+
+newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
+ deriving (Show, Semigroup, Monoid)
+
+singleTok :: ManToken -> ManTokens
+singleTok t = ManTokens (Seq.singleton t)
+
+data RoffState = RoffState { fontKind :: Font
+ , customMacros :: M.Map String ManTokens
+ } deriving Show
+
+instance Default RoffState where
+ def = RoffState { customMacros = M.fromList
+ $ map (\(n, s) ->
+ (n, singleTok
+ (MLine [RoffStr (s, mempty)])))
+ [ ("Tm", "\x2122")
+ , ("lq", "\x201C")
+ , ("rq", "\x201D")
+ , ("R", "\x00AE") ]
+ , fontKind = S.singleton Regular }
+
+type ManLexer m = ParserT [Char] RoffState m
+
+--
+-- Lexer: String -> ManToken
+--
+
+eofline :: Stream s m Char => ParsecT s u m ()
+eofline = void newline <|> eof
+
+spacetab :: Stream s m Char => ParsecT s u m Char
+spacetab = char ' ' <|> char '\t'
+
+characterCodeMap :: M.Map String Char
+characterCodeMap =
+ M.fromList $ map (\(x,y) -> (y,x)) characterCodes
+
+combiningAccentsMap :: M.Map String Char
+combiningAccentsMap =
+ M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
+
+escapeLexer :: PandocMonad m => ManLexer m String
+escapeLexer = try $ do
+ char '\\'
+ c <- noneOf ['*','$'] -- see escStar, macroArg
+ case c of
+ '(' -> twoCharGlyph
+ '[' -> bracketedGlyph
+ 'f' -> escFont
+ 's' -> escFontSize
+ '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
+ '#' -> mempty <$ manyTill anyChar newline
+ '%' -> return mempty
+ '{' -> return mempty
+ '}' -> return mempty
+ '&' -> return mempty
+ '\n' -> return mempty
+ ':' -> return mempty
+ '0' -> return mempty
+ 'c' -> return mempty
+ '-' -> return "-"
+ '_' -> return "_"
+ ' ' -> return " "
+ '\\' -> return "\\"
+ 't' -> return "\t"
+ 'e' -> return "\\"
+ '`' -> return "`"
+ '^' -> return " "
+ '|' -> return " "
+ '\'' -> return "`"
+ '.' -> return "`"
+ '~' -> return "\160" -- nonbreaking space
+ _ -> escUnknown ['\\',c] "\xFFFD"
+
+ where
+
+ twoCharGlyph = do
+ cs <- count 2 anyChar
+ case M.lookup cs characterCodeMap of
+ Just c -> return [c]
+ Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
+
+ bracketedGlyph = unicodeGlyph <|> charGlyph
+
+ charGlyph = do
+ cs <- manyTill (noneOf ['[',']','\n']) (char ']')
+ (case words cs of
+ [] -> mzero
+ [s] -> case M.lookup s characterCodeMap of
+ Nothing -> mzero
+ Just c -> return [c]
+ (s:ss) -> do
+ basechar <- case M.lookup cs characterCodeMap of
+ Nothing ->
+ case s of
+ [ch] | isAscii ch && isAlphaNum ch ->
+ return ch
+ _ -> mzero
+ Just c -> return c
+ let addAccents [] xs = return $ T.unpack .
+ Normalize.normalize Normalize.NFC .
+ T.pack $ reverse xs
+ addAccents (a:as) xs =
+ case M.lookup a combiningAccentsMap of
+ Just x -> addAccents as (x:xs)
+ Nothing -> mzero
+ addAccents ss [basechar])
+ <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
+
+ unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
+
+ ucharCode = try $ do
+ char 'u'
+ cs <- many1 (satisfy isHexDigit)
+ let lcs = length cs
+ guard $ lcs >= 4 && lcs <= 6
+ case chr <$> safeRead ('0':'x':cs) of
+ Nothing -> mzero
+ Just c -> return c
+
+ -- \s-1 \s0 -- we ignore these
+ escFontSize :: PandocMonad m => ManLexer m String
+ escFontSize = do
+ pos <- getPosition
+ pm <- option "" $ count 1 (oneOf "+-")
+ ds <- many1 digit
+ report $ SkippedContent ("\\s" ++ pm ++ ds) pos
+ return mempty
+
+ escFont :: PandocMonad m => ManLexer m String
+ escFont = do
+ font <- choice
+ [ S.singleton <$> letterFontKind
+ , char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
+ , char 'S' >> return (S.singleton Regular)
+ , try lettersFont
+ , digit >> return (S.singleton Regular)
+ ]
+ modifyState (\r -> r {fontKind = font})
+ return mempty
+
+ lettersFont :: PandocMonad m => ManLexer m Font
+ lettersFont = do
+ char '['
+ fs <- many letterFontKind
+ skipMany letter
+ char ']'
+ return $ S.fromList fs
+
+ letterFontKind :: PandocMonad m => ManLexer m FontKind
+ letterFontKind = choice [
+ oneOf ['B','b'] >> return Bold
+ , oneOf ['I','i'] >> return Italic
+ , oneOf ['C','c'] >> return Monospace
+ , oneOf ['P','p','R','r'] >> return Regular
+ ]
+
+ escUnknown :: PandocMonad m => String -> a -> ManLexer m a
+ escUnknown s x = do
+ pos <- getPosition
+ report $ SkippedContent ("Unknown escape sequence " ++ s) pos
+ return x
+
+currentFont :: PandocMonad m => ManLexer m Font
+currentFont = fontKind <$> getState
+
+-- separate function from lexMacro since real man files sometimes do not follow the rules
+lexComment :: PandocMonad m => ManLexer m ManTokens
+lexComment = do
+ try $ string ".\\\""
+ many Parsec.space
+ skipMany $ noneOf "\n"
+ char '\n'
+ return mempty
+
+lexMacro :: PandocMonad m => ManLexer m ManTokens
+lexMacro = do
+ pos <- getPosition
+ char '.' <|> char '\''
+ many spacetab
+ macroName <- many (satisfy (not . isSpace))
+ case macroName of
+ "nop" -> return mempty
+ "ie" -> lexConditional
+ "if" -> lexConditional
+ "el" -> skipConditional
+ "TS" -> lexTable
+
+ _ -> do
+ args <- lexArgs
+ case macroName of
+ "" -> return mempty
+ "\\\"" -> return mempty
+ "\\#" -> return mempty
+ "de" -> lexMacroDef args
+ "de1" -> lexMacroDef args
+ "ds" -> lexStringDef args
+ "ds1" -> lexStringDef args
+ "sp" -> return $ singleTok MEmptyLine
+ "so" -> lexIncludeFile args
+ _ -> resolveMacro macroName args pos
+
+-- | TODO placeholder
+lexTable :: PandocMonad m => ManLexer m ManTokens
+lexTable = do
+ pos <- getPosition
+ manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
+ report $ SkippedContent "table" pos
+ return mempty
+
+-- We don't fully handle the conditional. But we do
+-- include everything under '.ie n', which occurs commonly
+-- in man pages. We always skip the '.el' part.
+lexConditional :: PandocMonad m => ManLexer m ManTokens
+lexConditional = do
+ skipMany spacetab
+ parseNCond <|> skipConditional
+
+-- n means nroff mode
+parseNCond :: PandocMonad m => ManLexer m ManTokens
+parseNCond = do
+ char '\n'
+ many1 spacetab
+ lexGroup <|> manToken
+
+lexGroup :: PandocMonad m => ManLexer m ManTokens
+lexGroup = do
+ groupstart
+ mconcat <$> manyTill manToken groupend
+ where
+ groupstart = try $ string "\\{\\" >> newline
+ groupend = try $ string "\\}" >> eofline
+
+skipConditional :: PandocMonad m => ManLexer m ManTokens
+skipConditional = do
+ rest <- anyLine
+ when ("\\{\\" `isSuffixOf` rest) $
+ void $ manyTill anyChar (try (string "\\}"))
+ return mempty
+
+lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
+lexIncludeFile args = do
+ pos <- getPosition
+ case args of
+ (f:_) -> do
+ let fp = linePartsToString f
+ dirs <- getResourcePath
+ result <- readFileFromDirs dirs fp
+ case result of
+ Nothing -> report $ CouldNotLoadIncludeFile fp pos
+ Just s -> getInput >>= setInput . (s ++)
+ return mempty
+ [] -> return mempty
+
+resolveMacro :: PandocMonad m
+ => String -> [Arg] -> SourcePos -> ManLexer m ManTokens
+resolveMacro macroName args pos = do
+ macros <- customMacros <$> getState
+ case M.lookup macroName macros of
+ Nothing -> return $ singleTok $ MMacro macroName args pos
+ Just ts -> do
+ let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
+ fillLP (MacroArg i) zs =
+ case drop (i - 1) args of
+ [] -> zs
+ (ys:_) -> ys ++ zs
+ let fillMacroArg (MLine lineparts) =
+ MLine (foldr fillLP [] lineparts)
+ fillMacroArg x = x
+ return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
+
+lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
+lexStringDef args = do -- string definition
+ case args of
+ [] -> fail "No argument to .ds"
+ (x:ys) -> do
+ let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
+ let stringName = linePartsToString x
+ modifyState $ \st ->
+ st{ customMacros = M.insert stringName ts (customMacros st) }
+ return mempty
+
+lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
+lexMacroDef args = do -- macro definition
+ (macroName, stopMacro) <-
+ case args of
+ (x : y : _) -> return (linePartsToString x, linePartsToString y)
+ -- optional second arg
+ (x:_) -> return (linePartsToString x, ".")
+ [] -> fail "No argument to .de"
+ let stop = try $ do
+ char '.' <|> char '\''
+ many spacetab
+ string stopMacro
+ _ <- lexArgs
+ return ()
+ ts <- mconcat <$> manyTill manToken stop
+ modifyState $ \st ->
+ st{ customMacros = M.insert macroName ts (customMacros st) }
+ return mempty
+
+lexArgs :: PandocMonad m => ManLexer m [Arg]
+lexArgs = do
+ args <- many $ try oneArg
+ skipMany spacetab
+ eofline
+ return args
+
+ where
+
+ oneArg :: PandocMonad m => ManLexer m [LinePart]
+ oneArg = do
+ skipMany $ try $ string "\\\n" -- continuation line
+ try quotedArg <|> plainArg
+ -- try, because there are some erroneous files, e.g. linux/bpf.2
+
+ plainArg :: PandocMonad m => ManLexer m [LinePart]
+ plainArg = do
+ skipMany spacetab
+ mconcat <$> many1
+ (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
+ where
+ unescapedQuote = do
+ char '"'
+ fonts <- currentFont
+ return [RoffStr ("\"", fonts)]
+
+
+ quotedArg :: PandocMonad m => ManLexer m [LinePart]
+ quotedArg = do
+ skipMany spacetab
+ char '"'
+ xs <- mconcat <$>
+ many (macroArg <|> esc <|> escStar <|> regularText
+ <|> spaceTabChar <|> escapedQuote)
+ char '"'
+ return xs
+ where
+ escapedQuote = try $ do
+ char '"'
+ char '"'
+ fonts <- currentFont
+ return [RoffStr ("\"", fonts)]
+
+escStar :: PandocMonad m => ManLexer m [LinePart]
+escStar = try $ do
+ pos <- getPosition
+ char '\\'
+ char '*'
+ c <- anyChar
+ case c of
+ '(' -> do
+ cs <- count 2 anyChar
+ resolveString cs pos
+ '[' -> do
+ cs <- many (noneOf "\t\n\r ]")
+ char ']'
+ resolveString cs pos
+ 'S' -> return mempty -- switch back to default font size
+ _ -> resolveString [c] pos
+
+ where
+
+ -- strings and macros share namespace
+ resolveString stringname pos = do
+ ManTokens ts <- resolveMacro stringname [] pos
+ case Foldable.toList ts of
+ [MLine xs] -> return xs
+ _ -> do
+ report $ SkippedContent ("unknown string " ++ stringname) pos
+ return mempty
+
+lexLine :: PandocMonad m => ManLexer m ManTokens
+lexLine = do
+ lnparts <- mconcat <$> many1 linePart
+ eofline
+ go lnparts
+ where -- return empty line if we only have empty strings;
+ -- this can happen if the line just contains \f[C], for example.
+ go [] = return mempty
+ go (RoffStr ("",_):xs) = go xs
+ go xs = return $ singleTok $ MLine xs
+
+linePart :: PandocMonad m => ManLexer m [LinePart]
+linePart = macroArg <|> esc <|> escStar <|>
+ regularText <|> quoteChar <|> spaceTabChar
+
+macroArg :: PandocMonad m => ManLexer m [LinePart]
+macroArg = try $ do
+ string "\\\\$"
+ x <- digit
+ return [MacroArg $ ord x - ord '0']
+
+esc :: PandocMonad m => ManLexer m [LinePart]
+esc = do
+ s <- escapeLexer
+ font <- currentFont
+ return [RoffStr (s, font)]
+
+regularText :: PandocMonad m => ManLexer m [LinePart]
+regularText = do
+ s <- many1 $ noneOf "\n\r\t \\\""
+ font <- currentFont
+ return [RoffStr (s, font)]
+
+quoteChar :: PandocMonad m => ManLexer m [LinePart]
+quoteChar = do
+ char '"'
+ font <- currentFont
+ return [RoffStr ("\"", font)]
+
+spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
+spaceTabChar = do
+ c <- spacetab
+ font <- currentFont
+ return [RoffStr ([c], font)]
+
+lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
+lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
+
+manToken :: PandocMonad m => ManLexer m ManTokens
+manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
+
+linePartsToString :: [LinePart] -> String
+linePartsToString = mconcat . map go
+ where
+ go (RoffStr (s, _)) = s
+ go _ = mempty
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs
index f3013be5f..50ec0c019 100644
--- a/src/Text/Pandoc/Readers/Man.hs
+++ b/src/Text/Pandoc/Readers/Man.hs
@@ -18,8 +18,6 @@ You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-}
{- |
@@ -36,76 +34,25 @@ Conversion of man to 'Pandoc' document.
module Text.Pandoc.Readers.Man (readMan) where
import Prelude
-import Control.Monad (liftM, void, mzero, guard, when)
-import Control.Monad.Except (throwError)
-import Text.Pandoc.Class
- (getResourcePath, readFileFromDirs, PandocMonad(..), report)
-import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default)
+import Control.Monad (liftM, mzero, guard)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class (PandocMonad(..), report)
import Data.Maybe (catMaybes)
-import qualified Data.Map as M
-import Data.Set (Set)
import qualified Data.Set as S
-import Data.List (intersperse, intercalate, isSuffixOf)
+import Data.List (intersperse, intercalate)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
import Text.Pandoc.Error (PandocError (PandocParsecError))
import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Options
import Text.Pandoc.Parsing
-import Text.Pandoc.Shared (crFilter, safeRead)
+import Text.Pandoc.Shared (crFilter)
+import Text.Pandoc.Readers.Groff -- TODO explicit imports
import Text.Parsec hiding (tokenPrim)
import qualified Text.Parsec as Parsec
import Text.Parsec.Pos (updatePosString)
-import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
-import qualified Data.Sequence as Seq
import qualified Data.Foldable as Foldable
-import qualified Data.Text.Normalize as Normalize
-
--- import Debug.Trace (traceShowId)
-
---
--- Data Types
---
-data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
-
-type MacroKind = String
-
-type Font = Set FontKind
-
-data LinePart = RoffStr (String, Font)
- | MacroArg Int
- deriving Show
-
-type Arg = [LinePart]
-
--- TODO parse tables (see man tbl)
-data ManToken = MLine [LinePart]
- | MEmptyLine
- | MMacro MacroKind [Arg] SourcePos
- | MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
- deriving Show
-
-newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
- deriving (Show, Semigroup, Monoid)
-
-singleTok :: ManToken -> ManTokens
-singleTok t = ManTokens (Seq.singleton t)
-
-data RoffState = RoffState { fontKind :: Font
- , customMacros :: M.Map String ManTokens
- } deriving Show
-
-instance Default RoffState where
- def = RoffState { customMacros = M.fromList
- $ map (\(n, s) ->
- (n, singleTok
- (MLine [RoffStr (s, mempty)])))
- [ ("Tm", "\x2122")
- , ("lq", "\x201C")
- , ("rq", "\x201D")
- , ("R", "\x00AE") ]
- , fontKind = S.singleton Regular }
data ManState = ManState { readerOptions :: ReaderOptions
, metadata :: Meta
@@ -115,7 +62,6 @@ instance Default ManState where
def = ManState { readerOptions = def
, metadata = nullMeta }
-type ManLexer m = ParserT [Char] RoffState m
type ManParser m = ParserT [ManToken] ManState m
@@ -147,405 +93,6 @@ readMan opts txt = do
mapLeft f (Left x) = Left $ f x
mapLeft _ (Right r) = Right r
---
--- Lexer: String -> ManToken
---
-
-eofline :: Stream s m Char => ParsecT s u m ()
-eofline = void newline <|> eof
-
-spacetab :: Stream s m Char => ParsecT s u m Char
-spacetab = char ' ' <|> char '\t'
-
-characterCodeMap :: M.Map String Char
-characterCodeMap =
- M.fromList $ map (\(x,y) -> (y,x)) characterCodes
-
-combiningAccentsMap :: M.Map String Char
-combiningAccentsMap =
- M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-
-escapeLexer :: PandocMonad m => ManLexer m String
-escapeLexer = try $ do
- char '\\'
- c <- noneOf ['*','$'] -- see escStar, macroArg
- case c of
- '(' -> twoCharGlyph
- '[' -> bracketedGlyph
- 'f' -> escFont
- 's' -> escFontSize
- '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
- '#' -> mempty <$ manyTill anyChar newline
- '%' -> return mempty
- '{' -> return mempty
- '}' -> return mempty
- '&' -> return mempty
- '\n' -> return mempty
- ':' -> return mempty
- '0' -> return mempty
- 'c' -> return mempty
- '-' -> return "-"
- '_' -> return "_"
- ' ' -> return " "
- '\\' -> return "\\"
- 't' -> return "\t"
- 'e' -> return "\\"
- '`' -> return "`"
- '^' -> return " "
- '|' -> return " "
- '\'' -> return "`"
- '.' -> return "`"
- '~' -> return "\160" -- nonbreaking space
- _ -> escUnknown ['\\',c] "\xFFFD"
-
- where
-
- twoCharGlyph = do
- cs <- count 2 anyChar
- case M.lookup cs characterCodeMap of
- Just c -> return [c]
- Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
-
- bracketedGlyph = unicodeGlyph <|> charGlyph
-
- charGlyph = do
- cs <- manyTill (noneOf ['[',']','\n']) (char ']')
- (case words cs of
- [] -> mzero
- [s] -> case M.lookup s characterCodeMap of
- Nothing -> mzero
- Just c -> return [c]
- (s:ss) -> do
- basechar <- case M.lookup cs characterCodeMap of
- Nothing ->
- case s of
- [ch] | isAscii ch && isAlphaNum ch ->
- return ch
- _ -> mzero
- Just c -> return c
- let addAccents [] xs = return $ T.unpack .
- Normalize.normalize Normalize.NFC .
- T.pack $ reverse xs
- addAccents (a:as) xs =
- case M.lookup a combiningAccentsMap of
- Just x -> addAccents as (x:xs)
- Nothing -> mzero
- addAccents ss [basechar])
- <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
-
- unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
-
- ucharCode = try $ do
- char 'u'
- cs <- many1 (satisfy isHexDigit)
- let lcs = length cs
- guard $ lcs >= 4 && lcs <= 6
- case chr <$> safeRead ('0':'x':cs) of
- Nothing -> mzero
- Just c -> return c
-
- -- \s-1 \s0 -- we ignore these
- escFontSize :: PandocMonad m => ManLexer m String
- escFontSize = do
- pos <- getPosition
- pm <- option "" $ count 1 (oneOf "+-")
- ds <- many1 digit
- report $ SkippedContent ("\\s" ++ pm ++ ds) pos
- return mempty
-
- escFont :: PandocMonad m => ManLexer m String
- escFont = do
- font <- choice
- [ S.singleton <$> letterFontKind
- , char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
- , char 'S' >> return (S.singleton Regular)
- , try lettersFont
- , digit >> return (S.singleton Regular)
- ]
- modifyState (\r -> r {fontKind = font})
- return mempty
-
- lettersFont :: PandocMonad m => ManLexer m Font
- lettersFont = do
- char '['
- fs <- many letterFontKind
- skipMany letter
- char ']'
- return $ S.fromList fs
-
- letterFontKind :: PandocMonad m => ManLexer m FontKind
- letterFontKind = choice [
- oneOf ['B','b'] >> return Bold
- , oneOf ['I','i'] >> return Italic
- , oneOf ['C','c'] >> return Monospace
- , oneOf ['P','p','R','r'] >> return Regular
- ]
-
- escUnknown :: PandocMonad m => String -> a -> ManLexer m a
- escUnknown s x = do
- pos <- getPosition
- report $ SkippedContent ("Unknown escape sequence " ++ s) pos
- return x
-
-currentFont :: PandocMonad m => ManLexer m Font
-currentFont = fontKind <$> getState
-
--- separate function from lexMacro since real man files sometimes do not follow the rules
-lexComment :: PandocMonad m => ManLexer m ManTokens
-lexComment = do
- try $ string ".\\\""
- many Parsec.space
- skipMany $ noneOf "\n"
- char '\n'
- return mempty
-
-lexMacro :: PandocMonad m => ManLexer m ManTokens
-lexMacro = do
- pos <- getPosition
- char '.' <|> char '\''
- many spacetab
- macroName <- many (satisfy (not . isSpace))
- case macroName of
- "nop" -> return mempty
- "ie" -> lexConditional
- "if" -> lexConditional
- "el" -> skipConditional
- "TS" -> lexTable
-
- _ -> do
- args <- lexArgs
- case macroName of
- "" -> return mempty
- "\\\"" -> return mempty
- "\\#" -> return mempty
- "de" -> lexMacroDef args
- "de1" -> lexMacroDef args
- "ds" -> lexStringDef args
- "ds1" -> lexStringDef args
- "sp" -> return $ singleTok MEmptyLine
- "so" -> lexIncludeFile args
- _ -> resolveMacro macroName args pos
-
--- | TODO placeholder
-lexTable :: PandocMonad m => ManLexer m ManTokens
-lexTable = do
- pos <- getPosition
- manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
- report $ SkippedContent "table" pos
- return mempty
-
--- We don't fully handle the conditional. But we do
--- include everything under '.ie n', which occurs commonly
--- in man pages. We always skip the '.el' part.
-lexConditional :: PandocMonad m => ManLexer m ManTokens
-lexConditional = do
- skipMany spacetab
- parseNCond <|> skipConditional
-
--- n means nroff mode
-parseNCond :: PandocMonad m => ManLexer m ManTokens
-parseNCond = do
- char '\n'
- many1 spacetab
- lexGroup <|> manToken
-
-lexGroup :: PandocMonad m => ManLexer m ManTokens
-lexGroup = do
- groupstart
- mconcat <$> manyTill manToken groupend
- where
- groupstart = try $ string "\\{\\" >> newline
- groupend = try $ string "\\}" >> eofline
-
-skipConditional :: PandocMonad m => ManLexer m ManTokens
-skipConditional = do
- rest <- anyLine
- when ("\\{\\" `isSuffixOf` rest) $
- void $ manyTill anyChar (try (string "\\}"))
- return mempty
-
-lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
-lexIncludeFile args = do
- pos <- getPosition
- case args of
- (f:_) -> do
- let fp = linePartsToString f
- dirs <- getResourcePath
- result <- readFileFromDirs dirs fp
- case result of
- Nothing -> report $ CouldNotLoadIncludeFile fp pos
- Just s -> getInput >>= setInput . (s ++)
- return mempty
- [] -> return mempty
-
-resolveMacro :: PandocMonad m
- => String -> [Arg] -> SourcePos -> ManLexer m ManTokens
-resolveMacro macroName args pos = do
- macros <- customMacros <$> getState
- case M.lookup macroName macros of
- Nothing -> return $ singleTok $ MMacro macroName args pos
- Just ts -> do
- let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
- fillLP (MacroArg i) zs =
- case drop (i - 1) args of
- [] -> zs
- (ys:_) -> ys ++ zs
- let fillMacroArg (MLine lineparts) =
- MLine (foldr fillLP [] lineparts)
- fillMacroArg x = x
- return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
-
-lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
-lexStringDef args = do -- string definition
- case args of
- [] -> fail "No argument to .ds"
- (x:ys) -> do
- let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
- let stringName = linePartsToString x
- modifyState $ \st ->
- st{ customMacros = M.insert stringName ts (customMacros st) }
- return mempty
-
-lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
-lexMacroDef args = do -- macro definition
- (macroName, stopMacro) <-
- case args of
- (x : y : _) -> return (linePartsToString x, linePartsToString y)
- -- optional second arg
- (x:_) -> return (linePartsToString x, ".")
- [] -> fail "No argument to .de"
- let stop = try $ do
- char '.' <|> char '\''
- many spacetab
- string stopMacro
- _ <- lexArgs
- return ()
- ts <- mconcat <$> manyTill manToken stop
- modifyState $ \st ->
- st{ customMacros = M.insert macroName ts (customMacros st) }
- return mempty
-
-lexArgs :: PandocMonad m => ManLexer m [Arg]
-lexArgs = do
- args <- many $ try oneArg
- skipMany spacetab
- eofline
- return args
-
- where
-
- oneArg :: PandocMonad m => ManLexer m [LinePart]
- oneArg = do
- skipMany $ try $ string "\\\n" -- continuation line
- try quotedArg <|> plainArg
- -- try, because there are some erroneous files, e.g. linux/bpf.2
-
- plainArg :: PandocMonad m => ManLexer m [LinePart]
- plainArg = do
- skipMany spacetab
- mconcat <$> many1
- (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
- where
- unescapedQuote = do
- char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
-
-
- quotedArg :: PandocMonad m => ManLexer m [LinePart]
- quotedArg = do
- skipMany spacetab
- char '"'
- xs <- mconcat <$>
- many (macroArg <|> esc <|> escStar <|> regularText
- <|> spaceTabChar <|> escapedQuote)
- char '"'
- return xs
- where
- escapedQuote = try $ do
- char '"'
- char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
-
-escStar :: PandocMonad m => ManLexer m [LinePart]
-escStar = try $ do
- pos <- getPosition
- char '\\'
- char '*'
- c <- anyChar
- case c of
- '(' -> do
- cs <- count 2 anyChar
- resolveString cs pos
- '[' -> do
- cs <- many (noneOf "\t\n\r ]")
- char ']'
- resolveString cs pos
- 'S' -> return mempty -- switch back to default font size
- _ -> resolveString [c] pos
-
- where
-
- -- strings and macros share namespace
- resolveString stringname pos = do
- ManTokens ts <- resolveMacro stringname [] pos
- case Foldable.toList ts of
- [MLine xs] -> return xs
- _ -> do
- report $ SkippedContent ("unknown string " ++ stringname) pos
- return mempty
-
-lexLine :: PandocMonad m => ManLexer m ManTokens
-lexLine = do
- lnparts <- mconcat <$> many1 linePart
- eofline
- go lnparts
- where -- return empty line if we only have empty strings;
- -- this can happen if the line just contains \f[C], for example.
- go [] = return mempty
- go (RoffStr ("",_):xs) = go xs
- go xs = return $ singleTok $ MLine xs
-
-linePart :: PandocMonad m => ManLexer m [LinePart]
-linePart = macroArg <|> esc <|> escStar <|>
- regularText <|> quoteChar <|> spaceTabChar
-
-macroArg :: PandocMonad m => ManLexer m [LinePart]
-macroArg = try $ do
- string "\\\\$"
- x <- digit
- return [MacroArg $ ord x - ord '0']
-
-esc :: PandocMonad m => ManLexer m [LinePart]
-esc = do
- s <- escapeLexer
- font <- currentFont
- return [RoffStr (s, font)]
-
-regularText :: PandocMonad m => ManLexer m [LinePart]
-regularText = do
- s <- many1 $ noneOf "\n\r\t \\\""
- font <- currentFont
- return [RoffStr (s, font)]
-
-quoteChar :: PandocMonad m => ManLexer m [LinePart]
-quoteChar = do
- char '"'
- font <- currentFont
- return [RoffStr ("\"", font)]
-
-spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
-spaceTabChar = do
- c <- spacetab
- font <- currentFont
- return [RoffStr ([c], font)]
-
-lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
-lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
-
-manToken :: PandocMonad m => ManLexer m ManTokens
-manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
parseMan :: PandocMonad m => ManParser m Pandoc
parseMan = do
@@ -656,12 +203,6 @@ linePartsToInlines = go
isItalic (RoffStr (_,f)) = Italic `S.member` f
isItalic _ = False
-linePartsToString :: [LinePart] -> String
-linePartsToString = mconcat . map go
- where
- go (RoffStr (s, _)) = s
- go _ = mempty
-
parsePara :: PandocMonad m => ManParser m Blocks
parsePara = para . trimInlines <$> parseInlines