diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Man.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Man.hs | 471 |
1 files changed, 6 insertions, 465 deletions
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index f3013be5f..50ec0c019 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -18,8 +18,6 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -} {- | @@ -36,76 +34,25 @@ Conversion of man to 'Pandoc' document. module Text.Pandoc.Readers.Man (readMan) where import Prelude -import Control.Monad (liftM, void, mzero, guard, when) -import Control.Monad.Except (throwError) -import Text.Pandoc.Class - (getResourcePath, readFileFromDirs, PandocMonad(..), report) -import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace) import Data.Default (Default) +import Control.Monad (liftM, mzero, guard) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad(..), report) import Data.Maybe (catMaybes) -import qualified Data.Map as M -import Data.Set (Set) import qualified Data.Set as S -import Data.List (intersperse, intercalate, isSuffixOf) +import Data.List (intersperse, intercalate) import qualified Data.Text as T import Text.Pandoc.Builder as B import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing -import Text.Pandoc.Shared (crFilter, safeRead) +import Text.Pandoc.Shared (crFilter) +import Text.Pandoc.Readers.Groff -- TODO explicit imports import Text.Parsec hiding (tokenPrim) import qualified Text.Parsec as Parsec import Text.Parsec.Pos (updatePosString) -import Text.Pandoc.GroffChar (characterCodes, combiningAccents) -import qualified Data.Sequence as Seq import qualified Data.Foldable as Foldable -import qualified Data.Text.Normalize as Normalize - --- import Debug.Trace (traceShowId) - --- --- Data Types --- -data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord) - -type MacroKind = String - -type Font = Set FontKind - -data LinePart = RoffStr (String, Font) - | MacroArg Int - deriving Show - -type Arg = [LinePart] - --- TODO parse tables (see man tbl) -data ManToken = MLine [LinePart] - | MEmptyLine - | MMacro MacroKind [Arg] SourcePos - | MTable [Alignment] ManTokens [ManTokens] [[ManTokens]] - deriving Show - -newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken } - deriving (Show, Semigroup, Monoid) - -singleTok :: ManToken -> ManTokens -singleTok t = ManTokens (Seq.singleton t) - -data RoffState = RoffState { fontKind :: Font - , customMacros :: M.Map String ManTokens - } deriving Show - -instance Default RoffState where - def = RoffState { customMacros = M.fromList - $ map (\(n, s) -> - (n, singleTok - (MLine [RoffStr (s, mempty)]))) - [ ("Tm", "\x2122") - , ("lq", "\x201C") - , ("rq", "\x201D") - , ("R", "\x00AE") ] - , fontKind = S.singleton Regular } data ManState = ManState { readerOptions :: ReaderOptions , metadata :: Meta @@ -115,7 +62,6 @@ instance Default ManState where def = ManState { readerOptions = def , metadata = nullMeta } -type ManLexer m = ParserT [Char] RoffState m type ManParser m = ParserT [ManToken] ManState m @@ -147,405 +93,6 @@ readMan opts txt = do mapLeft f (Left x) = Left $ f x mapLeft _ (Right r) = Right r --- --- Lexer: String -> ManToken --- - -eofline :: Stream s m Char => ParsecT s u m () -eofline = void newline <|> eof - -spacetab :: Stream s m Char => ParsecT s u m Char -spacetab = char ' ' <|> char '\t' - -characterCodeMap :: M.Map String Char -characterCodeMap = - M.fromList $ map (\(x,y) -> (y,x)) characterCodes - -combiningAccentsMap :: M.Map String Char -combiningAccentsMap = - M.fromList $ map (\(x,y) -> (y,x)) combiningAccents - -escapeLexer :: PandocMonad m => ManLexer m String -escapeLexer = try $ do - char '\\' - c <- noneOf ['*','$'] -- see escStar, macroArg - case c of - '(' -> twoCharGlyph - '[' -> bracketedGlyph - 'f' -> escFont - 's' -> escFontSize - '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment - '#' -> mempty <$ manyTill anyChar newline - '%' -> return mempty - '{' -> return mempty - '}' -> return mempty - '&' -> return mempty - '\n' -> return mempty - ':' -> return mempty - '0' -> return mempty - 'c' -> return mempty - '-' -> return "-" - '_' -> return "_" - ' ' -> return " " - '\\' -> return "\\" - 't' -> return "\t" - 'e' -> return "\\" - '`' -> return "`" - '^' -> return " " - '|' -> return " " - '\'' -> return "`" - '.' -> return "`" - '~' -> return "\160" -- nonbreaking space - _ -> escUnknown ['\\',c] "\xFFFD" - - where - - twoCharGlyph = do - cs <- count 2 anyChar - case M.lookup cs characterCodeMap of - Just c -> return [c] - Nothing -> escUnknown ('\\':'(':cs) "\xFFFD" - - bracketedGlyph = unicodeGlyph <|> charGlyph - - charGlyph = do - cs <- manyTill (noneOf ['[',']','\n']) (char ']') - (case words cs of - [] -> mzero - [s] -> case M.lookup s characterCodeMap of - Nothing -> mzero - Just c -> return [c] - (s:ss) -> do - basechar <- case M.lookup cs characterCodeMap of - Nothing -> - case s of - [ch] | isAscii ch && isAlphaNum ch -> - return ch - _ -> mzero - Just c -> return c - let addAccents [] xs = return $ T.unpack . - Normalize.normalize Normalize.NFC . - T.pack $ reverse xs - addAccents (a:as) xs = - case M.lookup a combiningAccentsMap of - Just x -> addAccents as (x:xs) - Nothing -> mzero - addAccents ss [basechar]) - <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD" - - unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']' - - ucharCode = try $ do - char 'u' - cs <- many1 (satisfy isHexDigit) - let lcs = length cs - guard $ lcs >= 4 && lcs <= 6 - case chr <$> safeRead ('0':'x':cs) of - Nothing -> mzero - Just c -> return c - - -- \s-1 \s0 -- we ignore these - escFontSize :: PandocMonad m => ManLexer m String - escFontSize = do - pos <- getPosition - pm <- option "" $ count 1 (oneOf "+-") - ds <- many1 digit - report $ SkippedContent ("\\s" ++ pm ++ ds) pos - return mempty - - escFont :: PandocMonad m => ManLexer m String - escFont = do - font <- choice - [ S.singleton <$> letterFontKind - , char '(' >> anyChar >> anyChar >> return (S.singleton Regular) - , char 'S' >> return (S.singleton Regular) - , try lettersFont - , digit >> return (S.singleton Regular) - ] - modifyState (\r -> r {fontKind = font}) - return mempty - - lettersFont :: PandocMonad m => ManLexer m Font - lettersFont = do - char '[' - fs <- many letterFontKind - skipMany letter - char ']' - return $ S.fromList fs - - letterFontKind :: PandocMonad m => ManLexer m FontKind - letterFontKind = choice [ - oneOf ['B','b'] >> return Bold - , oneOf ['I','i'] >> return Italic - , oneOf ['C','c'] >> return Monospace - , oneOf ['P','p','R','r'] >> return Regular - ] - - escUnknown :: PandocMonad m => String -> a -> ManLexer m a - escUnknown s x = do - pos <- getPosition - report $ SkippedContent ("Unknown escape sequence " ++ s) pos - return x - -currentFont :: PandocMonad m => ManLexer m Font -currentFont = fontKind <$> getState - --- separate function from lexMacro since real man files sometimes do not follow the rules -lexComment :: PandocMonad m => ManLexer m ManTokens -lexComment = do - try $ string ".\\\"" - many Parsec.space - skipMany $ noneOf "\n" - char '\n' - return mempty - -lexMacro :: PandocMonad m => ManLexer m ManTokens -lexMacro = do - pos <- getPosition - char '.' <|> char '\'' - many spacetab - macroName <- many (satisfy (not . isSpace)) - case macroName of - "nop" -> return mempty - "ie" -> lexConditional - "if" -> lexConditional - "el" -> skipConditional - "TS" -> lexTable - - _ -> do - args <- lexArgs - case macroName of - "" -> return mempty - "\\\"" -> return mempty - "\\#" -> return mempty - "de" -> lexMacroDef args - "de1" -> lexMacroDef args - "ds" -> lexStringDef args - "ds1" -> lexStringDef args - "sp" -> return $ singleTok MEmptyLine - "so" -> lexIncludeFile args - _ -> resolveMacro macroName args pos - --- | TODO placeholder -lexTable :: PandocMonad m => ManLexer m ManTokens -lexTable = do - pos <- getPosition - manyTill anyLine (try (string ".TE" >> many spacetab >> eofline)) - report $ SkippedContent "table" pos - return mempty - --- We don't fully handle the conditional. But we do --- include everything under '.ie n', which occurs commonly --- in man pages. We always skip the '.el' part. -lexConditional :: PandocMonad m => ManLexer m ManTokens -lexConditional = do - skipMany spacetab - parseNCond <|> skipConditional - --- n means nroff mode -parseNCond :: PandocMonad m => ManLexer m ManTokens -parseNCond = do - char '\n' - many1 spacetab - lexGroup <|> manToken - -lexGroup :: PandocMonad m => ManLexer m ManTokens -lexGroup = do - groupstart - mconcat <$> manyTill manToken groupend - where - groupstart = try $ string "\\{\\" >> newline - groupend = try $ string "\\}" >> eofline - -skipConditional :: PandocMonad m => ManLexer m ManTokens -skipConditional = do - rest <- anyLine - when ("\\{\\" `isSuffixOf` rest) $ - void $ manyTill anyChar (try (string "\\}")) - return mempty - -lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens -lexIncludeFile args = do - pos <- getPosition - case args of - (f:_) -> do - let fp = linePartsToString f - dirs <- getResourcePath - result <- readFileFromDirs dirs fp - case result of - Nothing -> report $ CouldNotLoadIncludeFile fp pos - Just s -> getInput >>= setInput . (s ++) - return mempty - [] -> return mempty - -resolveMacro :: PandocMonad m - => String -> [Arg] -> SourcePos -> ManLexer m ManTokens -resolveMacro macroName args pos = do - macros <- customMacros <$> getState - case M.lookup macroName macros of - Nothing -> return $ singleTok $ MMacro macroName args pos - Just ts -> do - let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs - fillLP (MacroArg i) zs = - case drop (i - 1) args of - [] -> zs - (ys:_) -> ys ++ zs - let fillMacroArg (MLine lineparts) = - MLine (foldr fillLP [] lineparts) - fillMacroArg x = x - return $ ManTokens . fmap fillMacroArg . unManTokens $ ts - -lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens -lexStringDef args = do -- string definition - case args of - [] -> fail "No argument to .ds" - (x:ys) -> do - let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys) - let stringName = linePartsToString x - modifyState $ \st -> - st{ customMacros = M.insert stringName ts (customMacros st) } - return mempty - -lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens -lexMacroDef args = do -- macro definition - (macroName, stopMacro) <- - case args of - (x : y : _) -> return (linePartsToString x, linePartsToString y) - -- optional second arg - (x:_) -> return (linePartsToString x, ".") - [] -> fail "No argument to .de" - let stop = try $ do - char '.' <|> char '\'' - many spacetab - string stopMacro - _ <- lexArgs - return () - ts <- mconcat <$> manyTill manToken stop - modifyState $ \st -> - st{ customMacros = M.insert macroName ts (customMacros st) } - return mempty - -lexArgs :: PandocMonad m => ManLexer m [Arg] -lexArgs = do - args <- many $ try oneArg - skipMany spacetab - eofline - return args - - where - - oneArg :: PandocMonad m => ManLexer m [LinePart] - oneArg = do - skipMany $ try $ string "\\\n" -- continuation line - try quotedArg <|> plainArg - -- try, because there are some erroneous files, e.g. linux/bpf.2 - - plainArg :: PandocMonad m => ManLexer m [LinePart] - plainArg = do - skipMany spacetab - mconcat <$> many1 - (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar) - where - unescapedQuote = do - char '"' - fonts <- currentFont - return [RoffStr ("\"", fonts)] - - - quotedArg :: PandocMonad m => ManLexer m [LinePart] - quotedArg = do - skipMany spacetab - char '"' - xs <- mconcat <$> - many (macroArg <|> esc <|> escStar <|> regularText - <|> spaceTabChar <|> escapedQuote) - char '"' - return xs - where - escapedQuote = try $ do - char '"' - char '"' - fonts <- currentFont - return [RoffStr ("\"", fonts)] - -escStar :: PandocMonad m => ManLexer m [LinePart] -escStar = try $ do - pos <- getPosition - char '\\' - char '*' - c <- anyChar - case c of - '(' -> do - cs <- count 2 anyChar - resolveString cs pos - '[' -> do - cs <- many (noneOf "\t\n\r ]") - char ']' - resolveString cs pos - 'S' -> return mempty -- switch back to default font size - _ -> resolveString [c] pos - - where - - -- strings and macros share namespace - resolveString stringname pos = do - ManTokens ts <- resolveMacro stringname [] pos - case Foldable.toList ts of - [MLine xs] -> return xs - _ -> do - report $ SkippedContent ("unknown string " ++ stringname) pos - return mempty - -lexLine :: PandocMonad m => ManLexer m ManTokens -lexLine = do - lnparts <- mconcat <$> many1 linePart - eofline - go lnparts - where -- return empty line if we only have empty strings; - -- this can happen if the line just contains \f[C], for example. - go [] = return mempty - go (RoffStr ("",_):xs) = go xs - go xs = return $ singleTok $ MLine xs - -linePart :: PandocMonad m => ManLexer m [LinePart] -linePart = macroArg <|> esc <|> escStar <|> - regularText <|> quoteChar <|> spaceTabChar - -macroArg :: PandocMonad m => ManLexer m [LinePart] -macroArg = try $ do - string "\\\\$" - x <- digit - return [MacroArg $ ord x - ord '0'] - -esc :: PandocMonad m => ManLexer m [LinePart] -esc = do - s <- escapeLexer - font <- currentFont - return [RoffStr (s, font)] - -regularText :: PandocMonad m => ManLexer m [LinePart] -regularText = do - s <- many1 $ noneOf "\n\r\t \\\"" - font <- currentFont - return [RoffStr (s, font)] - -quoteChar :: PandocMonad m => ManLexer m [LinePart] -quoteChar = do - char '"' - font <- currentFont - return [RoffStr ("\"", font)] - -spaceTabChar :: PandocMonad m => ManLexer m [LinePart] -spaceTabChar = do - c <- spacetab - font <- currentFont - return [RoffStr ([c], font)] - -lexEmptyLine :: PandocMonad m => ManLexer m ManTokens -lexEmptyLine = char '\n' >> return (singleTok MEmptyLine) - -manToken :: PandocMonad m => ManLexer m ManTokens -manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do @@ -656,12 +203,6 @@ linePartsToInlines = go isItalic (RoffStr (_,f)) = Italic `S.member` f isItalic _ = False -linePartsToString :: [LinePart] -> String -linePartsToString = mconcat . map go - where - go (RoffStr (s, _)) = s - go _ = mempty - parsePara :: PandocMonad m => ManParser m Blocks parsePara = para . trimInlines <$> parseInlines |