aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-16 18:39:20 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-16 18:39:20 -0700
commit5c42101ee973131196b07661adc95670f2ed6693 (patch)
tree52ae302f981115a1855891d51857907704202ec7 /src
parent63b3886bfe3c5b50d171355573ab59750e009d03 (diff)
parent2c4e8941a7ccad2b325026fa27b9959ce959edef (diff)
downloadpandoc-5c42101ee973131196b07661adc95670f2ed6693.tar.gz
Merge branch 'groff_reader' of https://github.com/Yanpas/pandoc into Yanpas-groff_reader
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers.hs2
-rw-r--r--src/Text/Pandoc/Readers/Man.hs559
2 files changed, 561 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 7b7f92b35..76492b0aa 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -105,6 +105,7 @@ import Text.Pandoc.Readers.TikiWiki
import Text.Pandoc.Readers.TWiki
import Text.Pandoc.Readers.Txt2Tags
import Text.Pandoc.Readers.Vimwiki
+import Text.Pandoc.Readers.Man
import Text.Pandoc.Shared (mapLeft)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Parsec.Error
@@ -145,6 +146,7 @@ readers = [ ("native" , TextReader readNative)
,("t2t" , TextReader readTxt2Tags)
,("epub" , ByteStringReader readEPUB)
,("muse" , TextReader readMuse)
+ ,("man" , TextReader readMan)
,("fb2" , TextReader readFB2)
]
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs
new file mode 100644
index 000000000..1ffdd1f91
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Man.hs
@@ -0,0 +1,559 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-
+ Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Man
+ Copyright : Copyright (C) 2018 Yan Pashkovsky
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Yan Pashkovsky <yanp.bugz@gmail.com>
+ Stability : WIP
+ Portability : portable
+
+Conversion of man to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Man (readMan) where
+
+import Prelude
+import Control.Monad (liftM, void)
+import Control.Monad.Except (throwError)
+import Data.Char (isDigit, isUpper, isLower)
+import Data.Default (Default)
+import Data.Map (insert)
+import Data.Set (Set, singleton)
+import qualified Data.Set as S (fromList, toList)
+import Data.Maybe (catMaybes, fromMaybe, isNothing)
+import Data.List (intersperse, intercalate)
+import qualified Data.Text as T
+
+import Text.Pandoc.Class (PandocMonad(..))
+import Text.Pandoc.Builder as B hiding (singleton)
+import Text.Pandoc.Error (PandocError (PandocParsecError))
+import Text.Pandoc.Logging (LogMessage(..))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared (crFilter)
+import Text.Parsec hiding (tokenPrim, space)
+import qualified Text.Parsec as Parsec
+import Text.Parsec.Pos (updatePosString)
+
+--
+-- Data Types
+--
+data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
+
+data MacroKind = KTitle
+ | KCodeBlStart
+ | KCodeBlEnd
+ | KTab
+ | KTabEnd
+ | KSubTab
+ deriving (Show, Eq)
+
+type Font = Set FontKind
+
+type RoffStr = (String, Font)
+
+-- TODO parse tables (see man tbl)
+data ManToken = MStr RoffStr
+ | MLine [RoffStr]
+ | MMaybeLink String
+ | MEmptyLine
+ | MHeader Int [RoffStr]
+ | MMacro MacroKind [RoffStr]
+ | MUnknownMacro String [RoffStr]
+ | MComment String
+ deriving Show
+
+data EscapeThing = EFont Font
+ | EChar Char
+ | ENothing
+ deriving Show
+
+data RoffState = RoffState { fontKind :: Font
+ } deriving Show
+
+instance Default RoffState where
+ def = RoffState {fontKind = singleton Regular}
+
+type ManLexer m = ParserT [Char] RoffState m
+type ManParser m = ParserT [ManToken] ParserState m
+
+---- debug functions
+{-
+import Text.Pandoc.Class (runIOorExplode)
+
+printPandoc :: Pandoc -> [Char]
+printPandoc (Pandoc m content) =
+ let ttl = "Pandoc: " ++ (show $ unMeta m)
+ cnt = intercalate "\n" $ map show content
+ in ttl ++ "\n" ++ cnt
+
+testStr :: String -> IO ()
+testStr str = do
+ pand <- runIOorExplode $ readMan def (T.pack str)
+ putStrLn $ printPandoc pand
+
+
+testFile :: FilePath -> IO ()
+testFile fname = do
+ cont <- readFile fname
+ testStr cont
+-}
+----
+
+
+-- | Read man (troff) from an input string and return a Pandoc document.
+readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc
+readMan opts txt = do
+ eithertokens <- readWithM lexMan def (T.unpack $ crFilter txt)
+ case eithertokens of
+ Left e -> throwError e
+ Right tokenz -> do
+ let state = def {stateOptions = opts} :: ParserState
+ eitherdoc <- readWithMTokens parseMan state tokenz
+ either throwError return eitherdoc
+
+ where
+
+ readWithMTokens :: PandocMonad m
+ => ParserT [ManToken] ParserState m a -- ^ parser
+ -> ParserState -- ^ initial state
+ -> [ManToken] -- ^ input
+ -> m (Either PandocError a)
+ readWithMTokens parser state input =
+ let leftF = PandocParsecError . (intercalate "\n") $ show <$> input
+ in mapLeft leftF `liftM` runParserT parser state "source" input
+
+ mapLeft :: (a -> c) -> Either a b -> Either c b
+ mapLeft f (Left x) = Left $ f x
+ mapLeft _ (Right r) = Right r
+
+--
+-- String -> ManToken function
+--
+
+lexMan :: PandocMonad m => ManLexer m [ManToken]
+lexMan = many (lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine)
+
+parseMan :: PandocMonad m => ManParser m Pandoc
+parseMan = do
+ let parsers = [ try parseList, parseTitle, parsePara, parseSkippedContent
+ , try parseCodeBlock, parseHeader, parseSkipMacro]
+ bs <- many $ choice parsers
+ let (Pandoc _ blocks) = doc $ mconcat bs
+ meta <- stateMeta <$> getState
+ return $ Pandoc meta blocks
+
+eofline :: Stream s m Char => ParsecT s u m ()
+eofline = void newline <|> eof
+
+spacetab :: Stream s m Char => ParsecT s u m Char
+spacetab = char ' ' <|> char '\t'
+
+-- TODO add other sequences from man (7) groff
+escapeLexer :: PandocMonad m => ManLexer m EscapeThing
+escapeLexer = do
+ char '\\'
+ choice [escChar, escFont, escUnknown]
+ where
+
+ escChar :: PandocMonad m => ManLexer m EscapeThing
+ escChar =
+ let skipSeqs = ["%", "{", "}", "&", "\n", ":", "\"", "0", "c"]
+ subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”')
+ , ("[em]", '—'), ("[en]", '–'), ("*(lq", '«'), ("*(rq", '»')
+ , ("t", '\t'), ("e", '\\'), ("`", '`'), ("^", ' '), ("|", ' ')
+ , ("'", '`') ]
+ substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing
+ substitute (from,to) = try $ string from >> return (EChar to)
+ skip :: PandocMonad m => String -> ManLexer m EscapeThing
+ skip seq' = try $ string seq' >> return ENothing
+ in choice $ (substitute <$> subsSeqs) ++
+ (skip <$> skipSeqs) ++
+ [ char '(' >> anyChar >> return ENothing
+ , char '[' >> many alphaNum >> char ']' >> return ENothing
+ ]
+
+ escFont :: PandocMonad m => ManLexer m EscapeThing
+ escFont = do
+ char 'f'
+ font <- choice [ singleton <$> letterFontKind
+ , char '(' >> anyChar >> anyChar >> return (singleton Regular)
+ , try lettersFont
+ , digit >> return (singleton Regular)
+ ]
+ modifyState (\r -> r {fontKind = font})
+ return $ EFont font
+
+ where
+
+ lettersFont :: PandocMonad m => ManLexer m Font
+ lettersFont = do
+ char '['
+ fs <- many letterFontKind
+ many letter
+ char ']'
+ return $ S.fromList fs
+
+ letterFontKind :: PandocMonad m => ManLexer m FontKind
+ letterFontKind = choice [
+ char 'B' >> return Bold
+ , char 'I' >> return Italic
+ , char 'C' >> return Monospace
+ , (char 'P' <|> char 'R') >> return Regular
+ ]
+
+ escUnknown :: PandocMonad m => ManLexer m EscapeThing
+ escUnknown = do
+ c <- anyChar
+ pos <- getPosition
+ logOutput $ SkippedContent ("Unknown escape sequence \\" ++ [c]) pos
+ return ENothing
+
+currentFont :: PandocMonad m => ManLexer m Font
+currentFont = fontKind <$> getState
+
+-- separate function from lexMacro since real man files sometimes do not follow the rules
+lexComment :: PandocMonad m => ManLexer m ManToken
+lexComment = do
+ try $ string ".\\\""
+ many Parsec.space
+ body <- many $ noneOf "\n"
+ char '\n'
+ return $ MComment body
+
+lexMacro :: PandocMonad m => ManLexer m ManToken
+lexMacro = do
+ char '.' <|> char '\''
+ many spacetab
+ macroName <- many1 (letter <|> oneOf ['\\', '"', '&'])
+ args <- lexArgs
+ let joinedArgs = unwords $ fst <$> args
+ knownMacro mkind = MMacro mkind args
+
+ tok = case macroName of
+ x | x `elem` ["\\\"", "\\#"] -> MComment joinedArgs
+ "TH" -> knownMacro KTitle
+ "IP" -> knownMacro KTab
+ "TP" -> knownMacro KTab
+ "RE" -> knownMacro KTabEnd
+ "RS" -> knownMacro KSubTab
+ "nf" -> knownMacro KCodeBlStart
+ "fi" -> knownMacro KCodeBlEnd
+ "B" -> MStr (joinedArgs, singleton Bold)
+ "BR" -> MMaybeLink joinedArgs
+ x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, S.fromList [Italic, Bold])
+ x | x `elem` ["I", "IR", "RI"] -> MStr (joinedArgs, singleton Italic)
+ "SH" -> MHeader 2 args
+ "SS" -> MHeader 3 args
+ x | x `elem` [ "P", "PP", "LP", "sp"] -> MEmptyLine
+ _ -> MUnknownMacro macroName args
+ return tok
+
+ where
+
+ -- TODO better would be [[RoffStr]], since one arg may have different fonts
+ lexArgs :: PandocMonad m => ManLexer m [RoffStr]
+ lexArgs = do
+ args <- many $ try oneArg
+ many spacetab
+ eofline
+ return args
+
+ where
+
+ oneArg :: PandocMonad m => ManLexer m RoffStr
+ oneArg = do
+ many1 spacetab
+ many $ try $ string "\\\n"
+ try quotedArg <|> plainArg -- try, because there are some erroneous files, e.g. linux/bpf.2
+
+ plainArg :: PandocMonad m => ManLexer m RoffStr
+ plainArg = do
+ indents <- many spacetab
+ arg <- many1 $ escChar <|> (Just <$> noneOf " \t\n")
+ f <- currentFont
+ return (indents ++ catMaybes arg, f)
+
+ quotedArg :: PandocMonad m => ManLexer m RoffStr
+ quotedArg = do
+ char '"'
+ val <- many quotedChar
+ char '"'
+ val2 <- many $ escChar <|> (Just <$> noneOf " \t\n")
+ f <- currentFont
+ return (catMaybes $ val ++ val2, f)
+
+ quotedChar :: PandocMonad m => ManLexer m (Maybe Char)
+ quotedChar = escChar <|> (Just <$> noneOf "\"\n") <|> (Just <$> try (string "\"\"" >> return '"'))
+
+ escChar :: PandocMonad m => ManLexer m (Maybe Char)
+ escChar = do
+ ec <- escapeLexer
+ case ec of
+ (EChar c) -> return $ Just c
+ _ -> return Nothing
+
+lexLine :: PandocMonad m => ManLexer m ManToken
+lexLine = do
+ lnparts <- many1 (esc <|> linePart)
+ eofline
+ return $ MLine $ catMaybes lnparts
+ where
+
+ esc :: PandocMonad m => ManLexer m (Maybe (String, Font))
+ esc = do
+ someesc <- escapeLexer
+ font <- currentFont
+ let rv = case someesc of
+ EChar c -> Just ([c], font)
+ _ -> Nothing
+ return rv
+
+ linePart :: PandocMonad m => ManLexer m (Maybe (String, Font))
+ linePart = do
+ lnpart <- many1 $ noneOf "\n\\"
+ font <- currentFont
+ return $ Just (lnpart, font)
+
+
+lexEmptyLine :: PandocMonad m => ManLexer m ManToken
+lexEmptyLine = char '\n' >> return MEmptyLine
+
+--
+-- ManToken parsec functions
+--
+
+msatisfy :: (Show t, Stream s m t) => (t -> Bool) -> ParserT s st m t
+msatisfy predic = tokenPrim show nextPos testTok
+ where
+ testTok t = if predic t then Just t else Nothing
+ nextPos pos _x _xs = updatePosString (setSourceColumn (setSourceLine pos $ sourceLine pos + 1) 1) ("")
+
+mstr :: PandocMonad m => ManParser m ManToken
+mstr = msatisfy isMStr where
+ isMStr (MStr _) = True
+ isMStr _ = False
+
+mline :: PandocMonad m => ManParser m ManToken
+mline = msatisfy isMLine where
+ isMLine (MLine _) = True
+ isMLine _ = False
+
+mmaybeLink :: PandocMonad m => ManParser m ManToken
+mmaybeLink = msatisfy isMMaybeLink where
+ isMMaybeLink (MMaybeLink _) = True
+ isMMaybeLink _ = False
+
+memplyLine :: PandocMonad m => ManParser m ManToken
+memplyLine = msatisfy isMEmptyLine where
+ isMEmptyLine MEmptyLine = True
+ isMEmptyLine _ = False
+
+mheader :: PandocMonad m => ManParser m ManToken
+mheader = msatisfy isMHeader where
+ isMHeader (MHeader _ _) = True
+ isMHeader _ = False
+
+mmacro :: PandocMonad m => MacroKind -> ManParser m ManToken
+mmacro mk = msatisfy isMMacro where
+ isMMacro (MMacro mk' _) | mk == mk' = True
+ | otherwise = False
+ isMMacro _ = False
+
+mmacroAny :: PandocMonad m => ManParser m ManToken
+mmacroAny = msatisfy isMMacro where
+ isMMacro (MMacro _ _) = True
+ isMMacro _ = False
+
+munknownMacro :: PandocMonad m => ManParser m ManToken
+munknownMacro = msatisfy isMUnknownMacro where
+ isMUnknownMacro (MUnknownMacro _ _) = True
+ isMUnknownMacro _ = False
+
+mcomment :: PandocMonad m => ManParser m ManToken
+mcomment = msatisfy isMComment where
+ isMComment (MComment _) = True
+ isMComment _ = False
+
+--
+-- ManToken -> Block functions
+--
+
+parseTitle :: PandocMonad m => ManParser m Blocks
+parseTitle = do
+ (MMacro _ args) <- mmacro KTitle
+ if null args
+ then return mempty
+ else do
+ let mantitle = fst $ head args
+ modifyState (changeTitle mantitle)
+ return $ header 1 $ str mantitle
+ where
+ changeTitle title pst =
+ let meta = stateMeta pst
+ metaUp = Meta $ insert "title" (MetaString title) (unMeta meta)
+ in
+ pst {stateMeta = metaUp}
+
+parseSkippedContent :: PandocMonad m => ManParser m Blocks
+parseSkippedContent = do
+ tok <- munknownMacro <|> mcomment <|> memplyLine
+ onToken tok
+ return mempty
+
+ where
+
+ onToken :: PandocMonad m => ManToken -> ManParser m ()
+ onToken (MUnknownMacro mname _) = do
+ pos <- getPosition
+ logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos
+ onToken _ = return ()
+
+strToInlines :: RoffStr -> Inlines
+strToInlines (s, fonts) = inner $ S.toList fonts where
+ inner :: [FontKind] -> Inlines
+ inner [] = str s
+ inner (Bold:fs) = strong $ inner fs
+ inner (Italic:fs) = emph $ inner fs
+
+ -- Monospace goes after Bold and Italic in ordered set
+ inner (Monospace:_) = code s
+ inner (Regular:fs) = inner fs
+
+parsePara :: PandocMonad m => ManParser m Blocks
+parsePara = para <$> parseInlines
+
+parseInlines :: PandocMonad m => ManParser m Inlines
+parseInlines = do
+ inls <- many1 (strInl <|> lineInl <|> linkInl <|> comment)
+ let withspaces = intersperse B.space inls
+ return $ mconcat withspaces
+
+ where
+
+ strInl :: PandocMonad m => ManParser m Inlines
+ strInl = do
+ (MStr rstr) <- mstr
+ return $ strToInlines rstr
+
+ lineInl :: PandocMonad m => ManParser m Inlines
+ lineInl = do
+ (MLine fragments) <- mline
+ return $ mconcat $ strToInlines <$> fragments
+
+ linkInl :: PandocMonad m => ManParser m Inlines
+ linkInl = do
+ (MMaybeLink txt) <- mmaybeLink
+ let inls = case runParser linkParser () "" txt of
+ Right lnk -> lnk
+ Left _ -> strong $ str txt
+ return inls
+
+ where
+
+ -- assuming man pages are generated from Linux-like repository
+ linkParser :: Parsec String () Inlines
+ linkParser = do
+ mpage <- many1 (alphaNum <|> char '_')
+ spacetab
+ char '('
+ mansect <- digit
+ char ')'
+ other <- many anyChar
+ let manurl pagename section = "../"++section++"/"++pagename++"."++section
+ lnkInls = link (manurl mpage [mansect]) mpage (strong $ str mpage)
+ return $ lnkInls <> strong (str (" ("++[mansect] ++ ")") <> str other)
+
+ comment :: PandocMonad m => ManParser m Inlines
+ comment = mcomment >> return mempty
+
+
+parseCodeBlock :: PandocMonad m => ManParser m Blocks
+parseCodeBlock = do
+ mmacro KCodeBlStart
+ toks <- many (mstr <|> mline <|> mmaybeLink <|> memplyLine <|> munknownMacro <|> mcomment)
+ mmacro KCodeBlEnd
+ return $ codeBlock (intercalate "\n" . catMaybes $ extractText <$> toks)
+
+ where
+
+ extractText :: ManToken -> Maybe String
+ extractText (MStr (s, _)) = Just s
+ extractText (MLine ss) = Just . concat $ map fst ss -- TODO maybe unwords?
+ extractText (MMaybeLink s) = Just s
+ extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n'
+ extractText _ = Nothing
+
+parseHeader :: PandocMonad m => ManParser m Blocks
+parseHeader = do
+ (MHeader lvl ss) <- mheader
+ return $ header lvl (mconcat $ intersperse B.space $ strToInlines <$> ss)
+
+type ListBuilder = [Blocks] -> Blocks
+
+parseList :: PandocMonad m => ManParser m Blocks
+parseList = do
+ xx <- many1 paras
+ let bls = map snd xx
+ let bldr = fst $ head xx
+ return $ bldr bls
+
+ where
+
+ macroIPInl :: [RoffStr] -> Inlines
+ macroIPInl (x:_:[]) = strToInlines x <> B.space
+ macroIPInl _ = mempty
+
+ listKind :: [RoffStr] -> Maybe ListBuilder
+ listKind (((c:_), _):_:[]) =
+ let params style = orderedListWith (1, style, DefaultDelim)
+ in case c of
+ _ | isDigit c -> Just $ params Decimal
+ _ | isUpper c -> Just $ params UpperAlpha
+ _ | isLower c -> Just $ params LowerAlpha
+ _ -> Nothing
+
+ listKind _ = Nothing
+
+ paras :: PandocMonad m => ManParser m (ListBuilder, Blocks)
+ paras = do
+ (MMacro _ args) <- mmacro KTab
+ let lbuilderOpt = listKind args
+ lbuilder = fromMaybe bulletList lbuilderOpt
+ macroinl = macroIPInl args
+ inls <- parseInlines
+ let parainls = if isNothing lbuilderOpt then macroinl <> inls else inls
+ subls <- mconcat <$> many sublist
+ return $ (lbuilder, plain parainls <> subls)
+
+ sublist :: PandocMonad m => ManParser m Blocks
+ sublist = do
+ mmacro KSubTab
+ bl <- parseList
+ mmacro KTabEnd
+ return bl
+
+-- In case of weird man file it will be parsed succesfully
+parseSkipMacro :: PandocMonad m => ManParser m Blocks
+parseSkipMacro = mmacroAny >> mempty