diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Man.hs | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs new file mode 100644 index 000000000..30076102b --- /dev/null +++ b/src/Text/Pandoc/Readers/Man.hs @@ -0,0 +1,528 @@ +{-# LANGUAGE FlexibleContexts #-} +{- + Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + + +-} + +{- | + Module : Text.Pandoc.Readers.Man + Copyright : Copyright (C) 2018 Yan Pashkovsky + License : GNU GPL, version 2 or above + + Maintainer : Yan Pashkovsky <yanp.bugz@gmail.com> + Stability : WIP + Portability : portable + +Conversion of man to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Man (readMan, testFile) where + +import Prelude +import Control.Monad (liftM) +import Control.Monad.Except (throwError) +import Data.Char (isDigit, isUpper, isLower) +import Data.Default (Default) +import Data.Map (insert) +import Data.Maybe (catMaybes, fromMaybe, isNothing) +import Data.List (intersperse, intercalate) +import qualified Data.Text as T + +import Text.Pandoc.Class (PandocMonad(..), runIOorExplode) +import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError (PandocParsecError)) +import Text.Pandoc.Logging (LogMessage(..)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Shared (crFilter) +import Text.Parsec hiding (tokenPrim) +import Text.Parsec.Char () +import Text.Parsec.Pos (updatePosString) + +-- +-- Data Types +-- + +data FontKind = Regular | Italic | Bold | ItalicBold deriving Show + +data MacroKind = KTitle + | KCodeBlStart + | KCodeBlEnd + | KTab + | KTabEnd + | KSubTab + deriving (Show, Eq) + +type RoffStr = (String, FontKind) + +data ManToken = MStr RoffStr + | MLine [RoffStr] + | MMaybeLink String + | MEmptyLine + | MHeader Int [RoffStr] + | MMacro MacroKind [RoffStr] + | MUnknownMacro String [RoffStr] + | MComment String + deriving Show + +data EscapeThing = EFont FontKind + | EChar Char + | ENothing + deriving Show + +data RoffState = RoffState { fontKind :: FontKind + } deriving Show + +instance Default RoffState where + def = RoffState {fontKind = Regular} + +type ManLexer m = ParserT [Char] RoffState m +type ManParser m = ParserT [ManToken] ParserState m + +---- +-- testStrr :: [Char] -> Either PandocError Pandoc +-- testStrr s = runPure $ readMan def (T.pack s) + +printPandoc :: Pandoc -> [Char] +printPandoc (Pandoc m content) = + let ttl = "Pandoc: " ++ (show $ unMeta m) + cnt = intercalate "\n" $ map show content + in ttl ++ "\n" ++ cnt + +-- strrepr :: Either PandocError Pandoc -> [Char] +-- strrepr obj = case obj of +-- Right x -> printPandoc x +-- Left y -> show y + +testFile :: FilePath -> IO () +testFile fname = do + cont <- readFile fname + pand <- runIOorExplode $ readMan def (T.pack cont) + putStrLn $ printPandoc pand +---- + + +-- | Read man (troff) from an input string and return a Pandoc document. +readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc +readMan opts txt = do + eithertokens <- readWithM lexMan def (T.unpack $ crFilter txt) + case eithertokens of + Right tokenz -> do + let state = def {stateOptions = opts} :: ParserState + eitherdoc <- readWithMTokens parseMan state tokenz + case eitherdoc of + Right doc -> return doc + Left e -> throwError e + Left e -> throwError e + + where + + readWithMTokens :: PandocMonad m + => ParserT [ManToken] ParserState m a -- ^ parser + -> ParserState -- ^ initial state + -> [ManToken] -- ^ input + -> m (Either PandocError a) + readWithMTokens parser state input = + mapLeft (PandocParsecError . (intercalate "\n") $ show <$> input) `liftM` runParserT parser state "source" input + + mapLeft :: (a -> c) -> Either a b -> Either c b + mapLeft f (Left x) = Left $ f x + mapLeft _ (Right r) = Right r + +-- +-- String -> ManToken function +-- + +lexMan :: PandocMonad m => ManLexer m [ManToken] +lexMan = many (lexMacro <|> lexLine <|> lexEmptyLine) + +parseMan :: PandocMonad m => ManParser m Pandoc +parseMan = do + let parsers = [ try parseList, parseTitle, parsePara, parseSkippedContent + , parseCodeBlock, parseHeader, parseSkipMacro] + blocks <- many $ choice parsers + parserst <- getState + return $ Pandoc (stateMeta parserst) (filter (not . isNull) blocks) + + where + + isNull Null = True + isNull _ = False + +eofline :: Stream s m Char => ParsecT s u m () +eofline = (newline >> return ()) <|> eof + +spacetab :: Stream s m Char => ParsecT s u m Char +spacetab = char ' ' <|> char '\t' + +-- TODO handle more cases +escapeLexer :: PandocMonad m => ManLexer m EscapeThing +escapeLexer = do + char '\\' + choice [escChar, escFont] + where + + escChar :: PandocMonad m => ManLexer m EscapeThing + escChar = + let skipSeqs = ["%", "{", "}", "&"] + subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') + , ("[em]", '—'), ("[en]", '–') ] + substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing + substitute (from,to) = try $ string from >> return (EChar to) + skip :: PandocMonad m => String -> ManLexer m EscapeThing + skip seq' = try $ string seq' >> return ENothing + in choice $ (substitute <$> subsSeqs) ++ + (skip <$> skipSeqs) ++ + [ char '(' >> anyChar >> return ENothing + , char '[' >> many alphaNum >> char ']' >> return ENothing + ] + + escFont :: PandocMonad m => ManLexer m EscapeThing + escFont = do + char 'f' + font <- choice [ letterFont + , char '(' >> anyChar >> anyChar >> return Regular + , try (char '[' >> letterFont >>= \f -> char ']' >> return f) + , try $ string "[BI]" >> return ItalicBold + , char '[' >> many letter >> char ']' >> return Regular + ] + modifyState (\r -> r {fontKind = font}) + return $ EFont font + + where + + letterFont :: PandocMonad m => ManLexer m FontKind + letterFont = choice [ + char 'B' >> return Bold + , char 'I' >> return Italic + , (char 'P' <|> char 'R') >> return Regular + ] + +currentFont :: PandocMonad m => ManLexer m FontKind +currentFont = do + RoffState {fontKind = fk} <- getState + return fk + +lexMacro :: PandocMonad m => ManLexer m ManToken +lexMacro = do + char '.' <|> char '\'' + many spacetab + macroName <- many1 (letter <|> oneOf ['\\', '"']) + args <- lexArgs + let joinedArgs = unwords $ fst <$> args + knownMacro mkind = MMacro mkind args + + tok = case macroName of + x | x `elem` ["\\\"", "\\#"] -> MComment joinedArgs + "TH" -> knownMacro KTitle + "IP" -> knownMacro KTab + "TP" -> knownMacro KTab + "RE" -> knownMacro KTabEnd + "RS" -> knownMacro KSubTab + "nf" -> knownMacro KCodeBlStart + "fi" -> knownMacro KCodeBlEnd + "B" -> MStr (joinedArgs,Bold) + "BR" -> MMaybeLink joinedArgs + x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, ItalicBold) + x | x `elem` ["I", "IR", "RI"] -> MStr (joinedArgs, Italic) + "SH" -> MHeader 2 args + "SS" -> MHeader 3 args + x | x `elem` [ "P", "PP", "LP", "sp"] -> MEmptyLine + _ -> MUnknownMacro macroName args + return tok + + where + + -- TODO rework args + lexArgs :: PandocMonad m => ManLexer m [RoffStr] + lexArgs = do + args <- many oneArg + eofline + return args + + where + + oneArg :: PandocMonad m => ManLexer m RoffStr + oneArg = do + many1 spacetab + quotedArg <|> plainArg + + plainArg :: PandocMonad m => ManLexer m RoffStr + plainArg = do + arg <- many1 $ escChar <|> (Just <$> noneOf " \t\n") + f <- currentFont + return (catMaybes arg, f) + + quotedArg :: PandocMonad m => ManLexer m RoffStr + quotedArg = do + char '"' + val <- catMaybes <$> many quotedChar + char '"' + f <- currentFont + return (val, f) + + quotedChar :: PandocMonad m => ManLexer m (Maybe Char) + quotedChar = escChar <|> (Just <$> noneOf "\"\n") <|> (Just <$> try (string "\"\"" >> return '"')) + + escChar :: PandocMonad m => ManLexer m (Maybe Char) + escChar = do + ec <- escapeLexer + case ec of + (EChar c) -> return $ Just c + _ -> return Nothing + +lexLine :: PandocMonad m => ManLexer m ManToken +lexLine = do + lnparts <- many1 (esc <|> linePart) + eofline + return $ MLine $ catMaybes lnparts + where + + esc :: PandocMonad m => ManLexer m (Maybe (String, FontKind)) + esc = do + someesc <- escapeLexer + font <- currentFont + let rv = case someesc of + EChar c -> Just ([c], font) + _ -> Nothing + return rv + + linePart :: PandocMonad m => ManLexer m (Maybe (String, FontKind)) + linePart = do + lnpart <- many1 $ noneOf "\n\\" + font <- currentFont + return $ Just (lnpart, font) + + +lexEmptyLine :: PandocMonad m => ManLexer m ManToken +lexEmptyLine = char '\n' >> return MEmptyLine + +-- +-- ManToken parsec functions +-- + +msatisfy :: (Show t, Stream s m t) => (t -> Bool) -> ParserT s st m t +msatisfy predic = tokenPrim show nextPos testTok + where + testTok t = if predic t then Just t else Nothing + nextPos pos _x _xs = updatePosString (setSourceColumn (setSourceLine pos $ sourceLine pos + 1) 1) ("") + +mstr :: PandocMonad m => ManParser m ManToken +mstr = msatisfy isMStr where + isMStr (MStr _) = True + isMStr _ = False + +mline :: PandocMonad m => ManParser m ManToken +mline = msatisfy isMLine where + isMLine (MLine _) = True + isMLine _ = False + +mmaybeLink :: PandocMonad m => ManParser m ManToken +mmaybeLink = msatisfy isMMaybeLink where + isMMaybeLink (MMaybeLink _) = True + isMMaybeLink _ = False + +memplyLine :: PandocMonad m => ManParser m ManToken +memplyLine = msatisfy isMEmptyLine where + isMEmptyLine MEmptyLine = True + isMEmptyLine _ = False + +mheader :: PandocMonad m => ManParser m ManToken +mheader = msatisfy isMHeader where + isMHeader (MHeader _ _) = True + isMHeader _ = False + +mmacro :: PandocMonad m => MacroKind -> ManParser m ManToken +mmacro mk = msatisfy isMMacro where + isMMacro (MMacro mk' _) | mk == mk' = True + | otherwise = False + isMMacro _ = False + +mmacroAny :: PandocMonad m => ManParser m ManToken +mmacroAny = msatisfy isMMacro where + isMMacro (MMacro _ _) = True + isMMacro _ = False + +munknownMacro :: PandocMonad m => ManParser m ManToken +munknownMacro = msatisfy isMUnknownMacro where + isMUnknownMacro (MUnknownMacro _ _) = True + isMUnknownMacro _ = False + +mcomment :: PandocMonad m => ManParser m ManToken +mcomment = msatisfy isMComment where + isMComment (MComment _) = True + isMComment _ = False + +-- +-- ManToken -> Block functions +-- + +parseTitle :: PandocMonad m => ManParser m Block +parseTitle = do + (MMacro _ args) <- mmacro KTitle + if null args + then return Null + else do + let mantitle = fst $ head args + modifyState (changeTitle mantitle) + return $ Header 1 nullAttr [Str mantitle] + where + changeTitle title pst = + let meta = stateMeta pst + metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) + in + pst {stateMeta = metaUp} + +parseSkippedContent :: PandocMonad m => ManParser m Block +parseSkippedContent = do + tok <- munknownMacro <|> mcomment <|> memplyLine + onToken tok + return Null + + where + + onToken :: PandocMonad m => ManToken -> ManParser m () + onToken (MUnknownMacro mname _) = do + pos <- getPosition + logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos + onToken _ = return () + +strToInline :: RoffStr -> Inline +strToInline (s, Regular) = Str s +strToInline (s, Italic) = Emph [Str s] +strToInline (s, Bold) = Strong [Str s] +strToInline (s, ItalicBold) = Strong [Emph [Str s]] + +parsePara :: PandocMonad m => ManParser m Block +parsePara = Para <$> parseInlines + +parseInlines :: PandocMonad m => ManParser m [Inline] +parseInlines = do + inls <- many1 (strInl <|> lineInl <|> linkInl <|> comment) + let withspaces = intersperse [Space] inls + return $ concat withspaces + + where + + strInl :: PandocMonad m => ManParser m [Inline] + strInl = do + (MStr rstr) <- mstr + return [strToInline rstr] + + lineInl :: PandocMonad m => ManParser m [Inline] + lineInl = do + (MLine fragments) <- mline + return $ strToInline <$> fragments + + linkInl :: PandocMonad m => ManParser m [Inline] + linkInl = do + (MMaybeLink txt) <- mmaybeLink + let inls = case runParser linkParser () "" txt of + Right lnk -> lnk + Left _ -> [Strong [Str txt]] + return inls + + where + + -- assuming man pages are generated from Linux-like repository + linkParser :: Parsec String () [Inline] + linkParser = do + mpage <- many1 (alphaNum <|> char '_') + spacetab + char '(' + mansect <- digit + char ')' + other <- many anyChar + let manurl pagename section = "../"++section++"/"++pagename++"."++section + return $ [ Link nullAttr [Strong [Str mpage]] (manurl mpage [mansect], mpage) + , Strong [Str $ " ("++[mansect] ++ ")" + , Str other] + ] + + comment :: PandocMonad m => ManParser m [Inline] + comment = mcomment >> return [] + + +parseCodeBlock :: PandocMonad m => ManParser m Block +parseCodeBlock = do + mmacro KCodeBlStart + toks <- many (mstr <|> mline <|> mmaybeLink <|> memplyLine <|> munknownMacro <|> mcomment) + mmacro KCodeBlEnd + return $ CodeBlock nullAttr (intercalate "\n" . catMaybes $ extractText <$> toks) + + where + + extractText :: ManToken -> Maybe String + extractText (MStr (s, _)) = Just s + extractText (MLine ss) = Just . concat $ map fst ss -- TODO maybe unwords? + extractText (MMaybeLink s) = Just s + extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' + extractText _ = Nothing + +parseHeader :: PandocMonad m => ManParser m Block +parseHeader = do + (MHeader lvl ss) <- mheader + return $ Header lvl nullAttr $ intersperse Space $ strToInline <$> ss + +type ListBuilder = [[Block]] -> Block + +parseList :: PandocMonad m => ManParser m Block +parseList = do + xx <- many1 paras + let bls = map snd xx + let bldr = fst $ head xx + return $ bldr bls + + where + + macroIPInl :: [RoffStr] -> [Inline] + macroIPInl (x:_:[]) = [strToInline x, Space] + macroIPInl _ = [] + + listKind :: [RoffStr] -> Maybe ListBuilder + listKind (((c:_), _):_:[]) = + let params style = OrderedList (1, style, DefaultDelim) + in case c of + _ | isDigit c -> Just $ params Decimal + _ | isUpper c -> Just $ params UpperAlpha + _ | isLower c -> Just $ params LowerAlpha + _ -> Nothing + + listKind _ = Nothing + + paras :: PandocMonad m => ManParser m (ListBuilder, [Block]) + paras = do + (MMacro _ args) <- mmacro KTab + let lbuilderOpt = listKind args + lbuilder = fromMaybe BulletList lbuilderOpt + macroinl = macroIPInl args + inls <- parseInlines + let parainls = if isNothing lbuilderOpt then macroinl ++ inls else inls + subls <- many sublist + return $ (lbuilder, (Plain parainls) : subls) + + sublist :: PandocMonad m => ManParser m Block + sublist = do + mmacro KSubTab + bl <- parseList + mmacro KTabEnd + return bl + +-- In case of weird man file it will be parsed succesfully +parseSkipMacro :: PandocMonad m => ManParser m Block +parseSkipMacro = mmacroAny >> return Null |