{-# LANGUAGE FlexibleContexts #-} {- Copyright (C) 2018 Yan Pashkovsky and John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.Man Copyright : Copyright (C) 2018 Yan Pashkovsky and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Yan Pashkovsky Stability : WIP Portability : portable Conversion of man to 'Pandoc' document. -} module Text.Pandoc.Readers.Man (readMan) where import Prelude import Data.Char (toLower) import Data.Default (Default) import Control.Monad (liftM, mzero, guard, void) import Control.Monad.Trans (lift) import Control.Monad.Except (throwError) import Text.Pandoc.Class (PandocMonad(..), report) import Data.Maybe (catMaybes, isJust) import Data.List (intersperse, intercalate) import qualified Data.Text as T import Text.Pandoc.Builder as B import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Walk (query) import Text.Pandoc.Shared (crFilter) import Text.Pandoc.Readers.Roff -- TODO explicit imports import Text.Parsec hiding (tokenPrim) import qualified Text.Parsec as Parsec import Text.Parsec.Pos (updatePosString, initialPos) import qualified Data.Foldable as Foldable data ManState = ManState { readerOptions :: ReaderOptions , metadata :: Meta , tableCellsPlain :: Bool } deriving Show instance Default ManState where def = ManState { readerOptions = def , metadata = nullMeta , tableCellsPlain = True } type ManParser m = ParserT [RoffToken] ManState m -- | Read man (troff) from an input string and return a Pandoc document. readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc readMan opts txt = do tokenz <- lexRoff (initialPos "input") (crFilter txt) let state = def {readerOptions = opts} :: ManState eitherdoc <- readWithMTokens parseMan state (Foldable.toList . unRoffTokens $ tokenz) either throwError return eitherdoc readWithMTokens :: PandocMonad m => ParserT [RoffToken] ManState m a -- ^ parser -> ManState -- ^ initial state -> [RoffToken] -- ^ input -> m (Either PandocError a) readWithMTokens parser state input = let leftF = PandocParsecError . intercalate "\n" $ show <$> input in mapLeft leftF `liftM` runParserT parser state "source" input mapLeft :: (a -> c) -> Either a b -> Either c b mapLeft f (Left x) = Left $ f x mapLeft _ (Right r) = Right r parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do bs <- many parseBlock <* eof meta <- metadata <$> getState let (Pandoc _ blocks) = doc $ mconcat bs return $ Pandoc meta blocks parseBlock :: PandocMonad m => ManParser m Blocks parseBlock = choice [ parseList , parseDefinitionList , parseHeader , parseTable , parseTitle , parseCodeBlock , parseBlockQuote , parseNewParagraph , parsePara , skipUnkownMacro ] parseTable :: PandocMonad m => ManParser m Blocks parseTable = do modifyState $ \st -> st { tableCellsPlain = True } let isMTable (MTable{}) = True isMTable _ = False MTable _opts rows pos <- msatisfy isMTable case rows of ((as,_):_) -> try (do let as' = map (columnTypeToAlignment . columnType) as guard $ all isJust as' let alignments = catMaybes as' let (headerRow', bodyRows') = case rows of (h:x:bs) | isHrule x -> (h, bs) _ -> (([],[]), rows) headerRow <- mapM parseTableCell $ snd headerRow' bodyRows <- mapM (mapM parseTableCell . snd) bodyRows' isPlainTable <- tableCellsPlain <$> getState let widths = if isPlainTable then repeat 0.0 else repeat ((1.0 / fromIntegral (length alignments)) :: Double) return $ B.table mempty (zip alignments widths) headerRow bodyRows) <|> fallback pos [] -> fallback pos where parseTableCell ts = do st <- getState let ts' = Foldable.toList $ unRoffTokens ts let plaintcell = try $ do skipMany memptyLine plain . trimInlines <$> (parseInlines <* eof) let blockstcell = try $ do skipMany memptyLine mconcat <$> many parseBlock <* eof res <- if null ts' then return $ Right mempty else lift $ readWithMTokens plaintcell st ts' case res of Left _ -> do res' <- lift $ readWithMTokens blockstcell st ts' case res' of Left _ -> fail "Could not parse table cell" Right x -> do modifyState $ \s -> s{ tableCellsPlain = False } return x Right x -> return x isHrule :: TableRow -> Bool isHrule ([cellfmt], _) = columnType cellfmt `elem` ['_','-','='] isHrule (_, [RoffTokens ss]) = case Foldable.toList ss of [MLine [RoffStr [c]]] -> c `elem` ['_','-','='] _ -> False isHrule _ = False fallback pos = do report $ SkippedContent "TABLE" pos return $ B.para (B.text "TABLE") columnTypeToAlignment :: Char -> Maybe Alignment columnTypeToAlignment c = case toLower c of 'a' -> Just AlignLeft 'c' -> Just AlignCenter 'l' -> Just AlignLeft 'n' -> Just AlignRight 'r' -> Just AlignRight _ -> Nothing parseNewParagraph :: PandocMonad m => ManParser m Blocks parseNewParagraph = do mmacro "P" <|> mmacro "PP" <|> mmacro "LP" <|> memptyLine return mempty -- -- Parser: [RoffToken] -> Pandoc -- msatisfy :: Monad m => (RoffToken -> Bool) -> ParserT [RoffToken] st m RoffToken msatisfy predic = tokenPrim show nextPos testTok where testTok t = if predic t then Just t else Nothing nextPos _pos _x (MMacro _ _ pos':_) = pos' nextPos pos _x _xs = updatePosString (setSourceColumn (setSourceLine pos $ sourceLine pos + 1) 1) "" mtoken :: PandocMonad m => ManParser m RoffToken mtoken = msatisfy (const True) mline :: PandocMonad m => ManParser m RoffToken mline = msatisfy isMLine where isMLine (MLine _) = True isMLine _ = False memptyLine :: PandocMonad m => ManParser m RoffToken memptyLine = msatisfy isMEmptyLine where isMEmptyLine MEmptyLine = True isMEmptyLine _ = False mmacro :: PandocMonad m => MacroKind -> ManParser m RoffToken mmacro mk = msatisfy isMMacro where isMMacro (MMacro mk' _ _) | mk == mk' = True | otherwise = False isMMacro _ = False mmacroAny :: PandocMonad m => ManParser m RoffToken mmacroAny = msatisfy isMMacro where isMMacro MMacro{} = True isMMacro _ = False -- -- RoffToken -> Block functions -- parseTitle :: PandocMonad m => ManParser m Blocks parseTitle = do (MMacro _ args _) <- mmacro "TH" let adjustMeta = case args of (x:y:z:_) -> setMeta "title" (linePartsToInlines x) . setMeta "section" (linePartsToInlines y) . setMeta "date" (linePartsToInlines z) [x,y] -> setMeta "title" (linePartsToInlines x) . setMeta "section" (linePartsToInlines y) [x] -> setMeta "title" (linePartsToInlines x) [] -> id modifyState $ \st -> st{ metadata = adjustMeta $ metadata st } return mempty linePartsToInlines :: [LinePart] -> Inlines linePartsToInlines = go False where go :: Bool -> [LinePart] -> Inlines go _ [] = mempty go mono (MacroArg _:xs) = go mono xs -- shouldn't happen go mono (RoffStr s : RoffStr t : xs) = go mono (RoffStr (s <> t):xs) go mono (RoffStr s : xs) | mono = code s <> go mono xs | otherwise = text s <> go mono xs go mono (Font fs: xs) = if litals > 0 && litals >= lbolds && litals >= lmonos then emph (go mono (Font fs{ fontItalic = False } : map (adjustFontSpec (\s -> s{ fontItalic = False })) itals)) <> go mono italsrest else if lbolds > 0 && lbolds >= lmonos then strong (go mono (Font fs{ fontBold = False } : map (adjustFontSpec (\s -> s{ fontBold = False })) bolds)) <> go mono boldsrest else if lmonos > 0 then go True (Font fs{ fontMonospace = False } : map (adjustFontSpec (\s -> s { fontMonospace = False })) monos) <> go mono monosrest else go mono xs where adjustFontSpec f (Font fspec) = Font (f fspec) adjustFontSpec _ x = x withFont f (Font fspec) = f fspec withFont _ _ = False litals = length itals lbolds = length bolds lmonos = length monos (itals, italsrest) = if fontItalic fs then break (withFont (not . fontItalic)) xs else ([], xs) (bolds, boldsrest) = if fontBold fs then break (withFont (not . fontBold)) xs else ([], xs) (monos, monosrest) = if fontMonospace fs then break (withFont (not . fontMonospace)) xs else ([], xs) go mono (FontSize _fs : xs) = go mono xs parsePara :: PandocMonad m => ManParser m Blocks parsePara = para . trimInlines <$> parseInlines parseInlines :: PandocMonad m => ManParser m Inlines parseInlines = mconcat . intersperse B.space <$> many1 parseInline parseInline :: PandocMonad m => ManParser m Inlines parseInline = try $ do tok <- mtoken case tok of MLine lparts -> return $ linePartsToInlines lparts MMacro mname args pos -> handleInlineMacro mname args pos _ -> mzero handleInlineMacro :: PandocMonad m => String -> [Arg] -> SourcePos -> ManParser m Inlines handleInlineMacro mname args _pos = do case mname of "UR" -> parseLink args "MT" -> parseEmailLink args "B" -> parseBold args "I" -> parseItalic args "br" -> return linebreak "BI" -> parseAlternatingFonts [strong, emph] args "IB" -> parseAlternatingFonts [emph, strong] args "IR" -> parseAlternatingFonts [emph, id] args "RI" -> parseAlternatingFonts [id, emph] args "BR" -> parseAlternatingFonts [strong, id] args "RB" -> parseAlternatingFonts [id, strong] args "SY" -> return $ strong $ mconcat $ intersperse B.space $ map linePartsToInlines args "YS" -> return mempty "OP" -> case args of (x:ys) -> return $ B.space <> str "[" <> B.space <> mconcat (strong (linePartsToInlines x) : map ((B.space <>) . linePartsToInlines) ys) <> B.space <> str "]" [] -> return mempty _ -> mzero parseBold :: PandocMonad m => [Arg] -> ManParser m Inlines parseBold [] = do MLine lparts <- mline return $ strong $ linePartsToInlines lparts parseBold args = return $ strong $ mconcat $ intersperse B.space $ map linePartsToInlines args parseItalic :: PandocMonad m => [Arg] -> ManParser m Inlines parseItalic [] = do MLine lparts <- mline return $ emph $ linePartsToInlines lparts parseItalic args = return $ emph $ mconcat $ intersperse B.space $ map linePartsToInlines args parseAlternatingFonts :: PandocMonad m => [Inlines -> Inlines] -> [Arg] -> ManParser m Inlines parseAlternatingFonts constructors args = return $ mconcat $ zipWith (\f arg -> f (linePartsToInlines arg)) (cycle constructors) args lineInl :: PandocMonad m => ManParser m Inlines lineInl = do (MLine fragments) <- mline return $ linePartsToInlines fragments bareIP :: PandocMonad m => ManParser m RoffToken bareIP = msatisfy isBareIP where isBareIP (MMacro "IP" [] _) = True isBareIP _ = False endmacro :: PandocMonad m => String -> ManParser m () endmacro name = void (mmacro name) <|> lookAhead (void newBlockMacro) <|> lookAhead eof where newBlockMacro = msatisfy isNewBlockMacro isNewBlockMacro (MMacro "SH" _ _) = True isNewBlockMacro (MMacro "SS" _ _) = True isNewBlockMacro _ = False parseCodeBlock :: PandocMonad m => ManParser m Blocks parseCodeBlock = try $ do optional bareIP optional (mmacro "in") -- some people indent their code toks <- (mmacro "nf" *> manyTill codeline (endmacro "fi")) <|> (mmacro "EX" *> manyTill codeline (endmacro "EE")) optional (mmacro "in") return $ codeBlock (intercalate "\n" $ catMaybes toks) where codeline = do tok <- mtoken case tok of MMacro "PP" _ _ -> return $ Just "" -- .PP sometimes used for blank line MMacro mname args pos -> do (Just . query getText <$> handleInlineMacro mname args pos) <|> do report $ SkippedContent ('.':mname) pos return Nothing MTable _ _ pos -> do report $ SkippedContent "TABLE" pos return $ Just "TABLE" MEmptyLine -> return $ Just "" MLine ss | not (null ss) , all isFontToken ss -> return Nothing | otherwise -> return $ Just $ linePartsToString ss isFontToken FontSize{} = True isFontToken Font{} = True isFontToken _ = False getText :: Inline -> String getText (Str s) = s getText Space = " " getText (Code _ s) = s getText SoftBreak = "\n" getText LineBreak = "\n" getText _ = "" parseHeader :: PandocMonad m => ManParser m Blocks parseHeader = do MMacro name args _ <- mmacro "SH" <|> mmacro "SS" contents <- if null args then lineInl else return $ mconcat $ intersperse B.space $ map linePartsToInlines args let lvl = if name == "SH" then 1 else 2 return $ header lvl contents parseBlockQuote :: PandocMonad m => ManParser m Blocks parseBlockQuote = blockQuote <$> continuation data ListType = Ordered ListAttributes | Bullet listTypeMatches :: Maybe ListType -> ListType -> Bool listTypeMatches Nothing _ = True listTypeMatches (Just Bullet) Bullet = True listTypeMatches (Just (Ordered (_,x,y))) (Ordered (_,x',y')) = x == x' && y == y' listTypeMatches (Just _) _ = False listItem :: PandocMonad m => Maybe ListType -> ManParser m (ListType, Blocks) listItem mbListType = try $ do (MMacro _ args _) <- mmacro "IP" case args of (arg1 : _) -> do let cs = linePartsToString arg1 let cs' = if not ('.' `elem` cs || ')' `elem` cs) then cs ++ "." else cs let lt = case Parsec.runParser anyOrderedListMarker defaultParserState "list marker" cs' of Right (start, listtype, listdelim) | cs == cs' -> Ordered (start, listtype, listdelim) | otherwise -> Ordered (start, listtype, DefaultDelim) Left _ -> Bullet guard $ listTypeMatches mbListType lt inls <- option mempty parseInlines continuations <- mconcat <$> many continuation return (lt, para inls <> continuations) [] -> mzero parseList :: PandocMonad m => ManParser m Blocks parseList = try $ do (lt, x) <- listItem Nothing xs <- map snd <$> many (listItem (Just lt)) return $ case lt of Bullet -> bulletList (x:xs) Ordered lattr -> orderedListWith lattr (x:xs) continuation :: PandocMonad m => ManParser m Blocks continuation = mconcat <$> (mmacro "RS" *> manyTill parseBlock (endmacro "RE")) <|> mconcat <$> many1 ( try (bareIP *> parsePara) <|> try (bareIP *> parseCodeBlock) ) definitionListItem :: PandocMonad m => ManParser m (Inlines, [Blocks]) definitionListItem = try $ do mmacro "TP" -- args specify indent level, can ignore term <- parseInline moreterms <- many $ try $ do mmacro "TQ" parseInline inls <- option mempty parseInlines continuations <- mconcat <$> many continuation return ( mconcat (intersperse B.linebreak (term:moreterms)) , [para inls <> continuations]) parseDefinitionList :: PandocMonad m => ManParser m Blocks parseDefinitionList = definitionList <$> many1 definitionListItem parseLink :: PandocMonad m => [Arg] -> ManParser m Inlines parseLink args = do contents <- mconcat <$> many lineInl MMacro _ endargs _ <- mmacro "UE" let url = case args of [] -> "" (x:_) -> linePartsToString x return $ link url "" contents <> case endargs of [] -> mempty (x:_) -> linePartsToInlines x parseEmailLink :: PandocMonad m => [Arg] -> ManParser m Inlines parseEmailLink args = do contents <- mconcat <$> many lineInl MMacro _ endargs _ <- mmacro "ME" let url = case args of [] -> "" (x:_) -> "mailto:" ++ linePartsToString x return $ link url "" contents <> case endargs of [] -> mempty (x:_) -> linePartsToInlines x skipUnkownMacro :: PandocMonad m => ManParser m Blocks skipUnkownMacro = do tok <- mmacroAny case tok of MMacro mkind _ pos -> do report $ SkippedContent ('.':mkind) pos return mempty _ -> fail "the impossible happened"