From e4726518afe2c4802c351f6f785032ff4e7e6a35 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 24 Oct 2018 22:04:15 -0700 Subject: T.P.Readers.Groff: improve LinePart. Separate font change and font size change tokens. With this change, emphasis no longer works. This needs to be implemented in the parser, not the lexer. --- src/Text/Pandoc/Readers/Groff.hs | 217 +++++++++++++++++++-------------------- src/Text/Pandoc/Readers/Man.hs | 43 +++----- 2 files changed, 121 insertions(+), 139 deletions(-) diff --git a/src/Text/Pandoc/Readers/Groff.hs b/src/Text/Pandoc/Readers/Groff.hs index 90ae8561a..94cc96b24 100644 --- a/src/Text/Pandoc/Readers/Groff.hs +++ b/src/Text/Pandoc/Readers/Groff.hs @@ -33,7 +33,6 @@ Tokenizer for groff formats (man, ms). -} module Text.Pandoc.Readers.Groff ( FontKind(..) - , Font , MacroKind , LinePart(..) , Arg @@ -54,8 +53,6 @@ import Text.Pandoc.Class import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace) import Data.Default (Default) import qualified Data.Map as M -import Data.Set (Set) -import qualified Data.Set as S import Data.List (intercalate, isSuffixOf) import qualified Data.Text as T import Text.Pandoc.Builder as B @@ -79,9 +76,9 @@ data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord) type MacroKind = String -type Font = Set FontKind - -data LinePart = RoffStr (String, Font) +data LinePart = RoffStr String + | Font [FontKind] + | FontSize Int | MacroArg Int deriving Show @@ -100,20 +97,19 @@ newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken } singleTok :: ManToken -> ManTokens singleTok t = ManTokens (Seq.singleton t) -data RoffState = RoffState { fontKind :: Font - , customMacros :: M.Map String ManTokens +data RoffState = RoffState { customMacros :: M.Map String ManTokens } deriving Show instance Default RoffState where def = RoffState { customMacros = M.fromList $ map (\(n, s) -> (n, singleTok - (MLine [RoffStr (s, mempty)]))) + (MLine [RoffStr s]))) [ ("Tm", "\x2122") , ("lq", "\x201C") , ("rq", "\x201D") , ("R", "\x00AE") ] - , fontKind = S.singleton Regular } + } type ManLexer m = ParserT [Char] RoffState m @@ -135,15 +131,16 @@ combiningAccentsMap :: M.Map String Char combiningAccentsMap = M.fromList $ map (\(x,y) -> (y,x)) combiningAccents -escapeLexer :: PandocMonad m => ManLexer m String -escapeLexer = try $ do +escape :: PandocMonad m => ManLexer m [LinePart] +escape = do char '\\' - c <- noneOf ['*','$'] -- see escStar, macroArg + c <- anyChar case c of - '(' -> twoCharGlyph - '[' -> bracketedGlyph 'f' -> escFont 's' -> escFontSize + '*' -> escStar + '(' -> twoCharGlyph + '[' -> bracketedGlyph '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment '#' -> mempty <$ manyTill anyChar newline '%' -> return mempty @@ -154,27 +151,27 @@ escapeLexer = try $ do ':' -> return mempty '0' -> return mempty 'c' -> return mempty - '-' -> return "-" - '_' -> return "_" - ' ' -> return " " - '\\' -> return "\\" - 't' -> return "\t" - 'e' -> return "\\" - '`' -> return "`" - '^' -> return " " - '|' -> return " " - '\'' -> return "`" - '.' -> return "`" - '~' -> return "\160" -- nonbreaking space - _ -> escUnknown ['\\',c] "\xFFFD" + '-' -> return [RoffStr "-"] + '_' -> return [RoffStr "_"] + ' ' -> return [RoffStr " "] + '\\' -> return [RoffStr "\\"] + 't' -> return [RoffStr "\t"] + 'e' -> return [RoffStr "\\"] + '`' -> return [RoffStr "`"] + '^' -> return [RoffStr " "] + '|' -> return [RoffStr " "] + '\'' -> return [RoffStr "`"] + '.' -> return [RoffStr "`"] + '~' -> return [RoffStr "\160"] -- nonbreaking space + _ -> escUnknown ['\\',c] where twoCharGlyph = do cs <- count 2 anyChar case M.lookup cs characterCodeMap of - Just c -> return [c] - Nothing -> escUnknown ('\\':'(':cs) "\xFFFD" + Just c -> return [RoffStr [c]] + Nothing -> escUnknown ('\\':'(':cs) bracketedGlyph = unicodeGlyph <|> charGlyph @@ -184,7 +181,7 @@ escapeLexer = try $ do [] -> mzero [s] -> case M.lookup s characterCodeMap of Nothing -> mzero - Just c -> return [c] + Just c -> return [RoffStr [c]] (s:ss) -> do basechar <- case M.lookup cs characterCodeMap of Nothing -> @@ -200,10 +197,12 @@ escapeLexer = try $ do case M.lookup a combiningAccentsMap of Just x -> addAccents as (x:xs) Nothing -> mzero - addAccents ss [basechar]) - <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD" + addAccents ss [basechar] >>= \xs -> return [RoffStr xs]) + <|> escUnknown ("\\[" ++ cs ++ "]") - unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']' + unicodeGlyph = try $ do + xs <- ucharCode `sepBy1` (char '_') <* char ']' + return [RoffStr xs] ucharCode = try $ do char 'u' @@ -214,53 +213,66 @@ escapeLexer = try $ do Nothing -> mzero Just c -> return c - -- \s-1 \s0 -- we ignore these - escFontSize :: PandocMonad m => ManLexer m String - escFontSize = do - pos <- getPosition - pm <- option "" $ count 1 (oneOf "+-") - ds <- many1 digit - report $ SkippedContent ("\\s" ++ pm ++ ds) pos - return mempty - - escFont :: PandocMonad m => ManLexer m String - escFont = do - font <- choice - [ S.singleton <$> letterFontKind - , char '(' >> anyChar >> anyChar >> return (S.singleton Regular) - , char 'S' >> return (S.singleton Regular) - , try lettersFont - , digit >> return (S.singleton Regular) - ] - modifyState (\r -> r {fontKind = font}) - return mempty - - lettersFont :: PandocMonad m => ManLexer m Font - lettersFont = do - char '[' - fs <- many letterFontKind - skipMany letter - char ']' - return $ S.fromList fs - - letterFontKind :: PandocMonad m => ManLexer m FontKind - letterFontKind = choice [ - oneOf ['B','b'] >> return Bold - , oneOf ['I','i'] >> return Italic - , oneOf ['C','c'] >> return Monospace - , oneOf ['P','p','R','r'] >> return Regular - ] - - escUnknown :: PandocMonad m => String -> a -> ManLexer m a - escUnknown s x = do + escUnknown :: PandocMonad m => String -> ManLexer m [LinePart] + escUnknown s = do pos <- getPosition report $ SkippedContent ("Unknown escape sequence " ++ s) pos - return x - -currentFont :: PandocMonad m => ManLexer m Font -currentFont = fontKind <$> getState + return [RoffStr "\xFFFD"] + +-- \s-1 \s0 +escFontSize :: PandocMonad m => ManLexer m [LinePart] +escFontSize = do + let sign = option "" $ count 1 (oneOf "+-") + let toFontSize xs = + case safeRead xs of + Nothing -> mzero + Just n -> return [FontSize n] + choice + [ do char '(' + s <- sign + ds <- count 2 digit + toFontSize (s ++ ds) + , do char '[' + s <- sign + ds <- many1 digit + char ']' + toFontSize (s ++ ds) + , do s <- sign + ds <- count 1 digit + toFontSize (s ++ ds) + ] --- separate function from lexMacro since real man files sometimes do not follow the rules +escFont :: PandocMonad m => ManLexer m [LinePart] +escFont = do + font <- choice + [ char 'S' >> return [Regular] + , digit >> return [Regular] + , (:[]) <$> letterFontKind + , char '(' >> anyChar >> anyChar >> return [Regular] + , lettersFont + , digit >> return [Regular] + ] + return [Font font] + +lettersFont :: PandocMonad m => ManLexer m [FontKind] +lettersFont = try $ do + char '[' + fs <- many letterFontKind + skipMany letter + char ']' + return fs + +letterFontKind :: PandocMonad m => ManLexer m FontKind +letterFontKind = choice [ + oneOf ['B','b'] >> return Bold + , oneOf ['I','i'] >> return Italic + , oneOf ['C','c'] >> return Monospace + , oneOf ['P','p','R','r'] >> return Regular + ] + + +-- separate function from lexMacro since real man files sometimes do not +-- follow the rules lexComment :: PandocMonad m => ManLexer m ManTokens lexComment = do try $ string ".\\\"" @@ -310,11 +322,11 @@ lexTable = do lexConditional :: PandocMonad m => ManLexer m ManTokens lexConditional = do skipMany spacetab - parseNCond <|> skipConditional + lexNCond <|> skipConditional -- n means nroff mode -parseNCond :: PandocMonad m => ManLexer m ManTokens -parseNCond = do +lexNCond :: PandocMonad m => ManLexer m ManTokens +lexNCond = do char '\n' many1 spacetab lexGroup <|> manToken @@ -355,11 +367,11 @@ resolveMacro macroName args pos = do case M.lookup macroName macros of Nothing -> return $ singleTok $ MMacro macroName args pos Just ts -> do - let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs - fillLP (MacroArg i) zs = + let fillLP (MacroArg i) zs = case drop (i - 1) args of [] -> zs (ys:_) -> ys ++ zs + fillLP z zs = z : zs let fillMacroArg (MLine lineparts) = MLine (foldr fillLP [] lineparts) fillMacroArg x = x @@ -370,7 +382,7 @@ lexStringDef args = do -- string definition case args of [] -> fail "No argument to .ds" (x:ys) -> do - let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys) + let ts = singleTok $ MLine (intercalate [RoffStr " " ] ys) let stringName = linePartsToString x modifyState $ \st -> st{ customMacros = M.insert stringName ts (customMacros st) } @@ -413,21 +425,16 @@ lexArgs = do plainArg :: PandocMonad m => ManLexer m [LinePart] plainArg = do skipMany spacetab - mconcat <$> many1 - (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar) + mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote) where - unescapedQuote = do - char '"' - fonts <- currentFont - return [RoffStr ("\"", fonts)] - + unescapedQuote = char '"' >> return [RoffStr "\""] quotedArg :: PandocMonad m => ManLexer m [LinePart] quotedArg = do skipMany spacetab char '"' xs <- mconcat <$> - many (macroArg <|> esc <|> escStar <|> regularText + many (macroArg <|> escape <|> regularText <|> spaceTabChar <|> escapedQuote) char '"' return xs @@ -435,14 +442,11 @@ lexArgs = do escapedQuote = try $ do char '"' char '"' - fonts <- currentFont - return [RoffStr ("\"", fonts)] + return [RoffStr "\""] escStar :: PandocMonad m => ManLexer m [LinePart] escStar = try $ do pos <- getPosition - char '\\' - char '*' c <- anyChar case c of '(' -> do @@ -474,11 +478,11 @@ lexLine = do where -- return empty line if we only have empty strings; -- this can happen if the line just contains \f[C], for example. go [] = return mempty - go (RoffStr ("",_):xs) = go xs + go (RoffStr "" : xs) = go xs go xs = return $ singleTok $ MLine xs linePart :: PandocMonad m => ManLexer m [LinePart] -linePart = macroArg <|> esc <|> escStar <|> +linePart = macroArg <|> escape <|> regularText <|> quoteChar <|> spaceTabChar macroArg :: PandocMonad m => ManLexer m [LinePart] @@ -487,29 +491,20 @@ macroArg = try $ do x <- digit return [MacroArg $ ord x - ord '0'] -esc :: PandocMonad m => ManLexer m [LinePart] -esc = do - s <- escapeLexer - font <- currentFont - return [RoffStr (s, font)] - regularText :: PandocMonad m => ManLexer m [LinePart] regularText = do s <- many1 $ noneOf "\n\r\t \\\"" - font <- currentFont - return [RoffStr (s, font)] + return [RoffStr s] quoteChar :: PandocMonad m => ManLexer m [LinePart] quoteChar = do char '"' - font <- currentFont - return [RoffStr ("\"", font)] + return [RoffStr "\""] spaceTabChar :: PandocMonad m => ManLexer m [LinePart] spaceTabChar = do c <- spacetab - font <- currentFont - return [RoffStr ([c], font)] + return [RoffStr [c]] lexEmptyLine :: PandocMonad m => ManLexer m ManTokens lexEmptyLine = char '\n' >> return (singleTok MEmptyLine) @@ -520,5 +515,5 @@ manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine linePartsToString :: [LinePart] -> String linePartsToString = mconcat . map go where - go (RoffStr (s, _)) = s + go (RoffStr s) = s go _ = mempty diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 50ec0c019..ee7051213 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -39,7 +39,6 @@ import Control.Monad (liftM, mzero, guard) import Control.Monad.Except (throwError) import Text.Pandoc.Class (PandocMonad(..), report) import Data.Maybe (catMaybes) -import qualified Data.Set as S import Data.List (intersperse, intercalate) import qualified Data.Text as T import Text.Pandoc.Builder as B @@ -174,34 +173,16 @@ parseTitle = do return mempty linePartsToInlines :: [LinePart] -> Inlines -linePartsToInlines = go +linePartsToInlines = go [] where - go [] = mempty - go (MacroArg _:xs) = go xs -- shouldn't happen - go xs@(RoffStr{} : _) = - if lb > 0 && lb >= li - then strong (go (removeFont Bold bolds)) <> go (drop lb xs) - else if li > 0 - then emph (go (removeFont Italic italics)) <> go (drop li xs) - else text (linePartsToString regulars) <> go (drop lr xs) - - where - (lb, li, lr) = (length bolds, length italics, length regulars) - - removeFont font = map (removeFont' font) - removeFont' font (RoffStr (s,f)) = RoffStr (s, S.delete font f) - removeFont' _ x = x - - bolds = takeWhile isBold xs - italics = takeWhile isItalic xs - regulars = takeWhile (\x -> not (isBold x || isItalic x)) xs - - isBold (RoffStr (_,f)) = Bold `S.member` f - isBold _ = False - - isItalic (RoffStr (_,f)) = Italic `S.member` f - isItalic _ = False + go :: [[FontKind]] -> [LinePart] -> Inlines + go _ [] = mempty + go fs (MacroArg _:xs) = go fs xs -- shouldn't happen + go fs (RoffStr s : xs) = text s <> go fs xs + go (_:fs) (Font [] : xs) = go fs xs -- return to previous font + go fs (Font _newfonts : xs) = go fs xs + go fonts (FontSize _fs : xs) = go fonts xs parsePara :: PandocMonad m => ManParser m Blocks parsePara = para . trimInlines <$> parseInlines @@ -289,7 +270,13 @@ parseCodeBlock = try $ do where extractText :: ManToken -> Maybe String - extractText (MLine ss) = Just $ linePartsToString ss + extractText (MLine ss) + | not (null ss) + , all isFontToken ss = Nothing + | otherwise = Just $ linePartsToString ss + where isFontToken (FontSize{}) = True + isFontToken (Font{}) = True + isFontToken _ = False extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' extractText _ = Nothing -- cgit v1.2.3