aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-24 22:04:15 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-24 22:04:51 -0700
commite4726518afe2c4802c351f6f785032ff4e7e6a35 (patch)
treebd8836c415bfcea857e7ee3c6dd715d77afcdcd0
parent6c71100fcf0abc609dda323a76c78b0838234044 (diff)
downloadpandoc-e4726518afe2c4802c351f6f785032ff4e7e6a35.tar.gz
T.P.Readers.Groff: improve LinePart.
Separate font change and font size change tokens. With this change, emphasis no longer works. This needs to be implemented in the parser, not the lexer.
-rw-r--r--src/Text/Pandoc/Readers/Groff.hs217
-rw-r--r--src/Text/Pandoc/Readers/Man.hs43
2 files changed, 121 insertions, 139 deletions
diff --git a/src/Text/Pandoc/Readers/Groff.hs b/src/Text/Pandoc/Readers/Groff.hs
index 90ae8561a..94cc96b24 100644
--- a/src/Text/Pandoc/Readers/Groff.hs
+++ b/src/Text/Pandoc/Readers/Groff.hs
@@ -33,7 +33,6 @@ Tokenizer for groff formats (man, ms).
-}
module Text.Pandoc.Readers.Groff
( FontKind(..)
- , Font
, MacroKind
, LinePart(..)
, Arg
@@ -54,8 +53,6 @@ import Text.Pandoc.Class
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default)
import qualified Data.Map as M
-import Data.Set (Set)
-import qualified Data.Set as S
import Data.List (intercalate, isSuffixOf)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
@@ -79,9 +76,9 @@ data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
type MacroKind = String
-type Font = Set FontKind
-
-data LinePart = RoffStr (String, Font)
+data LinePart = RoffStr String
+ | Font [FontKind]
+ | FontSize Int
| MacroArg Int
deriving Show
@@ -100,20 +97,19 @@ newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t)
-data RoffState = RoffState { fontKind :: Font
- , customMacros :: M.Map String ManTokens
+data RoffState = RoffState { customMacros :: M.Map String ManTokens
} deriving Show
instance Default RoffState where
def = RoffState { customMacros = M.fromList
$ map (\(n, s) ->
(n, singleTok
- (MLine [RoffStr (s, mempty)])))
+ (MLine [RoffStr s])))
[ ("Tm", "\x2122")
, ("lq", "\x201C")
, ("rq", "\x201D")
, ("R", "\x00AE") ]
- , fontKind = S.singleton Regular }
+ }
type ManLexer m = ParserT [Char] RoffState m
@@ -135,15 +131,16 @@ combiningAccentsMap :: M.Map String Char
combiningAccentsMap =
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-escapeLexer :: PandocMonad m => ManLexer m String
-escapeLexer = try $ do
+escape :: PandocMonad m => ManLexer m [LinePart]
+escape = do
char '\\'
- c <- noneOf ['*','$'] -- see escStar, macroArg
+ c <- anyChar
case c of
- '(' -> twoCharGlyph
- '[' -> bracketedGlyph
'f' -> escFont
's' -> escFontSize
+ '*' -> escStar
+ '(' -> twoCharGlyph
+ '[' -> bracketedGlyph
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
'#' -> mempty <$ manyTill anyChar newline
'%' -> return mempty
@@ -154,27 +151,27 @@ escapeLexer = try $ do
':' -> return mempty
'0' -> return mempty
'c' -> return mempty
- '-' -> return "-"
- '_' -> return "_"
- ' ' -> return " "
- '\\' -> return "\\"
- 't' -> return "\t"
- 'e' -> return "\\"
- '`' -> return "`"
- '^' -> return " "
- '|' -> return " "
- '\'' -> return "`"
- '.' -> return "`"
- '~' -> return "\160" -- nonbreaking space
- _ -> escUnknown ['\\',c] "\xFFFD"
+ '-' -> return [RoffStr "-"]
+ '_' -> return [RoffStr "_"]
+ ' ' -> return [RoffStr " "]
+ '\\' -> return [RoffStr "\\"]
+ 't' -> return [RoffStr "\t"]
+ 'e' -> return [RoffStr "\\"]
+ '`' -> return [RoffStr "`"]
+ '^' -> return [RoffStr " "]
+ '|' -> return [RoffStr " "]
+ '\'' -> return [RoffStr "`"]
+ '.' -> return [RoffStr "`"]
+ '~' -> return [RoffStr "\160"] -- nonbreaking space
+ _ -> escUnknown ['\\',c]
where
twoCharGlyph = do
cs <- count 2 anyChar
case M.lookup cs characterCodeMap of
- Just c -> return [c]
- Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
+ Just c -> return [RoffStr [c]]
+ Nothing -> escUnknown ('\\':'(':cs)
bracketedGlyph = unicodeGlyph <|> charGlyph
@@ -184,7 +181,7 @@ escapeLexer = try $ do
[] -> mzero
[s] -> case M.lookup s characterCodeMap of
Nothing -> mzero
- Just c -> return [c]
+ Just c -> return [RoffStr [c]]
(s:ss) -> do
basechar <- case M.lookup cs characterCodeMap of
Nothing ->
@@ -200,10 +197,12 @@ escapeLexer = try $ do
case M.lookup a combiningAccentsMap of
Just x -> addAccents as (x:xs)
Nothing -> mzero
- addAccents ss [basechar])
- <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
+ addAccents ss [basechar] >>= \xs -> return [RoffStr xs])
+ <|> escUnknown ("\\[" ++ cs ++ "]")
- unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
+ unicodeGlyph = try $ do
+ xs <- ucharCode `sepBy1` (char '_') <* char ']'
+ return [RoffStr xs]
ucharCode = try $ do
char 'u'
@@ -214,53 +213,66 @@ escapeLexer = try $ do
Nothing -> mzero
Just c -> return c
- -- \s-1 \s0 -- we ignore these
- escFontSize :: PandocMonad m => ManLexer m String
- escFontSize = do
- pos <- getPosition
- pm <- option "" $ count 1 (oneOf "+-")
- ds <- many1 digit
- report $ SkippedContent ("\\s" ++ pm ++ ds) pos
- return mempty
-
- escFont :: PandocMonad m => ManLexer m String
- escFont = do
- font <- choice
- [ S.singleton <$> letterFontKind
- , char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
- , char 'S' >> return (S.singleton Regular)
- , try lettersFont
- , digit >> return (S.singleton Regular)
- ]
- modifyState (\r -> r {fontKind = font})
- return mempty
-
- lettersFont :: PandocMonad m => ManLexer m Font
- lettersFont = do
- char '['
- fs <- many letterFontKind
- skipMany letter
- char ']'
- return $ S.fromList fs
-
- letterFontKind :: PandocMonad m => ManLexer m FontKind
- letterFontKind = choice [
- oneOf ['B','b'] >> return Bold
- , oneOf ['I','i'] >> return Italic
- , oneOf ['C','c'] >> return Monospace
- , oneOf ['P','p','R','r'] >> return Regular
- ]
-
- escUnknown :: PandocMonad m => String -> a -> ManLexer m a
- escUnknown s x = do
+ escUnknown :: PandocMonad m => String -> ManLexer m [LinePart]
+ escUnknown s = do
pos <- getPosition
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
- return x
-
-currentFont :: PandocMonad m => ManLexer m Font
-currentFont = fontKind <$> getState
+ return [RoffStr "\xFFFD"]
+
+-- \s-1 \s0
+escFontSize :: PandocMonad m => ManLexer m [LinePart]
+escFontSize = do
+ let sign = option "" $ count 1 (oneOf "+-")
+ let toFontSize xs =
+ case safeRead xs of
+ Nothing -> mzero
+ Just n -> return [FontSize n]
+ choice
+ [ do char '('
+ s <- sign
+ ds <- count 2 digit
+ toFontSize (s ++ ds)
+ , do char '['
+ s <- sign
+ ds <- many1 digit
+ char ']'
+ toFontSize (s ++ ds)
+ , do s <- sign
+ ds <- count 1 digit
+ toFontSize (s ++ ds)
+ ]
--- separate function from lexMacro since real man files sometimes do not follow the rules
+escFont :: PandocMonad m => ManLexer m [LinePart]
+escFont = do
+ font <- choice
+ [ char 'S' >> return [Regular]
+ , digit >> return [Regular]
+ , (:[]) <$> letterFontKind
+ , char '(' >> anyChar >> anyChar >> return [Regular]
+ , lettersFont
+ , digit >> return [Regular]
+ ]
+ return [Font font]
+
+lettersFont :: PandocMonad m => ManLexer m [FontKind]
+lettersFont = try $ do
+ char '['
+ fs <- many letterFontKind
+ skipMany letter
+ char ']'
+ return fs
+
+letterFontKind :: PandocMonad m => ManLexer m FontKind
+letterFontKind = choice [
+ oneOf ['B','b'] >> return Bold
+ , oneOf ['I','i'] >> return Italic
+ , oneOf ['C','c'] >> return Monospace
+ , oneOf ['P','p','R','r'] >> return Regular
+ ]
+
+
+-- separate function from lexMacro since real man files sometimes do not
+-- follow the rules
lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do
try $ string ".\\\""
@@ -310,11 +322,11 @@ lexTable = do
lexConditional :: PandocMonad m => ManLexer m ManTokens
lexConditional = do
skipMany spacetab
- parseNCond <|> skipConditional
+ lexNCond <|> skipConditional
-- n means nroff mode
-parseNCond :: PandocMonad m => ManLexer m ManTokens
-parseNCond = do
+lexNCond :: PandocMonad m => ManLexer m ManTokens
+lexNCond = do
char '\n'
many1 spacetab
lexGroup <|> manToken
@@ -355,11 +367,11 @@ resolveMacro macroName args pos = do
case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args pos
Just ts -> do
- let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
- fillLP (MacroArg i) zs =
+ let fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
+ fillLP z zs = z : zs
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
@@ -370,7 +382,7 @@ lexStringDef args = do -- string definition
case args of
[] -> fail "No argument to .ds"
(x:ys) -> do
- let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
+ let ts = singleTok $ MLine (intercalate [RoffStr " " ] ys)
let stringName = linePartsToString x
modifyState $ \st ->
st{ customMacros = M.insert stringName ts (customMacros st) }
@@ -413,21 +425,16 @@ lexArgs = do
plainArg :: PandocMonad m => ManLexer m [LinePart]
plainArg = do
skipMany spacetab
- mconcat <$> many1
- (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
+ mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote)
where
- unescapedQuote = do
- char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
-
+ unescapedQuote = char '"' >> return [RoffStr "\""]
quotedArg :: PandocMonad m => ManLexer m [LinePart]
quotedArg = do
skipMany spacetab
char '"'
xs <- mconcat <$>
- many (macroArg <|> esc <|> escStar <|> regularText
+ many (macroArg <|> escape <|> regularText
<|> spaceTabChar <|> escapedQuote)
char '"'
return xs
@@ -435,14 +442,11 @@ lexArgs = do
escapedQuote = try $ do
char '"'
char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
+ return [RoffStr "\""]
escStar :: PandocMonad m => ManLexer m [LinePart]
escStar = try $ do
pos <- getPosition
- char '\\'
- char '*'
c <- anyChar
case c of
'(' -> do
@@ -474,11 +478,11 @@ lexLine = do
where -- return empty line if we only have empty strings;
-- this can happen if the line just contains \f[C], for example.
go [] = return mempty
- go (RoffStr ("",_):xs) = go xs
+ go (RoffStr "" : xs) = go xs
go xs = return $ singleTok $ MLine xs
linePart :: PandocMonad m => ManLexer m [LinePart]
-linePart = macroArg <|> esc <|> escStar <|>
+linePart = macroArg <|> escape <|>
regularText <|> quoteChar <|> spaceTabChar
macroArg :: PandocMonad m => ManLexer m [LinePart]
@@ -487,29 +491,20 @@ macroArg = try $ do
x <- digit
return [MacroArg $ ord x - ord '0']
-esc :: PandocMonad m => ManLexer m [LinePart]
-esc = do
- s <- escapeLexer
- font <- currentFont
- return [RoffStr (s, font)]
-
regularText :: PandocMonad m => ManLexer m [LinePart]
regularText = do
s <- many1 $ noneOf "\n\r\t \\\""
- font <- currentFont
- return [RoffStr (s, font)]
+ return [RoffStr s]
quoteChar :: PandocMonad m => ManLexer m [LinePart]
quoteChar = do
char '"'
- font <- currentFont
- return [RoffStr ("\"", font)]
+ return [RoffStr "\""]
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
spaceTabChar = do
c <- spacetab
- font <- currentFont
- return [RoffStr ([c], font)]
+ return [RoffStr [c]]
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
@@ -520,5 +515,5 @@ manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
linePartsToString :: [LinePart] -> String
linePartsToString = mconcat . map go
where
- go (RoffStr (s, _)) = s
+ go (RoffStr s) = s
go _ = mempty
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs
index 50ec0c019..ee7051213 100644
--- a/src/Text/Pandoc/Readers/Man.hs
+++ b/src/Text/Pandoc/Readers/Man.hs
@@ -39,7 +39,6 @@ import Control.Monad (liftM, mzero, guard)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class (PandocMonad(..), report)
import Data.Maybe (catMaybes)
-import qualified Data.Set as S
import Data.List (intersperse, intercalate)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
@@ -174,34 +173,16 @@ parseTitle = do
return mempty
linePartsToInlines :: [LinePart] -> Inlines
-linePartsToInlines = go
+linePartsToInlines = go []
where
- go [] = mempty
- go (MacroArg _:xs) = go xs -- shouldn't happen
- go xs@(RoffStr{} : _) =
- if lb > 0 && lb >= li
- then strong (go (removeFont Bold bolds)) <> go (drop lb xs)
- else if li > 0
- then emph (go (removeFont Italic italics)) <> go (drop li xs)
- else text (linePartsToString regulars) <> go (drop lr xs)
-
- where
- (lb, li, lr) = (length bolds, length italics, length regulars)
-
- removeFont font = map (removeFont' font)
- removeFont' font (RoffStr (s,f)) = RoffStr (s, S.delete font f)
- removeFont' _ x = x
-
- bolds = takeWhile isBold xs
- italics = takeWhile isItalic xs
- regulars = takeWhile (\x -> not (isBold x || isItalic x)) xs
-
- isBold (RoffStr (_,f)) = Bold `S.member` f
- isBold _ = False
-
- isItalic (RoffStr (_,f)) = Italic `S.member` f
- isItalic _ = False
+ go :: [[FontKind]] -> [LinePart] -> Inlines
+ go _ [] = mempty
+ go fs (MacroArg _:xs) = go fs xs -- shouldn't happen
+ go fs (RoffStr s : xs) = text s <> go fs xs
+ go (_:fs) (Font [] : xs) = go fs xs -- return to previous font
+ go fs (Font _newfonts : xs) = go fs xs
+ go fonts (FontSize _fs : xs) = go fonts xs
parsePara :: PandocMonad m => ManParser m Blocks
parsePara = para . trimInlines <$> parseInlines
@@ -289,7 +270,13 @@ parseCodeBlock = try $ do
where
extractText :: ManToken -> Maybe String
- extractText (MLine ss) = Just $ linePartsToString ss
+ extractText (MLine ss)
+ | not (null ss)
+ , all isFontToken ss = Nothing
+ | otherwise = Just $ linePartsToString ss
+ where isFontToken (FontSize{}) = True
+ isFontToken (Font{}) = True
+ isFontToken _ = False
extractText MEmptyLine = Just ""
-- string are intercalated with '\n', this prevents double '\n'
extractText _ = Nothing