aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-24 22:04:15 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-24 22:04:51 -0700
commite4726518afe2c4802c351f6f785032ff4e7e6a35 (patch)
treebd8836c415bfcea857e7ee3c6dd715d77afcdcd0 /src/Text/Pandoc/Readers
parent6c71100fcf0abc609dda323a76c78b0838234044 (diff)
downloadpandoc-e4726518afe2c4802c351f6f785032ff4e7e6a35.tar.gz
T.P.Readers.Groff: improve LinePart.
Separate font change and font size change tokens. With this change, emphasis no longer works. This needs to be implemented in the parser, not the lexer.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Groff.hs217
-rw-r--r--src/Text/Pandoc/Readers/Man.hs43
2 files changed, 121 insertions, 139 deletions
diff --git a/src/Text/Pandoc/Readers/Groff.hs b/src/Text/Pandoc/Readers/Groff.hs
index 90ae8561a..94cc96b24 100644
--- a/src/Text/Pandoc/Readers/Groff.hs
+++ b/src/Text/Pandoc/Readers/Groff.hs
@@ -33,7 +33,6 @@ Tokenizer for groff formats (man, ms).
-}
module Text.Pandoc.Readers.Groff
( FontKind(..)
- , Font
, MacroKind
, LinePart(..)
, Arg
@@ -54,8 +53,6 @@ import Text.Pandoc.Class
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default)
import qualified Data.Map as M
-import Data.Set (Set)
-import qualified Data.Set as S
import Data.List (intercalate, isSuffixOf)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
@@ -79,9 +76,9 @@ data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
type MacroKind = String
-type Font = Set FontKind
-
-data LinePart = RoffStr (String, Font)
+data LinePart = RoffStr String
+ | Font [FontKind]
+ | FontSize Int
| MacroArg Int
deriving Show
@@ -100,20 +97,19 @@ newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t)
-data RoffState = RoffState { fontKind :: Font
- , customMacros :: M.Map String ManTokens
+data RoffState = RoffState { customMacros :: M.Map String ManTokens
} deriving Show
instance Default RoffState where
def = RoffState { customMacros = M.fromList
$ map (\(n, s) ->
(n, singleTok
- (MLine [RoffStr (s, mempty)])))
+ (MLine [RoffStr s])))
[ ("Tm", "\x2122")
, ("lq", "\x201C")
, ("rq", "\x201D")
, ("R", "\x00AE") ]
- , fontKind = S.singleton Regular }
+ }
type ManLexer m = ParserT [Char] RoffState m
@@ -135,15 +131,16 @@ combiningAccentsMap :: M.Map String Char
combiningAccentsMap =
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-escapeLexer :: PandocMonad m => ManLexer m String
-escapeLexer = try $ do
+escape :: PandocMonad m => ManLexer m [LinePart]
+escape = do
char '\\'
- c <- noneOf ['*','$'] -- see escStar, macroArg
+ c <- anyChar
case c of
- '(' -> twoCharGlyph
- '[' -> bracketedGlyph
'f' -> escFont
's' -> escFontSize
+ '*' -> escStar
+ '(' -> twoCharGlyph
+ '[' -> bracketedGlyph
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
'#' -> mempty <$ manyTill anyChar newline
'%' -> return mempty
@@ -154,27 +151,27 @@ escapeLexer = try $ do
':' -> return mempty
'0' -> return mempty
'c' -> return mempty
- '-' -> return "-"
- '_' -> return "_"
- ' ' -> return " "
- '\\' -> return "\\"
- 't' -> return "\t"
- 'e' -> return "\\"
- '`' -> return "`"
- '^' -> return " "
- '|' -> return " "
- '\'' -> return "`"
- '.' -> return "`"
- '~' -> return "\160" -- nonbreaking space
- _ -> escUnknown ['\\',c] "\xFFFD"
+ '-' -> return [RoffStr "-"]
+ '_' -> return [RoffStr "_"]
+ ' ' -> return [RoffStr " "]
+ '\\' -> return [RoffStr "\\"]
+ 't' -> return [RoffStr "\t"]
+ 'e' -> return [RoffStr "\\"]
+ '`' -> return [RoffStr "`"]
+ '^' -> return [RoffStr " "]
+ '|' -> return [RoffStr " "]
+ '\'' -> return [RoffStr "`"]
+ '.' -> return [RoffStr "`"]
+ '~' -> return [RoffStr "\160"] -- nonbreaking space
+ _ -> escUnknown ['\\',c]
where
twoCharGlyph = do
cs <- count 2 anyChar
case M.lookup cs characterCodeMap of
- Just c -> return [c]
- Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
+ Just c -> return [RoffStr [c]]
+ Nothing -> escUnknown ('\\':'(':cs)
bracketedGlyph = unicodeGlyph <|> charGlyph
@@ -184,7 +181,7 @@ escapeLexer = try $ do
[] -> mzero
[s] -> case M.lookup s characterCodeMap of
Nothing -> mzero
- Just c -> return [c]
+ Just c -> return [RoffStr [c]]
(s:ss) -> do
basechar <- case M.lookup cs characterCodeMap of
Nothing ->
@@ -200,10 +197,12 @@ escapeLexer = try $ do
case M.lookup a combiningAccentsMap of
Just x -> addAccents as (x:xs)
Nothing -> mzero
- addAccents ss [basechar])
- <|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
+ addAccents ss [basechar] >>= \xs -> return [RoffStr xs])
+ <|> escUnknown ("\\[" ++ cs ++ "]")
- unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
+ unicodeGlyph = try $ do
+ xs <- ucharCode `sepBy1` (char '_') <* char ']'
+ return [RoffStr xs]
ucharCode = try $ do
char 'u'
@@ -214,53 +213,66 @@ escapeLexer = try $ do
Nothing -> mzero
Just c -> return c
- -- \s-1 \s0 -- we ignore these
- escFontSize :: PandocMonad m => ManLexer m String
- escFontSize = do
- pos <- getPosition
- pm <- option "" $ count 1 (oneOf "+-")
- ds <- many1 digit
- report $ SkippedContent ("\\s" ++ pm ++ ds) pos
- return mempty
-
- escFont :: PandocMonad m => ManLexer m String
- escFont = do
- font <- choice
- [ S.singleton <$> letterFontKind
- , char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
- , char 'S' >> return (S.singleton Regular)
- , try lettersFont
- , digit >> return (S.singleton Regular)
- ]
- modifyState (\r -> r {fontKind = font})
- return mempty
-
- lettersFont :: PandocMonad m => ManLexer m Font
- lettersFont = do
- char '['
- fs <- many letterFontKind
- skipMany letter
- char ']'
- return $ S.fromList fs
-
- letterFontKind :: PandocMonad m => ManLexer m FontKind
- letterFontKind = choice [
- oneOf ['B','b'] >> return Bold
- , oneOf ['I','i'] >> return Italic
- , oneOf ['C','c'] >> return Monospace
- , oneOf ['P','p','R','r'] >> return Regular
- ]
-
- escUnknown :: PandocMonad m => String -> a -> ManLexer m a
- escUnknown s x = do
+ escUnknown :: PandocMonad m => String -> ManLexer m [LinePart]
+ escUnknown s = do
pos <- getPosition
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
- return x
-
-currentFont :: PandocMonad m => ManLexer m Font
-currentFont = fontKind <$> getState
+ return [RoffStr "\xFFFD"]
+
+-- \s-1 \s0
+escFontSize :: PandocMonad m => ManLexer m [LinePart]
+escFontSize = do
+ let sign = option "" $ count 1 (oneOf "+-")
+ let toFontSize xs =
+ case safeRead xs of
+ Nothing -> mzero
+ Just n -> return [FontSize n]
+ choice
+ [ do char '('
+ s <- sign
+ ds <- count 2 digit
+ toFontSize (s ++ ds)
+ , do char '['
+ s <- sign
+ ds <- many1 digit
+ char ']'
+ toFontSize (s ++ ds)
+ , do s <- sign
+ ds <- count 1 digit
+ toFontSize (s ++ ds)
+ ]
--- separate function from lexMacro since real man files sometimes do not follow the rules
+escFont :: PandocMonad m => ManLexer m [LinePart]
+escFont = do
+ font <- choice
+ [ char 'S' >> return [Regular]
+ , digit >> return [Regular]
+ , (:[]) <$> letterFontKind
+ , char '(' >> anyChar >> anyChar >> return [Regular]
+ , lettersFont
+ , digit >> return [Regular]
+ ]
+ return [Font font]
+
+lettersFont :: PandocMonad m => ManLexer m [FontKind]
+lettersFont = try $ do
+ char '['
+ fs <- many letterFontKind
+ skipMany letter
+ char ']'
+ return fs
+
+letterFontKind :: PandocMonad m => ManLexer m FontKind
+letterFontKind = choice [
+ oneOf ['B','b'] >> return Bold
+ , oneOf ['I','i'] >> return Italic
+ , oneOf ['C','c'] >> return Monospace
+ , oneOf ['P','p','R','r'] >> return Regular
+ ]
+
+
+-- separate function from lexMacro since real man files sometimes do not
+-- follow the rules
lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do
try $ string ".\\\""
@@ -310,11 +322,11 @@ lexTable = do
lexConditional :: PandocMonad m => ManLexer m ManTokens
lexConditional = do
skipMany spacetab
- parseNCond <|> skipConditional
+ lexNCond <|> skipConditional
-- n means nroff mode
-parseNCond :: PandocMonad m => ManLexer m ManTokens
-parseNCond = do
+lexNCond :: PandocMonad m => ManLexer m ManTokens
+lexNCond = do
char '\n'
many1 spacetab
lexGroup <|> manToken
@@ -355,11 +367,11 @@ resolveMacro macroName args pos = do
case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args pos
Just ts -> do
- let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
- fillLP (MacroArg i) zs =
+ let fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
+ fillLP z zs = z : zs
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
@@ -370,7 +382,7 @@ lexStringDef args = do -- string definition
case args of
[] -> fail "No argument to .ds"
(x:ys) -> do
- let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
+ let ts = singleTok $ MLine (intercalate [RoffStr " " ] ys)
let stringName = linePartsToString x
modifyState $ \st ->
st{ customMacros = M.insert stringName ts (customMacros st) }
@@ -413,21 +425,16 @@ lexArgs = do
plainArg :: PandocMonad m => ManLexer m [LinePart]
plainArg = do
skipMany spacetab
- mconcat <$> many1
- (macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
+ mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote)
where
- unescapedQuote = do
- char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
-
+ unescapedQuote = char '"' >> return [RoffStr "\""]
quotedArg :: PandocMonad m => ManLexer m [LinePart]
quotedArg = do
skipMany spacetab
char '"'
xs <- mconcat <$>
- many (macroArg <|> esc <|> escStar <|> regularText
+ many (macroArg <|> escape <|> regularText
<|> spaceTabChar <|> escapedQuote)
char '"'
return xs
@@ -435,14 +442,11 @@ lexArgs = do
escapedQuote = try $ do
char '"'
char '"'
- fonts <- currentFont
- return [RoffStr ("\"", fonts)]
+ return [RoffStr "\""]
escStar :: PandocMonad m => ManLexer m [LinePart]
escStar = try $ do
pos <- getPosition
- char '\\'
- char '*'
c <- anyChar
case c of
'(' -> do
@@ -474,11 +478,11 @@ lexLine = do
where -- return empty line if we only have empty strings;
-- this can happen if the line just contains \f[C], for example.
go [] = return mempty
- go (RoffStr ("",_):xs) = go xs
+ go (RoffStr "" : xs) = go xs
go xs = return $ singleTok $ MLine xs
linePart :: PandocMonad m => ManLexer m [LinePart]
-linePart = macroArg <|> esc <|> escStar <|>
+linePart = macroArg <|> escape <|>
regularText <|> quoteChar <|> spaceTabChar
macroArg :: PandocMonad m => ManLexer m [LinePart]
@@ -487,29 +491,20 @@ macroArg = try $ do
x <- digit
return [MacroArg $ ord x - ord '0']
-esc :: PandocMonad m => ManLexer m [LinePart]
-esc = do
- s <- escapeLexer
- font <- currentFont
- return [RoffStr (s, font)]
-
regularText :: PandocMonad m => ManLexer m [LinePart]
regularText = do
s <- many1 $ noneOf "\n\r\t \\\""
- font <- currentFont
- return [RoffStr (s, font)]
+ return [RoffStr s]
quoteChar :: PandocMonad m => ManLexer m [LinePart]
quoteChar = do
char '"'
- font <- currentFont
- return [RoffStr ("\"", font)]
+ return [RoffStr "\""]
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
spaceTabChar = do
c <- spacetab
- font <- currentFont
- return [RoffStr ([c], font)]
+ return [RoffStr [c]]
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
@@ -520,5 +515,5 @@ manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
linePartsToString :: [LinePart] -> String
linePartsToString = mconcat . map go
where
- go (RoffStr (s, _)) = s
+ go (RoffStr s) = s
go _ = mempty
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs
index 50ec0c019..ee7051213 100644
--- a/src/Text/Pandoc/Readers/Man.hs
+++ b/src/Text/Pandoc/Readers/Man.hs
@@ -39,7 +39,6 @@ import Control.Monad (liftM, mzero, guard)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class (PandocMonad(..), report)
import Data.Maybe (catMaybes)
-import qualified Data.Set as S
import Data.List (intersperse, intercalate)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
@@ -174,34 +173,16 @@ parseTitle = do
return mempty
linePartsToInlines :: [LinePart] -> Inlines
-linePartsToInlines = go
+linePartsToInlines = go []
where
- go [] = mempty
- go (MacroArg _:xs) = go xs -- shouldn't happen
- go xs@(RoffStr{} : _) =
- if lb > 0 && lb >= li
- then strong (go (removeFont Bold bolds)) <> go (drop lb xs)
- else if li > 0
- then emph (go (removeFont Italic italics)) <> go (drop li xs)
- else text (linePartsToString regulars) <> go (drop lr xs)
-
- where
- (lb, li, lr) = (length bolds, length italics, length regulars)
-
- removeFont font = map (removeFont' font)
- removeFont' font (RoffStr (s,f)) = RoffStr (s, S.delete font f)
- removeFont' _ x = x
-
- bolds = takeWhile isBold xs
- italics = takeWhile isItalic xs
- regulars = takeWhile (\x -> not (isBold x || isItalic x)) xs
-
- isBold (RoffStr (_,f)) = Bold `S.member` f
- isBold _ = False
-
- isItalic (RoffStr (_,f)) = Italic `S.member` f
- isItalic _ = False
+ go :: [[FontKind]] -> [LinePart] -> Inlines
+ go _ [] = mempty
+ go fs (MacroArg _:xs) = go fs xs -- shouldn't happen
+ go fs (RoffStr s : xs) = text s <> go fs xs
+ go (_:fs) (Font [] : xs) = go fs xs -- return to previous font
+ go fs (Font _newfonts : xs) = go fs xs
+ go fonts (FontSize _fs : xs) = go fonts xs
parsePara :: PandocMonad m => ManParser m Blocks
parsePara = para . trimInlines <$> parseInlines
@@ -289,7 +270,13 @@ parseCodeBlock = try $ do
where
extractText :: ManToken -> Maybe String
- extractText (MLine ss) = Just $ linePartsToString ss
+ extractText (MLine ss)
+ | not (null ss)
+ , all isFontToken ss = Nothing
+ | otherwise = Just $ linePartsToString ss
+ where isFontToken (FontSize{}) = True
+ isFontToken (Font{}) = True
+ isFontToken _ = False
extractText MEmptyLine = Just ""
-- string are intercalated with '\n', this prevents double '\n'
extractText _ = Nothing