diff options
author | John MacFarlane <jgm@berkeley.edu> | 2018-10-18 23:50:29 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2018-10-18 23:54:13 -0700 |
commit | e15b5b360fdc892274acdae1631b27e966a6d397 (patch) | |
tree | 553af849caa46a418efa85f0fac75d5c22580089 /src/Text/Pandoc | |
parent | 7d5b25d6d65306b1f2eb77417a9010951d1e17a8 (diff) | |
download | pandoc-e15b5b360fdc892274acdae1631b27e966a6d397.tar.gz |
Man reader: improved parsing of groff escapes.
We now handle all the named escapes, plus combining accents
and unicode escapes.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/Readers/Man.hs | 196 |
1 files changed, 116 insertions, 80 deletions
diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 1ffdd1f91..876c876b7 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -34,27 +34,27 @@ Conversion of man to 'Pandoc' document. module Text.Pandoc.Readers.Man (readMan) where import Prelude -import Control.Monad (liftM, void) +import Control.Monad (liftM, void, mzero) import Control.Monad.Except (throwError) -import Data.Char (isDigit, isUpper, isLower) +import Data.Char (isHexDigit, chr) import Data.Default (Default) -import Data.Map (insert) +import Data.Maybe (catMaybes) +import qualified Data.Map as M import Data.Set (Set, singleton) import qualified Data.Set as S (fromList, toList) -import Data.Maybe (catMaybes, fromMaybe, isNothing) import Data.List (intersperse, intercalate) import qualified Data.Text as T - -import Text.Pandoc.Class (PandocMonad(..)) +import Text.Pandoc.Class (PandocMonad(..), report) import Text.Pandoc.Builder as B hiding (singleton) import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing -import Text.Pandoc.Shared (crFilter) +import Text.Pandoc.Shared (crFilter, safeRead) import Text.Parsec hiding (tokenPrim, space) import qualified Text.Parsec as Parsec import Text.Parsec.Pos (updatePosString) +import Text.Pandoc.GroffChar (characterCodes, combiningAccents) -- -- Data Types @@ -84,11 +84,6 @@ data ManToken = MStr RoffStr | MComment String deriving Show -data EscapeThing = EFont Font - | EChar Char - | ENothing - deriving Show - data RoffState = RoffState { fontKind :: Font } deriving Show @@ -113,7 +108,7 @@ testStr str = do pand <- runIOorExplode $ readMan def (T.pack str) putStrLn $ printPandoc pand - + testFile :: FilePath -> IO () testFile fname = do cont <- readFile fname @@ -170,31 +165,81 @@ eofline = void newline <|> eof spacetab :: Stream s m Char => ParsecT s u m Char spacetab = char ' ' <|> char '\t' --- TODO add other sequences from man (7) groff -escapeLexer :: PandocMonad m => ManLexer m EscapeThing +characterCodeMap :: M.Map String Char +characterCodeMap = + M.fromList $ map (\(x,y) -> (y,x)) $ characterCodes ++ combiningAccents + +escapeLexer :: PandocMonad m => ManLexer m String escapeLexer = do char '\\' - choice [escChar, escFont, escUnknown] + twoCharGlyph <|> bracketedGlyph <|> escFont <|> escStar <|> escSingle where - escChar :: PandocMonad m => ManLexer m EscapeThing - escChar = - let skipSeqs = ["%", "{", "}", "&", "\n", ":", "\"", "0", "c"] - subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') - , ("[em]", '—'), ("[en]", '–'), ("*(lq", '«'), ("*(rq", '»') - , ("t", '\t'), ("e", '\\'), ("`", '`'), ("^", ' '), ("|", ' ') - , ("'", '`') ] - substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing - substitute (from,to) = try $ string from >> return (EChar to) - skip :: PandocMonad m => String -> ManLexer m EscapeThing - skip seq' = try $ string seq' >> return ENothing - in choice $ (substitute <$> subsSeqs) ++ - (skip <$> skipSeqs) ++ - [ char '(' >> anyChar >> return ENothing - , char '[' >> many alphaNum >> char ']' >> return ENothing - ] - - escFont :: PandocMonad m => ManLexer m EscapeThing + twoCharGlyph = do + char '(' + cs <- count 2 anyChar + case M.lookup cs characterCodeMap of + Just c -> return [c] + Nothing -> escUnknown ('(':cs) + + bracketedGlyph = + char '[' *> + ( ucharCode `sepBy1` (char '_') + <|> charCode `sepBy1` (many1 Parsec.space) + ) <* char ']' + + ucharCode = do + char 'u' + cs <- many1 (satisfy isHexDigit) + case chr <$> safeRead ('0':'x':cs) of + Nothing -> mzero + Just c -> return c + + charCode = do + cs <- many1 (noneOf ['[',']',' ','\t','\n']) + case M.lookup cs characterCodeMap of + Nothing -> mzero + Just c -> return c + + escStar = do + char '*' + choice + [ ("\xae" <$ char 'R') + , ("" <$ char 'S') -- switch back to default font size + , ("\x201c" <$ try (string "(lq")) + , ("\x201d" <$ try (string "(rq")) + , ("" <$ try (string "(HF" >> + modifyState (\r -> r {fontKind = singleton Bold}))) + , ("\x2122" <$ try (string "(Tm")) + ] + + escSingle = do + c <- anyChar + case c of + '"' -> mempty <$ manyTill anyChar newline -- line comment + '#' -> mempty <$ (manyTill anyChar newline >> optional newline) + '%' -> return mempty + '{' -> return mempty + '}' -> return mempty + '&' -> return mempty + '\n' -> return mempty + ':' -> return mempty + '0' -> return mempty + 'c' -> return mempty + '-' -> return "-" + '_' -> return "_" + ' ' -> return " " + '\\' -> return "\\" + 't' -> return "\t" + 'e' -> return "\\" + '`' -> return "`" + '^' -> return " " + '|' -> return " " + '\'' -> return "`" + '.' -> return "`" + _ -> escUnknown [c] + + escFont :: PandocMonad m => ManLexer m String escFont = do char 'f' font <- choice [ singleton <$> letterFontKind @@ -203,32 +248,29 @@ escapeLexer = do , digit >> return (singleton Regular) ] modifyState (\r -> r {fontKind = font}) - return $ EFont font - - where - - lettersFont :: PandocMonad m => ManLexer m Font - lettersFont = do - char '[' - fs <- many letterFontKind - many letter - char ']' - return $ S.fromList fs - - letterFontKind :: PandocMonad m => ManLexer m FontKind - letterFontKind = choice [ - char 'B' >> return Bold - , char 'I' >> return Italic - , char 'C' >> return Monospace - , (char 'P' <|> char 'R') >> return Regular - ] - - escUnknown :: PandocMonad m => ManLexer m EscapeThing - escUnknown = do - c <- anyChar + return mempty + + lettersFont :: PandocMonad m => ManLexer m Font + lettersFont = do + char '[' + fs <- many letterFontKind + many letter + char ']' + return $ S.fromList fs + + letterFontKind :: PandocMonad m => ManLexer m FontKind + letterFontKind = choice [ + char 'B' >> return Bold + , char 'I' >> return Italic + , char 'C' >> return Monospace + , (char 'P' <|> char 'R') >> return Regular + ] + + escUnknown :: PandocMonad m => String -> ManLexer m String + escUnknown s = do pos <- getPosition - logOutput $ SkippedContent ("Unknown escape sequence \\" ++ [c]) pos - return ENothing + report $ SkippedContent ("Unknown escape sequence " ++ s) pos + return mempty currentFont :: PandocMonad m => ManLexer m Font currentFont = fontKind <$> getState @@ -291,28 +333,23 @@ lexMacro = do plainArg :: PandocMonad m => ManLexer m RoffStr plainArg = do indents <- many spacetab - arg <- many1 $ escChar <|> (Just <$> noneOf " \t\n") + arg <- many1 $ escapeLexer <|> many1 (noneOf " \t\n\\") f <- currentFont - return (indents ++ catMaybes arg, f) + return (indents ++ mconcat arg, f) quotedArg :: PandocMonad m => ManLexer m RoffStr quotedArg = do char '"' - val <- many quotedChar + val <- mconcat <$> many quotedChar char '"' - val2 <- many $ escChar <|> (Just <$> noneOf " \t\n") + val2 <- mconcat <$> many (escapeLexer <|> many1 (noneOf " \t\n")) f <- currentFont - return (catMaybes $ val ++ val2, f) - - quotedChar :: PandocMonad m => ManLexer m (Maybe Char) - quotedChar = escChar <|> (Just <$> noneOf "\"\n") <|> (Just <$> try (string "\"\"" >> return '"')) + return (val ++ val2, f) - escChar :: PandocMonad m => ManLexer m (Maybe Char) - escChar = do - ec <- escapeLexer - case ec of - (EChar c) -> return $ Just c - _ -> return Nothing + quotedChar :: PandocMonad m => ManLexer m String + quotedChar = escapeLexer + <|> many1 (noneOf "\"\n\\") + <|> try (string "\"\"" >> return "\"") lexLine :: PandocMonad m => ManLexer m ManToken lexLine = do @@ -325,10 +362,9 @@ lexLine = do esc = do someesc <- escapeLexer font <- currentFont - let rv = case someesc of - EChar c -> Just ([c], font) - _ -> Nothing - return rv + return $ if null someesc + then Nothing + else Just (someesc, font) linePart :: PandocMonad m => ManLexer m (Maybe (String, Font)) linePart = do @@ -336,7 +372,7 @@ lexLine = do font <- currentFont return $ Just (lnpart, font) - + lexEmptyLine :: PandocMonad m => ManLexer m ManToken lexEmptyLine = char '\n' >> return MEmptyLine @@ -412,10 +448,10 @@ parseTitle = do where changeTitle title pst = let meta = stateMeta pst - metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) + metaUp = Meta $ M.insert "title" (MetaString title) (unMeta meta) in pst {stateMeta = metaUp} - + parseSkippedContent :: PandocMonad m => ManParser m Blocks parseSkippedContent = do tok <- munknownMacro <|> mcomment <|> memplyLine |