From fd3676a568589f07ad0707c07b2a9f87df6e2f6c Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 25 Feb 2018 03:34:17 +0300 Subject: initial --- src/Text/Pandoc/Readers/Man.hs | 118 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Man.hs (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs new file mode 100644 index 000000000..d1ff3fc47 --- /dev/null +++ b/src/Text/Pandoc/Readers/Man.hs @@ -0,0 +1,118 @@ +{- + Copyright (C) 2018 Yan Pashkovsky + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Man + Copyright : Copyright (C) 2018 Yan Pashkovsky + License : GNU GPL, version 2 or above + + Maintainer : Yan Pashkovsky + Stability : WIP + Portability : portable + +Conversion of man to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Man where + +import Control.Monad.Except (liftM2, throwError, guard) +import Text.Pandoc.Class (PandocMonad(..)) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed) +import Text.Pandoc.Shared (crFilter) +import Text.Parsec +import Text.Parsec.Char +import Data.Text (Text) +import Data.Map (empty) +import qualified Data.Text as T + + +-- | Read man (troff) from an input string and return a Pandoc document. +readMan :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readMan opts s = do + parsed <- readWithM parseMan def{ stateOptions = opts } (T.unpack s) + case parsed of + Right result -> return result + Left e -> throwError e + +type ManParser m = ParserT [Char] ParserState m + +comment :: PandocMonad m => ManParser m String +comment = do + string ".\\\" " + many anyChar + +data Macro = Macro { macroName :: String + , macroArgs :: [String] + } + +parseMacro :: PandocMonad m => ManParser m Block +parseMacro = do + m <- macro + return $ Plain (map Str $ (macroName m : macroArgs m)) + +macro :: PandocMonad m => ManParser m Macro +macro = do + char '.' <|> char '\'' + many space + name <- many1 letter + --args <- many parseArg + return $ Macro { macroName = name, macroArgs = [] } + + where + + parseArg :: PandocMonad m => ManParser m String + parseArg = do + many1 space + plainArg + + quotedArg :: PandocMonad m => ManParser m String + quotedArg = do + char '"' + val <- many1 quotedChar + char '"' + return val + + plainArg :: PandocMonad m => ManParser m String + plainArg = do + many1 $ noneOf " \t" + + quotedChar :: PandocMonad m => ManParser m Char + quotedChar = do + noneOf "\"" + <|> try (string "\"\"" >> return '"') + +parseLine :: PandocMonad m => ManParser m Block +parseLine = do + str <- many anyChar + return $ Plain [Str str] + +parseBlock :: PandocMonad m => ManParser m Block +parseBlock = do + choice [ parseMacro + , parseLine + ] + +parseMan :: PandocMonad m => ManParser m Pandoc +parseMan = do + blocks <- parseBlock `sepBy` newline + + return $ Pandoc Meta{unMeta = empty} blocks \ No newline at end of file -- cgit v1.2.3 From c1617565fc2a5984e2e44d4da9adf7f8a26b3160 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 9 May 2018 03:24:45 +0300 Subject: basic manfile parsing --- src/Text/Pandoc/Readers/Man.hs | 241 ++++++++++++++++++++++++++++++----------- test/Tests/Readers/Man.hs | 23 +++- test/test-pandoc.hs | 2 +- 3 files changed, 198 insertions(+), 68 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index d1ff3fc47..166b7c7a7 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -14,6 +14,9 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + + -} {- | @@ -29,90 +32,202 @@ Conversion of man to 'Pandoc' document. -} module Text.Pandoc.Readers.Man where -import Control.Monad.Except (liftM2, throwError, guard) -import Text.Pandoc.Class (PandocMonad(..)) +import Control.Monad.Except (throwError) +import Data.Default (Default) +import Data.Map (insert) +import Data.Maybe (isJust) +import Data.List (intersperse, intercalate) +import qualified Data.Text as T + +import Text.Pandoc.Class (PandocMonad(..), runPure) import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError) +import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed) import Text.Pandoc.Shared (crFilter) import Text.Parsec -import Text.Parsec.Char -import Data.Text (Text) -import Data.Map (empty) -import qualified Data.Text as T +import Text.Parsec.Char () + +data FontKind = Regular | Italic | Bold | ItalicBold deriving Show + +data RoffState = RoffState { inCodeBlock :: Bool + , fontKind :: FontKind + } deriving Show + +instance Default RoffState where + def = RoffState {inCodeBlock = False, fontKind = Regular} + +data ManState = ManState {pState :: ParserState, rState :: RoffState} + +instance HasLogMessages ManState where + addLogMessage lm mst = mst {pState = addLogMessage lm (pState mst)} + getLogMessages mst = getLogMessages $ pState mst + +modifyRoffState :: PandocMonad m => (RoffState -> RoffState) -> ParsecT a ManState m () +modifyRoffState f = do + mst <- getState + setState mst { rState = f $ rState mst } + +type ManParser m = ParserT [Char] ManState m + +testStrr :: [Char] -> SourceName -> Either PandocError (Either ParseError Pandoc) +testStrr s srcnm = runPure (runParserT parseMan (ManState {pState=def, rState=def}) srcnm s) + +printPandoc :: Pandoc -> [Char] +printPandoc (Pandoc m content) = + let ttl = "Pandoc: " ++ (show $ unMeta m) + cnt = intercalate "\n" $ map show content + in ttl ++ "\n" ++ cnt + +strrepr :: (Show a2, Show a1) => Either a2 (Either a1 Pandoc) -> [Char] +strrepr obj = case obj of + Right x -> case x of + Right x' -> printPandoc x' + Left y' -> show y' + Left y -> show y + +testFile :: FilePath -> IO () +testFile fname = do + cont <- readFile fname + putStrLn . strrepr $ testStrr cont fname -- | Read man (troff) from an input string and return a Pandoc document. -readMan :: PandocMonad m - => ReaderOptions - -> Text - -> m Pandoc -readMan opts s = do - parsed <- readWithM parseMan def{ stateOptions = opts } (T.unpack s) +readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc +readMan opts txt = do + let state = ManState { pState = def{ stateOptions = opts }, rState = def} + parsed <- readWithM parseMan state (T.unpack $ crFilter txt) case parsed of Right result -> return result Left e -> throwError e -type ManParser m = ParserT [Char] ParserState m - -comment :: PandocMonad m => ManParser m String -comment = do - string ".\\\" " - many anyChar - -data Macro = Macro { macroName :: String - , macroArgs :: [String] - } - parseMacro :: PandocMonad m => ManParser m Block parseMacro = do - m <- macro - return $ Plain (map Str $ (macroName m : macroArgs m)) - -macro :: PandocMonad m => ManParser m Macro -macro = do char '.' <|> char '\'' many space - name <- many1 letter - --args <- many parseArg - return $ Macro { macroName = name, macroArgs = [] } - + macroName <- many1 (letter <|> oneOf ['\\', '"']) + args <- parseArgs + let joinedArgs = concat $ intersperse " " args + case macroName of + "\\\"" -> return Null -- comment + "TH" -> macroTitle (if null args then "" else head args) + "nf" -> macroCodeBlock True >> return Null + "fi" -> macroCodeBlock False >> return Null + "B" -> return $ Plain [Strong [Str joinedArgs]] + "BR" -> return $ Plain [Strong [Str joinedArgs]] + "BI" -> return $ Plain [Strong [Emph [Str joinedArgs]]] + "I" -> return $ Plain [Emph [Str joinedArgs]] + "SH" -> return $ Header 2 nullAttr [Str joinedArgs] + "sp" -> return $ Plain [LineBreak] + _ -> unkownMacro macroName args + where - parseArg :: PandocMonad m => ManParser m String - parseArg = do - many1 space - plainArg - - quotedArg :: PandocMonad m => ManParser m String - quotedArg = do - char '"' - val <- many1 quotedChar - char '"' - return val - - plainArg :: PandocMonad m => ManParser m String - plainArg = do - many1 $ noneOf " \t" - - quotedChar :: PandocMonad m => ManParser m Char - quotedChar = do - noneOf "\"" - <|> try (string "\"\"" >> return '"') + macroTitle :: PandocMonad m => String -> ManParser m Block + macroTitle mantitle = do + modifyState (changeTitle mantitle) + if null mantitle + then return Null + else return $ Header 1 nullAttr [Str mantitle] + where + changeTitle title mst @ ManState{ pState = pst} = + let meta = stateMeta pst + metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) + in + mst { pState = pst {stateMeta = metaUp} } + + + macroCodeBlock :: PandocMonad m => Bool -> ManParser m () + macroCodeBlock insideCB = modifyRoffState (\rst -> rst{inCodeBlock = insideCB}) >> return () + + unkownMacro :: PandocMonad m => String -> [String] -> ManParser m Block + unkownMacro mname args = do + pos <- getPosition + logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos + return $ Plain $ Str <$> args + + parseArgs :: PandocMonad m => ManParser m [String] + parseArgs = do + eolOpt <- optionMaybe $ char '\n' + if isJust eolOpt + then return [] + else do + many1 space + arg <- try quotedArg <|> plainArg + otherargs <- parseArgs + return $ arg : otherargs + + where + + plainArg :: PandocMonad m => ManParser m String + plainArg = many1 $ noneOf " \t\n" + + quotedArg :: PandocMonad m => ManParser m String + quotedArg = do + char '"' + val <- many1 quotedChar + char '"' + return val + + quotedChar :: PandocMonad m => ManParser m Char + quotedChar = noneOf "\"\n" <|> try (string "\"\"" >> return '"') + +roffInline :: RoffState -> String -> (Maybe Inline) +roffInline rst str + | null str = Nothing + | inCodeBlock rst = Just $ Code nullAttr str + | otherwise = Just $ case fontKind rst of + Regular -> Str str + Italic -> Emph [Str str] + _ -> Strong [Str str] parseLine :: PandocMonad m => ManParser m Block parseLine = do - str <- many anyChar - return $ Plain [Str str] - -parseBlock :: PandocMonad m => ManParser m Block -parseBlock = do - choice [ parseMacro - , parseLine - ] + parts <- parseLineParts + newline + return $ if null parts + then Plain [LineBreak] + else Plain parts + where + parseLineParts :: PandocMonad m => ManParser m [Inline] + parseLineParts = do + lnpart <- many $ noneOf "\n\\" + ManState {rState = roffSt} <- getState + let inl = roffInline roffSt lnpart + others <- backSlash <|> return [] + return $ case inl of + Just x -> x:others + Nothing -> others + + backSlash :: PandocMonad m => ManParser m [Inline] + backSlash = do + char '\\' + esc <- choice [ char 'f' >> fEscape + , char '-' >> return (Just '-') + , Just <$> noneOf "\n" + ] + ManState {rState = roffSt} <- getState + case esc of + Just c -> case roffInline roffSt [c] of + Just inl -> do + oth <- parseLineParts + return $ inl : oth + Nothing -> parseLineParts + Nothing -> parseLineParts + where + + fEscape :: PandocMonad m => ManParser m (Maybe Char) + fEscape = choice [ char 'B' >> modifyRoffState (\rst -> rst {fontKind = Bold}) + , char 'I' >> modifyRoffState (\rst -> rst {fontKind = Italic}) + , char 'P' >> modifyRoffState (\rst -> rst {fontKind = Regular}) + ] + >> return Nothing + + parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do - blocks <- parseBlock `sepBy` newline - - return $ Pandoc Meta{unMeta = empty} blocks \ No newline at end of file + blocks <- many (parseMacro <|> parseLine) + parserst <- pState <$> getState + return $ Pandoc (stateMeta parserst) blocks diff --git a/test/Tests/Readers/Man.hs b/test/Tests/Readers/Man.hs index 5dc91544b..007935be1 100644 --- a/test/Tests/Readers/Man.hs +++ b/test/Tests/Readers/Man.hs @@ -2,15 +2,30 @@ module Tests.Readers.Man (tests) where import Data.Text (Text) -import qualified Data.Text as T import Test.Tasty import Tests.Helpers import Text.Pandoc import Text.Pandoc.Arbitrary () import Text.Pandoc.Builder +import Text.Pandoc.Readers.Man -creole :: Text -> Pandoc -creole = purely $ readCreole def{ readerStandalone = True } +man :: Text -> Pandoc +man = purely $ readMan def + +infix 4 =: +(=:) :: ToString c + => String -> (Text, c) -> TestTree +(=:) = test man tests :: [TestTree] -tests = [] \ No newline at end of file +tests = [ + -- .SH "HEllo bbb" "aaa"" as" + testGroup "Macros" [ + "Bold" =: + ".B foo\n" + =?> strong "foo" + , "Italic" =: + ".I foo\n" + =?> emph "foo" + ] + ] diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index 1ea3591b2..9d4632f35 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -74,7 +74,7 @@ tests = testGroup "pandoc tests" [ Tests.Command.tests , testGroup "EPUB" Tests.Readers.EPUB.tests , testGroup "Muse" Tests.Readers.Muse.tests , testGroup "Creole" Tests.Readers.Creole.tests - , testGroup "Man" Tests.Readers + , testGroup "Man" Tests.Readers.Man.tests ] , testGroup "Lua filters" Tests.Lua.tests ] -- cgit v1.2.3 From 83902ffdb225b6b95e9a812c8daf08aa1e785df7 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 9 May 2018 17:12:59 +0300 Subject: links, paragraphs, codeblocks --- src/Text/Pandoc/Readers/Man.hs | 99 ++++++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 33 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 166b7c7a7..0b9990899 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -35,7 +35,7 @@ module Text.Pandoc.Readers.Man where import Control.Monad.Except (throwError) import Data.Default (Default) import Data.Map (insert) -import Data.Maybe (isJust) +import Data.Maybe (isJust, maybeToList) import Data.List (intersperse, intercalate) import qualified Data.Text as T @@ -109,18 +109,26 @@ parseMacro = do macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- parseArgs let joinedArgs = concat $ intersperse " " args + let toBold = return $ Plain [Strong [Str joinedArgs]] + let toBoldItalic = return $ Plain [Strong [Emph [Str joinedArgs]]] + let toItalic = return $ Plain [Emph [Str joinedArgs]] case macroName of "\\\"" -> return Null -- comment - "TH" -> macroTitle (if null args then "" else head args) + "TH" -> macroTitle (if null args then "" else head args) -- man-title + "TP" -> return Null -- tab-indented paragraph + "PP" -> return Null -- end of tab-indented paragraphs "nf" -> macroCodeBlock True >> return Null "fi" -> macroCodeBlock False >> return Null - "B" -> return $ Plain [Strong [Str joinedArgs]] - "BR" -> return $ Plain [Strong [Str joinedArgs]] - "BI" -> return $ Plain [Strong [Emph [Str joinedArgs]]] - "I" -> return $ Plain [Emph [Str joinedArgs]] + "B" -> toBold + "BR" -> return $ linkToMan joinedArgs + "BI" -> toBoldItalic + "IB" -> toBoldItalic + "I" -> toItalic + "IR" -> toItalic + "RI" -> toItalic "SH" -> return $ Header 2 nullAttr [Str joinedArgs] "sp" -> return $ Plain [LineBreak] - _ -> unkownMacro macroName args + _ -> unkownMacro macroName where @@ -131,21 +139,37 @@ parseMacro = do then return Null else return $ Header 1 nullAttr [Str mantitle] where - changeTitle title mst @ ManState{ pState = pst} = + changeTitle title mst@ManState{ pState = pst} = let meta = stateMeta pst metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) in mst { pState = pst {stateMeta = metaUp} } - macroCodeBlock :: PandocMonad m => Bool -> ManParser m () macroCodeBlock insideCB = modifyRoffState (\rst -> rst{inCodeBlock = insideCB}) >> return () + + linkToMan :: String -> Block + linkToMan txt = case runParser linkParser () "" txt of + Right lnk -> Plain [lnk] + Left _ -> Plain [Emph [Str txt]] + where + linkParser :: Parsec String () Inline + linkParser = do + mpage <- many1 alphaNum + space + char '(' + mansect <- digit + char ')' + -- assuming man pages are generated from Linux-like repository + let manurl pagename section = "../"++section++"/"++pagename++"."++section + return $ Link nullAttr [Str txt] (manurl mpage [mansect], mpage) + - unkownMacro :: PandocMonad m => String -> [String] -> ManParser m Block - unkownMacro mname args = do + unkownMacro :: PandocMonad m => String -> ManParser m Block + unkownMacro mname = do pos <- getPosition logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos - return $ Plain $ Str <$> args + return Null parseArgs :: PandocMonad m => ManParser m [String] parseArgs = do @@ -173,61 +197,70 @@ parseMacro = do quotedChar :: PandocMonad m => ManParser m Char quotedChar = noneOf "\"\n" <|> try (string "\"\"" >> return '"') -roffInline :: RoffState -> String -> (Maybe Inline) +roffInline :: RoffState -> String -> [Inline] roffInline rst str - | null str = Nothing - | inCodeBlock rst = Just $ Code nullAttr str - | otherwise = Just $ case fontKind rst of - Regular -> Str str - Italic -> Emph [Str str] - _ -> Strong [Str str] + | null str = [] + | inCodeBlock rst = [Code nullAttr str, LineBreak] + | otherwise = case fontKind rst of + Regular -> [Str str] + Italic -> [Emph [Str str]] + Bold -> [Strong [Str str]] + ItalicBold -> [Emph [Strong [Str str]]] parseLine :: PandocMonad m => ManParser m Block parseLine = do parts <- parseLineParts newline return $ if null parts - then Plain [LineBreak] + then Null else Plain parts where parseLineParts :: PandocMonad m => ManParser m [Inline] parseLineParts = do lnpart <- many $ noneOf "\n\\" ManState {rState = roffSt} <- getState - let inl = roffInline roffSt lnpart + let inls = roffInline roffSt lnpart others <- backSlash <|> return [] - return $ case inl of - Just x -> x:others - Nothing -> others + return $ inls ++ others backSlash :: PandocMonad m => ManParser m [Inline] backSlash = do char '\\' esc <- choice [ char 'f' >> fEscape , char '-' >> return (Just '-') + , char '%' >> return Nothing , Just <$> noneOf "\n" ] ManState {rState = roffSt} <- getState case esc of - Just c -> case roffInline roffSt [c] of - Just inl -> do - oth <- parseLineParts - return $ inl : oth - Nothing -> parseLineParts + Just c -> let inls = roffInline roffSt [c] + in parseLineParts >>= (\oth -> return $ inls ++ oth) Nothing -> parseLineParts where fEscape :: PandocMonad m => ManParser m (Maybe Char) fEscape = choice [ char 'B' >> modifyRoffState (\rst -> rst {fontKind = Bold}) , char 'I' >> modifyRoffState (\rst -> rst {fontKind = Italic}) - , char 'P' >> modifyRoffState (\rst -> rst {fontKind = Regular}) + , (char 'P' <|> anyChar) >> modifyRoffState (\rst -> rst {fontKind = Regular}) ] >> return Nothing - - + +createParas :: [Block] -> [Block] +createParas bs = inner bs [] where + inner :: [Block] -> [Inline] -> [Block] + inner [] inls = maybeToList $ inlinesToPara inls + inner (Plain einls : oth) inls = inner oth (inls ++ einls) + inner (block : oth) inls = case inlinesToPara inls of + Just par -> par : block : inner oth [] + Nothing -> block : inner oth [] + + inlinesToPara :: [Inline] -> Maybe Block + inlinesToPara [] = Nothing + inlinesToPara inls = Just $ Para (intersperse (Str " ") inls) + parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do - blocks <- many (parseMacro <|> parseLine) + blocks <- createParas <$> many (parseMacro <|> parseLine) parserst <- pState <$> getState return $ Pandoc (stateMeta parserst) blocks -- cgit v1.2.3 From 34f9ac9dbf8615e5dc8a8f803385d929bfa585c1 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 9 May 2018 19:25:24 +0300 Subject: codeblock handling --- src/Text/Pandoc/Readers/Man.hs | 82 +++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 21 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 0b9990899..23ac3aeff 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -35,7 +35,7 @@ module Text.Pandoc.Readers.Man where import Control.Monad.Except (throwError) import Data.Default (Default) import Data.Map (insert) -import Data.Maybe (isJust, maybeToList) +import Data.Maybe (isJust, fromMaybe) import Data.List (intersperse, intercalate) import qualified Data.Text as T @@ -109,9 +109,13 @@ parseMacro = do macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- parseArgs let joinedArgs = concat $ intersperse " " args - let toBold = return $ Plain [Strong [Str joinedArgs]] - let toBoldItalic = return $ Plain [Strong [Emph [Str joinedArgs]]] - let toItalic = return $ Plain [Emph [Str joinedArgs]] + ManState { rState = rst } <- getState + let toTextF transf = if inCodeBlock rst then [Code nullAttr joinedArgs] else transf [Str joinedArgs] + let toText = return . Plain . toTextF + let toBold = toText (\s -> [Strong s]) + let toItalic = toText (\s -> [Emph s]) + let toBoldItalic = toText (\s -> [Strong [Emph s]]) + case macroName of "\\\"" -> return Null -- comment "TH" -> macroTitle (if null args then "" else head args) -- man-title @@ -120,14 +124,14 @@ parseMacro = do "nf" -> macroCodeBlock True >> return Null "fi" -> macroCodeBlock False >> return Null "B" -> toBold - "BR" -> return $ linkToMan joinedArgs + "BR" -> return $ macroBR joinedArgs (inCodeBlock rst) "BI" -> toBoldItalic "IB" -> toBoldItalic "I" -> toItalic "IR" -> toItalic "RI" -> toItalic "SH" -> return $ Header 2 nullAttr [Str joinedArgs] - "sp" -> return $ Plain [LineBreak] + "sp" -> return $ if inCodeBlock rst then Null else Plain [LineBreak] _ -> unkownMacro macroName where @@ -148,10 +152,14 @@ parseMacro = do macroCodeBlock :: PandocMonad m => Bool -> ManParser m () macroCodeBlock insideCB = modifyRoffState (\rst -> rst{inCodeBlock = insideCB}) >> return () - linkToMan :: String -> Block + macroBR :: String -> Bool -> Block + macroBR txt inCode | inCode = Plain [Code nullAttr txt] + | otherwise = fromMaybe (Plain [Strong [Str txt]]) (linkToMan txt) + + linkToMan :: String -> Maybe Block linkToMan txt = case runParser linkParser () "" txt of - Right lnk -> Plain [lnk] - Left _ -> Plain [Emph [Str txt]] + Right lnk -> Just $ Plain [lnk] + Left _ -> Nothing where linkParser :: Parsec String () Inline linkParser = do @@ -199,8 +207,8 @@ parseMacro = do roffInline :: RoffState -> String -> [Inline] roffInline rst str - | null str = [] - | inCodeBlock rst = [Code nullAttr str, LineBreak] + | null str && (not $ inCodeBlock rst) = [] + | inCodeBlock rst = [Code nullAttr str] | otherwise = case fontKind rst of Regular -> [Str str] Italic -> [Emph [Str str]] @@ -245,19 +253,51 @@ parseLine = do ] >> return Nothing +finds :: (a -> Bool) -> [a] -> ([a], [a]) +finds predic els = let matched = finds' els + in (matched, drop (length matched) els) where + finds' [] = [] + finds' (e:es) | predic e = e : finds' es + | otherwise = [] + +-- | return (matched, notmatched, others) +findsBoth :: (a -> Bool) -> [a] -> ([a], [a], [a]) +findsBoth predic els = + let (matched, els') = finds predic els + (notmatched, els'') = finds (not . predic) els' + in (matched, notmatched, els'') + createParas :: [Block] -> [Block] createParas bs = inner bs [] where inner :: [Block] -> [Inline] -> [Block] - inner [] inls = maybeToList $ inlinesToPara inls - inner (Plain einls : oth) inls = inner oth (inls ++ einls) - inner (block : oth) inls = case inlinesToPara inls of - Just par -> par : block : inner oth [] - Nothing -> block : inner oth [] - - inlinesToPara :: [Inline] -> Maybe Block - inlinesToPara [] = Nothing - inlinesToPara inls = Just $ Para (intersperse (Str " ") inls) - + inner [] inls = plainInlinesToPara inls + inner (Plain einls : oth) inls = inner oth (inls ++ joinCode einls) + inner (block : oth) inls = (plainInlinesToPara inls ++ [block]) ++ inner oth [] + + joinCode :: [Inline] -> [Inline] + joinCode inls = + let (codes, notcodes) = finds isCode inls + codeStr (Code _ s) = s + codeStr _ = "" + joined = Code nullAttr (concat $ codeStr <$> codes) + in if null codes + then notcodes + else joined : notcodes + + plainInlinesToPara :: [Inline] -> [Block] + plainInlinesToPara [] = [] + plainInlinesToPara inls = + let (cds, ncds, oth) = findsBoth isCode inls + codeToStr (Code _ s) = s + codeToStr _ = "" + cbs = if null cds + then [] + else [CodeBlock nullAttr (intercalate "\n" $ codeToStr <$> cds)] + paras = [Para (intersperse (Str " ") ncds)] + in cbs ++ paras ++ plainInlinesToPara oth + + isCode (Code _ _) = True + isCode _ = False parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do -- cgit v1.2.3 From 8e9973b9f761262b6871206f741ac3f2a25aa6bb Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 9 May 2018 19:32:50 +0300 Subject: remove debug code --- src/Text/Pandoc/Readers/Man.hs | 47 +++++++++++------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 23ac3aeff..70de68d1f 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -30,7 +30,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of man to 'Pandoc' document. -} -module Text.Pandoc.Readers.Man where +module Text.Pandoc.Readers.Man (readMan) where import Control.Monad.Except (throwError) import Data.Default (Default) @@ -39,12 +39,11 @@ import Data.Maybe (isJust, fromMaybe) import Data.List (intersperse, intercalate) import qualified Data.Text as T -import Text.Pandoc.Class (PandocMonad(..), runPure) +import Text.Pandoc.Class (PandocMonad(..)) import Text.Pandoc.Definition -import Text.Pandoc.Error (PandocError) import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding (enclosed) +import Text.Pandoc.Parsing import Text.Pandoc.Shared (crFilter) import Text.Parsec import Text.Parsec.Char () @@ -71,37 +70,6 @@ modifyRoffState f = do type ManParser m = ParserT [Char] ManState m -testStrr :: [Char] -> SourceName -> Either PandocError (Either ParseError Pandoc) -testStrr s srcnm = runPure (runParserT parseMan (ManState {pState=def, rState=def}) srcnm s) - -printPandoc :: Pandoc -> [Char] -printPandoc (Pandoc m content) = - let ttl = "Pandoc: " ++ (show $ unMeta m) - cnt = intercalate "\n" $ map show content - in ttl ++ "\n" ++ cnt - -strrepr :: (Show a2, Show a1) => Either a2 (Either a1 Pandoc) -> [Char] -strrepr obj = case obj of - Right x -> case x of - Right x' -> printPandoc x' - Left y' -> show y' - Left y -> show y - -testFile :: FilePath -> IO () -testFile fname = do - cont <- readFile fname - putStrLn . strrepr $ testStrr cont fname - - --- | Read man (troff) from an input string and return a Pandoc document. -readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc -readMan opts txt = do - let state = ManState { pState = def{ stateOptions = opts }, rState = def} - parsed <- readWithM parseMan state (T.unpack $ crFilter txt) - case parsed of - Right result -> return result - Left e -> throwError e - parseMacro :: PandocMonad m => ManParser m Block parseMacro = do char '.' <|> char '\'' @@ -304,3 +272,12 @@ parseMan = do blocks <- createParas <$> many (parseMacro <|> parseLine) parserst <- pState <$> getState return $ Pandoc (stateMeta parserst) blocks + +-- | Read man (troff) from an input string and return a Pandoc document. +readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc +readMan opts txt = do + let state = ManState { pState = def{ stateOptions = opts }, rState = def} + parsed <- readWithM parseMan state (T.unpack $ crFilter txt) + case parsed of + Right result -> return result + Left e -> throwError e -- cgit v1.2.3 From ad19166bc308a2428bd040851a2a97c76e8873f9 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 9 May 2018 20:40:37 +0300 Subject: fix build and tests --- src/Text/Pandoc/Readers/Man.hs | 1 + test/Tests/Readers/Man.hs | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 70de68d1f..fe66bb61c 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -32,6 +32,7 @@ Conversion of man to 'Pandoc' document. -} module Text.Pandoc.Readers.Man (readMan) where +import Prelude import Control.Monad.Except (throwError) import Data.Default (Default) import Data.Map (insert) diff --git a/test/Tests/Readers/Man.hs b/test/Tests/Readers/Man.hs index 007935be1..a9fe324d1 100644 --- a/test/Tests/Readers/Man.hs +++ b/test/Tests/Readers/Man.hs @@ -1,6 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} module Tests.Readers.Man (tests) where +import Prelude import Data.Text (Text) import Test.Tasty import Tests.Helpers @@ -23,9 +24,9 @@ tests = [ testGroup "Macros" [ "Bold" =: ".B foo\n" - =?> strong "foo" + =?> (para $ strong "foo") , "Italic" =: ".I foo\n" - =?> emph "foo" + =?> (para $ emph "foo") ] ] -- cgit v1.2.3 From 6f793b5a63618966a7611da325ecc90bd30da8a2 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sat, 19 May 2018 23:09:14 +0300 Subject: tokenisation --- src/Text/Pandoc/Readers/Man.hs | 306 +++++++++++++++++++++++------------------ 1 file changed, 175 insertions(+), 131 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index fe66bb61c..dfe1bcdc1 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -35,6 +35,7 @@ module Text.Pandoc.Readers.Man (readMan) where import Prelude import Control.Monad.Except (throwError) import Data.Default (Default) +import Data.Functor.Identity (Identity) import Data.Map (insert) import Data.Maybe (isJust, fromMaybe) import Data.List (intersperse, intercalate) @@ -46,11 +47,38 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared (crFilter) -import Text.Parsec +import Text.Parsec hiding (tokenPrim) import Text.Parsec.Char () +import Text.Parsec.Pos (updatePosString) + +-- +-- Data Types +-- data FontKind = Regular | Italic | Bold | ItalicBold deriving Show +data MacroKind = KTitle + | KCodeBlStart + | KCodeBlEnd + | KTab + | KTabEnd + deriving Show + +data ManToken = MStr String FontKind + | MLine [(String, FontKind)] + | MLink String Target + | MEmptyLine + | MHeader Integer String + | MMacro MacroKind [String] + | MUnknownMacro String [String] + | MComment String + deriving Show + +data EscapeThing = EFont FontKind + | EChar Char + | ENothing + deriving Show + data RoffState = RoffState { inCodeBlock :: Bool , fontKind :: FontKind } deriving Show @@ -60,48 +88,60 @@ instance Default RoffState where data ManState = ManState {pState :: ParserState, rState :: RoffState} +type ManParser m = ParserT [Char] ManState m +type ManCompiler m = ParserT [ManToken] ManState m + instance HasLogMessages ManState where addLogMessage lm mst = mst {pState = addLogMessage lm (pState mst)} getLogMessages mst = getLogMessages $ pState mst +-- | Read man (troff) from an input string and return a Pandoc document. +readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc +readMan opts txt = do + let state = ManState { pState = def{ stateOptions = opts }, rState = def} + parsed <- readWithM parseMan state (T.unpack $ crFilter txt) + case parsed of + Right result -> return result + Left e -> throwError e + +-- +-- String -> ManToken function +-- + +parseMan :: PandocMonad m => ManParser m Pandoc +parseMan = do + tokens <- many (parseMacro <|> parseLine <|> parseEmptyLine) + let blocks = [] + parserst <- pState <$> getState + return $ Pandoc (stateMeta parserst) blocks + modifyRoffState :: PandocMonad m => (RoffState -> RoffState) -> ParsecT a ManState m () modifyRoffState f = do mst <- getState setState mst { rState = f $ rState mst } -type ManParser m = ParserT [Char] ManState m - -parseMacro :: PandocMonad m => ManParser m Block +parseMacro :: PandocMonad m => ManParser m ManToken parseMacro = do char '.' <|> char '\'' many space macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- parseArgs let joinedArgs = concat $ intersperse " " args - ManState { rState = rst } <- getState - let toTextF transf = if inCodeBlock rst then [Code nullAttr joinedArgs] else transf [Str joinedArgs] - let toText = return . Plain . toTextF - let toBold = toText (\s -> [Strong s]) - let toItalic = toText (\s -> [Emph s]) - let toBoldItalic = toText (\s -> [Strong [Emph s]]) - - case macroName of - "\\\"" -> return Null -- comment - "TH" -> macroTitle (if null args then "" else head args) -- man-title - "TP" -> return Null -- tab-indented paragraph - "PP" -> return Null -- end of tab-indented paragraphs - "nf" -> macroCodeBlock True >> return Null - "fi" -> macroCodeBlock False >> return Null - "B" -> toBold - "BR" -> return $ macroBR joinedArgs (inCodeBlock rst) - "BI" -> toBoldItalic - "IB" -> toBoldItalic - "I" -> toItalic - "IR" -> toItalic - "RI" -> toItalic - "SH" -> return $ Header 2 nullAttr [Str joinedArgs] - "sp" -> return $ if inCodeBlock rst then Null else Plain [LineBreak] - _ -> unkownMacro macroName + + let tok = case macroName of + x | x `elem` ["\\\"", "\\#"] -> MComment joinedArgs + "TH" -> MMacro KTitle args + "TP" -> MMacro KTab [] + "PP" -> MMacro KTabEnd [] + "nf" -> MMacro KCodeBlStart [] + "fi" -> MMacro KCodeBlEnd [] + x | x `elem` ["B", "BR"] -> MStr joinedArgs Bold -- "BR" is often used as a link to another man + x | x `elem` ["BI", "IB"] -> MStr joinedArgs ItalicBold + x | x `elem` ["I", "IR", "RI"] -> MStr joinedArgs Italic + "SH" -> MHeader 2 joinedArgs + "sp" -> MEmptyLine + _ -> MUnknownMacro macroName args + return tok where @@ -174,111 +214,115 @@ parseMacro = do quotedChar :: PandocMonad m => ManParser m Char quotedChar = noneOf "\"\n" <|> try (string "\"\"" >> return '"') -roffInline :: RoffState -> String -> [Inline] -roffInline rst str - | null str && (not $ inCodeBlock rst) = [] - | inCodeBlock rst = [Code nullAttr str] - | otherwise = case fontKind rst of - Regular -> [Str str] - Italic -> [Emph [Str str]] - Bold -> [Strong [Str str]] - ItalicBold -> [Emph [Strong [Str str]]] - -parseLine :: PandocMonad m => ManParser m Block +escapeParser :: PandocMonad m => ManParser m EscapeThing +escapeParser = do + char '\\' + choice [escChar, escFont] + where + + escChar :: PandocMonad m => ManParser m EscapeThing + escChar = choice [ char '-' >> return (EChar '-') + , oneOf ['%', '{', '}'] >> return ENothing + ] + + escFont :: PandocMonad m => ManParser m EscapeThing + escFont = do + char 'f' + font <- choice [ char 'B' >> return Bold + , char 'I' >> return Italic + , (char 'P' <|> anyChar) >> return Regular + , char '(' >> anyChar >> anyChar >> return Regular + , string "[]" >> return Regular + , char '[' >> many1 letter >> char ']' >> return Regular + ] + modifyRoffState (\r -> RoffState {fontKind = font}) + return $ EFont font + +parseLine :: PandocMonad m => ManParser m ManToken parseLine = do - parts <- parseLineParts - newline - return $ if null parts - then Null - else Plain parts + lnparts <- many1 (esc <|> linePart) + return $ MLine lnparts where - parseLineParts :: PandocMonad m => ManParser m [Inline] - parseLineParts = do - lnpart <- many $ noneOf "\n\\" - ManState {rState = roffSt} <- getState - let inls = roffInline roffSt lnpart - others <- backSlash <|> return [] - return $ inls ++ others + + esc :: PandocMonad m => ManParser m (String, FontKind) + esc = do + someesc <- escapeParser + font <- currentFont + let rv = case someesc of + EChar c -> ([c], font) + _ -> ("", font) + return rv + + linePart :: PandocMonad m => ManParser m (String, FontKind) + linePart = do + lnpart <- many1 $ noneOf "\n\\" + font <- currentFont + return (lnpart, font) + + currentFont :: PandocMonad m => ManParser m FontKind + currentFont = do + RoffState {fontKind = fk} <- rState <$> getState + return fk + - backSlash :: PandocMonad m => ManParser m [Inline] - backSlash = do - char '\\' - esc <- choice [ char 'f' >> fEscape - , char '-' >> return (Just '-') - , char '%' >> return Nothing - , Just <$> noneOf "\n" - ] - ManState {rState = roffSt} <- getState - case esc of - Just c -> let inls = roffInline roffSt [c] - in parseLineParts >>= (\oth -> return $ inls ++ oth) - Nothing -> parseLineParts - where - - fEscape :: PandocMonad m => ManParser m (Maybe Char) - fEscape = choice [ char 'B' >> modifyRoffState (\rst -> rst {fontKind = Bold}) - , char 'I' >> modifyRoffState (\rst -> rst {fontKind = Italic}) - , (char 'P' <|> anyChar) >> modifyRoffState (\rst -> rst {fontKind = Regular}) - ] - >> return Nothing - -finds :: (a -> Bool) -> [a] -> ([a], [a]) -finds predic els = let matched = finds' els - in (matched, drop (length matched) els) where - finds' [] = [] - finds' (e:es) | predic e = e : finds' es - | otherwise = [] - --- | return (matched, notmatched, others) -findsBoth :: (a -> Bool) -> [a] -> ([a], [a], [a]) -findsBoth predic els = - let (matched, els') = finds predic els - (notmatched, els'') = finds (not . predic) els' - in (matched, notmatched, els'') - -createParas :: [Block] -> [Block] -createParas bs = inner bs [] where - inner :: [Block] -> [Inline] -> [Block] - inner [] inls = plainInlinesToPara inls - inner (Plain einls : oth) inls = inner oth (inls ++ joinCode einls) - inner (block : oth) inls = (plainInlinesToPara inls ++ [block]) ++ inner oth [] - - joinCode :: [Inline] -> [Inline] - joinCode inls = - let (codes, notcodes) = finds isCode inls - codeStr (Code _ s) = s - codeStr _ = "" - joined = Code nullAttr (concat $ codeStr <$> codes) - in if null codes - then notcodes - else joined : notcodes - - plainInlinesToPara :: [Inline] -> [Block] - plainInlinesToPara [] = [] - plainInlinesToPara inls = - let (cds, ncds, oth) = findsBoth isCode inls - codeToStr (Code _ s) = s - codeToStr _ = "" - cbs = if null cds - then [] - else [CodeBlock nullAttr (intercalate "\n" $ codeToStr <$> cds)] - paras = [Para (intersperse (Str " ") ncds)] - in cbs ++ paras ++ plainInlinesToPara oth - - isCode (Code _ _) = True - isCode _ = False +parseEmptyLine :: PandocMonad m => ManParser m ManToken +parseEmptyLine = char '\n' >> return MEmptyLine -parseMan :: PandocMonad m => ManParser m Pandoc -parseMan = do - blocks <- createParas <$> many (parseMacro <|> parseLine) - parserst <- pState <$> getState - return $ Pandoc (stateMeta parserst) blocks +-- +-- ManToken parsec functions +-- + +msatisfy :: (Show t, Stream s m t) => (t -> Bool) -> ParserT s st m t +msatisfy pred = tokenPrim show nextPos testTok + where + posFromTok (pos,t) = pos + testTok t = if pred t then Just t else Nothing + nextPos pos x xs = updatePosString pos (show x) + +mstr :: PandocMonad m => ManCompiler m ManToken +mstr = msatisfy isMStr where + isMStr (MStr _ _) = True + isMStr _ = False + +mline :: PandocMonad m => ManCompiler m ManToken +mline = msatisfy isMLine where + isMLine (MLine _) = True + isMLine _ = False + +mlink :: PandocMonad m => ManCompiler m ManToken +mlink = msatisfy isMLink where + isMLink (MLink _ _) = True + isMLink _ = False + +memplyLine :: PandocMonad m => ManCompiler m ManToken +memplyLine = msatisfy isMEmptyLine where + isMEmptyLine MEmptyLine = True + isMEmptyLine _ = False + +mheader :: PandocMonad m => ManCompiler m ManToken +mheader = msatisfy isMHeader where + isMHeader (MHeader _ _) = True + isMHeader _ = False + +mmacro :: PandocMonad m => ManCompiler m ManToken +mmacro = msatisfy isMMacro where + isMMacro (MMacro _ _) = True + isMMacro _ = False + +munknownMacro :: PandocMonad m => ManCompiler m ManToken +munknownMacro = msatisfy isMUnknownMacro where + isMUnknownMacro (MUnknownMacro _ _) = True + isMUnknownMacro _ = False + +mcomment :: PandocMonad m => ManCompiler m ManToken +mcomment = msatisfy isMComment where + isMComment (MComment _) = True + isMComment _ = False + +-- +-- ManToken -> Block functions +-- + +compileHeader :: PandocMonad m => ManCompiler m Block +compileHeader = undefined --do --- | Read man (troff) from an input string and return a Pandoc document. -readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc -readMan opts txt = do - let state = ManState { pState = def{ stateOptions = opts }, rState = def} - parsed <- readWithM parseMan state (T.unpack $ crFilter txt) - case parsed of - Right result -> return result - Left e -> throwError e -- cgit v1.2.3 From 533d4505075b15235a5ae0d56cc0467a67b124bc Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 20 May 2018 01:51:53 +0300 Subject: compiling paragraphs --- src/Text/Pandoc/Readers/Man.hs | 179 ++++++++++++++++++++++++++++------------- 1 file changed, 124 insertions(+), 55 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index dfe1bcdc1..d7be9aee3 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -30,19 +30,20 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of man to 'Pandoc' document. -} -module Text.Pandoc.Readers.Man (readMan) where +module Text.Pandoc.Readers.Man (readMan, testFile) where import Prelude +import Control.Monad (liftM) import Control.Monad.Except (throwError) import Data.Default (Default) -import Data.Functor.Identity (Identity) import Data.Map (insert) -import Data.Maybe (isJust, fromMaybe) +import Data.Maybe (isJust, catMaybes) import Data.List (intersperse, intercalate) import qualified Data.Text as T -import Text.Pandoc.Class (PandocMonad(..)) +import Text.Pandoc.Class (PandocMonad(..), runPure, runIOorExplode) import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing @@ -62,7 +63,7 @@ data MacroKind = KTitle | KCodeBlEnd | KTab | KTabEnd - deriving Show + deriving (Show, Eq) data ManToken = MStr String FontKind | MLine [(String, FontKind)] @@ -95,23 +96,67 @@ instance HasLogMessages ManState where addLogMessage lm mst = mst {pState = addLogMessage lm (pState mst)} getLogMessages mst = getLogMessages $ pState mst +---- +testStrr :: [Char] -> Either PandocError Pandoc +testStrr s = runPure $ readMan def (T.pack s) + +printPandoc :: Pandoc -> [Char] +printPandoc (Pandoc m content) = + let ttl = "Pandoc: " ++ (show $ unMeta m) + cnt = intercalate "\n" $ map show content + in ttl ++ "\n" ++ cnt + +strrepr :: Either PandocError Pandoc -> [Char] +strrepr obj = case obj of + Right x -> printPandoc x + Left y -> show y + +testFile :: FilePath -> IO () +testFile fname = do + cont <- readFile fname + pand <- runIOorExplode $ readMan def (T.pack cont) + putStrLn $ printPandoc pand +---- + + -- | Read man (troff) from an input string and return a Pandoc document. readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc readMan opts txt = do let state = ManState { pState = def{ stateOptions = opts }, rState = def} - parsed <- readWithM parseMan state (T.unpack $ crFilter txt) - case parsed of - Right result -> return result + eithertokens <- readWithM parseMan state (T.unpack $ crFilter txt) + case eithertokens of + Right tokenz -> do + eitherdoc <- readWithMTokens compileMan state tokenz + case eitherdoc of + Right doc -> return doc + Left e -> throwError e Left e -> throwError e + where + + readWithMTokens :: PandocMonad m + => ParserT [ManToken] ManState m a -- ^ parser + -> ManState -- ^ initial state + -> [ManToken] -- ^ input + -> m (Either PandocError a) + readWithMTokens parser state input = + mapLeft (PandocParsecError . concat $ show <$> input) `liftM` runParserT parser state "source" input + + mapLeft :: (a -> c) -> Either a b -> Either c b + mapLeft f (Left x) = Left $ f x + mapLeft _ (Right r) = Right r + -- -- String -> ManToken function -- -parseMan :: PandocMonad m => ManParser m Pandoc -parseMan = do - tokens <- many (parseMacro <|> parseLine <|> parseEmptyLine) - let blocks = [] +parseMan :: PandocMonad m => ManParser m [ManToken] +parseMan = many (parseMacro <|> parseLine <|> parseEmptyLine) + +compileMan :: PandocMonad m => ManCompiler m Pandoc +compileMan = do + let compilers = [compileTitle, compilePara, compileSkippedContent] + blocks <- many $ choice compilers parserst <- pState <$> getState return $ Pandoc (stateMeta parserst) blocks @@ -145,26 +190,6 @@ parseMacro = do where - macroTitle :: PandocMonad m => String -> ManParser m Block - macroTitle mantitle = do - modifyState (changeTitle mantitle) - if null mantitle - then return Null - else return $ Header 1 nullAttr [Str mantitle] - where - changeTitle title mst@ManState{ pState = pst} = - let meta = stateMeta pst - metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) - in - mst { pState = pst {stateMeta = metaUp} } - - macroCodeBlock :: PandocMonad m => Bool -> ManParser m () - macroCodeBlock insideCB = modifyRoffState (\rst -> rst{inCodeBlock = insideCB}) >> return () - - macroBR :: String -> Bool -> Block - macroBR txt inCode | inCode = Plain [Code nullAttr txt] - | otherwise = fromMaybe (Plain [Strong [Str txt]]) (linkToMan txt) - linkToMan :: String -> Maybe Block linkToMan txt = case runParser linkParser () "" txt of Right lnk -> Just $ Plain [lnk] @@ -180,13 +205,6 @@ parseMacro = do -- assuming man pages are generated from Linux-like repository let manurl pagename section = "../"++section++"/"++pagename++"."++section return $ Link nullAttr [Str txt] (manurl mpage [mansect], mpage) - - - unkownMacro :: PandocMonad m => String -> ManParser m Block - unkownMacro mname = do - pos <- getPosition - logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos - return Null parseArgs :: PandocMonad m => ManParser m [String] parseArgs = do @@ -235,29 +253,30 @@ escapeParser = do , string "[]" >> return Regular , char '[' >> many1 letter >> char ']' >> return Regular ] - modifyRoffState (\r -> RoffState {fontKind = font}) + modifyRoffState (\r -> r {fontKind = font}) return $ EFont font parseLine :: PandocMonad m => ManParser m ManToken parseLine = do lnparts <- many1 (esc <|> linePart) - return $ MLine lnparts + newline + return $ MLine $ catMaybes lnparts where - esc :: PandocMonad m => ManParser m (String, FontKind) + esc :: PandocMonad m => ManParser m (Maybe (String, FontKind)) esc = do someesc <- escapeParser font <- currentFont let rv = case someesc of - EChar c -> ([c], font) - _ -> ("", font) + EChar c -> Just ([c], font) + _ -> Nothing return rv - linePart :: PandocMonad m => ManParser m (String, FontKind) + linePart :: PandocMonad m => ManParser m (Maybe (String, FontKind)) linePart = do lnpart <- many1 $ noneOf "\n\\" font <- currentFont - return (lnpart, font) + return $ Just (lnpart, font) currentFont :: PandocMonad m => ManParser m FontKind currentFont = do @@ -273,11 +292,10 @@ parseEmptyLine = char '\n' >> return MEmptyLine -- msatisfy :: (Show t, Stream s m t) => (t -> Bool) -> ParserT s st m t -msatisfy pred = tokenPrim show nextPos testTok +msatisfy predic = tokenPrim show nextPos testTok where - posFromTok (pos,t) = pos - testTok t = if pred t then Just t else Nothing - nextPos pos x xs = updatePosString pos (show x) + testTok t = if predic t then Just t else Nothing + nextPos pos x _xs = updatePosString pos (show x) mstr :: PandocMonad m => ManCompiler m ManToken mstr = msatisfy isMStr where @@ -304,9 +322,10 @@ mheader = msatisfy isMHeader where isMHeader (MHeader _ _) = True isMHeader _ = False -mmacro :: PandocMonad m => ManCompiler m ManToken -mmacro = msatisfy isMMacro where - isMMacro (MMacro _ _) = True +mmacro :: PandocMonad m => MacroKind -> ManCompiler m ManToken +mmacro mk = msatisfy isMMacro where + isMMacro (MMacro mk' _) | mk == mk' = True + | otherwise = False isMMacro _ = False munknownMacro :: PandocMonad m => ManCompiler m ManToken @@ -323,6 +342,56 @@ mcomment = msatisfy isMComment where -- ManToken -> Block functions -- -compileHeader :: PandocMonad m => ManCompiler m Block -compileHeader = undefined --do +compileTitle :: PandocMonad m => ManCompiler m Block +compileTitle = do + (MMacro _ args) <- mmacro KTitle + if null args + then return Null + else do + let mantitle = head args + modifyState (changeTitle mantitle) + return $ Header 1 nullAttr [Str mantitle] + where + changeTitle title mst@ManState{ pState = pst} = + let meta = stateMeta pst + metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) + in + mst { pState = pst {stateMeta = metaUp} } + +compileSkippedContent :: PandocMonad m => ManCompiler m Block +compileSkippedContent = do + tok <- munknownMacro <|> mcomment <|> memplyLine + onToken tok + return Null + + where + + onToken :: PandocMonad m => ManToken -> ManCompiler m () + onToken (MUnknownMacro mname _) = do + pos <- getPosition + logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos + onToken _ = return () + +strToInline :: String -> FontKind -> Inline +strToInline s Regular = Str s +strToInline s Italic = Emph [Str s] +strToInline s Bold = Strong [Str s] +strToInline s ItalicBold = Strong [Emph [Str s]] + +compilePara :: PandocMonad m => ManCompiler m Block +compilePara = do + inls <- many1 (strInl <|> lineInl) + let withspaces = intersperse [Str " "] inls + return $ Para (concat withspaces) + + where + + strInl :: PandocMonad m => ManCompiler m [Inline] + strInl = do + (MStr str fk) <- mstr + return [strToInline str fk] + lineInl :: PandocMonad m => ManCompiler m [Inline] + lineInl = do + (MLine fragments) <- mline + return $ fmap (\(s,f) -> strToInline s f) fragments -- cgit v1.2.3 From d8c51ad788bb896fbff45c7da9285ebe93d4865d Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 20 May 2018 02:54:24 +0300 Subject: states, code block compiling --- src/Text/Pandoc/Readers/Man.hs | 97 +++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 35 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index d7be9aee3..b23daa6b3 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -41,7 +41,7 @@ import Data.Maybe (isJust, catMaybes) import Data.List (intersperse, intercalate) import qualified Data.Text as T -import Text.Pandoc.Class (PandocMonad(..), runPure, runIOorExplode) +import Text.Pandoc.Class (PandocMonad(..), runIOorExplode) import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) @@ -69,7 +69,7 @@ data ManToken = MStr String FontKind | MLine [(String, FontKind)] | MLink String Target | MEmptyLine - | MHeader Integer String + | MHeader Int String | MMacro MacroKind [String] | MUnknownMacro String [String] | MComment String @@ -80,25 +80,18 @@ data EscapeThing = EFont FontKind | ENothing deriving Show -data RoffState = RoffState { inCodeBlock :: Bool - , fontKind :: FontKind +data RoffState = RoffState { fontKind :: FontKind } deriving Show instance Default RoffState where - def = RoffState {inCodeBlock = False, fontKind = Regular} + def = RoffState {fontKind = Regular} -data ManState = ManState {pState :: ParserState, rState :: RoffState} - -type ManParser m = ParserT [Char] ManState m -type ManCompiler m = ParserT [ManToken] ManState m - -instance HasLogMessages ManState where - addLogMessage lm mst = mst {pState = addLogMessage lm (pState mst)} - getLogMessages mst = getLogMessages $ pState mst +type ManParser m = ParserT [Char] RoffState m +type ManCompiler m = ParserT [ManToken] ParserState m ---- -testStrr :: [Char] -> Either PandocError Pandoc -testStrr s = runPure $ readMan def (T.pack s) +-- testStrr :: [Char] -> Either PandocError Pandoc +-- testStrr s = runPure $ readMan def (T.pack s) printPandoc :: Pandoc -> [Char] printPandoc (Pandoc m content) = @@ -106,10 +99,10 @@ printPandoc (Pandoc m content) = cnt = intercalate "\n" $ map show content in ttl ++ "\n" ++ cnt -strrepr :: Either PandocError Pandoc -> [Char] -strrepr obj = case obj of - Right x -> printPandoc x - Left y -> show y +-- strrepr :: Either PandocError Pandoc -> [Char] +-- strrepr obj = case obj of +-- Right x -> printPandoc x +-- Left y -> show y testFile :: FilePath -> IO () testFile fname = do @@ -122,10 +115,10 @@ testFile fname = do -- | Read man (troff) from an input string and return a Pandoc document. readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc readMan opts txt = do - let state = ManState { pState = def{ stateOptions = opts }, rState = def} - eithertokens <- readWithM parseMan state (T.unpack $ crFilter txt) + eithertokens <- readWithM parseMan def (T.unpack $ crFilter txt) case eithertokens of Right tokenz -> do + let state = def {stateOptions = opts} :: ParserState eitherdoc <- readWithMTokens compileMan state tokenz case eitherdoc of Right doc -> return doc @@ -135,8 +128,8 @@ readMan opts txt = do where readWithMTokens :: PandocMonad m - => ParserT [ManToken] ManState m a -- ^ parser - -> ManState -- ^ initial state + => ParserT [ManToken] ParserState m a -- ^ parser + -> ParserState -- ^ initial state -> [ManToken] -- ^ input -> m (Either PandocError a) readWithMTokens parser state input = @@ -155,15 +148,16 @@ parseMan = many (parseMacro <|> parseLine <|> parseEmptyLine) compileMan :: PandocMonad m => ManCompiler m Pandoc compileMan = do - let compilers = [compileTitle, compilePara, compileSkippedContent] + let compilers = [compileTitle, compilePara, compileSkippedContent + , compileCodeBlock, compileHeader, compileSkipMacro] blocks <- many $ choice compilers - parserst <- pState <$> getState - return $ Pandoc (stateMeta parserst) blocks + parserst <- getState + return $ Pandoc (stateMeta parserst) (filter (not . isNull) blocks) + + where -modifyRoffState :: PandocMonad m => (RoffState -> RoffState) -> ParsecT a ManState m () -modifyRoffState f = do - mst <- getState - setState mst { rState = f $ rState mst } + isNull Null = True + isNull _ = False parseMacro :: PandocMonad m => ManParser m ManToken parseMacro = do @@ -253,7 +247,7 @@ escapeParser = do , string "[]" >> return Regular , char '[' >> many1 letter >> char ']' >> return Regular ] - modifyRoffState (\r -> r {fontKind = font}) + modifyState (\r -> r {fontKind = font}) return $ EFont font parseLine :: PandocMonad m => ManParser m ManToken @@ -280,7 +274,7 @@ parseLine = do currentFont :: PandocMonad m => ManParser m FontKind currentFont = do - RoffState {fontKind = fk} <- rState <$> getState + RoffState {fontKind = fk} <- getState return fk @@ -328,6 +322,11 @@ mmacro mk = msatisfy isMMacro where | otherwise = False isMMacro _ = False +mmacroAny :: PandocMonad m => ManCompiler m ManToken +mmacroAny = msatisfy isMMacro where + isMMacro (MMacro _ _) = True + isMMacro _ = False + munknownMacro :: PandocMonad m => ManCompiler m ManToken munknownMacro = msatisfy isMUnknownMacro where isMUnknownMacro (MUnknownMacro _ _) = True @@ -352,11 +351,11 @@ compileTitle = do modifyState (changeTitle mantitle) return $ Header 1 nullAttr [Str mantitle] where - changeTitle title mst@ManState{ pState = pst} = + changeTitle title pst = let meta = stateMeta pst metaUp = Meta $ insert "title" (MetaString title) (unMeta meta) in - mst { pState = pst {stateMeta = metaUp} } + pst {stateMeta = metaUp} compileSkippedContent :: PandocMonad m => ManCompiler m Block compileSkippedContent = do @@ -380,7 +379,7 @@ strToInline s ItalicBold = Strong [Emph [Str s]] compilePara :: PandocMonad m => ManCompiler m Block compilePara = do - inls <- many1 (strInl <|> lineInl) + inls <- many1 (strInl <|> lineInl <|> comment) let withspaces = intersperse [Str " "] inls return $ Para (concat withspaces) @@ -395,3 +394,31 @@ compilePara = do lineInl = do (MLine fragments) <- mline return $ fmap (\(s,f) -> strToInline s f) fragments + + comment :: PandocMonad m => ManCompiler m [Inline] + comment = mcomment >> return [] + + +compileCodeBlock :: PandocMonad m => ManCompiler m Block +compileCodeBlock = do + mmacro KCodeBlStart + toks <- many (mstr <|> mline <|> mlink <|> memplyLine <|> munknownMacro <|> mcomment) + mmacro KCodeBlEnd + return $ CodeBlock nullAttr (intercalate "\n" . catMaybes $ extractText <$> toks) + + where + + extractText :: ManToken -> Maybe String + extractText (MStr s _) = Just s + extractText (MLine ss) = Just . intercalate " " $ map fst ss + extractText (MLink s _) = Just s + extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' + extractText _ = Nothing + +compileHeader :: PandocMonad m => ManCompiler m Block +compileHeader = do + (MHeader lvl s) <- mheader + return $ Header lvl nullAttr [Str s] + +compileSkipMacro :: PandocMonad m => ManCompiler m Block +compileSkipMacro = mmacroAny >> return Null -- cgit v1.2.3 From 9e3eba64fd20c753d039471403c70e69a169ea4d Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 20 May 2018 13:13:06 +0300 Subject: rename compile to parse, parse to lex --- src/Text/Pandoc/Readers/Man.hs | 114 ++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 57 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index b23daa6b3..91e0c6a1c 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -86,8 +86,8 @@ data RoffState = RoffState { fontKind :: FontKind instance Default RoffState where def = RoffState {fontKind = Regular} -type ManParser m = ParserT [Char] RoffState m -type ManCompiler m = ParserT [ManToken] ParserState m +type ManLexer m = ParserT [Char] RoffState m +type ManParser m = ParserT [ManToken] ParserState m ---- -- testStrr :: [Char] -> Either PandocError Pandoc @@ -115,11 +115,11 @@ testFile fname = do -- | Read man (troff) from an input string and return a Pandoc document. readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc readMan opts txt = do - eithertokens <- readWithM parseMan def (T.unpack $ crFilter txt) + eithertokens <- readWithM lexMan def (T.unpack $ crFilter txt) case eithertokens of Right tokenz -> do let state = def {stateOptions = opts} :: ParserState - eitherdoc <- readWithMTokens compileMan state tokenz + eitherdoc <- readWithMTokens parseMan state tokenz case eitherdoc of Right doc -> return doc Left e -> throwError e @@ -143,14 +143,14 @@ readMan opts txt = do -- String -> ManToken function -- -parseMan :: PandocMonad m => ManParser m [ManToken] -parseMan = many (parseMacro <|> parseLine <|> parseEmptyLine) +lexMan :: PandocMonad m => ManLexer m [ManToken] +lexMan = many (lexMacro <|> lexLine <|> lexEmptyLine) -compileMan :: PandocMonad m => ManCompiler m Pandoc -compileMan = do - let compilers = [compileTitle, compilePara, compileSkippedContent - , compileCodeBlock, compileHeader, compileSkipMacro] - blocks <- many $ choice compilers +parseMan :: PandocMonad m => ManParser m Pandoc +parseMan = do + let parsers = [parseTitle, parsePara, parseSkippedContent + , parseCodeBlock, parseHeader, parseSkipMacro] + blocks <- many $ choice parsers parserst <- getState return $ Pandoc (stateMeta parserst) (filter (not . isNull) blocks) @@ -159,12 +159,12 @@ compileMan = do isNull Null = True isNull _ = False -parseMacro :: PandocMonad m => ManParser m ManToken -parseMacro = do +lexMacro :: PandocMonad m => ManLexer m ManToken +lexMacro = do char '.' <|> char '\'' many space macroName <- many1 (letter <|> oneOf ['\\', '"']) - args <- parseArgs + args <- lexArgs let joinedArgs = concat $ intersperse " " args let tok = case macroName of @@ -200,44 +200,44 @@ parseMacro = do let manurl pagename section = "../"++section++"/"++pagename++"."++section return $ Link nullAttr [Str txt] (manurl mpage [mansect], mpage) - parseArgs :: PandocMonad m => ManParser m [String] - parseArgs = do + lexArgs :: PandocMonad m => ManLexer m [String] + lexArgs = do eolOpt <- optionMaybe $ char '\n' if isJust eolOpt then return [] else do many1 space arg <- try quotedArg <|> plainArg - otherargs <- parseArgs + otherargs <- lexArgs return $ arg : otherargs where - plainArg :: PandocMonad m => ManParser m String + plainArg :: PandocMonad m => ManLexer m String plainArg = many1 $ noneOf " \t\n" - quotedArg :: PandocMonad m => ManParser m String + quotedArg :: PandocMonad m => ManLexer m String quotedArg = do char '"' val <- many1 quotedChar char '"' return val - quotedChar :: PandocMonad m => ManParser m Char + quotedChar :: PandocMonad m => ManLexer m Char quotedChar = noneOf "\"\n" <|> try (string "\"\"" >> return '"') -escapeParser :: PandocMonad m => ManParser m EscapeThing -escapeParser = do +escapeLexer :: PandocMonad m => ManLexer m EscapeThing +escapeLexer = do char '\\' choice [escChar, escFont] where - escChar :: PandocMonad m => ManParser m EscapeThing + escChar :: PandocMonad m => ManLexer m EscapeThing escChar = choice [ char '-' >> return (EChar '-') , oneOf ['%', '{', '}'] >> return ENothing ] - escFont :: PandocMonad m => ManParser m EscapeThing + escFont :: PandocMonad m => ManLexer m EscapeThing escFont = do char 'f' font <- choice [ char 'B' >> return Bold @@ -250,36 +250,36 @@ escapeParser = do modifyState (\r -> r {fontKind = font}) return $ EFont font -parseLine :: PandocMonad m => ManParser m ManToken -parseLine = do +lexLine :: PandocMonad m => ManLexer m ManToken +lexLine = do lnparts <- many1 (esc <|> linePart) newline return $ MLine $ catMaybes lnparts where - esc :: PandocMonad m => ManParser m (Maybe (String, FontKind)) + esc :: PandocMonad m => ManLexer m (Maybe (String, FontKind)) esc = do - someesc <- escapeParser + someesc <- escapeLexer font <- currentFont let rv = case someesc of EChar c -> Just ([c], font) _ -> Nothing return rv - linePart :: PandocMonad m => ManParser m (Maybe (String, FontKind)) + linePart :: PandocMonad m => ManLexer m (Maybe (String, FontKind)) linePart = do lnpart <- many1 $ noneOf "\n\\" font <- currentFont return $ Just (lnpart, font) - currentFont :: PandocMonad m => ManParser m FontKind + currentFont :: PandocMonad m => ManLexer m FontKind currentFont = do RoffState {fontKind = fk} <- getState return fk -parseEmptyLine :: PandocMonad m => ManParser m ManToken -parseEmptyLine = char '\n' >> return MEmptyLine +lexEmptyLine :: PandocMonad m => ManLexer m ManToken +lexEmptyLine = char '\n' >> return MEmptyLine -- -- ManToken parsec functions @@ -291,48 +291,48 @@ msatisfy predic = tokenPrim show nextPos testTok testTok t = if predic t then Just t else Nothing nextPos pos x _xs = updatePosString pos (show x) -mstr :: PandocMonad m => ManCompiler m ManToken +mstr :: PandocMonad m => ManParser m ManToken mstr = msatisfy isMStr where isMStr (MStr _ _) = True isMStr _ = False -mline :: PandocMonad m => ManCompiler m ManToken +mline :: PandocMonad m => ManParser m ManToken mline = msatisfy isMLine where isMLine (MLine _) = True isMLine _ = False -mlink :: PandocMonad m => ManCompiler m ManToken +mlink :: PandocMonad m => ManParser m ManToken mlink = msatisfy isMLink where isMLink (MLink _ _) = True isMLink _ = False -memplyLine :: PandocMonad m => ManCompiler m ManToken +memplyLine :: PandocMonad m => ManParser m ManToken memplyLine = msatisfy isMEmptyLine where isMEmptyLine MEmptyLine = True isMEmptyLine _ = False -mheader :: PandocMonad m => ManCompiler m ManToken +mheader :: PandocMonad m => ManParser m ManToken mheader = msatisfy isMHeader where isMHeader (MHeader _ _) = True isMHeader _ = False -mmacro :: PandocMonad m => MacroKind -> ManCompiler m ManToken +mmacro :: PandocMonad m => MacroKind -> ManParser m ManToken mmacro mk = msatisfy isMMacro where isMMacro (MMacro mk' _) | mk == mk' = True | otherwise = False isMMacro _ = False -mmacroAny :: PandocMonad m => ManCompiler m ManToken +mmacroAny :: PandocMonad m => ManParser m ManToken mmacroAny = msatisfy isMMacro where isMMacro (MMacro _ _) = True isMMacro _ = False -munknownMacro :: PandocMonad m => ManCompiler m ManToken +munknownMacro :: PandocMonad m => ManParser m ManToken munknownMacro = msatisfy isMUnknownMacro where isMUnknownMacro (MUnknownMacro _ _) = True isMUnknownMacro _ = False -mcomment :: PandocMonad m => ManCompiler m ManToken +mcomment :: PandocMonad m => ManParser m ManToken mcomment = msatisfy isMComment where isMComment (MComment _) = True isMComment _ = False @@ -341,8 +341,8 @@ mcomment = msatisfy isMComment where -- ManToken -> Block functions -- -compileTitle :: PandocMonad m => ManCompiler m Block -compileTitle = do +parseTitle :: PandocMonad m => ManParser m Block +parseTitle = do (MMacro _ args) <- mmacro KTitle if null args then return Null @@ -357,15 +357,15 @@ compileTitle = do in pst {stateMeta = metaUp} -compileSkippedContent :: PandocMonad m => ManCompiler m Block -compileSkippedContent = do +parseSkippedContent :: PandocMonad m => ManParser m Block +parseSkippedContent = do tok <- munknownMacro <|> mcomment <|> memplyLine onToken tok return Null where - onToken :: PandocMonad m => ManToken -> ManCompiler m () + onToken :: PandocMonad m => ManToken -> ManParser m () onToken (MUnknownMacro mname _) = do pos <- getPosition logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos @@ -377,30 +377,30 @@ strToInline s Italic = Emph [Str s] strToInline s Bold = Strong [Str s] strToInline s ItalicBold = Strong [Emph [Str s]] -compilePara :: PandocMonad m => ManCompiler m Block -compilePara = do +parsePara :: PandocMonad m => ManParser m Block +parsePara = do inls <- many1 (strInl <|> lineInl <|> comment) let withspaces = intersperse [Str " "] inls return $ Para (concat withspaces) where - strInl :: PandocMonad m => ManCompiler m [Inline] + strInl :: PandocMonad m => ManParser m [Inline] strInl = do (MStr str fk) <- mstr return [strToInline str fk] - lineInl :: PandocMonad m => ManCompiler m [Inline] + lineInl :: PandocMonad m => ManParser m [Inline] lineInl = do (MLine fragments) <- mline return $ fmap (\(s,f) -> strToInline s f) fragments - comment :: PandocMonad m => ManCompiler m [Inline] + comment :: PandocMonad m => ManParser m [Inline] comment = mcomment >> return [] -compileCodeBlock :: PandocMonad m => ManCompiler m Block -compileCodeBlock = do +parseCodeBlock :: PandocMonad m => ManParser m Block +parseCodeBlock = do mmacro KCodeBlStart toks <- many (mstr <|> mline <|> mlink <|> memplyLine <|> munknownMacro <|> mcomment) mmacro KCodeBlEnd @@ -415,10 +415,10 @@ compileCodeBlock = do extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' extractText _ = Nothing -compileHeader :: PandocMonad m => ManCompiler m Block -compileHeader = do +parseHeader :: PandocMonad m => ManParser m Block +parseHeader = do (MHeader lvl s) <- mheader return $ Header lvl nullAttr [Str s] -compileSkipMacro :: PandocMonad m => ManCompiler m Block -compileSkipMacro = mmacroAny >> return Null +parseSkipMacro :: PandocMonad m => ManParser m Block +parseSkipMacro = mmacroAny >> return Null -- cgit v1.2.3 From a00323cbbe8946208f06e9b752d26cb4cae8b9a9 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 20 May 2018 15:37:15 +0300 Subject: links, specialchars --- src/Text/Pandoc/Readers/Man.hs | 104 +++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 39 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 91e0c6a1c..2f0674ff1 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -67,7 +67,7 @@ data MacroKind = KTitle data ManToken = MStr String FontKind | MLine [(String, FontKind)] - | MLink String Target + | MMaybeLink String | MEmptyLine | MHeader Int String | MMacro MacroKind [String] @@ -166,40 +166,26 @@ lexMacro = do macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- lexArgs let joinedArgs = concat $ intersperse " " args + let knownMacro mkind = MMacro mkind args let tok = case macroName of x | x `elem` ["\\\"", "\\#"] -> MComment joinedArgs - "TH" -> MMacro KTitle args - "TP" -> MMacro KTab [] - "PP" -> MMacro KTabEnd [] - "nf" -> MMacro KCodeBlStart [] - "fi" -> MMacro KCodeBlEnd [] - x | x `elem` ["B", "BR"] -> MStr joinedArgs Bold -- "BR" is often used as a link to another man + "TH" -> knownMacro KTitle + "TP" -> knownMacro KTab + "RE" -> knownMacro KTabEnd + "nf" -> knownMacro KCodeBlStart + "fi" -> knownMacro KCodeBlEnd + "B" -> MStr joinedArgs Bold + "BR" -> MMaybeLink joinedArgs x | x `elem` ["BI", "IB"] -> MStr joinedArgs ItalicBold x | x `elem` ["I", "IR", "RI"] -> MStr joinedArgs Italic "SH" -> MHeader 2 joinedArgs - "sp" -> MEmptyLine + x | x `elem` [ "P", "PP", "LP", "sp"] -> MEmptyLine _ -> MUnknownMacro macroName args return tok where - linkToMan :: String -> Maybe Block - linkToMan txt = case runParser linkParser () "" txt of - Right lnk -> Just $ Plain [lnk] - Left _ -> Nothing - where - linkParser :: Parsec String () Inline - linkParser = do - mpage <- many1 alphaNum - space - char '(' - mansect <- digit - char ')' - -- assuming man pages are generated from Linux-like repository - let manurl pagename section = "../"++section++"/"++pagename++"."++section - return $ Link nullAttr [Str txt] (manurl mpage [mansect], mpage) - lexArgs :: PandocMonad m => ManLexer m [String] lexArgs = do eolOpt <- optionMaybe $ char '\n' @@ -226,6 +212,7 @@ lexMacro = do quotedChar :: PandocMonad m => ManLexer m Char quotedChar = noneOf "\"\n" <|> try (string "\"\"" >> return '"') +-- TODO handle more cases escapeLexer :: PandocMonad m => ManLexer m EscapeThing escapeLexer = do char '\\' @@ -233,23 +220,38 @@ escapeLexer = do where escChar :: PandocMonad m => ManLexer m EscapeThing - escChar = choice [ char '-' >> return (EChar '-') - , oneOf ['%', '{', '}'] >> return ENothing + escChar = + let skipChars = ['%', '{', '}', '&'] + subsChars = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') + , ("[em]", '—'), ("[en]", '–') ] + substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing + substitute (from,to) = try $ string from >> return (EChar to) + others = [ oneOf skipChars >> return ENothing + , char '[' >> many alphaNum >> char ']' >> return ENothing ] + in choice $ (substitute <$> subsChars) ++ others escFont :: PandocMonad m => ManLexer m EscapeThing escFont = do char 'f' - font <- choice [ char 'B' >> return Bold - , char 'I' >> return Italic - , (char 'P' <|> anyChar) >> return Regular + font <- choice [ letterFont , char '(' >> anyChar >> anyChar >> return Regular - , string "[]" >> return Regular - , char '[' >> many1 letter >> char ']' >> return Regular + , try (char '[' >> letterFont >>= \f -> char ']' >> return f) + , try $ string "[BI]" >> return ItalicBold + , char '[' >> many letter >> char ']' >> return Regular ] modifyState (\r -> r {fontKind = font}) return $ EFont font + where + + letterFont :: PandocMonad m => ManLexer m FontKind + letterFont = choice [ + char 'B' >> return Bold + , char 'I' >> return Italic + , char 'P' >> return Regular + ] + lexLine :: PandocMonad m => ManLexer m ManToken lexLine = do lnparts <- many1 (esc <|> linePart) @@ -301,10 +303,10 @@ mline = msatisfy isMLine where isMLine (MLine _) = True isMLine _ = False -mlink :: PandocMonad m => ManParser m ManToken -mlink = msatisfy isMLink where - isMLink (MLink _ _) = True - isMLink _ = False +mmaybeLink :: PandocMonad m => ManParser m ManToken +mmaybeLink = msatisfy isMMaybeLink where + isMMaybeLink (MMaybeLink _) = True + isMMaybeLink _ = False memplyLine :: PandocMonad m => ManParser m ManToken memplyLine = msatisfy isMEmptyLine where @@ -379,7 +381,7 @@ strToInline s ItalicBold = Strong [Emph [Str s]] parsePara :: PandocMonad m => ManParser m Block parsePara = do - inls <- many1 (strInl <|> lineInl <|> comment) + inls <- many1 (strInl <|> lineInl <|> linkInl <|> comment) let withspaces = intersperse [Str " "] inls return $ Para (concat withspaces) @@ -395,6 +397,29 @@ parsePara = do (MLine fragments) <- mline return $ fmap (\(s,f) -> strToInline s f) fragments + linkInl :: PandocMonad m => ManParser m [Inline] + linkInl = do + (MMaybeLink txt) <- mmaybeLink + let inls = case runParser linkParser () "" txt of + Right lnk -> lnk + Left _ -> [Strong [Str txt]] + return inls + + where + + -- assuming man pages are generated from Linux-like repository + linkParser :: Parsec String () [Inline] + linkParser = do + mpage <- many1 alphaNum + space + char '(' + mansect <- digit + char ')' + other <- many anyChar + let manurl pagename section = "../"++section++"/"++pagename++"."++section + return $ [ Link nullAttr [Strong [Str mpage]] (manurl mpage [mansect], mpage) + , Strong [Str $ " ("++[mansect] ++ ")", Str other]] + comment :: PandocMonad m => ManParser m [Inline] comment = mcomment >> return [] @@ -402,7 +427,7 @@ parsePara = do parseCodeBlock :: PandocMonad m => ManParser m Block parseCodeBlock = do mmacro KCodeBlStart - toks <- many (mstr <|> mline <|> mlink <|> memplyLine <|> munknownMacro <|> mcomment) + toks <- many (mstr <|> mline <|> mmaybeLink <|> memplyLine <|> munknownMacro <|> mcomment) mmacro KCodeBlEnd return $ CodeBlock nullAttr (intercalate "\n" . catMaybes $ extractText <$> toks) @@ -410,8 +435,8 @@ parseCodeBlock = do extractText :: ManToken -> Maybe String extractText (MStr s _) = Just s - extractText (MLine ss) = Just . intercalate " " $ map fst ss - extractText (MLink s _) = Just s + extractText (MLine ss) = Just . concat $ map fst ss -- TODO maybe unwords? + extractText (MMaybeLink s) = Just s extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' extractText _ = Nothing @@ -420,5 +445,6 @@ parseHeader = do (MHeader lvl s) <- mheader return $ Header lvl nullAttr [Str s] +-- In case of weird man file it will be parsed succesfully parseSkipMacro :: PandocMonad m => ManParser m Block parseSkipMacro = mmacroAny >> return Null -- cgit v1.2.3 From 7f7e1c21e2c1f1f86e6ef088bdc70461a693c20e Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 20 May 2018 16:47:58 +0300 Subject: lists --- src/Text/Pandoc/Readers/Man.hs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 2f0674ff1..d70eedb86 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -133,7 +133,7 @@ readMan opts txt = do -> [ManToken] -- ^ input -> m (Either PandocError a) readWithMTokens parser state input = - mapLeft (PandocParsecError . concat $ show <$> input) `liftM` runParserT parser state "source" input + mapLeft (PandocParsecError . (intercalate "\n") $ show <$> input) `liftM` runParserT parser state "source" input mapLeft :: (a -> c) -> Either a b -> Either c b mapLeft f (Left x) = Left $ f x @@ -148,8 +148,8 @@ lexMan = many (lexMacro <|> lexLine <|> lexEmptyLine) parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do - let parsers = [parseTitle, parsePara, parseSkippedContent - , parseCodeBlock, parseHeader, parseSkipMacro] + let parsers = [ parseBulletList, parseTitle, parsePara, parseSkippedContent + , parseCodeBlock, parseHeader, parseSkipMacro] blocks <- many $ choice parsers parserst <- getState return $ Pandoc (stateMeta parserst) (filter (not . isNull) blocks) @@ -159,6 +159,7 @@ parseMan = do isNull Null = True isNull _ = False +-- TODO escape characters in arguments lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro = do char '.' <|> char '\'' @@ -180,6 +181,7 @@ lexMacro = do x | x `elem` ["BI", "IB"] -> MStr joinedArgs ItalicBold x | x `elem` ["I", "IR", "RI"] -> MStr joinedArgs Italic "SH" -> MHeader 2 joinedArgs + "SS" -> MHeader 3 joinedArgs x | x `elem` [ "P", "PP", "LP", "sp"] -> MEmptyLine _ -> MUnknownMacro macroName args return tok @@ -410,7 +412,7 @@ parsePara = do -- assuming man pages are generated from Linux-like repository linkParser :: Parsec String () [Inline] linkParser = do - mpage <- many1 alphaNum + mpage <- many1 (alphaNum <|> char '_') space char '(' mansect <- digit @@ -445,6 +447,20 @@ parseHeader = do (MHeader lvl s) <- mheader return $ Header lvl nullAttr [Str s] +parseBulletList :: PandocMonad m => ManParser m Block +parseBulletList = do + bls <- many1 block + return $ BulletList $ map (:[]) bls + + where + + block :: PandocMonad m => ManParser m Block + block = do + mmacro KTab + pars <- parsePara + many $ mmacro KTabEnd + return pars + -- In case of weird man file it will be parsed succesfully parseSkipMacro :: PandocMonad m => ManParser m Block parseSkipMacro = mmacroAny >> return Null -- cgit v1.2.3 From 1ce067fc2a4f70963c2c879eac74807bb4fa9e7c Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 23 May 2018 00:20:30 +0300 Subject: tests, parsing fixes --- src/Text/Pandoc/Readers/Man.hs | 35 +++++++++++++++++++---------------- test/Tests/Readers/Man.hs | 18 +++++++++++++++--- 2 files changed, 34 insertions(+), 19 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index d70eedb86..7b752373f 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -37,7 +37,7 @@ import Control.Monad (liftM) import Control.Monad.Except (throwError) import Data.Default (Default) import Data.Map (insert) -import Data.Maybe (isJust, catMaybes) +import Data.Maybe (catMaybes) import Data.List (intersperse, intercalate) import qualified Data.Text as T @@ -159,6 +159,9 @@ parseMan = do isNull Null = True isNull _ = False +eofline :: PandocMonad m => ManLexer m () +eofline = (newline >> return ()) <|> eof + -- TODO escape characters in arguments lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro = do @@ -166,7 +169,7 @@ lexMacro = do many space macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- lexArgs - let joinedArgs = concat $ intersperse " " args + let joinedArgs = unwords args let knownMacro mkind = MMacro mkind args let tok = case macroName of @@ -190,17 +193,17 @@ lexMacro = do lexArgs :: PandocMonad m => ManLexer m [String] lexArgs = do - eolOpt <- optionMaybe $ char '\n' - if isJust eolOpt - then return [] - else do - many1 space - arg <- try quotedArg <|> plainArg - otherargs <- lexArgs - return $ arg : otherargs + args <- many oneArg + eofline + return args where + oneArg :: PandocMonad m => ManLexer m String + oneArg = do + many1 $ char ' ' + try quotedArg <|> plainArg + plainArg :: PandocMonad m => ManLexer m String plainArg = many1 $ noneOf " \t\n" @@ -257,7 +260,7 @@ escapeLexer = do lexLine :: PandocMonad m => ManLexer m ManToken lexLine = do lnparts <- many1 (esc <|> linePart) - newline + eofline return $ MLine $ catMaybes lnparts where @@ -384,7 +387,7 @@ strToInline s ItalicBold = Strong [Emph [Str s]] parsePara :: PandocMonad m => ManParser m Block parsePara = do inls <- many1 (strInl <|> lineInl <|> linkInl <|> comment) - let withspaces = intersperse [Str " "] inls + let withspaces = intersperse [Space] inls return $ Para (concat withspaces) where @@ -420,7 +423,9 @@ parsePara = do other <- many anyChar let manurl pagename section = "../"++section++"/"++pagename++"."++section return $ [ Link nullAttr [Strong [Str mpage]] (manurl mpage [mansect], mpage) - , Strong [Str $ " ("++[mansect] ++ ")", Str other]] + , Strong [Str $ " ("++[mansect] ++ ")" + , Str other] + ] comment :: PandocMonad m => ManParser m [Inline] comment = mcomment >> return [] @@ -448,9 +453,7 @@ parseHeader = do return $ Header lvl nullAttr [Str s] parseBulletList :: PandocMonad m => ManParser m Block -parseBulletList = do - bls <- many1 block - return $ BulletList $ map (:[]) bls +parseBulletList = BulletList . map (: []) <$> many1 block where diff --git a/test/Tests/Readers/Man.hs b/test/Tests/Readers/Man.hs index a9fe324d1..4807095a5 100644 --- a/test/Tests/Readers/Man.hs +++ b/test/Tests/Readers/Man.hs @@ -23,10 +23,22 @@ tests = [ -- .SH "HEllo bbb" "aaa"" as" testGroup "Macros" [ "Bold" =: - ".B foo\n" + ".B foo" =?> (para $ strong "foo") , "Italic" =: - ".I foo\n" - =?> (para $ emph "foo") + ".I bar\n" + =?> (para $ emph "bar") + , "BoldItalic" =: + ".BI foo bar" + =?> (para $ strong $ emph $ str "foo bar") + , "H1" =: + ".SH The header\n" + =?> header 2 (str "The header") + , "H2" =: + ".SS The header 2" + =?> header 3 (str "The header 2") + , "Macro args" =: + ".B \"single arg with \"\"Q\"\"\"" + =?> (para $ strong $ str "single arg with \"Q\"") ] ] -- cgit v1.2.3 From 1d7c71189a5760d6674b02c13d0db6dae8eff10d Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sat, 26 May 2018 20:12:41 +0300 Subject: escape chars in macros --- src/Text/Pandoc/Readers/Man.hs | 115 +++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 49 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 7b752373f..8977c9df4 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE FlexibleContexts #-} {- Copyright (C) 2018 Yan Pashkovsky @@ -65,6 +66,9 @@ data MacroKind = KTitle | KTabEnd deriving (Show, Eq) +-- TODO header strings +-- TODO remove MStr +-- TODO filter skipped content data ManToken = MStr String FontKind | MLine [(String, FontKind)] | MMaybeLink String @@ -159,14 +163,59 @@ parseMan = do isNull Null = True isNull _ = False -eofline :: PandocMonad m => ManLexer m () +eofline :: Stream s m Char => ParsecT s u m () eofline = (newline >> return ()) <|> eof --- TODO escape characters in arguments +spacetab :: Stream s m Char => ParsecT s u m Char +spacetab = char ' ' <|> char '\t' + +-- TODO handle more cases +escapeLexer :: PandocMonad m => ManLexer m EscapeThing +escapeLexer = do + char '\\' + choice [escChar, escFont] + where + + escChar :: PandocMonad m => ManLexer m EscapeThing + escChar = + let skipSeqs = ["%", "{", "}", "&"] + subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') + , ("[em]", '—'), ("[en]", '–') ] + substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing + substitute (from,to) = try $ string from >> return (EChar to) + skip :: PandocMonad m => String -> ManLexer m EscapeThing + skip seq' = try $ string seq' >> return ENothing + in choice $ (substitute <$> subsSeqs) ++ + (skip <$> skipSeqs) ++ + [ char '(' >> anyChar >> return ENothing + , char '[' >> many alphaNum >> char ']' >> return ENothing + ] + + escFont :: PandocMonad m => ManLexer m EscapeThing + escFont = do + char 'f' + font <- choice [ letterFont + , char '(' >> anyChar >> anyChar >> return Regular + , try (char '[' >> letterFont >>= \f -> char ']' >> return f) + , try $ string "[BI]" >> return ItalicBold + , char '[' >> many letter >> char ']' >> return Regular + ] + modifyState (\r -> r {fontKind = font}) + return $ EFont font + + where + + letterFont :: PandocMonad m => ManLexer m FontKind + letterFont = choice [ + char 'B' >> return Bold + , char 'I' >> return Italic + , char 'P' >> return Regular + ] + lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro = do char '.' <|> char '\'' - many space + many spacetab macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- lexArgs let joinedArgs = unwords args @@ -175,6 +224,7 @@ lexMacro = do let tok = case macroName of x | x `elem` ["\\\"", "\\#"] -> MComment joinedArgs "TH" -> knownMacro KTitle + "IP" -> knownMacro KTab "TP" -> knownMacro KTab "RE" -> knownMacro KTabEnd "nf" -> knownMacro KCodeBlStart @@ -201,61 +251,28 @@ lexMacro = do oneArg :: PandocMonad m => ManLexer m String oneArg = do - many1 $ char ' ' - try quotedArg <|> plainArg + many1 spacetab + quotedArg <|> plainArg plainArg :: PandocMonad m => ManLexer m String - plainArg = many1 $ noneOf " \t\n" + plainArg = fmap catMaybes . many1 $ escChar <|> (Just <$> noneOf " \t\n") quotedArg :: PandocMonad m => ManLexer m String quotedArg = do char '"' - val <- many1 quotedChar + val <- catMaybes <$> many quotedChar char '"' return val - quotedChar :: PandocMonad m => ManLexer m Char - quotedChar = noneOf "\"\n" <|> try (string "\"\"" >> return '"') - --- TODO handle more cases -escapeLexer :: PandocMonad m => ManLexer m EscapeThing -escapeLexer = do - char '\\' - choice [escChar, escFont] - where - - escChar :: PandocMonad m => ManLexer m EscapeThing - escChar = - let skipChars = ['%', '{', '}', '&'] - subsChars = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') - , ("[em]", '—'), ("[en]", '–') ] - substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing - substitute (from,to) = try $ string from >> return (EChar to) - others = [ oneOf skipChars >> return ENothing - , char '[' >> many alphaNum >> char ']' >> return ENothing - ] - in choice $ (substitute <$> subsChars) ++ others - - escFont :: PandocMonad m => ManLexer m EscapeThing - escFont = do - char 'f' - font <- choice [ letterFont - , char '(' >> anyChar >> anyChar >> return Regular - , try (char '[' >> letterFont >>= \f -> char ']' >> return f) - , try $ string "[BI]" >> return ItalicBold - , char '[' >> many letter >> char ']' >> return Regular - ] - modifyState (\r -> r {fontKind = font}) - return $ EFont font - - where + quotedChar :: PandocMonad m => ManLexer m (Maybe Char) + quotedChar = escChar <|> (Just <$> noneOf "\"\n") <|> (Just <$> try (string "\"\"" >> return '"')) - letterFont :: PandocMonad m => ManLexer m FontKind - letterFont = choice [ - char 'B' >> return Bold - , char 'I' >> return Italic - , char 'P' >> return Regular - ] + escChar :: PandocMonad m => ManLexer m (Maybe Char) + escChar = do + ec <- escapeLexer + case ec of + (EChar c) -> return $ Just c + _ -> return Nothing lexLine :: PandocMonad m => ManLexer m ManToken lexLine = do @@ -416,7 +433,7 @@ parsePara = do linkParser :: Parsec String () [Inline] linkParser = do mpage <- many1 (alphaNum <|> char '_') - space + spacetab char '(' mansect <- digit char ')' -- cgit v1.2.3 From 9030c5ae46368e56ecaf3c579c3b04ca2d1edaff Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sat, 26 May 2018 23:29:36 +0300 Subject: nested lists --- src/Text/Pandoc/Readers/Man.hs | 136 +++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 52 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 8977c9df4..9797d2811 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -36,6 +36,7 @@ module Text.Pandoc.Readers.Man (readMan, testFile) where import Prelude import Control.Monad (liftM) import Control.Monad.Except (throwError) +import Data.Char (isDigit, isUpper, isLower) import Data.Default (Default) import Data.Map (insert) import Data.Maybe (catMaybes) @@ -64,18 +65,18 @@ data MacroKind = KTitle | KCodeBlEnd | KTab | KTabEnd + | KSubTab deriving (Show, Eq) --- TODO header strings --- TODO remove MStr --- TODO filter skipped content -data ManToken = MStr String FontKind - | MLine [(String, FontKind)] +type RoffStr = (String, FontKind) + +data ManToken = MStr RoffStr + | MLine [RoffStr] | MMaybeLink String | MEmptyLine - | MHeader Int String - | MMacro MacroKind [String] - | MUnknownMacro String [String] + | MHeader Int [RoffStr] + | MMacro MacroKind [RoffStr] + | MUnknownMacro String [RoffStr] | MComment String deriving Show @@ -152,7 +153,7 @@ lexMan = many (lexMacro <|> lexLine <|> lexEmptyLine) parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do - let parsers = [ parseBulletList, parseTitle, parsePara, parseSkippedContent + let parsers = [ try parseBulletList, parseTitle, parsePara, parseSkippedContent , parseCodeBlock, parseHeader, parseSkipMacro] blocks <- many $ choice parsers parserst <- getState @@ -212,13 +213,18 @@ escapeLexer = do , char 'P' >> return Regular ] +currentFont :: PandocMonad m => ManLexer m FontKind +currentFont = do + RoffState {fontKind = fk} <- getState + return fk + lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro = do char '.' <|> char '\'' many spacetab macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- lexArgs - let joinedArgs = unwords args + let joinedArgs = unwords $ fst <$> args let knownMacro mkind = MMacro mkind args let tok = case macroName of @@ -227,21 +233,22 @@ lexMacro = do "IP" -> knownMacro KTab "TP" -> knownMacro KTab "RE" -> knownMacro KTabEnd + "RS" -> knownMacro KSubTab "nf" -> knownMacro KCodeBlStart "fi" -> knownMacro KCodeBlEnd - "B" -> MStr joinedArgs Bold + "B" -> MStr (joinedArgs,Bold) "BR" -> MMaybeLink joinedArgs - x | x `elem` ["BI", "IB"] -> MStr joinedArgs ItalicBold - x | x `elem` ["I", "IR", "RI"] -> MStr joinedArgs Italic - "SH" -> MHeader 2 joinedArgs - "SS" -> MHeader 3 joinedArgs + x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, ItalicBold) + x | x `elem` ["I", "IR", "RI"] -> MStr (joinedArgs, Italic) + "SH" -> MHeader 2 args + "SS" -> MHeader 3 args x | x `elem` [ "P", "PP", "LP", "sp"] -> MEmptyLine _ -> MUnknownMacro macroName args return tok where - lexArgs :: PandocMonad m => ManLexer m [String] + lexArgs :: PandocMonad m => ManLexer m [RoffStr] lexArgs = do args <- many oneArg eofline @@ -249,20 +256,24 @@ lexMacro = do where - oneArg :: PandocMonad m => ManLexer m String + oneArg :: PandocMonad m => ManLexer m RoffStr oneArg = do many1 spacetab quotedArg <|> plainArg - plainArg :: PandocMonad m => ManLexer m String - plainArg = fmap catMaybes . many1 $ escChar <|> (Just <$> noneOf " \t\n") + plainArg :: PandocMonad m => ManLexer m RoffStr + plainArg = do + arg <- many1 $ escChar <|> (Just <$> noneOf " \t\n") + f <- currentFont + return (catMaybes arg, f) - quotedArg :: PandocMonad m => ManLexer m String + quotedArg :: PandocMonad m => ManLexer m RoffStr quotedArg = do char '"' val <- catMaybes <$> many quotedChar char '"' - return val + f <- currentFont + return (val, f) quotedChar :: PandocMonad m => ManLexer m (Maybe Char) quotedChar = escChar <|> (Just <$> noneOf "\"\n") <|> (Just <$> try (string "\"\"" >> return '"')) @@ -295,11 +306,6 @@ lexLine = do lnpart <- many1 $ noneOf "\n\\" font <- currentFont return $ Just (lnpart, font) - - currentFont :: PandocMonad m => ManLexer m FontKind - currentFont = do - RoffState {fontKind = fk} <- getState - return fk lexEmptyLine :: PandocMonad m => ManLexer m ManToken @@ -313,11 +319,11 @@ msatisfy :: (Show t, Stream s m t) => (t -> Bool) -> ParserT s st m t msatisfy predic = tokenPrim show nextPos testTok where testTok t = if predic t then Just t else Nothing - nextPos pos x _xs = updatePosString pos (show x) + nextPos pos x _xs = updatePosString (setSourceLine pos $ sourceLine pos + (if predic x then 1 else 0)) (show x) mstr :: PandocMonad m => ManParser m ManToken mstr = msatisfy isMStr where - isMStr (MStr _ _) = True + isMStr (MStr _) = True isMStr _ = False mline :: PandocMonad m => ManParser m ManToken @@ -371,7 +377,7 @@ parseTitle = do if null args then return Null else do - let mantitle = head args + let mantitle = fst $ head args modifyState (changeTitle mantitle) return $ Header 1 nullAttr [Str mantitle] where @@ -395,29 +401,32 @@ parseSkippedContent = do logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos onToken _ = return () -strToInline :: String -> FontKind -> Inline -strToInline s Regular = Str s -strToInline s Italic = Emph [Str s] -strToInline s Bold = Strong [Str s] -strToInline s ItalicBold = Strong [Emph [Str s]] +strToInline :: RoffStr -> Inline +strToInline (s, Regular) = Str s +strToInline (s, Italic) = Emph [Str s] +strToInline (s, Bold) = Strong [Str s] +strToInline (s, ItalicBold) = Strong [Emph [Str s]] parsePara :: PandocMonad m => ManParser m Block -parsePara = do +parsePara = Para <$> parseInlines + +parseInlines :: PandocMonad m => ManParser m [Inline] +parseInlines = do inls <- many1 (strInl <|> lineInl <|> linkInl <|> comment) let withspaces = intersperse [Space] inls - return $ Para (concat withspaces) + return $ concat withspaces where strInl :: PandocMonad m => ManParser m [Inline] strInl = do - (MStr str fk) <- mstr - return [strToInline str fk] + (MStr rstr) <- mstr + return [strToInline rstr] lineInl :: PandocMonad m => ManParser m [Inline] lineInl = do (MLine fragments) <- mline - return $ fmap (\(s,f) -> strToInline s f) fragments + return $ strToInline <$> fragments linkInl :: PandocMonad m => ManParser m [Inline] linkInl = do @@ -458,7 +467,7 @@ parseCodeBlock = do where extractText :: ManToken -> Maybe String - extractText (MStr s _) = Just s + extractText (MStr (s, _)) = Just s extractText (MLine ss) = Just . concat $ map fst ss -- TODO maybe unwords? extractText (MMaybeLink s) = Just s extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' @@ -466,20 +475,43 @@ parseCodeBlock = do parseHeader :: PandocMonad m => ManParser m Block parseHeader = do - (MHeader lvl s) <- mheader - return $ Header lvl nullAttr [Str s] + (MHeader lvl ss) <- mheader + return $ Header lvl nullAttr $ intersperse Space $ strToInline <$> ss parseBulletList :: PandocMonad m => ManParser m Block -parseBulletList = BulletList . map (: []) <$> many1 block - - where - - block :: PandocMonad m => ManParser m Block - block = do - mmacro KTab - pars <- parsePara - many $ mmacro KTabEnd - return pars +parseBulletList = BulletList <$> many1 paras where + + macroIPInl :: [RoffStr] -> [Inline] + macroIPInl (x:_:[]) = [strToInline x, Space] + macroIPInl _ = [] + + listKind :: [RoffStr] -> Maybe ([[Block]] -> Block) + listKind (((c:_), _):_:[]) = + let params style = OrderedList (1, style, DefaultDelim) + in Just $ case c of + _ | isDigit c -> params Decimal + _ | isUpper c -> params UpperAlpha + _ | isLower c -> params LowerAlpha + _ -> BulletList + + listKind _ = Nothing + + paras :: PandocMonad m => ManParser m [Block] + paras = do + (MMacro _ args) <- mmacro KTab + let lk = listKind args + inls <- parseInlines + let macroinl = macroIPInl args + let para = Plain $ macroinl ++ inls + subls <- many sublist + return $ para : subls + + sublist :: PandocMonad m => ManParser m Block + sublist = do + mmacro KSubTab + bl <- parseBulletList + mmacro KTabEnd + return bl -- In case of weird man file it will be parsed succesfully parseSkipMacro :: PandocMonad m => ManParser m Block -- cgit v1.2.3 From c2ae72aa6cee5aebb85228b5cc6fe6a620cf42f7 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 27 May 2018 14:09:34 +0300 Subject: custom ordered lists --- src/Text/Pandoc/Readers/Man.hs | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 9797d2811..adac1aca8 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -39,7 +39,7 @@ import Control.Monad.Except (throwError) import Data.Char (isDigit, isUpper, isLower) import Data.Default (Default) import Data.Map (insert) -import Data.Maybe (catMaybes) +import Data.Maybe (catMaybes, fromMaybe, isNothing) import Data.List (intersperse, intercalate) import qualified Data.Text as T @@ -153,7 +153,7 @@ lexMan = many (lexMacro <|> lexLine <|> lexEmptyLine) parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do - let parsers = [ try parseBulletList, parseTitle, parsePara, parseSkippedContent + let parsers = [ try parseList, parseTitle, parsePara, parseSkippedContent , parseCodeBlock, parseHeader, parseSkipMacro] blocks <- many $ choice parsers parserst <- getState @@ -210,7 +210,7 @@ escapeLexer = do letterFont = choice [ char 'B' >> return Bold , char 'I' >> return Italic - , char 'P' >> return Regular + , (char 'P' <|> char 'R') >> return Regular ] currentFont :: PandocMonad m => ManLexer m FontKind @@ -248,6 +248,7 @@ lexMacro = do where + -- TODO rework args lexArgs :: PandocMonad m => ManLexer m [RoffStr] lexArgs = do args <- many oneArg @@ -478,38 +479,47 @@ parseHeader = do (MHeader lvl ss) <- mheader return $ Header lvl nullAttr $ intersperse Space $ strToInline <$> ss -parseBulletList :: PandocMonad m => ManParser m Block -parseBulletList = BulletList <$> many1 paras where +type ListBuilder = [[Block]] -> Block + +parseList :: PandocMonad m => ManParser m Block +parseList = do + xx <- many1 paras + let bls = map snd xx + let bldr = fst $ head xx + return $ bldr bls + + where macroIPInl :: [RoffStr] -> [Inline] macroIPInl (x:_:[]) = [strToInline x, Space] macroIPInl _ = [] - listKind :: [RoffStr] -> Maybe ([[Block]] -> Block) + listKind :: [RoffStr] -> Maybe ListBuilder listKind (((c:_), _):_:[]) = let params style = OrderedList (1, style, DefaultDelim) - in Just $ case c of - _ | isDigit c -> params Decimal - _ | isUpper c -> params UpperAlpha - _ | isLower c -> params LowerAlpha - _ -> BulletList + in case c of + _ | isDigit c -> Just $ params Decimal + _ | isUpper c -> Just $ params UpperAlpha + _ | isLower c -> Just $ params LowerAlpha + _ -> Nothing listKind _ = Nothing - paras :: PandocMonad m => ManParser m [Block] + paras :: PandocMonad m => ManParser m (ListBuilder, [Block]) paras = do (MMacro _ args) <- mmacro KTab - let lk = listKind args + let lbuilderOpt = listKind args + let lbuilder = fromMaybe BulletList lbuilderOpt inls <- parseInlines let macroinl = macroIPInl args - let para = Plain $ macroinl ++ inls + let parainls = if isNothing lbuilderOpt then macroinl ++ inls else inls subls <- many sublist - return $ para : subls + return $ (lbuilder, (Plain parainls) : subls) sublist :: PandocMonad m => ManParser m Block sublist = do mmacro KSubTab - bl <- parseBulletList + bl <- parseList mmacro KTabEnd return bl -- cgit v1.2.3 From 4f3dd3b1af7217214287ab886147c5e33a54774d Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 7 Oct 2018 17:53:14 +0300 Subject: position calculations --- src/Text/Pandoc/Readers/Man.hs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index adac1aca8..30076102b 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -225,9 +225,9 @@ lexMacro = do macroName <- many1 (letter <|> oneOf ['\\', '"']) args <- lexArgs let joinedArgs = unwords $ fst <$> args - let knownMacro mkind = MMacro mkind args + knownMacro mkind = MMacro mkind args - let tok = case macroName of + tok = case macroName of x | x `elem` ["\\\"", "\\#"] -> MComment joinedArgs "TH" -> knownMacro KTitle "IP" -> knownMacro KTab @@ -320,7 +320,7 @@ msatisfy :: (Show t, Stream s m t) => (t -> Bool) -> ParserT s st m t msatisfy predic = tokenPrim show nextPos testTok where testTok t = if predic t then Just t else Nothing - nextPos pos x _xs = updatePosString (setSourceLine pos $ sourceLine pos + (if predic x then 1 else 0)) (show x) + nextPos pos _x _xs = updatePosString (setSourceColumn (setSourceLine pos $ sourceLine pos + 1) 1) ("") mstr :: PandocMonad m => ManParser m ManToken mstr = msatisfy isMStr where @@ -509,9 +509,9 @@ parseList = do paras = do (MMacro _ args) <- mmacro KTab let lbuilderOpt = listKind args - let lbuilder = fromMaybe BulletList lbuilderOpt + lbuilder = fromMaybe BulletList lbuilderOpt + macroinl = macroIPInl args inls <- parseInlines - let macroinl = macroIPInl args let parainls = if isNothing lbuilderOpt then macroinl ++ inls else inls subls <- many sublist return $ (lbuilder, (Plain parainls) : subls) -- cgit v1.2.3 From c7aa7a83ddbfb220d1613dabb27dc1e72eeb7385 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 7 Oct 2018 19:54:16 +0300 Subject: test suite and more secapes --- src/Text/Pandoc/Readers/Man.hs | 5 +++-- test/grofftest.sh | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 test/grofftest.sh (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 30076102b..0f84a01b3 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -179,9 +179,10 @@ escapeLexer = do escChar :: PandocMonad m => ManLexer m EscapeThing escChar = - let skipSeqs = ["%", "{", "}", "&"] + let skipSeqs = ["%", "{", "}", "&", "\n", ":", "\"", "0", "c"] subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') - , ("[em]", '—'), ("[en]", '–') ] + , ("[em]", '—'), ("[en]", '–'), ("*(lq", '«'), ("*(rq", '»') + , ("t", '\t'), ("e", '\\') ] substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing substitute (from,to) = try $ string from >> return (EChar to) skip :: PandocMonad m => String -> ManLexer m EscapeThing diff --git a/test/grofftest.sh b/test/grofftest.sh new file mode 100644 index 000000000..2c559d21a --- /dev/null +++ b/test/grofftest.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# iterates over specified directory, containing "\w+\.\d"-like files, +# executes pandoc voer them and prints stderr on nonzero return code + +if [ $# -ne 2 ]; then + echo "Not enough arguments" + exit 1 +fi + +PANDOC=$1 +DIR=$2 + +$PANDOC --version > /dev/null || { echo "pandoc executable error" >&2 ; exit 1 ; } + +ls $2 | egrep "^.+\.[0-9]$" | while read f ; do + FILE="$DIR/$f" + $PANDOC -f man -t native < $FILE > /dev/null + if [ $? -ne 0 ]; then + echo "Failed to convert $FILE" + fi +done -- cgit v1.2.3 From 753a4d376df9faecce20d2b60a1d56b9ee0c9357 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 7 Oct 2018 22:10:29 +0300 Subject: Successful parsing of all Linux mans, except zic.8 --- src/Text/Pandoc/Readers/Man.hs | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 0f84a01b3..9802216c6 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -149,12 +149,12 @@ readMan opts txt = do -- lexMan :: PandocMonad m => ManLexer m [ManToken] -lexMan = many (lexMacro <|> lexLine <|> lexEmptyLine) +lexMan = many (lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine) parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do let parsers = [ try parseList, parseTitle, parsePara, parseSkippedContent - , parseCodeBlock, parseHeader, parseSkipMacro] + , try parseCodeBlock, parseHeader, parseSkipMacro] blocks <- many $ choice parsers parserst <- getState return $ Pandoc (stateMeta parserst) (filter (not . isNull) blocks) @@ -174,7 +174,7 @@ spacetab = char ' ' <|> char '\t' escapeLexer :: PandocMonad m => ManLexer m EscapeThing escapeLexer = do char '\\' - choice [escChar, escFont] + choice [escChar, escFont, escUnknown] where escChar :: PandocMonad m => ManLexer m EscapeThing @@ -182,7 +182,7 @@ escapeLexer = do let skipSeqs = ["%", "{", "}", "&", "\n", ":", "\"", "0", "c"] subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') , ("[em]", '—'), ("[en]", '–'), ("*(lq", '«'), ("*(rq", '»') - , ("t", '\t'), ("e", '\\') ] + , ("t", '\t'), ("e", '\\'), ("`", '`') ] substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing substitute (from,to) = try $ string from >> return (EChar to) skip :: PandocMonad m => String -> ManLexer m EscapeThing @@ -201,6 +201,7 @@ escapeLexer = do , try (char '[' >> letterFont >>= \f -> char ']' >> return f) , try $ string "[BI]" >> return ItalicBold , char '[' >> many letter >> char ']' >> return Regular + , digit >> return Regular ] modifyState (\r -> r {fontKind = font}) return $ EFont font @@ -214,11 +215,27 @@ escapeLexer = do , (char 'P' <|> char 'R') >> return Regular ] + escUnknown :: PandocMonad m => ManLexer m EscapeThing + escUnknown = do + c <- anyChar + pos <- getPosition + logOutput $ SkippedContent ("Unknown escape seq \\" ++ [c]) pos + return ENothing + currentFont :: PandocMonad m => ManLexer m FontKind currentFont = do RoffState {fontKind = fk} <- getState return fk +-- separate function from lexMacro since real man files sometimes do not follow the rules +lexComment :: PandocMonad m => ManLexer m ManToken +lexComment = do + try $ string ".\\\"" + many space + body <- many $ noneOf "\n" + char '\n' + return $ MComment body + lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro = do char '.' <|> char '\'' @@ -249,7 +266,7 @@ lexMacro = do where - -- TODO rework args + -- TODO better would be [[RoffStr]], since one arg may have different fonts lexArgs :: PandocMonad m => ManLexer m [RoffStr] lexArgs = do args <- many oneArg @@ -261,21 +278,24 @@ lexMacro = do oneArg :: PandocMonad m => ManLexer m RoffStr oneArg = do many1 spacetab - quotedArg <|> plainArg + many $ try $ string "\\\n" + try quotedArg <|> plainArg -- try, because there are some erroneous files, e.g. linux/bpf.2 plainArg :: PandocMonad m => ManLexer m RoffStr plainArg = do + indents <- many spacetab arg <- many1 $ escChar <|> (Just <$> noneOf " \t\n") f <- currentFont - return (catMaybes arg, f) + return (indents ++ catMaybes arg, f) quotedArg :: PandocMonad m => ManLexer m RoffStr quotedArg = do char '"' - val <- catMaybes <$> many quotedChar + val <- many quotedChar char '"' + val2 <- many $ escChar <|> (Just <$> noneOf " \t\n") f <- currentFont - return (val, f) + return (catMaybes $ val ++ val2, f) quotedChar :: PandocMonad m => ManLexer m (Maybe Char) quotedChar = escChar <|> (Just <$> noneOf "\"\n") <|> (Just <$> try (string "\"\"" >> return '"')) -- cgit v1.2.3 From 07b4d7b297dfc83f47aa1d708b7405a5e4b3cc4f Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 7 Oct 2018 22:41:11 +0300 Subject: posix man files parsed successfully --- src/Text/Pandoc/Readers/Man.hs | 10 ++++++---- test/grofftest.sh | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 9802216c6..aea53a375 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -182,7 +182,8 @@ escapeLexer = do let skipSeqs = ["%", "{", "}", "&", "\n", ":", "\"", "0", "c"] subsSeqs = [ ("-", '-'), (" ", ' '), ("\\", '\\'), ("[lq]", '“'), ("[rq]", '”') , ("[em]", '—'), ("[en]", '–'), ("*(lq", '«'), ("*(rq", '»') - , ("t", '\t'), ("e", '\\'), ("`", '`') ] + , ("t", '\t'), ("e", '\\'), ("`", '`'), ("^", ' '), ("|", ' ') + , ("'", '`') ] substitute :: PandocMonad m => (String,Char) -> ManLexer m EscapeThing substitute (from,to) = try $ string from >> return (EChar to) skip :: PandocMonad m => String -> ManLexer m EscapeThing @@ -219,7 +220,7 @@ escapeLexer = do escUnknown = do c <- anyChar pos <- getPosition - logOutput $ SkippedContent ("Unknown escape seq \\" ++ [c]) pos + logOutput $ SkippedContent ("Unknown escape sequence \\" ++ [c]) pos return ENothing currentFont :: PandocMonad m => ManLexer m FontKind @@ -240,7 +241,7 @@ lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro = do char '.' <|> char '\'' many spacetab - macroName <- many1 (letter <|> oneOf ['\\', '"']) + macroName <- many1 (letter <|> oneOf ['\\', '"', '&']) args <- lexArgs let joinedArgs = unwords $ fst <$> args knownMacro mkind = MMacro mkind args @@ -269,7 +270,8 @@ lexMacro = do -- TODO better would be [[RoffStr]], since one arg may have different fonts lexArgs :: PandocMonad m => ManLexer m [RoffStr] lexArgs = do - args <- many oneArg + args <- many $ try oneArg + many spacetab eofline return args diff --git a/test/grofftest.sh b/test/grofftest.sh index 2c559d21a..ca1aa71d9 100644 --- a/test/grofftest.sh +++ b/test/grofftest.sh @@ -13,9 +13,9 @@ DIR=$2 $PANDOC --version > /dev/null || { echo "pandoc executable error" >&2 ; exit 1 ; } -ls $2 | egrep "^.+\.[0-9]$" | while read f ; do +ls $2 | egrep "^.+\.[0-9].?$" | while read f ; do FILE="$DIR/$f" - $PANDOC -f man -t native < $FILE > /dev/null + $PANDOC -f man -t native < $FILE 2>&1 > /dev/null if [ $? -ne 0 ]; then echo "Failed to convert $FILE" fi -- cgit v1.2.3 From 3fed62611ea394b088c49c1de680e74978bb9f82 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Sun, 14 Oct 2018 00:57:15 +0300 Subject: tests, commented debug functions --- src/Text/Pandoc/Readers/Man.hs | 23 +++++++------ test/Tests/Readers/Man.hs | 77 +++++++++++++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 30 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index aea53a375..24f8316ef 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -31,7 +31,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of man to 'Pandoc' document. -} -module Text.Pandoc.Readers.Man (readMan, testFile) where +module Text.Pandoc.Readers.Man (readMan) where --testFile import Prelude import Control.Monad (liftM) @@ -43,7 +43,7 @@ import Data.Maybe (catMaybes, fromMaybe, isNothing) import Data.List (intersperse, intercalate) import qualified Data.Text as T -import Text.Pandoc.Class (PandocMonad(..), runIOorExplode) +import Text.Pandoc.Class (PandocMonad(..)) import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) @@ -94,9 +94,9 @@ instance Default RoffState where type ManLexer m = ParserT [Char] RoffState m type ManParser m = ParserT [ManToken] ParserState m ----- --- testStrr :: [Char] -> Either PandocError Pandoc --- testStrr s = runPure $ readMan def (T.pack s) +---- debug functions +{- +import Text.Pandoc.Class (runIOorExplode) printPandoc :: Pandoc -> [Char] printPandoc (Pandoc m content) = @@ -104,16 +104,17 @@ printPandoc (Pandoc m content) = cnt = intercalate "\n" $ map show content in ttl ++ "\n" ++ cnt --- strrepr :: Either PandocError Pandoc -> [Char] --- strrepr obj = case obj of --- Right x -> printPandoc x --- Left y -> show y +testStr :: String -> IO () +testStr str = do + pand <- runIOorExplode $ readMan def (T.pack str) + putStrLn $ printPandoc pand + testFile :: FilePath -> IO () testFile fname = do cont <- readFile fname - pand <- runIOorExplode $ readMan def (T.pack cont) - putStrLn $ printPandoc pand + testStr cont +-} ---- diff --git a/test/Tests/Readers/Man.hs b/test/Tests/Readers/Man.hs index 4807095a5..6226099d2 100644 --- a/test/Tests/Readers/Man.hs +++ b/test/Tests/Readers/Man.hs @@ -22,23 +22,62 @@ tests :: [TestTree] tests = [ -- .SH "HEllo bbb" "aaa"" as" testGroup "Macros" [ - "Bold" =: - ".B foo" - =?> (para $ strong "foo") - , "Italic" =: - ".I bar\n" - =?> (para $ emph "bar") - , "BoldItalic" =: - ".BI foo bar" - =?> (para $ strong $ emph $ str "foo bar") - , "H1" =: - ".SH The header\n" - =?> header 2 (str "The header") - , "H2" =: - ".SS The header 2" - =?> header 3 (str "The header 2") - , "Macro args" =: - ".B \"single arg with \"\"Q\"\"\"" - =?> (para $ strong $ str "single arg with \"Q\"") - ] + "Bold" =: + ".B foo" + =?> (para $ strong "foo") + , "Italic" =: + ".I bar\n" + =?> (para $ emph "bar") + , "BoldItalic" =: + ".BI foo bar" + =?> (para $ strong $ emph $ str "foo bar") + , "H1" =: + ".SH The header\n" + =?> header 2 (str "The" <> space <> str "header") + , "H2" =: + ".SS \"The header 2\"" + =?> header 3 (str "The header 2") + , "Macro args" =: + ".B \"single arg with \"\"Q\"\"\"" + =?> (para $ strong $ str "single arg with \"Q\"") + , "comment" =: + ".\\\"bla\naaa" + =?> (para $ space <> str "aaa") + , "link" =: + ".BR aa (1)" + =?> (para $ fromList [Link nullAttr [Strong [Str "aa"]] ("../1/aa.1","aa"), Strong [Str " (1)",Str ""]]) + ], + testGroup "Escapes" [ + "fonts" =: + "aa\\fIbb\\fRcc" + =?> (para $ str "aa" <> (emph $ str "bb") <> str "cc") + , "skip" =: + "a\\%\\{\\}\\\n\\:b\\0" + =?> (para $ fromList $ map Str ["a", "b"]) + , "replace" =: + "\\-\\ \\\\\\[lq]\\[rq]\\[em]\\[en]\\*(lq\\*(rq" + =?> (para $ fromList $ map Str ["-", " ", "\\", "“", "”", "—", "–", "«", "»"]) + , "replace2" =: + "\\t\\e\\`\\^\\|\\'" + =?> (para $ fromList $ map Str ["\t", "\\", "`", " ", " ", "`"]) + ], + testGroup "Lists" [ + "bullet" =: + ".IP\nfirst\n.IP\nsecond" + =?> bulletList [plain $ str "first", plain $ str "second"] + , "odrered" =: + ".IP 1 a\nfirst\n.IP 2 a\nsecond" + =?> orderedListWith (1,Decimal,DefaultDelim) [plain $ str "first", plain $ str "second"] + , "upper" =: + ".IP A a\nfirst\n.IP B a\nsecond" + =?> orderedListWith (1,UpperAlpha,DefaultDelim) [plain $ str "first", plain $ str "second"] + , "nested" =: + ".IP\nfirst\n.RS\n.IP\n1a\n.IP\n1b\n.RE" + =?> fromList [BulletList [[Plain [Str "first"],BulletList [[Plain [Str "1a"]],[Plain [Str "1b"]]]]]] + ], + testGroup "CodeBlocks" [ + "cb1"=: + ".nf\naa\n\tbb\n.fi" + =?> codeBlock "aa\n\tbb" + ] ] -- cgit v1.2.3 From 2ca50e95b75312b16ca831287e654fb40732afcc Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Mon, 15 Oct 2018 23:35:27 +0300 Subject: style issues --- src/Text/Pandoc/Readers/Man.hs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 24f8316ef..ea5657b56 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -31,10 +31,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of man to 'Pandoc' document. -} -module Text.Pandoc.Readers.Man (readMan) where --testFile +module Text.Pandoc.Readers.Man (readMan) where import Prelude -import Control.Monad (liftM) +import Control.Monad (liftM, void) import Control.Monad.Except (throwError) import Data.Char (isDigit, isUpper, isLower) import Data.Default (Default) @@ -123,13 +123,11 @@ readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc readMan opts txt = do eithertokens <- readWithM lexMan def (T.unpack $ crFilter txt) case eithertokens of + Left e -> throwError e Right tokenz -> do let state = def {stateOptions = opts} :: ParserState eitherdoc <- readWithMTokens parseMan state tokenz - case eitherdoc of - Right doc -> return doc - Left e -> throwError e - Left e -> throwError e + either throwError return eitherdoc where @@ -139,7 +137,8 @@ readMan opts txt = do -> [ManToken] -- ^ input -> m (Either PandocError a) readWithMTokens parser state input = - mapLeft (PandocParsecError . (intercalate "\n") $ show <$> input) `liftM` runParserT parser state "source" input + let leftF = PandocParsecError . (intercalate "\n") $ show <$> input + in mapLeft leftF `liftM` runParserT parser state "source" input mapLeft :: (a -> c) -> Either a b -> Either c b mapLeft f (Left x) = Left $ f x @@ -166,12 +165,12 @@ parseMan = do isNull _ = False eofline :: Stream s m Char => ParsecT s u m () -eofline = (newline >> return ()) <|> eof +eofline = void newline <|> eof spacetab :: Stream s m Char => ParsecT s u m Char spacetab = char ' ' <|> char '\t' --- TODO handle more cases +-- TODO add other sequences from man (7) groff escapeLexer :: PandocMonad m => ManLexer m EscapeThing escapeLexer = do char '\\' @@ -225,9 +224,7 @@ escapeLexer = do return ENothing currentFont :: PandocMonad m => ManLexer m FontKind -currentFont = do - RoffState {fontKind = fk} <- getState - return fk +currentFont = fontKind <$> getState -- separate function from lexMacro since real man files sometimes do not follow the rules lexComment :: PandocMonad m => ManLexer m ManToken -- cgit v1.2.3 From 1684e918b286fad89425d2d5f45b7d51b5f4ddf8 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Tue, 16 Oct 2018 01:53:04 +0300 Subject: font as a set of styles, mono font support --- src/Text/Pandoc/Readers/Man.hs | 63 ++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 24 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index ea5657b56..280acb9c4 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -38,7 +38,9 @@ import Control.Monad (liftM, void) import Control.Monad.Except (throwError) import Data.Char (isDigit, isUpper, isLower) import Data.Default (Default) +import Data.Functor (($>)) import Data.Map (insert) +import Data.Set (Set, singleton, fromList, toList) import Data.Maybe (catMaybes, fromMaybe, isNothing) import Data.List (intersperse, intercalate) import qualified Data.Text as T @@ -57,8 +59,7 @@ import Text.Parsec.Pos (updatePosString) -- -- Data Types -- - -data FontKind = Regular | Italic | Bold | ItalicBold deriving Show +data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord) data MacroKind = KTitle | KCodeBlStart @@ -68,7 +69,9 @@ data MacroKind = KTitle | KSubTab deriving (Show, Eq) -type RoffStr = (String, FontKind) +type Font = Set FontKind + +type RoffStr = (String, Font) data ManToken = MStr RoffStr | MLine [RoffStr] @@ -80,16 +83,16 @@ data ManToken = MStr RoffStr | MComment String deriving Show -data EscapeThing = EFont FontKind +data EscapeThing = EFont Font | EChar Char | ENothing deriving Show -data RoffState = RoffState { fontKind :: FontKind +data RoffState = RoffState { fontKind :: Font } deriving Show instance Default RoffState where - def = RoffState {fontKind = Regular} + def = RoffState {fontKind = singleton Regular} type ManLexer m = ParserT [Char] RoffState m type ManParser m = ParserT [ManToken] ParserState m @@ -197,22 +200,29 @@ escapeLexer = do escFont :: PandocMonad m => ManLexer m EscapeThing escFont = do char 'f' - font <- choice [ letterFont - , char '(' >> anyChar >> anyChar >> return Regular - , try (char '[' >> letterFont >>= \f -> char ']' >> return f) - , try $ string "[BI]" >> return ItalicBold - , char '[' >> many letter >> char ']' >> return Regular - , digit >> return Regular + font <- choice [ singleton <$> letterFontKind + , char '(' >> anyChar >> anyChar >> return (singleton Regular) + , try lettersFont + , digit >> return (singleton Regular) ] modifyState (\r -> r {fontKind = font}) return $ EFont font where - letterFont :: PandocMonad m => ManLexer m FontKind - letterFont = choice [ + lettersFont :: PandocMonad m => ManLexer m Font + lettersFont = do + char '[' + fs <- many letterFontKind + many letter + char ']' + return $ fromList fs + + letterFontKind :: PandocMonad m => ManLexer m FontKind + letterFontKind = choice [ char 'B' >> return Bold , char 'I' >> return Italic + , char 'C' >> return Monospace , (char 'P' <|> char 'R') >> return Regular ] @@ -223,7 +233,7 @@ escapeLexer = do logOutput $ SkippedContent ("Unknown escape sequence \\" ++ [c]) pos return ENothing -currentFont :: PandocMonad m => ManLexer m FontKind +currentFont :: PandocMonad m => ManLexer m Font currentFont = fontKind <$> getState -- separate function from lexMacro since real man files sometimes do not follow the rules @@ -253,10 +263,10 @@ lexMacro = do "RS" -> knownMacro KSubTab "nf" -> knownMacro KCodeBlStart "fi" -> knownMacro KCodeBlEnd - "B" -> MStr (joinedArgs,Bold) + "B" -> MStr (joinedArgs, singleton Bold) "BR" -> MMaybeLink joinedArgs - x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, ItalicBold) - x | x `elem` ["I", "IR", "RI"] -> MStr (joinedArgs, Italic) + x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, fromList [Italic, Bold]) + x | x `elem` ["I", "IR", "RI"] -> MStr (joinedArgs, singleton Italic) "SH" -> MHeader 2 args "SS" -> MHeader 3 args x | x `elem` [ "P", "PP", "LP", "sp"] -> MEmptyLine @@ -314,7 +324,7 @@ lexLine = do return $ MLine $ catMaybes lnparts where - esc :: PandocMonad m => ManLexer m (Maybe (String, FontKind)) + esc :: PandocMonad m => ManLexer m (Maybe (String, Font)) esc = do someesc <- escapeLexer font <- currentFont @@ -323,7 +333,7 @@ lexLine = do _ -> Nothing return rv - linePart :: PandocMonad m => ManLexer m (Maybe (String, FontKind)) + linePart :: PandocMonad m => ManLexer m (Maybe (String, Font)) linePart = do lnpart <- many1 $ noneOf "\n\\" font <- currentFont @@ -424,10 +434,15 @@ parseSkippedContent = do onToken _ = return () strToInline :: RoffStr -> Inline -strToInline (s, Regular) = Str s -strToInline (s, Italic) = Emph [Str s] -strToInline (s, Bold) = Strong [Str s] -strToInline (s, ItalicBold) = Strong [Emph [Str s]] +strToInline (s, fonts) = inner $ toList fonts where + inner :: [FontKind] -> Inline + inner [] = Str s + inner (Bold:fs) = Strong [inner fs] + inner (Italic:fs) = Emph [inner fs] + + -- Monospace goes after Bold and Italic in ordered set + inner (Monospace:_) = Code nullAttr s + inner (Regular:fs) = inner fs parsePara :: PandocMonad m => ManParser m Block parsePara = Para <$> parseInlines -- cgit v1.2.3 From ce27bf9a02d98a6a5412a493577fa9d2f3cfd1fe Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Tue, 16 Oct 2018 03:12:06 +0300 Subject: builders --- src/Text/Pandoc/Readers/Man.hs | 122 ++++++++++++++++++++--------------------- test/Tests/Readers/Man.hs | 8 +-- 2 files changed, 62 insertions(+), 68 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 280acb9c4..d04718fc7 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -38,22 +38,22 @@ import Control.Monad (liftM, void) import Control.Monad.Except (throwError) import Data.Char (isDigit, isUpper, isLower) import Data.Default (Default) -import Data.Functor (($>)) import Data.Map (insert) -import Data.Set (Set, singleton, fromList, toList) +import Data.Set (Set, singleton) +import qualified Data.Set as S (fromList, toList) import Data.Maybe (catMaybes, fromMaybe, isNothing) import Data.List (intersperse, intercalate) import qualified Data.Text as T import Text.Pandoc.Class (PandocMonad(..)) -import Text.Pandoc.Definition +import Text.Pandoc.Builder as B hiding (singleton) import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared (crFilter) -import Text.Parsec hiding (tokenPrim) -import Text.Parsec.Char () +import Text.Parsec hiding (tokenPrim, space) +import qualified Text.Parsec as Parsec import Text.Parsec.Pos (updatePosString) -- @@ -158,14 +158,10 @@ parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do let parsers = [ try parseList, parseTitle, parsePara, parseSkippedContent , try parseCodeBlock, parseHeader, parseSkipMacro] - blocks <- many $ choice parsers - parserst <- getState - return $ Pandoc (stateMeta parserst) (filter (not . isNull) blocks) - - where - - isNull Null = True - isNull _ = False + bs <- many $ choice parsers + let (Pandoc _ blocks) = doc $ mconcat bs + meta <- stateMeta <$> getState + return $ Pandoc meta blocks eofline :: Stream s m Char => ParsecT s u m () eofline = void newline <|> eof @@ -216,7 +212,7 @@ escapeLexer = do fs <- many letterFontKind many letter char ']' - return $ fromList fs + return $ S.fromList fs letterFontKind :: PandocMonad m => ManLexer m FontKind letterFontKind = choice [ @@ -240,7 +236,7 @@ currentFont = fontKind <$> getState lexComment :: PandocMonad m => ManLexer m ManToken lexComment = do try $ string ".\\\"" - many space + many Parsec.space body <- many $ noneOf "\n" char '\n' return $ MComment body @@ -265,7 +261,7 @@ lexMacro = do "fi" -> knownMacro KCodeBlEnd "B" -> MStr (joinedArgs, singleton Bold) "BR" -> MMaybeLink joinedArgs - x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, fromList [Italic, Bold]) + x | x `elem` ["BI", "IB"] -> MStr (joinedArgs, S.fromList [Italic, Bold]) x | x `elem` ["I", "IR", "RI"] -> MStr (joinedArgs, singleton Italic) "SH" -> MHeader 2 args "SS" -> MHeader 3 args @@ -403,15 +399,15 @@ mcomment = msatisfy isMComment where -- ManToken -> Block functions -- -parseTitle :: PandocMonad m => ManParser m Block +parseTitle :: PandocMonad m => ManParser m Blocks parseTitle = do (MMacro _ args) <- mmacro KTitle if null args - then return Null + then return mempty else do let mantitle = fst $ head args modifyState (changeTitle mantitle) - return $ Header 1 nullAttr [Str mantitle] + return $ header 1 $ str mantitle where changeTitle title pst = let meta = stateMeta pst @@ -419,11 +415,11 @@ parseTitle = do in pst {stateMeta = metaUp} -parseSkippedContent :: PandocMonad m => ManParser m Block +parseSkippedContent :: PandocMonad m => ManParser m Blocks parseSkippedContent = do tok <- munknownMacro <|> mcomment <|> memplyLine onToken tok - return Null + return mempty where @@ -433,50 +429,50 @@ parseSkippedContent = do logMessage $ SkippedContent ("Unknown macro: " ++ mname) pos onToken _ = return () -strToInline :: RoffStr -> Inline -strToInline (s, fonts) = inner $ toList fonts where - inner :: [FontKind] -> Inline - inner [] = Str s - inner (Bold:fs) = Strong [inner fs] - inner (Italic:fs) = Emph [inner fs] +strToInlines :: RoffStr -> Inlines +strToInlines (s, fonts) = inner $ S.toList fonts where + inner :: [FontKind] -> Inlines + inner [] = str s + inner (Bold:fs) = strong $ inner fs + inner (Italic:fs) = emph $ inner fs -- Monospace goes after Bold and Italic in ordered set - inner (Monospace:_) = Code nullAttr s + inner (Monospace:_) = code s inner (Regular:fs) = inner fs -parsePara :: PandocMonad m => ManParser m Block -parsePara = Para <$> parseInlines +parsePara :: PandocMonad m => ManParser m Blocks +parsePara = para <$> parseInlines -parseInlines :: PandocMonad m => ManParser m [Inline] +parseInlines :: PandocMonad m => ManParser m Inlines parseInlines = do inls <- many1 (strInl <|> lineInl <|> linkInl <|> comment) - let withspaces = intersperse [Space] inls - return $ concat withspaces + let withspaces = intersperse B.space inls + return $ mconcat withspaces where - strInl :: PandocMonad m => ManParser m [Inline] + strInl :: PandocMonad m => ManParser m Inlines strInl = do (MStr rstr) <- mstr - return [strToInline rstr] + return $ strToInlines rstr - lineInl :: PandocMonad m => ManParser m [Inline] + lineInl :: PandocMonad m => ManParser m Inlines lineInl = do (MLine fragments) <- mline - return $ strToInline <$> fragments + return $ mconcat $ strToInlines <$> fragments - linkInl :: PandocMonad m => ManParser m [Inline] + linkInl :: PandocMonad m => ManParser m Inlines linkInl = do (MMaybeLink txt) <- mmaybeLink let inls = case runParser linkParser () "" txt of Right lnk -> lnk - Left _ -> [Strong [Str txt]] + Left _ -> strong $ str txt return inls where -- assuming man pages are generated from Linux-like repository - linkParser :: Parsec String () [Inline] + linkParser :: Parsec String () Inlines linkParser = do mpage <- many1 (alphaNum <|> char '_') spacetab @@ -485,21 +481,19 @@ parseInlines = do char ')' other <- many anyChar let manurl pagename section = "../"++section++"/"++pagename++"."++section - return $ [ Link nullAttr [Strong [Str mpage]] (manurl mpage [mansect], mpage) - , Strong [Str $ " ("++[mansect] ++ ")" - , Str other] - ] + lnkInls = link (manurl mpage [mansect]) mpage (strong $ str mpage) + return $ lnkInls <> strong (str (" ("++[mansect] ++ ")") <> str other) - comment :: PandocMonad m => ManParser m [Inline] - comment = mcomment >> return [] + comment :: PandocMonad m => ManParser m Inlines + comment = mcomment >> return mempty -parseCodeBlock :: PandocMonad m => ManParser m Block +parseCodeBlock :: PandocMonad m => ManParser m Blocks parseCodeBlock = do mmacro KCodeBlStart toks <- many (mstr <|> mline <|> mmaybeLink <|> memplyLine <|> munknownMacro <|> mcomment) mmacro KCodeBlEnd - return $ CodeBlock nullAttr (intercalate "\n" . catMaybes $ extractText <$> toks) + return $ codeBlock (intercalate "\n" . catMaybes $ extractText <$> toks) where @@ -510,14 +504,14 @@ parseCodeBlock = do extractText MEmptyLine = Just "" -- string are intercalated with '\n', this prevents double '\n' extractText _ = Nothing -parseHeader :: PandocMonad m => ManParser m Block +parseHeader :: PandocMonad m => ManParser m Blocks parseHeader = do (MHeader lvl ss) <- mheader - return $ Header lvl nullAttr $ intersperse Space $ strToInline <$> ss + return $ header lvl (mconcat $ intersperse B.space $ strToInlines <$> ss) -type ListBuilder = [[Block]] -> Block +type ListBuilder = [Blocks] -> Blocks -parseList :: PandocMonad m => ManParser m Block +parseList :: PandocMonad m => ManParser m Blocks parseList = do xx <- many1 paras let bls = map snd xx @@ -526,13 +520,13 @@ parseList = do where - macroIPInl :: [RoffStr] -> [Inline] - macroIPInl (x:_:[]) = [strToInline x, Space] - macroIPInl _ = [] + macroIPInl :: [RoffStr] -> Inlines + macroIPInl (x:_:[]) = strToInlines x <> B.space + macroIPInl _ = mempty listKind :: [RoffStr] -> Maybe ListBuilder listKind (((c:_), _):_:[]) = - let params style = OrderedList (1, style, DefaultDelim) + let params style = orderedListWith (1, style, DefaultDelim) in case c of _ | isDigit c -> Just $ params Decimal _ | isUpper c -> Just $ params UpperAlpha @@ -541,18 +535,18 @@ parseList = do listKind _ = Nothing - paras :: PandocMonad m => ManParser m (ListBuilder, [Block]) + paras :: PandocMonad m => ManParser m (ListBuilder, Blocks) paras = do (MMacro _ args) <- mmacro KTab let lbuilderOpt = listKind args - lbuilder = fromMaybe BulletList lbuilderOpt + lbuilder = fromMaybe bulletList lbuilderOpt macroinl = macroIPInl args inls <- parseInlines - let parainls = if isNothing lbuilderOpt then macroinl ++ inls else inls - subls <- many sublist - return $ (lbuilder, (Plain parainls) : subls) + let parainls = if isNothing lbuilderOpt then macroinl <> inls else inls + subls <- mconcat <$> many sublist + return $ (lbuilder, plain parainls <> subls) - sublist :: PandocMonad m => ManParser m Block + sublist :: PandocMonad m => ManParser m Blocks sublist = do mmacro KSubTab bl <- parseList @@ -560,5 +554,5 @@ parseList = do return bl -- In case of weird man file it will be parsed succesfully -parseSkipMacro :: PandocMonad m => ManParser m Block -parseSkipMacro = mmacroAny >> return Null +parseSkipMacro :: PandocMonad m => ManParser m Blocks +parseSkipMacro = mmacroAny >> mempty diff --git a/test/Tests/Readers/Man.hs b/test/Tests/Readers/Man.hs index 6226099d2..4d8e13fb1 100644 --- a/test/Tests/Readers/Man.hs +++ b/test/Tests/Readers/Man.hs @@ -45,7 +45,7 @@ tests = [ =?> (para $ space <> str "aaa") , "link" =: ".BR aa (1)" - =?> (para $ fromList [Link nullAttr [Strong [Str "aa"]] ("../1/aa.1","aa"), Strong [Str " (1)",Str ""]]) + =?> (para $ link "../1/aa.1" "aa" (strong $ str "aa") <> (strong $ str " (1)")) ], testGroup "Escapes" [ "fonts" =: @@ -53,13 +53,13 @@ tests = [ =?> (para $ str "aa" <> (emph $ str "bb") <> str "cc") , "skip" =: "a\\%\\{\\}\\\n\\:b\\0" - =?> (para $ fromList $ map Str ["a", "b"]) + =?> (para $ str "ab") , "replace" =: "\\-\\ \\\\\\[lq]\\[rq]\\[em]\\[en]\\*(lq\\*(rq" - =?> (para $ fromList $ map Str ["-", " ", "\\", "“", "”", "—", "–", "«", "»"]) + =?> (para $ str "- \\“”—–«»") , "replace2" =: "\\t\\e\\`\\^\\|\\'" - =?> (para $ fromList $ map Str ["\t", "\\", "`", " ", " ", "`"]) + =?> (para $ str "\t\\` `") ], testGroup "Lists" [ "bullet" =: -- cgit v1.2.3 From 7741cdbf04f93875e6cd2051a6778c2ca35b5e40 Mon Sep 17 00:00:00 2001 From: Yan Pas Date: Wed, 17 Oct 2018 00:21:52 +0300 Subject: added old-style test --- src/Text/Pandoc/Readers/Man.hs | 1 + test/Tests/Old.hs | 4 + test/Tests/Readers/Man.hs | 2 +- test/man-reader.man | 189 +++++++++++++++++++++++++++++++++++++++++ test/man-reader.native | 94 ++++++++++++++++++++ 5 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 test/man-reader.man create mode 100644 test/man-reader.native (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index d04718fc7..1ffdd1f91 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -73,6 +73,7 @@ type Font = Set FontKind type RoffStr = (String, Font) +-- TODO parse tables (see man tbl) data ManToken = MStr RoffStr | MLine [RoffStr] | MMaybeLink String diff --git a/test/Tests/Old.hs b/test/Tests/Old.hs index b426ffd07..842e0f656 100644 --- a/test/Tests/Old.hs +++ b/test/Tests/Old.hs @@ -171,6 +171,10 @@ tests = [ testGroup "markdown" , test "tables" ["-f", "native", "-t", "../data/sample.lua"] "tables.native" "tables.custom" ] + , testGroup "man" + [ test "reader" ["-r", "man", "-w", "native", "-s"] + "man-reader.man" "man-reader.native" + ] ] -- makes sure file is fully closed after reading diff --git a/test/Tests/Readers/Man.hs b/test/Tests/Readers/Man.hs index 4d8e13fb1..9dbfbab4d 100644 --- a/test/Tests/Readers/Man.hs +++ b/test/Tests/Readers/Man.hs @@ -73,7 +73,7 @@ tests = [ =?> orderedListWith (1,UpperAlpha,DefaultDelim) [plain $ str "first", plain $ str "second"] , "nested" =: ".IP\nfirst\n.RS\n.IP\n1a\n.IP\n1b\n.RE" - =?> fromList [BulletList [[Plain [Str "first"],BulletList [[Plain [Str "1a"]],[Plain [Str "1b"]]]]]] + =?> bulletList [(plain $ str "first") <> (bulletList [plain $ str "1a", plain $ str "1b"])] ], testGroup "CodeBlocks" [ "cb1"=: diff --git a/test/man-reader.man b/test/man-reader.man new file mode 100644 index 000000000..4f3395051 --- /dev/null +++ b/test/man-reader.man @@ -0,0 +1,189 @@ +.TH "Pandoc Man tests" "" "Oct 17, 2018" "" "" +.PP +This is a set of tests for pandoc. +.PP + * * * * * +.SH Headers +.SH Level 1 +.SS Level 2 + + * * * * * +.SH Paragraphs +.PP +Here's a regular paragraph. +.PP +Another paragraph +In Markdown 1.0.0 and earlier. +Version 8. +This line turns into a list item. +Because a hard\-wrapped line in the middle of a paragraph looked like a list +item. +.PP +There should be a hard line break +.PD 0 +.P +.PD +here. +.PP + * * * * * +.SH Block Quotes +Code in a block quote: +.IP +.nf +\f[C] +sub\ status\ { +\ \ \ \ print\ "working"; +} +\f[] +.fi +.PP +A list: +.IP "1." 3 +item one +.IP "2." 3 +item two +.PP +.SH Code Blocks +.PP +Code: +.IP +.nf +\f[C] +\-\-\-\-\ (should\ be\ four\ hyphens) + +sub\ status\ { +\ \ \ \ print\ "working"; +} + +\f[] +.fi +.PP +And: +.IP +.nf +\f[C] +\tthis\ code\ line is\ indented\ by\ one\ tab + +These\ should\ not\ be\ escaped:\ \ \\$\ \\\\\ \\>\ \\[\ \\{ +\f[] +.fi +.PP + * * * * * +.SH Lists +.SS Unordered +.PP +Asterisks: +.IP \[bu] 2 +asterisk 1 +.IP \[bu] 2 +asterisk 2 +.IP \[bu] 2 +asterisk 3 +.PP +.SS Ordered +.IP "1." 3 +First +.IP "2." 3 +Second +.IP "3." 3 +Third +.PP +.SS Nested +.IP \[bu] 2 +Tab +.RS 2 +.IP \[bu] 2 +Tab +.RS 2 +.IP \[bu] 2 +Tab +.RE +.RE +.PP +Here's another: +.IP "1." 3 +First +.IP "2." 3 +Second: +.RS 4 +.IP \[bu] 2 +Fee +.IP \[bu] 2 +Fie +.IP \[bu] 2 +Foe +.RE +.IP "3." 3 +Third +.PP +Same thing: +.IP "1." 3 +First +.IP "2." 3 +Second: +.RS 4 +.IP \[bu] 2 +Fee +.IP \[bu] 2 +Fie +.IP \[bu] 2 +Foe +.RE +.IP "3." 3 +Third +.SS different styles: +.IP "A." 3 +Upper Alpha +.RS 4 +.IP "I." 3 +Upper Roman. +.RS 4 +.IP "(6)" 4 +Decimal start with 6 +.RS 4 +.IP "c)" 3 +Lower alpha with paren +.RE +.RE +.RE +.PP + * * * * * +.SH Special Characters +AT&T has an ampersand in their name. +.PP +4 < 5. +.PP +6 > 5. +.PP +Backslash: \\ +.PP +Backtick: ` +.PP +Asterisk: * +.PP +Underscore: _ +.PP +Left brace: { +.PP +Right brace: } +.PP +Left bracket: [ +.PP +Right bracket: ] +.PP +Left paren: ( +.PP +Right paren: ) +.PP +Greater\-than: > +.PP +Hash: # +.PP +Period: . +.PP +Bang: ! +.PP +Plus: + +.PP +Minus: \- +.PP diff --git a/test/man-reader.native b/test/man-reader.native new file mode 100644 index 000000000..1fa010bd6 --- /dev/null +++ b/test/man-reader.native @@ -0,0 +1,94 @@ +Pandoc (Meta {unMeta = fromList [("title",MetaString "Pandoc Man tests")]}) +[Header 1 ("",[],[]) [Str "Pandoc Man tests"] +,Para [Str "This is a set of tests for pandoc."] +,Para [Str " * * * * *"] +,Header 2 ("",[],[]) [Str "Headers"] +,Header 2 ("",[],[]) [Str "Level",Space,Str "1"] +,Header 3 ("",[],[]) [Str "Level",Space,Str "2"] +,Para [Str " * * * * *"] +,Header 2 ("",[],[]) [Str "Paragraphs"] +,Para [Str "Here's a regular paragraph."] +,Para [Str "Another paragraph",Space,Str "In Markdown 1.0.0 and earlier.",Space,Str "Version 8.",Space,Str "This line turns into a list item.",Space,Str "Because a hard-wrapped line in the middle of a paragraph looked like a list",Space,Str "item."] +,Para [Str "There should be a hard line break"] +,Para [Str "here."] +,Para [Str " * * * * *"] +,Header 2 ("",[],[]) [Str "Block",Space,Str "Quotes"] +,Para [Str "Code in a block quote:"] +,CodeBlock ("",[],[]) "\nsub status {\n print \"working\";\n}\n" +,Para [Str "A list:"] +,OrderedList (1,Decimal,DefaultDelim) + [[Plain [Str "item one"]] + ,[Plain [Str "item two"]]] +,Header 2 ("",[],[]) [Str "Code",Space,Str "Blocks"] +,Para [Str "Code:"] +,CodeBlock ("",[],[]) "\n---- (should be four hyphens)\n\nsub status {\n print \"working\";\n}\n\n" +,Para [Str "And:"] +,CodeBlock ("",[],[]) "\n\tthis code line is indented by one tab\n\nThese should not be escaped: \\$ \\\\ \\> \\[ \\{\n" +,Para [Str " * * * * *"] +,Header 2 ("",[],[]) [Str "Lists"] +,Header 3 ("",[],[]) [Str "Unordered"] +,Para [Str "Asterisks:"] +,BulletList + [[Plain [Str "",Space,Str "asterisk 1"]] + ,[Plain [Str "",Space,Str "asterisk 2"]] + ,[Plain [Str "",Space,Str "asterisk 3"]]] +,Header 3 ("",[],[]) [Str "Ordered"] +,OrderedList (1,Decimal,DefaultDelim) + [[Plain [Str "First"]] + ,[Plain [Str "Second"]] + ,[Plain [Str "Third"]]] +,Header 3 ("",[],[]) [Str "Nested"] +,BulletList + [[Plain [Str "",Space,Str "Tab"] + ,BulletList + [[Plain [Str "",Space,Str "Tab"] + ,BulletList + [[Plain [Str "",Space,Str "Tab"]]]]]]] +,Para [Str "Here's another:"] +,OrderedList (1,Decimal,DefaultDelim) + [[Plain [Str "First"]] + ,[Plain [Str "Second:"] + ,BulletList + [[Plain [Str "",Space,Str "Fee"]] + ,[Plain [Str "",Space,Str "Fie"]] + ,[Plain [Str "",Space,Str "Foe"]]]] + ,[Plain [Str "Third"]]] +,Para [Str "Same thing:"] +,OrderedList (1,Decimal,DefaultDelim) + [[Plain [Str "First"]] + ,[Plain [Str "Second:"] + ,BulletList + [[Plain [Str "",Space,Str "Fee"]] + ,[Plain [Str "",Space,Str "Fie"]] + ,[Plain [Str "",Space,Str "Foe"]]]] + ,[Plain [Str "Third"]]] +,Header 3 ("",[],[]) [Str "different",Space,Str "styles:"] +,OrderedList (1,UpperAlpha,DefaultDelim) + [[Plain [Str "Upper Alpha"] + ,OrderedList (1,UpperAlpha,DefaultDelim) + [[Plain [Str "Upper Roman."] + ,BulletList + [[Plain [Str "(6)",Space,Str "Decimal start with 6"] + ,OrderedList (1,LowerAlpha,DefaultDelim) + [[Plain [Str "Lower alpha with paren"]]]]]]]]] +,Para [Str " * * * * *"] +,Header 2 ("",[],[]) [Str "Special",Space,Str "Characters"] +,Para [Str "AT&T has an ampersand in their name."] +,Para [Str "4 < 5."] +,Para [Str "6 > 5."] +,Para [Str "Backslash: \\"] +,Para [Str "Backtick: `"] +,Para [Str "Asterisk: *"] +,Para [Str "Underscore: _"] +,Para [Str "Left brace: {"] +,Para [Str "Right brace: }"] +,Para [Str "Left bracket: ["] +,Para [Str "Right bracket: ]"] +,Para [Str "Left paren: ("] +,Para [Str "Right paren: )"] +,Para [Str "Greater-than: >"] +,Para [Str "Hash: #"] +,Para [Str "Period: ."] +,Para [Str "Bang: !"] +,Para [Str "Plus: +"] +,Para [Str "Minus: -"]] -- cgit v1.2.3