diff options
Diffstat (limited to 'src/Text/Pandoc')
31 files changed, 2693 insertions, 848 deletions
diff --git a/src/Text/Pandoc/Biblio.hs b/src/Text/Pandoc/Biblio.hs index c8e87b2a0..13569a4d9 100644 --- a/src/Text/Pandoc/Biblio.hs +++ b/src/Text/Pandoc/Biblio.hs @@ -31,14 +31,14 @@ module Text.Pandoc.Biblio ( processBiblio ) where import Data.List import Data.Unique -import Data.Char ( isDigit ) +import Data.Char ( isDigit, isPunctuation ) import qualified Data.Map as M import Text.CSL hiding ( Cite(..), Citation(..) ) import qualified Text.CSL as CSL ( Cite(..) ) import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.Shared (stringify) -import Text.ParserCombinators.Parsec +import Text.Parsec import Control.Monad -- | Process a 'Pandoc' document by adding citations formatted @@ -53,42 +53,66 @@ processBiblio cslfile abrfile r p Just f -> readJsonAbbrevFile f Nothing -> return [] p' <- bottomUpM setHash p - let (nts,grps) = if styleClass csl == "note" - then let cits = queryWith getCite p' - ncits = map (queryWith getCite) $ queryWith getNote p' - needNt = cits \\ concat ncits - in (,) needNt $ getNoteCitations needNt p' - else (,) [] $ queryWith getCitation p' + let grps = queryWith getCitation p' style = csl { styleAbbrevs = abbrevs } result = citeproc procOpts style r (setNearNote style $ map (map toCslCite) grps) cits_map = M.fromList $ zip grps (citations result) biblioList = map (renderPandoc' style) (bibliography result) - Pandoc m b = bottomUp (procInlines $ processCite style cits_map) p' - return . generateNotes nts . Pandoc m $ b ++ biblioList + Pandoc m b = bottomUp (processCite style cits_map) p' + b' = bottomUp mvPunct $ deNote b + return $ Pandoc m $ b' ++ biblioList -- | Substitute 'Cite' elements with formatted citations. -processCite :: Style -> M.Map [Citation] [FormattedOutput] -> [Inline] -> [Inline] -processCite s cs (Cite t _ : rest) = +processCite :: Style -> M.Map [Citation] [FormattedOutput] -> Inline -> Inline +processCite s cs (Cite t _) = case M.lookup t cs of - Just (x:xs) -> - if isTextualCitation t - then renderPandoc s [x] ++ - if null xs - then processCite s cs rest - else [Space, Cite t (renderPandoc s xs)] - ++ processCite s cs rest - else Cite t (renderPandoc s (x:xs)) : processCite s cs rest - _ -> Str ("Error processing " ++ show t) : processCite s cs rest -processCite s cs (x:xs) = x : processCite s cs xs -processCite _ _ [] = [] - -procInlines :: ([Inline] -> [Inline]) -> Block -> Block -procInlines f b - | Plain inls <- b = Plain $ f inls - | Para inls <- b = Para $ f inls - | Header i inls <- b = Header i $ f inls - | otherwise = b + Just (x:xs) + | isTextualCitation t && not (null xs) -> + let xs' = renderPandoc s xs + in if styleClass s == "note" + then Cite t (renderPandoc s [x] ++ [Note [Para xs']]) + else Cite t (renderPandoc s [x] ++ [Space | not (startWithPunct xs')] ++ xs') + | otherwise -> if styleClass s == "note" + then Cite t [Note [Para $ renderPandoc s (x:xs)]] + else Cite t (renderPandoc s (x:xs)) + _ -> Strong [Str "???"] -- TODO raise error instead? +processCite _ _ x = x + +isNote :: Inline -> Bool +isNote (Note _) = True +isNote (Cite _ [Note _]) = True +isNote _ = False + +mvPunct :: [Inline] -> [Inline] +mvPunct (Space : Space : xs) = Space : xs +mvPunct (Space : x : ys) | isNote x, startWithPunct ys = + Str (headInline ys) : x : tailFirstInlineStr ys +mvPunct (Space : x : ys) | isNote x = x : ys +mvPunct xs = xs + +sanitize :: [Inline] -> [Inline] +sanitize xs | endWithPunct xs = toCapital' xs + | otherwise = toCapital' (xs ++ [Str "."]) + +-- NOTE: toCapital' works around a bug in toCapital from citeproc-hs 0.3.4. +-- When citeproc-hs is fixed, we can return to using toCapital in sanitize. +toCapital' :: [Inline] -> [Inline] +toCapital' [] = [] +toCapital' xs = case toCapital xs of + [] -> xs + ys -> ys + +deNote :: [Block] -> [Block] +deNote = topDown go + where go (Note [Para xs]) = Note $ bottomUp go' [Para $ sanitize xs] + go (Note xs) = Note $ bottomUp go' xs + go x = x + go' (Note [Para xs]:ys) = + if startWithPunct ys && endWithPunct xs + then initInline xs ++ ys + else xs ++ ys + go' xs = xs isTextualCitation :: [Citation] -> Bool isTextualCitation (c:_) = citationMode c == AuthorInText @@ -100,77 +124,29 @@ getCitation :: Inline -> [[Citation]] getCitation i | Cite t _ <- i = [t] | otherwise = [] -getNote :: Inline -> [Inline] -getNote i | Note _ <- i = [i] - | otherwise = [] - -getCite :: Inline -> [Inline] -getCite i | Cite _ _ <- i = [i] - | otherwise = [] - -getNoteCitations :: [Inline] -> Pandoc -> [[Citation]] -getNoteCitations needNote - = let mvCite i = if i `elem` needNote then Note [Para [i]] else i - setNote = bottomUp mvCite - getCits = concat . flip (zipWith $ setCiteNoteNum) [1..] . - map (queryWith getCite) . queryWith getNote . setNote - in queryWith getCitation . getCits - setHash :: Citation -> IO Citation setHash (Citation i p s cm nn _) = hashUnique `fmap` newUnique >>= return . Citation i p s cm nn -generateNotes :: [Inline] -> Pandoc -> Pandoc -generateNotes needNote = bottomUp (mvCiteInNote needNote) - -mvCiteInNote :: [Inline] -> Block -> Block -mvCiteInNote is = procInlines mvCite - where - mvCite :: [Inline] -> [Inline] - mvCite inls - | x:i:xs <- inls, startWithPunct xs - , x == Space, i `elem_` is = switch i xs ++ mvCite (tailFirstInlineStr xs) - | x:i:xs <- inls - , x == Space, i `elem_` is = mvInNote i : mvCite xs - | i:xs <- inls, i `elem_` is - , startWithPunct xs = switch i xs ++ mvCite (tailFirstInlineStr xs) - | i:xs <- inls, Note _ <- i = checkNt i : mvCite xs - | i:xs <- inls = i : mvCite xs - | otherwise = [] - elem_ x xs = case x of Cite cs _ -> (Cite cs []) `elem` xs; _ -> False - switch i xs = Str (headInline xs) : mvInNote i : [] - mvInNote i - | Cite t o <- i = Note [Para [Cite t $ sanitize o]] - | otherwise = Note [Para [i ]] - sanitize i - | endWithPunct i = toCapital i - | otherwise = toCapital (i ++ [Str "."]) - - checkPt i - | Cite c o : xs <- i , endWithPunct o, startWithPunct xs - = Cite c (initInline o) : checkPt xs - | x:xs <- i = x : checkPt xs - | otherwise = [] - checkNt = bottomUp $ procInlines checkPt - -setCiteNoteNum :: [Inline] -> Int -> [Inline] -setCiteNoteNum ((Cite cs o):xs) n = Cite (setCitationNoteNum n cs) o : setCiteNoteNum xs n -setCiteNoteNum _ _ = [] - -setCitationNoteNum :: Int -> [Citation] -> [Citation] -setCitationNoteNum i = map $ \c -> c { citationNoteNum = i} - toCslCite :: Citation -> CSL.Cite toCslCite c = let (l, s) = locatorWords $ citationSuffix c (la,lo) = parseLocator l + s' = case (l,s,citationMode c) of + -- treat a bare locator as if it begins with comma + -- so @item1 [blah] is like [@item1, blah] + ("",(x:_),AuthorInText) | not (isPunct x) + -> [Str ",",Space] ++ s + _ -> s + isPunct (Str (x:_)) = isPunctuation x + isPunct _ = False citMode = case citationMode c of AuthorInText -> (True, False) SuppressAuthor -> (False,True ) NormalCitation -> (False,False) in emptyCite { CSL.citeId = citationId c , CSL.citePrefix = PandocText $ citationPrefix c - , CSL.citeSuffix = PandocText $ s + , CSL.citeSuffix = PandocText s' , CSL.citeLabel = la , CSL.citeLocator = lo , CSL.citeNoteNumber = show $ citationNoteNum c @@ -189,7 +165,7 @@ locatorWords inp = breakup (x : xs) = x : breakup xs splitup = groupBy (\x y -> x /= '\160' && y /= '\160') -pLocatorWords :: GenParser Inline st (String, [Inline]) +pLocatorWords :: Parsec [Inline] st (String, [Inline]) pLocatorWords = do l <- pLocator s <- getInput -- rest is suffix @@ -197,16 +173,16 @@ pLocatorWords = do then return (init l, Str "," : s) else return (l, s) -pMatch :: (Inline -> Bool) -> GenParser Inline st Inline +pMatch :: (Inline -> Bool) -> Parsec [Inline] st Inline pMatch condition = try $ do t <- anyToken guard $ condition t return t -pSpace :: GenParser Inline st Inline +pSpace :: Parsec [Inline] st Inline pSpace = pMatch (\t -> t == Space || t == Str "\160") -pLocator :: GenParser Inline st String +pLocator :: Parsec [Inline] st String pLocator = try $ do optional $ pMatch (== Str ",") optional pSpace @@ -214,7 +190,7 @@ pLocator = try $ do gs <- many1 pWordWithDigits return $ stringify f ++ (' ' : unwords gs) -pWordWithDigits :: GenParser Inline st String +pWordWithDigits :: Parsec [Inline] st String pWordWithDigits = try $ do pSpace r <- many1 (notFollowedBy pSpace >> anyToken) diff --git a/src/Text/Pandoc/Highlighting.hs b/src/Text/Pandoc/Highlighting.hs index 4fb799cf1..080acebee 100644 --- a/src/Text/Pandoc/Highlighting.hs +++ b/src/Text/Pandoc/Highlighting.hs @@ -39,6 +39,7 @@ module Text.Pandoc.Highlighting ( languages , styleToCss , pygments , espresso + , zenburn , tango , kate , monochrome diff --git a/src/Text/Pandoc/MIME.hs b/src/Text/Pandoc/MIME.hs index d3df2f2e1..f9749cece 100644 --- a/src/Text/Pandoc/MIME.hs +++ b/src/Text/Pandoc/MIME.hs @@ -448,6 +448,7 @@ getMimeType f = M.lookup (map toLower $ drop 1 $ takeExtension f) mimeTypes ,("wax","audio/x-ms-wax") ,("wbmp","image/vnd.wap.wbmp") ,("wbxml","application/vnd.wap.wbxml") + ,("webm","video/webm") ,("wk","application/x-123") ,("wm","video/x-ms-wm") ,("wma","audio/x-ms-wma") diff --git a/src/Text/Pandoc/PDF.hs b/src/Text/Pandoc/PDF.hs index 59cce2e45..4f3f38a14 100644 --- a/src/Text/Pandoc/PDF.hs +++ b/src/Text/Pandoc/PDF.hs @@ -57,8 +57,8 @@ tex2pdf' :: FilePath -- ^ temp directory for output -> IO (Either ByteString ByteString) tex2pdf' tmpDir program source = do let numruns = if "\\tableofcontents" `isInfixOf` source - then 3 - else 1 + then 3 -- to get page numbers + else 2 -- 1 run won't give you PDF bookmarks (exit, log', mbPdf) <- runTeXProgram program numruns tmpDir source let msg = "Error producing PDF from TeX source." case (exit, mbPdf) of diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 883eaf65b..61c47b730 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -52,6 +52,7 @@ module Text.Pandoc.Parsing ( (>>~), failUnlessLHS, escaped, characterReference, + updateLastStrPos, anyOrderedListMarker, orderedListMarker, charRef, @@ -73,21 +74,75 @@ module Text.Pandoc.Parsing ( (>>~), lookupKeySrc, smartPunctuation, macro, - applyMacros' ) + applyMacros', + -- * Re-exports from Text.Pandoc.Parsec + Parser, + runParser, + parse, + anyToken, + getInput, + setInput, + unexpected, + char, + letter, + digit, + alphaNum, + skipMany, + skipMany1, + spaces, + space, + anyChar, + satisfy, + newline, + string, + count, + eof, + noneOf, + oneOf, + lookAhead, + notFollowedBy, + many, + many1, + manyTill, + (<|>), + (<?>), + choice, + try, + sepBy1, + sepBy, + sepEndBy, + endBy1, + option, + optional, + optionMaybe, + getState, + setState, + updateState, + getPosition, + setPosition, + sourceColumn, + sourceLine, + newPos, + token + ) where import Text.Pandoc.Definition import Text.Pandoc.Generic import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) -import Text.ParserCombinators.Parsec +import Text.Parsec +import Text.Parsec.Pos (newPos) import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation ) import Data.List ( intercalate, transpose ) import Network.URI ( parseURI, URI (..), isAllowedInURI ) -import Control.Monad ( join, liftM, guard ) +import Control.Monad ( join, liftM, guard, mzero ) import Text.Pandoc.Shared import qualified Data.Map as M import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions) import Text.HTML.TagSoup.Entity ( lookupEntity ) +import Data.Default + +type Parser t s = Parsec t s -- | Like >>, but returns the operation on the left. -- (Suggested by Tillmann Rendel on Haskell-cafe list.) @@ -95,13 +150,13 @@ import Text.HTML.TagSoup.Entity ( lookupEntity ) a >>~ b = a >>= \x -> b >> return x -- | Parse any line of text -anyLine :: GenParser Char st [Char] +anyLine :: Parsec [Char] st [Char] anyLine = manyTill anyChar newline -- | Like @manyTill@, but reads at least one item. -many1Till :: GenParser tok st a - -> GenParser tok st end - -> GenParser tok st [a] +many1Till :: Parsec [tok] st a + -> Parsec [tok] st end + -> Parsec [tok] st [a] many1Till p end = do first <- p rest <- manyTill p end @@ -110,7 +165,7 @@ many1Till p end = do -- | A more general form of @notFollowedBy@. This one allows any -- type of parser to be specified, and succeeds only if that parser fails. -- It does not consume any input. -notFollowedBy' :: Show b => GenParser a st b -> GenParser a st () +notFollowedBy' :: Show b => Parsec [a] st b -> Parsec [a] st () notFollowedBy' p = try $ join $ do a <- try p return (unexpected (show a)) <|> @@ -118,39 +173,39 @@ notFollowedBy' p = try $ join $ do a <- try p -- (This version due to Andrew Pimlott on the Haskell mailing list.) -- | Parses one of a list of strings (tried in order). -oneOfStrings :: [String] -> GenParser Char st String +oneOfStrings :: [String] -> Parsec [Char] st String oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings -- | Parses a space or tab. -spaceChar :: CharParser st Char +spaceChar :: Parsec [Char] st Char spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Parses a nonspace, nonnewline character. -nonspaceChar :: CharParser st Char +nonspaceChar :: Parsec [Char] st Char nonspaceChar = satisfy $ \x -> x /= '\t' && x /= '\n' && x /= ' ' && x /= '\r' -- | Skips zero or more spaces or tabs. -skipSpaces :: GenParser Char st () +skipSpaces :: Parsec [Char] st () skipSpaces = skipMany spaceChar -- | Skips zero or more spaces or tabs, then reads a newline. -blankline :: GenParser Char st Char +blankline :: Parsec [Char] st Char blankline = try $ skipSpaces >> newline -- | Parses one or more blank lines and returns a string of newlines. -blanklines :: GenParser Char st [Char] +blanklines :: Parsec [Char] st [Char] blanklines = many1 blankline -- | Parses material enclosed between start and end parsers. -enclosed :: GenParser Char st t -- ^ start parser - -> GenParser Char st end -- ^ end parser - -> GenParser Char st a -- ^ content parser (to be used repeatedly) - -> GenParser Char st [a] +enclosed :: Parsec [Char] st t -- ^ start parser + -> Parsec [Char] st end -- ^ end parser + -> Parsec [Char] st a -- ^ content parser (to be used repeatedly) + -> Parsec [Char] st [a] enclosed start end parser = try $ start >> notFollowedBy space >> many1Till parser end -- | Parse string, case insensitive. -stringAnyCase :: [Char] -> CharParser st String +stringAnyCase :: [Char] -> Parsec [Char] st String stringAnyCase [] = string "" stringAnyCase (x:xs) = do firstChar <- char (toUpper x) <|> char (toLower x) @@ -158,7 +213,7 @@ stringAnyCase (x:xs) = do return (firstChar:rest) -- | Parse contents of 'str' using 'parser' and return result. -parseFromString :: GenParser tok st a -> [tok] -> GenParser tok st a +parseFromString :: Parsec [tok] st a -> [tok] -> Parsec [tok] st a parseFromString parser str = do oldPos <- getPosition oldInput <- getInput @@ -169,7 +224,7 @@ parseFromString parser str = do return result -- | Parse raw line block up to and including blank lines. -lineClump :: GenParser Char st String +lineClump :: Parsec [Char] st String lineClump = blanklines <|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines) @@ -178,8 +233,8 @@ lineClump = blanklines -- pairs of open and close, which must be different. For example, -- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" -- and return "hello (there)". -charsInBalanced :: Char -> Char -> GenParser Char st Char - -> GenParser Char st String +charsInBalanced :: Char -> Char -> Parsec [Char] st Char + -> Parsec [Char] st String charsInBalanced open close parser = try $ do char open let isDelim c = c == open || c == close @@ -204,7 +259,7 @@ uppercaseRomanDigits = map toUpper lowercaseRomanDigits -- | Parses a roman numeral (uppercase or lowercase), returns number. romanNumeral :: Bool -- ^ Uppercase if true - -> GenParser Char st Int + -> Parsec [Char] st Int romanNumeral upperCase = do let romanDigits = if upperCase then uppercaseRomanDigits @@ -234,14 +289,14 @@ romanNumeral upperCase = do -- Parsers for email addresses and URIs -emailChar :: GenParser Char st Char +emailChar :: Parsec [Char] st Char emailChar = alphaNum <|> satisfy (\c -> c == '-' || c == '+' || c == '_' || c == '.') -domainChar :: GenParser Char st Char +domainChar :: Parsec [Char] st Char domainChar = alphaNum <|> char '-' -domain :: GenParser Char st [Char] +domain :: Parsec [Char] st [Char] domain = do first <- many1 domainChar dom <- many1 $ try (char '.' >> many1 domainChar ) @@ -249,7 +304,7 @@ domain = do -- | Parses an email address; returns original and corresponding -- escaped mailto: URI. -emailAddress :: GenParser Char st (String, String) +emailAddress :: Parsec [Char] st (String, String) emailAddress = try $ do firstLetter <- alphaNum restAddr <- many emailChar @@ -260,7 +315,7 @@ emailAddress = try $ do return (full, escapeURI $ "mailto:" ++ full) -- | Parses a URI. Returns pair of original and URI-escaped version. -uri :: GenParser Char st (String, String) +uri :: Parsec [Char] st (String, String) uri = try $ do let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:", "news:", "telnet:" ] @@ -294,8 +349,8 @@ uri = try $ do -- displacement (the difference between the source column at the end -- and the source column at the beginning). Vertical displacement -- (source row) is ignored. -withHorizDisplacement :: GenParser Char st a -- ^ Parser to apply - -> GenParser Char st (a, Int) -- ^ (result, displacement) +withHorizDisplacement :: Parsec [Char] st a -- ^ Parser to apply + -> Parsec [Char] st (a, Int) -- ^ (result, displacement) withHorizDisplacement parser = do pos1 <- getPosition result <- parser @@ -304,7 +359,7 @@ withHorizDisplacement parser = do -- | Applies a parser and returns the raw string that was parsed, -- along with the value produced by the parser. -withRaw :: GenParser Char st a -> GenParser Char st (a, [Char]) +withRaw :: Parsec [Char] st a -> Parsec [Char] st (a, [Char]) withRaw parser = do pos1 <- getPosition inp <- getInput @@ -321,26 +376,26 @@ withRaw parser = do -- | Parses a character and returns 'Null' (so that the parser can move on -- if it gets stuck). -nullBlock :: GenParser Char st Block +nullBlock :: Parsec [Char] st Block nullBlock = anyChar >> return Null -- | Fail if reader is in strict markdown syntax mode. -failIfStrict :: GenParser a ParserState () +failIfStrict :: Parsec [a] ParserState () failIfStrict = do state <- getState if stateStrict state then fail "strict mode" else return () -- | Fail unless we're in literate haskell mode. -failUnlessLHS :: GenParser tok ParserState () +failUnlessLHS :: Parsec [tok] ParserState () failUnlessLHS = getState >>= guard . stateLiterateHaskell -- | Parses backslash, then applies character parser. -escaped :: GenParser Char st Char -- ^ Parser for character to escape - -> GenParser Char st Char +escaped :: Parsec [Char] st Char -- ^ Parser for character to escape + -> Parsec [Char] st Char escaped parser = try $ char '\\' >> parser -- | Parse character entity. -characterReference :: GenParser Char st Char +characterReference :: Parsec [Char] st Char characterReference = try $ do char '&' ent <- many1Till nonspaceChar (char ';') @@ -349,19 +404,19 @@ characterReference = try $ do Nothing -> fail "entity not found" -- | Parses an uppercase roman numeral and returns (UpperRoman, number). -upperRoman :: GenParser Char st (ListNumberStyle, Int) +upperRoman :: Parsec [Char] st (ListNumberStyle, Int) upperRoman = do num <- romanNumeral True return (UpperRoman, num) -- | Parses a lowercase roman numeral and returns (LowerRoman, number). -lowerRoman :: GenParser Char st (ListNumberStyle, Int) +lowerRoman :: Parsec [Char] st (ListNumberStyle, Int) lowerRoman = do num <- romanNumeral False return (LowerRoman, num) -- | Parses a decimal numeral and returns (Decimal, number). -decimal :: GenParser Char st (ListNumberStyle, Int) +decimal :: Parsec [Char] st (ListNumberStyle, Int) decimal = do num <- many1 digit return (Decimal, read num) @@ -370,7 +425,7 @@ decimal = do -- returns (DefaultStyle, [next example number]). The next -- example number is incremented in parser state, and the label -- (if present) is added to the label table. -exampleNum :: GenParser Char ParserState (ListNumberStyle, Int) +exampleNum :: Parsec [Char] ParserState (ListNumberStyle, Int) exampleNum = do char '@' lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-')) @@ -384,38 +439,38 @@ exampleNum = do return (Example, num) -- | Parses a '#' returns (DefaultStyle, 1). -defaultNum :: GenParser Char st (ListNumberStyle, Int) +defaultNum :: Parsec [Char] st (ListNumberStyle, Int) defaultNum = do char '#' return (DefaultStyle, 1) -- | Parses a lowercase letter and returns (LowerAlpha, number). -lowerAlpha :: GenParser Char st (ListNumberStyle, Int) +lowerAlpha :: Parsec [Char] st (ListNumberStyle, Int) lowerAlpha = do ch <- oneOf ['a'..'z'] return (LowerAlpha, ord ch - ord 'a' + 1) -- | Parses an uppercase letter and returns (UpperAlpha, number). -upperAlpha :: GenParser Char st (ListNumberStyle, Int) +upperAlpha :: Parsec [Char] st (ListNumberStyle, Int) upperAlpha = do ch <- oneOf ['A'..'Z'] return (UpperAlpha, ord ch - ord 'A' + 1) -- | Parses a roman numeral i or I -romanOne :: GenParser Char st (ListNumberStyle, Int) +romanOne :: Parsec [Char] st (ListNumberStyle, Int) romanOne = (char 'i' >> return (LowerRoman, 1)) <|> (char 'I' >> return (UpperRoman, 1)) -- | Parses an ordered list marker and returns list attributes. -anyOrderedListMarker :: GenParser Char ParserState ListAttributes +anyOrderedListMarker :: Parsec [Char] ParserState ListAttributes anyOrderedListMarker = choice $ [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], numParser <- [decimal, exampleNum, defaultNum, romanOne, lowerAlpha, lowerRoman, upperAlpha, upperRoman]] -- | Parses a list number (num) followed by a period, returns list attributes. -inPeriod :: GenParser Char st (ListNumberStyle, Int) - -> GenParser Char st ListAttributes +inPeriod :: Parsec [Char] st (ListNumberStyle, Int) + -> Parsec [Char] st ListAttributes inPeriod num = try $ do (style, start) <- num char '.' @@ -425,16 +480,16 @@ inPeriod num = try $ do return (start, style, delim) -- | Parses a list number (num) followed by a paren, returns list attributes. -inOneParen :: GenParser Char st (ListNumberStyle, Int) - -> GenParser Char st ListAttributes +inOneParen :: Parsec [Char] st (ListNumberStyle, Int) + -> Parsec [Char] st ListAttributes inOneParen num = try $ do (style, start) <- num char ')' return (start, style, OneParen) -- | Parses a list number (num) enclosed in parens, returns list attributes. -inTwoParens :: GenParser Char st (ListNumberStyle, Int) - -> GenParser Char st ListAttributes +inTwoParens :: Parsec [Char] st (ListNumberStyle, Int) + -> Parsec [Char] st ListAttributes inTwoParens num = try $ do char '(' (style, start) <- num @@ -445,7 +500,7 @@ inTwoParens num = try $ do -- returns number. orderedListMarker :: ListNumberStyle -> ListNumberDelim - -> GenParser Char ParserState Int + -> Parsec [Char] ParserState Int orderedListMarker style delim = do let num = defaultNum <|> -- # can continue any kind of list case style of @@ -465,19 +520,19 @@ orderedListMarker style delim = do return start -- | Parses a character reference and returns a Str element. -charRef :: GenParser Char st Inline +charRef :: Parsec [Char] st Inline charRef = do c <- characterReference return $ Str [c] -- | Parse a table using 'headerParser', 'rowParser', -- 'lineParser', and 'footerParser'. -tableWith :: GenParser Char ParserState ([[Block]], [Alignment], [Int]) - -> ([Int] -> GenParser Char ParserState [[Block]]) - -> GenParser Char ParserState sep - -> GenParser Char ParserState end - -> GenParser Char ParserState [Inline] - -> GenParser Char ParserState Block +tableWith :: Parsec [Char] ParserState ([[Block]], [Alignment], [Int]) + -> ([Int] -> Parsec [Char] ParserState [[Block]]) + -> Parsec [Char] ParserState sep + -> Parsec [Char] ParserState end + -> Parsec [Char] ParserState [Inline] + -> Parsec [Char] ParserState Block tableWith headerParser rowParser lineParser footerParser captionParser = try $ do caption' <- option [] captionParser (heads, aligns, indices) <- headerParser @@ -615,10 +670,10 @@ extraTableFooter = blanklines -- (which may be grid), then the rows, -- which may be grid, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -gridTableWith :: GenParser Char ParserState Block -- ^ Block parser - -> GenParser Char ParserState [Inline] -- ^ Caption parser +gridTableWith :: Parsec [Char] ParserState Block -- ^ Block parser + -> Parsec [Char] ParserState [Inline] -- ^ Caption parser -> Bool -- ^ Headerless table - -> GenParser Char ParserState Block + -> Parsec [Char] ParserState Block gridTableWith block tableCaption headless = tableWith (gridTableHeader headless block) (gridTableRow block) (gridTableSep '-') gridTableFooter tableCaption @@ -626,13 +681,13 @@ gridTableSplitLine :: [Int] -> String -> [String] gridTableSplitLine indices line = map removeFinalBar $ tail $ splitStringByIndices (init indices) $ removeTrailingSpace line -gridPart :: Char -> GenParser Char st (Int, Int) +gridPart :: Char -> Parsec [Char] st (Int, Int) gridPart ch = do dashes <- many1 (char ch) char '+' return (length dashes, length dashes + 1) -gridDashedLines :: Char -> GenParser Char st [(Int,Int)] +gridDashedLines :: Char -> Parsec [Char] st [(Int,Int)] gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline removeFinalBar :: String -> String @@ -640,13 +695,13 @@ removeFinalBar = reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse -- | Separator between rows of grid table. -gridTableSep :: Char -> GenParser Char ParserState Char +gridTableSep :: Char -> Parsec [Char] ParserState Char gridTableSep ch = try $ gridDashedLines ch >> return '\n' -- | Parse header for a grid table. gridTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block - -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) + -> Parsec [Char] ParserState Block + -> Parsec [Char] ParserState ([[Block]], [Alignment], [Int]) gridTableHeader headless block = try $ do optional blanklines dashes <- gridDashedLines '-' @@ -670,16 +725,16 @@ gridTableHeader headless block = try $ do map removeLeadingTrailingSpace rawHeads return (heads, aligns, indices) -gridTableRawLine :: [Int] -> GenParser Char ParserState [String] +gridTableRawLine :: [Int] -> Parsec [Char] ParserState [String] gridTableRawLine indices = do char '|' line <- many1Till anyChar newline return (gridTableSplitLine indices line) -- | Parse row of grid table. -gridTableRow :: GenParser Char ParserState Block +gridTableRow :: Parsec [Char] ParserState Block -> [Int] - -> GenParser Char ParserState [[Block]] + -> Parsec [Char] ParserState [[Block]] gridTableRow block indices = do colLines <- many1 (gridTableRawLine indices) let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ @@ -698,13 +753,13 @@ compactifyCell :: [Block] -> [Block] compactifyCell bs = head $ compactify [bs] -- | Parse footer for a grid table. -gridTableFooter :: GenParser Char ParserState [Char] +gridTableFooter :: Parsec [Char] ParserState [Char] gridTableFooter = blanklines --- -- | Parse a string with a given parser and state. -readWith :: GenParser t ParserState a -- ^ parser +readWith :: Parsec [t] ParserState a -- ^ parser -> ParserState -- ^ initial state -> [t] -- ^ input -> a @@ -714,7 +769,7 @@ readWith parser state input = Right result -> result -- | Parse a string with @parser@ (for testing). -testStringWith :: (Show a) => GenParser Char ParserState a +testStringWith :: (Show a) => Parsec [Char] ParserState a -> String -> IO () testStringWith parser str = UTF8.putStrLn $ show $ @@ -748,10 +803,14 @@ data ParserState = ParserState stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers stateHasChapters :: Bool, -- ^ True if \chapter encountered stateApplyMacros :: Bool, -- ^ Apply LaTeX macros? - stateMacros :: [Macro] -- ^ List of macros defined so far + stateMacros :: [Macro], -- ^ List of macros defined so far + stateRstDefaultRole :: String -- ^ Current rST default interpreted text role } deriving Show +instance Default ParserState where + def = defaultParserState + defaultParserState :: ParserState defaultParserState = ParserState { stateParseRaw = False, @@ -778,7 +837,8 @@ defaultParserState = stateExamples = M.empty, stateHasChapters = False, stateApplyMacros = True, - stateMacros = []} + stateMacros = [], + stateRstDefaultRole = "title-reference"} data HeaderType = SingleHeader Char -- ^ Single line of characters underneath @@ -824,25 +884,25 @@ lookupKeySrc table key = case M.lookup key table of Just src -> Just src -- | Fail unless we're in "smart typography" mode. -failUnlessSmart :: GenParser tok ParserState () +failUnlessSmart :: Parsec [tok] ParserState () failUnlessSmart = getState >>= guard . stateSmart -smartPunctuation :: GenParser Char ParserState Inline - -> GenParser Char ParserState Inline +smartPunctuation :: Parsec [Char] ParserState Inline + -> Parsec [Char] ParserState Inline smartPunctuation inlineParser = do failUnlessSmart choice [ quoted inlineParser, apostrophe, dash, ellipses ] -apostrophe :: GenParser Char ParserState Inline +apostrophe :: Parsec [Char] ParserState Inline apostrophe = (char '\'' <|> char '\8217') >> return (Str "\x2019") -quoted :: GenParser Char ParserState Inline - -> GenParser Char ParserState Inline +quoted :: Parsec [Char] ParserState Inline + -> Parsec [Char] ParserState Inline quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser withQuoteContext :: QuoteContext - -> (GenParser Char ParserState Inline) - -> GenParser Char ParserState Inline + -> (Parsec [Char] ParserState Inline) + -> Parsec [Char] ParserState Inline withQuoteContext context parser = do oldState <- getState let oldQuoteContext = stateQuoteContext oldState @@ -852,35 +912,39 @@ withQuoteContext context parser = do setState newState { stateQuoteContext = oldQuoteContext } return result -singleQuoted :: GenParser Char ParserState Inline - -> GenParser Char ParserState Inline +singleQuoted :: Parsec [Char] ParserState Inline + -> Parsec [Char] ParserState Inline singleQuoted inlineParser = try $ do singleQuoteStart withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= return . Quoted SingleQuote . normalizeSpaces -doubleQuoted :: GenParser Char ParserState Inline - -> GenParser Char ParserState Inline +doubleQuoted :: Parsec [Char] ParserState Inline + -> Parsec [Char] ParserState Inline doubleQuoted inlineParser = try $ do doubleQuoteStart withQuoteContext InDoubleQuote $ do contents <- manyTill inlineParser doubleQuoteEnd return . Quoted DoubleQuote . normalizeSpaces $ contents -failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState () +failIfInQuoteContext :: QuoteContext -> Parsec [tok] ParserState () failIfInQuoteContext context = do st <- getState if stateQuoteContext st == context then fail "already inside quotes" else return () -charOrRef :: [Char] -> GenParser Char st Char +charOrRef :: [Char] -> Parsec [Char] st Char charOrRef cs = oneOf cs <|> try (do c <- characterReference guard (c `elem` cs) return c) -singleQuoteStart :: GenParser Char ParserState () +updateLastStrPos :: Parsec [Char] ParserState () +updateLastStrPos = getPosition >>= \p -> + updateState $ \s -> s{ stateLastStrPos = Just p } + +singleQuoteStart :: Parsec [Char] ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote pos <- getPosition @@ -895,28 +959,28 @@ singleQuoteStart = do -- possess/contraction return () -singleQuoteEnd :: GenParser Char st () +singleQuoteEnd :: Parsec [Char] st () singleQuoteEnd = try $ do charOrRef "'\8217\146" notFollowedBy alphaNum -doubleQuoteStart :: GenParser Char ParserState () +doubleQuoteStart :: Parsec [Char] ParserState () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote try $ do charOrRef "\"\8220\147" notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n')) -doubleQuoteEnd :: GenParser Char st () +doubleQuoteEnd :: Parsec [Char] st () doubleQuoteEnd = do charOrRef "\"\8221\148" return () -ellipses :: GenParser Char st Inline +ellipses :: Parsec [Char] st Inline ellipses = do try (charOrRef "\8230\133") <|> try (string "..." >> return '…') return (Str "\8230") -dash :: GenParser Char ParserState Inline +dash :: Parsec [Char] ParserState Inline dash = do oldDashes <- stateOldDashes `fmap` getState if oldDashes @@ -924,28 +988,28 @@ dash = do else Str `fmap` (hyphenDash <|> emDash <|> enDash) -- Two hyphens = en-dash, three = em-dash -hyphenDash :: GenParser Char st String +hyphenDash :: Parsec [Char] st String hyphenDash = do try $ string "--" option "\8211" (char '-' >> return "\8212") -emDash :: GenParser Char st String +emDash :: Parsec [Char] st String emDash = do try (charOrRef "\8212\151") return "\8212" -enDash :: GenParser Char st String +enDash :: Parsec [Char] st String enDash = do try (charOrRef "\8212\151") return "\8211" -enDashOld :: GenParser Char st Inline +enDashOld :: Parsec [Char] st Inline enDashOld = do try (charOrRef "\8211\150") <|> try (char '-' >> lookAhead (satisfy isDigit) >> return '–') return (Str "\8211") -emDashOld :: GenParser Char st Inline +emDashOld :: Parsec [Char] st Inline emDashOld = do try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-') return (Str "\8212") @@ -955,19 +1019,22 @@ emDashOld = do -- -- | Parse a \newcommand or \renewcommand macro definition. -macro :: GenParser Char ParserState Block +macro :: Parsec [Char] ParserState Block macro = do - getState >>= guard . stateApplyMacros + apply <- stateApplyMacros `fmap` getState inp <- getInput case parseMacroDefinitions inp of - ([], _) -> pzero - (ms, rest) -> do count (length inp - length rest) anyChar - updateState $ \st -> - st { stateMacros = ms ++ stateMacros st } - return Null + ([], _) -> mzero + (ms, rest) -> do def' <- count (length inp - length rest) anyChar + if apply + then do + updateState $ \st -> + st { stateMacros = ms ++ stateMacros st } + return Null + else return $ RawBlock "latex" def' -- | Apply current macros to string. -applyMacros' :: String -> GenParser Char ParserState String +applyMacros' :: String -> Parsec [Char] ParserState String applyMacros' target = do apply <- liftM stateApplyMacros getState if apply diff --git a/src/Text/Pandoc/Pretty.hs b/src/Text/Pandoc/Pretty.hs index bf78b2594..0372dbe5d 100644 --- a/src/Text/Pandoc/Pretty.hs +++ b/src/Text/Pandoc/Pretty.hs @@ -510,7 +510,9 @@ charWidth c = | c >= '\xFE10' && c <= '\xFE19' -> 2 | c >= '\xFE20' && c <= '\xFE26' -> 1 | c >= '\xFE30' && c <= '\xFE6B' -> 2 - | c >= '\xFE70' && c <= '\x16A38' -> 1 + | c >= '\xFE70' && c <= '\xFEFF' -> 1 + | c >= '\xFF01' && c <= '\xFF60' -> 2 + | c >= '\xFF61' && c <= '\x16A38' -> 1 | c >= '\x1B000' && c <= '\x1B001' -> 2 | c >= '\x1D000' && c <= '\x1F1FF' -> 1 | c >= '\x1F200' && c <= '\x1F251' -> 2 diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs new file mode 100644 index 000000000..62f7c61a0 --- /dev/null +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -0,0 +1,904 @@ +module Text.Pandoc.Readers.DocBook ( readDocBook ) where +import Data.Char (toUpper, isDigit) +import Text.Pandoc.Parsing (ParserState(..)) +import Text.Pandoc.Definition +import Text.Pandoc.Builder +import Text.XML.Light +import Text.HTML.TagSoup.Entity (lookupEntity) +import Data.Generics +import Data.Monoid +import Data.Char (isSpace) +import Control.Monad.State +import Control.Applicative ((<$>)) +import Data.List (intersperse) + +{- + +List of all DocBook tags, with [x] indicating implemented, +[o] meaning intentionally left unimplemented (pass through): + +[o] abbrev - An abbreviation, especially one followed by a period +[x] abstract - A summary +[o] accel - A graphical user interface (GUI) keyboard shortcut +[x] ackno - Acknowledgements in an Article +[o] acronym - An often pronounceable word made from the initial +[o] action - A response to a user event +[o] address - A real-world address, generally a postal address +[ ] affiliation - The institutional affiliation of an individual +[ ] alt - Text representation for a graphical element +[o] anchor - A spot in the document +[x] answer - An answer to a question posed in a QandASet +[x] appendix - An appendix in a Book or Article +[x] appendixinfo - Meta-information for an Appendix +[o] application - The name of a software program +[x] area - A region defined for a Callout in a graphic or code example +[x] areaset - A set of related areas in a graphic or code example +[x] areaspec - A collection of regions in a graphic or code example +[ ] arg - An argument in a CmdSynopsis +[x] article - An article +[x] articleinfo - Meta-information for an Article +[ ] artpagenums - The page numbers of an article as published +[x] attribution - The source of a block quote or epigraph +[ ] audiodata - Pointer to external audio data +[ ] audioobject - A wrapper for audio data and its associated meta-information +[x] author - The name of an individual author +[ ] authorblurb - A short description or note about an author +[ ] authorgroup - Wrapper for author information when a document has + multiple authors or collabarators +[x] authorinitials - The initials or other short identifier for an author +[o] beginpage - The location of a page break in a print version of the document +[ ] bibliocoverage - The spatial or temporal coverage of a document +[x] bibliodiv - A section of a Bibliography +[x] biblioentry - An entry in a Bibliography +[x] bibliography - A bibliography +[ ] bibliographyinfo - Meta-information for a Bibliography +[ ] biblioid - An identifier for a document +[o] bibliolist - A wrapper for a set of bibliography entries +[ ] bibliomisc - Untyped bibliographic information +[x] bibliomixed - An entry in a Bibliography +[ ] bibliomset - A cooked container for related bibliographic information +[ ] biblioref - A cross reference to a bibliographic entry +[ ] bibliorelation - The relationship of a document to another +[ ] biblioset - A raw container for related bibliographic information +[ ] bibliosource - The source of a document +[ ] blockinfo - Meta-information for a block element +[x] blockquote - A quotation set off from the main text +[x] book - A book +[x] bookinfo - Meta-information for a Book +[x] bridgehead - A free-floating heading +[ ] callout - A “called out” description of a marked Area +[ ] calloutlist - A list of Callouts +[x] caption - A caption +[x] caution - A note of caution +[x] chapter - A chapter, as of a book +[x] chapterinfo - Meta-information for a Chapter +[ ] citation - An inline bibliographic reference to another published work +[ ] citebiblioid - A citation of a bibliographic identifier +[ ] citerefentry - A citation to a reference page +[ ] citetitle - The title of a cited work +[ ] city - The name of a city in an address +[ ] classname - The name of a class, in the object-oriented programming sense +[ ] classsynopsis - The syntax summary for a class definition +[ ] classsynopsisinfo - Information supplementing the contents of + a ClassSynopsis +[ ] cmdsynopsis - A syntax summary for a software command +[ ] co - The location of a callout embedded in text +[x] code - An inline code fragment +[x] col - Specifications for a column in an HTML table +[x] colgroup - A group of columns in an HTML table +[ ] collab - Identifies a collaborator +[ ] collabname - The name of a collaborator +[ ] colophon - Text at the back of a book describing facts about its production +[x] colspec - Specifications for a column in a table +[x] command - The name of an executable program or other software command +[x] computeroutput - Data, generally text, displayed or presented by a computer +[ ] confdates - The dates of a conference for which a document was written +[ ] confgroup - A wrapper for document meta-information about a conference +[ ] confnum - An identifier, frequently numerical, associated with a conference for which a document was written +[ ] confsponsor - The sponsor of a conference for which a document was written +[ ] conftitle - The title of a conference for which a document was written +[x] constant - A programming or system constant +[ ] constraint - A constraint in an EBNF production +[ ] constraintdef - The definition of a constraint in an EBNF production +[ ] constructorsynopsis - A syntax summary for a constructor +[ ] contractnum - The contract number of a document +[ ] contractsponsor - The sponsor of a contract +[ ] contrib - A summary of the contributions made to a document by a + credited source +[ ] copyright - Copyright information about a document +[ ] coref - A cross reference to a co +[ ] corpauthor - A corporate author, as opposed to an individual +[ ] corpcredit - A corporation or organization credited in a document +[ ] corpname - The name of a corporation +[ ] country - The name of a country +[ ] database - The name of a database, or part of a database +[x] date - The date of publication or revision of a document +[ ] dedication - A wrapper for the dedication section of a book +[ ] destructorsynopsis - A syntax summary for a destructor +[ ] edition - The name or number of an edition of a document +[ ] editor - The name of the editor of a document +[x] email - An email address +[x] emphasis - Emphasized text +[x] entry - A cell in a table +[ ] entrytbl - A subtable appearing in place of an Entry in a table +[ ] envar - A software environment variable +[x] epigraph - A short inscription at the beginning of a document or component + note: also handle embedded attribution tag +[ ] equation - A displayed mathematical equation +[ ] errorcode - An error code +[ ] errorname - An error name +[ ] errortext - An error message. +[ ] errortype - The classification of an error message +[ ] example - A formal example, with a title +[ ] exceptionname - The name of an exception +[ ] fax - A fax number +[ ] fieldsynopsis - The name of a field in a class definition +[ ] figure - A formal figure, generally an illustration, with a title +[x] filename - The name of a file +[ ] firstname - The first name of a person +[ ] firstterm - The first occurrence of a term +[x] footnote - A footnote +[ ] footnoteref - A cross reference to a footnote (a footnote mark) +[x] foreignphrase - A word or phrase in a language other than the primary + language of the document +[x] formalpara - A paragraph with a title +[ ] funcdef - A function (subroutine) name and its return type +[ ] funcparams - Parameters for a function referenced through a function + pointer in a synopsis +[ ] funcprototype - The prototype of a function +[ ] funcsynopsis - The syntax summary for a function definition +[ ] funcsynopsisinfo - Information supplementing the FuncDefs of a FuncSynopsis +[x] function - The name of a function or subroutine, as in a + programming language +[x] glossary - A glossary +[x] glossaryinfo - Meta-information for a Glossary +[x] glossdef - A definition in a GlossEntry +[x] glossdiv - A division in a Glossary +[x] glossentry - An entry in a Glossary or GlossList +[x] glosslist - A wrapper for a set of GlossEntrys +[x] glosssee - A cross-reference from one GlossEntry to another +[x] glossseealso - A cross-reference from one GlossEntry to another +[x] glossterm - A glossary term +[ ] graphic - A displayed graphical object (not an inline) +[ ] graphicco - A graphic that contains callout areas +[ ] group - A group of elements in a CmdSynopsis +[ ] guibutton - The text on a button in a GUI +[ ] guiicon - Graphic and/or text appearing as a icon in a GUI +[ ] guilabel - The text of a label in a GUI +[ ] guimenu - The name of a menu in a GUI +[ ] guimenuitem - The name of a terminal menu item in a GUI +[ ] guisubmenu - The name of a submenu in a GUI +[ ] hardware - A physical part of a computer system +[ ] highlights - A summary of the main points of the discussed component +[ ] holder - The name of the individual or organization that holds a copyright +[o] honorific - The title of a person +[ ] html:form - An HTML form +[ ] imagedata - Pointer to external image data +[ ] imageobject - A wrapper for image data and its associated meta-information +[ ] imageobjectco - A wrapper for an image object with callouts +[x] important - An admonition set off from the text +[x] index - An index +[x] indexdiv - A division in an index +[x] indexentry - An entry in an index +[x] indexinfo - Meta-information for an Index +[x] indexterm - A wrapper for terms to be indexed +[x] info - A wrapper for information about a component or other block. (DocBook v5) +[ ] informalequation - A displayed mathematical equation without a title +[ ] informalexample - A displayed example without a title +[ ] informalfigure - A untitled figure +[ ] informaltable - A table without a title +[ ] initializer - The initializer for a FieldSynopsis +[ ] inlineequation - A mathematical equation or expression occurring inline +[ ] inlinegraphic - An object containing or pointing to graphical data + that will be rendered inline +[x] inlinemediaobject - An inline media object (video, audio, image, and so on) +[ ] interface - An element of a GUI +[ ] interfacename - The name of an interface +[ ] invpartnumber - An inventory part number +[ ] isbn - The International Standard Book Number of a document +[ ] issn - The International Standard Serial Number of a periodical +[ ] issuenum - The number of an issue of a journal +[x] itemizedlist - A list in which each entry is marked with a bullet or + other dingbat +[ ] itermset - A set of index terms in the meta-information of a document +[ ] jobtitle - The title of an individual in an organization +[ ] keycap - The text printed on a key on a keyboard +[ ] keycode - The internal, frequently numeric, identifier for a key + on a keyboard +[ ] keycombo - A combination of input actions +[ ] keysym - The symbolic name of a key on a keyboard +[ ] keyword - One of a set of keywords describing the content of a document +[ ] keywordset - A set of keywords describing the content of a document +[ ] label - A label on a Question or Answer +[ ] legalnotice - A statement of legal obligations or requirements +[ ] lhs - The left-hand side of an EBNF production +[ ] lineage - The portion of a person's name indicating a relationship to + ancestors +[ ] lineannotation - A comment on a line in a verbatim listing +[x] link - A hypertext link +[x] listitem - A wrapper for the elements of a list item +[x] literal - Inline text that is some literal value +[x] literallayout - A block of text in which line breaks and white space are + to be reproduced faithfully +[ ] lot - A list of the titles of formal objects (as tables or figures) in + a document +[ ] lotentry - An entry in a list of titles +[ ] manvolnum - A reference volume number +[x] markup - A string of formatting markup in text that is to be + represented literally +[ ] mathphrase - A mathematical phrase, an expression that can be represented + with ordinary text and a small amount of markup +[ ] medialabel - A name that identifies the physical medium on which some + information resides +[x] mediaobject - A displayed media object (video, audio, image, etc.) +[ ] mediaobjectco - A media object that contains callouts +[x] member - An element of a simple list +[ ] menuchoice - A selection or series of selections from a menu +[ ] methodname - The name of a method +[ ] methodparam - Parameters to a method +[ ] methodsynopsis - A syntax summary for a method +[ ] mml:math - A MathML equation +[ ] modespec - Application-specific information necessary for the + completion of an OLink +[ ] modifier - Modifiers in a synopsis +[ ] mousebutton - The conventional name of a mouse button +[ ] msg - A message in a message set +[ ] msgaud - The audience to which a message in a message set is relevant +[ ] msgentry - A wrapper for an entry in a message set +[ ] msgexplan - Explanatory material relating to a message in a message set +[ ] msginfo - Information about a message in a message set +[ ] msglevel - The level of importance or severity of a message in a message set +[ ] msgmain - The primary component of a message in a message set +[ ] msgorig - The origin of a message in a message set +[ ] msgrel - A related component of a message in a message set +[ ] msgset - A detailed set of messages, usually error messages +[ ] msgsub - A subcomponent of a message in a message set +[ ] msgtext - The actual text of a message component in a message set +[ ] nonterminal - A non-terminal in an EBNF production +[x] note - A message set off from the text +[ ] objectinfo - Meta-information for an object +[ ] olink - A link that addresses its target indirectly, through an entity +[ ] ooclass - A class in an object-oriented programming language +[ ] ooexception - An exception in an object-oriented programming language +[ ] oointerface - An interface in an object-oriented programming language +[x] option - An option for a software command +[x] optional - Optional information +[x] orderedlist - A list in which each entry is marked with a sequentially + incremented label +[ ] orgdiv - A division of an organization +[ ] orgname - The name of an organization other than a corporation +[ ] otheraddr - Uncategorized information in address +[ ] othercredit - A person or entity, other than an author or editor, + credited in a document +[ ] othername - A component of a persons name that is not a first name, + surname, or lineage +[ ] package - A package +[ ] pagenums - The numbers of the pages in a book, for use in a bibliographic + entry +[x] para - A paragraph +[ ] paramdef - Information about a function parameter in a programming language +[x] parameter - A value or a symbolic reference to a value +[ ] part - A division in a book +[ ] partinfo - Meta-information for a Part +[ ] partintro - An introduction to the contents of a part +[ ] personblurb - A short description or note about a person +[ ] personname - The personal name of an individual +[ ] phone - A telephone number +[ ] phrase - A span of text +[ ] pob - A post office box in an address +[ ] postcode - A postal code in an address +[x] preface - Introductory matter preceding the first chapter of a book +[ ] prefaceinfo - Meta-information for a Preface +[ ] primary - The primary word or phrase under which an index term should be + sorted +[ ] primaryie - A primary term in an index entry, not in the text +[ ] printhistory - The printing history of a document +[ ] procedure - A list of operations to be performed in a well-defined sequence +[ ] production - A production in a set of EBNF productions +[ ] productionrecap - A cross-reference to an EBNF production +[ ] productionset - A set of EBNF productions +[ ] productname - The formal name of a product +[ ] productnumber - A number assigned to a product +[x] programlisting - A literal listing of all or part of a program +[ ] programlistingco - A program listing with associated areas used in callouts +[x] prompt - A character or string indicating the start of an input field in + a computer display +[ ] property - A unit of data associated with some part of a computer system +[ ] pubdate - The date of publication of a document +[ ] publisher - The publisher of a document +[ ] publishername - The name of the publisher of a document +[ ] pubsnumber - A number assigned to a publication other than an ISBN or ISSN + or inventory part number +[x] qandadiv - A titled division in a QandASet +[o] qandaentry - A question/answer set within a QandASet +[o] qandaset - A question-and-answer set +[x] question - A question in a QandASet +[x] quote - An inline quotation +[ ] refclass - The scope or other indication of applicability of a + reference entry +[ ] refdescriptor - A description of the topic of a reference page +[ ] refentry - A reference page (originally a UNIX man-style reference page) +[ ] refentryinfo - Meta-information for a Refentry +[ ] refentrytitle - The title of a reference page +[ ] reference - A collection of reference entries +[ ] referenceinfo - Meta-information for a Reference +[ ] refmeta - Meta-information for a reference entry +[ ] refmiscinfo - Meta-information for a reference entry other than the title + and volume number +[ ] refname - The name of (one of) the subject(s) of a reference page +[ ] refnamediv - The name, purpose, and classification of a reference page +[ ] refpurpose - A short (one sentence) synopsis of the topic of a reference + page +[x] refsect1 - A major subsection of a reference entry +[x] refsect1info - Meta-information for a RefSect1 +[x] refsect2 - A subsection of a RefSect1 +[x] refsect2info - Meta-information for a RefSect2 +[x] refsect3 - A subsection of a RefSect2 +[x] refsect3info - Meta-information for a RefSect3 +[x] refsection - A recursive section in a refentry +[x] refsectioninfo - Meta-information for a refsection +[ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page +[ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv +[ ] releaseinfo - Information about a particular release of a document +[ ] remark - A remark (or comment) intended for presentation in a draft + manuscript +[ ] replaceable - Content that may or must be replaced by the user +[ ] returnvalue - The value returned by a function +[ ] revdescription - A extended description of a revision to a document +[ ] revhistory - A history of the revisions to a document +[ ] revision - An entry describing a single revision in the history of the + revisions to a document +[ ] revnumber - A document revision number +[ ] revremark - A description of a revision to a document +[ ] rhs - The right-hand side of an EBNF production +[x] row - A row in a table +[ ] sbr - An explicit line break in a command synopsis +[x] screen - Text that a user sees or might see on a computer screen +[o] screenco - A screen with associated areas used in callouts +[o] screeninfo - Information about how a screen shot was produced +[ ] screenshot - A representation of what the user sees or might see on a + computer screen +[ ] secondary - A secondary word or phrase in an index term +[ ] secondaryie - A secondary term in an index entry, rather than in the text +[x] sect1 - A top-level section of document +[x] sect1info - Meta-information for a Sect1 +[x] sect2 - A subsection within a Sect1 +[x] sect2info - Meta-information for a Sect2 +[x] sect3 - A subsection within a Sect2 +[x] sect3info - Meta-information for a Sect3 +[x] sect4 - A subsection within a Sect3 +[x] sect4info - Meta-information for a Sect4 +[x] sect5 - A subsection within a Sect4 +[x] sect5info - Meta-information for a Sect5 +[x] section - A recursive section +[x] sectioninfo - Meta-information for a recursive section +[x] see - Part of an index term directing the reader instead to another entry + in the index +[x] seealso - Part of an index term directing the reader also to another entry + in the index +[ ] seealsoie - A See also entry in an index, rather than in the text +[ ] seeie - A See entry in an index, rather than in the text +[x] seg - An element of a list item in a segmented list +[x] seglistitem - A list item in a segmented list +[x] segmentedlist - A segmented list, a list of sets of elements +[x] segtitle - The title of an element of a list item in a segmented list +[ ] seriesvolnums - Numbers of the volumes in a series of books +[ ] set - A collection of books +[ ] setindex - An index to a set of books +[ ] setindexinfo - Meta-information for a SetIndex +[ ] setinfo - Meta-information for a Set +[ ] sgmltag - A component of SGML markup +[ ] shortaffil - A brief description of an affiliation +[ ] shortcut - A key combination for an action that is also accessible through + a menu +[ ] sidebar - A portion of a document that is isolated from the main + narrative flow +[ ] sidebarinfo - Meta-information for a Sidebar +[x] simpara - A paragraph that contains only text and inline markup, no block + elements +[x] simplelist - An undecorated list of single words or short phrases +[ ] simplemsgentry - A wrapper for a simpler entry in a message set +[ ] simplesect - A section of a document with no subdivisions +[ ] spanspec - Formatting information for a spanned column in a table +[ ] state - A state or province in an address +[ ] step - A unit of action in a procedure +[ ] stepalternatives - Alternative steps in a procedure +[ ] street - A street address in an address +[ ] structfield - A field in a structure (in the programming language sense) +[ ] structname - The name of a structure (in the programming language sense) +[ ] subject - One of a group of terms describing the subject matter of a + document +[ ] subjectset - A set of terms describing the subject matter of a document +[ ] subjectterm - A term in a group of terms describing the subject matter of + a document +[x] subscript - A subscript (as in H2O, the molecular formula for water) +[ ] substeps - A wrapper for steps that occur within steps in a procedure +[x] subtitle - The subtitle of a document +[x] superscript - A superscript (as in x2, the mathematical notation for x + multiplied by itself) +[ ] surname - A family name; in western cultures the last name +[ ] svg:svg - An SVG graphic +[x] symbol - A name that is replaced by a value before processing +[ ] synopfragment - A portion of a CmdSynopsis broken out from the main body + of the synopsis +[ ] synopfragmentref - A reference to a fragment of a command synopsis +[ ] synopsis - A general-purpose element for representing the syntax of + commands or functions +[ ] systemitem - A system-related item or term +[ ] table - A formal table in a document +[ ] task - A task to be completed +[ ] taskprerequisites - The prerequisites for a task +[ ] taskrelated - Information related to a task +[ ] tasksummary - A summary of a task +[x] tbody - A wrapper for the rows of a table or informal table +[x] td - A table entry in an HTML table +[x] term - The word or phrase being defined or described in a variable list +[ ] termdef - An inline term definition +[ ] tertiary - A tertiary word or phrase in an index term +[ ] tertiaryie - A tertiary term in an index entry, rather than in the text +[ ] textdata - Pointer to external text data +[ ] textobject - A wrapper for a text description of an object and its + associated meta-information +[ ] tfoot - A table footer consisting of one or more rows +[x] tgroup - A wrapper for the main content of a table, or part of a table +[x] th - A table header entry in an HTML table +[x] thead - A table header consisting of one or more rows +[x] tip - A suggestion to the user, set off from the text +[x] title - The text of the title of a section of a document or of a formal + block-level element +[x] titleabbrev - The abbreviation of a Title +[x] toc - A table of contents +[x] tocback - An entry in a table of contents for a back matter component +[x] tocchap - An entry in a table of contents for a component in the body of + a document +[x] tocentry - A component title in a table of contents +[x] tocfront - An entry in a table of contents for a front matter component +[x] toclevel1 - A top-level entry within a table of contents entry for a + chapter-like component +[x] toclevel2 - A second-level entry within a table of contents entry for a + chapter-like component +[x] toclevel3 - A third-level entry within a table of contents entry for a + chapter-like component +[x] toclevel4 - A fourth-level entry within a table of contents entry for a + chapter-like component +[x] toclevel5 - A fifth-level entry within a table of contents entry for a + chapter-like component +[x] tocpart - An entry in a table of contents for a part of a book +[ ] token - A unit of information +[x] tr - A row in an HTML table +[ ] trademark - A trademark +[ ] type - The classification of a value +[x] ulink - A link that addresses its target by means of a URL + (Uniform Resource Locator) +[x] uri - A Uniform Resource Identifier +[x] userinput - Data entered by the user +[x] varargs - An empty element in a function synopsis indicating a variable + number of arguments +[x] variablelist - A list in which each entry is composed of a set of one or + more terms and an associated description +[x] varlistentry - A wrapper for a set of terms and the associated description + in a variable list +[x] varname - The name of a variable +[ ] videodata - Pointer to external video data +[ ] videoobject - A wrapper for video data and its associated meta-information +[ ] void - An empty element in a function synopsis indicating that the + function in question takes no arguments +[ ] volumenum - The volume number of a document in a set (as of books in a set + or articles in a journal) +[x] warning - An admonition set off from the text +[x] wordasword - A word meant specifically as a word and not representing + anything else +[ ] xref - A cross reference to another part of the document +[ ] year - The year of publication of a document + +-} + +type DB = State DBState + +data DBState = DBState{ dbSectionLevel :: Int + , dbQuoteType :: QuoteType + , dbDocTitle :: Inlines + , dbDocAuthors :: [Inlines] + , dbDocDate :: Inlines + , dbBook :: Bool + } deriving Show + +readDocBook :: ParserState -> String -> Pandoc +readDocBook _ inp = setTitle (dbDocTitle st') + $ setAuthors (dbDocAuthors st') + $ setDate (dbDocDate st') + $ doc $ mconcat bs + where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp) + DBState{ dbSectionLevel = 0 + , dbQuoteType = DoubleQuote + , dbDocTitle = mempty + , dbDocAuthors = [] + , dbDocDate = mempty + , dbBook = False + } + +-- normalize input, consolidating adjacent Text and CRef elements +normalizeTree :: [Content] -> [Content] +normalizeTree = everywhere (mkT go) + where go :: [Content] -> [Content] + go (Text (CData CDataRaw _ _):xs) = xs + go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = + Text (CData CDataText (s1 ++ s2) z):xs + go (Text (CData CDataText s1 z):CRef r:xs) = + Text (CData CDataText (s1 ++ convertEntity r) z):xs + go (CRef r:Text (CData CDataText s1 z):xs) = + Text (CData CDataText (convertEntity r ++ s1) z):xs + go (CRef r1:CRef r2:xs) = + Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + go xs = xs + +convertEntity :: String -> String +convertEntity e = maybe (map toUpper e) (:[]) (lookupEntity e) + +-- convenience function to get an attribute value, defaulting to "" +attrValue :: String -> Element -> String +attrValue attr elt = + case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of + Just z -> z + Nothing -> "" + +-- convenience function +named :: String -> Element -> Bool +named s e = qName (elName e) == s + +isBlockElement :: Content -> Bool +isBlockElement (Elem e) = qName (elName e) `elem` blocktags + where blocktags = ["toc","index","para","formalpara","simpara", + "ackno","epigraph","blockquote","bibliography","bibliodiv", + "biblioentry","glossee","glosseealso","glossary", + "glossdiv","glosslist","chapter","appendix","preface", + "bridgehead","sect1","sect2","sect3","sect4","sect5","section", + "refsect1","refsect2","refsect3","refsection", + "important","caution","note","tip","warning","qandadiv", + "question","answer","abstract","itemizedlist","orderedlist", + "variablelist","article","book","table","informaltable", + "screen","programlisting","example"] +isBlockElement _ = False + +-- Trim leading and trailing newline characters +trimNl :: String -> String +trimNl = reverse . go . reverse . go + where go ('\n':xs) = xs + go xs = xs + +-- meld text into beginning of first paragraph of Blocks. +-- assumes Blocks start with a Para; if not, does nothing. +addToStart :: Inlines -> Blocks -> Blocks +addToStart toadd bs = + case toList bs of + (Para xs : rest) -> para (toadd <> fromList xs) <> fromList rest + _ -> bs + +-- function that is used by both mediaobject (in parseBlock) +-- and inlinemediaobject (in parseInline) +getImage :: Element -> DB Inlines +getImage e = do + imageUrl <- case filterChild (named "imageobject") e of + Nothing -> return mempty + Just z -> case filterChild (named "imagedata") z of + Nothing -> return mempty + Just i -> return $ attrValue "fileref" i + caption <- case filterChild + (\x -> named "caption" x || named "textobject" x) e of + Nothing -> return mempty + Just z -> mconcat <$> (mapM parseInline $ elContent z) + return $ image imageUrl "" caption + +parseBlock :: Content -> DB Blocks +parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE +parseBlock (Text (CData _ s _)) = if all isSpace s + then return mempty + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ map toUpper x +parseBlock (Elem e) = + case qName (elName e) of + "toc" -> return mempty -- skip TOC, since in pandoc it's autogenerated + "index" -> return mempty -- skip index, since page numbers meaningless + "para" -> parseMixed para (elContent e) + "formalpara" -> do + tit <- case filterChild (named "title") e of + Just t -> (<> str "." <> linebreak) <$> emph + <$> getInlines t + Nothing -> return mempty + addToStart tit <$> parseMixed para (elContent e) + "simpara" -> parseMixed para (elContent e) + "ackno" -> parseMixed para (elContent e) + "epigraph" -> parseBlockquote + "blockquote" -> parseBlockquote + "attribution" -> return mempty + "titleabbrev" -> return mempty + "authorinitials" -> return mempty + "title" -> return mempty -- handled by getTitle or sect + "bibliography" -> sect 0 + "bibliodiv" -> sect 1 + "biblioentry" -> parseMixed para (elContent e) + "bibliomixed" -> parseMixed para (elContent e) + "glosssee" -> para . (\ils -> text "See " <> ils <> str ".") + <$> getInlines e + "glossseealso" -> para . (\ils -> text "See also " <> ils <> str ".") + <$> getInlines e + "glossary" -> sect 0 + "glossdiv" -> definitionList <$> + mapM parseGlossEntry (filterChildren (named "glossentry") e) + "glosslist" -> definitionList <$> + mapM parseGlossEntry (filterChildren (named "glossentry") e) + "chapter" -> sect 0 + "appendix" -> sect 0 + "preface" -> sect 0 + "bridgehead" -> para . strong <$> getInlines e + "sect1" -> sect 1 + "sect2" -> sect 2 + "sect3" -> sect 3 + "sect4" -> sect 4 + "sect5" -> sect 5 + "section" -> gets dbSectionLevel >>= sect . (+1) + "refsect1" -> sect 1 + "refsect2" -> sect 2 + "refsect3" -> sect 3 + "refsection" -> gets dbSectionLevel >>= sect . (+1) + "important" -> blockQuote . (para (strong $ str "Important") <>) + <$> getBlocks e + "caution" -> blockQuote . (para (strong $ str "Caution") <>) + <$> getBlocks e + "note" -> blockQuote . (para (strong $ str "Note") <>) + <$> getBlocks e + "tip" -> blockQuote . (para (strong $ str "Tip") <>) + <$> getBlocks e + "warning" -> blockQuote . (para (strong $ str "Warning") <>) + <$> getBlocks e + "area" -> return mempty + "areaset" -> return mempty + "areaspec" -> return mempty + "qandadiv" -> gets dbSectionLevel >>= sect . (+1) + "question" -> addToStart (strong (str "Q:") <> str " ") <$> getBlocks e + "answer" -> addToStart (strong (str "A:") <> str " ") <$> getBlocks e + "abstract" -> blockQuote <$> getBlocks e + "itemizedlist" -> bulletList <$> listitems + "orderedlist" -> do + let listStyle = case attrValue "numeration" e of + "arabic" -> Decimal + "loweralpha" -> LowerAlpha + "upperalpha" -> UpperAlpha + "lowerroman" -> LowerRoman + "upperroman" -> UpperRoman + _ -> Decimal + let start = case attrValue "override" <$> + filterElement (named "listitem") e of + Just x@(_:_) | all isDigit x -> read x + _ -> 1 + orderedListWith (start,listStyle,DefaultDelim) + <$> listitems + "variablelist" -> definitionList <$> deflistitems + "mediaobject" -> para <$> (getImage e) + "caption" -> return mempty + "info" -> getTitle >> getAuthors >> getDate >> return mempty + "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "sectioninfo" -> return mempty -- keywords & other metadata + "refsectioninfo" -> return mempty -- keywords & other metadata + "refsect1info" -> return mempty -- keywords & other metadata + "refsect2info" -> return mempty -- keywords & other metadata + "refsect3info" -> return mempty -- keywords & other metadata + "sect1info" -> return mempty -- keywords & other metadata + "sect2info" -> return mempty -- keywords & other metadata + "sect3info" -> return mempty -- keywords & other metadata + "sect4info" -> return mempty -- keywords & other metadata + "sect5info" -> return mempty -- keywords & other metadata + "chapterinfo" -> return mempty -- keywords & other metadata + "glossaryinfo" -> return mempty -- keywords & other metadata + "appendixinfo" -> return mempty -- keywords & other metadata + "bookinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "article" -> modify (\st -> st{ dbBook = False }) >> + getTitle >> getBlocks e + "book" -> modify (\st -> st{ dbBook = True }) >> getTitle >> getBlocks e + "table" -> parseTable + "informaltable" -> parseTable + "literallayout" -> codeBlockWithLang + "screen" -> codeBlockWithLang + "programlisting" -> codeBlockWithLang + "?xml" -> return mempty + _ -> getBlocks e + where getBlocks e' = mconcat <$> (mapM parseBlock $ elContent e') + parseMixed container conts = do + let (ils,rest) = break isBlockElement conts + ils' <- (trimInlines . mconcat) <$> mapM parseInline ils + let p = if ils' == mempty then mempty else container ils' + case rest of + [] -> return p + (r:rs) -> do + b <- parseBlock r + x <- parseMixed container rs + return $ p <> b <> x + codeBlockWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + x -> [x] + return $ codeBlockWith (attrValue "id" e, classes', []) + $ trimNl $ strContent e + parseBlockquote = do + attrib <- case filterChild (named "attribution") e of + Nothing -> return mempty + Just z -> (para . (str "— " <>) . mconcat) + <$> (mapM parseInline $ elContent z) + contents <- getBlocks e + return $ blockQuote (contents <> attrib) + listitems = mapM getBlocks $ filterChildren (named "listitem") e + deflistitems = mapM parseVarListEntry $ filterChildren + (named "varlistentry") e + parseVarListEntry e' = do + let terms = filterChildren (named "term") e' + let items = filterChildren (named "listitem") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + parseGlossEntry e' = do + let terms = filterChildren (named "glossterm") e' + let items = filterChildren (named "glossdef") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + getTitle = case filterChild (named "title") e of + Just t -> do + tit <- getInlines t + subtit <- case filterChild (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + modify $ \st -> st{dbDocTitle = tit <> subtit} + Nothing -> return () + getAuthors = do + auths <- mapM getInlines + $ filterChildren (named "author") e + modify $ \st -> st{dbDocAuthors = auths} + getDate = case filterChild (named "date") e of + Just t -> do + dat <- getInlines t + modify $ \st -> st{dbDocDate = dat} + Nothing -> return () + parseTable = do + let isCaption x = named "title" x || named "caption" x + caption <- case filterChild isCaption e of + Just t -> getInlines t + Nothing -> return mempty + let e' = maybe e id $ filterChild (named "tgroup") e + let isColspec x = named "colspec" x || named "col" x + let colspecs = case filterChild (named "colgroup") e' of + Just c -> filterChildren isColspec c + _ -> filterChildren isColspec e' + let isRow x = named "row" x || named "tr" x + headrows <- case filterChild (named "thead") e' of + Just h -> case filterChild isRow h of + Just x -> parseRow x + Nothing -> return [] + Nothing -> return [] + bodyrows <- case filterChild (named "tbody") e' of + Just b -> mapM parseRow + $ filterChildren isRow b + Nothing -> mapM parseRow + $ filterChildren isRow e' + let toAlignment c = case findAttr (unqual "align") c of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let toWidth c = case findAttr (unqual "colwidth") c of + Just w -> read $ filter (\x -> + (x >= '0' && x <= '9') + || x == '.') w + Nothing -> 0 :: Double + let numrows = maximum $ map length bodyrows + let aligns = case colspecs of + [] -> replicate numrows AlignDefault + cs -> map toAlignment cs + let widths = case colspecs of + [] -> replicate numrows 0 + cs -> let ws = map toWidth cs + tot = sum ws + in if all (> 0) ws + then map (/ tot) ws + else replicate numrows 0 + let headrows' = if null headrows + then replicate numrows mempty + else headrows + return $ table caption (zip aligns widths) + headrows' bodyrows + isEntry x = named "entry" x || named "td" x || named "th" x + parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry + sect n = do isbook <- gets dbBook + let n' = if isbook || n == 0 then n + 1 else n + headerText <- case filterChild (named "title") e of + Just t -> getInlines t + Nothing -> return mempty + modify $ \st -> st{ dbSectionLevel = n } + b <- getBlocks e + modify $ \st -> st{ dbSectionLevel = n - 1 } + return $ header n' headerText <> b + +getInlines :: Element -> DB Inlines +getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') + +parseInline :: Content -> DB Inlines +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = + return $ maybe (text $ map toUpper ref) (text . (:[])) $ lookupEntity ref +parseInline (Elem e) = + case qName (elName e) of + "subscript" -> subscript <$> innerInlines + "superscript" -> superscript <$> innerInlines + "inlinemediaobject" -> getImage e + "quote" -> do + qt <- gets dbQuoteType + let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote + modify $ \st -> st{ dbQuoteType = qt' } + contents <- innerInlines + modify $ \st -> st{ dbQuoteType = qt } + return $ if qt == SingleQuote + then singleQuoted contents + else doubleQuoted contents + "simplelist" -> simpleList + "segmentedlist" -> segmentedList + "code" -> codeWithLang + "filename" -> codeWithLang + "literal" -> codeWithLang + "computeroutput" -> codeWithLang + "prompt" -> codeWithLang + "parameter" -> codeWithLang + "option" -> codeWithLang + "optional" -> do x <- getInlines e + return $ str "[" <> x <> str "]" + "markup" -> codeWithLang + "wordasword" -> emph <$> innerInlines + "command" -> codeWithLang + "varname" -> codeWithLang + "function" -> codeWithLang + "type" -> codeWithLang + "symbol" -> codeWithLang + "constant" -> codeWithLang + "userinput" -> codeWithLang + "varargs" -> return $ code "(...)" + "xref" -> return $ str "?" -- so at least you know something is there + "email" -> return $ link ("mailto:" ++ strContent e) "" + $ code $ strContent e + "uri" -> return $ link (strContent e) "" $ code $ strContent e + "ulink" -> link (attrValue "url" e) "" <$> innerInlines + "link" -> do + ils <- innerInlines + let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + Just h -> h + _ -> ('#' : attrValue "linkend" e) + let ils' = if ils == mempty then code href else ils + return $ link href "" ils' + "foreignphrase" -> emph <$> innerInlines + "emphasis" -> case attrValue "role" e of + "bold" -> strong <$> innerInlines + "strong" -> strong <$> innerInlines + "strikethrough" -> strikeout <$> innerInlines + _ -> emph <$> innerInlines + "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) + "title" -> return mempty + _ -> innerInlines + where innerInlines = (trimInlines . mconcat) <$> + (mapM parseInline $ elContent e) + codeWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + l -> [l] + return $ codeWith (attrValue "id" e,classes',[]) $ strContent e + simpleList = (mconcat . intersperse (str "," <> space)) <$> mapM getInlines + (filterChildren (named "member") e) + segmentedList = do + tit <- maybe (return mempty) getInlines $ filterChild (named "title") e + segtits <- mapM getInlines $ filterChildren (named "segtitle") e + segitems <- mapM (mapM getInlines . filterChildren (named "seg")) + $ filterChildren (named "seglistitem") e + let toSeg = mconcat . zipWith (\x y -> strong (x <> str ":") <> space <> + y <> linebreak) segtits + let segs = mconcat $ map toSeg segitems + let tit' = if tit == mempty + then mempty + else strong tit <> linebreak + return $ linebreak <> tit' <> segs diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 0c017b2e4..d76524e14 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -36,8 +36,6 @@ module Text.Pandoc.Readers.HTML ( readHtml , isCommentTag ) where -import Text.ParserCombinators.Parsec -import Text.ParserCombinators.Parsec.Pos import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition @@ -46,8 +44,14 @@ import Text.Pandoc.Shared import Text.Pandoc.Parsing import Data.Maybe ( fromMaybe, isJust ) import Data.List ( intercalate ) -import Data.Char ( isSpace, isDigit, toLower ) -import Control.Monad ( liftM, guard, when ) +import Data.Char ( isDigit, toLower ) +import Control.Monad ( liftM, guard, when, mzero ) + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state @@ -62,7 +66,7 @@ readHtml st inp = Pandoc meta blocks then parseHeader tags else (Meta [] [] [], tags) -type TagParser = GenParser (Tag String) ParserState +type TagParser = Parser [Tag String] ParserState -- TODO - fix this - not every header has a title tag parseHeader :: [Tag String] -> (Meta, [Tag String]) @@ -222,6 +226,8 @@ pSimpleTable :: TagParser [Block] pSimpleTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank + caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank + skipMany $ pInTags "col" block >> skipMany pBlank head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") skipMany pBlank rows <- pOptInTag "tbody" @@ -231,7 +237,7 @@ pSimpleTable = try $ do let cols = maximum $ map length rows let aligns = replicate cols AlignLeft let widths = replicate cols 0 - return [Table [] aligns widths head' rows] + return [Table caption aligns widths head' rows] pCell :: String -> TagParser [TableCell] pCell celltype = try $ do @@ -409,7 +415,7 @@ pCloses tagtype = try $ do (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "li" -> return () - _ -> pzero + _ -> mzero pTagText :: TagParser [Inline] pTagText = try $ do @@ -424,11 +430,11 @@ pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: GenParser Char ParserState Inline +pTagContents :: Parser [Char] ParserState Inline pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad -pStr :: GenParser Char ParserState Inline +pStr :: Parser [Char] ParserState Inline pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) @@ -447,13 +453,13 @@ isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: GenParser Char ParserState Inline +pSymbol :: Parser [Char] ParserState Inline pSymbol = satisfy isSpecial >>= return . Str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: GenParser Char ParserState Inline +pBad :: Parser [Char] ParserState Inline pBad = do c <- satisfy isBad let c' = case c of @@ -487,7 +493,7 @@ pBad = do _ -> '?' return $ Str [c'] -pSpace :: GenParser Char ParserState Inline +pSpace :: Parser [Char] ParserState Inline pSpace = many1 (satisfy isSpace) >> return Space -- @@ -585,7 +591,7 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String +htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String htmlInBalanced f = try $ do (TagOpen t _, tag) <- htmlTag f guard $ '/' `notElem` tag -- not a self-closing tag @@ -598,7 +604,7 @@ htmlInBalanced f = try $ do return $ tag ++ concat contents ++ endtag -- | Matches a tag meeting a certain condition. -htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String) +htmlTag :: (Tag String -> Bool) -> Parser [Char] ParserState (Tag String, String) htmlTag f = try $ do lookAhead (char '<') (next : _) <- getInput >>= return . canonicalizeTags . parseTags diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 279f90318..351e1fef5 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -33,10 +33,9 @@ module Text.Pandoc.Readers.LaTeX ( readLaTeX, handleIncludes ) where -import Text.ParserCombinators.Parsec hiding ((<|>), space, many, optional) import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.Parsing +import Text.Pandoc.Parsing hiding ((<|>), many, optional, space) import qualified Text.Pandoc.UTF8 as UTF8 import Data.Char ( chr, ord ) import Control.Monad @@ -64,7 +63,7 @@ parseLaTeX = do let date' = stateDate st return $ Pandoc (Meta title' authors' date') $ toList bs -type LP = GenParser Char ParserState +type LP = Parser [Char] ParserState anyControlSeq :: LP String anyControlSeq = do @@ -82,9 +81,16 @@ controlSeq name = try $ do case name of "" -> mzero [c] | not (isLetter c) -> string [c] - cs -> string cs <* optional sp + cs -> string cs <* notFollowedBy letter <* optional sp return name +dimenarg :: LP String +dimenarg = try $ do + ch <- option "" $ string "=" + num <- many1 digit + dim <- oneOfStrings ["pt","pc","in","bp","cm","mm","dd","cc","sp"] + return $ ch ++ num ++ dim + sp :: LP () sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline) @@ -112,18 +118,28 @@ comment = do newline return () +bgroup :: LP () +bgroup = () <$ char '{' + <|> () <$ controlSeq "bgroup" + <|> () <$ controlSeq "begingroup" + +egroup :: LP () +egroup = () <$ char '}' + <|> () <$ controlSeq "egroup" + <|> () <$ controlSeq "endgroup" + grouped :: Monoid a => LP a -> LP a -grouped parser = try $ char '{' *> (mconcat <$> manyTill parser (char '}')) +grouped parser = try $ bgroup *> (mconcat <$> manyTill parser egroup) braced :: LP String -braced = char '{' *> (concat <$> manyTill +braced = bgroup *> (concat <$> manyTill ( many1 (satisfy (\c -> c /= '\\' && c /= '}' && c /= '{')) <|> try (string "\\}") <|> try (string "\\{") <|> try (string "\\\\") <|> ((\x -> "{" ++ x ++ "}") <$> braced) <|> count 1 anyChar - ) (char '}')) + ) egroup) bracketed :: Monoid a => LP a -> LP a bracketed parser = try $ char '[' *> (mconcat <$> manyTill parser (char ']')) @@ -181,7 +197,7 @@ inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) block :: LP Blocks block = (mempty <$ comment) - <|> (mempty <$ ((spaceChar <|> blankline) *> spaces)) + <|> (mempty <$ ((spaceChar <|> newline) *> spaces)) <|> environment <|> mempty <$ macro -- TODO improve macros, make them work everywhere <|> blockCommand @@ -251,6 +267,7 @@ blockCommands = M.fromList $ , ("end", mzero) , ("item", skipopts *> loose_item) , ("documentclass", skipopts *> braced *> preamble) + , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) ] ++ map ignoreBlocks -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks @@ -281,7 +298,9 @@ authors :: LP () authors = try $ do char '{' let oneAuthor = mconcat <$> - many1 (notFollowedBy' (controlSeq "and") >> inline) + many1 (notFollowedBy' (controlSeq "and") >> + (inline <|> mempty <$ blockCommand)) + -- skip e.g. \vspace{10pt} auths <- sepBy oneAuthor (controlSeq "and") char '}' updateState (\s -> s { stateAuthors = map (normalizeSpaces . toList) auths }) @@ -304,16 +323,19 @@ inlineCommand = try $ do parseRaw <- stateParseRaw `fmap` getState star <- option "" (string "*") let name' = name ++ star + let rawargs = withRaw (skipopts *> option "" dimenarg + *> many braced) >>= applyMacros' . snd + let raw = if parseRaw + then (rawInline "latex" . (('\\':name') ++)) <$> rawargs + else mempty <$> rawargs case M.lookup name' inlineCommands of - Just p -> p + Just p -> p <|> raw Nothing -> case M.lookup name inlineCommands of - Just p -> p - Nothing - | parseRaw -> - (rawInline "latex" . (('\\':name') ++)) <$> - (withRaw (skipopts *> many braced) - >>= applyMacros' . snd) - | otherwise -> return mempty + Just p -> p <|> raw + Nothing -> raw + +unlessParseRaw :: LP () +unlessParseRaw = getState >>= guard . not . stateParseRaw isBlockCommand :: String -> Bool isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands @@ -333,8 +355,8 @@ inlineCommands = M.fromList $ , ("dots", lit "…") , ("mdots", lit "…") , ("sim", lit "~") - , ("label", inBrackets <$> tok) - , ("ref", inBrackets <$> tok) + , ("label", unlessParseRaw >> (inBrackets <$> tok)) + , ("ref", unlessParseRaw >> (inBrackets <$> tok)) , ("(", mathInline $ manyTill anyChar (try $ string "\\)")) , ("[", mathDisplay $ manyTill anyChar (try $ string "\\]")) , ("ensuremath", mathInline $ braced) @@ -358,8 +380,6 @@ inlineCommands = M.fromList $ , ("scshape", smallcaps <$> inlines) , ("bfseries", strong <$> inlines) , ("/", pure mempty) -- italic correction - , ("cc", lit "ç") - , ("cC", lit "Ç") , ("aa", lit "å") , ("AA", lit "Å") , ("ss", lit "ß") @@ -374,11 +394,12 @@ inlineCommands = M.fromList $ , ("copyright", lit "©") , ("`", option (str "`") $ try $ tok >>= accent grave) , ("'", option (str "'") $ try $ tok >>= accent acute) - , ("^", option (str "^") $ try $ tok >>= accent hat) - , ("~", option (str "~") $ try $ tok >>= accent circ) + , ("^", option (str "^") $ try $ tok >>= accent circ) + , ("~", option (str "~") $ try $ tok >>= accent tilde) , ("\"", option (str "\"") $ try $ tok >>= accent umlaut) , (".", option (str ".") $ try $ tok >>= accent dot) , ("=", option (str "=") $ try $ tok >>= accent macron) + , ("c", option (str "c") $ try $ tok >>= accent cedilla) , ("i", lit "i") , ("\\", linebreak <$ (optional (bracketed inline) *> optional sp)) , (",", pure mempty) @@ -502,33 +523,66 @@ acute 'E' = 'É' acute 'I' = 'Í' acute 'O' = 'Ó' acute 'U' = 'Ú' +acute 'Y' = 'Ý' acute 'a' = 'á' acute 'e' = 'é' acute 'i' = 'í' acute 'o' = 'ó' acute 'u' = 'ú' +acute 'y' = 'ý' +acute 'C' = 'Ć' +acute 'c' = 'ć' +acute 'L' = 'Ĺ' +acute 'l' = 'ĺ' +acute 'N' = 'Ń' +acute 'n' = 'ń' +acute 'R' = 'Ŕ' +acute 'r' = 'ŕ' +acute 'S' = 'Ś' +acute 's' = 'ś' +acute 'Z' = 'Ź' +acute 'z' = 'ź' acute c = c -hat :: Char -> Char -hat 'A' = 'Â' -hat 'E' = 'Ê' -hat 'I' = 'Î' -hat 'O' = 'Ô' -hat 'U' = 'Û' -hat 'a' = 'ã' -hat 'e' = 'ê' -hat 'i' = 'î' -hat 'o' = 'ô' -hat 'u' = 'û' -hat c = c - circ :: Char -> Char -circ 'A' = 'Ã' -circ 'O' = 'Õ' -circ 'o' = 'õ' -circ 'N' = 'Ñ' -circ 'n' = 'ñ' -circ c = c +circ 'A' = 'Â' +circ 'E' = 'Ê' +circ 'I' = 'Î' +circ 'O' = 'Ô' +circ 'U' = 'Û' +circ 'a' = 'â' +circ 'e' = 'ê' +circ 'i' = 'î' +circ 'o' = 'ô' +circ 'u' = 'û' +circ 'C' = 'Ĉ' +circ 'c' = 'ĉ' +circ 'G' = 'Ĝ' +circ 'g' = 'ĝ' +circ 'H' = 'Ĥ' +circ 'h' = 'ĥ' +circ 'J' = 'Ĵ' +circ 'j' = 'ĵ' +circ 'S' = 'Ŝ' +circ 's' = 'ŝ' +circ 'W' = 'Ŵ' +circ 'w' = 'ŵ' +circ 'Y' = 'Ŷ' +circ 'y' = 'ŷ' +circ c = c + +tilde :: Char -> Char +tilde 'A' = 'Ã' +tilde 'a' = 'ã' +tilde 'O' = 'Õ' +tilde 'o' = 'õ' +tilde 'I' = 'Ĩ' +tilde 'i' = 'ĩ' +tilde 'U' = 'Ũ' +tilde 'u' = 'ũ' +tilde 'N' = 'Ñ' +tilde 'n' = 'ñ' +tilde c = c umlaut :: Char -> Char umlaut 'A' = 'Ä' @@ -568,6 +622,13 @@ macron 'o' = 'ō' macron 'u' = 'ū' macron c = c +cedilla :: Char -> Char +cedilla 'c' = 'ç' +cedilla 'C' = 'Ç' +cedilla 's' = 'ş' +cedilla 'S' = 'Ş' +cedilla c = c + tok :: LP Inlines tok = try $ grouped inline <|> inlineCommand <|> str <$> (count 1 $ inlineChar) @@ -646,15 +707,15 @@ verbatimEnv = do controlSeq "begin" name <- braced guard $ name == "verbatim" || name == "Verbatim" || - name == "lstlisting" + name == "lstlisting" || name == "minted" verbEnv name rest <- getInput return (r,rest) -rawLaTeXBlock :: GenParser Char ParserState String +rawLaTeXBlock :: Parser [Char] ParserState String rawLaTeXBlock = snd <$> withRaw (environment <|> blockCommand) -rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline :: Parser [Char] ParserState Inline rawLaTeXInline = do (res, raw) <- withRaw inlineCommand if res == mempty @@ -678,7 +739,9 @@ environments = M.fromList verbEnv "code")) , ("verbatim", codeBlock <$> (verbEnv "verbatim")) , ("Verbatim", codeBlock <$> (verbEnv "Verbatim")) - , ("lstlisting", codeBlock <$> (verbEnv "listlisting")) + , ("lstlisting", codeBlock <$> (verbEnv "lstlisting")) + , ("minted", liftA2 (\l c -> codeBlockWith ("",[l],[]) c) + (grouped (many1 $ satisfy (/= '}'))) (verbEnv "minted")) , ("displaymath", mathEnv Nothing "displaymath") , ("equation", mathEnv Nothing "equation") , ("equation*", mathEnv Nothing "equation*") @@ -878,9 +941,9 @@ parseAligns :: LP [Alignment] parseAligns = try $ do char '{' optional $ char '|' - let cAlign = char 'c' >> return AlignCenter - let lAlign = char 'l' >> return AlignLeft - let rAlign = char 'r' >> return AlignRight + let cAlign = AlignCenter <$ char 'c' + let lAlign = AlignLeft <$ char 'l' + let rAlign = AlignRight <$ char 'r' let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign) aligns' <- sepEndBy alignChar (optional $ char '|') spaces @@ -891,16 +954,20 @@ parseAligns = try $ do hline :: LP () hline = () <$ (try $ spaces >> controlSeq "hline") +lbreak :: LP () +lbreak = () <$ (try $ spaces *> controlSeq "\\") + +amp :: LP () +amp = () <$ (try $ spaces *> char '&') + parseTableRow :: Int -- ^ number of columns -> LP [Blocks] parseTableRow cols = try $ do - let amp = try $ spaces *> string "&" - let tableCellInline = notFollowedBy (amp <|> controlSeq "\\") >> inline - cells' <- sepBy ((plain . trimInlines . mconcat) <$> many tableCellInline) amp + let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline + let tableCell = (plain . trimInlines . mconcat) <$> many tableCellInline + cells' <- sepBy tableCell amp guard $ length cells' == cols spaces - optional $ controlSeq "\\" - spaces return cells' simpTable :: LP Blocks @@ -909,8 +976,8 @@ simpTable = try $ do aligns <- parseAligns let cols = length aligns optional hline - header' <- option [] $ try (parseTableRow cols <* hline) - rows <- many (parseTableRow cols <* optional hline) + header' <- option [] $ try (parseTableRow cols <* lbreak <* hline) + rows <- sepEndBy (parseTableRow cols) (lbreak <* optional hline) spaces let header'' = if null header' then replicate cols mempty diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 65c80956a..34a6cf7ce 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -43,7 +43,6 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag, isCommentTag ) import Text.Pandoc.XML ( fromEntities ) -import Text.ParserCombinators.Parsec import Control.Monad (when, liftM, guard, mzero) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match (tagOpen) @@ -83,14 +82,14 @@ isBlank _ = False -- auxiliary functions -- -indentSpaces :: GenParser Char ParserState [Char] +indentSpaces :: Parser [Char] ParserState [Char] indentSpaces = try $ do state <- getState let tabStop = stateTabStop state count tabStop (char ' ') <|> string "\t" <?> "indentation" -nonindentSpaces :: GenParser Char ParserState [Char] +nonindentSpaces :: Parser [Char] ParserState [Char] nonindentSpaces = do state <- getState let tabStop = stateTabStop state @@ -99,30 +98,30 @@ nonindentSpaces = do then return sps else unexpected "indented line" -skipNonindentSpaces :: GenParser Char ParserState () +skipNonindentSpaces :: Parser [Char] ParserState () skipNonindentSpaces = do state <- getState atMostSpaces (stateTabStop state - 1) -atMostSpaces :: Int -> GenParser Char ParserState () +atMostSpaces :: Int -> Parser [Char] ParserState () atMostSpaces 0 = notFollowedBy (char ' ') atMostSpaces n = (char ' ' >> atMostSpaces (n-1)) <|> return () -litChar :: GenParser Char ParserState Char +litChar :: Parser [Char] ParserState Char litChar = escapedChar' <|> noneOf "\n" <|> (newline >> notFollowedBy blankline >> return ' ') -- | Fail unless we're at beginning of a line. -failUnlessBeginningOfLine :: GenParser tok st () +failUnlessBeginningOfLine :: Parser [tok] st () failUnlessBeginningOfLine = do pos <- getPosition if sourceColumn pos == 1 then return () else fail "not beginning of line" -- | Parse a sequence of inline elements between square brackets, -- including inlines between balanced pairs of square brackets. -inlinesInBalancedBrackets :: GenParser Char ParserState Inline - -> GenParser Char ParserState [Inline] +inlinesInBalancedBrackets :: Parser [Char] ParserState Inline + -> Parser [Char] ParserState [Inline] inlinesInBalancedBrackets parser = try $ do char '[' result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser @@ -137,7 +136,7 @@ inlinesInBalancedBrackets parser = try $ do -- document structure -- -titleLine :: GenParser Char ParserState [Inline] +titleLine :: Parser [Char] ParserState [Inline] titleLine = try $ do char '%' skipSpaces @@ -146,7 +145,7 @@ titleLine = try $ do newline return $ normalizeSpaces res -authorsLine :: GenParser Char ParserState [[Inline]] +authorsLine :: Parser [Char] ParserState [[Inline]] authorsLine = try $ do char '%' skipSpaces @@ -157,14 +156,14 @@ authorsLine = try $ do newline return $ filter (not . null) $ map normalizeSpaces authors -dateLine :: GenParser Char ParserState [Inline] +dateLine :: Parser [Char] ParserState [Inline] dateLine = try $ do char '%' skipSpaces date <- manyTill inline newline return $ normalizeSpaces date -titleBlock :: GenParser Char ParserState ([Inline], [[Inline]], [Inline]) +titleBlock :: Parser [Char] ParserState ([Inline], [[Inline]], [Inline]) titleBlock = try $ do failIfStrict title <- option [] titleLine @@ -173,7 +172,7 @@ titleBlock = try $ do optional blanklines return (title, author, date) -parseMarkdown :: GenParser Char ParserState Pandoc +parseMarkdown :: Parser [Char] ParserState Pandoc parseMarkdown = do -- markdown allows raw HTML updateState (\state -> state { stateParseRaw = True }) @@ -182,7 +181,8 @@ parseMarkdown = do -- docMinusKeys is the raw document with blanks where the keys/notes were... st <- getState let firstPassParser = referenceKey - <|> (if stateStrict st then pzero else noteBlock) + <|> (if stateStrict st then mzero else noteBlock) + <|> liftM snd (withRaw codeBlockDelimited) <|> lineClump docMinusKeys <- liftM concat $ manyTill firstPassParser eof setInput docMinusKeys @@ -210,7 +210,7 @@ parseMarkdown = do -- initial pass for references and notes -- -referenceKey :: GenParser Char ParserState [Char] +referenceKey :: Parser [Char] ParserState [Char] referenceKey = try $ do startPos <- getPosition skipNonindentSpaces @@ -237,7 +237,7 @@ referenceKey = try $ do -- return blanks so line count isn't affected return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -referenceTitle :: GenParser Char ParserState String +referenceTitle :: Parser [Char] ParserState String referenceTitle = try $ do skipSpaces >> optional newline >> skipSpaces tit <- (charsInBalanced '(' ')' litChar >>= return . unwords . words) @@ -246,23 +246,23 @@ referenceTitle = try $ do notFollowedBy (noneOf ")\n"))) return $ fromEntities tit -noteMarker :: GenParser Char ParserState [Char] +noteMarker :: Parser [Char] ParserState [Char] noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']') -rawLine :: GenParser Char ParserState [Char] +rawLine :: Parser [Char] ParserState [Char] rawLine = try $ do notFollowedBy blankline notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker optional indentSpaces anyLine -rawLines :: GenParser Char ParserState [Char] +rawLines :: Parser [Char] ParserState [Char] rawLines = do first <- anyLine rest <- many rawLine return $ unlines (first:rest) -noteBlock :: GenParser Char ParserState [Char] +noteBlock :: Parser [Char] ParserState [Char] noteBlock = try $ do startPos <- getPosition skipNonindentSpaces @@ -286,10 +286,10 @@ noteBlock = try $ do -- parsing blocks -- -parseBlocks :: GenParser Char ParserState [Block] +parseBlocks :: Parser [Char] ParserState [Block] parseBlocks = manyTill block eof -block :: GenParser Char ParserState Block +block :: Parser [Char] ParserState Block block = do st <- getState choice (if stateStrict st @@ -324,10 +324,10 @@ block = do -- header blocks -- -header :: GenParser Char ParserState Block +header :: Parser [Char] ParserState Block header = setextHeader <|> atxHeader <?> "header" -atxHeader :: GenParser Char ParserState Block +atxHeader :: Parser [Char] ParserState Block atxHeader = try $ do level <- many1 (char '#') >>= return . length notFollowedBy (char '.' <|> char ')') -- this would be a list @@ -335,10 +335,10 @@ atxHeader = try $ do text <- manyTill inline atxClosing >>= return . normalizeSpaces return $ Header level text -atxClosing :: GenParser Char st [Char] +atxClosing :: Parser [Char] st [Char] atxClosing = try $ skipMany (char '#') >> blanklines -setextHeader :: GenParser Char ParserState Block +setextHeader :: Parser [Char] ParserState Block setextHeader = try $ do -- This lookahead prevents us from wasting time parsing Inlines -- unless necessary -- it gives a significant performance boost. @@ -354,7 +354,7 @@ setextHeader = try $ do -- hrule block -- -hrule :: GenParser Char st Block +hrule :: Parser [Char] st Block hrule = try $ do skipSpaces start <- satisfy isHruleChar @@ -368,12 +368,12 @@ hrule = try $ do -- code blocks -- -indentedLine :: GenParser Char ParserState [Char] +indentedLine :: Parser [Char] ParserState [Char] indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") blockDelimiter :: (Char -> Bool) -> Maybe Int - -> GenParser Char st (Int, (String, [String], [(String, String)]), Char) + -> Parser [Char] st (Int, (String, [String], [(String, String)]), Char) blockDelimiter f len = try $ do c <- lookAhead (satisfy f) size <- case len of @@ -387,7 +387,7 @@ blockDelimiter f len = try $ do blankline return (size, attr, c) -attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) +attributes :: Parser [Char] st ([Char], [[Char]], [([Char], [Char])]) attributes = try $ do char '{' spnl @@ -399,28 +399,28 @@ attributes = try $ do | otherwise = firstNonNull xs return (firstNonNull $ reverse ids, concat classes, concat keyvals) -attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) +attribute :: Parser [Char] st ([Char], [[Char]], [([Char], [Char])]) attribute = identifierAttr <|> classAttr <|> keyValAttr -identifier :: GenParser Char st [Char] +identifier :: Parser [Char] st [Char] identifier = do first <- letter rest <- many $ alphaNum <|> oneOf "-_:." return (first:rest) -identifierAttr :: GenParser Char st ([Char], [a], [a1]) +identifierAttr :: Parser [Char] st ([Char], [a], [a1]) identifierAttr = try $ do char '#' result <- identifier return (result,[],[]) -classAttr :: GenParser Char st ([Char], [[Char]], [a]) +classAttr :: Parser [Char] st ([Char], [[Char]], [a]) classAttr = try $ do char '.' result <- identifier return ("",[result],[]) -keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])]) +keyValAttr :: Parser [Char] st ([Char], [a], [([Char], [Char])]) keyValAttr = try $ do key <- identifier char '=' @@ -429,14 +429,14 @@ keyValAttr = try $ do <|> many nonspaceChar return ("",[],[(key,val)]) -codeBlockDelimited :: GenParser Char st Block +codeBlockDelimited :: Parser [Char] st Block codeBlockDelimited = try $ do (size, attr, c) <- blockDelimiter (\c -> c == '~' || c == '`') Nothing contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) blanklines return $ CodeBlock attr $ intercalate "\n" contents -codeBlockIndented :: GenParser Char ParserState Block +codeBlockIndented :: Parser [Char] ParserState Block codeBlockIndented = do contents <- many1 (indentedLine <|> try (do b <- blanklines @@ -447,7 +447,7 @@ codeBlockIndented = do return $ CodeBlock ("", stateIndentedCodeClasses st, []) $ stripTrailingNewlines $ concat contents -lhsCodeBlock :: GenParser Char ParserState Block +lhsCodeBlock :: Parser [Char] ParserState Block lhsCodeBlock = do failUnlessLHS liftM (CodeBlock ("",["sourceCode","literate","haskell"],[])) @@ -455,7 +455,7 @@ lhsCodeBlock = do <|> liftM (CodeBlock ("",["sourceCode","haskell"],[])) lhsCodeBlockInverseBird -lhsCodeBlockLaTeX :: GenParser Char ParserState String +lhsCodeBlockLaTeX :: Parser [Char] ParserState String lhsCodeBlockLaTeX = try $ do string "\\begin{code}" manyTill spaceChar newline @@ -463,13 +463,13 @@ lhsCodeBlockLaTeX = try $ do blanklines return $ stripTrailingNewlines contents -lhsCodeBlockBird :: GenParser Char ParserState String +lhsCodeBlockBird :: Parser [Char] ParserState String lhsCodeBlockBird = lhsCodeBlockBirdWith '>' -lhsCodeBlockInverseBird :: GenParser Char ParserState String +lhsCodeBlockInverseBird :: Parser [Char] ParserState String lhsCodeBlockInverseBird = lhsCodeBlockBirdWith '<' -lhsCodeBlockBirdWith :: Char -> GenParser Char ParserState String +lhsCodeBlockBirdWith :: Char -> Parser [Char] ParserState String lhsCodeBlockBirdWith c = try $ do pos <- getPosition when (sourceColumn pos /= 1) $ fail "Not in first column" @@ -481,7 +481,7 @@ lhsCodeBlockBirdWith c = try $ do blanklines return $ intercalate "\n" lns' -birdTrackLine :: Char -> GenParser Char st [Char] +birdTrackLine :: Char -> Parser [Char] st [Char] birdTrackLine c = try $ do char c -- allow html tags on left margin: @@ -493,10 +493,10 @@ birdTrackLine c = try $ do -- block quotes -- -emailBlockQuoteStart :: GenParser Char ParserState Char +emailBlockQuoteStart :: Parser [Char] ParserState Char emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ') -emailBlockQuote :: GenParser Char ParserState [[Char]] +emailBlockQuote :: Parser [Char] ParserState [[Char]] emailBlockQuote = try $ do emailBlockQuoteStart raw <- sepBy (many (nonEndline <|> @@ -507,7 +507,7 @@ emailBlockQuote = try $ do optional blanklines return raw -blockQuote :: GenParser Char ParserState Block +blockQuote :: Parser [Char] ParserState Block blockQuote = do raw <- emailBlockQuote -- parse the extracted block, which may contain various block elements: @@ -518,7 +518,7 @@ blockQuote = do -- list blocks -- -bulletListStart :: GenParser Char ParserState () +bulletListStart :: Parser [Char] ParserState () bulletListStart = try $ do optional newline -- if preceded by a Plain block in a list context skipNonindentSpaces @@ -527,7 +527,7 @@ bulletListStart = try $ do spaceChar skipSpaces -anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim) +anyOrderedListStart :: Parser [Char] ParserState (Int, ListNumberStyle, ListNumberDelim) anyOrderedListStart = try $ do optional newline -- if preceded by a Plain block in a list context skipNonindentSpaces @@ -547,13 +547,12 @@ anyOrderedListStart = try $ do skipSpaces return (num, style, delim) -listStart :: GenParser Char ParserState () +listStart :: Parser [Char] ParserState () listStart = bulletListStart <|> (anyOrderedListStart >> return ()) -- parse a line of a list item (start = parser for beginning of list item) -listLine :: GenParser Char ParserState [Char] +listLine :: Parser [Char] ParserState [Char] listLine = try $ do - notFollowedBy' listStart notFollowedBy blankline notFollowedBy' (do indentSpaces many (spaceChar) @@ -562,24 +561,26 @@ listLine = try $ do return $ concat chunks ++ "\n" -- parse raw text for one list item, excluding start marker and continuations -rawListItem :: GenParser Char ParserState a -> GenParser Char ParserState [Char] +rawListItem :: Parser [Char] ParserState a + -> Parser [Char] ParserState [Char] rawListItem start = try $ do start - result <- many1 listLine + first <- listLine + rest <- many (notFollowedBy listStart >> listLine) blanks <- many blankline - return $ concat result ++ blanks + return $ concat (first:rest) ++ blanks -- continuation of a list item - indented and separated by blankline -- or (in compact lists) endline. -- note: nested lists are parsed as continuations -listContinuation :: GenParser Char ParserState [Char] +listContinuation :: Parser [Char] ParserState [Char] listContinuation = try $ do lookAhead indentSpaces result <- many1 listContinuationLine blanks <- many blankline return $ concat result ++ blanks -listContinuationLine :: GenParser Char ParserState [Char] +listContinuationLine :: Parser [Char] ParserState [Char] listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart @@ -587,8 +588,9 @@ listContinuationLine = try $ do result <- manyTill anyChar newline return $ result ++ "\n" -listItem :: GenParser Char ParserState a -> GenParser Char ParserState [Block] -listItem start = try $ do +listItem :: Parser [Char] ParserState a + -> Parser [Char] ParserState [Block] +listItem start = try $ do first <- rawListItem start continuations <- many listContinuation -- parsing with ListItemState forces markers at beginning of lines to @@ -603,7 +605,7 @@ listItem start = try $ do updateState (\st -> st {stateParserContext = oldContext}) return contents -orderedList :: GenParser Char ParserState Block +orderedList :: Parser [Char] ParserState Block orderedList = try $ do (start, style, delim) <- lookAhead anyOrderedListStart items <- many1 $ listItem $ try $ @@ -612,13 +614,13 @@ orderedList = try $ do orderedListMarker style delim return $ OrderedList (start, style, delim) $ compactify items -bulletList :: GenParser Char ParserState Block +bulletList :: Parser [Char] ParserState Block bulletList = many1 (listItem bulletListStart) >>= return . BulletList . compactify -- definition lists -defListMarker :: GenParser Char ParserState () +defListMarker :: Parser [Char] ParserState () defListMarker = do sps <- nonindentSpaces char ':' <|> char '~' @@ -627,10 +629,10 @@ defListMarker = do let remaining = tabStop - (length sps + 1) if remaining > 0 then count remaining (char ' ') <|> string "\t" - else pzero + else mzero return () -definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) +definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) definitionListItem = try $ do -- first, see if this has any chance of being a definition list: lookAhead (anyLine >> optional blankline >> defListMarker) @@ -644,7 +646,7 @@ definitionListItem = try $ do updateState (\st -> st {stateParserContext = oldContext}) return ((normalizeSpaces term), contents) -defRawBlock :: GenParser Char ParserState [Char] +defRawBlock :: Parser [Char] ParserState [Char] defRawBlock = try $ do defListMarker firstline <- anyLine @@ -656,7 +658,7 @@ defRawBlock = try $ do return $ unlines lns ++ trl return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont -definitionList :: GenParser Char ParserState Block +definitionList :: Parser [Char] ParserState Block definitionList = do items <- many1 definitionListItem -- "compactify" the definition list: @@ -685,7 +687,7 @@ isHtmlOrBlank (Space) = True isHtmlOrBlank (LineBreak) = True isHtmlOrBlank _ = False -para :: GenParser Char ParserState Block +para :: Parser [Char] ParserState Block para = try $ do result <- liftM normalizeSpaces $ many1 inline guard $ not . all isHtmlOrBlank $ result @@ -696,17 +698,17 @@ para = try $ do lookAhead (blockQuote <|> header) >> return "") return $ Para result -plain :: GenParser Char ParserState Block +plain :: Parser [Char] ParserState Block plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces -- -- raw html -- -htmlElement :: GenParser Char ParserState [Char] +htmlElement :: Parser [Char] ParserState [Char] htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag) -htmlBlock :: GenParser Char ParserState Block +htmlBlock :: Parser [Char] ParserState Block htmlBlock = try $ do failUnlessBeginningOfLine first <- htmlElement @@ -714,12 +716,12 @@ htmlBlock = try $ do finalNewlines <- many newline return $ RawBlock "html" $ first ++ finalSpace ++ finalNewlines -strictHtmlBlock :: GenParser Char ParserState [Char] +strictHtmlBlock :: Parser [Char] ParserState [Char] strictHtmlBlock = do failUnlessBeginningOfLine htmlInBalanced (not . isInlineTag) -rawVerbatimBlock :: GenParser Char ParserState String +rawVerbatimBlock :: Parser [Char] ParserState String rawVerbatimBlock = try $ do (TagOpen tag _, open) <- htmlTag (tagOpen (\t -> t == "pre" || t == "style" || t == "script") @@ -727,7 +729,7 @@ rawVerbatimBlock = try $ do contents <- manyTill anyChar (htmlTag (~== TagClose tag)) return $ open ++ contents ++ renderTags [TagClose tag] -rawTeXBlock :: GenParser Char ParserState Block +rawTeXBlock :: Parser [Char] ParserState Block rawTeXBlock = do failIfStrict result <- liftM (RawBlock "latex") rawLaTeXBlock @@ -735,7 +737,7 @@ rawTeXBlock = do spaces return result -rawHtmlBlocks :: GenParser Char ParserState Block +rawHtmlBlocks :: Parser [Char] ParserState Block rawHtmlBlocks = do htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|> liftM snd (htmlTag isBlockTag) @@ -759,7 +761,7 @@ rawHtmlBlocks = do -- Parse a dashed line with optional trailing spaces; return its length -- and the length including trailing space. dashedLine :: Char - -> GenParser Char st (Int, Int) + -> Parser [Char] st (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many spaceChar @@ -768,7 +770,7 @@ dashedLine ch = do -- Parse a table header with dashed lines of '-' preceded by -- one (or zero) line of text. simpleTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) + -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) simpleTableHeader headless = try $ do rawContent <- if headless then return "" @@ -792,16 +794,16 @@ simpleTableHeader headless = try $ do return (heads, aligns, indices) -- Parse a table footer - dashed lines followed by blank line. -tableFooter :: GenParser Char ParserState [Char] +tableFooter :: Parser [Char] ParserState [Char] tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines -- Parse a table separator - dashed line. -tableSep :: GenParser Char ParserState Char +tableSep :: Parser [Char] ParserState Char tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n' -- Parse a raw line and split it into chunks by indices. rawTableLine :: [Int] - -> GenParser Char ParserState [String] + -> Parser [Char] ParserState [String] rawTableLine indices = do notFollowedBy' (blanklines <|> tableFooter) line <- many1Till anyChar newline @@ -810,12 +812,12 @@ rawTableLine indices = do -- Parse a table line and return a list of lists of blocks (columns). tableLine :: [Int] - -> GenParser Char ParserState [[Block]] + -> Parser [Char] ParserState [[Block]] tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) -- Parse a multiline table row and return a list of blocks (columns). multilineRow :: [Int] - -> GenParser Char ParserState [[Block]] + -> Parser [Char] ParserState [[Block]] multilineRow indices = do colLines <- many1 (rawTableLine indices) let cols = map unlines $ transpose colLines @@ -823,7 +825,7 @@ multilineRow indices = do -- Parses a table caption: inlines beginning with 'Table:' -- and followed by blank lines. -tableCaption :: GenParser Char ParserState [Inline] +tableCaption :: Parser [Char] ParserState [Inline] tableCaption = try $ do skipNonindentSpaces string ":" <|> string "Table:" @@ -833,7 +835,7 @@ tableCaption = try $ do -- Parse a simple table with '---' header and one line per row. simpleTable :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block + -> Parser [Char] ParserState Block simpleTable headless = do Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine (return ()) @@ -847,12 +849,12 @@ simpleTable headless = do -- which may be multiline, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). multilineTable :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block + -> Parser [Char] ParserState Block multilineTable headless = tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter tableCaption multilineTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) + -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) multilineTableHeader headless = try $ do if headless then return '\n' @@ -904,10 +906,10 @@ extraTable :: Bool -- ^ Headerless table extraTable = extraTableWith block tableCaption gridTable :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block + -> Parser [Char] ParserState Block gridTable = gridTableWith block tableCaption -table :: GenParser Char ParserState Block +table :: Parser [Char] ParserState Block table = multilineTable False <|> simpleTable True <|> simpleTable False <|> multilineTable True <|> extraTable False <|> extraTable True <|> @@ -917,10 +919,10 @@ table = multilineTable False <|> simpleTable True <|> -- inline -- -inline :: GenParser Char ParserState Inline +inline :: Parser [Char] ParserState Inline inline = choice inlineParsers <?> "inline" -inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers :: [Parser [Char] ParserState Inline] inlineParsers = [ whitespace , str , endline @@ -947,7 +949,7 @@ inlineParsers = [ whitespace , symbol , ltSign ] -escapedChar' :: GenParser Char ParserState Char +escapedChar' :: Parser [Char] ParserState Char escapedChar' = try $ do char '\\' state <- getState @@ -955,7 +957,7 @@ escapedChar' = try $ do then oneOf "\\`*_{}[]()>#+-.!~" else satisfy (not . isAlphaNum) -escapedChar :: GenParser Char ParserState Inline +escapedChar :: Parser [Char] ParserState Inline escapedChar = do result <- escapedChar' return $ case result of @@ -963,7 +965,7 @@ escapedChar = do '\n' -> LineBreak -- "\[newline]" is a linebreak _ -> Str [result] -ltSign :: GenParser Char ParserState Inline +ltSign :: Parser [Char] ParserState Inline ltSign = do st <- getState if stateStrict st @@ -971,7 +973,7 @@ ltSign = do else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html return $ Str ['<'] -exampleRef :: GenParser Char ParserState Inline +exampleRef :: Parser [Char] ParserState Inline exampleRef = try $ do char '@' lab <- many1 (alphaNum <|> oneOf "-_") @@ -979,7 +981,7 @@ exampleRef = try $ do -- later. See the end of parseMarkdown. return $ Str $ '@' : lab -symbol :: GenParser Char ParserState Inline +symbol :: Parser [Char] ParserState Inline symbol = do result <- noneOf "<\\\n\t " <|> try (do lookAhead $ char '\\' @@ -988,7 +990,7 @@ symbol = do return $ Str [result] -- parses inline code, between n `s and n `s -code :: GenParser Char ParserState Inline +code :: Parser [Char] ParserState Inline code = try $ do starts <- many1 (char '`') skipSpaces @@ -999,26 +1001,26 @@ code = try $ do attr <- option ([],[],[]) (try $ optional whitespace >> attributes) return $ Code attr $ removeLeadingTrailingSpace $ concat result -mathWord :: GenParser Char st [Char] +mathWord :: Parser [Char] st [Char] mathWord = liftM concat $ many1 mathChunk -mathChunk :: GenParser Char st [Char] +mathChunk :: Parser [Char] st [Char] mathChunk = do char '\\' c <- anyChar return ['\\',c] <|> many1 (satisfy $ \c -> not (isBlank c || c == '\\' || c == '$')) -math :: GenParser Char ParserState Inline +math :: Parser [Char] ParserState Inline math = (mathDisplay >>= applyMacros' >>= return . Math DisplayMath) <|> (mathInline >>= applyMacros' >>= return . Math InlineMath) -mathDisplay :: GenParser Char ParserState String +mathDisplay :: Parser [Char] ParserState String mathDisplay = try $ do failIfStrict string "$$" many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$") -mathInline :: GenParser Char ParserState String +mathInline :: Parser [Char] ParserState String mathInline = try $ do failIfStrict char '$' @@ -1028,20 +1030,20 @@ mathInline = try $ do notFollowedBy digit return $ intercalate " " words' --- to avoid performance problems, treat 4 or more _ or * in a row as a literal --- rather than attempting to parse for emph/strong -fours :: GenParser Char st Inline +-- to avoid performance problems, treat 4 or more _ or * or ~ or ^ in a row +-- as a literal rather than attempting to parse for emph/strong/strikeout/super/sub +fours :: Parser [Char] st Inline fours = try $ do - x <- char '*' <|> char '_' + x <- char '*' <|> char '_' <|> char '~' <|> char '^' count 2 $ satisfy (==x) rest <- many1 (satisfy (==x)) return $ Str (x:x:x:rest) -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) - => GenParser Char ParserState a - -> GenParser Char ParserState b - -> GenParser Char ParserState [Inline] + => Parser [Char] ParserState a + -> Parser [Char] ParserState b + -> Parser [Char] ParserState [Inline] inlinesBetween start end = normalizeSpaces `liftM` try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' whitespace >> inline) @@ -1049,8 +1051,8 @@ inlinesBetween start end = -- This is used to prevent exponential blowups for things like: -- a**a*a**a*a**a*a**a*a**a*a**a*a** -nested :: GenParser Char ParserState a - -> GenParser Char ParserState a +nested :: Parser [Char] ParserState a + -> Parser [Char] ParserState a nested p = do nestlevel <- stateMaxNestingLevel `fmap` getState guard $ nestlevel > 0 @@ -1059,7 +1061,7 @@ nested p = do updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res -emph :: GenParser Char ParserState Inline +emph :: Parser [Char] ParserState Inline emph = Emph `fmap` nested (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) where starStart = char '*' >> lookAhead nonspaceChar @@ -1067,7 +1069,7 @@ emph = Emph `fmap` nested ulStart = char '_' >> lookAhead nonspaceChar ulEnd = notFollowedBy' strong >> char '_' -strong :: GenParser Char ParserState Inline +strong :: Parser [Char] ParserState Inline strong = Strong `liftM` nested (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) where starStart = string "**" >> lookAhead nonspaceChar @@ -1075,32 +1077,32 @@ strong = Strong `liftM` nested ulStart = string "__" >> lookAhead nonspaceChar ulEnd = try $ string "__" -strikeout :: GenParser Char ParserState Inline +strikeout :: Parser [Char] ParserState Inline strikeout = Strikeout `liftM` (failIfStrict >> inlinesBetween strikeStart strikeEnd) where strikeStart = string "~~" >> lookAhead nonspaceChar >> notFollowedBy (char '~') strikeEnd = try $ string "~~" -superscript :: GenParser Char ParserState Inline +superscript :: Parser [Char] ParserState Inline superscript = failIfStrict >> enclosed (char '^') (char '^') (notFollowedBy spaceChar >> inline) >>= -- may not contain Space return . Superscript -subscript :: GenParser Char ParserState Inline +subscript :: Parser [Char] ParserState Inline subscript = failIfStrict >> enclosed (char '~') (char '~') (notFollowedBy spaceChar >> inline) >>= -- may not contain Space return . Subscript -whitespace :: GenParser Char ParserState Inline +whitespace :: Parser [Char] ParserState Inline whitespace = spaceChar >> ( (spaceChar >> skipMany spaceChar >> option Space (endline >> return LineBreak)) <|> (skipMany spaceChar >> return Space) ) <?> "whitespace" -nonEndline :: GenParser Char st Char +nonEndline :: Parser [Char] st Char nonEndline = satisfy (/='\n') -str :: GenParser Char ParserState Inline +str :: Parser [Char] ParserState Inline str = do smart <- stateSmart `fmap` getState a <- alphaNum @@ -1133,12 +1135,12 @@ likelyAbbrev x = "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.", "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.", - "ch.", "sec." ] + "ch.", "sec.", "cf.", "cp."] abbrPairs = map (break (=='.')) abbrevs in map snd $ filter (\(y,_) -> y == x) abbrPairs -- an endline character that can be treated as a space, not a structural break -endline :: GenParser Char ParserState Inline +endline :: Parser [Char] ParserState Inline endline = try $ do newline notFollowedBy blankline @@ -1157,20 +1159,20 @@ endline = try $ do -- -- a reference label for a link -reference :: GenParser Char ParserState [Inline] +reference :: Parser [Char] ParserState [Inline] reference = do notFollowedBy' (string "[^") -- footnote reference result <- inlinesInBalancedBrackets inline return $ normalizeSpaces result -- source for a link, with optional title -source :: GenParser Char ParserState (String, [Char]) +source :: Parser [Char] ParserState (String, [Char]) source = (try $ charsInBalanced '(' ')' litChar >>= parseFromString source') <|> -- the following is needed for cases like: [ref](/url(a). (enclosed (char '(') (char ')') litChar >>= parseFromString source') -- auxiliary function for source -source' :: GenParser Char ParserState (String, [Char]) +source' :: Parser [Char] ParserState (String, [Char]) source' = do skipSpaces let nl = char '\n' >>~ notFollowedBy blankline @@ -1188,7 +1190,7 @@ source' = do eof return (escapeURI $ removeTrailingSpace src, tit) -linkTitle :: GenParser Char ParserState String +linkTitle :: Parser [Char] ParserState String linkTitle = try $ do (many1 spaceChar >> option '\n' newline) <|> newline skipSpaces @@ -1196,7 +1198,7 @@ linkTitle = try $ do tit <- manyTill litChar (try (char delim >> skipSpaces >> eof)) return $ fromEntities tit -link :: GenParser Char ParserState Inline +link :: Parser [Char] ParserState Inline link = try $ do lab <- reference (src, tit) <- source <|> referenceLink lab @@ -1209,7 +1211,7 @@ delinkify = bottomUp $ concatMap go -- a link like [this][ref] or [this][] or [this] referenceLink :: [Inline] - -> GenParser Char ParserState (String, [Char]) + -> Parser [Char] ParserState (String, [Char]) referenceLink lab = do ref <- option [] (try (optional (char ' ') >> optional (newline >> skipSpaces) >> reference)) @@ -1219,7 +1221,7 @@ referenceLink lab = do Nothing -> fail "no corresponding key" Just target -> return target -autoLink :: GenParser Char ParserState Inline +autoLink :: Parser [Char] ParserState Inline autoLink = try $ do char '<' (orig, src) <- uri <|> emailAddress @@ -1229,14 +1231,14 @@ autoLink = try $ do then Link [Str orig] (src, "") else Link [Code ("",["url"],[]) orig] (src, "") -image :: GenParser Char ParserState Inline +image :: Parser [Char] ParserState Inline image = try $ do char '!' lab <- reference (src, tit) <- source <|> referenceLink lab return $ Image lab (src,tit) -note :: GenParser Char ParserState Inline +note :: Parser [Char] ParserState Inline note = try $ do failIfStrict ref <- noteMarker @@ -1253,21 +1255,21 @@ note = try $ do updateState $ \st -> st{ stateNotes = notes } return $ Note contents -inlineNote :: GenParser Char ParserState Inline +inlineNote :: Parser [Char] ParserState Inline inlineNote = try $ do failIfStrict char '^' contents <- inlinesInBalancedBrackets inline return $ Note [Para contents] -rawLaTeXInline' :: GenParser Char ParserState Inline +rawLaTeXInline' :: Parser [Char] ParserState Inline rawLaTeXInline' = try $ do failIfStrict lookAhead $ char '\\' >> notFollowedBy' (string "start") -- context env RawInline _ s <- rawLaTeXInline return $ RawInline "tex" s -- "tex" because it might be context or latex -rawConTeXtEnvironment :: GenParser Char st String +rawConTeXtEnvironment :: Parser [Char] st String rawConTeXtEnvironment = try $ do string "\\start" completion <- inBrackets (letter <|> digit <|> spaceChar) @@ -1276,14 +1278,14 @@ rawConTeXtEnvironment = try $ do (try $ string "\\stop" >> string completion) return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion -inBrackets :: (GenParser Char st Char) -> GenParser Char st String +inBrackets :: (Parser [Char] st Char) -> Parser [Char] st String inBrackets parser = do char '[' contents <- many parser char ']' return $ "[" ++ contents ++ "]" -rawHtmlInline :: GenParser Char ParserState Inline +rawHtmlInline :: Parser [Char] ParserState Inline rawHtmlInline = do st <- getState (_,result) <- if stateStrict st @@ -1293,20 +1295,20 @@ rawHtmlInline = do -- Citations -cite :: GenParser Char ParserState Inline +cite :: Parser [Char] ParserState Inline cite = do failIfStrict citations <- textualCite <|> normalCite return $ Cite citations [] -spnl :: GenParser Char st () +spnl :: Parser [Char] st () spnl = try $ do skipSpaces optional newline skipSpaces notFollowedBy (char '\n') -textualCite :: GenParser Char ParserState [Citation] +textualCite :: Parser [Char] ParserState [Citation] textualCite = try $ do (_, key) <- citeKey let first = Citation{ citationId = key @@ -1321,7 +1323,7 @@ textualCite = try $ do then option [first] $ bareloc first else return $ first : rest -bareloc :: Citation -> GenParser Char ParserState [Citation] +bareloc :: Citation -> Parser [Char] ParserState [Citation] bareloc c = try $ do spnl char '[' @@ -1331,7 +1333,7 @@ bareloc c = try $ do char ']' return $ c{ citationSuffix = suff } : rest -normalCite :: GenParser Char ParserState [Citation] +normalCite :: Parser [Char] ParserState [Citation] normalCite = try $ do char '[' spnl @@ -1340,7 +1342,7 @@ normalCite = try $ do char ']' return citations -citeKey :: GenParser Char ParserState (Bool, String) +citeKey :: Parser [Char] ParserState (Bool, String) citeKey = try $ do suppress_author <- option False (char '-' >> return True) char '@' @@ -1352,7 +1354,7 @@ citeKey = try $ do guard $ key `elem` stateCitations st return (suppress_author, key) -suffix :: GenParser Char ParserState [Inline] +suffix :: Parser [Char] ParserState [Inline] suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) spnl @@ -1361,14 +1363,14 @@ suffix = try $ do then Space : rest else rest -prefix :: GenParser Char ParserState [Inline] +prefix :: Parser [Char] ParserState [Inline] prefix = liftM normalizeSpaces $ manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey)) -citeList :: GenParser Char ParserState [Citation] +citeList :: Parser [Char] ParserState [Citation] citeList = sepBy1 citation (try $ char ';' >> spnl) -citation :: GenParser Char ParserState Citation +citation :: Parser [Char] ParserState Citation citation = try $ do pref <- prefix (suppress_author, key) <- citeKey diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 456b23ce8..1806866ce 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -33,8 +33,7 @@ module Text.Pandoc.Readers.RST ( import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Text.ParserCombinators.Parsec -import Control.Monad ( when, liftM ) +import Control.Monad ( when, liftM, guard, mzero ) import Data.List ( findIndex, intercalate, transpose, sort, deleteFirstsBy ) import qualified Data.Map as M import Text.Printf ( printf ) @@ -58,7 +57,7 @@ underlineChars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" -- treat these as potentially non-text when parsing inline: specialChars :: [Char] -specialChars = "\\`|*_<>$:[]()-.\"'\8216\8217\8220\8221" +specialChars = "\\`|*_<>$:/[]{}()-.\"'\8216\8217\8220\8221" -- -- parsing documents @@ -89,7 +88,7 @@ titleTransform ((Header 1 head1):rest) | (promoteHeaders 1 rest, head1) titleTransform blocks = (blocks, []) -parseRST :: GenParser Char ParserState Pandoc +parseRST :: Parser [Char] ParserState Pandoc parseRST = do optional blanklines -- skip blank lines at beginning of file startPos <- getPosition @@ -118,17 +117,19 @@ parseRST = do -- parsing blocks -- -parseBlocks :: GenParser Char ParserState [Block] +parseBlocks :: Parser [Char] ParserState [Block] parseBlocks = manyTill block eof -block :: GenParser Char ParserState Block +block :: Parser [Char] ParserState Block block = choice [ codeBlock , rawBlock , blockQuote , fieldList , imageBlock + , figureBlock , customCodeBlock , mathBlock + , defaultRoleBlock , unknownDirective , header , hrule @@ -144,7 +145,7 @@ block = choice [ codeBlock -- field list -- -rawFieldListItem :: String -> GenParser Char ParserState (String, String) +rawFieldListItem :: String -> Parser [Char] ParserState (String, String) rawFieldListItem indent = try $ do string indent char ':' @@ -158,7 +159,7 @@ rawFieldListItem indent = try $ do return (name, raw) fieldListItem :: String - -> GenParser Char ParserState (Maybe ([Inline], [[Block]])) + -> Parser [Char] ParserState (Maybe ([Inline], [[Block]])) fieldListItem indent = try $ do (name, raw) <- rawFieldListItem indent let term = [Str name] @@ -185,7 +186,7 @@ extractContents [Plain auth] = auth extractContents [Para auth] = auth extractContents _ = [] -fieldList :: GenParser Char ParserState Block +fieldList :: Parser [Char] ParserState Block fieldList = try $ do indent <- lookAhead $ many spaceChar items <- many1 $ fieldListItem indent @@ -197,7 +198,7 @@ fieldList = try $ do -- line block -- -lineBlockLine :: GenParser Char ParserState [Inline] +lineBlockLine :: Parser [Char] ParserState [Inline] lineBlockLine = try $ do char '|' char ' ' <|> lookAhead (char '\n') @@ -208,7 +209,7 @@ lineBlockLine = try $ do then normalizeSpaces line else Str white : normalizeSpaces line -lineBlock :: GenParser Char ParserState Block +lineBlock :: Parser [Char] ParserState Block lineBlock = try $ do lines' <- many1 lineBlockLine blanklines @@ -218,14 +219,14 @@ lineBlock = try $ do -- paragraph block -- -para :: GenParser Char ParserState Block +para :: Parser [Char] ParserState Block para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph" -codeBlockStart :: GenParser Char st Char +codeBlockStart :: Parser [Char] st Char codeBlockStart = string "::" >> blankline >> blankline -- paragraph that ends in a :: starting a code block -paraBeforeCodeBlock :: GenParser Char ParserState Block +paraBeforeCodeBlock :: Parser [Char] ParserState Block paraBeforeCodeBlock = try $ do result <- many1 (notFollowedBy' codeBlockStart >> inline) lookAhead (string "::") @@ -234,21 +235,21 @@ paraBeforeCodeBlock = try $ do else (normalizeSpaces result) ++ [Str ":"] -- regular paragraph -paraNormal :: GenParser Char ParserState Block +paraNormal :: Parser [Char] ParserState Block paraNormal = try $ do result <- many1 inline newline blanklines return $ Para $ normalizeSpaces result -plain :: GenParser Char ParserState Block +plain :: Parser [Char] ParserState Block plain = many1 inline >>= return . Plain . normalizeSpaces -- -- image block -- -imageBlock :: GenParser Char ParserState Block +imageBlock :: Parser [Char] ParserState Block imageBlock = try $ do string ".. image:: " src <- manyTill anyChar newline @@ -263,11 +264,11 @@ imageBlock = try $ do -- header blocks -- -header :: GenParser Char ParserState Block +header :: Parser [Char] ParserState Block header = doubleHeader <|> singleHeader <?> "header" -- a header with lines on top and bottom -doubleHeader :: GenParser Char ParserState Block +doubleHeader :: Parser [Char] ParserState Block doubleHeader = try $ do c <- oneOf underlineChars rest <- many (char c) -- the top line @@ -292,7 +293,7 @@ doubleHeader = try $ do return $ Header level (normalizeSpaces txt) -- a header with line on the bottom only -singleHeader :: GenParser Char ParserState Block +singleHeader :: Parser [Char] ParserState Block singleHeader = try $ do notFollowedBy' whitespace txt <- many1 (do {notFollowedBy blankline; inline}) @@ -315,7 +316,7 @@ singleHeader = try $ do -- hrule block -- -hrule :: GenParser Char st Block +hrule :: Parser [Char] st Block hrule = try $ do chr <- oneOf underlineChars count 3 (char chr) @@ -329,14 +330,14 @@ hrule = try $ do -- -- read a line indented by a given string -indentedLine :: String -> GenParser Char st [Char] +indentedLine :: String -> Parser [Char] st [Char] indentedLine indents = try $ do string indents manyTill anyChar newline -- one or more indented lines, possibly separated by blank lines. -- any amount of indentation will work. -indentedBlock :: GenParser Char st [Char] +indentedBlock :: Parser [Char] st [Char] indentedBlock = try $ do indents <- lookAhead $ many1 spaceChar lns <- many1 $ try $ do b <- option "" blanklines @@ -345,7 +346,7 @@ indentedBlock = try $ do optional blanklines return $ unlines lns -codeBlock :: GenParser Char st Block +codeBlock :: Parser [Char] st Block codeBlock = try $ do codeBlockStart result <- indentedBlock @@ -353,7 +354,7 @@ codeBlock = try $ do -- | The 'code-block' directive (from Sphinx) that allows a language to be -- specified. -customCodeBlock :: GenParser Char st Block +customCodeBlock :: Parser [Char] st Block customCodeBlock = try $ do string ".. code-block:: " language <- manyTill anyChar newline @@ -361,19 +362,33 @@ customCodeBlock = try $ do result <- indentedBlock return $ CodeBlock ("", ["sourceCode", language], []) $ stripTrailingNewlines result + +figureBlock :: Parser [Char] ParserState Block +figureBlock = try $ do + string ".. figure::" + src <- removeLeadingTrailingSpace `fmap` manyTill anyChar newline + body <- indentedBlock + caption <- parseFromString extractCaption body + return $ Para [Image caption (src,"")] + +extractCaption :: Parser [Char] ParserState [Inline] +extractCaption = try $ do + manyTill anyLine blanklines + many inline + -- | The 'math' directive (from Sphinx) for display math. -mathBlock :: GenParser Char st Block +mathBlock :: Parser [Char] st Block mathBlock = try $ do string ".. math::" mathBlockMultiline <|> mathBlockOneLine -mathBlockOneLine :: GenParser Char st Block +mathBlockOneLine :: Parser [Char] st Block mathBlockOneLine = try $ do result <- manyTill anyChar newline blanklines return $ Para [Math DisplayMath $ removeLeadingTrailingSpace result] -mathBlockMultiline :: GenParser Char st Block +mathBlockMultiline :: Parser [Char] st Block mathBlockMultiline = try $ do blanklines result <- indentedBlock @@ -388,7 +403,7 @@ mathBlockMultiline = try $ do $ filter (not . null) $ splitBy null lns' return $ Para $ map (Math DisplayMath) eqs -lhsCodeBlock :: GenParser Char ParserState Block +lhsCodeBlock :: Parser [Char] ParserState Block lhsCodeBlock = try $ do failUnlessLHS optional codeBlockStart @@ -402,7 +417,7 @@ lhsCodeBlock = try $ do blanklines return $ CodeBlock ("", ["sourceCode", "literate", "haskell"], []) $ intercalate "\n" lns' -birdTrackLine :: GenParser Char st [Char] +birdTrackLine :: Parser [Char] st [Char] birdTrackLine = do char '>' manyTill anyChar newline @@ -411,7 +426,7 @@ birdTrackLine = do -- raw html/latex/etc -- -rawBlock :: GenParser Char st Block +rawBlock :: Parser [Char] st Block rawBlock = try $ do string ".. raw:: " lang <- many1 (letter <|> digit) @@ -423,7 +438,7 @@ rawBlock = try $ do -- block quotes -- -blockQuote :: GenParser Char ParserState Block +blockQuote :: Parser [Char] ParserState Block blockQuote = do raw <- indentedBlock -- parse the extracted block, which may contain various block elements: @@ -434,10 +449,10 @@ blockQuote = do -- list blocks -- -list :: GenParser Char ParserState Block +list :: Parser [Char] ParserState Block list = choice [ bulletList, orderedList, definitionList ] <?> "list" -definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) +definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) definitionListItem = try $ do -- avoid capturing a directive or comment notFollowedBy (try $ char '.' >> char '.') @@ -447,11 +462,11 @@ definitionListItem = try $ do contents <- parseFromString parseBlocks $ raw ++ "\n" return (normalizeSpaces term, [contents]) -definitionList :: GenParser Char ParserState Block +definitionList :: Parser [Char] ParserState Block definitionList = many1 definitionListItem >>= return . DefinitionList -- parses bullet list start and returns its length (inc. following whitespace) -bulletListStart :: GenParser Char st Int +bulletListStart :: Parser [Char] st Int bulletListStart = try $ do notFollowedBy' hrule -- because hrules start out just like lists marker <- oneOf bulletListMarkers @@ -461,14 +476,14 @@ bulletListStart = try $ do -- parses ordered list start and returns its length (inc following whitespace) orderedListStart :: ListNumberStyle -> ListNumberDelim - -> GenParser Char ParserState Int + -> Parser [Char] ParserState Int orderedListStart style delim = try $ do (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) white <- many1 spaceChar return $ markerLen + length white -- parse a line of a list item -listLine :: Int -> GenParser Char ParserState [Char] +listLine :: Int -> Parser [Char] ParserState [Char] listLine markerLength = try $ do notFollowedBy blankline indentWith markerLength @@ -476,7 +491,7 @@ listLine markerLength = try $ do return $ line ++ "\n" -- indent by specified number of spaces (or equiv. tabs) -indentWith :: Int -> GenParser Char ParserState [Char] +indentWith :: Int -> Parser [Char] ParserState [Char] indentWith num = do state <- getState let tabStop = stateTabStop state @@ -486,8 +501,8 @@ indentWith num = do (try (char '\t' >> count (num - tabStop) (char ' '))) ] -- parse raw text for one list item, excluding start marker and continuations -rawListItem :: GenParser Char ParserState Int - -> GenParser Char ParserState (Int, [Char]) +rawListItem :: Parser [Char] ParserState Int + -> Parser [Char] ParserState (Int, [Char]) rawListItem start = try $ do markerLength <- start firstLine <- manyTill anyChar newline @@ -497,14 +512,14 @@ rawListItem start = try $ do -- continuation of a list item - indented and separated by blankline or -- (in compact lists) endline. -- Note: nested lists are parsed as continuations. -listContinuation :: Int -> GenParser Char ParserState [Char] +listContinuation :: Int -> Parser [Char] ParserState [Char] listContinuation markerLength = try $ do blanks <- many1 blankline result <- many1 (listLine markerLength) return $ blanks ++ concat result -listItem :: GenParser Char ParserState Int - -> GenParser Char ParserState [Block] +listItem :: Parser [Char] ParserState Int + -> Parser [Char] ParserState [Block] listItem start = try $ do (markerLength, first) <- rawListItem start rest <- many (listContinuation markerLength) @@ -521,22 +536,40 @@ listItem start = try $ do updateState (\st -> st {stateParserContext = oldContext}) return parsed -orderedList :: GenParser Char ParserState Block +orderedList :: Parser [Char] ParserState Block orderedList = try $ do (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) items <- many1 (listItem (orderedListStart style delim)) let items' = compactify items return $ OrderedList (start, style, delim) items' -bulletList :: GenParser Char ParserState Block +bulletList :: Parser [Char] ParserState Block bulletList = many1 (listItem bulletListStart) >>= return . BulletList . compactify -- +-- default-role block +-- + +defaultRoleBlock :: Parser [Char] ParserState Block +defaultRoleBlock = try $ do + string ".. default-role::" + -- doesn't enforce any restrictions on the role name; embedded spaces shouldn't be allowed, for one + role <- manyTill anyChar newline >>= return . removeLeadingTrailingSpace + updateState $ \s -> s { stateRstDefaultRole = + if null role + then stateRstDefaultRole defaultParserState + else role + } + -- skip body of the directive if it exists + many $ blanklines <|> (spaceChar >> manyTill anyChar newline) + return Null + +-- -- unknown directive (e.g. comment) -- -unknownDirective :: GenParser Char st Block +unknownDirective :: Parser [Char] st Block unknownDirective = try $ do string ".." notFollowedBy (noneOf " \t\n") @@ -548,7 +581,7 @@ unknownDirective = try $ do --- note block --- -noteBlock :: GenParser Char ParserState [Char] +noteBlock :: Parser [Char] ParserState [Char] noteBlock = try $ do startPos <- getPosition string ".." @@ -567,7 +600,7 @@ noteBlock = try $ do -- return blanks so line count isn't affected return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -noteMarker :: GenParser Char ParserState [Char] +noteMarker :: Parser [Char] ParserState [Char] noteMarker = do char '[' res <- many1 digit @@ -580,13 +613,13 @@ noteMarker = do -- reference key -- -quotedReferenceName :: GenParser Char ParserState [Inline] +quotedReferenceName :: Parser [Char] ParserState [Inline] quotedReferenceName = try $ do char '`' >> notFollowedBy (char '`') -- `` means inline code! label' <- many1Till inline (char '`') return label' -unquotedReferenceName :: GenParser Char ParserState [Inline] +unquotedReferenceName :: Parser [Char] ParserState [Inline] unquotedReferenceName = try $ do label' <- many1Till inline (lookAhead $ char ':') return label' @@ -595,24 +628,24 @@ unquotedReferenceName = try $ do -- plus isolated (no two adjacent) internal hyphens, underscores, -- periods, colons and plus signs; no whitespace or other characters -- are allowed. -simpleReferenceName' :: GenParser Char st String +simpleReferenceName' :: Parser [Char] st String simpleReferenceName' = do x <- alphaNum xs <- many $ alphaNum <|> (try $ oneOf "-_:+." >> lookAhead alphaNum) return (x:xs) -simpleReferenceName :: GenParser Char st [Inline] +simpleReferenceName :: Parser [Char] st [Inline] simpleReferenceName = do raw <- simpleReferenceName' return [Str raw] -referenceName :: GenParser Char ParserState [Inline] +referenceName :: Parser [Char] ParserState [Inline] referenceName = quotedReferenceName <|> (try $ simpleReferenceName >>~ lookAhead (char ':')) <|> unquotedReferenceName -referenceKey :: GenParser Char ParserState [Char] +referenceKey :: Parser [Char] ParserState [Char] referenceKey = do startPos <- getPosition (key, target) <- choice [imageKey, anonymousKey, regularKey] @@ -624,7 +657,7 @@ referenceKey = do -- return enough blanks to replace key return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -targetURI :: GenParser Char st [Char] +targetURI :: Parser [Char] st [Char] targetURI = do skipSpaces optional newline @@ -633,7 +666,7 @@ targetURI = do blanklines return $ escapeURI $ removeLeadingTrailingSpace $ contents -imageKey :: GenParser Char ParserState (Key, Target) +imageKey :: Parser [Char] ParserState (Key, Target) imageKey = try $ do string ".. |" ref <- manyTill inline (char '|') @@ -642,14 +675,14 @@ imageKey = try $ do src <- targetURI return (toKey (normalizeSpaces ref), (src, "")) -anonymousKey :: GenParser Char st (Key, Target) +anonymousKey :: Parser [Char] st (Key, Target) anonymousKey = try $ do oneOfStrings [".. __:", "__"] src <- targetURI pos <- getPosition return (toKey [Str $ "_" ++ printf "%09d" (sourceLine pos)], (src, "")) -regularKey :: GenParser Char ParserState (Key, Target) +regularKey :: Parser [Char] ParserState (Key, Target) regularKey = try $ do string ".. _" ref <- referenceName @@ -674,31 +707,31 @@ regularKey = try $ do -- Grid tables TODO: -- - column spans -dashedLine :: Char -> GenParser Char st (Int, Int) +dashedLine :: Char -> Parser [Char] st (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many (char ' ') return (length dashes, length $ dashes ++ sp) -simpleDashedLines :: Char -> GenParser Char st [(Int,Int)] +simpleDashedLines :: Char -> Parser [Char] st [(Int,Int)] simpleDashedLines ch = try $ many1 (dashedLine ch) -- Parse a table row separator -simpleTableSep :: Char -> GenParser Char ParserState Char +simpleTableSep :: Char -> Parser [Char] ParserState Char simpleTableSep ch = try $ simpleDashedLines ch >> newline -- Parse a table footer -simpleTableFooter :: GenParser Char ParserState [Char] +simpleTableFooter :: Parser [Char] ParserState [Char] simpleTableFooter = try $ simpleTableSep '=' >> blanklines -- Parse a raw line and split it into chunks by indices. -simpleTableRawLine :: [Int] -> GenParser Char ParserState [String] +simpleTableRawLine :: [Int] -> Parser [Char] ParserState [String] simpleTableRawLine indices = do line <- many1Till anyChar newline return (simpleTableSplitLine indices line) -- Parse a table row and return a list of blocks (columns). -simpleTableRow :: [Int] -> GenParser Char ParserState [[Block]] +simpleTableRow :: [Int] -> Parser [Char] ParserState [[Block]] simpleTableRow indices = do notFollowedBy' simpleTableFooter firstLine <- simpleTableRawLine indices @@ -712,7 +745,7 @@ simpleTableSplitLine indices line = $ tail $ splitByIndices (init indices) line simpleTableHeader :: Bool -- ^ Headerless table - -> GenParser Char ParserState ([[Block]], [Alignment], [Int]) + -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) simpleTableHeader headless = try $ do optional blanklines rawContent <- if headless @@ -732,7 +765,7 @@ simpleTableHeader headless = try $ do -- Parse a simple table. simpleTable :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block + -> Parser [Char] ParserState Block simpleTable headless = do Table c a _w h l <- tableWith (simpleTableHeader headless) simpleTableRow sep simpleTableFooter (return []) -- Simple tables get 0s for relative column widths (i.e., use default) @@ -741,10 +774,10 @@ simpleTable headless = do sep = return () -- optional (simpleTableSep '-') gridTable :: Bool -- ^ Headerless table - -> GenParser Char ParserState Block + -> Parser [Char] ParserState Block gridTable = gridTableWith block (return []) -table :: GenParser Char ParserState Block +table :: Parser [Char] ParserState Block table = gridTable False <|> simpleTable False <|> gridTable True <|> simpleTable True <?> "table" @@ -753,7 +786,7 @@ table = gridTable False <|> simpleTable False <|> -- inline -- -inline :: GenParser Char ParserState Inline +inline :: Parser [Char] ParserState Inline inline = choice [ whitespace , link , str @@ -771,66 +804,90 @@ inline = choice [ whitespace , escapedChar , symbol ] <?> "inline" -hyphens :: GenParser Char ParserState Inline +hyphens :: Parser [Char] ParserState Inline hyphens = do result <- many1 (char '-') option Space endline -- don't want to treat endline after hyphen or dash as a space return $ Str result -escapedChar :: GenParser Char st Inline +escapedChar :: Parser [Char] st Inline escapedChar = do c <- escaped anyChar - return $ Str [c] + return $ if c == ' ' -- '\ ' is null in RST + then Str "" + else Str [c] -symbol :: GenParser Char ParserState Inline +symbol :: Parser [Char] ParserState Inline symbol = do result <- oneOf specialChars return $ Str [result] -- parses inline code, between codeStart and codeEnd -code :: GenParser Char ParserState Inline +code :: Parser [Char] ParserState Inline code = try $ do string "``" result <- manyTill anyChar (try (string "``")) return $ Code nullAttr $ removeLeadingTrailingSpace $ intercalate " " $ lines result -emph :: GenParser Char ParserState Inline -emph = enclosed (char '*') (char '*') inline >>= +-- succeeds only if we're not right after a str (ie. in middle of word) +atStart :: Parser [Char] ParserState a -> Parser [Char] ParserState a +atStart p = do + pos <- getPosition + st <- getState + -- single quote start can't be right after str + guard $ stateLastStrPos st /= Just pos + p + +emph :: Parser [Char] ParserState Inline +emph = enclosed (atStart $ char '*') (char '*') inline >>= return . Emph . normalizeSpaces -strong :: GenParser Char ParserState Inline -strong = enclosed (string "**") (try $ string "**") inline >>= +strong :: Parser [Char] ParserState Inline +strong = enclosed (atStart $ string "**") (try $ string "**") inline >>= return . Strong . normalizeSpaces -interpreted :: [Char] -> GenParser Char st [Char] +-- Parses inline interpreted text which is required to have the given role. +-- This decision is based on the role marker (if present), +-- and the current default interpreted text role. +interpreted :: [Char] -> Parser [Char] ParserState [Char] interpreted role = try $ do - optional $ try $ string "\\ " - result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar - try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "") - return result - -superscript :: GenParser Char ParserState Inline + state <- getState + if role == stateRstDefaultRole state + then try markedInterpretedText <|> unmarkedInterpretedText + else markedInterpretedText + where + markedInterpretedText = try (roleMarker >> unmarkedInterpretedText) + <|> (unmarkedInterpretedText >>= (\txt -> roleMarker >> return txt)) + roleMarker = string $ ":" ++ role ++ ":" + -- Note, this doesn't precisely implement the complex rule in + -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules + -- but it should be good enough for most purposes + unmarkedInterpretedText = do + result <- enclosed (atStart $ char '`') (char '`') anyChar + return result + +superscript :: Parser [Char] ParserState Inline superscript = interpreted "sup" >>= \x -> return (Superscript [Str x]) -subscript :: GenParser Char ParserState Inline +subscript :: Parser [Char] ParserState Inline subscript = interpreted "sub" >>= \x -> return (Subscript [Str x]) -math :: GenParser Char ParserState Inline +math :: Parser [Char] ParserState Inline math = interpreted "math" >>= \x -> return (Math InlineMath x) -whitespace :: GenParser Char ParserState Inline +whitespace :: Parser [Char] ParserState Inline whitespace = many1 spaceChar >> return Space <?> "whitespace" -str :: GenParser Char ParserState Inline +str :: Parser [Char] ParserState Inline str = do - result <- many1 (noneOf (specialChars ++ "\t\n ")) - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + let strChar = noneOf ("\t\n " ++ specialChars) + result <- many1 strChar + updateLastStrPos return $ Str result -- an endline character that can be treated as a space, not a structural break -endline :: GenParser Char ParserState Inline +endline :: Parser [Char] ParserState Inline endline = try $ do newline notFollowedBy blankline @@ -846,10 +903,10 @@ endline = try $ do -- links -- -link :: GenParser Char ParserState Inline +link :: Parser [Char] ParserState Inline link = choice [explicitLink, referenceLink, autoLink] <?> "link" -explicitLink :: GenParser Char ParserState Inline +explicitLink :: Parser [Char] ParserState Inline explicitLink = try $ do char '`' notFollowedBy (char '`') -- `` marks start of inline code @@ -861,7 +918,7 @@ explicitLink = try $ do return $ Link (normalizeSpaces label') (escapeURI $ removeLeadingTrailingSpace src, "") -referenceLink :: GenParser Char ParserState Inline +referenceLink :: Parser [Char] ParserState Inline referenceLink = try $ do label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_' state <- getState @@ -873,7 +930,7 @@ referenceLink = try $ do do char '_' let anonKeys = sort $ filter isAnonKey $ M.keys keyTable if null anonKeys - then pzero + then mzero else return (head anonKeys) (src,tit) <- case lookupKeySrc keyTable key of Nothing -> fail "no corresponding key" @@ -882,21 +939,21 @@ referenceLink = try $ do when (isAnonKey key) $ updateState $ \s -> s{ stateKeys = M.delete key keyTable } return $ Link (normalizeSpaces label') (src, tit) -autoURI :: GenParser Char ParserState Inline +autoURI :: Parser [Char] ParserState Inline autoURI = do (orig, src) <- uri return $ Link [Str orig] (src, "") -autoEmail :: GenParser Char ParserState Inline +autoEmail :: Parser [Char] ParserState Inline autoEmail = do (orig, src) <- emailAddress return $ Link [Str orig] (src, "") -autoLink :: GenParser Char ParserState Inline +autoLink :: Parser [Char] ParserState Inline autoLink = autoURI <|> autoEmail -- For now, we assume that all substitution references are for images. -image :: GenParser Char ParserState Inline +image :: Parser [Char] ParserState Inline image = try $ do char '|' ref <- manyTill inline (char '|') @@ -907,7 +964,7 @@ image = try $ do Just target -> return target return $ Image (normalizeSpaces ref) (src, tit) -note :: GenParser Char ParserState Inline +note :: Parser [Char] ParserState Inline note = try $ do ref <- noteMarker char '_' diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 3b5954368..71ba26c8c 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Textile - Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane + Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -59,10 +59,11 @@ import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) -import Text.ParserCombinators.Parsec +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.HTML.TagSoup.Match -import Data.Char ( digitToInt, isLetter ) +import Data.Char ( digitToInt, isUpper ) import Control.Monad ( guard, liftM ) +import Control.Applicative ((<$>), (*>), (<*)) -- | Parse a Textile text and return a Pandoc document. readTextile :: ParserState -- ^ Parser state, including options for parser @@ -72,16 +73,8 @@ readTextile state s = (readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n") --- --- Constants and data structure definitions --- - --- | Special chars border strings parsing -specialChars :: [Char] -specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()" - -- | Generate a Pandoc ADT from a textile document -parseTextile :: GenParser Char ParserState Pandoc +parseTextile :: Parser [Char] ParserState Pandoc parseTextile = do -- textile allows raw HTML and does smart punctuation by default updateState (\state -> state { stateParseRaw = True, stateSmart = True }) @@ -99,10 +92,10 @@ parseTextile = do blocks <- parseBlocks return $ Pandoc (Meta [] [] []) blocks -- FIXME -noteMarker :: GenParser Char ParserState [Char] +noteMarker :: Parser [Char] ParserState [Char] noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') -noteBlock :: GenParser Char ParserState [Char] +noteBlock :: Parser [Char] ParserState [Char] noteBlock = try $ do startPos <- getPosition ref <- noteMarker @@ -117,36 +110,37 @@ noteBlock = try $ do return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -- | Parse document blocks -parseBlocks :: GenParser Char ParserState [Block] +parseBlocks :: Parser [Char] ParserState [Block] parseBlocks = manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: [GenParser Char ParserState Block] +blockParsers :: [Parser [Char] ParserState Block] blockParsers = [ codeBlock , header , blockQuote , hrule , anyList , rawHtmlBlock + , rawLaTeXBlock' , maybeExplicitBlock "table" table , maybeExplicitBlock "p" para , nullBlock ] -- | Any block in the order of definition of blockParsers -block :: GenParser Char ParserState Block +block :: Parser [Char] ParserState Block block = choice blockParsers <?> "block" -codeBlock :: GenParser Char ParserState Block +codeBlock :: Parser [Char] ParserState Block codeBlock = codeBlockBc <|> codeBlockPre -codeBlockBc :: GenParser Char ParserState Block +codeBlockBc :: Parser [Char] ParserState Block codeBlockBc = try $ do string "bc. " contents <- manyTill anyLine blanklines return $ CodeBlock ("",[],[]) $ unlines contents -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockPre :: GenParser Char ParserState Block +codeBlockPre :: Parser [Char] ParserState Block codeBlockPre = try $ do htmlTag (tagOpen (=="pre") null) result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak) @@ -161,28 +155,23 @@ codeBlockPre = try $ do return $ CodeBlock ("",[],[]) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: GenParser Char ParserState Block +header :: Parser [Char] ParserState Block header = try $ do char 'h' - level <- oneOf "123456" >>= return . digitToInt - optional attributes - char '.' - whitespace - name <- manyTill inline blockBreak - return $ Header level (normalizeSpaces name) + level <- digitToInt <$> oneOf "123456" + optional attributes >> char '.' >> whitespace + name <- normalizeSpaces <$> manyTill inline blockBreak + return $ Header level name -- | Blockquote of the form "bq. content" -blockQuote :: GenParser Char ParserState Block +blockQuote :: Parser [Char] ParserState Block blockQuote = try $ do - string "bq" - optional attributes - char '.' - whitespace - para >>= return . BlockQuote . (:[]) + string "bq" >> optional attributes >> char '.' >> whitespace + BlockQuote . singleton <$> para -- Horizontal rule -hrule :: GenParser Char st Block +hrule :: Parser [Char] st Block hrule = try $ do skipSpaces start <- oneOf "-*" @@ -197,73 +186,62 @@ hrule = try $ do -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: GenParser Char ParserState Block -anyList = try $ do - l <- anyListAtDepth 1 - blanklines - return l +anyList :: Parser [Char] ParserState Block +anyList = try $ ( (anyListAtDepth 1) <* blanklines ) -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: Int -> GenParser Char ParserState Block +anyListAtDepth :: Int -> Parser [Char] ParserState Block anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: Int -> GenParser Char ParserState Block -bulletListAtDepth depth = try $ do - items <- many1 (bulletListItemAtDepth depth) - return (BulletList items) +bulletListAtDepth :: Int -> Parser [Char] ParserState Block +bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block] -bulletListItemAtDepth depth = try $ do - count depth (char '*') - optional attributes - whitespace - p <- inlines >>= return . Plain - sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) - return (p:sublist) +bulletListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of --- leading '#' -orderedListAtDepth :: Int -> GenParser Char ParserState Block +-- leading '#' +orderedListAtDepth :: Int -> Parser [Char] ParserState Block orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) return (OrderedList (1, DefaultStyle, DefaultDelim) items) -- | Ordered List Item of given depth, depth being the number of --- leading '#' -orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block] -orderedListItemAtDepth depth = try $ do - count depth (char '#') - optional attributes - whitespace - p <- inlines >>= return . Plain - sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) - return (p:sublist) +-- leading '#' +orderedListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +orderedListItemAtDepth = genericListItemAtDepth '#' + +-- | Common implementation of list items +genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState [Block] +genericListItemAtDepth c depth = try $ do + count depth (char c) >> optional attributes >> whitespace + p <- inlines + sublist <- option [] (singleton <$> anyListAtDepth (depth + 1)) + return ((Plain p):sublist) -- | A definition list is a set of consecutive definition items -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - items <- many1 definitionListItem - return $ DefinitionList items +definitionList :: Parser [Char] ParserState Block +definitionList = try $ DefinitionList <$> many1 definitionListItem -- | A definition list item in textile begins with '- ', followed by -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) +definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) definitionListItem = try $ do string "- " term <- many1Till inline (try (whitespace >> string ":=")) def <- inlineDef <|> multilineDef return (term, def) - where inlineDef :: GenParser Char ParserState [[Block]] + where inlineDef :: Parser [Char] ParserState [[Block]] inlineDef = liftM (\d -> [[Plain d]]) $ try (whitespace >> inlines) - multilineDef :: GenParser Char ParserState [[Block]] + multilineDef :: Parser [Char] ParserState [[Block]] multilineDef = try $ do optional whitespace >> newline s <- many1Till anyChar (try (string "=:" >> newline)) @@ -273,59 +251,57 @@ definitionListItem = try $ do -- | This terminates a block such as a paragraph. Because of raw html -- blocks support, we have to lookAhead for a rawHtmlBlock. -blockBreak :: GenParser Char ParserState () +blockBreak :: Parser [Char] ParserState () blockBreak = try (newline >> blanklines >> return ()) <|> (lookAhead rawHtmlBlock >> return ()) +-- raw content + -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: GenParser Char ParserState Block +rawHtmlBlock :: Parser [Char] ParserState Block rawHtmlBlock = try $ do (_,b) <- htmlTag isBlockTag optional blanklines return $ RawBlock "html" b +-- | Raw block of LaTeX content +rawLaTeXBlock' :: Parser [Char] ParserState Block +rawLaTeXBlock' = do + failIfStrict + RawBlock "latex" <$> (rawLaTeXBlock <* spaces) + + -- | In textile, paragraphs are separated by blank lines. -para :: GenParser Char ParserState Block -para = try $ do - content <- manyTill inline blockBreak - return $ Para $ normalizeSpaces content +para :: Parser [Char] ParserState Block +para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak -- Tables -- | A table cell spans until a pipe | -tableCell :: GenParser Char ParserState TableCell +tableCell :: Parser [Char] ParserState TableCell tableCell = do c <- many1 (noneOf "|\n") content <- parseFromString (many1 inline) c return $ [ Plain $ normalizeSpaces content ] -- | A table row is made of many table cells -tableRow :: GenParser Char ParserState [TableCell] -tableRow = try $ do - char '|' - cells <- endBy1 tableCell (char '|') - newline - return cells +tableRow :: Parser [Char] ParserState [TableCell] +tableRow = try $ ( char '|' *> (endBy1 tableCell (char '|')) <* newline) -- | Many table rows -tableRows :: GenParser Char ParserState [[TableCell]] +tableRows :: Parser [Char] ParserState [[TableCell]] tableRows = many1 tableRow -- | Table headers are made of cells separated by a tag "|_." -tableHeaders :: GenParser Char ParserState [TableCell] -tableHeaders = try $ do - let separator = (try $ string "|_.") - separator - headers <- sepBy1 tableCell separator - char '|' - newline - return headers +tableHeaders :: Parser [Char] ParserState [TableCell] +tableHeaders = let separator = (try $ string "|_.") in + try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline ) -- | A table with an optional header. Current implementation can -- handle tables with and without header, but will parse cells -- alignment attributes as content. -table :: GenParser Char ParserState Block +table :: Parser [Char] ParserState Block table = try $ do headers <- option [] tableHeaders rows <- tableRows @@ -341,8 +317,8 @@ table = try $ do -- | Blocks like 'p' and 'table' do not need explicit block tag. -- However, they can be used to set HTML/CSS attributes when needed. maybeExplicitBlock :: String -- ^ block tag name - -> GenParser Char ParserState Block -- ^ implicit block - -> GenParser Char ParserState Block + -> Parser [Char] ParserState Block -- ^ implicit block + -> Parser [Char] ParserState Block maybeExplicitBlock name blk = try $ do optional $ try $ string name >> optional attributes >> char '.' >> ((try whitespace) <|> endline) @@ -356,31 +332,27 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: GenParser Char ParserState Inline +inline :: Parser [Char] ParserState Inline inline = choice inlineParsers <?> "inline" -- | List of consecutive inlines before a newline -inlines :: GenParser Char ParserState [Inline] +inlines :: Parser [Char] ParserState [Inline] inlines = manyTill inline newline -- | Inline parsers tried in order -inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers :: [Parser [Char] ParserState Inline] inlineParsers = [ autoLink , str , whitespace , endline , code + , escapedInline , htmlSpan , rawHtmlInline + , rawLaTeXInline' , note - , simpleInline (string "??") (Cite []) - , simpleInline (string "**") Strong - , simpleInline (string "__") Emph - , simpleInline (char '*') Strong - , simpleInline (char '_') Emph - , simpleInline (char '-') Strikeout - , simpleInline (char '^') Superscript - , simpleInline (char '~') Subscript + , try $ (char '[' *> inlineMarkup <* char ']') + , inlineMarkup , link , image , mark @@ -388,97 +360,140 @@ inlineParsers = [ autoLink , symbol ] +-- | Inline markups +inlineMarkup :: Parser [Char] ParserState Inline +inlineMarkup = choice [ simpleInline (string "??") (Cite []) + , simpleInline (string "**") Strong + , simpleInline (string "__") Emph + , simpleInline (char '*') Strong + , simpleInline (char '_') Emph + , simpleInline (char '+') Emph -- approximates underline + , simpleInline (char '-') Strikeout + , simpleInline (char '^') Superscript + , simpleInline (char '~') Subscript + ] + -- | Trademark, registered, copyright -mark :: GenParser Char st Inline +mark :: Parser [Char] st Inline mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: GenParser Char st Inline +reg :: Parser [Char] st Inline reg = do oneOf "Rr" char ')' return $ Str "\174" -tm :: GenParser Char st Inline +tm :: Parser [Char] st Inline tm = do oneOf "Tt" oneOf "Mm" char ')' return $ Str "\8482" -copy :: GenParser Char st Inline +copy :: Parser [Char] st Inline copy = do oneOf "Cc" char ')' return $ Str "\169" -note :: GenParser Char ParserState Inline +note :: Parser [Char] ParserState Inline note = try $ do - char '[' - ref <- many1 digit - char ']' - state <- getState - let notes = stateNotes state + ref <- (char '[' *> many1 digit <* char ']') + notes <- stateNotes <$> getState case lookup ref notes of Nothing -> fail "note not found" Just raw -> liftM Note $ parseFromString parseBlocks raw +-- | Special chars +markupChars :: [Char] +markupChars = "\\[]*#_@~-+^|%=" + +-- | Break strings on following chars. Space tab and newline break for +-- inlines breaking. Open paren breaks for mark. Quote, dash and dot +-- break for smart punctuation. Punctuation breaks for regular +-- punctuation. Double quote breaks for named links. > and < break +-- for inline html. +stringBreakers :: [Char] +stringBreakers = " \t\n('-.,:!?;\"<>" + +wordBoundaries :: [Char] +wordBoundaries = markupChars ++ stringBreakers + +-- | Parse a hyphened sequence of words +hyphenedWords :: Parser [Char] ParserState String +hyphenedWords = try $ do + hd <- noneOf wordBoundaries + tl <- many ( (noneOf wordBoundaries) <|> + try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) ) + let wd = hd:tl + option wd $ try $ + (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords) + -- | Any string -str :: GenParser Char ParserState Inline +str :: Parser [Char] ParserState Inline str = do - xs <- many1 (noneOf (specialChars ++ "\t\n ")) - optional $ try $ do - lookAhead (char '(') - notFollowedBy' mark - getInput >>= setInput . (' ':) -- add space before acronym explanation - -- parse a following hyphen if followed by a letter - -- (this prevents unwanted interpretation as starting a strikeout section) - result <- option xs $ try $ do - char '-' - next <- lookAhead letter - guard $ isLetter (last xs) || isLetter next - return $ xs ++ "-" - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } - return $ Str result + baseStr <- hyphenedWords + -- RedCloth compliance : if parsed word is uppercase and immediatly + -- followed by parens, parens content is unconditionally word acronym + fullStr <- option baseStr $ try $ do + guard $ all isUpper baseStr + acro <- enclosed (char '(') (char ')') anyChar + return $ concat [baseStr, " (", acro, ")"] + updateLastStrPos + return $ Str fullStr -- | Textile allows HTML span infos, we discard them -htmlSpan :: GenParser Char ParserState Inline -htmlSpan = try $ do - char '%' - _ <- attributes - content <- manyTill anyChar (char '%') - return $ Str content +htmlSpan :: Parser [Char] ParserState Inline +htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') ) -- | Some number of space chars -whitespace :: GenParser Char ParserState Inline +whitespace :: Parser [Char] ParserState Inline whitespace = many1 spaceChar >> return Space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: GenParser Char ParserState Inline +endline :: Parser [Char] ParserState Inline endline = try $ do newline >> notFollowedBy blankline return LineBreak -rawHtmlInline :: GenParser Char ParserState Inline -rawHtmlInline = liftM (RawInline "html" . snd) - $ htmlTag isInlineTag +rawHtmlInline :: Parser [Char] ParserState Inline +rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag + +-- | Raw LaTeX Inline +rawLaTeXInline' :: Parser [Char] ParserState Inline +rawLaTeXInline' = try $ do + failIfStrict + rawLaTeXInline + +-- | Textile standard link syntax is "label":target. But we +-- can also have ["label":target]. +link :: Parser [Char] ParserState Inline +link = linkB <|> linkNoB + +linkNoB :: Parser [Char] ParserState Inline +linkNoB = try $ do + name <- surrounded (char '"') inline + char ':' + let stopChars = "!.,;:" + url <- manyTill nonspaceChar (lookAhead $ space <|> try (oneOf stopChars >> (space <|> newline))) + return $ Link name (url, "") --- | Textile standard link syntax is "label":target -link :: GenParser Char ParserState Inline -link = try $ do +linkB :: Parser [Char] ParserState Inline +linkB = try $ do + char '[' name <- surrounded (char '"') inline char ':' - url <- manyTill (anyChar) (lookAhead $ (space <|> try (oneOf ".;,:" >> (space <|> newline)))) + url <- manyTill nonspaceChar (char ']') return $ Link name (url, "") -- | Detect plain links to http or email. -autoLink :: GenParser Char ParserState Inline +autoLink :: Parser [Char] ParserState Inline autoLink = do (orig, src) <- (try uri <|> try emailAddress) return $ Link [Str orig] (src, "") -- | image embedding -image :: GenParser Char ParserState Inline +image :: Parser [Char] ParserState Inline image = try $ do char '!' >> notFollowedBy space src <- manyTill anyChar (lookAhead $ oneOf "!(") @@ -486,41 +501,53 @@ image = try $ do char '!' return $ Image [Str alt] (src, alt) --- | Any special symbol defined in specialChars -symbol :: GenParser Char ParserState Inline -symbol = do - result <- oneOf specialChars - return $ Str [result] +escapedInline :: Parser [Char] ParserState Inline +escapedInline = escapedEqs <|> escapedTag + +escapedEqs :: Parser [Char] ParserState Inline +escapedEqs = Str <$> (try $ string "==" *> manyTill anyChar (try $ string "==")) + +-- | literal text escaped btw <notextile> tags +escapedTag :: Parser [Char] ParserState Inline +escapedTag = Str <$> + (try $ string "<notextile>" *> manyTill anyChar (try $ string "</notextile>")) + +-- | Any special symbol defined in wordBoundaries +symbol :: Parser [Char] ParserState Inline +symbol = Str . singleton <$> oneOf wordBoundaries -- | Inline code -code :: GenParser Char ParserState Inline +code :: Parser [Char] ParserState Inline code = code1 <|> code2 -code1 :: GenParser Char ParserState Inline -code1 = surrounded (char '@') anyChar >>= return . Code nullAttr +code1 :: Parser [Char] ParserState Inline +code1 = Code nullAttr <$> surrounded (char '@') anyChar -code2 :: GenParser Char ParserState Inline +code2 :: Parser [Char] ParserState Inline code2 = do htmlTag (tagOpen (=="tt") null) - result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) - return $ Code nullAttr result' + Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes -attributes :: GenParser Char ParserState String +attributes :: Parser [Char] ParserState String attributes = choice [ enclosed (char '(') (char ')') anyChar, enclosed (char '{') (char '}') anyChar, enclosed (char '[') (char ']') anyChar] -- | Parses material surrounded by a parser. -surrounded :: GenParser Char st t -- ^ surrounding parser - -> GenParser Char st a -- ^ content parser (to be used repeatedly) - -> GenParser Char st [a] -surrounded border = enclosed border border +surrounded :: Parser [Char] st t -- ^ surrounding parser + -> Parser [Char] st a -- ^ content parser (to be used repeatedly) + -> Parser [Char] st [a] +surrounded border = enclosed border (try border) -- | Inlines are most of the time of the same form -simpleInline :: GenParser Char ParserState t -- ^ surrounding parser +simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser -> ([Inline] -> Inline) -- ^ Inline constructor - -> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly) + -> Parser [Char] ParserState Inline -- ^ content parser (to be used repeatedly) simpleInline border construct = surrounded border (inlineWithAttribute) >>= return . construct . normalizeSpaces where inlineWithAttribute = (try $ optional attributes) >> inline + +-- | Create a singleton list +singleton :: a -> [a] +singleton x = [x] diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index 9332a3fa0..a80ab0c63 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -121,8 +121,12 @@ cssURLs userdata d orig = let url = toString $ case B.take 1 u of "\"" -> B.takeWhile (/='"') $ B.drop 1 u + "'" -> B.takeWhile (/='\'') $ B.drop 1 u _ -> u - (raw, mime) <- getRaw userdata "" (d </> url) + let url' = if isAbsoluteURI url + then url + else d </> url + (raw, mime) <- getRaw userdata "" url' rest <- cssURLs userdata d v let enc = "data:" `B.append` fromString mime `B.append` ";base64," `B.append` (encode raw) diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index cd5b19164..6c8904010 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -72,7 +72,7 @@ module Text.Pandoc.Shared ( readDataFile, -- * Error handling err, - warn, + warn ) where import Text.Pandoc.Definition @@ -94,6 +94,7 @@ import Text.Pandoc.Highlighting (Style, pygments) import Text.Pandoc.Pretty (charWidth) import System.Locale (defaultTimeLocale) import Data.Time +import Data.Default import System.IO (stderr) -- @@ -482,6 +483,7 @@ data ObfuscationMethod = NoObfuscation -- | Varieties of HTML slide shows. data HTMLSlideVariant = S5Slides | SlidySlides + | SlideousSlides | DZSlides | NoSlides deriving (Show, Read, Eq) @@ -494,7 +496,7 @@ data WriterOptions = WriterOptions , writerEPUBMetadata :: String -- ^ Metadata to include in EPUB , writerTabStop :: Int -- ^ Tabstop for conversion btw spaces and tabs , writerTableOfContents :: Bool -- ^ Include table of contents - , writerSlideVariant :: HTMLSlideVariant -- ^ Are we writing S5 or Slidy? + , writerSlideVariant :: HTMLSlideVariant -- ^ Are we writing S5, Slidy or Slideous? , writerIncremental :: Bool -- ^ True if lists should be incremental , writerXeTeX :: Bool -- ^ Create latex suitable for use by xetex , writerHTMLMathMethod :: HTMLMathMethod -- ^ How to print math in HTML @@ -520,8 +522,12 @@ data WriterOptions = WriterOptions , writerHighlight :: Bool -- ^ Highlight source code , writerHighlightStyle :: Style -- ^ Style to use for highlighting , writerSetextHeaders :: Bool -- ^ Use setext headers for levels 1-2 in markdown + , writerTeXLigatures :: Bool -- ^ Use tex ligatures quotes, dashes in latex } deriving Show +instance Default WriterOptions where + def = defaultWriterOptions + {-# DEPRECATED writerXeTeX "writerXeTeX no longer does anything" #-} -- | Default writer options. defaultWriterOptions :: WriterOptions @@ -558,6 +564,7 @@ defaultWriterOptions = , writerHighlight = False , writerHighlightStyle = pygments , writerSetextHeaders = True + , writerTeXLigatures = True } -- diff --git a/src/Text/Pandoc/Slides.hs b/src/Text/Pandoc/Slides.hs index 1df556d38..fe9b60720 100644 --- a/src/Text/Pandoc/Slides.hs +++ b/src/Text/Pandoc/Slides.hs @@ -26,7 +26,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Portability : portable Utility functions for splitting documents into slides for slide -show formats (dzslides, s5, slidy, beamer). +show formats (dzslides, s5, slidy, slideous, beamer). -} module Text.Pandoc.Slides ( getSlideLevel, prepSlides ) where import Text.Pandoc.Definition @@ -49,9 +49,10 @@ prepSlides :: Int -> [Block] -> [Block] prepSlides slideLevel = ensureStartWithH . splitHrule where splitHrule (HorizontalRule : Header n xs : ys) | n == slideLevel = Header slideLevel xs : splitHrule ys - splitHrule (HorizontalRule : xs) = Header slideLevel [] : splitHrule xs + splitHrule (HorizontalRule : xs) = Header slideLevel [Str "\0"] : + splitHrule xs splitHrule (x : xs) = x : splitHrule xs splitHrule [] = [] ensureStartWithH bs@(Header n _:_) | n <= slideLevel = bs - ensureStartWithH bs = Header slideLevel [] : bs + ensureStartWithH bs = Header slideLevel [Str "\0"] : bs diff --git a/src/Text/Pandoc/Templates.hs b/src/Text/Pandoc/Templates.hs index 336efe453..bd4cdcd86 100644 --- a/src/Text/Pandoc/Templates.hs +++ b/src/Text/Pandoc/Templates.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE TypeSynonymInstances, FlexibleInstances #-} +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, CPP #-} {- Copyright (C) 2009-2010 John MacFarlane <jgm@berkeley.edu> @@ -68,11 +68,16 @@ module Text.Pandoc.Templates ( renderTemplate , TemplateTarget , getDefaultTemplate ) where -import Text.ParserCombinators.Parsec -import Control.Monad (liftM, when, forM) +import Text.Parsec +import Control.Monad (liftM, when, forM, mzero) import System.FilePath import Data.List (intercalate, intersperse) +#if MIN_VERSION_blaze_html(0,5,0) +import Text.Blaze.Html (Html) +import Text.Blaze.Internal (preEscapedString) +#else import Text.Blaze (preEscapedString, Html) +#endif import Data.ByteString.Lazy.UTF8 (ByteString, fromString) import Text.Pandoc.Shared (readDataFile) import qualified Control.Exception.Extensible as E (try, IOException) @@ -93,7 +98,7 @@ getDefaultTemplate user writer = do data TemplateState = TemplateState Int [(String,String)] -adjustPosition :: String -> GenParser Char TemplateState String +adjustPosition :: String -> Parsec [Char] TemplateState String adjustPosition str = do let lastline = takeWhile (/= '\n') $ reverse str updateState $ \(TemplateState pos x) -> @@ -127,21 +132,21 @@ renderTemplate vals templ = reservedWords :: [String] reservedWords = ["else","endif","for","endfor","sep"] -parseTemplate :: GenParser Char TemplateState [String] +parseTemplate :: Parsec [Char] TemplateState [String] parseTemplate = many $ (plaintext <|> escapedDollar <|> conditional <|> for <|> variable) >>= adjustPosition -plaintext :: GenParser Char TemplateState String +plaintext :: Parsec [Char] TemplateState String plaintext = many1 $ noneOf "$" -escapedDollar :: GenParser Char TemplateState String +escapedDollar :: Parsec [Char] TemplateState String escapedDollar = try $ string "$$" >> return "$" -skipEndline :: GenParser Char st () +skipEndline :: Parsec [Char] st () skipEndline = try $ skipMany (oneOf " \t") >> newline >> return () -conditional :: GenParser Char TemplateState String +conditional :: Parsec [Char] TemplateState String conditional = try $ do TemplateState pos vars <- getState string "$if(" @@ -165,7 +170,7 @@ conditional = try $ do then ifContents else elseContents -for :: GenParser Char TemplateState String +for :: Parsec [Char] TemplateState String for = try $ do TemplateState pos vars <- getState string "$for(" @@ -188,16 +193,16 @@ for = try $ do setState $ TemplateState pos vars return $ concat $ intersperse sep contents -ident :: GenParser Char TemplateState String +ident :: Parsec [Char] TemplateState String ident = do first <- letter rest <- many (alphaNum <|> oneOf "_-") let id' = first : rest if id' `elem` reservedWords - then pzero + then mzero else return id' -variable :: GenParser Char TemplateState String +variable :: Parsec [Char] TemplateState String variable = try $ do char '$' id' <- ident diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs index 4af155882..e2959eae7 100644 --- a/src/Text/Pandoc/UTF8.hs +++ b/src/Text/Pandoc/UTF8.hs @@ -39,21 +39,25 @@ module Text.Pandoc.UTF8 ( readFile where +#if MIN_VERSION_base(4,4,0) +#else +import Codec.Binary.UTF8.String (encodeString) +#endif + #if MIN_VERSION_base(4,2,0) import System.IO hiding (readFile, writeFile, getContents, putStr, putStrLn, hPutStr, hPutStrLn, hGetContents) import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn ) -import Codec.Binary.UTF8.String (encodeString) import qualified System.IO as IO readFile :: FilePath -> IO String readFile f = do - h <- openFile (encodeString f) ReadMode + h <- openFile (encodePath f) ReadMode hGetContents h writeFile :: FilePath -> String -> IO () -writeFile f s = withFile (encodeString f) WriteMode $ \h -> hPutStr h s +writeFile f s = withFile (encodePath f) WriteMode $ \h -> hPutStr h s getContents :: IO String getContents = hGetContents stdin @@ -76,7 +80,6 @@ hGetContents h = hSetEncoding h utf8_bom >> IO.hGetContents h #else import qualified Data.ByteString as B -import Codec.Binary.UTF8.String (encodeString) import Data.ByteString.UTF8 (toString, fromString) import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn) import System.IO (Handle) @@ -91,10 +94,10 @@ stripBOM s | bom `B.isPrefixOf` s = B.drop 3 s stripBOM s = s readFile :: FilePath -> IO String -readFile = liftM (toString . stripBOM) . B.readFile . encodeString +readFile = liftM (toString . stripBOM) . B.readFile . encodePath writeFile :: FilePath -> String -> IO () -writeFile f = B.writeFile (encodeString f) . fromString +writeFile f = B.writeFile (encodePath f) . fromString getContents :: IO String getContents = liftM (toString . stripBOM) B.getContents @@ -115,3 +118,10 @@ hPutStrLn :: Handle -> String -> IO () hPutStrLn h s = hPutStr h (s ++ "\n") #endif + +encodePath :: FilePath -> FilePath +#if MIN_VERSION_base(4,4,0) +encodePath = id +#else +encodePath = encodeString +#endif diff --git a/src/Text/Pandoc/Writers/AsciiDoc.hs b/src/Text/Pandoc/Writers/AsciiDoc.hs index 1913eb92b..1ccfab6e6 100644 --- a/src/Text/Pandoc/Writers/AsciiDoc.hs +++ b/src/Text/Pandoc/Writers/AsciiDoc.hs @@ -40,8 +40,7 @@ module Text.Pandoc.Writers.AsciiDoc (writeAsciiDoc) where import Text.Pandoc.Definition import Text.Pandoc.Templates (renderTemplate) import Text.Pandoc.Shared -import Text.Pandoc.Parsing hiding (blankline) -import Text.ParserCombinators.Parsec ( runParser, GenParser ) +import Text.Pandoc.Parsing hiding (blankline, space) import Data.List ( isPrefixOf, intersperse, intercalate ) import Text.Pandoc.Pretty import Control.Monad.State @@ -93,7 +92,7 @@ escapeString = escapeStringUsing escs where escs = backslashEscapes "{" -- | Ordered list start parser for use in Para below. -olMarker :: GenParser Char ParserState Char +olMarker :: Parser [Char] ParserState Char olMarker = do (start, style', delim) <- anyOrderedListMarker if delim == Period && (style' == UpperAlpha || (style' == UpperRoman && diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs index dfdf7a140..964320eb2 100644 --- a/src/Text/Pandoc/Writers/ConTeXt.hs +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -77,6 +77,8 @@ pandocToConTeXt options (Pandoc (Meta title authors date) blocks) = do , ("title", titletext) , ("date", datetext) ] ++ [ ("number-sections", "yes") | writerNumberSections options ] ++ + [ ("mainlang", maybe "" (reverse . takeWhile (/=',') . reverse) + (lookup "lang" $ writerVariables options)) ] ++ [ ("author", a) | a <- authorstext ] return $ if writerStandalone options then renderTemplate context $ writerTemplate options @@ -84,34 +86,30 @@ pandocToConTeXt options (Pandoc (Meta title authors date) blocks) = do -- escape things as needed for ConTeXt -escapeCharForConTeXt :: Char -> String -escapeCharForConTeXt ch = +escapeCharForConTeXt :: WriterOptions -> Char -> String +escapeCharForConTeXt opts ch = + let ligatures = writerTeXLigatures opts in case ch of - '{' -> "\\letteropenbrace{}" - '}' -> "\\letterclosebrace{}" + '{' -> "\\{" + '}' -> "\\}" '\\' -> "\\letterbackslash{}" '$' -> "\\$" '|' -> "\\letterbar{}" - '^' -> "\\letterhat{}" - '%' -> "\\%" + '%' -> "\\letterpercent{}" '~' -> "\\lettertilde{}" - '&' -> "\\&" '#' -> "\\#" - '<' -> "\\letterless{}" - '>' -> "\\lettermore{}" '[' -> "{[}" ']' -> "{]}" - '_' -> "\\letterunderscore{}" '\160' -> "~" - '\x2014' -> "---" - '\x2013' -> "--" - '\x2019' -> "'" + '\x2014' | ligatures -> "---" + '\x2013' | ligatures -> "--" + '\x2019' | ligatures -> "'" '\x2026' -> "\\ldots{}" x -> [x] -- | Escape string for ConTeXt -stringToConTeXt :: String -> String -stringToConTeXt = concatMap escapeCharForConTeXt +stringToConTeXt :: WriterOptions -> String -> String +stringToConTeXt opts = concatMap (escapeCharForConTeXt opts) -- | Convert Elements to ConTeXt elementToConTeXt :: WriterOptions -> Element -> State WriterState Doc @@ -252,8 +250,9 @@ inlineToConTeXt (SmallCaps lst) = do return $ braces $ "\\sc " <> contents inlineToConTeXt (Code _ str) | not ('{' `elem` str || '}' `elem` str) = return $ "\\type" <> braces (text str) -inlineToConTeXt (Code _ str) = - return $ "\\mono" <> braces (text $ stringToConTeXt str) +inlineToConTeXt (Code _ str) = do + opts <- gets stOptions + return $ "\\mono" <> braces (text $ stringToConTeXt opts str) inlineToConTeXt (Quoted SingleQuote lst) = do contents <- inlineListToConTeXt lst return $ "\\quote" <> braces contents @@ -261,11 +260,13 @@ inlineToConTeXt (Quoted DoubleQuote lst) = do contents <- inlineListToConTeXt lst return $ "\\quotation" <> braces contents inlineToConTeXt (Cite _ lst) = inlineListToConTeXt lst -inlineToConTeXt (Str str) = return $ text $ stringToConTeXt str +inlineToConTeXt (Str str) = do + opts <- gets stOptions + return $ text $ stringToConTeXt opts str inlineToConTeXt (Math InlineMath str) = return $ char '$' <> text str <> char '$' inlineToConTeXt (Math DisplayMath str) = - return $ text "\\startformula " <> text str <> text " \\stopformula" + return $ text "\\startformula " <> text str <> text " \\stopformula" <> space inlineToConTeXt (RawInline "context" str) = return $ text str inlineToConTeXt (RawInline "tex" str) = return $ text str inlineToConTeXt (RawInline _ _) = return empty @@ -296,7 +297,7 @@ inlineToConTeXt (Link txt (src, _)) = do label <- inlineListToConTeXt txt return $ "\\useURL" <> brackets (text ref) - <> brackets (text $ escapeStringUsing [('#',"\\#")] src) + <> brackets (text $ escapeStringUsing [('#',"\\#"),('%',"\\%")] src) <> brackets empty <> brackets label <> "\\from" diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index a2995b705..396e7a482 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -146,7 +146,8 @@ writeDocx mbRefDocx opts doc@(Pandoc (Meta tit auths date) _) = do let styledoc = case findEntryByPath stylepath refArchive >>= parseXMLDoc . toString . fromEntry of Just d -> d - Nothing -> error $ stylepath ++ "missing in reference docx" + Nothing -> error $ "Unable to parse " ++ stylepath ++ + " from reference.docx" let styledoc' = styledoc{ elContent = elContent styledoc ++ map Elem newstyles } let styleEntry = toEntry stylepath epochtime $ fromString $ showTopElement' styledoc' -- construct word/numbering.xml @@ -261,13 +262,13 @@ mkLvl marker lvl = ,show n) step = 720 hang = 480 - bulletFor 0 = "\8226" - bulletFor 1 = "\9702" - bulletFor 2 = "\8227" - bulletFor 3 = "\8259" - bulletFor 4 = "\8226" - bulletFor 5 = "\9702" - bulletFor _ = "\8227" + bulletFor 0 = "\x2022" -- filled circle + bulletFor 1 = "\x2013" -- en dash + bulletFor 2 = "\x2022" -- hyphen bullet + bulletFor 3 = "\x2013" + bulletFor 4 = "\x2022" + bulletFor 5 = "\x2013" + bulletFor _ = "\x2022" styleFor UpperAlpha _ = "upperLetter" styleFor LowerAlpha _ = "lowerLetter" styleFor UpperRoman _ = "upperRoman" @@ -488,7 +489,10 @@ getParaProps :: WS [Element] getParaProps = do props <- gets stParaProperties listLevel <- gets stListLevel - numid <- getNumId + listMarker <- gets stListMarker + numid <- case listMarker of + NoMarker -> return 1 + _ -> getNumId let listPr = if listLevel >= 0 then [ mknode "w:numPr" [] [ mknode "w:numId" [("w:val",show numid)] () diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 67048348e..b423f136f 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -48,6 +48,8 @@ import Text.Pandoc.Writers.Markdown ( writePlain ) import Data.Char ( toLower ) import Network.URI ( unEscapeString ) import Text.Pandoc.MIME (getMimeType) +import Prelude hiding (catch) +import Control.Exception (catch, SomeException) -- | Produce an EPUB file from a Pandoc document. writeEPUB :: Maybe String -- ^ EPUB stylesheet specified at command line @@ -126,8 +128,9 @@ writeEPUB mbStylesheet fonts opts doc@(Pandoc meta _) = do let chapterEntries = zipWith chapterToEntry [1..] chapters -- contents.opf - localeLang <- catch (liftM (takeWhile (/='.')) $ getEnv "LANG") - (\_ -> return "en-US") + localeLang <- catch (liftM (map (\c -> if c == '_' then '-' else c) . + takeWhile (/='.')) $ getEnv "LANG") + (\e -> let _ = (e :: SomeException) in return "en-US") let lang = case lookup "lang" (writerVariables opts') of Just x -> x Nothing -> localeLang diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs new file mode 100644 index 000000000..0fbfb3968 --- /dev/null +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -0,0 +1,616 @@ +{- +Copyright (c) 2011-2012, Sergey Astanin +All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | Conversion of 'Pandoc' documents to FB2 (FictionBook2) format. + +FictionBook is an XML-based e-book format. For more information see: +<http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1> + +-} +module Text.Pandoc.Writers.FB2 (writeFB2) where + +import Control.Monad.State (StateT, evalStateT, get, modify) +import Control.Monad.State (liftM, liftM2, liftIO) +import Data.ByteString.Base64 (encode) +import Data.Char (toUpper, toLower, isSpace, isAscii, isControl) +import Data.List (intersperse, intercalate, isPrefixOf) +import Data.Either (lefts, rights) +import Network.Browser (browse, request, setAllowRedirects, setOutHandler) +import Network.HTTP (catchIO_, getRequest, getHeaders, getResponseBody) +import Network.HTTP (lookupHeader, HeaderName(..), urlEncode) +import Network.URI (isURI, unEscapeString) +import System.FilePath (takeExtension) +import Text.XML.Light +import qualified Control.Exception as E +import qualified Data.ByteString as B +import qualified Text.XML.Light as X +import qualified Text.XML.Light.Cursor as XC + +import Text.Pandoc.Definition +import Text.Pandoc.Shared (WriterOptions(..), HTMLMathMethod(..)) +import Text.Pandoc.Shared (orderedListMarkers, defaultWriterOptions) +import Text.Pandoc.Generic (bottomUp) + +-- | Data to be written at the end of the document: +-- (foot)notes, URLs, references, images. +data FbRenderState = FbRenderState + { footnotes :: [ (Int, String, [Content]) ] -- ^ #, ID, text + , imagesToFetch :: [ (String, String) ] -- ^ filename, URL or path + , parentListMarker :: String -- ^ list marker of the parent ordered list + , parentBulletLevel :: Int -- ^ nesting level of the unordered list + , writerOptions :: WriterOptions + } deriving (Show) + +-- | FictionBook building monad. +type FBM = StateT FbRenderState IO + +newFB :: FbRenderState +newFB = FbRenderState { footnotes = [], imagesToFetch = [] + , parentListMarker = "", parentBulletLevel = 0 + , writerOptions = defaultWriterOptions } + +data ImageMode = NormalImage | InlineImage deriving (Eq) +instance Show ImageMode where + show NormalImage = "imageType" + show InlineImage = "inlineImageType" + +-- | Produce an FB2 document from a 'Pandoc' document. +writeFB2 :: WriterOptions -- ^ conversion options + -> Pandoc -- ^ document to convert + -> IO String -- ^ FictionBook2 document (not encoded yet) +writeFB2 opts (Pandoc meta blocks) = flip evalStateT newFB $ do + modify (\s -> s { writerOptions = opts { writerStandalone = True } }) + desc <- description meta + fp <- frontpage meta + secs <- renderSections 1 blocks + let body = el "body" $ fp ++ secs + notes <- renderFootnotes + (imgs,missing) <- liftM imagesToFetch get >>= \s -> liftIO (fetchImages s) + let body' = replaceImagesWithAlt missing body + let fb2_xml = el "FictionBook" (fb2_attrs, [desc, body'] ++ notes ++ imgs) + return $ xml_head ++ (showContent fb2_xml) + where + xml_head = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + fb2_attrs = + let xmlns = "http://www.gribuser.ru/xml/fictionbook/2.0" + xlink = "http://www.w3.org/1999/xlink" + in [ uattr "xmlns" xmlns + , attr ("xmlns", "l") xlink ] + -- + frontpage :: Meta -> FBM [Content] + frontpage meta' = do + t <- cMapM toXml . docTitle $ meta' + return $ + [ el "title" (el "p" t) + , el "annotation" (map (el "p" . cMap plain) + (docAuthors meta' ++ [docDate meta'])) + ] + description :: Meta -> FBM Content + description meta' = do + bt <- booktitle meta' + let as = authors meta' + dd <- docdate meta' + return $ el "description" + [ el "title-info" (bt ++ as ++ dd) + , el "document-info" [ el "program-used" "pandoc" ] -- FIXME: +version + ] + booktitle :: Meta -> FBM [Content] + booktitle meta' = do + t <- cMapM toXml . docTitle $ meta' + return $ if null t + then [] + else [ el "book-title" t ] + authors :: Meta -> [Content] + authors meta' = cMap author (docAuthors meta') + author :: [Inline] -> [Content] + author ss = + let ws = words . cMap plain $ ss + email = (el "email") `fmap` (take 1 $ filter ('@' `elem`) ws) + ws' = filter ('@' `notElem`) ws + names = case ws' of + (nickname:[]) -> [ el "nickname" nickname ] + (fname:lname:[]) -> [ el "first-name" fname + , el "last-name" lname ] + (fname:rest) -> [ el "first-name" fname + , el "middle-name" (concat . init $ rest) + , el "last-name" (last rest) ] + ([]) -> [] + in list $ el "author" (names ++ email) + docdate :: Meta -> FBM [Content] + docdate meta' = do + let ss = docDate meta' + d <- cMapM toXml ss + return $ if null d + then [] + else [el "date" d] + +-- | Divide the stream of blocks into sections and convert to XML +-- representation. +renderSections :: Int -> [Block] -> FBM [Content] +renderSections level blocks = do + let secs = splitSections level blocks + mapM (renderSection level) secs + +renderSection :: Int -> ([Inline], [Block]) -> FBM Content +renderSection level (ttl, body) = do + title <- if null ttl + then return [] + else return . list . el "title" . formatTitle $ ttl + content <- if (hasSubsections body) + then renderSections (level + 1) body + else cMapM blockToXml body + return $ el "section" (title ++ content) + where + hasSubsections = any isHeader + isHeader (Header _ _) = True + isHeader _ = False + +-- | Only <p> and <empty-line> are allowed within <title> in FB2. +formatTitle :: [Inline] -> [Content] +formatTitle inlines = + let lns = split isLineBreak inlines + lns' = map (el "p" . cMap plain) lns + in intersperse (el "empty-line" ()) lns' + +split :: (a -> Bool) -> [a] -> [[a]] +split _ [] = [] +split cond xs = let (b,a) = break cond xs + in (b:split cond (drop 1 a)) + +isLineBreak :: Inline -> Bool +isLineBreak LineBreak = True +isLineBreak _ = False + +-- | Divide the stream of block elements into sections: [(title, blocks)]. +splitSections :: Int -> [Block] -> [([Inline], [Block])] +splitSections level blocks = reverse $ revSplit (reverse blocks) + where + revSplit [] = [] + revSplit rblocks = + let (lastsec, before) = break sameLevel rblocks + (header, prevblocks) = + case before of + ((Header n title):prevblocks') -> + if n == level + then (title, prevblocks') + else ([], before) + _ -> ([], before) + in (header, reverse lastsec) : revSplit prevblocks + sameLevel (Header n _) = n == level + sameLevel _ = False + +-- | Make another FictionBook body with footnotes. +renderFootnotes :: FBM [Content] +renderFootnotes = do + fns <- footnotes `liftM` get + if null fns + then return [] -- no footnotes + else return . list $ + el "body" ([uattr "name" "notes"], map renderFN (reverse fns)) + where + renderFN (n, idstr, cs) = + let fn_texts = (el "title" (el "p" (show n))) : cs + in el "section" ([uattr "id" idstr], fn_texts) + +-- | Fetch images and encode them for the FictionBook XML. +-- Return image data and a list of hrefs of the missing images. +fetchImages :: [(String,String)] -> IO ([Content],[String]) +fetchImages links = do + imgs <- mapM (uncurry fetchImage) links + return $ (rights imgs, lefts imgs) + +-- | Fetch image data from disk or from network and make a <binary> XML section. +-- Return either (Left hrefOfMissingImage) or (Right xmlContent). +fetchImage :: String -> String -> IO (Either String Content) +fetchImage href link = do + mbimg <- + case (isURI link, readDataURI link) of + (True, Just (mime,_,True,base64)) -> + let mime' = map toLower mime + in if mime' == "image/png" || mime' == "image/jpeg" + then return (Just (mime',base64)) + else return Nothing + (True, Just _) -> return Nothing -- not base64-encoded + (True, Nothing) -> fetchURL link + (False, _) -> do + d <- nothingOnError $ B.readFile (unEscapeString link) + let t = case map toLower (takeExtension link) of + ".png" -> Just "image/png" + ".jpg" -> Just "image/jpeg" + ".jpeg" -> Just "image/jpeg" + ".jpe" -> Just "image/jpeg" + _ -> Nothing -- only PNG and JPEG are supported in FB2 + return $ liftM2 (,) t (liftM (toStr . encode) d) + case mbimg of + Just (imgtype, imgdata) -> do + return . Right $ el "binary" + ( [uattr "id" href + , uattr "content-type" imgtype] + , txt imgdata ) + _ -> return (Left ('#':href)) + where + nothingOnError :: (IO B.ByteString) -> (IO (Maybe B.ByteString)) + nothingOnError action = liftM Just action `E.catch` omnihandler + omnihandler :: E.SomeException -> IO (Maybe B.ByteString) + omnihandler _ = return Nothing + +-- | Extract mime type and encoded data from the Data URI. +readDataURI :: String -- ^ URI + -> Maybe (String,String,Bool,String) + -- ^ Maybe (mime,charset,isBase64,data) +readDataURI uri = + let prefix = "data:" + in if not (prefix `isPrefixOf` uri) + then Nothing + else + let rest = drop (length prefix) uri + meta = takeWhile (/= ',') rest -- without trailing ',' + uridata = drop (length meta + 1) rest + parts = split (== ';') meta + (mime,cs,enc)=foldr upd ("text/plain","US-ASCII",False) parts + in Just (mime,cs,enc,uridata) + where + upd str m@(mime,cs,enc) + | isMimeType str = (str,cs,enc) + | "charset=" `isPrefixOf` str = (mime,drop (length "charset=") str,enc) + | str == "base64" = (mime,cs,True) + | otherwise = m + +-- Without parameters like ;charset=...; see RFC 2045, 5.1 +isMimeType :: String -> Bool +isMimeType s = + case split (=='/') s of + [mtype,msubtype] -> + ((map toLower mtype) `elem` types + || "x-" `isPrefixOf` (map toLower mtype)) + && all valid mtype + && all valid msubtype + _ -> False + where + types = ["text","image","audio","video","application","message","multipart"] + valid c = isAscii c && not (isControl c) && not (isSpace c) && + c `notElem` "()<>@,;:\\\"/[]?=" + +-- | Fetch URL, return its Content-Type and binary data on success. +fetchURL :: String -> IO (Maybe (String, String)) +fetchURL url = do + flip catchIO_ (return Nothing) $ do + r <- browse $ do + setOutHandler (const (return ())) + setAllowRedirects True + liftM snd . request . getRequest $ url + let content_type = lookupHeader HdrContentType (getHeaders r) + content <- liftM (Just . toStr . encode . toBS) . getResponseBody $ Right r + return $ liftM2 (,) content_type content + where + +toBS :: String -> B.ByteString +toBS = B.pack . map (toEnum . fromEnum) + +toStr :: B.ByteString -> String +toStr = map (toEnum . fromEnum) . B.unpack + +footnoteID :: Int -> String +footnoteID i = "n" ++ (show i) + +linkID :: Int -> String +linkID i = "l" ++ (show i) + +-- | Convert a block-level Pandoc's element to FictionBook XML representation. +blockToXml :: Block -> FBM [Content] +blockToXml (Plain ss) = cMapM toXml ss -- FIXME: can lead to malformed FB2 +blockToXml (Para [Math DisplayMath formula]) = insertMath NormalImage formula +blockToXml (Para [img@(Image _ _)]) = insertImage NormalImage img +blockToXml (Para ss) = liftM (list . el "p") $ cMapM toXml ss +blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . + map (el "p" . el "code") . lines $ s +blockToXml (RawBlock _ s) = return . spaceBeforeAfter . + map (el "p" . el "code") . lines $ s +blockToXml (BlockQuote bs) = liftM (list . el "cite") $ cMapM blockToXml bs +blockToXml (OrderedList a bss) = do + state <- get + let pmrk = parentListMarker state + let markers = map ((pmrk ++ " ") ++) $ orderedListMarkers a + let mkitem mrk bs = do + modify (\s -> s { parentListMarker = mrk }) + itemtext <- cMapM blockToXml . paraToPlain $ bs + modify (\s -> s { parentListMarker = pmrk }) -- old parent marker + return . el "p" $ [ txt mrk, txt " " ] ++ itemtext + mapM (uncurry mkitem) (zip markers bss) +blockToXml (BulletList bss) = do + state <- get + let level = parentBulletLevel state + let pmrk = parentListMarker state + let prefix = replicate (length pmrk) ' ' + let bullets = ["\x2022", "\x25e6", "*", "\x2043", "\x2023"] + let mrk = prefix ++ bullets !! (level `mod` (length bullets)) + let mkitem bs = do + modify (\s -> s { parentBulletLevel = (level+1) }) + itemtext <- cMapM blockToXml . paraToPlain $ bs + modify (\s -> s { parentBulletLevel = level }) -- restore bullet level + return $ el "p" $ [ txt (mrk ++ " ") ] ++ itemtext + mapM mkitem bss +blockToXml (DefinitionList defs) = + cMapM mkdef defs + where + mkdef (term, bss) = do + def <- cMapM (cMapM blockToXml . sep . paraToPlain . map indent) bss + t <- wrap "strong" term + return [ el "p" t, el "p" def ] + sep blocks = + if all needsBreak blocks then + blocks ++ [Plain [LineBreak]] + else + blocks + needsBreak (Para _) = False + needsBreak (Plain ins) = LineBreak `notElem` ins + needsBreak _ = True +blockToXml (Header _ _) = -- should never happen, see renderSections + error "unexpected header in section text" +blockToXml HorizontalRule = return + [ el "empty-line" () + , el "p" (txt (replicate 10 '—')) + , el "empty-line" () ] +blockToXml (Table caption aligns _ headers rows) = do + hd <- mkrow "th" headers aligns + bd <- mapM (\r -> mkrow "td" r aligns) rows + c <- return . el "emphasis" =<< cMapM toXml caption + return [el "table" (hd : bd), el "p" c] + where + mkrow :: String -> [TableCell] -> [Alignment] -> FBM Content + mkrow tag cells aligns' = + (el "tr") `liftM` (mapM (mkcell tag) (zip cells aligns')) + -- + mkcell :: String -> (TableCell, Alignment) -> FBM Content + mkcell tag (cell, align) = do + cblocks <- cMapM blockToXml cell + return $ el tag ([align_attr align], cblocks) + -- + align_attr a = Attr (QName "align" Nothing Nothing) (align_str a) + align_str AlignLeft = "left" + align_str AlignCenter = "center" + align_str AlignRight = "right" + align_str AlignDefault = "left" +blockToXml Null = return [] + +-- Replace paragraphs with plain text and line break. +-- Necessary to simulate multi-paragraph lists in FB2. +paraToPlain :: [Block] -> [Block] +paraToPlain [] = [] +paraToPlain (Para inlines : rest) = + let p = (Plain (inlines ++ [LineBreak])) + in p : paraToPlain rest +paraToPlain (p:rest) = p : paraToPlain rest + +-- Simulate increased indentation level. Will not really work +-- for multi-line paragraphs. +indent :: Block -> Block +indent = indentBlock + where + -- indentation space + spacer :: String + spacer = replicate 4 ' ' + -- + indentBlock (Plain ins) = Plain ((Str spacer):ins) + indentBlock (Para ins) = Para ((Str spacer):ins) + indentBlock (CodeBlock a s) = + let s' = unlines . map (spacer++) . lines $ s + in CodeBlock a s' + indentBlock (BlockQuote bs) = BlockQuote (map indent bs) + indentBlock (Header l ins) = Header l (indentLines ins) + indentBlock everythingElse = everythingElse + -- indent every (explicit) line + indentLines :: [Inline] -> [Inline] + indentLines ins = let lns = split isLineBreak ins :: [[Inline]] + in intercalate [LineBreak] $ map ((Str spacer):) lns + +-- | Convert a Pandoc's Inline element to FictionBook XML representation. +toXml :: Inline -> FBM [Content] +toXml (Str s) = return [txt s] +toXml (Emph ss) = list `liftM` wrap "emphasis" ss +toXml (Strong ss) = list `liftM` wrap "strong" ss +toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss +toXml (Superscript ss) = list `liftM` wrap "sup" ss +toXml (Subscript ss) = list `liftM` wrap "sub" ss +toXml (SmallCaps ss) = cMapM toXml $ bottomUp (map toUpper) ss +toXml (Quoted SingleQuote ss) = do -- FIXME: should be language-specific + inner <- cMapM toXml ss + return $ [txt "‘"] ++ inner ++ [txt "’"] +toXml (Quoted DoubleQuote ss) = do + inner <- cMapM toXml ss + return $ [txt "“"] ++ inner ++ [txt "”"] +toXml (Cite _ ss) = cMapM toXml ss -- FIXME: support citation styles +toXml (Code _ s) = return [el "code" s] +toXml Space = return [txt " "] +toXml LineBreak = return [el "empty-line" ()] +toXml (Math _ formula) = insertMath InlineImage formula +toXml (RawInline _ _) = return [] -- raw TeX and raw HTML are suppressed +toXml (Link text (url,ttl)) = do + fns <- footnotes `liftM` get + let n = 1 + length fns + let ln_id = linkID n + let ln_ref = list . el "sup" . txt $ "[" ++ show n ++ "]" + ln_text <- cMapM toXml text + let ln_desc = + let ttl' = dropWhile isSpace ttl + in if null ttl' + then list . el "p" $ el "code" url + else list . el "p" $ [ txt (ttl' ++ ": "), el "code" url ] + modify (\s -> s { footnotes = (n, ln_id, ln_desc) : fns }) + return $ ln_text ++ + [ el "a" + ( [ attr ("l","href") ('#':ln_id) + , uattr "type" "note" ] + , ln_ref) ] +toXml img@(Image _ _) = insertImage InlineImage img +toXml (Note bs) = do + fns <- footnotes `liftM` get + let n = 1 + length fns + let fn_id = footnoteID n + fn_desc <- cMapM blockToXml bs + modify (\s -> s { footnotes = (n, fn_id, fn_desc) : fns }) + let fn_ref = el "sup" . txt $ "[" ++ show n ++ "]" + return . list $ el "a" ( [ attr ("l","href") ('#':fn_id) + , uattr "type" "note" ] + , fn_ref ) + +insertMath :: ImageMode -> String -> FBM [Content] +insertMath immode formula = do + htmlMath <- return . writerHTMLMathMethod . writerOptions =<< get + case htmlMath of + WebTeX url -> do + let alt = [Code nullAttr formula] + let imgurl = url ++ urlEncode formula + let img = Image alt (imgurl, "") + insertImage immode img + _ -> return [el "code" formula] + +insertImage :: ImageMode -> Inline -> FBM [Content] +insertImage immode (Image alt (url,ttl)) = do + images <- imagesToFetch `liftM` get + let n = 1 + length images + let fname = "image" ++ show n + modify (\s -> s { imagesToFetch = (fname, url) : images }) + let ttlattr = case (immode, null ttl) of + (NormalImage, False) -> [ uattr "title" ttl ] + _ -> [] + return . list $ + el "image" $ + [ attr ("l","href") ('#':fname) + , attr ("l","type") (show immode) + , uattr "alt" (cMap plain alt) ] + ++ ttlattr +insertImage _ _ = error "unexpected inline instead of image" + +replaceImagesWithAlt :: [String] -> Content -> Content +replaceImagesWithAlt missingHrefs body = + let cur = XC.fromContent body + cur' = replaceAll cur + in XC.toTree . XC.root $ cur' + where + -- + replaceAll :: XC.Cursor -> XC.Cursor + replaceAll c = + let n = XC.current c + c' = if isImage n && isMissing n + then XC.modifyContent replaceNode c + else c + in case XC.nextDF c' of + (Just cnext) -> replaceAll cnext + Nothing -> c' -- end of document + -- + isImage :: Content -> Bool + isImage (Elem e) = (elName e) == (uname "image") + isImage _ = False + -- + isMissing (Elem img@(Element _ _ _ _)) = + let imgAttrs = elAttribs img + badAttrs = map (attr ("l","href")) missingHrefs + in any (`elem` imgAttrs) badAttrs + isMissing _ = False + -- + replaceNode :: Content -> Content + replaceNode n@(Elem img@(Element _ _ _ _)) = + let attrs = elAttribs img + alt = getAttrVal attrs (uname "alt") + imtype = getAttrVal attrs (qname "l" "type") + in case (alt, imtype) of + (Just alt', Just imtype') -> + if imtype' == show NormalImage + then el "p" alt' + else txt alt' + (Just alt', Nothing) -> txt alt' -- no type attribute + _ -> n -- don't replace if alt text is not found + replaceNode n = n + -- + getAttrVal :: [X.Attr] -> QName -> Maybe String + getAttrVal attrs name = + case filter ((name ==) . attrKey) attrs of + (a:_) -> Just (attrVal a) + _ -> Nothing + + +-- | Wrap all inlines with an XML tag (given its unqualified name). +wrap :: String -> [Inline] -> FBM Content +wrap tagname inlines = el tagname `liftM` cMapM toXml inlines + +-- " Create a singleton list. +list :: a -> [a] +list = (:[]) + +-- | Convert an 'Inline' to plaintext. +plain :: Inline -> String +plain (Str s) = s +plain (Emph ss) = concat (map plain ss) +plain (Strong ss) = concat (map plain ss) +plain (Strikeout ss) = concat (map plain ss) +plain (Superscript ss) = concat (map plain ss) +plain (Subscript ss) = concat (map plain ss) +plain (SmallCaps ss) = concat (map plain ss) +plain (Quoted _ ss) = concat (map plain ss) +plain (Cite _ ss) = concat (map plain ss) -- FIXME +plain (Code _ s) = s +plain Space = " " +plain LineBreak = "\n" +plain (Math _ s) = s +plain (RawInline _ s) = s +plain (Link text (url,_)) = concat (map plain text ++ [" <", url, ">"]) +plain (Image alt _) = concat (map plain alt) +plain (Note _) = "" -- FIXME + +-- | Create an XML element. +el :: (Node t) + => String -- ^ unqualified element name + -> t -- ^ node contents + -> Content -- ^ XML content +el name cs = Elem $ unode name cs + +-- | Put empty lines around content +spaceBeforeAfter :: [Content] -> [Content] +spaceBeforeAfter cs = + let emptyline = el "empty-line" () + in [emptyline] ++ cs ++ [emptyline] + +-- | Create a plain-text XML content. +txt :: String -> Content +txt s = Text $ CData CDataText s Nothing + +-- | Create an XML attribute with an unqualified name. +uattr :: String -> String -> Text.XML.Light.Attr +uattr name val = Attr (uname name) val + +-- | Create an XML attribute with a qualified name from given namespace. +attr :: (String, String) -> String -> Text.XML.Light.Attr +attr (ns, name) val = Attr (qname ns name) val + +-- | Unqualified name +uname :: String -> QName +uname name = QName name Nothing Nothing + +-- | Qualified name +qname :: String -> String -> QName +qname ns name = QName name Nothing (Just ns) + +-- | Abbreviation for 'concatMap'. +cMap :: (a -> [b]) -> [a] -> [b] +cMap = concatMap + +-- | Monadic equivalent of 'concatMap'. +cMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +cMapM f xs = concat `liftM` mapM f xs
\ No newline at end of file diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 9dd29f183..cafb6ca74 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE OverloadedStrings, CPP #-} {-# OPTIONS_GHC -fno-warn-deprecations #-} {- Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> @@ -46,7 +46,12 @@ import Data.List ( isPrefixOf, intersperse ) import Data.String ( fromString ) import Data.Maybe ( catMaybes ) import Control.Monad.State +#if MIN_VERSION_blaze_html(0,5,0) +import Text.Blaze.Html hiding(contents) +import Text.Blaze.Internal(preEscapedString) +#else import Text.Blaze +#endif import qualified Text.Blaze.Html5 as H5 import qualified Text.Blaze.XHtml1.Transitional as H import qualified Text.Blaze.XHtml1.Transitional.Attributes as A @@ -59,12 +64,14 @@ import Data.Monoid (mempty, mconcat) data WriterState = WriterState { stNotes :: [Html] -- ^ List of notes , stMath :: Bool -- ^ Math is used in document + , stQuotes :: Bool -- ^ <q> tag is used , stHighlighting :: Bool -- ^ Syntax highlighting is used , stSecNum :: [Int] -- ^ Number of current section } defaultWriterState :: WriterState -defaultWriterState = WriterState {stNotes= [], stMath = False, stHighlighting = False, stSecNum = []} +defaultWriterState = WriterState {stNotes= [], stMath = False, stQuotes = False, + stHighlighting = False, stSecNum = []} -- Helpers to render HTML with the appropriate function. @@ -156,7 +163,8 @@ pandocToHtml opts (Pandoc (Meta title' authors' date') blocks) = do let newvars = [("highlighting-css", styleToCss $ writerHighlightStyle opts) | stHighlighting st] ++ - [("math", renderHtml math) | stMath st] + [("math", renderHtml math) | stMath st] ++ + [("quotes", "yes") | stQuotes st] return (tit, auths, authsMeta, date, toc, thebody, newvars) -- | Prepare author for meta tag, converting notes into @@ -191,6 +199,7 @@ inTemplate opts tit auths authsMeta date toc body' newvars = , ("date", date') , ("idprefix", writerIdentifierPrefix opts) , ("slidy-url", "http://www.w3.org/Talks/Tools/Slidy2") + , ("slideous-url", "slideous") , ("s5-url", "s5/default") ] ++ [ ("html5","true") | writerHtml5 opts ] ++ (case toc of @@ -253,7 +262,9 @@ elementToHtml slideLevel opts (Sec level num id' title' elements) = do -- always use level 1 for slide titles let level' = if slide then 1 else level let titleSlide = slide && level < slideLevel - header' <- blockToHtml opts (Header level' title') + header' <- if title' == [Str "\0"] -- marker for hrule + then return mempty + else blockToHtml opts (Header level' title') let isSec (Sec _ _ _ _ _) = True isSec (Blk _) = False innerContents <- mapM (elementToHtml slideLevel opts) @@ -261,9 +272,8 @@ elementToHtml slideLevel opts (Sec level num id' title' elements) = do -- title slides have no content of their own then filter isSec elements else elements - let header'' = if (writerStrictMarkdown opts || - writerSectionDivs opts || - writerSlideVariant opts == S5Slides) + let header'' = if (writerStrictMarkdown opts || writerSectionDivs opts || + writerSlideVariant opts == S5Slides || slide) then header' else header' ! prefixedId opts id' let inNl x = mconcat $ nl opts : intersperse (nl opts) x ++ [nl opts] @@ -581,8 +591,12 @@ inlineToHtml opts inline = strToHtml "’") DoubleQuote -> (strToHtml "“", strToHtml "”") - in do contents <- inlineListToHtml opts lst - return $ leftQuote >> contents >> rightQuote + in if writerHtml5 opts + then do + modify $ \st -> st{ stQuotes = True } + H.q `fmap` inlineListToHtml opts lst + else (\x -> leftQuote >> x >> rightQuote) + `fmap` inlineListToHtml opts lst (Math t str) -> modify (\st -> st {stMath = True}) >> (case writerHTMLMathMethod opts of LaTeXMathML _ -> @@ -624,7 +638,7 @@ inlineToHtml opts inline = Left _ -> inlineListToHtml opts (readTeXMath str) >>= return . (H.span ! A.class_ "math") - MathJax _ -> return $ toHtml $ + MathJax _ -> return $ H.span ! A.class_ "math" $ toHtml $ case t of InlineMath -> "\\(" ++ str ++ "\\)" DisplayMath -> "\\[" ++ str ++ "\\]" diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index e99b20c60..7beee2d42 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -56,7 +56,6 @@ data WriterState = , stEnumerate :: Bool -- true if document needs fancy enumerated lists , stTable :: Bool -- true if document has a table , stStrikeout :: Bool -- true if document has strikeout - , stSubscript :: Bool -- true if document has subscript , stUrl :: Bool -- true if document has visible URL link , stGraphics :: Bool -- true if document contains images , stLHS :: Bool -- true if document has literate haskell code @@ -65,6 +64,7 @@ data WriterState = , stHighlighting :: Bool -- true if document has highlighted code , stIncremental :: Bool -- true if beamer lists should be displayed bit by bit , stInternalLinks :: [String] -- list of internal link targets + , stUsesEuro :: Bool -- true if euro symbol used } -- | Convert Pandoc to LaTeX. @@ -74,12 +74,12 @@ writeLaTeX options document = WriterState { stInNote = False, stInTable = False, stTableNotes = [], stOLLevel = 1, stOptions = options, stVerbInNote = False, stEnumerate = False, - stTable = False, stStrikeout = False, stSubscript = False, + stTable = False, stStrikeout = False, stUrl = False, stGraphics = False, stLHS = False, stBook = writerChapters options, stCsquotes = False, stHighlighting = False, stIncremental = writerIncremental options, - stInternalLinks = [] } + stInternalLinks = [], stUsesEuro = False } pandocToLaTeX :: WriterOptions -> Pandoc -> State WriterState String pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do @@ -117,7 +117,7 @@ pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do else return blocks' body <- mapM (elementToLaTeX options) $ hierarchicalize blocks'' biblioTitle <- liftM (render colwidth) $ inlineListToLaTeX lastHeader - let main = render colwidth $ vcat body + let main = render colwidth $ vsep body st <- get let biblioFiles = intercalate "," $ map dropExtension $ writerBiblioFiles options citecontext = case writerCiteMethod options of @@ -134,6 +134,8 @@ pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do [ ("toc", if writerTableOfContents options then "yes" else "") , ("body", main) , ("title", titletext) + , ("title-meta", stringify title) + , ("author-meta", intercalate "; " $ map stringify authors) , ("date", dateText) , ("documentclass", if writerBeamer options then "beamer" @@ -145,14 +147,16 @@ pandocToLaTeX options (Pandoc (Meta title authors date) blocks) = do [ ("fancy-enums", "yes") | stEnumerate st ] ++ [ ("tables", "yes") | stTable st ] ++ [ ("strikeout", "yes") | stStrikeout st ] ++ - [ ("subscript", "yes") | stSubscript st ] ++ [ ("url", "yes") | stUrl st ] ++ [ ("numbersections", "yes") | writerNumberSections options ] ++ [ ("lhs", "yes") | stLHS st ] ++ [ ("graphics", "yes") | stGraphics st ] ++ [ ("book-class", "yes") | stBook st] ++ + [ ("euro", "yes") | stUsesEuro st] ++ [ ("listings", "yes") | writerListings options || stLHS st ] ++ [ ("beamer", "yes") | writerBeamer options ] ++ + [ ("mainlang", maybe "" (reverse . takeWhile (/=',') . reverse) + (lookup "lang" $ writerVariables options)) ] ++ [ ("highlighting-macros", styleToLaTeX $ writerHighlightStyle options ) | stHighlighting st ] ++ citecontext @@ -166,13 +170,20 @@ elementToLaTeX _ (Blk block) = blockToLaTeX block elementToLaTeX opts (Sec level _ id' title' elements) = do header' <- sectionHeader id' level title' innerContents <- mapM (elementToLaTeX opts) elements - return $ vcat (header' : innerContents) + return $ vsep (header' : innerContents) -- escape things as needed for LaTeX -stringToLaTeX :: Bool -> String -> String -stringToLaTeX _ [] = "" -stringToLaTeX isUrl (x:xs) = - case x of +stringToLaTeX :: Bool -> String -> State WriterState String +stringToLaTeX _ [] = return "" +stringToLaTeX isUrl (x:xs) = do + opts <- gets stOptions + rest <- stringToLaTeX isUrl xs + let ligatures = writerTeXLigatures opts + when (x == '€') $ + modify $ \st -> st{ stUsesEuro = True } + return $ + case x of + '€' -> "\\euro{}" ++ rest '{' -> "\\{" ++ rest '}' -> "\\}" ++ rest '$' -> "\\$" ++ rest @@ -183,25 +194,23 @@ stringToLaTeX isUrl (x:xs) = '-' -> case xs of -- prevent adjacent hyphens from forming ligatures ('-':_) -> "-{}" ++ rest _ -> '-' : rest - '~' | not isUrl -> "\\ensuremath{\\sim}" + '~' | not isUrl -> "\\textasciitilde{}" ++ rest '^' -> "\\^{}" ++ rest '\\' -> "\\textbackslash{}" ++ rest - '€' -> "\\euro{}" ++ rest '|' -> "\\textbar{}" ++ rest '<' -> "\\textless{}" ++ rest '>' -> "\\textgreater{}" ++ rest '[' -> "{[}" ++ rest -- to avoid interpretation as ']' -> "{]}" ++ rest -- optional arguments '\160' -> "~" ++ rest - '\x2018' -> "`" ++ rest - '\x2019' -> "'" ++ rest - '\x201C' -> "``" ++ rest - '\x201D' -> "''" ++ rest '\x2026' -> "\\ldots{}" ++ rest - '\x2014' -> "---" ++ rest - '\x2013' -> "--" ++ rest + '\x2018' | ligatures -> "`" ++ rest + '\x2019' | ligatures -> "'" ++ rest + '\x201C' | ligatures -> "``" ++ rest + '\x201D' | ligatures -> "''" ++ rest + '\x2014' | ligatures -> "---" ++ rest + '\x2013' | ligatures -> "--" ++ rest _ -> x : rest - where rest = stringToLaTeX isUrl xs -- | Puts contents into LaTeX command. inCmd :: String -> Doc -> Doc @@ -234,8 +243,11 @@ elementToBeamer slideLevel (Sec lvl _num _ident tit elts) let fragile = if not $ null $ queryWith hasCodeBlock elts ++ queryWith hasCode elts then "[fragile]" else "" - let slideStart = Para $ RawInline "latex" ("\\begin{frame}" ++ fragile ++ - "\\frametitle{") : tit ++ [RawInline "latex" "}"] + let slideStart = Para $ RawInline "latex" ("\\begin{frame}" ++ fragile) : + if tit == [Str "\0"] -- marker for hrule + then [] + else (RawInline "latex" "\\frametitle{") : tit ++ + [RawInline "latex" "}"] let slideEnd = RawBlock "latex" "\\end{frame}" -- now carve up slide into blocks if there are sections inside bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts @@ -256,16 +268,16 @@ blockToLaTeX (Para [Image txt (src,tit)]) = do capt <- inlineListToLaTeX txt img <- inlineToLaTeX (Image txt (src,tit)) return $ "\\begin{figure}[htbp]" $$ "\\centering" $$ img $$ - ("\\caption{" <> capt <> char '}') $$ "\\end{figure}" $$ blankline + ("\\caption{" <> capt <> char '}') $$ "\\end{figure}" blockToLaTeX (Para lst) = do result <- inlineListToLaTeX lst - return $ result <> blankline + return result blockToLaTeX (BlockQuote lst) = do beamer <- writerBeamer `fmap` gets stOptions case lst of [b] | beamer && isListBlock b -> do oldIncremental <- gets stIncremental - modify $ \s -> s{ stIncremental = True } + modify $ \s -> s{ stIncremental = not oldIncremental } result <- blockToLaTeX b modify $ \s -> s{ stIncremental = oldIncremental } return result @@ -290,7 +302,7 @@ blockToLaTeX (CodeBlock (_,classes,keyvalAttr) str) = do return "Verbatim" else return "verbatim" return $ flush (text ("\\begin{" ++ env ++ "}") $$ text str $$ - text ("\\end{" ++ env ++ "}")) $$ cr -- final cr because of notes + text ("\\end{" ++ env ++ "}")) <> cr listingsCodeBlock = do st <- get let params = if writerListings (stOptions st) @@ -325,13 +337,14 @@ blockToLaTeX (CodeBlock (_,classes,keyvalAttr) str) = do Nothing -> rawCodeBlock Just h -> modify (\st -> st{ stHighlighting = True }) >> return (flush $ text h) -blockToLaTeX (RawBlock "latex" x) = return $ text x <> blankline +blockToLaTeX (RawBlock "latex" x) = return $ text x blockToLaTeX (RawBlock _ _) = return empty blockToLaTeX (BulletList lst) = do incremental <- gets stIncremental let inc = if incremental then "[<+->]" else "" items <- mapM listItemToLaTeX lst - return $ text ("\\begin{itemize}" ++ inc) $$ vcat items $$ "\\end{itemize}" + return $ text ("\\begin{itemize}" ++ inc) $$ vcat items $$ + "\\end{itemize}" blockToLaTeX (OrderedList (start, numstyle, numdelim) lst) = do st <- get let inc = if stIncremental st then "[<+->]" else "" @@ -357,9 +370,10 @@ blockToLaTeX (DefinitionList lst) = do incremental <- gets stIncremental let inc = if incremental then "[<+->]" else "" items <- mapM defListItemToLaTeX lst - return $ text ("\\begin{description}" ++ inc) $$ vcat items $$ "\\end{description}" + return $ text ("\\begin{description}" ++ inc) $$ vcat items $$ + "\\end{description}" blockToLaTeX HorizontalRule = return $ - "\\begin{center}\\rule{3in}{0.4pt}\\end{center}" $$ blankline + "\\begin{center}\\rule{3in}{0.4pt}\\end{center}" blockToLaTeX (Header level lst) = sectionHeader "" level lst blockToLaTeX (Table caption aligns widths heads rows) = do modify $ \s -> s{ stInTable = True, stTableNotes = [] } @@ -370,7 +384,7 @@ blockToLaTeX (Table caption aligns widths heads rows) = do captionText <- inlineListToLaTeX caption let capt = if isEmpty captionText then empty - else text "caption = " <> captionText <> "," <> space + else text "caption = {" <> captionText <> "}," <> space rows' <- mapM (tableRowToLaTeX False aligns widths) rows let rows'' = intersperse ("\\\\\\noalign{\\medskip}") rows' tableNotes <- liftM (reverse . stTableNotes) get @@ -385,7 +399,7 @@ blockToLaTeX (Table caption aligns widths heads rows) = do $$ braces (text "% rows" $$ "\\FL" $$ vcat (headers : rows'') $$ "\\LL" <> cr) modify $ \s -> s{ stTable = True, stInTable = False, stTableNotes = [] } - return $ tableBody $$ blankline + return $ tableBody toColDescriptor :: Alignment -> String toColDescriptor align = @@ -396,7 +410,7 @@ toColDescriptor align = AlignDefault -> "l" blockListToLaTeX :: [Block] -> State WriterState Doc -blockListToLaTeX lst = mapM blockToLaTeX lst >>= return . vcat +blockListToLaTeX lst = vsep `fmap` mapM blockToLaTeX lst tableRowToLaTeX :: Bool -> [Alignment] @@ -457,7 +471,6 @@ sectionHeader ref level lst = do <> braces (lab <> text "\\label" <> braces (text ref)) else lab) - $$ blankline let headerWith x y = refLabel $ text x <> y return $ case level' of 0 -> if writerBeamer opts @@ -468,7 +481,7 @@ sectionHeader ref level lst = do 3 -> headerWith "\\subsubsection" stuffing 4 -> headerWith "\\paragraph" stuffing 5 -> headerWith "\\subparagraph" stuffing - _ -> txt $$ blankline + _ -> txt -- | Convert list of inline elements to LaTeX. @@ -494,11 +507,7 @@ inlineToLaTeX (Strikeout lst) = do inlineToLaTeX (Superscript lst) = inlineListToLaTeX lst >>= return . inCmd "textsuperscript" inlineToLaTeX (Subscript lst) = do - modify $ \s -> s{ stSubscript = True } - contents <- inlineListToLaTeX lst - -- oddly, latex includes \textsuperscript but not \textsubscript - -- so we have to define it (using a different name so as not to conflict with memoir class): - return $ inCmd "textsubscr" contents + inlineListToLaTeX lst >>= return . inCmd "textsubscript" inlineToLaTeX (SmallCaps lst) = inlineListToLaTeX lst >>= return . inCmd "textsc" inlineToLaTeX (Cite cits lst) = do @@ -525,24 +534,12 @@ inlineToLaTeX (Code (_,classes,_) str) = do Nothing -> rawCode Just h -> modify (\st -> st{ stHighlighting = True }) >> return (text h) - rawCode = return - $ text $ "\\texttt{" ++ stringToLaTeX False str ++ "}" -inlineToLaTeX (Quoted SingleQuote lst) = do - contents <- inlineListToLaTeX lst - csquotes <- liftM stCsquotes get - if csquotes - then return $ "\\enquote" <> braces contents - else do - let s1 = if (not (null lst)) && (isQuoted (head lst)) - then "\\," - else empty - let s2 = if (not (null lst)) && (isQuoted (last lst)) - then "\\," - else empty - return $ char '`' <> s1 <> contents <> s2 <> char '\'' -inlineToLaTeX (Quoted DoubleQuote lst) = do + rawCode = liftM (text . (\s -> "\\texttt{" ++ s ++ "}")) + $ stringToLaTeX False str +inlineToLaTeX (Quoted qt lst) = do contents <- inlineListToLaTeX lst csquotes <- liftM stCsquotes get + opts <- gets stOptions if csquotes then return $ "\\enquote" <> braces contents else do @@ -552,8 +549,17 @@ inlineToLaTeX (Quoted DoubleQuote lst) = do let s2 = if (not (null lst)) && (isQuoted (last lst)) then "\\," else empty - return $ "``" <> s1 <> contents <> s2 <> "''" -inlineToLaTeX (Str str) = return $ text $ stringToLaTeX False str + let inner = s1 <> contents <> s2 + return $ case qt of + DoubleQuote -> + if writerTeXLigatures opts + then text "``" <> inner <> text "''" + else char '\x201C' <> inner <> char '\x201D' + SingleQuote -> + if writerTeXLigatures opts + then char '`' <> inner <> char '\'' + else char '\x2018' <> inner <> char '\x2019' +inlineToLaTeX (Str str) = liftM text $ stringToLaTeX False str inlineToLaTeX (Math InlineMath str) = return $ char '$' <> text str <> char '$' inlineToLaTeX (Math DisplayMath str) = return $ "\\[" <> text str <> "\\]" inlineToLaTeX (RawInline "latex" str) = return $ text str @@ -561,13 +567,18 @@ inlineToLaTeX (RawInline "tex" str) = return $ text str inlineToLaTeX (RawInline _ _) = return empty inlineToLaTeX (LineBreak) = return "\\\\" inlineToLaTeX Space = return space +inlineToLaTeX (Link txt ('#':ident, _)) = do + contents <- inlineListToLaTeX txt + ident' <- stringToLaTeX False ident + return $ text "\\hyperref" <> brackets (text ident') <> braces contents inlineToLaTeX (Link txt (src, _)) = case txt of [Code _ x] | x == src -> -- autolink do modify $ \s -> s{ stUrl = True } return $ text $ "\\url{" ++ x ++ "}" _ -> do contents <- inlineListToLaTeX txt - return $ text ("\\href{" ++ stringToLaTeX True src ++ "}{") <> + src' <- stringToLaTeX True src + return $ text ("\\href{" ++ src' ++ "}{") <> contents <> char '}' inlineToLaTeX (Image _ (source, _)) = do modify $ \s -> s{ stGraphics = True } @@ -580,13 +591,16 @@ inlineToLaTeX (Note contents) = do contents' <- blockListToLaTeX contents modify (\s -> s {stInNote = False}) inTable <- liftM stInTable get + let optnl = case reverse contents of + (CodeBlock _ _ : _) -> cr + _ -> empty if inTable then do curnotes <- liftM stTableNotes get let marker = cycle ['a'..'z'] !! length curnotes modify $ \s -> s{ stTableNotes = (marker, contents') : curnotes } return $ "\\tmark" <> brackets (char marker) <> space - else return $ "\\footnote" <> braces (nest 2 contents') + else return $ "\\footnote" <> braces (nest 2 contents' <> optnl) -- note: a \n before } needed when note ends with a Verbatim environment citationsToNatbib :: [Citation] -> State WriterState Doc diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index d3735efa7..c481e6c87 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -112,7 +112,11 @@ escapeString = escapeStringUsing manEscapes -- | Escape a literal (code) section for Man. escapeCode :: String -> String -escapeCode = escapeStringUsing (manEscapes ++ backslashEscapes "\t ") +escapeCode = concat . intersperse "\n" . map escapeLine . lines where + escapeLine codeline = + case escapeStringUsing (manEscapes ++ backslashEscapes "\t ") codeline of + a@('.':_) -> "\\&" ++ a + b -> b -- We split inline lists into sentences, and print one sentence per -- line. groff/troff treats the line-ending period differently. @@ -122,15 +126,18 @@ escapeCode = escapeStringUsing (manEscapes ++ backslashEscapes "\t ") breakSentence :: [Inline] -> ([Inline], [Inline]) breakSentence [] = ([],[]) breakSentence xs = - let isSentenceEndInline (Str ".") = True - isSentenceEndInline (Str "?") = True + let isSentenceEndInline (Str ys@(_:_)) | last ys == '.' = True + isSentenceEndInline (Str ys@(_:_)) | last ys == '?' = True + isSentenceEndInline (LineBreak) = True isSentenceEndInline _ = False (as, bs) = break isSentenceEndInline xs in case bs of [] -> (as, []) [c] -> (as ++ [c], []) (c:Space:cs) -> (as ++ [c], cs) - (Str ".":Str ")":cs) -> (as ++ [Str ".", Str ")"], cs) + (Str ".":Str (')':ys):cs) -> (as ++ [Str ".", Str (')':ys)], cs) + (x@(Str ('.':')':_)):cs) -> (as ++ [x], cs) + (LineBreak:x@(Str ('.':_)):cs) -> (as ++[LineBreak], x:cs) (c:cs) -> (as ++ [c] ++ ds, es) where (ds, es) = breakSentence cs @@ -279,7 +286,7 @@ blockListToMan opts blocks = inlineListToMan :: WriterOptions -> [Inline] -> State WriterState Doc -- if list starts with ., insert a zero-width character \& so it -- won't be interpreted as markup if it falls at the beginning of a line. -inlineListToMan opts lst@(Str "." : _) = mapM (inlineToMan opts) lst >>= +inlineListToMan opts lst@(Str ('.':_) : _) = mapM (inlineToMan opts) lst >>= (return . (text "\\&" <>) . hcat) inlineListToMan opts lst = mapM (inlineToMan opts) lst >>= (return . hcat) diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 7ce939395..32b28a770 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -35,8 +35,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.Templates (renderTemplate) import Text.Pandoc.Shared -import Text.Pandoc.Parsing hiding (blankline) -import Text.ParserCombinators.Parsec ( runParser, GenParser ) +import Text.Pandoc.Parsing hiding (blankline, char, space) import Data.List ( group, isPrefixOf, find, intersperse, transpose ) import Text.Pandoc.Pretty import Control.Monad.State @@ -152,7 +151,7 @@ noteToMarkdown opts num blocks = do -- | Escape special characters for Markdown. escapeString :: String -> String escapeString = escapeStringUsing markdownEscapes - where markdownEscapes = backslashEscapes "\\`*_>#~^" + where markdownEscapes = backslashEscapes "\\`*_$<>#~^" -- | Construct table of contents from list of header blocks. tableOfContents :: WriterOptions -> [Block] -> Doc @@ -188,7 +187,7 @@ attrsToMarkdown attribs = braces $ hsep [attribId, attribClasses, attribKeys] <> "=\"" <> text v <> "\"") ks -- | Ordered list start parser for use in Para below. -olMarker :: GenParser Char ParserState Char +olMarker :: Parser [Char] ParserState Char olMarker = do (start, style', delim) <- anyOrderedListMarker if delim == Period && (style' == UpperAlpha || (style' == UpperRoman && @@ -218,7 +217,7 @@ blockToMarkdown opts (Para inlines) = do let esc = if (not (writerStrictMarkdown opts)) && not (stPlain st) && beginsWithOrderedListMarker (render Nothing contents) - then text "\\" + then text "\x200B" -- zero-width space, a hack else empty return $ esc <> contents <> blankline blockToMarkdown _ (RawBlock f str) @@ -254,7 +253,7 @@ blockToMarkdown opts (CodeBlock attribs str) = return $ if writerStrictMarkdown opts || attribs == nullAttr then nest (writerTabStop opts) (text str) <> blankline else -- use delimited code block - flush (tildes <> space <> attrs <> cr <> text str <> + (tildes <> space <> attrs <> cr <> text str <> cr <> tildes) <> blankline where tildes = text "~~~~" attrs = attrsToMarkdown attribs @@ -355,13 +354,13 @@ definitionListItemToMarkdown opts (label, defs) = do labelText <- inlineListToMarkdown opts label let tabStop = writerTabStop opts st <- get - let leader = if stPlain st then " " else " ~" + let leader = if stPlain st then " " else ": " let sps = case writerTabStop opts - 3 of n | n > 0 -> text $ replicate n ' ' _ -> text " " defs' <- mapM (mapM (blockToMarkdown opts)) defs let contents = vcat $ map (\d -> hang tabStop (leader <> sps) $ vcat d <> cr) defs' - return $ labelText <> cr <> contents <> cr + return $ nowrap labelText <> cr <> contents <> cr -- | Convert list of Pandoc block elements to markdown. blockListToMarkdown :: WriterOptions -- ^ Options @@ -516,9 +515,9 @@ inlineToMarkdown opts (Link txt (src, tit)) = do else "[" <> linktext <> "](" <> text src <> linktitle <> ")" inlineToMarkdown opts (Image alternate (source, tit)) = do - let txt = if (null alternate) || (alternate == [Str ""]) || - (alternate == [Str source]) -- to prevent autolinks - then [Str "image"] + let txt = if null alternate || alternate == [Str source] + -- to prevent autolinks + then [Str ""] else alternate linkPart <- inlineToMarkdown opts (Link txt (source, tit)) return $ "!" <> linkPart diff --git a/src/Text/Pandoc/Writers/MediaWiki.hs b/src/Text/Pandoc/Writers/MediaWiki.hs index f31a2c2d1..b32c5327d 100644 --- a/src/Text/Pandoc/Writers/MediaWiki.hs +++ b/src/Text/Pandoc/Writers/MediaWiki.hs @@ -149,6 +149,7 @@ blockToMediaWiki opts (Table capt aligns widths headers rows') = do blockToMediaWiki opts x@(BulletList items) = do oldUseTags <- get >>= return . stUseTags + listLevel <- get >>= return . stListLevel let useTags = oldUseTags || not (isSimpleList x) if useTags then do @@ -160,10 +161,11 @@ blockToMediaWiki opts x@(BulletList items) = do modify $ \s -> s { stListLevel = stListLevel s ++ "*" } contents <- mapM (listItemToMediaWiki opts) items modify $ \s -> s { stListLevel = init (stListLevel s) } - return $ vcat contents ++ "\n" + return $ vcat contents ++ if null listLevel then "\n" else "" blockToMediaWiki opts x@(OrderedList attribs items) = do oldUseTags <- get >>= return . stUseTags + listLevel <- get >>= return . stListLevel let useTags = oldUseTags || not (isSimpleList x) if useTags then do @@ -175,10 +177,11 @@ blockToMediaWiki opts x@(OrderedList attribs items) = do modify $ \s -> s { stListLevel = stListLevel s ++ "#" } contents <- mapM (listItemToMediaWiki opts) items modify $ \s -> s { stListLevel = init (stListLevel s) } - return $ vcat contents ++ "\n" + return $ vcat contents ++ if null listLevel then "\n" else "" blockToMediaWiki opts x@(DefinitionList items) = do oldUseTags <- get >>= return . stUseTags + listLevel <- get >>= return . stListLevel let useTags = oldUseTags || not (isSimpleList x) if useTags then do @@ -190,7 +193,7 @@ blockToMediaWiki opts x@(DefinitionList items) = do modify $ \s -> s { stListLevel = stListLevel s ++ ";" } contents <- mapM (definitionListItemToMediaWiki opts) items modify $ \s -> s { stListLevel = init (stListLevel s) } - return $ vcat contents ++ "\n" + return $ vcat contents ++ if null listLevel then "\n" else "" -- Auxiliary functions for lists: diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs index 4c77ba7c6..7eb943a22 100644 --- a/src/Text/Pandoc/Writers/Org.hs +++ b/src/Text/Pandoc/Writers/Org.hs @@ -139,9 +139,9 @@ blockToOrg (CodeBlock (_,classes,_) str) = do "ledger", "lisp", "matlab", "mscgen", "ocaml", "octave", "oz", "perl", "plantuml", "python", "R", "ruby", "sass", "scheme", "screen", "sh", "sql", "sqlite"] - let (beg, end) = if null at - then ("#+BEGIN_EXAMPLE", "#+END_EXAMPLE") - else ("#+BEGIN_SRC" ++ head at, "#+END_SRC") + let (beg, end) = case at of + [] -> ("#+BEGIN_EXAMPLE", "#+END_EXAMPLE") + (x:_) -> ("#+BEGIN_SRC " ++ x, "#+END_SRC") return $ text beg $$ nest tabstop (text str) $$ text end $$ blankline blockToOrg (BlockQuote blocks) = do contents <- blockListToOrg blocks diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index d6e5b5c9e..d98079940 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -38,6 +38,7 @@ import Data.List ( isPrefixOf, intersperse, transpose ) import Text.Pandoc.Pretty import Control.Monad.State import Control.Applicative ( (<$>) ) +import Data.Char (isSpace) type Refs = [([Inline], Target)] @@ -96,7 +97,7 @@ keyToRST (label, (src, _)) = do let label'' = if ':' `elem` (render Nothing label') then char '`' <> label' <> char '`' else label' - return $ ".. _" <> label'' <> ": " <> text src + return $ nowrap $ ".. _" <> label'' <> ": " <> text src -- | Return RST representation of notes. notesToRST :: [[Block]] -> State WriterState Doc @@ -253,7 +254,52 @@ blockListToRST blocks = mapM blockToRST blocks >>= return . vcat -- | Convert list of Pandoc inline elements to RST. inlineListToRST :: [Inline] -> State WriterState Doc -inlineListToRST lst = mapM inlineToRST lst >>= return . hcat +inlineListToRST lst = mapM inlineToRST (insertBS lst) >>= return . hcat + where insertBS :: [Inline] -> [Inline] -- insert '\ ' where needed + insertBS (x:y:z:zs) + | isComplex y && surroundComplex x z = + x : y : RawInline "rst" "\\ " : insertBS (z:zs) + insertBS (x:y:zs) + | isComplex x && not (okAfterComplex y) = + x : RawInline "rst" "\\ " : insertBS (y : zs) + | isComplex y && not (okBeforeComplex x) = + x : RawInline "rst" "\\ " : insertBS (y : zs) + | otherwise = + x : insertBS (y : zs) + insertBS (x:ys) = x : insertBS ys + insertBS [] = [] + surroundComplex :: Inline -> Inline -> Bool + surroundComplex (Str s@(_:_)) (Str s'@(_:_)) = + case (last s, head s') of + ('\'','\'') -> True + ('"','"') -> True + ('<','>') -> True + ('[',']') -> True + ('{','}') -> True + _ -> False + surroundComplex _ _ = False + okAfterComplex :: Inline -> Bool + okAfterComplex Space = True + okAfterComplex LineBreak = True + okAfterComplex (Str (c:_)) = isSpace c || c `elem` "-.,:;!?\\/'\")]}>–—" + okAfterComplex _ = False + okBeforeComplex :: Inline -> Bool + okBeforeComplex Space = True + okBeforeComplex LineBreak = True + okBeforeComplex (Str (c:_)) = isSpace c || c `elem` "-:/'\"<([{–—" + okBeforeComplex _ = False + isComplex :: Inline -> Bool + isComplex (Emph _) = True + isComplex (Strong _) = True + isComplex (SmallCaps _) = True + isComplex (Strikeout _) = True + isComplex (Superscript _) = True + isComplex (Subscript _) = True + isComplex (Link _ _) = True + isComplex (Image _ _) = True + isComplex (Code _ _) = True + isComplex (Math _ _) = True + isComplex _ = False -- | Convert Pandoc inline element to RST. inlineToRST :: Inline -> State WriterState Doc @@ -268,10 +314,10 @@ inlineToRST (Strikeout lst) = do return $ "[STRIKEOUT:" <> contents <> "]" inlineToRST (Superscript lst) = do contents <- inlineListToRST lst - return $ "\\ :sup:`" <> contents <> "`\\ " + return $ ":sup:`" <> contents <> "`" inlineToRST (Subscript lst) = do contents <- inlineListToRST lst - return $ "\\ :sub:`" <> contents <> "`\\ " + return $ ":sub:`" <> contents <> "`" inlineToRST (SmallCaps lst) = inlineListToRST lst inlineToRST (Quoted SingleQuote lst) = do contents <- inlineListToRST lst @@ -286,11 +332,12 @@ inlineToRST (Str str) = return $ text $ escapeString str inlineToRST (Math t str) = do modify $ \st -> st{ stHasMath = True } return $ if t == InlineMath - then ":math:`" <> text str <> "`" <> beforeNonBlank "\\ " + then ":math:`" <> text str <> "`" else if '\n' `elem` str then blankline $$ ".. math::" $$ blankline $$ nest 3 (text str) $$ blankline else blankline $$ (".. math:: " <> text str) $$ blankline +inlineToRST (RawInline "rst" x) = return $ text x inlineToRST (RawInline _ _) = return empty inlineToRST (LineBreak) = return cr -- there's no line break in RST inlineToRST Space = return space diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs index 563ad7044..6bb782899 100644 --- a/src/Text/Pandoc/Writers/Texinfo.hs +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -344,7 +344,8 @@ inlineListToTexinfo lst = mapM inlineToTexinfo lst >>= return . hcat -- | Convert list of inline elements to Texinfo acceptable for a node name. inlineListForNode :: [Inline] -- ^ Inlines to convert -> State WriterState Doc -inlineListForNode = return . text . filter (not . disallowedInNode) . stringify +inlineListForNode = return . text . stringToTexinfo . + filter (not . disallowedInNode) . stringify -- periods, commas, colons, and parentheses are disallowed in node names disallowedInNode :: Char -> Bool @@ -415,7 +416,7 @@ inlineToTexinfo (Image alternate (source, _)) = do text (ext ++ "}") where ext = drop 1 $ takeExtension source' - base = takeBaseName source' + base = dropExtension source' source' = if isAbsoluteURI source then source else unEscapeString source diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index 7a1c8bdd8..31279c3bb 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -38,7 +38,7 @@ module Text.Pandoc.XML ( stripTags, fromEntities ) where import Text.Pandoc.Pretty -import Data.Char (ord, isAscii) +import Data.Char (ord, isAscii, isSpace) import Text.HTML.TagSoup.Entity (lookupEntity) -- | Remove everything between <...> @@ -106,8 +106,8 @@ fromEntities :: String -> String fromEntities ('&':xs) = case lookupEntity ent of Just c -> c : fromEntities rest - Nothing -> '&' : fromEntities rest - where (ent, rest) = case break (==';') xs of + Nothing -> '&' : fromEntities xs + where (ent, rest) = case break (\c -> isSpace c || c == ';') xs of (zs,';':ys) -> (zs,ys) _ -> ("",xs) fromEntities (x:xs) = x : fromEntities xs |