From 75485c2f112cdc2e1f95f871d01cc356510166ae Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 29 Jan 2012 23:54:00 -0800 Subject: Complete rewrite of LaTeX reader. * The new reader is more robust, accurate, and extensible. It is still quite incomplete, but it should be easier now to add features. * Text.Pandoc.Parsing: Added withRaw combinator. * Markdown reader: do escapedChar before raw latex inline. Otherwise we capture commands like \{. * Fixed latex citation tests for new citeproc. * Handle \include{} commands in latex. This is done in pandoc.hs, not the (pure) latex reader. But the reader exports the needed function, handleIncludes. * Moved err and warn from pandoc.hs to Shared. * Fixed tests - raw tex should sometimes have trailing space. * Updated lhs-test for highlighting-kate changes. --- src/Text/Pandoc/Readers/LaTeX.hs | 1697 +++++++++++++++-------------------- src/Text/Pandoc/Readers/Markdown.hs | 21 +- 2 files changed, 737 insertions(+), 981 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 3d9689168..9eb9eb2f9 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2006-2010 John MacFarlane +Copyright (C) 2006-2012 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,8 +18,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2010 John MacFarlane - License : GNU GPL, version 2 or above + Copyright : Copyright (C) 2006-2012 John MacFarlane + License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha @@ -27,20 +27,24 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of LaTeX to 'Pandoc' document. -} -module Text.Pandoc.Readers.LaTeX ( - readLaTeX, - rawLaTeXInline, - rawLaTeXEnvironment' +module Text.Pandoc.Readers.LaTeX ( readLaTeX, + rawLaTeXInline, + rawLaTeXBlock, + handleIncludes ) where -import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Parsec hiding ((<|>), space, many, optional) import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Data.Maybe ( fromMaybe ) -import Data.Char ( chr, toUpper ) -import Data.List ( intercalate, isPrefixOf, isSuffixOf ) +import Data.Char ( chr, ord ) import Control.Monad +import Text.Pandoc.Builder +import Data.Char (isLetter) +import Control.Applicative +import Data.Monoid +import System.FilePath (replaceExtension) +import qualified Data.Map as M -- | Parse LaTeX from string and return 'Pandoc' document. readLaTeX :: ParserState -- ^ Parser state, including options for parser @@ -48,1002 +52,757 @@ readLaTeX :: ParserState -- ^ Parser state, including options for parser -> Pandoc readLaTeX = readWith parseLaTeX --- characters with special meaning -specialChars :: [Char] -specialChars = "\\`$%^&_~#{}[]\n \t|<>'\"-" - --- --- utility functions --- - --- | Returns text between brackets and its matching pair. -bracketedText :: Char -> Char -> GenParser Char st [Char] -bracketedText openB closeB = do - result <- charsInBalanced openB closeB anyChar - return $ [openB] ++ result ++ [closeB] - --- | Returns an option or argument of a LaTeX command. -optOrArg :: GenParser Char st [Char] -optOrArg = try $ spaces >> (bracketedText '{' '}' <|> bracketedText '[' ']') - --- | True if the string begins with '{'. -isArg :: [Char] -> Bool -isArg ('{':_) = True -isArg _ = False +parseLaTeX :: LP Pandoc +parseLaTeX = do + bs <- blocks + eof + st <- getState + let title' = stateTitle st + let authors' = stateAuthors st + let date' = stateDate st + return $ Pandoc (Meta title' authors' date') $ toList bs --- | Returns list of options and arguments of a LaTeX command. -commandArgs :: GenParser Char st [[Char]] -commandArgs = many optOrArg +type LP = GenParser Char ParserState --- | Parses LaTeX command, returns (name, star, list of options or arguments). -command :: GenParser Char st ([Char], [Char], [[Char]]) -command = do +anyControlSeq :: LP String +anyControlSeq = do char '\\' - name <- many1 letter - star <- option "" (string "*") -- some commands have starred versions - args <- commandArgs - return (name, star, args) - -begin :: [Char] -> GenParser Char st [Char] -begin name = try $ do - string "\\begin" - spaces - char '{' - string name - char '}' - optional commandArgs - spaces + next <- option '\n' anyChar + name <- case next of + '\n' -> return "" + c | isLetter c -> (c:) <$> (many letter <* optional sp) + | otherwise -> return [c] return name -end :: [Char] -> GenParser Char st [Char] -end name = try $ do - string "\\end" - spaces - char '{' - string name - char '}' +controlSeq :: String -> LP String +controlSeq name = try $ do + char '\\' + case name of + "" -> mzero + [c] | not (isLetter c) -> string [c] + cs -> string cs <* optional sp return name --- | Returns a list of block elements containing the contents of an --- environment. -environment :: [Char] -> GenParser Char ParserState [Block] -environment name = try $ begin name >> spaces >> manyTill block (end name) >>~ spaces - -anyEnvironment :: GenParser Char ParserState Block -anyEnvironment = try $ do - string "\\begin" - spaces +sp :: LP () +sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') + <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline) + +isLowerHex :: Char -> Bool +isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' + +tildeEscape :: LP Char +tildeEscape = try $ do + string "^^" + c <- satisfy (\x -> x >= '\0' && x <= '\128') + d <- if isLowerHex c + then option "" $ count 1 (satisfy isLowerHex) + else return "" + if null d + then case ord c of + x | x >= 64 && x <= 127 -> return $ chr (x - 64) + | otherwise -> return $ chr (x + 64) + else return $ chr $ read ('0':'x':c:d) + +comment :: LP () +comment = do + char '%' + skipMany (satisfy (/='\n')) + newline + return () + +grouped :: Monoid a => LP a -> LP a +grouped parser = try $ char '{' *> (mconcat <$> manyTill parser (char '}')) + +braced :: LP String +braced = char '{' *> (concat <$> manyTill + ( many1 (satisfy (\c -> c /= '\\' && c /= '}' && c /= '{')) + <|> try (string "\\}") + <|> try (string "\\{") + <|> ((\x -> "{" ++ x ++ "}") <$> braced) + <|> count 1 anyChar + ) (char '}')) + +bracketed :: Monoid a => LP a -> LP a +bracketed parser = try $ char '[' *> (mconcat <$> manyTill parser (char ']')) + +trim :: String -> String +trim = removeLeadingTrailingSpace + +mathDisplay :: LP String -> LP Inlines +mathDisplay p = displayMath <$> (try p >>= applyMacros' . trim) + +mathInline :: LP String -> LP Inlines +mathInline p = math <$> (try p >>= applyMacros') + +double_quote :: LP Inlines +double_quote = (doubleQuoted . mconcat) <$> + (try $ string "``" *> manyTill inline (try $ string "''")) + +single_quote :: LP Inlines +single_quote = (singleQuoted . mconcat) <$> + (try $ char '`' *> manyTill inline (try $ char '\'' >> notFollowedBy letter)) + +inline :: LP Inlines +inline = (mempty <$ comment) + <|> (space <$ sp) + <|> inlineText + <|> inlineCommand + <|> grouped inline + <|> (char '-' *> option (str "-") + ((char '-') *> option (str "–") (str "—" <$ char '-'))) + <|> double_quote + <|> single_quote + <|> (str "’" <$ char '\'') + <|> (str "\160" <$ char '~') + <|> (mathDisplay $ string "$$" *> manyTill anyChar (try $ string "$$")) + <|> (mathInline $ char '$' *> manyTill anyChar (char '$')) + <|> (superscript <$> (char '^' *> tok)) + <|> (subscript <$> (char '_' *> tok)) + <|> (failUnlessLHS *> char '|' *> doLHSverb) + <|> (str <$> count 1 tildeEscape) + <|> (str <$> string "]") + <|> (str <$> count 1 (satisfy (\c -> c /= '\\' && c /='\n' && c /='}' && c /='{'))) -- eat random leftover characters + +inlines :: LP Inlines +inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) + +block :: LP Blocks +block = (mempty <$ comment) + <|> (mempty <$ ((spaceChar <|> blankline) *> spaces)) + <|> environment + <|> mempty <$ macro -- TODO improve macros, make them work everywhere + <|> blockCommand + <|> grouped block + <|> paragraph + + +blocks :: LP Blocks +blocks = mconcat <$> many block + +blockCommand :: LP Blocks +blockCommand = try $ do + name <- anyControlSeq + star <- option "" (string "*" <* optional sp) + let name' = name ++ star + case M.lookup name' blockCommands of + Just p -> p + Nothing -> case M.lookup name blockCommands of + Just p -> p + Nothing -> mzero + +inBrackets :: Inlines -> Inlines +inBrackets x = (str "[") <> x <> (str "]") + +blockCommands :: M.Map String (LP Blocks) +blockCommands = M.fromList + [ ("par", pure mempty) + , ("title", mempty <$ (tok >>= addTitle)) + , ("subtitle", mempty <$ (tok >>= addSubtitle)) + , ("author", mempty <$ authors) + , ("date", mempty <$ (tok >>= addDate)) + , ("maketitle", pure mempty) + -- \ignore{} is used conventionally in literate haskell for definitions + -- that are to be processed by the compiler but not printed. + , ("ignore", mempty <$ tok) + , ("hyperdef", mempty <$ (tok *> tok)) + , ("chapter", updateState (\s -> s{ stateHasChapters = True }) *> section 0) + , ("section", section 1) + , ("subsection", section 2) + , ("subsubsection", section 3) + , ("paragraph", section 4) + , ("subparagraph", section 5) + , ("opening", (para . trimInlines) <$> tok) + , ("closing", (para . trimInlines) <$> tok) + , ("rule", optional opt *> tok *> tok *> pure horizontalRule) + , ("begin", mzero) -- these are here so they won't be interpreted as inline + , ("end", mzero) + , ("item", loose_item) + , ("documentclass", optional opt *> braced *> preamble) + -- should be parsed by macro, but we need this + -- here so these aren't parsed as inline + , ("newcommand", mempty <$ (tok *> optional opt *> tok)) + , ("renewcommand", mempty <$ (tok *> optional opt *> tok)) + , ("newenvironment", mempty <$ (tok *> tok *> tok)) + , ("renewenvironment", mempty <$ (tok *> tok *> tok)) + , ("special", pure mempty) + , ("pdfannot", pure mempty) + , ("pdfstringdef", pure mempty) + , ("index", pure mempty) + , ("bibliography", pure mempty) + ] + +addTitle :: Inlines -> LP () +addTitle tit = updateState (\s -> s{ stateTitle = toList tit }) + +addSubtitle :: Inlines -> LP () +addSubtitle tit = updateState (\s -> s{ stateTitle = stateTitle s ++ + toList (str ":" <> linebreak <> tit) }) + +authors :: LP () +authors = try $ do char '{' - name <- many letter - star <- option "" (string "*") -- some environments have starred variants - char '}' - optional commandArgs - spaces - contents <- manyTill block (end (name ++ star)) - spaces - return $ BlockQuote contents - --- --- parsing documents --- + let oneAuthor = mconcat <$> many1 (notFollowedBy' (controlSeq "and") >> inline) + auths <- sepBy oneAuthor (controlSeq "and") + updateState (\s -> s { stateAuthors = map (normalizeSpaces . toList) auths }) + +addDate :: Inlines -> LP () +addDate dat = updateState (\s -> s{ stateDate = toList dat }) + +section :: Int -> LP Blocks +section lvl = do + hasChapters <- stateHasChapters `fmap` getState + let lvl' = if hasChapters then lvl + 1 else lvl + optional sp + optional opt + contents <- grouped inline + return $ header lvl' contents + +inlineCommand :: LP Inlines +inlineCommand = try $ do + name <- anyControlSeq + guard $ not $ isBlockCommand name + parseRaw <- stateParseRaw `fmap` getState + star <- option "" (string "*") + let name' = name ++ star + case M.lookup name' inlineCommands of + Just p -> p + Nothing -> case M.lookup name inlineCommands of + Just p -> p + Nothing + | parseRaw -> + (rawInline "latex" . (('\\':name') ++)) <$> + (withRaw (optional opt *> many braced) + >>= applyMacros' . snd) + | otherwise -> return mempty + +isBlockCommand :: String -> Bool +isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands + +inlineCommands :: M.Map String (LP Inlines) +inlineCommands = M.fromList + [ ("emph", emph <$> tok) + , ("textit", emph <$> tok) + , ("textsc", smallcaps <$> tok) + , ("sout", strikeout <$> tok) + , ("textsuperscript", superscript <$> tok) + , ("textsubscript", subscript <$> tok) + , ("textbackslash", lit "\\") + , ("backslash", lit "\\") + , ("textbf", strong <$> tok) + , ("ldots", lit "…") + , ("dots", lit "…") + , ("mdots", lit "…") + , ("sim", lit "~") + , ("label", inBrackets <$> tok) + , ("ref", inBrackets <$> tok) + , ("(", mathInline $ manyTill anyChar (try $ string "\\)")) + , ("[", mathDisplay $ manyTill anyChar (try $ string "\\]")) + , ("ensuremath", mathInline $ braced) + , ("$", lit "$") + , ("%", lit "%") + , ("&", lit "&") + , ("#", lit "#") + , ("_", lit "_") + , ("{", lit "{") + , ("}", lit "}") + -- old TeX commands + , ("em", emph <$> inlines) + , ("it", emph <$> inlines) + , ("sl", emph <$> inlines) + , ("bf", strong <$> inlines) + , ("rm", inlines) + , ("itshape", emph <$> inlines) + , ("slshape", emph <$> inlines) + , ("scshape", smallcaps <$> inlines) + , ("bfseries", strong <$> inlines) + , ("/", pure mempty) -- italic correction + , ("cc", lit "ç") + , ("cC", lit "Ç") + , ("aa", lit "å") + , ("AA", lit "Å") + , ("ss", lit "ß") + , ("o", lit "ø") + , ("O", lit "Ø") + , ("L", lit "Ł") + , ("l", lit "ł") + , ("ae", lit "æ") + , ("AE", lit "Æ") + , ("pounds", lit "£") + , ("euro", lit "€") + , ("copyright", lit "©") + , ("sect", lit "§") + , ("`", option (str "`") $ try $ tok >>= accent grave) + , ("'", option (str "'") $ try $ tok >>= accent acute) + , ("^", option (str "^") $ try $ tok >>= accent hat) + , ("~", option (str "~") $ try $ tok >>= accent circ) + , ("\"", option (str "\"") $ try $ tok >>= accent umlaut) + , ("i", lit "i") + , ("\\", linebreak <$ optional (bracketed inline *> optional sp)) + , (",", pure mempty) + , ("@", pure mempty) + , (" ", lit "\160") + , ("bar", lit "|") + , ("textless", lit "<") + , ("textgreater", lit ">") + , ("thanks", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) + , ("footnote", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) + , ("verb", doverb) + , ("lstinline", doverb) + , ("texttt", (code . stringify . toList) <$> tok) + , ("url", (unescapeURL <$> braced) >>= \url -> + pure (link url "" (codeWith ("",["url"],[]) url))) + , ("href", (unescapeURL <$> braced <* optional sp) >>= \url -> + tok >>= \lab -> + pure (link url "" lab)) + , ("includegraphics", optional opt *> (unescapeURL <$> braced) >>= + (\src -> pure (image src "" (str "image")))) + , ("cite", citation NormalCitation False) + , ("citep", citation NormalCitation False) + , ("citep*", citation NormalCitation False) + , ("citeal", citation NormalCitation False) + , ("citealp", citation NormalCitation False) + , ("citealp*", citation NormalCitation False) + , ("autocite", citation NormalCitation False) + , ("footcite", citation NormalCitation False) + , ("parencite", citation NormalCitation False) + , ("supercite", citation NormalCitation False) + , ("footcitetext", citation NormalCitation False) + , ("citeyearpar", citation SuppressAuthor False) + , ("citeyear", citation SuppressAuthor False) + , ("autocite*", citation SuppressAuthor False) + , ("cite*", citation SuppressAuthor False) + , ("parencite*", citation SuppressAuthor False) + , ("textcite", citation AuthorInText False) + , ("citet", citation AuthorInText False) + , ("citet*", citation AuthorInText False) + , ("citealt", citation AuthorInText False) + , ("citealt*", citation AuthorInText False) + , ("textcites", citation AuthorInText True) + , ("cites", citation NormalCitation True) + , ("autocites", citation NormalCitation True) + , ("footcites", citation NormalCitation True) + , ("parencites", citation NormalCitation True) + , ("supercites", citation NormalCitation True) + , ("footcitetexts", citation NormalCitation True) + , ("Autocite", citation NormalCitation False) + , ("Footcite", citation NormalCitation False) + , ("Parencite", citation NormalCitation False) + , ("Supercite", citation NormalCitation False) + , ("Footcitetext", citation NormalCitation False) + , ("Citeyearpar", citation SuppressAuthor False) + , ("Citeyear", citation SuppressAuthor False) + , ("Autocite*", citation SuppressAuthor False) + , ("Cite*", citation SuppressAuthor False) + , ("Parencite*", citation SuppressAuthor False) + , ("Textcite", citation AuthorInText False) + , ("Textcites", citation AuthorInText True) + , ("Cites", citation NormalCitation True) + , ("Autocites", citation NormalCitation True) + , ("Footcites", citation NormalCitation True) + , ("Parencites", citation NormalCitation True) + , ("Supercites", citation NormalCitation True) + , ("Footcitetexts", citation NormalCitation True) + , ("citetext", complexNatbibCitation NormalCitation) + , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> + complexNatbibCitation AuthorInText) + <|> citation AuthorInText False) + ] + +unescapeURL :: String -> String +unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs + where isEscapable '%' = True + isEscapable '#' = True + isEscapable _ = False +unescapeURL (x:xs) = x:unescapeURL xs +unescapeURL [] = "" + +doverb :: LP Inlines +doverb = do + marker <- anyChar + code <$> manyTill (satisfy (/='\n')) (char marker) + +doLHSverb :: LP Inlines +doLHSverb = codeWith ("",["haskell"],[]) <$> manyTill (satisfy (/='\n')) (char '|') + +lit :: String -> LP Inlines +lit = pure . str + +accent :: (Char -> Char) -> Inlines -> LP Inlines +accent f ils = + case toList ils of + (Str (x:xs) : ys) -> return $ fromList $ (Str (f x : xs) : ys) + [] -> mzero + _ -> return ils + +grave :: Char -> Char +grave 'A' = 'À' +grave 'E' = 'È' +grave 'I' = 'Ì' +grave 'O' = 'Ò' +grave 'U' = 'Ù' +grave 'a' = 'à' +grave 'e' = 'è' +grave 'i' = 'ì' +grave 'o' = 'ò' +grave 'u' = 'ù' +grave c = c + +acute :: Char -> Char +acute 'A' = 'Á' +acute 'E' = 'É' +acute 'I' = 'Í' +acute 'O' = 'Ó' +acute 'U' = 'Ú' +acute 'a' = 'á' +acute 'e' = 'é' +acute 'i' = 'í' +acute 'o' = 'ó' +acute 'u' = 'ú' +acute c = c + +hat :: Char -> Char +hat 'A' = 'Â' +hat 'E' = 'Ê' +hat 'I' = 'Î' +hat 'O' = 'Ô' +hat 'U' = 'Û' +hat 'a' = 'ã' +hat 'e' = 'ê' +hat 'i' = 'î' +hat 'o' = 'ô' +hat 'u' = 'û' +hat c = c + +circ :: Char -> Char +circ 'A' = 'Ã' +circ 'O' = 'Õ' +circ 'o' = 'õ' +circ 'N' = 'Ñ' +circ 'n' = 'ñ' +circ c = c + +umlaut :: Char -> Char +umlaut 'A' = 'Ä' +umlaut 'E' = 'Ë' +umlaut 'I' = 'Ï' +umlaut 'O' = 'Ö' +umlaut 'U' = 'Ü' +umlaut 'a' = 'ä' +umlaut 'e' = 'ë' +umlaut 'i' = 'ï' +umlaut 'o' = 'ö' +umlaut 'u' = 'ü' +umlaut c = c + +tok :: LP Inlines +tok = try $ grouped inline <|> inlineCommand <|> str <$> (count 1 $ inlineChar) + +opt :: LP Inlines +opt = bracketed inline <* optional sp + +inlineText :: LP Inlines +inlineText = str <$> many1 inlineChar + +inlineChar :: LP Char +inlineChar = satisfy $ \c -> + not (c == '\\' || c == '$' || c == '%' || c == '^' || c == '_' || + c == '&' || c == '~' || c == '#' || c == '{' || c == '}' || + c == '^' || c == '\'' || c == '`' || c == '-' || c == ']' || + c == ' ' || c == '\t' || c == '\n' ) + +environment :: LP Blocks +environment = do + controlSeq "begin" + name <- braced + parseRaw <- stateParseRaw `fmap` getState + let addBegin x = "\\begin{" ++ name ++ "}" ++ x + case M.lookup name environments of + Just p -> p + Nothing -> if parseRaw + then (rawBlock "latex" . addBegin) <$> + (withRaw (env name blocks) >>= applyMacros' . snd) + else env name blocks + +-- | Replace "include" commands with file contents. +handleIncludes :: String -> IO String +handleIncludes [] = return [] +handleIncludes ('\\':xs) = + case runParser include defaultParserState "input" ('\\':xs) of + Right (f, rest) -> do ys <- catch (readFile (replaceExtension f ".tex")) + (\e -> warn + ("could not open included file `" ++ + f ++ "': " ++ show e) >> return "") + (ys ++) `fmap` handleIncludes rest + _ -> case runParser verbatimEnv defaultParserState "input" ('\\':xs) of + Right (r, rest) -> (r ++) `fmap` handleIncludes rest + _ -> ('\\':) `fmap` handleIncludes xs +handleIncludes (x:xs) = (x:) `fmap` handleIncludes xs + +include :: LP (FilePath, String) +include = do + controlSeq "include" + f <- braced + rest <- getInput + return (f, rest) + +verbatimEnv :: LP (String, String) +verbatimEnv = do + (_,r) <- withRaw $ do + controlSeq "begin" + name <- braced + guard $ name == "verbatim" || name == "Verbatim" || + name == "lstlisting" + verbEnv name + rest <- getInput + return (r,rest) --- | Process LaTeX preamble, extracting metadata. -processLaTeXPreamble :: GenParser Char ParserState () -processLaTeXPreamble = do - try $ string "\\documentclass" - skipMany $ bibliographic <|> macro <|> commentBlock <|> skipChar +-- | Parse any LaTeX environment and return a string containing +-- the whole literal environment as raw TeX. +rawLaTeXBlock :: GenParser Char ParserState String +rawLaTeXBlock = + (rawLaTeXEnvironment <|> (snd <$> withRaw blockCommand)) >>= applyMacros' --- | Parse LaTeX and return 'Pandoc'. -parseLaTeX :: GenParser Char ParserState Pandoc -parseLaTeX = do - spaces - skipMany $ comment >> spaces - blocks <- try (processLaTeXPreamble >> environment "document") - <|> (many block >>~ (spaces >> eof)) - state <- getState - let blocks' = filter (/= Null) blocks - let title' = stateTitle state - let authors' = stateAuthors state - let date' = stateDate state - return $ Pandoc (Meta title' authors' date') blocks' - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = spaces >> many block - -block :: GenParser Char ParserState Block -block = choice [ hrule - , codeBlock - , header - , list - , blockQuote - , simpleTable - , commentBlock - , macro - , bibliographic - , para - , itemBlock - , unknownEnvironment - , ignore - , unknownCommand - ] "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = section <|> chapter - -chapter :: GenParser Char ParserState Block -chapter = try $ do - string "\\chapter" - result <- headerWithLevel 1 - updateState $ \s -> s{ stateHasChapters = True } - return result - -section :: GenParser Char ParserState Block -section = try $ do - char '\\' - subs <- many (try (string "sub")) - base <- try (string "section" >> return 1) <|> (string "paragraph" >> return 4) - st <- getState - let lev = if stateHasChapters st - then length subs + base + 1 - else length subs + base - headerWithLevel lev +rawLaTeXEnvironment :: GenParser Char ParserState String +rawLaTeXEnvironment = try $ do + controlSeq "begin" + name <- braced + let addBegin x = "\\begin{" ++ name ++ "}" ++ x + addBegin <$> (withRaw (env name blocks) >>= applyMacros' . snd) -headerWithLevel :: Int -> GenParser Char ParserState Block -headerWithLevel lev = try $ do - spaces - optional (char '*') - spaces - optional $ bracketedText '[' ']' -- alt title +rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline = do + (res, raw) <- withRaw inlineCommand + if res == mempty + then return (Str "") + else RawInline "latex" <$> (applyMacros' raw) + +environments :: M.Map String (LP Blocks) +environments = M.fromList + [ ("document", env "document" blocks) + , ("letter", env "letter" blocks) + , ("center", env "center" blocks) + , ("tabular", env "tabular" simpTable) + , ("quote", blockQuote <$> env "quote" blocks) + , ("quotation", blockQuote <$> env "quotation" blocks) + , ("itemize", bulletList <$> listenv "itemize" (many item)) + , ("description", definitionList <$> listenv "description" (many descItem)) + , ("enumerate", ordered_list) + , ("code", failUnlessLHS *> + (codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> verbEnv "code")) + , ("verbatim", codeBlock <$> (verbEnv "verbatim")) + , ("Verbatim", codeBlock <$> (verbEnv "Verbatim")) + , ("lstlisting", codeBlock <$> (verbEnv "listlisting")) + , ("displaymath", mathEnv Nothing "displaymath") + , ("equation", mathEnv Nothing "equation") + , ("equation*", mathEnv Nothing "equation*") + , ("gather", mathEnv (Just "gathered") "gather") + , ("gather*", mathEnv (Just "gathered") "gather*") + , ("multiline", mathEnv (Just "gathered") "multiline") + , ("multiline*", mathEnv (Just "gathered") "multiline*") + , ("eqnarray", mathEnv (Just "aligned*") "eqnarray") + , ("eqnarray*", mathEnv (Just "aligned*") "eqnarray*") + , ("align", mathEnv (Just "aligned*") "align") + , ("align*", mathEnv (Just "aligned*") "align*") + , ("alignat", mathEnv (Just "aligned*") "alignat") + , ("alignat*", mathEnv (Just "aligned*") "alignat*") + ] + +item :: LP Blocks +item = blocks *> controlSeq "item" *> optional opt *> blocks + +loose_item :: LP Blocks +loose_item = do + ctx <- stateParserContext `fmap` getState + if ctx == ListItemState + then mzero + else return mempty + +descItem :: LP (Inlines, [Blocks]) +descItem = do + blocks -- skip blocks before item + controlSeq "item" + optional sp + ils <- opt + bs <- blocks + return (ils, [bs]) + +env :: String -> LP a -> LP a +env name p = p <* (controlSeq "end" *> braced >>= guard . (== name)) + +listenv :: String -> LP a -> LP a +listenv name p = try $ do + oldCtx <- stateParserContext `fmap` getState + updateState $ \st -> st{ stateParserContext = ListItemState } + res <- env name p + updateState $ \st -> st{ stateParserContext = oldCtx } + return res + +mathEnv :: Maybe String -> String -> LP Blocks +mathEnv innerEnv name = para <$> mathDisplay (inner <$> verbEnv name) + where inner x = case innerEnv of + Nothing -> x + Just y -> "\\begin{" ++ y ++ "}\n" ++ x ++ + "\\end{" ++ y ++ "}" + +verbEnv :: String -> LP String +verbEnv name = do + optional opt + optional blankline + let endEnv = try $ controlSeq "end" *> braced >>= guard . (== name) + res <- manyTill anyChar endEnv + return $ stripTrailingNewlines res + +ordered_list :: LP Blocks +ordered_list = do + optional sp + (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ + try $ char '[' *> anyOrderedListMarker <* char ']' spaces - char '{' - title' <- manyTill inline (char '}') + optional $ try $ controlSeq "setlength" *> grouped (controlSeq "itemindent") *> braced spaces - return $ Header lev (normalizeSpaces title') + start <- option 1 $ try $ do controlSeq "setcounter" + grouped (string "enum" *> many1 (oneOf "iv")) + optional sp + num <- grouped (many1 digit) + spaces + return $ (read num + 1 :: Int) + bs <- listenv "enumerate" (many item) + return $ orderedListWith (start, style, delim) bs + +paragraph :: LP Blocks +paragraph = do + x <- mconcat <$> many1 inline + if x == mempty + then return mempty + else return $ para $ trimInlines x + +preamble :: LP Blocks +preamble = mempty <$> manyTill preambleBlock beginDoc + where beginDoc = lookAhead $ controlSeq "begin" *> string "{document}" + preambleBlock = (mempty <$ comment) + <|> (mempty <$ sp) + <|> (mempty <$ blanklines) + <|> (mempty <$ macro) + <|> blockCommand + <|> (mempty <$ anyControlSeq) + <|> (mempty <$ braced) + <|> (mempty <$ anyChar) + +------- + +-- citations + +addPrefix :: Inlines -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = toList p ++ citationPrefix k} : ks +addPrefix _ _ = [] --- --- hrule block --- +addSuffix :: Inlines -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ toList s}] +addSuffix _ _ = [] + +simpleCiteArgs :: LP [Citation] +simpleCiteArgs = try $ do + first <- optionMaybe opt + second <- optionMaybe opt + char '{' + keys <- manyTill citationLabel (char '}') + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> (mempty, s ) + (Just s , Just t ) -> (s , t ) + _ -> (mempty, mempty) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys -hrule :: GenParser Char st Block -hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", - "\\newpage" ] >> spaces >> return HorizontalRule +citationLabel :: LP String +citationLabel = trim <$> + (many1 (satisfy $ \c -> c /=',' && c /='}') <* optional (char ',') <* optional sp) --- tables +cites :: CitationMode -> Bool -> LP [Citation] +cites mode multi = try $ do + cits <- if multi + then many1 simpleCiteArgs + else count 1 simpleCiteArgs + let (c:cs) = concat cits + return $ case mode of + AuthorInText -> c {citationMode = mode} : cs + _ -> map (\a -> a {citationMode = mode}) (c:cs) -simpleTable :: GenParser Char ParserState Block -simpleTable = try $ do - string "\\begin" - spaces - string "{tabular}" - spaces - aligns <- parseAligns - let cols = length aligns - optional hline - header' <- option [] $ parseTableHeader cols - rows <- many (parseTableRow cols >>~ optional hline) - spaces - end "tabular" - spaces - let header'' = if null header' - then replicate cols [] - else header' - return $ Table [] aligns (replicate cols 0) header'' rows +citation :: CitationMode -> Bool -> LP Inlines +citation mode multi = (flip cite mempty) <$> cites mode multi + +complexNatbibCitation :: CitationMode -> LP Inlines +complexNatbibCitation mode = try $ do + let ils = (trimInlines . mconcat) <$> + many (notFollowedBy (oneOf "\\};") >> inline) + let parseOne = try $ do + skipSpaces + pref <- ils + cit' <- inline -- expect a citation + let citlist = toList cit' + cits' <- case citlist of + [Cite cs _] -> return cs + _ -> mzero + suff <- ils + skipSpaces + optional $ char ';' + return $ addPrefix pref $ addSuffix suff $ cits' + (c:cits) <- grouped parseOne + return $ cite (c{ citationMode = mode }:cits) mempty -hline :: GenParser Char st () -hline = try $ spaces >> string "\\hline" >> return () +-- tables -parseAligns :: GenParser Char ParserState [Alignment] +parseAligns :: LP [Alignment] parseAligns = try $ do char '{' optional $ char '|' let cAlign = char 'c' >> return AlignCenter let lAlign = char 'l' >> return AlignLeft let rAlign = char 'r' >> return AlignRight - let alignChar = cAlign <|> lAlign <|> rAlign + let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign) aligns' <- sepEndBy alignChar (optional $ char '|') + spaces char '}' spaces return aligns' -parseTableHeader :: Int -- ^ number of columns - -> GenParser Char ParserState [TableCell] -parseTableHeader cols = try $ do - cells' <- parseTableRow cols - hline - return cells' +hline :: LP () +hline = () <$ (try $ spaces >> controlSeq "hline") parseTableRow :: Int -- ^ number of columns - -> GenParser Char ParserState [TableCell] + -> LP [Blocks] parseTableRow cols = try $ do - let tableCellInline = notFollowedBy (char '&' <|> - (try $ char '\\' >> char '\\')) >> inline - cells' <- sepBy (spaces >> liftM ((:[]) . Plain . normalizeSpaces) - (many tableCellInline)) (char '&') + let amp = try $ spaces *> string "&" + let tableCellInline = notFollowedBy (amp <|> controlSeq "\\") >> inline + cells' <- sepBy (spaces *> ((plain . trimInlines . mconcat) <$> + many tableCellInline)) amp guard $ length cells' == cols spaces - (try $ string "\\\\" >> spaces) <|> - (lookAhead (end "tabular") >> return ()) + try $ controlSeq "\\" <|> lookAhead (try $ controlSeq "end" >> string "{tabular}") return cells' --- --- code blocks --- - -codeBlock :: GenParser Char ParserState Block -codeBlock = codeBlockWith "verbatim" <|> codeBlockWith "Verbatim" <|> codeBlockWith "lstlisting" <|> lhsCodeBlock --- Note: Verbatim is from fancyvrb. - -codeBlockWith :: String -> GenParser Char st Block -codeBlockWith env = try $ do - string "\\begin" - spaces -- don't use begin function because it - string $ "{" ++ env ++ "}" -- gobbles whitespace; we want to gobble - optional blanklines -- blank lines, but not leading space - contents <- manyTill anyChar (try (string $ "\\end{" ++ env ++ "}")) - spaces - let classes = if env == "code" then ["haskell"] else [] - return $ CodeBlock ("",classes,[]) (stripTrailingNewlines contents) - -lhsCodeBlock :: GenParser Char ParserState Block -lhsCodeBlock = do - failUnlessLHS - (CodeBlock (_,_,_) cont) <- codeBlockWith "code" - return $ CodeBlock ("", ["sourceCode","literate","haskell"], []) cont - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>= - return . BlockQuote - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = bulletList <|> orderedList <|> definitionList "list" - -listItem :: GenParser Char ParserState ([Inline], [Block]) -listItem = try $ do - ("item", _, args) <- command - spaces - state <- getState - let oldParserContext = stateParserContext state - updateState (\s -> s {stateParserContext = ListItemState}) - blocks <- many block - updateState (\s -> s {stateParserContext = oldParserContext}) - opt <- case args of - ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x -> - parseFromString (many inline) $ tail $ init x - _ -> return [] - return (opt, blocks) - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - string "\\begin" - spaces - string "{enumerate}" - spaces - (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ - try $ do failIfStrict - char '[' - res <- anyOrderedListMarker - char ']' - return res - spaces - option "" $ try $ do string "\\setlength{\\itemindent}" - char '{' - manyTill anyChar (char '}') - spaces - start <- option 1 $ try $ do failIfStrict - string "\\setcounter{enum" - many1 (oneOf "iv") - string "}{" - num <- many1 digit - char '}' - spaces - return $ (read num) + 1 - items <- many listItem - end "enumerate" - spaces - return $ OrderedList (start, style, delim) $ map snd items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - begin "itemize" - items <- many listItem - end "itemize" - spaces - return (BulletList $ map snd items) - -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - begin "description" - items <- many listItem - end "description" - spaces - return $ DefinitionList $ map (\(t,d) -> (t,[d])) items - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = do - res <- many1 inline - spaces - return $ if null (filter (`notElem` [Str "", Space]) res) - then Null - else Para $ normalizeSpaces res - --- --- title authors date --- - -bibliographic :: GenParser Char ParserState Block -bibliographic = choice [ maketitle, title, subtitle, authors, date ] - -maketitle :: GenParser Char st Block -maketitle = try (string "\\maketitle") >> spaces >> return Null - -title :: GenParser Char ParserState Block -title = try $ do - string "\\title{" - tit <- manyTill inline (char '}') - spaces - updateState (\state -> state { stateTitle = tit }) - return Null - -subtitle :: GenParser Char ParserState Block -subtitle = try $ do - string "\\subtitle{" - tit <- manyTill inline (char '}') - spaces - updateState (\state -> state { stateTitle = stateTitle state ++ - Str ":" : LineBreak : tit }) - return Null - -authors :: GenParser Char ParserState Block -authors = try $ do - string "\\author{" - let andsep = try $ string "\\and" >> notFollowedBy letter >> - spaces >> return '&' - raw <- sepBy (many $ notFollowedBy (char '}' <|> andsep) >> inline) andsep - let authors' = map normalizeSpaces raw - char '}' - spaces - updateState (\s -> s { stateAuthors = authors' }) - return Null - -date :: GenParser Char ParserState Block -date = try $ do - string "\\date{" - date' <- manyTill inline (char '}') - spaces - updateState (\state -> state { stateDate = normalizeSpaces date' }) - return Null - --- --- item block --- for use in unknown environments that aren't being parsed as raw latex --- - --- this forces items to be parsed in different blocks -itemBlock :: GenParser Char ParserState Block -itemBlock = try $ do - ("item", _, args) <- command - state <- getState - if stateParserContext state == ListItemState - then fail "item should be handled by list block" - else if null args - then return Null - else return $ Plain [Str (stripFirstAndLast (head args))] - --- --- raw LaTeX --- - --- | Parse any LaTeX environment and return a Para block containing --- the whole literal environment as raw TeX. -rawLaTeXEnvironment :: GenParser Char st Block -rawLaTeXEnvironment = do - contents <- rawLaTeXEnvironment' - spaces - return $ RawBlock "latex" contents - --- | Parse any LaTeX environment and return a string containing --- the whole literal environment as raw TeX. -rawLaTeXEnvironment' :: GenParser Char st String -rawLaTeXEnvironment' = try $ do - string "\\begin" - spaces - char '{' - name <- many1 letter - star <- option "" (string "*") -- for starred variants - let name' = name ++ star - char '}' - args <- option [] commandArgs - let argStr = concat args - contents <- manyTill (choice [ (many1 (noneOf "\\")), - rawLaTeXEnvironment', - string "\\" ]) - (end name') - return $ "\\begin{" ++ name' ++ "}" ++ argStr ++ - concat contents ++ "\\end{" ++ name' ++ "}" - -unknownEnvironment :: GenParser Char ParserState Block -unknownEnvironment = try $ do - state <- getState - result <- if stateParseRaw state -- check whether we should include raw TeX - then rawLaTeXEnvironment -- if so, get whole raw environment - else anyEnvironment -- otherwise just the contents - return result - --- \ignore{} is used conventionally in literate haskell for definitions --- that are to be processed by the compiler but not printed. -ignore :: GenParser Char ParserState Block -ignore = try $ do - ("ignore", _, _) <- command - spaces - return Null - -demacro :: (String, String, [String]) -> GenParser Char ParserState Inline -demacro (n,st,args) = try $ do - let raw = "\\" ++ n ++ st ++ concat args - s' <- applyMacros' raw - if raw == s' - then return $ RawInline "latex" raw - else do - inp <- getInput - setInput $ s' ++ inp - return $ Str "" - -unknownCommand :: GenParser Char ParserState Block -unknownCommand = try $ do - spaces - notFollowedBy' $ oneOfStrings ["\\begin","\\end","\\item"] >> - notFollowedBy letter - state <- getState - when (stateParserContext state == ListItemState) $ - notFollowedBy' (string "\\item") - if stateParseRaw state - then command >>= demacro >>= return . Plain . (:[]) - else do - (name, _, args) <- command - spaces - unless (name `elem` commandsToIgnore) $ do - -- put arguments back in input to be parsed - inp <- getInput - setInput $ intercalate " " args ++ inp - return Null - -commandsToIgnore :: [String] -commandsToIgnore = ["special","pdfannot","pdfstringdef", "index","bibliography"] - -skipChar :: GenParser Char ParserState Block -skipChar = do - satisfy (/='\\') <|> - (notFollowedBy' (try $ - string "\\begin" >> spaces >> string "{document}") >> - anyChar) - spaces - return Null - -commentBlock :: GenParser Char st Block -commentBlock = many1 (comment >> spaces) >> return Null - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice [ str - , endline - , whitespace - , quoted - , apostrophe - , strong - , math - , ellipses - , emDash - , enDash - , hyphen - , emph - , strikeout - , superscript - , subscript - , code - , url - , link - , image - , footnote - , linebreak - , accentedChar - , nonbreakingSpace - , cite - , specialChar - , ensureMath - , rawLaTeXInline' - , escapedChar - , emptyGroup - , unescapedChar - , comment - ] "inline" - - --- latex comment -comment :: GenParser Char st Inline -comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return (Str "") - -accentedChar :: GenParser Char st Inline -accentedChar = normalAccentedChar <|> specialAccentedChar - -normalAccentedChar :: GenParser Char st Inline -normalAccentedChar = try $ do - char '\\' - accent <- oneOf "'`^\"~" - character <- (try $ char '{' >> letter >>~ char '}') <|> letter - let table = fromMaybe [] $ lookup character accentTable - let result = case lookup accent table of - Just num -> chr num - Nothing -> '?' - return $ Str [result] - --- an association list of letters and association list of accents --- and decimal character numbers. -accentTable :: [(Char, [(Char, Int)])] -accentTable = - [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), - ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), - ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), - ('N', [('~', 209)]), - ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), - ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), - ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), - ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), - ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), - ('n', [('~', 241)]), - ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), - ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] - -specialAccentedChar :: GenParser Char st Inline -specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, lslash, - oslash, pound, euro, copyright, sect ] - -ccedil :: GenParser Char st Inline -ccedil = try $ do - char '\\' - letter' <- oneOfStrings ["cc", "cC"] - notFollowedBy letter - let num = if letter' == "cc" then 231 else 199 - return $ Str [chr num] - -aring :: GenParser Char st Inline -aring = try $ do - char '\\' - letter' <- oneOfStrings ["aa", "AA"] - notFollowedBy letter - let num = if letter' == "aa" then 229 else 197 - return $ Str [chr num] - -iuml :: GenParser Char st Inline -iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >> - return (Str [chr 239]) - -szlig :: GenParser Char st Inline -szlig = try (string "\\ss") >> notFollowedBy letter >> return (Str [chr 223]) - -oslash :: GenParser Char st Inline -oslash = try $ do - char '\\' - letter' <- choice [char 'o', char 'O'] - notFollowedBy letter - let num = if letter' == 'o' then 248 else 216 - return $ Str [chr num] - -lslash :: GenParser Char st Inline -lslash = try $ do - cmd <- oneOfStrings ["{\\L}","{\\l}"] - <|> (oneOfStrings ["\\L ","\\l "] >>~ notFollowedBy letter) - return $ if 'l' `elem` cmd - then Str "\x142" - else Str "\x141" - -aelig :: GenParser Char st Inline -aelig = try $ do - char '\\' - letter' <- oneOfStrings ["ae", "AE"] - notFollowedBy letter - let num = if letter' == "ae" then 230 else 198 - return $ Str [chr num] - -pound :: GenParser Char st Inline -pound = try (string "\\pounds" >> notFollowedBy letter) >> return (Str [chr 163]) - -euro :: GenParser Char st Inline -euro = try (string "\\euro" >> notFollowedBy letter) >> return (Str [chr 8364]) - -copyright :: GenParser Char st Inline -copyright = try (string "\\copyright" >> notFollowedBy letter) >> return (Str [chr 169]) - -sect :: GenParser Char st Inline -sect = try (string "\\S" >> notFollowedBy letter) >> return (Str [chr 167]) - -escapedChar :: GenParser Char st Inline -escapedChar = do - result <- escaped (oneOf specialChars) - return $ if result == '\n' then Str " " else Str [result] +parseTableHeader :: Int -- ^ number of columns + -> LP [Blocks] +parseTableHeader cols = try $ parseTableRow cols <* hline -emptyGroup :: GenParser Char st Inline -emptyGroup = try $ do - char '{' +simpTable :: LP Blocks +simpTable = try $ do spaces - char '}' - return $ Str "" - --- nonescaped special characters -unescapedChar :: GenParser Char st Inline -unescapedChar = oneOf "`$^&_#{}[]|<>" >>= return . (\c -> Str [c]) - -specialChar :: GenParser Char st Inline -specialChar = choice [ spacer, interwordSpace, sentenceEnd, - backslash, tilde, caret, - bar, lt, gt, doubleQuote ] - -spacer :: GenParser Char st Inline -spacer = try (string "\\,") >> return (Str "") - -sentenceEnd :: GenParser Char st Inline -sentenceEnd = try (string "\\@") >> return (Str "") - -interwordSpace :: GenParser Char st Inline -interwordSpace = try (string "\\ ") >> return (Str "\160") - -backslash :: GenParser Char st Inline -backslash = try (string "\\textbackslash") >> optional (try $ string "{}") >> return (Str "\\") - -tilde :: GenParser Char st Inline -tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~") - -caret :: GenParser Char st Inline -caret = try (string "\\^{}") >> return (Str "^") - -bar :: GenParser Char st Inline -bar = try (string "\\textbar") >> optional (try $ string "{}") >> return (Str "\\") - -lt :: GenParser Char st Inline -lt = try (string "\\textless") >> optional (try $ string "{}") >> return (Str "<") - -gt :: GenParser Char st Inline -gt = try (string "\\textgreater") >> optional (try $ string "{}") >> return (Str ">") - -doubleQuote :: GenParser Char st Inline -doubleQuote = char '"' >> return (Str "\"") - -code :: GenParser Char ParserState Inline -code = code1 <|> code2 <|> code3 <|> lhsInlineCode - -code1 :: GenParser Char st Inline -code1 = try $ do - string "\\verb" - marker <- anyChar - result <- manyTill anyChar (char marker) - return $ Code nullAttr $ removeLeadingTrailingSpace result - -code2 :: GenParser Char st Inline -code2 = try $ do - string "\\texttt{" - result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') - return $ Code nullAttr result - -code3 :: GenParser Char st Inline -code3 = try $ do - string "\\lstinline" - marker <- anyChar - result <- manyTill anyChar (char marker) - return $ Code nullAttr $ removeLeadingTrailingSpace result - -lhsInlineCode :: GenParser Char ParserState Inline -lhsInlineCode = try $ do - failUnlessLHS - char '|' - result <- manyTill (noneOf "|\n") (char '|') - return $ Code ("",["haskell"],[]) result - -emph :: GenParser Char ParserState Inline -emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> - manyTill inline (char '}') >>= return . Emph - -strikeout :: GenParser Char ParserState Inline -strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>= - return . Strikeout - -superscript :: GenParser Char ParserState Inline -superscript = try $ string "\\textsuperscript{" >> - manyTill inline (char '}') >>= return . Superscript - --- note: \textsubscript isn't a standard latex command, but we use --- a defined version in pandoc. -subscript :: GenParser Char ParserState Inline -subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>= - return . Subscript - -apostrophe :: GenParser Char ParserState Inline -apostrophe = char '\'' >> return (Str "\x2019") - -quoted :: GenParser Char ParserState Inline -quoted = doubleQuoted <|> singleQuoted - -singleQuoted :: GenParser Char ParserState Inline -singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>= - return . Quoted SingleQuote . normalizeSpaces - -doubleQuoted :: GenParser Char ParserState Inline -doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>= - return . Quoted DoubleQuote . normalizeSpaces - -singleQuoteStart :: GenParser Char st Char -singleQuoteStart = char '`' - -singleQuoteEnd :: GenParser Char st () -singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum - -doubleQuoteStart :: CharParser st String -doubleQuoteStart = string "``" - -doubleQuoteEnd :: CharParser st String -doubleQuoteEnd = try $ string "''" - -ellipses :: GenParser Char st Inline -ellipses = try $ do - char '\\' - optional $ char 'l' - string "dots" - optional $ try $ string "{}" - return (Str "…") - -enDash :: GenParser Char st Inline -enDash = try (string "--") >> return (Str "-") - -emDash :: GenParser Char st Inline -emDash = try (string "---") >> return (Str "—") - -hyphen :: GenParser Char st Inline -hyphen = char '-' >> return (Str "-") - -strong :: GenParser Char ParserState Inline -strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= - return . Strong - -whitespace :: GenParser Char st Inline -whitespace = many1 (oneOf " \t") >> return Space - -nonbreakingSpace :: GenParser Char st Inline -nonbreakingSpace = char '~' >> return (Str "\160") - --- hard line break -linebreak :: GenParser Char st Inline -linebreak = try $ do - string "\\\\" - optional $ bracketedText '[' ']' -- e.g. \\[10pt] + aligns <- parseAligns + let cols = length aligns + optional hline + header' <- option [] $ parseTableHeader cols + rows <- many (parseTableRow cols <* optional hline) spaces - return LineBreak - -str :: GenParser Char st Inline -str = many1 (noneOf specialChars) >>= return . Str - --- endline internal to paragraph -endline :: GenParser Char st Inline -endline = try $ newline >> notFollowedBy blankline >> return Space - --- math -math :: GenParser Char ParserState Inline -math = (math3 >>= applyMacros' >>= return . Math DisplayMath) - <|> (math1 >>= applyMacros' >>= return . Math InlineMath) - <|> (math2 >>= applyMacros' >>= return . Math InlineMath) - <|> (math4 >>= applyMacros' >>= return . Math DisplayMath) - <|> (math5 >>= applyMacros' >>= return . Math DisplayMath) - <|> (math6 >>= applyMacros' >>= return . Math DisplayMath) - "math" - -math1 :: GenParser Char st String -math1 = try $ char '$' >> manyTill anyChar (char '$') - -math2 :: GenParser Char st String -math2 = try $ string "\\(" >> manyTill anyChar (try $ string "\\)") - -math3 :: GenParser Char st String -math3 = try $ char '$' >> math1 >>~ char '$' - -math4 :: GenParser Char st String -math4 = try $ do - name <- begin "displaymath" <|> begin "equation" <|> begin "equation*" <|> - begin "gather" <|> begin "gather*" <|> begin "gathered" <|> - begin "multline" <|> begin "multline*" - manyTill anyChar (end name) - -math5 :: GenParser Char st String -math5 = try $ (string "\\[") >> spaces >> manyTill anyChar (try $ string "\\]") - -math6 :: GenParser Char st String -math6 = try $ do - name <- begin "eqnarray" <|> begin "eqnarray*" <|> begin "align" <|> - begin "align*" <|> begin "alignat" <|> begin "alignat*" <|> - begin "split" <|> begin "aligned" <|> begin "alignedat" - res <- manyTill anyChar (end name) - return $ filter (/= '&') res -- remove alignment codes - -ensureMath :: GenParser Char st Inline -ensureMath = try $ do - (n, _, args) <- command - guard $ n == "ensuremath" && not (null args) - return $ Math InlineMath $ tail $ init $ head args - --- --- links and images --- - -url :: GenParser Char ParserState Inline -url = try $ do - string "\\url" - url' <- charsInBalanced '{' '}' anyChar - return $ Link [Code ("",["url"],[]) url'] (escapeURI url', "") - -link :: GenParser Char ParserState Inline -link = try $ do - string "\\href{" - url' <- manyTill anyChar (char '}') - char '{' - label' <- manyTill inline (char '}') - return $ Link (normalizeSpaces label') (escapeURI url', "") - -image :: GenParser Char ParserState Inline -image = try $ do - ("includegraphics", _, args) <- command - let args' = filter isArg args -- filter out options - let (src,tit) = case args' of - [] -> ("", "") - (x:_) -> (stripFirstAndLast x, "") - return $ Image [Str "image"] (escapeURI src, tit) - -footnote :: GenParser Char ParserState Inline -footnote = try $ do - (name, _, (contents:[])) <- command - if ((name == "footnote") || (name == "thanks")) - then string "" - else fail "not a footnote or thanks command" - let contents' = stripFirstAndLast contents - -- parse the extracted block, which may contain various block elements: - rest <- getInput - setInput $ contents' - blocks <- parseBlocks - setInput rest - return $ Note blocks - --- | citations -cite :: GenParser Char ParserState Inline -cite = simpleCite <|> complexNatbibCites - -simpleCiteArgs :: GenParser Char ParserState [Citation] -simpleCiteArgs = try $ do - first <- optionMaybe $ (char '[') >> manyTill inline (char ']') - second <- optionMaybe $ (char '[') >> manyTill inline (char ']') - char '{' - keys <- many1Till citationLabel (char '}') - let (pre, suf) = case (first , second ) of - (Just s , Nothing) -> ([], s ) - (Just s , Just t ) -> (s , t ) - _ -> ([], []) - conv k = Citation { citationId = k - , citationPrefix = [] - , citationSuffix = [] - , citationMode = NormalCitation - , citationHash = 0 - , citationNoteNum = 0 - } - return $ addPrefix pre $ addSuffix suf $ map conv keys - - -simpleCite :: GenParser Char ParserState Inline -simpleCite = try $ do - char '\\' - let biblatex = [a ++ "cite" | a <- ["auto", "foot", "paren", "super", ""]] - ++ ["footcitetext"] - normal = ["cite" ++ a ++ b | a <- ["al", ""], b <- ["p", "p*", ""]] - ++ biblatex - supress = ["citeyearpar", "citeyear", "autocite*", "cite*", "parencite*"] - intext = ["textcite"] ++ ["cite" ++ a ++ b | a <- ["al", ""], b <- ["t", "t*"]] - mintext = ["textcites"] - mnormal = map (++ "s") biblatex - cmdend = notFollowedBy (letter <|> char '*') - capit [] = [] - capit (x:xs) = toUpper x : xs - addUpper xs = xs ++ map capit xs - toparser l t = try $ oneOfStrings (addUpper l) >> cmdend >> return t - (mode, multi) <- toparser normal (NormalCitation, False) - <|> toparser supress (SuppressAuthor, False) - <|> toparser intext (AuthorInText , False) - <|> toparser mnormal (NormalCitation, True ) - <|> toparser mintext (AuthorInText , True ) - cits <- if multi then - many1 simpleCiteArgs - else - simpleCiteArgs >>= \c -> return [c] - let (c:cs) = concat cits - cits' = case mode of - AuthorInText -> c {citationMode = mode} : cs - _ -> map (\a -> a {citationMode = mode}) (c:cs) - return $ Cite cits' [] - -complexNatbibCites :: GenParser Char ParserState Inline -complexNatbibCites = complexNatbibTextual <|> complexNatbibParenthetical - -complexNatbibTextual :: GenParser Char ParserState Inline -complexNatbibTextual = try $ do - string "\\citeauthor{" - manyTill (noneOf "}") (char '}') - skipSpaces - Cite (c:cs) _ <- complexNatbibParenthetical - return $ Cite (c {citationMode = AuthorInText} : cs) [] - - -complexNatbibParenthetical :: GenParser Char ParserState Inline -complexNatbibParenthetical = try $ do - string "\\citetext{" - cits <- many1Till parseOne (char '}') - return $ Cite (concat cits) [] - where - parseOne = do - skipSpaces - pref <- many (notFollowedBy (oneOf "\\}") >> inline) - (Cite cites _) <- simpleCite - suff <- many (notFollowedBy (oneOf "\\};") >> inline) - skipSpaces - optional $ char ';' - return $ addPrefix pref $ addSuffix suff $ cites - -addPrefix :: [Inline] -> [Citation] -> [Citation] -addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks -addPrefix _ _ = [] - -addSuffix :: [Inline] -> [Citation] -> [Citation] -addSuffix s ks@(_:_) = let k = last ks - in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] -addSuffix _ _ = [] - -citationLabel :: GenParser Char ParserState String -citationLabel = do - res <- many1 $ noneOf ",}" - optional $ char ',' - return $ removeLeadingTrailingSpace res - --- | Parse any LaTeX inline command and return it in a raw TeX inline element. -rawLaTeXInline' :: GenParser Char ParserState Inline -rawLaTeXInline' = do - notFollowedBy' $ oneOfStrings ["\\begin", "\\end", "\\item", "\\ignore", - "\\section"] - rawLaTeXInline + let header'' = if null header' + then replicate cols mempty + else header' + return $ table mempty (zip aligns (repeat 0)) header'' rows --- | Parse any LaTeX command and return it in a raw TeX inline element. -rawLaTeXInline :: GenParser Char ParserState Inline -rawLaTeXInline = try $ do - state <- getState - if stateParseRaw state - then command >>= demacro - else do - (name,st,args) <- command - x <- demacro (name,st,args) - unless (x == Str "" || name `elem` commandsToIgnore) $ do - inp <- getInput - setInput $ intercalate " " args ++ inp - return $ Str "" diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index c78727715..8da0f7c16 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -39,7 +39,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.Shared import Text.Pandoc.Parsing -import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag, isCommentTag ) import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) @@ -729,8 +729,8 @@ rawVerbatimBlock = try $ do rawTeXBlock :: GenParser Char ParserState Block rawTeXBlock = do failIfStrict - result <- liftM (RawBlock "latex") rawLaTeXEnvironment' - <|> liftM (RawBlock "context") rawConTeXtEnvironment' + result <- liftM (RawBlock "latex") rawLaTeXBlock + <|> liftM (RawBlock "context") rawConTeXtEnvironment spaces return result @@ -933,8 +933,8 @@ inlineParsers = [ whitespace , inlineNote -- after superscript because of ^[link](/foo)^ , autoLink , rawHtmlInline - , rawLaTeXInline' , escapedChar + , rawLaTeXInline' , exampleRef , smartPunctuation inline , charRef @@ -977,8 +977,7 @@ symbol :: GenParser Char ParserState Inline symbol = do result <- noneOf "<\\\n\t " <|> try (do lookAhead $ char '\\' - notFollowedBy' $ rawLaTeXEnvironment' - <|> rawConTeXtEnvironment' + notFollowedBy' rawTeXBlock char '\\') return $ Str [result] @@ -1246,18 +1245,16 @@ inlineNote = try $ do rawLaTeXInline' :: GenParser Char ParserState Inline rawLaTeXInline' = try $ do failIfStrict - lookAhead $ char '\\' - notFollowedBy' $ rawLaTeXEnvironment' - <|> rawConTeXtEnvironment' + lookAhead $ char '\\' >> notFollowedBy' (string "start") -- context env RawInline _ s <- rawLaTeXInline return $ RawInline "tex" s -- "tex" because it might be context or latex -rawConTeXtEnvironment' :: GenParser Char st String -rawConTeXtEnvironment' = try $ do +rawConTeXtEnvironment :: GenParser Char st String +rawConTeXtEnvironment = try $ do string "\\start" completion <- inBrackets (letter <|> digit <|> spaceChar) <|> (many1 letter) - contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar)) + contents <- manyTill (rawConTeXtEnvironment <|> (count 1 anyChar)) (try $ string "\\stop" >> string completion) return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion -- cgit v1.2.3