aboutsummaryrefslogtreecommitdiff
path: root/Text/Pandoc/Readers/LaTeX.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Text/Pandoc/Readers/LaTeX.hs')
-rw-r--r--Text/Pandoc/Readers/LaTeX.hs651
1 files changed, 651 insertions, 0 deletions
diff --git a/Text/Pandoc/Readers/LaTeX.hs b/Text/Pandoc/Readers/LaTeX.hs
new file mode 100644
index 000000000..37cc2bfe4
--- /dev/null
+++ b/Text/Pandoc/Readers/LaTeX.hs
@@ -0,0 +1,651 @@
+{-
+Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.LaTeX
+ Copyright : Copyright (C) 2006-7 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of LaTeX to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.LaTeX (
+ readLaTeX,
+ rawLaTeXInline,
+ rawLaTeXEnvironment
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Data.Maybe ( fromMaybe )
+import Data.Char ( chr )
+import Data.List ( isPrefixOf, isSuffixOf )
+
+-- | Parse LaTeX from string and return 'Pandoc' document.
+readLaTeX :: ParserState -- ^ Parser state, including options for parser
+ -> String -- ^ String to parse
+ -> Pandoc
+readLaTeX = readWith parseLaTeX
+
+-- characters with special meaning
+specialChars = "\\`$%^&_~#{}\n \t|<>'\"-"
+
+--
+-- utility functions
+--
+
+-- | Returns text between brackets and its matching pair.
+bracketedText openB closeB = do
+ result <- charsInBalanced' openB closeB
+ return $ [openB] ++ result ++ [closeB]
+
+-- | Returns an option or argument of a LaTeX command.
+optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']'
+
+-- | True if the string begins with '{'.
+isArg ('{':rest) = True
+isArg other = False
+
+-- | Returns list of options and arguments of a LaTeX command.
+commandArgs = many optOrArg
+
+-- | Parses LaTeX command, returns (name, star, list of options or arguments).
+command = do
+ char '\\'
+ name <- many1 letter
+ star <- option "" (string "*") -- some commands have starred versions
+ args <- commandArgs
+ return (name, star, args)
+
+begin name = try $ do
+ string $ "\\begin{" ++ name ++ "}"
+ optional commandArgs
+ spaces
+ return name
+
+end name = try $ do
+ string $ "\\end{" ++ name ++ "}"
+ spaces
+ return name
+
+-- | Returns a list of block elements containing the contents of an
+-- environment.
+environment name = try $ begin name >> spaces >> manyTill block (end name)
+
+anyEnvironment = try $ do
+ string "\\begin{"
+ name <- many letter
+ star <- option "" (string "*") -- some environments have starred variants
+ char '}'
+ optional commandArgs
+ spaces
+ contents <- manyTill block (end (name ++ star))
+ return $ BlockQuote contents
+
+--
+-- parsing documents
+--
+
+-- | Process LaTeX preamble, extracting metadata.
+processLaTeXPreamble = try $ manyTill
+ (choice [bibliographic, comment, unknownCommand, nullBlock])
+ (try (string "\\begin{document}")) >>
+ spaces
+
+-- | Parse LaTeX and return 'Pandoc'.
+parseLaTeX = do
+ optional processLaTeXPreamble -- preamble might not be present (fragment)
+ spaces
+ blocks <- parseBlocks
+ spaces
+ optional $ try (string "\\end{document}" >> many anyChar)
+ -- might not be present (fragment)
+ spaces
+ eof
+ state <- getState
+ let blocks' = filter (/= Null) blocks
+ let title' = stateTitle state
+ let authors' = stateAuthors state
+ let date' = stateDate state
+ return $ Pandoc (Meta title' authors' date') blocks'
+
+--
+-- parsing blocks
+--
+
+parseBlocks = spaces >> many block
+
+block = choice [ hrule
+ , codeBlock
+ , header
+ , list
+ , blockQuote
+ , mathBlock
+ , comment
+ , bibliographic
+ , para
+ , specialEnvironment
+ , itemBlock
+ , unknownEnvironment
+ , unknownCommand ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = try $ do
+ char '\\'
+ subs <- many (try (string "sub"))
+ string "section"
+ optional (char '*')
+ char '{'
+ title <- manyTill inline (char '}')
+ spaces
+ return $ Header (length subs + 1) (normalizeSpaces title)
+
+--
+-- hrule block
+--
+
+hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n",
+ "\\newpage" ] >> spaces >> return HorizontalRule
+
+--
+-- code blocks
+--
+
+codeBlock = codeBlock1 <|> codeBlock2
+
+codeBlock1 = try $ do
+ string "\\begin{verbatim}" -- don't use begin function because it
+ -- gobbles whitespace
+ optional blanklines -- we want to gobble blank lines, but not
+ -- leading space
+ contents <- manyTill anyChar (try (string "\\end{verbatim}"))
+ spaces
+ return $ CodeBlock (stripTrailingNewlines contents)
+
+codeBlock2 = try $ do
+ string "\\begin{Verbatim}" -- used by fancyvrb package
+ option "" blanklines
+ contents <- manyTill anyChar (try (string "\\end{Verbatim}"))
+ spaces
+ return $ CodeBlock (stripTrailingNewlines contents)
+
+--
+-- block quotes
+--
+
+blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>=
+ return . BlockQuote
+
+--
+-- math block
+--
+
+mathBlock = mathBlockWith (begin "equation") (end "equation") <|>
+ mathBlockWith (begin "displaymath") (end "displaymath") <|>
+ mathBlockWith (string "\\[") (string "\\]") <?> "math block"
+
+mathBlockWith start end = try $ do
+ start
+ spaces
+ result <- manyTill anyChar end
+ spaces
+ return $ BlockQuote [Para [TeX ("$" ++ result ++ "$")]]
+
+--
+-- list blocks
+--
+
+list = bulletList <|> orderedList <|> definitionList <?> "list"
+
+listItem = try $ do
+ ("item", _, args) <- command
+ spaces
+ state <- getState
+ let oldParserContext = stateParserContext state
+ updateState (\state -> state {stateParserContext = ListItemState})
+ blocks <- many block
+ updateState (\state -> state {stateParserContext = oldParserContext})
+ opt <- case args of
+ ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x ->
+ parseFromString (many inline) $ tail $ init x
+ _ -> return []
+ return (opt, blocks)
+
+orderedList = try $ do
+ string "\\begin{enumerate}"
+ (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $
+ try $ do failIfStrict
+ char '['
+ res <- anyOrderedListMarker
+ char ']'
+ return res
+ spaces
+ option "" $ try $ do string "\\setlength{\\itemindent}"
+ char '{'
+ manyTill anyChar (char '}')
+ spaces
+ start <- option 1 $ try $ do failIfStrict
+ string "\\setcounter{enum"
+ many1 (oneOf "iv")
+ string "}{"
+ num <- many1 digit
+ char '}'
+ spaces
+ return $ (read num) + 1
+ items <- many listItem
+ end "enumerate"
+ spaces
+ return $ OrderedList (start, style, delim) $ map snd items
+
+bulletList = try $ do
+ begin "itemize"
+ spaces
+ items <- many listItem
+ end "itemize"
+ spaces
+ return (BulletList $ map snd items)
+
+definitionList = try $ do
+ begin "description"
+ spaces
+ items <- many listItem
+ end "description"
+ spaces
+ return (DefinitionList items)
+
+--
+-- paragraph block
+--
+
+para = many1 inline >>~ spaces >>= return . Para . normalizeSpaces
+
+--
+-- title authors date
+--
+
+bibliographic = choice [ maketitle, title, authors, date ]
+
+maketitle = try (string "\\maketitle") >> spaces >> return Null
+
+title = try $ do
+ string "\\title{"
+ tit <- manyTill inline (char '}')
+ spaces
+ updateState (\state -> state { stateTitle = tit })
+ return Null
+
+authors = try $ do
+ string "\\author{"
+ authors <- manyTill anyChar (char '}')
+ spaces
+ let authors' = map removeLeadingTrailingSpace $ lines $
+ substitute "\\\\" "\n" authors
+ updateState (\state -> state { stateAuthors = authors' })
+ return Null
+
+date = try $ do
+ string "\\date{"
+ date' <- manyTill anyChar (char '}')
+ spaces
+ updateState (\state -> state { stateDate = date' })
+ return Null
+
+--
+-- item block
+-- for use in unknown environments that aren't being parsed as raw latex
+--
+
+-- this forces items to be parsed in different blocks
+itemBlock = try $ do
+ ("item", _, args) <- command
+ state <- getState
+ if (stateParserContext state == ListItemState)
+ then fail "item should be handled by list block"
+ else if null args
+ then return Null
+ else return $ Plain [Str (stripFirstAndLast (head args))]
+
+--
+-- raw LaTeX
+--
+
+specialEnvironment = do -- these are always parsed as raw
+ lookAhead (choice (map (\name -> begin name) ["tabular", "figure",
+ "tabbing", "eqnarry", "picture", "table", "verse", "theorem"]))
+ rawLaTeXEnvironment
+
+-- | Parse any LaTeX environment and return a Para block containing
+-- the whole literal environment as raw TeX.
+rawLaTeXEnvironment :: GenParser Char st Block
+rawLaTeXEnvironment = try $ do
+ string "\\begin{"
+ name <- many1 letter
+ star <- option "" (string "*") -- for starred variants
+ let name' = name ++ star
+ char '}'
+ args <- option [] commandArgs
+ let argStr = concat args
+ contents <- manyTill (choice [ (many1 (noneOf "\\")),
+ (do
+ (Para [TeX str]) <- rawLaTeXEnvironment
+ return str),
+ string "\\" ])
+ (end name')
+ spaces
+ return $ Para [TeX $ "\\begin{" ++ name' ++ "}" ++ argStr ++
+ concat contents ++ "\\end{" ++ name' ++ "}"]
+
+unknownEnvironment = try $ do
+ state <- getState
+ result <- if stateParseRaw state -- check whether we should include raw TeX
+ then rawLaTeXEnvironment -- if so, get whole raw environment
+ else anyEnvironment -- otherwise just the contents
+ return result
+
+unknownCommand = try $ do
+ notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",
+ "document"]
+ (name, star, args) <- command
+ spaces
+ let argStr = concat args
+ state <- getState
+ if name == "item" && (stateParserContext state) == ListItemState
+ then fail "should not be parsed as raw"
+ else string ""
+ if stateParseRaw state
+ then return $ Plain [TeX ("\\" ++ name ++ star ++ argStr)]
+ else return $ Plain [Str (joinWithSep " " args)]
+
+-- latex comment
+comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null
+
+--
+-- inline
+--
+
+inline = choice [ str
+ , endline
+ , whitespace
+ , quoted
+ , apostrophe
+ , spacer
+ , strong
+ , math
+ , ellipses
+ , emDash
+ , enDash
+ , hyphen
+ , emph
+ , strikeout
+ , superscript
+ , subscript
+ , ref
+ , lab
+ , code
+ , url
+ , link
+ , image
+ , footnote
+ , linebreak
+ , accentedChar
+ , specialChar
+ , rawLaTeXInline
+ , escapedChar
+ , unescapedChar
+ ] <?> "inline"
+
+accentedChar = normalAccentedChar <|> specialAccentedChar
+
+normalAccentedChar = try $ do
+ char '\\'
+ accent <- oneOf "'`^\"~"
+ character <- (try $ char '{' >> letter >>~ char '}') <|> letter
+ let table = fromMaybe [] $ lookup character accentTable
+ let result = case lookup accent table of
+ Just num -> chr num
+ Nothing -> '?'
+ return $ Str [result]
+
+-- an association list of letters and association list of accents
+-- and decimal character numbers.
+accentTable =
+ [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
+ ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
+ ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
+ ('N', [('~', 209)]),
+ ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
+ ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
+ ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
+ ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
+ ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
+ ('n', [('~', 241)]),
+ ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
+ ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]
+
+specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig,
+ oslash, pound, euro, copyright, sect ]
+
+ccedil = try $ do
+ char '\\'
+ letter <- oneOfStrings ["cc", "cC"]
+ let num = if letter == "cc" then 231 else 199
+ return $ Str [chr num]
+
+aring = try $ do
+ char '\\'
+ letter <- oneOfStrings ["aa", "AA"]
+ let num = if letter == "aa" then 229 else 197
+ return $ Str [chr num]
+
+iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >>
+ return (Str [chr 239])
+
+icirc = try (string "\\^") >> oneOfStrings ["\\i", "{\\i}"] >>
+ return (Str [chr 238])
+
+szlig = try (string "\\ss") >> return (Str [chr 223])
+
+oslash = try $ do
+ char '\\'
+ letter <- choice [char 'o', char 'O']
+ let num = if letter == 'o' then 248 else 216
+ return $ Str [chr num]
+
+aelig = try $ do
+ char '\\'
+ letter <- oneOfStrings ["ae", "AE"]
+ let num = if letter == "ae" then 230 else 198
+ return $ Str [chr num]
+
+pound = try (string "\\pounds") >> return (Str [chr 163])
+
+euro = try (string "\\euro") >> return (Str [chr 8364])
+
+copyright = try (string "\\copyright") >> return (Str [chr 169])
+
+sect = try (string "\\S") >> return (Str [chr 167])
+
+escapedChar = do
+ result <- escaped (oneOf " $%&_#{}\n")
+ return $ if result == Str "\n" then Str " " else result
+
+-- ignore standalone, nonescaped special characters
+unescapedChar = oneOf "`$^&_#{}|<>" >> return (Str "")
+
+specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ]
+
+backslash = try (string "\\textbackslash") >> return (Str "\\")
+
+tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~")
+
+caret = try (string "\\^{}") >> return (Str "^")
+
+bar = try (string "\\textbar") >> return (Str "\\")
+
+lt = try (string "\\textless") >> return (Str "<")
+
+gt = try (string "\\textgreater") >> return (Str ">")
+
+doubleQuote = char '"' >> return (Str "\"")
+
+code = code1 <|> code2
+
+code1 = try $ do
+ string "\\verb"
+ marker <- anyChar
+ result <- manyTill anyChar (char marker)
+ return $ Code $ removeLeadingTrailingSpace result
+
+code2 = try $ do
+ string "\\texttt{"
+ result <- manyTill (noneOf "\\\n~$%^&{}") (char '}')
+ return $ Code result
+
+emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >>
+ manyTill inline (char '}') >>= return . Emph
+
+strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>=
+ return . Strikeout
+
+superscript = try $ string "\\textsuperscript{" >>
+ manyTill inline (char '}') >>= return . Superscript
+
+-- note: \textsubscript isn't a standard latex command, but we use
+-- a defined version in pandoc.
+subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>=
+ return . Subscript
+
+apostrophe = char '\'' >> return Apostrophe
+
+quoted = doubleQuoted <|> singleQuoted
+
+singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>=
+ return . Quoted SingleQuote . normalizeSpaces
+
+doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>=
+ return . Quoted DoubleQuote . normalizeSpaces
+
+singleQuoteStart = char '`'
+
+singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum
+
+doubleQuoteStart = string "``"
+
+doubleQuoteEnd = try $ string "''"
+
+ellipses = try $ string "\\ldots" >> optional (try (string "{}")) >>
+ return Ellipses
+
+enDash = try (string "--") >> return EnDash
+
+emDash = try (string "---") >> return EmDash
+
+hyphen = char '-' >> return (Str "-")
+
+lab = try $ do
+ string "\\label{"
+ result <- manyTill anyChar (char '}')
+ return $ Str $ "(" ++ result ++ ")"
+
+ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str
+
+strong = try (string "\\textbf{") >> manyTill inline (char '}') >>=
+ return . Strong
+
+whitespace = many1 (oneOf "~ \t") >> return Space
+
+-- hard line break
+linebreak = try (string "\\\\") >> return LineBreak
+
+spacer = try (string "\\,") >> return (Str "")
+
+str = many1 (noneOf specialChars) >>= return . Str
+
+-- endline internal to paragraph
+endline = try $ newline >> notFollowedBy blankline >> return Space
+
+-- math
+math = math1 <|> math2 <?> "math"
+
+math1 = try $ do
+ char '$'
+ result <- many (noneOf "$")
+ char '$'
+ return $ TeX ("$" ++ result ++ "$")
+
+math2 = try $ do
+ string "\\("
+ result <- many (noneOf "$")
+ string "\\)"
+ return $ TeX ("$" ++ result ++ "$")
+
+--
+-- links and images
+--
+
+url = try $ do
+ string "\\url"
+ url <- charsInBalanced '{' '}'
+ return $ Link [Code url] (url, "")
+
+link = try $ do
+ string "\\href{"
+ url <- manyTill anyChar (char '}')
+ char '{'
+ label <- manyTill inline (char '}')
+ return $ Link (normalizeSpaces label) (url, "")
+
+image = try $ do
+ ("includegraphics", _, args) <- command
+ let args' = filter isArg args -- filter out options
+ let src = if null args' then
+ ("", "")
+ else
+ (stripFirstAndLast (head args'), "")
+ return $ Image [Str "image"] src
+
+footnote = try $ do
+ (name, _, (contents:[])) <- command
+ if ((name == "footnote") || (name == "thanks"))
+ then string ""
+ else fail "not a footnote or thanks command"
+ let contents' = stripFirstAndLast contents
+ -- parse the extracted block, which may contain various block elements:
+ rest <- getInput
+ setInput $ contents'
+ blocks <- parseBlocks
+ setInput rest
+ return $ Note blocks
+
+-- | Parse any LaTeX command and return it in a raw TeX inline element.
+rawLaTeXInline :: GenParser Char ParserState Inline
+rawLaTeXInline = try $ do
+ (name, star, args) <- command
+ state <- getState
+ if ((name == "begin") || (name == "end") || (name == "item"))
+ then fail "not an inline command"
+ else string ""
+ return $ TeX ("\\" ++ name ++ star ++ concat args)
+