diff options
Diffstat (limited to 'src/Text/Pandoc')
46 files changed, 4577 insertions, 1897 deletions
diff --git a/src/Text/Pandoc/Asciify.hs b/src/Text/Pandoc/Asciify.hs index 8a5ccec5c..66490d5c6 100644 --- a/src/Text/Pandoc/Asciify.hs +++ b/src/Text/Pandoc/Asciify.hs @@ -17,7 +17,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.SelfContained + Module : Text.Pandoc.Asciify Copyright : Copyright (C) 2013-2014 John MacFarlane License : GNU GPL, version 2 or above diff --git a/src/Text/Pandoc/Compat/Directory.hs b/src/Text/Pandoc/Compat/Directory.hs new file mode 100644 index 000000000..61dd5c525 --- /dev/null +++ b/src/Text/Pandoc/Compat/Directory.hs @@ -0,0 +1,21 @@ +{-# LANGUAGE CPP #-} +module Text.Pandoc.Compat.Directory ( getModificationTime ) + where + +#if MIN_VERSION_directory(1,2,0) +import System.Directory + + +#else +import qualified System.Directory as S +import Data.Time.Clock (UTCTime) +import Data.Time.Clock.POSIX +import System.Time + +getModificationTime :: FilePath -> IO UTCTime +getModificationTime fp = convert `fmap` S.getModificationTime fp + where + convert (TOD x _) = posixSecondsToUTCTime (realToFrac x) + +#endif + diff --git a/src/Text/Pandoc/Compat/Except.hs b/src/Text/Pandoc/Compat/Except.hs new file mode 100644 index 000000000..9ce7c0d36 --- /dev/null +++ b/src/Text/Pandoc/Compat/Except.hs @@ -0,0 +1,37 @@ +{-# LANGUAGE CPP #-} +module Text.Pandoc.Compat.Except ( ExceptT + , Except + , Error(..) + , runExceptT + , runExcept + , MonadError + , throwError + , catchError ) + where + +#if MIN_VERSION_mtl(2,2,1) +import Control.Monad.Except + +class Error a where + noMsg :: a + strMsg :: String -> a + + noMsg = strMsg "" + strMsg _ = noMsg + +#else +import Control.Monad.Error +import Control.Monad.Identity (Identity, runIdentity) + +type ExceptT = ErrorT + +type Except s a = ErrorT s Identity a + +runExceptT :: ExceptT e m a -> m (Either e a) +runExceptT = runErrorT + +runExcept :: ExceptT e Identity a -> Either e a +runExcept = runIdentity . runExceptT +#endif + + diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs new file mode 100644 index 000000000..b19804b5f --- /dev/null +++ b/src/Text/Pandoc/MediaBag.hs @@ -0,0 +1,107 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.MediaBag + Copyright : Copyright (C) 2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Definition of a MediaBag object to hold binary resources, and an +interface for interacting with it. +-} +module Text.Pandoc.MediaBag ( + MediaBag, + lookupMedia, + insertMedia, + mediaDirectory, + extractMediaBag + ) where +import System.FilePath +import System.Directory (createDirectoryIfMissing) +import qualified Data.Map as M +import qualified Data.ByteString.Lazy as BL +import Data.Monoid (Monoid) +import Control.Monad (when, MonadPlus(..)) +import Text.Pandoc.MIME (getMimeType) +import qualified Text.Pandoc.UTF8 as UTF8 +import Data.Maybe (fromMaybe) +import System.IO (stderr) + +-- | A container for a collection of binary resources, with names and +-- mime types. Note that a 'MediaBag' is a Monoid, so 'mempty' +-- can be used for an empty 'MediaBag', and '<>' can be used to append +-- two 'MediaBag's. +newtype MediaBag = MediaBag (M.Map String (String, BL.ByteString)) + deriving (Monoid) + +instance Show MediaBag where + show bag = "MediaBag " ++ show (mediaDirectory bag) + +-- | Insert a media item into a 'MediaBag', replacing any existing +-- value with the same name. +insertMedia :: FilePath -- ^ relative path and canonical name of resource + -> Maybe String -- ^ mime type (Nothing = determine from extension) + -> BL.ByteString -- ^ contents of resource + -> MediaBag + -> MediaBag +insertMedia fp mbMime contents (MediaBag mediamap) = + MediaBag (M.insert fp (mime, contents) mediamap) + where mime = fromMaybe "application/octet-stream" (mbMime `mplus` fallback) + fallback = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + _ -> getMimeType fp + +-- | Lookup a media item in a 'MediaBag', returning mime type and contents. +lookupMedia :: FilePath + -> MediaBag + -> Maybe (String, BL.ByteString) +lookupMedia fp (MediaBag mediamap) = M.lookup fp mediamap + +-- | Get a list of the file paths stored in a 'MediaBag', with +-- their corresponding mime types and the lengths in bytes of the contents. +mediaDirectory :: MediaBag -> [(String, String, Int)] +mediaDirectory (MediaBag mediamap) = + M.foldWithKey (\fp (mime,contents) -> + ((fp, mime, fromIntegral $ BL.length contents):)) [] mediamap + +-- | Extract contents of MediaBag to a given directory. Print informational +-- messages if 'verbose' is true. +extractMediaBag :: Bool + -> FilePath + -> MediaBag + -> IO () +extractMediaBag verbose dir (MediaBag mediamap) = do + sequence_ $ M.foldWithKey + (\fp (_ ,contents) -> + ((writeMedia verbose dir (fp, contents)):)) [] mediamap + +writeMedia :: Bool -> FilePath -> (FilePath, BL.ByteString) -> IO () +writeMedia verbose dir (subpath, bs) = do + -- we join and split to convert a/b/c to a\b\c on Windows; + -- in zip containers all paths use / + let fullpath = dir </> normalise subpath + createDirectoryIfMissing True $ takeDirectory fullpath + when verbose $ UTF8.hPutStrLn stderr $ "pandoc: extracting " ++ fullpath + BL.writeFile fullpath bs + + diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 611a6bb06..84ccbbdc9 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -41,6 +41,7 @@ module Text.Pandoc.Options ( Extension(..) , HTMLSlideVariant (..) , EPUBVersion (..) , WriterOptions (..) + , TrackChanges (..) , def , isEnabled ) where @@ -48,6 +49,8 @@ import Data.Set (Set) import qualified Data.Set as Set import Data.Default import Text.Pandoc.Highlighting (Style, pygments) +import Text.Pandoc.MediaBag (MediaBag) +import Data.Monoid -- | Individually selectable syntax extensions. data Extension = @@ -74,6 +77,8 @@ data Extension = | Ext_backtick_code_blocks -- ^ Github style ``` code blocks | Ext_inline_code_attributes -- ^ Allow attributes on inline code | Ext_markdown_in_html_blocks -- ^ Interpret as markdown inside HTML blocks + | Ext_native_divs -- ^ Use Div blocks for contents of <div> tags + | Ext_native_spans -- ^ Use Span inlines for contents of <span> | Ext_markdown_attribute -- ^ Interpret text inside HTML as markdown -- iff container has attribute 'markdown' | Ext_escaped_line_breaks -- ^ Treat a backslash at EOL as linebreak @@ -83,6 +88,8 @@ data Extension = | Ext_lists_without_preceding_blankline -- ^ Allow lists without preceding blank | Ext_startnum -- ^ Make start number of ordered list significant | Ext_definition_lists -- ^ Definition lists as in pandoc, mmd, php + | Ext_compact_definition_lists -- ^ Definition lists without + -- space between items, and disallow laziness | Ext_example_lists -- ^ Markdown-style numbered examples | Ext_all_symbols_escapable -- ^ Make all non-alphanumerics escapable | Ext_intraword_underscores -- ^ Treat underscore inside word as literal @@ -101,6 +108,7 @@ data Extension = | Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid] | Ext_implicit_header_references -- ^ Implicit reference links for headers | Ext_line_blocks -- ^ RST style line blocks + | Ext_epub_html_exts -- ^ Recognise the EPUB extended version of HTML deriving (Show, Read, Enum, Eq, Ord, Bounded) pandocExtensions :: Set Extension @@ -125,6 +133,8 @@ pandocExtensions = Set.fromList , Ext_backtick_code_blocks , Ext_inline_code_attributes , Ext_markdown_in_html_blocks + , Ext_native_divs + , Ext_native_spans , Ext_escaped_line_breaks , Ext_fancy_lists , Ext_startnum @@ -162,7 +172,6 @@ githubMarkdownExtensions = Set.fromList , Ext_raw_html , Ext_tex_math_single_backslash , Ext_fenced_code_blocks - , Ext_fenced_code_attributes , Ext_auto_identifiers , Ext_ascii_identifiers , Ext_backtick_code_blocks @@ -198,7 +207,6 @@ strictExtensions = Set.fromList data ReaderOptions = ReaderOptions{ readerExtensions :: Set Extension -- ^ Syntax extensions , readerSmart :: Bool -- ^ Smart punctuation - , readerStrict :: Bool -- ^ FOR TRANSITION ONLY , readerStandalone :: Bool -- ^ Standalone document with header , readerParseRaw :: Bool -- ^ Parse raw HTML, LaTeX , readerColumns :: Int -- ^ Number of columns in terminal @@ -211,13 +219,13 @@ data ReaderOptions = ReaderOptions{ -- indented code blocks , readerDefaultImageExtension :: String -- ^ Default extension for images , readerTrace :: Bool -- ^ Print debugging info + , readerTrackChanges :: TrackChanges } deriving (Show, Read) instance Default ReaderOptions where def = ReaderOptions{ readerExtensions = pandocExtensions , readerSmart = False - , readerStrict = False , readerStandalone = False , readerParseRaw = False , readerColumns = 80 @@ -227,6 +235,7 @@ instance Default ReaderOptions , readerIndentedCodeClasses = [] , readerDefaultImageExtension = "" , readerTrace = False + , readerTrackChanges = AcceptChanges } -- @@ -264,6 +273,12 @@ data HTMLSlideVariant = S5Slides | NoSlides deriving (Show, Read, Eq) +-- | Options for accepting or rejecting MS Word track-changes. +data TrackChanges = AcceptChanges + | RejectChanges + | AllChanges + deriving (Show, Read, Eq) + -- | Options for writers data WriterOptions = WriterOptions { writerStandalone :: Bool -- ^ Include header and footer @@ -305,7 +320,8 @@ data WriterOptions = WriterOptions , writerEpubChapterLevel :: Int -- ^ Header level for chapters (separate files) , writerTOCDepth :: Int -- ^ Number of levels to include in TOC , writerReferenceODT :: Maybe FilePath -- ^ Path to reference ODT if specified - , writerReferenceDocx :: Maybe FilePath -- ^ Ptah to reference DOCX if specified + , writerReferenceDocx :: Maybe FilePath -- ^ Path to reference DOCX if specified + , writerMediaBag :: MediaBag -- ^ Media collected by docx or epub reader } deriving Show instance Default WriterOptions where @@ -348,6 +364,7 @@ instance Default WriterOptions where , writerTOCDepth = 3 , writerReferenceODT = Nothing , writerReferenceDocx = Nothing + , writerMediaBag = mempty } -- | Returns True if the given extension is enabled. diff --git a/src/Text/Pandoc/PDF.hs b/src/Text/Pandoc/PDF.hs index bd55c565f..35554637a 100644 --- a/src/Text/Pandoc/PDF.hs +++ b/src/Text/Pandoc/PDF.hs @@ -30,7 +30,6 @@ Conversion of LaTeX documents to PDF. -} module Text.Pandoc.PDF ( makePDF ) where -import System.IO.Temp import Data.ByteString.Lazy (ByteString) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BC @@ -46,7 +45,7 @@ import Data.Maybe (fromMaybe) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Definition import Text.Pandoc.Walk (walkM) -import Text.Pandoc.Shared (fetchItem, warn) +import Text.Pandoc.Shared (fetchItem', warn, withTempDir) import Text.Pandoc.Options (WriterOptions(..)) import Text.Pandoc.MIME (extensionFromMimeType) import Text.Pandoc.Process (pipeProcess) @@ -55,14 +54,6 @@ import qualified Data.ByteString.Lazy as BL import Data.List (intercalate) #endif -withTempDir :: String -> (FilePath -> IO a) -> IO a -withTempDir = -#ifdef _WINDOWS - withTempDirectory "." -#else - withSystemTempDirectory -#endif - #ifdef _WINDOWS changePathSeparators :: FilePath -> FilePath changePathSeparators = intercalate "/" . splitDirectories @@ -74,26 +65,26 @@ makePDF :: String -- ^ pdf creator (pdflatex, lualatex, xelatex) -> Pandoc -- ^ document -> IO (Either ByteString ByteString) makePDF program writer opts doc = withTempDir "tex2pdf." $ \tmpdir -> do - doc' <- handleImages (writerSourceURL opts) tmpdir doc + doc' <- handleImages opts tmpdir doc let source = writer opts doc' tex2pdf' tmpdir program source -handleImages :: Maybe String -- ^ source base URL +handleImages :: WriterOptions -> FilePath -- ^ temp dir to store images -> Pandoc -- ^ document -> IO Pandoc -handleImages baseURL tmpdir = walkM (handleImage' baseURL tmpdir) +handleImages opts tmpdir = walkM (handleImage' opts tmpdir) -handleImage' :: Maybe String +handleImage' :: WriterOptions -> FilePath -> Inline -> IO Inline -handleImage' baseURL tmpdir (Image ils (src,tit)) = do +handleImage' opts tmpdir (Image ils (src,tit)) = do exists <- doesFileExist src if exists then return $ Image ils (src,tit) else do - res <- fetchItem baseURL src + res <- fetchItem' (writerMediaBag opts) (writerSourceURL opts) src case res of Right (contents, Just mime) -> do let ext = fromMaybe (takeExtension src) $ diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 8bc042e28..d1fba1e21 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -1,5 +1,9 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving, TypeSynonymInstances, - FlexibleInstances#-} +{-# LANGUAGE + FlexibleContexts +, GeneralizedNewtypeDeriving +, TypeSynonymInstances +, MultiParamTypeClasses +, FlexibleInstances #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -29,8 +33,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA A utility library with parsers used in pandoc readers. -} -module Text.Pandoc.Parsing ( (>>~), - anyLine, +module Text.Pandoc.Parsing ( anyLine, many1Till, notFollowedBy', oneOfStrings, @@ -62,6 +65,7 @@ module Text.Pandoc.Parsing ( (>>~), widthsFromIndices, gridTableWith, readWith, + readWithM, testStringWith, guardEnabled, guardDisabled, @@ -77,6 +81,7 @@ module Text.Pandoc.Parsing ( (>>~), HeaderType (..), ParserContext (..), QuoteContext (..), + HasQuoteContext (..), NoteTable, NoteTable', KeyTable, @@ -85,7 +90,6 @@ module Text.Pandoc.Parsing ( (>>~), toKey, registerHeader, smartPunctuation, - withQuoteContext, singleQuoteStart, singleQuoteEnd, doubleQuoteStart, @@ -98,12 +102,16 @@ module Text.Pandoc.Parsing ( (>>~), macro, applyMacros', Parser, + ParserT, F(..), runF, askF, asksF, + token, -- * Re-exports from Text.Pandoc.Parsec + Stream, runParser, + runParserT, parse, anyToken, getInput, @@ -154,7 +162,6 @@ module Text.Pandoc.Parsing ( (>>~), setSourceColumn, setSourceLine, newPos, - token ) where @@ -164,25 +171,29 @@ import Text.Pandoc.Builder (Blocks, Inlines, rawBlock, HasMeta(..)) import qualified Text.Pandoc.Builder as B import Text.Pandoc.XML (fromEntities) import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) -import Text.Parsec +import Text.Parsec hiding (token) import Text.Parsec.Pos (newPos) -import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, isDigit, +import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, isHexDigit, isSpace ) import Data.List ( intercalate, transpose ) import Text.Pandoc.Shared import qualified Data.Map as M -import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions) +import Text.TeXMath.Readers.TeX.Macros (applyMacros, Macro, + parseMacroDefinitions) import Text.Pandoc.Compat.TagSoupEntity ( lookupEntity ) import Text.Pandoc.Asciify (toAsciiChar) import Data.Default import qualified Data.Set as Set import Control.Monad.Reader -import Control.Applicative ((*>), (<*), (<$), liftA2, Applicative) +import Control.Monad.Identity +import Control.Applicative ((<$>), (<*>), (*>), (<*), (<$), Applicative) import Data.Monoid import Data.Maybe (catMaybes) type Parser t s = Parsec t s +type ParserT = ParsecT + newtype F a = F { unF :: Reader ParserState a } deriving (Monad, Applicative, Functor) runF :: F a -> ParserState -> a @@ -199,13 +210,8 @@ instance Monoid a => Monoid (F a) where mappend = liftM2 mappend mconcat = liftM mconcat . sequence --- | Like >>, but returns the operation on the left. --- (Suggested by Tillmann Rendel on Haskell-cafe list.) -(>>~) :: (Monad m) => m a -> m b -> m a -a >>~ b = a >>= \x -> b >> return x - -- | Parse any line of text -anyLine :: Parser [Char] st [Char] +anyLine :: Stream [Char] m Char => ParserT [Char] st m [Char] anyLine = do -- This is much faster than: -- manyTill anyChar newline @@ -221,9 +227,10 @@ anyLine = do _ -> mzero -- | Like @manyTill@, but reads at least one item. -many1Till :: Parser [tok] st a - -> Parser [tok] st end - -> Parser [tok] st [a] +many1Till :: Stream s m t + => ParserT s st m a + -> ParserT s st m end + -> ParserT s st m [a] many1Till p end = do first <- p rest <- manyTill p end @@ -232,21 +239,21 @@ many1Till p end = do -- | A more general form of @notFollowedBy@. This one allows any -- type of parser to be specified, and succeeds only if that parser fails. -- It does not consume any input. -notFollowedBy' :: Show b => Parser [a] st b -> Parser [a] st () +notFollowedBy' :: (Show b, Stream s m a) => ParserT s st m b -> ParserT s st m () notFollowedBy' p = try $ join $ do a <- try p return (unexpected (show a)) <|> return (return ()) -- (This version due to Andrew Pimlott on the Haskell mailing list.) -oneOfStrings' :: (Char -> Char -> Bool) -> [String] -> Parser [Char] st String +oneOfStrings' :: Stream s m Char => (Char -> Char -> Bool) -> [String] -> ParserT s st m String oneOfStrings' _ [] = fail "no strings" oneOfStrings' matches strs = try $ do c <- anyChar let strs' = [xs | (x:xs) <- strs, x `matches` c] case strs' of [] -> fail "not found" - _ -> (c:) `fmap` oneOfStrings' matches strs' + _ -> (c:) <$> oneOfStrings' matches strs' <|> if "" `elem` strs' then return [c] else fail "not found" @@ -254,11 +261,11 @@ oneOfStrings' matches strs = try $ do -- | Parses one of a list of strings. If the list contains -- two strings one of which is a prefix of the other, the longer -- string will be matched if possible. -oneOfStrings :: [String] -> Parser [Char] st String +oneOfStrings :: Stream s m Char => [String] -> ParserT s st m String oneOfStrings = oneOfStrings' (==) -- | Parses one of a list of strings (tried in order), case insensitive. -oneOfStringsCI :: [String] -> Parser [Char] st String +oneOfStringsCI :: Stream s m Char => [String] -> ParserT s st m String oneOfStringsCI = oneOfStrings' ciMatch where ciMatch x y = toLower' x == toLower' y -- this optimizes toLower by checking common ASCII case @@ -269,35 +276,35 @@ oneOfStringsCI = oneOfStrings' ciMatch | otherwise = toLower c -- | Parses a space or tab. -spaceChar :: Parser [Char] st Char +spaceChar :: Stream s m Char => ParserT s st m Char spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Parses a nonspace, nonnewline character. -nonspaceChar :: Parser [Char] st Char +nonspaceChar :: Stream s m Char => ParserT s st m Char nonspaceChar = satisfy $ flip notElem ['\t', '\n', ' ', '\r'] -- | Skips zero or more spaces or tabs. -skipSpaces :: Parser [Char] st () +skipSpaces :: Stream s m Char => ParserT s st m () skipSpaces = skipMany spaceChar -- | Skips zero or more spaces or tabs, then reads a newline. -blankline :: Parser [Char] st Char +blankline :: Stream s m Char => ParserT s st m Char blankline = try $ skipSpaces >> newline -- | Parses one or more blank lines and returns a string of newlines. -blanklines :: Parser [Char] st [Char] +blanklines :: Stream s m Char => ParserT s st m [Char] blanklines = many1 blankline -- | Parses material enclosed between start and end parsers. -enclosed :: Parser [Char] st t -- ^ start parser - -> Parser [Char] st end -- ^ end parser - -> Parser [Char] st a -- ^ content parser (to be used repeatedly) - -> Parser [Char] st [a] +enclosed :: Stream s m Char => ParserT s st m t -- ^ start parser + -> ParserT s st m end -- ^ end parser + -> ParserT s st m a -- ^ content parser (to be used repeatedly) + -> ParserT s st m [a] enclosed start end parser = try $ start >> notFollowedBy space >> many1Till parser end -- | Parse string, case insensitive. -stringAnyCase :: [Char] -> Parser [Char] st String +stringAnyCase :: Stream s m Char => [Char] -> ParserT s st m String stringAnyCase [] = string "" stringAnyCase (x:xs) = do firstChar <- char (toUpper x) <|> char (toLower x) @@ -305,7 +312,7 @@ stringAnyCase (x:xs) = do return (firstChar:rest) -- | Parse contents of 'str' using 'parser' and return result. -parseFromString :: Parser [tok] st a -> [tok] -> Parser [tok] st a +parseFromString :: Stream s m t => ParserT s st m a -> s -> ParserT s st m a parseFromString parser str = do oldPos <- getPosition oldInput <- getInput @@ -316,7 +323,7 @@ parseFromString parser str = do return result -- | Parse raw line block up to and including blank lines. -lineClump :: Parser [Char] st String +lineClump :: Stream [Char] m Char => ParserT [Char] st m String lineClump = blanklines <|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines) @@ -325,8 +332,8 @@ lineClump = blanklines -- pairs of open and close, which must be different. For example, -- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" -- and return "hello (there)". -charsInBalanced :: Char -> Char -> Parser [Char] st Char - -> Parser [Char] st String +charsInBalanced :: Stream s m Char => Char -> Char -> ParserT s st m Char + -> ParserT s st m String charsInBalanced open close parser = try $ do char open let isDelim c = c == open || c == close @@ -350,8 +357,8 @@ uppercaseRomanDigits :: [Char] uppercaseRomanDigits = map toUpper lowercaseRomanDigits -- | Parses a roman numeral (uppercase or lowercase), returns number. -romanNumeral :: Bool -- ^ Uppercase if true - -> Parser [Char] st Int +romanNumeral :: Stream s m Char => Bool -- ^ Uppercase if true + -> ParserT s st m Int romanNumeral upperCase = do let romanDigits = if upperCase then uppercaseRomanDigits @@ -383,12 +390,12 @@ romanNumeral upperCase = do -- | Parses an email address; returns original and corresponding -- escaped mailto: URI. -emailAddress :: Parser [Char] st (String, String) -emailAddress = try $ liftA2 toResult mailbox (char '@' *> domain) +emailAddress :: Stream s m Char => ParserT s st m (String, String) +emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) where toResult mbox dom = let full = fromEntities $ mbox ++ '@':dom in (full, escapeURI $ "mailto:" ++ full) - mailbox = intercalate "." `fmap` (emailWord `sepby1` dot) - domain = intercalate "." `fmap` (subdomain `sepby1` dot) + mailbox = intercalate "." <$> (emailWord `sepby1` dot) + domain = intercalate "." <$> (subdomain `sepby1` dot) dot = char '.' subdomain = many1 $ alphaNum <|> innerPunct innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@') <* @@ -398,11 +405,11 @@ emailAddress = try $ liftA2 toResult mailbox (char '@' *> domain) isEmailPunct c = c `elem` "!\"#$%&'*+-/=?^_{|}~;" -- note: sepBy1 from parsec consumes input when sep -- succeeds and p fails, so we use this variant here. - sepby1 p sep = liftA2 (:) p (many (try $ sep >> p)) + sepby1 p sep = (:) <$> p <*> (many (try $ sep >> p)) -- Schemes from http://www.iana.org/assignments/uri-schemes.html plus --- the unofficial schemes coap, doi, javascript. +-- the unofficial schemes coap, doi, javascript, isbn, pmid schemes :: [String] schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid", "crid","data","dav","dict","dns","file","ftp","geo","go","gopher", @@ -424,13 +431,13 @@ schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid", "rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify", "ssh","steam","svn","teamspeak","things","udp","unreal","ut2004", "ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri", - "ymsgr"] + "ymsgr", "isbn", "pmid"] -uriScheme :: Parser [Char] st String +uriScheme :: Stream s m Char => ParserT s st m String uriScheme = oneOfStringsCI schemes -- | Parses a URI. Returns pair of original and URI-escaped version. -uri :: Parser [Char] st (String, String) +uri :: Stream [Char] m Char => ParserT [Char] st m (String, String) uri = try $ do scheme <- uriScheme char ':' @@ -451,7 +458,7 @@ uri = try $ do <|> entity <|> (try $ punct >> lookAhead (void (satisfy isWordChar) <|> percentEscaped)) - str <- snd `fmap` withRaw (skipMany1 ( () <$ + str <- snd <$> withRaw (skipMany1 ( () <$ (enclosed (char '(') (char ')') uriChunk <|> enclosed (char '{') (char '}') uriChunk <|> enclosed (char '[') (char ']') uriChunk) @@ -460,7 +467,7 @@ uri = try $ do let uri' = scheme ++ ":" ++ fromEntities str' return (uri', escapeURI uri') -mathInlineWith :: String -> String -> Parser [Char] st String +mathInlineWith :: Stream s m Char => String -> String -> ParserT s st m String mathInlineWith op cl = try $ do string op notFollowedBy space @@ -474,12 +481,13 @@ mathInlineWith op cl = try $ do notFollowedBy digit -- to prevent capture of $5 return $ concat words' -mathDisplayWith :: String -> String -> Parser [Char] st String +mathDisplayWith :: Stream s m Char => String -> String -> ParserT s st m String mathDisplayWith op cl = try $ do string op - many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string cl) + many1Till (noneOf "\n" <|> (newline <* notFollowedBy' blankline)) (try $ string cl) -mathDisplay :: Parser [Char] ParserState String +mathDisplay :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m String mathDisplay = (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") <|> (guardEnabled Ext_tex_math_single_backslash >> @@ -487,7 +495,8 @@ mathDisplay = <|> (guardEnabled Ext_tex_math_double_backslash >> mathDisplayWith "\\\\[" "\\\\]") -mathInline :: Parser [Char] ParserState String +mathInline :: (HasReaderOptions st , Stream s m Char) + => ParserT s st m String mathInline = (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") <|> (guardEnabled Ext_tex_math_single_backslash >> @@ -499,8 +508,9 @@ mathInline = -- displacement (the difference between the source column at the end -- and the source column at the beginning). Vertical displacement -- (source row) is ignored. -withHorizDisplacement :: Parser [Char] st a -- ^ Parser to apply - -> Parser [Char] st (a, Int) -- ^ (result, displacement) +withHorizDisplacement :: Stream s m Char + => ParserT s st m a -- ^ Parser to apply + -> ParserT s st m (a, Int) -- ^ (result, displacement) withHorizDisplacement parser = do pos1 <- getPosition result <- parser @@ -509,7 +519,7 @@ withHorizDisplacement parser = do -- | Applies a parser and returns the raw string that was parsed, -- along with the value produced by the parser. -withRaw :: Monad m => ParsecT [Char] st m a -> ParsecT [Char] st m (a, [Char]) +withRaw :: Stream [Char] m Char => ParsecT [Char] st m a -> ParsecT [Char] st m (a, [Char]) withRaw parser = do pos1 <- getPosition inp <- getInput @@ -525,12 +535,13 @@ withRaw parser = do return (result, raw) -- | Parses backslash, then applies character parser. -escaped :: Parser [Char] st Char -- ^ Parser for character to escape - -> Parser [Char] st Char +escaped :: Stream s m Char + => ParserT s st m Char -- ^ Parser for character to escape + -> ParserT s st m Char escaped parser = try $ char '\\' >> parser -- | Parse character entity. -characterReference :: Parser [Char] st Char +characterReference :: Stream s m Char => ParserT s st m Char characterReference = try $ do char '&' ent <- many1Till nonspaceChar (char ';') @@ -539,19 +550,19 @@ characterReference = try $ do Nothing -> fail "entity not found" -- | Parses an uppercase roman numeral and returns (UpperRoman, number). -upperRoman :: Parser [Char] st (ListNumberStyle, Int) +upperRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) upperRoman = do num <- romanNumeral True return (UpperRoman, num) -- | Parses a lowercase roman numeral and returns (LowerRoman, number). -lowerRoman :: Parser [Char] st (ListNumberStyle, Int) +lowerRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) lowerRoman = do num <- romanNumeral False return (LowerRoman, num) -- | Parses a decimal numeral and returns (Decimal, number). -decimal :: Parser [Char] st (ListNumberStyle, Int) +decimal :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) decimal = do num <- many1 digit return (Decimal, read num) @@ -560,7 +571,8 @@ decimal = do -- returns (DefaultStyle, [next example number]). The next -- example number is incremented in parser state, and the label -- (if present) is added to the label table. -exampleNum :: Parser [Char] ParserState (ListNumberStyle, Int) +exampleNum :: Stream s m Char + => ParserT s ParserState m (ListNumberStyle, Int) exampleNum = do char '@' lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-')) @@ -574,38 +586,39 @@ exampleNum = do return (Example, num) -- | Parses a '#' returns (DefaultStyle, 1). -defaultNum :: Parser [Char] st (ListNumberStyle, Int) +defaultNum :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) defaultNum = do char '#' return (DefaultStyle, 1) -- | Parses a lowercase letter and returns (LowerAlpha, number). -lowerAlpha :: Parser [Char] st (ListNumberStyle, Int) +lowerAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) lowerAlpha = do ch <- oneOf ['a'..'z'] return (LowerAlpha, ord ch - ord 'a' + 1) -- | Parses an uppercase letter and returns (UpperAlpha, number). -upperAlpha :: Parser [Char] st (ListNumberStyle, Int) +upperAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) upperAlpha = do ch <- oneOf ['A'..'Z'] return (UpperAlpha, ord ch - ord 'A' + 1) -- | Parses a roman numeral i or I -romanOne :: Parser [Char] st (ListNumberStyle, Int) +romanOne :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) romanOne = (char 'i' >> return (LowerRoman, 1)) <|> (char 'I' >> return (UpperRoman, 1)) -- | Parses an ordered list marker and returns list attributes. -anyOrderedListMarker :: Parser [Char] ParserState ListAttributes +anyOrderedListMarker :: Stream s m Char => ParserT s ParserState m ListAttributes anyOrderedListMarker = choice $ [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], numParser <- [decimal, exampleNum, defaultNum, romanOne, lowerAlpha, lowerRoman, upperAlpha, upperRoman]] -- | Parses a list number (num) followed by a period, returns list attributes. -inPeriod :: Parser [Char] st (ListNumberStyle, Int) - -> Parser [Char] st ListAttributes +inPeriod :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes inPeriod num = try $ do (style, start) <- num char '.' @@ -615,16 +628,18 @@ inPeriod num = try $ do return (start, style, delim) -- | Parses a list number (num) followed by a paren, returns list attributes. -inOneParen :: Parser [Char] st (ListNumberStyle, Int) - -> Parser [Char] st ListAttributes +inOneParen :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes inOneParen num = try $ do (style, start) <- num char ')' return (start, style, OneParen) -- | Parses a list number (num) enclosed in parens, returns list attributes. -inTwoParens :: Parser [Char] st (ListNumberStyle, Int) - -> Parser [Char] st ListAttributes +inTwoParens :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes inTwoParens num = try $ do char '(' (style, start) <- num @@ -633,9 +648,10 @@ inTwoParens num = try $ do -- | Parses an ordered list marker with a given style and delimiter, -- returns number. -orderedListMarker :: ListNumberStyle +orderedListMarker :: Stream s m Char + => ListNumberStyle -> ListNumberDelim - -> Parser [Char] ParserState Int + -> ParserT s ParserState m Int orderedListMarker style delim = do let num = defaultNum <|> -- # can continue any kind of list case style of @@ -655,12 +671,12 @@ orderedListMarker style delim = do return start -- | Parses a character reference and returns a Str element. -charRef :: Parser [Char] st Inline +charRef :: Stream s m Char => ParserT s st m Inline charRef = do c <- characterReference return $ Str [c] -lineBlockLine :: Parser [Char] st String +lineBlockLine :: Stream [Char] m Char => ParserT [Char] st m String lineBlockLine = try $ do char '|' char ' ' @@ -671,7 +687,7 @@ lineBlockLine = try $ do return $ white ++ unwords (line : continuations) -- | Parses an RST-style line block and returns a list of strings. -lineBlockLines :: Parser [Char] st [String] +lineBlockLines :: Stream [Char] m Char => ParserT [Char] st m [String] lineBlockLines = try $ do lines' <- many1 lineBlockLine skipMany1 $ blankline <|> try (char '|' >> blankline) @@ -679,11 +695,12 @@ lineBlockLines = try $ do -- | Parse a table using 'headerParser', 'rowParser', -- 'lineParser', and 'footerParser'. -tableWith :: Parser [Char] ParserState ([[Block]], [Alignment], [Int]) - -> ([Int] -> Parser [Char] ParserState [[Block]]) - -> Parser [Char] ParserState sep - -> Parser [Char] ParserState end - -> Parser [Char] ParserState Block +tableWith :: Stream s m Char + => ParserT s ParserState m ([[Block]], [Alignment], [Int]) + -> ([Int] -> ParserT s ParserState m [[Block]]) + -> ParserT s ParserState m sep + -> ParserT s ParserState m end + -> ParserT s ParserState m Block tableWith headerParser rowParser lineParser footerParser = try $ do (heads, aligns, indices) <- headerParser lines' <- rowParser indices `sepEndBy1` lineParser @@ -725,9 +742,10 @@ widthsFromIndices numColumns' indices = -- (which may be grid), then the rows, -- which may be grid, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -gridTableWith :: Parser [Char] ParserState [Block] -- ^ Block list parser +gridTableWith :: Stream [Char] m Char + => ParserT [Char] ParserState m [Block] -- ^ Block list parser -> Bool -- ^ Headerless table - -> Parser [Char] ParserState Block + -> ParserT [Char] ParserState m Block gridTableWith blocks headless = tableWith (gridTableHeader headless blocks) (gridTableRow blocks) (gridTableSep '-') gridTableFooter @@ -736,27 +754,28 @@ gridTableSplitLine :: [Int] -> String -> [String] gridTableSplitLine indices line = map removeFinalBar $ tail $ splitStringByIndices (init indices) $ trimr line -gridPart :: Char -> Parser [Char] st (Int, Int) +gridPart :: Stream s m Char => Char -> ParserT s st m (Int, Int) gridPart ch = do dashes <- many1 (char ch) char '+' return (length dashes, length dashes + 1) -gridDashedLines :: Char -> Parser [Char] st [(Int,Int)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline +gridDashedLines :: Stream s m Char => Char -> ParserT s st m [(Int,Int)] +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline removeFinalBar :: String -> String removeFinalBar = reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse -- | Separator between rows of grid table. -gridTableSep :: Char -> Parser [Char] ParserState Char +gridTableSep :: Stream s m Char => Char -> ParserT s ParserState m Char gridTableSep ch = try $ gridDashedLines ch >> return '\n' -- | Parse header for a grid table. -gridTableHeader :: Bool -- ^ Headerless table - -> Parser [Char] ParserState [Block] - -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) +gridTableHeader :: Stream [Char] m Char + => Bool -- ^ Headerless table + -> ParserT [Char] ParserState m [Block] + -> ParserT [Char] ParserState m ([[Block]], [Alignment], [Int]) gridTableHeader headless blocks = try $ do optional blanklines dashes <- gridDashedLines '-' @@ -779,16 +798,17 @@ gridTableHeader headless blocks = try $ do heads <- mapM (parseFromString blocks) $ map trim rawHeads return (heads, aligns, indices) -gridTableRawLine :: [Int] -> Parser [Char] ParserState [String] +gridTableRawLine :: Stream s m Char => [Int] -> ParserT s ParserState m [String] gridTableRawLine indices = do char '|' line <- many1Till anyChar newline return (gridTableSplitLine indices line) -- | Parse row of grid table. -gridTableRow :: Parser [Char] ParserState [Block] +gridTableRow :: Stream [Char] m Char + => ParserT [Char] ParserState m [Block] -> [Int] - -> Parser [Char] ParserState [[Block]] + -> ParserT [Char] ParserState m [[Block]] gridTableRow blocks indices = do colLines <- many1 (gridTableRawLine indices) let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ @@ -807,19 +827,21 @@ compactifyCell :: [Block] -> [Block] compactifyCell bs = head $ compactify [bs] -- | Parse footer for a grid table. -gridTableFooter :: Parser [Char] ParserState [Char] +gridTableFooter :: Stream s m Char => ParserT s ParserState m [Char] gridTableFooter = blanklines --- --- | Parse a string with a given parser and state. -readWith :: Parser [Char] st a -- ^ parser - -> st -- ^ initial state - -> [Char] -- ^ input - -> a -readWith parser state input = - case runParser parser state "source" input of - Left err' -> +-- | Removes the ParsecT layer from the monad transformer stack +readWithM :: (Monad m, Functor m) + => ParserT [Char] st m a -- ^ parser + -> st -- ^ initial state + -> String -- ^ input + -> m a +readWithM parser state input = + handleError <$> (runParserT parser state "source" input) + where + handleError (Left err') = let errPos = errorPos err' errLine = sourceLine errPos errColumn = sourceColumn errPos @@ -827,11 +849,19 @@ readWith parser state input = in error $ "\nError at " ++ show err' ++ "\n" ++ theline ++ "\n" ++ replicate (errColumn - 1) ' ' ++ "^" - Right result -> result + handleError (Right result) = result + +-- | Parse a string with a given parser and state +readWith :: Parser [Char] st a + -> st + -> String + -> a +readWith p t inp = runIdentity $ readWithM p t inp -- | Parse a string with @parser@ (for testing). -testStringWith :: (Show a) => Parser [Char] ParserState a - -> String +testStringWith :: (Show a, Stream [Char] Identity Char) + => ParserT [Char] ParserState Identity a + -> [Char] -> IO () testStringWith parser str = UTF8.putStrLn $ show $ readWith parser defaultParserState str @@ -863,6 +893,8 @@ data ParserState = ParserState -- roles), 3) Source language annotation for code (could be used to -- annotate role classes too). stateCaption :: Maybe Inlines, -- ^ Caption in current environment + stateInHtmlBlock :: Maybe String, -- ^ Tag type of HTML block being parsed + stateMarkdownAttribute :: Bool, -- ^ True if in markdown=1 context stateWarnings :: [String] -- ^ Warnings generated by the parser } @@ -877,9 +909,24 @@ instance HasMeta ParserState where class HasReaderOptions st where extractReaderOptions :: st -> ReaderOptions - getOption :: (ReaderOptions -> b) -> Parser s st b + getOption :: (Stream s m t) => (ReaderOptions -> b) -> ParserT s st m b -- default - getOption f = (f . extractReaderOptions) `fmap` getState + getOption f = (f . extractReaderOptions) <$> getState + +class HasQuoteContext st m where + getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext + withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a + +instance Monad m => HasQuoteContext ParserState m where + getQuoteContext = stateQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result instance HasReaderOptions ParserState where extractReaderOptions = stateOptions @@ -941,22 +988,24 @@ defaultParserState = stateRstDefaultRole = "title-reference", stateRstCustomRoles = M.empty, stateCaption = Nothing, + stateInHtmlBlock = Nothing, + stateMarkdownAttribute = False, stateWarnings = []} -- | Succeed only if the extension is enabled. -guardEnabled :: HasReaderOptions st => Extension -> Parser s st () +guardEnabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () guardEnabled ext = getOption readerExtensions >>= guard . Set.member ext -- | Succeed only if the extension is disabled. -guardDisabled :: HasReaderOptions st => Extension -> Parser s st () +guardDisabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () guardDisabled ext = getOption readerExtensions >>= guard . not . Set.member ext -- | Update the position on which the last string ended. -updateLastStrPos :: HasLastStrPosition st => Parser s st () +updateLastStrPos :: (Stream s m a, HasLastStrPosition st) => ParserT s st m () updateLastStrPos = getPosition >>= updateState . setLastStrPos -- | Whether we are right after the end of a string. -notAfterString :: HasLastStrPosition st => Parser s st Bool +notAfterString :: (Stream s m a, HasLastStrPosition st) => ParserT s st m Bool notAfterString = do pos <- getPosition st <- getState @@ -996,10 +1045,10 @@ type SubstTable = M.Map Key Inlines -- and the auto_identifers extension is set, generate a new -- unique identifier, and update the list of identifiers -- in state. -registerHeader :: (HasReaderOptions st, HasHeaderMap st, HasIdentifierList st) - => Attr -> Inlines -> Parser s st Attr +registerHeader :: (Stream s m a, HasReaderOptions st, HasHeaderMap st, HasIdentifierList st) + => Attr -> Inlines -> ParserT s st m Attr registerHeader (ident,classes,kvs) header' = do - ids <- extractIdentifierList `fmap` getState + ids <- extractIdentifierList <$> getState exts <- getOption readerExtensions let insert' = M.insertWith (\_new old -> old) if null ident && Ext_auto_identifiers `Set.member` exts @@ -1018,136 +1067,113 @@ registerHeader (ident,classes,kvs) header' = do return (ident,classes,kvs) -- | Fail unless we're in "smart typography" mode. -failUnlessSmart :: HasReaderOptions st => Parser s st () +failUnlessSmart :: (Stream s m a, HasReaderOptions st) => ParserT s st m () failUnlessSmart = getOption readerSmart >>= guard -smartPunctuation :: Parser [Char] ParserState Inlines - -> Parser [Char] ParserState Inlines +smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines smartPunctuation inlineParser = do failUnlessSmart choice [ quoted inlineParser, apostrophe, dash, ellipses ] -apostrophe :: Parser [Char] ParserState Inlines +apostrophe :: Stream s m Char => ParserT s st m Inlines apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\x2019") -quoted :: Parser [Char] ParserState Inlines - -> Parser [Char] ParserState Inlines +quoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser -withQuoteContext :: QuoteContext - -> Parser [tok] ParserState a - -> Parser [tok] ParserState a -withQuoteContext context parser = do - oldState <- getState - let oldQuoteContext = stateQuoteContext oldState - setState oldState { stateQuoteContext = context } - result <- parser - newState <- getState - setState newState { stateQuoteContext = oldQuoteContext } - return result - -singleQuoted :: Parser [Char] ParserState Inlines - -> Parser [Char] ParserState Inlines +singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines singleQuoted inlineParser = try $ do singleQuoteStart withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= return . B.singleQuoted . mconcat -doubleQuoted :: Parser [Char] ParserState Inlines - -> Parser [Char] ParserState Inlines +doubleQuoted :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines doubleQuoted inlineParser = try $ do doubleQuoteStart withQuoteContext InDoubleQuote $ manyTill inlineParser doubleQuoteEnd >>= return . B.doubleQuoted . mconcat -failIfInQuoteContext :: QuoteContext -> Parser [tok] ParserState () +failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) + => QuoteContext + -> ParserT s st m () failIfInQuoteContext context = do - st <- getState - if stateQuoteContext st == context + context' <- getQuoteContext + if context' == context then fail "already inside quotes" else return () -charOrRef :: [Char] -> Parser [Char] st Char +charOrRef :: Stream s m Char => String -> ParserT s st m Char charOrRef cs = oneOf cs <|> try (do c <- characterReference guard (c `elem` cs) return c) -singleQuoteStart :: Parser [Char] ParserState () +singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m () singleQuoteStart = do failIfInQuoteContext InSingleQuote -- single quote start can't be right after str guard =<< notAfterString () <$ charOrRef "'\8216\145" -singleQuoteEnd :: Parser [Char] st () +singleQuoteEnd :: Stream s m Char + => ParserT s st m () singleQuoteEnd = try $ do charOrRef "'\8217\146" notFollowedBy alphaNum -doubleQuoteStart :: Parser [Char] ParserState () +doubleQuoteStart :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote try $ do charOrRef "\"\8220\147" notFollowedBy . satisfy $ flip elem [' ', '\t', '\n'] -doubleQuoteEnd :: Parser [Char] st () -doubleQuoteEnd = do - charOrRef "\"\8221\148" - return () +doubleQuoteEnd :: Stream s m Char + => ParserT s st m () +doubleQuoteEnd = void (charOrRef "\"\8221\148") -ellipses :: Parser [Char] st Inlines -ellipses = do - try (charOrRef "\8230\133") <|> try (string "..." >> return '…') - return (B.str "\8230") +ellipses :: Stream s m Char + => ParserT s st m Inlines +ellipses = try (string "..." >> return (B.str "\8230")) -dash :: Parser [Char] ParserState Inlines -dash = do +dash :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m Inlines +dash = try $ do oldDashes <- getOption readerOldDashes if oldDashes - then emDashOld <|> enDashOld - else B.str `fmap` (hyphenDash <|> emDash <|> enDash) - --- Two hyphens = en-dash, three = em-dash -hyphenDash :: Parser [Char] st String -hyphenDash = do - try $ string "--" - option "\8211" (char '-' >> return "\8212") - -emDash :: Parser [Char] st String -emDash = do - try (charOrRef "\8212\151") - return "\8212" - -enDash :: Parser [Char] st String -enDash = do - try (charOrRef "\8212\151") - return "\8211" - -enDashOld :: Parser [Char] st Inlines -enDashOld = do - try (charOrRef "\8211\150") <|> - try (char '-' >> lookAhead (satisfy isDigit) >> return '–') - return (B.str "\8211") - -emDashOld :: Parser [Char] st Inlines -emDashOld = do - try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-') - return (B.str "\8212") + then do + char '-' + (char '-' >> return (B.str "\8212")) + <|> (lookAhead digit >> return (B.str "\8211")) + else do + string "--" + (char '-' >> return (B.str "\8212")) + <|> return (B.str "\8211") -- This is used to prevent exponential blowups for things like: -- a**a*a**a*a**a*a**a*a**a*a**a*a** -nested :: Parser s ParserState a - -> Parser s ParserState a +nested :: Stream s m a + => ParserT s ParserState m a + -> ParserT s ParserState m a nested p = do - nestlevel <- stateMaxNestingLevel `fmap` getState + nestlevel <- stateMaxNestingLevel <$> getState guard $ nestlevel > 0 updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } res <- p updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res -citeKey :: HasLastStrPosition st => Parser [Char] st (Bool, String) +citeKey :: (Stream s m Char, HasLastStrPosition st) + => ParserT s st m (Bool, String) citeKey = try $ do guard =<< notAfterString suppress_author <- option False (char '-' *> return True) @@ -1159,12 +1185,21 @@ citeKey = try $ do let key = firstChar:rest return (suppress_author, key) + +token :: (Stream s m t) + => (t -> String) + -> (t -> SourcePos) + -> (t -> Maybe a) + -> ParsecT s st m a +token pp pos match = tokenPrim pp (\_ t _ -> pos t) match + -- -- Macros -- -- | Parse a \newcommand or \renewcommand macro definition. -macro :: (HasMacros st, HasReaderOptions st) => Parser [Char] st Blocks +macro :: (Stream [Char] m Char, HasMacros st, HasReaderOptions st) + => ParserT [Char] st m Blocks macro = do apply <- getOption readerApplyMacros inp <- getInput @@ -1179,10 +1214,12 @@ macro = do else return $ rawBlock "latex" def' -- | Apply current macros to string. -applyMacros' :: String -> Parser [Char] ParserState String +applyMacros' :: (HasReaderOptions st, HasMacros st, Stream [Char] m Char) + => String + -> ParserT [Char] st m String applyMacros' target = do apply <- getOption readerApplyMacros if apply - then do macros <- extractMacros `fmap` getState + then do macros <- extractMacros <$> getState return $ applyMacros macros target else return target diff --git a/src/Text/Pandoc/Pretty.hs b/src/Text/Pandoc/Pretty.hs index d25ba725f..1e72c2040 100644 --- a/src/Text/Pandoc/Pretty.hs +++ b/src/Text/Pandoc/Pretty.hs @@ -35,6 +35,7 @@ module Text.Pandoc.Pretty ( , render , cr , blankline + , blanklines , space , text , char @@ -100,7 +101,7 @@ data D = Text Int String | BreakingSpace | CarriageReturn | NewLine - | BlankLine + | BlankLines Int -- number of blank lines deriving (Show) newtype Doc = Doc { unDoc :: Seq D } @@ -113,7 +114,7 @@ isBlank :: D -> Bool isBlank BreakingSpace = True isBlank CarriageReturn = True isBlank NewLine = True -isBlank BlankLine = True +isBlank (BlankLines _) = True isBlank (Text _ (c:_)) = isSpace c isBlank _ = False @@ -190,7 +191,7 @@ vsep = foldr ($+$) empty nestle :: Doc -> Doc nestle (Doc d) = Doc $ go d where go x = case viewl x of - (BlankLine :< rest) -> go rest + (BlankLines _ :< rest) -> go rest (NewLine :< rest) -> go rest _ -> x @@ -203,7 +204,7 @@ chomp d = Doc (fromList dl') go (BreakingSpace : xs) = go xs go (CarriageReturn : xs) = go xs go (NewLine : xs) = go xs - go (BlankLine : xs) = go xs + go (BlankLines _ : xs) = go xs go (Prefixed s d' : xs) = Prefixed s (chomp d') : xs go xs = xs @@ -216,9 +217,10 @@ outp off s | off < 0 = do -- offset < 0 means newline characters let pref = reverse $ dropWhile isSpace $ reverse rawpref modify $ \st -> st{ output = fromString pref : output st , column = column st + realLength pref } + let numnewlines = length $ takeWhile (=='\n') $ reverse s modify $ \st -> st { output = fromString s : output st , column = 0 - , newlines = newlines st + 1 } + , newlines = newlines st + numnewlines } outp off s = do -- offset >= 0 (0 might be combining char) st' <- get let pref = prefix st' @@ -277,15 +279,11 @@ renderList (BeforeNonBlank d : xs) = | otherwise -> renderDoc d >> renderList xs [] -> renderList xs -renderList (BlankLine : xs) = do +renderList (BlankLines num : xs) = do st <- get case output st of - _ | newlines st > 1 || null xs -> return () - _ | column st == 0 -> do - outp (-1) "\n" - _ -> do - outp (-1) "\n" - outp (-1) "\n" + _ | newlines st > num || null xs -> return () + | otherwise -> replicateM_ (1 + num - newlines st) (outp (-1) "\n") renderList xs renderList (CarriageReturn : xs) = do @@ -302,7 +300,7 @@ renderList (NewLine : xs) = do renderList (BreakingSpace : CarriageReturn : xs) = renderList (CarriageReturn:xs) renderList (BreakingSpace : NewLine : xs) = renderList (NewLine:xs) -renderList (BreakingSpace : BlankLine : xs) = renderList (BlankLine:xs) +renderList (BreakingSpace : BlankLines n : xs) = renderList (BlankLines n:xs) renderList (BreakingSpace : BreakingSpace : xs) = renderList (BreakingSpace:xs) renderList (BreakingSpace : xs) = do let isText (Text _ _) = True @@ -383,9 +381,13 @@ cr = Doc $ singleton CarriageReturn -- | Inserts a blank line unless one exists already. -- (@blankline <> blankline@ has the same effect as @blankline@. --- If you want multiple blank lines, use @text "\\n\\n"@. blankline :: Doc -blankline = Doc $ singleton BlankLine +blankline = Doc $ singleton (BlankLines 1) + +-- | Inserts a blank lines unless they exists already. +-- (@blanklines m <> blanklines n@ has the same effect as @blankline (max m n)@. +blanklines :: Int -> Doc +blanklines n = Doc $ singleton (BlankLines n) -- | Uses the specified string as a prefix for every line of -- the inside document (except the first, if not at the beginning diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index cf1d5132e..1e119e729 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -769,7 +769,12 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ strContent e + $ trimNl $ strContentRecursive e + strContentRecursive = strContent . (\e' -> e'{ elContent = + map elementToStr $ elContent e' }) + elementToStr :: Content -> Content + elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing + elementToStr x = x parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 09c2330fb..085ee01fc 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards #-} + {- Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> @@ -76,48 +78,188 @@ import Codec.Archive.Zip import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (text, toList) -import Text.Pandoc.Generic (bottomUp) -import Text.Pandoc.MIME (getMimeType) -import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists -import Data.Maybe (mapMaybe, isJust, fromJust) -import Data.List (delete, isPrefixOf, (\\), intersect) -import qualified Data.ByteString as BS +import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Shared +import Text.Pandoc.MediaBag (insertMedia, MediaBag) +import Data.Maybe (mapMaybe) +import Data.List (delete, stripPrefix, (\\), intersect) +import Data.Monoid +import Text.TeXMath (writeTeX) +import Data.Default (Default) import qualified Data.ByteString.Lazy as B -import Data.ByteString.Base64 (encode) -import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State +import Control.Applicative ((<$>)) readDocx :: ReaderOptions -> B.ByteString - -> Pandoc + -> (Pandoc, MediaBag) readDocx opts bytes = case archiveToDocx (toArchive bytes) of - Just docx -> Pandoc nullMeta (docxToBlocks opts docx) - Nothing -> error $ "couldn't parse docx file" - -runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)]) -runStyleToSpanAttr rPr = ("", - mapMaybe id [ - if isBold rPr then (Just "strong") else Nothing, - if isItalic rPr then (Just "emph") else Nothing, - if isSmallCaps rPr then (Just "smallcaps") else Nothing, - if isStrike rPr then (Just "strike") else Nothing, - if isSuperScript rPr then (Just "superscript") else Nothing, - if isSubScript rPr then (Just "subscript") else Nothing, - rStyle rPr], - case underline rPr of - Just fmt -> [("underline", fmt)] - _ -> [] - ) - -parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)]) -parStyleToDivAttr pPr = ("", - pStyle pPr, - case indent pPr of - Just n -> [("indent", (show n))] - Nothing -> [] - ) + Right docx -> (Pandoc meta blks, mediaBag) where + (meta, blks, mediaBag) = (docxToOutput opts docx) + Left _ -> error $ "couldn't parse docx file" + +data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag } + +instance Default DState where + def = DState { docxAnchorMap = M.empty + , docxMediaBag = mempty } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxInHeaderBlock :: Bool } + +instance Default DEnv where + def = DEnv def False + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + +-- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + +divsToKeep :: [String] +divsToKeep = ["list-item", "Definition", "DefinitionTerm"] + +metaStyles :: M.Map String String +metaStyles = M.fromList [ ("Title", "title") + , ("Subtitle", "subtitle") + , ("Author", "author") + , ("Date", "date") + , ("Abstract", "abstract")] + +sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) +sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) + +isMetaPar :: BodyPart -> Bool +isMetaPar (Paragraph pPr _) = + not $ null $ intersect (pStyle pPr) (M.keys metaStyles) +isMetaPar _ = False + +isEmptyPar :: BodyPart -> Bool +isEmptyPar (Paragraph _ parParts) = + all isEmptyParPart parParts + where + isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems + isEmptyParPart _ = False + isEmptyElem (TextRun s) = trim s == "" + isEmptyElem _ = True +isEmptyPar _ = False + +bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' [] = return M.empty +bodyPartsToMeta' (bp : bps) + | (Paragraph pPr parParts) <- bp + , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (Just metaField) <- M.lookup c metaStyles = do + inlines <- parPartsToInlines parParts + remaining <- bodyPartsToMeta' bps + let + f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f m (MetaList mv) = MetaList (m : mv) + f m n = MetaList [m, n] + return $ M.insertWith f metaField (MetaInlines inlines) remaining +bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps + +bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta bps = do + mp <- bodyPartsToMeta' bps + let mp' = + case M.lookup "author" mp of + Just mv -> M.insert "author" (fixAuthors mv) mp + Nothing -> mp + return $ Meta mp' + +fixAuthors :: MetaValue -> MetaValue +fixAuthors (MetaBlocks blks) = + MetaList $ map g $ filter f blks + where f (Para _) = True + f _ = False + g (Para ils) = MetaInlines ils + g _ = MetaInlines [] +fixAuthors mv = mv + +runStyleToContainers :: RunStyle -> [Container Inline] +runStyleToContainers rPr = + let spanClassToContainers :: String -> [Container Inline] + spanClassToContainers s | s `elem` codeSpans = + [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))] + spanClassToContainers s | s `elem` spansToKeep = + [Container $ Span ("", [s], [])] + spanClassToContainers _ = [] + + classContainers = case rStyle rPr of + Nothing -> [] + Just s -> spanClassToContainers s + + formatters = map Container $ mapMaybe id + [ if isBold rPr then (Just Strong) else Nothing + , if isItalic rPr then (Just Emph) else Nothing + , if isSmallCaps rPr then (Just SmallCaps) else Nothing + , if isStrike rPr then (Just Strikeout) else Nothing + , if isSuperScript rPr then (Just Superscript) else Nothing + , if isSubScript rPr then (Just Subscript) else Nothing + , rUnderline rPr >>= + (\f -> if f == "single" then (Just Emph) else Nothing) + ] + in + classContainers ++ formatters + +parStyleToContainers :: ParagraphStyle -> [Container Block] +parStyleToContainers pPr | (c:cs) <- pStyle pPr, Just n <- isHeaderClass c = + [Container $ \_ -> Header n ("", delete ("Heading" ++ show n) cs, []) []] +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` divsToKeep = + let pPr' = pPr { pStyle = cs } + in + (Container $ Div ("", [c], [])) : (parStyleToContainers pPr') +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` codeDivs = + -- This is a bit of a cludge. We make the codeblock from the raw + -- parparts in bodyPartToBlocks. But we need something to match against. + let pPr' = pPr { pStyle = cs } + in + (Container $ \_ -> CodeBlock ("", [], []) "") : (parStyleToContainers pPr') +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` listParagraphDivs = + let pPr' = pPr { pStyle = cs, indentation = Nothing} + in + (Container $ Div ("", [c], [])) : (parStyleToContainers pPr') + +parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` blockQuoteDivs = + let pPr' = pPr { pStyle = cs \\ blockQuoteDivs } + in + (Container BlockQuote) : (parStyleToContainers pPr') +parStyleToContainers pPr | (_:cs) <- pStyle pPr = + let pPr' = pPr { pStyle = cs} + in + parStyleToContainers pPr' +parStyleToContainers pPr | null (pStyle pPr), + Just left <- indentation pPr >>= leftParIndent, + Just hang <- indentation pPr >>= hangingParIndent = + let pPr' = pPr { indentation = Nothing } + in + case (left - hang) > 0 of + True -> (Container BlockQuote) : (parStyleToContainers pPr') + False -> parStyleToContainers pPr' +parStyleToContainers pPr | null (pStyle pPr), + Just left <- indentation pPr >>= leftParIndent = + let pPr' = pPr { indentation = Nothing } + in + case left > 0 of + True -> (Container BlockQuote) : (parStyleToContainers pPr') + False -> parStyleToContainers pPr' +parStyleToContainers _ = [] + strToInlines :: String -> [Inline] strToInlines = toList . text @@ -144,115 +286,103 @@ runElemToString (Tab) = ['\t'] runElemsToString :: [RunElem] -> String runElemsToString = concatMap runElemToString ---- We use this instead of the more general ---- Text.Pandoc.Shared.normalize for reasons of efficiency. For ---- whatever reason, `normalize` makes a run take almost twice as ---- long. (It does more, but this does what we need) -inlineNormalize :: [Inline] -> [Inline] -inlineNormalize [] = [] -inlineNormalize (Str "" : ils) = inlineNormalize ils -inlineNormalize ((Str s) : (Str s') : l) = - inlineNormalize (Str (s++s') : l) -inlineNormalize ((Emph ils) : (Emph ils') : l) = - inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Emph ils) : l) = - Emph (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Strong ils) : (Strong ils') : l) = - inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Strong ils) : l) = - Strong (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) = - inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Strikeout ils) : l) = - Strikeout (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Superscript ils) : (Superscript ils') : l) = - inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Superscript ils) : l) = - Superscript (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Subscript ils) : (Subscript ils') : l) = - inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l -inlineNormalize ((Subscript ils) : l) = - Subscript (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Space : Space : l)) = - inlineNormalize $ (Space : l) -inlineNormalize ((Quoted qt ils) : l) = - Quoted qt (inlineNormalize ils) : inlineNormalize l -inlineNormalize ((Cite cits ils) : l) = - let - f :: Citation -> Citation - f (Citation s pref suff mode num hash) = - Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash - in - Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize ((Link ils s) : l) = - Link (inlineNormalize ils) s : (inlineNormalize l) -inlineNormalize ((Image ils s) : l) = - Image (inlineNormalize ils) s : (inlineNormalize l) -inlineNormalize ((Note blks) : l) = - Note (map blockNormalize blks) : (inlineNormalize l) -inlineNormalize ((Span attr ils) : l) = - Span attr (inlineNormalize ils) : (inlineNormalize l) -inlineNormalize (il : l) = il : (inlineNormalize l) - -stripSpaces :: [Inline] -> [Inline] -stripSpaces ils = - reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils - -blockNormalize :: Block -> Block -blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils -blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils -blockNormalize (Header n attr ils) = - Header n attr $ stripSpaces $ inlineNormalize ils -blockNormalize (Table ils align width hdr cells) = - Table (stripSpaces $ inlineNormalize ils) align width hdr cells -blockNormalize (DefinitionList pairs) = - DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs -blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks) -blockNormalize (OrderedList attr blkslst) = - OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst -blockNormalize (BulletList blkslst) = - BulletList $ map (\blks -> map blockNormalize blks) blkslst -blockNormalize (Div attr blks) = Div attr (map blockNormalize blks) -blockNormalize blk = blk - -runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) - | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans = - case runStyleToSpanAttr rs == ("", [], []) of - True -> [Str (runElemsToString runElems)] - False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]] - | otherwise = case runStyleToSpanAttr rs == ("", [], []) of - True -> concatMap runElemToInlines runElems - False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)] -runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = - case (getFootNote fnId notes) of - Just bodyParts -> - [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] - Nothing -> - [Note [Div ("", ["footnote"], []) []]] -runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = - case (getEndNote fnId notes) of - Just bodyParts -> - [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] - Nothing -> - [Note [Div ("", ["endnote"], []) []]] - -parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines _ _ (BookMark _ anchor) = - [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = - case lookupRelationship relid rels of - Just target -> [Image [] (combine "word" target, "")] - Nothing -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = - [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = - case lookupRelationship relid rels of - Just target -> - [Link (concatMap (runToInlines opts docx) runs) (target, "")] - Nothing -> - [Link (concatMap (runToInlines opts docx) runs) ("", "")] +runToString :: Run -> String +runToString (Run _ runElems) = runElemsToString runElems +runToString _ = "" + +parPartToString :: ParPart -> String +parPartToString (PlainRun run) = runToString run +parPartToString (InternalHyperLink _ runs) = concatMap runToString runs +parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs +parPartToString _ = "" + + +inlineCodeContainer :: Container Inline -> Bool +inlineCodeContainer (Container f) = case f [] of + Code _ "" -> True + _ -> False +inlineCodeContainer _ = False + + +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) + | any inlineCodeContainer (runStyleToContainers rs) = + return $ + rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] + | otherwise = + return $ + rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) +runToInlines (Footnote bps) = + concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) +runToInlines (Endnote bps) = + concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) +runToInlines (InlineDrawing fp bs) = do + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + return [Image [] (fp, "")] + + + + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- get whether we're in a header. + inHdrBool <- asks docxInHeaderBlock + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- We don't want to rewrite if we're in a header, since we'll take + -- care of that later, when we make the header anchor. If the + -- bookmark were already in uniqueIdent form, this would lead to a + -- duplication. Otherwise, we check to see if the id is already in + -- there. Rewrite if necessary. This will have the possible effect + -- of rewriting user-defined anchor links. However, since these + -- are not defined in pandoc, it seems like a necessary evil to + -- avoid an extra pass. + let newAnchor = + if not inHdrBool && anchor `elem` (M.elems anchorMap) + then uniqueIdent [Str anchor] (M.elems anchorMap) + else anchor + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) + return [Span (newAnchor, ["anchor"], []) []] +parPartToInlines (Drawing fp bs) = do + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + return [Image [] (fp, "")] +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink target runs) = do + ils <- concatMapM runToInlines runs + return [Link ils (target, "")] +parPartToInlines (PlainOMath exps) = do + return [Math InlineMath (writeTeX exps)] + isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (ident, classes, kvs) ils) = @@ -265,74 +395,106 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchors :: Block -> Block -makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = - case filter isAnchorSpan ils of - [] -> h - (x@(Span (ident, _, _) _) : xs) -> - case ident `elem` dummyAnchors of - True -> h - False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) - _ -> h -makeHeaderAnchors blk = blk - - -parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = - -- - -- We're going to skip data-uri's for now. It should be an option, - -- not mandatory. - -- - (if False -- TODO depend on option - then bottomUp (makeImagesSelfContained docx) - else id) $ - bottomUp spanTrim $ - bottomUp spanCorrect $ - bottomUp spanReduce $ - concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps - -rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells - -bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block -bodyPartToBlock opts docx (Paragraph pPr parparts) = - Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)] -bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = +makeHeaderAnchor :: Block -> DocxContext Block +-- If there is an anchor already there (an anchor span in the header, +-- to be exact), we rename and associate the new id with the old one. +makeHeaderAnchor (Header n (_, classes, kvs) ils) + | (x : xs) <- filter isAnchorSpan ils + , (Span (ident, _, _) _) <- x + , notElem ident dummyAnchors = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) +-- Otherwise we just give it a name, and register that name (associate +-- it with itself.) +makeHeaderAnchor (Header n (_, classes, kvs) ils) = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) ils +makeHeaderAnchor blk = return blk + + +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells + +isBlockCodeContainer :: Container Block -> Bool +isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True +isBlockCodeContainer _ = False + +isHeaderContainer :: Container Block -> Bool +isHeaderContainer (Container f) | Header _ _ _ <- f [] = True +isHeaderContainer _ = False + +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) + | any isBlockCodeContainer (parStyleToContainers pPr) = + let + otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr) + in + return $ + rebuild + otherConts + [CodeBlock ("", [], []) (concatMap parPartToString parparts)] +bodyPartToBlocks (Paragraph pPr parparts) + | any isHeaderContainer (parStyleToContainers pPr) = do + ils <- normalizeSpaces <$> local (\s -> s{docxInHeaderBlock = True}) + (parPartsToInlines parparts) + let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) + Header n attr _ = hdrFun [] + hdr <- makeHeaderAnchor $ Header n attr ils + return [hdr] +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + case ils of + [] -> return [] + _ -> do + return $ + rebuild + (parStyleToContainers pPr) + [Para ils] +bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do let - kvs = case lookupLevel numId lvl numbering of - Just (_, fmt, txt, Just start) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - , ("start", (show start)) - ] - - Just (_, fmt, txt, Nothing) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - ] - Nothing -> [] - in - Div - ("", ["list-item"], kvs) - [bodyPartToBlock opts docx (Paragraph pPr parparts)] -bodyPartToBlock _ _ (Tbl _ _ _ []) = - Para [] -bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = + kvs = case levelInfo of + (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do let caption = strToInlines cap (hdr, rows) = case firstRowFormatting look of True -> (Just r, rs) False -> (Nothing, r:rs) - hdrCells = case hdr of - Just r' -> rowToBlocksList opts docx r' - Nothing -> [] - cells = map (rowToBlocksList opts docx) rows + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] - size = case null hdrCells of + cells <- mapM rowToBlocksList rows + + let size = case null hdrCells of True -> length $ head cells False -> length $ hdrCells -- @@ -341,208 +503,49 @@ bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = -- moment. Width information is in the TblGrid field of the Tbl, -- so should be possible. Alignment might be more difficult, -- since there doesn't seem to be a column entity in docx. - alignments = take size (repeat AlignDefault) - widths = take size (repeat 0) :: [Double] - in - Table caption alignments widths hdrCells cells - -makeImagesSelfContained :: Docx -> Inline -> Inline -makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = - case lookup uri media of - Just bs -> case getMimeType uri of - Just mime -> let data_uri = - "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) - in - Image alt (data_uri, title) - Nothing -> i - Nothing -> i -makeImagesSelfContained _ inline = inline - -bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = - bottomUp removeEmptyPars $ - map blockNormalize $ - bottomUp spanRemove $ - bottomUp divRemove $ - map (makeHeaderAnchors) $ - bottomUp divCorrect $ - bottomUp divReduce $ - bottomUp divCorrectPreReduce $ - bottomUp blocksToDefinitions $ - blocksToBullets $ - map (bodyPartToBlock opts docx) bps - -docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body - -spanReduce :: [Inline] -> [Inline] -spanReduce [] = [] -spanReduce ((Span (id1, classes1, kvs1) ils1) : ils) - | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils) -spanReduce (s1@(Span (id1, classes1, kvs1) ils1) : - s2@(Span (id2, classes2, kvs2) ils2) : - ils) = - let classes' = classes1 `intersect` classes2 - kvs' = kvs1 `intersect` kvs2 - classes1' = classes1 \\ classes' - kvs1' = kvs1 \\ kvs' - classes2' = classes2 \\ classes' - kvs2' = kvs2 \\ kvs' - in - case null classes' && null kvs' of - True -> s1 : (spanReduce (s2 : ils)) - False -> let attr' = ("", classes', kvs') - attr1' = (id1, classes1', kvs1') - attr2' = (id2, classes2', kvs2') - in - spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] : - ils) -spanReduce (il:ils) = il : (spanReduce ils) + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] +bodyPartToBlocks (OMathPara e) = do + return [Para [Math DisplayMath (writeTeX e)]] + + +-- replace targets with generated anchors. +rewriteLink :: Inline -> DocxContext Inline +rewriteLink l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink il = return il + +bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) +bodyToOutput (Body bps) = do + let (metabps, blkbps) = sepBodyParts bps + meta <- bodyPartsToMeta metabps + blks <- concatMapM bodyPartToBlocks blkbps >>= + walkM rewriteLink + mediaBag <- gets docxMediaBag + return $ (meta, + blocksToDefinitions $ blocksToBullets $ blks, + mediaBag) + +docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) +docxToOutput opts (Docx (Document _ body)) = + let dEnv = def { docxOptions = opts} in + evalDocxContext (bodyToOutput body) dEnv def + ilToCode :: Inline -> String ilToCode (Str s) = s -ilToCode _ = "" - -spanRemove' :: Inline -> [Inline] -spanRemove' s@(Span (ident, classes, _) []) - -- "_GoBack" is automatically inserted. We don't want to keep it. - | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s] -spanRemove' (Span (_, _, kvs) ils) = - case lookup "underline" kvs of - Just val -> [Span ("", [], [("underline", val)]) ils] - Nothing -> ils -spanRemove' il = [il] - -spanRemove :: [Inline] -> [Inline] -spanRemove = concatMap spanRemove' - -spanTrim' :: Inline -> [Inline] -spanTrim' il@(Span _ []) = [il] -spanTrim' il@(Span attr (il':[])) - | il' == Space = [Span attr [], Space] - | otherwise = [il] -spanTrim' (Span attr ils) - | head ils == Space && last ils == Space = - [Space, Span attr (init $ tail ils), Space] - | head ils == Space = [Space, Span attr (tail ils)] - | last ils == Space = [Span attr (init ils), Space] -spanTrim' il = [il] - -spanTrim :: [Inline] -> [Inline] -spanTrim = concatMap spanTrim' - -spanCorrect' :: Inline -> [Inline] -spanCorrect' (Span ("", [], []) ils) = ils -spanCorrect' (Span (ident, classes, kvs) ils) - | "emph" `elem` classes = - [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils] - | "strong" `elem` classes = - [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils] - | "smallcaps" `elem` classes = - [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils] - | "strike" `elem` classes = - [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils] - | "superscript" `elem` classes = - [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils] - | "subscript" `elem` classes = - [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils] - | (not . null) (codeSpans `intersect` classes) = - [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)] - | otherwise = - [Span (ident, classes, kvs) ils] -spanCorrect' il = [il] - -spanCorrect :: [Inline] -> [Inline] -spanCorrect = concatMap spanCorrect' - -removeEmptyPars :: [Block] -> [Block] -removeEmptyPars blks = filter (\b -> b /= (Para [])) blks - -divReduce :: [Block] -> [Block] -divReduce [] = [] -divReduce ((Div (id1, classes1, kvs1) blks1) : blks) - | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks) -divReduce (d1@(Div (id1, classes1, kvs1) blks1) : - d2@(Div (id2, classes2, kvs2) blks2) : - blks) = - let classes' = classes1 `intersect` classes2 - kvs' = kvs1 `intersect` kvs2 - classes1' = classes1 \\ classes' - kvs1' = kvs1 \\ kvs' - classes2' = classes2 \\ classes' - kvs2' = kvs2 \\ kvs' - in - case null classes' && null kvs' of - True -> d1 : (divReduce (d2 : blks)) - False -> let attr' = ("", classes', kvs') - attr1' = (id1, classes1', kvs1') - attr2' = (id2, classes2', kvs2') - in - divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] : - blks) -divReduce (blk:blks) = blk : (divReduce blks) +ilToCode Space = " " +ilToCode _ = "" isHeaderClass :: String -> Maybe Int -isHeaderClass s | "Heading" `isPrefixOf` s = - case reads (drop (length "Heading") s) :: [(Int, String)] of +isHeaderClass s | Just s' <- stripPrefix "Heading" s = + case reads s' :: [(Int, String)] of [] -> Nothing ((n, "") : []) -> Just n _ -> Nothing isHeaderClass _ = Nothing - -findHeaderClass :: [String] -> Maybe Int -findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of - [] -> Nothing - n : _ -> Just n - -blksToInlines :: [Block] -> [Inline] -blksToInlines (Para ils : _) = ils -blksToInlines (Plain ils : _) = ils -blksToInlines _ = [] - -divCorrectPreReduce' :: Block -> [Block] -divCorrectPreReduce' (Div (ident, classes, kvs) blks) - | isJust $ findHeaderClass classes = - let n = fromJust $ findHeaderClass classes - in - [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)] - | otherwise = [Div (ident, classes, kvs) blks] -divCorrectPreReduce' blk = [blk] - -divCorrectPreReduce :: [Block] -> [Block] -divCorrectPreReduce = concatMap divCorrectPreReduce' - -blkToCode :: Block -> String -blkToCode (Para []) = "" -blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils)) -blkToCode (Para ((Span (_, classes, _) ils'): ils)) - | (not . null) (codeSpans `intersect` classes) = - (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils)) -blkToCode _ = "" - -divRemove' :: Block -> [Block] -divRemove' (Div (_, _, kvs) blks) = - case lookup "indent" kvs of - Just val -> [Div ("", [], [("indent", val)]) blks] - Nothing -> blks -divRemove' blk = [blk] - -divRemove :: [Block] -> [Block] -divRemove = concatMap divRemove' - -divCorrect' :: Block -> [Block] -divCorrect' b@(Div (ident, classes, kvs) blks) - | (not . null) (blockQuoteDivs `intersect` classes) = - [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]] - | (not . null) (codeDivs `intersect` classes) = - [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)] - | otherwise = - case lookup "indent" kvs of - Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks] - Just _ -> - [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]] - Nothing -> [b] -divCorrect' blk = [blk] - -divCorrect :: [Block] -> [Block] -divCorrect = concatMap divCorrect' diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs new file mode 100644 index 000000000..b44c71412 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs @@ -0,0 +1,238 @@ +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fonts + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + Stability : alpha + Portability : portable + +Utilities to convert between font codepoints and unicode characters. +-} +module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where + + +-- | Enumeration of recognised fonts +data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol> + deriving (Show, Eq) + +-- | Given a font and codepoint, returns the corresponding unicode +-- character +getUnicode :: Font -> Char -> Maybe Char +getUnicode Symbol c = lookup c symbol + +-- Generated from lib/fonts/symbol.txt +symbol :: [(Char, Char)] +symbol = + [ (' ',' ') + , (' ','\160') + , ('!','!') + , ('"','\8704') + , ('#','#') + , ('$','\8707') + , ('%','%') + , ('&','&') + , ('\'','\8715') + , ('(','(') + , (')',')') + , ('*','\8727') + , ('+','+') + , (',',',') + , ('-','\8722') + , ('.','.') + , ('/','/') + , ('0','0') + , ('1','1') + , ('2','2') + , ('3','3') + , ('4','4') + , ('5','5') + , ('6','6') + , ('7','7') + , ('8','8') + , ('9','9') + , (':',':') + , (';',';') + , ('<','<') + , ('=','=') + , ('>','>') + , ('?','?') + , ('@','\8773') + , ('A','\913') + , ('B','\914') + , ('C','\935') + , ('D','\916') + , ('D','\8710') + , ('E','\917') + , ('F','\934') + , ('G','\915') + , ('H','\919') + , ('I','\921') + , ('J','\977') + , ('K','\922') + , ('L','\923') + , ('M','\924') + , ('N','\925') + , ('O','\927') + , ('P','\928') + , ('Q','\920') + , ('R','\929') + , ('S','\931') + , ('T','\932') + , ('U','\933') + , ('V','\962') + , ('W','\937') + , ('W','\8486') + , ('X','\926') + , ('Y','\936') + , ('Z','\918') + , ('[','[') + , ('\\','\8756') + , (']',']') + , ('^','\8869') + , ('_','_') + , ('`','\63717') + , ('a','\945') + , ('b','\946') + , ('c','\967') + , ('d','\948') + , ('e','\949') + , ('f','\966') + , ('g','\947') + , ('h','\951') + , ('i','\953') + , ('j','\981') + , ('k','\954') + , ('l','\955') + , ('m','\181') + , ('m','\956') + , ('n','\957') + , ('o','\959') + , ('p','\960') + , ('q','\952') + , ('r','\961') + , ('s','\963') + , ('t','\964') + , ('u','\965') + , ('v','\982') + , ('w','\969') + , ('x','\958') + , ('y','\968') + , ('z','\950') + , ('{','{') + , ('|','|') + , ('}','}') + , ('~','\8764') + , ('\160','\8364') + , ('\161','\978') + , ('\162','\8242') + , ('\163','\8804') + , ('\164','\8260') + , ('\164','\8725') + , ('\165','\8734') + , ('\166','\402') + , ('\167','\9827') + , ('\168','\9830') + , ('\169','\9829') + , ('\170','\9824') + , ('\171','\8596') + , ('\172','\8592') + , ('\173','\8593') + , ('\174','\8594') + , ('\175','\8595') + , ('\176','\176') + , ('\177','\177') + , ('\178','\8243') + , ('\179','\8805') + , ('\180','\215') + , ('\181','\8733') + , ('\182','\8706') + , ('\183','\8226') + , ('\184','\247') + , ('\185','\8800') + , ('\186','\8801') + , ('\187','\8776') + , ('\188','\8230') + , ('\189','\63718') + , ('\190','\63719') + , ('\191','\8629') + , ('\192','\8501') + , ('\193','\8465') + , ('\194','\8476') + , ('\195','\8472') + , ('\196','\8855') + , ('\197','\8853') + , ('\198','\8709') + , ('\199','\8745') + , ('\200','\8746') + , ('\201','\8835') + , ('\202','\8839') + , ('\203','\8836') + , ('\204','\8834') + , ('\205','\8838') + , ('\206','\8712') + , ('\207','\8713') + , ('\208','\8736') + , ('\209','\8711') + , ('\210','\63194') + , ('\211','\63193') + , ('\212','\63195') + , ('\213','\8719') + , ('\214','\8730') + , ('\215','\8901') + , ('\216','\172') + , ('\217','\8743') + , ('\218','\8744') + , ('\219','\8660') + , ('\220','\8656') + , ('\221','\8657') + , ('\222','\8658') + , ('\223','\8659') + , ('\224','\9674') + , ('\225','\9001') + , ('\226','\63720') + , ('\227','\63721') + , ('\228','\63722') + , ('\229','\8721') + , ('\230','\63723') + , ('\231','\63724') + , ('\232','\63725') + , ('\233','\63726') + , ('\234','\63727') + , ('\235','\63728') + , ('\236','\63729') + , ('\237','\63730') + , ('\238','\63731') + , ('\239','\63732') + , ('\241','\9002') + , ('\242','\8747') + , ('\243','\8992') + , ('\244','\63733') + , ('\245','\8993') + , ('\246','\63734') + , ('\247','\63735') + , ('\248','\63736') + , ('\249','\63737') + , ('\250','\63738') + , ('\251','\63739') + , ('\252','\63740') + , ('\253','\63741') + , ('\254','\63742')] diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index 68559d98b..ea195c14a 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -29,9 +29,12 @@ Functions for converting flat docx paragraphs into nested lists. -} module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets - , blocksToDefinitions) where + , blocksToDefinitions + , listParagraphDivs + ) where import Text.Pandoc.JSON +import Text.Pandoc.Generic (bottomUp) import Text.Pandoc.Shared (trim) import Control.Monad import Data.List @@ -118,7 +121,7 @@ handleListParagraphs ( in handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) - + separateBlocks' :: Block -> [[Block]] -> [[Block]] separateBlocks' blk ([] : []) = [[blk]] separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] @@ -136,7 +139,7 @@ flatToBullets' :: Integer -> [Block] -> [Block] flatToBullets' _ [] = [] flatToBullets' num xs@(b : elems) | getLevelN b == num = b : (flatToBullets' num elems) - | otherwise = + | otherwise = let bNumId = getNumIdN b bLevel = getLevelN b (children, remaining) = @@ -159,10 +162,9 @@ flatToBullets elems = flatToBullets' (-1) elems blocksToBullets :: [Block] -> [Block] blocksToBullets blks = - -- bottomUp removeListItemDivs $ + bottomUp removeListDivs $ flatToBullets $ (handleListParagraphs blks) - plainParaInlines :: Block -> [Inline] plainParaInlines (Plain ils) = ils plainParaInlines (Para ils) = ils @@ -199,10 +201,27 @@ blocksToDefinitions' [] acc (b:blks) = blocksToDefinitions' defAcc acc (b:blks) = blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks +removeListDivs' :: Block -> [Block] +removeListDivs' (Div (ident, classes, kvs) blks) + | "list-item" `elem` classes = + case delete "list-item" classes of + [] -> blks + classes' -> [Div (ident, classes', kvs) $ blks] +removeListDivs' (Div (ident, classes, kvs) blks) + | not $ null $ listParagraphDivs `intersect` classes = + case classes \\ listParagraphDivs of + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] +removeListDivs' blk = [blk] + +removeListDivs :: [Block] -> [Block] +removeListDivs = concatMap removeListDivs' + + blocksToDefinitions :: [Block] -> [Block] blocksToDefinitions = blocksToDefinitions' [] [] - - - + + + diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 18200bcf9..1abd4bc6b 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards, ViewPatterns #-} + {- Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> @@ -8,49 +10,46 @@ the Free Software Foundation; either version 2 of the License, or This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | - Module : Text.Pandoc.Readers.Docx.Parse - Copyright : Copyright (C) 2014 Jesse Rosenthal - License : GNU GPL, version 2 or above + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above - Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> - Stability : alpha - Portability : portable + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable Conversion of docx archive into Docx haskell type -} +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , ParIndentation(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , archiveToDocx + ) where -module Text.Pandoc.Readers.Docx.Parse ( Docx(..) - , Document(..) - , Body(..) - , BodyPart(..) - , TblLook(..) - , ParPart(..) - , Run(..) - , RunElem(..) - , Notes - , Numbering - , Relationship - , Media - , RunStyle(..) - , ParagraphStyle(..) - , Row(..) - , Cell(..) - , getFootNote - , getEndNote - , lookupLevel - , lookupRelationship - , archiveToDocx - ) where import Codec.Archive.Zip import Text.XML.Light import Data.Maybe @@ -59,39 +58,244 @@ import System.FilePath import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad.Reader +import Control.Applicative ((<$>), (<|>)) +import qualified Data.Map as M +import Text.Pandoc.Compat.Except +import Text.TeXMath.Readers.OMML (readOMML) +import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) +import Text.TeXMath (Exp) +import Data.Char (readLitChar, ord, chr) + +data ReaderEnv = ReaderEnv { envNotes :: Notes + , envNumbering :: Numbering + , envRelationships :: [Relationship] + , envMedia :: Media + , envFont :: Maybe Font + } + deriving Show + +data DocxError = DocxError | WrongElem + deriving Show + +instance Error DocxError where + noMsg = WrongElem + +type D = ExceptT DocxError (Reader ReaderEnv) + +runD :: D a -> ReaderEnv -> Either DocxError a +runD dx re = runReader (runExceptT dx ) re + +maybeToD :: Maybe a -> D a +maybeToD (Just a) = return a +maybeToD Nothing = throwError DocxError + +eitherToD :: Either a b -> D b +eitherToD (Right b) = return b +eitherToD (Left _) = throwError DocxError + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + + +-- This is similar to `mapMaybe`: it maps a function returning the D +-- monad over a list, and only keeps the non-erroring return values. +mapD :: (a -> D b) -> [a] -> D [b] +mapD f xs = + let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return []) + in + concatMapM handler xs -attrToNSPair :: Attr -> Maybe (String, String) -attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing +type NameSpaces = [(String, String)] +data Docx = Docx Document + deriving Show -type NameSpaces = [(String, String)] +data Document = Document NameSpaces Body + deriving Show -data Docx = Docx Document Notes Numbering [Relationship] Media +data Body = Body [BodyPart] deriving Show -archiveToDocx :: Archive -> Maybe Docx +type Media = [(FilePath, B.ByteString)] + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +data Relationship = Relationship (RelId, Target) + deriving Show + +data Notes = Notes NameSpaces + (Maybe (M.Map String Element)) + (Maybe (M.Map String Element)) + deriving Show + +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer + , hangingParIndent :: Maybe Integer} + deriving Show + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indentation :: Maybe ParIndentation + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indentation = Nothing + } + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String Level [ParPart] + | Tbl String TblGrid TblLook [Row] + | OMathPara [Exp] + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +data Row = Row [Cell] + deriving Show + +data Cell = Cell [BodyPart] + deriving Show + +data ParPart = PlainRun Run + | Insertion ChangeId Author ChangeDate [Run] + | Deletion ChangeId Author ChangeDate [Run] + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink URL [Run] + | Drawing FilePath B.ByteString + | PlainOMath [Exp] + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote [BodyPart] + | Endnote [BodyPart] + | InlineDrawing FilePath B.ByteString + deriving Show + +data RunElem = TextRun String | LnBrk | Tab + deriving Show + +data RunStyle = RunStyle { isBold :: Bool + , isItalic :: Bool + , isSmallCaps :: Bool + , isStrike :: Bool + , isSuperScript :: Bool + , isSubScript :: Bool + , rUnderline :: Maybe String + , rStyle :: Maybe String } + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = False + , isItalic = False + , isSmallCaps = False + , isStrike = False + , isSuperScript = False + , isSubScript = False + , rUnderline = Nothing + , rStyle = Nothing + } + + +type Target = String +type Anchor = String +type URL = String +type BookMarkId = String +type RelId = String +type ChangeId = String +type Author = String +type ChangeDate = String + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + +archiveToDocx :: Archive -> Either DocxError Docx archiveToDocx archive = do - let notes = archiveToNotes archive - rels = archiveToRelationships archive - media = archiveToMedia archive - doc <- archiveToDocument archive - numbering <- archiveToNumbering archive - return $ Docx doc notes numbering rels media - -data Document = Document NameSpaces Body - deriving Show + let notes = archiveToNotes archive + numbering = archiveToNumbering archive + rels = archiveToRelationships archive + media = archiveToMedia archive + rEnv = ReaderEnv notes numbering rels media Nothing + doc <- runD (archiveToDocument archive) rEnv + return $ Docx doc + -archiveToDocument :: Archive -> Maybe Document +archiveToDocument :: Archive -> D Document archiveToDocument zf = do - entry <- findEntryByPath "word/document.xml" zf - docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry - let namespaces = mapMaybe attrToNSPair (elAttribs docElem) - bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem + entry <- maybeToD $ findEntryByPath "word/document.xml" zf + docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem body <- elemToBody namespaces bodyElem return $ Document namespaces body -type Media = [(FilePath, B.ByteString)] +elemToBody :: NameSpaces -> Element -> D Body +elemToBody ns element | isElem ns "w" "body" element = + mapD (elemToBodyPart ns) (elChildren element) >>= + (\bps -> return $ Body bps) +elemToBody _ _ = throwError WrongElem + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + en_namespaces = case enElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = + let (dir, name) = splitFileName fp + in + (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship (relId, target) +relElemToRelationship _ = Nothing + + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + let relPaths = filter filePathIsRel (filesInArchive archive) + entries = mapMaybe (\f -> findEntryByPath f archive) relPaths + relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries + rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems + in + rels filePathIsMedia :: FilePath -> Bool filePathIsMedia fp = @@ -109,18 +313,6 @@ archiveToMedia :: Archive -> Media archiveToMedia zf = mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) -data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] - deriving Show - -data Numb = Numb String String -- right now, only a key to an abstract num - deriving Show - -data AbstractNumb = AbstractNumb String [Level] - deriving Show - --- (ilvl, format, string, start) -type Level = (String, String, String, Maybe Integer) - lookupLevel :: String -> String -> Numbering -> Maybe Level lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs @@ -148,7 +340,7 @@ absNumElemToAbsNum ns element | let levelElems = findChildren (QName "lvl" (lookup "w" ns) (Just "w")) element - levels = mapMaybe id $ map (levelElemToLevel ns) levelElems + levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels absNumElemToAbsNum _ _ = Nothing @@ -167,8 +359,8 @@ levelElemToLevel ns element | return (ilvl, fmt, txt, start) levelElemToLevel _ _ = Nothing -archiveToNumbering :: Archive -> Maybe Numbering -archiveToNumbering zf = +archiveToNumbering' :: Archive -> Maybe Numbering +archiveToNumbering' zf = do case findEntryByPath "word/numbering.xml" zf of Nothing -> Just $ Numbering [] [] [] Just entry -> do @@ -180,321 +372,281 @@ archiveToNumbering zf = absNumElems = findChildren (QName "abstractNum" (lookup "w" namespaces) (Just "w")) numberingElem - nums = mapMaybe id $ map (numElemToNum namespaces) numElems - absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems + nums = mapMaybe (numElemToNum namespaces) numElems + absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems return $ Numbering namespaces nums absNums -data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) - deriving Show - -noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) -noteElemToNote ns element - | qName (elName element) `elem` ["endnote", "footnote"] && - qURI (elName element) == (lookup "w" ns) = - do - noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element - let bps = map fromJust - $ filter isJust - $ map (elemToBodyPart ns) - $ filterChildrenName (isParOrTbl ns) element - return $ (noteId, bps) -noteElemToNote _ _ = Nothing - -getFootNote :: String -> Notes -> Maybe [BodyPart] -getFootNote s (Notes _ fns _) = fns >>= (lookup s) - -getEndNote :: String -> Notes -> Maybe [BodyPart] -getEndNote s (Notes _ _ ens) = ens >>= (lookup s) +archiveToNumbering :: Archive -> Numbering +archiveToNumbering archive = + fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] +elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element) elemToNotes ns notetype element - | qName (elName element) == (notetype ++ "s") && - qURI (elName element) == (lookup "w" ns) = - Just $ map fromJust - $ filter isJust - $ map (noteElemToNote ns) - $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element + | isElem ns "w" (notetype ++ "s") element = + let pairs = mapMaybe + (\e -> findAttr (elemName ns "w" "id") e >>= + (\a -> Just (a, e))) + (findChildren (elemName ns "w" notetype) element) + in + Just $ M.fromList $ pairs elemToNotes _ _ _ = Nothing -archiveToNotes :: Archive -> Notes -archiveToNotes zf = - let fnElem = findEntryByPath "word/footnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - enElem = findEntryByPath "word/endnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - fn_namespaces = case fnElem of - Just e -> mapMaybe attrToNSPair (elAttribs e) - Nothing -> [] - en_namespaces = case enElem of - Just e -> mapMaybe attrToNSPair (elAttribs e) - Nothing -> [] - ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= (elemToNotes ns "footnote") - en = enElem >>= (elemToNotes ns "endnote") - in - Notes ns fn en +--------------------------------------------- +--------------------------------------------- +elemName :: NameSpaces -> String -> String -> QName +elemName ns prefix name = (QName name (lookup prefix ns) (Just prefix)) -data Relationship = Relationship (RelId, Target) - deriving Show +isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem ns prefix name element = + qName (elName element) == name && + qURI (elName element) == (lookup prefix ns) -lookupRelationship :: RelId -> [Relationship] -> Maybe Target -lookupRelationship relid rels = - lookup relid (map (\(Relationship pair) -> pair) rels) -filePathIsRel :: FilePath -> Bool -filePathIsRel fp = - let (dir, name) = splitFileName fp +elemToTblGrid :: NameSpaces -> Element -> D TblGrid +elemToTblGrid ns element | isElem ns "w" "tblGrid" element = + let cols = findChildren (elemName ns "w" "gridCol") element in - (dir == "word/_rels/") && ((takeExtension name) == ".rels") - -relElemToRelationship :: Element -> Maybe Relationship -relElemToRelationship element | qName (elName element) == "Relationship" = - do - relId <- findAttr (QName "Id" Nothing Nothing) element - target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship (relId, target) -relElemToRelationship _ = Nothing - - -archiveToRelationships :: Archive -> [Relationship] -archiveToRelationships archive = - let relPaths = filter filePathIsRel (filesInArchive archive) - entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths - relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries - rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems + mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger)) + cols +elemToTblGrid _ _ = throwError WrongElem + +elemToTblLook :: NameSpaces -> Element -> D TblLook +elemToTblLook ns element | isElem ns "w" "tblLook" element = + let firstRow = findAttr (elemName ns "w" "firstRow") element + val = findAttr (elemName ns "w" "val") element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False in - rels - -data Body = Body [BodyPart] - deriving Show + return $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = throwError WrongElem -isParOrTbl :: NameSpaces -> QName -> Bool -isParOrTbl ns q = qName q `elem` ["p", "tbl"] && - qURI q == (lookup "w" ns) +elemToRow :: NameSpaces -> Element -> D Row +elemToRow ns element | isElem ns "w" "tr" element = + do + let cellElems = findChildren (elemName ns "w" "tc") element + cells <- mapD (elemToCell ns) cellElems + return $ Row cells +elemToRow _ _ = throwError WrongElem -elemToBody :: NameSpaces -> Element -> Maybe Body -elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = - Just $ Body - $ map fromJust - $ filter isJust - $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element -elemToBody _ _ = Nothing +elemToCell :: NameSpaces -> Element -> D Cell +elemToCell ns element | isElem ns "w" "tc" element = + do + cellContents <- mapD (elemToBodyPart ns) (elChildren element) + return $ Cell cellContents +elemToCell _ _ = throwError WrongElem + +elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation +elemToParIndentation ns element | isElem ns "w" "ind" element = + Just $ ParIndentation { + leftParIndent = + findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>= + stringToInteger + , rightParIndent = + findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>= + stringToInteger + , hangingParIndent = + findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>= + stringToInteger} +elemToParIndentation _ _ = Nothing -isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool -isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] && - qURI q == (lookup "w" ns) elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) -elemToNumInfo ns element - | qName (elName element) == "p" && - qURI (elName element) == (lookup "w" ns) = - do - pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element - numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr - lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - return (numId, lvl) +elemToNumInfo ns element | isElem ns "w" "p" element = do + let pPr = findChild (elemName ns "w" "pPr") element + numPr = pPr >>= findChild (elemName ns "w" "numPr") + lvl <- numPr >>= + findChild (elemName ns "w" "ilvl") >>= + findAttr (elemName ns "w" "val") + numId <- numPr >>= + findChild (elemName ns "w" "numId") >>= + findAttr (elemName ns "w" "val") + return (numId, lvl) elemToNumInfo _ _ = Nothing --- isBookMarkTag :: NameSpaces -> QName -> Bool --- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] && --- qURI q == (lookup "w" ns) - --- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark --- parChildrenToBookmark ns (bms : bme : _) --- | qName (elName bms) == "bookmarkStart" && --- qURI (elName bms) == (lookup "w" ns) && --- qName (elName bme) == "bookmarkEnd" && --- qURI (elName bme) == (lookup "w" ns) = do --- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms --- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms --- return $ (bmId, bmName) --- parChildrenToBookmark _ _ = Nothing - -elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart -elemToBodyPart ns element - | qName (elName element) == "p" && - qURI (elName element) == (lookup "w" ns) = - let parstyle = elemToParagraphStyle ns element - parparts = mapMaybe id - $ map (elemToParPart ns) - $ filterChildrenName (isRunOrLinkOrBookmark ns) element - in - case elemToNumInfo ns element of - Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts - Nothing -> Just $ Paragraph parstyle parparts - | qName (elName element) == "tbl" && - qURI (elName element) == (lookup "w" ns) = - let - caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element - >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - grid = case - findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element - of - Just g -> elemToTblGrid ns g - Nothing -> [] - tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element - >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) - >>= elemToTblLook ns - in - Just $ Tbl - (fromMaybe "" caption) - grid - (fromMaybe defaultTblLook tblLook) - (mapMaybe (elemToRow ns) (elChildren element)) - | otherwise = Nothing - -elemToTblLook :: NameSpaces -> Element -> Maybe TblLook -elemToTblLook ns element - | qName (elName element) == "tblLook" && - qURI (elName element) == (lookup "w" ns) = - let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element - val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element - firstRowFmt = - case firstRow of - Just "1" -> True - Just _ -> False - Nothing -> case val of - Just bitMask -> testBitMask bitMask 0x020 - Nothing -> False - in - Just $ TblLook{firstRowFormatting = firstRowFmt} -elemToTblLook _ _ = Nothing - testBitMask :: String -> Int -> Bool testBitMask bitMaskS n = case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> ((n' .|. n) /= 0) -data ParagraphStyle = ParagraphStyle { pStyle :: [String] - , indent :: Maybe Integer - } - deriving Show - -defaultParagraphStyle :: ParagraphStyle -defaultParagraphStyle = ParagraphStyle { pStyle = [] - , indent = Nothing - } - -elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle -elemToParagraphStyle ns element = - case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of - Just pPr -> - ParagraphStyle - {pStyle = - mapMaybe id $ - map - (findAttr (QName "val" (lookup "w" ns) (Just "w"))) - (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) - , indent = - findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= - findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= - stringToInteger - } - Nothing -> defaultParagraphStyle - - -data BodyPart = Paragraph ParagraphStyle [ParPart] - | ListItem ParagraphStyle String String [ParPart] - | Tbl String TblGrid TblLook [Row] - - deriving Show - -type TblGrid = [Integer] - -data TblLook = TblLook {firstRowFormatting::Bool} - deriving Show - -defaultTblLook :: TblLook -defaultTblLook = TblLook{firstRowFormatting = False} - stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) -elemToTblGrid :: NameSpaces -> Element -> TblGrid -elemToTblGrid ns element - | qName (elName element) == "tblGrid" && - qURI (elName element) == (lookup "w" ns) = - let - cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element - in - mapMaybe (\e -> - findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e - >>= stringToInteger - ) - cols -elemToTblGrid _ _ = [] - -data Row = Row [Cell] - deriving Show - - -elemToRow :: NameSpaces -> Element -> Maybe Row -elemToRow ns element - | qName (elName element) == "tr" && - qURI (elName element) == (lookup "w" ns) = - let - cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element - in - Just $ Row (mapMaybe (elemToCell ns) cells) -elemToRow _ _ = Nothing +elemToBodyPart :: NameSpaces -> Element -> D BodyPart +elemToBodyPart ns element + | isElem ns "w" "p" element + , (c:_) <- findChildren (elemName ns "m" "oMathPara") element = + do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst +elemToBodyPart ns element + | isElem ns "w" "p" element + , Just (numId, lvl) <- elemToNumInfo ns element = do + let parstyle = elemToParagraphStyle ns element + parparts <- mapD (elemToParPart ns) (elChildren element) + num <- asks envNumbering + case lookupLevel numId lvl num of + Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts + Nothing -> throwError WrongElem +elemToBodyPart ns element + | isElem ns "w" "p" element = do + let parstyle = elemToParagraphStyle ns element + parparts <- mapD (elemToParPart ns) (elChildren element) + return $ Paragraph parstyle parparts +elemToBodyPart ns element + | isElem ns "w" "tbl" element = do + let caption' = findChild (elemName ns "w" "tblPr") element + >>= findChild (elemName ns "w" "tblCaption") + >>= findAttr (elemName ns "w" "val") + caption = (fromMaybe "" caption') + grid' = case findChild (elemName ns "w" "tblGrid") element of + Just g -> elemToTblGrid ns g + Nothing -> return [] + tblLook' = case findChild (elemName ns "w" "tblPr") element >>= + findChild (elemName ns "w" "tblLook") + of + Just l -> elemToTblLook ns l + Nothing -> return defaultTblLook + + grid <- grid' + tblLook <- tblLook' + rows <- mapD (elemToRow ns) (elChildren element) + return $ Tbl caption grid tblLook rows +elemToBodyPart _ _ = throwError WrongElem -data Cell = Cell [BodyPart] - deriving Show +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = + lookup relid (map (\(Relationship pair) -> pair) rels) -elemToCell :: NameSpaces -> Element -> Maybe Cell -elemToCell ns element - | qName (elName element) == "tc" && - qURI (elName element) == (lookup "w" ns) = - Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) -elemToCell _ _ = Nothing +expandDrawingId :: String -> D (FilePath, B.ByteString) +expandDrawingId s = do + target <- asks (lookupRelationship s . envRelationships) + case target of + Just filepath -> do + bytes <- asks (lookup (combine "word" filepath) . envMedia) + case bytes of + Just bs -> return (filepath, bs) + Nothing -> throwError DocxError + Nothing -> throwError DocxError + +elemToParPart :: NameSpaces -> Element -> D ParPart +elemToParPart ns element + | isElem ns "w" "r" element + , Just _ <- findChild (elemName ns "w" "drawing") element = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs) + Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element = + elemToRun ns element >>= (\r -> return $ PlainRun r) +elemToParPart ns element + | isElem ns "w" "ins" element + , Just cId <- findAttr (elemName ns "w" "id") element + , Just cAuthor <- findAttr (elemName ns "w" "author") element + , Just cDate <- findAttr (elemName ns "w" "date") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Insertion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "del" element + , Just cId <- findAttr (elemName ns "w" "id") element + , Just cAuthor <- findAttr (elemName ns "w" "author") element + , Just cDate <- findAttr (elemName ns "w" "date") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Deletion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "bookmarkStart" element + , Just bmId <- findAttr (elemName ns "w" "id") element + , Just bmName <- findAttr (elemName ns "w" "name") element = + return $ BookMark bmId bmName +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just anchor <- findAttr (elemName ns "w" "anchor") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ InternalHyperLink anchor runs +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just relId <- findAttr (elemName ns "r" "id") element = do + runs <- mapD (elemToRun ns) (elChildren element) + rels <- asks envRelationships + return $ case lookupRelationship relId rels of + Just target -> ExternalHyperLink target runs + Nothing -> ExternalHyperLink "" runs +elemToParPart ns element + | isElem ns "m" "oMath" element = + (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) +elemToParPart _ _ = throwError WrongElem -data ParPart = PlainRun Run - | BookMark BookMarkId Anchor - | InternalHyperLink Anchor [Run] - | ExternalHyperLink RelId [Run] - | Drawing String - deriving Show +lookupFootnote :: String -> Notes -> Maybe Element +lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) -data Run = Run RunStyle [RunElem] - | Footnote String - | Endnote String - deriving Show +lookupEndnote :: String -> Notes -> Maybe Element +lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) -data RunElem = TextRun String | LnBrk | Tab - deriving Show +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= + (\(fp, bs) -> return $ InlineDrawing fp bs) + Nothing -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "footnoteReference") element + , Just fnId <- findAttr (elemName ns "w" "id") ref = do + notes <- asks envNotes + case lookupFootnote fnId notes of + Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + return $ Footnote bps + Nothing -> return $ Footnote [] +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "endnoteReference") element + , Just enId <- findAttr (elemName ns "w" "id") ref = do + notes <- asks envNotes + case lookupEndnote enId notes of + Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + return $ Endnote bps + Nothing -> return $ Endnote [] +elemToRun ns element + | isElem ns "w" "r" element = do + runElems <- elemToRunElems ns element + return $ Run (elemToRunStyle ns element) runElems +elemToRun _ _ = throwError WrongElem -data RunStyle = RunStyle { isBold :: Bool - , isItalic :: Bool - , isSmallCaps :: Bool - , isStrike :: Bool - , isSuperScript :: Bool - , isSubScript :: Bool - , underline :: Maybe String - , rStyle :: Maybe String } - deriving Show +elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle +elemToParagraphStyle ns element + | Just pPr <- findChild (elemName ns "w" "pPr") element = + ParagraphStyle + {pStyle = + mapMaybe + (findAttr (elemName ns "w" "val")) + (findChildren (elemName ns "w" "pStyle") pPr) + , indentation = + findChild (elemName ns "w" "ind") pPr >>= + elemToParIndentation ns + } +elemToParagraphStyle _ _ = defaultParagraphStyle -defaultRunStyle :: RunStyle -defaultRunStyle = RunStyle { isBold = False - , isItalic = False - , isSmallCaps = False - , isStrike = False - , isSuperScript = False - , isSubScript = False - , underline = Nothing - , rStyle = Nothing - } elemToRunStyle :: NameSpaces -> Element -> RunStyle -elemToRunStyle ns element = - case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of - Just rPr -> - RunStyle +elemToRunStyle ns element + | Just rPr <- findChild (elemName ns "w" "rPr") element = + RunStyle { isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr @@ -508,100 +660,65 @@ elemToRunStyle ns element = (Just "subscript" == (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= findAttr (QName "val" (lookup "w" ns) (Just "w")))) - , underline = + , rUnderline = findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) , rStyle = findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) } - Nothing -> defaultRunStyle +elemToRunStyle _ _ = defaultRunStyle -elemToRun :: NameSpaces -> Element -> Maybe Run -elemToRun ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - case - findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= - findAttr (QName "id" (lookup "w" ns) (Just "w")) - of - Just s -> Just $ Footnote s - Nothing -> - case - findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= - findAttr (QName "id" (lookup "w" ns) (Just "w")) - of - Just s -> Just $ Endnote s - Nothing -> Just $ - Run (elemToRunStyle ns element) - (elemToRunElems ns element) -elemToRun _ _ = Nothing - -elemToRunElem :: NameSpaces -> Element -> Maybe RunElem +elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element - | qName (elName element) == "t" && - qURI (elName element) == (lookup "w" ns) = - Just $ TextRun (strContent element) - | qName (elName element) == "br" && - qURI (elName element) == (lookup "w" ns) = - Just $ LnBrk - | qName (elName element) == "tab" && - qURI (elName element) == (lookup "w" ns) = - Just $ Tab - | otherwise = Nothing - - -elemToRunElems :: NameSpaces -> Element -> [RunElem] + | isElem ns "w" "t" element + || isElem ns "w" "delText" element + || isElem ns "m" "t" element = do + let str = strContent element + font <- asks envFont + case font of + Nothing -> return $ TextRun str + Just f -> return . TextRun $ + map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str + | isElem ns "w" "br" element = return LnBrk + | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "sym" element = return (getSymChar ns element) + | otherwise = throwError WrongElem + where + lowerFromPrivate (ord -> c) + | c >= ord '\xF000' = chr $ c - ord '\xF000' + | otherwise = chr c + +-- The char attribute is a hex string +getSymChar :: NameSpaces -> Element -> RunElem +getSymChar ns element + | Just s <- lowerFromPrivate <$> getCodepoint + , Just font <- getFont = + let [(char, _)] = readLitChar ("\\x" ++ s) in + TextRun . maybe "" (:[]) $ getUnicode font char + where + getCodepoint = findAttr (elemName ns "w" "char") element + getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + lowerFromPrivate ('F':xs) = '0':xs + lowerFromPrivate xs = xs +getSymChar _ _ = TextRun "" + +stringToFont :: String -> Maybe Font +stringToFont "Symbol" = Just Symbol +stringToFont _ = Nothing + +elemToRunElems :: NameSpaces -> Element -> D [RunElem] elemToRunElems ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - mapMaybe (elemToRunElem ns) (elChildren element) - | otherwise = [] - -elemToDrawing :: NameSpaces -> Element -> Maybe ParPart -elemToDrawing ns element - | qName (elName element) == "drawing" && - qURI (elName element) == (lookup "w" ns) = - let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - in - findElement (QName "blip" (Just a_ns) (Just "a")) element - >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) - >>= (\s -> Just $ Drawing s) -elemToDrawing _ _ = Nothing - - -elemToParPart :: NameSpaces -> Element -> Maybe ParPart -elemToParPart ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of - Just drawingElem -> elemToDrawing ns drawingElem - Nothing -> do - r <- elemToRun ns element - return $ PlainRun r -elemToParPart ns element - | qName (elName element) == "bookmarkStart" && - qURI (elName element) == (lookup "w" ns) = do - bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element - bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element - return $ BookMark bmId bmName -elemToParPart ns element - | qName (elName element) == "hyperlink" && - qURI (elName element) == (lookup "w" ns) = - let runs = map fromJust $ filter isJust $ map (elemToRun ns) - $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element - in - case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of - Just anchor -> - Just $ InternalHyperLink anchor runs - Nothing -> - case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of - Just relId -> Just $ ExternalHyperLink relId runs - Nothing -> Nothing -elemToParPart _ _ = Nothing - -type Target = String -type Anchor = String -type BookMarkId = String -type RelId = String - + | isElem ns "w" "r" element + || isElem ns "m" "r" element = do + let qualName = elemName ns "w" + let font = do + fontElem <- findElement (qualName "rFonts") element + stringToFont =<< + (foldr (<|>) Nothing $ + map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) +elemToRunElems _ _ = throwError WrongElem + +setFont :: Maybe Font -> ReaderEnv -> ReaderEnv +setFont f s = s{envFont = f} diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs new file mode 100644 index 000000000..2dbef4131 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs @@ -0,0 +1,183 @@ +{-# LANGUAGE OverloadedStrings, PatternGuards #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Reducible + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Typeclass for combining adjacent blocks and inlines correctly. +-} + + +module Text.Pandoc.Readers.Docx.Reducible ((<++>), + (<+++>), + Reducible, + Container(..), + container, + innards, + reduceList, + reduceListB, + rebuild) + where + +import Text.Pandoc.Builder +import Data.List ((\\), intersect) + +data Container a = Container ([a] -> a) | NullContainer + +instance (Eq a) => Eq (Container a) where + (Container x) == (Container y) = ((x []) == (y [])) + NullContainer == NullContainer = True + _ == _ = False + +instance (Show a) => Show (Container a) where + show (Container x) = "Container {" ++ + (reverse $ drop 3 $ reverse $ show $ x []) ++ + "}" + show (NullContainer) = "NullContainer" + +class Reducible a where + (<++>) :: a -> a -> [a] + container :: a -> Container a + innards :: a -> [a] + isSpace :: a -> Bool + +(<+++>) :: (Reducible a) => Many a -> Many a -> Many a +mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms + +reduceListB :: (Reducible a) => Many a -> Many a +reduceListB = fromList . reduceList . toList + +reduceList' :: (Reducible a) => [a] -> [a] -> [a] +reduceList' acc [] = acc +reduceList' [] (x:xs) = reduceList' [x] xs +reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs + +reduceList :: (Reducible a) => [a] -> [a] +reduceList = reduceList' [] + +combineReducibles :: (Reducible a, Eq a) => a -> a -> [a] +combineReducibles r s = + let (conts, rs) = topLevelContainers r + (conts', ss) = topLevelContainers s + shared = conts `intersect` conts' + remaining = conts \\ shared + remaining' = conts' \\ shared + in + case null shared of + True | (x : xs) <- reverse rs + , isSpace x -> + rebuild conts (reverse xs) ++ [x, s] + | (x : xs) <- ss + , isSpace x -> + [r, x] ++ rebuild conts' (xs) + True -> [r,s] + False -> rebuild + shared $ + reduceList $ + (rebuild remaining rs) ++ (rebuild remaining' ss) + +instance Reducible Inline where + s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> [s1,s2] + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + s1' = case null classes1' && null kvs1' of + True -> ils1 + False -> [Span attr1' ils1] + s2' = case null classes2' && null kvs2' of + True -> ils2 + False -> [Span attr2' ils2] + in + [Span attr' $ reduceList $ s1' ++ s2'] + + (Str x) <++> (Str y) = [Str (x++y)] + il <++> il' = combineReducibles il il' + + container (Emph _) = Container Emph + container (Strong _) = Container Strong + container (Strikeout _) = Container Strikeout + container (Subscript _) = Container Subscript + container (Superscript _) = Container Superscript + container (Quoted qt _) = Container $ Quoted qt + container (Cite cs _) = Container $ Cite cs + container (Span attr _) = Container $ Span attr + container _ = NullContainer + + innards (Emph ils) = ils + innards (Strong ils) = ils + innards (Strikeout ils) = ils + innards (Subscript ils) = ils + innards (Superscript ils) = ils + innards (Quoted _ ils) = ils + innards (Cite _ ils) = ils + innards (Span _ ils) = ils + innards _ = [] + + isSpace Space = True + isSpace _ = False + +instance Reducible Block where + (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes = + [Div (ident, classes, kvs) (reduceList blks), blk] + + blk <++> blk' = combineReducibles blk blk' + + container (BlockQuote _) = Container BlockQuote + container (Div attr _) = Container $ Div attr + container _ = NullContainer + + innards (BlockQuote bs) = bs + innards (Div _ bs) = bs + innards _ = [] + + isSpace _ = False + + +topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a]) +topLevelContainers' (r : []) = case container r of + NullContainer -> ([], [r]) + _ -> + let (conts, inns) = topLevelContainers' (innards r) + in + ((container r) : conts, inns) +topLevelContainers' rs = ([], rs) + +topLevelContainers :: (Reducible a) => a -> ([Container a], [a]) +topLevelContainers il = topLevelContainers' [il] + +rebuild :: [Container a] -> [a] -> [a] +rebuild [] xs = xs +rebuild ((Container f) : cs) xs = rebuild cs $ [f xs] +rebuild (NullContainer : cs) xs = rebuild cs $ xs diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs new file mode 100644 index 000000000..f900c0adc --- /dev/null +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -0,0 +1,285 @@ +{-# LANGUAGE + ViewPatterns + , StandaloneDeriving + , TupleSections + , FlexibleContexts #-} + +module Text.Pandoc.Readers.EPUB + (readEPUB) + where + +import Text.XML.Light +import Text.Pandoc.Definition hiding (Attr) +import Text.Pandoc.Walk (walk, query) +import Text.Pandoc.Generic(bottomUp) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Options ( ReaderOptions(..), readerTrace) +import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField) +import Text.Pandoc.MediaBag (MediaBag, insertMedia) +import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) +import qualified Text.Pandoc.Builder as B +import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry + , findEntryByPath, Entry) +import qualified Data.ByteString.Lazy as BL (ByteString) +import System.FilePath ( takeFileName, (</>), dropFileName, normalise + , dropFileName + , splitFileName ) +import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Control.Applicative ((<$>)) +import Control.Monad (guard, liftM, when) +import Data.Monoid (mempty, (<>)) +import Data.List (isPrefixOf, isInfixOf) +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Map as M (Map, lookup, fromList, elems) +import Control.DeepSeq.Generics (deepseq, NFData) + +import Debug.Trace (trace) + +type MIME = String + +type Items = M.Map String (FilePath, MIME) + +readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag) +readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) + +runEPUB :: Except String a -> a +runEPUB = either error id . runExcept + +-- Note that internal reference are aggresively normalised so that all ids +-- are of the form "filename#id" +-- +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) +archiveToEPUB os archive = do + -- root is path to folder with manifest file in + (root, content) <- getManifest archive + meta <- parseMeta content + (cover, items) <- parseManifest content + -- No need to collapse here as the image path is from the manifest file + let coverDoc = fromMaybe mempty (imageToPandoc <$> cover) + spine <- parseSpine items content + let escapedSpine = map (escapeURI . takeFileName . fst) spine + Pandoc _ bs <- + foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) + `liftM` parseSpineElem root b) mempty spine + let ast = coverDoc <> (Pandoc meta bs) + let mediaBag = fetchImages (M.elems items) root archive ast + return $ (ast, mediaBag) + where + os' = os {readerParseRaw = True} + parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc + parseSpineElem (normalise -> r) (normalise -> path, mime) = do + when (readerTrace os) (traceM path) + doc <- mimeToReader mime r path + let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty + return $ docSpan <> doc + mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" (normalise -> root) (normalise -> path) = do + fname <- findEntryByPathE (root </> path) archive + return $ fixInternalReferences path . + readHtml os' . + UTF8.toStringLazy $ + fromEntry fname + mimeToReader s _ path + | s `elem` imageMimes = return $ imageToPandoc path + | otherwise = return $ mempty + +-- paths should be absolute when this function is called +-- renameImages should do this +fetchImages :: [(FilePath, MIME)] + -> FilePath -- ^ Root + -> Archive + -> Pandoc + -> MediaBag +fetchImages mimes root arc (query iq -> links) = + foldr (uncurry3 insertMedia) mempty + (mapMaybe getEntry links) + where + getEntry link = + let abslink = root </> link in + (link , lookup link mimes, ) . fromEntry + <$> findEntryByPath abslink arc + +iq :: Inline -> [FilePath] +iq (Image _ (url, _)) = [url] +iq _ = [] + +-- Remove relative paths +renameImages :: FilePath -> Inline -> Inline +renameImages root (Image a (url, b)) = Image a (collapseFilePath (root </> url), b) +renameImages _ x = x + +imageToPandoc :: FilePath -> Pandoc +imageToPandoc s = B.doc . B.para $ B.image s "" mempty + +imageMimes :: [String] +imageMimes = ["image/gif", "image/jpeg", "image/png"] + +type CoverImage = FilePath + +parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items) +parseManifest content = do + manifest <- findElementE (dfName "manifest") content + let items = findChildren (dfName "item") manifest + r <- mapM parseItem items + let cover = findAttr (emptyName "href") =<< filterChild findCover manifest + return (cover, (M.fromList r)) + where + findCover e = maybe False (isInfixOf "cover-image") + (findAttr (emptyName "properties") e) + parseItem e = do + uid <- findAttrE (emptyName "id") e + href <- findAttrE (emptyName "href") e + mime <- findAttrE (emptyName "media-type") e + return (uid, (href, mime)) + +parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)] +parseSpine is e = do + spine <- findElementE (dfName "spine") e + let itemRefs = findChildren (dfName "itemref") spine + mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + where + parseItemRef ref = do + let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) + guard linear + findAttr (emptyName "idref") ref + +parseMeta :: MonadError String m => Element -> m Meta +parseMeta content = do + meta <- findElementE (dfName "metadata") content + let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True + dcspace _ = False + let dcs = filterChildrenName dcspace meta + let r = foldr parseMetaItem nullMeta dcs + return r + +-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem +parseMetaItem :: Element -> Meta -> Meta +parseMetaItem e@(stripNamespace . elName -> field) meta = + addMetaField (renameMeta field) (B.str $ strContent e) meta + +renameMeta :: String -> String +renameMeta "creator" = "author" +renameMeta s = s + +getManifest :: MonadError String m => Archive -> m (String, Element) +getManifest archive = do + metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive + docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) + as <- liftM ((map attrToPair) . elAttribs) + (findElementE (QName "rootfile" (Just ns) Nothing) docElem) + manifestFile <- mkE "Root not found" (lookup "full-path" as) + let rootdir = dropFileName manifestFile + --mime <- lookup "media-type" as + manifest <- findEntryByPathE manifestFile archive + liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + +-- Fixup + +fixInternalReferences :: FilePath -> Pandoc -> Pandoc +fixInternalReferences pathToFile = + (walk $ renameImages root) + . (walk normalisePath) + . (walk $ fixBlockIRs filename) + . (walk $ fixInlineIRs filename) + where + (root, escapeURI -> filename) = splitFileName pathToFile + +fixInlineIRs :: String -> Inline -> Inline +fixInlineIRs s (Span as v) = + Span (fixAttrs s as) v +fixInlineIRs s (Code as code) = + Code (fixAttrs s as) code +fixInlineIRs s (Link t ('#':url, tit)) = + Link t (addHash s url, tit) +fixInlineIRs _ v = v + +normalisePath :: Inline -> Inline +normalisePath (Link t (url, tit)) = + let (path, uid) = span (/= '#') url in + Link t (takeFileName path ++ uid, tit) +normalisePath s = s + +prependHash :: [String] -> Inline -> Inline +prependHash ps l@(Link is (url, tit)) + | or [s `isPrefixOf` url | s <- ps] = + Link is ('#':url, tit) + | otherwise = l +prependHash _ i = i + +fixBlockIRs :: String -> Block -> Block +fixBlockIRs s (Div as b) = + Div (fixAttrs s as) b +fixBlockIRs s (Header i as b) = + Header i (fixAttrs s as) b +fixBlockIRs s (CodeBlock as code) = + CodeBlock (fixAttrs s as) code +fixBlockIRs _ b = b + +fixAttrs :: FilePath -> B.Attr -> B.Attr +fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) + +addHash :: String -> String -> String +addHash _ "" = "" +addHash s ident = s ++ "#" ++ ident + +removeEPUBAttrs :: [(String, String)] -> [(String, String)] +removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs + +isEPUBAttr :: (String, String) -> Bool +isEPUBAttr (k, _) = "epub:" `isPrefixOf` k + +-- Library + +-- Strict version of foldM +foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a +foldM' _ z [] = return z +foldM' f z (x:xs) = do + z' <- f z x + z' `deepseq` foldM' f z' xs + +uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d +uncurry3 f (a, b, c) = f a b c + +traceM :: Monad m => String -> m () +traceM = flip trace (return ()) + +-- Utility + +stripNamespace :: QName -> String +stripNamespace (QName v _ _) = v + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) +attrToNSPair _ = Nothing + +attrToPair :: Attr -> (String, String) +attrToPair (Attr (QName name _ _) val) = (name, val) + +defaultNameSpace :: Maybe String +defaultNameSpace = Just "http://www.idpf.org/2007/opf" + +dfName :: String -> QName +dfName s = QName s defaultNameSpace Nothing + +emptyName :: String -> QName +emptyName s = QName s Nothing Nothing + +-- Convert Maybe interface to Either + +findAttrE :: MonadError String m => QName -> Element -> m String +findAttrE q e = mkE "findAttr" $ findAttr q e + +findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry +findEntryByPathE (normalise -> path) a = + mkE ("No entry on path: " ++ path) $ findEntryByPath path a + +parseXMLDocE :: MonadError String m => String -> m Element +parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc + +findElementE :: MonadError String m => QName -> Element -> m Element +findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x + +mkE :: MonadError String m => String -> Maybe a -> m a +mkE s = maybe (throwError s) return diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 552e8a251..1789b865f 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -40,41 +41,69 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) -import Text.Pandoc.Shared -import Text.Pandoc.Options -import Text.Pandoc.Parsing -import Data.Maybe ( fromMaybe, isJust ) -import Data.List ( intercalate ) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) +import Text.Pandoc.Shared ( extractSpaces, renderTags' + , escapeURI, safeRead ) +import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) + , Extension (Ext_epub_html_exts, + Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Walk +import Data.Maybe ( fromMaybe, isJust) +import Data.List ( intercalate, isInfixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$), (<*) ) -import Data.Monoid +import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Arrow ((***)) +import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>)) +import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..)) import Text.Printf (printf) import Debug.Trace (trace) +import Text.TeXMath (readMathML, writeTeX) +import Data.Default (Default (..), def) +import Control.Monad.Reader (Reader,ask, asks, local, runReader) -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = - case runParser parseDoc def{ stateOptions = opts } "source" tags of + case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of Left err' -> error $ "\nError at " ++ show err' Right result -> result - where tags = canonicalizeTags $ + where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do blocks <- (fixPlains False) . mconcat <$> manyTill block eof - meta <- stateMeta <$> getState - return $ Pandoc meta (B.toList blocks) + meta <- stateMeta . parserState <$> getState + bs' <- replaceNotes (B.toList blocks) + return $ Pandoc meta bs' + +replaceNotes :: [Block] -> TagParser [Block] +replaceNotes = walkM replaceNotes' + +replaceNotes' :: Inline -> TagParser Inline +replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes + where + getNotes = noteTable <$> getState +replaceNotes' x = return x -type TagParser = Parser [Tag String] ParserState +data HTMLState = + HTMLState + { parserState :: ParserState, + noteTable :: [(String, Blocks)] + } + +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + } + +setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter = local (\s -> s {inChapter = True}) + +type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) + +type TagParser = HTMLParser [Tag String] pBody :: TagParser Blocks pBody = pInTags "body" block @@ -98,7 +127,11 @@ block = do tr <- getOption readerTrace pos <- getPosition res <- choice - [ pPara + [ eSection + , eSwitch B.para block + , mempty <$ eFootnote + , mempty <$ eTOC + , pPara , pHeader , pBlockQuote , pCodeBlock @@ -115,6 +148,63 @@ block = do (take 60 $ show $ B.toList res)) (return ()) return res +namespaces :: [(String, TagParser Inlines)] +namespaces = [(mathMLNamespace, pMath True)] + +mathMLNamespace :: String +mathMLNamespace = "http://www.w3.org/1998/Math/MathML" + +eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a +eSwitch constructor parser = try $ do + guardEnabled Ext_epub_html_exts + pSatisfy (~== TagOpen "switch" []) + cases <- getFirst . mconcat <$> + manyTill (First <$> (eCase <* skipMany pBlank) ) + (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + skipMany pBlank + fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) + skipMany pBlank + pSatisfy (~== TagClose "switch") + return $ maybe fallback constructor cases + +eCase :: TagParser (Maybe Inlines) +eCase = do + skipMany pBlank + TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + case (flip lookup namespaces) =<< lookup "required-namespace" attr of + Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + +eFootnote :: TagParser () +eFootnote = try $ do + let notes = ["footnote", "rearnote"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (flip elem notes) (lookup "type" attr)) + let ident = fromMaybe "" (lookup "id" attr) + content <- pInTags tag block + addNote ident content + +addNote :: String -> Blocks -> TagParser () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) + +eNoteref :: TagParser Inlines +eNoteref = try $ do + guardEnabled Ext_epub_html_exts + TagOpen tag attr <- lookAhead $ pAnyTag + guard (maybe False (== "noteref") (lookup "type" attr)) + let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) + guard (not (null ident)) + pInTags tag block + return $ B.rawInline "noteref" ident + +-- Strip TOC if there is one, better to generate again +eTOC :: TagParser () +eTOC = try $ do + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (== "toc") (lookup "type" attr)) + void (pInTags tag block) pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList @@ -128,9 +218,15 @@ pBulletList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") + items <- manyTill (pListItem nonItem) (pCloses "ul") return $ B.bulletList $ map (fixPlains True) items +pListItem :: TagParser a -> TagParser Blocks +pListItem nonItem = do + TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) + let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) + (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + pOrderedList :: TagParser Blocks pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) @@ -156,7 +252,7 @@ pOrderedList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") + items <- manyTill (pListItem nonItem) (pCloses "ol") return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items pDefinitionList :: TagParser Blocks @@ -194,14 +290,14 @@ fixPlains inList bs = if any isParaish bs' pRawTag :: TagParser String pRawTag = do tag <- pAnyTag - let ignorable x = x `elem` ["html","head","body"] + let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag then return [] else return $ renderTags' [tag] pDiv :: TagParser Blocks pDiv = try $ do - getOption readerParseRaw >>= guard + guardEnabled Ext_native_divs TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True) contents <- pInTags "div" block return $ B.divWith (mkAttr attr) contents @@ -220,13 +316,35 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] +-- Sets chapter context +eSection :: TagParser Blocks +eSection = try $ do + let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let sectTag = tagOpen (`elem` sectioningContent) matchChapter + TagOpen tag _ <- lookAhead $ pSatisfy sectTag + setInChapter (pInTags tag block) + +headerLevel :: String -> TagParser Int +headerLevel tagtype = do + let level = read (drop 1 tagtype) + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + + + + + pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] - let level = read (drop 1 tagtype) + level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr @@ -244,7 +362,7 @@ pTable :: TagParser Blocks pTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank - caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank + caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank -- TODO actually read these and take width information from them widths' <- pColgroup <|> many pCol head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") @@ -326,7 +444,9 @@ pCodeBlock = try $ do inline :: TagParser Inlines inline = choice - [ pTagText + [ eNoteref + , eSwitch id inline + , pTagText , pQ , pEmph , pStrong @@ -338,6 +458,7 @@ inline = choice , pImage , pCode , pSpan + , pMath False , pRawHtmlInline ] @@ -366,8 +487,8 @@ pSelfClosing f g = do pQ :: TagParser Inlines pQ = do - quoteContext <- stateQuoteContext `fmap` getState - let quoteType = case quoteContext of + context <- asks quoteContext + let quoteType = case context of InDoubleQuote -> SingleQuote _ -> DoubleQuote let innerQuoteContext = if quoteType == SingleQuote @@ -406,12 +527,24 @@ pLineBreak = do return B.linebreak pLink :: TagParser Inlines -pLink = try $ do +pLink = pRelLink <|> pAnchor + +pAnchor :: TagParser Inlines +pAnchor = try $ do + tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id")) + return $ B.spanWith (fromAttrib "id" tag , [], []) mempty + +pRelLink :: TagParser Inlines +pRelLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) let url = fromAttrib "href" tag let title = fromAttrib "title" tag + let uid = fromAttrib "id" tag + let spanC = case uid of + [] -> id + s -> B.spanWith (s, [], []) lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") - return $ B.link (escapeURI url) title lab + return $ spanC $ B.link (escapeURI url) title lab pImage :: TagParser Inlines pImage = do @@ -429,7 +562,7 @@ pCode = try $ do pSpan :: TagParser Inlines pSpan = try $ do - getOption readerParseRaw >>= guard + guardEnabled Ext_native_spans TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) contents <- pInTags "span" inline return $ B.spanWith (mkAttr attr) contents @@ -442,6 +575,22 @@ pRawHtmlInline = do then return $ B.rawInline "html" $ renderTags' [result] else return mempty +mathMLToTeXMath :: String -> Either String String +mathMLToTeXMath s = writeTeX <$> readMathML s + +pMath :: Bool -> TagParser Inlines +pMath inCase = try $ do + open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr))) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) + let math = mathMLToTeXMath $ + (renderTags $ [open] ++ contents ++ [TagClose "math"]) + let constructor = + maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath) + (lookup "display" attr) + return $ either (const mempty) + (\x -> if null x then mempty else constructor x) math + pInlinesInTags :: String -> (Inlines -> Inlines) -> TagParser Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline @@ -479,7 +628,8 @@ pTagText :: TagParser Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState - case runParser (many pTagContents) st "text" str of + qu <- ask + case flip runReader qu $ runParserT (many pTagContents) st "text" str of Left _ -> fail $ "Could not parse `" ++ str ++ "'" Right result -> return $ mconcat result @@ -488,7 +638,9 @@ pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: Parser [Char] ParserState Inlines +type InlinesParser = HTMLParser String + +pTagContents :: InlinesParser Inlines pTagContents = B.displayMath <$> mathDisplay <|> B.math <$> mathInline @@ -498,12 +650,11 @@ pTagContents = <|> pSymbol <|> pBad -pStr :: Parser [Char] ParserState Inlines +pStr :: InlinesParser Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos return $ B.str result isSpecial :: Char -> Bool @@ -518,13 +669,13 @@ isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: Parser [Char] ParserState Inlines +pSymbol :: InlinesParser Inlines pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: Parser [Char] ParserState Inlines +pBad :: InlinesParser Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -558,7 +709,7 @@ pBad = do _ -> '?' return $ B.str [c'] -pSpace :: Parser [Char] ParserState Inlines +pSpace :: InlinesParser Inlines pSpace = many1 (satisfy isSpace) >> return B.space -- @@ -566,8 +717,10 @@ pSpace = many1 (satisfy isSpace) >> return B.space -- eitherBlockOrInline :: [String] -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object"] +eitherBlockOrInline = ["audio", "applet", "button", "iframe", + "del", "ins", + "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] {- inlineHtmlTags :: [[Char]] @@ -579,15 +732,17 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", -} blockHtmlTags :: [String] -blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "canvas", +blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "button", "canvas", "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", - "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "map", "menu", - "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", - "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd", + "dl", "dt", "embed", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "menu", "noframes", "ol", "output", "p", "pre", + "section", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style", "svg", "video"] + "th", "thead", "tr", "script", "style"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. @@ -605,8 +760,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] +epubTags :: [String] +epubTags = ["case", "switch", "default"] + blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags +blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags isInlineTag :: Tag String -> Bool isInlineTag t = tagOpen isInlineTagName (const True) t || @@ -670,19 +828,23 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String +htmlInBalanced :: (Monad m) + => (Tag String -> Bool) + -> ParserT String st m String htmlInBalanced f = try $ do (TagOpen t _, tag) <- htmlTag f guard $ '/' `notElem` tag -- not a self-closing tag let stopper = htmlTag (~== TagClose t) - let anytag = liftM snd $ htmlTag (const True) + let anytag = snd <$> htmlTag (const True) contents <- many $ notFollowedBy' stopper >> (htmlInBalanced f <|> anytag <|> count 1 anyChar) endtag <- liftM snd stopper return $ tag ++ concat contents ++ endtag -- | Matches a tag meeting a certain condition. -htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String) +htmlTag :: Monad m + => (Tag String -> Bool) + -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead $ char '<' >> (oneOf "/!?" <|> letter) (next : _) <- getInput >>= return . canonicalizeTags . parseTags @@ -701,7 +863,78 @@ htmlTag f = try $ do mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + +-- Strip namespace prefixes +stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag String -> Tag String +stripPrefix (TagOpen s as) = + TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) +stripPrefix (TagClose s) = TagClose (stripPrefix' s) +stripPrefix x = x + +stripPrefix' :: String -> String +stripPrefix' s = + case span (/= ':') s of + (_, "") -> s + (_, (_:ts)) -> ts + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- Instances +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance HasQuoteContext st (Reader HTMLLocal) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance Default HTMLState where + def = HTMLState def [] + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote False + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState + + +-- EPUB Specific +-- +-- +sectioningContent :: [String] +sectioningContent = ["article", "aside", "nav", "section"] + +{- +groupingContent :: [String] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] + + + +types :: [(String, ([String], Int))] +types = -- Document divisions + map (\s -> (s, (["section", "body"], 0))) + ["volume", "part", "chapter", "division"] + ++ -- Document section and components + [ + ("abstract", ([], 0))] +-} diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index a3dfb7c3c..4b46c869d 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -43,11 +43,8 @@ docHToBlocks d' = (docHToInlines False $ headerTitle h) DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2) DocString _ -> inlineFallback - DocParagraph (DocHeader h) -> docHToBlocks (DocHeader h) DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h - DocParagraph x -> let (ils, rest) = getInlines x - in (B.para $ docHToInlines False ils) - <> docHToBlocks rest + DocParagraph x -> B.para $ docHToInlines False x DocIdentifier _ -> inlineFallback DocIdentifierUnchecked _ -> inlineFallback DocModule s -> B.plain $ docHToInlines False $ DocModule s @@ -115,40 +112,6 @@ docHToInlines isCode d' = DocProperty _ -> mempty DocExamples _ -> mempty -getInlines :: DocH String Identifier -> (DocH String Identifier, DocH String Identifier) -getInlines (DocAppend x y) = if isInline x - then let (a, b) = getInlines y - in (DocAppend x a, b) - else (DocEmpty, DocAppend x y) -getInlines x = if isInline x - then (x, DocEmpty) - else (DocEmpty, x) - -isInline :: DocH String Identifier -> Bool -isInline d' = - case d' of - DocEmpty -> True - DocAppend d1 _ -> isInline d1 - DocString _ -> True - DocParagraph _ -> False - DocIdentifier _ -> True - DocIdentifierUnchecked _ -> True - DocModule _ -> True - DocWarning _ -> True - DocEmphasis _ -> True - DocMonospaced _ -> True - DocBold _ -> True - DocHeader _ -> False - DocUnorderedList _ -> False - DocOrderedList _ -> False - DocDefList _ -> False - DocCodeBlock _ -> False - DocHyperlink _ -> True - DocPic _ -> True - DocAName _ -> True - DocProperty _ -> False - DocExamples _ -> False - -- | Create an 'Example', stripping superfluous characters as appropriate makeExample :: String -> String -> [String] -> Blocks makeExample prompt expression result = @@ -173,4 +136,3 @@ makeExample prompt expression result = substituteBlankLine "<BLANKLINE>" = "" substituteBlankLine line = line coder = B.codeWith ([], ["result"], []) - diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 97bfaa455..9f51e9a8f 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -41,7 +41,6 @@ import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, mathDisplay, mathInline) -import Text.Parsec.Prim (ParsecT, runParserT) import qualified Text.Pandoc.UTF8 as UTF8 import Data.Char ( chr, ord ) import Control.Monad.Trans (lift) @@ -104,7 +103,7 @@ dimenarg = try $ do sp :: LP () sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') - <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline) + <|> (try $ newline <* lookAhead anyChar <* notFollowedBy blankline) isLowerHex :: Char -> Bool isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' @@ -808,7 +807,7 @@ rawEnv name = do ---- -type IncludeParser = ParsecT [Char] [String] IO String +type IncludeParser = ParserT [Char] [String] IO String -- | Replace "include" commands with file contents. handleIncludes :: String -> IO String diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 6c710c8ff..861f81b23 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -353,8 +353,7 @@ referenceKey = try $ do notFollowedBy' referenceTitle notFollowedBy' (() <$ reference) many1 $ notFollowedBy space >> litChar - let betweenAngles = try $ char '<' >> - manyTill (escapedChar' <|> litChar) (char '>') + let betweenAngles = try $ char '<' >> manyTill litChar (char '>') src <- try betweenAngles <|> sourceURL tit <- option "" referenceTitle -- currently we just ignore MMD-style link/image attributes @@ -571,7 +570,7 @@ attributes :: MarkdownParser Attr attributes = try $ do char '{' spnl - attrs <- many (attribute >>~ spnl) + attrs <- many (attribute <* spnl) char '}' return $ foldl (\x f -> f x) nullAttr attrs @@ -688,7 +687,7 @@ birdTrackLine c = try $ do -- emailBlockQuoteStart :: MarkdownParser Char -emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ') +emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ') emailBlockQuote :: MarkdownParser [String] emailBlockQuote = try $ do @@ -752,7 +751,7 @@ listLine = try $ do notFollowedBy' (do indentSpaces many spaceChar listStart) - notFollowedBy' $ htmlTag (~== TagClose "div") + notFollowedByHtmlCloser optional (() <$ indentSpaces) chunks <- manyTill ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') @@ -781,11 +780,18 @@ listContinuation = try $ do blanks <- many blankline return $ concat result ++ blanks +notFollowedByHtmlCloser :: MarkdownParser () +notFollowedByHtmlCloser = do + inHtmlBlock <- stateInHtmlBlock <$> getState + case inHtmlBlock of + Just t -> notFollowedBy' $ htmlTag (~== TagClose t) + Nothing -> return () + listContinuationLine :: MarkdownParser String listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart - notFollowedBy' $ htmlTag (~== TagClose "div") + notFollowedByHtmlCloser optional indentSpaces result <- anyLine return $ result ++ "\n" @@ -840,38 +846,53 @@ defListMarker = do else mzero return () -definitionListItem :: MarkdownParser (F (Inlines, [Blocks])) -definitionListItem = try $ do - -- first, see if this has any chance of being a definition list: - lookAhead (anyLine >> optional blankline >> defListMarker) - term <- trimInlinesF . mconcat <$> manyTill inline newline - optional blankline - raw <- many1 defRawBlock - state <- getState - let oldContext = stateParserContext state - -- parse the extracted block, which may contain various block elements: +definitionListItem :: Bool -> MarkdownParser (F (Inlines, [Blocks])) +definitionListItem compact = try $ do + rawLine' <- anyLine + raw <- many1 $ defRawBlock compact + term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine' contents <- mapM (parseFromString parseBlocks) raw - updateState (\st -> st {stateParserContext = oldContext}) + optional blanklines return $ liftM2 (,) term (sequence contents) -defRawBlock :: MarkdownParser String -defRawBlock = try $ do +defRawBlock :: Bool -> MarkdownParser String +defRawBlock compact = try $ do + hasBlank <- option False $ blankline >> return True defListMarker firstline <- anyLine - rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) - trailing <- option "" blanklines - cont <- liftM concat $ many $ do - lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine - trl <- option "" blanklines - return $ unlines lns ++ trl - return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont + let dline = try + ( do notFollowedBy blankline + if compact -- laziness not compatible with compact + then () <$ indentSpaces + else (() <$ indentSpaces) + <|> notFollowedBy defListMarker + anyLine ) + rawlines <- many dline + cont <- liftM concat $ many $ try $ do + trailing <- option "" blanklines + ln <- indentSpaces >> notFollowedBy blankline >> anyLine + lns <- many dline + return $ trailing ++ unlines (ln:lns) + return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++ + if hasBlank || not (null cont) then "\n\n" else "" definitionList :: MarkdownParser (F Blocks) -definitionList = do - guardEnabled Ext_definition_lists - items <- fmap sequence $ many1 definitionListItem +definitionList = try $ do + lookAhead (anyLine >> optional blankline >> defListMarker) + compactDefinitionList <|> normalDefinitionList + +compactDefinitionList :: MarkdownParser (F Blocks) +compactDefinitionList = do + guardEnabled Ext_compact_definition_lists + items <- fmap sequence $ many1 $ definitionListItem True return $ B.definitionList <$> fmap compactify'DL items +normalDefinitionList :: MarkdownParser (F Blocks) +normalDefinitionList = do + guardEnabled Ext_definition_lists + items <- fmap sequence $ many1 $ definitionListItem False + return $ B.definitionList <$> items + -- -- paragraph block -- @@ -914,16 +935,34 @@ htmlElement = rawVerbatimBlock htmlBlock :: MarkdownParser (F Blocks) htmlBlock = do guardEnabled Ext_raw_html - res <- (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks) - <|> htmlBlock' - return $ return $ B.rawBlock "html" res - -htmlBlock' :: MarkdownParser String + try (do + (TagOpen t attrs) <- lookAhead $ fst <$> htmlTag isBlockTag + (guard (t `elem` ["pre","style","script"]) >> + (return . B.rawBlock "html") <$> rawVerbatimBlock) + <|> (do guardEnabled Ext_markdown_attribute + oldMarkdownAttribute <- stateMarkdownAttribute <$> getState + markdownAttribute <- + case lookup "markdown" attrs of + Just "0" -> False <$ updateState (\st -> st{ + stateMarkdownAttribute = False }) + Just _ -> True <$ updateState (\st -> st{ + stateMarkdownAttribute = True }) + Nothing -> return oldMarkdownAttribute + res <- if markdownAttribute + then rawHtmlBlocks + else htmlBlock' + updateState $ \st -> st{ stateMarkdownAttribute = + oldMarkdownAttribute } + return res) + <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)) + <|> htmlBlock' + +htmlBlock' :: MarkdownParser (F Blocks) htmlBlock' = try $ do first <- htmlElement - finalSpace <- many spaceChar - finalNewlines <- many newline - return $ first ++ finalSpace ++ finalNewlines + skipMany spaceChar + optional blanklines + return $ return $ B.rawBlock "html" first strictHtmlBlock :: MarkdownParser String strictHtmlBlock = htmlInBalanced (not . isInlineTag) @@ -934,48 +973,36 @@ rawVerbatimBlock = try $ do ["pre", "style", "script"]) (const True)) contents <- manyTill anyChar (htmlTag (~== TagClose tag)) - return $ open ++ contents ++ renderTags [TagClose tag] + return $ open ++ contents ++ renderTags' [TagClose tag] rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do guardEnabled Ext_raw_tex - result <- (B.rawBlock "latex" <$> rawLaTeXBlock) - <|> (B.rawBlock "context" <$> rawConTeXtEnvironment) + result <- (B.rawBlock "latex" . concat <$> + rawLaTeXBlock `sepEndBy1` blankline) + <|> (B.rawBlock "context" . concat <$> + rawConTeXtEnvironment `sepEndBy1` blankline) spaces return $ return result -rawHtmlBlocks :: MarkdownParser String +rawHtmlBlocks :: MarkdownParser (F Blocks) rawHtmlBlocks = do - htmlBlocks <- many1 $ try $ do - s <- rawVerbatimBlock <|> try ( - do (t,raw) <- htmlTag isBlockTag - guard $ t ~/= TagOpen "div" [] && - t ~/= TagClose "div" - exts <- getOption readerExtensions - -- if open tag, need markdown="1" if - -- markdown_attributes extension is set - case t of - TagOpen _ as - | Ext_markdown_attribute `Set.member` - exts -> - if "markdown" `notElem` - map fst as - then mzero - else return $ - stripMarkdownAttribute raw - | otherwise -> return raw - _ -> return raw ) - sps <- do sp1 <- many spaceChar - sp2 <- option "" (blankline >> return "\n") - sp3 <- many spaceChar - sp4 <- option "" blanklines - return $ sp1 ++ sp2 ++ sp3 ++ sp4 - -- note: we want raw html to be able to - -- precede a code block, when separated - -- by a blank line - return $ s ++ sps - let combined = concat htmlBlocks - return $ if last combined == '\n' then init combined else combined + (TagOpen tagtype _, raw) <- htmlTag isBlockTag + -- try to find closing tag + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just tagtype } + let closer = htmlTag (\x -> x ~== TagClose tagtype) + contents <- mconcat <$> many (notFollowedBy' closer >> block) + result <- + (closer >>= \(_, rawcloser) -> return ( + return (B.rawBlock "html" $ stripMarkdownAttribute raw) <> + contents <> + return (B.rawBlock "html" rawcloser))) + <|> return (return (B.rawBlock "html" raw) <> contents) + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + return result -- remove markdown="1" attribute stripMarkdownAttribute :: String -> String @@ -1163,7 +1190,7 @@ gridPart ch = do return (length dashes, length dashes + 1) gridDashedLines :: Char -> Parser [Char] st [(Int,Int)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline removeFinalBar :: String -> String removeFinalBar = @@ -1388,8 +1415,7 @@ escapedChar = do ltSign :: MarkdownParser (F Inlines) ltSign = do guardDisabled Ext_raw_html - <|> guardDisabled Ext_markdown_in_html_blocks - <|> (notFollowedBy' (htmlTag isBlockTag) >> return ()) + <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag)) char '<' return $ return $ B.str "<" @@ -1434,52 +1460,60 @@ math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) enclosure :: Char -> MarkdownParser (F Inlines) enclosure c = do + -- we can't start an enclosure with _ if after a string and + -- the intraword_underscores extension is enabled: + guardDisabled Ext_intraword_underscores + <|> guard (c == '*') + <|> (guard =<< notAfterString) cs <- many1 (char c) (return (B.str cs) <>) <$> whitespace - <|> case length cs of + <|> do + case length cs of 3 -> three c 2 -> two c mempty 1 -> one c mempty _ -> return (return $ B.str cs) +ender :: Char -> Int -> MarkdownParser () +ender c n = try $ do + count n (char c) + guard (c == '*') + <|> guardDisabled Ext_intraword_underscores + <|> notFollowedBy alphaNum + -- Parse inlines til you hit one c or a sequence of two cs. -- If one c, emit emph and then parse two. -- If two cs, emit strong and then parse one. -- Otherwise, emit ccc then the results. three :: Char -> MarkdownParser (F Inlines) three c = do - contents <- mconcat <$> many (notFollowedBy (char c) >> inline) - (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents)) - <|> (try (string [c,c]) >> one c (B.strong <$> contents)) - <|> (char c >> two c (B.emph <$> contents)) + contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline) + (ender c 3 >> return ((B.strong . B.emph) <$> contents)) + <|> (ender c 2 >> one c (B.strong <$> contents)) + <|> (ender c 1 >> two c (B.emph <$> contents)) <|> return (return (B.str [c,c,c]) <> contents) -- Parse inlines til you hit two c's, and emit strong. -- If you never do hit two cs, emit ** plus inlines parsed. two :: Char -> F Inlines -> MarkdownParser (F Inlines) two c prefix' = do - let ender = try $ string [c,c] - contents <- mconcat <$> many (try $ notFollowedBy ender >> inline) - (ender >> return (B.strong <$> (prefix' <> contents))) + contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline) + (ender c 2 >> return (B.strong <$> (prefix' <> contents))) <|> return (return (B.str [c,c]) <> (prefix' <> contents)) -- Parse inlines til you hit a c, and emit emph. -- If you never hit a c, emit * plus inlines parsed. one :: Char -> F Inlines -> MarkdownParser (F Inlines) one c prefix' = do - contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline) + contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline) <|> try (string [c,c] >> - notFollowedBy (char c) >> + notFollowedBy (ender c 1) >> two c mempty) ) - (char c >> return (B.emph <$> (prefix' <> contents))) + (ender c 1 >> return (B.emph <$> (prefix' <> contents))) <|> return (return (B.str [c]) <> (prefix' <> contents)) strongOrEmph :: MarkdownParser (F Inlines) -strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_') - where checkIntraword = do - exts <- getOption readerExtensions - when (Ext_intraword_underscores `Set.member` exts) $ do - guard =<< notAfterString +strongOrEmph = enclosure '*' <|> enclosure '_' -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) @@ -1489,7 +1523,7 @@ inlinesBetween :: (Show b) inlinesBetween start end = (trimInlinesF . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace >>~ notFollowedBy' end + innerSpace = try $ whitespace <* notFollowedBy' end strikeout :: MarkdownParser (F Inlines) strikeout = fmap B.strikeout <$> @@ -1730,7 +1764,7 @@ inBrackets parser = do spanHtml :: MarkdownParser (F Inlines) spanHtml = try $ do - guardEnabled Ext_markdown_in_html_blocks + guardEnabled Ext_native_spans (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span")) let ident = fromMaybe "" $ lookup "id" attrs @@ -1745,14 +1779,19 @@ spanHtml = try $ do divHtml :: MarkdownParser (F Blocks) divHtml = try $ do - guardEnabled Ext_markdown_in_html_blocks + guardEnabled Ext_native_divs (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just "div" } bls <- option "" (blankline >> option "" blanklines) contents <- mconcat <$> many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block) closed <- option False (True <$ htmlTag (~== TagClose "div")) if closed then do + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } let ident = fromMaybe "" $ lookup "id" attrs let classes = maybe [] words $ lookup "class" attrs let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] @@ -1763,10 +1802,17 @@ divHtml = try $ do rawHtmlInline :: MarkdownParser (F Inlines) rawHtmlInline = do guardEnabled Ext_raw_html + inHtmlBlock <- stateInHtmlBlock <$> getState + let isCloseBlockTag t = case inHtmlBlock of + Just t' -> t ~== TagClose t' + Nothing -> False mdInHtml <- option False $ - guardEnabled Ext_markdown_in_html_blocks >> return True + ( guardEnabled Ext_markdown_in_html_blocks + <|> guardEnabled Ext_markdown_attribute + ) >> return True (_,result) <- htmlTag $ if mdInHtml - then isInlineTag + then (\x -> isInlineTag x && + not (isCloseBlockTag x)) else not . isTextTag return $ return $ B.rawInline "html" result diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index f1dcce8f7..e43b8a86c 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -569,7 +569,8 @@ endline = () <$ try (newline <* imageIdentifiers :: [MWParser ()] imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] - where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier"] + where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier", + "Bild"] image :: MWParser Inlines image = try $ do @@ -634,7 +635,7 @@ inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines inlinesBetween start end = (trimInlines . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace >>~ notFollowedBy' end + innerSpace = try $ whitespace <* notFollowedBy' end emph :: MWParser Inlines emph = B.emph <$> nested (inlinesBetween start end) diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 7a35e2ca0..e1c29d1e8 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -41,10 +41,10 @@ import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF ) import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.Pandoc.Shared (compactify', compactify'DL) -import Text.TeXMath (texMathToPandoc, DisplayType(..)) +import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) import Control.Applicative ( Applicative, pure - , (<$>), (<$), (<*>), (<*), (*>), (<**>) ) + , (<$>), (<$), (<*>), (<*), (*>) ) import Control.Arrow (first) import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) import Control.Monad.Reader (Reader, runReader, ask, asks) @@ -274,7 +274,7 @@ optionalAttributes parser = try $ parseBlockAttributes :: OrgParser () parseBlockAttributes = do attrs <- many attribute - () <$ mapM (uncurry parseAndAddAttribute) attrs + mapM_ (uncurry parseAndAddAttribute) attrs where attribute :: OrgParser (String, String) attribute = try $ do @@ -341,14 +341,36 @@ verseBlock blkProp = try $ do fmap B.para . mconcat . intersperse (pure B.linebreak) <$> mapM (parseFromString parseInlines) (lines content) +exportsCode :: [(String, String)] -> Bool +exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs + || ("rundoc-exports", "results") `elem` attrs) + +exportsResults :: [(String, String)] -> Bool +exportsResults attrs = ("rundoc-exports", "results") `elem` attrs + || ("rundoc-exports", "both") `elem` attrs + +followingResultsBlock :: OrgParser (Maybe String) +followingResultsBlock = + optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:" + *> blankline + *> (unlines <$> many1 exampleLine)) + codeBlock :: BlockProperties -> OrgParser (F Blocks) codeBlock blkProp = do skipSpaces - (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) - id' <- fromMaybe "" <$> lookupBlockAttribute "name" - content <- rawBlockContent blkProp - let codeBlck = B.codeBlockWith ( id', classes, kv ) content - maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption" + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + id' <- fromMaybe "" <$> lookupBlockAttribute "name" + content <- rawBlockContent blkProp + resultsContent <- followingResultsBlock + let includeCode = exportsCode kv + let includeResults = exportsResults kv + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + labelledBlck <- maybe (pure codeBlck) + (labelDiv codeBlck) + <$> lookupInlinesAttr "caption" + let resultBlck = pure $ maybe mempty (exampleCode) resultsContent + return $ (if includeCode then labelledBlck else mempty) + <> (if includeResults then resultBlck else mempty) where labelDiv blk value = B.divWith nullAttr <$> (mappend <$> labelledBlock value @@ -780,8 +802,12 @@ noteBlock = try $ do -- Paragraphs or Plain text paraOrPlain :: OrgParser (F Blocks) -paraOrPlain = try $ - parseInlines <**> (fmap <$> option B.plain (try $ newline *> pure B.para)) +paraOrPlain = try $ do + ils <- parseInlines + nl <- option False (newline >> return True) + try (guard nl >> notFollowedBy (orderedListStart <|> bulletListStart) >> + return (B.para <$> ils)) + <|> (return (B.plain <$> ils)) inlinesTillNewline :: OrgParser (F Inlines) inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline @@ -1357,7 +1383,7 @@ inlineLaTeX = try $ do maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd where parseAsMath :: String -> Maybe Inlines - parseAsMath cs = maybeRight $ B.fromList <$> texMathToPandoc DisplayInline cs + parseAsMath cs = B.fromList <$> texMathToPandoc cs parseAsInlineLaTeX :: String -> Maybe Inlines parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs @@ -1365,6 +1391,9 @@ inlineLaTeX = try $ do state :: ParserState state = def{ stateOptions = def{ readerParseRaw = True }} + texMathToPandoc inp = (maybeRight $ readTeX inp) >>= + writePandoc DisplayInline + maybeRight :: Either a b -> Maybe b maybeRight = either (const Nothing) Just diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index fa8438e70..e5eccb116 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -47,7 +47,7 @@ import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import qualified Text.Pandoc.Builder as B import Data.Monoid (mconcat, mempty) import Data.Sequence (viewr, ViewR(..)) -import Data.Char (toLower) +import Data.Char (toLower, isHexDigit) -- | Parse reStructuredText string and return Pandoc document. readRST :: ReaderOptions -- ^ Reader options @@ -460,7 +460,7 @@ listItem :: RSTParser Int listItem start = try $ do (markerLength, first) <- rawListItem start rest <- many (listContinuation markerLength) - blanks <- choice [ try (many blankline >>~ lookAhead start), + blanks <- choice [ try (many blankline <* lookAhead start), many1 blankline ] -- whole list must end with blank. -- parsing with ListItemState forces markers at beginning of lines to -- count as list item markers, even if not separated by blank space. @@ -480,7 +480,7 @@ listItem start = try $ do orderedList :: RSTParser Blocks orderedList = try $ do - (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) + (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar) items <- many1 (listItem (orderedListStart style delim)) let items' = compactify' items return $ B.orderedListWith (start, style, delim) items' @@ -656,9 +656,6 @@ extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc where (ds,rest) = span isHexDigit s mbc = safeRead ('\'':'\\':'x':ds ++ "'") -isHexDigit :: Char -> Bool -isHexDigit c = c `elem` "0123456789ABCDEFabcdef" - extractCaption :: RSTParser (Inlines, Blocks) extractCaption = do capt <- trimInlines . mconcat <$> many inline @@ -747,7 +744,7 @@ simpleReferenceName = do referenceName :: RSTParser Inlines referenceName = quotedReferenceName <|> - (try $ simpleReferenceName >>~ lookAhead (char ':')) <|> + (try $ simpleReferenceName <* lookAhead (char ':')) <|> unquotedReferenceName referenceKey :: RSTParser [Char] @@ -1076,7 +1073,7 @@ explicitLink = try $ do referenceLink :: RSTParser Inlines referenceLink = try $ do - (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) >>~ + (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <* char '_' state <- getState let keyTable = stateKeys state diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index f03eae044..3fee3051e 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -27,7 +27,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} -module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where +module Text.Pandoc.Readers.TeXMath ( texMathToInlines ) where import Text.Pandoc.Definition import Text.TeXMath @@ -35,22 +35,14 @@ import Text.TeXMath -- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. -- Defaults to raw formula between @$@ or @$$@ characters if entire formula -- can't be converted. -readTeXMath' :: MathType +texMathToInlines :: MathType -> String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath' mt inp = case texMathToPandoc dt inp of - Left _ -> [Str (delim ++ inp ++ delim)] - Right res -> res +texMathToInlines mt inp = + case writePandoc dt `fmap` readTeX inp of + Right (Just ils) -> ils + _ -> [Str (delim ++ inp ++ delim)] where (dt, delim) = case mt of DisplayMath -> (DisplayBlock, "$$") InlineMath -> (DisplayInline, "$") -{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-} --- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. --- Defaults to raw formula between @$@ characters if entire formula --- can't be converted. (This is provided for backwards compatibility; --- it is better to use @readTeXMath'@, which properly distinguishes --- between display and inline math.) -readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) - -> [Inline] -readTeXMath = readTeXMath' InlineMath diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 6d839ec1d..cd34da942 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -265,8 +265,20 @@ definitionList :: Parser [Char] ParserState Blocks definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. -listStart :: Parser [Char] st Char -listStart = oneOf "*#-" +listStart :: Parser [Char] ParserState () +listStart = genericListStart '*' + <|> () <$ genericListStart '#' + <|> () <$ definitionListStart + +genericListStart :: Char -> Parser [Char] st () +genericListStart c = () <$ try (many1 (char c) >> whitespace) + +definitionListStart :: Parser [Char] ParserState Inlines +definitionListStart = try $ do + char '-' + whitespace + trimInlines . mconcat <$> + many1Till inline (try (string ":=")) <* optional whitespace listInline :: Parser [Char] ParserState Inlines listInline = try (notFollowedBy newline >> inline) @@ -278,8 +290,7 @@ listInline = try (notFollowedBy newline >> inline) -- break. definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks]) definitionListItem = try $ do - string "- " - term <- mconcat <$> many1Till inline (try (whitespace >> string ":=")) + term <- definitionListStart def' <- multilineDef <|> inlineDef return (term, def') where inlineDef :: Parser [Char] ParserState [Blocks] @@ -488,7 +499,7 @@ str = do return $ B.str fullStr -- | Some number of space chars -whitespace :: Parser [Char] ParserState Inlines +whitespace :: Parser [Char] st Inlines whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break @@ -616,7 +627,8 @@ simpleInline border construct = try $ do attr <- attributes body <- trimInlines . mconcat <$> withQuoteContext InSingleQuote - (manyTill inline (try border <* notFollowedBy alphaNum)) + (manyTill (notFollowedBy newline >> inline) + (try border <* notFollowedBy alphaNum)) return $ construct $ if attr == nullAttr then body diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs new file mode 100644 index 000000000..3a51b9d84 --- /dev/null +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -0,0 +1,548 @@ +{-# LANGUAGE ViewPatterns #-} +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Txt2Tags + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + +Conversion of txt2tags formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags + , getT2TMeta + , T2TMeta (..) + , readTxt2TagsNoMacros) + where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, (<>) + , trimInlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL) +import Text.Pandoc.Parsing hiding (space, spaces, uri, macro) +import Control.Applicative ((<$>), (<$), (<*>), (<*), (*>)) +import Data.Char (toLower) +import Data.List (transpose, intersperse, intercalate) +import Data.Maybe (fromMaybe) +import Data.Monoid (Monoid, mconcat, mempty, mappend) +--import Network.URI (isURI) -- Not sure whether to use this function +import Control.Monad (void, guard, when) +import Data.Default +import Control.Monad.Reader (Reader, runReader, asks) + +import Data.Time.LocalTime (getZonedTime) +import Text.Pandoc.Compat.Directory(getModificationTime) +import Data.Time.Format (formatTime) +import System.Locale (defaultTimeLocale) +import System.IO.Error (catchIOError) + +type T2T = ParserT String ParserState (Reader T2TMeta) + +-- | An object for the T2T macros meta information +-- the contents of each field is simply substituted verbatim into the file +data T2TMeta = T2TMeta { + date :: String -- ^ Current date + , mtime :: String -- ^ Last modification time of infile + , infile :: FilePath -- ^ Input file + , outfile :: FilePath -- ^ Output file + } deriving Show + +instance Default T2TMeta where + def = T2TMeta "" "" "" "" + +-- | Get the meta information required by Txt2Tags macros +getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta +getT2TMeta inps out = do + curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime + let getModTime = fmap (formatTime defaultTimeLocale "%F") . + getModificationTime + curMtime <- catchIOError + (maximum <$> mapM getModTime inps) + (const (return "")) + return $ T2TMeta curDate curMtime (intercalate ", " inps) out + +-- | Read Txt2Tags from an input string returning a Pandoc document +readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Pandoc +readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n") + +-- | Read Txt2Tags (ignoring all macros) from an input string returning +-- a Pandoc document +readTxt2TagsNoMacros :: ReaderOptions -> String -> Pandoc +readTxt2TagsNoMacros = readTxt2Tags def + +parseT2T :: T2T Pandoc +parseT2T = do + _ <- (Nothing <$ try blankline) <|> (Just <$> (count 3 anyLine)) + config <- manyTill setting (notFollowedBy setting) + -- TODO: Handle settings better + let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) nullMeta config + updateState (\s -> s {stateMeta = settings}) + body <- mconcat <$> manyTill block eof + return $ Pandoc mempty (B.toList body) + +type Keyword = String +type Value = String + +setting :: T2T (Keyword, Value) +setting = do + string "%!" + keyword <- ignoreSpacesCap (many1 alphaNum) + char ':' + value <- ignoreSpacesCap (manyTill anyChar (newline)) + return (keyword, value) + +-- Blocks + +parseBlocks :: T2T Blocks +parseBlocks = mconcat <$> manyTill block eof + +block :: T2T Blocks +block = do + choice + [ mempty <$ blanklines + , quote + , hrule -- hrule must go above title + , title + , commentBlock + , verbatim + , rawBlock + , taggedBlock + , list + , table + , para + ] + +title :: T2T Blocks +title = try $ balancedTitle '+' <|> balancedTitle '=' + +balancedTitle :: Char -> T2T Blocks +balancedTitle c = try $ do + spaces + level <- length <$> many1 (char c) + guard (level <= 5) -- Max header level 5 + heading <- manyTill (noneOf "\n\r") (count level (char c)) + label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-")) + many spaceChar *> newline + let attr = maybe nullAttr (\x -> (x, [], [])) label + return $ B.headerWith attr level (trimInlines $ B.text heading) + +para :: T2T Blocks +para = try $ do + ils <- parseInlines + nl <- option False (True <$ newline) + option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils)) + where + listStart = try bulletListStart <|> orderedListStart + +commentBlock :: T2T Blocks +commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment + +-- Seperator and Strong line treated the same +hrule :: T2T Blocks +hrule = try $ do + spaces + line <- many1 (oneOf "=-_") + guard (length line >= 20) + B.horizontalRule <$ blankline + +quote :: T2T Blocks +quote = try $ do + lookAhead tab + rawQuote <- many1 (tab *> optional spaces *> anyLine) + contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n") + return $ B.blockQuote contents + +commentLine :: T2T Inlines +commentLine = comment + +-- List Parsing code from Org Reader + +list :: T2T Blocks +list = choice [bulletList, orderedList, definitionList] + +bulletList :: T2T Blocks +bulletList = B.bulletList . compactify' + <$> many1 (listItem bulletListStart parseBlocks) + +orderedList :: T2T Blocks +orderedList = B.orderedList . compactify' + <$> many1 (listItem orderedListStart parseBlocks) + +definitionList :: T2T Blocks +definitionList = try $ do + B.definitionList . compactify'DL <$> + many1 (listItem definitionListStart definitionListEnd) + +definitionListEnd :: T2T (Inlines, [Blocks]) +definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks) + +genericListStart :: T2T Char + -> T2T Int +genericListStart listMarker = try $ + (2+) <$> (length <$> many spaceChar + <* listMarker <* space <* notFollowedBy space) + +-- parses bullet list \start and returns its length (excl. following whitespace) +bulletListStart :: T2T Int +bulletListStart = genericListStart (char '-') + +orderedListStart :: T2T Int +orderedListStart = genericListStart (char '+' ) + +definitionListStart :: T2T Int +definitionListStart = genericListStart (char ':') + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: T2T Int + -> T2T a + -> T2T a +listItem start end = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString end $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> T2T String +listContinuation markerLength = try $ + notFollowedBy' (blankline >> blankline) + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: T2T String +anyLineNewline = (++ "\n") <$> anyLine + +indentWith :: Int -> T2T String +indentWith n = count n space + +-- Table + +table :: T2T Blocks +table = try $ do + header <- fmap snd <$> option mempty (try headerRow) + rows <- many1 (many commentLine *> tableRow) + let columns = transpose rows + let ncolumns = length columns + let aligns = map (foldr1 findAlign) (map (map fst) columns) + let rows' = map (map snd) rows + let size = maximum (map length rows') + let rowsPadded = map (pad size) rows' + let headerPadded = if (not (null header)) then pad size header else mempty + return $ B.table mempty + (zip aligns (replicate ncolumns 0.0)) + headerPadded rowsPadded + +pad :: (Show a, Monoid a) => Int -> [a] -> [a] +pad n xs = xs ++ (replicate (n - length xs) mempty) + + +findAlign :: Alignment -> Alignment -> Alignment +findAlign x y + | x == y = x + | otherwise = AlignDefault + +headerRow :: T2T [(Alignment, Blocks)] +headerRow = genericRow (string "||") + +tableRow :: T2T [(Alignment, Blocks)] +tableRow = genericRow (char '|') + +genericRow :: T2T a -> T2T [(Alignment, Blocks)] +genericRow start = try $ do + spaces *> start + manyTill tableCell newline <?> "genericRow" + + +tableCell :: T2T (Alignment, Blocks) +tableCell = try $ do + leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead + content <- (manyTill inline (try $ lookAhead (cellEnd))) + rightSpaces <- length <$> many space + let align = + case compare leftSpaces rightSpaces of + LT -> AlignLeft + EQ -> AlignCenter + GT -> AlignRight + endOfCell + return $ (align, B.plain (B.trimInlines $ mconcat content)) + where + cellEnd = (void newline <|> (many1 space *> endOfCell)) + +endOfCell :: T2T () +endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline) + +-- Raw area + +verbatim :: T2T Blocks +verbatim = genericBlock anyLineNewline B.codeBlock "```" + +rawBlock :: T2T Blocks +rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\"" + +taggedBlock :: T2T Blocks +taggedBlock = do + target <- getTarget + genericBlock anyLineNewline (B.rawBlock target) "'''" + +-- Generic + +genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks +genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s + +blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupArea p f s = try $ (do + string s *> blankline + f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline)))) + +blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupLine p f s = try (f <$> (string s *> space *> p)) + +-- Can be in either block or inline position +comment :: Monoid a => T2T a +comment = try $ do + atStart + notFollowedBy macro + mempty <$ (char '%' *> anyLine) + +-- Inline + +parseInlines :: T2T Inlines +parseInlines = trimInlines . mconcat <$> many1 inline + +inline :: T2T Inlines +inline = do + choice + [ endline + , macro + , commentLine + , whitespace + , url + , link + , image + , bold + , underline + , code + , raw + , tagged + , strike + , italic + , code + , str + , symbol + ] + +bold :: T2T Inlines +bold = inlineMarkup inline B.strong '*' (B.str) + +underline :: T2T Inlines +underline = inlineMarkup inline B.emph '_' (B.str) + +strike :: T2T Inlines +strike = inlineMarkup inline B.strikeout '-' (B.str) + +italic :: T2T Inlines +italic = inlineMarkup inline B.emph '/' (B.str) + +code :: T2T Inlines +code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id + +raw :: T2T Inlines +raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id + +tagged :: T2T Inlines +tagged = do + target <- getTarget + inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id + +-- Parser for markup indicated by a double character. +-- Inline markup is greedy and glued +-- Greedy meaning ***a*** = Bold [Str "*a*"] +-- Glued meaning that markup must be tight to content +-- Markup can't pass newlines +inlineMarkup :: Monoid a + => (T2T a) -- Content parser + -> (a -> Inlines) -- Constructor + -> Char -- Fence + -> (String -> a) -- Special Case to handle ****** + -> T2T Inlines +inlineMarkup p f c special = try $ do + start <- many1 (char c) + let l = length start + guard (l >= 2) + when (l == 2) (void $ notFollowedBy space) + -- We must make sure that there is no space before the start of the + -- closing tags + body <- optionMaybe (try $ manyTill (noneOf "\n\r") $ + (try $ lookAhead (noneOf " " >> string [c,c] ))) + case body of + Just middle -> do + lastChar <- anyChar + end <- many1 (char c) + let parser inp = parseFromString (mconcat <$> many p) inp + let start' = special (drop 2 start) + body' <- parser (middle ++ [lastChar]) + let end' = special (drop 2 end) + return $ f (start' <> body' <> end') + Nothing -> do -- Either bad or case such as ***** + guard (l >= 5) + let body' = (replicate (l - 4) c) + return $ f (special body') + +link :: T2T Inlines +link = try imageLink <|> titleLink + +-- Link with title +titleLink :: T2T Inlines +titleLink = try $ do + char '[' + notFollowedBy space + tokens <- sepBy1 (many $ noneOf " ]") space + guard (length tokens >= 2) + char ']' + let link' = last tokens + guard (length link' > 0) + let tit = concat (intersperse " " (init tokens)) + return $ B.link link' "" (B.text tit) + +-- Link with image +imageLink :: T2T Inlines +imageLink = try $ do + char '[' + body <- image + many1 space + l <- manyTill (noneOf "\n\r ") (char ']') + return (B.link l "" body) + +macro :: T2T Inlines +macro = try $ do + name <- string "%%" *> oneOfStringsCI (map fst commands) + optional (try $ enclosed (char '(') (char ')') anyChar) + lookAhead (spaceChar <|> oneOf specialChars <|> newline) + maybe (return mempty) (\f -> B.str <$> asks f) (lookup name commands) + where + commands = [ ("date", date), ("mtime", mtime) + , ("infile", infile), ("outfile", outfile)] + +-- raw URLs in text are automatically linked +url :: T2T Inlines +url = try $ do + (rawUrl, escapedUrl) <- (try uri <|> emailAddress) + return $ B.link rawUrl "" (B.str escapedUrl) + +uri :: T2T (String, String) +uri = try $ do + address <- t2tURI + return (address, escapeURI address) + +-- The definition of a URI in the T2T source differs from the +-- actual definition. This is a transcription of the definition in +-- the source of v2.6 +--isT2TURI :: String -> Bool +--isT2TURI (parse t2tURI "" -> Right _) = True +--isT2TURI _ = False + +t2tURI :: T2T String +t2tURI = do + start <- try ((++) <$> proto <*> urlLogin) <|> guess + domain <- many1 chars + sep <- many (char '/') + form' <- option mempty ((:) <$> char '?' <*> many1 form) + anchor' <- option mempty ((:) <$> char '#' <*> many anchor) + return (start ++ domain ++ sep ++ form' ++ anchor') + where + protos = ["http", "https", "ftp", "telnet", "gopher", "wais"] + proto = (++) <$> oneOfStrings protos <*> string "://" + guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23")) + <|> stringAnyCase "ftp") <*> ((:[]) <$> char '.') + login = alphaNum <|> oneOf "_.-" + pass = many (noneOf " @") + chars = alphaNum <|> oneOf "%._/~:,=$@&+-" + anchor = alphaNum <|> oneOf "%._0" + form = chars <|> oneOf ";*" + urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@') + + +image :: T2T Inlines +image = try $ do + -- List taken from txt2tags source + let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"] + char '[' + path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions)) + ext <- oneOfStrings extensions + char ']' + return $ B.image (path ++ ext) "" mempty + +-- Characters used in markup +specialChars :: String +specialChars = "%*-_/|:+" + +tab :: T2T Char +tab = char '\t' + +space :: T2T Char +space = char ' ' + +spaces :: T2T String +spaces = many space + +endline :: T2T Inlines +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy hrule + notFollowedBy title + notFollowedBy verbatim + notFollowedBy rawBlock + notFollowedBy taggedBlock + notFollowedBy quote + notFollowedBy list + notFollowedBy table + return $ B.space + +str :: T2T Inlines +str = try $ do + B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + +whitespace :: T2T Inlines +whitespace = try $ B.space <$ spaceChar + +symbol :: T2T Inlines +symbol = B.str . (:[]) <$> oneOf specialChars + +-- Utility + +getTarget :: T2T String +getTarget = do + mv <- lookupMeta "target" . stateMeta <$> getState + let MetaString target = fromMaybe (MetaString "html") mv + return target + +atStart :: T2T () +atStart = (sourceColumn <$> getPosition) >>= guard . (== 1) + +ignoreSpacesCap :: T2T String -> T2T String +ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces) + diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index 2a2f56281..1a4e037cf 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -32,53 +32,54 @@ the HTML using data URIs. -} module Text.Pandoc.SelfContained ( makeSelfContained ) where import Text.HTML.TagSoup -import Network.URI (isURI, escapeURIString) +import Network.URI (isURI, escapeURIString, URI(..), parseURI) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B import Data.ByteString (ByteString) -import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>)) +import System.FilePath (takeExtension, takeDirectory, (</>)) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err) +import Text.Pandoc.Shared (renderTags', err, fetchItem') +import Text.Pandoc.MediaBag (MediaBag) import Text.Pandoc.UTF8 (toString, fromString) -import Text.Pandoc.MIME (getMimeType) -import System.Directory (doesFileExist) +import Text.Pandoc.Options (WriterOptions(..)) isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c -convertTag :: Maybe FilePath -> Tag String -> IO (Tag String) -convertTag userdata t@(TagOpen tagname as) +convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String) +convertTag media sourceURL t@(TagOpen tagname as) | tagname `elem` ["img", "embed", "video", "input", "audio", "source"] = do as' <- mapM processAttribute as return $ TagOpen tagname as' where processAttribute (x,y) = if x == "src" || x == "href" || x == "poster" then do - (raw, mime) <- getRaw userdata (fromAttrib "type" t) y + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) y let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) return (x, enc) else return (x,y) -convertTag userdata t@(TagOpen "script" as) = +convertTag media sourceURL t@(TagOpen "script" as) = case fromAttrib "src" t of [] -> return t src -> do - (raw, mime) <- getRaw userdata (fromAttrib "type" t) src + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag userdata t@(TagOpen "link" as) = +convertTag media sourceURL t@(TagOpen "link" as) = case fromAttrib "href" t of [] -> return t src -> do - (raw, mime) <- getRaw userdata (fromAttrib "type" t) src + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) -convertTag _ t = return t +convertTag _ _ t = return t -- NOTE: This is really crude, it doesn't respect CSS comments. -cssURLs :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString -cssURLs userdata d orig = +cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString + -> IO ByteString +cssURLs media sourceURL d orig = case B.breakSubstring "url(" orig of (x,y) | B.null y -> return orig | otherwise -> do @@ -91,33 +92,21 @@ cssURLs userdata d orig = let url' = if isURI url then url else d </> url - (raw, mime) <- getRaw userdata "" url' - rest <- cssURLs userdata d v + (raw, mime) <- getRaw media sourceURL "" url' + rest <- cssURLs media sourceURL d v let enc = "data:" `B.append` fromString mime `B.append` ";base64," `B.append` (encode raw) return $ x `B.append` "url(" `B.append` enc `B.append` rest -getItem :: Maybe FilePath -> String -> IO (ByteString, Maybe String) -getItem userdata f = - if isURI f - then openURL f >>= either handleErr return - else do - -- strip off trailing query or fragment part, if relative URL. - -- this is needed for things like cmunrm.eot?#iefix, - -- which is used to get old versions of IE to work with web fonts. - let f' = takeWhile (\c -> c /= '?' && c /= '#') f - let mime = case takeExtension f' of - ".gz" -> getMimeType $ dropExtension f' - x -> getMimeType x - exists <- doesFileExist f' - cont <- if exists then B.readFile f' else readDataFile userdata f' - return (cont, mime) - where handleErr e = err 61 $ "Failed to retrieve " ++ f ++ "\n" ++ show e - -getRaw :: Maybe FilePath -> String -> String -> IO (ByteString, String) -getRaw userdata mimetype src = do +getRaw :: MediaBag -> Maybe String -> String -> String + -> IO (ByteString, String) +getRaw media sourceURL mimetype src = do let ext = map toLower $ takeExtension src - (raw, respMime) <- getItem userdata src + fetchResult <- fetchItem' media sourceURL src + (raw, respMime) <- case fetchResult of + Left msg -> err 67 $ "Could not fetch " ++ src ++ + "\n" ++ show msg + Right x -> return x let raw' = if ext == ".gz" then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks $ [raw] @@ -127,21 +116,22 @@ getRaw userdata mimetype src = do $ "Could not determine mime type for `" ++ src ++ "'" (x, Nothing) -> x (_, Just x ) -> x + let cssSourceURL = case parseURI src of + Just u + | uriScheme u `elem` ["http:","https:"] -> + Just $ show u{ uriPath = "", + uriQuery = "", + uriFragment = "" } + _ -> Nothing result <- if mime == "text/css" - then cssURLs userdata (takeDirectory src) raw' + then cssURLs media cssSourceURL (takeDirectory src) raw' else return raw' return (result, mime) -- | Convert HTML into self-contained HTML, incorporating images, --- scripts, and CSS using data: URIs. Items specified using absolute --- URLs will be downloaded; those specified using relative URLs will --- be sought first relative to the working directory, then relative --- to the user data directory (if the first parameter is 'Just' --- a directory), and finally relative to pandoc's default data --- directory. -makeSelfContained :: Maybe FilePath -> String -> IO String -makeSelfContained userdata inp = do +-- scripts, and CSS using data: URIs. +makeSelfContained :: WriterOptions -> String -> IO String +makeSelfContained opts inp = do let tags = parseTags inp - out' <- mapM (convertTag userdata) tags + out' <- mapM (convertTag (writerMediaBag opts) (writerSourceURL opts)) tags return $ renderTags' out' - diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 5b0d9b6b4..a91ca9115 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -1,5 +1,5 @@ {-# LANGUAGE DeriveDataTypeable, CPP, MultiParamTypeClasses, - FlexibleContexts, ScopedTypeVariables #-} + FlexibleContexts, ScopedTypeVariables, PatternGuards #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -55,7 +55,11 @@ module Text.Pandoc.Shared ( normalizeSpaces, extractSpaces, normalize, + normalizeInlines, + normalizeBlocks, + removeFormatting, stringify, + capitalize, compactify, compactify', compactify'DL, @@ -74,17 +78,21 @@ module Text.Pandoc.Shared ( readDataFile, readDataFileUTF8, fetchItem, + fetchItem', openURL, + collapseFilePath, -- * Error handling err, warn, -- * Safe read - safeRead + safeRead, + -- * Temp directory + withTempDir ) where import Text.Pandoc.Definition import Text.Pandoc.Walk -import Text.Pandoc.Generic +import Text.Pandoc.MediaBag (MediaBag, lookupMedia) import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..)) import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.UTF8 as UTF8 @@ -92,14 +100,15 @@ import System.Environment (getProgName) import System.Exit (exitWith, ExitCode(..)) import Data.Char ( toLower, isLower, isUpper, isAlpha, isLetter, isDigit, isSpace ) -import Data.List ( find, isPrefixOf, intercalate ) +import Data.List ( find, stripPrefix, intercalate ) import qualified Data.Map as M import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, - unEscapeString, parseURIReference ) + unEscapeString, parseURIReference, isAllowedInURI ) import qualified Data.Set as Set import System.Directory +import System.FilePath (joinPath, splitDirectories) import Text.Pandoc.MIME (getMimeType) -import System.FilePath ( (</>), takeExtension, dropExtension ) +import System.FilePath ( (</>), takeExtension, dropExtension) import Data.Generics (Typeable, Data) import qualified Control.Monad.State as S import qualified Control.Exception as E @@ -108,6 +117,7 @@ import Text.Pandoc.Pretty (charWidth) import System.Locale (defaultTimeLocale) import Data.Time import System.IO (stderr) +import System.IO.Temp import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) import qualified Data.ByteString as BS @@ -115,6 +125,7 @@ import qualified Data.ByteString.Char8 as B8 import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) +import qualified Data.Text as T (toUpper, pack, unpack) #ifdef EMBED_DATA_FILES import Text.Pandoc.Data (dataFiles) @@ -174,9 +185,9 @@ substitute :: (Eq a) => [a] -> [a] -> [a] -> [a] substitute _ _ [] = [] substitute [] _ xs = xs substitute target replacement lst@(x:xs) = - if target `isPrefixOf` lst - then replacement ++ substitute target replacement (drop (length target) lst) - else x : substitute target replacement xs + case stripPrefix target lst of + Just lst' -> replacement ++ substitute target replacement lst' + Nothing -> x : substitute target replacement xs ordNub :: (Ord a) => [a] -> [a] ordNub l = go Set.empty l @@ -334,10 +345,10 @@ isSpaceOrEmpty (Str "") = True isSpaceOrEmpty _ = False -- | Extract the leading and trailing spaces from inside an inline element --- and place them outside the element. +-- and place them outside the element. extractSpaces :: (Inlines -> Inlines) -> Inlines -> Inlines -extractSpaces f is = +extractSpaces f is = let contents = B.unMany is left = case viewl contents of (Space :< _) -> B.space @@ -350,72 +361,160 @@ extractSpaces f is = -- | Normalize @Pandoc@ document, consolidating doubled 'Space's, -- combining adjacent 'Str's and 'Emph's, remove 'Null's and -- empty elements, etc. -normalize :: (Eq a, Data a) => a -> a -normalize = topDown removeEmptyBlocks . - topDown consolidateInlines . - bottomUp (removeEmptyInlines . removeTrailingInlineSpaces) - -removeEmptyBlocks :: [Block] -> [Block] -removeEmptyBlocks (Null : xs) = removeEmptyBlocks xs -removeEmptyBlocks (BulletList [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (OrderedList _ [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (DefinitionList [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (RawBlock _ [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (x:xs) = x : removeEmptyBlocks xs -removeEmptyBlocks [] = [] - -removeEmptyInlines :: [Inline] -> [Inline] -removeEmptyInlines (Emph [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Strong [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Subscript [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Superscript [] : zs) = removeEmptyInlines zs -removeEmptyInlines (SmallCaps [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Strikeout [] : zs) = removeEmptyInlines zs -removeEmptyInlines (RawInline _ [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Code _ [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Str "" : zs) = removeEmptyInlines zs -removeEmptyInlines (x : xs) = x : removeEmptyInlines xs -removeEmptyInlines [] = [] - -removeTrailingInlineSpaces :: [Inline] -> [Inline] -removeTrailingInlineSpaces = reverse . removeLeadingInlineSpaces . reverse - -removeLeadingInlineSpaces :: [Inline] -> [Inline] -removeLeadingInlineSpaces = dropWhile isSpaceOrEmpty - -consolidateInlines :: [Inline] -> [Inline] -consolidateInlines (Str x : ys) = +normalize :: Pandoc -> Pandoc +normalize (Pandoc (Meta meta) blocks) = + Pandoc (Meta $ M.map go meta) (normalizeBlocks blocks) + where go (MetaInlines xs) = MetaInlines $ normalizeInlines xs + go (MetaBlocks xs) = MetaBlocks $ normalizeBlocks xs + go (MetaList ms) = MetaList $ map go ms + go (MetaMap m) = MetaMap $ M.map go m + go x = x + +normalizeBlocks :: [Block] -> [Block] +normalizeBlocks (Null : xs) = normalizeBlocks xs +normalizeBlocks (Div attr bs : xs) = + Div attr (normalizeBlocks bs) : normalizeBlocks xs +normalizeBlocks (BlockQuote bs : xs) = + case normalizeBlocks bs of + [] -> normalizeBlocks xs + bs' -> BlockQuote bs' : normalizeBlocks xs +normalizeBlocks (BulletList [] : xs) = normalizeBlocks xs +normalizeBlocks (BulletList items : xs) = + BulletList (map normalizeBlocks items) : normalizeBlocks xs +normalizeBlocks (OrderedList _ [] : xs) = normalizeBlocks xs +normalizeBlocks (OrderedList attr items : xs) = + OrderedList attr (map normalizeBlocks items) : normalizeBlocks xs +normalizeBlocks (DefinitionList [] : xs) = normalizeBlocks xs +normalizeBlocks (DefinitionList items : xs) = + DefinitionList (map go items) : normalizeBlocks xs + where go (ils, bs) = (normalizeInlines ils, map normalizeBlocks bs) +normalizeBlocks (RawBlock _ "" : xs) = normalizeBlocks xs +normalizeBlocks (RawBlock f x : xs) = + case normalizeBlocks xs of + (RawBlock f' x' : rest) | f' == f -> + RawBlock f (x ++ ('\n':x')) : rest + rest -> RawBlock f x : rest +normalizeBlocks (Para ils : xs) = + case normalizeInlines ils of + [] -> normalizeBlocks xs + ils' -> Para ils' : normalizeBlocks xs +normalizeBlocks (Plain ils : xs) = + case normalizeInlines ils of + [] -> normalizeBlocks xs + ils' -> Plain ils' : normalizeBlocks xs +normalizeBlocks (Header lev attr ils : xs) = + Header lev attr (normalizeInlines ils) : normalizeBlocks xs +normalizeBlocks (Table capt aligns widths hdrs rows : xs) = + Table (normalizeInlines capt) aligns widths + (map normalizeBlocks hdrs) (map (map normalizeBlocks) rows) + : normalizeBlocks xs +normalizeBlocks (x:xs) = x : normalizeBlocks xs +normalizeBlocks [] = [] + +normalizeInlines :: [Inline] -> [Inline] +normalizeInlines (Str x : ys) = case concat (x : map fromStr strs) of - "" -> consolidateInlines rest - n -> Str n : consolidateInlines rest + "" -> rest + n -> Str n : rest where - (strs, rest) = span isStr ys + (strs, rest) = span isStr $ normalizeInlines ys isStr (Str _) = True isStr _ = False fromStr (Str z) = z - fromStr _ = error "consolidateInlines - fromStr - not a Str" -consolidateInlines (Space : ys) = Space : rest + fromStr _ = error "normalizeInlines - fromStr - not a Str" +normalizeInlines (Space : ys) = + if null rest + then [] + else Space : rest where isSp Space = True isSp _ = False - rest = consolidateInlines $ dropWhile isSp ys -consolidateInlines (Emph xs : Emph ys : zs) = consolidateInlines $ - Emph (xs ++ ys) : zs -consolidateInlines (Strong xs : Strong ys : zs) = consolidateInlines $ - Strong (xs ++ ys) : zs -consolidateInlines (Subscript xs : Subscript ys : zs) = consolidateInlines $ - Subscript (xs ++ ys) : zs -consolidateInlines (Superscript xs : Superscript ys : zs) = consolidateInlines $ - Superscript (xs ++ ys) : zs -consolidateInlines (SmallCaps xs : SmallCaps ys : zs) = consolidateInlines $ - SmallCaps (xs ++ ys) : zs -consolidateInlines (Strikeout xs : Strikeout ys : zs) = consolidateInlines $ - Strikeout (xs ++ ys) : zs -consolidateInlines (RawInline f x : RawInline f' y : zs) | f == f' = - consolidateInlines $ RawInline f (x ++ y) : zs -consolidateInlines (Code a1 x : Code a2 y : zs) | a1 == a2 = - consolidateInlines $ Code a1 (x ++ y) : zs -consolidateInlines (x : xs) = x : consolidateInlines xs -consolidateInlines [] = [] + rest = dropWhile isSp $ normalizeInlines ys +normalizeInlines (Emph xs : zs) = + case normalizeInlines zs of + (Emph ys : rest) -> normalizeInlines $ + Emph (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Emph xs' : rest +normalizeInlines (Strong xs : zs) = + case normalizeInlines zs of + (Strong ys : rest) -> normalizeInlines $ + Strong (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Strong xs' : rest +normalizeInlines (Subscript xs : zs) = + case normalizeInlines zs of + (Subscript ys : rest) -> normalizeInlines $ + Subscript (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Subscript xs' : rest +normalizeInlines (Superscript xs : zs) = + case normalizeInlines zs of + (Superscript ys : rest) -> normalizeInlines $ + Superscript (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Superscript xs' : rest +normalizeInlines (SmallCaps xs : zs) = + case normalizeInlines zs of + (SmallCaps ys : rest) -> normalizeInlines $ + SmallCaps (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> SmallCaps xs' : rest +normalizeInlines (Strikeout xs : zs) = + case normalizeInlines zs of + (Strikeout ys : rest) -> normalizeInlines $ + Strikeout (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Strikeout xs' : rest +normalizeInlines (RawInline _ [] : ys) = normalizeInlines ys +normalizeInlines (RawInline f xs : zs) = + case normalizeInlines zs of + (RawInline f' ys : rest) | f == f' -> normalizeInlines $ + RawInline f (xs ++ ys) : rest + rest -> RawInline f xs : rest +normalizeInlines (Code _ "" : ys) = normalizeInlines ys +normalizeInlines (Code attr xs : zs) = + case normalizeInlines zs of + (Code attr' ys : rest) | attr == attr' -> normalizeInlines $ + Code attr (xs ++ ys) : rest + rest -> Code attr xs : rest +-- allow empty spans, they may carry identifiers etc. +-- normalizeInlines (Span _ [] : ys) = normalizeInlines ys +normalizeInlines (Span attr xs : zs) = + case normalizeInlines zs of + (Span attr' ys : rest) | attr == attr' -> normalizeInlines $ + Span attr (normalizeInlines $ xs ++ ys) : rest + rest -> Span attr (normalizeInlines xs) : rest +normalizeInlines (Note bs : ys) = Note (normalizeBlocks bs) : + normalizeInlines ys +normalizeInlines (Quoted qt ils : ys) = + Quoted qt (normalizeInlines ils) : normalizeInlines ys +normalizeInlines (Link ils t : ys) = + Link (normalizeInlines ils) t : normalizeInlines ys +normalizeInlines (Image ils t : ys) = + Image (normalizeInlines ils) t : normalizeInlines ys +normalizeInlines (Cite cs ils : ys) = + Cite cs (normalizeInlines ils) : normalizeInlines ys +normalizeInlines (x : xs) = x : normalizeInlines xs +normalizeInlines [] = [] + +-- | Extract inlines, removing formatting. +removeFormatting :: Walkable Inline a => a -> [Inline] +removeFormatting = query go . walk deNote + where go :: Inline -> [Inline] + go (Str xs) = [Str xs] + go Space = [Space] + go (Code _ x) = [Str x] + go (Math _ x) = [Str x] + go LineBreak = [Space] + go _ = [] + deNote (Note _) = Str "" + deNote x = x -- | Convert pandoc structure to a string with formatting removed. -- Footnotes are skipped (since we don't want their contents in link @@ -432,6 +531,17 @@ stringify = query go . walk deNote deNote (Note _) = Str "" deNote x = x +-- | Bring all regular text in a pandoc structure to uppercase. +-- +-- This function correctly handles cases where a lowercase character doesn't +-- match to a single uppercase character – e.g. “Straße” would be converted +-- to “STRASSE”, not “STRAßE”. +capitalize :: Walkable Inline a => a -> a +capitalize = walk go + where go :: Inline -> Inline + go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s) + go x = x + -- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. compactify :: [[Block]] -- ^ List of list items (each a list of blocks) @@ -463,20 +573,22 @@ compactify' items = _ -> items _ -> items --- | Like @compactify'@, but akts on items of definition lists. +-- | Like @compactify'@, but acts on items of definition lists. compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] compactify'DL items = let defs = concatMap snd items - defBlocks = reverse $ concatMap B.toList defs - in case defBlocks of - (Para x:_) -> if not $ any isPara (drop 1 defBlocks) - then let (t,ds) = last items - lastDef = B.toList $ last ds - ds' = init ds ++ - [B.fromList $ init lastDef ++ [Plain x]] - in init items ++ [(t, ds')] - else items - _ -> items + in case reverse (concatMap B.toList defs) of + (Para x:xs) + | not (any isPara xs) -> + let (t,ds) = last items + lastDef = B.toList $ last ds + ds' = init ds ++ + if null lastDef + then [B.fromList lastDef] + else [B.fromList $ init lastDef ++ [Plain x]] + in init items ++ [(t, ds')] + | otherwise -> items + _ -> items isPara :: Block -> Bool isPara (Para _) = True @@ -669,28 +781,38 @@ readDataFileUTF8 userDir fname = -- Returns raw content and maybe mime type. fetchItem :: Maybe String -> String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) -fetchItem sourceURL s - | isURI s = openURL s - | otherwise = - case sourceURL >>= parseURIReference of - Just u -> case parseURIReference s of - Just s' -> openURL $ show $ - s' `nonStrictRelativeTo` u - Nothing -> openURL $ show u ++ "/" ++ s - Nothing -> E.try readLocalFile +fetchItem sourceURL s = + case (sourceURL >>= parseURIReference . ensureEscaped, ensureEscaped s) of + (_, s') | isURI s' -> openURL s' + (Just u, s') -> -- try fetching from relative path at source + case parseURIReference s' of + Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u + Nothing -> openURL s' -- will throw error + (Nothing, _) -> E.try readLocalFile -- get from local file system where readLocalFile = do - let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - cont <- BS.readFile s + cont <- BS.readFile fp return (cont, mime) + dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#') + fp = unEscapeString $ dropFragmentAndQuery s + mime = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + x -> getMimeType x + ensureEscaped = escapeURIString isAllowedInURI + +-- | Like 'fetchItem', but also looks for items in a 'MediaBag'. +fetchItem' :: MediaBag -> Maybe String -> String + -> IO (Either E.SomeException (BS.ByteString, Maybe String)) +fetchItem' media sourceURL s = do + case lookupMedia s media of + Nothing -> fetchItem sourceURL s + Just (mime, bs) -> return $ Right (BS.concat $ toChunks bs, Just mime) -- | Read from a URL and return raw data and maybe mime type. openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) openURL u - | "data:" `isPrefixOf` u = - let mime = takeWhile (/=',') $ drop 5 u - contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u + | Just u' <- stripPrefix "data:" u = + let mime = takeWhile (/=',') u' + contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u' in return $ Right (decodeLenient contents, Just mime) #ifdef HTTP_CLIENT | otherwise = withSocketsDo $ E.try $ do @@ -734,6 +856,29 @@ warn msg = do name <- getProgName UTF8.hPutStrLn stderr $ name ++ ": " ++ msg +-- | Remove intermediate "." and ".." directories from a path. +-- +-- @ +-- collapseFilePath "./foo" == "foo" +-- collapseFilePath "/bar/../baz" == "/baz" +-- collapseFilePath "/../baz" == "/../baz" +-- collapseFilePath "parent/foo/baz/../bar" == "parent/foo/bar" +-- collapseFilePath "parent/foo/baz/../../bar" == "parent/bar" +-- collapseFilePath "parent/foo/.." == "parent" +-- collapseFilePath "/parent/foo/../../bar" == "/bar" +-- @ +collapseFilePath :: FilePath -> FilePath +collapseFilePath = joinPath . reverse . foldl go [] . splitDirectories + where + go rs "." = rs + go r@(p:rs) ".." = case p of + ".." -> ("..":r) + "/" -> ("..":r) + _ -> rs + go _ "/" = ["/"] + go rs x = x:rs + + -- -- Safe read -- @@ -743,3 +888,15 @@ safeRead s = case reads s of (d,x):_ | all isSpace x -> return d _ -> fail $ "Could not read `" ++ s ++ "'" + +-- +-- Temp directory +-- + +withTempDir :: String -> (FilePath -> IO a) -> IO a +withTempDir = +#ifdef _WINDOWS + withTempDirectory "." +#else + withSystemTempDirectory +#endif diff --git a/src/Text/Pandoc/Writers/AsciiDoc.hs b/src/Text/Pandoc/Writers/AsciiDoc.hs index 19112d8f5..e5b8c5167 100644 --- a/src/Text/Pandoc/Writers/AsciiDoc.hs +++ b/src/Text/Pandoc/Writers/AsciiDoc.hs @@ -43,16 +43,19 @@ import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, space) -import Data.List ( isPrefixOf, intersperse, intercalate ) +import Data.Maybe (fromMaybe) +import Data.List ( stripPrefix, intersperse, intercalate ) import Text.Pandoc.Pretty import Control.Monad.State import qualified Data.Map as M import Data.Aeson (Value(String), fromJSON, toJSON, Result(..)) import qualified Data.Text as T +import Control.Applicative ((<*), (*>)) data WriterState = WriterState { defListMarker :: String , orderedListLevel :: Int , bulletListLevel :: Int + , intraword :: Bool } -- | Convert Pandoc to AsciiDoc. @@ -62,6 +65,7 @@ writeAsciiDoc opts document = defListMarker = "::" , orderedListLevel = 1 , bulletListLevel = 1 + , intraword = False } -- | Return asciidoc representation of document. @@ -123,7 +127,7 @@ blockToAsciiDoc _ Null = return empty blockToAsciiDoc opts (Plain inlines) = do contents <- inlineListToAsciiDoc opts inlines return $ contents <> cr -blockToAsciiDoc opts (Para [Image alt (src,'f':'i':'g':':':tit)]) = +blockToAsciiDoc opts (Para [Image alt (src,'f':'i':'g':':':tit)]) = do blockToAsciiDoc opts (Para [Image alt (src,tit)]) blockToAsciiDoc opts (Para inlines) = do contents <- inlineListToAsciiDoc opts inlines @@ -142,10 +146,10 @@ blockToAsciiDoc opts (Header level (ident,_,_) inlines) = do let len = offset contents -- ident seem to be empty most of the time and asciidoc will generate them automatically -- so lets make them not show up when null - let identifier = if (null ident) then empty else ("[[" <> text ident <> "]]") + let identifier = if (null ident) then empty else ("[[" <> text ident <> "]]") let setext = writerSetextHeaders opts - return $ - (if setext + return $ + (if setext then identifier $$ contents $$ (case level of @@ -155,7 +159,7 @@ blockToAsciiDoc opts (Header level (ident,_,_) inlines) = do 4 -> text $ replicate len '+' _ -> empty) <> blankline else - identifier $$ text (replicate level '=') <> space <> contents <> blankline) + identifier $$ text (replicate level '=') <> space <> contents <> blankline) blockToAsciiDoc _ (CodeBlock (_,classes,_) str) = return $ flush (attrs <> dashes <> space <> attrs <> cr <> text str <> cr <> dashes) <> blankline @@ -317,17 +321,51 @@ blockListToAsciiDoc opts blocks = cat `fmap` mapM (blockToAsciiDoc opts) blocks -- | Convert list of Pandoc inline elements to asciidoc. inlineListToAsciiDoc :: WriterOptions -> [Inline] -> State WriterState Doc -inlineListToAsciiDoc opts lst = - mapM (inlineToAsciiDoc opts) lst >>= return . cat +inlineListToAsciiDoc opts lst = do + oldIntraword <- gets intraword + setIntraword False + result <- go lst + setIntraword oldIntraword + return result + where go [] = return empty + go (y:x:xs) + | not (isSpacy y) = do + y' <- if isSpacy x + then inlineToAsciiDoc opts y + else withIntraword $ inlineToAsciiDoc opts y + x' <- withIntraword $ inlineToAsciiDoc opts x + xs' <- go xs + return (y' <> x' <> xs') + | x /= Space && x /= LineBreak = do + y' <- withIntraword $ inlineToAsciiDoc opts y + xs' <- go (x:xs) + return (y' <> xs') + go (x:xs) = do + x' <- inlineToAsciiDoc opts x + xs' <- go xs + return (x' <> xs') + isSpacy Space = True + isSpacy LineBreak = True + isSpacy _ = False + +setIntraword :: Bool -> State WriterState () +setIntraword b = modify $ \st -> st{ intraword = b } + +withIntraword :: State WriterState a -> State WriterState a +withIntraword p = setIntraword True *> p <* setIntraword False -- | Convert Pandoc inline element to asciidoc. inlineToAsciiDoc :: WriterOptions -> Inline -> State WriterState Doc inlineToAsciiDoc opts (Emph lst) = do contents <- inlineListToAsciiDoc opts lst - return $ "_" <> contents <> "_" + isIntraword <- gets intraword + let marker = if isIntraword then "__" else "_" + return $ marker <> contents <> marker inlineToAsciiDoc opts (Strong lst) = do contents <- inlineListToAsciiDoc opts lst - return $ "*" <> contents <> "*" + isIntraword <- gets intraword + let marker = if isIntraword then "**" else "*" + return $ marker <> contents <> marker inlineToAsciiDoc opts (Strikeout lst) = do contents <- inlineListToAsciiDoc opts lst return $ "[line-through]*" <> contents <> "*" @@ -338,12 +376,10 @@ inlineToAsciiDoc opts (Subscript lst) = do contents <- inlineListToAsciiDoc opts lst return $ "~" <> contents <> "~" inlineToAsciiDoc opts (SmallCaps lst) = inlineListToAsciiDoc opts lst -inlineToAsciiDoc opts (Quoted SingleQuote lst) = do - contents <- inlineListToAsciiDoc opts lst - return $ "`" <> contents <> "'" -inlineToAsciiDoc opts (Quoted DoubleQuote lst) = do - contents <- inlineListToAsciiDoc opts lst - return $ "``" <> contents <> "''" +inlineToAsciiDoc opts (Quoted SingleQuote lst) = + inlineListToAsciiDoc opts (Str "`" : lst ++ [Str "'"]) +inlineToAsciiDoc opts (Quoted DoubleQuote lst) = + inlineListToAsciiDoc opts (Str "``" : lst ++ [Str "''"]) inlineToAsciiDoc _ (Code _ str) = return $ text "`" <> text (escapeStringUsing (backslashEscapes "`") str) <> "`" inlineToAsciiDoc _ (Str str) = return $ text $ escapeString str @@ -366,7 +402,7 @@ inlineToAsciiDoc opts (Link txt (src, _tit)) = do let prefix = if isRelative then text "link:" else empty - let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) let useAuto = case txt of [Str s] | escapeURI s == srcSuffix -> True _ -> False diff --git a/src/Text/Pandoc/Writers/Custom.hs b/src/Text/Pandoc/Writers/Custom.hs index 88f590c43..914d61850 100644 --- a/src/Text/Pandoc/Writers/Custom.hs +++ b/src/Text/Pandoc/Writers/Custom.hs @@ -1,4 +1,5 @@ -{-# LANGUAGE OverlappingInstances, FlexibleInstances, OverloadedStrings #-} +{-# LANGUAGE OverlappingInstances, FlexibleInstances, OverloadedStrings, + ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-orphans #-} {- Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> @@ -35,12 +36,14 @@ import Text.Pandoc.Options import Data.List ( intersperse ) import Data.Char ( toLower ) import Scripting.Lua (LuaState, StackValue, callfunc) +import Text.Pandoc.Writers.Shared import qualified Scripting.Lua as Lua import Text.Pandoc.UTF8 (fromString, toString) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as C8 import Data.Monoid import qualified Data.Map as M +import Text.Pandoc.Templates attrToMap :: Attr -> M.Map ByteString ByteString attrToMap (id',classes,keyvals) = M.fromList @@ -128,9 +131,23 @@ instance StackValue MetaValue where valuetype (MetaInlines _) = Lua.TSTRING valuetype (MetaBlocks _) = Lua.TSTRING +instance StackValue Citation where + push lua cit = do + Lua.createtable lua 6 0 + let addValue ((k :: String), v) = Lua.push lua k >> Lua.push lua v >> + Lua.rawset lua (-3) + addValue ("citationId", citationId cit) + addValue ("citationPrefix", citationPrefix cit) + addValue ("citationSuffix", citationSuffix cit) + addValue ("citationMode", show (citationMode cit)) + addValue ("citationNoteNum", citationNoteNum cit) + addValue ("citationHash", citationHash cit) + peek = undefined + valuetype _ = Lua.TTABLE + -- | Convert Pandoc to custom markup. writeCustom :: FilePath -> WriterOptions -> Pandoc -> IO String -writeCustom luaFile opts doc = do +writeCustom luaFile opts doc@(Pandoc meta _) = do luaScript <- C8.unpack `fmap` C8.readFile luaFile lua <- Lua.newstate Lua.openlibs lua @@ -138,8 +155,17 @@ writeCustom luaFile opts doc = do Lua.call lua 0 0 -- TODO - call hierarchicalize, so we have that info rendered <- docToCustom lua opts doc + context <- metaToJSON opts + (fmap toString . blockListToCustom lua) + (fmap toString . inlineListToCustom lua) + meta Lua.close lua - return $ toString rendered + let body = toString rendered + if writerStandalone opts + then do + let context' = setField "body" body context + return $ renderTemplate' (writerTemplate opts) context' + else return body docToCustom :: LuaState -> WriterOptions -> Pandoc -> IO ByteString docToCustom lua opts (Pandoc (Meta metamap) blocks) = do @@ -225,7 +251,7 @@ inlineToCustom lua (Quoted SingleQuote lst) = callfunc lua "SingleQuoted" lst inlineToCustom lua (Quoted DoubleQuote lst) = callfunc lua "DoubleQuoted" lst -inlineToCustom lua (Cite _ lst) = callfunc lua "Cite" lst +inlineToCustom lua (Cite cs lst) = callfunc lua "Cite" lst cs inlineToCustom lua (Code attr str) = callfunc lua "Code" (fromString str) (attrToMap attr) diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index ba6a92a08..b10317506 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE OverloadedStrings, PatternGuards #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -37,8 +37,9 @@ import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Readers.TeXMath -import Data.List ( isPrefixOf, intercalate, isSuffixOf ) +import Data.List ( stripPrefix, isPrefixOf, intercalate, isSuffixOf ) import Data.Char ( toLower ) +import Control.Applicative ((<$>)) import Data.Monoid ( Any(..) ) import Text.Pandoc.Highlighting ( languages, languagesByExtension ) import Text.Pandoc.Pretty @@ -293,14 +294,14 @@ inlineToDocbook _ (Code _ str) = inTagsSimple "literal" $ text (escapeStringForXML str) inlineToDocbook opts (Math t str) | isMathML (writerHTMLMathMethod opts) = - case texMathToMathML dt str of - Right r -> inTagsSimple tagtype - $ text $ Xml.ppcElement conf - $ fixNS - $ removeAttr r - Left _ -> inlinesToDocbook opts - $ readTeXMath' t str - | otherwise = inlinesToDocbook opts $ readTeXMath' t str + case writeMathML dt <$> readTeX str of + Right r -> inTagsSimple tagtype + $ text $ Xml.ppcElement conf + $ fixNS + $ removeAttr r + Left _ -> inlinesToDocbook opts + $ texMathToInlines t str + | otherwise = inlinesToDocbook opts $ texMathToInlines t str where (dt, tagtype) = case t of InlineMath -> (DisplayInline,"inlineequation") DisplayMath -> (DisplayBlock,"informalequation") @@ -312,19 +313,19 @@ inlineToDocbook _ (RawInline f x) | f == "html" || f == "docbook" = text x | otherwise = empty inlineToDocbook _ LineBreak = text "\n" inlineToDocbook _ Space = space -inlineToDocbook opts (Link txt (src, _)) = - if isPrefixOf "mailto:" src - then let src' = drop 7 src - emailLink = inTagsSimple "email" $ text $ - escapeStringForXML $ src' - in case txt of - [Str s] | escapeURI s == src' -> emailLink - _ -> inlinesToDocbook opts txt <+> - char '(' <> emailLink <> char ')' - else (if isPrefixOf "#" src - then inTags False "link" [("linkend", drop 1 src)] - else inTags False "ulink" [("url", src)]) $ - inlinesToDocbook opts txt +inlineToDocbook opts (Link txt (src, _)) + | Just email <- stripPrefix "mailto:" src = + let emailLink = inTagsSimple "email" $ text $ + escapeStringForXML $ email + in case txt of + [Str s] | escapeURI s == email -> emailLink + _ -> inlinesToDocbook opts txt <+> + char '(' <> emailLink <> char ')' + | otherwise = + (if isPrefixOf "#" src + then inTags False "link" [("linkend", drop 1 src)] + else inTags False "ulink" [("url", src)]) $ + inlinesToDocbook opts txt inlineToDocbook _ (Image _ (src, tit)) = let titleDoc = if null tit then empty diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index 77ee51519..38031b7dc 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -58,7 +58,7 @@ import System.Random (randomRIO) import Text.Printf (printf) import qualified Control.Exception as E import Text.Pandoc.MIME (getMimeType, extensionFromMimeType) -import Control.Applicative ((<|>)) +import Control.Applicative ((<|>), (<$>)) import Data.Maybe (mapMaybe) data ListMarker = NoMarker @@ -481,17 +481,30 @@ writeOpenXML opts (Pandoc meta blocks) = do _ -> [] let auths = docAuthors meta let dat = docDate meta + let abstract' = case lookupMeta "abstract" meta of + Just (MetaBlocks bs) -> bs + Just (MetaInlines ils) -> [Plain ils] + _ -> [] + let subtitle' = case lookupMeta "subtitle" meta of + Just (MetaBlocks [Plain xs]) -> xs + Just (MetaBlocks [Para xs]) -> xs + Just (MetaInlines xs) -> xs + _ -> [] title <- withParaProp (pStyle "Title") $ blocksToOpenXML opts [Para tit | not (null tit)] - authors <- withParaProp (pStyle "Authors") $ blocksToOpenXML opts - [Para (intercalate [LineBreak] auths) | not (null auths)] + subtitle <- withParaProp (pStyle "Subtitle") $ blocksToOpenXML opts [Para subtitle' | not (null subtitle')] + authors <- withParaProp (pStyle "Author") $ blocksToOpenXML opts $ + map Para auths date <- withParaProp (pStyle "Date") $ blocksToOpenXML opts [Para dat | not (null dat)] + abstract <- if null abstract' + then return [] + else withParaProp (pStyle "Abstract") $ blocksToOpenXML opts abstract' let convertSpace (Str x : Space : Str y : xs) = Str (x ++ " " ++ y) : xs convertSpace (Str x : Str y : xs) = Str (x ++ y) : xs convertSpace xs = xs let blocks' = bottomUp convertSpace $ blocks doc' <- blocksToOpenXML opts blocks' notes' <- reverse `fmap` gets stFootnotes - let meta' = title ++ authors ++ date + let meta' = title ++ subtitle ++ authors ++ date ++ abstract return (meta' ++ doc', notes') -- | Convert a list of Pandoc blocks to OpenXML. @@ -514,8 +527,11 @@ blockToOpenXML :: WriterOptions -> Block -> WS [Element] blockToOpenXML _ Null = return [] blockToOpenXML opts (Div _ bs) = blocksToOpenXML opts bs blockToOpenXML opts (Header lev (ident,_,_) lst) = do - contents <- withParaProp (pStyle $ "Heading" ++ show lev) $ - blockToOpenXML opts (Para lst) + + paraProps <- withParaProp (pStyle $ "Heading" ++ show lev) $ + getParaProps False + contents <- inlinesToOpenXML opts lst + usedIdents <- gets stSectionIds let bookmarkName = if null ident then uniqueIdent lst usedIdents @@ -525,7 +541,7 @@ blockToOpenXML opts (Header lev (ident,_,_) lst) = do let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id') ,("w:name",bookmarkName)] () let bookmarkEnd = mknode "w:bookmarkEnd" [("w:id", id')] () - return $ [bookmarkStart] ++ contents ++ [bookmarkEnd] + return [mknode "w:p" [] (paraProps ++ [bookmarkStart, bookmarkEnd] ++ contents)] blockToOpenXML opts (Plain lst) = withParaProp (pStyle "Compact") $ blockToOpenXML opts (Para lst) -- title beginning with fig: indicates that the image is a figure @@ -751,9 +767,9 @@ inlineToOpenXML opts (Math mathType str) = do let displayType = if mathType == DisplayMath then DisplayBlock else DisplayInline - case texMathToOMML displayType str of + case writeOMML displayType <$> readTeX str of Right r -> return [r] - Left _ -> inlinesToOpenXML opts (readTeXMath' mathType str) + Left _ -> inlinesToOpenXML opts (texMathToInlines mathType str) inlineToOpenXML opts (Cite _ lst) = inlinesToOpenXML opts lst inlineToOpenXML opts (Code attrs str) = withTextProp (rStyle "VerbatimChar") @@ -814,7 +830,8 @@ inlineToOpenXML opts (Image alt (src, tit)) = do case M.lookup src imgs of Just (_,_,_,elt,_) -> return [elt] Nothing -> do - res <- liftIO $ fetchItem (writerSourceURL opts) src + res <- liftIO $ + fetchItem' (writerMediaBag opts) (writerSourceURL opts) src case res of Left (_ :: E.SomeException) -> do liftIO $ warn $ "Could not find image `" ++ src ++ "', skipping..." diff --git a/src/Text/Pandoc/Writers/DokuWiki.hs b/src/Text/Pandoc/Writers/DokuWiki.hs new file mode 100644 index 000000000..26f9b5f62 --- /dev/null +++ b/src/Text/Pandoc/Writers/DokuWiki.hs @@ -0,0 +1,475 @@ +{- +Copyright (C) 2008-2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.DokuWiki + Copyright : Copyright (C) 2008-2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Clare Macrae <clare.macrae@googlemail.com> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to DokuWiki markup. + +DokuWiki: <https://www.dokuwiki.org/dokuwiki> +-} + +{- + [ ] Implement nested blockquotes (currently only ever does one level) + [ ] Implement alignment of text in tables + [ ] Implement comments + [ ] Work through the Dokuwiki spec, and check I've not missed anything out + [ ] Remove dud/duplicate code +-} + +module Text.Pandoc.Writers.DokuWiki ( writeDokuWiki ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Templates (renderTemplate') +import Data.List ( intersect, intercalate, isPrefixOf ) +import Network.URI ( isURI ) +import Control.Monad.State + +data WriterState = WriterState { + stNotes :: Bool -- True if there are notes + , stIndent :: String -- Indent after the marker at the beginning of list items + , stUseTags :: Bool -- True if we should use HTML tags because we're in a complex list + } + +-- | Convert Pandoc to DokuWiki. +writeDokuWiki :: WriterOptions -> Pandoc -> String +writeDokuWiki opts document = + evalState (pandocToDokuWiki opts $ normalize document) + (WriterState { stNotes = False, stIndent = "", stUseTags = False }) + +-- | Return DokuWiki representation of document. +pandocToDokuWiki :: WriterOptions -> Pandoc -> State WriterState String +pandocToDokuWiki opts (Pandoc meta blocks) = do + metadata <- metaToJSON opts + (fmap trimr . blockListToDokuWiki opts) + (inlineListToDokuWiki opts) + meta + body <- blockListToDokuWiki opts blocks + notesExist <- get >>= return . stNotes + let notes = if notesExist + then "" -- TODO Was "\n<references />" Check whether I can really remove this: + -- if it is definitely to do with footnotes, can remove this whole bit + else "" + let main = body ++ notes + let context = defField "body" main + $ defField "toc" (writerTableOfContents opts) + $ metadata + if writerStandalone opts + then return $ renderTemplate' (writerTemplate opts) context + else return main + +-- | Escape special characters for DokuWiki. +escapeString :: String -> String +escapeString = substitute "__" "%%__%%" . + substitute "**" "%%**%%" . + substitute "//" "%%//%%" + +-- | Convert Pandoc block element to DokuWiki. +blockToDokuWiki :: WriterOptions -- ^ Options + -> Block -- ^ Block element + -> State WriterState String + +blockToDokuWiki _ Null = return "" + +blockToDokuWiki opts (Div _attrs bs) = do + contents <- blockListToDokuWiki opts bs + return $ contents ++ "\n" + +blockToDokuWiki opts (Plain inlines) = + inlineListToDokuWiki opts inlines + +-- title beginning with fig: indicates that the image is a figure +-- dokuwiki doesn't support captions - so combine together alt and caption into alt +blockToDokuWiki opts (Para [Image txt (src,'f':'i':'g':':':tit)]) = do + capt <- if null txt + then return "" + else (" " ++) `fmap` inlineListToDokuWiki opts txt + let opt = if null txt + then "" + else "|" ++ if null tit then capt else tit ++ capt + return $ "{{:" ++ src ++ opt ++ "}}\n" + +blockToDokuWiki opts (Para inlines) = do + indent <- gets stIndent + useTags <- gets stUseTags + contents <- inlineListToDokuWiki opts inlines + return $ if useTags + then "<HTML><p></HTML>" ++ contents ++ "<HTML></p></HTML>" + else contents ++ if null indent then "\n" else "" + +blockToDokuWiki _ (RawBlock f str) + | f == Format "dokuwiki" = return str + -- See https://www.dokuwiki.org/wiki:syntax + -- use uppercase HTML tag for block-level content: + | f == Format "html" = return $ "<HTML>\n" ++ str ++ "\n</HTML>" + | otherwise = return "" + +blockToDokuWiki _ HorizontalRule = return "\n----\n" + +blockToDokuWiki opts (Header level _ inlines) = do + -- emphasis, links etc. not allowed in headers, apparently, + -- so we remove formatting: + contents <- inlineListToDokuWiki opts $ removeFormatting inlines + let eqs = replicate ( 7 - level ) '=' + return $ eqs ++ " " ++ contents ++ " " ++ eqs ++ "\n" + +blockToDokuWiki _ (CodeBlock (_,classes,_) str) = do + let at = classes `intersect` ["actionscript", "ada", "apache", "applescript", "asm", "asp", + "autoit", "bash", "blitzbasic", "bnf", "c", "c_mac", "caddcl", "cadlisp", "cfdg", "cfm", + "cpp", "cpp-qt", "csharp", "css", "d", "delphi", "diff", "div", "dos", "eiffel", "fortran", + "freebasic", "gml", "groovy", "html4strict", "idl", "ini", "inno", "io", "java", "java5", + "javascript", "latex", "lisp", "lua", "matlab", "mirc", "mpasm", "mysql", "nsis", "objc", + "ocaml", "ocaml-brief", "oobas", "oracle8", "pascal", "perl", "php", "php-brief", "plsql", + "python", "qbasic", "rails", "reg", "robots", "ruby", "sas", "scheme", "sdlbasic", + "smalltalk", "smarty", "sql", "tcl", "", "thinbasic", "tsql", "vb", "vbnet", "vhdl", + "visualfoxpro", "winbatch", "xml", "xpp", "z80"] + let (beg, end) = if null at + then ("<code" ++ if null classes then ">" else " class=\"" ++ unwords classes ++ "\">", "</code>") + else ("<source lang=\"" ++ head at ++ "\">", "</source>") + return $ beg ++ str ++ end + +blockToDokuWiki opts (BlockQuote blocks) = do + contents <- blockListToDokuWiki opts blocks + if isSimpleBlockQuote blocks + then return $ "> " ++ contents + else return $ "<HTML><blockquote>\n" ++ contents ++ "</blockquote></HTML>" + +blockToDokuWiki opts (Table capt aligns _ headers rows') = do + let alignStrings = map alignmentToString aligns + captionDoc <- if null capt + then return "" + else do + c <- inlineListToDokuWiki opts capt + return $ "" ++ c ++ "\n" + head' <- if all null headers + then return "" + else do + hs <- tableHeaderToDokuWiki opts alignStrings 0 headers + return $ hs ++ "\n" + body' <- zipWithM (tableRowToDokuWiki opts alignStrings) [1..] rows' + return $ captionDoc ++ head' ++ + unlines body' + +blockToDokuWiki opts x@(BulletList items) = do + oldUseTags <- get >>= return . stUseTags + indent <- get >>= return . stIndent + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + modify $ \s -> s { stUseTags = True } + contents <- mapM (listItemToDokuWiki opts) items + modify $ \s -> s { stUseTags = oldUseTags } + return $ "<HTML><ul></HTML>\n" ++ vcat contents ++ "<HTML></ul></HTML>\n" + else do + modify $ \s -> s { stIndent = stIndent s ++ " " } + contents <- mapM (listItemToDokuWiki opts) items + modify $ \s -> s { stIndent = indent } + return $ vcat contents ++ if null indent then "\n" else "" + +blockToDokuWiki opts x@(OrderedList attribs items) = do + oldUseTags <- get >>= return . stUseTags + indent <- get >>= return . stIndent + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + modify $ \s -> s { stUseTags = True } + contents <- mapM (orderedListItemToDokuWiki opts) items + modify $ \s -> s { stUseTags = oldUseTags } + return $ "<HTML><ol" ++ listAttribsToString attribs ++ "></HTML>\n" ++ vcat contents ++ "<HTML></ol></HTML>\n" + else do + modify $ \s -> s { stIndent = stIndent s ++ " " } + contents <- mapM (orderedListItemToDokuWiki opts) items + modify $ \s -> s { stIndent = indent } + return $ vcat contents ++ if null indent then "\n" else "" + +-- TODO Need to decide how to make definition lists work on dokuwiki - I don't think there +-- is a specific representation of them. +-- TODO This creates double '; ; ' if there is a bullet or ordered list inside a definition list +blockToDokuWiki opts x@(DefinitionList items) = do + oldUseTags <- get >>= return . stUseTags + indent <- get >>= return . stIndent + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + modify $ \s -> s { stUseTags = True } + contents <- mapM (definitionListItemToDokuWiki opts) items + modify $ \s -> s { stUseTags = oldUseTags } + return $ "<HTML><dl></HTML>\n" ++ vcat contents ++ "<HTML></dl></HTML>\n" + else do + modify $ \s -> s { stIndent = stIndent s ++ " " } + contents <- mapM (definitionListItemToDokuWiki opts) items + modify $ \s -> s { stIndent = indent } + return $ vcat contents ++ if null indent then "\n" else "" + +-- Auxiliary functions for lists: + +-- | Convert ordered list attributes to HTML attribute string +listAttribsToString :: ListAttributes -> String +listAttribsToString (startnum, numstyle, _) = + let numstyle' = camelCaseToHyphenated $ show numstyle + in (if startnum /= 1 + then " start=\"" ++ show startnum ++ "\"" + else "") ++ + (if numstyle /= DefaultStyle + then " style=\"list-style-type: " ++ numstyle' ++ ";\"" + else "") + +-- | Convert bullet list item (list of blocks) to DokuWiki. +listItemToDokuWiki :: WriterOptions -> [Block] -> State WriterState String +listItemToDokuWiki opts items = do + contents <- blockListToDokuWiki opts items + useTags <- get >>= return . stUseTags + if useTags + then return $ "<HTML><li></HTML>" ++ contents ++ "<HTML></li></HTML>" + else do + indent <- get >>= return . stIndent + return $ indent ++ "* " ++ contents + +-- | Convert ordered list item (list of blocks) to DokuWiki. +-- | TODO Emiminate dreadful duplication of text from listItemToDokuWiki +orderedListItemToDokuWiki :: WriterOptions -> [Block] -> State WriterState String +orderedListItemToDokuWiki opts items = do + contents <- blockListToDokuWiki opts items + useTags <- get >>= return . stUseTags + if useTags + then return $ "<HTML><li></HTML>" ++ contents ++ "<HTML></li></HTML>" + else do + indent <- get >>= return . stIndent + return $ indent ++ "- " ++ contents + +-- | Convert definition list item (label, list of blocks) to DokuWiki. +definitionListItemToDokuWiki :: WriterOptions + -> ([Inline],[[Block]]) + -> State WriterState String +definitionListItemToDokuWiki opts (label, items) = do + labelText <- inlineListToDokuWiki opts label + contents <- mapM (blockListToDokuWiki opts) items + useTags <- get >>= return . stUseTags + if useTags + then return $ "<HTML><dt></HTML>" ++ labelText ++ "<HTML></dt></HTML>\n" ++ + (intercalate "\n" $ map (\d -> "<HTML><dd></HTML>" ++ d ++ "<HTML></dd></HTML>") contents) + else do + indent <- get >>= return . stIndent + return $ indent ++ "* **" ++ labelText ++ "** " ++ concat contents + +-- | True if the list can be handled by simple wiki markup, False if HTML tags will be needed. +isSimpleList :: Block -> Bool +isSimpleList x = + case x of + BulletList items -> all isSimpleListItem items + OrderedList (num, sty, _) items -> all isSimpleListItem items && + num == 1 && sty `elem` [DefaultStyle, Decimal] + DefinitionList items -> all isSimpleListItem $ concatMap snd items + _ -> False + +-- | True if list item can be handled with the simple wiki syntax. False if +-- HTML tags will be needed. +isSimpleListItem :: [Block] -> Bool +isSimpleListItem [] = True +isSimpleListItem [x] = + case x of + Plain _ -> True + Para _ -> True + BulletList _ -> isSimpleList x + OrderedList _ _ -> isSimpleList x + DefinitionList _ -> isSimpleList x + _ -> False +isSimpleListItem [x, y] | isPlainOrPara x = + case y of + BulletList _ -> isSimpleList y + OrderedList _ _ -> isSimpleList y + DefinitionList _ -> isSimpleList y + _ -> False +isSimpleListItem _ = False + +isPlainOrPara :: Block -> Bool +isPlainOrPara (Plain _) = True +isPlainOrPara (Para _) = True +isPlainOrPara _ = False + +isSimpleBlockQuote :: [Block] -> Bool +isSimpleBlockQuote [BlockQuote bs] = isSimpleBlockQuote bs +isSimpleBlockQuote [b] = isPlainOrPara b +isSimpleBlockQuote _ = False + +-- | Concatenates strings with line breaks between them. +vcat :: [String] -> String +vcat = intercalate "\n" + +-- Auxiliary functions for tables: + +-- TODO Eliminate copy-and-pasted code in tableHeaderToDokuWiki and tableRowToDokuWiki +tableHeaderToDokuWiki :: WriterOptions + -> [String] + -> Int + -> [[Block]] + -> State WriterState String +tableHeaderToDokuWiki opts alignStrings rownum cols' = do + let celltype = if rownum == 0 then "" else "" + cols'' <- sequence $ zipWith + (\alignment item -> tableItemToDokuWiki opts celltype alignment item) + alignStrings cols' + return $ "^ " ++ "" ++ joinHeaders cols'' ++ " ^" + +tableRowToDokuWiki :: WriterOptions + -> [String] + -> Int + -> [[Block]] + -> State WriterState String +tableRowToDokuWiki opts alignStrings rownum cols' = do + let celltype = if rownum == 0 then "" else "" + cols'' <- sequence $ zipWith + (\alignment item -> tableItemToDokuWiki opts celltype alignment item) + alignStrings cols' + return $ "| " ++ "" ++ joinColumns cols'' ++ " |" + +alignmentToString :: Alignment -> [Char] +alignmentToString alignment = case alignment of + AlignLeft -> "" + AlignRight -> "" + AlignCenter -> "" + AlignDefault -> "" + +tableItemToDokuWiki :: WriterOptions + -> String + -> String + -> [Block] + -> State WriterState String +-- TODO Fix celltype and align' defined but not used +tableItemToDokuWiki opts _celltype _align' item = do + let mkcell x = "" ++ x ++ "" + contents <- blockListToDokuWiki opts item + return $ mkcell contents + +-- | Concatenates columns together. +joinColumns :: [String] -> String +joinColumns = intercalate " | " + +-- | Concatenates headers together. +joinHeaders :: [String] -> String +joinHeaders = intercalate " ^ " + +-- | Convert list of Pandoc block elements to DokuWiki. +blockListToDokuWiki :: WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> State WriterState String +blockListToDokuWiki opts blocks = + mapM (blockToDokuWiki opts) blocks >>= return . vcat + +-- | Convert list of Pandoc inline elements to DokuWiki. +inlineListToDokuWiki :: WriterOptions -> [Inline] -> State WriterState String +inlineListToDokuWiki opts lst = mapM (inlineToDokuWiki opts) lst >>= return . concat + +-- | Convert Pandoc inline element to DokuWiki. +inlineToDokuWiki :: WriterOptions -> Inline -> State WriterState String + +inlineToDokuWiki opts (Span _attrs ils) = do + contents <- inlineListToDokuWiki opts ils + return contents + +inlineToDokuWiki opts (Emph lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "//" ++ contents ++ "//" + +inlineToDokuWiki opts (Strong lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "**" ++ contents ++ "**" + +inlineToDokuWiki opts (Strikeout lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "<del>" ++ contents ++ "</del>" + +inlineToDokuWiki opts (Superscript lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "<sup>" ++ contents ++ "</sup>" + +inlineToDokuWiki opts (Subscript lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "<sub>" ++ contents ++ "</sub>" + +inlineToDokuWiki opts (SmallCaps lst) = inlineListToDokuWiki opts lst + +inlineToDokuWiki opts (Quoted SingleQuote lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "\8216" ++ contents ++ "\8217" + +inlineToDokuWiki opts (Quoted DoubleQuote lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "\8220" ++ contents ++ "\8221" + +inlineToDokuWiki opts (Cite _ lst) = inlineListToDokuWiki opts lst + +inlineToDokuWiki _ (Code _ str) = + -- In dokuwiki, text surrounded by '' is really just a font statement, i.e. <tt>, + -- and so other formatting can be present inside. + -- However, in pandoc, and markdown, inlined code doesn't contain formatting. + -- So I have opted for using %% to disable all formatting inside inline code blocks. + -- This gives the best results when converting from other formats to dokuwiki, even if + -- the resultand code is a little ugly, for short strings that don't contain formatting + -- characters. + -- It does mean that if pandoc could ever read dokuwiki, and so round-trip the format, + -- any formatting inside inlined code blocks would be lost, or presented incorrectly. + return $ "''%%" ++ str ++ "%%''" + +inlineToDokuWiki _ (Str str) = return $ escapeString str + +inlineToDokuWiki _ (Math _ str) = return $ "<math>" ++ str ++ "</math>" + -- note: str should NOT be escaped + +inlineToDokuWiki _ (RawInline f str) + | f == Format "dokuwiki" = return str + | f == Format "html" = return $ "<html>" ++ str ++ "</html>" + | otherwise = return "" + +inlineToDokuWiki _ (LineBreak) = return "\\\\ " + +inlineToDokuWiki _ Space = return " " + +inlineToDokuWiki opts (Link txt (src, _)) = do + label <- inlineListToDokuWiki opts txt + case txt of + [Str s] | "mailto:" `isPrefixOf` src -> return $ "<" ++ s ++ ">" + | escapeURI s == src -> return src + _ -> if isURI src + then return $ "[[" ++ src ++ "|" ++ label ++ "]]" + else return $ "[[" ++ src' ++ "|" ++ label ++ "]]" + where src' = case src of + '/':xs -> xs -- with leading / it's a + _ -> src -- link to a help page +inlineToDokuWiki opts (Image alt (source, tit)) = do + alt' <- inlineListToDokuWiki opts alt + let txt = if (null tit) + then if null alt + then "" + else "|" ++ alt' + else "|" ++ tit + return $ "{{:" ++ source ++ txt ++ "}}" + +inlineToDokuWiki opts (Note contents) = do + contents' <- blockListToDokuWiki opts contents + modify (\s -> s { stNotes = True }) + return $ "((" ++ contents' ++ "))" + -- note - may not work for notes with multiple blocks diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index b6687c330..2aab7701f 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE PatternGuards, CPP, ScopedTypeVariables #-} +{-# LANGUAGE PatternGuards, CPP, ScopedTypeVariables, ViewPatterns #-} {- Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu> @@ -29,10 +29,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of 'Pandoc' documents to EPUB. -} module Text.Pandoc.Writers.EPUB ( writeEPUB ) where -import Data.IORef +import Data.IORef ( IORef, newIORef, readIORef, modifyIORef ) import qualified Data.Map as M import Data.Maybe ( fromMaybe ) -import Data.List ( isInfixOf, intercalate ) +import Data.List ( isPrefixOf, isInfixOf, intercalate ) import System.Environment ( getEnv ) import Text.Printf (printf) import System.FilePath ( (</>), takeExtension, takeFileName ) @@ -40,28 +40,35 @@ import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as B8 import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.SelfContained ( makeSelfContained ) -import Codec.Archive.Zip +import Codec.Archive.Zip ( emptyArchive, addEntryToArchive, eRelativePath, fromEntry , Entry, toEntry, fromArchive) import Control.Applicative ((<$>)) -import Data.Time.Clock.POSIX -import Data.Time -import System.Locale -import Text.Pandoc.Shared hiding ( Element ) -import qualified Text.Pandoc.Shared as Shared +import Data.Time.Clock.POSIX ( getPOSIXTime ) +import Data.Time (getCurrentTime,UTCTime, formatTime) +import System.Locale ( defaultTimeLocale ) +import Text.Pandoc.Shared ( trimr, renderTags', safeRead, uniqueIdent, trim + , normalizeDate, readDataFile, stringify, warn + , hierarchicalize, fetchItem' ) +import qualified Text.Pandoc.Shared as S (Element(..)) import Text.Pandoc.Builder (fromList, setMeta) -import Text.Pandoc.Options +import Text.Pandoc.Options ( WriterOptions(..) + , HTMLMathMethod(..) + , EPUBVersion(..) + , ObfuscationMethod(NoObfuscation) ) import Text.Pandoc.Definition -import Text.Pandoc.Walk -import Control.Monad.State -import Text.XML.Light hiding (ppTopElement) -import Text.Pandoc.UUID -import Text.Pandoc.Writers.HTML -import Text.Pandoc.Writers.Markdown ( writePlain ) +import Text.Pandoc.Walk (walk, walkM) +import Control.Monad.State (modify, get, execState, State, put, evalState) +import Control.Monad (foldM, when, mplus, liftM) +import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs + , strContent, lookupAttr, Node(..), QName(..), parseXML + , onlyElems, node, ppElement) +import Text.Pandoc.UUID (getRandomUUID) +import Text.Pandoc.Writers.HTML (writeHtmlString, writeHtml) import Data.Char ( toLower, isDigit, isAlphaNum ) import Network.URI ( unEscapeString ) import Text.Pandoc.MIME (getMimeType) import qualified Control.Exception as E import Text.Blaze.Html.Renderer.Utf8 (renderHtml) -import Text.HTML.TagSoup +import Text.HTML.TagSoup (Tag(TagOpen), fromAttrib, parseTags) -- A Chapter includes a list of blocks and maybe a section -- number offset. Note, some chapters are unnumbered. The section @@ -87,6 +94,7 @@ data EPUBMetadata = EPUBMetadata{ , epubRights :: Maybe String , epubCoverImage :: Maybe String , epubStylesheet :: Maybe Stylesheet + , epubPageDirection :: ProgressionDirection } deriving Show data Stylesheet = StylesheetPath FilePath @@ -115,6 +123,8 @@ data Title = Title{ , titleType :: Maybe String } deriving Show +data ProgressionDirection = LTR | RTL | Default deriving Show + dcName :: String -> QName dcName n = QName n Nothing (Just "dc") @@ -124,20 +134,15 @@ dcNode = node . dcName opfName :: String -> QName opfName n = QName n Nothing (Just "opf") -plainify :: [Inline] -> String -plainify t = - trimr $ writePlain def{ writerStandalone = False } - $ Pandoc nullMeta [Plain $ walk removeNote t] - -removeNote :: Inline -> Inline -removeNote (Note _) = Str "" -removeNote x = x - toId :: FilePath -> String toId = map (\x -> if isAlphaNum x || x == '-' || x == '_' then x else '_') . takeFileName +removeNote :: Inline -> Inline +removeNote (Note _) = Str "" +removeNote x = x + getEPUBMetadata :: WriterOptions -> Meta -> IO EPUBMetadata getEPUBMetadata opts meta = do let md = metadataFromMeta opts meta @@ -172,7 +177,7 @@ getEPUBMetadata opts meta = do if any (\c -> creatorRole c == Just "aut") $ epubCreator m then return m else do - let authors' = map plainify $ docAuthors meta + let authors' = map stringify $ docAuthors meta let toAuthor name = Creator{ creatorText = name , creatorRole = Just "aut" , creatorFileAs = Nothing } @@ -221,8 +226,8 @@ addMetadataFromXML _ md = md metaValueToString :: MetaValue -> String metaValueToString (MetaString s) = s -metaValueToString (MetaInlines ils) = plainify ils -metaValueToString (MetaBlocks bs) = plainify $ query (:[]) bs +metaValueToString (MetaInlines ils) = stringify ils +metaValueToString (MetaBlocks bs) = stringify bs metaValueToString (MetaBool b) = show b metaValueToString _ = "" @@ -294,6 +299,7 @@ metadataFromMeta opts meta = EPUBMetadata{ , epubRights = rights , epubCoverImage = coverImage , epubStylesheet = stylesheet + , epubPageDirection = pageDirection } where identifiers = getIdentifier meta titles = getTitle meta @@ -316,6 +322,14 @@ metadataFromMeta opts meta = EPUBMetadata{ stylesheet = (StylesheetContents <$> writerEpubStylesheet opts) `mplus` ((StylesheetPath . metaValueToString) <$> lookupMeta "stylesheet" meta) + pageDirection = maybe Default stringToPageDirection + (lookupMeta "page-progression-direction" meta) + stringToPageDirection (metaValueToString -> s) = + case s of + "ltr" -> LTR + "rtl" -> RTL + _ -> Default + -- | Produce an EPUB file from a Pandoc document. writeEPUB :: WriterOptions -- ^ Writer options @@ -339,7 +353,7 @@ writeEPUB opts doc@(Pandoc meta _) = do if epub3 then MathML Nothing else writerHTMLMathMethod opts - , writerWrapText = False } + , writerWrapText = True } metadata <- getEPUBMetadata opts' meta -- cover page @@ -366,7 +380,8 @@ writeEPUB opts doc@(Pandoc meta _) = do walkM (transformBlock opts' mediaRef) pics <- readIORef mediaRef let readPicEntry entries (oldsrc, newsrc) = do - res <- fetchItem (writerSourceURL opts') oldsrc + res <- fetchItem' (writerMediaBag opts') + (writerSourceURL opts') oldsrc case res of Left _ -> do warn $ "Could not find media `" ++ oldsrc ++ "', skipping..." @@ -379,6 +394,12 @@ writeEPUB opts doc@(Pandoc meta _) = do let mkFontEntry f = mkEntry (takeFileName f) `fmap` B.readFile f fontEntries <- mapM mkFontEntry $ writerEpubFonts opts' + -- set page progression direction + let progressionDirection = case epubPageDirection metadata of + LTR -> "ltr" + RTL -> "rtl" + Default -> "default" + -- body pages -- add level 1 header to beginning if none there @@ -468,7 +489,7 @@ writeEPUB opts doc@(Pandoc meta _) = do [] -> case epubTitle metadata of [] -> "UNTITLED" (x:_) -> titleText x - x -> plainify x + x -> stringify x let uuid = case epubIdentifier metadata of (x:_) -> identifierText x -- use first identifier as UUID [] -> error "epubIdentifier is null" -- shouldn't happen @@ -498,7 +519,8 @@ writeEPUB opts doc@(Pandoc meta _) = do (pictureNode x)]) ++ map pictureNode picEntries ++ map fontNode fontEntries - , unode "spine" ! [("toc","ncx")] $ + , unode "spine" ! [("toc","ncx") + ,("page-progression-direction", progressionDirection)] $ case epubCoverImage metadata of Nothing -> [] Just _ -> [ unode "itemref" ! @@ -529,25 +551,25 @@ writeEPUB opts doc@(Pandoc meta _) = do let tocLevel = writerTOCDepth opts let navPointNode :: (Int -> String -> String -> [Element] -> Element) - -> Shared.Element -> State Int Element - navPointNode formatter (Sec _ nums (ident,_,_) ils children) = do + -> S.Element -> State Int Element + navPointNode formatter (S.Sec _ nums (ident,_,_) ils children) = do n <- get modify (+1) let showNums :: [Int] -> String showNums = intercalate "." . map show - let tit' = plainify ils + let tit' = stringify ils let tit = if writerNumberSections opts && not (null nums) then showNums nums ++ " " ++ tit' else tit' let src = case lookup ident reftable of Just x -> x Nothing -> error (ident ++ " not found in reftable") - let isSec (Sec lev _ _ _ _) = lev <= tocLevel + let isSec (S.Sec lev _ _ _ _) = lev <= tocLevel isSec _ = False let subsecs = filter isSec children subs <- mapM (navPointNode formatter) subsecs return $ formatter n tit src subs - navPointNode _ (Blk _) = error "navPointNode encountered Blk" + navPointNode _ (S.Blk _) = error "navPointNode encountered Blk" let navMapFormatter :: Int -> String -> String -> [Element] -> Element navMapFormatter n tit src subs = unode "navPoint" ! @@ -558,7 +580,7 @@ writeEPUB opts doc@(Pandoc meta _) = do ] ++ subs let tpNode = unode "navPoint" ! [("id", "navPoint-0")] $ - [ unode "navLabel" $ unode "text" (plainify $ docTitle meta) + [ unode "navLabel" $ unode "text" (stringify $ docTitle meta) , unode "content" ! [("src","title_page.xhtml")] $ () ] let tocData = UTF8.fromStringLazy $ ppTopElement $ @@ -587,7 +609,7 @@ writeEPUB opts doc@(Pandoc meta _) = do navXhtmlFormatter n tit src subs = unode "li" ! [("id", "toc-li-" ++ show n)] $ (unode "a" ! [("href",src)] - $ (unode "span" tit)) + $ tit) : case subs of [] -> [] (_:_) -> [unode "ol" ! [("class","toc")] $ subs] @@ -751,7 +773,7 @@ transformTag :: WriterOptions -> Tag String -> IO (Tag String) transformTag opts mediaRef tag@(TagOpen name attr) - | name == "video" || name == "source" || name == "img" = do + | name `elem` ["video", "source", "img", "audio"] = do let src = fromAttrib "src" tag let poster = fromAttrib "poster" tag let oldsrc = maybe src (</> src) $ writerSourceURL opts @@ -784,7 +806,7 @@ transformBlock opts mediaRef (RawBlock fmt raw) | fmt == Format "html" = do let tags = parseTags raw tags' <- mapM (transformTag opts mediaRef) tags - return $ RawBlock fmt (renderTags tags') + return $ RawBlock fmt (renderTags' tags') transformBlock _ _ b = return b transformInline :: WriterOptions @@ -798,8 +820,13 @@ transformInline opts mediaRef (Image lab (src,tit)) = do return $ Image lab (newsrc, tit) transformInline opts _ (x@(Math _ _)) | WebTeX _ <- writerHTMLMathMethod opts = do - raw <- makeSelfContained Nothing $ writeHtmlInline opts x + raw <- makeSelfContained opts $ writeHtmlInline opts x return $ RawInline (Format "html") raw +transformInline opts mediaRef (RawInline fmt raw) + | fmt == Format "html" = do + let tags = parseTags raw + tags' <- mapM (transformTag opts mediaRef) tags + return $ RawInline fmt (renderTags' tags') transformInline _ _ x = return x writeHtmlInline :: WriterOptions -> Inline -> String @@ -825,11 +852,11 @@ ppTopElement = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" ++) . unEntity . unEntity (x:xs) = x : unEntity xs mediaTypeOf :: FilePath -> Maybe String -mediaTypeOf x = case getMimeType x of - Just y@('i':'m':'a':'g':'e':_) -> Just y - Just y@('v':'i':'d':'e':'o':_) -> Just y - Just y@('a':'u':'d':'i':'o':_) -> Just y - _ -> Nothing +mediaTypeOf x = + let mediaPrefixes = ["image", "video", "audio"] in + case getMimeType x of + Just y | any (`isPrefixOf` y) mediaPrefixes -> Just y + _ -> Nothing data IdentState = IdentState{ chapterNumber :: Int, diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 803617f95..233b8b32b 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards #-} + {- Copyright (c) 2011-2012, Sergey Astanin All rights reserved. @@ -28,8 +30,8 @@ module Text.Pandoc.Writers.FB2 (writeFB2) where import Control.Monad.State (StateT, evalStateT, get, modify) import Control.Monad.State (liftM, liftM2, liftIO) import Data.ByteString.Base64 (encode) -import Data.Char (toUpper, toLower, isSpace, isAscii, isControl) -import Data.List (intersperse, intercalate, isPrefixOf) +import Data.Char (toLower, isSpace, isAscii, isControl) +import Data.List (intersperse, intercalate, isPrefixOf, stripPrefix) import Data.Either (lefts, rights) import Network.Browser (browse, request, setAllowRedirects, setOutHandler) import Network.HTTP (catchIO_, getRequest, getHeaders, getResponseBody) @@ -44,8 +46,7 @@ import qualified Text.XML.Light.Cursor as XC import Text.Pandoc.Definition import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..), def) -import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock) -import Text.Pandoc.Walk +import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock, capitalize) -- | Data to be written at the end of the document: -- (foot)notes, URLs, references, images. @@ -253,22 +254,21 @@ readDataURI :: String -- ^ URI -> Maybe (String,String,Bool,String) -- ^ Maybe (mime,charset,isBase64,data) readDataURI uri = - let prefix = "data:" - in if not (prefix `isPrefixOf` uri) - then Nothing - else - let rest = drop (length prefix) uri - meta = takeWhile (/= ',') rest -- without trailing ',' - uridata = drop (length meta + 1) rest - parts = split (== ';') meta - (mime,cs,enc)=foldr upd ("text/plain","US-ASCII",False) parts - in Just (mime,cs,enc,uridata) + case stripPrefix "data:" uri of + Nothing -> Nothing + Just rest -> + let meta = takeWhile (/= ',') rest -- without trailing ',' + uridata = drop (length meta + 1) rest + parts = split (== ';') meta + (mime,cs,enc)=foldr upd ("text/plain","US-ASCII",False) parts + in Just (mime,cs,enc,uridata) + where upd str m@(mime,cs,enc) - | isMimeType str = (str,cs,enc) - | "charset=" `isPrefixOf` str = (mime,drop (length "charset=") str,enc) - | str == "base64" = (mime,cs,True) - | otherwise = m + | isMimeType str = (str,cs,enc) + | Just str' <- stripPrefix "charset=" str = (mime,str',enc) + | str == "base64" = (mime,cs,True) + | otherwise = m -- Without parameters like ;charset=...; see RFC 2045, 5.1 isMimeType :: String -> Bool @@ -296,7 +296,6 @@ fetchURL url = do let content_type = lookupHeader HdrContentType (getHeaders r) content <- liftM (Just . toStr . encode . toBS) . getResponseBody $ Right r return $ liftM2 (,) content_type content - where toBS :: String -> B.ByteString toBS = B.pack . map (toEnum . fromEnum) @@ -421,10 +420,6 @@ indent = indentBlock indentLines ins = let lns = split isLineBreak ins :: [[Inline]] in intercalate [LineBreak] $ map ((Str spacer):) lns -capitalize :: Inline -> Inline -capitalize (Str xs) = Str $ map toUpper xs -capitalize x = x - -- | Convert a Pandoc's Inline element to FictionBook XML representation. toXml :: Inline -> FBM [Content] toXml (Str s) = return [txt s] @@ -434,7 +429,7 @@ toXml (Strong ss) = list `liftM` wrap "strong" ss toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss toXml (Superscript ss) = list `liftM` wrap "sup" ss toXml (Subscript ss) = list `liftM` wrap "sub" ss -toXml (SmallCaps ss) = cMapM toXml $ walk capitalize ss +toXml (SmallCaps ss) = cMapM toXml $ capitalize ss toXml (Quoted SingleQuote ss) = do -- FIXME: should be language-specific inner <- cMapM toXml ss return $ [txt "‘"] ++ inner ++ [txt "’"] diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 9a26cf2ac..36ce2ba21 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -63,6 +63,7 @@ import Text.XML.Light.Output import System.FilePath (takeExtension) import Data.Monoid import Data.Aeson (Value) +import Control.Applicative ((<$>)) data WriterState = WriterState { stNotes :: [Html] -- ^ List of notes @@ -235,6 +236,9 @@ showSecNum = concat . intersperse "." . map show -- | Converts an Element to a list item for a table of contents, -- retrieving the appropriate identifier from state. elementToListItem :: WriterOptions -> Element -> State WriterState (Maybe Html) +-- Don't include the empty headers created in slide shows +-- shows when an hrule is used to separate slides without a new title: +elementToListItem _ (Sec _ _ _ [Str "\0"] _) = return Nothing elementToListItem opts (Sec lev num (id',classes,_) headerText subsecs) | lev <= writerTOCDepth opts = do let num' = zipWith (+) num (writerNumberOffset opts ++ repeat 0) @@ -429,9 +433,11 @@ blockToHtml opts (Div attr@(_,classes,_) bs) = do let contents' = nl opts >> contents >> nl opts return $ if "notes" `elem` classes - then case writerSlideVariant opts of - RevealJsSlides -> addAttrs opts attr $ H5.aside $ contents' - NoSlides -> addAttrs opts attr $ H.div $ contents' + then let opts' = opts{ writerIncremental = False } in + -- we don't want incremental output inside speaker notes + case writerSlideVariant opts of + RevealJsSlides -> addAttrs opts' attr $ H5.aside $ contents' + NoSlides -> addAttrs opts' attr $ H.div $ contents' _ -> mempty else addAttrs opts attr $ H.div $ contents' blockToHtml _ (RawBlock f str) @@ -698,18 +704,18 @@ inlineToHtml opts inline = else DisplayBlock let conf = useShortEmptyTags (const False) defaultConfigPP - case texMathToMathML dt str of - Right r -> return $ preEscapedString $ - ppcElement conf r - Left _ -> inlineListToHtml opts - (readTeXMath' t str) >>= return . - (H.span ! A.class_ "math") + case writeMathML dt <$> readTeX str of + Right r -> return $ preEscapedString $ + ppcElement conf r + Left _ -> inlineListToHtml opts + (texMathToInlines t str) >>= + return . (H.span ! A.class_ "math") MathJax _ -> return $ H.span ! A.class_ "math" $ toHtml $ case t of InlineMath -> "\\(" ++ str ++ "\\)" DisplayMath -> "\\[" ++ str ++ "\\]" PlainMath -> do - x <- inlineListToHtml opts (readTeXMath' t str) + x <- inlineListToHtml opts (texMathToInlines t str) let m = H.span ! A.class_ "math" $ x let brtag = if writerHtml5 opts then H5.br else H.br return $ case t of @@ -723,10 +729,6 @@ inlineToHtml opts inline = _ -> return mempty | f == Format "html" -> return $ preEscapedString str | otherwise -> return mempty - (Link [Str str] (s,_)) | "mailto:" `isPrefixOf` s && - s == escapeURI ("mailto" ++ str) -> - -- autolink - return $ obfuscateLink opts str s (Link txt (s,_)) | "mailto:" `isPrefixOf` s -> do linkText <- inlineListToHtml opts txt return $ obfuscateLink opts (renderHtml linkText) s diff --git a/src/Text/Pandoc/Writers/Haddock.hs b/src/Text/Pandoc/Writers/Haddock.hs index 1c82839d0..14f398da9 100644 --- a/src/Text/Pandoc/Writers/Haddock.hs +++ b/src/Text/Pandoc/Writers/Haddock.hs @@ -39,7 +39,7 @@ import Text.Pandoc.Options import Data.List ( intersperse, transpose ) import Text.Pandoc.Pretty import Control.Monad.State -import Text.Pandoc.Readers.TeXMath (readTeXMath') +import Text.Pandoc.Readers.TeXMath (texMathToInlines) import Network.URI (isURI) import Data.Default @@ -319,7 +319,7 @@ inlineToHaddock opts (Math mt str) = do let adjust x = case mt of DisplayMath -> cr <> x <> cr InlineMath -> x - adjust `fmap` (inlineListToHaddock opts $ readTeXMath' mt str) + adjust `fmap` (inlineListToHaddock opts $ texMathToInlines mt str) inlineToHaddock _ (RawInline f str) | f == "haddock" = return $ text str | otherwise = return empty diff --git a/src/Text/Pandoc/Writers/ICML.hs b/src/Text/Pandoc/Writers/ICML.hs index 19d486b25..ae20efd4b 100644 --- a/src/Text/Pandoc/Writers/ICML.hs +++ b/src/Text/Pandoc/Writers/ICML.hs @@ -42,7 +42,7 @@ type WS a = State WriterState a defaultWriterState :: WriterState defaultWriterState = WriterState{ - blockStyles = Set.empty + blockStyles = Set.empty , inlineStyles = Set.empty , links = [] , listDepth = 1 @@ -267,7 +267,7 @@ hyperlinksToDoc (x:xs) = hyp x $$ hyperlinksToDoc xs $ inTags False "BorderColor" [("type","enumeration")] (text "Black") $$ (inTags False "Destination" [("type","object")] $ text $ "HyperlinkURLDestination/"++(escapeStringForXML url)) - + -- | Convert a list of Pandoc blocks to ICML. blocksToICML :: WriterOptions -> Style -> [Block] -> WS Doc @@ -352,7 +352,7 @@ listItemsToICML opts listType style attribs (first:rest) = do -- | Convert a list of blocks to ICML list items. listItemToICML :: WriterOptions -> Style -> Bool-> Maybe ListAttributes -> [Block] -> WS Doc listItemToICML opts style isFirst attribs item = - let makeNumbStart (Just (beginsWith, numbStl, _)) = + let makeNumbStart (Just (beginsWith, numbStl, _)) = let doN DefaultStyle = [] doN LowerRoman = [lowerRomanName] doN UpperRoman = [upperRomanName] @@ -467,7 +467,7 @@ parStyle opts style lst = -- | Wrap a Doc in an ICML Character Style. charStyle :: Style -> Doc -> WS Doc -charStyle style content = +charStyle style content = let (stlStr, attrs) = styleToStrAttr style doc = inTags True "CharacterStyleRange" attrs $ inTagsSimple "Content" $ flush content in do diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index f2f7438c4..d140932a7 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -1,4 +1,5 @@ -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, + PatternGuards #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -37,7 +38,7 @@ import Text.Pandoc.Options import Text.Pandoc.Templates import Text.Printf ( printf ) import Network.URI ( isURI, unEscapeString ) -import Data.List ( (\\), isSuffixOf, isInfixOf, +import Data.List ( (\\), isSuffixOf, isInfixOf, stripPrefix, isPrefixOf, intercalate, intersperse ) import Data.Char ( toLower, isPunctuation, isAscii, isLetter, isDigit, ord ) import Data.Maybe ( fromMaybe ) @@ -220,6 +221,7 @@ stringToLaTeX ctx (x:xs) = do '>' -> "\\textgreater{}" ++ rest '[' -> "{[}" ++ rest -- to avoid interpretation as ']' -> "{]}" ++ rest -- optional arguments + '\'' | ctx == CodeString -> "\\textquotesingle{}" ++ rest '\160' -> "~" ++ rest '\x2026' -> "\\ldots{}" ++ rest '\x2018' | ligatures -> "`" ++ rest @@ -470,19 +472,18 @@ blockToLaTeX (Table caption aligns widths heads rows) = do captionText <- inlineListToLaTeX caption let capt = if isEmpty captionText then empty - else text "\\addlinespace" - $$ text "\\caption" <> braces captionText + else text "\\caption" <> braces captionText <> "\\\\" rows' <- mapM (tableRowToLaTeX False aligns widths) rows let colDescriptors = text $ concat $ map toColDescriptor aligns modify $ \s -> s{ stTable = True } return $ "\\begin{longtable}[c]" <> braces ("@{}" <> colDescriptors <> "@{}") -- the @{} removes extra space at beginning and end + $$ capt $$ "\\toprule\\addlinespace" $$ headers $$ vcat rows' $$ "\\bottomrule" - $$ capt $$ "\\end{longtable}" toColDescriptor :: Alignment -> String @@ -742,7 +743,7 @@ inlineToLaTeX (Quoted qt lst) = do else char '\x2018' <> inner <> char '\x2019' inlineToLaTeX (Str str) = liftM text $ stringToLaTeX TextString str inlineToLaTeX (Math InlineMath str) = - return $ char '$' <> text str <> char '$' + return $ "\\(" <> text str <> "\\)" inlineToLaTeX (Math DisplayMath str) = return $ "\\[" <> text str <> "\\]" inlineToLaTeX (RawInline f str) @@ -757,10 +758,17 @@ inlineToLaTeX (Link txt ('#':ident, _)) = do return $ text "\\hyperref" <> brackets (text lab) <> braces contents inlineToLaTeX (Link txt (src, _)) = case txt of - [Str x] | x == src -> -- autolink + [Str x] | escapeURI x == src -> -- autolink do modify $ \s -> s{ stUrl = True } - src' <- stringToLaTeX URLString x + src' <- stringToLaTeX URLString src return $ text $ "\\url{" ++ src' ++ "}" + [Str x] | Just rest <- stripPrefix "mailto:" src, + escapeURI x == rest -> -- email autolink + do modify $ \s -> s{ stUrl = True } + src' <- stringToLaTeX URLString src + contents <- inlineListToLaTeX txt + return $ "\\href" <> braces (text src') <> + braces ("\\nolinkurl" <> braces contents) _ -> do contents <- inlineListToLaTeX txt src' <- stringToLaTeX URLString src return $ text ("\\href{" ++ src' ++ "}{") <> diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index 41eb3e5be..6b2c4c200 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -36,7 +36,8 @@ import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Readers.TeXMath import Text.Printf ( printf ) -import Data.List ( isPrefixOf, intersperse, intercalate ) +import Data.List ( stripPrefix, intersperse, intercalate ) +import Data.Maybe (fromMaybe) import Text.Pandoc.Pretty import Text.Pandoc.Builder (deleteMeta) import Control.Monad.State @@ -331,9 +332,9 @@ inlineToMan _ (Code _ str) = return $ text $ "\\f[C]" ++ escapeCode str ++ "\\f[]" inlineToMan _ (Str str) = return $ text $ escapeString str inlineToMan opts (Math InlineMath str) = - inlineListToMan opts $ readTeXMath' InlineMath str + inlineListToMan opts $ texMathToInlines InlineMath str inlineToMan opts (Math DisplayMath str) = do - contents <- inlineListToMan opts $ readTeXMath' DisplayMath str + contents <- inlineListToMan opts $ texMathToInlines DisplayMath str return $ cr <> text ".RS" $$ contents $$ text ".RE" inlineToMan _ (RawInline f str) | f == Format "man" = return $ text str @@ -343,7 +344,7 @@ inlineToMan _ (LineBreak) = return $ inlineToMan _ Space = return space inlineToMan opts (Link txt (src, _)) = do linktext <- inlineListToMan opts txt - let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) return $ case txt of [Str s] | escapeURI s == srcSuffix -> diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index a67271a5d..95d4db29b 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -37,16 +37,17 @@ import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding (blankline, char, space) -import Data.List ( group, isPrefixOf, find, intersperse, transpose, sortBy ) +import Text.Pandoc.Parsing hiding (blankline, blanklines, char, space) +import Data.Maybe (fromMaybe) +import Data.List ( group, stripPrefix, find, intersperse, transpose, sortBy ) import Data.Char ( isSpace, isPunctuation ) import Data.Ord ( comparing ) import Text.Pandoc.Pretty import Control.Monad.State import qualified Data.Set as Set import Text.Pandoc.Writers.HTML (writeHtmlString) -import Text.Pandoc.Readers.TeXMath (readTeXMath') -import Text.HTML.TagSoup (renderTags, parseTags, isTagText, Tag(..)) +import Text.Pandoc.Readers.TeXMath (texMathToInlines) +import Text.HTML.TagSoup (parseTags, isTagText, Tag(..)) import Network.URI (isURI) import Data.Default import Data.Yaml (Value(Object,String,Array,Bool,Number)) @@ -77,26 +78,15 @@ writePlain :: WriterOptions -> Pandoc -> String writePlain opts document = evalState (pandocToMarkdown opts{ writerExtensions = Set.delete Ext_escaped_line_breaks $ + Set.delete Ext_pipe_tables $ + Set.delete Ext_raw_html $ + Set.delete Ext_markdown_in_html_blocks $ + Set.delete Ext_raw_tex $ + Set.delete Ext_footnotes $ + Set.delete Ext_tex_math_dollars $ + Set.delete Ext_citations $ writerExtensions opts } - document') def{ stPlain = True } - where document' = plainify document - -plainify :: Pandoc -> Pandoc -plainify = walk go - where go :: Inline -> Inline - go (Emph xs) = SmallCaps xs - go (Strong xs) = SmallCaps xs - go (Strikeout xs) = SmallCaps xs - go (Superscript xs) = SmallCaps xs - go (Subscript xs) = SmallCaps xs - go (SmallCaps xs) = SmallCaps xs - go (Code _ s) = Str s - go (Math _ s) = Str s - go (RawInline _ _) = Str "" - go (Link xs _) = SmallCaps xs - go (Image xs _) = SmallCaps $ [Str "["] ++ xs ++ [Str "]"] - go (Cite _ cits) = SmallCaps cits - go x = x + document) def{ stPlain = True } pandocTitleBlock :: Doc -> [Doc] -> Doc -> Doc pandocTitleBlock tit auths dat = @@ -187,7 +177,7 @@ pandocToMarkdown opts (Pandoc meta blocks) = do then tableOfContents opts headerBlocks else empty -- Strip off final 'references' header if markdown citations enabled - let blocks' = if not isPlain && isEnabled Ext_citations opts + let blocks' = if isEnabled Ext_citations opts then case reverse blocks of (Div (_,["references"],_) _):xs -> reverse xs _ -> blocks @@ -308,12 +298,12 @@ blockToMarkdown :: WriterOptions -- ^ Options -> State WriterState Doc blockToMarkdown _ Null = return empty blockToMarkdown opts (Div attrs ils) = do - isPlain <- gets stPlain contents <- blockListToMarkdown opts ils - return $ if isPlain || not (isEnabled Ext_markdown_in_html_blocks opts) - then contents <> blankline - else tagWithAttrs "div" attrs <> blankline <> + return $ if isEnabled Ext_raw_html opts && + isEnabled Ext_markdown_in_html_blocks opts + then tagWithAttrs "div" attrs <> blankline <> contents <> blankline <> "</div>" <> blankline + else contents <> blankline blockToMarkdown opts (Plain inlines) = do contents <- inlineListToMarkdown opts inlines -- escape if para starts with ordered list marker @@ -337,21 +327,22 @@ blockToMarkdown opts (Para inlines) = (<> blankline) `fmap` blockToMarkdown opts (Plain inlines) blockToMarkdown opts (RawBlock f str) | f == "html" = do - st <- get - if stPlain st - then return empty - else return $ if isEnabled Ext_markdown_attribute opts + plain <- gets stPlain + return $ if plain + then empty + else if isEnabled Ext_markdown_attribute opts then text (addMarkdownAttribute str) <> text "\n" else text str <> text "\n" | f `elem` ["latex", "tex", "markdown"] = do - st <- get - if stPlain st - then return empty - else return $ text str <> text "\n" + plain <- gets stPlain + return $ if plain + then empty + else text str <> text "\n" blockToMarkdown _ (RawBlock _ _) = return empty -blockToMarkdown _ HorizontalRule = - return $ blankline <> text "* * * * *" <> blankline +blockToMarkdown opts HorizontalRule = do + return $ blankline <> text (replicate (writerColumns opts) '-') <> blankline blockToMarkdown opts (Header level attr inlines) = do + plain <- gets stPlain -- we calculate the id that would be used by auto_identifiers -- so we know whether to print an explicit identifier ids <- gets stIds @@ -366,19 +357,23 @@ blockToMarkdown opts (Header level attr inlines) = do _ | isEnabled Ext_header_attributes opts -> space <> attrsToMarkdown attr | otherwise -> empty - contents <- inlineListToMarkdown opts inlines - st <- get + contents <- inlineListToMarkdown opts $ + if level == 1 && plain + then capitalize inlines + else inlines let setext = writerSetextHeaders opts return $ nowrap $ case level of - 1 | setext -> + 1 | plain -> blanklines 3 <> contents <> blanklines 2 + | setext -> contents <> attr' <> cr <> text (replicate (offset contents) '=') <> blankline - 2 | setext -> + 2 | plain -> blanklines 2 <> contents <> blankline + | setext -> contents <> attr' <> cr <> text (replicate (offset contents) '-') <> blankline -- ghc interprets '#' characters in column 1 as linenum specifiers. - _ | stPlain st || isEnabled Ext_literate_haskell opts -> + _ | plain || isEnabled Ext_literate_haskell opts -> contents <> blankline _ -> text (replicate level '#') <> space <> contents <> attr' <> blankline blockToMarkdown opts (CodeBlock (_,classes,_) str) @@ -405,17 +400,15 @@ blockToMarkdown opts (CodeBlock attribs str) = return $ attrs = if isEnabled Ext_fenced_code_attributes opts then nowrap $ " " <> attrsToMarkdown attribs else case attribs of - (_,[cls],_) -> " " <> text cls - _ -> empty + (_,(cls:_),_) -> " " <> text cls + _ -> empty blockToMarkdown opts (BlockQuote blocks) = do - st <- get + plain <- gets stPlain -- if we're writing literate haskell, put a space before the bird tracks -- so they won't be interpreted as lhs... let leader = if isEnabled Ext_literate_haskell opts then " > " - else if stPlain st - then " " - else "> " + else if plain then " " else "> " contents <- blockListToMarkdown opts blocks return $ (prefixed leader contents) <> blankline blockToMarkdown opts t@(Table caption aligns widths headers rows) = do @@ -471,7 +464,7 @@ addMarkdownAttribute :: String -> String addMarkdownAttribute s = case span isTagText $ reverse $ parseTags s of (xs,(TagOpen t attrs:rest)) -> - renderTags $ reverse rest ++ (TagOpen t attrs' : reverse xs) + renderTags' $ reverse rest ++ (TagOpen t attrs' : reverse xs) where attrs' = ("markdown","1"):[(x,y) | (x,y) <- attrs, x /= "markdown"] _ -> s @@ -609,8 +602,19 @@ definitionListItemToMarkdown opts (label, defs) = do let sps = case writerTabStop opts - 3 of n | n > 0 -> text $ replicate n ' ' _ -> text " " - let contents = vcat $ map (\d -> hang tabStop (leader <> sps) $ vcat d <> cr) defs' - return $ nowrap labelText <> cr <> contents <> cr + if isEnabled Ext_compact_definition_lists opts + then do + let contents = vcat $ map (\d -> hang tabStop (leader <> sps) + $ vcat d <> cr) defs' + return $ nowrap labelText <> cr <> contents <> cr + else do + let contents = vcat $ map (\d -> hang tabStop (leader <> sps) + $ vcat d <> cr) defs' + let isTight = case defs of + ((Plain _ : _): _) -> True + _ -> False + return $ blankline <> nowrap labelText <> + (if isTight then cr else blankline) <> contents <> blankline else do return $ nowrap labelText <> text " " <> cr <> vsep (map vsep defs') <> blankline @@ -625,15 +629,21 @@ blockListToMarkdown opts blocks = -- code block will be treated as a list continuation paragraph where fixBlocks (b : CodeBlock attr x : rest) | (not (isEnabled Ext_fenced_code_blocks opts) || attr == nullAttr) - && isListBlock b = - b : RawBlock "html" "<!-- -->\n" : CodeBlock attr x : - fixBlocks rest + && isListBlock b = b : commentSep : CodeBlock attr x : + fixBlocks rest + fixBlocks (b1@(BulletList _) : b2@(BulletList _) : bs) = + b1 : commentSep : fixBlocks (b2:bs) + fixBlocks (b1@(OrderedList _ _) : b2@(OrderedList _ _) : bs) = + b1 : commentSep : fixBlocks (b2:bs) + fixBlocks (b1@(DefinitionList _) : b2@(DefinitionList _) : bs) = + b1 : commentSep : fixBlocks (b2:bs) fixBlocks (x : xs) = x : fixBlocks xs fixBlocks [] = [] isListBlock (BulletList _) = True isListBlock (OrderedList _ _) = True isListBlock (DefinitionList _) = True isListBlock _ = False + commentSep = RawBlock "html" "<!-- -->\n" -- | Get reference for target; if none exists, create unique one and return. -- Prefer label if possible; otherwise, generate a unique key. @@ -657,7 +667,11 @@ getReference label (src, tit) = do -- | Convert list of Pandoc inline elements to markdown. inlineListToMarkdown :: WriterOptions -> [Inline] -> State WriterState Doc inlineListToMarkdown opts lst = - mapM (inlineToMarkdown opts) lst >>= return . cat + mapM (inlineToMarkdown opts) (avoidBadWraps lst) >>= return . cat + where avoidBadWraps [] = [] + avoidBadWraps (Space:Str (c:cs):xs) + | c `elem` "-*+>" = Str (' ':c:cs) : avoidBadWraps xs + avoidBadWraps (x:xs) = x : avoidBadWraps xs escapeSpaces :: Inline -> Inline escapeSpaces (Str s) = Str $ substitute " " "\\ " s @@ -667,52 +681,67 @@ escapeSpaces x = x -- | Convert Pandoc inline element to markdown. inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc inlineToMarkdown opts (Span attrs ils) = do - st <- get contents <- inlineListToMarkdown opts ils - return $ if stPlain st - then contents - else tagWithAttrs "span" attrs <> contents <> text "</span>" + return $ if isEnabled Ext_raw_html opts + then tagWithAttrs "span" attrs <> contents <> text "</span>" + else contents inlineToMarkdown opts (Emph lst) = do + plain <- gets stPlain contents <- inlineListToMarkdown opts lst - return $ "*" <> contents <> "*" + return $ if plain + then "_" <> contents <> "_" + else "*" <> contents <> "*" inlineToMarkdown opts (Strong lst) = do - contents <- inlineListToMarkdown opts lst - return $ "**" <> contents <> "**" + plain <- gets stPlain + if plain + then inlineListToMarkdown opts $ capitalize lst + else do + contents <- inlineListToMarkdown opts lst + return $ "**" <> contents <> "**" inlineToMarkdown opts (Strikeout lst) = do contents <- inlineListToMarkdown opts lst return $ if isEnabled Ext_strikeout opts then "~~" <> contents <> "~~" else "<s>" <> contents <> "</s>" inlineToMarkdown opts (Superscript lst) = do - let lst' = walk escapeSpaces lst - contents <- inlineListToMarkdown opts lst' + contents <- inlineListToMarkdown opts $ walk escapeSpaces lst return $ if isEnabled Ext_superscript opts then "^" <> contents <> "^" else "<sup>" <> contents <> "</sup>" inlineToMarkdown opts (Subscript lst) = do - let lst' = walk escapeSpaces lst - contents <- inlineListToMarkdown opts lst' + contents <- inlineListToMarkdown opts $ walk escapeSpaces lst return $ if isEnabled Ext_subscript opts then "~" <> contents <> "~" else "<sub>" <> contents <> "</sub>" -inlineToMarkdown opts (SmallCaps lst) = inlineListToMarkdown opts lst +inlineToMarkdown opts (SmallCaps lst) = do + plain <- gets stPlain + if plain + then inlineListToMarkdown opts $ capitalize lst + else do + contents <- inlineListToMarkdown opts lst + return $ tagWithAttrs "span" + ("",[],[("style","font-variant:small-caps;")]) + <> contents <> text "</span>" inlineToMarkdown opts (Quoted SingleQuote lst) = do contents <- inlineListToMarkdown opts lst return $ "‘" <> contents <> "’" inlineToMarkdown opts (Quoted DoubleQuote lst) = do contents <- inlineListToMarkdown opts lst return $ "“" <> contents <> "”" -inlineToMarkdown opts (Code attr str) = +inlineToMarkdown opts (Code attr str) = do let tickGroups = filter (\s -> '`' `elem` s) $ group str - longest = if null tickGroups + let longest = if null tickGroups then 0 else maximum $ map length tickGroups - marker = replicate (longest + 1) '`' - spacer = if (longest == 0) then "" else " " - attrs = if isEnabled Ext_inline_code_attributes opts && attr /= nullAttr + let marker = replicate (longest + 1) '`' + let spacer = if (longest == 0) then "" else " " + let attrs = if isEnabled Ext_inline_code_attributes opts && attr /= nullAttr then attrsToMarkdown attr else empty - in return $ text (marker ++ spacer ++ str ++ spacer ++ marker) <> attrs + plain <- gets stPlain + if plain + then return $ text str + else return $ text (marker ++ spacer ++ str ++ spacer ++ marker) <> attrs inlineToMarkdown _ (Str str) = do st <- get if stPlain st @@ -725,7 +754,11 @@ inlineToMarkdown opts (Math InlineMath str) return $ "\\(" <> text str <> "\\)" | isEnabled Ext_tex_math_double_backslash opts = return $ "\\\\(" <> text str <> "\\\\)" - | otherwise = inlineListToMarkdown opts $ readTeXMath' InlineMath str + | otherwise = do + plain <- gets stPlain + inlineListToMarkdown opts $ + (if plain then makeMathPlainer else id) $ + texMathToInlines InlineMath str inlineToMarkdown opts (Math DisplayMath str) | isEnabled Ext_tex_math_dollars opts = return $ "$$" <> text str <> "$$" @@ -734,16 +767,23 @@ inlineToMarkdown opts (Math DisplayMath str) | isEnabled Ext_tex_math_double_backslash opts = return $ "\\\\[" <> text str <> "\\\\]" | otherwise = (\x -> cr <> x <> cr) `fmap` - inlineListToMarkdown opts (readTeXMath' DisplayMath str) -inlineToMarkdown opts (RawInline f str) - | f == "html" || f == "markdown" || - (isEnabled Ext_raw_tex opts && (f == "latex" || f == "tex")) = - return $ text str -inlineToMarkdown _ (RawInline _ _) = return empty -inlineToMarkdown opts (LineBreak) - | isEnabled Ext_hard_line_breaks opts = return cr - | isEnabled Ext_escaped_line_breaks opts = return $ "\\" <> cr - | otherwise = return $ " " <> cr + inlineListToMarkdown opts (texMathToInlines DisplayMath str) +inlineToMarkdown opts (RawInline f str) = do + plain <- gets stPlain + if not plain && + ( f == "markdown" || + (isEnabled Ext_raw_tex opts && (f == "latex" || f == "tex")) || + (isEnabled Ext_raw_html opts && f == "html") ) + then return $ text str + else return empty +inlineToMarkdown opts (LineBreak) = do + plain <- gets stPlain + if plain || isEnabled Ext_hard_line_breaks opts + then return cr + else return $ + if isEnabled Ext_escaped_line_breaks opts + then "\\" <> cr + else " " <> cr inlineToMarkdown _ Space = return space inlineToMarkdown opts (Cite [] lst) = inlineListToMarkdown opts lst inlineToMarkdown opts (Cite (c:cs) lst) @@ -776,11 +816,12 @@ inlineToMarkdown opts (Cite (c:cs) lst) modekey SuppressAuthor = "-" modekey _ = "" inlineToMarkdown opts (Link txt (src, tit)) = do + plain <- gets stPlain linktext <- inlineListToMarkdown opts txt let linktitle = if null tit then empty else text $ " \"" ++ tit ++ "\"" - let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) let useAuto = isURI src && case txt of [Str s] | escapeURI s == srcSuffix -> True @@ -789,22 +830,29 @@ inlineToMarkdown opts (Link txt (src, tit)) = do ref <- if useRefLinks then getReference txt (src, tit) else return [] reftext <- inlineListToMarkdown opts ref return $ if useAuto - then "<" <> text srcSuffix <> ">" + then if plain + then text srcSuffix + else "<" <> text srcSuffix <> ">" else if useRefLinks then let first = "[" <> linktext <> "]" second = if txt == ref then "[]" else "[" <> reftext <> "]" in first <> second - else "[" <> linktext <> "](" <> - text src <> linktitle <> ")" + else if plain + then linktext + else "[" <> linktext <> "](" <> + text src <> linktitle <> ")" inlineToMarkdown opts (Image alternate (source, tit)) = do + plain <- gets stPlain let txt = if null alternate || alternate == [Str source] -- to prevent autolinks then [Str ""] else alternate linkPart <- inlineToMarkdown opts (Link txt (source, tit)) - return $ "!" <> linkPart + return $ if plain + then "[" <> linkPart <> "]" + else "!" <> linkPart inlineToMarkdown opts (Note contents) = do modify (\st -> st{ stNotes = contents : stNotes st }) st <- get @@ -812,3 +860,9 @@ inlineToMarkdown opts (Note contents) = do if isEnabled Ext_footnotes opts then return $ "[^" <> ref <> "]" else return $ "[" <> ref <> "]" + +makeMathPlainer :: [Inline] -> [Inline] +makeMathPlainer = walk go + where + go (Emph xs) = Span nullAttr xs + go x = x diff --git a/src/Text/Pandoc/Writers/MediaWiki.hs b/src/Text/Pandoc/Writers/MediaWiki.hs index 3b987ba2b..3f392a5d0 100644 --- a/src/Text/Pandoc/Writers/MediaWiki.hs +++ b/src/Text/Pandoc/Writers/MediaWiki.hs @@ -37,92 +37,99 @@ import Text.Pandoc.Writers.Shared import Text.Pandoc.Pretty (render) import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.XML ( escapeStringForXML ) -import Data.List ( intersect, intercalate, intersperse ) +import Data.List ( intersect, intercalate ) import Network.URI ( isURI ) +import Control.Monad.Reader import Control.Monad.State data WriterState = WriterState { stNotes :: Bool -- True if there are notes - , stListLevel :: [Char] -- String at beginning of list items, e.g. "**" - , stUseTags :: Bool -- True if we should use HTML tags because we're in a complex list } +data WriterReader = WriterReader { + options :: WriterOptions -- Writer options + , listLevel :: String -- String at beginning of list items, e.g. "**" + , useTags :: Bool -- True if we should use HTML tags because we're in a complex list + } + +type MediaWikiWriter = ReaderT WriterReader (State WriterState) + -- | Convert Pandoc to MediaWiki. writeMediaWiki :: WriterOptions -> Pandoc -> String writeMediaWiki opts document = - evalState (pandocToMediaWiki opts document) - WriterState { stNotes = False, stListLevel = [], stUseTags = False } + let initialState = WriterState { stNotes = False } + env = WriterReader { options = opts, listLevel = [], useTags = False } + in evalState (runReaderT (pandocToMediaWiki document) env) initialState -- | Return MediaWiki representation of document. -pandocToMediaWiki :: WriterOptions -> Pandoc -> State WriterState String -pandocToMediaWiki opts (Pandoc meta blocks) = do +pandocToMediaWiki :: Pandoc -> MediaWikiWriter String +pandocToMediaWiki (Pandoc meta blocks) = do + opts <- asks options metadata <- metaToJSON opts - (fmap trimr . blockListToMediaWiki opts) - (inlineListToMediaWiki opts) + (fmap trimr . blockListToMediaWiki) + inlineListToMediaWiki meta - body <- blockListToMediaWiki opts blocks - notesExist <- get >>= return . stNotes + body <- blockListToMediaWiki blocks + notesExist <- gets stNotes let notes = if notesExist then "\n<references />" else "" let main = body ++ notes let context = defField "body" main - $ defField "toc" (writerTableOfContents opts) - $ metadata - if writerStandalone opts - then return $ renderTemplate' (writerTemplate opts) context - else return main + $ defField "toc" (writerTableOfContents opts) metadata + return $ if writerStandalone opts + then renderTemplate' (writerTemplate opts) context + else main -- | Escape special characters for MediaWiki. escapeString :: String -> String escapeString = escapeStringForXML -- | Convert Pandoc block element to MediaWiki. -blockToMediaWiki :: WriterOptions -- ^ Options - -> Block -- ^ Block element - -> State WriterState String +blockToMediaWiki :: Block -- ^ Block element + -> MediaWikiWriter String -blockToMediaWiki _ Null = return "" +blockToMediaWiki Null = return "" -blockToMediaWiki opts (Div attrs bs) = do - contents <- blockListToMediaWiki opts bs +blockToMediaWiki (Div attrs bs) = do + contents <- blockListToMediaWiki bs return $ render Nothing (tagWithAttrs "div" attrs) ++ "\n\n" ++ contents ++ "\n\n" ++ "</div>" -blockToMediaWiki opts (Plain inlines) = - inlineListToMediaWiki opts inlines +blockToMediaWiki (Plain inlines) = + inlineListToMediaWiki inlines -- title beginning with fig: indicates that the image is a figure -blockToMediaWiki opts (Para [Image txt (src,'f':'i':'g':':':tit)]) = do +blockToMediaWiki (Para [Image txt (src,'f':'i':'g':':':tit)]) = do capt <- if null txt then return "" - else ("|caption " ++) `fmap` inlineListToMediaWiki opts txt + else ("|caption " ++) `fmap` inlineListToMediaWiki txt let opt = if null txt then "" else "|alt=" ++ if null tit then capt else tit ++ capt return $ "[[Image:" ++ src ++ "|frame|none" ++ opt ++ "]]\n" -blockToMediaWiki opts (Para inlines) = do - useTags <- get >>= return . stUseTags - listLevel <- get >>= return . stListLevel - contents <- inlineListToMediaWiki opts inlines - return $ if useTags +blockToMediaWiki (Para inlines) = do + tags <- asks useTags + lev <- asks listLevel + contents <- inlineListToMediaWiki inlines + return $ if tags then "<p>" ++ contents ++ "</p>" - else contents ++ if null listLevel then "\n" else "" + else contents ++ if null lev then "\n" else "" -blockToMediaWiki _ (RawBlock f str) +blockToMediaWiki (RawBlock f str) | f == Format "mediawiki" = return str | f == Format "html" = return str | otherwise = return "" -blockToMediaWiki _ HorizontalRule = return "\n-----\n" +blockToMediaWiki HorizontalRule = return "\n-----\n" -blockToMediaWiki opts (Header level _ inlines) = do - contents <- inlineListToMediaWiki opts inlines +blockToMediaWiki (Header level _ inlines) = do + contents <- inlineListToMediaWiki inlines let eqs = replicate level '=' return $ eqs ++ " " ++ contents ++ " " ++ eqs ++ "\n" -blockToMediaWiki _ (CodeBlock (_,classes,_) str) = do +blockToMediaWiki (CodeBlock (_,classes,_) str) = do let at = classes `intersect` ["actionscript", "ada", "apache", "applescript", "asm", "asp", "autoit", "bash", "blitzbasic", "bnf", "c", "c_mac", "caddcl", "cadlisp", "cfdg", "cfm", "cpp", "cpp-qt", "csharp", "css", "d", "delphi", "diff", "div", "dos", "eiffel", "fortran", @@ -132,75 +139,64 @@ blockToMediaWiki _ (CodeBlock (_,classes,_) str) = do "python", "qbasic", "rails", "reg", "robots", "ruby", "sas", "scheme", "sdlbasic", "smalltalk", "smarty", "sql", "tcl", "", "thinbasic", "tsql", "vb", "vbnet", "vhdl", "visualfoxpro", "winbatch", "xml", "xpp", "z80"] - let (beg, end) = if null at - then ("<pre" ++ if null classes then ">" else " class=\"" ++ unwords classes ++ "\">", "</pre>") - else ("<source lang=\"" ++ head at ++ "\">", "</source>") - return $ beg ++ escapeString str ++ end - -blockToMediaWiki opts (BlockQuote blocks) = do - contents <- blockListToMediaWiki opts blocks + return $ + if null at + then "<pre" ++ (if null classes + then ">" + else " class=\"" ++ unwords classes ++ "\">") ++ + escapeString str ++ "</pre>" + else "<source lang=\"" ++ head at ++ "\">" ++ str ++ "</source>" + -- note: no escape! + +blockToMediaWiki (BlockQuote blocks) = do + contents <- blockListToMediaWiki blocks return $ "<blockquote>" ++ contents ++ "</blockquote>" -blockToMediaWiki opts (Table capt aligns widths headers rows') = do +blockToMediaWiki (Table capt aligns widths headers rows') = do caption <- if null capt then return "" else do - c <- inlineListToMediaWiki opts capt + c <- inlineListToMediaWiki capt return $ "|+ " ++ trimr c ++ "\n" let headless = all null headers let allrows = if headless then rows' else headers:rows' - tableBody <- (concat . intersperse "|-\n") `fmap` - mapM (tableRowToMediaWiki opts headless aligns widths) + tableBody <- intercalate "|-\n" `fmap` + mapM (tableRowToMediaWiki headless aligns widths) (zip [1..] allrows) return $ "{|\n" ++ caption ++ tableBody ++ "|}\n" -blockToMediaWiki opts x@(BulletList items) = do - oldUseTags <- get >>= return . stUseTags - listLevel <- get >>= return . stListLevel - let useTags = oldUseTags || not (isSimpleList x) - if useTags +blockToMediaWiki x@(BulletList items) = do + tags <- fmap (|| not (isSimpleList x)) $ asks useTags + if tags then do - modify $ \s -> s { stUseTags = True } - contents <- mapM (listItemToMediaWiki opts) items - modify $ \s -> s { stUseTags = oldUseTags } + contents <- local (\ s -> s { useTags = True }) $ mapM listItemToMediaWiki items return $ "<ul>\n" ++ vcat contents ++ "</ul>\n" else do - modify $ \s -> s { stListLevel = stListLevel s ++ "*" } - contents <- mapM (listItemToMediaWiki opts) items - modify $ \s -> s { stListLevel = init (stListLevel s) } - return $ vcat contents ++ if null listLevel then "\n" else "" - -blockToMediaWiki opts x@(OrderedList attribs items) = do - oldUseTags <- get >>= return . stUseTags - listLevel <- get >>= return . stListLevel - let useTags = oldUseTags || not (isSimpleList x) - if useTags + lev <- asks listLevel + contents <- local (\s -> s { listLevel = listLevel s ++ "*" }) $ mapM listItemToMediaWiki items + return $ vcat contents ++ if null lev then "\n" else "" + +blockToMediaWiki x@(OrderedList attribs items) = do + tags <- fmap (|| not (isSimpleList x)) $ asks useTags + if tags then do - modify $ \s -> s { stUseTags = True } - contents <- mapM (listItemToMediaWiki opts) items - modify $ \s -> s { stUseTags = oldUseTags } + contents <- local (\s -> s { useTags = True }) $ mapM listItemToMediaWiki items return $ "<ol" ++ listAttribsToString attribs ++ ">\n" ++ vcat contents ++ "</ol>\n" else do - modify $ \s -> s { stListLevel = stListLevel s ++ "#" } - contents <- mapM (listItemToMediaWiki opts) items - modify $ \s -> s { stListLevel = init (stListLevel s) } - return $ vcat contents ++ if null listLevel then "\n" else "" - -blockToMediaWiki opts x@(DefinitionList items) = do - oldUseTags <- get >>= return . stUseTags - listLevel <- get >>= return . stListLevel - let useTags = oldUseTags || not (isSimpleList x) - if useTags + lev <- asks listLevel + contents <- local (\s -> s { listLevel = listLevel s ++ "#" }) $ mapM listItemToMediaWiki items + return $ vcat contents ++ if null lev then "\n" else "" + +blockToMediaWiki x@(DefinitionList items) = do + tags <- fmap (|| not (isSimpleList x)) $ asks useTags + if tags then do - modify $ \s -> s { stUseTags = True } - contents <- mapM (definitionListItemToMediaWiki opts) items - modify $ \s -> s { stUseTags = oldUseTags } + contents <- local (\s -> s { useTags = True }) $ mapM definitionListItemToMediaWiki items return $ "<dl>\n" ++ vcat contents ++ "</dl>\n" else do - modify $ \s -> s { stListLevel = stListLevel s ++ ";" } - contents <- mapM (definitionListItemToMediaWiki opts) items - modify $ \s -> s { stListLevel = init (stListLevel s) } - return $ vcat contents ++ if null listLevel then "\n" else "" + lev <- asks listLevel + contents <- local (\s -> s { listLevel = listLevel s ++ ";" }) $ mapM definitionListItemToMediaWiki items + return $ vcat contents ++ if null lev then "\n" else "" -- Auxiliary functions for lists: @@ -216,31 +212,30 @@ listAttribsToString (startnum, numstyle, _) = else "") -- | Convert bullet or ordered list item (list of blocks) to MediaWiki. -listItemToMediaWiki :: WriterOptions -> [Block] -> State WriterState String -listItemToMediaWiki opts items = do - contents <- blockListToMediaWiki opts items - useTags <- get >>= return . stUseTags - if useTags +listItemToMediaWiki :: [Block] -> MediaWikiWriter String +listItemToMediaWiki items = do + contents <- blockListToMediaWiki items + tags <- asks useTags + if tags then return $ "<li>" ++ contents ++ "</li>" else do - marker <- get >>= return . stListLevel + marker <- asks listLevel return $ marker ++ " " ++ contents -- | Convert definition list item (label, list of blocks) to MediaWiki. -definitionListItemToMediaWiki :: WriterOptions - -> ([Inline],[[Block]]) - -> State WriterState String -definitionListItemToMediaWiki opts (label, items) = do - labelText <- inlineListToMediaWiki opts label - contents <- mapM (blockListToMediaWiki opts) items - useTags <- get >>= return . stUseTags - if useTags +definitionListItemToMediaWiki :: ([Inline],[[Block]]) + -> MediaWikiWriter String +definitionListItemToMediaWiki (label, items) = do + labelText <- inlineListToMediaWiki label + contents <- mapM blockListToMediaWiki items + tags <- asks useTags + if tags then return $ "<dt>" ++ labelText ++ "</dt>\n" ++ - (intercalate "\n" $ map (\d -> "<dd>" ++ d ++ "</dd>") contents) + intercalate "\n" (map (\d -> "<dd>" ++ d ++ "</dd>") contents) else do - marker <- get >>= return . stListLevel + marker <- asks listLevel return $ marker ++ " " ++ labelText ++ "\n" ++ - (intercalate "\n" $ map (\d -> init marker ++ ": " ++ d) contents) + intercalate "\n" (map (\d -> init marker ++ ": " ++ d) contents) -- | True if the list can be handled by simple wiki markup, False if HTML tags will be needed. isSimpleList :: Block -> Bool @@ -283,25 +278,22 @@ vcat = intercalate "\n" -- Auxiliary functions for tables: -tableRowToMediaWiki :: WriterOptions - -> Bool +tableRowToMediaWiki :: Bool -> [Alignment] -> [Double] -> (Int, [[Block]]) - -> State WriterState String -tableRowToMediaWiki opts headless alignments widths (rownum, cells) = do - cells' <- mapM (\cellData -> - tableCellToMediaWiki opts headless rownum cellData) + -> MediaWikiWriter String +tableRowToMediaWiki headless alignments widths (rownum, cells) = do + cells' <- mapM (tableCellToMediaWiki headless rownum) $ zip3 alignments widths cells return $ unlines cells' -tableCellToMediaWiki :: WriterOptions - -> Bool +tableCellToMediaWiki :: Bool -> Int -> (Alignment, Double, [Block]) - -> State WriterState String -tableCellToMediaWiki opts headless rownum (alignment, width, bs) = do - contents <- blockListToMediaWiki opts bs + -> MediaWikiWriter String +tableCellToMediaWiki headless rownum (alignment, width, bs) = do + contents <- blockListToMediaWiki bs let marker = if rownum == 1 && not headless then "!" else "|" let percent w = show (truncate (100*w) :: Integer) ++ "%" let attrs = ["align=" ++ show (alignmentToString alignment) | @@ -313,7 +305,7 @@ tableCellToMediaWiki opts headless rownum (alignment, width, bs) = do else unwords attrs ++ "|" return $ marker ++ attr ++ trimr contents -alignmentToString :: Alignment -> [Char] +alignmentToString :: Alignment -> String alignmentToString alignment = case alignment of AlignLeft -> "left" AlignRight -> "right" @@ -321,94 +313,94 @@ alignmentToString alignment = case alignment of AlignDefault -> "left" -- | Convert list of Pandoc block elements to MediaWiki. -blockListToMediaWiki :: WriterOptions -- ^ Options - -> [Block] -- ^ List of block elements - -> State WriterState String -blockListToMediaWiki opts blocks = - mapM (blockToMediaWiki opts) blocks >>= return . vcat +blockListToMediaWiki :: [Block] -- ^ List of block elements + -> MediaWikiWriter String +blockListToMediaWiki blocks = + fmap vcat $ mapM blockToMediaWiki blocks -- | Convert list of Pandoc inline elements to MediaWiki. -inlineListToMediaWiki :: WriterOptions -> [Inline] -> State WriterState String -inlineListToMediaWiki opts lst = - mapM (inlineToMediaWiki opts) lst >>= return . concat +inlineListToMediaWiki :: [Inline] -> MediaWikiWriter String +inlineListToMediaWiki lst = + fmap concat $ mapM inlineToMediaWiki lst -- | Convert Pandoc inline element to MediaWiki. -inlineToMediaWiki :: WriterOptions -> Inline -> State WriterState String +inlineToMediaWiki :: Inline -> MediaWikiWriter String -inlineToMediaWiki opts (Span attrs ils) = do - contents <- inlineListToMediaWiki opts ils +inlineToMediaWiki (Span attrs ils) = do + contents <- inlineListToMediaWiki ils return $ render Nothing (tagWithAttrs "span" attrs) ++ contents ++ "</span>" -inlineToMediaWiki opts (Emph lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Emph lst) = do + contents <- inlineListToMediaWiki lst return $ "''" ++ contents ++ "''" -inlineToMediaWiki opts (Strong lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Strong lst) = do + contents <- inlineListToMediaWiki lst return $ "'''" ++ contents ++ "'''" -inlineToMediaWiki opts (Strikeout lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Strikeout lst) = do + contents <- inlineListToMediaWiki lst return $ "<s>" ++ contents ++ "</s>" -inlineToMediaWiki opts (Superscript lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Superscript lst) = do + contents <- inlineListToMediaWiki lst return $ "<sup>" ++ contents ++ "</sup>" -inlineToMediaWiki opts (Subscript lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Subscript lst) = do + contents <- inlineListToMediaWiki lst return $ "<sub>" ++ contents ++ "</sub>" -inlineToMediaWiki opts (SmallCaps lst) = inlineListToMediaWiki opts lst +inlineToMediaWiki (SmallCaps lst) = inlineListToMediaWiki lst -inlineToMediaWiki opts (Quoted SingleQuote lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Quoted SingleQuote lst) = do + contents <- inlineListToMediaWiki lst return $ "\8216" ++ contents ++ "\8217" -inlineToMediaWiki opts (Quoted DoubleQuote lst) = do - contents <- inlineListToMediaWiki opts lst +inlineToMediaWiki (Quoted DoubleQuote lst) = do + contents <- inlineListToMediaWiki lst return $ "\8220" ++ contents ++ "\8221" -inlineToMediaWiki opts (Cite _ lst) = inlineListToMediaWiki opts lst +inlineToMediaWiki (Cite _ lst) = inlineListToMediaWiki lst -inlineToMediaWiki _ (Code _ str) = - return $ "<code>" ++ (escapeString str) ++ "</code>" +inlineToMediaWiki (Code _ str) = + return $ "<code>" ++ escapeString str ++ "</code>" -inlineToMediaWiki _ (Str str) = return $ escapeString str +inlineToMediaWiki (Str str) = return $ escapeString str -inlineToMediaWiki _ (Math _ str) = return $ "<math>" ++ str ++ "</math>" - -- note: str should NOT be escaped +inlineToMediaWiki (Math _ str) = return $ "<math>" ++ str ++ "</math>" + -- note: str should NOT be escaped -inlineToMediaWiki _ (RawInline f str) +inlineToMediaWiki (RawInline f str) | f == Format "mediawiki" = return str | f == Format "html" = return str | otherwise = return "" -inlineToMediaWiki _ (LineBreak) = return "<br />" +inlineToMediaWiki (LineBreak) = return "<br />" -inlineToMediaWiki _ Space = return " " +inlineToMediaWiki Space = return " " -inlineToMediaWiki opts (Link txt (src, _)) = do - label <- inlineListToMediaWiki opts txt +inlineToMediaWiki (Link txt (src, _)) = do + label <- inlineListToMediaWiki txt case txt of [Str s] | escapeURI s == src -> return src - _ -> if isURI src - then return $ "[" ++ src ++ " " ++ label ++ "]" - else return $ "[[" ++ src' ++ "|" ++ label ++ "]]" + _ -> return $ if isURI src + then "[" ++ src ++ " " ++ label ++ "]" + else "[[" ++ src' ++ "|" ++ label ++ "]]" where src' = case src of '/':xs -> xs -- with leading / it's a _ -> src -- link to a help page -inlineToMediaWiki opts (Image alt (source, tit)) = do - alt' <- inlineListToMediaWiki opts alt - let txt = if (null tit) + +inlineToMediaWiki (Image alt (source, tit)) = do + alt' <- inlineListToMediaWiki alt + let txt = if null tit then if null alt then "" - else "|" ++ alt' - else "|" ++ tit + else '|' : alt' + else '|' : tit return $ "[[Image:" ++ source ++ txt ++ "]]" -inlineToMediaWiki opts (Note contents) = do - contents' <- blockListToMediaWiki opts contents +inlineToMediaWiki (Note contents) = do + contents' <- blockListToMediaWiki contents modify (\s -> s { stNotes = True }) return $ "<ref>" ++ contents' ++ "</ref>" -- note - may not work for notes with multiple blocks diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index 15f7c8be8..feaa0167c 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -37,8 +37,9 @@ import Text.TeXMath import qualified Data.ByteString.Lazy as B import Text.Pandoc.UTF8 ( fromStringLazy ) import Codec.Archive.Zip +import Control.Applicative ((<$>)) import Text.Pandoc.Options ( WriterOptions(..) ) -import Text.Pandoc.Shared ( stringify, readDataFile, fetchItem, warn ) +import Text.Pandoc.Shared ( stringify, readDataFile, fetchItem', warn ) import Text.Pandoc.ImageSize ( imageSize, sizeInPoints ) import Text.Pandoc.MIME ( getMimeType ) import Text.Pandoc.Definition @@ -131,7 +132,7 @@ writeODT opts doc@(Pandoc meta _) = do transformPicMath :: WriterOptions -> IORef [Entry] -> Inline -> IO Inline transformPicMath opts entriesRef (Image lab (src,_)) = do - res <- fetchItem (writerSourceURL opts) src + res <- fetchItem' (writerMediaBag opts) (writerSourceURL opts) src case res of Left (_ :: E.SomeException) -> do warn $ "Could not find image `" ++ src ++ "', skipping..." @@ -150,7 +151,7 @@ transformPicMath opts entriesRef (Image lab (src,_)) = do transformPicMath _ entriesRef (Math t math) = do entries <- readIORef entriesRef let dt = if t == InlineMath then DisplayInline else DisplayBlock - case texMathToMathML dt math of + case writeMathML dt <$> readTeX math of Left _ -> return $ Math t math Right r -> do let conf = useShortEmptyTags (const False) defaultConfigPP diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs index b6da2694c..773d142f4 100644 --- a/src/Text/Pandoc/Writers/OpenDocument.hs +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -380,7 +380,7 @@ inlineToOpenDocument o ils | SmallCaps l <- ils = withTextStyle SmallC $ inlinesToOpenDocument o l | Quoted t l <- ils = inQuotes t <$> inlinesToOpenDocument o l | Code _ s <- ils = withTextStyle Pre $ inTextStyle $ preformatted s - | Math t s <- ils = inlinesToOpenDocument o (readTeXMath' t s) + | Math t s <- ils = inlinesToOpenDocument o (texMathToInlines t s) | Cite _ l <- ils = inlinesToOpenDocument o l | RawInline f s <- ils = if f == Format "opendocument" then return $ text s @@ -504,7 +504,7 @@ paraStyle parent attrs = do tight = if t then [ ("fo:margin-top" , "0in" ) , ("fo:margin-bottom" , "0in" )] else [] - indent = if (i /= 0 || b) + indent = if (i /= 0 || b) then [ ("fo:margin-left" , indentVal) , ("fo:margin-right" , "0in" ) , ("fo:text-indent" , "0in" ) @@ -534,7 +534,7 @@ paraTableStyles t s (a:xs) [ ("fo:text-align", x) , ("style:justify-single-word", "false")] -data TextStyle = Italic | Bold | Strike | Sub | Sup | SmallC | Pre +data TextStyle = Italic | Bold | Strike | Sub | Sup | SmallC | Pre deriving ( Eq,Ord ) textStyleAttr :: TextStyle -> [(String,String)] diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs index 87046537c..414883b29 100644 --- a/src/Text/Pandoc/Writers/Org.hs +++ b/src/Text/Pandoc/Writers/Org.hs @@ -238,6 +238,8 @@ inlineListToOrg lst = mapM inlineToOrg lst >>= return . hcat -- | Convert Pandoc inline element to Org. inlineToOrg :: Inline -> State WriterState Doc +inlineToOrg (Span (uid, [], []) []) = + return $ "<<" <> text uid <> ">>" inlineToOrg (Span _ lst) = inlineListToOrg lst inlineToOrg (Emph lst) = do diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index 31c97349b..57ebfc360 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -37,7 +37,8 @@ import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Builder (deleteMeta) -import Data.List ( isPrefixOf, intersperse, transpose ) +import Data.Maybe (fromMaybe) +import Data.List ( isPrefixOf, stripPrefix, intersperse, transpose ) import Network.URI (isURI) import Text.Pandoc.Pretty import Control.Monad.State @@ -174,7 +175,7 @@ blockToRST (Para [Image txt (src,'f':'i':'g':':':tit)]) = do let alt = ":alt: " <> if null tit then capt else text tit return $ hang 3 ".. " $ fig $$ alt $+$ capt $$ blankline blockToRST (Para inlines) - | LineBreak `elem` inlines = do -- use line block if LineBreaks + | LineBreak `elem` inlines = do -- use line block if LineBreaks lns <- mapM inlineListToRST $ splitBy (==LineBreak) inlines return $ (vcat $ map (text "| " <>) lns) <> blankline | otherwise = do @@ -401,7 +402,7 @@ inlineToRST (Link [Str str] (src, _)) if "mailto:" `isPrefixOf` src then src == escapeURI ("mailto:" ++ str) else src == escapeURI str = do - let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) return $ text srcSuffix inlineToRST (Link [Image alt (imgsrc,imgtit)] (src, _tit)) = do label <- registerImage alt (imgsrc,imgtit) (Just src) diff --git a/src/Text/Pandoc/Writers/RTF.hs b/src/Text/Pandoc/Writers/RTF.hs index e0428aaa8..43405ce3c 100644 --- a/src/Text/Pandoc/Writers/RTF.hs +++ b/src/Text/Pandoc/Writers/RTF.hs @@ -36,50 +36,64 @@ import Text.Pandoc.Readers.TeXMath import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Walk import Data.List ( isSuffixOf, intercalate ) -import Data.Char ( ord, chr, isDigit, toLower ) -import System.FilePath ( takeExtension ) +import Data.Char ( ord, chr, isDigit ) import qualified Data.ByteString as B +import qualified Data.Map as M import Text.Printf ( printf ) -import Network.URI ( isURI, unEscapeString ) -import qualified Control.Exception as E +import Text.Pandoc.ImageSize --- | Convert Image inlines into a raw RTF embedded image, read from a file. +-- | Convert Image inlines into a raw RTF embedded image, read from a file, +-- or a MediaBag, or the internet. -- If file not found or filetype not jpeg or png, leave the inline unchanged. -rtfEmbedImage :: Inline -> IO Inline -rtfEmbedImage x@(Image _ (src,_)) = do - let ext = map toLower (takeExtension src) - if ext `elem` [".jpg",".jpeg",".png"] && not (isURI src) - then do - let src' = unEscapeString src - imgdata <- E.catch (B.readFile src') - (\e -> let _ = (e :: E.SomeException) in return B.empty) - let bytes = map (printf "%02x") $ B.unpack imgdata - let filetype = case ext of - ".jpg" -> "\\jpegblip" - ".jpeg" -> "\\jpegblip" - ".png" -> "\\pngblip" - _ -> error "Unknown file type" - let raw = "{\\pict" ++ filetype ++ " " ++ concat bytes ++ "}" - return $ if B.null imgdata - then x - else RawInline (Format "rtf") raw - else return x -rtfEmbedImage x = return x +rtfEmbedImage :: WriterOptions -> Inline -> IO Inline +rtfEmbedImage opts x@(Image _ (src,_)) = do + result <- fetchItem' (writerMediaBag opts) (writerSourceURL opts) src + case result of + Right (imgdata, Just mime) + | mime == "image/jpeg" || mime == "image/png" -> do + let bytes = map (printf "%02x") $ B.unpack imgdata + let filetype = case mime of + "image/jpeg" -> "\\jpegblip" + "image/png" -> "\\pngblip" + _ -> error "Unknown file type" + let sizeSpec = case imageSize imgdata of + Nothing -> "" + Just sz -> "\\picw" ++ show xpx ++ + "\\pich" ++ show ypx ++ + "\\picwgoal" ++ show (xpt * 20) + ++ "\\pichgoal" ++ show (ypt * 20) + -- twip = 1/1440in = 1/20pt + where (xpx, ypx) = sizeInPixels sz + (xpt, ypt) = sizeInPoints sz + let raw = "{\\pict" ++ filetype ++ sizeSpec ++ " " ++ + concat bytes ++ "}" + return $ if B.null imgdata + then x + else RawInline (Format "rtf") raw + _ -> return x +rtfEmbedImage _ x = return x -- | Convert Pandoc to a string in rich text format, with -- images embedded as encoded binary data. writeRTFWithEmbeddedImages :: WriterOptions -> Pandoc -> IO String writeRTFWithEmbeddedImages options doc = - writeRTF options `fmap` walkM rtfEmbedImage doc + writeRTF options `fmap` walkM (rtfEmbedImage options) doc -- | Convert Pandoc to a string in rich text format. writeRTF :: WriterOptions -> Pandoc -> String -writeRTF options (Pandoc meta blocks) = +writeRTF options (Pandoc meta@(Meta metamap) blocks) = let spacer = not $ all null $ docTitle meta : docDate meta : docAuthors meta + toPlain (MetaBlocks [Para ils]) = MetaInlines ils + toPlain x = x + -- adjust title, author, date so we don't get para inside para + meta' = Meta $ M.adjust toPlain "title" + . M.adjust toPlain "author" + . M.adjust toPlain "date" + $ metamap Just metadata = metaToJSON options (Just . concatMap (blockToRTF 0 AlignDefault)) (Just . inlineListToRTF) - meta + meta' body = concatMap (blockToRTF 0 AlignDefault) blocks isTOCHeader (Header lev _ _) = lev <= writerTOCDepth options isTOCHeader _ = False @@ -324,7 +338,7 @@ inlineToRTF (Quoted DoubleQuote lst) = "\\u8220\"" ++ (inlineListToRTF lst) ++ "\\u8221\"" inlineToRTF (Code _ str) = "{\\f1 " ++ (codeStringToRTF str) ++ "}" inlineToRTF (Str str) = stringToRTF str -inlineToRTF (Math t str) = inlineListToRTF $ readTeXMath' t str +inlineToRTF (Math t str) = inlineListToRTF $ texMathToInlines t str inlineToRTF (Cite _ lst) = inlineListToRTF lst inlineToRTF (RawInline f str) | f == Format "rtf" = str |