diff options
author | Clare Macrae <github@cfmacrae.fastmail.co.uk> | 2014-06-29 19:22:31 +0100 |
---|---|---|
committer | Clare Macrae <github@cfmacrae.fastmail.co.uk> | 2014-06-29 19:22:31 +0100 |
commit | 717e16660d1ee83f690b35d0aa9b60c8ac9d6b61 (patch) | |
tree | aa850d4ee99fa0b14da9ba0396ba6aa67e2037e3 /src/Text/Pandoc | |
parent | fccfc8429cf4d002df37977f03508c9aae457416 (diff) | |
parent | ce69021e42d7bf50deccba2a52ed4717f6ddac10 (diff) | |
download | pandoc-717e16660d1ee83f690b35d0aa9b60c8ac9d6b61.tar.gz |
Merge remote-tracking branch 'jgm/master' into dokuwiki
Diffstat (limited to 'src/Text/Pandoc')
61 files changed, 8159 insertions, 2533 deletions
diff --git a/src/Text/Pandoc/Asciify.hs b/src/Text/Pandoc/Asciify.hs index 1c177da90..8a5ccec5c 100644 --- a/src/Text/Pandoc/Asciify.hs +++ b/src/Text/Pandoc/Asciify.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2013 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2013-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.SelfContained - Copyright : Copyright (C) 2013 John MacFarlane + Copyright : Copyright (C) 2013-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Biblio.hs b/src/Text/Pandoc/Biblio.hs deleted file mode 100644 index 4dd82dd08..000000000 --- a/src/Text/Pandoc/Biblio.hs +++ /dev/null @@ -1,187 +0,0 @@ -{-# LANGUAGE PatternGuards #-} -{- -Copyright (C) 2008 Andrea Rossato <andrea.rossato@ing.unitn.it> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Biblio - Copyright : Copyright (C) 2008-2010 Andrea Rossato - License : GNU GPL, version 2 or above - - Maintainer : Andrea Rossato <andrea.rossato@unitn.it> - Stability : alpha - Portability : portable --} - -module Text.Pandoc.Biblio ( processBiblio ) where - -import Data.List -import Data.Char ( isDigit, isPunctuation ) -import qualified Data.Map as M -import Text.CSL hiding ( Cite(..), Citation(..) ) -import qualified Text.CSL as CSL ( Cite(..) ) -import Text.Pandoc.Definition -import Text.Pandoc.Generic -import Text.Pandoc.Shared (stringify) -import Text.Parsec hiding (State) -import Control.Monad -import Control.Monad.State - --- | Process a 'Pandoc' document by adding citations formatted --- according to a CSL style, using 'citeproc' from citeproc-hs. -processBiblio :: Maybe Style -> [Reference] -> Pandoc -> Pandoc -processBiblio Nothing _ p = p -processBiblio _ [] p = p -processBiblio (Just style) r p = - let p' = evalState (bottomUpM setHash p) 1 - grps = queryWith getCitation p' - result = citeproc procOpts style r (setNearNote style $ - map (map toCslCite) grps) - cits_map = M.fromList $ zip grps (citations result) - biblioList = map (renderPandoc' style) (bibliography result) - Pandoc m b = bottomUp mvPunct . deNote . bottomUp (processCite style cits_map) $ p' - in Pandoc m $ b ++ biblioList - --- | Substitute 'Cite' elements with formatted citations. -processCite :: Style -> M.Map [Citation] [FormattedOutput] -> Inline -> Inline -processCite s cs (Cite t _) = - case M.lookup t cs of - Just (x:xs) - | isTextualCitation t && not (null xs) -> - let xs' = renderPandoc s xs - in if styleClass s == "note" - then Cite t (renderPandoc s [x] ++ [Note [Para xs']]) - else Cite t (renderPandoc s [x] ++ [Space | not (startWithPunct xs')] ++ xs') - | otherwise -> if styleClass s == "note" - then Cite t [Note [Para $ renderPandoc s (x:xs)]] - else Cite t (renderPandoc s (x:xs)) - _ -> Strong [Str "???"] -- TODO raise error instead? -processCite _ _ x = x - -isNote :: Inline -> Bool -isNote (Note _) = True -isNote (Cite _ [Note _]) = True -isNote _ = False - -mvPunct :: [Inline] -> [Inline] -mvPunct (Space : Space : xs) = Space : xs -mvPunct (Space : x : ys) | isNote x, startWithPunct ys = - Str (headInline ys) : x : tailFirstInlineStr ys -mvPunct (Space : x : ys) | isNote x = x : ys -mvPunct xs = xs - -sanitize :: [Inline] -> [Inline] -sanitize xs | endWithPunct xs = toCapital xs - | otherwise = toCapital (xs ++ [Str "."]) - -deNote :: Pandoc -> Pandoc -deNote = topDown go - where go (Note [Para xs]) = Note $ bottomUp go' [Para $ sanitize xs] - go (Note xs) = Note $ bottomUp go' xs - go x = x - go' (Note [Para xs]:ys) = - if startWithPunct ys && endWithPunct xs - then initInline xs ++ ys - else xs ++ ys - go' xs = xs - -isTextualCitation :: [Citation] -> Bool -isTextualCitation (c:_) = citationMode c == AuthorInText -isTextualCitation _ = False - --- | Retrieve all citations from a 'Pandoc' docuument. To be used with --- 'queryWith'. -getCitation :: Inline -> [[Citation]] -getCitation i | Cite t _ <- i = [t] - | otherwise = [] - -setHash :: Citation -> State Int Citation -setHash c = do - ident <- get - put $ ident + 1 - return c{ citationHash = ident } - -toCslCite :: Citation -> CSL.Cite -toCslCite c - = let (l, s) = locatorWords $ citationSuffix c - (la,lo) = parseLocator l - s' = case (l,s,citationMode c) of - -- treat a bare locator as if it begins with comma - -- so @item1 [blah] is like [@item1, blah] - ("",(x:_),AuthorInText) | not (isPunct x) - -> [Str ",",Space] ++ s - _ -> s - isPunct (Str (x:_)) = isPunctuation x - isPunct _ = False - citMode = case citationMode c of - AuthorInText -> (True, False) - SuppressAuthor -> (False,True ) - NormalCitation -> (False,False) - in emptyCite { CSL.citeId = citationId c - , CSL.citePrefix = PandocText $ citationPrefix c - , CSL.citeSuffix = PandocText s' - , CSL.citeLabel = la - , CSL.citeLocator = lo - , CSL.citeNoteNumber = show $ citationNoteNum c - , CSL.authorInText = fst citMode - , CSL.suppressAuthor = snd citMode - , CSL.citeHash = citationHash c - } - -locatorWords :: [Inline] -> (String, [Inline]) -locatorWords inp = - case parse pLocatorWords "suffix" $ breakup inp of - Right r -> r - Left _ -> ("",inp) - where breakup [] = [] - breakup (Str x : xs) = map Str (splitup x) ++ breakup xs - breakup (x : xs) = x : breakup xs - splitup = groupBy (\x y -> x /= '\160' && y /= '\160') - -pLocatorWords :: Parsec [Inline] st (String, [Inline]) -pLocatorWords = do - l <- pLocator - s <- getInput -- rest is suffix - if length l > 0 && last l == ',' - then return (init l, Str "," : s) - else return (l, s) - -pMatch :: (Inline -> Bool) -> Parsec [Inline] st Inline -pMatch condition = try $ do - t <- anyToken - guard $ condition t - return t - -pSpace :: Parsec [Inline] st Inline -pSpace = pMatch (\t -> t == Space || t == Str "\160") - -pLocator :: Parsec [Inline] st String -pLocator = try $ do - optional $ pMatch (== Str ",") - optional pSpace - f <- many1 (notFollowedBy pSpace >> anyToken) - gs <- many1 pWordWithDigits - return $ stringify f ++ (' ' : unwords gs) - -pWordWithDigits :: Parsec [Inline] st String -pWordWithDigits = try $ do - pSpace - r <- many1 (notFollowedBy pSpace >> anyToken) - let s = stringify r - guard $ any isDigit s - return s - diff --git a/src/Text/Pandoc/Compat/Monoid.hs b/src/Text/Pandoc/Compat/Monoid.hs new file mode 100644 index 000000000..cb7ea2527 --- /dev/null +++ b/src/Text/Pandoc/Compat/Monoid.hs @@ -0,0 +1,20 @@ +{-# LANGUAGE CPP #-} +module Text.Pandoc.Compat.Monoid ( Monoid(..) + , (<>) + ) where + +#if MIN_VERSION_base(4,5,0) +import Data.Monoid ((<>), Monoid(..)) +#else +import Data.Monoid (mappend, Monoid(..)) +#endif + +#if MIN_VERSION_base(4,5,0) +#else +infixr 6 <> + +-- | An infix synonym for 'mappend'. +(<>) :: Monoid m => m -> m -> m +(<>) = mappend +{-# INLINE (<>) #-} +#endif diff --git a/src/Text/Pandoc/Compat/TagSoupEntity.hs b/src/Text/Pandoc/Compat/TagSoupEntity.hs new file mode 100644 index 000000000..80985aef9 --- /dev/null +++ b/src/Text/Pandoc/Compat/TagSoupEntity.hs @@ -0,0 +1,15 @@ +{-# LANGUAGE CPP #-} +module Text.Pandoc.Compat.TagSoupEntity (lookupEntity + ) where + +import qualified Text.HTML.TagSoup.Entity as TE + +lookupEntity :: String -> Maybe Char +#if MIN_VERSION_tagsoup(0,13,0) +lookupEntity = str2chr . TE.lookupEntity + where str2chr :: Maybe String -> Maybe Char + str2chr (Just [c]) = Just c + str2chr _ = Nothing +#else +lookupEntity = TE.lookupEntity +#endif diff --git a/src/Text/Pandoc/Highlighting.hs b/src/Text/Pandoc/Highlighting.hs index 11d608db6..7f975d4c6 100644 --- a/src/Text/Pandoc/Highlighting.hs +++ b/src/Text/Pandoc/Highlighting.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2008-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Highlighting - Copyright : Copyright (C) 2008 John MacFarlane + Copyright : Copyright (C) 2008-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -74,7 +74,12 @@ highlight formatter (_, classes, keyvals) rawCode = ["number","numberLines", "number-lines"]) classes } lcclasses = map (map toLower) classes in case find (`elem` lcLanguages) lcclasses of - Nothing -> Nothing + Nothing + | numberLines fmtOpts -> Just + $ formatter fmtOpts{ codeClasses = [], + containerClasses = classes } + $ map (\ln -> [(NormalTok, ln)]) $ lines rawCode + | otherwise -> Nothing Just language -> Just $ formatter fmtOpts{ codeClasses = [language], containerClasses = classes } diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index 273a1a428..68b34dcf3 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} {- - Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> + Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ {- | Module : Text.Pandoc.ImageSize -Copyright : Copyright (C) 2011 John MacFarlane +Copyright : Copyright (C) 2011-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,13 +32,19 @@ module Text.Pandoc.ImageSize ( ImageType(..), imageType, imageSize, sizeInPixels, sizeInPoints ) where import Data.ByteString (ByteString, unpack) import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy as BL +import Control.Applicative import Control.Monad import Data.Bits +import Data.Binary +import Data.Binary.Get +import Text.Pandoc.Shared (safeRead) +import qualified Data.Map as M -- quick and dirty functions to get image sizes -- algorithms borrowed from wwwis.pl -data ImageType = Png | Gif | Jpeg | Pdf deriving Show +data ImageType = Png | Gif | Jpeg | Pdf | Eps deriving Show data ImageSize = ImageSize{ pxX :: Integer @@ -52,8 +58,12 @@ imageType :: ByteString -> Maybe ImageType imageType img = case B.take 4 img of "\x89\x50\x4e\x47" -> return Png "\x47\x49\x46\x38" -> return Gif - "\xff\xd8\xff\xe0" -> return Jpeg + "\xff\xd8\xff\xe0" -> return Jpeg -- JFIF + "\xff\xd8\xff\xe1" -> return Jpeg -- Exif "%PDF" -> return Pdf + "%!PS" + | (B.take 4 $ B.drop 1 $ B.dropWhile (/=' ') img) == "EPSF" + -> return Eps _ -> fail "Unknown image type" imageSize :: ByteString -> Maybe ImageSize @@ -63,14 +73,35 @@ imageSize img = do Png -> pngSize img Gif -> gifSize img Jpeg -> jpegSize img + Eps -> epsSize img Pdf -> Nothing -- TODO +defaultSize :: (Integer, Integer) +defaultSize = (72, 72) + sizeInPixels :: ImageSize -> (Integer, Integer) sizeInPixels s = (pxX s, pxY s) sizeInPoints :: ImageSize -> (Integer, Integer) sizeInPoints s = (pxX s * 72 `div` dpiX s, pxY s * 72 `div` dpiY s) +epsSize :: ByteString -> Maybe ImageSize +epsSize img = do + let ls = takeWhile ("%" `B.isPrefixOf`) $ B.lines img + let ls' = dropWhile (not . ("%%BoundingBox:" `B.isPrefixOf`)) ls + case ls' of + [] -> mzero + (x:_) -> case B.words x of + (_:_:_:ux:uy:[]) -> do + ux' <- safeRead $ B.unpack ux + uy' <- safeRead $ B.unpack uy + return ImageSize{ + pxX = ux' + , pxY = uy' + , dpiX = 72 + , dpiY = 72 } + _ -> mzero + pngSize :: ByteString -> Maybe ImageSize pngSize img = do let (h, rest) = B.splitAt 8 img @@ -117,8 +148,14 @@ gifSize img = do jpegSize :: ByteString -> Maybe ImageSize jpegSize img = do let (hdr, rest) = B.splitAt 4 img - guard $ hdr == "\xff\xd8\xff\xe0" guard $ B.length rest >= 14 + case hdr of + "\xff\xd8\xff\xe0" -> jfifSize rest + "\xff\xd8\xff\xe1" -> exifSize $ B.takeWhile (/= '\xff') rest + _ -> mzero + +jfifSize :: ByteString -> Maybe ImageSize +jfifSize rest = do let [dpiDensity,dpix1,dpix2,dpiy1,dpiy2] = map fromIntegral $ unpack $ B.take 5 $ B.drop 9 $ rest let factor = case dpiDensity of @@ -127,11 +164,11 @@ jpegSize img = do _ -> const 72 let dpix = factor (shift dpix1 8 + dpix2) let dpiy = factor (shift dpiy1 8 + dpiy2) - (w,h) <- findJpegSize rest + (w,h) <- findJfifSize rest return $ ImageSize { pxX = w, pxY = h, dpiX = dpix, dpiY = dpiy } -findJpegSize :: ByteString -> Maybe (Integer,Integer) -findJpegSize bs = do +findJfifSize :: ByteString -> Maybe (Integer,Integer) +findJfifSize bs = do let bs' = B.dropWhile (=='\xff') $ B.dropWhile (/='\xff') bs case B.uncons bs' of Just (c,bs'') | c >= '\xc0' && c <= '\xc3' -> do @@ -143,8 +180,220 @@ findJpegSize bs = do [c1,c2] -> do let len = shift c1 8 + c2 -- skip variables - findJpegSize $ B.drop len bs'' + findJfifSize $ B.drop len bs'' _ -> fail "JPEG parse error" Nothing -> fail "Did not find length record" +exifSize :: ByteString -> Maybe ImageSize +exifSize bs = runGet (Just <$> exifHeader bl) bl + where bl = BL.fromChunks [bs] +-- NOTE: It would be nicer to do +-- runGet ((Just <$> exifHeader) <|> return Nothing) +-- which would prevent pandoc from raising an error when an exif header can't +-- be parsed. But we only get an Alternative instance for Get in binary 0.6, +-- and binary 0.5 ships with ghc 7.6. + +exifHeader :: BL.ByteString -> Get ImageSize +exifHeader hdr = do + _app1DataSize <- getWord16be + exifHdr <- getWord32be + unless (exifHdr == 0x45786966) $ fail "Did not find exif header" + zeros <- getWord16be + unless (zeros == 0) $ fail "Expected zeros after exif header" + -- beginning of tiff header -- we read whole thing to use + -- in getting data from offsets: + let tiffHeader = BL.drop 8 hdr + byteAlign <- getWord16be + let bigEndian = byteAlign == 0x4d4d + let (getWord16, getWord32, getWord64) = + if bigEndian + then (getWord16be, getWord32be, getWord64be) + else (getWord16le, getWord32le, getWord64le) + let getRational = do + num <- getWord32 + den <- getWord32 + return $ fromIntegral num / fromIntegral den + tagmark <- getWord16 + unless (tagmark == 0x002a) $ fail "Failed alignment sanity check" + ifdOffset <- getWord32 + skip (fromIntegral ifdOffset - 8) -- skip to IDF + numentries <- getWord16 + let ifdEntry = do + tag <- getWord16 >>= \t -> + maybe (return UnknownTagType) return + (M.lookup t tagTypeTable) + dataFormat <- getWord16 + numComponents <- getWord32 + (fmt, bytesPerComponent) <- + case dataFormat of + 1 -> return (UnsignedByte . runGet getWord8, 1) + 2 -> return (AsciiString, 1) + 3 -> return (UnsignedShort . runGet getWord16, 2) + 4 -> return (UnsignedLong . runGet getWord32, 4) + 5 -> return (UnsignedRational . runGet getRational, 8) + 6 -> return (SignedByte . runGet getWord8, 1) + 7 -> return (Undefined . runGet getWord8, 1) + 8 -> return (SignedShort . runGet getWord16, 2) + 9 -> return (SignedLong . runGet getWord32, 4) + 10 -> return (SignedRational . runGet getRational, 8) + 11 -> return (SingleFloat . runGet getWord32 {- TODO -}, 4) + 12 -> return (DoubleFloat . runGet getWord64 {- TODO -}, 8) + _ -> fail $ "Unknown data format " ++ show dataFormat + let totalBytes = fromIntegral $ numComponents * bytesPerComponent + payload <- if totalBytes <= 4 -- data is right here + then fmt <$> + (getLazyByteString (fromIntegral totalBytes) <* + skip (4 - totalBytes)) + else do -- get data from offset + offs <- getWord32 + return $ fmt $ BL.take (fromIntegral totalBytes) $ + BL.drop (fromIntegral offs) tiffHeader + return (tag, payload) + entries <- sequence $ replicate (fromIntegral numentries) ifdEntry + subentries <- case lookup ExifOffset entries of + Just (UnsignedLong offset) -> do + pos <- bytesRead + skip (fromIntegral offset - (fromIntegral pos - 8)) + numsubentries <- getWord16 + sequence $ + replicate (fromIntegral numsubentries) ifdEntry + _ -> return [] + let allentries = entries ++ subentries + (width, height) <- case (lookup ExifImageWidth allentries, + lookup ExifImageHeight allentries) of + (Just (UnsignedLong w), Just (UnsignedLong h)) -> + return (fromIntegral w, fromIntegral h) + _ -> return defaultSize + -- we return a default width and height when + -- the exif header doesn't contain these + let resfactor = case lookup ResolutionUnit allentries of + Just (UnsignedShort 1) -> (100 / 254) + _ -> 1 + let xres = maybe 72 (\(UnsignedRational x) -> floor $ x * resfactor) + $ lookup XResolution allentries + let yres = maybe 72 (\(UnsignedRational x) -> floor $ x * resfactor) + $ lookup YResolution allentries + return $ ImageSize{ + pxX = width + , pxY = height + , dpiX = xres + , dpiY = yres } + +data DataFormat = UnsignedByte Word8 + | AsciiString BL.ByteString + | UnsignedShort Word16 + | UnsignedLong Word32 + | UnsignedRational Rational + | SignedByte Word8 + | Undefined Word8 + | SignedShort Word16 + | SignedLong Word32 + | SignedRational Rational + | SingleFloat Word32 + | DoubleFloat Word64 + deriving (Show) + +data TagType = ImageDescription + | Make + | Model + | Orientation + | XResolution + | YResolution + | ResolutionUnit + | Software + | DateTime + | WhitePoint + | PrimaryChromaticities + | YCbCrCoefficients + | YCbCrPositioning + | ReferenceBlackWhite + | Copyright + | ExifOffset + | ExposureTime + | FNumber + | ExposureProgram + | ISOSpeedRatings + | ExifVersion + | DateTimeOriginal + | DateTimeDigitized + | ComponentConfiguration + | CompressedBitsPerPixel + | ShutterSpeedValue + | ApertureValue + | BrightnessValue + | ExposureBiasValue + | MaxApertureValue + | SubjectDistance + | MeteringMode + | LightSource + | Flash + | FocalLength + | MakerNote + | UserComment + | FlashPixVersion + | ColorSpace + | ExifImageWidth + | ExifImageHeight + | RelatedSoundFile + | ExifInteroperabilityOffset + | FocalPlaneXResolution + | FocalPlaneYResolution + | FocalPlaneResolutionUnit + | SensingMethod + | FileSource + | SceneType + | UnknownTagType + deriving (Show, Eq, Ord) +tagTypeTable :: M.Map Word16 TagType +tagTypeTable = M.fromList + [ (0x010e, ImageDescription) + , (0x010f, Make) + , (0x0110, Model) + , (0x0112, Orientation) + , (0x011a, XResolution) + , (0x011b, YResolution) + , (0x0128, ResolutionUnit) + , (0x0131, Software) + , (0x0132, DateTime) + , (0x013e, WhitePoint) + , (0x013f, PrimaryChromaticities) + , (0x0211, YCbCrCoefficients) + , (0x0213, YCbCrPositioning) + , (0x0214, ReferenceBlackWhite) + , (0x8298, Copyright) + , (0x8769, ExifOffset) + , (0x829a, ExposureTime) + , (0x829d, FNumber) + , (0x8822, ExposureProgram) + , (0x8827, ISOSpeedRatings) + , (0x9000, ExifVersion) + , (0x9003, DateTimeOriginal) + , (0x9004, DateTimeDigitized) + , (0x9101, ComponentConfiguration) + , (0x9102, CompressedBitsPerPixel) + , (0x9201, ShutterSpeedValue) + , (0x9202, ApertureValue) + , (0x9203, BrightnessValue) + , (0x9204, ExposureBiasValue) + , (0x9205, MaxApertureValue) + , (0x9206, SubjectDistance) + , (0x9207, MeteringMode) + , (0x9208, LightSource) + , (0x9209, Flash) + , (0x920a, FocalLength) + , (0x927c, MakerNote) + , (0x9286, UserComment) + , (0xa000, FlashPixVersion) + , (0xa001, ColorSpace) + , (0xa002, ExifImageWidth) + , (0xa003, ExifImageHeight) + , (0xa004, RelatedSoundFile) + , (0xa005, ExifInteroperabilityOffset) + , (0xa20e, FocalPlaneXResolution) + , (0xa20f, FocalPlaneYResolution) + , (0xa210, FocalPlaneResolutionUnit) + , (0xa217, SensingMethod) + , (0xa300, FileSource) + , (0xa301, SceneType) + ] diff --git a/src/Text/Pandoc/MIME.hs b/src/Text/Pandoc/MIME.hs index eb54bd48d..6e6284b25 100644 --- a/src/Text/Pandoc/MIME.hs +++ b/src/Text/Pandoc/MIME.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.MIME - Copyright : Copyright (C) 2011 John MacFarlane + Copyright : Copyright (C) 2011-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,7 +27,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Mime type lookup for ODT writer. -} -module Text.Pandoc.MIME ( getMimeType ) +module Text.Pandoc.MIME ( getMimeType, extensionFromMimeType ) where import System.FilePath import Data.Char ( toLower ) @@ -37,7 +37,15 @@ import qualified Data.Map as M getMimeType :: FilePath -> Maybe String getMimeType "layout-cache" = Just "application/binary" -- in ODT getMimeType f = M.lookup (map toLower $ drop 1 $ takeExtension f) mimeTypes - where mimeTypes = M.fromList -- List borrowed from happstack-server. + where mimeTypes = M.fromList mimeTypesList + +extensionFromMimeType :: String -> Maybe String +extensionFromMimeType mimetype = M.lookup (takeWhile (/=';') mimetype) reverseMimeTypes + -- note: we just look up the basic mime type, dropping the content-encoding etc. + where reverseMimeTypes = M.fromList $ map (\(k,v) -> (v,k)) mimeTypesList + +mimeTypesList :: [(String, String)] +mimeTypesList = -- List borrowed from happstack-server. [("gz","application/x-gzip") ,("cabal","application/x-cabal") ,("%","application/x-trash") @@ -139,6 +147,7 @@ getMimeType f = M.lookup (map toLower $ drop 1 $ takeExtension f) mimeTypes ,("dxr","application/x-director") ,("emb","chemical/x-embl-dl-nucleotide") ,("embl","chemical/x-embl-dl-nucleotide") + ,("emf","image/x-emf") ,("eml","message/rfc822") ,("ent","chemical/x-ncbi-asn1-ascii") ,("eot","application/vnd.ms-fontobject") @@ -212,6 +221,7 @@ getMimeType f = M.lookup (map toLower $ drop 1 $ takeExtension f) mimeTypes ,("jnlp","application/x-java-jnlp-file") ,("jpe","image/jpeg") ,("jpeg","image/jpeg") + ,("jfif","image/jpeg") ,("jpg","image/jpeg") ,("js","application/x-javascript") ,("kar","audio/midi") @@ -236,6 +246,7 @@ getMimeType f = M.lookup (map toLower $ drop 1 $ takeExtension f) mimeTypes ,("lzx","application/x-lzx") ,("m3u","audio/mpegurl") ,("m4a","audio/mpeg") + ,("m4v","video/x-m4v") ,("maker","application/x-maker") ,("man","application/x-troff-man") ,("mcif","chemical/x-mmcif") @@ -456,6 +467,7 @@ getMimeType f = M.lookup (map toLower $ drop 1 $ takeExtension f) mimeTypes ,("wm","video/x-ms-wm") ,("wma","audio/x-ms-wma") ,("wmd","application/x-ms-wmd") + ,("wmf","image/x-wmf") ,("wml","text/vnd.wap.wml") ,("wmlc","application/vnd.wap.wmlc") ,("wmls","text/vnd.wap.wmlscript") diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index c9a5e27da..b7a3a4b7b 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Options - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -41,6 +41,7 @@ module Text.Pandoc.Options ( Extension(..) , HTMLSlideVariant (..) , EPUBVersion (..) , WriterOptions (..) + , TrackChanges (..) , def , isEnabled ) where @@ -48,7 +49,6 @@ import Data.Set (Set) import qualified Data.Set as Set import Data.Default import Text.Pandoc.Highlighting (Style, pygments) -import qualified Text.CSL as CSL -- | Individually selectable syntax extensions. data Extension = @@ -81,6 +81,7 @@ data Extension = | Ext_link_attributes -- ^ MMD style reference link attributes | Ext_autolink_bare_uris -- ^ Make all absolute URIs into links | Ext_fancy_lists -- ^ Enable fancy list numbers and delimiters + | Ext_lists_without_preceding_blankline -- ^ Allow lists without preceding blank | Ext_startnum -- ^ Make start number of ordered list significant | Ext_definition_lists -- ^ Definition lists as in pandoc, mmd, php | Ext_example_lists -- ^ Markdown-style numbered examples @@ -92,6 +93,7 @@ data Extension = | Ext_superscript -- ^ Superscript using ^this^ syntax | Ext_subscript -- ^ Subscript using ~this~ syntax | Ext_hard_line_breaks -- ^ All newlines become hard line breaks + | Ext_ignore_line_breaks -- ^ Newlines in paragraphs are ignored | Ext_literate_haskell -- ^ Enable literate Haskell conventions | Ext_abbreviations -- ^ PHP markdown extra abbreviation definitions | Ext_auto_identifiers -- ^ Automatic identifiers for headers @@ -169,6 +171,7 @@ githubMarkdownExtensions = Set.fromList , Ext_intraword_underscores , Ext_strikeout , Ext_hard_line_breaks + , Ext_lists_without_preceding_blankline ] multimarkdownExtensions :: Set Extension @@ -204,12 +207,12 @@ data ReaderOptions = ReaderOptions{ , readerOldDashes :: Bool -- ^ Use pandoc <= 1.8.2.1 behavior -- in parsing dashes; -- is em-dash; -- - before numerial is en-dash - , readerReferences :: [CSL.Reference] -- ^ Bibliographic references - , readerCitationStyle :: Maybe CSL.Style -- ^ Citation style , readerApplyMacros :: Bool -- ^ Apply macros to TeX math , readerIndentedCodeClasses :: [String] -- ^ Default classes for -- indented code blocks , readerDefaultImageExtension :: String -- ^ Default extension for images + , readerTrace :: Bool -- ^ Print debugging info + , readerTrackChanges :: TrackChanges } deriving (Show, Read) instance Default ReaderOptions @@ -222,11 +225,11 @@ instance Default ReaderOptions , readerColumns = 80 , readerTabStop = 4 , readerOldDashes = False - , readerReferences = [] - , readerCitationStyle = Nothing , readerApplyMacros = True , readerIndentedCodeClasses = [] , readerDefaultImageExtension = "" + , readerTrace = False + , readerTrackChanges = AcceptChanges } -- @@ -264,6 +267,12 @@ data HTMLSlideVariant = S5Slides | NoSlides deriving (Show, Read, Eq) +-- | Options for accepting or rejecting MS Word track-changes. +data TrackChanges = AcceptChanges + | RejectChanges + | AllChanges + deriving (Show, Read, Eq) + -- | Options for writers data WriterOptions = WriterOptions { writerStandalone :: Bool -- ^ Include header and footer @@ -285,10 +294,9 @@ data WriterOptions = WriterOptions , writerEmailObfuscation :: ObfuscationMethod -- ^ How to obfuscate emails , writerIdentifierPrefix :: String -- ^ Prefix for section & note ids in HTML -- and for footnote marks in markdown - , writerSourceDirectory :: FilePath -- ^ Directory path of 1st source file + , writerSourceURL :: Maybe String -- ^ Absolute URL + directory of 1st source file , writerUserDataDir :: Maybe FilePath -- ^ Path of user data directory , writerCiteMethod :: CiteMethod -- ^ How to print cites - , writerBiblioFiles :: [FilePath] -- ^ Biblio files to use for citations , writerHtml5 :: Bool -- ^ Produce HTML5 , writerHtmlQTags :: Bool -- ^ Use @<q>@ tags for quotes in HTML , writerBeamer :: Bool -- ^ Produce beamer LaTeX slide show @@ -328,10 +336,9 @@ instance Default WriterOptions where , writerColumns = 72 , writerEmailObfuscation = JavascriptObfuscation , writerIdentifierPrefix = "" - , writerSourceDirectory = "." + , writerSourceURL = Nothing , writerUserDataDir = Nothing , writerCiteMethod = Citeproc - , writerBiblioFiles = [] , writerHtml5 = False , writerHtmlQTags = False , writerBeamer = False diff --git a/src/Text/Pandoc/PDF.hs b/src/Text/Pandoc/PDF.hs index 3227fd0bd..bd55c565f 100644 --- a/src/Text/Pandoc/PDF.hs +++ b/src/Text/Pandoc/PDF.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings, CPP #-} {- -Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.PDF - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -28,22 +28,32 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of LaTeX documents to PDF. -} -module Text.Pandoc.PDF ( tex2pdf ) where +module Text.Pandoc.PDF ( makePDF ) where import System.IO.Temp import Data.ByteString.Lazy (ByteString) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BC +import qualified Data.ByteString as BS import System.Exit (ExitCode (..)) import System.FilePath import System.Directory -import System.Process -import Control.Exception (evaluate) -import System.IO (hClose) -import Control.Concurrent (putMVar, takeMVar, newEmptyMVar, forkIO) -import Text.Pandoc.UTF8 as UTF8 +import Data.Digest.Pure.SHA (showDigest, sha1) +import System.Environment import Control.Monad (unless) import Data.List (isInfixOf) +import Data.Maybe (fromMaybe) +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Definition +import Text.Pandoc.Walk (walkM) +import Text.Pandoc.Shared (fetchItem, warn) +import Text.Pandoc.Options (WriterOptions(..)) +import Text.Pandoc.MIME (extensionFromMimeType) +import Text.Pandoc.Process (pipeProcess) +import qualified Data.ByteString.Lazy as BL +#ifdef _WINDOWS +import Data.List (intercalate) +#endif withTempDir :: String -> (FilePath -> IO a) -> IO a withTempDir = @@ -53,12 +63,50 @@ withTempDir = withSystemTempDirectory #endif -tex2pdf :: String -- ^ tex program (pdflatex, lualatex, xelatex) - -> String -- ^ latex source +#ifdef _WINDOWS +changePathSeparators :: FilePath -> FilePath +changePathSeparators = intercalate "/" . splitDirectories +#endif + +makePDF :: String -- ^ pdf creator (pdflatex, lualatex, xelatex) + -> (WriterOptions -> Pandoc -> String) -- ^ writer + -> WriterOptions -- ^ options + -> Pandoc -- ^ document -> IO (Either ByteString ByteString) -tex2pdf program source = withTempDir "tex2pdf." $ \tmpdir -> +makePDF program writer opts doc = withTempDir "tex2pdf." $ \tmpdir -> do + doc' <- handleImages (writerSourceURL opts) tmpdir doc + let source = writer opts doc' tex2pdf' tmpdir program source +handleImages :: Maybe String -- ^ source base URL + -> FilePath -- ^ temp dir to store images + -> Pandoc -- ^ document + -> IO Pandoc +handleImages baseURL tmpdir = walkM (handleImage' baseURL tmpdir) + +handleImage' :: Maybe String + -> FilePath + -> Inline + -> IO Inline +handleImage' baseURL tmpdir (Image ils (src,tit)) = do + exists <- doesFileExist src + if exists + then return $ Image ils (src,tit) + else do + res <- fetchItem baseURL src + case res of + Right (contents, Just mime) -> do + let ext = fromMaybe (takeExtension src) $ + extensionFromMimeType mime + let basename = showDigest $ sha1 $ BL.fromChunks [contents] + let fname = tmpdir </> basename <.> ext + BS.writeFile fname contents + return $ Image ils (fname,tit) + _ -> do + warn $ "Could not find image `" ++ src ++ "', skipping..." + return $ Image ils (src,tit) +handleImage' _ _ x = return x + tex2pdf' :: FilePath -- ^ temp directory for output -> String -- ^ tex program -> String -- ^ tex source @@ -68,11 +116,16 @@ tex2pdf' tmpDir program source = do then 3 -- to get page numbers else 2 -- 1 run won't give you PDF bookmarks (exit, log', mbPdf) <- runTeXProgram program numruns tmpDir source - let msg = "Error producing PDF from TeX source." case (exit, mbPdf) of - (ExitFailure _, _) -> return $ Left $ - msg <> "\n" <> extractMsg log' - (ExitSuccess, Nothing) -> return $ Left msg + (ExitFailure _, _) -> do + let logmsg = extractMsg log' + let extramsg = + case logmsg of + x | "! Package inputenc Error" `BC.isPrefixOf` x -> + "\nTry running pandoc with --latex-engine=xelatex." + _ -> "" + return $ Left $ logmsg <> extramsg + (ExitSuccess, Nothing) -> return $ Left "" (ExitSuccess, Just pdf) -> return $ Right pdf (<>) :: ByteString -> ByteString -> ByteString @@ -100,45 +153,33 @@ runTeXProgram program runsLeft tmpDir source = do let file = tmpDir </> "input.tex" exists <- doesFileExist file unless exists $ UTF8.writeFile file source +#ifdef _WINDOWS + -- note: we want / even on Windows, for TexLive + let tmpDir' = changePathSeparators tmpDir + let file' = changePathSeparators file +#else + let tmpDir' = tmpDir + let file' = file +#endif let programArgs = ["-halt-on-error", "-interaction", "nonstopmode", - "-output-directory", tmpDir, file] - (exit, out, err) <- readCommand program programArgs + "-output-directory", tmpDir', file'] + env' <- getEnvironment + let sep = searchPathSeparator:[] + let texinputs = maybe (tmpDir' ++ sep) ((tmpDir' ++ sep) ++) + $ lookup "TEXINPUTS" env' + let env'' = ("TEXINPUTS", texinputs) : + [(k,v) | (k,v) <- env', k /= "TEXINPUTS"] + (exit, out, err) <- pipeProcess (Just env'') program programArgs BL.empty if runsLeft > 1 then runTeXProgram program (runsLeft - 1) tmpDir source else do let pdfFile = replaceDirectory (replaceExtension file ".pdf") tmpDir pdfExists <- doesFileExist pdfFile pdf <- if pdfExists - then Just `fmap` B.readFile pdfFile + -- We read PDF as a strict bytestring to make sure that the + -- temp directory is removed on Windows. + -- See https://github.com/jgm/pandoc/issues/1192. + then (Just . B.fromChunks . (:[])) `fmap` BS.readFile pdfFile else return Nothing return (exit, out <> err, pdf) --- utility functions - --- Run a command and return exitcode, contents of stdout, and --- contents of stderr. (Based on --- 'readProcessWithExitCode' from 'System.Process'.) -readCommand :: FilePath -- ^ command to run - -> [String] -- ^ any arguments - -> IO (ExitCode,ByteString,ByteString) -- ^ exit, stdout, stderr -readCommand cmd args = do - (Just inh, Just outh, Just errh, pid) <- - createProcess (proc cmd args){ std_in = CreatePipe, - std_out = CreatePipe, - std_err = CreatePipe } - outMVar <- newEmptyMVar - -- fork off a thread to start consuming stdout - out <- B.hGetContents outh - _ <- forkIO $ evaluate (B.length out) >> putMVar outMVar () - -- fork off a thread to start consuming stderr - err <- B.hGetContents errh - _ <- forkIO $ evaluate (B.length err) >> putMVar outMVar () - -- now write and flush any input - hClose inh -- done with stdin - -- wait on the output - takeMVar outMVar - takeMVar outMVar - hClose outh - -- wait on the process - ex <- waitForProcess pid - return (ex, out, err) diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 0913d8c6c..8bc042e28 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -1,6 +1,7 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, TypeSynonymInstances, + FlexibleInstances#-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Parsing - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -47,11 +48,12 @@ module Text.Pandoc.Parsing ( (>>~), romanNumeral, emailAddress, uri, + mathInline, + mathDisplay, withHorizDisplacement, withRaw, escaped, characterReference, - updateLastStrPos, anyOrderedListMarker, orderedListMarker, charRef, @@ -61,10 +63,16 @@ module Text.Pandoc.Parsing ( (>>~), gridTableWith, readWith, testStringWith, - getOption, guardEnabled, guardDisabled, + updateLastStrPos, + notAfterString, ParserState (..), + HasReaderOptions (..), + HasHeaderMap (..), + HasIdentifierList (..), + HasMacros (..), + HasLastStrPosition (..), defaultParserState, HeaderType (..), ParserContext (..), @@ -75,6 +83,7 @@ module Text.Pandoc.Parsing ( (>>~), SubstTable, Key (..), toKey, + registerHeader, smartPunctuation, withQuoteContext, singleQuoteStart, @@ -85,6 +94,7 @@ module Text.Pandoc.Parsing ( (>>~), apostrophe, dash, nested, + citeKey, macro, applyMacros', Parser, @@ -151,6 +161,7 @@ where import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (Blocks, Inlines, rawBlock, HasMeta(..)) +import qualified Text.Pandoc.Builder as B import Text.Pandoc.XML (fromEntities) import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) import Text.Parsec @@ -161,16 +172,18 @@ import Data.List ( intercalate, transpose ) import Text.Pandoc.Shared import qualified Data.Map as M import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions) -import Text.HTML.TagSoup.Entity ( lookupEntity ) +import Text.Pandoc.Compat.TagSoupEntity ( lookupEntity ) +import Text.Pandoc.Asciify (toAsciiChar) import Data.Default import qualified Data.Set as Set import Control.Monad.Reader -import Control.Applicative ((*>), (<*), (<$), liftA2) +import Control.Applicative ((*>), (<*), (<$), liftA2, Applicative) import Data.Monoid +import Data.Maybe (catMaybes) type Parser t s = Parsec t s -newtype F a = F { unF :: Reader ParserState a } deriving (Monad, Functor) +newtype F a = F { unF :: Reader ParserState a } deriving (Monad, Applicative, Functor) runF :: F a -> ParserState -> a runF = runReader . unF @@ -261,7 +274,7 @@ spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Parses a nonspace, nonnewline character. nonspaceChar :: Parser [Char] st Char -nonspaceChar = satisfy $ \x -> x /= '\t' && x /= '\n' && x /= ' ' && x /= '\r' +nonspaceChar = satisfy $ flip notElem ['\t', '\n', ' ', '\r'] -- | Skips zero or more spaces or tabs. skipSpaces :: Parser [Char] st () @@ -421,7 +434,6 @@ uri :: Parser [Char] st (String, String) uri = try $ do scheme <- uriScheme char ':' - -- /^[\/\w\u0080-\uffff]+|%[A-Fa-f0-9]+|&#?\w+;|(?:[,]+|[\S])[%&~\w\u0080-\uffff]/ -- We allow punctuation except at the end, since -- we don't want the trailing '.' in 'http://google.com.' We want to allow -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) @@ -448,6 +460,41 @@ uri = try $ do let uri' = scheme ++ ":" ++ fromEntities str' return (uri', escapeURI uri') +mathInlineWith :: String -> String -> Parser [Char] st String +mathInlineWith op cl = try $ do + string op + notFollowedBy space + words' <- many1Till (count 1 (noneOf " \t\n\\") + <|> (char '\\' >> anyChar >>= \c -> return ['\\',c]) + <|> do (blankline <* notFollowedBy' blankline) <|> + (oneOf " \t" <* skipMany (oneOf " \t")) + notFollowedBy (char '$') + return " " + ) (try $ string cl) + notFollowedBy digit -- to prevent capture of $5 + return $ concat words' + +mathDisplayWith :: String -> String -> Parser [Char] st String +mathDisplayWith op cl = try $ do + string op + many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string cl) + +mathDisplay :: Parser [Char] ParserState String +mathDisplay = + (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathDisplayWith "\\[" "\\]") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathDisplayWith "\\\\[" "\\\\]") + +mathInline :: Parser [Char] ParserState String +mathInline = + (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathInlineWith "\\(" "\\)") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathInlineWith "\\\\(" "\\\\)") + -- | Applies a parser, returns tuple of its results and its horizontal -- displacement (the difference between the source column at the end -- and the source column at the beginning). Vertical displacement @@ -462,7 +509,7 @@ withHorizDisplacement parser = do -- | Applies a parser and returns the raw string that was parsed, -- along with the value produced by the parser. -withRaw :: Parser [Char] st a -> Parser [Char] st (a, [Char]) +withRaw :: Monad m => ParsecT [Char] st m a -> ParsecT [Char] st m (a, [Char]) withRaw parser = do pos1 <- getPosition inp <- getInput @@ -802,6 +849,7 @@ data ParserState = ParserState stateNotes :: NoteTable, -- ^ List of notes (raw bodies) stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies) stateMeta :: Meta, -- ^ Document metadata + stateMeta' :: F Meta, -- ^ Document metadata stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) stateIdentifiers :: [String], -- ^ List of header identifiers used @@ -810,6 +858,11 @@ data ParserState = ParserState stateHasChapters :: Bool, -- ^ True if \chapter encountered stateMacros :: [Macro], -- ^ List of macros defined so far stateRstDefaultRole :: String, -- ^ Current rST default interpreted text role + stateRstCustomRoles :: M.Map String (String, Maybe String, Attr -> (String, Attr)), -- ^ Current rST custom text roles + -- Triple represents: 1) Base role, 2) Optional format (only for :raw: + -- roles), 3) Source language annotation for code (could be used to + -- annotate role classes too). + stateCaption :: Maybe Inlines, -- ^ Caption in current environment stateWarnings :: [String] -- ^ Warnings generated by the parser } @@ -822,6 +875,48 @@ instance HasMeta ParserState where deleteMeta field st = st{ stateMeta = deleteMeta field $ stateMeta st } +class HasReaderOptions st where + extractReaderOptions :: st -> ReaderOptions + getOption :: (ReaderOptions -> b) -> Parser s st b + -- default + getOption f = (f . extractReaderOptions) `fmap` getState + +instance HasReaderOptions ParserState where + extractReaderOptions = stateOptions + +class HasHeaderMap st where + extractHeaderMap :: st -> M.Map Inlines String + updateHeaderMap :: (M.Map Inlines String -> M.Map Inlines String) -> + st -> st + +instance HasHeaderMap ParserState where + extractHeaderMap = stateHeaders + updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st } + +class HasIdentifierList st where + extractIdentifierList :: st -> [String] + updateIdentifierList :: ([String] -> [String]) -> st -> st + +instance HasIdentifierList ParserState where + extractIdentifierList = stateIdentifiers + updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st } + +class HasMacros st where + extractMacros :: st -> [Macro] + updateMacros :: ([Macro] -> [Macro]) -> st -> st + +instance HasMacros ParserState where + extractMacros = stateMacros + updateMacros f st = st{ stateMacros = f $ stateMacros st } + +class HasLastStrPosition st where + setLastStrPos :: SourcePos -> st -> st + getLastStrPos :: st -> Maybe SourcePos + +instance HasLastStrPosition ParserState where + setLastStrPos pos st = st{ stateLastStrPos = Just pos } + getLastStrPos st = stateLastStrPos st + defaultParserState :: ParserState defaultParserState = ParserState { stateOptions = def, @@ -835,6 +930,7 @@ defaultParserState = stateNotes = [], stateNotes' = [], stateMeta = nullMeta, + stateMeta' = return nullMeta, stateHeaderTable = [], stateHeaders = M.empty, stateIdentifiers = [], @@ -843,19 +939,29 @@ defaultParserState = stateHasChapters = False, stateMacros = [], stateRstDefaultRole = "title-reference", + stateRstCustomRoles = M.empty, + stateCaption = Nothing, stateWarnings = []} -getOption :: (ReaderOptions -> a) -> Parser s ParserState a -getOption f = (f . stateOptions) `fmap` getState - -- | Succeed only if the extension is enabled. -guardEnabled :: Extension -> Parser s ParserState () +guardEnabled :: HasReaderOptions st => Extension -> Parser s st () guardEnabled ext = getOption readerExtensions >>= guard . Set.member ext -- | Succeed only if the extension is disabled. -guardDisabled :: Extension -> Parser s ParserState () +guardDisabled :: HasReaderOptions st => Extension -> Parser s st () guardDisabled ext = getOption readerExtensions >>= guard . not . Set.member ext +-- | Update the position on which the last string ended. +updateLastStrPos :: HasLastStrPosition st => Parser s st () +updateLastStrPos = getPosition >>= updateState . setLastStrPos + +-- | Whether we are right after the end of a string. +notAfterString :: HasLastStrPosition st => Parser s st Bool +notAfterString = do + pos <- getPosition + st <- getState + return $ getLastStrPos st /= Just pos + data HeaderType = SingleHeader Char -- ^ Single line of characters underneath | DoubleHeader Char -- ^ Lines of characters above and below @@ -885,21 +991,47 @@ type KeyTable = M.Map Key Target type SubstTable = M.Map Key Inlines +-- | Add header to the list of headers in state, together +-- with its associated identifier. If the identifier is null +-- and the auto_identifers extension is set, generate a new +-- unique identifier, and update the list of identifiers +-- in state. +registerHeader :: (HasReaderOptions st, HasHeaderMap st, HasIdentifierList st) + => Attr -> Inlines -> Parser s st Attr +registerHeader (ident,classes,kvs) header' = do + ids <- extractIdentifierList `fmap` getState + exts <- getOption readerExtensions + let insert' = M.insertWith (\_new old -> old) + if null ident && Ext_auto_identifiers `Set.member` exts + then do + let id' = uniqueIdent (B.toList header') ids + let id'' = if Ext_ascii_identifiers `Set.member` exts + then catMaybes $ map toAsciiChar id' + else id' + updateState $ updateIdentifierList $ + if id' == id'' then (id' :) else ([id', id''] ++) + updateState $ updateHeaderMap $ insert' header' id' + return (id'',classes,kvs) + else do + unless (null ident) $ + updateState $ updateHeaderMap $ insert' header' ident + return (ident,classes,kvs) + -- | Fail unless we're in "smart typography" mode. -failUnlessSmart :: Parser [tok] ParserState () +failUnlessSmart :: HasReaderOptions st => Parser s st () failUnlessSmart = getOption readerSmart >>= guard -smartPunctuation :: Parser [Char] ParserState Inline - -> Parser [Char] ParserState Inline +smartPunctuation :: Parser [Char] ParserState Inlines + -> Parser [Char] ParserState Inlines smartPunctuation inlineParser = do failUnlessSmart choice [ quoted inlineParser, apostrophe, dash, ellipses ] -apostrophe :: Parser [Char] ParserState Inline -apostrophe = (char '\'' <|> char '\8217') >> return (Str "\x2019") +apostrophe :: Parser [Char] ParserState Inlines +apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\x2019") -quoted :: Parser [Char] ParserState Inline - -> Parser [Char] ParserState Inline +quoted :: Parser [Char] ParserState Inlines + -> Parser [Char] ParserState Inlines quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser withQuoteContext :: QuoteContext @@ -914,20 +1046,19 @@ withQuoteContext context parser = do setState newState { stateQuoteContext = oldQuoteContext } return result -singleQuoted :: Parser [Char] ParserState Inline - -> Parser [Char] ParserState Inline +singleQuoted :: Parser [Char] ParserState Inlines + -> Parser [Char] ParserState Inlines singleQuoted inlineParser = try $ do singleQuoteStart withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= - return . Quoted SingleQuote . normalizeSpaces + return . B.singleQuoted . mconcat -doubleQuoted :: Parser [Char] ParserState Inline - -> Parser [Char] ParserState Inline +doubleQuoted :: Parser [Char] ParserState Inlines + -> Parser [Char] ParserState Inlines doubleQuoted inlineParser = try $ do doubleQuoteStart - withQuoteContext InDoubleQuote $ do - contents <- manyTill inlineParser doubleQuoteEnd - return . Quoted DoubleQuote . normalizeSpaces $ contents + withQuoteContext InDoubleQuote $ manyTill inlineParser doubleQuoteEnd >>= + return . B.doubleQuoted . mconcat failIfInQuoteContext :: QuoteContext -> Parser [tok] ParserState () failIfInQuoteContext context = do @@ -942,17 +1073,11 @@ charOrRef cs = guard (c `elem` cs) return c) -updateLastStrPos :: Parser [Char] ParserState () -updateLastStrPos = getPosition >>= \p -> - updateState $ \s -> s{ stateLastStrPos = Just p } - singleQuoteStart :: Parser [Char] ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote - pos <- getPosition - st <- getState -- single quote start can't be right after str - guard $ stateLastStrPos st /= Just pos + guard =<< notAfterString () <$ charOrRef "'\8216\145" singleQuoteEnd :: Parser [Char] st () @@ -964,24 +1089,24 @@ doubleQuoteStart :: Parser [Char] ParserState () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote try $ do charOrRef "\"\8220\147" - notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n')) + notFollowedBy . satisfy $ flip elem [' ', '\t', '\n'] doubleQuoteEnd :: Parser [Char] st () doubleQuoteEnd = do charOrRef "\"\8221\148" return () -ellipses :: Parser [Char] st Inline +ellipses :: Parser [Char] st Inlines ellipses = do try (charOrRef "\8230\133") <|> try (string "..." >> return '…') - return (Str "\8230") + return (B.str "\8230") -dash :: Parser [Char] ParserState Inline +dash :: Parser [Char] ParserState Inlines dash = do oldDashes <- getOption readerOldDashes if oldDashes then emDashOld <|> enDashOld - else Str `fmap` (hyphenDash <|> emDash <|> enDash) + else B.str `fmap` (hyphenDash <|> emDash <|> enDash) -- Two hyphens = en-dash, three = em-dash hyphenDash :: Parser [Char] st String @@ -999,16 +1124,16 @@ enDash = do try (charOrRef "\8212\151") return "\8211" -enDashOld :: Parser [Char] st Inline +enDashOld :: Parser [Char] st Inlines enDashOld = do try (charOrRef "\8211\150") <|> try (char '-' >> lookAhead (satisfy isDigit) >> return '–') - return (Str "\8211") + return (B.str "\8211") -emDashOld :: Parser [Char] st Inline +emDashOld :: Parser [Char] st Inlines emDashOld = do try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-') - return (Str "\8212") + return (B.str "\8212") -- This is used to prevent exponential blowups for things like: -- a**a*a**a*a**a*a**a*a**a*a**a*a** @@ -1022,12 +1147,24 @@ nested p = do updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res +citeKey :: HasLastStrPosition st => Parser [Char] st (Bool, String) +citeKey = try $ do + guard =<< notAfterString + suppress_author <- option False (char '-' *> return True) + char '@' + firstChar <- letter <|> char '_' + let regchar = satisfy (\c -> isAlphaNum c || c == '_') + let internal p = try $ p <* lookAhead regchar + rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") + let key = firstChar:rest + return (suppress_author, key) + -- -- Macros -- -- | Parse a \newcommand or \renewcommand macro definition. -macro :: Parser [Char] ParserState Blocks +macro :: (HasMacros st, HasReaderOptions st) => Parser [Char] st Blocks macro = do apply <- getOption readerApplyMacros inp <- getInput @@ -1037,7 +1174,7 @@ macro = do if apply then do updateState $ \st -> - st { stateMacros = ms ++ stateMacros st } + updateMacros (ms ++) st return mempty else return $ rawBlock "latex" def' @@ -1046,7 +1183,6 @@ applyMacros' :: String -> Parser [Char] ParserState String applyMacros' target = do apply <- getOption readerApplyMacros if apply - then do macros <- liftM stateMacros getState + then do macros <- extractMacros `fmap` getState return $ applyMacros macros target else return target - diff --git a/src/Text/Pandoc/Pretty.hs b/src/Text/Pandoc/Pretty.hs index 21121a506..d25ba725f 100644 --- a/src/Text/Pandoc/Pretty.hs +++ b/src/Text/Pandoc/Pretty.hs @@ -1,6 +1,6 @@ -{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, CPP #-} {- -Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111(-1)307 USA {- | Module : Text.Pandoc.Pretty - Copyright : Copyright (C) 2010 John MacFarlane + Copyright : Copyright (C) 2010-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -60,6 +60,7 @@ module Text.Pandoc.Pretty ( , hsep , vcat , vsep + , nestle , chomp , inside , braces @@ -72,7 +73,7 @@ module Text.Pandoc.Pretty ( ) where -import Data.Sequence (Seq, fromList, (<|), singleton, mapWithIndex) +import Data.Sequence (Seq, fromList, (<|), singleton, mapWithIndex, viewl, ViewL(..)) import Data.Foldable (toList) import Data.List (intercalate) import Data.Monoid @@ -80,8 +81,7 @@ import Data.String import Control.Monad.State import Data.Char (isSpace) -data Monoid a => - RenderState a = RenderState{ +data RenderState a = RenderState{ output :: [a] -- ^ In reverse order , prefix :: String , usePrefix :: Bool @@ -186,6 +186,14 @@ vcat = foldr ($$) empty vsep :: [Doc] -> Doc vsep = foldr ($+$) empty +-- | Removes leading blank lines from a 'Doc'. +nestle :: Doc -> Doc +nestle (Doc d) = Doc $ go d + where go x = case viewl x of + (BlankLine :< rest) -> go rest + (NewLine :< rest) -> go rest + _ -> x + -- | Chomps trailing blank space off of a 'Doc'. chomp :: Doc -> Doc chomp d = Doc (fromList dl') diff --git a/src/Text/Pandoc/Process.hs b/src/Text/Pandoc/Process.hs new file mode 100644 index 000000000..19872b405 --- /dev/null +++ b/src/Text/Pandoc/Process.hs @@ -0,0 +1,104 @@ +{- +Copyright (C) 2013-2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Process + Copyright : Copyright (C) 2013-2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +ByteString variant of 'readProcessWithExitCode'. +-} +module Text.Pandoc.Process (pipeProcess) +where +import System.Process +import System.Exit (ExitCode (..)) +import Control.Exception +import System.IO (hClose, hFlush) +import Control.Concurrent (putMVar, takeMVar, newEmptyMVar, forkIO) +import Control.Monad (unless) +import qualified Data.ByteString.Lazy as BL + +{- | +Version of 'System.Process.readProcessWithExitCode' that uses lazy bytestrings +instead of strings and allows setting environment variables. + +@readProcessWithExitCode@ creates an external process, reads its +standard output and standard error strictly, waits until the process +terminates, and then returns the 'ExitCode' of the process, +the standard output, and the standard error. + +If an asynchronous exception is thrown to the thread executing +@readProcessWithExitCode@, the forked process will be terminated and +@readProcessWithExitCode@ will wait (block) until the process has been +terminated. +-} + +pipeProcess + :: Maybe [(String, String)] -- ^ environment variables + -> FilePath -- ^ Filename of the executable (see 'proc' for details) + -> [String] -- ^ any arguments + -> BL.ByteString -- ^ standard input + -> IO (ExitCode,BL.ByteString,BL.ByteString) -- ^ exitcode, stdout, stderr +pipeProcess mbenv cmd args input = + mask $ \restore -> do + (Just inh, Just outh, Just errh, pid) <- createProcess (proc cmd args) + { env = mbenv, + std_in = CreatePipe, + std_out = CreatePipe, + std_err = CreatePipe } + flip onException + (do hClose inh; hClose outh; hClose errh; + terminateProcess pid; waitForProcess pid) $ restore $ do + -- fork off a thread to start consuming stdout + out <- BL.hGetContents outh + waitOut <- forkWait $ evaluate $ BL.length out + + -- fork off a thread to start consuming stderr + err <- BL.hGetContents errh + waitErr <- forkWait $ evaluate $ BL.length err + + -- now write and flush any input + let writeInput = do + unless (BL.null input) $ do + BL.hPutStr inh input + hFlush inh + hClose inh + + writeInput + + -- wait on the output + waitOut + waitErr + + hClose outh + hClose errh + + -- wait on the process + ex <- waitForProcess pid + + return (ex, out, err) + +forkWait :: IO a -> IO (IO a) +forkWait a = do + res <- newEmptyMVar + _ <- mask $ \restore -> forkIO $ try (restore a) >>= putMVar res + return (takeMVar res >>= either (\ex -> throwIO (ex :: SomeException)) return) diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index 0058e889c..cf1d5132e 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -1,16 +1,18 @@ module Text.Pandoc.Readers.DocBook ( readDocBook ) where -import Data.Char (toUpper, isDigit) +import Data.Char (toUpper) +import Text.Pandoc.Shared (safeRead) import Text.Pandoc.Options import Text.Pandoc.Definition import Text.Pandoc.Builder import Text.XML.Light -import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Compat.TagSoupEntity (lookupEntity) import Data.Generics import Data.Monoid import Data.Char (isSpace) import Control.Monad.State import Control.Applicative ((<$>)) import Data.List (intersperse) +import Data.Maybe (fromMaybe) {- @@ -43,7 +45,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] audioobject - A wrapper for audio data and its associated meta-information [x] author - The name of an individual author [ ] authorblurb - A short description or note about an author -[ ] authorgroup - Wrapper for author information when a document has +[x] authorgroup - Wrapper for author information when a document has multiple authors or collabarators [x] authorinitials - The initials or other short identifier for an author [o] beginpage - The location of a page break in a print version of the document @@ -339,7 +341,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] refsectioninfo - Meta-information for a refsection [ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page [ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv -[ ] releaseinfo - Information about a particular release of a document +[x] releaseinfo - Information about a particular release of a document [ ] remark - A remark (or comment) intended for presentation in a draft manuscript [ ] replaceable - Content that may or must be replaced by the user @@ -490,34 +492,40 @@ List of all DocBook tags, with [x] indicating implemented, anything else [ ] xref - A cross reference to another part of the document [ ] year - The year of publication of a document - +[x] ?asciidoc-br? - line break from asciidoc docbook output -} type DB = State DBState data DBState = DBState{ dbSectionLevel :: Int , dbQuoteType :: QuoteType - , dbDocTitle :: Inlines - , dbDocAuthors :: [Inlines] - , dbDocDate :: Inlines + , dbMeta :: Meta + , dbAcceptsMeta :: Bool , dbBook :: Bool , dbFigureTitle :: Inlines } deriving Show readDocBook :: ReaderOptions -> String -> Pandoc -readDocBook _ inp = setTitle (dbDocTitle st') - $ setAuthors (dbDocAuthors st') - $ setDate (dbDocDate st') - $ doc $ mconcat bs - where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp) +readDocBook _ inp = Pandoc (dbMeta st') (toList $ mconcat bs) + where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp') DBState{ dbSectionLevel = 0 , dbQuoteType = DoubleQuote - , dbDocTitle = mempty - , dbDocAuthors = [] - , dbDocDate = mempty + , dbMeta = mempty + , dbAcceptsMeta = False , dbBook = False , dbFigureTitle = mempty } + inp' = handleInstructions inp + +-- We treat <?asciidoc-br?> specially (issue #1236), converting it +-- to <br/>, since xml-light doesn't parse the instruction correctly. +-- Other xml instructions are simply removed from the input stream. +handleInstructions :: String -> String +handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions xs +handleInstructions xs = case break (=='<') xs of + (ys, []) -> ys + ([], '<':zs) -> '<' : handleInstructions zs + (ys, zs) -> ys ++ handleInstructions zs getFigure :: Element -> DB Blocks getFigure e = do @@ -558,6 +566,30 @@ attrValue attr elt = named :: String -> Element -> Bool named s e = qName (elName e) == s +-- + +acceptingMetadata :: DB a -> DB a +acceptingMetadata p = do + modify (\s -> s { dbAcceptsMeta = True } ) + res <- p + modify (\s -> s { dbAcceptsMeta = False }) + return res + +checkInMeta :: Monoid a => DB () -> DB a +checkInMeta p = do + accepts <- dbAcceptsMeta <$> get + when accepts p + return mempty + + + +addMeta :: ToMetaValue a => String -> a -> DB () +addMeta field val = modify (setMeta field val) + +instance HasMeta DBState where + setMeta field v s = s {dbMeta = setMeta field v (dbMeta s)} + deleteMeta field s = s {dbMeta = deleteMeta field (dbMeta s)} + isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blocktags where blocktags = ["toc","index","para","formalpara","simpara", @@ -604,6 +636,7 @@ getImage e = do getBlocks :: Element -> DB Blocks getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) + parseBlock :: Content -> DB Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE parseBlock (Text (CData _ s _)) = if all isSpace s @@ -617,10 +650,10 @@ parseBlock (Elem e) = "para" -> parseMixed para (elContent e) "formalpara" -> do tit <- case filterChild (named "title") e of - Just t -> (<> str "." <> linebreak) <$> emph - <$> getInlines t + Just t -> (para . strong . (<> str ".")) <$> + getInlines t Nothing -> return mempty - addToStart tit <$> parseMixed para (elContent e) + (tit <>) <$> parseMixed para (elContent e) "simpara" -> parseMixed para (elContent e) "ackno" -> parseMixed para (elContent e) "epigraph" -> parseBlockquote @@ -628,7 +661,11 @@ parseBlock (Elem e) = "attribution" -> return mempty "titleabbrev" -> return mempty "authorinitials" -> return mempty - "title" -> return mempty -- handled by getTitle or sect or figure + "title" -> checkInMeta getTitle + "author" -> checkInMeta getAuthor + "authorgroup" -> checkInMeta getAuthorGroup + "releaseinfo" -> checkInMeta (getInlines e >>= addMeta "release") + "date" -> checkInMeta getDate "bibliography" -> sect 0 "bibliodiv" -> sect 1 "biblioentry" -> parseMixed para (elContent e) @@ -682,18 +719,17 @@ parseBlock (Elem e) = "lowerroman" -> LowerRoman "upperroman" -> UpperRoman _ -> Decimal - let start = case attrValue "override" <$> - filterElement (named "listitem") e of - Just x@(_:_) | all isDigit x -> read x - _ -> 1 + let start = fromMaybe 1 $ + (attrValue "override" <$> filterElement (named "listitem") e) + >>= safeRead orderedListWith (start,listStyle,DefaultDelim) <$> listitems "variablelist" -> definitionList <$> deflistitems "figure" -> getFigure e "mediaobject" -> para <$> getImage e "caption" -> return mempty - "info" -> getTitle >> getAuthors >> getDate >> return mempty - "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "info" -> metaBlock + "articleinfo" -> metaBlock "sectioninfo" -> return mempty -- keywords & other metadata "refsectioninfo" -> return mempty -- keywords & other metadata "refsect1info" -> return mempty -- keywords & other metadata @@ -707,10 +743,10 @@ parseBlock (Elem e) = "chapterinfo" -> return mempty -- keywords & other metadata "glossaryinfo" -> return mempty -- keywords & other metadata "appendixinfo" -> return mempty -- keywords & other metadata - "bookinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "bookinfo" -> metaBlock "article" -> modify (\st -> st{ dbBook = False }) >> - getTitle >> getBlocks e - "book" -> modify (\st -> st{ dbBook = True }) >> getTitle >> getBlocks e + getBlocks e + "book" -> modify (\st -> st{ dbBook = True }) >> getBlocks e "table" -> parseTable "informaltable" -> parseTable "literallayout" -> codeBlockWithLang @@ -756,30 +792,25 @@ parseBlock (Elem e) = terms' <- mapM getInlines terms items' <- mapM getBlocks items return (mconcat $ intersperse (str "; ") terms', items') - getTitle = case filterChild (named "title") e of - Just t -> do - tit <- getInlines t - subtit <- case filterChild (named "subtitle") e of - Just s -> (text ": " <>) <$> - getInlines s - Nothing -> return mempty - modify $ \st -> st{dbDocTitle = tit <> subtit} - Nothing -> return () - getAuthors = do - auths <- mapM getInlines - $ filterChildren (named "author") e - modify $ \st -> st{dbDocAuthors = auths} - getDate = case filterChild (named "date") e of - Just t -> do - dat <- getInlines t - modify $ \st -> st{dbDocDate = dat} - Nothing -> return () + getTitle = do + tit <- getInlines e + subtit <- case filterChild (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + addMeta "title" (tit <> subtit) + + getAuthor = (:[]) <$> getInlines e >>= addMeta "author" + getAuthorGroup = do + let terms = filterChildren (named "author") e + mapM getInlines terms >>= addMeta "author" + getDate = getInlines e >>= addMeta "date" parseTable = do let isCaption x = named "title" x || named "caption" x caption <- case filterChild isCaption e of Just t -> getInlines t Nothing -> return mempty - let e' = maybe e id $ filterChild (named "tgroup") e + let e' = fromMaybe e $ filterChild (named "tgroup") e let isColspec x = named "colspec" x || named "col" x let colspecs = case filterChild (named "colgroup") e' of Just c -> filterChildren isColspec c @@ -801,11 +832,14 @@ parseBlock (Elem e) = Just "center" -> AlignCenter _ -> AlignDefault let toWidth c = case findAttr (unqual "colwidth") c of - Just w -> read $ filter (\x -> + Just w -> fromMaybe 0 + $ safeRead $ '0': filter (\x -> (x >= '0' && x <= '9') || x == '.') w Nothing -> 0 :: Double - let numrows = maximum $ map length bodyrows + let numrows = case bodyrows of + [] -> 0 + xs -> maximum $ map length xs let aligns = case colspecs of [] -> replicate numrows AlignDefault cs -> map toAlignment cs @@ -832,6 +866,7 @@ parseBlock (Elem e) = b <- getBlocks e modify $ \st -> st{ dbSectionLevel = n - 1 } return $ header n' headerText <> b + metaBlock = acceptingMetadata (getBlocks e) >> return mempty getInlines :: Element -> DB Inlines getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') @@ -895,6 +930,11 @@ parseInline (Elem e) = _ -> emph <$> innerInlines "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) "title" -> return mempty + "affiliation" -> return mempty + -- Note: this isn't a real docbook tag; it's what we convert + -- <?asciidor-br?> to in handleInstructions, above. A kludge to + -- work around xml-light's inability to parse an instruction. + "br" -> return linebreak _ -> innerInlines where innerInlines = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e) diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs new file mode 100644 index 000000000..71baa5dde --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -0,0 +1,489 @@ +{-# LANGUAGE PatternGuards #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse) +to 'Pandoc' document. -} + +{- +Current state of implementation of Docx entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + + - [X] Para + - [X] CodeBlock (styled with `SourceCode`) + - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, + indented) + - [X] OrderedList + - [X] BulletList + - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) + - [X] Header (styled with `Heading#`) + - [ ] HorizontalRule + - [-] Table (column widths and alignments not yet implemented) + +* Inlines + + - [X] Str + - [X] Emph (From italics. `underline` currently read as span. In + future, it might optionally be emph as well) + - [X] Strong + - [X] Strikeout + - [X] Superscript + - [X] Subscript + - [X] SmallCaps + - [ ] Quoted + - [ ] Cite + - [X] Code (styled with `VerbatimChar`) + - [X] Space + - [X] LineBreak (these are invisible in Word: entered with Shift-Return) + - [ ] Math + - [X] Link (links to an arbitrary bookmark create a span with the target as + id and "anchor" class) + - [-] Image (Links to path in archive. Future option for + data-encoded URI likely.) + - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.Docx + ( readDocx + ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (text, toList) +import Text.Pandoc.MIME (getMimeType) +import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Walk +import Text.Pandoc.Readers.Docx.Parse +import Text.Pandoc.Readers.Docx.Lists +import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Shared +import Data.Maybe (mapMaybe) +import Data.List (delete, isPrefixOf, (\\)) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Base64 (encode) +import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State + +readDocx :: ReaderOptions + -> B.ByteString + -> Pandoc +readDocx opts bytes = + case archiveToDocx (toArchive bytes) of + Just docx -> Pandoc nullMeta (docxToBlocks opts docx) + Nothing -> error $ "couldn't parse docx file" + +data DState = DState { docxAnchorMap :: M.Map String String } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxDocument :: Docx} + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + +-- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + +divsToKeep :: [String] +divsToKeep = ["list-item", "Definition", "DefinitionTerm"] + +runStyleToContainers :: RunStyle -> [Container Inline] +runStyleToContainers rPr = + let spanClassToContainers :: String -> [Container Inline] + spanClassToContainers s | s `elem` codeSpans = + [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))] + spanClassToContainers s | s `elem` spansToKeep = + [Container $ Span ("", [s], [])] + spanClassToContainers _ = [] + + classContainers = case rStyle rPr of + Nothing -> [] + Just s -> spanClassToContainers s + + formatters = map Container $ mapMaybe id + [ if isBold rPr then (Just Strong) else Nothing + , if isItalic rPr then (Just Emph) else Nothing + , if isSmallCaps rPr then (Just SmallCaps) else Nothing + , if isStrike rPr then (Just Strikeout) else Nothing + , if isSuperScript rPr then (Just Superscript) else Nothing + , if isSubScript rPr then (Just Subscript) else Nothing + , underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)])) + ] + in + classContainers ++ formatters + + +divAttrToContainers :: [String] -> [(String, String)] -> [Container Block] +divAttrToContainers (c:cs) _ | Just n <- isHeaderClass c = + [Container $ \_ -> + Header n ("", delete ("Heading" ++ show n) cs, []) []] +divAttrToContainers (c:cs) kvs | c `elem` divsToKeep = + (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs) +divAttrToContainers (c:cs) kvs | c `elem` codeDivs = + -- This is a bit of a cludge. We make the codeblock from the raw + -- parparts in bodyPartToBlocks. But we need something to match against. + (Container $ \_ -> CodeBlock ("", [], []) "") : (divAttrToContainers cs kvs) +divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs = + let kvs' = filter (\(k,_) -> k /= "indent") kvs + in + (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs') +divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs = + (Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs) +divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs +divAttrToContainers [] kvs | Just numString <- lookup "indent" kvs = + let kvs' = filter (\(k,_) -> k /= "indent") kvs + in + case numString of + "0" -> divAttrToContainers [] kvs' + ('-' : _) -> divAttrToContainers [] kvs' + _ -> (Container BlockQuote) : divAttrToContainers [] kvs' +divAttrToContainers _ _ = [] + + +parStyleToContainers :: ParagraphStyle -> [Container Block] +parStyleToContainers pPr = + let classes = pStyle pPr + kvs = case indent pPr of + Just n -> [("indent", show n)] + Nothing -> [] + in + divAttrToContainers classes kvs + + +strToInlines :: String -> [Inline] +strToInlines = toList . text + +codeSpans :: [String] +codeSpans = ["VerbatimChar"] + +blockQuoteDivs :: [String] +blockQuoteDivs = ["Quote", "BlockQuote"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> [Inline] +runElemToInlines (TextRun s) = strToInlines s +runElemToInlines (LnBrk) = [LineBreak] +runElemToInlines (Tab) = [Space] + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] +runElemToString (Tab) = ['\t'] + +runElemsToString :: [RunElem] -> String +runElemsToString = concatMap runElemToString + +runToString :: Run -> String +runToString (Run _ runElems) = runElemsToString runElems +runToString _ = "" + +parPartToString :: ParPart -> String +parPartToString (PlainRun run) = runToString run +parPartToString (InternalHyperLink _ runs) = concatMap runToString runs +parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs +parPartToString _ = "" + + +inlineCodeContainer :: Container Inline -> Bool +inlineCodeContainer (Container f) = case f [] of + Code _ "" -> True + _ -> False +inlineCodeContainer _ = False + + +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) + | any inlineCodeContainer (runStyleToContainers rs) = + return $ + rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] + | otherwise = + return $ + rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) +runToInlines (Footnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument + case (getFootNote fnId notes) of + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] +runToInlines (Endnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument + case (getEndNote fnId notes) of + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- Check to see if the id is already in there. Rewrite if + -- necessary. This will have the possible effect of rewriting + -- user-defined anchor links. However, since these are not defined + -- in pandoc, it seems like a necessary evil to avoid an extra + -- pass. + let newAnchor = case anchor `elem` (M.elems anchorMap) of + True -> uniqueIdent [Str anchor] (M.elems anchorMap) + False -> anchor + put DState{ docxAnchorMap = M.insert anchor newAnchor anchorMap} + return [Span (anchor, ["anchor"], []) []] +parPartToInlines (Drawing relid) = do + (Docx _ _ _ rels _) <- asks docxDocument + return $ case lookupRelationship relid rels of + Just target -> [Image [] (combine "word" target, "")] + Nothing -> [Image [] ("", "")] +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink relid runs) = do + (Docx _ _ _ rels _) <- asks docxDocument + rs <- concatMapM runToInlines runs + return $ case lookupRelationship relid rels of + Just target -> + [Link rs (target, "")] + Nothing -> + [Link rs ("", "")] + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (ident, classes, kvs) ils) = + (not . null) ident && + classes == ["anchor"] && + null kvs && + null ils +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchor :: Block -> DocxContext Block +-- If there is an anchor already there (an anchor span in the header, +-- to be exact), we rename and associate the new id with the old one. +makeHeaderAnchor (Header n (_, classes, kvs) ils) + | (x : xs) <- filter isAnchorSpan ils + , (Span (ident, _, _) _) <- x + , notElem ident dummyAnchors = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + put DState{docxAnchorMap = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) +-- Otherwise we just give it a name, and register that name (associate +-- it with itself.) +makeHeaderAnchor (Header n (_, classes, kvs) ils) = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + put DState{docxAnchorMap = M.insert newIdent newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) ils +makeHeaderAnchor blk = return blk + + +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts >>= + -- TODO: Option for self-containted images + (if False then (walkM makeImagesSelfContained) else return) + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells + +isBlockCodeContainer :: Container Block -> Bool +isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True +isBlockCodeContainer _ = False + +isHeaderContainer :: Container Block -> Bool +isHeaderContainer (Container f) | Header _ _ _ <- f [] = True +isHeaderContainer _ = False + +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) + | any isBlockCodeContainer (parStyleToContainers pPr) = + let + otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr) + in + return $ + rebuild + otherConts + [CodeBlock ("", [], []) (concatMap parPartToString parparts)] +bodyPartToBlocks (Paragraph pPr parparts) + | any isHeaderContainer (parStyleToContainers pPr) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) + Header n attr _ = hdrFun [] + hdr <- makeHeaderAnchor $ Header n attr ils + return [hdr] +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + case ils of + [] -> return [] + _ -> do + return $ + rebuild + (parStyleToContainers pPr) + [Para ils] +bodyPartToBlocks (ListItem pPr numId lvl parparts) = do + (Docx _ _ numbering _ _) <- asks docxDocument + let + kvs = case lookupLevel numId lvl numbering of + Just (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + Just (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + Nothing -> [] + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do + let caption = strToInlines cap + (hdr, rows) = case firstRowFormatting look of + True -> (Just r, rs) + False -> (Nothing, r:rs) + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] + + cells <- mapM rowToBlocksList rows + + let size = case null hdrCells of + True -> length $ head cells + False -> length $ hdrCells + -- + -- The two following variables (horizontal column alignment and + -- relative column widths) go to the default at the + -- moment. Width information is in the TblGrid field of the Tbl, + -- so should be possible. Alignment might be more difficult, + -- since there doesn't seem to be a column entity in docx. + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] + +-- replace targets with generated anchors. +rewriteLink :: Inline -> DocxContext Inline +rewriteLink l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink il = return il + +makeImagesSelfContained :: Inline -> DocxContext Inline +makeImagesSelfContained i@(Image alt (uri, title)) = do + (Docx _ _ _ _ media) <- asks docxDocument + return $ case lookup uri media of + Just bs -> + case getMimeType uri of + Just mime -> + let data_uri = "data:" ++ mime ++ ";base64," ++ + toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i + Nothing -> i +makeImagesSelfContained inline = return inline + +bodyToBlocks :: Body -> DocxContext [Block] +bodyToBlocks (Body bps) = do + blks <- concatMapM bodyPartToBlocks bps >>= + walkM rewriteLink + return $ + blocksToDefinitions $ + blocksToBullets $ blks + +docxToBlocks :: ReaderOptions -> Docx -> [Block] +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = + let dState = DState { docxAnchorMap = M.empty } + dEnv = DEnv { docxOptions = opts + , docxDocument = d} + in + evalDocxContext (bodyToBlocks body) dEnv dState + +ilToCode :: Inline -> String +ilToCode (Str s) = s +ilToCode Space = " " +ilToCode _ = "" + +isHeaderClass :: String -> Maybe Int +isHeaderClass s | "Heading" `isPrefixOf` s = + case reads (drop (length "Heading") s) :: [(Int, String)] of + [] -> Nothing + ((n, "") : []) -> Just n + _ -> Nothing +isHeaderClass _ = Nothing diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs new file mode 100644 index 000000000..1e37d0076 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -0,0 +1,227 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Lists + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Functions for converting flat docx paragraphs into nested lists. +-} + +module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets + , blocksToDefinitions + , listParagraphDivs + ) where + +import Text.Pandoc.JSON +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.Shared (trim) +import Control.Monad +import Data.List +import Data.Maybe + +isListItem :: Block -> Bool +isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True +isListItem _ = False + +getLevel :: Block -> Maybe Integer +getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs +getLevel _ = Nothing + +getLevelN :: Block -> Integer +getLevelN b = case getLevel b of + Just n -> n + Nothing -> -1 + +getNumId :: Block -> Maybe Integer +getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs +getNumId _ = Nothing + +getNumIdN :: Block -> Integer +getNumIdN b = case getNumId b of + Just n -> n + Nothing -> -1 + +getText :: Block -> Maybe String +getText (Div (_, _, kvs) _) = lookup "text" kvs +getText _ = Nothing + +data ListType = Itemized | Enumerated ListAttributes + +listStyleMap :: [(String, ListNumberStyle)] +listStyleMap = [("upperLetter", UpperAlpha), + ("lowerLetter", LowerAlpha), + ("upperRoman", UpperRoman), + ("lowerRoman", LowerRoman), + ("decimal", Decimal)] + +listDelimMap :: [(String, ListNumberDelim)] +listDelimMap = [("%1)", OneParen), + ("(%1)", TwoParens), + ("%1.", Period)] + +getListType :: Block -> Maybe ListType +getListType b@(Div (_, _, kvs) _) | isListItem b = + let + start = lookup "start" kvs + frmt = lookup "format" kvs + txt = lookup "text" kvs + in + case frmt of + Just "bullet" -> Just Itemized + Just f -> + case txt of + Just t -> Just $ Enumerated ( + read (fromMaybe "1" start) :: Int, + fromMaybe DefaultStyle (lookup f listStyleMap), + fromMaybe DefaultDelim (lookup t listDelimMap)) + Nothing -> Nothing + _ -> Nothing +getListType _ = Nothing + +listParagraphDivs :: [String] +listParagraphDivs = ["ListParagraph"] + +-- This is a first stab at going through and attaching meaning to list +-- paragraphs, without an item marker, following a list item. We +-- assume that these are paragraphs in the same item. + +handleListParagraphs :: [Block] -> [Block] +handleListParagraphs [] = [] +handleListParagraphs ( + (Div attr1@(_, classes1, _) blks1) : + (Div (ident2, classes2, kvs2) blks2) : + blks + ) | "list-item" `elem` classes1 && + not ("list-item" `elem` classes2) && + (not . null) (listParagraphDivs `intersect` classes2) = + -- We don't want to keep this indent. + let newDiv2 = + (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) + in + handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) +handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) + +separateBlocks' :: Block -> [[Block]] -> [[Block]] +separateBlocks' blk ([] : []) = [[blk]] +separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +-- The following is for the invisible bullet lists. This is how +-- pandoc-generated ooxml does multiparagraph item lists. +separateBlocks' b acc | liftM trim (getText b) == Just "" = + (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc = acc ++ [[b]] + +separateBlocks :: [Block] -> [[Block]] +separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) + +flatToBullets' :: Integer -> [Block] -> [Block] +flatToBullets' _ [] = [] +flatToBullets' num xs@(b : elems) + | getLevelN b == num = b : (flatToBullets' num elems) + | otherwise = + let bNumId = getNumIdN b + bLevel = getLevelN b + (children, remaining) = + span + (\b' -> + ((getLevelN b') > bLevel || + ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) + xs + in + case getListType b of + Just (Enumerated attr) -> + (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + _ -> + (BulletList (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + +flatToBullets :: [Block] -> [Block] +flatToBullets elems = flatToBullets' (-1) elems + +blocksToBullets :: [Block] -> [Block] +blocksToBullets blks = + bottomUp removeListDivs $ + flatToBullets $ (handleListParagraphs blks) + +plainParaInlines :: Block -> [Inline] +plainParaInlines (Plain ils) = ils +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] + +blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] +blocksToDefinitions' [] acc [] = reverse acc +blocksToDefinitions' defAcc acc [] = + reverse $ (DefinitionList (reverse defAcc)) : acc +blocksToDefinitions' defAcc acc + ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) + | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + pair = case remainingAttr2 == ("", [], []) of + True -> (concatMap plainParaInlines blks1, [blks2]) + False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) + in + blocksToDefinitions' (pair : defAcc) acc blks +blocksToDefinitions' defAcc acc + ((Div (ident2, classes2, kvs2) blks2) : blks) + | (not . null) defAcc && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + defItems2 = case remainingAttr2 == ("", [], []) of + True -> blks2 + False -> [Div remainingAttr2 blks2] + ((defTerm, defItems):defs) = defAcc + defAcc' = case null defItems of + True -> (defTerm, [defItems2]) : defs + False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + in + blocksToDefinitions' defAcc' acc blks +blocksToDefinitions' [] acc (b:blks) = + blocksToDefinitions' [] (b:acc) blks +blocksToDefinitions' defAcc acc (b:blks) = + blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + +removeListDivs' :: Block -> [Block] +removeListDivs' (Div (ident, classes, kvs) blks) + | "list-item" `elem` classes = + case delete "list-item" classes of + [] -> blks + classes' -> [Div (ident, classes', kvs) $ blks] +removeListDivs' (Div (ident, classes, kvs) blks) + | not $ null $ listParagraphDivs `intersect` classes = + case classes \\ listParagraphDivs of + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] +removeListDivs' blk = [blk] + +removeListDivs :: [Block] -> [Block] +removeListDivs = concatMap removeListDivs' + + + +blocksToDefinitions :: [Block] -> [Block] +blocksToDefinitions = blocksToDefinitions' [] [] + + + + diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs new file mode 100644 index 000000000..07f34450d --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -0,0 +1,596 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of docx archive into Docx haskell type +-} + + +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , getFootNote + , getEndNote + , lookupLevel + , lookupRelationship + , archiveToDocx + ) where +import Codec.Archive.Zip +import Text.XML.Light +import Data.Maybe +import Data.List +import System.FilePath +import Data.Bits ((.|.)) +import qualified Data.ByteString.Lazy as B +import qualified Text.Pandoc.UTF8 as UTF8 + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + + +type NameSpaces = [(String, String)] + +data Docx = Docx Document Notes Numbering [Relationship] Media + deriving Show + +archiveToDocx :: Archive -> Maybe Docx +archiveToDocx archive = do + let notes = archiveToNotes archive + rels = archiveToRelationships archive + media = archiveToMedia archive + doc <- archiveToDocument archive + numbering <- archiveToNumbering archive + return $ Docx doc notes numbering rels media + +data Document = Document NameSpaces Body + deriving Show + +archiveToDocument :: Archive -> Maybe Document +archiveToDocument zf = do + entry <- findEntryByPath "word/document.xml" zf + docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem + body <- elemToBody namespaces bodyElem + return $ Document namespaces body + +type Media = [(FilePath, B.ByteString)] + +filePathIsMedia :: FilePath -> Bool +filePathIsMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "word/media/") + +getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) +getMediaPair zf fp = + case findEntryByPath fp zf of + Just e -> Just (fp, fromEntry e) + Nothing -> Nothing + +archiveToMedia :: Archive -> Media +archiveToMedia zf = + mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +lookupLevel :: String -> String -> Numbering -> Maybe Level +lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do + absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs + lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs + lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls + return lvl + +numElemToNum :: NameSpaces -> Element -> Maybe Numb +numElemToNum ns element | + qName (elName element) == "num" && + qURI (elName element) == (lookup "w" ns) = do + numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element + absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + return $ Numb numId absNumId +numElemToNum _ _ = Nothing + +absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb +absNumElemToAbsNum ns element | + qName (elName element) == "abstractNum" && + qURI (elName element) == (lookup "w" ns) = do + absNumId <- findAttr + (QName "abstractNumId" (lookup "w" ns) (Just "w")) + element + let levelElems = findChildren + (QName "lvl" (lookup "w" ns) (Just "w")) + element + levels = mapMaybe (levelElemToLevel ns) levelElems + return $ AbstractNumb absNumId levels +absNumElemToAbsNum _ _ = Nothing + +levelElemToLevel :: NameSpaces -> Element -> Maybe Level +levelElemToLevel ns element | + qName (elName element) == "lvl" && + qURI (elName element) == (lookup "w" ns) = do + ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element + fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + return (ilvl, fmt, txt, start) +levelElemToLevel _ _ = Nothing + +archiveToNumbering :: Archive -> Maybe Numbering +archiveToNumbering zf = + case findEntryByPath "word/numbering.xml" zf of + Nothing -> Just $ Numbering [] [] [] + Just entry -> do + numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem) + numElems = findChildren + (QName "num" (lookup "w" namespaces) (Just "w")) + numberingElem + absNumElems = findChildren + (QName "abstractNum" (lookup "w" namespaces) (Just "w")) + numberingElem + nums = mapMaybe (numElemToNum namespaces) numElems + absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems + return $ Numbering namespaces nums absNums + +data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) + deriving Show + +noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) +noteElemToNote ns element + | qName (elName element) `elem` ["endnote", "footnote"] && + qURI (elName element) == (lookup "w" ns) = + do + noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + let bps = mapMaybe (elemToBodyPart ns) + $ elChildren element + return $ (noteId, bps) +noteElemToNote _ _ = Nothing + +getFootNote :: String -> Notes -> Maybe [BodyPart] +getFootNote s (Notes _ fns _) = fns >>= (lookup s) + +getEndNote :: String -> Notes -> Maybe [BodyPart] +getEndNote s (Notes _ _ ens) = ens >>= (lookup s) + +elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] +elemToNotes ns notetype element + | qName (elName element) == (notetype ++ "s") && + qURI (elName element) == (lookup "w" ns) = + Just $ mapMaybe (noteElemToNote ns) + $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element +elemToNotes _ _ _ = Nothing + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + en_namespaces = case enElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + + +data Relationship = Relationship (RelId, Target) + deriving Show + +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = + lookup relid (map (\(Relationship pair) -> pair) rels) + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = + let (dir, name) = splitFileName fp + in + (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship (relId, target) +relElemToRelationship _ = Nothing + + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + let relPaths = filter filePathIsRel (filesInArchive archive) + entries = mapMaybe (\f -> findEntryByPath f archive) relPaths + relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries + rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems + in + rels + +data Body = Body [BodyPart] + deriving Show + +elemToBody :: NameSpaces -> Element -> Maybe Body +elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = + Just $ Body + $ mapMaybe (elemToBodyPart ns) $ elChildren element +elemToBody _ _ = Nothing + +elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) +elemToNumInfo ns element + | qName (elName element) == "p" && + qURI (elName element) == (lookup "w" ns) = + do + pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element + numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr + lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + return (numId, lvl) +elemToNumInfo _ _ = Nothing + +elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart +elemToBodyPart ns element + | qName (elName element) == "p" && + qURI (elName element) == (lookup "w" ns) = + let parstyle = elemToParagraphStyle ns element + parparts = mapMaybe (elemToParPart ns) + $ elChildren element + in + case elemToNumInfo ns element of + Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts + Nothing -> Just $ Paragraph parstyle parparts + | qName (elName element) == "tbl" && + qURI (elName element) == (lookup "w" ns) = + let + caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element + >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + grid = case + findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element + of + Just g -> elemToTblGrid ns g + Nothing -> [] + tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element + >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) + >>= elemToTblLook ns + in + Just $ Tbl + (fromMaybe "" caption) + grid + (fromMaybe defaultTblLook tblLook) + (mapMaybe (elemToRow ns) (elChildren element)) + | otherwise = Nothing + +elemToTblLook :: NameSpaces -> Element -> Maybe TblLook +elemToTblLook ns element + | qName (elName element) == "tblLook" && + qURI (elName element) == (lookup "w" ns) = + let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element + val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False + in + Just $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = Nothing + +testBitMask :: String -> Int -> Bool +testBitMask bitMaskS n = + case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + [] -> False + ((n', _) : _) -> ((n' .|. n) /= 0) + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indent :: Maybe Integer + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indent = Nothing + } + +elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle +elemToParagraphStyle ns element = + case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of + Just pPr -> + ParagraphStyle + {pStyle = + mapMaybe + (findAttr (QName "val" (lookup "w" ns) (Just "w"))) + (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) + , indent = + findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= + findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= + stringToInteger + } + Nothing -> defaultParagraphStyle + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String [ParPart] + | Tbl String TblGrid TblLook [Row] + + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +stringToInteger :: String -> Maybe Integer +stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) + +elemToTblGrid :: NameSpaces -> Element -> TblGrid +elemToTblGrid ns element + | qName (elName element) == "tblGrid" && + qURI (elName element) == (lookup "w" ns) = + let + cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element + in + mapMaybe (\e -> + findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e + >>= stringToInteger + ) + cols +elemToTblGrid _ _ = [] + +data Row = Row [Cell] + deriving Show + + +elemToRow :: NameSpaces -> Element -> Maybe Row +elemToRow ns element + | qName (elName element) == "tr" && + qURI (elName element) == (lookup "w" ns) = + let + cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element + in + Just $ Row (mapMaybe (elemToCell ns) cells) +elemToRow _ _ = Nothing + +data Cell = Cell [BodyPart] + deriving Show + +elemToCell :: NameSpaces -> Element -> Maybe Cell +elemToCell ns element + | qName (elName element) == "tc" && + qURI (elName element) == (lookup "w" ns) = + Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) +elemToCell _ _ = Nothing + +data ParPart = PlainRun Run + | Insertion ChangeId Author ChangeDate [Run] + | Deletion ChangeId Author ChangeDate [Run] + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink RelId [Run] + | Drawing String + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote String + | Endnote String + deriving Show + +data RunElem = TextRun String | LnBrk | Tab + deriving Show + +data RunStyle = RunStyle { isBold :: Bool + , isItalic :: Bool + , isSmallCaps :: Bool + , isStrike :: Bool + , isSuperScript :: Bool + , isSubScript :: Bool + , underline :: Maybe String + , rStyle :: Maybe String } + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = False + , isItalic = False + , isSmallCaps = False + , isStrike = False + , isSuperScript = False + , isSubScript = False + , underline = Nothing + , rStyle = Nothing + } + +elemToRunStyle :: NameSpaces -> Element -> RunStyle +elemToRunStyle ns element = + case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of + Just rPr -> + RunStyle + { + isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr + , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr + , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr + , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr + , isSuperScript = + (Just "superscript" == + (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")))) + , isSubScript = + (Just "subscript" == + (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")))) + , underline = + findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + , rStyle = + findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + } + Nothing -> defaultRunStyle + +elemToRun :: NameSpaces -> Element -> Maybe Run +elemToRun ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + case + findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= + findAttr (QName "id" (lookup "w" ns) (Just "w")) + of + Just s -> Just $ Footnote s + Nothing -> + case + findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= + findAttr (QName "id" (lookup "w" ns) (Just "w")) + of + Just s -> Just $ Endnote s + Nothing -> Just $ + Run (elemToRunStyle ns element) + (elemToRunElems ns element) +elemToRun _ _ = Nothing + +elemToRunElem :: NameSpaces -> Element -> Maybe RunElem +elemToRunElem ns element + | (qName (elName element) == "t" || qName (elName element) == "delText") && + qURI (elName element) == (lookup "w" ns) = + Just $ TextRun (strContent element) + | qName (elName element) == "br" && + qURI (elName element) == (lookup "w" ns) = + Just $ LnBrk + | qName (elName element) == "tab" && + qURI (elName element) == (lookup "w" ns) = + Just $ Tab + | otherwise = Nothing + + +elemToRunElems :: NameSpaces -> Element -> [RunElem] +elemToRunElems ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + mapMaybe (elemToRunElem ns) (elChildren element) + | otherwise = [] + +elemToDrawing :: NameSpaces -> Element -> Maybe ParPart +elemToDrawing ns element + | qName (elName element) == "drawing" && + qURI (elName element) == (lookup "w" ns) = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + in + findElement (QName "blip" (Just a_ns) (Just "a")) element + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + >>= (\s -> Just $ Drawing s) +elemToDrawing _ _ = Nothing + + +elemToParPart :: NameSpaces -> Element -> Maybe ParPart +elemToParPart ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of + Just drawingElem -> elemToDrawing ns drawingElem + Nothing -> do + r <- elemToRun ns element + return $ PlainRun r +elemToParPart ns element + | qName (elName element) == "ins" && + qURI (elName element) == (lookup "w" ns) = do + cId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + cAuthor <- findAttr (QName "author" (lookup "w" ns) (Just "w")) element + cDate <- findAttr (QName "date" (lookup "w" ns) (Just "w")) element + let runs = mapMaybe (elemToRun ns) (elChildren element) + return $ Insertion cId cAuthor cDate runs +elemToParPart ns element + | qName (elName element) == "del" && + qURI (elName element) == (lookup "w" ns) = do + cId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + cAuthor <- findAttr (QName "author" (lookup "w" ns) (Just "w")) element + cDate <- findAttr (QName "date" (lookup "w" ns) (Just "w")) element + let runs = mapMaybe (elemToRun ns) (elChildren element) + return $ Deletion cId cAuthor cDate runs +elemToParPart ns element + | qName (elName element) == "bookmarkStart" && + qURI (elName element) == (lookup "w" ns) = do + bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element + return $ BookMark bmId bmName +elemToParPart ns element + | qName (elName element) == "hyperlink" && + qURI (elName element) == (lookup "w" ns) = + let runs = mapMaybe (elemToRun ns) + $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element + in + case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of + Just anchor -> + Just $ InternalHyperLink anchor runs + Nothing -> + case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of + Just relId -> Just $ ExternalHyperLink relId runs + Nothing -> Nothing +elemToParPart _ _ = Nothing + +type Target = String +type Anchor = String +type BookMarkId = String +type RelId = String +type ChangeId = String +type Author = String +type ChangeDate = String diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs new file mode 100644 index 000000000..8c105d1f1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs @@ -0,0 +1,181 @@ +{-# LANGUAGE OverloadedStrings #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Reducible + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Typeclass for combining adjacent blocks and inlines correctly. +-} + + +module Text.Pandoc.Readers.Docx.Reducible ((<++>), + (<+++>), + Reducible, + Container(..), + container, + innards, + reduceList, + reduceListB, + rebuild) + where + +import Text.Pandoc.Builder +import Data.List ((\\), intersect) + +data Container a = Container ([a] -> a) | NullContainer + +instance (Eq a) => Eq (Container a) where + (Container x) == (Container y) = ((x []) == (y [])) + NullContainer == NullContainer = True + _ == _ = False + +instance (Show a) => Show (Container a) where + show (Container x) = "Container {" ++ + (reverse $ drop 3 $ reverse $ show $ x []) ++ + "}" + show (NullContainer) = "NullContainer" + +class Reducible a where + (<++>) :: a -> a -> [a] + container :: a -> Container a + innards :: a -> [a] + isSpace :: a -> Bool + +(<+++>) :: (Reducible a) => Many a -> Many a -> Many a +mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms + +reduceListB :: (Reducible a) => Many a -> Many a +reduceListB = fromList . reduceList . toList + +reduceList' :: (Reducible a) => [a] -> [a] -> [a] +reduceList' acc [] = acc +reduceList' [] (x:xs) = reduceList' [x] xs +reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs + +reduceList :: (Reducible a) => [a] -> [a] +reduceList = reduceList' [] + +combineReducibles :: (Reducible a, Eq a) => a -> a -> [a] +combineReducibles r s = + let (conts, rs) = topLevelContainers r + (conts', ss) = topLevelContainers s + shared = conts `intersect` conts' + remaining = conts \\ shared + remaining' = conts' \\ shared + in + case null shared of + True -> case (not . null) rs && isSpace (last rs) of + True -> rebuild conts (init rs) ++ [last rs, s] + False -> [r,s] + False -> rebuild + shared $ + reduceList $ + (rebuild remaining rs) ++ (rebuild remaining' ss) + +instance Reducible Inline where + s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> [s1,s2] + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + s1' = case null classes1' && null kvs1' of + True -> ils1 + False -> [Span attr1' ils1] + s2' = case null classes2' && null kvs2' of + True -> ils2 + False -> [Span attr2' ils2] + in + [Span attr' $ reduceList $ s1' ++ s2'] + + (Str x) <++> (Str y) = [Str (x++y)] + il <++> il' = combineReducibles il il' + + container (Emph _) = Container Emph + container (Strong _) = Container Strong + container (Strikeout _) = Container Strikeout + container (Subscript _) = Container Subscript + container (Superscript _) = Container Superscript + container (Quoted qt _) = Container $ Quoted qt + container (Cite cs _) = Container $ Cite cs + container (Span attr _) = Container $ Span attr + container _ = NullContainer + + innards (Emph ils) = ils + innards (Strong ils) = ils + innards (Strikeout ils) = ils + innards (Subscript ils) = ils + innards (Superscript ils) = ils + innards (Quoted _ ils) = ils + innards (Cite _ ils) = ils + innards (Span _ ils) = ils + innards _ = [] + + isSpace Space = True + isSpace _ = False + +instance Reducible Block where + (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes = + [Div (ident, classes, kvs) (reduceList blks), blk] + + blk <++> blk' = combineReducibles blk blk' + + container (BlockQuote _) = Container BlockQuote + container (Div attr _) = Container $ Div attr + container _ = NullContainer + + innards (BlockQuote bs) = bs + innards (Div _ bs) = bs + innards _ = [] + + isSpace _ = False + + +topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a]) +topLevelContainers' (r : []) = case container r of + NullContainer -> ([], [r]) + _ -> + let (conts, inns) = topLevelContainers' (innards r) + in + ((container r) : conts, inns) +topLevelContainers' rs = ([], rs) + +topLevelContainers :: (Reducible a) => a -> ([Container a], [a]) +topLevelContainers il = topLevelContainers' [il] + +rebuild :: [Container a] -> [a] -> [a] +rebuild [] xs = xs +rebuild ((Container f) : cs) xs = rebuild cs $ [f xs] +rebuild (NullContainer : cs) xs = rebuild cs $ xs + + diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index f6657a4d1..552e8a251 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -40,6 +40,7 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing @@ -47,7 +48,10 @@ import Data.Maybe ( fromMaybe, isJust ) import Data.List ( intercalate ) import Data.Char ( isDigit ) import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$) ) +import Control.Applicative ( (<$>), (<$), (<*) ) +import Data.Monoid +import Text.Printf (printf) +import Debug.Trace (trace) isSpace :: Char -> Bool isSpace ' ' = True @@ -66,39 +70,56 @@ readHtml opts inp = where tags = canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do - blocks <- (fixPlains False . concat) <$> manyTill block eof + blocks <- (fixPlains False) . mconcat <$> manyTill block eof meta <- stateMeta <$> getState - return $ Pandoc meta blocks + return $ Pandoc meta (B.toList blocks) type TagParser = Parser [Tag String] ParserState -pBody :: TagParser [Block] +pBody :: TagParser Blocks pBody = pInTags "body" block -pHead :: TagParser [Block] -pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag) - where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces - setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t)) - -block :: TagParser [Block] -block = choice +pHead :: TagParser Blocks +pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) + where pTitle = pInTags "title" inline >>= setTitle . trimInlines + setTitle t = mempty <$ (updateState $ B.setMeta "title" t) + pMetaTag = do + mt <- pSatisfy (~== TagOpen "meta" []) + let name = fromAttrib "name" mt + if null name + then return mempty + else do + let content = fromAttrib "content" mt + updateState $ B.setMeta name (B.text content) + return mempty + +block :: TagParser Blocks +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- choice [ pPara , pHeader , pBlockQuote , pCodeBlock , pList , pHrule - , pSimpleTable + , pTable , pHead , pBody , pPlain + , pDiv , pRawHtmlBlock ] + when tr $ trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res -pList :: TagParser [Block] + +pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList -pBulletList :: TagParser [Block] +pBulletList :: TagParser Blocks pBulletList = try $ do pSatisfy (~== TagOpen "ul" []) let nonItem = pSatisfy (\t -> @@ -108,9 +129,9 @@ pBulletList = try $ do -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") - return [BulletList $ map (fixPlains True) items] + return $ B.bulletList $ map (fixPlains True) items -pOrderedList :: TagParser [Block] +pOrderedList :: TagParser Blocks pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) let (start, style) = (sta', sty') @@ -136,27 +157,27 @@ pOrderedList = try $ do -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") - return [OrderedList (start, style, DefaultDelim) $ map (fixPlains True) items] + return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items -pDefinitionList :: TagParser [Block] +pDefinitionList :: TagParser Blocks pDefinitionList = try $ do pSatisfy (~== TagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") - return [DefinitionList items] + return $ B.definitionList items -pDefListItem :: TagParser ([Inline],[[Block]]) +pDefListItem :: TagParser (Inlines, [Blocks]) pDefListItem = try $ do let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem - let term = intercalate [LineBreak] terms + let term = foldl1 (\x y -> x <> B.linebreak <> y) terms return (term, map (fixPlains True) defs) -fixPlains :: Bool -> [Block] -> [Block] -fixPlains inList bs = if any isParaish bs - then map plainToPara bs +fixPlains :: Bool -> Blocks -> Blocks +fixPlains inList bs = if any isParaish bs' + then B.fromList $ map plainToPara bs' else bs where isParaish (Para _) = True isParaish (CodeBlock _ _) = True @@ -168,6 +189,7 @@ fixPlains inList bs = if any isParaish bs isParaish _ = False plainToPara (Plain xs) = Para xs plainToPara x = x + bs' = B.toList bs pRawTag :: TagParser String pRawTag = do @@ -177,13 +199,20 @@ pRawTag = do then return [] else return $ renderTags' [tag] -pRawHtmlBlock :: TagParser [Block] +pDiv :: TagParser Blocks +pDiv = try $ do + getOption readerParseRaw >>= guard + TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True) + contents <- pInTags "div" block + return $ B.divWith (mkAttr attr) contents + +pRawHtmlBlock :: TagParser Blocks pRawHtmlBlock = do raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag parseRaw <- getOption readerParseRaw if parseRaw && not (null raw) - then return [RawBlock "html" raw] - else return [] + then return $ B.rawBlock "html" raw + else return mempty pHtmlBlock :: String -> TagParser String pHtmlBlock t = try $ do @@ -191,70 +220,96 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] -pHeader :: TagParser [Block] +pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] let level = read (drop 1 tagtype) - contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof) - let ident = maybe "" id $ lookup "id" attr + contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) + let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] return $ if bodyTitle - then [] -- skip a representation of the title in the body - else [Header level (ident, classes, keyvals) $ - normalizeSpaces contents] + then mempty -- skip a representation of the title in the body + else B.headerWith (ident, classes, keyvals) level contents -pHrule :: TagParser [Block] +pHrule :: TagParser Blocks pHrule = do pSelfClosing (=="hr") (const True) - return [HorizontalRule] + return B.horizontalRule -pSimpleTable :: TagParser [Block] -pSimpleTable = try $ do +pTable :: TagParser Blocks +pTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank - caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank - skipMany $ (pInTags "col" block >> skipMany pBlank) <|> - (pInTags "colgroup" block >> skipMany pBlank) + caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank + -- TODO actually read these and take width information from them + widths' <- pColgroup <|> many pCol head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") skipMany pBlank rows <- pOptInTag "tbody" $ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td") skipMany pBlank TagClose _ <- pSatisfy (~== TagClose "table") - let cols = maximum $ map length rows - let aligns = replicate cols AlignLeft - let widths = replicate cols 0 - return [Table caption aligns widths head' rows] + let isSinglePlain x = case B.toList x of + [Plain _] -> True + _ -> False + let isSimple = all isSinglePlain $ concat (head':rows) + let cols = length $ if null head' then head rows else head' + -- fail if there are colspans or rowspans + guard $ all (\r -> length r == cols) rows + let aligns = replicate cols AlignDefault + let widths = if null widths' + then if isSimple + then replicate cols 0 + else replicate cols (1.0 / fromIntegral cols) + else widths' + return $ B.table caption (zip aligns widths) head' rows + +pCol :: TagParser Double +pCol = try $ do + TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + skipMany pBlank + optional $ pSatisfy (~== TagClose "col") + skipMany pBlank + return $ case lookup "width" attribs of + Just x | not (null x) && last x == '%' -> + fromMaybe 0.0 $ safeRead ('0':'.':init x) + _ -> 0.0 + +pColgroup :: TagParser [Double] +pColgroup = try $ do + pSatisfy (~== TagOpen "colgroup" []) + skipMany pBlank + manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -pCell :: String -> TagParser [TableCell] +pCell :: String -> TagParser [Blocks] pCell celltype = try $ do skipMany pBlank - res <- pInTags celltype pPlain + res <- pInTags celltype block skipMany pBlank return [res] -pBlockQuote :: TagParser [Block] +pBlockQuote :: TagParser Blocks pBlockQuote = do contents <- pInTags "blockquote" block - return [BlockQuote $ fixPlains False contents] + return $ B.blockQuote $ fixPlains False contents -pPlain :: TagParser [Block] +pPlain :: TagParser Blocks pPlain = do - contents <- liftM (normalizeSpaces . concat) $ many1 inline - if null contents - then return [] - else return [Plain contents] + contents <- trimInlines . mconcat <$> many1 inline + if B.isNull contents + then return mempty + else return $ B.plain contents -pPara :: TagParser [Block] +pPara :: TagParser Blocks pPara = do - contents <- pInTags "p" inline - return [Para $ normalizeSpaces contents] + contents <- trimInlines <$> pInTags "p" inline + return $ B.para contents -pCodeBlock :: TagParser [Block] +pCodeBlock :: TagParser Blocks pCodeBlock = try $ do TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) contents <- manyTill pAnyTag (pCloses "pre" <|> eof) @@ -267,13 +322,9 @@ pCodeBlock = try $ do let result = case reverse result' of '\n':_ -> init result' _ -> result' - let attribsId = fromMaybe "" $ lookup "id" attr - let attribsClasses = words $ fromMaybe "" $ lookup "class" attr - let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr - let attribs = (attribsId, attribsClasses, attribsKV) - return [CodeBlock attribs result] + return $ B.codeBlockWith (mkAttr attr) result -inline :: TagParser [Inline] +inline :: TagParser Inlines inline = choice [ pTagText , pQ @@ -286,6 +337,7 @@ inline = choice , pLink , pImage , pCode + , pSpan , pRawHtmlInline ] @@ -312,7 +364,7 @@ pSelfClosing f g = do optional $ pSatisfy (tagClose f) return open -pQ :: TagParser [Inline] +pQ :: TagParser Inlines pQ = do quoteContext <- stateQuoteContext `fmap` getState let quoteType = case quoteContext of @@ -321,79 +373,84 @@ pQ = do let innerQuoteContext = if quoteType == SingleQuote then InSingleQuote else InDoubleQuote - withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType) + let constructor = case quoteType of + SingleQuote -> B.singleQuoted + DoubleQuote -> B.doubleQuoted + withQuoteContext innerQuoteContext $ + pInlinesInTags "q" constructor -pEmph :: TagParser [Inline] -pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph +pEmph :: TagParser Inlines +pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph -pStrong :: TagParser [Inline] -pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong +pStrong :: TagParser Inlines +pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong -pSuperscript :: TagParser [Inline] -pSuperscript = pInlinesInTags "sup" Superscript +pSuperscript :: TagParser Inlines +pSuperscript = pInlinesInTags "sup" B.superscript -pSubscript :: TagParser [Inline] -pSubscript = pInlinesInTags "sub" Subscript +pSubscript :: TagParser Inlines +pSubscript = pInlinesInTags "sub" B.subscript -pStrikeout :: TagParser [Inline] +pStrikeout :: TagParser Inlines pStrikeout = do - pInlinesInTags "s" Strikeout <|> - pInlinesInTags "strike" Strikeout <|> - pInlinesInTags "del" Strikeout <|> + pInlinesInTags "s" B.strikeout <|> + pInlinesInTags "strike" B.strikeout <|> + pInlinesInTags "del" B.strikeout <|> try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) - contents <- liftM concat $ manyTill inline (pCloses "span") - return [Strikeout contents]) + contents <- mconcat <$> manyTill inline (pCloses "span") + return $ B.strikeout contents) -pLineBreak :: TagParser [Inline] +pLineBreak :: TagParser Inlines pLineBreak = do pSelfClosing (=="br") (const True) - return [LineBreak] + return B.linebreak -pLink :: TagParser [Inline] +pLink :: TagParser Inlines pLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) let url = fromAttrib "href" tag let title = fromAttrib "title" tag - lab <- liftM concat $ manyTill inline (pCloses "a") - return [Link (normalizeSpaces lab) (escapeURI url, title)] + lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") + return $ B.link (escapeURI url) title lab -pImage :: TagParser [Inline] +pImage :: TagParser Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") let url = fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - return [Image (B.toList $ B.text alt) (escapeURI url, title)] + return $ B.image (escapeURI url) title (B.text alt) -pCode :: TagParser [Inline] +pCode :: TagParser Inlines pCode = try $ do (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) result <- manyTill pAnyTag (pCloses open) - let ident = fromMaybe "" $ lookup "id" attr - let classes = words $ fromMaybe [] $ lookup "class" attr - let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr - return [Code (ident,classes,rest) - $ intercalate " " $ lines $ innerText result] + return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result -pRawHtmlInline :: TagParser [Inline] +pSpan :: TagParser Inlines +pSpan = try $ do + getOption readerParseRaw >>= guard + TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + contents <- pInTags "span" inline + return $ B.spanWith (mkAttr attr) contents + +pRawHtmlInline :: TagParser Inlines pRawHtmlInline = do result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag parseRaw <- getOption readerParseRaw if parseRaw - then return [RawInline "html" $ renderTags' [result]] - else return [] + then return $ B.rawInline "html" $ renderTags' [result] + else return mempty -pInlinesInTags :: String -> ([Inline] -> Inline) - -> TagParser [Inline] -pInlinesInTags tagtype f = do - contents <- pInTags tagtype inline - return [f $ normalizeSpaces contents] +pInlinesInTags :: String -> (Inlines -> Inlines) + -> TagParser Inlines +pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: String -> TagParser [a] - -> TagParser [a] +pInTags :: (Monoid a) => String -> TagParser a + -> TagParser a pInTags tagtype parser = try $ do pSatisfy (~== TagOpen tagtype []) - liftM concat $ manyTill parser (pCloses tagtype <|> eof) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) pOptInTag :: String -> TagParser a -> TagParser a @@ -409,56 +466,65 @@ pCloses :: String -> TagParser () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of - (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagClose t') | t' == tagtype -> pAnyTag >> return () (TagOpen t' _) | t' `closes` tagtype -> return () (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "li" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "tr" -> return () _ -> mzero -pTagText :: TagParser [Inline] +pTagText :: TagParser Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState case runParser (many pTagContents) st "text" str of Left _ -> fail $ "Could not parse `" ++ str ++ "'" - Right result -> return result + Right result -> return $ mconcat result pBlank :: TagParser () pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: Parser [Char] ParserState Inline +pTagContents :: Parser [Char] ParserState Inlines pTagContents = - pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad - -pStr :: Parser [Char] ParserState Inline + B.displayMath <$> mathDisplay + <|> B.math <$> mathInline + <|> pStr + <|> pSpace + <|> smartPunctuation pTagContents + <|> pSymbol + <|> pBad + +pStr :: Parser [Char] ParserState Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) pos <- getPosition updateState $ \s -> s{ stateLastStrPos = Just pos } - return $ Str result + return $ B.str result isSpecial :: Char -> Bool isSpecial '"' = True isSpecial '\'' = True isSpecial '.' = True isSpecial '-' = True +isSpecial '$' = True isSpecial '\8216' = True isSpecial '\8217' = True isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: Parser [Char] ParserState Inline -pSymbol = satisfy isSpecial >>= return . Str . (:[]) +pSymbol :: Parser [Char] ParserState Inlines +pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: Parser [Char] ParserState Inline +pBad :: Parser [Char] ParserState Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -490,10 +556,10 @@ pBad = do '\158' -> '\382' '\159' -> '\376' _ -> '?' - return $ Str [c'] + return $ B.str [c'] -pSpace :: Parser [Char] ParserState Inline -pSpace = many1 (satisfy isSpace) >> return Space +pSpace :: Parser [Char] ParserState Inlines +pSpace = many1 (satisfy isSpace) >> return B.space -- -- Constants @@ -521,7 +587,7 @@ blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style", "video"] + "th", "thead", "tr", "script", "style", "svg", "video"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. @@ -543,15 +609,19 @@ blockTags :: [String] blockTags = blockHtmlTags ++ blockDocBookTags isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen (`notElem` blockTags) (const True) t || - tagClose (`notElem` blockTags) t || +isInlineTag t = tagOpen isInlineTagName (const True) t || + tagClose isInlineTagName t || tagComment (const True) t + where isInlineTagName x = x `notElem` blockTags isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen (`elem` blocktags) (const True) t || - tagClose (`elem` blocktags) t || +isBlockTag t = tagOpen isBlockTagName (const True) t || + tagClose isBlockTagName t || tagComment (const True) t - where blocktags = blockTags ++ eitherBlockOrInline + where isBlockTagName ('?':_) = True + isBlockTagName ('!':_) = True + isBlockTagName x = x `elem` blockTags + || x `elem` eitherBlockOrInline isTextTag :: Tag String -> Bool isTextTag = tagText (const True) @@ -560,7 +630,7 @@ isCommentTag :: Tag String -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended - +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags closes :: String -> String -> Bool _ `closes` "body" = False _ `closes` "html" = False @@ -568,11 +638,18 @@ _ `closes` "html" = False "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True -"hr" `closes` "p" = True -"p" `closes` "p" = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- http://www.w3.org/TR/html-markup/p.html +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True "meta" `closes` "meta" = True -"colgroup" `closes` "colgroup" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True @@ -620,3 +697,11 @@ htmlTag f = try $ do _ -> do rendered <- manyTill anyChar (char '>') return (next, rendered ++ ">") + +mkAttr :: [(String, String)] -> Attr +mkAttr attr = (attribsId, attribsClasses, attribsKV) + where attribsId = fromMaybe "" $ lookup "id" attr + attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + + diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index 0e74406ef..4b46c869d 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -3,7 +3,8 @@ Copyright : Copyright (C) 2013 David Lazar License : GNU GPL, version 2 or above - Maintainer : David Lazar <lazar6@illinois.edu> + Maintainer : David Lazar <lazar6@illinois.edu>, + John MacFarlane <jgm@berkeley.edu> Stability : alpha Conversion of Haddock markup to 'Pandoc' document. @@ -12,30 +13,126 @@ module Text.Pandoc.Readers.Haddock ( readHaddock ) where -import Text.Pandoc.Builder +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Shared (trim, splitBy) +import Data.Monoid +import Data.List (intersperse, stripPrefix) +import Data.Maybe (fromMaybe) +import Text.Pandoc.Definition import Text.Pandoc.Options -import Text.Pandoc.Readers.Haddock.Lex -import Text.Pandoc.Readers.Haddock.Parse +import Documentation.Haddock.Parser +import Documentation.Haddock.Types +import Debug.Trace (trace) -- | Parse Haddock markup and return a 'Pandoc' document. readHaddock :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse -> Pandoc -readHaddock _ s = Pandoc nullMeta blocks +readHaddock opts = B.doc . docHToBlocks . trace' . parseParas + where trace' x = if readerTrace opts + then trace (show x) x + else x + +docHToBlocks :: DocH String Identifier -> Blocks +docHToBlocks d' = + case d' of + DocEmpty -> mempty + DocAppend (DocParagraph (DocHeader h)) (DocParagraph (DocAName ident)) -> + B.headerWith (ident,[],[]) (headerLevel h) + (docHToInlines False $ headerTitle h) + DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2) + DocString _ -> inlineFallback + DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h + DocParagraph x -> B.para $ docHToInlines False x + DocIdentifier _ -> inlineFallback + DocIdentifierUnchecked _ -> inlineFallback + DocModule s -> B.plain $ docHToInlines False $ DocModule s + DocWarning _ -> mempty -- TODO + DocEmphasis _ -> inlineFallback + DocMonospaced _ -> inlineFallback + DocBold _ -> inlineFallback + DocHeader h -> B.header (headerLevel h) + (docHToInlines False $ headerTitle h) + DocUnorderedList items -> B.bulletList (map docHToBlocks items) + DocOrderedList items -> B.orderedList (map docHToBlocks items) + DocDefList items -> B.definitionList (map (\(d,t) -> + (docHToInlines False d, + [consolidatePlains $ docHToBlocks t])) items) + DocCodeBlock (DocString s) -> B.codeBlockWith ("",[],[]) s + DocCodeBlock d -> B.para $ docHToInlines True d + DocHyperlink _ -> inlineFallback + DocPic _ -> inlineFallback + DocAName _ -> inlineFallback + DocProperty s -> B.codeBlockWith ("",["property","haskell"],[]) (trim s) + DocExamples es -> mconcat $ map (\e -> + makeExample ">>>" (exampleExpression e) (exampleResult e)) es + + where inlineFallback = B.plain $ docHToInlines False d' + consolidatePlains = B.fromList . consolidatePlains' . B.toList + consolidatePlains' zs@(Plain _ : _) = + let (xs, ys) = span isPlain zs in + Para (concatMap extractContents xs) : consolidatePlains' ys + consolidatePlains' (x : xs) = x : consolidatePlains' xs + consolidatePlains' [] = [] + isPlain (Plain _) = True + isPlain _ = False + extractContents (Plain xs) = xs + extractContents _ = [] + +docHToInlines :: Bool -> DocH String Identifier -> Inlines +docHToInlines isCode d' = + case d' of + DocEmpty -> mempty + DocAppend d1 d2 -> mappend (docHToInlines isCode d1) + (docHToInlines isCode d2) + DocString s + | isCode -> mconcat $ intersperse B.linebreak + $ map B.code $ splitBy (=='\n') s + | otherwise -> B.text s + DocParagraph _ -> mempty + DocIdentifier (_,s,_) -> B.codeWith ("",["haskell","identifier"],[]) s + DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) s + DocModule s -> B.codeWith ("",["haskell","module"],[]) s + DocWarning _ -> mempty -- TODO + DocEmphasis d -> B.emph (docHToInlines isCode d) + DocMonospaced (DocString s) -> B.code s + DocMonospaced d -> docHToInlines True d + DocBold d -> B.strong (docHToInlines isCode d) + DocHeader _ -> mempty + DocUnorderedList _ -> mempty + DocOrderedList _ -> mempty + DocDefList _ -> mempty + DocCodeBlock _ -> mempty + DocHyperlink h -> B.link (hyperlinkUrl h) (hyperlinkUrl h) + (maybe (B.text $ hyperlinkUrl h) B.text $ hyperlinkLabel h) + DocPic p -> B.image (pictureUri p) (fromMaybe (pictureUri p) $ pictureTitle p) + (maybe mempty B.text $ pictureTitle p) + DocAName s -> B.spanWith (s,["anchor"],[]) mempty + DocProperty _ -> mempty + DocExamples _ -> mempty + +-- | Create an 'Example', stripping superfluous characters as appropriate +makeExample :: String -> String -> [String] -> Blocks +makeExample prompt expression result = + B.para $ B.codeWith ("",["prompt"],[]) prompt + <> B.space + <> B.codeWith ([], ["haskell","expr"], []) (trim expression) + <> B.linebreak + <> (mconcat $ intersperse B.linebreak $ map coder result') where - blocks = case parseParas (tokenise s (0,0)) of - Left [] -> error "parse failure" - Left (tok:_) -> error $ "parse failure " ++ pos (tokenPos tok) - where pos (l, c) = "(line " ++ show l ++ ", column " ++ show c ++ ")" - Right x -> mergeLists (toList x) - --- similar to 'docAppend' in Haddock.Doc -mergeLists :: [Block] -> [Block] -mergeLists (BulletList xs : BulletList ys : blocks) - = mergeLists (BulletList (xs ++ ys) : blocks) -mergeLists (OrderedList _ xs : OrderedList a ys : blocks) - = mergeLists (OrderedList a (xs ++ ys) : blocks) -mergeLists (DefinitionList xs : DefinitionList ys : blocks) - = mergeLists (DefinitionList (xs ++ ys) : blocks) -mergeLists (x : blocks) = x : mergeLists blocks -mergeLists [] = [] + -- 1. drop trailing whitespace from the prompt, remember the prefix + prefix = takeWhile (`elem` " \t") prompt + + -- 2. drop, if possible, the exact same sequence of whitespace + -- characters from each result line + -- + -- 3. interpret lines that only contain the string "<BLANKLINE>" as an + -- empty line + result' = map (substituteBlankLine . tryStripPrefix prefix) result + where + tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys + + substituteBlankLine "<BLANKLINE>" = "" + substituteBlankLine line = line + coder = B.codeWith ([], ["result"], []) diff --git a/src/Text/Pandoc/Readers/Haddock/Lex.x b/src/Text/Pandoc/Readers/Haddock/Lex.x deleted file mode 100644 index 120e96ebf..000000000 --- a/src/Text/Pandoc/Readers/Haddock/Lex.x +++ /dev/null @@ -1,171 +0,0 @@ --- --- Haddock - A Haskell Documentation Tool --- --- (c) Simon Marlow 2002 --- --- This file was modified and integrated into GHC by David Waern 2006. --- Then moved back into Haddock by Isaac Dupree in 2009 :-) --- Then copied into Pandoc by David Lazar in 2013 :-D - -{ -{-# LANGUAGE BangPatterns #-} -- Generated by Alex -{-# OPTIONS -Wwarn -w #-} --- The above warning supression flag is a temporary kludge. --- While working on this module you are encouraged to remove it and fix --- any warnings in the module. See --- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings --- for details - -module Text.Pandoc.Readers.Haddock.Lex ( - Token(..), - LToken, - tokenise, - tokenPos - ) where - -import Data.Char -import Numeric (readHex) -} - -%wrapper "posn" - -$ws = $white # \n -$digit = [0-9] -$hexdigit = [0-9a-fA-F] -$special = [\"\@] -$alphanum = [A-Za-z0-9] -$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:] - -:- - --- beginning of a paragraph -<0,para> { - $ws* \n ; - $ws* \> { begin birdtrack } - $ws* prop \> .* \n { strtoken TokProperty `andBegin` property} - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - $ws* [\*\-] { token TokBullet `andBegin` string } - $ws* \[ { token TokDefStart `andBegin` def } - $ws* \( $digit+ \) { token TokNumber `andBegin` string } - $ws* $digit+ \. { token TokNumber `andBegin` string } - $ws* { begin string } -} - --- beginning of a line -<line> { - $ws* \> { begin birdtrack } - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - - $ws* \n { token TokPara `andBegin` para } - -- ^ Here, we really want to be able to say - -- $ws* (\n | <eof>) { token TokPara `andBegin` para} - -- because otherwise a trailing line of whitespace will result in - -- a spurious TokString at the end of a docstring. We don't have <eof>, - -- though (NOW I realise what it was for :-). To get around this, we always - -- append \n to the end of a docstring. - - () { begin string } -} - -<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line } - -<property> () { token TokPara `andBegin` para } - -<example> { - $ws* \n { token TokPara `andBegin` para } - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - () { begin exampleresult } -} - -<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example } - -<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example } - -<string,def> { - $special { strtoken $ \s -> TokSpecial (head s) } - \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) } - \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) } - \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) } - \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) } - [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) } - \\ . { strtoken (TokString . tail) } - "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] } - "&#" [xX] $hexdigit+ \; - { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] } - -- allow special characters through if they don't fit one of the previous - -- patterns. - [\/\'\`\<\#\&\\] { strtoken TokString } - [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line } - [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString } -} - -<def> { - \] { token TokDefEnd `andBegin` string } -} - --- ']' doesn't have any special meaning outside of the [...] at the beginning --- of a definition paragraph. -<string> { - \] { strtoken TokString } -} - -{ --- | A located token -type LToken = (Token, AlexPosn) - -data Token - = TokPara - | TokNumber - | TokBullet - | TokDefStart - | TokDefEnd - | TokSpecial Char - | TokIdent String - | TokString String - | TokURL String - | TokPic String - | TokEmphasis String - | TokAName String - | TokBirdTrack String - | TokProperty String - | TokExamplePrompt String - | TokExampleExpression String - | TokExampleResult String - deriving Show - -tokenPos :: LToken -> (Int, Int) -tokenPos t = let AlexPn _ line col = snd t in (line, col) - -type StartCode = Int -type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken] - -tokenise :: String -> (Int, Int) -> [LToken] -tokenise str (line, col) = go (posn,'\n',[],eofHack str) para - where posn = AlexPn 0 line col - go inp@(pos,_,_,str) sc = - case alexScan inp sc of - AlexEOF -> [] - AlexError _ -> [] - AlexSkip inp' len -> go inp' sc - AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) - --- NB. we add a final \n to the string, (see comment in the beginning of line --- production above). -eofHack str = str++"\n" - -andBegin :: Action -> StartCode -> Action -andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont - -token :: Token -> Action -token t = \pos _ sc cont -> (t, pos) : cont sc - -strtoken, strtokenNL :: (String -> Token) -> Action -strtoken t = \pos str sc cont -> (t str, pos) : cont sc -strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc --- ^ We only want LF line endings in our internal doc string format, so we --- filter out all CRs. - -begin :: StartCode -> Action -begin sc = \_ _ _ cont -> cont sc - -} diff --git a/src/Text/Pandoc/Readers/Haddock/Parse.y b/src/Text/Pandoc/Readers/Haddock/Parse.y deleted file mode 100644 index 9c2bbc8a9..000000000 --- a/src/Text/Pandoc/Readers/Haddock/Parse.y +++ /dev/null @@ -1,178 +0,0 @@ --- This code was copied from the 'haddock' package, modified, and integrated --- into Pandoc by David Lazar. -{ -{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6 --- The above warning supression flag is a temporary kludge. --- While working on this module you are encouraged to remove it and fix --- any warnings in the module. See --- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings --- for details - -module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where - -import Text.Pandoc.Readers.Haddock.Lex -import Text.Pandoc.Builder -import Text.Pandoc.Shared (trim, trimr) -import Data.Generics (everywhere, mkT) -import Data.Char (isSpace) -import Data.Maybe (fromMaybe) -import Data.List (stripPrefix, intersperse) -import Data.Monoid (mempty, mconcat) -} - -%expect 0 - -%tokentype { LToken } - -%token - '/' { (TokSpecial '/',_) } - '@' { (TokSpecial '@',_) } - '[' { (TokDefStart,_) } - ']' { (TokDefEnd,_) } - DQUO { (TokSpecial '\"',_) } - URL { (TokURL $$,_) } - PIC { (TokPic $$,_) } - ANAME { (TokAName $$,_) } - '/../' { (TokEmphasis $$,_) } - '-' { (TokBullet,_) } - '(n)' { (TokNumber,_) } - '>..' { (TokBirdTrack $$,_) } - PROP { (TokProperty $$,_) } - PROMPT { (TokExamplePrompt $$,_) } - RESULT { (TokExampleResult $$,_) } - EXP { (TokExampleExpression $$,_) } - IDENT { (TokIdent $$,_) } - PARA { (TokPara,_) } - STRING { (TokString $$,_) } - -%monad { Either [LToken] } - -%name parseParas doc -%name parseString seq - -%% - -doc :: { Blocks } - : apara PARA doc { $1 <> $3 } - | PARA doc { $2 } - | apara { $1 } - | {- empty -} { mempty } - -apara :: { Blocks } - : ulpara { bulletList [$1] } - | olpara { orderedList [$1] } - | defpara { definitionList [$1] } - | para { $1 } - -ulpara :: { Blocks } - : '-' para { $2 } - -olpara :: { Blocks } - : '(n)' para { $2 } - -defpara :: { (Inlines, [Blocks]) } - : '[' seq ']' seq { (trimInlines $2, [plain $ trimInlines $4]) } - -para :: { Blocks } - : seq { para' $1 } - | codepara { codeBlockWith ([], ["haskell"], []) $1 } - | property { $1 } - | examples { $1 } - -codepara :: { String } - : '>..' codepara { $1 ++ $2 } - | '>..' { $1 } - -property :: { Blocks } - : PROP { makeProperty $1 } - -examples :: { Blocks } - : example examples { $1 <> $2 } - | example { $1 } - -example :: { Blocks } - : PROMPT EXP result { makeExample $1 $2 (lines $3) } - | PROMPT EXP { makeExample $1 $2 [] } - -result :: { String } - : RESULT result { $1 ++ $2 } - | RESULT { $1 } - -seq :: { Inlines } - : elem seq { $1 <> $2 } - | elem { $1 } - -elem :: { Inlines } - : elem1 { $1 } - | '@' seq1 '@' { monospace $2 } - -seq1 :: { Inlines } - : PARA seq1 { linebreak <> $2 } - | elem1 seq1 { $1 <> $2 } - | elem1 { $1 } - -elem1 :: { Inlines } - : STRING { text $1 } - | '/../' { emph (str $1) } - | URL { makeHyperlink $1 } - | PIC { image $1 $1 mempty } - | ANAME { mempty } -- TODO - | IDENT { codeWith ([], ["haskell"], []) $1 } - | DQUO strings DQUO { codeWith ([], ["haskell"], []) $2 } - -strings :: { String } - : STRING { $1 } - | STRING strings { $1 ++ $2 } - -{ -happyError :: [LToken] -> Either [LToken] a -happyError toks = Left toks - -para' :: Inlines -> Blocks -para' = para . trimInlines - -monospace :: Inlines -> Inlines -monospace = everywhere (mkT go) - where - go (Str s) = Code nullAttr s - go x = x - --- | Create a `Hyperlink` from given string. --- --- A hyperlink consists of a URL and an optional label. The label is separated --- from the url by one or more whitespace characters. -makeHyperlink :: String -> Inlines -makeHyperlink input = case break isSpace $ trim input of - (url, "") -> link url url (str url) - (url, lb) -> link url url (trimInlines $ text lb) - -makeProperty :: String -> Blocks -makeProperty s = case trim s of - 'p':'r':'o':'p':'>':xs -> - codeBlockWith ([], ["property"], []) (dropWhile isSpace xs) - xs -> - error $ "makeProperty: invalid input " ++ show xs - --- | Create an 'Example', stripping superfluous characters as appropriate -makeExample :: String -> String -> [String] -> Blocks -makeExample prompt expression result = - para $ codeWith ([], ["haskell","expr"], []) (trim expression) - <> linebreak - <> (mconcat $ intersperse linebreak $ map coder result') - where - -- 1. drop trailing whitespace from the prompt, remember the prefix - prefix = takeWhile isSpace prompt - - -- 2. drop, if possible, the exact same sequence of whitespace - -- characters from each result line - -- - -- 3. interpret lines that only contain the string "<BLANKLINE>" as an - -- empty line - result' = map (substituteBlankLine . tryStripPrefix prefix) result - where - tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys - - substituteBlankLine "<BLANKLINE>" = "" - substituteBlankLine line = line - coder = codeWith ([], ["result"], []) -} diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 1a22f2ad2..97bfaa455 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,6 +1,6 @@ -{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ScopedTypeVariables, OverloadedStrings #-} {- -Copyright (C) 2006-2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2012 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -31,22 +31,26 @@ Conversion of LaTeX to 'Pandoc' document. module Text.Pandoc.Readers.LaTeX ( readLaTeX, rawLaTeXInline, rawLaTeXBlock, + inlineCommand, handleIncludes ) where import Text.Pandoc.Definition -import Text.Pandoc.Generic +import Text.Pandoc.Walk import Text.Pandoc.Shared import Text.Pandoc.Options -import Text.Pandoc.Biblio (processBiblio) -import Text.Pandoc.Parsing hiding ((<|>), many, optional, space) +import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, + mathDisplay, mathInline) +import Text.Parsec.Prim (ParsecT, runParserT) import qualified Text.Pandoc.UTF8 as UTF8 import Data.Char ( chr, ord ) +import Control.Monad.Trans (lift) import Control.Monad import Text.Pandoc.Builder -import Data.Char (isLetter, isPunctuation, isSpace) +import Data.Char (isLetter, isAlphaNum) import Control.Applicative import Data.Monoid +import Data.Maybe (fromMaybe) import System.Environment (getEnv) import System.FilePath (replaceExtension, (</>)) import Data.List (intercalate, intersperse) @@ -67,9 +71,7 @@ parseLaTeX = do eof st <- getState let meta = stateMeta st - refs <- getOption readerReferences - mbsty <- getOption readerCitationStyle - let (Pandoc _ bs') = processBiblio mbsty refs $ doc bs + let (Pandoc _ bs') = doc bs return $ Pandoc meta bs' type LP = Parser [Char] ParserState @@ -124,7 +126,7 @@ comment :: LP () comment = do char '%' skipMany (satisfy (/='\n')) - newline + optional newline return () bgroup :: LP () @@ -165,28 +167,40 @@ mathChars = concat <$> <|> (\c -> ['\\',c]) <$> (try $ char '\\' *> anyChar) ) +quoted' :: (Inlines -> Inlines) -> LP String -> LP () -> LP Inlines +quoted' f starter ender = do + startchs <- starter + try ((f . mconcat) <$> manyTill inline ender) <|> lit startchs + double_quote :: LP Inlines -double_quote = (doubleQuoted . mconcat) <$> - (try $ string "``" *> manyTill inline (try $ string "''")) +double_quote = + ( quoted' doubleQuoted (try $ string "``") (void $ try $ string "''") + <|> quoted' doubleQuoted (string "“") (void $ char '”') + -- the following is used by babel for localized quotes: + <|> quoted' doubleQuoted (try $ string "\"`") (void $ try $ string "\"'") + <|> quoted' doubleQuoted (string "\"") (void $ char '"') + ) single_quote :: LP Inlines -single_quote = (singleQuoted . mconcat) <$> - (try $ char '`' *> manyTill inline (try $ char '\'' >> notFollowedBy letter)) +single_quote = + ( quoted' singleQuoted (string "`") (try $ char '\'' >> notFollowedBy letter) + <|> quoted' singleQuoted (string "‘") (try $ char '’' >> notFollowedBy letter) + ) inline :: LP Inlines inline = (mempty <$ comment) <|> (space <$ sp) <|> inlineText <|> inlineCommand - <|> grouped inline + <|> inlineGroup <|> (char '-' *> option (str "-") ((char '-') *> option (str "–") (str "—" <$ char '-'))) <|> double_quote <|> single_quote - <|> (str "“" <$ try (string "``")) -- nb. {``} won't be caught by double_quote <|> (str "”" <$ try (string "''")) - <|> (str "‘" <$ char '`') -- nb. {`} won't be caught by single_quote + <|> (str "”" <$ char '”') <|> (str "’" <$ char '\'') + <|> (str "’" <$ char '’') <|> (str "\160" <$ char '~') <|> (mathDisplay $ string "$$" *> mathChars <* string "$$") <|> (mathInline $ char '$' *> mathChars <* char '$') @@ -201,6 +215,15 @@ inline = (mempty <$ comment) inlines :: LP Inlines inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) +inlineGroup :: LP Inlines +inlineGroup = do + ils <- grouped inline + if isNull ils + then return mempty + else return $ spanWith nullAttr ils + -- we need the span so we can detitlecase bibtex entries; + -- we need to know when something is {C}apitalized + block :: LP Blocks block = (mempty <$ comment) <|> (mempty <$ ((spaceChar <|> newline) *> spaces)) @@ -282,6 +305,13 @@ blockCommands = M.fromList $ , ("item", skipopts *> loose_item) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) + , ("caption", skipopts *> tok >>= setCaption) + , ("PandocStartInclude", startInclude) + , ("PandocEndInclude", endInclude) + , ("bibliography", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) + , ("addbibresource", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) ] ++ map ignoreBlocks -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks @@ -289,7 +319,7 @@ blockCommands = M.fromList $ -- newcommand, etc. should be parsed by macro, but we need this -- here so these aren't parsed as inline commands to ignore , "special", "pdfannot", "pdfstringdef" - , "bibliography", "bibliographystyle" + , "bibliographystyle" , "maketitle", "makeindex", "makeglossary" , "addcontentsline", "addtocontents", "addtocounter" -- \ignore{} is used conventionally in literate haskell for definitions @@ -301,7 +331,19 @@ blockCommands = M.fromList $ ] addMeta :: ToMetaValue a => String -> a -> LP () -addMeta field val = updateState $ setMeta field val +addMeta field val = updateState $ \st -> + st{ stateMeta = addMetaField field val $ stateMeta st } + +splitBibs :: String -> [Inlines] +splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') + +setCaption :: Inlines -> LP Blocks +setCaption ils = do + updateState $ \st -> st{ stateCaption = Just ils } + return mempty + +resetCaption :: LP () +resetCaption = updateState $ \st -> st{ stateCaption = Nothing } authors :: LP () authors = try $ do @@ -312,15 +354,17 @@ authors = try $ do -- skip e.g. \vspace{10pt} auths <- sepBy oneAuthor (controlSeq "and") char '}' - addMeta "authors" (map trimInlines auths) + addMeta "author" (map trimInlines auths) section :: Attr -> Int -> LP Blocks -section attr lvl = do +section (ident, classes, kvs) lvl = do hasChapters <- stateHasChapters `fmap` getState let lvl' = if hasChapters then lvl + 1 else lvl skipopts contents <- grouped inline - return $ headerWith attr lvl' contents + lab <- option ident $ try (spaces >> controlSeq "label" >> spaces >> braced) + attr' <- registerHeader (lab, classes, kvs) contents + return $ headerWith attr' lvl' contents inlineCommand :: LP Inlines inlineCommand = try $ do @@ -353,17 +397,18 @@ isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands inlineCommands :: M.Map String (LP Inlines) inlineCommands = M.fromList $ - [ ("emph", emph <$> tok) - , ("textit", emph <$> tok) - , ("textsl", emph <$> tok) - , ("textsc", smallcaps <$> tok) - , ("sout", strikeout <$> tok) - , ("textsuperscript", superscript <$> tok) - , ("textsubscript", subscript <$> tok) + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("sout", extractSpaces strikeout <$> tok) + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) , ("textbackslash", lit "\\") , ("backslash", lit "\\") , ("slash", lit "/") - , ("textbf", strong <$> tok) + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) , ("ldots", lit "…") , ("dots", lit "…") , ("mdots", lit "…") @@ -383,15 +428,15 @@ inlineCommands = M.fromList $ , ("{", lit "{") , ("}", lit "}") -- old TeX commands - , ("em", emph <$> inlines) - , ("it", emph <$> inlines) - , ("sl", emph <$> inlines) - , ("bf", strong <$> inlines) + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) , ("rm", inlines) - , ("itshape", emph <$> inlines) - , ("slshape", emph <$> inlines) - , ("scshape", smallcaps <$> inlines) - , ("bfseries", strong <$> inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) , ("/", pure mempty) -- italic correction , ("aa", lit "å") , ("AA", lit "Å") @@ -402,6 +447,8 @@ inlineCommands = M.fromList $ , ("l", lit "ł") , ("ae", lit "æ") , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") , ("pounds", lit "£") , ("euro", lit "€") , ("copyright", lit "©") @@ -415,6 +462,8 @@ inlineCommands = M.fromList $ , (".", option (str ".") $ try $ tok >>= accent dot) , ("=", option (str "=") $ try $ tok >>= accent macron) , ("c", option (str "c") $ try $ tok >>= accent cedilla) + , ("v", option (str "v") $ try $ tok >>= accent hacek) + , ("u", option (str "u") $ try $ tok >>= accent breve) , ("i", lit "i") , ("\\", linebreak <$ (optional (bracketed inline) *> optional sp)) , (",", pure mempty) @@ -430,6 +479,7 @@ inlineCommands = M.fromList $ , ("footnote", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) , ("verb", doverb) , ("lstinline", doverb) + , ("Verb", doverb) , ("texttt", (code . stringify . toList) <$> tok) , ("url", (unescapeURL <$> braced) >>= \url -> pure (link url "" (str url))) @@ -488,25 +538,21 @@ inlineCommands = M.fromList $ , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> complexNatbibCitation AuthorInText) <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) ] ++ map ignoreInlines -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks: - [ "noindent", "index", "nocite" ] + [ "noindent", "index" ] mkImage :: String -> LP Inlines mkImage src = do - -- try for a caption - (alt, tit) <- option (str "image", "") $ try $ do - spaces - controlSeq "caption" - optional (char '*') - ils <- grouped inline - return (ils, "fig:") + let alt = str "image" case takeExtension src of "" -> do defaultExt <- getOption readerDefaultImageExtension - return $ image (addExtension src defaultExt) tit alt - _ -> return $ image src tit alt + return $ image (addExtension src defaultExt) "" alt + _ -> return $ image src "" alt inNote :: Inlines -> Inlines inNote ils = @@ -514,9 +560,7 @@ inNote ils = unescapeURL :: String -> String unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs - where isEscapable '%' = True - isEscapable '#' = True - isEscapable _ = False + where isEscapable c = c `elem` "#$%&~_^\\{}" unescapeURL (x:xs) = x:unescapeURL xs unescapeURL [] = "" @@ -539,137 +583,196 @@ doLHSverb = codeWith ("",["haskell"],[]) <$> manyTill (satisfy (/='\n')) (char ' lit :: String -> LP Inlines lit = pure . str -accent :: (Char -> Char) -> Inlines -> LP Inlines +accent :: (Char -> String) -> Inlines -> LP Inlines accent f ils = case toList ils of - (Str (x:xs) : ys) -> return $ fromList $ (Str (f x : xs) : ys) + (Str (x:xs) : ys) -> return $ fromList $ (Str (f x ++ xs) : ys) [] -> mzero _ -> return ils -grave :: Char -> Char -grave 'A' = 'À' -grave 'E' = 'È' -grave 'I' = 'Ì' -grave 'O' = 'Ò' -grave 'U' = 'Ù' -grave 'a' = 'à' -grave 'e' = 'è' -grave 'i' = 'ì' -grave 'o' = 'ò' -grave 'u' = 'ù' -grave c = c - -acute :: Char -> Char -acute 'A' = 'Á' -acute 'E' = 'É' -acute 'I' = 'Í' -acute 'O' = 'Ó' -acute 'U' = 'Ú' -acute 'Y' = 'Ý' -acute 'a' = 'á' -acute 'e' = 'é' -acute 'i' = 'í' -acute 'o' = 'ó' -acute 'u' = 'ú' -acute 'y' = 'ý' -acute 'C' = 'Ć' -acute 'c' = 'ć' -acute 'L' = 'Ĺ' -acute 'l' = 'ĺ' -acute 'N' = 'Ń' -acute 'n' = 'ń' -acute 'R' = 'Ŕ' -acute 'r' = 'ŕ' -acute 'S' = 'Ś' -acute 's' = 'ś' -acute 'Z' = 'Ź' -acute 'z' = 'ź' -acute c = c - -circ :: Char -> Char -circ 'A' = 'Â' -circ 'E' = 'Ê' -circ 'I' = 'Î' -circ 'O' = 'Ô' -circ 'U' = 'Û' -circ 'a' = 'â' -circ 'e' = 'ê' -circ 'i' = 'î' -circ 'o' = 'ô' -circ 'u' = 'û' -circ 'C' = 'Ĉ' -circ 'c' = 'ĉ' -circ 'G' = 'Ĝ' -circ 'g' = 'ĝ' -circ 'H' = 'Ĥ' -circ 'h' = 'ĥ' -circ 'J' = 'Ĵ' -circ 'j' = 'ĵ' -circ 'S' = 'Ŝ' -circ 's' = 'ŝ' -circ 'W' = 'Ŵ' -circ 'w' = 'ŵ' -circ 'Y' = 'Ŷ' -circ 'y' = 'ŷ' -circ c = c - -tilde :: Char -> Char -tilde 'A' = 'Ã' -tilde 'a' = 'ã' -tilde 'O' = 'Õ' -tilde 'o' = 'õ' -tilde 'I' = 'Ĩ' -tilde 'i' = 'ĩ' -tilde 'U' = 'Ũ' -tilde 'u' = 'ũ' -tilde 'N' = 'Ñ' -tilde 'n' = 'ñ' -tilde c = c - -umlaut :: Char -> Char -umlaut 'A' = 'Ä' -umlaut 'E' = 'Ë' -umlaut 'I' = 'Ï' -umlaut 'O' = 'Ö' -umlaut 'U' = 'Ü' -umlaut 'a' = 'ä' -umlaut 'e' = 'ë' -umlaut 'i' = 'ï' -umlaut 'o' = 'ö' -umlaut 'u' = 'ü' -umlaut c = c - -dot :: Char -> Char -dot 'C' = 'Ċ' -dot 'c' = 'ċ' -dot 'E' = 'Ė' -dot 'e' = 'ė' -dot 'G' = 'Ġ' -dot 'g' = 'ġ' -dot 'I' = 'İ' -dot 'Z' = 'Ż' -dot 'z' = 'ż' -dot c = c - -macron :: Char -> Char -macron 'A' = 'Ā' -macron 'E' = 'Ē' -macron 'I' = 'Ī' -macron 'O' = 'Ō' -macron 'U' = 'Ū' -macron 'a' = 'ā' -macron 'e' = 'ē' -macron 'i' = 'ī' -macron 'o' = 'ō' -macron 'u' = 'ū' -macron c = c - -cedilla :: Char -> Char -cedilla 'c' = 'ç' -cedilla 'C' = 'Ç' -cedilla 's' = 'ş' -cedilla 'S' = 'Ş' -cedilla c = c +grave :: Char -> String +grave 'A' = "À" +grave 'E' = "È" +grave 'I' = "Ì" +grave 'O' = "Ò" +grave 'U' = "Ù" +grave 'a' = "à" +grave 'e' = "è" +grave 'i' = "ì" +grave 'o' = "ò" +grave 'u' = "ù" +grave c = [c] + +acute :: Char -> String +acute 'A' = "Á" +acute 'E' = "É" +acute 'I' = "Í" +acute 'O' = "Ó" +acute 'U' = "Ú" +acute 'Y' = "Ý" +acute 'a' = "á" +acute 'e' = "é" +acute 'i' = "í" +acute 'o' = "ó" +acute 'u' = "ú" +acute 'y' = "ý" +acute 'C' = "Ć" +acute 'c' = "ć" +acute 'L' = "Ĺ" +acute 'l' = "ĺ" +acute 'N' = "Ń" +acute 'n' = "ń" +acute 'R' = "Ŕ" +acute 'r' = "ŕ" +acute 'S' = "Ś" +acute 's' = "ś" +acute 'Z' = "Ź" +acute 'z' = "ź" +acute c = [c] + +circ :: Char -> String +circ 'A' = "Â" +circ 'E' = "Ê" +circ 'I' = "Î" +circ 'O' = "Ô" +circ 'U' = "Û" +circ 'a' = "â" +circ 'e' = "ê" +circ 'i' = "î" +circ 'o' = "ô" +circ 'u' = "û" +circ 'C' = "Ĉ" +circ 'c' = "ĉ" +circ 'G' = "Ĝ" +circ 'g' = "ĝ" +circ 'H' = "Ĥ" +circ 'h' = "ĥ" +circ 'J' = "Ĵ" +circ 'j' = "ĵ" +circ 'S' = "Ŝ" +circ 's' = "ŝ" +circ 'W' = "Ŵ" +circ 'w' = "ŵ" +circ 'Y' = "Ŷ" +circ 'y' = "ŷ" +circ c = [c] + +tilde :: Char -> String +tilde 'A' = "Ã" +tilde 'a' = "ã" +tilde 'O' = "Õ" +tilde 'o' = "õ" +tilde 'I' = "Ĩ" +tilde 'i' = "ĩ" +tilde 'U' = "Ũ" +tilde 'u' = "ũ" +tilde 'N' = "Ñ" +tilde 'n' = "ñ" +tilde c = [c] + +umlaut :: Char -> String +umlaut 'A' = "Ä" +umlaut 'E' = "Ë" +umlaut 'I' = "Ï" +umlaut 'O' = "Ö" +umlaut 'U' = "Ü" +umlaut 'a' = "ä" +umlaut 'e' = "ë" +umlaut 'i' = "ï" +umlaut 'o' = "ö" +umlaut 'u' = "ü" +umlaut c = [c] + +dot :: Char -> String +dot 'C' = "Ċ" +dot 'c' = "ċ" +dot 'E' = "Ė" +dot 'e' = "ė" +dot 'G' = "Ġ" +dot 'g' = "ġ" +dot 'I' = "İ" +dot 'Z' = "Ż" +dot 'z' = "ż" +dot c = [c] + +macron :: Char -> String +macron 'A' = "Ā" +macron 'E' = "Ē" +macron 'I' = "Ī" +macron 'O' = "Ō" +macron 'U' = "Ū" +macron 'a' = "ā" +macron 'e' = "ē" +macron 'i' = "ī" +macron 'o' = "ō" +macron 'u' = "ū" +macron c = [c] + +cedilla :: Char -> String +cedilla 'c' = "ç" +cedilla 'C' = "Ç" +cedilla 's' = "ş" +cedilla 'S' = "Ş" +cedilla 't' = "ţ" +cedilla 'T' = "Ţ" +cedilla 'e' = "ȩ" +cedilla 'E' = "Ȩ" +cedilla 'h' = "ḩ" +cedilla 'H' = "Ḩ" +cedilla 'o' = "o̧" +cedilla 'O' = "O̧" +cedilla c = [c] + +hacek :: Char -> String +hacek 'A' = "Ǎ" +hacek 'a' = "ǎ" +hacek 'C' = "Č" +hacek 'c' = "č" +hacek 'D' = "Ď" +hacek 'd' = "ď" +hacek 'E' = "Ě" +hacek 'e' = "ě" +hacek 'G' = "Ǧ" +hacek 'g' = "ǧ" +hacek 'H' = "Ȟ" +hacek 'h' = "ȟ" +hacek 'I' = "Ǐ" +hacek 'i' = "ǐ" +hacek 'j' = "ǰ" +hacek 'K' = "Ǩ" +hacek 'k' = "ǩ" +hacek 'L' = "Ľ" +hacek 'l' = "ľ" +hacek 'N' = "Ň" +hacek 'n' = "ň" +hacek 'O' = "Ǒ" +hacek 'o' = "ǒ" +hacek 'R' = "Ř" +hacek 'r' = "ř" +hacek 'S' = "Š" +hacek 's' = "š" +hacek 'T' = "Ť" +hacek 't' = "ť" +hacek 'U' = "Ǔ" +hacek 'u' = "ǔ" +hacek 'Z' = "Ž" +hacek 'z' = "ž" +hacek c = [c] + +breve :: Char -> String +breve 'A' = "Ă" +breve 'a' = "ă" +breve 'E' = "Ĕ" +breve 'e' = "ĕ" +breve 'G' = "Ğ" +breve 'g' = "ğ" +breve 'I' = "Ĭ" +breve 'i' = "ĭ" +breve 'O' = "Ŏ" +breve 'o' = "ŏ" +breve 'U' = "Ŭ" +breve 'u' = "ŭ" +breve c = [c] tok :: LP Inlines tok = try $ grouped inline <|> inlineCommand <|> str <$> (count 1 $ inlineChar) @@ -684,7 +787,7 @@ inlineText :: LP Inlines inlineText = str <$> many1 inlineChar inlineChar :: LP Char -inlineChar = noneOf "\\$%^_&~#{}^'`-[] \t\n" +inlineChar = noneOf "\\$%^_&~#{}^'`\"‘’“”-[] \t\n" environment :: LP Blocks environment = do @@ -703,31 +806,107 @@ rawEnv name = do (withRaw (env name blocks) >>= applyMacros' . snd) else env name blocks +---- + +type IncludeParser = ParsecT [Char] [String] IO String + -- | Replace "include" commands with file contents. handleIncludes :: String -> IO String -handleIncludes = handleIncludes' [] - --- parents parameter prevents infinite include loops -handleIncludes' :: [FilePath] -> String -> IO String -handleIncludes' _ [] = return [] -handleIncludes' parents ('\\':'%':xs) = - ("\\%"++) `fmap` handleIncludes' parents xs -handleIncludes' parents ('%':xs) = handleIncludes' parents - $ drop 1 $ dropWhile (/='\n') xs -handleIncludes' parents ('\\':xs) = - case runParser include defaultParserState "input" ('\\':xs) of - Right (fs, rest) -> do yss <- mapM (\f -> if f `elem` parents - then "" <$ warn ("Include file loop in '" - ++ f ++ "'.") - else readTeXFile f >>= - handleIncludes' (f:parents)) fs - rest' <- handleIncludes' parents rest - return $ intercalate "\n" yss ++ rest' - _ -> case runParser (verbCmd <|> verbatimEnv) defaultParserState - "input" ('\\':xs) of - Right (r, rest) -> (r ++) `fmap` handleIncludes' parents rest - _ -> ('\\':) `fmap` handleIncludes' parents xs -handleIncludes' parents (x:xs) = (x:) `fmap` handleIncludes' parents xs +handleIncludes s = do + res <- runParserT includeParser' [] "input" s + case res of + Right s' -> return s' + Left e -> error $ show e + +includeParser' :: IncludeParser +includeParser' = + concat <$> many (comment' <|> escaped' <|> blob' <|> include' + <|> startMarker' <|> endMarker' + <|> verbCmd' <|> verbatimEnv' <|> backslash') + +comment' :: IncludeParser +comment' = do + char '%' + xs <- manyTill anyChar newline + return ('%':xs ++ "\n") + +escaped' :: IncludeParser +escaped' = try $ string "\\%" <|> string "\\\\" + +verbCmd' :: IncludeParser +verbCmd' = fmap snd <$> + withRaw $ try $ do + string "\\verb" + c <- anyChar + manyTill anyChar (char c) + +verbatimEnv' :: IncludeParser +verbatimEnv' = fmap snd <$> + withRaw $ try $ do + string "\\begin" + name <- braced' + guard $ name `elem` ["verbatim", "Verbatim", "lstlisting", + "minted", "alltt"] + manyTill anyChar (try $ string $ "\\end{" ++ name ++ "}") + +blob' :: IncludeParser +blob' = try $ many1 (noneOf "\\%") + +backslash' :: IncludeParser +backslash' = string "\\" + +braced' :: IncludeParser +braced' = try $ char '{' *> manyTill (satisfy (/='}')) (char '}') + +include' :: IncludeParser +include' = do + fs' <- try $ do + char '\\' + name <- try (string "include") + <|> try (string "input") + <|> string "usepackage" + -- skip options + skipMany $ try $ char '[' *> (manyTill anyChar (char ']')) + fs <- (map trim . splitBy (==',')) <$> braced' + return $ if name == "usepackage" + then map (flip replaceExtension ".sty") fs + else map (flip replaceExtension ".tex") fs + pos <- getPosition + containers <- getState + let fn = case containers of + (f':_) -> f' + [] -> "input" + -- now process each include file in order... + rest <- getInput + results' <- forM fs' (\f -> do + when (f `elem` containers) $ + fail "Include file loop!" + contents <- lift $ readTeXFile f + return $ "\\PandocStartInclude{" ++ f ++ "}" ++ + contents ++ "\\PandocEndInclude{" ++ + fn ++ "}{" ++ show (sourceLine pos) ++ "}{" + ++ show (sourceColumn pos) ++ "}") + setInput $ concat results' ++ rest + return "" + +startMarker' :: IncludeParser +startMarker' = try $ do + string "\\PandocStartInclude" + fn <- braced' + updateState (fn:) + setPosition $ newPos fn 1 1 + return $ "\\PandocStartInclude{" ++ fn ++ "}" + +endMarker' :: IncludeParser +endMarker' = try $ do + string "\\PandocEndInclude" + fn <- braced' + ln <- braced' + co <- braced' + updateState tail + setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) + return $ "\\PandocEndInclude{" ++ fn ++ "}{" ++ ln ++ "}{" ++ + co ++ "}" readTeXFile :: FilePath -> IO String readTeXFile f = do @@ -742,27 +921,7 @@ readFileFromDirs (d:ds) f = E.catch (UTF8.readFile $ d </> f) $ \(_ :: E.SomeException) -> readFileFromDirs ds f -include :: LP ([FilePath], String) -include = do - name <- controlSeq "include" - <|> controlSeq "input" - <|> controlSeq "usepackage" - skipopts - fs <- (splitBy (==',')) <$> braced - rest <- getInput - let fs' = if name == "usepackage" - then map (flip replaceExtension ".sty") fs - else map (flip replaceExtension ".tex") fs - return (fs', rest) - -verbCmd :: LP (String, String) -verbCmd = do - (_,r) <- withRaw $ do - controlSeq "verb" - c <- anyChar - manyTill anyChar (char c) - rest <- getInput - return (r, rest) +---- keyval :: LP (String, String) keyval = try $ do @@ -778,24 +937,12 @@ keyvals :: LP [(String, String)] keyvals = try $ char '[' *> manyTill keyval (char ']') alltt :: String -> LP Blocks -alltt t = bottomUp strToCode <$> parseFromString blocks +alltt t = walk strToCode <$> parseFromString blocks (substitute " " "\\ " $ substitute "%" "\\%" $ concat $ intersperse "\\\\\n" $ lines t) where strToCode (Str s) = Code nullAttr s strToCode x = x -verbatimEnv :: LP (String, String) -verbatimEnv = do - (_,r) <- withRaw $ do - controlSeq "begin" - name <- braced - guard $ name == "verbatim" || name == "Verbatim" || - name == "lstlisting" || name == "minted" || - name == "alltt" - verbEnv name - rest <- getInput - return (r,rest) - rawLaTeXBlock :: Parser [Char] ParserState String rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand)) @@ -804,12 +951,33 @@ rawLaTeXInline = do raw <- (snd <$> withRaw inlineCommand) <|> (snd <$> withRaw blockCommand) RawInline "latex" <$> applyMacros' raw +addImageCaption :: Blocks -> LP Blocks +addImageCaption = walkM go + where go (Image alt (src,tit)) = do + mbcapt <- stateCaption <$> getState + case mbcapt of + Just ils -> return (Image (toList ils) (src, "fig:")) + Nothing -> return (Image alt (src,tit)) + go x = return x + +addTableCaption :: Blocks -> LP Blocks +addTableCaption = walkM go + where go (Table c als ws hs rs) = do + mbcapt <- stateCaption <$> getState + case mbcapt of + Just ils -> return (Table (toList ils) als ws hs rs) + Nothing -> return (Table c als ws hs rs) + go x = return x + environments :: M.Map String (LP Blocks) environments = M.fromList [ ("document", env "document" blocks <* skipMany anyChar) , ("letter", env "letter" letter_contents) - , ("figure", env "figure" $ skipopts *> blocks) + , ("figure", env "figure" $ + resetCaption *> skipopts *> blocks >>= addImageCaption) , ("center", env "center" blocks) + , ("table", env "table" $ + resetCaption *> skipopts *> blocks >>= addTableCaption) , ("tabular", env "tabular" simpTable) , ("quote", blockQuote <$> env "quote" blocks) , ("quotation", blockQuote <$> env "quotation" blocks) @@ -838,7 +1006,7 @@ environments = M.fromList lookup "numbers" options == Just "left" ] ++ maybe [] (:[]) (lookup "language" options >>= fromListingsLanguage) - let attr = ("",classes,kvs) + let attr = (fromMaybe "" (lookup "label" options),classes,kvs) codeBlockWith attr <$> (verbEnv "lstlisting")) , ("minted", do options <- option [] keyvals lang <- grouped (many1 $ satisfy (/='}')) @@ -966,15 +1134,15 @@ paragraph = do preamble :: LP Blocks preamble = mempty <$> manyTill preambleBlock beginDoc - where beginDoc = lookAhead $ controlSeq "begin" *> string "{document}" - preambleBlock = (mempty <$ comment) - <|> (mempty <$ sp) - <|> (mempty <$ blanklines) - <|> (mempty <$ macro) - <|> blockCommand - <|> (mempty <$ anyControlSeq) - <|> (mempty <$ braced) - <|> (mempty <$ anyChar) + where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}" + preambleBlock = (void comment) + <|> (void sp) + <|> (void blanklines) + <|> (void macro) + <|> (void blockCommand) + <|> (void anyControlSeq) + <|> (void braced) + <|> (void anyChar) ------- @@ -986,12 +1154,8 @@ addPrefix _ _ = [] addSuffix :: [Inline] -> [Citation] -> [Citation] addSuffix s ks@(_:_) = - let k = last ks - s' = case s of - (Str (c:_):_) - | not (isPunctuation c || isSpace c) -> Str "," : Space : s - _ -> s - in init ks ++ [k {citationSuffix = citationSuffix k ++ s'}] + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] addSuffix _ _ = [] simpleCiteArgs :: LP [Citation] @@ -999,6 +1163,7 @@ simpleCiteArgs = try $ do first <- optionMaybe $ toList <$> opt second <- optionMaybe $ toList <$> opt char '{' + optional sp keys <- manyTill citationLabel (char '}') let (pre, suf) = case (first , second ) of (Just s , Nothing) -> (mempty, s ) @@ -1014,18 +1179,24 @@ simpleCiteArgs = try $ do return $ addPrefix pre $ addSuffix suf $ map conv keys citationLabel :: LP String -citationLabel = trim <$> - (many1 (satisfy $ \c -> c /=',' && c /='}') <* optional (char ',') <* optional sp) +citationLabel = optional sp *> + (many1 (satisfy isBibtexKeyChar) + <* optional sp + <* optional (char ',') + <* optional sp) + where isBibtexKeyChar c = isAlphaNum c || c `elem` ".:;?!`'()/*@_+=-[]*" cites :: CitationMode -> Bool -> LP [Citation] cites mode multi = try $ do cits <- if multi then many1 simpleCiteArgs else count 1 simpleCiteArgs - let (c:cs) = concat cits + let cs = concat cits return $ case mode of - AuthorInText -> c {citationMode = mode} : cs - _ -> map (\a -> a {citationMode = mode}) (c:cs) + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs citation :: String -> CitationMode -> Bool -> LP Inlines citation name mode multi = do @@ -1057,12 +1228,14 @@ complexNatbibCitation mode = try $ do parseAligns :: LP [Alignment] parseAligns = try $ do char '{' - optional $ char '|' + let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ try (string "@{}") + maybeBar let cAlign = AlignCenter <$ char 'c' let lAlign = AlignLeft <$ char 'l' let rAlign = AlignRight <$ char 'r' - let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign) - aligns' <- sepEndBy alignChar (optional $ char '|') + let parAlign = AlignLeft <$ (char 'p' >> braced) + let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign + aligns' <- sepEndBy alignChar maybeBar spaces char '}' spaces @@ -1082,10 +1255,14 @@ parseTableRow :: Int -- ^ number of columns parseTableRow cols = try $ do let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline let tableCell = (plain . trimInlines . mconcat) <$> many tableCellInline - cells' <- sepBy tableCell amp - guard $ length cells' == cols + cells' <- sepBy1 tableCell amp + let numcells = length cells' + guard $ numcells <= cols && numcells >= 1 + guard $ cells' /= [mempty] + -- note: a & b in a three-column table leaves an empty 3rd cell: + let cells'' = cells' ++ replicate (cols - numcells) mempty spaces - return cells' + return cells'' simpTable :: LP Blocks simpTable = try $ do @@ -1096,9 +1273,23 @@ simpTable = try $ do header' <- option [] $ try (parseTableRow cols <* lbreak <* hline) rows <- sepEndBy (parseTableRow cols) (lbreak <* optional hline) spaces + skipMany (comment *> spaces) let header'' = if null header' then replicate cols mempty else header' lookAhead $ controlSeq "end" -- make sure we're at end return $ table mempty (zip aligns (repeat 0)) header'' rows +startInclude :: LP Blocks +startInclude = do + fn <- braced + setPosition $ newPos fn 1 1 + return mempty + +endInclude :: LP Blocks +endInclude = do + fn <- braced + ln <- braced + co <- braced + setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) + return mempty diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index a3500fbcf..690256224 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,6 +1,6 @@ {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-2013 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,6 +33,7 @@ module Text.Pandoc.Readers.Markdown ( readMarkdown, import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate ) import qualified Data.Map as M +import Data.Scientific (coefficient, base10Exponent) import Data.Ord ( comparing ) import Data.Char ( isAlphaNum, toLower ) import Data.Maybe @@ -49,13 +50,10 @@ import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import Text.Pandoc.Options import Text.Pandoc.Shared import Text.Pandoc.XML (fromEntities) -import Text.Pandoc.Asciify (toAsciiChar) import Text.Pandoc.Parsing hiding (tableWith) import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag, isCommentTag ) -import Text.Pandoc.Biblio (processBiblio) -import qualified Text.CSL as CSL import Data.Monoid (mconcat, mempty) import Control.Applicative ((<$>), (<*), (*>), (<$)) import Control.Monad @@ -63,6 +61,8 @@ import System.FilePath (takeExtension, addExtension) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match (tagOpen) import qualified Data.Set as Set +import Text.Printf (printf) +import Debug.Trace (trace) type MarkdownParser = Parser [Char] ParserState @@ -203,13 +203,10 @@ dateLine = try $ do skipSpaces trimInlinesF . mconcat <$> manyTill inline newline -titleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) -titleBlock = pandocTitleBlock - <|> yamlTitleBlock - <|> mmdTitleBlock - <|> return (return id) +titleBlock :: MarkdownParser () +titleBlock = pandocTitleBlock <|> mmdTitleBlock -pandocTitleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) +pandocTitleBlock :: MarkdownParser () pandocTitleBlock = try $ do guardEnabled Ext_pandoc_title_block lookAhead (char '%') @@ -217,49 +214,61 @@ pandocTitleBlock = try $ do author <- option (return []) authorsLine date <- option mempty dateLine optional blanklines - return $ do - title' <- title - author' <- author - date' <- date - return $ B.setMeta "title" title' - . B.setMeta "author" author' - . B.setMeta "date" date' - -yamlTitleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) -yamlTitleBlock = try $ do + let meta' = do title' <- title + author' <- author + date' <- date + return $ + (if B.isNull title' then id else B.setMeta "title" title') + . (if null author' then id else B.setMeta "author" author') + . (if B.isNull date' then id else B.setMeta "date" date') + $ nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + +yamlMetaBlock :: MarkdownParser (F Blocks) +yamlMetaBlock = try $ do guardEnabled Ext_yaml_metadata_block pos <- getPosition string "---" blankline - rawYaml <- unlines <$> manyTill anyLine stopLine + notFollowedBy blankline -- if --- is followed by a blank it's an HRULE + rawYamlLines <- manyTill anyLine stopLine + -- by including --- and ..., we allow yaml blocks with just comments: + let rawYaml = unlines ("---" : (rawYamlLines ++ ["..."])) optional blanklines opts <- stateOptions <$> getState - case Yaml.decodeEither' $ UTF8.fromString rawYaml of - Right (Yaml.Object hashmap) -> return $ return $ - H.foldrWithKey (\k v f -> - if ignorable k - then f - else B.setMeta (T.unpack k) (yamlToMeta opts v) . f) - id hashmap - Right _ -> do - addWarning (Just pos) "YAML header is not an object" - return $ return id - Left err' -> do - case err' of - InvalidYaml (Just YamlParseException{ - yamlProblem = problem - , yamlContext = _ctxt - , yamlProblemMark = Yaml.YamlMark { - yamlLine = yline - , yamlColumn = ycol - }}) -> - addWarning (Just $ setSourceLine - (setSourceColumn pos (sourceColumn pos + ycol)) - (sourceLine pos + 1 + yline)) - $ "Could not parse YAML header: " ++ problem - _ -> addWarning (Just pos) - $ "Could not parse YAML header: " ++ show err' - return $ return id + meta' <- case Yaml.decodeEither' $ UTF8.fromString rawYaml of + Right (Yaml.Object hashmap) -> return $ return $ + H.foldrWithKey (\k v m -> + if ignorable k + then m + else B.setMeta (T.unpack k) + (yamlToMeta opts v) m) + nullMeta hashmap + Right Yaml.Null -> return $ return nullMeta + Right _ -> do + addWarning (Just pos) "YAML header is not an object" + return $ return nullMeta + Left err' -> do + case err' of + InvalidYaml (Just YamlParseException{ + yamlProblem = problem + , yamlContext = _ctxt + , yamlProblemMark = Yaml.YamlMark { + yamlLine = yline + , yamlColumn = ycol + }}) -> + addWarning (Just $ setSourceLine + (setSourceColumn pos + (sourceColumn pos + ycol)) + (sourceLine pos + 1 + yline)) + $ "Could not parse YAML header: " ++ + problem + _ -> addWarning (Just pos) + $ "Could not parse YAML header: " ++ + show err' + return $ return nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + return mempty -- ignore fields ending with _ ignorable :: Text -> Bool @@ -277,8 +286,12 @@ toMetaValue opts x = yamlToMeta :: ReaderOptions -> Yaml.Value -> MetaValue yamlToMeta opts (Yaml.String t) = toMetaValue opts t -yamlToMeta _ (Yaml.Number n) = MetaString $ show n -yamlToMeta _ (Yaml.Bool b) = MetaString $ map toLower $ show b +yamlToMeta _ (Yaml.Number n) + -- avoid decimal points for numbers that don't need them: + | base10Exponent n >= 0 = MetaString $ show + $ coefficient n * (10 ^ base10Exponent n) + | otherwise = MetaString $ show n +yamlToMeta _ (Yaml.Bool b) = MetaBool b yamlToMeta opts (Yaml.Array xs) = B.toMetaValue $ map (yamlToMeta opts) $ V.toList xs yamlToMeta opts (Yaml.Object o) = MetaMap $ H.foldrWithKey (\k v m -> @@ -292,13 +305,13 @@ yamlToMeta _ _ = MetaString "" stopLine :: MarkdownParser () stopLine = try $ (string "---" <|> string "...") >> blankline >> return () -mmdTitleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) +mmdTitleBlock :: MarkdownParser () mmdTitleBlock = try $ do guardEnabled Ext_mmd_title_block kvPairs <- many1 kvPair blanklines - return $ return $ \(Pandoc m bs) -> - Pandoc (foldl (\m' (k,v) -> addMetaField k v m') m kvPairs) bs + updateState $ \st -> st{ stateMeta' = stateMeta' st <> + return (Meta $ M.fromList kvPairs) } kvPair :: MarkdownParser (String, MetaValue) kvPair = try $ do @@ -315,15 +328,12 @@ parseMarkdown = do updateState $ \state -> state { stateOptions = let oldOpts = stateOptions state in oldOpts{ readerParseRaw = True } } - titleTrans <- option (return id) titleBlock + optional titleBlock blocks <- parseBlocks st <- getState - mbsty <- getOption readerCitationStyle - refs <- getOption readerReferences - return $ processBiblio mbsty refs - $ runF titleTrans st - $ B.doc - $ runF blocks st + let meta = runF (stateMeta' st) st + let Pandoc _ bs = B.doc $ runF blocks st + return $ Pandoc meta bs addWarning :: Maybe SourcePos -> String -> MarkdownParser () addWarning mbpos msg = @@ -339,10 +349,8 @@ referenceKey = try $ do char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') let sourceURL = liftM unwords $ many $ try $ do - notFollowedBy' referenceTitle - skipMany spaceChar - optional $ newline >> notFollowedBy blankline skipMany spaceChar + notFollowedBy' referenceTitle notFollowedBy' (() <$ reference) many1 $ notFollowedBy space >> litChar let betweenAngles = try $ char '<' >> @@ -351,7 +359,7 @@ referenceKey = try $ do tit <- option "" referenceTitle -- currently we just ignore MMD-style link/image attributes _kvs <- option [] $ guardEnabled Ext_link_attributes - >> many (spnl >> keyValAttr) + >> many (try $ spnl >> keyValAttr) blanklines let target = (escapeURI $ trimr src, tit) st <- getState @@ -437,19 +445,26 @@ parseBlocks :: MarkdownParser (F Blocks) parseBlocks = mconcat <$> manyTill block eof block :: MarkdownParser (F Blocks) -block = choice [ mempty <$ blanklines +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- choice [ mempty <$ blanklines , codeBlockFenced + , yamlMetaBlock , guardEnabled Ext_latex_macros *> (macro >>= return . return) + -- note: bulletList needs to be before header because of + -- the possibility of empty list items: - + , bulletList , header , lhsCodeBlock , rawTeXBlock + , divHtml , htmlBlock , table , lineBlock , codeBlockIndented , blockQuote , hrule - , bulletList , orderedList , definitionList , noteBlock @@ -458,6 +473,11 @@ block = choice [ mempty <$ blanklines , para , plain ] <?> "block" + when tr $ do + st <- getState + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList $ runF res st)) (return ()) + return res -- -- header blocks @@ -466,39 +486,15 @@ block = choice [ mempty <$ blanklines header :: MarkdownParser (F Blocks) header = setextHeader <|> atxHeader <?> "header" --- returns unique identifier -addToHeaderList :: Attr -> F Inlines -> MarkdownParser Attr -addToHeaderList (ident,classes,kvs) text = do - let header' = runF text defaultParserState - exts <- getOption readerExtensions - let insert' = M.insertWith (\_new old -> old) - if null ident && Ext_auto_identifiers `Set.member` exts - then do - ids <- stateIdentifiers `fmap` getState - let id' = uniqueIdent (B.toList header') ids - let id'' = if Ext_ascii_identifiers `Set.member` exts - then catMaybes $ map toAsciiChar id' - else id' - updateState $ \st -> st{ - stateIdentifiers = if id' == id'' - then id' : ids - else id' : id'' : ids, - stateHeaders = insert' header' id' $ stateHeaders st } - return (id'',classes,kvs) - else do - unless (null ident) $ - updateState $ \st -> st{ - stateHeaders = insert' header' ident $ stateHeaders st } - return (ident,classes,kvs) - atxHeader :: MarkdownParser (F Blocks) atxHeader = try $ do level <- many1 (char '#') >>= return . length - notFollowedBy (char '.' <|> char ')') -- this would be a list + notFollowedBy $ guardEnabled Ext_fancy_lists >> + (char '.' <|> char ')') -- this would be a list skipSpaces text <- trimInlinesF . mconcat <$> many (notFollowedBy atxClosing >> inline) attr <- atxClosing - attr' <- addToHeaderList attr text + attr' <- registerHeader attr (runF text defaultParserState) return $ B.headerWith attr' level <$> text atxClosing :: MarkdownParser Attr @@ -537,7 +533,7 @@ setextHeader = try $ do many (char underlineChar) blanklines let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 - attr' <- addToHeaderList attr text + attr' <- registerHeader attr (runF text defaultParserState) return $ B.headerWith attr' level <$> text -- @@ -622,12 +618,19 @@ codeBlockFenced = try $ do skipMany spaceChar attr <- option ([],[],[]) $ try (guardEnabled Ext_fenced_code_attributes >> attributes) - <|> ((\x -> ("",[x],[])) <$> identifier) + <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar) blankline contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) blanklines return $ return $ B.codeBlockWith attr $ intercalate "\n" contents +-- correctly handle github language identifiers +toLanguageId :: String -> String +toLanguageId = map toLower . go + where go "c++" = "cpp" + go "objective-c" = "objectivec" + go x = x + codeBlockIndented :: MarkdownParser (F Blocks) codeBlockIndented = do contents <- many1 (indentedLine <|> @@ -718,7 +721,7 @@ bulletListStart = try $ do skipNonindentSpaces notFollowedBy' (() <$ hrule) -- because hrules start out just like lists satisfy isBulletListMarker - spaceChar + spaceChar <|> lookAhead newline skipSpaces anyOrderedListStart :: MarkdownParser (Int, ListNumberStyle, ListNumberDelim) @@ -746,11 +749,16 @@ listStart = bulletListStart <|> (anyOrderedListStart >> return ()) -- parse a line of a list item (start = parser for beginning of list item) listLine :: MarkdownParser String listLine = try $ do - notFollowedBy blankline notFollowedBy' (do indentSpaces - many (spaceChar) + many spaceChar listStart) - chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline + notFollowedBy' $ htmlTag (~== TagClose "div") + optional (() <$ indentSpaces) + chunks <- manyTill + ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') + <|> liftM snd (htmlTag isCommentTag) + <|> count 1 anyChar + ) newline return $ concat chunks -- parse raw text for one list item, excluding start marker and continuations @@ -759,7 +767,7 @@ rawListItem :: MarkdownParser a rawListItem start = try $ do start first <- listLine - rest <- many (notFollowedBy listStart >> listLine) + rest <- many (notFollowedBy listStart >> notFollowedBy blankline >> listLine) blanks <- many blankline return $ unlines (first:rest) ++ blanks @@ -777,6 +785,7 @@ listContinuationLine :: MarkdownParser String listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart + notFollowedBy' $ htmlTag (~== TagClose "div") optional indentSpaces result <- anyLine return $ result ++ "\n" @@ -801,8 +810,8 @@ listItem start = try $ do orderedList :: MarkdownParser (F Blocks) orderedList = try $ do (start, style, delim) <- lookAhead anyOrderedListStart - unless ((style == DefaultStyle || style == Decimal || style == Example) && - (delim == DefaultDelim || delim == Period)) $ + unless (style `elem` [DefaultStyle, Decimal, Example] && + delim `elem` [DefaultDelim, Period]) $ guardEnabled Ext_fancy_lists when (style == Example) $ guardEnabled Ext_example_lists items <- fmap sequence $ many1 $ listItem @@ -863,22 +872,6 @@ definitionList = do items <- fmap sequence $ many1 definitionListItem return $ B.definitionList <$> fmap compactify'DL items -compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] -compactify'DL items = - let defs = concatMap snd items - defBlocks = reverse $ concatMap B.toList defs - isPara (Para _) = True - isPara _ = False - in case defBlocks of - (Para x:_) -> if not $ any isPara (drop 1 defBlocks) - then let (t,ds) = last items - lastDef = B.toList $ last ds - ds' = init ds ++ - [B.fromList $ init lastDef ++ [Plain x]] - in init items ++ [(t, ds')] - else items - _ -> items - -- -- paragraph block -- @@ -891,8 +884,11 @@ para = try $ do $ try $ do newline (blanklines >> return mempty) - <|> (guardDisabled Ext_blank_before_blockquote >> lookAhead blockQuote) - <|> (guardDisabled Ext_blank_before_header >> lookAhead header) + <|> (guardDisabled Ext_blank_before_blockquote >> () <$ lookAhead blockQuote) + <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) + <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) + <|> (guardEnabled Ext_lists_without_preceding_blankline >> + () <$ lookAhead listStart) return $ do result' <- result case B.toList result' of @@ -911,7 +907,9 @@ plain = fmap B.plain . trimInlinesF . mconcat <$> many1 inline -- htmlElement :: MarkdownParser String -htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag) +htmlElement = rawVerbatimBlock + <|> strictHtmlBlock + <|> liftM snd (htmlTag isBlockTag) htmlBlock :: MarkdownParser (F Blocks) htmlBlock = do @@ -932,8 +930,8 @@ strictHtmlBlock = htmlInBalanced (not . isInlineTag) rawVerbatimBlock :: MarkdownParser String rawVerbatimBlock = try $ do - (TagOpen tag _, open) <- htmlTag (tagOpen (\t -> - t == "pre" || t == "style" || t == "script") + (TagOpen tag _, open) <- htmlTag (tagOpen (flip elem + ["pre", "style", "script"]) (const True)) contents <- manyTill anyChar (htmlTag (~== TagClose tag)) return $ open ++ contents ++ renderTags [TagClose tag] @@ -941,8 +939,10 @@ rawVerbatimBlock = try $ do rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do guardEnabled Ext_raw_tex - result <- (B.rawBlock "latex" <$> rawLaTeXBlock) - <|> (B.rawBlock "context" <$> rawConTeXtEnvironment) + result <- (B.rawBlock "latex" . concat <$> + rawLaTeXBlock `sepEndBy1` blankline) + <|> (B.rawBlock "context" . concat <$> + rawConTeXtEnvironment `sepEndBy1` blankline) spaces return $ return result @@ -951,6 +951,8 @@ rawHtmlBlocks = do htmlBlocks <- many1 $ try $ do s <- rawVerbatimBlock <|> try ( do (t,raw) <- htmlTag isBlockTag + guard $ t ~/= TagOpen "div" [] && + t ~/= TagClose "div" exts <- getOption readerExtensions -- if open tag, need markdown="1" if -- markdown_attributes extension is set @@ -1117,13 +1119,11 @@ multilineTable headless = multilineTableHeader :: Bool -- ^ Headerless table -> MarkdownParser (F [Blocks], [Alignment], [Int]) multilineTableHeader headless = try $ do - if headless - then return '\n' - else tableSep >>~ notFollowedBy blankline + unless headless $ + tableSep >> notFollowedBy blankline rawContent <- if headless then return $ repeat "" - else many1 - (notFollowedBy tableSep >> many1Till anyChar newline) + else many1 $ notFollowedBy tableSep >> anyLine initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline @@ -1133,12 +1133,12 @@ multilineTableHeader headless = try $ do then liftM (map (:[]) . tail . splitStringByIndices (init indices)) $ lookAhead anyLine else return $ transpose $ map - (\ln -> tail $ splitStringByIndices (init indices) ln) + (tail . splitStringByIndices (init indices)) rawContent let aligns = zipWith alignType rawHeadsList lengths let rawHeads = if headless then replicate (length dashes) "" - else map unwords rawHeadsList + else map (unlines . map trim) rawHeadsList heads <- fmap sequence $ mapM (parseFromString (mconcat <$> many plain)) $ map trim rawHeads @@ -1195,7 +1195,7 @@ gridTableHeader headless = try $ do -- RST does not have a notion of alignments let rawHeads = if headless then replicate (length dashes) "" - else map unwords $ transpose + else map (unlines . map trim) $ transpose $ map (gridTableSplitLine indices) rawContent heads <- fmap sequence $ mapM (parseFromString parseBlocks . trim) rawHeads return (heads, aligns, indices) @@ -1227,11 +1227,20 @@ removeOneLeadingSpace xs = gridTableFooter :: MarkdownParser [Char] gridTableFooter = blanklines +pipeBreak :: MarkdownParser [Alignment] +pipeBreak = try $ do + nonindentSpaces + openPipe <- (True <$ char '|') <|> return False + first <- pipeTableHeaderPart + rest <- many $ sepPipe *> pipeTableHeaderPart + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) + optional (char '|') + blankline + return (first:rest) + pipeTable :: MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) pipeTable = try $ do - let pipeBreak = nonindentSpaces *> optional (char '|') *> - pipeTableHeaderPart `sepBy1` sepPipe <* - optional (char '|') <* blankline (heads,aligns) <- try ( pipeBreak >>= \als -> return (return $ replicate (length als) mempty, als)) <|> ( pipeTableRow >>= \row -> pipeBreak >>= \als -> @@ -1250,12 +1259,13 @@ sepPipe = try $ do pipeTableRow :: MarkdownParser (F [Blocks]) pipeTableRow = do nonindentSpaces - optional (char '|') + openPipe <- (True <$ char '|') <|> return False let cell = mconcat <$> many (notFollowedBy (blankline <|> char '|') >> inline) first <- cell - sepPipe - rest <- cell `sepBy1` sepPipe + rest <- many $ sepPipe *> cell + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) optional (char '|') blankline let cells = sequence (first:rest) @@ -1340,19 +1350,18 @@ inline = choice [ whitespace , str , endline , code - , fours - , strong - , emph + , strongOrEmph , note , cite , link , image , math , strikeout - , superscript , subscript + , superscript , inlineNote -- after superscript because of ^[link](/foo)^ , autoLink + , spanHtml , rawHtmlInline , escapedChar , rawLaTeXInline' @@ -1382,7 +1391,7 @@ ltSign :: MarkdownParser (F Inlines) ltSign = do guardDisabled Ext_raw_html <|> guardDisabled Ext_markdown_in_html_blocks - <|> (notFollowedBy' rawHtmlBlocks >> return ()) + <|> (notFollowedBy' (htmlTag isBlockTag) >> return ()) char '<' return $ return $ B.str "<" @@ -1422,47 +1431,57 @@ math :: MarkdownParser (F Inlines) math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) <|> (return . B.math <$> (mathInline >>= applyMacros')) -mathDisplay :: MarkdownParser String -mathDisplay = - (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") - <|> (guardEnabled Ext_tex_math_single_backslash >> - mathDisplayWith "\\[" "\\]") - <|> (guardEnabled Ext_tex_math_double_backslash >> - mathDisplayWith "\\\\[" "\\\\]") - -mathDisplayWith :: String -> String -> MarkdownParser String -mathDisplayWith op cl = try $ do - string op - many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string cl) - -mathInline :: MarkdownParser String -mathInline = - (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") - <|> (guardEnabled Ext_tex_math_single_backslash >> - mathInlineWith "\\(" "\\)") - <|> (guardEnabled Ext_tex_math_double_backslash >> - mathInlineWith "\\\\(" "\\\\)") - -mathInlineWith :: String -> String -> MarkdownParser String -mathInlineWith op cl = try $ do - string op - notFollowedBy space - words' <- many1Till (count 1 (noneOf "\n\\") - <|> (char '\\' >> anyChar >>= \c -> return ['\\',c]) - <|> count 1 newline <* notFollowedBy' blankline - *> return " ") - (try $ string cl) - notFollowedBy digit -- to prevent capture of $5 - return $ concat words' - --- to avoid performance problems, treat 4 or more _ or * or ~ or ^ in a row --- as a literal rather than attempting to parse for emph/strong/strikeout/super/sub -fours :: Parser [Char] st (F Inlines) -fours = try $ do - x <- char '*' <|> char '_' <|> char '~' <|> char '^' - count 2 $ satisfy (==x) - rest <- many1 (satisfy (==x)) - return $ return $ B.str (x:x:x:rest) +-- Parses material enclosed in *s, **s, _s, or __s. +-- Designed to avoid backtracking. +enclosure :: Char + -> MarkdownParser (F Inlines) +enclosure c = do + cs <- many1 (char c) + (return (B.str cs) <>) <$> whitespace + <|> case length cs of + 3 -> three c + 2 -> two c mempty + 1 -> one c mempty + _ -> return (return $ B.str cs) + +-- Parse inlines til you hit one c or a sequence of two cs. +-- If one c, emit emph and then parse two. +-- If two cs, emit strong and then parse one. +-- Otherwise, emit ccc then the results. +three :: Char -> MarkdownParser (F Inlines) +three c = do + contents <- mconcat <$> many (notFollowedBy (char c) >> inline) + (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents)) + <|> (try (string [c,c]) >> one c (B.strong <$> contents)) + <|> (char c >> two c (B.emph <$> contents)) + <|> return (return (B.str [c,c,c]) <> contents) + +-- Parse inlines til you hit two c's, and emit strong. +-- If you never do hit two cs, emit ** plus inlines parsed. +two :: Char -> F Inlines -> MarkdownParser (F Inlines) +two c prefix' = do + let ender = try $ string [c,c] + contents <- mconcat <$> many (try $ notFollowedBy ender >> inline) + (ender >> return (B.strong <$> (prefix' <> contents))) + <|> return (return (B.str [c,c]) <> (prefix' <> contents)) + +-- Parse inlines til you hit a c, and emit emph. +-- If you never hit a c, emit * plus inlines parsed. +one :: Char -> F Inlines -> MarkdownParser (F Inlines) +one c prefix' = do + contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline) + <|> try (string [c,c] >> + notFollowedBy (char c) >> + two c mempty) ) + (char c >> return (B.emph <$> (prefix' <> contents))) + <|> return (return (B.str [c]) <> (prefix' <> contents)) + +strongOrEmph :: MarkdownParser (F Inlines) +strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_') + where checkIntraword = do + exts <- getOption readerExtensions + when (Ext_intraword_underscores `Set.member` exts) $ do + guard =<< notAfterString -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) @@ -1474,28 +1493,6 @@ inlinesBetween start end = where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) innerSpace = try $ whitespace >>~ notFollowedBy' end -emph :: MarkdownParser (F Inlines) -emph = fmap B.emph <$> nested - (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) - where starStart = char '*' >> lookAhead nonspaceChar - starEnd = notFollowedBy' (() <$ strong) >> char '*' - ulStart = checkIntraword >> char '_' >> lookAhead nonspaceChar - ulEnd = notFollowedBy' (() <$ strong) >> char '_' - checkIntraword = do - exts <- getOption readerExtensions - when (Ext_intraword_underscores `Set.member` exts) $ do - pos <- getPosition - lastStrPos <- stateLastStrPos <$> getState - guard $ lastStrPos /= Just pos - -strong :: MarkdownParser (F Inlines) -strong = fmap B.strong <$> nested - (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) - where starStart = string "**" >> lookAhead nonspaceChar - starEnd = try $ string "**" - ulStart = string "__" >> lookAhead nonspaceChar - ulEnd = try $ string "__" - strikeout :: MarkdownParser (F Inlines) strikeout = fmap B.strikeout <$> (guardEnabled Ext_strikeout >> inlinesBetween strikeStart strikeEnd) @@ -1526,8 +1523,7 @@ nonEndline = satisfy (/='\n') str :: MarkdownParser (F Inlines) str = do result <- many1 alphaNum - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) isSmart <- getOption readerSmart if isSmart @@ -1558,14 +1554,17 @@ endline :: MarkdownParser (F Inlines) endline = try $ do newline notFollowedBy blankline - guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart - guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header -- parse potential list-starts differently if in a list: st <- getState - when (stateParserContext st == ListItemState) $ do - notFollowedBy' bulletListStart - notFollowedBy' anyOrderedListStart - (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) + when (stateParserContext st == ListItemState) $ notFollowedBy listStart + guardDisabled Ext_lists_without_preceding_blankline <|> notFollowedBy listStart + guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart + guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header + guardDisabled Ext_backtick_code_blocks <|> + notFollowedBy (() <$ (lookAhead (char '`') >> codeBlockFenced)) + (eof >> return mempty) + <|> (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) + <|> (guardEnabled Ext_ignore_line_breaks >> return mempty) <|> (return $ return B.space) -- @@ -1660,6 +1659,7 @@ bareURL :: MarkdownParser (F Inlines) bareURL = try $ do guardEnabled Ext_autolink_bare_uris (orig, src) <- uri <|> emailAddress + notFollowedBy $ try $ spaces >> htmlTag (~== TagClose "a") return $ return $ B.link src "" (B.str orig) autoLink :: MarkdownParser (F Inlines) @@ -1730,6 +1730,38 @@ inBrackets parser = do char ']' return $ "[" ++ contents ++ "]" +spanHtml :: MarkdownParser (F Inlines) +spanHtml = try $ do + guardEnabled Ext_markdown_in_html_blocks + (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) + contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span")) + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + case lookup "style" keyvals of + Just s | null ident && null classes && + map toLower (filter (`notElem` " \t;") s) == + "font-variant:small-caps" + -> return $ B.smallcaps <$> contents + _ -> return $ B.spanWith (ident, classes, keyvals) <$> contents + +divHtml :: MarkdownParser (F Blocks) +divHtml = try $ do + guardEnabled Ext_markdown_in_html_blocks + (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) + bls <- option "" (blankline >> option "" blanklines) + contents <- mconcat <$> + many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block) + closed <- option False (True <$ htmlTag (~== TagClose "div")) + if closed + then do + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.divWith (ident, classes, keyvals) <$> contents + else -- avoid backtracing + return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents + rawHtmlInline :: MarkdownParser (F Inlines) rawHtmlInline = do guardEnabled Ext_raw_html @@ -1745,11 +1777,12 @@ rawHtmlInline = do cite :: MarkdownParser (F Inlines) cite = do guardEnabled Ext_citations - getOption readerReferences >>= guard . not . null - citations <- textualCite <|> normalCite - return $ flip B.cite mempty <$> citations + citations <- textualCite + <|> do (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + return citations -textualCite :: MarkdownParser (F [Citation]) +textualCite :: MarkdownParser (F Inlines) textualCite = try $ do (_, key) <- citeKey let first = Citation{ citationId = key @@ -1759,10 +1792,18 @@ textualCite = try $ do , citationNoteNum = 0 , citationHash = 0 } - mbrest <- option Nothing $ try $ spnl >> Just <$> normalCite + mbrest <- option Nothing $ try $ spnl >> Just <$> withRaw normalCite case mbrest of - Just rest -> return $ (first:) <$> rest - Nothing -> option (return [first]) $ bareloc first + Just (rest, raw) -> + return $ (flip B.cite (B.text $ '@':key ++ " " ++ raw) . (first:)) + <$> rest + Nothing -> + (do (cs, raw) <- withRaw $ bareloc first + return $ (flip B.cite (B.text $ '@':key ++ " " ++ raw)) <$> cs) + <|> return (do st <- askF + return $ case M.lookup key (stateExamples st) of + Just n -> B.str (show n) + _ -> B.cite [first] $ B.str $ '@':key) bareloc :: Citation -> MarkdownParser (F [Citation]) bareloc c = try $ do @@ -1786,18 +1827,6 @@ normalCite = try $ do char ']' return citations -citeKey :: MarkdownParser (Bool, String) -citeKey = try $ do - suppress_author <- option False (char '-' >> return True) - char '@' - first <- letter - let internal p = try $ p >>~ lookAhead (letter <|> digit) - rest <- many $ letter <|> digit <|> internal (oneOf ":.#$%&-_+?<>~/") - let key = first:rest - citations' <- map CSL.refId <$> getOption readerReferences - guard $ key `elem` citations' - return (suppress_author, key) - suffix :: MarkdownParser (F Inlines) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) @@ -1836,7 +1865,7 @@ smart :: MarkdownParser (F Inlines) smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> - choice (map (return . B.singleton <$>) [apostrophe, dash, ellipses]) + choice (map (return <$>) [apostrophe, dash, ellipses]) singleQuoted :: MarkdownParser (F Inlines) singleQuoted = try $ do @@ -1855,4 +1884,3 @@ doubleQuoted = try $ do (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return (fmap B.doubleQuoted . trimInlinesF $ contents)) <|> (return $ return (B.str "\8220") <> contents) - diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 56049e035..f1dcce8f7 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -1,6 +1,7 @@ -{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 +{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} +-- RelaxedPolyRec needed for inlinesBetween on GHC < 7 {- - Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> + Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.MediaWiki - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -42,8 +43,8 @@ import Text.Pandoc.Options import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isCommentTag ) import Text.Pandoc.XML ( fromEntities ) import Text.Pandoc.Parsing hiding ( nested ) -import Text.Pandoc.Generic ( bottomUp ) -import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead ) +import Text.Pandoc.Walk ( walk ) +import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead, stringify, trim ) import Data.Monoid (mconcat, mempty) import Control.Applicative ((<$>), (<*), (*>), (<$)) import Control.Monad @@ -51,7 +52,11 @@ import Data.List (intersperse, intercalate, isPrefixOf ) import Text.HTML.TagSoup import Data.Sequence (viewl, ViewL(..), (<|)) import qualified Data.Foldable as F +import qualified Data.Map as M import Data.Char (isDigit, isSpace) +import Data.Maybe (fromMaybe) +import Text.Printf (printf) +import Debug.Trace (trace) -- | Read mediawiki from an input string and return a Pandoc document. readMediaWiki :: ReaderOptions -- ^ Reader options @@ -62,6 +67,8 @@ readMediaWiki opts s = , mwMaxNestingLevel = 4 , mwNextLinkNumber = 1 , mwCategoryLinks = [] + , mwHeaderMap = M.empty + , mwIdentifierList = [] } "source" (s ++ "\n") of Left err' -> error $ "\nError:\n" ++ show err' @@ -71,10 +78,23 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwMaxNestingLevel :: Int , mwNextLinkNumber :: Int , mwCategoryLinks :: [Inlines] + , mwHeaderMap :: M.Map Inlines String + , mwIdentifierList :: [String] } type MWParser = Parser [Char] MWState +instance HasReaderOptions MWState where + extractReaderOptions = mwOptions + +instance HasHeaderMap MWState where + extractHeaderMap = mwHeaderMap + updateHeaderMap f st = st{ mwHeaderMap = f $ mwHeaderMap st } + +instance HasIdentifierList MWState where + extractIdentifierList = mwIdentifierList + updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st } + -- -- auxiliary functions -- @@ -91,7 +111,7 @@ nested p = do return res specialChars :: [Char] -specialChars = "'[]<=&*{}|\"" +specialChars = "'[]<=&*{}|\":\\" spaceChars :: [Char] spaceChars = " \n\t" @@ -131,9 +151,16 @@ inlinesInTags tag = try $ do blocksInTags :: String -> MWParser Blocks blocksInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) + let closer = if tag == "li" + then htmlTag (~== TagClose "li") + <|> lookAhead ( + htmlTag (~== TagOpen "li" []) + <|> htmlTag (~== TagClose "ol") + <|> htmlTag (~== TagClose "ul")) + else htmlTag (~== TagClose tag) if '/' `elem` raw -- self-closing tag then return mempty - else mconcat <$> manyTill block (htmlTag (~== TagClose tag)) + else mconcat <$> manyTill block closer charsInTags :: String -> MWParser [Char] charsInTags tag = try $ do @@ -162,7 +189,10 @@ parseMediaWiki = do -- block :: MWParser Blocks -block = mempty <$ skipMany1 blankline +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- mempty <$ skipMany1 blankline <|> table <|> header <|> hrule @@ -174,6 +204,10 @@ block = mempty <$ skipMany1 blankline <|> blockTag <|> (B.rawBlock "mediawiki" <$> template) <|> para + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res para :: MWParser Blocks para = do @@ -187,7 +221,7 @@ table = do tableStart styles <- option [] parseAttrs <* blankline let tableWidth = case lookup "width" styles of - Just w -> maybe 1.0 id $ parseWidth w + Just w -> fromMaybe 1.0 $ parseWidth w Nothing -> 1.0 caption <- option mempty tableCaption optional rowsep @@ -202,6 +236,7 @@ table = do let widths' = map (\w -> if w == 0 then defaultwidth else w) widths let cellspecs = zip (map fst cellspecs') widths' rows' <- many $ try $ rowsep *> (map snd <$> tableRow) + optional blanklines tableEnd let cols = length hdr let (headers,rows) = if hasheader @@ -250,7 +285,7 @@ tableCaption = try $ do (trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline) tableRow :: MWParser [((Alignment, Double), Blocks)] -tableRow = try $ many tableCell +tableRow = try $ skipMany htmlComment *> many tableCell tableCell :: MWParser ((Alignment, Double), Blocks) tableCell = try $ do @@ -268,7 +303,7 @@ tableCell = try $ do Just "center" -> AlignCenter _ -> AlignDefault let width = case lookup "width" attrs of - Just xs -> maybe 0.0 id $ parseWidth xs + Just xs -> fromMaybe 0.0 $ parseWidth xs Nothing -> 0.0 return ((align, width), bs) @@ -282,6 +317,7 @@ template :: MWParser String template = try $ do string "{{" notFollowedBy (char '{') + lookAhead $ letter <|> digit <|> char ':' let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar contents <- manyTill chunk (try $ string "}}") return $ "{{" ++ concat contents ++ "}}" @@ -342,7 +378,7 @@ preformatted = try $ do spacesStr _ = False if F.all spacesStr contents then return mempty - else return $ B.para $ bottomUp strToCode contents + else return $ B.para $ walk strToCode contents header :: MWParser Blocks header = try $ do @@ -351,7 +387,8 @@ header = try $ do let lev = length eqs guard $ lev <= 6 contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=') - return $ B.header lev contents + attr <- registerHeader nullAttr contents + return $ B.headerWith attr lev contents bulletList :: MWParser Blocks bulletList = B.bulletList <$> @@ -362,15 +399,13 @@ bulletList = B.bulletList <$> orderedList :: MWParser Blocks orderedList = (B.orderedList <$> many1 (listItem '#')) - <|> (B.orderedList <$> (htmlTag (~== TagOpen "ul" []) *> spaces *> - many (listItem '#' <|> li) <* - optional (htmlTag (~== TagClose "ul")))) - <|> do (tag,_) <- htmlTag (~== TagOpen "ol" []) - spaces - items <- many (listItem '#' <|> li) - optional (htmlTag (~== TagClose "ol")) - let start = maybe 1 id $ safeRead $ fromAttrib "start" tag - return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items + <|> try + (do (tag,_) <- htmlTag (~== TagOpen "ol" []) + spaces + items <- many (listItem '#' <|> li) + optional (htmlTag (~== TagClose "ol")) + let start = fromMaybe 1 $ safeRead $ fromAttrib "start" tag + return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items) definitionList :: MWParser Blocks definitionList = B.definitionList <$> many1 defListItem @@ -380,8 +415,9 @@ defListItem = try $ do terms <- mconcat . intersperse B.linebreak <$> many defListTerm -- we allow dd with no dt, or dt with no dd defs <- if B.isNull terms - then many1 $ listItem ':' - else many $ listItem ':' + then notFollowedBy (try $ string ":<math>") *> + many1 (listItem ':') + else many (listItem ':') return (terms, defs) defListTerm :: MWParser Inlines @@ -413,7 +449,8 @@ listItem c = try $ do skipMany spaceChar first <- concat <$> manyTill listChunk newline rest <- many - (try $ string extras *> (concat <$> manyTill listChunk newline)) + (try $ string extras *> lookAhead listStartChar *> + (concat <$> manyTill listChunk newline)) contents <- parseFromString (many1 $ listItem' c) (unlines (first : rest)) case c of @@ -462,6 +499,7 @@ inline = whitespace <|> image <|> internalLink <|> externalLink + <|> math <|> inlineTag <|> B.singleton <$> charRef <|> inlineHtml @@ -472,6 +510,16 @@ inline = whitespace str :: MWParser Inlines str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) +math :: MWParser Inlines +math = (B.displayMath . trim <$> try (char ':' >> charsInTags "math")) + <|> (B.math . trim <$> charsInTags "math") + <|> (B.displayMath . trim <$> try (dmStart *> manyTill anyChar dmEnd)) + <|> (B.math . trim <$> try (mStart *> manyTill (satisfy (/='\n')) mEnd)) + where dmStart = string "\\[" + dmEnd = try (string "\\]") + mStart = string "\\(" + mEnd = try (string "\\)") + variable :: MWParser String variable = try $ do string "{{{" @@ -495,7 +543,6 @@ inlineTag = do TagOpen "del" _ -> B.strikeout <$> inlinesInTags "del" TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub" TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup" - TagOpen "math" _ -> B.math <$> charsInTags "math" TagOpen "code" _ -> B.code <$> charsInTags "code" TagOpen "tt" _ -> B.code <$> charsInTags "tt" TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" @@ -520,15 +567,19 @@ endline = () <$ try (newline <* notFollowedBy' header <* notFollowedBy anyListStart) +imageIdentifiers :: [MWParser ()] +imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] + where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier"] + image :: MWParser Inlines image = try $ do sym "[[" - sym "File:" + choice imageIdentifiers fname <- many1 (noneOf "|]") _ <- many (try $ char '|' *> imageOption) caption <- (B.str fname <$ sym "]]") <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]"))) - return $ B.image fname "image" caption + return $ B.image fname ("fig:" ++ stringify caption) caption imageOption :: MWParser String imageOption = diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs index c5d4cb98a..f4dfa62c1 100644 --- a/src/Text/Pandoc/Readers/Native.hs +++ b/src/Text/Pandoc/Readers/Native.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Native - Copyright : Copyright (C) 2011 John MacFarlane + Copyright : Copyright (C) 2011-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index c9726d195..35d01e877 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -6,7 +6,7 @@ import Text.Pandoc.Builder import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) import Text.XML.Light -import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Compat.TagSoupEntity (lookupEntity) import Data.Generics import Data.Monoid import Control.Monad.State diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs new file mode 100644 index 000000000..7a35e2ca0 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org.hs @@ -0,0 +1,1379 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org + Copyright : Copyright (C) 2014 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb@moltkeplatz.de> + +Conversion of org-mode formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Org ( readOrg ) where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), (<>) + , trimInlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import qualified Text.Pandoc.Parsing as P +import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF + , newline, orderedListMarker + , parseFromString + ) +import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) +import Text.Pandoc.Shared (compactify', compactify'DL) +import Text.TeXMath (texMathToPandoc, DisplayType(..)) + +import Control.Applicative ( Applicative, pure + , (<$>), (<$), (<*>), (<*), (*>), (<**>) ) +import Control.Arrow (first) +import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) +import Control.Monad.Reader (Reader, runReader, ask, asks) +import Data.Char (isAlphaNum, toLower) +import Data.Default +import Data.List (intersperse, isPrefixOf, isSuffixOf) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isJust) +import Data.Monoid (Monoid, mconcat, mempty, mappend) +import Network.HTTP (urlEncode) + +-- | Parse org-mode string and return a Pandoc document. +readOrg :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") + +type OrgParser = Parser [Char] OrgParserState + +parseOrg :: OrgParser Pandoc +parseOrg = do + blocks' <- parseBlocks + st <- getState + let meta = runF (orgStateMeta' st) st + return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st) + +-- +-- Parser State for Org +-- + +type OrgNoteRecord = (String, F Blocks) +type OrgNoteTable = [OrgNoteRecord] + +type OrgBlockAttributes = M.Map String String + +type OrgLinkFormatters = M.Map String (String -> String) + +-- | Org-mode parser state +data OrgParserState = OrgParserState + { orgStateOptions :: ReaderOptions + , orgStateAnchorIds :: [String] + , orgStateBlockAttributes :: OrgBlockAttributes + , orgStateEmphasisCharStack :: [Char] + , orgStateEmphasisNewlines :: Maybe Int + , orgStateLastForbiddenCharPos :: Maybe SourcePos + , orgStateLastPreCharPos :: Maybe SourcePos + , orgStateLastStrPos :: Maybe SourcePos + , orgStateLinkFormatters :: OrgLinkFormatters + , orgStateMeta :: Meta + , orgStateMeta' :: F Meta + , orgStateNotes' :: OrgNoteTable + } + +instance HasReaderOptions OrgParserState where + extractReaderOptions = orgStateOptions + +instance HasMeta OrgParserState where + setMeta field val st = + st{ orgStateMeta = setMeta field val $ orgStateMeta st } + deleteMeta field st = + st{ orgStateMeta = deleteMeta field $ orgStateMeta st } + +instance HasLastStrPosition OrgParserState where + getLastStrPos = orgStateLastStrPos + setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } + +instance Default OrgParserState where + def = defaultOrgParserState + +defaultOrgParserState :: OrgParserState +defaultOrgParserState = OrgParserState + { orgStateOptions = def + , orgStateAnchorIds = [] + , orgStateBlockAttributes = M.empty + , orgStateEmphasisCharStack = [] + , orgStateEmphasisNewlines = Nothing + , orgStateLastForbiddenCharPos = Nothing + , orgStateLastPreCharPos = Nothing + , orgStateLastStrPos = Nothing + , orgStateLinkFormatters = M.empty + , orgStateMeta = nullMeta + , orgStateMeta' = return nullMeta + , orgStateNotes' = [] + } + +recordAnchorId :: String -> OrgParser () +recordAnchorId i = updateState $ \s -> + s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + +addBlockAttribute :: String -> String -> OrgParser () +addBlockAttribute key val = updateState $ \s -> + let attrs = orgStateBlockAttributes s + in s{ orgStateBlockAttributes = M.insert key val attrs } + +lookupBlockAttribute :: String -> OrgParser (Maybe String) +lookupBlockAttribute key = + M.lookup key . orgStateBlockAttributes <$> getState + +resetBlockAttributes :: OrgParser () +resetBlockAttributes = updateState $ \s -> + s{ orgStateBlockAttributes = orgStateBlockAttributes def } + +updateLastForbiddenCharPos :: OrgParser () +updateLastForbiddenCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} + +updateLastPreCharPos :: OrgParser () +updateLastPreCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastPreCharPos = Just p} + +pushToInlineCharStack :: Char -> OrgParser () +pushToInlineCharStack c = updateState $ \s -> + s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } + +popInlineCharStack :: OrgParser () +popInlineCharStack = updateState $ \s -> + s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } + +surroundingEmphasisChar :: OrgParser [Char] +surroundingEmphasisChar = + take 1 . drop 1 . orgStateEmphasisCharStack <$> getState + +startEmphasisNewlinesCounting :: Int -> OrgParser () +startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Just maxNewlines } + +decEmphasisNewlinesCount :: OrgParser () +decEmphasisNewlinesCount = updateState $ \s -> + s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } + +newlinesCountWithinLimits :: OrgParser Bool +newlinesCountWithinLimits = do + st <- getState + return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True + +resetEmphasisNewlines :: OrgParser () +resetEmphasisNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Nothing } + +addLinkFormat :: String + -> (String -> String) + -> OrgParser () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + +addToNotesTable :: OrgNoteRecord -> OrgParser () +addToNotesTable note = do + oldnotes <- orgStateNotes' <$> getState + updateState $ \s -> s{ orgStateNotes' = note:oldnotes } + +-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts +-- of the state saved and restored. +parseFromString :: OrgParser a -> String -> OrgParser a +parseFromString parser str' = do + oldLastPreCharPos <- orgStateLastPreCharPos <$> getState + updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } + result <- P.parseFromString parser str' + updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } + return result + + +-- +-- Adaptions and specializations of parsing utilities +-- + +newtype F a = F { unF :: Reader OrgParserState a + } deriving (Monad, Applicative, Functor) + +runF :: F a -> OrgParserState -> a +runF = runReader . unF + +askF :: F OrgParserState +askF = F ask + +asksF :: (OrgParserState -> a) -> F a +asksF f = F $ asks f + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = fmap mconcat . sequence + +trimInlinesF :: F Inlines -> F Inlines +trimInlinesF = liftM trimInlines + +returnF :: a -> OrgParser (F a) +returnF = return . return + + +-- | Like @Text.Parsec.Char.newline@, but causes additional state changes. +newline :: OrgParser Char +newline = + P.newline + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- +-- parsing blocks +-- + +parseBlocks :: OrgParser (F Blocks) +parseBlocks = mconcat <$> manyTill block eof + +block :: OrgParser (F Blocks) +block = choice [ mempty <$ blanklines + , optionalAttributes $ choice + [ orgBlock + , figure + , table + ] + , example + , drawer + , specialLine + , header + , return <$> hline + , list + , latexFragment + , noteBlock + , paraOrPlain + ] <?> "block" + +optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks) +optionalAttributes parser = try $ + resetBlockAttributes *> parseBlockAttributes *> parser + +parseBlockAttributes :: OrgParser () +parseBlockAttributes = do + attrs <- many attribute + () <$ mapM (uncurry parseAndAddAttribute) attrs + where + attribute :: OrgParser (String, String) + attribute = try $ do + key <- metaLineStart *> many1Till nonspaceChar (char ':') + val <- skipSpaces *> anyLine + return (map toLower key, val) + +parseAndAddAttribute :: String -> String -> OrgParser () +parseAndAddAttribute key value = do + let key' = map toLower key + () <$ addBlockAttribute key' value + +lookupInlinesAttr :: String -> OrgParser (Maybe (F Inlines)) +lookupInlinesAttr attr = try $ do + val <- lookupBlockAttribute attr + maybe (return Nothing) + (fmap Just . parseFromString parseInlines) + val + + +-- +-- Org Blocks (#+BEGIN_... / #+END_...) +-- + +type BlockProperties = (Int, String) -- (Indentation, Block-Type) + +orgBlock :: OrgParser (F Blocks) +orgBlock = try $ do + blockProp@(_, blkType) <- blockHeaderStart + ($ blockProp) $ + case blkType of + "comment" -> withRaw' (const mempty) + "html" -> withRaw' (return . (B.rawBlock blkType)) + "latex" -> withRaw' (return . (B.rawBlock blkType)) + "ascii" -> withRaw' (return . (B.rawBlock blkType)) + "example" -> withRaw' (return . exampleCode) + "quote" -> withParsed (fmap B.blockQuote) + "verse" -> verseBlock + "src" -> codeBlock + _ -> withParsed (fmap $ divWithClass blkType) + +blockHeaderStart :: OrgParser (Int, String) +blockHeaderStart = try $ (,) <$> indent <*> blockType + where + indent = length <$> many spaceChar + blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) + +withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) + +withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp)) + +ignHeaders :: OrgParser () +ignHeaders = (() <$ newline) <|> (() <$ anyLine) + +divWithClass :: String -> Blocks -> Blocks +divWithClass cls = B.divWith ("", [cls], []) + +verseBlock :: BlockProperties -> OrgParser (F Blocks) +verseBlock blkProp = try $ do + ignHeaders + content <- rawBlockContent blkProp + fmap B.para . mconcat . intersperse (pure B.linebreak) + <$> mapM (parseFromString parseInlines) (lines content) + +codeBlock :: BlockProperties -> OrgParser (F Blocks) +codeBlock blkProp = do + skipSpaces + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + id' <- fromMaybe "" <$> lookupBlockAttribute "name" + content <- rawBlockContent blkProp + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption" + where + labelDiv blk value = + B.divWith nullAttr <$> (mappend <$> labelledBlock value + <*> pure blk) + labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + +rawBlockContent :: BlockProperties -> OrgParser String +rawBlockContent (indent, blockType) = try $ + unlines . map commaEscaped <$> manyTill indentedLine blockEnder + where + indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) + blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType) + +parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) +parsedBlockContent blkProps = try $ do + raw <- rawBlockContent blkProps + parseFromString parseBlocks (raw ++ "\n") + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Int -> OrgParser String +indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> count (num - tabStop) (char ' ')) ] + +type SwitchOption = (Char, Maybe String) + +orgArgWord :: OrgParser String +orgArgWord = many1 orgArgWordChar + +-- | Parse code block arguments +-- TODO: We currently don't handle switches. +codeHeaderArgs :: OrgParser ([String], [(String, String)]) +codeHeaderArgs = try $ do + language <- skipSpaces *> orgArgWord + _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) + parameters <- manyTill blockOption newline + let pandocLang = translateLang language + return $ + if hasRundocParameters parameters + then ( [ pandocLang, rundocBlockClass ] + , map toRundocAttrib (("language", language) : parameters) + ) + else ([ pandocLang ], parameters) + where hasRundocParameters = not . null + +switch :: OrgParser SwitchOption +switch = try $ simpleSwitch <|> lineNumbersSwitch + where + simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) + lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> + (string "-l \"" *> many1Till nonspaceChar (char '"')) + +translateLang :: String -> String +translateLang "C" = "c" +translateLang "C++" = "cpp" +translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported +translateLang "js" = "javascript" +translateLang "lisp" = "commonlisp" +translateLang "R" = "r" +translateLang "sh" = "bash" +translateLang "sqlite" = "sql" +translateLang cs = cs + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +blockOption :: OrgParser (String, String) +blockOption = try $ (,) <$> orgArgKey <*> orgParamValue + +inlineBlockOption :: OrgParser (String, String) +inlineBlockOption = try $ (,) <$> orgArgKey <*> orgInlineParamValue + +orgArgKey :: OrgParser String +orgArgKey = try $ + skipSpaces *> char ':' + *> many1 orgArgWordChar + +orgParamValue :: OrgParser String +orgParamValue = try $ + skipSpaces *> many1 (noneOf "\t\n\r ") <* skipSpaces + +orgInlineParamValue :: OrgParser String +orgInlineParamValue = try $ + skipSpaces *> many1 (noneOf "\t\n\r ]") <* skipSpaces + +orgArgWordChar :: OrgParser Char +orgArgWordChar = alphaNum <|> oneOf "-_" + +toRundocAttrib :: (String, String) -> (String, String) +toRundocAttrib = first ("rundoc-" ++) + +commaEscaped :: String -> String +commaEscaped (',':cs@('*':_)) = cs +commaEscaped (',':cs@('#':'+':_)) = cs +commaEscaped cs = cs + +example :: OrgParser (F Blocks) +example = try $ do + return . return . exampleCode =<< unlines <$> many1 exampleLine + +exampleCode :: String -> Blocks +exampleCode = B.codeBlockWith ("", ["example"], []) + +exampleLine :: OrgParser String +exampleLine = try $ string ": " *> anyLine + +-- Drawers for properties or a logbook +drawer :: OrgParser (F Blocks) +drawer = try $ do + drawerStart + manyTill drawerLine (try drawerEnd) + return mempty + +drawerStart :: OrgParser String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* P.newline + where drawerName = try $ char ':' *> validDrawerName <* char ':' + validDrawerName = stringAnyCase "PROPERTIES" + <|> stringAnyCase "LOGBOOK" + +drawerLine :: OrgParser String +drawerLine = try anyLine + +drawerEnd :: OrgParser String +drawerEnd = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* P.newline + + +-- +-- Figures +-- + +-- Figures (Image on a line by itself, preceded by name and/or caption) +figure :: OrgParser (F Blocks) +figure = try $ do + (cap, nam) <- nameAndCaption + src <- skipSpaces *> selfTarget <* skipSpaces <* P.newline + guard (isImageFilename src) + return $ do + cap' <- cap + return $ B.para $ B.image src nam cap' + where + nameAndCaption = + do + maybeCap <- lookupInlinesAttr "caption" + maybeNam <- lookupBlockAttribute "name" + guard $ isJust maybeCap || isJust maybeNam + return ( fromMaybe mempty maybeCap + , maybe mempty withFigPrefix maybeNam ) + withFigPrefix cs = + if "fig:" `isPrefixOf` cs + then cs + else "fig:" ++ cs + +-- +-- Comments, Options and Metadata +specialLine :: OrgParser (F Blocks) +specialLine = fmap return . try $ metaLine <|> commentLine + +metaLine :: OrgParser Blocks +metaLine = try $ mempty + <$ (metaLineStart *> (optionLine <|> declarationLine)) + +commentLine :: OrgParser Blocks +commentLine = try $ commentLineStart *> anyLine *> pure mempty + +-- The order, in which blocks are tried, makes sure that we're not looking at +-- the beginning of a block, so we don't need to check for it +metaLineStart :: OrgParser String +metaLineStart = try $ mappend <$> many spaceChar <*> string "#+" + +commentLineStart :: OrgParser String +commentLineStart = try $ mappend <$> many spaceChar <*> string "# " + +declarationLine :: OrgParser () +declarationLine = try $ do + key <- metaKey + inlinesF <- metaInlines + updateState $ \st -> + let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta + in st { orgStateMeta' = orgStateMeta' st <> meta' } + return () + +metaInlines :: OrgParser (F MetaValue) +metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline + +metaKey :: OrgParser String +metaKey = map toLower <$> many1 (noneOf ": \n\r") + <* char ':' + <* skipSpaces + +optionLine :: OrgParser () +optionLine = try $ do + key <- metaKey + case key of + "link" -> parseLinkFormat >>= uncurry addLinkFormat + _ -> mzero + +parseLinkFormat :: OrgParser ((String, String -> String)) +parseLinkFormat = try $ do + linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces + linkSubst <- parseFormat + return (linkType, linkSubst) + +-- | An ad-hoc, single-argument-only implementation of a printf-style format +-- parser. +parseFormat :: OrgParser (String -> String) +parseFormat = try $ do + replacePlain <|> replaceUrl <|> justAppend + where + -- inefficient, but who cares + replacePlain = try $ (\x -> concat . flip intersperse x) + <$> sequence [tillSpecifier 's', rest] + replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode) + <$> sequence [tillSpecifier 'h', rest] + justAppend = try $ (++) <$> rest + + rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") + tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) + +-- +-- Headers +-- + +-- | Headers +header :: OrgParser (F Blocks) +header = try $ do + level <- headerStart + title <- inlinesTillNewline + return $ B.header level <$> title + +headerStart :: OrgParser Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') + + +-- Don't use (or need) the reader wrapper here, we want hline to be +-- @show@able. Otherwise we can't use it with @notFollowedBy'@. + +-- | Horizontal Line (five -- dashes or more) +hline :: OrgParser Blocks +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return B.horizontalRule + +-- +-- Tables +-- + +data OrgTableRow = OrgContentRow (F [Blocks]) + | OrgAlignRow [Alignment] + | OrgHlineRow + +data OrgTable = OrgTable + { orgTableColumns :: Int + , orgTableAlignments :: [Alignment] + , orgTableHeader :: [Blocks] + , orgTableRows :: [[Blocks]] + } + +table :: OrgParser (F Blocks) +table = try $ do + lookAhead tableStart + do + rows <- tableRows + cptn <- fromMaybe (pure "") <$> lookupInlinesAttr "caption" + return $ (<$> cptn) . orgToPandocTable . normalizeTable =<< rowsToTable rows + +orgToPandocTable :: OrgTable + -> Inlines + -> Blocks +orgToPandocTable (OrgTable _ aligns heads lns) caption = + B.table caption (zip aligns $ repeat 0) heads lns + +tableStart :: OrgParser Char +tableStart = try $ skipSpaces *> char '|' + +tableRows :: OrgParser [OrgTableRow] +tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) + +tableContentRow :: OrgParser OrgTableRow +tableContentRow = try $ + OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline) + +tableContentCell :: OrgParser (F Blocks) +tableContentCell = try $ + fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell + +endOfCell :: OrgParser Char +endOfCell = try $ char '|' <|> lookAhead newline + +tableAlignRow :: OrgParser OrgTableRow +tableAlignRow = try $ + OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline) + +tableAlignCell :: OrgParser Alignment +tableAlignCell = + choice [ try $ emptyCell *> return AlignDefault + , try $ skipSpaces + *> char '<' + *> tableAlignFromChar + <* many digit + <* char '>' + <* emptyCell + ] <?> "alignment info" + where emptyCell = try $ skipSpaces *> endOfCell + +tableAlignFromChar :: OrgParser Alignment +tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] + +tableHline :: OrgParser OrgTableRow +tableHline = try $ + OrgHlineRow <$ (tableStart *> char '-' *> anyLine) + +rowsToTable :: [OrgTableRow] + -> F OrgTable +rowsToTable = foldM (flip rowToContent) zeroTable + where zeroTable = OrgTable 0 mempty mempty mempty + +normalizeTable :: OrgTable + -> OrgTable +normalizeTable (OrgTable cols aligns heads lns) = + let aligns' = fillColumns aligns AlignDefault + heads' = if heads == mempty + then mempty + else fillColumns heads (B.plain mempty) + lns' = map (`fillColumns` B.plain mempty) lns + fillColumns base padding = take cols $ base ++ repeat padding + in OrgTable cols aligns' heads' lns' + + +-- One or more horizontal rules after the first content line mark the previous +-- line as a header. All other horizontal lines are discarded. +rowToContent :: OrgTableRow + -> OrgTable + -> F OrgTable +rowToContent OrgHlineRow t = maybeBodyToHeader t +rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t +rowToContent (OrgContentRow rf) t = do + rs <- rf + setLongestRow rs =<< appendToBody rs t + +setLongestRow :: [a] + -> OrgTable + -> F OrgTable +setLongestRow rs t = + return t{ orgTableColumns = max (length rs) (orgTableColumns t) } + +maybeBodyToHeader :: OrgTable + -> F OrgTable +maybeBodyToHeader t = case t of + OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> + return t{ orgTableHeader = b , orgTableRows = [] } + _ -> return t + +appendToBody :: [Blocks] + -> OrgTable + -> F OrgTable +appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] } + +setAligns :: [Alignment] + -> OrgTable + -> F OrgTable +setAligns aligns t = return $ t{ orgTableAlignments = aligns } + + +-- +-- LaTeX fragments +-- +latexFragment :: OrgParser (F Blocks) +latexFragment = try $ do + envName <- latexEnvStart + content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) + return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) + where + c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" + , c + , "\\end{", e, "}\n" + ] + +latexEnvStart :: OrgParser String +latexEnvStart = try $ do + skipSpaces *> string "\\begin{" + *> latexEnvName + <* string "}" + <* blankline + +latexEnd :: String -> OrgParser () +latexEnd envName = try $ + () <$ skipSpaces + <* string ("\\end{" ++ envName ++ "}") + <* blankline + +-- | Parses a LaTeX environment name. +latexEnvName :: OrgParser String +latexEnvName = try $ do + mappend <$> many1 alphaNum + <*> option "" (string "*") + + +-- +-- Footnote defintions +-- +noteBlock :: OrgParser (F Blocks) +noteBlock = try $ do + ref <- noteMarker <* skipSpaces + content <- mconcat <$> blocksTillHeaderOrNote + addToNotesTable (ref, content) + return mempty + where + blocksTillHeaderOrNote = + many1Till block (eof <|> () <$ lookAhead noteMarker + <|> () <$ lookAhead headerStart) + +-- Paragraphs or Plain text +paraOrPlain :: OrgParser (F Blocks) +paraOrPlain = try $ + parseInlines <**> (fmap <$> option B.plain (try $ newline *> pure B.para)) + +inlinesTillNewline :: OrgParser (F Inlines) +inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline + + +-- +-- list blocks +-- + +list :: OrgParser (F Blocks) +list = choice [ definitionList, bulletList, orderedList ] <?> "list" + +definitionList :: OrgParser (F Blocks) +definitionList = fmap B.definitionList . fmap compactify'DL . sequence + <$> many1 (definitionListItem bulletListStart) + +bulletList :: OrgParser (F Blocks) +bulletList = fmap B.bulletList . fmap compactify' . sequence + <$> many1 (listItem bulletListStart) + +orderedList :: OrgParser (F Blocks) +orderedList = fmap B.orderedList . fmap compactify' . sequence + <$> many1 (listItem orderedListStart) + +genericListStart :: OrgParser String + -> OrgParser Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +-- parses bullet list start and returns its length (excl. following whitespace) +bulletListStart :: OrgParser Int +bulletListStart = genericListStart bulletListMarker + where bulletListMarker = pure <$> oneOf "*-+" + +orderedListStart :: OrgParser Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +definitionListItem :: OrgParser Int + -> OrgParser (F (Inlines, [Blocks])) +definitionListItem parseMarkerGetLength = try $ do + markerLength <- parseMarkerGetLength + term <- manyTill (noneOf "\n\r") (try $ string "::") + line1 <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + cont <- concat <$> many (listContinuation markerLength) + term' <- parseFromString inline term + contents' <- parseFromString parseBlocks $ line1 ++ blank ++ cont + return $ (,) <$> term' <*> fmap (:[]) contents' + + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: OrgParser Int + -> OrgParser (F Blocks) +listItem start = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString parseBlocks $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> OrgParser String +listContinuation markerLength = try $ + notFollowedBy' blankline + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: OrgParser String +anyLineNewline = (++ "\n") <$> anyLine + + +-- +-- inline +-- + +inline :: OrgParser (F Inlines) +inline = + choice [ whitespace + , linebreak + , cite + , footnote + , linkOrImage + , anchor + , inlineCodeBlock + , str + , endline + , emph + , strong + , strikeout + , underline + , code + , math + , displayMath + , verbatim + , subscript + , superscript + , inlineLaTeX + , symbol + ] <* (guard =<< newlinesCountWithinLimits) + <?> "inline" + +parseInlines :: OrgParser (F Inlines) +parseInlines = trimInlinesF . mconcat <$> many1 inline + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-./:<=>[\\]^_{|}~" + + +whitespace :: OrgParser (F Inlines) +whitespace = pure B.space <$ skipMany1 spaceChar + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + <?> "whitespace" + +linebreak :: OrgParser (F Inlines) +linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline + +str :: OrgParser (F Inlines) +str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- | An endline character that can be treated as a space, not a structural +-- break. This should reflect the values of the Emacs variable +-- @org-element-pagaraph-separate@. +endline :: OrgParser (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy' exampleLine + notFollowedBy' hline + notFollowedBy' noteMarker + notFollowedBy' tableStart + notFollowedBy' drawerStart + notFollowedBy' headerStart + notFollowedBy' metaLineStart + notFollowedBy' latexEnvStart + notFollowedBy' commentLineStart + notFollowedBy' bulletListStart + notFollowedBy' orderedListStart + decEmphasisNewlinesCount + guard =<< newlinesCountWithinLimits + updateLastPreCharPos + return . return $ B.space + +cite :: OrgParser (F Inlines) +cite = try $ do + guardEnabled Ext_citations + (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + +normalCite :: OrgParser (F [Citation]) +normalCite = try $ char '[' + *> skipSpaces + *> citeList + <* skipSpaces + <* char ']' + +citeList :: OrgParser (F [Citation]) +citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) + +citation :: OrgParser (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +footnote :: OrgParser (F Inlines) +footnote = try $ inlineNote <|> referencedNote + +inlineNote :: OrgParser (F Inlines) +inlineNote = try $ do + string "[fn:" + ref <- many alphaNum + char ':' + note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') + when (not $ null ref) $ + addToNotesTable ("fn:" ++ ref, note) + return $ B.note <$> note + +referencedNote :: OrgParser (F Inlines) +referencedNote = try $ do + ref <- noteMarker + return $ do + notes <- asksF orgStateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just contents -> do + st <- askF + let contents' = runF contents st{ orgStateNotes' = [] } + return $ B.note contents' + +noteMarker :: OrgParser String +noteMarker = try $ do + char '[' + choice [ many1Till digit (char ']') + , (++) <$> string "fn:" + <*> many1Till (noneOf "\n\r\t ") (char ']') + ] + +linkOrImage :: OrgParser (F Inlines) +linkOrImage = explicitOrImageLink + <|> selflinkOrImage + <|> angleLink + <|> plainLink + <?> "link or image" + +explicitOrImageLink :: OrgParser (F Inlines) +explicitOrImageLink = try $ do + char '[' + srcF <- applyCustomLinkFormat =<< linkTarget + title <- enclosedRaw (char '[') (char ']') + title' <- parseFromString (mconcat <$> many inline) title + char ']' + return $ do + src <- srcF + if isImageFilename src && isImageFilename title + then pure $ B.link src "" $ B.image title mempty mempty + else linkToInlinesF src =<< title' + +selflinkOrImage :: OrgParser (F Inlines) +selflinkOrImage = try $ do + src <- char '[' *> linkTarget <* char ']' + return $ linkToInlinesF src (B.str src) + +plainLink :: OrgParser (F Inlines) +plainLink = try $ do + (orig, src) <- uri + returnF $ B.link src "" (B.str orig) + +angleLink :: OrgParser (F Inlines) +angleLink = try $ do + char '<' + link <- plainLink + char '>' + return link + +selfTarget :: OrgParser String +selfTarget = try $ char '[' *> linkTarget <* char ']' + +linkTarget :: OrgParser String +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") + +applyCustomLinkFormat :: String -> OrgParser (F String) +applyCustomLinkFormat link = do + let (linkType, rest) = break (== ':') link + return $ do + formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters + return $ maybe link ($ drop 1 rest) formatter + + +linkToInlinesF :: String -> Inlines -> F Inlines +linkToInlinesF s@('#':_) = pure . B.link s "" +linkToInlinesF s + | isImageFilename s = const . pure $ B.image s "" "" + | isUri s = pure . B.link s "" + | isRelativeUrl s = pure . B.link s "" +linkToInlinesF s = \title -> do + anchorB <- (s `elem`) <$> asksF orgStateAnchorIds + if anchorB + then pure $ B.link ('#':s) "" title + else pure $ B.emph title + +isRelativeUrl :: String -> Bool +isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s) + +isUri :: String -> Bool +isUri s = let (scheme, path) = break (== ':') s + in all (\c -> isAlphaNum c || c `elem` ".-") scheme + && not (null path) + +isImageFilename :: String -> Bool +isImageFilename filename = + any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && + (any (\x -> (x++":") `isPrefixOf` filename) protocols || + ':' `notElem` filename) + where + imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] + protocols = [ "file", "http", "https" ] + +-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with +-- @anchor-id@ set as id. Legal anchors in org-mode are defined through +-- @org-target-regexp@, which is fairly liberal. Since no link is created if +-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as +-- an anchor. + +anchor :: OrgParser (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + recordAnchorId anchorId + returnF $ B.spanWith (solidify anchorId, [], []) mempty + where + parseAnchor = string "<<" + *> many1 (noneOf "\t\n\r<>\"' ") + <* string ">>" + <* skipSpaces + +-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors +-- the org function @org-export-solidify-link-text@. + +solidify :: String -> String +solidify = map replaceSpecialChar + where replaceSpecialChar c + | isAlphaNum c = c + | c `elem` "_.-:" = c + | otherwise = '-' + +-- | Parses an inline code block and marks it as an babel block. +inlineCodeBlock :: OrgParser (F Inlines) +inlineCodeBlock = try $ do + string "src_" + lang <- many1 orgArgWordChar + opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption + inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") + let attrClasses = [translateLang lang, rundocBlockClass] + let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode + +enclosedByPair :: Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser a -- ^ parser + -> OrgParser [a] +enclosedByPair s e p = char s *> many1Till p (char e) + +emph :: OrgParser (F Inlines) +emph = fmap B.emph <$> emphasisBetween '/' + +strong :: OrgParser (F Inlines) +strong = fmap B.strong <$> emphasisBetween '*' + +strikeout :: OrgParser (F Inlines) +strikeout = fmap B.strikeout <$> emphasisBetween '+' + +-- There is no underline, so we use strong instead. +underline :: OrgParser (F Inlines) +underline = fmap B.strong <$> emphasisBetween '_' + +verbatim :: OrgParser (F Inlines) +verbatim = return . B.code <$> verbatimBetween '=' + +code :: OrgParser (F Inlines) +code = return . B.code <$> verbatimBetween '~' + +subscript :: OrgParser (F Inlines) +subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) + +superscript :: OrgParser (F Inlines) +superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) + +math :: OrgParser (F Inlines) +math = return . B.math <$> choice [ math1CharBetween '$' + , mathStringBetween '$' + , rawMathBetween "\\(" "\\)" + ] + +displayMath :: OrgParser (F Inlines) +displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" + , rawMathBetween "$$" "$$" + ] +symbol :: OrgParser (F Inlines) +symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) + where updatePositions c + | c `elem` emphasisPreChars = c <$ updateLastPreCharPos + | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos + | otherwise = return c + +emphasisBetween :: Char + -> OrgParser (F Inlines) +emphasisBetween c = try $ do + startEmphasisNewlinesCounting emphasisAllowedNewlines + res <- enclosedInlines (emphasisStart c) (emphasisEnd c) + isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState + when isTopLevelEmphasis + resetEmphasisNewlines + return res + +verbatimBetween :: Char + -> OrgParser String +verbatimBetween c = try $ + emphasisStart c *> + many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) + +-- | Parses a raw string delimited by @c@ using Org's math rules +mathStringBetween :: Char + -> OrgParser String +mathStringBetween c = try $ do + mathStart c + body <- many1TillNOrLessNewlines mathAllowedNewlines + (noneOf (c:"\n\r")) + (lookAhead $ mathEnd c) + final <- mathEnd c + return $ body ++ [final] + +-- | Parse a single character between @c@ using math rules +math1CharBetween :: Char + -> OrgParser String +math1CharBetween c = try $ do + char c + res <- noneOf $ c:mathForbiddenBorderChars + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return [res] + +rawMathBetween :: String + -> String + -> OrgParser String +rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) + +-- | Parses the start (opening character) of emphasis +emphasisStart :: Char -> OrgParser Char +emphasisStart c = try $ do + guard =<< afterEmphasisPreChar + guard =<< notAfterString + char c + lookAhead (noneOf emphasisForbiddenBorderChars) + pushToInlineCharStack c + return c + +-- | Parses the closing character of emphasis +emphasisEnd :: Char -> OrgParser Char +emphasisEnd c = try $ do + guard =<< notAfterForbiddenBorderChar + char c + eof <|> () <$ lookAhead acceptablePostChars + updateLastStrPos + popInlineCharStack + return c + where acceptablePostChars = + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + +mathStart :: Char -> OrgParser Char +mathStart c = try $ + char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) + +mathEnd :: Char -> OrgParser Char +mathEnd c = try $ do + res <- noneOf (c:mathForbiddenBorderChars) + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return res + + +enclosedInlines :: OrgParser a + -> OrgParser b + -> OrgParser (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> enclosed start end inline + +enclosedRaw :: OrgParser a + -> OrgParser b + -> OrgParser String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume +-- newlines. +many1TillNOrLessNewlines :: Int + -> OrgParser Char + -> OrgParser a + -> OrgParser String +many1TillNOrLessNewlines n p end = try $ + nMoreLines (Just n) mempty >>= oneOrMore + where + nMoreLines Nothing cs = return cs + nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine + nMoreLines k cs = try $ (final k cs <|> rest k cs) + >>= uncurry nMoreLines + final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p P.newline) + finalLine = try $ manyTill p end + minus1 k = k - 1 + oneOrMore cs = guard (not $ null cs) *> return cs + +-- Org allows customization of the way it reads emphasis. We use the defaults +-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` +-- for details). + +-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) +emphasisPreChars :: [Char] +emphasisPreChars = "\t \"'({" + +-- | Chars allowed at after emphasis +emphasisPostChars :: [Char] +emphasisPostChars = "\t\n !\"'),-.:;?\\}" + +-- | Chars not allowed at the (inner) border of emphasis +emphasisForbiddenBorderChars :: [Char] +emphasisForbiddenBorderChars = "\t\n\r \"'," + +-- | The maximum number of newlines within +emphasisAllowedNewlines :: Int +emphasisAllowedNewlines = 1 + +-- LaTeX-style math: see `org-latex-regexps` for details + +-- | Chars allowed after an inline ($...$) math statement +mathPostChars :: [Char] +mathPostChars = "\t\n \"'),-.:;?" + +-- | Chars not allowed at the (inner) border of math +mathForbiddenBorderChars :: [Char] +mathForbiddenBorderChars = "\t\n\r ,;.$" + +-- | Maximum number of newlines in an inline math statement +mathAllowedNewlines :: Int +mathAllowedNewlines = 2 + +-- | Whether we are right behind a char allowed before emphasis +afterEmphasisPreChar :: OrgParser Bool +afterEmphasisPreChar = do + pos <- getPosition + lastPrePos <- orgStateLastPreCharPos <$> getState + return . fromMaybe True $ (== pos) <$> lastPrePos + +-- | Whether the parser is right after a forbidden border char +notAfterForbiddenBorderChar :: OrgParser Bool +notAfterForbiddenBorderChar = do + pos <- getPosition + lastFBCPos <- orgStateLastForbiddenCharPos <$> getState + return $ lastFBCPos /= Just pos + +-- | Read a sub- or superscript expression +subOrSuperExpr :: OrgParser (F Inlines) +subOrSuperExpr = try $ + choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + , simpleSubOrSuperString + ] >>= parseFromString (mconcat <$> many inline) + where enclosing (left, right) s = left : s ++ [right] + +simpleSubOrSuperString :: OrgParser String +simpleSubOrSuperString = try $ + choice [ string "*" + , mappend <$> option [] ((:[]) <$> oneOf "+-") + <*> many1 alphaNum + ] + +inlineLaTeX :: OrgParser (F Inlines) +inlineLaTeX = try $ do + cmd <- inlineLaTeXCommand + maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd + where + parseAsMath :: String -> Maybe Inlines + parseAsMath cs = maybeRight $ B.fromList <$> texMathToPandoc DisplayInline cs + + parseAsInlineLaTeX :: String -> Maybe Inlines + parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + + state :: ParserState + state = def{ stateOptions = def{ readerParseRaw = True }} + +maybeRight :: Either a b -> Maybe b +maybeRight = either (const Nothing) Just + +inlineLaTeXCommand :: OrgParser String +inlineLaTeXCommand = try $ do + rest <- getInput + case runParser rawLaTeXInline def "source" rest of + Right (RawInline _ cs) -> do + let len = length cs + count len anyChar + return cs + _ -> mzero diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 34962b553..fa8438e70 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -35,12 +36,13 @@ import Text.Pandoc.Builder (setMeta, fromList) import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Options -import Control.Monad ( when, liftM, guard, mzero ) +import Control.Monad ( when, liftM, guard, mzero, mplus ) import Data.List ( findIndex, intersperse, intercalate, transpose, sort, deleteFirstsBy, isSuffixOf ) +import Data.Maybe (fromMaybe) import qualified Data.Map as M import Text.Printf ( printf ) -import Control.Applicative ((<$>), (<$), (<*), (*>)) +import Control.Applicative ((<$>), (<$), (<*), (*>), (<*>)) import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import qualified Text.Pandoc.Builder as B import Data.Monoid (mconcat, mempty) @@ -111,15 +113,16 @@ titleTransform (bs, meta) = metaFromDefList :: [([Inline], [[Block]])] -> Meta -> Meta metaFromDefList ds meta = adjustAuthors $ foldr f meta ds where f (k,v) = setMeta (map toLower $ stringify k) (mconcat $ map fromList v) - adjustAuthors (Meta metamap) = Meta $ M.adjust toPlain "author" + adjustAuthors (Meta metamap) = Meta $ M.adjust splitAuthors "author" $ M.adjust toPlain "date" $ M.adjust toPlain "title" - $ M.adjust splitAuthors "authors" + $ M.mapKeys (\k -> if k == "authors" then "author" else k) $ metamap toPlain (MetaBlocks [Para xs]) = MetaInlines xs toPlain x = x - splitAuthors (MetaBlocks [Para xs]) = MetaList $ map MetaInlines - $ splitAuthors' xs + splitAuthors (MetaBlocks [Para xs]) + = MetaList $ map MetaInlines + $ splitAuthors' xs splitAuthors x = x splitAuthors' = map normalizeSpaces . splitOnSemi . concatMap factorSemi @@ -183,22 +186,22 @@ block = choice [ codeBlock -- field list -- -rawFieldListItem :: String -> RSTParser (String, String) -rawFieldListItem indent = try $ do - string indent +rawFieldListItem :: Int -> RSTParser (String, String) +rawFieldListItem minIndent = try $ do + indent <- length <$> many (char ' ') + guard $ indent >= minIndent char ':' name <- many1Till (noneOf "\n") (char ':') (() <$ lookAhead newline) <|> skipMany1 spaceChar first <- anyLine - rest <- option "" $ try $ do lookAhead (string indent >> spaceChar) + rest <- option "" $ try $ do lookAhead (count indent (char ' ') >> spaceChar) indentedBlock let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n" return (name, raw) -fieldListItem :: String - -> RSTParser (Inlines, [Blocks]) -fieldListItem indent = try $ do - (name, raw) <- rawFieldListItem indent +fieldListItem :: Int -> RSTParser (Inlines, [Blocks]) +fieldListItem minIndent = try $ do + (name, raw) <- rawFieldListItem minIndent let term = B.str name contents <- parseFromString parseBlocks raw optional blanklines @@ -206,7 +209,7 @@ fieldListItem indent = try $ do fieldList :: RSTParser Blocks fieldList = try $ do - indent <- lookAhead $ many spaceChar + indent <- length <$> lookAhead (many spaceChar) items <- many1 $ fieldListItem indent case items of [] -> return mempty @@ -274,7 +277,8 @@ doubleHeader = try $ do Just ind -> (headerTable, ind + 1) Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) setState (state { stateHeaderTable = headerTable' }) - return $ B.header level txt + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt -- a header with line on the bottom only singleHeader :: RSTParser Blocks @@ -294,7 +298,8 @@ singleHeader = try $ do Just ind -> (headerTable, ind + 1) Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) setState (state { stateHeaderTable = headerTable' }) - return $ B.header level txt + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt -- -- hrule block @@ -344,14 +349,25 @@ lhsCodeBlock = try $ do getPosition >>= guard . (==1) . sourceColumn guardEnabled Ext_literate_haskell optional codeBlockStart - lns <- many1 birdTrackLine - -- if (as is normal) there is always a space after >, drop it - let lns' = if all (\ln -> null ln || take 1 ln == " ") lns - then map (drop 1) lns - else lns + lns <- latexCodeBlock <|> birdCodeBlock blanklines return $ B.codeBlockWith ("", ["sourceCode", "literate", "haskell"], []) - $ intercalate "\n" lns' + $ intercalate "\n" lns + +latexCodeBlock :: Parser [Char] st [[Char]] +latexCodeBlock = try $ do + try (latexBlockLine "\\begin{code}") + many1Till anyLine (try $ latexBlockLine "\\end{code}") + where + latexBlockLine s = skipMany spaceChar >> string s >> blankline + +birdCodeBlock :: Parser [Char] st [[Char]] +birdCodeBlock = filterSpace <$> many1 birdTrackLine + where filterSpace lns = + -- if (as is normal) there is always a space after >, drop it + if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns birdTrackLine :: Parser [Char] st [Char] birdTrackLine = char '>' >> anyLine @@ -506,17 +522,17 @@ directive' = do skipMany spaceChar top <- many $ satisfy (/='\n') <|> try (char '\n' <* - notFollowedBy' (rawFieldListItem " ") <* + notFollowedBy' (rawFieldListItem 3) <* count 3 (char ' ') <* notFollowedBy blankline) newline - fields <- many $ rawFieldListItem " " + fields <- many $ rawFieldListItem 3 body <- option "" $ try $ blanklines >> indentedBlock optional blanklines let body' = body ++ "\n\n" case label of "raw" -> return $ B.rawBlock (trim top) (stripTrailingNewlines body) - "role" -> return mempty + "role" -> addNewRole top $ map (\(k,v) -> (k, trim v)) fields "container" -> parseFromString parseBlocks body' "replace" -> B.para <$> -- consumed by substKey parseFromString (trimInlines . mconcat <$> many inline) @@ -561,12 +577,15 @@ directive' = do role -> role }) "code" -> codeblock (lookup "number-lines" fields) (trim top) body "code-block" -> codeblock (lookup "number-lines" fields) (trim top) body + "aafig" -> do + let attribs = ("", ["aafig"], map (\(k,v) -> (k, trimr v)) fields) + return $ B.codeBlockWith attribs $ stripTrailingNewlines body "math" -> return $ B.para $ mconcat $ map B.displayMath $ toChunks $ top ++ "\n\n" ++ body "figure" -> do (caption, legend) <- parseFromString extractCaption body' let src = escapeURI $ trim top - return $ B.para (B.image src "" caption) <> legend + return $ B.para (B.image src "fig:" caption) <> legend "image" -> do let src = escapeURI $ trim top let alt = B.str $ maybe "image" trim $ lookup "alt" fields @@ -577,7 +596,38 @@ directive' = do Nothing -> B.image src "" alt _ -> return mempty --- Can contain haracter codes as decimal numbers or +-- TODO: +-- - Silently ignores illegal fields +-- - Silently drops classes +-- - Only supports :format: fields with a single format for :raw: roles, +-- change Text.Pandoc.Definition.Format to fix +addNewRole :: String -> [(String, String)] -> RSTParser Blocks +addNewRole roleString fields = do + (role, parentRole) <- parseFromString inheritedRole roleString + customRoles <- stateRstCustomRoles <$> getState + baseRole <- case M.lookup parentRole customRoles of + Just (base, _, _) -> return base + Nothing -> return parentRole + + let fmt = if baseRole == "raw" then lookup "format" fields else Nothing + annotate = maybe id addLanguage $ + if baseRole == "code" + then lookup "language" fields + else Nothing + + updateState $ \s -> s { + stateRstCustomRoles = + M.insert role (baseRole, fmt, (,) parentRole . annotate) customRoles + } + + return $ B.singleton Null + where + addLanguage lang (ident, classes, keyValues) = + (ident, "sourceCode" : lang : classes, keyValues) + inheritedRole = + (,) <$> roleNameEndingIn (char '(') <*> roleNameEndingIn (char ')') + +-- Can contain character codes as decimal numbers or -- hexadecimal numbers, prefixed by 0x, x, \x, U+, u, or \u -- or as XML-style hexadecimal character entities, e.g. ᨫ -- or text, which is used as-is. Comments start with .. @@ -916,17 +966,56 @@ strong = B.strong . trimInlines . mconcat <$> -- Note, this doesn't precisely implement the complex rule in -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules -- but it should be good enough for most purposes +-- +-- TODO: +-- - Classes are silently discarded in addNewRole +-- - Lacks sensible implementation for title-reference (which is the default) +-- - Allows direct use of the :raw: role, rST only allows inherited use. interpretedRole :: RSTParser Inlines interpretedRole = try $ do (role, contents) <- roleBefore <|> roleAfter - case role of - "sup" -> return $ B.superscript $ B.str contents - "sub" -> return $ B.subscript $ B.str contents - "math" -> return $ B.math contents - _ -> return $ B.str contents --unknown + renderRole contents Nothing role nullAttr + +renderRole :: String -> Maybe String -> String -> Attr -> RSTParser Inlines +renderRole contents fmt role attr = case role of + "sup" -> return $ B.superscript $ B.str contents + "superscript" -> return $ B.superscript $ B.str contents + "sub" -> return $ B.subscript $ B.str contents + "subscript" -> return $ B.subscript $ B.str contents + "emphasis" -> return $ B.emph $ B.str contents + "strong" -> return $ B.strong $ B.str contents + "rfc-reference" -> return $ rfcLink contents + "RFC" -> return $ rfcLink contents + "pep-reference" -> return $ pepLink contents + "PEP" -> return $ pepLink contents + "literal" -> return $ B.str contents + "math" -> return $ B.math contents + "title-reference" -> titleRef contents + "title" -> titleRef contents + "t" -> titleRef contents + "code" -> return $ B.codeWith attr contents + "raw" -> return $ B.rawInline (fromMaybe "" fmt) contents + custom -> do + customRole <- stateRstCustomRoles <$> getState + case M.lookup custom customRole of + Just (_, newFmt, inherit) -> let + fmtStr = fmt `mplus` newFmt + (newRole, newAttr) = inherit attr + in renderRole contents fmtStr newRole newAttr + Nothing -> return $ B.str contents -- Undefined role + where + titleRef ref = return $ B.str ref -- FIXME: Not a sensible behaviour + rfcLink rfcNo = B.link rfcUrl ("RFC " ++ rfcNo) $ B.str ("RFC " ++ rfcNo) + where rfcUrl = "http://www.faqs.org/rfcs/rfc" ++ rfcNo ++ ".html" + pepLink pepNo = B.link pepUrl ("PEP " ++ pepNo) $ B.str ("PEP " ++ pepNo) + where padNo = replicate (4 - length pepNo) '0' ++ pepNo + pepUrl = "http://www.python.org/dev/peps/pep-" ++ padNo ++ "/" + +roleNameEndingIn :: RSTParser Char -> RSTParser String +roleNameEndingIn end = many1Till (letter <|> char '-') end roleMarker :: RSTParser String -roleMarker = char ':' *> many1Till (letter <|> char '-') (char ':') +roleMarker = char ':' *> roleNameEndingIn (char ':') roleBefore :: RSTParser (String,String) roleBefore = try $ do @@ -1055,7 +1144,7 @@ smart :: RSTParser Inlines smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> - choice (map (B.singleton <$>) [apostrophe, dash, ellipses]) + choice [apostrophe, dash, ellipses] singleQuoted :: RSTParser Inlines singleQuoted = try $ do diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index fe49a992e..f03eae044 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2007-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.TeXMath - Copyright : Copyright (C) 2007-2010 John MacFarlane + Copyright : Copyright (C) 2007-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,96 +27,30 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} -module Text.Pandoc.Readers.TeXMath ( readTeXMath ) where +module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where import Text.Pandoc.Definition -import Text.TeXMath.Types -import Text.TeXMath.Parser +import Text.TeXMath -- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. --- Defaults to raw formula between @$@ characters if entire formula +-- Defaults to raw formula between @$@ or @$$@ characters if entire formula -- can't be converted. +readTeXMath' :: MathType + -> String -- ^ String to parse (assumes @'\n'@ line endings) + -> [Inline] +readTeXMath' mt inp = case texMathToPandoc dt inp of + Left _ -> [Str (delim ++ inp ++ delim)] + Right res -> res + where (dt, delim) = case mt of + DisplayMath -> (DisplayBlock, "$$") + InlineMath -> (DisplayInline, "$") + +{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-} +-- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. +-- Defaults to raw formula between @$@ characters if entire formula +-- can't be converted. (This is provided for backwards compatibility; +-- it is better to use @readTeXMath'@, which properly distinguishes +-- between display and inline math.) readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath inp = case texMathToPandoc inp of - Left _ -> [Str ("$" ++ inp ++ "$")] - Right res -> res - -texMathToPandoc :: String -> Either String [Inline] -texMathToPandoc inp = inp `seq` - case parseFormula inp of - Left err -> Left err - Right exps -> case expsToInlines exps of - Nothing -> Left "Formula too complex for [Inline]" - Just r -> Right r - -expsToInlines :: [Exp] -> Maybe [Inline] -expsToInlines xs = do - res <- mapM expToInlines xs - return (concat res) - -expToInlines :: Exp -> Maybe [Inline] -expToInlines (ENumber s) = Just [Str s] -expToInlines (EIdentifier s) = Just [Emph [Str s]] -expToInlines (EMathOperator s) = Just [Str s] -expToInlines (ESymbol t s) = Just $ addSpace t (Str s) - where addSpace Op x = [x, thinspace] - addSpace Bin x = [medspace, x, medspace] - addSpace Rel x = [widespace, x, widespace] - addSpace Pun x = [x, thinspace] - addSpace _ x = [x] - thinspace = Str "\x2006" - medspace = Str "\x2005" - widespace = Str "\x2004" -expToInlines (EStretchy x) = expToInlines x -expToInlines (EDelimited start end xs) = do - xs' <- mapM expToInlines xs - return $ [Str start] ++ concat xs' ++ [Str end] -expToInlines (EGrouped xs) = expsToInlines xs -expToInlines (ESpace "0.167em") = Just [Str "\x2009"] -expToInlines (ESpace "0.222em") = Just [Str "\x2005"] -expToInlines (ESpace "0.278em") = Just [Str "\x2004"] -expToInlines (ESpace "0.333em") = Just [Str "\x2004"] -expToInlines (ESpace "1em") = Just [Str "\x2001"] -expToInlines (ESpace "2em") = Just [Str "\x2001\x2001"] -expToInlines (ESpace _) = Just [Str " "] -expToInlines (EBinary _ _ _) = Nothing -expToInlines (ESub x y) = do - x' <- expToInlines x - y' <- expToInlines y - return $ x' ++ [Subscript y'] -expToInlines (ESuper x y) = do - x' <- expToInlines x - y' <- expToInlines y - return $ x' ++ [Superscript y'] -expToInlines (ESubsup x y z) = do - x' <- expToInlines x - y' <- expToInlines y - z' <- expToInlines z - return $ x' ++ [Subscript y'] ++ [Superscript z'] -expToInlines (EDown x y) = expToInlines (ESub x y) -expToInlines (EUp x y) = expToInlines (ESuper x y) -expToInlines (EDownup x y z) = expToInlines (ESubsup x y z) -expToInlines (EText TextNormal x) = Just [Str x] -expToInlines (EText TextBold x) = Just [Strong [Str x]] -expToInlines (EText TextMonospace x) = Just [Code nullAttr x] -expToInlines (EText TextItalic x) = Just [Emph [Str x]] -expToInlines (EText _ x) = Just [Str x] -expToInlines (EOver (EGrouped [EIdentifier [c]]) (ESymbol Accent [accent])) = - case accent of - '\x203E' -> Just [Emph [Str [c,'\x0304']]] -- bar - '\x00B4' -> Just [Emph [Str [c,'\x0301']]] -- acute - '\x0060' -> Just [Emph [Str [c,'\x0300']]] -- grave - '\x02D8' -> Just [Emph [Str [c,'\x0306']]] -- breve - '\x02C7' -> Just [Emph [Str [c,'\x030C']]] -- check - '.' -> Just [Emph [Str [c,'\x0307']]] -- dot - '\x00B0' -> Just [Emph [Str [c,'\x030A']]] -- ring - '\x20D7' -> Just [Emph [Str [c,'\x20D7']]] -- arrow right - '\x20D6' -> Just [Emph [Str [c,'\x20D6']]] -- arrow left - '\x005E' -> Just [Emph [Str [c,'\x0302']]] -- hat - '\x0302' -> Just [Emph [Str [c,'\x0302']]] -- hat - '~' -> Just [Emph [Str [c,'\x0303']]] -- tilde - _ -> Nothing -expToInlines _ = Nothing - - +readTeXMath = readTeXMath' InlineMath diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index a1687a691..6d839ec1d 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -1,5 +1,6 @@ {- -Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' +Copyright (C) 2010-2014 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + and John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Textile - Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane + Copyright : Copyright (C) 2010-2014 Paul Rivier and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -50,18 +51,22 @@ TODO : refactor common patterns across readers : module Text.Pandoc.Readers.Textile ( readTextile) where - import Text.Pandoc.Definition -import Text.Pandoc.Shared +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import qualified Text.Pandoc.Builder as B import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) +import Text.HTML.TagSoup (parseTags, innerText, fromAttrib, Tag(..)) import Text.HTML.TagSoup.Match import Data.List ( intercalate ) -import Data.Char ( digitToInt, isUpper ) -import Control.Monad ( guard, liftM ) -import Control.Applicative ((<$>), (*>), (<*)) +import Data.Char ( digitToInt, isUpper) +import Control.Monad ( guard, liftM, when ) +import Text.Printf +import Control.Applicative ((<$>), (*>), (<*), (<$)) +import Data.Monoid +import Debug.Trace (trace) -- | Parse a Textile text and return a Pandoc document. readTextile :: ReaderOptions -- ^ Reader options @@ -93,7 +98,7 @@ parseTextile = do updateState $ \s -> s { stateNotes = reverse reversedNotes } -- now parse it for real... blocks <- parseBlocks - return $ Pandoc nullMeta blocks -- FIXME + return $ Pandoc nullMeta (B.toList blocks) -- FIXME noteMarker :: Parser [Char] ParserState [Char] noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') @@ -113,11 +118,11 @@ noteBlock = try $ do return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -- | Parse document blocks -parseBlocks :: Parser [Char] ParserState [Block] -parseBlocks = manyTill block eof +parseBlocks :: Parser [Char] ParserState Blocks +parseBlocks = mconcat <$> manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: [Parser [Char] ParserState Block] +blockParsers :: [Parser [Char] ParserState Blocks] blockParsers = [ codeBlock , header , blockQuote @@ -128,32 +133,45 @@ blockParsers = [ codeBlock , rawLaTeXBlock' , maybeExplicitBlock "table" table , maybeExplicitBlock "p" para + , endBlock ] --- | Any block in the order of definition of blockParsers -block :: Parser [Char] ParserState Block -block = choice blockParsers <?> "block" +endBlock :: Parser [Char] ParserState Blocks +endBlock = string "\n\n" >> return mempty -commentBlock :: Parser [Char] ParserState Block +-- | Any block in the order of definition of blockParsers +block :: Parser [Char] ParserState Blocks +block = do + res <- choice blockParsers <?> "block" + pos <- getPosition + tr <- getOption readerTrace + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res + +commentBlock :: Parser [Char] ParserState Blocks commentBlock = try $ do string "###." manyTill anyLine blanklines - return Null + return mempty -codeBlock :: Parser [Char] ParserState Block +codeBlock :: Parser [Char] ParserState Blocks codeBlock = codeBlockBc <|> codeBlockPre -codeBlockBc :: Parser [Char] ParserState Block +codeBlockBc :: Parser [Char] ParserState Blocks codeBlockBc = try $ do string "bc. " contents <- manyTill anyLine blanklines - return $ CodeBlock ("",[],[]) $ unlines contents + return $ B.codeBlock (unlines contents) -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockPre :: Parser [Char] ParserState Block +codeBlockPre :: Parser [Char] ParserState Blocks codeBlockPre = try $ do - htmlTag (tagOpen (=="pre") null) - result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak) + (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) + result' <- (innerText . parseTags) `fmap` -- remove internal tags + manyTill anyChar (htmlTag (tagClose (=="pre"))) + optional blanklines -- drop leading newline if any let result'' = case result' of '\n':xs -> xs @@ -162,28 +180,32 @@ codeBlockPre = try $ do let result''' = case reverse result'' of '\n':_ -> init result'' _ -> result'' - return $ CodeBlock ("",[],[]) result''' + let classes = words $ fromAttrib "class" t + let ident = fromAttrib "id" t + let kvs = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.codeBlockWith (ident,classes,kvs) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: Parser [Char] ParserState Block +header :: Parser [Char] ParserState Blocks header = try $ do char 'h' level <- digitToInt <$> oneOf "123456" attr <- attributes char '.' - whitespace - name <- normalizeSpaces <$> manyTill inline blockBreak - return $ Header level attr name + lookAhead whitespace + name <- trimInlines . mconcat <$> manyTill inline blockBreak + attr' <- registerHeader attr name + return $ B.headerWith attr' level name -- | Blockquote of the form "bq. content" -blockQuote :: Parser [Char] ParserState Block +blockQuote :: Parser [Char] ParserState Blocks blockQuote = try $ do string "bq" >> attributes >> char '.' >> whitespace - BlockQuote . singleton <$> para + B.blockQuote <$> para -- Horizontal rule -hrule :: Parser [Char] st Block +hrule :: Parser [Char] st Blocks hrule = try $ do skipSpaces start <- oneOf "-*" @@ -191,62 +213,62 @@ hrule = try $ do skipMany (spaceChar <|> char start) newline optional blanklines - return HorizontalRule + return B.horizontalRule -- Lists handling -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: Parser [Char] ParserState Block +anyList :: Parser [Char] ParserState Blocks anyList = try $ anyListAtDepth 1 <* blanklines -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: Int -> Parser [Char] ParserState Block +anyListAtDepth :: Int -> Parser [Char] ParserState Blocks anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: Int -> Parser [Char] ParserState Block -bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth) +bulletListAtDepth :: Int -> Parser [Char] ParserState Blocks +bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +bulletListItemAtDepth :: Int -> Parser [Char] ParserState Blocks bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' -orderedListAtDepth :: Int -> Parser [Char] ParserState Block +orderedListAtDepth :: Int -> Parser [Char] ParserState Blocks orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) - return (OrderedList (1, DefaultStyle, DefaultDelim) items) + return $ B.orderedList items -- | Ordered List Item of given depth, depth being the number of -- leading '#' -orderedListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +orderedListItemAtDepth :: Int -> Parser [Char] ParserState Blocks orderedListItemAtDepth = genericListItemAtDepth '#' -- | Common implementation of list items -genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState [Block] +genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState Blocks genericListItemAtDepth c depth = try $ do count depth (char c) >> attributes >> whitespace - p <- many listInline + p <- mconcat <$> many listInline newline - sublist <- option [] (singleton <$> anyListAtDepth (depth + 1)) - return (Plain p : sublist) + sublist <- option mempty (anyListAtDepth (depth + 1)) + return $ (B.plain p) <> sublist -- | A definition list is a set of consecutive definition items -definitionList :: Parser [Char] ParserState Block -definitionList = try $ DefinitionList <$> many1 definitionListItem +definitionList :: Parser [Char] ParserState Blocks +definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. listStart :: Parser [Char] st Char listStart = oneOf "*#-" -listInline :: Parser [Char] ParserState Inline +listInline :: Parser [Char] ParserState Inlines listInline = try (notFollowedBy newline >> inline) <|> try (endline <* notFollowedBy listStart) @@ -254,16 +276,16 @@ listInline = try (notFollowedBy newline >> inline) -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) +definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks]) definitionListItem = try $ do string "- " - term <- many1Till inline (try (whitespace >> string ":=")) + term <- mconcat <$> many1Till inline (try (whitespace >> string ":=")) def' <- multilineDef <|> inlineDef return (term, def') - where inlineDef :: Parser [Char] ParserState [[Block]] - inlineDef = liftM (\d -> [[Plain d]]) - $ optional whitespace >> many listInline <* newline - multilineDef :: Parser [Char] ParserState [[Block]] + where inlineDef :: Parser [Char] ParserState [Blocks] + inlineDef = liftM (\d -> [B.plain d]) + $ optional whitespace >> (trimInlines . mconcat <$> many listInline) <* newline + multilineDef :: Parser [Char] ParserState [Blocks] multilineDef = try $ do optional whitespace >> newline s <- many1Till anyChar (try (string "=:" >> newline)) @@ -275,64 +297,62 @@ definitionListItem = try $ do -- blocks support, we have to lookAhead for a rawHtmlBlock. blockBreak :: Parser [Char] ParserState () blockBreak = try (newline >> blanklines >> return ()) <|> - (lookAhead rawHtmlBlock >> return ()) + try (optional spaces >> lookAhead rawHtmlBlock >> return ()) -- raw content -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: Parser [Char] ParserState Block +rawHtmlBlock :: Parser [Char] ParserState Blocks rawHtmlBlock = try $ do (_,b) <- htmlTag isBlockTag optional blanklines - return $ RawBlock "html" b + return $ B.rawBlock "html" b -- | Raw block of LaTeX content -rawLaTeXBlock' :: Parser [Char] ParserState Block +rawLaTeXBlock' :: Parser [Char] ParserState Blocks rawLaTeXBlock' = do guardEnabled Ext_raw_tex - RawBlock "latex" <$> (rawLaTeXBlock <* spaces) + B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) -- | In textile, paragraphs are separated by blank lines. -para :: Parser [Char] ParserState Block -para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak - +para :: Parser [Char] ParserState Blocks +para = B.para . trimInlines . mconcat <$> manyTill inline blockBreak -- Tables -- | A table cell spans until a pipe | -tableCell :: Parser [Char] ParserState TableCell +tableCell :: Parser [Char] ParserState Blocks tableCell = do c <- many1 (noneOf "|\n") - content <- parseFromString (many1 inline) c - return $ [ Plain $ normalizeSpaces content ] + content <- trimInlines . mconcat <$> parseFromString (many1 inline) c + return $ B.plain content -- | A table row is made of many table cells -tableRow :: Parser [Char] ParserState [TableCell] +tableRow :: Parser [Char] ParserState [Blocks] tableRow = try $ ( char '|' *> (endBy1 tableCell (optional blankline *> char '|')) <* newline) -- | Many table rows -tableRows :: Parser [Char] ParserState [[TableCell]] +tableRows :: Parser [Char] ParserState [[Blocks]] tableRows = many1 tableRow -- | Table headers are made of cells separated by a tag "|_." -tableHeaders :: Parser [Char] ParserState [TableCell] +tableHeaders :: Parser [Char] ParserState [Blocks] tableHeaders = let separator = (try $ string "|_.") in try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline ) -- | A table with an optional header. Current implementation can -- handle tables with and without header, but will parse cells -- alignment attributes as content. -table :: Parser [Char] ParserState Block +table :: Parser [Char] ParserState Blocks table = try $ do - headers <- option [] tableHeaders + headers <- option mempty tableHeaders rows <- tableRows blanklines let nbOfCols = max (length headers) (length $ head rows) - return $ Table [] - (replicate nbOfCols AlignDefault) - (replicate nbOfCols 0.0) + return $ B.table mempty + (zip (replicate nbOfCols AlignDefault) (replicate nbOfCols 0.0)) headers rows @@ -340,8 +360,8 @@ table = try $ do -- | Blocks like 'p' and 'table' do not need explicit block tag. -- However, they can be used to set HTML/CSS attributes when needed. maybeExplicitBlock :: String -- ^ block tag name - -> Parser [Char] ParserState Block -- ^ implicit block - -> Parser [Char] ParserState Block + -> Parser [Char] ParserState Blocks -- ^ implicit block + -> Parser [Char] ParserState Blocks maybeExplicitBlock name blk = try $ do optional $ try $ string name >> attributes >> char '.' >> optional whitespace >> optional endline @@ -355,73 +375,74 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: Parser [Char] ParserState Inline -inline = choice inlineParsers <?> "inline" +inline :: Parser [Char] ParserState Inlines +inline = do + choice inlineParsers <?> "inline" -- | Inline parsers tried in order -inlineParsers :: [Parser [Char] ParserState Inline] +inlineParsers :: [Parser [Char] ParserState Inlines] inlineParsers = [ str , whitespace , endline , code , escapedInline - , htmlSpan + , inlineMarkup + , groupedInlineMarkup , rawHtmlInline , rawLaTeXInline' , note - , try $ (char '[' *> inlineMarkup <* char ']') - , inlineMarkup , link , image , mark - , (Str . (:[])) <$> characterReference + , (B.str . (:[])) <$> characterReference , smartPunctuation inline , symbol ] -- | Inline markups -inlineMarkup :: Parser [Char] ParserState Inline -inlineMarkup = choice [ simpleInline (string "??") (Cite []) - , simpleInline (string "**") Strong - , simpleInline (string "__") Emph - , simpleInline (char '*') Strong - , simpleInline (char '_') Emph - , simpleInline (char '+') Emph -- approximates underline - , simpleInline (char '-' <* notFollowedBy (char '-')) Strikeout - , simpleInline (char '^') Superscript - , simpleInline (char '~') Subscript +inlineMarkup :: Parser [Char] ParserState Inlines +inlineMarkup = choice [ simpleInline (string "??") (B.cite []) + , simpleInline (string "**") B.strong + , simpleInline (string "__") B.emph + , simpleInline (char '*') B.strong + , simpleInline (char '_') B.emph + , simpleInline (char '+') B.emph -- approximates underline + , simpleInline (char '-' <* notFollowedBy (char '-')) B.strikeout + , simpleInline (char '^') B.superscript + , simpleInline (char '~') B.subscript + , simpleInline (char '%') id ] -- | Trademark, registered, copyright -mark :: Parser [Char] st Inline +mark :: Parser [Char] st Inlines mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: Parser [Char] st Inline +reg :: Parser [Char] st Inlines reg = do oneOf "Rr" char ')' - return $ Str "\174" + return $ B.str "\174" -tm :: Parser [Char] st Inline +tm :: Parser [Char] st Inlines tm = do oneOf "Tt" oneOf "Mm" char ')' - return $ Str "\8482" + return $ B.str "\8482" -copy :: Parser [Char] st Inline +copy :: Parser [Char] st Inlines copy = do oneOf "Cc" char ')' - return $ Str "\169" + return $ B.str "\169" -note :: Parser [Char] ParserState Inline +note :: Parser [Char] ParserState Inlines note = try $ do ref <- (char '[' *> many1 digit <* char ']') notes <- stateNotes <$> getState case lookup ref notes of Nothing -> fail "note not found" - Just raw -> liftM Note $ parseFromString parseBlocks raw + Just raw -> B.note <$> parseFromString parseBlocks raw -- | Special chars markupChars :: [Char] @@ -442,7 +463,7 @@ wordBoundaries = markupChars ++ stringBreakers hyphenedWords :: Parser [Char] ParserState String hyphenedWords = do x <- wordChunk - xs <- many (try $ char '-' >> wordChunk) + xs <- many (try $ char '-' >> wordChunk) return $ intercalate "-" (x:xs) wordChunk :: Parser [Char] ParserState String @@ -454,99 +475,99 @@ wordChunk = try $ do return $ hd:tl -- | Any string -str :: Parser [Char] ParserState Inline +str :: Parser [Char] ParserState Inlines str = do baseStr <- hyphenedWords -- RedCloth compliance : if parsed word is uppercase and immediatly -- followed by parens, parens content is unconditionally word acronym fullStr <- option baseStr $ try $ do guard $ all isUpper baseStr - acro <- enclosed (char '(') (char ')') anyChar + acro <- enclosed (char '(') (char ')') anyChar' return $ concat [baseStr, " (", acro, ")"] updateLastStrPos - return $ Str fullStr - --- | Textile allows HTML span infos, we discard them -htmlSpan :: Parser [Char] ParserState Inline -htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') ) + return $ B.str fullStr -- | Some number of space chars -whitespace :: Parser [Char] ParserState Inline -whitespace = many1 spaceChar >> return Space <?> "whitespace" +whitespace :: Parser [Char] ParserState Inlines +whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: Parser [Char] ParserState Inline +endline :: Parser [Char] ParserState Inlines endline = try $ do newline >> notFollowedBy blankline - return LineBreak + return B.linebreak -rawHtmlInline :: Parser [Char] ParserState Inline -rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag +rawHtmlInline :: Parser [Char] ParserState Inlines +rawHtmlInline = B.rawInline "html" . snd <$> htmlTag isInlineTag -- | Raw LaTeX Inline -rawLaTeXInline' :: Parser [Char] ParserState Inline +rawLaTeXInline' :: Parser [Char] ParserState Inlines rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex - rawLaTeXInline + B.singleton <$> rawLaTeXInline -- | Textile standard link syntax is "label":target. But we -- can also have ["label":target]. -link :: Parser [Char] ParserState Inline -link = linkB <|> linkNoB - -linkNoB :: Parser [Char] ParserState Inline -linkNoB = try $ do - name <- surrounded (char '"') inline - char ':' - let stopChars = "!.,;:" - url <- manyTill nonspaceChar (lookAhead $ space <|> try (oneOf stopChars >> (space <|> newline))) - let name' = if name == [Str "$"] then [Str url] else name - return $ Link name' (url, "") - -linkB :: Parser [Char] ParserState Inline -linkB = try $ do - char '[' - name <- surrounded (char '"') inline +link :: Parser [Char] ParserState Inlines +link = try $ do + bracketed <- (True <$ char '[') <|> return False + char '"' *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + name <- trimInlines . mconcat <$> + withQuoteContext InDoubleQuote (many1Till inline (char '"')) char ':' - url <- manyTill nonspaceChar (char ']') - let name' = if name == [Str "$"] then [Str url] else name - return $ Link name' (url, "") + let stop = if bracketed + then char ']' + else lookAhead $ space <|> + try (oneOf "!.,;:" *> (space <|> newline)) + url <- manyTill nonspaceChar stop + let name' = if B.toList name == [Str "$"] then B.str url else name + return $ if attr == nullAttr + then B.link url "" name' + else B.spanWith attr $ B.link url "" name' -- | image embedding -image :: Parser [Char] ParserState Inline +image :: Parser [Char] ParserState Inlines image = try $ do char '!' >> notFollowedBy space - src <- manyTill anyChar (lookAhead $ oneOf "!(") - alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')'))) + src <- manyTill anyChar' (lookAhead $ oneOf "!(") + alt <- option "" (try $ (char '(' >> manyTill anyChar' (char ')'))) char '!' - return $ Image [Str alt] (src, alt) + return $ B.image src alt (B.str alt) -escapedInline :: Parser [Char] ParserState Inline +escapedInline :: Parser [Char] ParserState Inlines escapedInline = escapedEqs <|> escapedTag -escapedEqs :: Parser [Char] ParserState Inline -escapedEqs = Str <$> (try $ string "==" *> manyTill anyChar (try $ string "==")) +escapedEqs :: Parser [Char] ParserState Inlines +escapedEqs = B.str <$> + (try $ string "==" *> manyTill anyChar' (try $ string "==")) -- | literal text escaped btw <notextile> tags -escapedTag :: Parser [Char] ParserState Inline -escapedTag = Str <$> - (try $ string "<notextile>" *> manyTill anyChar (try $ string "</notextile>")) +escapedTag :: Parser [Char] ParserState Inlines +escapedTag = B.str <$> + (try $ string "<notextile>" *> + manyTill anyChar' (try $ string "</notextile>")) -- | Any special symbol defined in wordBoundaries -symbol :: Parser [Char] ParserState Inline -symbol = Str . singleton <$> (oneOf wordBoundaries <|> oneOf markupChars) +symbol :: Parser [Char] ParserState Inlines +symbol = B.str . singleton <$> (oneOf wordBoundaries <|> oneOf markupChars) -- | Inline code -code :: Parser [Char] ParserState Inline +code :: Parser [Char] ParserState Inlines code = code1 <|> code2 -code1 :: Parser [Char] ParserState Inline -code1 = Code nullAttr <$> surrounded (char '@') anyChar +-- any character except a newline before a blank line +anyChar' :: Parser [Char] ParserState Char +anyChar' = + satisfy (/='\n') <|> (try $ char '\n' <* notFollowedBy blankline) + +code1 :: Parser [Char] ParserState Inlines +code1 = B.code <$> surrounded (char '@') anyChar' -code2 :: Parser [Char] ParserState Inline +code2 :: Parser [Char] ParserState Inlines code2 = do htmlTag (tagOpen (=="tt") null) - Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) + B.code <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes attributes :: Parser [Char] ParserState Attr @@ -558,7 +579,7 @@ attribute = classIdAttr <|> styleAttr <|> langAttr classIdAttr :: Parser [Char] ParserState (Attr -> Attr) classIdAttr = try $ do -- (class class #id) char '(' - ws <- words `fmap` manyTill anyChar (char ')') + ws <- words `fmap` manyTill anyChar' (char ')') case reverse ws of [] -> return $ \(_,_,keyvals) -> ("",[],keyvals) (('#':ident'):classes') -> return $ \(_,_,keyvals) -> @@ -568,28 +589,49 @@ classIdAttr = try $ do -- (class class #id) styleAttr :: Parser [Char] ParserState (Attr -> Attr) styleAttr = do - style <- try $ enclosed (char '{') (char '}') anyChar + style <- try $ enclosed (char '{') (char '}') anyChar' return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals) langAttr :: Parser [Char] ParserState (Attr -> Attr) langAttr = do - lang <- try $ enclosed (char '[') (char ']') anyChar + lang <- try $ enclosed (char '[') (char ']') alphaNum return $ \(id',classes,keyvals) -> (id',classes,("lang",lang):keyvals) -- | Parses material surrounded by a parser. surrounded :: Parser [Char] st t -- ^ surrounding parser -> Parser [Char] st a -- ^ content parser (to be used repeatedly) -> Parser [Char] st [a] -surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) +surrounded border = + enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) --- | Inlines are most of the time of the same form simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser - -> ([Inline] -> Inline) -- ^ Inline constructor - -> Parser [Char] ParserState Inline -- ^ content parser (to be used repeatedly) -simpleInline border construct = surrounded border (inlineWithAttribute) >>= - return . construct . normalizeSpaces - where inlineWithAttribute = (try $ optional attributes) >> inline + -> (Inlines -> Inlines) -- ^ Inline constructor + -> Parser [Char] ParserState Inlines -- ^ content parser (to be used repeatedly) +simpleInline border construct = try $ do + st <- getState + pos <- getPosition + let afterString = stateLastStrPos st == Just pos + guard $ not afterString + border *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + body <- trimInlines . mconcat <$> + withQuoteContext InSingleQuote + (manyTill inline (try border <* notFollowedBy alphaNum)) + return $ construct $ + if attr == nullAttr + then body + else B.spanWith attr body + +groupedInlineMarkup :: Parser [Char] ParserState Inlines +groupedInlineMarkup = try $ do + char '[' + sp1 <- option mempty $ B.space <$ whitespace + result <- withQuoteContext InSingleQuote inlineMarkup + sp2 <- option mempty $ B.space <$ whitespace + char ']' + return $ sp1 <> result <> sp2 -- | Create a singleton list singleton :: a -> [a] singleton x = [x] + diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index c4613992a..2a2f56281 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.SelfContained - Copyright : Copyright (C) 2011 John MacFarlane + Copyright : Copyright (C) 2011-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,7 +32,7 @@ the HTML using data URIs. -} module Text.Pandoc.SelfContained ( makeSelfContained ) where import Text.HTML.TagSoup -import Network.URI (isAbsoluteURI, escapeURIString) +import Network.URI (isURI, escapeURIString) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B import Data.ByteString (ByteString) @@ -40,7 +40,7 @@ import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>)) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', openURL, readDataFile) +import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err) import Text.Pandoc.UTF8 (toString, fromString) import Text.Pandoc.MIME (getMimeType) import System.Directory (doesFileExist) @@ -50,14 +50,16 @@ isOk c = isAscii c && isAlphaNum c convertTag :: Maybe FilePath -> Tag String -> IO (Tag String) convertTag userdata t@(TagOpen tagname as) - | tagname `elem` ["img", "embed", "video", "input", "audio", "source"] = - case fromAttrib "src" t of - [] -> return t - src -> do - (raw, mime) <- getRaw userdata (fromAttrib "type" t) src - let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) - return $ TagOpen tagname - (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) + | tagname `elem` ["img", "embed", "video", "input", "audio", "source"] = do + as' <- mapM processAttribute as + return $ TagOpen tagname as' + where processAttribute (x,y) = + if x == "src" || x == "href" || x == "poster" + then do + (raw, mime) <- getRaw userdata (fromAttrib "type" t) y + let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) + return (x, enc) + else return (x,y) convertTag userdata t@(TagOpen "script" as) = case fromAttrib "src" t of [] -> return t @@ -86,7 +88,7 @@ cssURLs userdata d orig = "\"" -> B.takeWhile (/='"') $ B.drop 1 u "'" -> B.takeWhile (/='\'') $ B.drop 1 u _ -> u - let url' = if isAbsoluteURI url + let url' = if isURI url then url else d </> url (raw, mime) <- getRaw userdata "" url' @@ -97,8 +99,8 @@ cssURLs userdata d orig = getItem :: Maybe FilePath -> String -> IO (ByteString, Maybe String) getItem userdata f = - if isAbsoluteURI f - then openURL f + if isURI f + then openURL f >>= either handleErr return else do -- strip off trailing query or fragment part, if relative URL. -- this is needed for things like cmunrm.eot?#iefix, @@ -110,6 +112,7 @@ getItem userdata f = exists <- doesFileExist f' cont <- if exists then B.readFile f' else readDataFile userdata f' return (cont, mime) + where handleErr e = err 61 $ "Failed to retrieve " ++ f ++ "\n" ++ show e getRaw :: Maybe FilePath -> String -> String -> IO (ByteString, String) getRaw userdata mimetype src = do diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 09086da1f..5b0d9b6b4 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -1,6 +1,7 @@ -{-# LANGUAGE DeriveDataTypeable, CPP #-} +{-# LANGUAGE DeriveDataTypeable, CPP, MultiParamTypeClasses, + FlexibleContexts, ScopedTypeVariables #-} {- -Copyright (C) 2006-2013 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Shared - Copyright : Copyright (C) 2006-2013 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,6 +35,7 @@ module Text.Pandoc.Shared ( splitByIndices, splitStringByIndices, substitute, + ordNub, -- * Text processing backslashEscapes, escapeStringUsing, @@ -51,10 +53,12 @@ module Text.Pandoc.Shared ( -- * Pandoc block and inline list processing orderedListMarkers, normalizeSpaces, + extractSpaces, normalize, stringify, compactify, compactify', + compactify'DL, Element (..), hierarchicalize, uniqueIdent, @@ -79,8 +83,9 @@ module Text.Pandoc.Shared ( ) where import Text.Pandoc.Definition +import Text.Pandoc.Walk import Text.Pandoc.Generic -import Text.Pandoc.Builder (Blocks, ToMetaValue(..)) +import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..)) import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.UTF8 as UTF8 import System.Environment (getProgName) @@ -89,12 +94,15 @@ import Data.Char ( toLower, isLower, isUpper, isAlpha, isLetter, isDigit, isSpace ) import Data.List ( find, isPrefixOf, intercalate ) import qualified Data.Map as M -import Network.URI ( escapeURIString, isAbsoluteURI, unEscapeString ) +import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, + unEscapeString, parseURIReference ) +import qualified Data.Set as Set import System.Directory import Text.Pandoc.MIME (getMimeType) import System.FilePath ( (</>), takeExtension, dropExtension ) import Data.Generics (Typeable, Data) import qualified Control.Monad.State as S +import qualified Control.Exception as E import Control.Monad (msum, unless) import Text.Pandoc.Pretty (charWidth) import System.Locale (defaultTimeLocale) @@ -103,8 +111,10 @@ import System.IO (stderr) import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) import qualified Data.ByteString as BS -import Data.ByteString.Lazy (toChunks) import qualified Data.ByteString.Char8 as B8 +import Text.Pandoc.Compat.Monoid +import Data.ByteString.Base64 (decodeLenient) +import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) #ifdef EMBED_DATA_FILES import Text.Pandoc.Data (dataFiles) @@ -112,10 +122,16 @@ import System.FilePath ( joinPath, splitDirectories ) #else import Paths_pandoc (getDataFileName) #endif -#ifdef HTTP_CONDUIT -import Network.HTTP.Conduit (httpLbs, parseUrl, withManager, - responseBody, responseHeaders) +#ifdef HTTP_CLIENT +import Data.ByteString.Lazy (toChunks) +import Network.HTTP.Client (httpLbs, parseUrl, withManager, + responseBody, responseHeaders, + Request(port,host)) +import Network.HTTP.Client.Internal (addProxy) +import Network.HTTP.Client.TLS (tlsManagerSettings) +import System.Environment (getEnv) import Network.HTTP.Types.Header ( hContentType) +import Network (withSocketsDo) #else import Network.URI (parseURI) import Network.HTTP (findHeader, rspBody, @@ -162,6 +178,13 @@ substitute target replacement lst@(x:xs) = then replacement ++ substitute target replacement (drop (length target) lst) else x : substitute target replacement xs +ordNub :: (Ord a) => [a] -> [a] +ordNub l = go Set.empty l + where + go _ [] = [] + go s (x:xs) = if x `Set.member` s then go s xs + else x : go (Set.insert x s) xs + -- -- Text processing -- @@ -225,9 +248,9 @@ toRomanNumeral x = _ | x >= 50 -> "L" ++ toRomanNumeral (x - 50) _ | x >= 40 -> "XL" ++ toRomanNumeral (x - 40) _ | x >= 10 -> "X" ++ toRomanNumeral (x - 10) - _ | x >= 9 -> "IX" ++ toRomanNumeral (x - 5) + _ | x == 9 -> "IX" _ | x >= 5 -> "V" ++ toRomanNumeral (x - 5) - _ | x >= 4 -> "IV" ++ toRomanNumeral (x - 4) + _ | x == 4 -> "IV" _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) _ -> "" @@ -265,7 +288,7 @@ normalizeDate s = fmap (formatTime defaultTimeLocale "%F") (msum $ map (\fs -> parsetimeWith fs s) formats :: Maybe Day) where parsetimeWith = parseTime defaultTimeLocale formats = ["%x","%m/%d/%Y", "%D","%F", "%d %b %Y", - "%d %B %Y", "%b. %d, %Y", "%B %d, %Y"] + "%d %B %Y", "%b. %d, %Y", "%B %d, %Y", "%Y"] -- -- Pandoc block and inline list processing @@ -310,6 +333,20 @@ isSpaceOrEmpty Space = True isSpaceOrEmpty (Str "") = True isSpaceOrEmpty _ = False +-- | Extract the leading and trailing spaces from inside an inline element +-- and place them outside the element. + +extractSpaces :: (Inlines -> Inlines) -> Inlines -> Inlines +extractSpaces f is = + let contents = B.unMany is + left = case viewl contents of + (Space :< _) -> B.space + _ -> mempty + right = case viewr contents of + (_ :> Space) -> B.space + _ -> mempty in + (left <> f (B.trimInlines . B.Many $ contents) <> right) + -- | Normalize @Pandoc@ document, consolidating doubled 'Space's, -- combining adjacent 'Str's and 'Emph's, remove 'Null's and -- empty elements, etc. @@ -380,9 +417,11 @@ consolidateInlines (Code a1 x : Code a2 y : zs) | a1 == a2 = consolidateInlines (x : xs) = x : consolidateInlines xs consolidateInlines [] = [] --- | Convert list of inlines to a string with formatting removed. -stringify :: [Inline] -> String -stringify = queryWith go +-- | Convert pandoc structure to a string with formatting removed. +-- Footnotes are skipped (since we don't want their contents in link +-- labels). +stringify :: Walkable Inline a => a -> String +stringify = query go . walk deNote where go :: Inline -> [Char] go Space = " " go (Str x) = x @@ -390,6 +429,8 @@ stringify = queryWith go go (Math _ x) = x go LineBreak = " " go _ = "" + deNote (Note _) = Str "" + deNote x = x -- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. @@ -422,6 +463,21 @@ compactify' items = _ -> items _ -> items +-- | Like @compactify'@, but akts on items of definition lists. +compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] +compactify'DL items = + let defs = concatMap snd items + defBlocks = reverse $ concatMap B.toList defs + in case defBlocks of + (Para x:_) -> if not $ any isPara (drop 1 defBlocks) + then let (t,ds) = last items + lastDef = B.toList $ last ds + ds' = init ds ++ + [B.fromList $ init lastDef ++ [Plain x]] + in init items ++ [(t, ds')] + else items + _ -> items + isPara :: Block -> Bool isPara (Para _) = True isPara _ = False @@ -432,6 +488,29 @@ data Element = Blk Block -- lvl num attributes label contents deriving (Eq, Read, Show, Typeable, Data) +instance Walkable Inline Element where + walk f (Blk x) = Blk (walk f x) + walk f (Sec lev nums attr ils elts) = Sec lev nums attr (walk f ils) (walk f elts) + walkM f (Blk x) = Blk `fmap` walkM f x + walkM f (Sec lev nums attr ils elts) = do + ils' <- walkM f ils + elts' <- walkM f elts + return $ Sec lev nums attr ils' elts' + query f (Blk x) = query f x + query f (Sec _ _ _ ils elts) = query f ils <> query f elts + +instance Walkable Block Element where + walk f (Blk x) = Blk (walk f x) + walk f (Sec lev nums attr ils elts) = Sec lev nums attr (walk f ils) (walk f elts) + walkM f (Blk x) = Blk `fmap` walkM f x + walkM f (Sec lev nums attr ils elts) = do + ils' <- walkM f ils + elts' <- walkM f elts + return $ Sec lev nums attr ils' elts' + query f (Blk x) = query f x + query f (Sec _ _ _ ils elts) = query f ils <> query f elts + + -- | Convert Pandoc inline list to plain text identifier. HTML -- identifiers must start with a letter, and may contain only -- letters, digits, and the characters _-. @@ -492,14 +571,14 @@ isHeaderBlock _ = False -- | Shift header levels up or down. headerShift :: Int -> Pandoc -> Pandoc -headerShift n = bottomUp shift +headerShift n = walk shift where shift :: Block -> Block shift (Header level attr inner) = Header (level + n) attr inner shift x = x -- | Detect if a list is tight. isTightList :: [[Block]] -> Bool -isTightList = and . map firstIsPlain +isTightList = all firstIsPlain where firstIsPlain (Plain _ : _) = True firstIsPlain _ = False @@ -512,8 +591,10 @@ addMetaField :: ToMetaValue a -> Meta addMetaField key val (Meta meta) = Meta $ M.insertWith combine key (toMetaValue val) meta - where combine newval (MetaList xs) = MetaList (xs ++ [newval]) + where combine newval (MetaList xs) = MetaList (xs ++ tolist newval) combine newval x = MetaList [x, newval] + tolist (MetaList ys) = ys + tolist y = [y] -- | Create 'Meta' from old-style title, authors, date. This is -- provided to ease the transition from the old API. @@ -531,14 +612,10 @@ makeMeta title authors date = -- | Render HTML tags. renderTags' :: [Tag String] -> String renderTags' = renderTagsOptions - renderOptions{ optMinimize = \x -> - let y = map toLower x - in y == "hr" || y == "br" || - y == "img" || y == "meta" || - y == "link" - , optRawTag = \x -> - let y = map toLower x - in y == "script" || y == "style" } + renderOptions{ optMinimize = matchTags ["hr", "br", "img", + "meta", "link"] + , optRawTag = matchTags ["script", "style"] } + where matchTags = \tags -> flip elem tags . map toLower -- -- File handling @@ -557,8 +634,7 @@ readDefaultDataFile :: FilePath -> IO BS.ByteString readDefaultDataFile fname = #ifdef EMBED_DATA_FILES case lookup (makeCanonical fname) dataFiles of - Nothing -> ioError $ userError - $ "Data file `" ++ fname ++ "' does not exist" + Nothing -> err 97 $ "Could not find data file " ++ fname Just contents -> return contents where makeCanonical = joinPath . transformPathParts . splitDirectories transformPathParts = reverse . foldl go [] @@ -566,7 +642,12 @@ readDefaultDataFile fname = go (_:as) ".." = as go as x = x : as #else - getDataFileName ("data" </> fname) >>= BS.readFile + getDataFileName ("data" </> fname) >>= checkExistence >>= BS.readFile + where checkExistence fn = do + exists <- doesFileExist fn + if exists + then return fn + else err 97 ("Could not find data file " ++ fname) #endif -- | Read file from specified user data directory or, if not found there, from @@ -586,34 +667,45 @@ readDataFileUTF8 userDir fname = -- | Fetch an image or other item from the local filesystem or the net. -- Returns raw content and maybe mime type. -fetchItem :: String -> String -> IO (BS.ByteString, Maybe String) -fetchItem sourceDir s = - case s of - _ | isAbsoluteURI s -> openURL s - | isAbsoluteURI sourceDir -> openURL $ sourceDir ++ "/" ++ s - | otherwise -> do +fetchItem :: Maybe String -> String + -> IO (Either E.SomeException (BS.ByteString, Maybe String)) +fetchItem sourceURL s + | isURI s = openURL s + | otherwise = + case sourceURL >>= parseURIReference of + Just u -> case parseURIReference s of + Just s' -> openURL $ show $ + s' `nonStrictRelativeTo` u + Nothing -> openURL $ show u ++ "/" ++ s + Nothing -> E.try readLocalFile + where readLocalFile = do let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - let f = sourceDir </> s - cont <- BS.readFile f + ".gz" -> getMimeType $ dropExtension s + x -> getMimeType x + cont <- BS.readFile s return (cont, mime) -- | Read from a URL and return raw data and maybe mime type. -openURL :: String -> IO (BS.ByteString, Maybe String) +openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) openURL u | "data:" `isPrefixOf` u = let mime = takeWhile (/=',') $ drop 5 u contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u - in return (contents, Just mime) -#ifdef HTTP_CONDUIT - | otherwise = do + in return $ Right (decodeLenient contents, Just mime) +#ifdef HTTP_CLIENT + | otherwise = withSocketsDo $ E.try $ do req <- parseUrl u - resp <- withManager $ httpLbs req + (proxy :: Either E.SomeException String) <- E.try $ getEnv "http_proxy" + let req' = case proxy of + Left _ -> req + Right pr -> case parseUrl pr of + Just r -> addProxy (host r) (port r) req + Nothing -> req + resp <- withManager tlsManagerSettings $ httpLbs req' return (BS.concat $ toChunks $ responseBody resp, UTF8.toString `fmap` lookup hContentType (responseHeaders resp)) #else - | otherwise = getBodyAndMimeType `fmap` browse + | otherwise = E.try $ getBodyAndMimeType `fmap` browse (do S.liftIO $ UTF8.hPutStrLn stderr $ "Fetching " ++ u ++ "..." setOutHandler $ const (return ()) setAllowRedirects True @@ -651,5 +743,3 @@ safeRead s = case reads s of (d,x):_ | all isSpace x -> return d _ -> fail $ "Could not read `" ++ s ++ "'" - - diff --git a/src/Text/Pandoc/Slides.hs b/src/Text/Pandoc/Slides.hs index 2bbdb120f..2b863c780 100644 --- a/src/Text/Pandoc/Slides.hs +++ b/src/Text/Pandoc/Slides.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Slides - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -46,13 +46,18 @@ getSlideLevel = go 6 -- | Prepare a block list to be passed to hierarchicalize. prepSlides :: Int -> [Block] -> [Block] -prepSlides slideLevel = ensureStartWithH . splitHrule +prepSlides slideLevel = ensureStartWithH . splitHrule . extractRefsHeader where splitHrule (HorizontalRule : Header n attr xs : ys) | n == slideLevel = Header slideLevel attr xs : splitHrule ys splitHrule (HorizontalRule : xs) = Header slideLevel nullAttr [Str "\0"] : splitHrule xs splitHrule (x : xs) = x : splitHrule xs splitHrule [] = [] + extractRefsHeader bs = + case reverse bs of + (Div ("",["references"],[]) (Header n attrs xs : ys) : zs) + -> reverse zs ++ (Header n attrs xs : [Div ("",["references"],[]) ys]) + _ -> bs ensureStartWithH bs@(Header n _ _:_) | n <= slideLevel = bs ensureStartWithH bs = Header slideLevel nullAttr [Str "\0"] : bs diff --git a/src/Text/Pandoc/Templates.hs b/src/Text/Pandoc/Templates.hs index c95c84ca8..4ae6a6d8a 100644 --- a/src/Text/Pandoc/Templates.hs +++ b/src/Text/Pandoc/Templates.hs @@ -1,7 +1,7 @@ {-# LANGUAGE TypeSynonymInstances, FlexibleInstances, CPP, OverloadedStrings, GeneralizedNewtypeDeriving #-} {- -Copyright (C) 2009-2013 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2009-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Templates - Copyright : Copyright (C) 2009-2013 John MacFarlane + Copyright : Copyright (C) 2009-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -96,14 +96,14 @@ module Text.Pandoc.Templates ( renderTemplate import Data.Char (isAlphaNum) import Control.Monad (guard, when) import Data.Aeson (ToJSON(..), Value(..)) -import qualified Data.Attoparsec.Text as A -import Data.Attoparsec.Text (Parser) +import qualified Text.Parsec as P +import Text.Parsec.Text (Parser) import Control.Applicative import qualified Data.Text as T import Data.Text (Text) import Data.Text.Encoding (encodeUtf8) -import Data.Monoid ((<>), Monoid(..)) -import Data.List (intersperse, nub) +import Text.Pandoc.Compat.Monoid ((<>), Monoid(..)) +import Data.List (intersperse) import System.FilePath ((</>), (<.>)) import qualified Data.Map as M import qualified Data.HashMap.Strict as H @@ -116,7 +116,8 @@ import Text.Blaze.Internal (preEscapedText) import Text.Blaze (preEscapedText, Html) #endif import Data.ByteString.Lazy (ByteString, fromChunks) -import Text.Pandoc.Shared (readDataFileUTF8) +import Text.Pandoc.Shared (readDataFileUTF8, ordNub) +import Data.Vector ((!?)) -- | Get default template for the specified writer. getDefaultTemplate :: (Maybe FilePath) -- ^ User data directory to search first @@ -162,7 +163,7 @@ varListToJSON assoc = toJSON $ M.fromList assoc' where assoc' = [(T.pack k, toVal [T.pack z | (y,z) <- assoc, not (null z), y == k]) - | k <- nub $ map fst assoc ] + | k <- ordNub $ map fst assoc ] toVal [x] = toJSON x toVal [] = Null toVal xs = toJSON xs @@ -171,7 +172,10 @@ renderTemplate :: (ToJSON a, TemplateTarget b) => Template -> a -> b renderTemplate (Template f) context = toTarget $ f $ toJSON context compileTemplate :: Text -> Either String Template -compileTemplate template = A.parseOnly pTemplate template +compileTemplate template = + case P.parse (pTemplate <* P.eof) "template" template of + Left e -> Left (show e) + Right x -> Right x -- | Like 'renderTemplate', but compiles the template first, -- raising an error if compilation fails. @@ -185,10 +189,11 @@ var = Template . resolveVar resolveVar :: Variable -> Value -> Text resolveVar var' val = case multiLookup var' val of - Just (Array vec) -> mconcat $ map (resolveVar []) $ toList vec + Just (Array vec) -> maybe mempty (resolveVar []) $ vec !? 0 Just (String t) -> T.stripEnd t Just (Number n) -> T.pack $ show n Just (Bool True) -> "true" + Just (Object _) -> "true" Just _ -> mempty Nothing -> mempty @@ -212,7 +217,7 @@ iter var' template sep = Template $ \val -> unTemplate Just (Array vec) -> mconcat $ intersperse sep $ map (setVar template var') $ toList vec - Just x -> setVar template var' x + Just x -> cond var' (setVar template var' x) mempty Nothing -> mempty) val setVar :: Template -> Variable -> Value -> Template @@ -228,7 +233,7 @@ replaceVar _ _ old = old pTemplate :: Parser Template pTemplate = do - sp <- A.option mempty pInitialSpace + sp <- P.option mempty pInitialSpace rest <- mconcat <$> many (pConditional <|> pFor <|> pNewline <|> @@ -237,40 +242,43 @@ pTemplate = do pEscapedDollar) return $ sp <> rest +takeWhile1 :: (Char -> Bool) -> Parser Text +takeWhile1 f = T.pack <$> P.many1 (P.satisfy f) + pLit :: Parser Template -pLit = lit <$> A.takeWhile1 (\x -> x /='$' && x /= '\n') +pLit = lit <$> takeWhile1 (\x -> x /='$' && x /= '\n') pNewline :: Parser Template pNewline = do - A.char '\n' - sp <- A.option mempty pInitialSpace + P.char '\n' + sp <- P.option mempty pInitialSpace return $ lit "\n" <> sp pInitialSpace :: Parser Template pInitialSpace = do - sps <- A.takeWhile1 (==' ') + sps <- takeWhile1 (==' ') let indentVar = if T.null sps then id else indent (T.length sps) - v <- A.option mempty $ indentVar <$> pVar + v <- P.option mempty $ indentVar <$> pVar return $ lit sps <> v pEscapedDollar :: Parser Template -pEscapedDollar = lit "$" <$ A.string "$$" +pEscapedDollar = lit "$" <$ P.try (P.string "$$") pVar :: Parser Template -pVar = var <$> (A.char '$' *> pIdent <* A.char '$') +pVar = var <$> (P.try $ P.char '$' *> pIdent <* P.char '$') pIdent :: Parser [Text] pIdent = do first <- pIdentPart - rest <- many (A.char '.' *> pIdentPart) + rest <- many (P.char '.' *> pIdentPart) return (first:rest) pIdentPart :: Parser Text -pIdentPart = do - first <- A.letter - rest <- A.takeWhile (\c -> isAlphaNum c || c == '_' || c == '-') +pIdentPart = P.try $ do + first <- P.letter + rest <- T.pack <$> P.many (P.satisfy (\c -> isAlphaNum c || c == '_' || c == '-')) let id' = T.singleton first <> rest guard $ id' `notElem` reservedWords return id' @@ -279,38 +287,38 @@ reservedWords :: [Text] reservedWords = ["else","endif","for","endfor","sep"] skipEndline :: Parser () -skipEndline = A.skipWhile (`elem` " \t") >> A.char '\n' >> return () +skipEndline = P.try $ P.skipMany (P.satisfy (`elem` " \t")) >> P.char '\n' >> return () pConditional :: Parser Template pConditional = do - A.string "$if(" + P.try $ P.string "$if(" id' <- pIdent - A.string ")$" + P.string ")$" -- if newline after the "if", then a newline after "endif" will be swallowed - multiline <- A.option False (True <$ skipEndline) + multiline <- P.option False (True <$ skipEndline) ifContents <- pTemplate - elseContents <- A.option mempty $ - do A.string "$else$" - when multiline $ A.option () skipEndline + elseContents <- P.option mempty $ P.try $ + do P.string "$else$" + when multiline $ P.option () skipEndline pTemplate - A.string "$endif$" - when multiline $ A.option () skipEndline + P.string "$endif$" + when multiline $ P.option () skipEndline return $ cond id' ifContents elseContents pFor :: Parser Template pFor = do - A.string "$for(" + P.try $ P.string "$for(" id' <- pIdent - A.string ")$" + P.string ")$" -- if newline after the "for", then a newline after "endfor" will be swallowed - multiline <- A.option False $ skipEndline >> return True + multiline <- P.option False $ skipEndline >> return True contents <- pTemplate - sep <- A.option mempty $ - do A.string "$sep$" - when multiline $ A.option () skipEndline + sep <- P.option mempty $ + do P.try $ P.string "$sep$" + when multiline $ P.option () skipEndline pTemplate - A.string "$endfor$" - when multiline $ A.option () skipEndline + P.string "$endfor$" + when multiline $ P.option () skipEndline return $ iter id' contents sep indent :: Int -> Template -> Template diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs index 9fa743cd9..cf25de85b 100644 --- a/src/Text/Pandoc/UTF8.hs +++ b/src/Text/Pandoc/UTF8.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE CPP #-} {- -Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.UTF8 - Copyright : Copyright (C) 2010 John MacFarlane + Copyright : Copyright (C) 2010-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -50,8 +51,7 @@ import System.IO hiding (readFile, writeFile, getContents, #if MIN_VERSION_base(4,6,0) import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn) #else -import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn, - catch) +import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn) #endif import qualified System.IO as IO import qualified Data.ByteString.Char8 as B diff --git a/src/Text/Pandoc/UUID.hs b/src/Text/Pandoc/UUID.hs index 082644eea..eebfe09d2 100644 --- a/src/Text/Pandoc/UUID.hs +++ b/src/Text/Pandoc/UUID.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.UUID - Copyright : Copyright (C) 2010 John MacFarlane + Copyright : Copyright (C) 2010-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/AsciiDoc.hs b/src/Text/Pandoc/Writers/AsciiDoc.hs index 6c3c6955e..19112d8f5 100644 --- a/src/Text/Pandoc/Writers/AsciiDoc.hs +++ b/src/Text/Pandoc/Writers/AsciiDoc.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.AsciiDoc - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -132,7 +132,9 @@ blockToAsciiDoc opts (Para inlines) = do then text "\\" else empty return $ esc <> contents <> blankline -blockToAsciiDoc _ (RawBlock _ _) = return empty +blockToAsciiDoc _ (RawBlock f s) + | f == "asciidoc" = return $ text s + | otherwise = return empty blockToAsciiDoc _ HorizontalRule = return $ blankline <> text "'''''" <> blankline blockToAsciiDoc opts (Header level (ident,_,_) inlines) = do @@ -215,7 +217,9 @@ blockToAsciiDoc opts (Table caption aligns widths headers rows) = do let makeCell [Plain x] = do d <- blockListToAsciiDoc opts [Plain x] return $ text "|" <> chomp d makeCell [Para x] = makeCell [Plain x] - makeCell _ = return $ text "|" <> "[multiblock cell omitted]" + makeCell [] = return $ text "|" + makeCell bs = do d <- blockListToAsciiDoc opts bs + return $ text "a|" $$ d let makeRow cells = hsep `fmap` mapM makeCell cells rows' <- mapM makeRow rows head' <- makeRow headers @@ -225,7 +229,7 @@ blockToAsciiDoc opts (Table caption aligns widths headers rows) = do else 100000 let maxwidth = maximum $ map offset (head':rows') let body = if maxwidth > colwidth then vsep rows' else vcat rows' - let border = text $ "|" ++ replicate ((min maxwidth colwidth) - 1) '=' + let border = text $ "|" ++ replicate (max 5 (min maxwidth colwidth) - 1) '=' return $ caption'' $$ tablespec $$ border $$ head'' $$ body $$ border $$ blankline blockToAsciiDoc opts (BulletList items) = do @@ -246,6 +250,7 @@ blockToAsciiDoc opts (OrderedList (_start, sty, _delim) items) = do blockToAsciiDoc opts (DefinitionList items) = do contents <- mapM (definitionListItemToAsciiDoc opts) items return $ cat contents <> blankline +blockToAsciiDoc opts (Div _ bs) = blockListToAsciiDoc opts bs -- | Convert bullet list item (list of blocks) to asciidoc. bulletListItemToAsciiDoc :: WriterOptions -> [Block] -> State WriterState Doc @@ -346,7 +351,9 @@ inlineToAsciiDoc _ (Math InlineMath str) = return $ "latexmath:[$" <> text str <> "$]" inlineToAsciiDoc _ (Math DisplayMath str) = return $ "latexmath:[\\[" <> text str <> "\\]]" -inlineToAsciiDoc _ (RawInline _ _) = return empty +inlineToAsciiDoc _ (RawInline f s) + | f == "asciidoc" = return $ text s + | otherwise = return empty inlineToAsciiDoc _ (LineBreak) = return $ " +" <> cr inlineToAsciiDoc _ Space = return space inlineToAsciiDoc opts (Cite _ lst) = inlineListToAsciiDoc opts lst @@ -383,3 +390,4 @@ inlineToAsciiDoc opts (Note [Plain inlines]) = do return $ text "footnote:[" <> contents <> "]" -- asciidoc can't handle blank lines in notes inlineToAsciiDoc _ (Note _) = return "[multiblock footnote omitted]" +inlineToAsciiDoc opts (Span _ ils) = inlineListToAsciiDoc opts ils diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs index 32588dc8f..3b321cc19 100644 --- a/src/Text/Pandoc/Writers/ConTeXt.hs +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2007-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.ConTeXt - Copyright : Copyright (C) 2007-2010 John MacFarlane + Copyright : Copyright (C) 2007-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,9 +33,9 @@ import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Options -import Text.Pandoc.Generic (queryWith) +import Text.Pandoc.Walk (query) import Text.Printf ( printf ) -import Data.List ( intercalate, isPrefixOf ) +import Data.List ( intercalate ) import Control.Monad.State import Text.Pandoc.Pretty import Text.Pandoc.Templates ( renderTemplate' ) @@ -130,7 +130,7 @@ blockToConTeXt (Plain lst) = inlineListToConTeXt lst -- title beginning with fig: indicates that the image is a figure blockToConTeXt (Para [Image txt (src,'f':'i':'g':':':_)]) = do capt <- inlineListToConTeXt txt - return $ blankline $$ "\\placefigure[here,nonumber]" <> braces capt <> + return $ blankline $$ "\\placefigure" <> braces capt <> braces ("\\externalfigure" <> brackets (text src)) <> blankline blockToConTeXt (Para lst) = do contents <- inlineListToConTeXt lst @@ -143,6 +143,7 @@ blockToConTeXt (CodeBlock _ str) = -- blankline because \stoptyping can't have anything after it, inc. '}' blockToConTeXt (RawBlock "context" str) = return $ text str <> blankline blockToConTeXt (RawBlock _ _ ) = return empty +blockToConTeXt (Div _ bs) = blockListToConTeXt bs blockToConTeXt (BulletList lst) = do contents <- mapM listItemToConTeXt lst return $ ("\\startitemize" <> if isTightList lst @@ -204,9 +205,9 @@ blockToConTeXt (Table caption aligns widths heads rows) = do else liftM ($$ "\\HL") $ tableRowToConTeXt heads captionText <- inlineListToConTeXt caption rows' <- mapM tableRowToConTeXt rows - return $ "\\placetable" <> brackets ("here" <> if null caption - then ",none" - else "") + return $ "\\placetable" <> (if null caption + then brackets "none" + else empty) <> braces captionText $$ "\\starttable" <> brackets (text colDescriptors) $$ "\\HL" $$ headers $$ @@ -282,14 +283,6 @@ inlineToConTeXt (RawInline "tex" str) = return $ text str inlineToConTeXt (RawInline _ _) = return empty inlineToConTeXt (LineBreak) = return $ text "\\crlf" <> cr inlineToConTeXt Space = return space --- autolink -inlineToConTeXt (Link [Str str] (src, tit)) - | if "mailto:" `isPrefixOf` src - then src == escapeURI ("mailto:" ++ str) - else src == escapeURI str = - inlineToConTeXt (Link - [RawInline "context" "\\hyphenatedurl{", Str str, RawInline "context" "}"] - (src, tit)) -- Handle HTML-like internal document references to sections inlineToConTeXt (Link txt (('#' : ref), _)) = do opts <- gets stOptions @@ -304,6 +297,7 @@ inlineToConTeXt (Link txt (('#' : ref), _)) = do <> brackets (text ref) inlineToConTeXt (Link txt (src, _)) = do + let isAutolink = txt == [Str src] st <- get let next = stNextRef st put $ st {stNextRef = next + 1} @@ -312,8 +306,9 @@ inlineToConTeXt (Link txt (src, _)) = do return $ "\\useURL" <> brackets (text ref) <> brackets (text $ escapeStringUsing [('#',"\\#"),('%',"\\%")] src) - <> brackets empty - <> brackets label + <> (if isAutolink + then empty + else brackets empty <> brackets label) <> "\\from" <> brackets (text ref) inlineToConTeXt (Image _ (src, _)) = do @@ -325,11 +320,12 @@ inlineToConTeXt (Note contents) = do contents' <- blockListToConTeXt contents let codeBlock x@(CodeBlock _ _) = [x] codeBlock _ = [] - let codeBlocks = queryWith codeBlock contents + let codeBlocks = query codeBlock contents return $ if null codeBlocks then text "\\footnote{" <> nest 2 contents' <> char '}' else text "\\startbuffer " <> nest 2 contents' <> text "\\stopbuffer\\footnote{\\getbuffer}" +inlineToConTeXt (Span _ ils) = inlineListToConTeXt ils -- | Craft the section header, inserting the secton reference, if supplied. sectionHeader :: Attr diff --git a/src/Text/Pandoc/Writers/Custom.hs b/src/Text/Pandoc/Writers/Custom.hs index 732497616..88f590c43 100644 --- a/src/Text/Pandoc/Writers/Custom.hs +++ b/src/Text/Pandoc/Writers/Custom.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverlappingInstances, FlexibleInstances, OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -{- Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> +{- Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Custom - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,6 +33,7 @@ module Text.Pandoc.Writers.Custom ( writeCustom ) where import Text.Pandoc.Definition import Text.Pandoc.Options import Data.List ( intersperse ) +import Data.Char ( toLower ) import Scripting.Lua (LuaState, StackValue, callfunc) import qualified Scripting.Lua as Lua import Text.Pandoc.UTF8 (fromString, toString) @@ -78,6 +79,11 @@ instance StackValue a => StackValue [a] where return (Just lst) valuetype _ = Lua.TTABLE +instance StackValue Format where + push lua (Format f) = Lua.push lua (map toLower f) + peek l n = fmap Format `fmap` Lua.peek l n + valuetype _ = Lua.TSTRING + instance (StackValue a, StackValue b) => StackValue (M.Map a b) where push lua m = do let xs = M.toList m @@ -110,12 +116,14 @@ instance StackValue [Block] where instance StackValue MetaValue where push l (MetaMap m) = Lua.push l m push l (MetaList xs) = Lua.push l xs + push l (MetaBool x) = Lua.push l x push l (MetaString s) = Lua.push l s push l (MetaInlines ils) = Lua.push l ils push l (MetaBlocks bs) = Lua.push l bs peek _ _ = undefined valuetype (MetaMap _) = Lua.TTABLE valuetype (MetaList _) = Lua.TTABLE + valuetype (MetaBool _) = Lua.TBOOLEAN valuetype (MetaString _) = Lua.TSTRING valuetype (MetaInlines _) = Lua.TSTRING valuetype (MetaBlocks _) = Lua.TSTRING @@ -123,7 +131,7 @@ instance StackValue MetaValue where -- | Convert Pandoc to custom markup. writeCustom :: FilePath -> WriterOptions -> Pandoc -> IO String writeCustom luaFile opts doc = do - luaScript <- readFile luaFile + luaScript <- C8.unpack `fmap` C8.readFile luaFile lua <- Lua.newstate Lua.openlibs lua Lua.loadstring lua luaScript "custom" @@ -176,6 +184,9 @@ blockToCustom lua (OrderedList (num,sty,delim) items) = blockToCustom lua (DefinitionList items) = callfunc lua "DefinitionList" items +blockToCustom lua (Div attr items) = + callfunc lua "Div" items (attrToMap attr) + -- | Convert list of Pandoc block elements to Custom. blockListToCustom :: LuaState -- ^ Options -> [Block] -- ^ List of block elements @@ -238,3 +249,5 @@ inlineToCustom lua (Image alt (src,tit)) = inlineToCustom lua (Note contents) = callfunc lua "Note" contents +inlineToCustom lua (Span attr items) = + callfunc lua "Span" items (attrToMap attr) diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 6f4b61a79..ba6a92a08 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Docbook - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -31,12 +32,14 @@ module Text.Pandoc.Writers.Docbook ( writeDocbook) where import Text.Pandoc.Definition import Text.Pandoc.XML import Text.Pandoc.Shared +import Text.Pandoc.Walk import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Readers.TeXMath import Data.List ( isPrefixOf, intercalate, isSuffixOf ) import Data.Char ( toLower ) +import Data.Monoid ( Any(..) ) import Text.Pandoc.Highlighting ( languages, languagesByExtension ) import Text.Pandoc.Pretty import qualified Text.Pandoc.Builder as B @@ -84,8 +87,9 @@ writeDocbook opts (Pandoc meta blocks) = auths' = map (authorToDocbook opts) $ docAuthors meta meta' = B.setMeta "author" auths' meta Just metadata = metaToJSON opts - (Just . render colwidth . blocksToDocbook opts) - (Just . render colwidth . inlinesToDocbook opts) + (Just . render colwidth . (vcat . + (map (elementToDocbook opts' startLvl)) . hierarchicalize)) + (Just . render colwidth . inlinesToDocbook opts') meta' main = render' $ vcat (map (elementToDocbook opts' startLvl) elements) context = defField "body" main @@ -148,6 +152,7 @@ listItemToDocbook opts item = -- | Convert a Pandoc block element to Docbook. blockToDocbook :: WriterOptions -> Block -> Doc blockToDocbook _ Null = empty +blockToDocbook opts (Div _ bs) = blocksToDocbook opts $ map plainToPara bs blockToDocbook _ (Header _ _ _) = empty -- should not occur after hierarchicalize blockToDocbook opts (Plain lst) = inlinesToDocbook opts lst -- title beginning with fig: indicates that the image is a figure @@ -162,8 +167,9 @@ blockToDocbook opts (Para [Image txt (src,'f':'i':'g':':':_)]) = (inTagsIndented "imageobject" (selfClosingTag "imagedata" [("fileref",src)])) $$ inTagsSimple "textobject" (inTagsSimple "phrase" alt)) -blockToDocbook opts (Para lst) = - inTagsIndented "para" $ inlinesToDocbook opts lst +blockToDocbook opts (Para lst) + | hasLineBreaks lst = flush $ nowrap $ inTagsSimple "literallayout" $ inlinesToDocbook opts lst + | otherwise = inTagsIndented "para" $ inlinesToDocbook opts lst blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" $ blocksToDocbook opts blocks blockToDocbook _ (CodeBlock (_,classes,_) str) = @@ -179,10 +185,11 @@ blockToDocbook _ (CodeBlock (_,classes,_) str) = else languagesByExtension . map toLower $ s langs = concatMap langsFrom classes blockToDocbook opts (BulletList lst) = - inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst + let attribs = [("spacing", "compact") | isTightList lst] + in inTags True "itemizedlist" attribs $ listItemsToDocbook opts lst blockToDocbook _ (OrderedList _ []) = empty blockToDocbook opts (OrderedList (start, numstyle, _) (first:rest)) = - let attribs = case numstyle of + let numeration = case numstyle of DefaultStyle -> [] Decimal -> [("numeration", "arabic")] Example -> [("numeration", "arabic")] @@ -190,18 +197,21 @@ blockToDocbook opts (OrderedList (start, numstyle, _) (first:rest)) = LowerAlpha -> [("numeration", "loweralpha")] UpperRoman -> [("numeration", "upperroman")] LowerRoman -> [("numeration", "lowerroman")] - items = if start == 1 - then listItemsToDocbook opts (first:rest) - else (inTags True "listitem" [("override",show start)] - (blocksToDocbook opts $ map plainToPara first)) $$ - listItemsToDocbook opts rest + spacing = [("spacing", "compact") | isTightList (first:rest)] + attribs = numeration ++ spacing + items = if start == 1 + then listItemsToDocbook opts (first:rest) + else (inTags True "listitem" [("override",show start)] + (blocksToDocbook opts $ map plainToPara first)) $$ + listItemsToDocbook opts rest in inTags True "orderedlist" attribs items blockToDocbook opts (DefinitionList lst) = - inTagsIndented "variablelist" $ deflistItemsToDocbook opts lst -blockToDocbook _ (RawBlock "docbook" str) = text str -- raw XML block --- we allow html for compatibility with earlier versions of pandoc -blockToDocbook _ (RawBlock "html" str) = text str -- raw XML block -blockToDocbook _ (RawBlock _ _) = empty + let attribs = [("spacing", "compact") | isTightList $ concatMap snd lst] + in inTags True "variablelist" attribs $ deflistItemsToDocbook opts lst +blockToDocbook _ (RawBlock f str) + | f == "docbook" = text str -- raw XML block + | f == "html" = text str -- allow html for backwards compatibility + | otherwise = empty blockToDocbook _ HorizontalRule = empty -- not semantic blockToDocbook opts (Table caption aligns widths headers rows) = let captionDoc = if null caption @@ -223,6 +233,16 @@ blockToDocbook opts (Table caption aligns widths headers rows) = (inTags True "tgroup" [("cols", show (length headers))] $ coltags $$ head' $$ body') +hasLineBreaks :: [Inline] -> Bool +hasLineBreaks = getAny . query isLineBreak . walk removeNote + where + removeNote :: Inline -> Inline + removeNote (Note _) = Str "" + removeNote x = x + isLineBreak :: Inline -> Any + isLineBreak LineBreak = Any True + isLineBreak _ = Any False + alignmentToString :: Alignment -> [Char] alignmentToString alignment = case alignment of AlignLeft -> "left" @@ -267,6 +287,8 @@ inlineToDocbook opts (Quoted _ lst) = inTagsSimple "quote" $ inlinesToDocbook opts lst inlineToDocbook opts (Cite _ lst) = inlinesToDocbook opts lst +inlineToDocbook opts (Span _ ils) = + inlinesToDocbook opts ils inlineToDocbook _ (Code _ str) = inTagsSimple "literal" $ text (escapeStringForXML str) inlineToDocbook opts (Math t str) @@ -277,8 +299,8 @@ inlineToDocbook opts (Math t str) $ fixNS $ removeAttr r Left _ -> inlinesToDocbook opts - $ readTeXMath str - | otherwise = inlinesToDocbook opts $ readTeXMath str + $ readTeXMath' t str + | otherwise = inlinesToDocbook opts $ readTeXMath' t str where (dt, tagtype) = case t of InlineMath -> (DisplayInline,"inlineequation") DisplayMath -> (DisplayBlock,"informalequation") @@ -288,7 +310,7 @@ inlineToDocbook opts (Math t str) fixNS = everywhere (mkT fixNS') inlineToDocbook _ (RawInline f x) | f == "html" || f == "docbook" = text x | otherwise = empty -inlineToDocbook _ LineBreak = flush $ inTagsSimple "literallayout" (text "\n") +inlineToDocbook _ LineBreak = text "\n" inlineToDocbook _ Space = space inlineToDocbook opts (Link txt (src, _)) = if isPrefixOf "mailto:" src diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index e899200f6..31e64f14e 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -1,6 +1,6 @@ {-# LANGUAGE ScopedTypeVariables #-} {- -Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Docx - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -29,22 +29,25 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of 'Pandoc' documents to docx. -} module Text.Pandoc.Writers.Docx ( writeDocx ) where -import Data.List ( intercalate, groupBy ) +import Data.Maybe (fromMaybe) +import Data.List ( intercalate, isPrefixOf, isSuffixOf ) import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Char8 as BL8 import qualified Data.Map as M import qualified Text.Pandoc.UTF8 as UTF8 -import Data.Monoid ((<>)) +import Text.Pandoc.Compat.Monoid ((<>)) import Codec.Archive.Zip import Data.Time.Clock.POSIX import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.ImageSize import Text.Pandoc.Shared hiding (Element) +import Text.Pandoc.Writers.Shared (fixDisplayMath) import Text.Pandoc.Options import Text.Pandoc.Readers.TeXMath import Text.Pandoc.Highlighting ( highlight ) +import Text.Pandoc.Walk import Text.Highlighting.Kate.Types () import Text.XML.Light import Text.TeXMath @@ -54,8 +57,33 @@ import Data.Unique (hashUnique, newUnique) import System.Random (randomRIO) import Text.Printf (printf) import qualified Control.Exception as E -import System.FilePath (takeExtension) -import Text.Pandoc.MIME (getMimeType) +import Text.Pandoc.MIME (getMimeType, extensionFromMimeType) +import Control.Applicative ((<|>)) +import Data.Maybe (mapMaybe) + +data ListMarker = NoMarker + | BulletMarker + | NumberMarker ListNumberStyle ListNumberDelim Int + deriving (Show, Read, Eq, Ord) + +listMarkerToId :: ListMarker -> String +listMarkerToId NoMarker = "990" +listMarkerToId BulletMarker = "991" +listMarkerToId (NumberMarker sty delim n) = + '9' : '9' : styNum : delimNum : show n + where styNum = case sty of + DefaultStyle -> '2' + Example -> '3' + Decimal -> '4' + LowerRoman -> '5' + UpperRoman -> '6' + LowerAlpha -> '7' + UpperAlpha -> '8' + delimNum = case delim of + DefaultDelim -> '0' + Period -> '1' + OneParen -> '2' + TwoParens -> '3' data WriterState = WriterState{ stTextProperties :: [Element] @@ -66,15 +94,9 @@ data WriterState = WriterState{ , stImages :: M.Map FilePath (String, String, Maybe String, Element, B.ByteString) , stListLevel :: Int , stListNumId :: Int - , stNumStyles :: M.Map ListMarker Int , stLists :: [ListMarker] } -data ListMarker = NoMarker - | BulletMarker - | NumberMarker ListNumberStyle ListNumberDelim Int - deriving (Show, Read, Eq, Ord) - defaultWriterState :: WriterState defaultWriterState = WriterState{ stTextProperties = [] @@ -85,7 +107,6 @@ defaultWriterState = WriterState{ , stImages = M.empty , stListLevel = -1 , stListNumId = 1 - , stNumStyles = M.fromList [(NoMarker, 0)] , stLists = [NoMarker] } @@ -108,17 +129,52 @@ writeDocx :: WriterOptions -- ^ Writer options -> IO BL.ByteString writeDocx opts doc@(Pandoc meta _) = do let datadir = writerUserDataDir opts - let doc' = bottomUp (concatMap fixDisplayMath) doc + let doc' = walk fixDisplayMath doc refArchive <- liftM (toArchive . toLazy) $ case writerReferenceDocx opts of Just f -> B.readFile f Nothing -> readDataFile datadir "reference.docx" + distArchive <- liftM (toArchive . toLazy) $ readDataFile Nothing "reference.docx" ((contents, footnotes), st) <- runStateT (writeOpenXML opts{writerWrapText = False} doc') defaultWriterState epochtime <- floor `fmap` getPOSIXTime let imgs = M.elems $ stImages st + -- create entries for images in word/media/... + let toImageEntry (_,path,_,_,img) = toEntry ("word/" ++ path) epochtime $ toLazy img + let imageEntries = map toImageEntry imgs + + -- adjust contents to add sectPr from reference.docx + parsedDoc <- parseXml refArchive distArchive "word/document.xml" + let wname f qn = qPrefix qn == Just "w" && f (qName qn) + let mbsectpr = filterElementName (wname (=="sectPr")) parsedDoc + + let sectpr = maybe (mknode "w:sectPr" [] $ ()) id mbsectpr + + let stdAttributes = + [("xmlns:w","http://schemas.openxmlformats.org/wordprocessingml/2006/main") + ,("xmlns:m","http://schemas.openxmlformats.org/officeDocument/2006/math") + ,("xmlns:r","http://schemas.openxmlformats.org/officeDocument/2006/relationships") + ,("xmlns:o","urn:schemas-microsoft-com:office:office") + ,("xmlns:v","urn:schemas-microsoft-com:vml") + ,("xmlns:w10","urn:schemas-microsoft-com:office:word") + ,("xmlns:a","http://schemas.openxmlformats.org/drawingml/2006/main") + ,("xmlns:pic","http://schemas.openxmlformats.org/drawingml/2006/picture") + ,("xmlns:wp","http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing")] + + let contents' = contents ++ [sectpr] + let docContents = mknode "w:document" stdAttributes + $ mknode "w:body" [] $ contents' + + parsedRels <- parseXml refArchive distArchive "word/_rels/document.xml.rels" + let isHeaderNode e = findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" + let isFooterNode e = findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" + let headers = filterElements isHeaderNode parsedRels + let footers = filterElements isFooterNode parsedRels + + let extractTarget e = findAttr (QName "Target" Nothing Nothing) e + -- we create [Content_Types].xml and word/_rels/document.xml.rels -- from scratch rather than reading from reference.docx, -- because Word sometimes changes these files when a reference.docx is modified, @@ -129,8 +185,12 @@ writeDocx opts doc@(Pandoc meta _) = do let mkOverrideNode (part', contentType') = mknode "Override" [("PartName",part'),("ContentType",contentType')] () let mkImageOverride (_, imgpath, mbMimeType, _, _) = - mkOverrideNode ("/word/" ++ imgpath, maybe "application/octet-stream" id mbMimeType) - let overrides = map mkOverrideNode + mkOverrideNode ("/word/" ++ imgpath, + fromMaybe "application/octet-stream" mbMimeType) + let mkMediaOverride imgpath = mkOverrideNode ('/':imgpath, + fromMaybe "application/octet-stream" + $ getMimeType imgpath) + let overrides = map mkOverrideNode ( [("/word/webSettings.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml") ,("/word/numbering.xml", @@ -151,7 +211,15 @@ writeDocx opts doc@(Pandoc meta _) = do "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") ,("/word/footnotes.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml") - ] ++ map mkImageOverride imgs + ] ++ + map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml")) headers ++ + map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml")) footers) ++ + map mkImageOverride imgs ++ + map mkMediaOverride [ eRelativePath e | e <- zEntries refArchive + , "word/media/" `isPrefixOf` eRelativePath e ] + let defaultnodes = [mknode "Default" [("Extension","xml"),("ContentType","application/xml")] (), mknode "Default" @@ -186,7 +254,9 @@ writeDocx opts doc@(Pandoc meta _) = do "theme/theme1.xml") ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes", "rId7", - "footnotes.xml")] + "footnotes.xml") + ] ++ + headers ++ footers let toImgRel (ident,path,_,_,_) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"),("Id",ident),("Target",path)] () let imgrels = map toImgRel imgs let toLinkRel (src,ident) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"),("Id",ident),("Target",src),("TargetMode","External") ] () @@ -195,15 +265,14 @@ writeDocx opts doc@(Pandoc meta _) = do let relEntry = toEntry "word/_rels/document.xml.rels" epochtime $ renderXml reldoc - -- create entries for images in word/media/... - let toImageEntry (_,path,_,_,img) = toEntry ("word/" ++ path) epochtime $ toLazy img - let imageEntries = map toImageEntry imgs -- word/document.xml - let contentEntry = toEntry "word/document.xml" epochtime $ renderXml contents + let contentEntry = toEntry "word/document.xml" epochtime + $ renderXml docContents -- footnotes - let footnotesEntry = toEntry "word/footnotes.xml" epochtime $ renderXml footnotes + let notes = mknode "w:footnotes" stdAttributes footnotes + let footnotesEntry = toEntry "word/footnotes.xml" epochtime $ renderXml notes -- footnote rels let footnoteRelEntry = toEntry "word/_rels/footnotes.xml.rels" epochtime @@ -213,14 +282,22 @@ writeDocx opts doc@(Pandoc meta _) = do -- styles let newstyles = styleToOpenXml $ writerHighlightStyle opts let stylepath = "word/styles.xml" - styledoc <- parseXml refArchive stylepath - let styledoc' = styledoc{ elContent = elContent styledoc ++ map Elem newstyles } + styledoc <- parseXml refArchive distArchive stylepath + let styledoc' = styledoc{ elContent = elContent styledoc ++ + [Elem x | x <- newstyles, writerHighlight opts] } let styleEntry = toEntry stylepath epochtime $ renderXml styledoc' -- construct word/numbering.xml let numpath = "word/numbering.xml" - numEntry <- (toEntry numpath epochtime . renderXml) - `fmap` mkNumbering (stNumStyles st) (stLists st) + numbering <- parseXml refArchive distArchive numpath + newNumElts <- mkNumbering (stLists st) + let allElts = onlyElems (elContent numbering) ++ newNumElts + let numEntry = toEntry numpath epochtime $ renderXml numbering{ elContent = + -- we want all the abstractNums first, then the nums, + -- otherwise things break: + [Elem e | e <- allElts + , qName (elName e) == "abstractNum" ] ++ + [Elem e | e <- allElts, qName (elName e) == "num" ] } let docPropsPath = "docProps/core.xml" let docProps = mknode "cp:coreProperties" [("xmlns:cp","http://schemas.openxmlformats.org/package/2006/metadata/core-properties") @@ -229,10 +306,11 @@ writeDocx opts doc@(Pandoc meta _) = do ,("xmlns:dcmitype","http://purl.org/dc/dcmitype/") ,("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance")] $ mknode "dc:title" [] (stringify $ docTitle meta) - : mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] - (maybe "" id $ normalizeDate $ stringify $ docDate meta) - : mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] () -- put current time here - : map (mknode "dc:creator" [] . stringify) (docAuthors meta) + : mknode "dc:creator" [] (intercalate "; " (map stringify $ docAuthors meta)) + : maybe [] + (\x -> [ mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] $ x + , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] $ x + ]) (normalizeDate $ stringify $ docDate meta) let docPropsEntry = toEntry docPropsPath epochtime $ renderXml docProps let relsPath = "_rels/.rels" @@ -245,24 +323,41 @@ writeDocx opts doc@(Pandoc meta _) = do ,("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties") ,("Target","docProps/app.xml")] , [("Id","rId3") - ,("Type","http://schemas.openxmlformats.org/officedocument/2006/relationships/metadata/core-properties") + ,("Type","http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties") ,("Target","docProps/core.xml")] ] let relsEntry = toEntry relsPath epochtime $ renderXml rels - let entryFromArchive path = (toEntry path epochtime . renderXml) `fmap` - parseXml refArchive path - docPropsAppEntry <- entryFromArchive "docProps/app.xml" - themeEntry <- entryFromArchive "word/theme/theme1.xml" - fontTableEntry <- entryFromArchive "word/fontTable.xml" - webSettingsEntry <- entryFromArchive "word/webSettings.xml" + let entryFromArchive arch path = + maybe (fail $ path ++ " corrupt or missing in reference docx") + return + (findEntryByPath path arch `mplus` findEntryByPath path distArchive) + docPropsAppEntry <- entryFromArchive refArchive "docProps/app.xml" + themeEntry <- entryFromArchive refArchive "word/theme/theme1.xml" + fontTableEntry <- entryFromArchive refArchive "word/fontTable.xml" + -- we use dist archive for settings.xml, because Word sometimes + -- adds references to footnotes or endnotes we don't have... + settingsEntry <- entryFromArchive distArchive "word/settings.xml" + webSettingsEntry <- entryFromArchive refArchive "word/webSettings.xml" + headerFooterEntries <- mapM (entryFromArchive refArchive) $ + mapMaybe (\e -> fmap ("word/" ++) $ extractTarget e) + (headers ++ footers) + let miscRelEntries = [ e | e <- zEntries refArchive + , "word/_rels/" `isPrefixOf` (eRelativePath e) + , ".xml.rels" `isSuffixOf` (eRelativePath e) + , eRelativePath e /= "word/_rels/document.xml.rels" + , eRelativePath e /= "word/_rels/footnotes.xml.rels" ] + let otherMediaEntries = [ e | e <- zEntries refArchive + , "word/media/" `isPrefixOf` eRelativePath e ] -- Create archive let archive = foldr addEntryToArchive emptyArchive $ contentTypesEntry : relsEntry : contentEntry : relEntry : footnoteRelEntry : numEntry : styleEntry : footnotesEntry : docPropsEntry : docPropsAppEntry : themeEntry : - fontTableEntry : webSettingsEntry : imageEntries + fontTableEntry : settingsEntry : webSettingsEntry : + imageEntries ++ headerFooterEntries ++ + miscRelEntries ++ otherMediaEntries return $ fromArchive archive styleToOpenXml :: Style -> [Element] @@ -300,29 +395,30 @@ styleToOpenXml style = parStyle : map toStyle alltoktypes $ backgroundColor style ) ] -mkNumbering :: M.Map ListMarker Int -> [ListMarker] -> IO Element -mkNumbering markers lists = do - elts <- mapM mkAbstractNum (M.toList markers) - return $ mknode "w:numbering" - [("xmlns:w","http://schemas.openxmlformats.org/wordprocessingml/2006/main")] - $ elts ++ zipWith (mkNum markers) lists [1..(length lists)] +-- this is the lowest number used for a list numId +baseListId :: Int +baseListId = 1000 -mkNum :: M.Map ListMarker Int -> ListMarker -> Int -> Element -mkNum markers marker numid = +mkNumbering :: [ListMarker] -> IO [Element] +mkNumbering lists = do + elts <- mapM mkAbstractNum (ordNub lists) + return $ elts ++ zipWith mkNum lists [baseListId..(baseListId + length lists - 1)] + +mkNum :: ListMarker -> Int -> Element +mkNum marker numid = mknode "w:num" [("w:numId",show numid)] - $ mknode "w:abstractNumId" [("w:val",show absnumid)] () + $ mknode "w:abstractNumId" [("w:val",listMarkerToId marker)] () : case marker of NoMarker -> [] BulletMarker -> [] NumberMarker _ _ start -> map (\lvl -> mknode "w:lvlOverride" [("w:ilvl",show (lvl :: Int))] $ mknode "w:startOverride" [("w:val",show start)] ()) [0..6] - where absnumid = maybe 0 id $ M.lookup marker markers -mkAbstractNum :: (ListMarker,Int) -> IO Element -mkAbstractNum (marker,numid) = do +mkAbstractNum :: ListMarker -> IO Element +mkAbstractNum marker = do nsid <- randomRIO (0x10000000 :: Integer, 0xFFFFFFFF :: Integer) - return $ mknode "w:abstractNum" [("w:abstractNumId",show numid)] + return $ mknode "w:abstractNum" [("w:abstractNumId",listMarkerToId marker)] $ mknode "w:nsid" [("w:val", printf "%8x" nsid)] () : mknode "w:multiLevelType" [("w:val","multilevel")] () : map (mkLvl marker) [0..6] @@ -374,10 +470,11 @@ mkLvl marker lvl = patternFor _ s = s ++ "." getNumId :: WS Int -getNumId = length `fmap` gets stLists +getNumId = ((999 +) . length) `fmap` gets stLists --- | Convert Pandoc document to two OpenXML elements (the main document and footnotes). -writeOpenXML :: WriterOptions -> Pandoc -> WS (Element, Element) +-- | Convert Pandoc document to two lists of +-- OpenXML elements (the main document and footnotes). +writeOpenXML :: WriterOptions -> Pandoc -> WS ([Element], [Element]) writeOpenXML opts (Pandoc meta blocks) = do let tit = docTitle meta ++ case lookupMeta "subtitle" meta of Just (MetaBlocks [Plain xs]) -> LineBreak : xs @@ -395,19 +492,7 @@ writeOpenXML opts (Pandoc meta blocks) = do doc' <- blocksToOpenXML opts blocks' notes' <- reverse `fmap` gets stFootnotes let meta' = title ++ authors ++ date - let stdAttributes = - [("xmlns:w","http://schemas.openxmlformats.org/wordprocessingml/2006/main") - ,("xmlns:m","http://schemas.openxmlformats.org/officeDocument/2006/math") - ,("xmlns:r","http://schemas.openxmlformats.org/officeDocument/2006/relationships") - ,("xmlns:o","urn:schemas-microsoft-com:office:office") - ,("xmlns:v","urn:schemas-microsoft-com:vml") - ,("xmlns:w10","urn:schemas-microsoft-com:office:word") - ,("xmlns:a","http://schemas.openxmlformats.org/drawingml/2006/main") - ,("xmlns:pic","http://schemas.openxmlformats.org/drawingml/2006/picture") - ,("xmlns:wp","http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing")] - let doc = mknode "w:document" stdAttributes $ mknode "w:body" [] (meta' ++ doc') - let notes = mknode "w:footnotes" stdAttributes notes' - return (doc, notes) + return (meta' ++ doc', notes') -- | Convert a list of Pandoc blocks to OpenXML. blocksToOpenXML :: WriterOptions -> [Block] -> WS [Element] @@ -427,6 +512,7 @@ getUniqueId = liftIO $ (show . (+ 20) . hashUnique) `fmap` newUnique -- | Convert a Pandoc block element to OpenXML. blockToOpenXML :: WriterOptions -> Block -> WS [Element] blockToOpenXML _ Null = return [] +blockToOpenXML opts (Div _ bs) = blocksToOpenXML opts bs blockToOpenXML opts (Header lev (ident,_,_) lst) = do contents <- withParaProp (pStyle $ "Heading" ++ show lev) $ blockToOpenXML opts (Para lst) @@ -458,8 +544,8 @@ blockToOpenXML opts (Para lst) = do contents <- inlinesToOpenXML opts lst return [mknode "w:p" [] (paraProps ++ contents)] blockToOpenXML _ (RawBlock format str) - | format == "openxml" = return [ x | Elem x <- parseXML str ] - | otherwise = return [] + | format == Format "openxml" = return [ x | Elem x <- parseXML str ] + | otherwise = return [] blockToOpenXML opts (BlockQuote blocks) = withParaProp (pStyle "BlockQuote") $ blocksToOpenXML opts blocks blockToOpenXML opts (CodeBlock attrs str) = @@ -485,10 +571,12 @@ blockToOpenXML opts (Table caption aligns widths headers rows) = do [ mknode "w:tcBorders" [] $ mknode "w:bottom" [("w:val","single")] () , mknode "w:vAlign" [("w:val","bottom")] () ] + let emptyCell = [mknode "w:p" [] [mknode "w:pPr" [] $ + [mknode "w:pStyle" [("w:val","Compact")] ()]]] let mkcell border contents = mknode "w:tc" [] $ [ borderProps | border ] ++ if null contents - then [mknode "w:p" [] ()] + then emptyCell else contents let mkrow border cells = mknode "w:tr" [] $ map (mkcell border) cells let textwidth = 7920 -- 5.5 in in twips, 1/20 pt @@ -533,17 +621,13 @@ addList :: ListMarker -> WS () addList marker = do lists <- gets stLists modify $ \st -> st{ stLists = lists ++ [marker] } - numStyles <- gets stNumStyles - case M.lookup marker numStyles of - Just _ -> return () - Nothing -> modify $ \st -> - st{ stNumStyles = M.insert marker (M.size numStyles + 1) numStyles } listItemToOpenXML :: WriterOptions -> Int -> [Block] -> WS [Element] listItemToOpenXML _ _ [] = return [] listItemToOpenXML opts numid (first:rest) = do first' <- withNumId numid $ blockToOpenXML opts first - rest' <- withNumId 1 $ blocksToOpenXML opts rest + -- baseListId is the code for no list marker: + rest' <- withNumId baseListId $ blocksToOpenXML opts rest return $ first' ++ rest' alignmentToString :: Alignment -> [Char] @@ -632,6 +716,12 @@ formattedString str = do inlineToOpenXML :: WriterOptions -> Inline -> WS [Element] inlineToOpenXML _ (Str str) = formattedString str inlineToOpenXML opts Space = inlineToOpenXML opts (Str " ") +inlineToOpenXML opts (Span (_,classes,_) ils) = do + let off x = withTextProp (mknode x [("w:val","0")] ()) + ((if "csl-no-emph" `elem` classes then off "w:i" else id) . + (if "csl-no-strong" `elem` classes then off "w:b" else id) . + (if "csl-no-smallcaps" `elem` classes then off "w:smallCaps" else id)) + $ inlinesToOpenXML opts ils inlineToOpenXML opts (Strong lst) = withTextProp (mknode "w:b" [] ()) $ inlinesToOpenXML opts lst inlineToOpenXML opts (Emph lst) = @@ -650,8 +740,8 @@ inlineToOpenXML opts (Strikeout lst) = $ inlinesToOpenXML opts lst inlineToOpenXML _ LineBreak = return [br] inlineToOpenXML _ (RawInline f str) - | f == "openxml" = return [ x | Elem x <- parseXML str ] - | otherwise = return [] + | f == Format "openxml" = return [ x | Elem x <- parseXML str ] + | otherwise = return [] inlineToOpenXML opts (Quoted quoteType lst) = inlinesToOpenXML opts $ [Str open] ++ lst ++ [Str close] where (open, close) = case quoteType of @@ -663,15 +753,18 @@ inlineToOpenXML opts (Math mathType str) = do else DisplayInline case texMathToOMML displayType str of Right r -> return [r] - Left _ -> inlinesToOpenXML opts (readTeXMath str) + Left _ -> inlinesToOpenXML opts (readTeXMath' mathType str) inlineToOpenXML opts (Cite _ lst) = inlinesToOpenXML opts lst -inlineToOpenXML _ (Code attrs str) = +inlineToOpenXML opts (Code attrs str) = withTextProp (rStyle "VerbatimChar") - $ case highlight formatOpenXML attrs str of - Nothing -> intercalate [br] - `fmap` (mapM formattedString $ lines str) - Just h -> return h - where formatOpenXML _fmtOpts = intercalate [br] . map (map toHlTok) + $ if writerHighlight opts + then case highlight formatOpenXML attrs str of + Nothing -> unhighlighted + Just h -> return h + else unhighlighted + where unhighlighted = intercalate [br] `fmap` + (mapM formattedString $ lines str) + formatOpenXML _fmtOpts = intercalate [br] . map (map toHlTok) toHlTok (toktype,tok) = mknode "w:r" [] [ mknode "w:rPr" [] [ rStyle $ show toktype ] @@ -682,7 +775,7 @@ inlineToOpenXML opts (Note bs) = do let notemarker = mknode "w:r" [] [ mknode "w:rPr" [] (rStyle "FootnoteRef") , mknode "w:footnoteRef" [] () ] - let notemarkerXml = RawInline "openxml" $ ppElement notemarker + let notemarkerXml = RawInline (Format "openxml") $ ppElement notemarker let insertNoteRef (Plain ils : xs) = Plain (notemarkerXml : ils) : xs insertNoteRef (Para ils : xs) = Para (notemarkerXml : ils) : xs insertNoteRef xs = Para [notemarkerXml] : xs @@ -721,14 +814,13 @@ inlineToOpenXML opts (Image alt (src, tit)) = do case M.lookup src imgs of Just (_,_,_,elt,_) -> return [elt] Nothing -> do - let sourceDir = writerSourceDirectory opts - res <- liftIO $ E.try $ fetchItem sourceDir src + res <- liftIO $ fetchItem (writerSourceURL opts) src case res of Left (_ :: E.SomeException) -> do liftIO $ warn $ "Could not find image `" ++ src ++ "', skipping..." -- emit alt text inlinesToOpenXML opts alt - Right (img, _) -> do + Right (img, mt) -> do ident <- ("rId"++) `fmap` getUniqueId let size = imageSize img let (xpt,ypt) = maybe (120,120) sizeInPoints size @@ -767,18 +859,21 @@ inlineToOpenXML opts (Image alt (src, tit)) = do , mknode "wp:effectExtent" [("b","0"),("l","0"),("r","0"),("t","0")] () , mknode "wp:docPr" [("descr",tit),("id","1"),("name","Picture")] () , graphic ] - let imgext = case imageType img of - Just Png -> ".png" - Just Jpeg -> ".jpeg" - Just Gif -> ".gif" - Just Pdf -> ".pdf" - Nothing -> takeExtension src + let imgext = case mt >>= extensionFromMimeType of + Just x -> '.':x + Nothing -> case imageType img of + Just Png -> ".png" + Just Jpeg -> ".jpeg" + Just Gif -> ".gif" + Just Pdf -> ".pdf" + Just Eps -> ".eps" + Nothing -> "" if null imgext then -- without an extension there is no rule for content type inlinesToOpenXML opts alt -- return alt to avoid corrupted docx else do let imgpath = "media/" ++ ident ++ imgext - let mbMimeType = getMimeType imgpath + let mbMimeType = mt <|> getMimeType imgpath -- insert mime type to use in constructing [Content_Types].xml modify $ \st -> st{ stImages = M.insert src (ident, imgpath, mbMimeType, imgElt, img) @@ -788,32 +883,10 @@ inlineToOpenXML opts (Image alt (src, tit)) = do br :: Element br = mknode "w:r" [] [mknode "w:br" [("w:type","textWrapping")] () ] -parseXml :: Archive -> String -> IO Element -parseXml refArchive relpath = - case (findEntryByPath relpath refArchive >>= parseXMLDoc . UTF8.toStringLazy . fromEntry) of - Just d -> return d - Nothing -> fail $ relpath ++ " missing in reference docx" - -isDisplayMath :: Inline -> Bool -isDisplayMath (Math DisplayMath _) = True -isDisplayMath _ = False - -stripLeadingTrailingSpace :: [Inline] -> [Inline] -stripLeadingTrailingSpace = go . reverse . go . reverse - where go (Space:xs) = xs - go xs = xs - -fixDisplayMath :: Block -> [Block] -fixDisplayMath (Plain lst) - | any isDisplayMath lst && not (all isDisplayMath lst) = - -- chop into several paragraphs so each displaymath is its own - map (Plain . stripLeadingTrailingSpace) $ - groupBy (\x y -> (isDisplayMath x && isDisplayMath y) || - not (isDisplayMath x || isDisplayMath y)) lst -fixDisplayMath (Para lst) - | any isDisplayMath lst && not (all isDisplayMath lst) = - -- chop into several paragraphs so each displaymath is its own - map (Para . stripLeadingTrailingSpace) $ - groupBy (\x y -> (isDisplayMath x && isDisplayMath y) || - not (isDisplayMath x || isDisplayMath y)) lst -fixDisplayMath x = [x] +parseXml :: Archive -> Archive -> String -> IO Element +parseXml refArchive distArchive relpath = + case ((findEntryByPath relpath refArchive `mplus` + findEntryByPath relpath distArchive) + >>= parseXMLDoc . UTF8.toStringLazy . fromEntry) of + Just d -> return d + Nothing -> fail $ relpath ++ " corrupt or missing in reference docx" diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index f171a2560..b6687c330 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -1,6 +1,6 @@ -{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE PatternGuards, CPP, ScopedTypeVariables #-} {- -Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.EPUB - Copyright : Copyright (C) 2010 John MacFarlane + Copyright : Copyright (C) 2010-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -30,16 +30,18 @@ Conversion of 'Pandoc' documents to EPUB. -} module Text.Pandoc.Writers.EPUB ( writeEPUB ) where import Data.IORef -import Data.Maybe ( fromMaybe, isNothing ) +import qualified Data.Map as M +import Data.Maybe ( fromMaybe ) import Data.List ( isInfixOf, intercalate ) import System.Environment ( getEnv ) import Text.Printf (printf) -import System.FilePath ( (</>), takeBaseName, takeExtension, takeFileName ) +import System.FilePath ( (</>), takeExtension, takeFileName ) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as B8 -import Text.Pandoc.UTF8 ( fromStringLazy, toString ) +import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.SelfContained ( makeSelfContained ) import Codec.Archive.Zip +import Control.Applicative ((<$>)) import Data.Time.Clock.POSIX import Data.Time import System.Locale @@ -48,25 +50,18 @@ import qualified Text.Pandoc.Shared as Shared import Text.Pandoc.Builder (fromList, setMeta) import Text.Pandoc.Options import Text.Pandoc.Definition -import Text.Pandoc.Generic +import Text.Pandoc.Walk import Control.Monad.State import Text.XML.Light hiding (ppTopElement) import Text.Pandoc.UUID import Text.Pandoc.Writers.HTML import Text.Pandoc.Writers.Markdown ( writePlain ) -import Data.Char ( toLower ) -import Network.URI ( isAbsoluteURI, unEscapeString ) +import Data.Char ( toLower, isDigit, isAlphaNum ) +import Network.URI ( unEscapeString ) import Text.Pandoc.MIME (getMimeType) -#if MIN_VERSION_base(4,6,0) -#else -import Prelude hiding (catch) -#endif -import Control.Exception (catch, SomeException) -#if MIN_VERSION_blaze_html(0,5,0) +import qualified Control.Exception as E import Text.Blaze.Html.Renderer.Utf8 (renderHtml) -#else -import Text.Blaze.Renderer.Utf8 (renderHtml) -#endif +import Text.HTML.TagSoup -- A Chapter includes a list of blocks and maybe a section -- number offset. Note, some chapters are unnumbered. The section @@ -74,12 +69,260 @@ import Text.Blaze.Renderer.Utf8 (renderHtml) -- in filenames, chapter0003.xhtml. data Chapter = Chapter (Maybe [Int]) [Block] +data EPUBMetadata = EPUBMetadata{ + epubIdentifier :: [Identifier] + , epubTitle :: [Title] + , epubDate :: [Date] + , epubLanguage :: String + , epubCreator :: [Creator] + , epubContributor :: [Creator] + , epubSubject :: [String] + , epubDescription :: Maybe String + , epubType :: Maybe String + , epubFormat :: Maybe String + , epubPublisher :: Maybe String + , epubSource :: Maybe String + , epubRelation :: Maybe String + , epubCoverage :: Maybe String + , epubRights :: Maybe String + , epubCoverImage :: Maybe String + , epubStylesheet :: Maybe Stylesheet + } deriving Show + +data Stylesheet = StylesheetPath FilePath + | StylesheetContents String + deriving Show + +data Date = Date{ + dateText :: String + , dateEvent :: Maybe String + } deriving Show + +data Creator = Creator{ + creatorText :: String + , creatorRole :: Maybe String + , creatorFileAs :: Maybe String + } deriving Show + +data Identifier = Identifier{ + identifierText :: String + , identifierScheme :: Maybe String + } deriving Show + +data Title = Title{ + titleText :: String + , titleFileAs :: Maybe String + , titleType :: Maybe String + } deriving Show + +dcName :: String -> QName +dcName n = QName n Nothing (Just "dc") + +dcNode :: Node t => String -> t -> Element +dcNode = node . dcName + +opfName :: String -> QName +opfName n = QName n Nothing (Just "opf") + +plainify :: [Inline] -> String +plainify t = + trimr $ writePlain def{ writerStandalone = False } + $ Pandoc nullMeta [Plain $ walk removeNote t] + +removeNote :: Inline -> Inline +removeNote (Note _) = Str "" +removeNote x = x + +toId :: FilePath -> String +toId = map (\x -> if isAlphaNum x || x == '-' || x == '_' + then x + else '_') . takeFileName + +getEPUBMetadata :: WriterOptions -> Meta -> IO EPUBMetadata +getEPUBMetadata opts meta = do + let md = metadataFromMeta opts meta + let elts = onlyElems $ parseXML $ writerEpubMetadata opts + let md' = foldr addMetadataFromXML md elts + let addIdentifier m = + if null (epubIdentifier m) + then do + randomId <- fmap show getRandomUUID + return $ m{ epubIdentifier = [Identifier randomId Nothing] } + else return m + let addLanguage m = + if null (epubLanguage m) + then case lookup "lang" (writerVariables opts) of + Just x -> return m{ epubLanguage = x } + Nothing -> do + localeLang <- E.catch (liftM + (map (\c -> if c == '_' then '-' else c) . + takeWhile (/='.')) $ getEnv "LANG") + (\e -> let _ = (e :: E.SomeException) in return "en-US") + return m{ epubLanguage = localeLang } + else return m + let fixDate m = + if null (epubDate m) + then do + currentTime <- getCurrentTime + return $ m{ epubDate = [ Date{ + dateText = showDateTimeISO8601 currentTime + , dateEvent = Nothing } ] } + else return m + let addAuthor m = + if any (\c -> creatorRole c == Just "aut") $ epubCreator m + then return m + else do + let authors' = map plainify $ docAuthors meta + let toAuthor name = Creator{ creatorText = name + , creatorRole = Just "aut" + , creatorFileAs = Nothing } + return $ m{ epubCreator = map toAuthor authors' ++ epubCreator m } + addIdentifier md' >>= fixDate >>= addAuthor >>= addLanguage + +addMetadataFromXML :: Element -> EPUBMetadata -> EPUBMetadata +addMetadataFromXML e@(Element (QName name _ (Just "dc")) attrs _ _) md + | name == "identifier" = md{ epubIdentifier = + Identifier{ identifierText = strContent e + , identifierScheme = lookupAttr (opfName "scheme") attrs + } : epubIdentifier md } + | name == "title" = md{ epubTitle = + Title{ titleText = strContent e + , titleFileAs = getAttr "file-as" + , titleType = getAttr "type" + } : epubTitle md } + | name == "date" = md{ epubDate = + Date{ dateText = fromMaybe "" $ normalizeDate' $ strContent e + , dateEvent = getAttr "event" + } : epubDate md } + | name == "language" = md{ epubLanguage = strContent e } + | name == "creator" = md{ epubCreator = + Creator{ creatorText = strContent e + , creatorRole = getAttr "role" + , creatorFileAs = getAttr "file-as" + } : epubCreator md } + | name == "contributor" = md{ epubContributor = + Creator { creatorText = strContent e + , creatorRole = getAttr "role" + , creatorFileAs = getAttr "file-as" + } : epubContributor md } + | name == "subject" = md{ epubSubject = strContent e : epubSubject md } + | name == "description" = md { epubDescription = Just $ strContent e } + | name == "type" = md { epubType = Just $ strContent e } + | name == "format" = md { epubFormat = Just $ strContent e } + | name == "type" = md { epubType = Just $ strContent e } + | name == "publisher" = md { epubPublisher = Just $ strContent e } + | name == "source" = md { epubSource = Just $ strContent e } + | name == "relation" = md { epubRelation = Just $ strContent e } + | name == "coverage" = md { epubCoverage = Just $ strContent e } + | name == "rights" = md { epubRights = Just $ strContent e } + | otherwise = md + where getAttr n = lookupAttr (opfName n) attrs +addMetadataFromXML _ md = md + +metaValueToString :: MetaValue -> String +metaValueToString (MetaString s) = s +metaValueToString (MetaInlines ils) = plainify ils +metaValueToString (MetaBlocks bs) = plainify $ query (:[]) bs +metaValueToString (MetaBool b) = show b +metaValueToString _ = "" + +getList :: String -> Meta -> (MetaValue -> a) -> [a] +getList s meta handleMetaValue = + case lookupMeta s meta of + Just (MetaList xs) -> map handleMetaValue xs + Just mv -> [handleMetaValue mv] + Nothing -> [] + +getIdentifier :: Meta -> [Identifier] +getIdentifier meta = getList "identifier" meta handleMetaValue + where handleMetaValue (MetaMap m) = + Identifier{ identifierText = maybe "" metaValueToString + $ M.lookup "text" m + , identifierScheme = metaValueToString <$> + M.lookup "scheme" m } + handleMetaValue mv = Identifier (metaValueToString mv) Nothing + +getTitle :: Meta -> [Title] +getTitle meta = getList "title" meta handleMetaValue + where handleMetaValue (MetaMap m) = + Title{ titleText = maybe "" metaValueToString $ M.lookup "text" m + , titleFileAs = metaValueToString <$> M.lookup "file-as" m + , titleType = metaValueToString <$> M.lookup "type" m } + handleMetaValue mv = Title (metaValueToString mv) Nothing Nothing + +getCreator :: String -> Meta -> [Creator] +getCreator s meta = getList s meta handleMetaValue + where handleMetaValue (MetaMap m) = + Creator{ creatorText = maybe "" metaValueToString $ M.lookup "text" m + , creatorFileAs = metaValueToString <$> M.lookup "file-as" m + , creatorRole = metaValueToString <$> M.lookup "role" m } + handleMetaValue mv = Creator (metaValueToString mv) Nothing Nothing + +getDate :: String -> Meta -> [Date] +getDate s meta = getList s meta handleMetaValue + where handleMetaValue (MetaMap m) = + Date{ dateText = maybe "" id $ + M.lookup "text" m >>= normalizeDate' . metaValueToString + , dateEvent = metaValueToString <$> M.lookup "event" m } + handleMetaValue mv = Date { dateText = maybe "" + id $ normalizeDate' $ metaValueToString mv + , dateEvent = Nothing } + +simpleList :: String -> Meta -> [String] +simpleList s meta = + case lookupMeta s meta of + Just (MetaList xs) -> map metaValueToString xs + Just x -> [metaValueToString x] + Nothing -> [] + +metadataFromMeta :: WriterOptions -> Meta -> EPUBMetadata +metadataFromMeta opts meta = EPUBMetadata{ + epubIdentifier = identifiers + , epubTitle = titles + , epubDate = date + , epubLanguage = language + , epubCreator = creators + , epubContributor = contributors + , epubSubject = subjects + , epubDescription = description + , epubType = epubtype + , epubFormat = format + , epubPublisher = publisher + , epubSource = source + , epubRelation = relation + , epubCoverage = coverage + , epubRights = rights + , epubCoverImage = coverImage + , epubStylesheet = stylesheet + } + where identifiers = getIdentifier meta + titles = getTitle meta + date = getDate "date" meta + language = maybe "" metaValueToString $ + lookupMeta "language" meta `mplus` lookupMeta "lang" meta + creators = getCreator "creator" meta + contributors = getCreator "contributor" meta + subjects = simpleList "subject" meta + description = metaValueToString <$> lookupMeta "description" meta + epubtype = metaValueToString <$> lookupMeta "type" meta + format = metaValueToString <$> lookupMeta "format" meta + publisher = metaValueToString <$> lookupMeta "publisher" meta + source = metaValueToString <$> lookupMeta "source" meta + relation = metaValueToString <$> lookupMeta "relation" meta + coverage = metaValueToString <$> lookupMeta "coverage" meta + rights = metaValueToString <$> lookupMeta "rights" meta + coverImage = lookup "epub-cover-image" (writerVariables opts) `mplus` + (metaValueToString <$> lookupMeta "cover-image" meta) + stylesheet = (StylesheetContents <$> writerEpubStylesheet opts) `mplus` + ((StylesheetPath . metaValueToString) <$> + lookupMeta "stylesheet" meta) + -- | Produce an EPUB file from a Pandoc document. writeEPUB :: WriterOptions -- ^ Writer options -> Pandoc -- ^ Document to convert -> IO B.ByteString writeEPUB opts doc@(Pandoc meta _) = do - let version = maybe EPUB2 id (writerEpubVersion opts) + let version = fromMaybe EPUB2 (writerEpubVersion opts) let epub3 = version == EPUB3 epochtime <- floor `fmap` getPOSIXTime let mkEntry path content = toEntry path epochtime content @@ -97,17 +340,16 @@ writeEPUB opts doc@(Pandoc meta _) = do then MathML Nothing else writerHTMLMathMethod opts , writerWrapText = False } - let sourceDir = writerSourceDirectory opts' - let mbCoverImage = lookup "epub-cover-image" vars + metadata <- getEPUBMetadata opts' meta -- cover page (cpgEntry, cpicEntry) <- - case mbCoverImage of + case epubCoverImage metadata of Nothing -> return ([],[]) Just img -> do - let coverImage = "cover-image" ++ takeExtension img + let coverImage = "media/" ++ takeFileName img let cpContent = renderHtml $ writeHtml opts' - (Pandoc meta [RawBlock "html" $ "<div id=\"cover-image\">\n<img src=\"" ++ coverImage ++ "\" alt=\"cover image\" />\n</div>"]) + (Pandoc meta [RawBlock (Format "html") $ "<div id=\"cover-image\">\n<img src=\"" ++ coverImage ++ "\" alt=\"cover image\" />\n</div>"]) imgContent <- B.readFile img return ( [mkEntry "cover.xhtml" cpContent] , [mkEntry coverImage imgContent] ) @@ -119,14 +361,19 @@ writeEPUB opts doc@(Pandoc meta _) = do let tpEntry = mkEntry "title_page.xhtml" tpContent -- handle pictures - picsRef <- newIORef [] - Pandoc _ blocks <- bottomUpM - (transformInline opts' sourceDir picsRef) doc - pics <- readIORef picsRef - let readPicEntry (oldsrc, newsrc) = do - (img,_) <- fetchItem sourceDir oldsrc - return $ toEntry newsrc epochtime $ B.fromChunks . (:[]) $ img - picEntries <- mapM readPicEntry pics + mediaRef <- newIORef [] + Pandoc _ blocks <- walkM (transformInline opts' mediaRef) doc >>= + walkM (transformBlock opts' mediaRef) + pics <- readIORef mediaRef + let readPicEntry entries (oldsrc, newsrc) = do + res <- fetchItem (writerSourceURL opts') oldsrc + case res of + Left _ -> do + warn $ "Could not find media `" ++ oldsrc ++ "', skipping..." + return entries + Right (img,_) -> return $ + (toEntry newsrc epochtime $ B.fromChunks . (:[]) $ img) : entries + picEntries <- foldM readPicEntry [] pics -- handle fonts let mkFontEntry f = mkEntry (takeFileName f) `fmap` B.readFile f @@ -179,59 +426,60 @@ writeEPUB opts doc@(Pandoc meta _) = do chapToEntry num (Chapter mbnum bs) = mkEntry (showChapter num) $ renderHtml $ writeHtml opts'{ writerNumberOffset = - maybe [] id mbnum } + fromMaybe [] mbnum } $ case bs of (Header _ _ xs : _) -> - Pandoc (setMeta "title" (fromList xs) nullMeta) bs + -- remove notes or we get doubled footnotes + Pandoc (setMeta "title" (walk removeNote $ fromList xs) + nullMeta) bs _ -> Pandoc nullMeta bs let chapterEntries = zipWith chapToEntry [1..] chapters -- incredibly inefficient (TODO): - let containsMathML ent = "<math" `isInfixOf` (B8.unpack $ fromEntry ent) + let containsMathML ent = epub3 && + "<math" `isInfixOf` (B8.unpack $ fromEntry ent) + let containsSVG ent = epub3 && + "<svg" `isInfixOf` (B8.unpack $ fromEntry ent) + let props ent = ["mathml" | containsMathML ent] ++ ["svg" | containsSVG ent] -- contents.opf - localeLang <- catch (liftM (map (\c -> if c == '_' then '-' else c) . - takeWhile (/='.')) $ getEnv "LANG") - (\e -> let _ = (e :: SomeException) in return "en-US") - let lang = case lookup "lang" (writerVariables opts') of - Just x -> x - Nothing -> localeLang - uuid <- getRandomUUID let chapterNode ent = unode "item" ! - ([("id", takeBaseName $ eRelativePath ent), + ([("id", toId $ eRelativePath ent), ("href", eRelativePath ent), ("media-type", "application/xhtml+xml")] - ++ [("properties","mathml") | epub3 && - containsMathML ent]) $ () + ++ case props ent of + [] -> [] + xs -> [("properties", unwords xs)]) + $ () let chapterRefNode ent = unode "itemref" ! - [("idref", takeBaseName $ eRelativePath ent)] $ () + [("idref", toId $ eRelativePath ent)] $ () let pictureNode ent = unode "item" ! - [("id", takeBaseName $ eRelativePath ent), + [("id", toId $ eRelativePath ent), ("href", eRelativePath ent), ("media-type", fromMaybe "application/octet-stream" - $ imageTypeOf $ eRelativePath ent)] $ () + $ mediaTypeOf $ eRelativePath ent)] $ () let fontNode ent = unode "item" ! - [("id", takeBaseName $ eRelativePath ent), + [("id", toId $ eRelativePath ent), ("href", eRelativePath ent), - ("media-type", maybe "" id $ getMimeType $ eRelativePath ent)] $ () - let plainify t = trimr $ - writePlain opts'{ writerStandalone = False } $ - Pandoc meta [Plain t] - let plainTitle = plainify $ docTitle meta - let plainAuthors = map plainify $ docAuthors meta + ("media-type", fromMaybe "" $ getMimeType $ eRelativePath ent)] $ () + let plainTitle = case docTitle meta of + [] -> case epubTitle metadata of + [] -> "UNTITLED" + (x:_) -> titleText x + x -> plainify x + let uuid = case epubIdentifier metadata of + (x:_) -> identifierText x -- use first identifier as UUID + [] -> error "epubIdentifier is null" -- shouldn't happen currentTime <- getCurrentTime - let plainDate = maybe (showDateTimeISO8601 currentTime) id - $ normalizeDate $ stringify $ docDate meta - let contentsData = fromStringLazy $ ppTopElement $ + let contentsData = UTF8.fromStringLazy $ ppTopElement $ unode "package" ! [("version", case version of EPUB2 -> "2.0" EPUB3 -> "3.0") ,("xmlns","http://www.idpf.org/2007/opf") - ,("unique-identifier","BookId")] $ - [ metadataElement version (writerEpubMetadata opts') - uuid lang plainTitle plainAuthors plainDate currentTime mbCoverImage + ,("unique-identifier","epub-id-1")] $ + [ metadataElement version metadata currentTime , unode "manifest" $ [ unode "item" ! [("id","ncx"), ("href","toc.ncx") ,("media-type","application/x-dtbncx+xml")] $ () @@ -243,14 +491,19 @@ writeEPUB opts doc@(Pandoc meta _) = do [("properties","nav") | epub3 ]) $ () ] ++ map chapterNode (cpgEntry ++ (tpEntry : chapterEntries)) ++ - map pictureNode (cpicEntry ++ picEntries) ++ + (case cpicEntry of + [] -> [] + (x:_) -> [add_attrs + [Attr (unqual "properties") "cover-image" | epub3] + (pictureNode x)]) ++ + map pictureNode picEntries ++ map fontNode fontEntries , unode "spine" ! [("toc","ncx")] $ - case mbCoverImage of + case epubCoverImage metadata of Nothing -> [] Just _ -> [ unode "itemref" ! - [("idref", "cover"),("linear","no")] $ () ] - ++ ((unode "itemref" ! [("idref", "title_page") + [("idref", "cover_xhtml"),("linear","no")] $ () ] + ++ ((unode "itemref" ! [("idref", "title_page_xhtml") ,("linear", if null (docTitle meta) then "no" else "yes")] $ ()) : @@ -260,8 +513,13 @@ writeEPUB opts doc@(Pandoc meta _) = do else "no")] $ ()) : map chapterRefNode chapterEntries) , unode "guide" $ - unode "reference" ! - [("type","toc"),("title",plainTitle),("href","nav.xhtml")] $ () + [ unode "reference" ! + [("type","toc"),("title",plainTitle), + ("href","nav.xhtml")] $ () + ] ++ + [ unode "reference" ! + [("type","cover"),("title","Cover"),("href","cover.xhtml")] $ () | epubCoverImage metadata /= Nothing + ] ] let contentsEntry = mkEntry "content.opf" contentsData @@ -303,22 +561,22 @@ writeEPUB opts doc@(Pandoc meta _) = do [ unode "navLabel" $ unode "text" (plainify $ docTitle meta) , unode "content" ! [("src","title_page.xhtml")] $ () ] - let tocData = fromStringLazy $ ppTopElement $ + let tocData = UTF8.fromStringLazy $ ppTopElement $ unode "ncx" ! [("version","2005-1") ,("xmlns","http://www.daisy.org/z3986/2005/ncx/")] $ [ unode "head" $ [ unode "meta" ! [("name","dtb:uid") - ,("content", show uuid)] $ () + ,("content", uuid)] $ () , unode "meta" ! [("name","dtb:depth") ,("content", "1")] $ () , unode "meta" ! [("name","dtb:totalPageCount") ,("content", "0")] $ () , unode "meta" ! [("name","dtb:maxPageNumber") ,("content", "0")] $ () - ] ++ case mbCoverImage of + ] ++ case epubCoverImage metadata of Nothing -> [] - Just _ -> [unode "meta" ! [("name","cover"), - ("content","cover-image")] $ ()] + Just img -> [unode "meta" ! [("name","cover"), + ("content", toId img)] $ ()] , unode "docTitle" $ unode "text" $ plainTitle , unode "navMap" $ tpNode : evalState (mapM (navPointNode navMapFormatter) secs) 1 @@ -335,7 +593,7 @@ writeEPUB opts doc@(Pandoc meta _) = do (_:_) -> [unode "ol" ! [("class","toc")] $ subs] let navtag = if epub3 then "nav" else "div" - let navData = fromStringLazy $ ppTopElement $ + let navData = UTF8.fromStringLazy $ ppTopElement $ unode "html" ! [("xmlns","http://www.w3.org/1999/xhtml") ,("xmlns:epub","http://www.idpf.org/2007/ops")] $ [ unode "head" $ @@ -349,10 +607,10 @@ writeEPUB opts doc@(Pandoc meta _) = do let navEntry = mkEntry "nav.xhtml" navData -- mimetype - let mimetypeEntry = mkEntry "mimetype" $ fromStringLazy "application/epub+zip" + let mimetypeEntry = mkEntry "mimetype" $ UTF8.fromStringLazy "application/epub+zip" -- container.xml - let containerData = fromStringLazy $ ppTopElement $ + let containerData = UTF8.fromStringLazy $ ppTopElement $ unode "container" ! [("version","1.0") ,("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")] $ unode "rootfiles" $ @@ -361,18 +619,19 @@ writeEPUB opts doc@(Pandoc meta _) = do let containerEntry = mkEntry "META-INF/container.xml" containerData -- com.apple.ibooks.display-options.xml - let apple = fromStringLazy $ ppTopElement $ + let apple = UTF8.fromStringLazy $ ppTopElement $ unode "display_options" $ unode "platform" ! [("name","*")] $ unode "option" ! [("name","specified-fonts")] $ "true" let appleEntry = mkEntry "META-INF/com.apple.ibooks.display-options.xml" apple -- stylesheet - stylesheet <- case writerEpubStylesheet opts of - Just s -> return s - Nothing -> toString `fmap` + stylesheet <- case epubStylesheet metadata of + Just (StylesheetPath fp) -> UTF8.readFile fp + Just (StylesheetContents s) -> return s + Nothing -> UTF8.toString `fmap` readDataFile (writerUserDataDir opts) "epub.css" - let stylesheetEntry = mkEntry "stylesheet.css" $ fromStringLazy stylesheet + let stylesheetEntry = mkEntry "stylesheet.css" $ UTF8.fromStringLazy stylesheet -- construct archive let archive = foldr addEntryToArchive emptyArchive @@ -381,64 +640,167 @@ writeEPUB opts doc@(Pandoc meta _) = do (picEntries ++ cpicEntry ++ cpgEntry ++ chapterEntries ++ fontEntries)) return $ fromArchive archive -metadataElement :: EPUBVersion -> String -> UUID -> String -> String -> [String] - -> String -> UTCTime -> Maybe a -> Element -metadataElement version metadataXML uuid lang title authors date currentTime mbCoverImage = - let userNodes = parseXML metadataXML - elt = unode "metadata" ! [("xmlns:dc","http://purl.org/dc/elements/1.1/") - ,("xmlns:opf","http://www.idpf.org/2007/opf")] $ - filter isMetadataElement $ onlyElems userNodes - dublinElements = ["contributor","coverage","creator","date", - "description","format","identifier","language","publisher", - "relation","rights","source","subject","title","type"] - isMetadataElement e = (qPrefix (elName e) == Just "dc" && - qName (elName e) `elem` dublinElements) || - (qPrefix (elName e) == Nothing && - qName (elName e) `elem` ["link","meta"]) - contains e n = not (null (findElements (QName n Nothing (Just "dc")) e)) - newNodes = [ unode "dc:title" title | not (elt `contains` "title") ] ++ - [ unode "dc:language" lang | not (elt `contains` "language") ] ++ - [ unode "dc:identifier" ! [("id","BookId")] $ show uuid | - not (elt `contains` "identifier") ] ++ - [ unode "dc:creator" ! [("opf:role","aut") | version == EPUB2] - $ a | a <- authors, not (elt `contains` "creator") ] ++ - [ unode "dc:date" date | not (elt `contains` "date") ] ++ - [ unode "meta" ! [("property", "dcterms:modified")] $ - (showDateTimeISO8601 currentTime) | version == EPUB3] ++ - [ unode "meta" ! [("name","cover"), ("content","cover-image")] $ () | - not (isNothing mbCoverImage) ] - in elt{ elContent = elContent elt ++ map Elem newNodes } +metadataElement :: EPUBVersion -> EPUBMetadata -> UTCTime -> Element +metadataElement version md currentTime = + unode "metadata" ! [("xmlns:dc","http://purl.org/dc/elements/1.1/") + ,("xmlns:opf","http://www.idpf.org/2007/opf")] $ mdNodes + where mdNodes = identifierNodes ++ titleNodes ++ dateNodes ++ languageNodes + ++ creatorNodes ++ contributorNodes ++ subjectNodes + ++ descriptionNodes ++ typeNodes ++ formatNodes + ++ publisherNodes ++ sourceNodes ++ relationNodes + ++ coverageNodes ++ rightsNodes ++ coverImageNodes + ++ modifiedNodes + withIds base f = concat . zipWith f (map (\x -> base ++ ('-' : show x)) + ([1..] :: [Int])) + identifierNodes = withIds "epub-id" toIdentifierNode $ + epubIdentifier md + titleNodes = withIds "epub-title" toTitleNode $ epubTitle md + dateNodes = if version == EPUB2 + then withIds "epub-date" toDateNode $ epubDate md + else -- epub3 allows only one dc:date + -- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-opf-dcdate + case epubDate md of + [] -> [] + (x:_) -> [dcNode "date" ! [("id","epub-date")] + $ dateText x] + languageNodes = [dcTag "language" $ epubLanguage md] + creatorNodes = withIds "epub-creator" (toCreatorNode "creator") $ + epubCreator md + contributorNodes = withIds "epub-contributor" + (toCreatorNode "contributor") $ epubContributor md + subjectNodes = map (dcTag "subject") $ epubSubject md + descriptionNodes = maybe [] (dcTag' "description") $ epubDescription md + typeNodes = maybe [] (dcTag' "type") $ epubType md + formatNodes = maybe [] (dcTag' "format") $ epubFormat md + publisherNodes = maybe [] (dcTag' "publisher") $ epubPublisher md + sourceNodes = maybe [] (dcTag' "source") $ epubSource md + relationNodes = maybe [] (dcTag' "relation") $ epubRelation md + coverageNodes = maybe [] (dcTag' "coverage") $ epubCoverage md + rightsNodes = maybe [] (dcTag' "rights") $ epubRights md + coverImageNodes = maybe [] + (\img -> [unode "meta" ! [("name","cover"), + ("content",toId img)] $ ()]) + $ epubCoverImage md + modifiedNodes = [ unode "meta" ! [("property", "dcterms:modified")] $ + (showDateTimeISO8601 currentTime) | version == EPUB3 ] + dcTag n s = unode ("dc:" ++ n) s + dcTag' n s = [dcTag n s] + toIdentifierNode id' (Identifier txt scheme) + | version == EPUB2 = [dcNode "identifier" ! + ([("id",id')] ++ maybe [] (\x -> [("opf:scheme", x)]) scheme) $ + txt] + | otherwise = [dcNode "identifier" ! [("id",id')] $ txt] ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","identifier-type"), + ("scheme","onix:codelist5")] $ x]) + (schemeToOnix `fmap` scheme) + toCreatorNode s id' creator + | version == EPUB2 = [dcNode s ! + (("id",id') : + maybe [] (\x -> [("opf:file-as",x)]) (creatorFileAs creator) ++ + maybe [] (\x -> [("opf:role",x)]) + (creatorRole creator >>= toRelator)) $ creatorText creator] + | otherwise = [dcNode s ! [("id",id')] $ creatorText creator] ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","file-as")] $ x]) + (creatorFileAs creator) ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","role"), + ("scheme","marc:relators")] $ x]) + (creatorRole creator >>= toRelator) + toTitleNode id' title + | version == EPUB2 = [dcNode "title" ! + (("id",id') : + maybe [] (\x -> [("opf:file-as",x)]) (titleFileAs title) ++ + maybe [] (\x -> [("opf:title-type",x)]) (titleType title)) $ + titleText title] + | otherwise = [dcNode "title" ! [("id",id')] $ titleText title] + ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","file-as")] $ x]) + (titleFileAs title) ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","title-type")] $ x]) + (titleType title) + toDateNode id' date = [dcNode "date" ! + (("id",id') : + maybe [] (\x -> [("opf:event",x)]) (dateEvent date)) $ + dateText date] + schemeToOnix "ISBN-10" = "02" + schemeToOnix "GTIN-13" = "03" + schemeToOnix "UPC" = "04" + schemeToOnix "ISMN-10" = "05" + schemeToOnix "DOI" = "06" + schemeToOnix "LCCN" = "13" + schemeToOnix "GTIN-14" = "14" + schemeToOnix "ISBN-13" = "15" + schemeToOnix "Legal deposit number" = "17" + schemeToOnix "URN" = "22" + schemeToOnix "OCLC" = "23" + schemeToOnix "ISMN-13" = "25" + schemeToOnix "ISBN-A" = "26" + schemeToOnix "JP" = "27" + schemeToOnix "OLCC" = "28" + schemeToOnix _ = "01" showDateTimeISO8601 :: UTCTime -> String showDateTimeISO8601 = formatTime defaultTimeLocale "%FT%TZ" +transformTag :: WriterOptions + -> IORef [(FilePath, FilePath)] -- ^ (oldpath, newpath) media + -> Tag String + -> IO (Tag String) +transformTag opts mediaRef tag@(TagOpen name attr) + | name == "video" || name == "source" || name == "img" = do + let src = fromAttrib "src" tag + let poster = fromAttrib "poster" tag + let oldsrc = maybe src (</> src) $ writerSourceURL opts + let oldposter = maybe poster (</> poster) $ writerSourceURL opts + newsrc <- modifyMediaRef mediaRef oldsrc + newposter <- modifyMediaRef mediaRef oldposter + let attr' = filter (\(x,_) -> x /= "src" && x /= "poster") attr ++ + [("src", newsrc) | not (null newsrc)] ++ + [("poster", newposter) | not (null newposter)] + return $ TagOpen name attr' +transformTag _ _ tag = return tag + +modifyMediaRef :: IORef [(FilePath, FilePath)] -> FilePath -> IO FilePath +modifyMediaRef _ "" = return "" +modifyMediaRef mediaRef oldsrc = do + media <- readIORef mediaRef + case lookup oldsrc media of + Just n -> return n + Nothing -> do + let new = "media/file" ++ show (length media) ++ + takeExtension oldsrc + modifyIORef mediaRef ( (oldsrc, new): ) + return new + +transformBlock :: WriterOptions + -> IORef [(FilePath, FilePath)] -- ^ (oldpath, newpath) media + -> Block + -> IO Block +transformBlock opts mediaRef (RawBlock fmt raw) + | fmt == Format "html" = do + let tags = parseTags raw + tags' <- mapM (transformTag opts mediaRef) tags + return $ RawBlock fmt (renderTags tags') +transformBlock _ _ b = return b + transformInline :: WriterOptions - -> FilePath - -> IORef [(FilePath, FilePath)] -- ^ (oldpath, newpath) images + -> IORef [(FilePath, FilePath)] -- ^ (oldpath, newpath) media -> Inline -> IO Inline -transformInline opts sourceDir picsRef (Image lab (src,tit)) - | isAbsoluteURI src = do - raw <- makeSelfContained Nothing - $ writeHtmlInline opts (Image lab (src,tit)) - return $ RawInline "html" raw - | otherwise = do +transformInline opts mediaRef (Image lab (src,tit)) = do let src' = unEscapeString src - pics <- readIORef picsRef - let oldsrc = sourceDir </> src' - let ext = takeExtension src' - newsrc <- case lookup oldsrc pics of - Just n -> return n - Nothing -> do - let new = "images/img" ++ show (length pics) ++ ext - modifyIORef picsRef ( (oldsrc, new): ) - return new + let oldsrc = maybe src' (</> src) $ writerSourceURL opts + newsrc <- modifyMediaRef mediaRef oldsrc return $ Image lab (newsrc, tit) -transformInline opts _ _ (x@(Math _ _)) +transformInline opts _ (x@(Math _ _)) | WebTeX _ <- writerHTMLMathMethod opts = do raw <- makeSelfContained Nothing $ writeHtmlInline opts x - return $ RawInline "html" raw -transformInline _ _ _ x = return x + return $ RawInline (Format "html") raw +transformInline _ _ x = return x writeHtmlInline :: WriterOptions -> Inline -> String writeHtmlInline opts z = trimr $ @@ -462,16 +824,12 @@ ppTopElement = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" ++) . unEntity . Nothing -> '&':'#':unEntity xs unEntity (x:xs) = x : unEntity xs -imageTypeOf :: FilePath -> Maybe String -imageTypeOf x = case drop 1 (map toLower (takeExtension x)) of - "jpg" -> Just "image/jpeg" - "jpeg" -> Just "image/jpeg" - "jfif" -> Just "image/jpeg" - "png" -> Just "image/png" - "gif" -> Just "image/gif" - "svg" -> Just "image/svg+xml" - _ -> Nothing - +mediaTypeOf :: FilePath -> Maybe String +mediaTypeOf x = case getMimeType x of + Just y@('i':'m':'a':'g':'e':_) -> Just y + Just y@('v':'i':'d':'e':'o':_) -> Just y + Just y@('a':'u':'d':'i':'o':_) -> Just y + _ -> Nothing data IdentState = IdentState{ chapterNumber :: Int, @@ -519,9 +877,293 @@ correlateRefs chapterHeaderLevel bs = -- Replace internal link references using the table produced -- by correlateRefs. replaceRefs :: [(String,String)] -> [Block] -> [Block] -replaceRefs refTable = bottomUp replaceOneRef +replaceRefs refTable = walk replaceOneRef where replaceOneRef x@(Link lab ('#':xs,tit)) = case lookup xs refTable of Just url -> Link lab (url,tit) Nothing -> x replaceOneRef x = x + +-- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM +normalizeDate' :: String -> Maybe String +normalizeDate' xs = + let xs' = trim xs in + case xs' of + [y1,y2,y3,y4] | all isDigit [y1,y2,y3,y4] -> Just xs' -- YYYY + [y1,y2,y3,y4,'-',m1,m2] | all isDigit [y1,y2,y3,y4,m1,m2] -- YYYY-MM + -> Just xs' + _ -> normalizeDate xs' + +toRelator :: String -> Maybe String +toRelator x + | x `elem` relators = Just x + | otherwise = lookup (map toLower x) relatorMap + +relators :: [String] +relators = map snd relatorMap + +relatorMap :: [(String, String)] +relatorMap = + [("abridger", "abr") + ,("actor", "act") + ,("adapter", "adp") + ,("addressee", "rcp") + ,("analyst", "anl") + ,("animator", "anm") + ,("annotator", "ann") + ,("appellant", "apl") + ,("appellee", "ape") + ,("applicant", "app") + ,("architect", "arc") + ,("arranger", "arr") + ,("art copyist", "acp") + ,("art director", "adi") + ,("artist", "art") + ,("artistic director", "ard") + ,("assignee", "asg") + ,("associated name", "asn") + ,("attributed name", "att") + ,("auctioneer", "auc") + ,("author", "aut") + ,("author in quotations or text abstracts", "aqt") + ,("author of afterword, colophon, etc.", "aft") + ,("author of dialog", "aud") + ,("author of introduction, etc.", "aui") + ,("autographer", "ato") + ,("bibliographic antecedent", "ant") + ,("binder", "bnd") + ,("binding designer", "bdd") + ,("blurb writer", "blw") + ,("book designer", "bkd") + ,("book producer", "bkp") + ,("bookjacket designer", "bjd") + ,("bookplate designer", "bpd") + ,("bookseller", "bsl") + ,("braille embosser", "brl") + ,("broadcaster", "brd") + ,("calligrapher", "cll") + ,("cartographer", "ctg") + ,("caster", "cas") + ,("censor", "cns") + ,("choreographer", "chr") + ,("cinematographer", "cng") + ,("client", "cli") + ,("collection registrar", "cor") + ,("collector", "col") + ,("collotyper", "clt") + ,("colorist", "clr") + ,("commentator", "cmm") + ,("commentator for written text", "cwt") + ,("compiler", "com") + ,("complainant", "cpl") + ,("complainant-appellant", "cpt") + ,("complainant-appellee", "cpe") + ,("composer", "cmp") + ,("compositor", "cmt") + ,("conceptor", "ccp") + ,("conductor", "cnd") + ,("conservator", "con") + ,("consultant", "csl") + ,("consultant to a project", "csp") + ,("contestant", "cos") + ,("contestant-appellant", "cot") + ,("contestant-appellee", "coe") + ,("contestee", "cts") + ,("contestee-appellant", "ctt") + ,("contestee-appellee", "cte") + ,("contractor", "ctr") + ,("contributor", "ctb") + ,("copyright claimant", "cpc") + ,("copyright holder", "cph") + ,("corrector", "crr") + ,("correspondent", "crp") + ,("costume designer", "cst") + ,("court governed", "cou") + ,("court reporter", "crt") + ,("cover designer", "cov") + ,("creator", "cre") + ,("curator", "cur") + ,("dancer", "dnc") + ,("data contributor", "dtc") + ,("data manager", "dtm") + ,("dedicatee", "dte") + ,("dedicator", "dto") + ,("defendant", "dfd") + ,("defendant-appellant", "dft") + ,("defendant-appellee", "dfe") + ,("degree granting institution", "dgg") + ,("delineator", "dln") + ,("depicted", "dpc") + ,("depositor", "dpt") + ,("designer", "dsr") + ,("director", "drt") + ,("dissertant", "dis") + ,("distribution place", "dbp") + ,("distributor", "dst") + ,("donor", "dnr") + ,("draftsman", "drm") + ,("dubious author", "dub") + ,("editor", "edt") + ,("editor of compilation", "edc") + ,("editor of moving image work", "edm") + ,("electrician", "elg") + ,("electrotyper", "elt") + ,("enacting jurisdiction", "enj") + ,("engineer", "eng") + ,("engraver", "egr") + ,("etcher", "etr") + ,("event place", "evp") + ,("expert", "exp") + ,("facsimilist", "fac") + ,("field director", "fld") + ,("film director", "fmd") + ,("film distributor", "fds") + ,("film editor", "flm") + ,("film producer", "fmp") + ,("filmmaker", "fmk") + ,("first party", "fpy") + ,("forger", "frg") + ,("former owner", "fmo") + ,("funder", "fnd") + ,("geographic information specialist", "gis") + ,("honoree", "hnr") + ,("host", "hst") + ,("host institution", "his") + ,("illuminator", "ilu") + ,("illustrator", "ill") + ,("inscriber", "ins") + ,("instrumentalist", "itr") + ,("interviewee", "ive") + ,("interviewer", "ivr") + ,("inventor", "inv") + ,("issuing body", "isb") + ,("judge", "jud") + ,("jurisdiction governed", "jug") + ,("laboratory", "lbr") + ,("laboratory director", "ldr") + ,("landscape architect", "lsa") + ,("lead", "led") + ,("lender", "len") + ,("libelant", "lil") + ,("libelant-appellant", "lit") + ,("libelant-appellee", "lie") + ,("libelee", "lel") + ,("libelee-appellant", "let") + ,("libelee-appellee", "lee") + ,("librettist", "lbt") + ,("licensee", "lse") + ,("licensor", "lso") + ,("lighting designer", "lgd") + ,("lithographer", "ltg") + ,("lyricist", "lyr") + ,("manufacture place", "mfp") + ,("manufacturer", "mfr") + ,("marbler", "mrb") + ,("markup editor", "mrk") + ,("metadata contact", "mdc") + ,("metal-engraver", "mte") + ,("moderator", "mod") + ,("monitor", "mon") + ,("music copyist", "mcp") + ,("musical director", "msd") + ,("musician", "mus") + ,("narrator", "nrt") + ,("onscreen presenter", "osp") + ,("opponent", "opn") + ,("organizer of meeting", "orm") + ,("originator", "org") + ,("other", "oth") + ,("owner", "own") + ,("panelist", "pan") + ,("papermaker", "ppm") + ,("patent applicant", "pta") + ,("patent holder", "pth") + ,("patron", "pat") + ,("performer", "prf") + ,("permitting agency", "pma") + ,("photographer", "pht") + ,("plaintiff", "ptf") + ,("plaintiff-appellant", "ptt") + ,("plaintiff-appellee", "pte") + ,("platemaker", "plt") + ,("praeses", "pra") + ,("presenter", "pre") + ,("printer", "prt") + ,("printer of plates", "pop") + ,("printmaker", "prm") + ,("process contact", "prc") + ,("producer", "pro") + ,("production company", "prn") + ,("production designer", "prs") + ,("production manager", "pmn") + ,("production personnel", "prd") + ,("production place", "prp") + ,("programmer", "prg") + ,("project director", "pdr") + ,("proofreader", "pfr") + ,("provider", "prv") + ,("publication place", "pup") + ,("publisher", "pbl") + ,("publishing director", "pbd") + ,("puppeteer", "ppt") + ,("radio director", "rdd") + ,("radio producer", "rpc") + ,("recording engineer", "rce") + ,("recordist", "rcd") + ,("redaktor", "red") + ,("renderer", "ren") + ,("reporter", "rpt") + ,("repository", "rps") + ,("research team head", "rth") + ,("research team member", "rtm") + ,("researcher", "res") + ,("respondent", "rsp") + ,("respondent-appellant", "rst") + ,("respondent-appellee", "rse") + ,("responsible party", "rpy") + ,("restager", "rsg") + ,("restorationist", "rsr") + ,("reviewer", "rev") + ,("rubricator", "rbr") + ,("scenarist", "sce") + ,("scientific advisor", "sad") + ,("screenwriter", "aus") + ,("scribe", "scr") + ,("sculptor", "scl") + ,("second party", "spy") + ,("secretary", "sec") + ,("seller", "sll") + ,("set designer", "std") + ,("setting", "stg") + ,("signer", "sgn") + ,("singer", "sng") + ,("sound designer", "sds") + ,("speaker", "spk") + ,("sponsor", "spn") + ,("stage director", "sgd") + ,("stage manager", "stm") + ,("standards body", "stn") + ,("stereotyper", "str") + ,("storyteller", "stl") + ,("supporting host", "sht") + ,("surveyor", "srv") + ,("teacher", "tch") + ,("technical director", "tcd") + ,("television director", "tld") + ,("television producer", "tlp") + ,("thesis advisor", "ths") + ,("transcriber", "trc") + ,("translator", "trl") + ,("type designer", "tyd") + ,("typographer", "tyg") + ,("university place", "uvp") + ,("videographer", "vdg") + ,("witness", "wit") + ,("wood engraver", "wde") + ,("woodcutter", "wdc") + ,("writer of accompanying material", "wam") + ,("writer of added commentary", "wac") + ,("writer of added lyrics", "wal") + ,("writer of added text", "wat") + ] + diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 27f0c8305..803617f95 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -44,8 +44,8 @@ import qualified Text.XML.Light.Cursor as XC import Text.Pandoc.Definition import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..), def) -import Text.Pandoc.Shared (orderedListMarkers) -import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock) +import Text.Pandoc.Walk -- | Data to be written at the end of the document: -- (foot)notes, URLs, references, images. @@ -157,9 +157,7 @@ renderSection level (ttl, body) = do else cMapM blockToXml body return $ el "section" (title ++ content) where - hasSubsections = any isHeader - isHeader (Header _ _ _) = True - isHeader _ = False + hasSubsections = any isHeaderBlock -- | Only <p> and <empty-line> are allowed within <title> in FB2. formatTitle :: [Inline] -> [Content] @@ -324,6 +322,7 @@ blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . map (el "p" . el "code") . lines $ s blockToXml (RawBlock _ s) = return . spaceBeforeAfter . map (el "p" . el "code") . lines $ s +blockToXml (Div _ bs) = cMapM blockToXml bs blockToXml (BlockQuote bs) = liftM (list . el "cite") $ cMapM blockToXml bs blockToXml (OrderedList a bss) = do state <- get @@ -422,15 +421,20 @@ indent = indentBlock indentLines ins = let lns = split isLineBreak ins :: [[Inline]] in intercalate [LineBreak] $ map ((Str spacer):) lns +capitalize :: Inline -> Inline +capitalize (Str xs) = Str $ map toUpper xs +capitalize x = x + -- | Convert a Pandoc's Inline element to FictionBook XML representation. toXml :: Inline -> FBM [Content] toXml (Str s) = return [txt s] +toXml (Span _ ils) = cMapM toXml ils toXml (Emph ss) = list `liftM` wrap "emphasis" ss toXml (Strong ss) = list `liftM` wrap "strong" ss toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss toXml (Superscript ss) = list `liftM` wrap "sup" ss toXml (Subscript ss) = list `liftM` wrap "sub" ss -toXml (SmallCaps ss) = cMapM toXml $ bottomUp (map toUpper) ss +toXml (SmallCaps ss) = cMapM toXml $ walk capitalize ss toXml (Quoted SingleQuote ss) = do -- FIXME: should be language-specific inner <- cMapM toXml ss return $ [txt "‘"] ++ inner ++ [txt "’"] @@ -560,6 +564,7 @@ list = (:[]) plain :: Inline -> String plain (Str s) = s plain (Emph ss) = concat (map plain ss) +plain (Span _ ss) = concat (map plain ss) plain (Strong ss) = concat (map plain ss) plain (Strikeout ss) = concat (map plain ss) plain (Superscript ss) = concat (map plain ss) diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 57bf2a349..9a26cf2ac 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings, CPP #-} {-# OPTIONS_GHC -fno-warn-deprecations #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.HTML - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -39,13 +39,14 @@ import Text.Pandoc.Readers.TeXMath import Text.Pandoc.Slides import Text.Pandoc.Highlighting ( highlight, styleToCss, formatHtmlInline, formatHtmlBlock ) -import Text.Pandoc.XML (fromEntities) +import Text.Pandoc.XML (fromEntities, escapeStringForXML) +import Network.URI ( parseURIReference, URI(..) ) import Network.HTTP ( urlEncode ) import Numeric ( showHex ) import Data.Char ( ord, toLower ) import Data.List ( isPrefixOf, intersperse ) import Data.String ( fromString ) -import Data.Maybe ( catMaybes ) +import Data.Maybe ( catMaybes, fromMaybe ) import Control.Monad.State import Text.Blaze.Html hiding(contents) import Text.Blaze.Internal(preEscapedString) @@ -115,9 +116,10 @@ pandocToHtml opts (Pandoc meta blocks) = do (fmap renderHtml . blockListToHtml opts) (fmap renderHtml . inlineListToHtml opts) meta - let authsMeta = map stringify $ docAuthors meta - let dateMeta = stringify $ docDate meta - let slideLevel = maybe (getSlideLevel blocks) id $ writerSlideLevel opts + let stringifyHTML = escapeStringForXML . stringify + let authsMeta = map stringifyHTML $ docAuthors meta + let dateMeta = stringifyHTML $ docDate meta + let slideLevel = fromMaybe (getSlideLevel blocks) $ writerSlideLevel opts let sects = hierarchicalize $ if writerSlideVariant opts == NoSlides then blocks @@ -143,7 +145,11 @@ pandocToHtml opts (Pandoc meta blocks) = do MathJax url -> H.script ! A.src (toValue url) ! A.type_ "text/javascript" - $ mempty + $ case writerSlideVariant opts of + SlideousSlides -> + preEscapedString + "MathJax.Hub.Queue([\"Typeset\",MathJax.Hub]);" + _ -> mempty JsMath (Just url) -> H.script ! A.src (toValue url) ! A.type_ "text/javascript" @@ -167,7 +173,7 @@ pandocToHtml opts (Pandoc meta blocks) = do maybe id (defField "toc" . renderHtml) toc $ defField "author-meta" authsMeta $ maybe id (defField "date-meta") (normalizeDate dateMeta) $ - defField "pagetitle" (stringify $ docTitle meta) $ + defField "pagetitle" (stringifyHTML $ docTitle meta) $ defField "idprefix" (writerIdentifierPrefix opts) $ -- these should maybe be set in pandoc.hs defField "slidy-url" @@ -267,11 +273,23 @@ elementToHtml slideLevel opts (Sec level num (id',classes,keyvals) title' elemen else blockToHtml opts (Header level' (id',classes,keyvals) title') let isSec (Sec _ _ _ _ _) = True isSec (Blk _) = False + let isPause (Blk x) = x == Para [Str ".",Space,Str ".",Space,Str "."] + isPause _ = False + let fragmentClass = case writerSlideVariant opts of + RevealJsSlides -> "fragment" + _ -> "incremental" + let inDiv xs = Blk (RawBlock (Format "html") ("<div class=\"" + ++ fragmentClass ++ "\">")) : + (xs ++ [Blk (RawBlock (Format "html") "</div>")]) innerContents <- mapM (elementToHtml slideLevel opts) $ if titleSlide -- title slides have no content of their own then filter isSec elements - else elements + else if slide + then case splitBy isPause elements of + [] -> [] + (x:xs) -> x ++ concatMap inDiv xs + else elements let inNl x = mconcat $ nl opts : intersperse (nl opts) x ++ [nl opts] let classes' = ["titleslide" | titleSlide] ++ ["slide" | slide] ++ ["section" | (slide || writerSectionDivs opts) && @@ -379,7 +397,10 @@ imageExts = [ "art", "bmp", "cdr", "cdt", "cpt", "cr2", "crw", "djvu", "erf", treatAsImage :: FilePath -> Bool treatAsImage fp = - let ext = map toLower $ drop 1 $ takeExtension fp + let path = case uriPath `fmap` parseURIReference fp of + Nothing -> fp + Just up -> up + ext = map toLower $ drop 1 $ takeExtension path in null ext || ext `elem` imageExts -- | Convert Pandoc block element to HTML. @@ -400,15 +421,22 @@ blockToHtml opts (Para [Image txt (s,'f':'i':'g':':':tit)]) = do [nl opts, img, capt, nl opts] else H.div ! A.class_ "figure" $ mconcat [nl opts, img, capt, nl opts] --- . . . indicates a pause in a slideshow -blockToHtml opts (Para [Str ".",Space,Str ".",Space,Str "."]) - | writerSlideVariant opts == RevealJsSlides = - blockToHtml opts (RawBlock "html" "<div class=\"fragment\" />") blockToHtml opts (Para lst) = do contents <- inlineListToHtml opts lst return $ H.p contents -blockToHtml _ (RawBlock "html" str) = return $ preEscapedString str -blockToHtml _ (RawBlock _ _) = return mempty +blockToHtml opts (Div attr@(_,classes,_) bs) = do + contents <- blockListToHtml opts bs + let contents' = nl opts >> contents >> nl opts + return $ + if "notes" `elem` classes + then case writerSlideVariant opts of + RevealJsSlides -> addAttrs opts attr $ H5.aside $ contents' + NoSlides -> addAttrs opts attr $ H.div $ contents' + _ -> mempty + else addAttrs opts attr $ H.div $ contents' +blockToHtml _ (RawBlock f str) + | f == Format "html" = return $ preEscapedString str + | otherwise = return mempty blockToHtml opts (HorizontalRule) = return $ if writerHtml5 opts then H5.hr else H.hr blockToHtml opts (CodeBlock (id',classes,keyvals) rawCode) = do let tolhs = isEnabled Ext_literate_haskell opts && @@ -422,7 +450,10 @@ blockToHtml opts (CodeBlock (id',classes,keyvals) rawCode) = do adjCode = if tolhs then unlines . map ("> " ++) . lines $ rawCode else rawCode - case highlight formatHtmlBlock (id',classes',keyvals) adjCode of + hlCode = if writerHighlight opts -- check highlighting options + then highlight formatHtmlBlock (id',classes',keyvals) adjCode + else Nothing + case hlCode of Nothing -> return $ addAttrs opts (id',classes,keyvals) $ H.pre $ H.code $ toHtml adjCode Just h -> modify (\st -> st{ stHighlighting = True }) >> @@ -448,28 +479,22 @@ blockToHtml opts (BlockQuote blocks) = else do contents <- blockListToHtml opts blocks return $ H.blockquote $ nl opts >> contents >> nl opts -blockToHtml opts (Header level (ident,_,_) lst) = do +blockToHtml opts (Header level (_,classes,_) lst) = do contents <- inlineListToHtml opts lst secnum <- liftM stSecNum get let contents' = if writerNumberSections opts && not (null secnum) + && "unnumbered" `notElem` classes then (H.span ! A.class_ "header-section-number" $ toHtml $ showSecNum secnum) >> strToHtml " " >> contents else contents - let revealSlash = ['/' | writerSlideVariant opts == RevealJsSlides] - let contents'' = if writerTableOfContents opts && not (null ident) - then H.a ! A.href (toValue $ - '#' : revealSlash ++ - writerIdentifierPrefix opts ++ - ident) $ contents' - else contents' return $ case level of - 1 -> H.h1 contents'' - 2 -> H.h2 contents'' - 3 -> H.h3 contents'' - 4 -> H.h4 contents'' - 5 -> H.h5 contents'' - 6 -> H.h6 contents'' - _ -> H.p contents'' + 1 -> H.h1 contents' + 2 -> H.h2 contents' + 3 -> H.h3 contents' + 4 -> H.h4 contents' + 5 -> H.h5 contents' + 6 -> H.h6 contents' + _ -> H.p contents' blockToHtml opts (BulletList lst) = do contents <- mapM (blockListToHtml opts) lst return $ unordList opts contents @@ -497,7 +522,7 @@ blockToHtml opts (DefinitionList lst) = do contents <- mapM (\(term, defs) -> do term' <- if null term then return mempty - else liftM (H.dt) $ inlineListToHtml opts term + else liftM H.dt $ inlineListToHtml opts term defs' <- mapM ((liftM (\x -> H.dd $ (x >> nl opts))) . blockListToHtml opts) defs return $ mconcat $ nl opts : term' : nl opts : @@ -512,11 +537,16 @@ blockToHtml opts (Table capt aligns widths headers rows') = do let percent w = show (truncate (100*w) :: Integer) ++ "%" let coltags = if all (== 0.0) widths then mempty - else mconcat $ map (\w -> - if writerHtml5 opts - then H.col ! A.style (toValue $ "width: " ++ percent w) - else H.col ! A.width (toValue $ percent w) >> nl opts) - widths + else do + H.colgroup $ do + nl opts + mapM_ (\w -> do + if writerHtml5 opts + then H.col ! A.style (toValue $ "width: " ++ + percent w) + else H.col ! A.width (toValue $ percent w) + nl opts) widths + nl opts head' <- if all null headers then return mempty else do @@ -572,8 +602,7 @@ toListItem opts item = nl opts >> H.li item blockListToHtml :: WriterOptions -> [Block] -> State WriterState Html blockListToHtml opts lst = - mapM (blockToHtml opts) lst >>= - return . mconcat . intersperse (nl opts) + fmap (mconcat . intersperse (nl opts)) $ mapM (blockToHtml opts) lst -- | Convert list of Pandoc inline elements to HTML. inlineListToHtml :: WriterOptions -> [Inline] -> State WriterState Html @@ -587,16 +616,35 @@ inlineToHtml opts inline = (Str str) -> return $ strToHtml str (Space) -> return $ strToHtml " " (LineBreak) -> return $ if writerHtml5 opts then H5.br else H.br + (Span (id',classes,kvs) ils) + -> inlineListToHtml opts ils >>= + return . addAttrs opts attr' . H.span + where attr' = (id',classes',kvs') + classes' = filter (`notElem` ["csl-no-emph", + "csl-no-strong", + "csl-no-smallcaps"]) classes + kvs' = if null styles + then kvs + else (("style", concat styles) : kvs) + styles = ["font-style:normal;" + | "csl-no-emph" `elem` classes] + ++ ["font-weight:normal;" + | "csl-no-strong" `elem` classes] + ++ ["font-variant:normal;" + | "csl-no-smallcaps" `elem` classes] (Emph lst) -> inlineListToHtml opts lst >>= return . H.em (Strong lst) -> inlineListToHtml opts lst >>= return . H.strong - (Code attr str) -> case highlight formatHtmlInline attr str of + (Code attr str) -> case hlCode of Nothing -> return $ addAttrs opts attr $ H.code $ strToHtml str Just h -> do modify $ \st -> st{ stHighlighting = True } return $ addAttrs opts (id',[],keyvals) h - where (id',_,keyvals) = attr + where (id',_,keyvals) = attr + hlCode = if writerHighlight opts + then highlight formatHtmlInline attr str + else Nothing (Strikeout lst) -> inlineListToHtml opts lst >>= return . H.del (SmallCaps lst) -> inlineListToHtml opts lst >>= @@ -654,25 +702,27 @@ inlineToHtml opts inline = Right r -> return $ preEscapedString $ ppcElement conf r Left _ -> inlineListToHtml opts - (readTeXMath str) >>= return . + (readTeXMath' t str) >>= return . (H.span ! A.class_ "math") MathJax _ -> return $ H.span ! A.class_ "math" $ toHtml $ case t of InlineMath -> "\\(" ++ str ++ "\\)" DisplayMath -> "\\[" ++ str ++ "\\]" PlainMath -> do - x <- inlineListToHtml opts (readTeXMath str) + x <- inlineListToHtml opts (readTeXMath' t str) let m = H.span ! A.class_ "math" $ x let brtag = if writerHtml5 opts then H5.br else H.br return $ case t of InlineMath -> m DisplayMath -> brtag >> m >> brtag ) - (RawInline "latex" str) -> case writerHTMLMathMethod opts of + (RawInline f str) + | f == Format "latex" -> + case writerHTMLMathMethod opts of LaTeXMathML _ -> do modify (\st -> st {stMath = True}) return $ toHtml str _ -> return mempty - (RawInline "html" str) -> return $ preEscapedString str - (RawInline _ _) -> return mempty + | f == Format "html" -> return $ preEscapedString str + | otherwise -> return mempty (Link [Str str] (s,_)) | "mailto:" `isPrefixOf` s && s == escapeURI ("mailto" ++ str) -> -- autolink @@ -709,7 +759,9 @@ inlineToHtml opts inline = else [A.title $ toValue tit]) return $ foldl (!) H5.embed attributes -- note: null title included, as in Markdown.pl - (Note contents) -> do + (Note contents) + | writerIgnoreNotes opts -> return mempty + | otherwise -> do st <- get let notes = stNotes st let number = (length notes) + 1 @@ -724,11 +776,11 @@ inlineToHtml opts inline = writerIdentifierPrefix opts ++ "fn" ++ ref) ! A.class_ "footnoteRef" ! prefixedId opts ("fnref" ++ ref) + $ H.sup $ toHtml ref - let link' = case writerEpubVersion opts of - Just EPUB3 -> link ! customAttribute "epub:type" "noteref" - _ -> link - return $ H.sup $ link' + return $ case writerEpubVersion opts of + Just EPUB3 -> link ! customAttribute "epub:type" "noteref" + _ -> link (Cite cits il)-> do contents <- inlineListToHtml opts il let citationIds = unwords $ map citationId cits let result = H.span ! A.class_ "citation" $ contents diff --git a/src/Text/Pandoc/Writers/Haddock.hs b/src/Text/Pandoc/Writers/Haddock.hs new file mode 100644 index 000000000..1c82839d0 --- /dev/null +++ b/src/Text/Pandoc/Writers/Haddock.hs @@ -0,0 +1,346 @@ +{-# LANGUAGE OverloadedStrings, TupleSections, ScopedTypeVariables #-} +{- +Copyright (C) 2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Haddock + Copyright : Copyright (C) 2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to haddock markup. + +Haddock: <http://www.haskell.org/haddock/doc/html/> +-} +module Text.Pandoc.Writers.Haddock (writeHaddock) where +import Text.Pandoc.Definition +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Data.List ( intersperse, transpose ) +import Text.Pandoc.Pretty +import Control.Monad.State +import Text.Pandoc.Readers.TeXMath (readTeXMath') +import Network.URI (isURI) +import Data.Default + +type Notes = [[Block]] +data WriterState = WriterState { stNotes :: Notes } +instance Default WriterState + where def = WriterState{ stNotes = [] } + +-- | Convert Pandoc to Haddock. +writeHaddock :: WriterOptions -> Pandoc -> String +writeHaddock opts document = + evalState (pandocToHaddock opts{ + writerWrapText = writerWrapText opts } document) def + +-- | Return haddock representation of document. +pandocToHaddock :: WriterOptions -> Pandoc -> State WriterState String +pandocToHaddock opts (Pandoc meta blocks) = do + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + body <- blockListToHaddock opts blocks + st <- get + notes' <- notesToHaddock opts (reverse $ stNotes st) + let render' :: Doc -> String + render' = render colwidth + let main = render' $ body <> + (if isEmpty notes' then empty else blankline <> notes') + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToHaddock opts) + (fmap (render colwidth) . inlineListToHaddock opts) + meta + let context = defField "body" main + $ metadata + if writerStandalone opts + then return $ renderTemplate' (writerTemplate opts) context + else return main + +-- | Return haddock representation of notes. +notesToHaddock :: WriterOptions -> [[Block]] -> State WriterState Doc +notesToHaddock opts notes = + if null notes + then return empty + else do + contents <- blockToHaddock opts $ OrderedList (1,DefaultStyle,DefaultDelim) notes + return $ text "#notes#" <> blankline <> contents + +-- | Escape special characters for Haddock. +escapeString :: String -> String +escapeString = escapeStringUsing haddockEscapes + where haddockEscapes = backslashEscapes "\\/'`\"@<" + +-- | Convert Pandoc block element to haddock. +blockToHaddock :: WriterOptions -- ^ Options + -> Block -- ^ Block element + -> State WriterState Doc +blockToHaddock _ Null = return empty +blockToHaddock opts (Div _ ils) = do + contents <- blockListToHaddock opts ils + return $ contents <> blankline +blockToHaddock opts (Plain inlines) = do + contents <- inlineListToHaddock opts inlines + return $ contents <> cr +-- title beginning with fig: indicates figure +blockToHaddock opts (Para [Image alt (src,'f':'i':'g':':':tit)]) = + blockToHaddock opts (Para [Image alt (src,tit)]) +blockToHaddock opts (Para inlines) = + -- TODO: if it contains linebreaks, we need to use a @...@ block + (<> blankline) `fmap` blockToHaddock opts (Plain inlines) +blockToHaddock _ (RawBlock f str) + | f == "haddock" = do + return $ text str <> text "\n" + | otherwise = return empty +blockToHaddock opts HorizontalRule = + return $ blankline <> text (replicate (writerColumns opts) '_') <> blankline +blockToHaddock opts (Header level (ident,_,_) inlines) = do + contents <- inlineListToHaddock opts inlines + let attr' = if null ident + then empty + else cr <> text "#" <> text ident <> text "#" + return $ nowrap (text (replicate level '=') <> space <> contents) + <> attr' <> blankline +blockToHaddock _ (CodeBlock (_,_,_) str) = + return $ prefixed "> " (text str) <> blankline +-- Nothing in haddock corresponds to block quotes: +blockToHaddock opts (BlockQuote blocks) = + blockListToHaddock opts blocks +-- Haddock doesn't have tables. Use haddock tables in code. +blockToHaddock opts (Table caption aligns widths headers rows) = do + caption' <- inlineListToHaddock opts caption + let caption'' = if null caption + then empty + else blankline <> caption' <> blankline + rawHeaders <- mapM (blockListToHaddock opts) headers + rawRows <- mapM (mapM (blockListToHaddock opts)) rows + let isSimple = all (==0) widths + let isPlainBlock (Plain _) = True + isPlainBlock _ = False + let hasBlocks = not (all isPlainBlock $ concat . concat $ headers:rows) + (nst,tbl) <- case True of + _ | isSimple -> fmap (nest 2,) $ + pandocTable opts (all null headers) aligns widths + rawHeaders rawRows + | not hasBlocks -> fmap (nest 2,) $ + pandocTable opts (all null headers) aligns widths + rawHeaders rawRows + | otherwise -> fmap (id,) $ + gridTable opts (all null headers) aligns widths + rawHeaders rawRows + return $ (prefixed "> " $ nst $ tbl $$ blankline $$ caption'') $$ blankline +blockToHaddock opts (BulletList items) = do + contents <- mapM (bulletListItemToHaddock opts) items + return $ cat contents <> blankline +blockToHaddock opts (OrderedList (start,_,delim) items) = do + let attribs = (start, Decimal, delim) + let markers = orderedListMarkers attribs + let markers' = map (\m -> if length m < 3 + then m ++ replicate (3 - length m) ' ' + else m) markers + contents <- mapM (\(item, num) -> orderedListItemToHaddock opts item num) $ + zip markers' items + return $ cat contents <> blankline +blockToHaddock opts (DefinitionList items) = do + contents <- mapM (definitionListItemToHaddock opts) items + return $ cat contents <> blankline + +pandocTable :: WriterOptions -> Bool -> [Alignment] -> [Double] + -> [Doc] -> [[Doc]] -> State WriterState Doc +pandocTable opts headless aligns widths rawHeaders rawRows = do + let isSimple = all (==0) widths + let alignHeader alignment = case alignment of + AlignLeft -> lblock + AlignCenter -> cblock + AlignRight -> rblock + AlignDefault -> lblock + let numChars = maximum . map offset + let widthsInChars = if isSimple + then map ((+2) . numChars) + $ transpose (rawHeaders : rawRows) + else map + (floor . (fromIntegral (writerColumns opts) *)) + widths + let makeRow = hcat . intersperse (lblock 1 (text " ")) . + (zipWith3 alignHeader aligns widthsInChars) + let rows' = map makeRow rawRows + let head' = makeRow rawHeaders + let maxRowHeight = maximum $ map height (head':rows') + let underline = cat $ intersperse (text " ") $ + map (\width -> text (replicate width '-')) widthsInChars + let border = if maxRowHeight > 1 + then text (replicate (sum widthsInChars + + length widthsInChars - 1) '-') + else if headless + then underline + else empty + let head'' = if headless + then empty + else border <> cr <> head' + let body = if maxRowHeight > 1 + then vsep rows' + else vcat rows' + let bottom = if headless + then underline + else border + return $ head'' $$ underline $$ body $$ bottom + +gridTable :: WriterOptions -> Bool -> [Alignment] -> [Double] + -> [Doc] -> [[Doc]] -> State WriterState Doc +gridTable opts headless _aligns widths headers' rawRows = do + let numcols = length headers' + let widths' = if all (==0) widths + then replicate numcols (1.0 / fromIntegral numcols) + else widths + let widthsInChars = map (floor . (fromIntegral (writerColumns opts) *)) widths' + let hpipeBlocks blocks = hcat [beg, middle, end] + where h = maximum (map height blocks) + sep' = lblock 3 $ vcat (map text $ replicate h " | ") + beg = lblock 2 $ vcat (map text $ replicate h "| ") + end = lblock 2 $ vcat (map text $ replicate h " |") + middle = chomp $ hcat $ intersperse sep' blocks + let makeRow = hpipeBlocks . zipWith lblock widthsInChars + let head' = makeRow headers' + let rows' = map (makeRow . map chomp) rawRows + let border ch = char '+' <> char ch <> + (hcat $ intersperse (char ch <> char '+' <> char ch) $ + map (\l -> text $ replicate l ch) widthsInChars) <> + char ch <> char '+' + let body = vcat $ intersperse (border '-') rows' + let head'' = if headless + then empty + else head' $$ border '=' + return $ border '-' $$ head'' $$ body $$ border '-' + +-- | Convert bullet list item (list of blocks) to haddock +bulletListItemToHaddock :: WriterOptions -> [Block] -> State WriterState Doc +bulletListItemToHaddock opts items = do + contents <- blockListToHaddock opts items + let sps = replicate (writerTabStop opts - 2) ' ' + let start = text ('-' : ' ' : sps) + -- remove trailing blank line if it is a tight list + let contents' = case reverse items of + (BulletList xs:_) | isTightList xs -> + chomp contents <> cr + (OrderedList _ xs:_) | isTightList xs -> + chomp contents <> cr + _ -> contents + return $ hang (writerTabStop opts) start $ contents' <> cr + +-- | Convert ordered list item (a list of blocks) to haddock +orderedListItemToHaddock :: WriterOptions -- ^ options + -> String -- ^ list item marker + -> [Block] -- ^ list item (list of blocks) + -> State WriterState Doc +orderedListItemToHaddock opts marker items = do + contents <- blockListToHaddock opts items + let sps = case length marker - writerTabStop opts of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + let start = text marker <> sps + return $ hang (writerTabStop opts) start $ contents <> cr + +-- | Convert definition list item (label, list of blocks) to haddock +definitionListItemToHaddock :: WriterOptions + -> ([Inline],[[Block]]) + -> State WriterState Doc +definitionListItemToHaddock opts (label, defs) = do + labelText <- inlineListToHaddock opts label + defs' <- mapM (mapM (blockToHaddock opts)) defs + let contents = vcat $ map (\d -> hang 4 empty $ vcat d <> cr) defs' + return $ nowrap (brackets labelText) <> cr <> contents <> cr + +-- | Convert list of Pandoc block elements to haddock +blockListToHaddock :: WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> State WriterState Doc +blockListToHaddock opts blocks = + mapM (blockToHaddock opts) blocks >>= return . cat + +-- | Convert list of Pandoc inline elements to haddock. +inlineListToHaddock :: WriterOptions -> [Inline] -> State WriterState Doc +inlineListToHaddock opts lst = + mapM (inlineToHaddock opts) lst >>= return . cat + +-- | Convert Pandoc inline element to haddock. +inlineToHaddock :: WriterOptions -> Inline -> State WriterState Doc +inlineToHaddock opts (Span (ident,_,_) ils) = do + contents <- inlineListToHaddock opts ils + if not (null ident) && null ils + then return $ "#" <> text ident <> "#" + else return contents +inlineToHaddock opts (Emph lst) = do + contents <- inlineListToHaddock opts lst + return $ "/" <> contents <> "/" +inlineToHaddock opts (Strong lst) = do + contents <- inlineListToHaddock opts lst + return $ "__" <> contents <> "__" +inlineToHaddock opts (Strikeout lst) = do + contents <- inlineListToHaddock opts lst + -- not supported in haddock, but we fake it: + return $ "~~" <> contents <> "~~" +-- not supported in haddock: +inlineToHaddock opts (Superscript lst) = inlineListToHaddock opts lst +-- not supported in haddock: +inlineToHaddock opts (Subscript lst) = inlineListToHaddock opts lst +-- not supported in haddock: +inlineToHaddock opts (SmallCaps lst) = inlineListToHaddock opts lst +inlineToHaddock opts (Quoted SingleQuote lst) = do + contents <- inlineListToHaddock opts lst + return $ "‘" <> contents <> "’" +inlineToHaddock opts (Quoted DoubleQuote lst) = do + contents <- inlineListToHaddock opts lst + return $ "“" <> contents <> "”" +inlineToHaddock _ (Code _ str) = + return $ "@" <> text (escapeString str) <> "@" +inlineToHaddock _ (Str str) = do + return $ text $ escapeString str +inlineToHaddock opts (Math mt str) = do + let adjust x = case mt of + DisplayMath -> cr <> x <> cr + InlineMath -> x + adjust `fmap` (inlineListToHaddock opts $ readTeXMath' mt str) +inlineToHaddock _ (RawInline f str) + | f == "haddock" = return $ text str + | otherwise = return empty +-- no line break in haddock (see above on CodeBlock) +inlineToHaddock _ (LineBreak) = return cr +inlineToHaddock _ Space = return space +inlineToHaddock opts (Cite _ lst) = inlineListToHaddock opts lst +inlineToHaddock opts (Link txt (src, _)) = do + linktext <- inlineListToHaddock opts txt + let useAuto = isURI src && + case txt of + [Str s] | escapeURI s == src -> True + _ -> False + return $ nowrap $ "<" <> text src <> + (if useAuto then empty else space <> linktext) <> ">" +inlineToHaddock opts (Image alternate (source, tit)) = do + linkhaddock <- inlineToHaddock opts (Link alternate (source, tit)) + return $ "<" <> linkhaddock <> ">" +-- haddock doesn't have notes, but we can fake it: +inlineToHaddock opts (Note contents) = do + modify (\st -> st{ stNotes = contents : stNotes st }) + st <- get + let ref = text $ writerIdentifierPrefix opts ++ show (length $ stNotes st) + return $ "<#notes [" <> ref <> "]>" diff --git a/src/Text/Pandoc/Writers/ICML.hs b/src/Text/Pandoc/Writers/ICML.hs new file mode 100644 index 000000000..19d486b25 --- /dev/null +++ b/src/Text/Pandoc/Writers/ICML.hs @@ -0,0 +1,525 @@ +{-# LANGUAGE OverloadedStrings #-} + +{- | + Module : Text.Pandoc.Writers.ICML + Copyright : Copyright (C) 2013 github.com/mb21 + License : GNU GPL, version 2 or above + + Stability : alpha + +Conversion of 'Pandoc' documents to Adobe InCopy ICML, a stand-alone XML format +which is a subset of the zipped IDML format for which the documentation is +available here: http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/indesign/sdk/cs6/idml/idml-specification.pdf +InCopy is the companion word-processor to Adobe InDesign and ICML documents can be integrated +into InDesign with File -> Place. +-} +module Text.Pandoc.Writers.ICML (writeICML) where +import Text.Pandoc.Definition +import Text.Pandoc.XML +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Shared (splitBy) +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Pretty +import Data.List (isPrefixOf, isInfixOf, stripPrefix) +import Data.Text as Text (breakOnAll, pack) +import Data.Monoid (mappend) +import Control.Monad.State +import qualified Data.Set as Set + +type Style = [String] +type Hyperlink = [(Int, String)] + +data WriterState = WriterState{ + blockStyles :: Set.Set String + , inlineStyles :: Set.Set String + , links :: Hyperlink + , listDepth :: Int + , maxListDepth :: Int + } + +type WS a = State WriterState a + +defaultWriterState :: WriterState +defaultWriterState = WriterState{ + blockStyles = Set.empty + , inlineStyles = Set.empty + , links = [] + , listDepth = 1 + , maxListDepth = 0 + } + +-- inline names (appear in InDesign's character styles pane) +emphName :: String +strongName :: String +strikeoutName :: String +superscriptName :: String +subscriptName :: String +smallCapsName :: String +codeName :: String +linkName :: String +emphName = "Italic" +strongName = "Bold" +strikeoutName = "Strikeout" +superscriptName = "Superscript" +subscriptName = "Subscript" +smallCapsName = "SmallCaps" +codeName = "Code" +linkName = "Link" + +-- block element names (appear in InDesign's paragraph styles pane) +paragraphName :: String +codeBlockName :: String +rawBlockName :: String +blockQuoteName :: String +orderedListName :: String +bulletListName :: String +defListTermName :: String +defListDefName :: String +headerName :: String +tableName :: String +tableHeaderName :: String +tableCaptionName :: String +alignLeftName :: String +alignRightName :: String +alignCenterName :: String +firstListItemName :: String +beginsWithName :: String +lowerRomanName :: String +upperRomanName :: String +lowerAlphaName :: String +upperAlphaName :: String +subListParName :: String +footnoteName :: String +paragraphName = "Paragraph" +codeBlockName = "CodeBlock" +rawBlockName = "Rawblock" +blockQuoteName = "Blockquote" +orderedListName = "NumList" +bulletListName = "BulList" +defListTermName = "DefListTerm" +defListDefName = "DefListDef" +headerName = "Header" +tableName = "TablePar" +tableHeaderName = "TableHeader" +tableCaptionName = "TableCaption" +alignLeftName = "LeftAlign" +alignRightName = "RightAlign" +alignCenterName = "CenterAlign" +firstListItemName = "first" +beginsWithName = "beginsWith-" +lowerRomanName = "lowerRoman" +upperRomanName = "upperRoman" +lowerAlphaName = "lowerAlpha" +upperAlphaName = "upperAlpha" +subListParName = "subParagraph" +footnoteName = "Footnote" + + +-- | Convert Pandoc document to string in ICML format. +writeICML :: WriterOptions -> Pandoc -> String +writeICML opts (Pandoc meta blocks) = + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + render' = render colwidth + renderMeta f s = Just $ render' $ fst $ runState (f opts [] s) defaultWriterState + Just metadata = metaToJSON opts + (renderMeta blocksToICML) + (renderMeta inlinesToICML) + meta + (doc, st) = runState (blocksToICML opts [] blocks) defaultWriterState + main = render' doc + context = defField "body" main + $ defField "charStyles" (render' $ charStylesToDoc st) + $ defField "parStyles" (render' $ parStylesToDoc st) + $ defField "hyperlinks" (render' $ hyperlinksToDoc $ links st) + $ metadata + in if writerStandalone opts + then renderTemplate' (writerTemplate opts) context + else main + +-- | Auxilary functions for parStylesToDoc and charStylesToDoc. +contains :: String -> (String, (String, String)) -> [(String, String)] +contains s rule = + if isInfixOf (fst rule) s + then [snd rule] + else [] + +-- | The monospaced font to use as default. +monospacedFont :: Doc +monospacedFont = inTags False "AppliedFont" [("type", "string")] $ text "Courier New" + +-- | How much to indent blockquotes etc. +defaultIndent :: Int +defaultIndent = 20 + +-- | How much to indent numbered lists before the number. +defaultListIndent :: Int +defaultListIndent = 10 + +-- other constants +lineSeparator :: String +lineSeparator = "
" + +-- | Convert a WriterState with its block styles to the ICML listing of Paragraph Styles. +parStylesToDoc :: WriterState -> Doc +parStylesToDoc st = vcat $ map makeStyle $ Set.toAscList $ blockStyles st + where + makeStyle s = + let countSubStrs sub str = length $ Text.breakOnAll (Text.pack sub) (Text.pack str) + attrs = concat $ map (contains s) $ [ + (defListTermName, ("BulletsAndNumberingListType", "BulletList")) + , (defListTermName, ("FontStyle", "Bold")) + , (tableHeaderName, ("FontStyle", "Bold")) + , (alignLeftName, ("Justification", "LeftAlign")) + , (alignRightName, ("Justification", "RightAlign")) + , (alignCenterName, ("Justification", "CenterAlign")) + , (headerName++"1", ("PointSize", "36")) + , (headerName++"2", ("PointSize", "30")) + , (headerName++"3", ("PointSize", "24")) + , (headerName++"4", ("PointSize", "18")) + , (headerName++"5", ("PointSize", "14")) + ] + -- what is the most nested list type, if any? + (isBulletList, isOrderedList) = findList $ reverse $ splitBy (==' ') s + where + findList [] = (False, False) + findList (x:xs) | x == bulletListName = (True, False) + | x == orderedListName = (False, True) + | otherwise = findList xs + nBuls = countSubStrs bulletListName s + nOrds = countSubStrs orderedListName s + attrs' = numbering ++ listType ++ indent ++ attrs + where + numbering | isOrderedList = [("NumberingExpression", "^#.^t"), ("NumberingLevel", show nOrds)] + | otherwise = [] + listType | isOrderedList && (not $ isInfixOf subListParName s) + = [("BulletsAndNumberingListType", "NumberedList")] + | isBulletList && (not $ isInfixOf subListParName s) + = [("BulletsAndNumberingListType", "BulletList")] + | otherwise = [] + indent = [("LeftIndent", show indt)] + where + nBlockQuotes = countSubStrs blockQuoteName s + nDefLists = countSubStrs defListDefName s + indt = max 0 $ defaultListIndent*(nBuls + nOrds - 1) + defaultIndent*(nBlockQuotes + nDefLists) + props = inTags True "Properties" [] $ (basedOn $$ tabList $$ numbForm) + where + font = if isInfixOf codeBlockName s + then monospacedFont + else empty + basedOn = inTags False "BasedOn" [("type", "object")] (text "$ID/NormalParagraphStyle") $$ font + tabList = if isBulletList + then inTags True "TabList" [("type","list")] $ inTags True "ListItem" [("type","record")] + $ vcat [ + inTags False "Alignment" [("type","enumeration")] $ text "LeftAlign" + , inTags False "AlignmentCharacter" [("type","string")] $ text "." + , selfClosingTag "Leader" [("type","string")] + , inTags False "Position" [("type","unit")] $ text + $ show $ defaultListIndent * (nBuls + nOrds) + ] + else empty + makeNumb name = inTags False "NumberingFormat" [("type", "string")] (text name) + numbForm | isInfixOf lowerRomanName s = makeNumb "i, ii, iii, iv..." + | isInfixOf upperRomanName s = makeNumb "I, II, III, IV..." + | isInfixOf lowerAlphaName s = makeNumb "a, b, c, d..." + | isInfixOf upperAlphaName s = makeNumb "A, B, C, D..." + | otherwise = empty + in inTags True "ParagraphStyle" ([("Self", "ParagraphStyle/"++s), ("Name", s)] ++ attrs') props + +-- | Convert a WriterState with its inline styles to the ICML listing of Character Styles. +charStylesToDoc :: WriterState -> Doc +charStylesToDoc st = vcat $ map makeStyle $ Set.toAscList $ inlineStyles st + where + makeStyle s = + let attrs = concat $ map (contains s) [ + (strikeoutName, ("StrikeThru", "true")) + , (superscriptName, ("Position", "Superscript")) + , (subscriptName, ("Position", "Subscript")) + , (smallCapsName, ("Capitalization", "SmallCaps")) + ] + attrs' | isInfixOf emphName s && isInfixOf strongName s = ("FontStyle", "Bold Italic") : attrs + | isInfixOf strongName s = ("FontStyle", "Bold") : attrs + | isInfixOf emphName s = ("FontStyle", "Italic") : attrs + | otherwise = attrs + props = inTags True "Properties" [] $ + inTags False "BasedOn" [("type", "object")] (text "$ID/NormalCharacterStyle") $$ font + where + font = + if isInfixOf codeName s + then monospacedFont + else empty + in inTags True "CharacterStyle" ([("Self", "CharacterStyle/"++s), ("Name", s)] ++ attrs') props + +-- | Convert a list of (identifier, url) pairs to the ICML listing of hyperlinks. +hyperlinksToDoc :: Hyperlink -> Doc +hyperlinksToDoc [] = empty +hyperlinksToDoc (x:xs) = hyp x $$ hyperlinksToDoc xs + where + hyp (ident, url) = hdest $$ hlink + where + hdest = selfClosingTag "HyperlinkURLDestination" + [("Self", "HyperlinkURLDestination/"++url), ("Name","link"), ("DestinationURL",url), ("DestinationUniqueKey","1")] + hlink = inTags True "Hyperlink" [("Self","uf-"++show ident), ("Name",url), + ("Source","htss-"++show ident), ("Visible","true"), ("DestinationUniqueKey","1")] + $ inTags True "Properties" [] + $ inTags False "BorderColor" [("type","enumeration")] (text "Black") + $$ (inTags False "Destination" [("type","object")] + $ text $ "HyperlinkURLDestination/"++(escapeStringForXML url)) + + +-- | Convert a list of Pandoc blocks to ICML. +blocksToICML :: WriterOptions -> Style -> [Block] -> WS Doc +blocksToICML opts style lst = vcat `fmap` mapM (blockToICML opts style) lst + +-- | Convert a Pandoc block element to ICML. +blockToICML :: WriterOptions -> Style -> Block -> WS Doc +blockToICML opts style (Plain lst) = parStyle opts style lst +blockToICML opts style (Para lst) = parStyle opts (paragraphName:style) lst +blockToICML opts style (CodeBlock _ str) = parStyle opts (codeBlockName:style) $ [Str str] +blockToICML opts style (RawBlock _ str) = parStyle opts (rawBlockName:style) $ [Str str] +blockToICML opts style (BlockQuote blocks) = blocksToICML opts (blockQuoteName:style) blocks +blockToICML opts style (OrderedList attribs lst) = listItemsToICML opts orderedListName style (Just attribs) lst +blockToICML opts style (BulletList lst) = listItemsToICML opts bulletListName style Nothing lst +blockToICML opts style (DefinitionList lst) = vcat `fmap` mapM (definitionListItemToICML opts style) lst +blockToICML opts style (Header lvl _ lst) = + let stl = (headerName ++ show lvl):style + in parStyle opts stl lst +blockToICML _ _ HorizontalRule = return empty -- we could insert a page break instead +blockToICML opts style (Table caption aligns widths headers rows) = + let style' = tableName : style + noHeader = all null headers + nrHeaders = if noHeader + then "0" + else "1" + nrRows = length rows + nrCols = if null rows + then 0 + else length $ head rows + rowsToICML [] _ = return empty + rowsToICML (col:rest) rowNr = + liftM2 ($$) (colsToICML col rowNr (0::Int)) $ rowsToICML rest (rowNr+1) + colsToICML [] _ _ = return empty + colsToICML (cell:rest) rowNr colNr = do + let stl = if rowNr == 0 && not noHeader + then tableHeaderName:style' + else style' + alig = aligns !! colNr + stl' | alig == AlignLeft = alignLeftName : stl + | alig == AlignRight = alignRightName : stl + | alig == AlignCenter = alignCenterName : stl + | otherwise = stl + c <- blocksToICML opts stl' cell + let cl = return $ inTags True "Cell" + [("Name", show colNr ++":"++ show rowNr), ("AppliedCellStyle","CellStyle/Cell")] c + liftM2 ($$) cl $ colsToICML rest rowNr (colNr+1) + in do + let tabl = if noHeader + then rows + else headers:rows + cells <- rowsToICML tabl (0::Int) + let colWidths w = if w > 0 + then [("SingleColumnWidth",show $ 500 * w)] + else [] + let tupToDoc tup = selfClosingTag "Column" $ [("Name",show $ fst tup)] ++ (colWidths $ snd tup) + let colDescs = vcat $ map tupToDoc $ zip [0..nrCols-1] widths + let tableDoc = return $ inTags True "Table" [ + ("AppliedTableStyle","TableStyle/Table") + , ("HeaderRowCount", nrHeaders) + , ("BodyRowCount", show nrRows) + , ("ColumnCount", show nrCols) + ] (colDescs $$ cells) + liftM2 ($$) tableDoc $ parStyle opts (tableCaptionName:style) caption +blockToICML opts style (Div _ lst) = blocksToICML opts style lst +blockToICML _ _ Null = return empty + +-- | Convert a list of lists of blocks to ICML list items. +listItemsToICML :: WriterOptions -> String -> Style -> Maybe ListAttributes -> [[Block]] -> WS Doc +listItemsToICML _ _ _ _ [] = return empty +listItemsToICML opts listType style attribs (first:rest) = do + st <- get + put st{ listDepth = 1 + listDepth st} + let stl = listType:style + let f = listItemToICML opts stl True attribs first + let r = map (listItemToICML opts stl False attribs) rest + docs <- sequence $ f:r + s <- get + let maxD = max (maxListDepth s) (listDepth s) + put s{ listDepth = 1, maxListDepth = maxD } + return $ vcat docs + +-- | Convert a list of blocks to ICML list items. +listItemToICML :: WriterOptions -> Style -> Bool-> Maybe ListAttributes -> [Block] -> WS Doc +listItemToICML opts style isFirst attribs item = + let makeNumbStart (Just (beginsWith, numbStl, _)) = + let doN DefaultStyle = [] + doN LowerRoman = [lowerRomanName] + doN UpperRoman = [upperRomanName] + doN LowerAlpha = [lowerAlphaName] + doN UpperAlpha = [upperAlphaName] + doN _ = [] + bw = if beginsWith > 1 + then [beginsWithName ++ show beginsWith] + else [] + in doN numbStl ++ bw + makeNumbStart Nothing = [] + stl = if isFirst + then firstListItemName:style + else style + stl' = makeNumbStart attribs ++ stl + in if length item > 1 + then do + let insertTab (Para lst) = blockToICML opts (subListParName:style) $ Para $ (Str "\t"):lst + insertTab block = blockToICML opts style block + f <- blockToICML opts stl' $ head item + r <- fmap vcat $ mapM insertTab $ tail item + return $ f $$ r + else blocksToICML opts stl' item + +definitionListItemToICML :: WriterOptions -> Style -> ([Inline],[[Block]]) -> WS Doc +definitionListItemToICML opts style (term,defs) = do + term' <- parStyle opts (defListTermName:style) term + defs' <- vcat `fmap` mapM (blocksToICML opts (defListDefName:style)) defs + return $ term' $$ defs' + + +-- | Convert a list of inline elements to ICML. +inlinesToICML :: WriterOptions -> Style -> [Inline] -> WS Doc +inlinesToICML opts style lst = vcat `fmap` mapM (inlineToICML opts style) (mergeSpaces lst) + +-- | Convert an inline element to ICML. +inlineToICML :: WriterOptions -> Style -> Inline -> WS Doc +inlineToICML _ style (Str str) = charStyle style $ text $ escapeStringForXML str +inlineToICML opts style (Emph lst) = inlinesToICML opts (emphName:style) lst +inlineToICML opts style (Strong lst) = inlinesToICML opts (strongName:style) lst +inlineToICML opts style (Strikeout lst) = inlinesToICML opts (strikeoutName:style) lst +inlineToICML opts style (Superscript lst) = inlinesToICML opts (superscriptName:style) lst +inlineToICML opts style (Subscript lst) = inlinesToICML opts (subscriptName:style) lst +inlineToICML opts style (SmallCaps lst) = inlinesToICML opts (smallCapsName:style) lst +inlineToICML opts style (Quoted SingleQuote lst) = inlinesToICML opts style $ [Str "‘"] ++ lst ++ [Str "’"] +inlineToICML opts style (Quoted DoubleQuote lst) = inlinesToICML opts style $ [Str "“"] ++ lst ++ [Str "”"] +inlineToICML opts style (Cite _ lst) = footnoteToICML opts style [Para lst] +inlineToICML _ style (Code _ str) = charStyle (codeName:style) $ text $ escapeStringForXML str +inlineToICML _ style Space = charStyle style space +inlineToICML _ style LineBreak = charStyle style $ text lineSeparator +inlineToICML _ style (Math _ str) = charStyle style $ text $ escapeStringForXML str --InDesign doesn't really do math +inlineToICML _ style (RawInline _ str) = charStyle style $ text $ escapeStringForXML str +inlineToICML opts style (Link lst (url, title)) = do + content <- inlinesToICML opts (linkName:style) lst + state $ \st -> + let ident = if null $ links st + then 1::Int + else 1 + (fst $ head $ links st) + newst = st{ links = (ident, url):(links st) } + cont = inTags True "HyperlinkTextSource" + [("Self","htss-"++show ident), ("Name",title), ("Hidden","false")] content + in (cont, newst) +inlineToICML opts style (Image alt target) = imageICML opts style alt target +inlineToICML opts style (Note lst) = footnoteToICML opts style lst +inlineToICML opts style (Span _ lst) = inlinesToICML opts style lst + +-- | Convert a list of block elements to an ICML footnote. +footnoteToICML :: WriterOptions -> Style -> [Block] -> WS Doc +footnoteToICML opts style lst = + let insertTab (Para ls) = blockToICML opts (footnoteName:style) $ Para $ (Str "\t"):ls + insertTab block = blockToICML opts (footnoteName:style) block + in do + contents <- mapM insertTab lst + let number = inTags True "ParagraphStyleRange" [] $ + inTags True "CharacterStyleRange" [] $ inTagsSimple "Content" "<?ACE 4?>" + return $ inTags True "CharacterStyleRange" + [("AppliedCharacterStyle","$ID/NormalCharacterStyle"), ("Position","Superscript")] + $ inTags True "Footnote" [] $ number $$ vcat contents + +-- | Auxiliary function to merge Space elements into the adjacent Strs. +mergeSpaces :: [Inline] -> [Inline] +mergeSpaces ((Str s):(Space:((Str s'):xs))) = mergeSpaces $ Str(s++" "++s') : xs +mergeSpaces (Space:((Str s):xs)) = mergeSpaces $ Str (" "++s) : xs +mergeSpaces ((Str s):(Space:xs)) = mergeSpaces $ Str (s++" ") : xs +mergeSpaces (x:xs) = x : (mergeSpaces xs) +mergeSpaces [] = [] + +-- | Wrap a list of inline elements in an ICML Paragraph Style +parStyle :: WriterOptions -> Style -> [Inline] -> WS Doc +parStyle opts style lst = + let slipIn x y = if null y + then x + else x ++ " > " ++ y + stlStr = foldr slipIn [] $ reverse style + stl = if null stlStr + then "" + else "ParagraphStyle/" ++ stlStr + attrs = ("AppliedParagraphStyle", stl) + attrs' = if firstListItemName `elem` style + then let ats = attrs : [("NumberingContinue", "false")] + begins = filter (isPrefixOf beginsWithName) style + in if null begins + then ats + else let i = maybe "" id $ stripPrefix beginsWithName $ head begins + in ("NumberingStartAt", i) : ats + else [attrs] + in do + content <- inlinesToICML opts [] lst + let cont = inTags True "ParagraphStyleRange" attrs' + $ mappend content $ selfClosingTag "Br" [] + state $ \st -> (cont, st{ blockStyles = Set.insert stlStr $ blockStyles st }) + +-- | Wrap a Doc in an ICML Character Style. +charStyle :: Style -> Doc -> WS Doc +charStyle style content = + let (stlStr, attrs) = styleToStrAttr style + doc = inTags True "CharacterStyleRange" attrs $ inTagsSimple "Content" $ flush content + in do + state $ \st -> + let styles = if null stlStr + then st + else st{ inlineStyles = Set.insert stlStr $ inlineStyles st } + in (doc, styles) + +-- | Transform a Style to a tuple of String (eliminating duplicates and ordered) and corresponding attribute. +styleToStrAttr :: Style -> (String, [(String, String)]) +styleToStrAttr style = + let stlStr = unwords $ Set.toAscList $ Set.fromList style + stl = if null style + then "$ID/NormalCharacterStyle" + else "CharacterStyle/" ++ stlStr + attrs = [("AppliedCharacterStyle", stl)] + in (stlStr, attrs) + +-- | Assemble an ICML Image. +imageICML :: WriterOptions -> Style -> [Inline] -> Target -> WS Doc +imageICML _ style _ (linkURI, _) = + let imgWidth = 300::Int --TODO: set width, height dynamically as in Docx.hs + imgHeight = 200::Int + scaleFact = show (1::Double) --TODO: set scaling factor so image is scaled exactly to imgWidth x imgHeight + hw = show $ imgWidth `div` 2 + hh = show $ imgHeight `div` 2 + qw = show $ imgWidth `div` 4 + qh = show $ imgHeight `div` 4 + (stlStr, attrs) = styleToStrAttr style + props = inTags True "Properties" [] $ inTags True "PathGeometry" [] + $ inTags True "GeometryPathType" [("PathOpen","false")] + $ inTags True "PathPointArray" [] + $ vcat [ + selfClosingTag "PathPointType" [("Anchor", "-"++qw++" -"++qh), + ("LeftDirection", "-"++qw++" -"++qh), ("RightDirection", "-"++qw++" -"++qh)] + , selfClosingTag "PathPointType" [("Anchor", "-"++qw++" "++qh), + ("LeftDirection", "-"++qw++" "++qh), ("RightDirection", "-"++qw++" "++qh)] + , selfClosingTag "PathPointType" [("Anchor", qw++" "++qh), + ("LeftDirection", qw++" "++qh), ("RightDirection", qw++" "++qh)] + , selfClosingTag "PathPointType" [("Anchor", qw++" -"++qh), + ("LeftDirection", qw++" -"++qh), ("RightDirection", qw++" -"++qh)] + ] + image = inTags True "Image" + [("Self","ue6"), ("ItemTransform", scaleFact++" 0 0 "++scaleFact++" -"++qw++" -"++qh)] + $ vcat [ + inTags True "Properties" [] $ inTags True "Profile" [("type","string")] $ text "$ID/Embedded" + $$ selfClosingTag "GraphicBounds" [("Left","0"), ("Top","0"), ("Right", hw), ("Bottom", hh)] + , selfClosingTag "Link" [("Self", "ueb"), ("LinkResourceURI", linkURI)] + ] + doc = inTags True "CharacterStyleRange" attrs + $ inTags True "Rectangle" [("Self","uec"), ("ItemTransform", "1 0 0 1 "++qw++" -"++qh)] + $ (props $$ image) + in do + state $ \st -> (doc, st{ inlineStyles = Set.insert stlStr $ inlineStyles st } ) diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index 2b4a608a7..100bf900d 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.LaTeX - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -30,20 +30,20 @@ Conversion of 'Pandoc' format into LaTeX. -} module Text.Pandoc.Writers.LaTeX ( writeLaTeX ) where import Text.Pandoc.Definition -import Text.Pandoc.Generic +import Text.Pandoc.Walk import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Templates import Text.Printf ( printf ) -import Network.URI ( isAbsoluteURI, unEscapeString ) +import Network.URI ( isURI, unEscapeString ) import Data.List ( (\\), isSuffixOf, isInfixOf, isPrefixOf, intercalate, intersperse ) -import Data.Char ( toLower, isPunctuation ) +import Data.Char ( toLower, isPunctuation, isAscii, isLetter, isDigit, ord ) +import Data.Maybe ( fromMaybe ) import Control.Applicative ((<|>)) import Control.Monad.State import Text.Pandoc.Pretty -import System.FilePath (dropExtension) import Text.Pandoc.Slides import Text.Pandoc.Highlighting (highlight, styleToLaTeX, formatLaTeXInline, formatLaTeXBlock, @@ -51,6 +51,9 @@ import Text.Pandoc.Highlighting (highlight, styleToLaTeX, data WriterState = WriterState { stInNote :: Bool -- true if we're in a note + , stInQuote :: Bool -- true if in a blockquote + , stInMinipage :: Bool -- true if in minipage + , stNotes :: [Doc] -- notes in a minipage , stOLLevel :: Int -- level of ordered list nesting , stOptions :: WriterOptions -- writer options, so they don't have to be parameter , stVerbInNote :: Bool -- true if document has verbatim text in note @@ -71,7 +74,8 @@ data WriterState = writeLaTeX :: WriterOptions -> Pandoc -> String writeLaTeX options document = evalState (pandocToLaTeX options document) $ - WriterState { stInNote = False, + WriterState { stInNote = False, stInQuote = False, + stInMinipage = False, stNotes = [], stOLLevel = 1, stOptions = options, stVerbInNote = False, stTable = False, stStrikeout = False, @@ -83,10 +87,17 @@ writeLaTeX options document = pandocToLaTeX :: WriterOptions -> Pandoc -> State WriterState String pandocToLaTeX options (Pandoc meta blocks) = do + -- Strip off final 'references' header if --natbib or --biblatex + let method = writerCiteMethod options + let blocks' = if method == Biblatex || method == Natbib + then case reverse blocks of + (Div (_,["references"],_) _):xs -> reverse xs + _ -> blocks + else blocks -- see if there are internal links let isInternalLink (Link _ ('#':xs,_)) = [xs] isInternalLink _ = [] - modify $ \s -> s{ stInternalLinks = queryWith isInternalLink blocks } + modify $ \s -> s{ stInternalLinks = query isInternalLink blocks' } let template = writerTemplate options -- set stBook depending on documentclass let bookClasses = ["memoir","book","report","scrreprt","scrbook"] @@ -108,30 +119,31 @@ pandocToLaTeX options (Pandoc meta blocks) = do (fmap (render colwidth) . blockListToLaTeX) (fmap (render colwidth) . inlineListToLaTeX) meta - let (blocks', lastHeader) = if writerCiteMethod options == Citeproc then - (blocks, []) - else case last blocks of - Header 1 _ il -> (init blocks, il) - _ -> (blocks, []) - blocks'' <- if writerBeamer options - then toSlides blocks' - else return blocks' - body <- mapM (elementToLaTeX options) $ hierarchicalize blocks'' + let (blocks'', lastHeader) = if writerCiteMethod options == Citeproc then + (blocks', []) + else case last blocks' of + Header 1 _ il -> (init blocks', il) + _ -> (blocks', []) + blocks''' <- if writerBeamer options + then toSlides blocks'' + else return blocks'' + body <- mapM (elementToLaTeX options) $ hierarchicalize blocks''' (biblioTitle :: String) <- liftM (render colwidth) $ inlineListToLaTeX lastHeader let main = render colwidth $ vsep body st <- get - let biblioFiles = intercalate "," $ map dropExtension $ writerBiblioFiles options + titleMeta <- stringToLaTeX TextString $ stringify $ docTitle meta + authorsMeta <- mapM (stringToLaTeX TextString . stringify) $ docAuthors meta let context = defField "toc" (writerTableOfContents options) $ defField "toc-depth" (show (writerTOCDepth options - - if writerChapters options + if stBook st then 1 else 0)) $ defField "body" main $ - defField "title-meta" (stringify $ docTitle meta) $ - defField "author-meta" (intercalate "; " $ map stringify $ docAuthors meta) $ + defField "title-meta" titleMeta $ + defField "author-meta" (intercalate "; " authorsMeta) $ defField "documentclass" (if writerBeamer options then ("beamer" :: String) - else if writerChapters options + else if stBook st then "book" else "article") $ defField "verbatim-in-note" (stVerbInNote st) $ @@ -152,11 +164,9 @@ pandocToLaTeX options (Pandoc meta blocks) = do $ writerHighlightStyle options ) else id) $ (case writerCiteMethod options of - Natbib -> defField "biblio-files" biblioFiles . - defField "biblio-title" biblioTitle . + Natbib -> defField "biblio-title" biblioTitle . defField "natbib" True - Biblatex -> defField "biblio-files" biblioFiles . - defField "biblio-title" biblioTitle . + Biblatex -> defField "biblio-title" biblioTitle . defField "biblatex" True _ -> id) $ metadata @@ -183,7 +193,7 @@ stringToLaTeX _ [] = return "" stringToLaTeX ctx (x:xs) = do opts <- gets stOptions rest <- stringToLaTeX ctx xs - let ligatures = writerTeXLigatures opts && not (ctx == CodeString) + let ligatures = writerTeXLigatures opts && ctx == TextString let isUrl = ctx == URLString when (x == '€') $ modify $ \st -> st{ stUsesEuro = True } @@ -197,17 +207,20 @@ stringToLaTeX ctx (x:xs) = do '&' -> "\\&" ++ rest '_' | not isUrl -> "\\_" ++ rest '#' -> "\\#" ++ rest - '-' -> case xs of -- prevent adjacent hyphens from forming ligatures - ('-':_) -> "-{}" ++ rest + '-' | not isUrl -> case xs of + -- prevent adjacent hyphens from forming ligatures + ('-':_) -> "-\\/" ++ rest _ -> '-' : rest '~' | not isUrl -> "\\textasciitilde{}" ++ rest '^' -> "\\^{}" ++ rest - '\\' -> "\\textbackslash{}" ++ rest + '\\'| isUrl -> '/' : rest -- NB. / works as path sep even on Windows + | otherwise -> "\\textbackslash{}" ++ rest '|' -> "\\textbar{}" ++ rest '<' -> "\\textless{}" ++ rest '>' -> "\\textgreater{}" ++ rest '[' -> "{[}" ++ rest -- to avoid interpretation as ']' -> "{]}" ++ rest -- optional arguments + '\'' | ctx == CodeString -> "\\textquotesingle{}" ++ rest '\160' -> "~" ++ rest '\x2026' -> "\\ldots{}" ++ rest '\x2018' | ligatures -> "`" ++ rest @@ -218,6 +231,14 @@ stringToLaTeX ctx (x:xs) = do '\x2013' | ligatures -> "--" ++ rest _ -> x : rest +toLabel :: String -> State WriterState String +toLabel z = go `fmap` stringToLaTeX URLString z + where go [] = "" + go (x:xs) + | (isLetter x || isDigit x) && isAscii x = x:go xs + | elem x "-+=:;." = x:go xs + | otherwise = "ux" ++ printf "%x" (ord x) ++ go xs + -- | Puts contents into LaTeX command. inCmd :: String -> Doc -> Doc inCmd cmd contents = char '\\' <> text cmd <> braces contents @@ -225,13 +246,13 @@ inCmd cmd contents = char '\\' <> text cmd <> braces contents toSlides :: [Block] -> State WriterState [Block] toSlides bs = do opts <- gets stOptions - let slideLevel = maybe (getSlideLevel bs) id $ writerSlideLevel opts + let slideLevel = fromMaybe (getSlideLevel bs) $ writerSlideLevel opts let bs' = prepSlides slideLevel bs concat `fmap` (mapM (elementToBeamer slideLevel) $ hierarchicalize bs') elementToBeamer :: Int -> Element -> State WriterState [Block] elementToBeamer _slideLevel (Blk b) = return [b] -elementToBeamer slideLevel (Sec lvl _num (ident,classes,_) tit elts) +elementToBeamer slideLevel (Sec lvl _num (ident,classes,kvs) tit elts) | lvl > slideLevel = do bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts return $ Para ( RawInline "latex" "\\begin{block}{" @@ -239,7 +260,7 @@ elementToBeamer slideLevel (Sec lvl _num (ident,classes,_) tit elts) : bs ++ [RawBlock "latex" "\\end{block}"] | lvl < slideLevel = do bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts - return $ (Header lvl (ident,classes,[]) tit) : bs + return $ (Header lvl (ident,classes,kvs) tit) : bs | otherwise = do -- lvl == slideLevel -- note: [fragile] is required or verbatim breaks let hasCodeBlock (CodeBlock _ _) = [True] @@ -247,17 +268,20 @@ elementToBeamer slideLevel (Sec lvl _num (ident,classes,_) tit elts) let hasCode (Code _ _) = [True] hasCode _ = [] opts <- gets stOptions - let fragile = if not $ null $ queryWith hasCodeBlock elts ++ + let fragile = not $ null $ query hasCodeBlock elts ++ if writerListings opts - then queryWith hasCode elts + then query hasCode elts else [] - then "[fragile]" - else "" - let slideStart = Para $ RawInline "latex" ("\\begin{frame}" ++ fragile) : + let allowframebreaks = "allowframebreaks" `elem` classes + let optionslist = ["fragile" | fragile] ++ + ["allowframebreaks" | allowframebreaks] + let options = if null optionslist + then "" + else "[" ++ intercalate "," optionslist ++ "]" + let slideStart = Para $ RawInline "latex" ("\\begin{frame}" ++ options) : if tit == [Str "\0"] -- marker for hrule then [] - else (RawInline "latex" "\\frametitle{") : tit ++ - [RawInline "latex" "}"] + else (RawInline "latex" "{") : tit ++ [RawInline "latex" "}"] let slideEnd = RawBlock "latex" "\\end{frame}" -- now carve up slide into blocks if there are sections inside bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts @@ -278,16 +302,24 @@ isLineBreakOrSpace _ = False blockToLaTeX :: Block -- ^ Block to convert -> State WriterState Doc blockToLaTeX Null = return empty +blockToLaTeX (Div (_,classes,_) bs) = do + beamer <- writerBeamer `fmap` gets stOptions + contents <- blockListToLaTeX bs + if beamer && "notes" `elem` classes -- speaker notes + then return $ "\\note" <> braces contents + else return contents blockToLaTeX (Plain lst) = inlineListToLaTeX $ dropWhile isLineBreakOrSpace lst -- title beginning with fig: indicates that the image is a figure blockToLaTeX (Para [Image txt (src,'f':'i':'g':':':tit)]) = do - capt <- if null txt - then return empty - else (\c -> "\\caption" <> braces c) `fmap` inlineListToLaTeX txt + inNote <- gets stInNote + capt <- inlineListToLaTeX txt img <- inlineToLaTeX (Image txt (src,tit)) - return $ "\\begin{figure}[htbp]" $$ "\\centering" $$ img $$ - capt $$ "\\end{figure}" + return $ if inNote + -- can't have figures in notes + then "\\begin{center}" $$ img $+$ capt $$ "\\end{center}" + else "\\begin{figure}[htbp]" $$ "\\centering" $$ img $$ + ("\\caption" <> braces capt) $$ "\\end{figure}" -- . . . indicates pause in beamer slides blockToLaTeX (Para [Str ".",Space,Str ".",Space,Str "."]) = do beamer <- writerBeamer `fmap` gets stOptions @@ -306,53 +338,68 @@ blockToLaTeX (BlockQuote lst) = do modify $ \s -> s{ stIncremental = oldIncremental } return result _ -> do + oldInQuote <- gets stInQuote + modify (\s -> s{stInQuote = True}) contents <- blockListToLaTeX lst + modify (\s -> s{stInQuote = oldInQuote}) return $ "\\begin{quote}" $$ contents $$ "\\end{quote}" -blockToLaTeX (CodeBlock (_,classes,keyvalAttr) str) = do +blockToLaTeX (CodeBlock (identifier,classes,keyvalAttr) str) = do opts <- gets stOptions + ref <- toLabel identifier + let linkAnchor = if null identifier + then empty + else "\\hyperdef{}" <> braces (text ref) <> + braces ("\\label" <> braces (text ref)) + let lhsCodeBlock = do + modify $ \s -> s{ stLHS = True } + return $ flush (linkAnchor $$ "\\begin{code}" $$ text str $$ + "\\end{code}") $$ cr + let rawCodeBlock = do + st <- get + env <- if stInNote st + then modify (\s -> s{ stVerbInNote = True }) >> + return "Verbatim" + else return "verbatim" + return $ flush (linkAnchor $$ text ("\\begin{" ++ env ++ "}") $$ + text str $$ text ("\\end{" ++ env ++ "}")) <> cr + let listingsCodeBlock = do + st <- get + let params = if writerListings (stOptions st) + then (case getListingsLanguage classes of + Just l -> [ "language=" ++ l ] + Nothing -> []) ++ + [ "numbers=left" | "numberLines" `elem` classes + || "number" `elem` classes + || "number-lines" `elem` classes ] ++ + [ (if key == "startFrom" + then "firstnumber" + else key) ++ "=" ++ attr | + (key,attr) <- keyvalAttr ] ++ + (if identifier == "" + then [] + else [ "label=" ++ ref ]) + + else [] + printParams + | null params = empty + | otherwise = brackets $ hcat (intersperse ", " (map text params)) + return $ flush ("\\begin{lstlisting}" <> printParams $$ text str $$ + "\\end{lstlisting}") $$ cr + let highlightedCodeBlock = + case highlight formatLaTeXBlock ("",classes,keyvalAttr) str of + Nothing -> rawCodeBlock + Just h -> modify (\st -> st{ stHighlighting = True }) >> + return (flush $ linkAnchor $$ text h) case () of _ | isEnabled Ext_literate_haskell opts && "haskell" `elem` classes && "literate" `elem` classes -> lhsCodeBlock | writerListings opts -> listingsCodeBlock | writerHighlight opts && not (null classes) -> highlightedCodeBlock | otherwise -> rawCodeBlock - where lhsCodeBlock = do - modify $ \s -> s{ stLHS = True } - return $ flush ("\\begin{code}" $$ text str $$ "\\end{code}") $$ cr - rawCodeBlock = do - st <- get - env <- if stInNote st - then modify (\s -> s{ stVerbInNote = True }) >> - return "Verbatim" - else return "verbatim" - return $ flush (text ("\\begin{" ++ env ++ "}") $$ text str $$ - text ("\\end{" ++ env ++ "}")) <> cr - listingsCodeBlock = do - st <- get - let params = if writerListings (stOptions st) - then (case getListingsLanguage classes of - Just l -> [ "language=" ++ l ] - Nothing -> []) ++ - [ "numbers=left" | "numberLines" `elem` classes - || "number" `elem` classes - || "number-lines" `elem` classes ] ++ - [ (if key == "startFrom" - then "firstnumber" - else key) ++ "=" ++ attr | - (key,attr) <- keyvalAttr ] - else [] - printParams - | null params = empty - | otherwise = brackets $ hcat (intersperse ", " (map text params)) - return $ flush ("\\begin{lstlisting}" <> printParams $$ text str $$ - "\\end{lstlisting}") $$ cr - highlightedCodeBlock = - case highlight formatLaTeXBlock ("",classes,keyvalAttr) str of - Nothing -> rawCodeBlock - Just h -> modify (\st -> st{ stHighlighting = True }) >> - return (flush $ text h) -blockToLaTeX (RawBlock "latex" x) = return $ text x -blockToLaTeX (RawBlock _ _) = return empty +blockToLaTeX (RawBlock f x) + | f == Format "latex" || f == Format "tex" + = return $ text x + | otherwise = return empty blockToLaTeX (BulletList []) = return empty -- otherwise latex error blockToLaTeX (BulletList lst) = do incremental <- gets stIncremental @@ -407,7 +454,7 @@ blockToLaTeX (DefinitionList lst) = do incremental <- gets stIncremental let inc = if incremental then "[<+->]" else "" items <- mapM defListItemToLaTeX lst - let spacing = if and $ map isTightList (map snd lst) + let spacing = if all isTightList (map snd lst) then text "\\itemsep1pt\\parskip0pt\\parsep0pt" else empty return $ text ("\\begin{description}" ++ inc) $$ spacing $$ vcat items $$ @@ -419,12 +466,12 @@ blockToLaTeX (Header level (id',classes,_) lst) = blockToLaTeX (Table caption aligns widths heads rows) = do headers <- if all null heads then return empty - else ($$ "\\hline\\noalign{\\medskip}") `fmap` + else ($$ "\\midrule\\endhead") `fmap` (tableRowToLaTeX True aligns widths) heads captionText <- inlineListToLaTeX caption let capt = if isEmpty captionText then empty - else text "\\noalign{\\medskip}" + else text "\\addlinespace" $$ text "\\caption" <> braces captionText rows' <- mapM (tableRowToLaTeX False aligns widths) rows let colDescriptors = text $ concat $ map toColDescriptor aligns @@ -432,10 +479,10 @@ blockToLaTeX (Table caption aligns widths heads rows) = do return $ "\\begin{longtable}[c]" <> braces ("@{}" <> colDescriptors <> "@{}") -- the @{} removes extra space at beginning and end - $$ "\\hline\\noalign{\\medskip}" + $$ "\\toprule\\addlinespace" $$ headers $$ vcat rows' - $$ "\\hline" + $$ "\\bottomrule" $$ capt $$ "\\end{longtable}" @@ -456,19 +503,61 @@ tableRowToLaTeX :: Bool -> [[Block]] -> State WriterState Doc tableRowToLaTeX header aligns widths cols = do - renderedCells <- mapM blockListToLaTeX cols + -- scale factor compensates for extra space between columns + -- so the whole table isn't larger than columnwidth + let scaleFactor = 0.97 ** fromIntegral (length aligns) + let widths' = map (scaleFactor *) widths + cells <- mapM (tableCellToLaTeX header) $ zip3 widths' aligns cols + return $ hsep (intersperse "&" cells) $$ "\\\\\\addlinespace" + +-- For simple latex tables (without minipages or parboxes), +-- we need to go to some lengths to get line breaks working: +-- as LineBreak bs = \vtop{\hbox{\strut as}\hbox{\strut bs}}. +fixLineBreaks :: Block -> Block +fixLineBreaks (Para ils) = Para $ fixLineBreaks' ils +fixLineBreaks (Plain ils) = Plain $ fixLineBreaks' ils +fixLineBreaks x = x + +fixLineBreaks' :: [Inline] -> [Inline] +fixLineBreaks' ils = case splitBy (== LineBreak) ils of + [] -> [] + [xs] -> xs + chunks -> RawInline "tex" "\\vtop{" : + concatMap tohbox chunks ++ + [RawInline "tex" "}"] + where tohbox ys = RawInline "tex" "\\hbox{\\strut " : ys ++ + [RawInline "tex" "}"] + +tableCellToLaTeX :: Bool -> (Double, Alignment, [Block]) + -> State WriterState Doc +tableCellToLaTeX _ (0, _, blocks) = + blockListToLaTeX $ walk fixLineBreaks blocks +tableCellToLaTeX header (width, align, blocks) = do + modify $ \st -> st{ stInMinipage = True, stNotes = [] } + cellContents <- blockListToLaTeX blocks + notes <- gets stNotes + modify $ \st -> st{ stInMinipage = False, stNotes = [] } let valign = text $ if header then "[b]" else "[t]" - let halign x = case x of - AlignLeft -> "\\raggedright" - AlignRight -> "\\raggedleft" - AlignCenter -> "\\centering" - AlignDefault -> "\\raggedright" - let toCell 0 _ c = c - toCell w a c = "\\begin{minipage}" <> valign <> - braces (text (printf "%.2f\\columnwidth" w)) <> - (halign a <> cr <> c <> cr) <> "\\end{minipage}" - let cells = zipWith3 toCell widths aligns renderedCells - return $ hsep (intersperse "&" cells) $$ "\\\\\\noalign{\\medskip}" + let halign = case align of + AlignLeft -> "\\raggedright" + AlignRight -> "\\raggedleft" + AlignCenter -> "\\centering" + AlignDefault -> "\\raggedright" + return $ ("\\begin{minipage}" <> valign <> + braces (text (printf "%.2f\\columnwidth" width)) <> + (halign <> cr <> cellContents <> cr) <> "\\end{minipage}") + $$ case notes of + [] -> empty + ns -> (case length ns of + n | n > 1 -> "\\addtocounter" <> + braces "footnote" <> + braces (text $ show $ 1 - n) + | otherwise -> empty) + $$ + vcat (intersperse + ("\\addtocounter" <> braces "footnote" <> braces "1") + $ map (\x -> "\\footnotetext" <> braces x) + $ reverse ns) listItemToLaTeX :: [Block] -> State WriterState Doc listItemToLaTeX lst = blockListToLaTeX lst >>= return . (text "\\item" $$) . @@ -477,8 +566,15 @@ listItemToLaTeX lst = blockListToLaTeX lst >>= return . (text "\\item" $$) . defListItemToLaTeX :: ([Inline], [[Block]]) -> State WriterState Doc defListItemToLaTeX (term, defs) = do term' <- inlineListToLaTeX term + -- put braces around term if it contains an internal link, + -- since otherwise we get bad bracket interactions: \item[\hyperref[..] + let isInternalLink (Link _ ('#':_,_)) = True + isInternalLink _ = False + let term'' = if any isInternalLink term + then braces term' + else term' def' <- liftM vsep $ mapM blockListToLaTeX defs - return $ "\\item" <> brackets term' $$ def' + return $ "\\item" <> brackets term'' $$ def' -- | Craft the section header, inserting the secton reference, if supplied. sectionHeader :: Bool -- True for unnumbered @@ -488,17 +584,19 @@ sectionHeader :: Bool -- True for unnumbered -> State WriterState Doc sectionHeader unnumbered ref level lst = do txt <- inlineListToLaTeX lst + lab <- text `fmap` toLabel ref let noNote (Note _) = Str "" noNote x = x - let lstNoNotes = bottomUp noNote lst + let lstNoNotes = walk noNote lst + txtNoNotes <- inlineListToLaTeX lstNoNotes let star = if unnumbered then text "*" else empty - -- footnotes in sections don't work unless you specify an optional - -- argument: \section[mysec]{mysec\footnote{blah}} - optional <- if lstNoNotes == lst + -- footnotes in sections don't work (except for starred variants) + -- unless you specify an optional argument: + -- \section[mysec]{mysec\footnote{blah}} + optional <- if unnumbered || lstNoNotes == lst then return empty else do - res <- inlineListToLaTeX lstNoNotes - return $ char '[' <> res <> char ']' + return $ brackets txtNoNotes let stuffing = star <> optional <> braces txt book <- gets stBook opts <- gets stOptions @@ -507,13 +605,13 @@ sectionHeader unnumbered ref level lst = do let refLabel x = (if ref `elem` internalLinks then text "\\hyperdef" <> braces empty - <> braces (text ref) + <> braces lab <> braces x else x) - let headerWith x y r = refLabel $ text x <> y <> - if null r + let headerWith x y = refLabel $ text x <> y <> + if null ref then empty - else text "\\label" <> braces (text r) + else text "\\label" <> braces lab let sectionType = case level' of 0 | writerBeamer opts -> "part" | otherwise -> "chapter" @@ -523,13 +621,20 @@ sectionHeader unnumbered ref level lst = do 4 -> "paragraph" 5 -> "subparagraph" _ -> "" + inQuote <- gets stInQuote + let prefix = if inQuote && level' >= 4 + then text "\\mbox{}%" + -- needed for \paragraph, \subparagraph in quote environment + -- see http://tex.stackexchange.com/questions/169830/ + else empty return $ if level' > 5 then txt - else headerWith ('\\':sectionType) stuffing ref + else prefix $$ + headerWith ('\\':sectionType) stuffing $$ if unnumbered then "\\addcontentsline{toc}" <> braces (text sectionType) <> - braces txt + braces txtNoNotes else empty -- | Convert list of inline elements to LaTeX. @@ -556,12 +661,29 @@ isQuoted _ = False -- | Convert inline element to LaTeX inlineToLaTeX :: Inline -- ^ Inline to convert -> State WriterState Doc +inlineToLaTeX (Span (id',classes,_) ils) = do + let noEmph = "csl-no-emph" `elem` classes + let noStrong = "csl-no-strong" `elem` classes + let noSmallCaps = "csl-no-smallcaps" `elem` classes + label' <- if null id' + then return empty + else toLabel id' >>= \x -> + return (text "\\label" <> braces (text x)) + fmap (label' <>) + ((if noEmph then inCmd "textup" else id) . + (if noStrong then inCmd "textnormal" else id) . + (if noSmallCaps then inCmd "textnormal" else id) . + (if not (noEmph || noStrong || noSmallCaps) + then braces + else id)) `fmap` inlineListToLaTeX ils inlineToLaTeX (Emph lst) = inlineListToLaTeX lst >>= return . inCmd "emph" inlineToLaTeX (Strong lst) = inlineListToLaTeX lst >>= return . inCmd "textbf" inlineToLaTeX (Strikeout lst) = do - contents <- inlineListToLaTeX lst + -- we need to protect VERB in an mbox or we get an error + -- see #1294 + contents <- inlineListToLaTeX $ protectCode lst modify $ \s -> s{ stStrikeout = True } return $ inCmd "sout" contents inlineToLaTeX (Superscript lst) = @@ -624,15 +746,16 @@ inlineToLaTeX (Math InlineMath str) = return $ char '$' <> text str <> char '$' inlineToLaTeX (Math DisplayMath str) = return $ "\\[" <> text str <> "\\]" -inlineToLaTeX (RawInline "latex" str) = return $ text str -inlineToLaTeX (RawInline "tex" str) = return $ text str -inlineToLaTeX (RawInline _ _) = return empty +inlineToLaTeX (RawInline f str) + | f == Format "latex" || f == Format "tex" + = return $ text str + | otherwise = return empty inlineToLaTeX (LineBreak) = return "\\\\" inlineToLaTeX Space = return space inlineToLaTeX (Link txt ('#':ident, _)) = do contents <- inlineListToLaTeX txt - ident' <- stringToLaTeX URLString ident - return $ text "\\hyperref" <> brackets (text ident') <> braces contents + lab <- toLabel ident + return $ text "\\hyperref" <> brackets (text lab) <> braces contents inlineToLaTeX (Link txt (src, _)) = case txt of [Str x] | x == src -> -- autolink @@ -645,19 +768,33 @@ inlineToLaTeX (Link txt (src, _)) = contents <> char '}' inlineToLaTeX (Image _ (source, _)) = do modify $ \s -> s{ stGraphics = True } - let source' = if isAbsoluteURI source + let source' = if isURI source then source else unEscapeString source - return $ "\\includegraphics" <> braces (text source') + source'' <- stringToLaTeX URLString source' + return $ "\\includegraphics" <> braces (text source'') inlineToLaTeX (Note contents) = do + inMinipage <- gets stInMinipage modify (\s -> s{stInNote = True}) contents' <- blockListToLaTeX contents modify (\s -> s {stInNote = False}) let optnl = case reverse contents of (CodeBlock _ _ : _) -> cr _ -> empty - return $ "\\footnote" <> braces (nest 2 contents' <> optnl) - -- note: a \n before } needed when note ends with a Verbatim environment + let noteContents = nest 2 contents' <> optnl + modify $ \st -> st{ stNotes = noteContents : stNotes st } + return $ + if inMinipage + then "\\footnotemark{}" + -- note: a \n before } needed when note ends with a Verbatim environment + else "\\footnote" <> braces noteContents + +protectCode :: [Inline] -> [Inline] +protectCode [] = [] +protectCode (x@(Code ("",[],[]) _) : xs) = x : protectCode xs +protectCode (x@(Code _ _) : xs) = ltx "\\mbox{" : x : ltx "}" : xs + where ltx = RawInline (Format "latex") +protectCode (x : xs) = x : protectCode xs citationsToNatbib :: [Citation] -> State WriterState Doc citationsToNatbib (one:[]) @@ -678,9 +815,9 @@ citationsToNatbib cits | noPrefix (tail cits) && noSuffix (init cits) && ismode NormalCitation cits = citeCommand "citep" p s ks where - noPrefix = and . map (null . citationPrefix) - noSuffix = and . map (null . citationSuffix) - ismode m = and . map (((==) m) . citationMode) + noPrefix = all (null . citationPrefix) + noSuffix = all (null . citationSuffix) + ismode m = all (((==) m) . citationMode) p = citationPrefix $ head $ cits s = citationSuffix $ last $ cits ks = intercalate ", " $ map citationId cits diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index 0508b6c27..41eb3e5be 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2007-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Man - Copyright : Copyright (C) 2007-2010 John MacFarlane + Copyright : Copyright (C) 2007-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -160,14 +160,16 @@ blockToMan :: WriterOptions -- ^ Options -> Block -- ^ Block element -> State WriterState Doc blockToMan _ Null = return empty +blockToMan opts (Div _ bs) = blockListToMan opts bs blockToMan opts (Plain inlines) = liftM vcat $ mapM (inlineListToMan opts) $ splitSentences inlines blockToMan opts (Para inlines) = do contents <- liftM vcat $ mapM (inlineListToMan opts) $ splitSentences inlines return $ text ".PP" $$ contents -blockToMan _ (RawBlock "man" str) = return $ text str -blockToMan _ (RawBlock _ _) = return empty +blockToMan _ (RawBlock f str) + | f == Format "man" = return $ text str + | otherwise = return empty blockToMan _ HorizontalRule = return $ text ".PP" $$ text " * * * * *" blockToMan opts (Header level _ inlines) = do contents <- inlineListToMan opts inlines @@ -281,7 +283,7 @@ definitionListItemToMan opts (label, defs) = do mapM (\item -> blockToMan opts item) rest first' <- blockToMan opts first return $ first' $$ text ".RS" $$ rest' $$ text ".RE" - return $ text ".TP" $$ text ".B " <> labelText $$ contents + return $ text ".TP" $$ nowrap (text ".B " <> labelText) $$ contents -- | Convert list of Pandoc block elements to man. blockListToMan :: WriterOptions -- ^ Options @@ -300,6 +302,7 @@ inlineListToMan opts lst = mapM (inlineToMan opts) lst >>= (return . hcat) -- | Convert Pandoc inline element to man. inlineToMan :: WriterOptions -> Inline -> State WriterState Doc +inlineToMan opts (Span _ ils) = inlineListToMan opts ils inlineToMan opts (Emph lst) = do contents <- inlineListToMan opts lst return $ text "\\f[I]" <> contents <> text "\\f[]" @@ -327,12 +330,14 @@ inlineToMan opts (Cite _ lst) = inlineToMan _ (Code _ str) = return $ text $ "\\f[C]" ++ escapeCode str ++ "\\f[]" inlineToMan _ (Str str) = return $ text $ escapeString str -inlineToMan opts (Math InlineMath str) = inlineListToMan opts $ readTeXMath str +inlineToMan opts (Math InlineMath str) = + inlineListToMan opts $ readTeXMath' InlineMath str inlineToMan opts (Math DisplayMath str) = do - contents <- inlineListToMan opts $ readTeXMath str + contents <- inlineListToMan opts $ readTeXMath' DisplayMath str return $ cr <> text ".RS" $$ contents $$ text ".RE" -inlineToMan _ (RawInline "man" str) = return $ text str -inlineToMan _ (RawInline _ _) = return empty +inlineToMan _ (RawInline f str) + | f == Format "man" = return $ text str + | otherwise = return empty inlineToMan _ (LineBreak) = return $ cr <> text ".PD 0" $$ text ".P" $$ text ".PD" <> cr inlineToMan _ Space = return space diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 80402a757..a67271a5d 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings, TupleSections, ScopedTypeVariables #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Markdown - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,22 +32,22 @@ Markdown: <http://daringfireball.net/projects/markdown/> -} module Text.Pandoc.Writers.Markdown (writeMarkdown, writePlain) where import Text.Pandoc.Definition -import Text.Pandoc.Generic +import Text.Pandoc.Walk import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, char, space) import Data.List ( group, isPrefixOf, find, intersperse, transpose, sortBy ) -import Data.Char ( isSpace ) +import Data.Char ( isSpace, isPunctuation ) import Data.Ord ( comparing ) import Text.Pandoc.Pretty import Control.Monad.State import qualified Data.Set as Set import Text.Pandoc.Writers.HTML (writeHtmlString) -import Text.Pandoc.Readers.TeXMath (readTeXMath) +import Text.Pandoc.Readers.TeXMath (readTeXMath') import Text.HTML.TagSoup (renderTags, parseTags, isTagText, Tag(..)) -import Network.URI (isAbsoluteURI) +import Network.URI (isURI) import Data.Default import Data.Yaml (Value(Object,String,Array,Bool,Number)) import qualified Data.HashMap.Strict as H @@ -82,7 +82,7 @@ writePlain opts document = where document' = plainify document plainify :: Pandoc -> Pandoc -plainify = bottomUp go +plainify = walk go where go :: Inline -> Inline go (Emph xs) = SmallCaps xs go (Strong xs) = SmallCaps xs @@ -143,7 +143,7 @@ jsonToYaml (Object hashmap) = | otherwise -> (k' <> ":") $$ x (k', Object _, x) -> (k' <> ":") $$ nest 2 x (_, String "", _) -> empty - (k', _, x) -> k' <> ":" <> space <> x) + (k', _, x) -> k' <> ":" <> space <> hang 2 "" x) $ sortBy (comparing fst) $ H.toList hashmap jsonToYaml (Array vec) = vcat $ map (\v -> hang 2 "- " (jsonToYaml v)) $ V.toList vec @@ -151,7 +151,7 @@ jsonToYaml (String "") = empty jsonToYaml (String s) = case T.unpack s of x | '\n' `elem` x -> hang 2 ("|" <> cr) $ text x - | not (any (`elem` x) "\"'#:[]{}?-") -> text x + | not (any isPunctuation x) -> text x | otherwise -> text $ "'" ++ substitute "'" "''" x ++ "'" jsonToYaml (Bool b) = text $ show b jsonToYaml (Number n) = text $ show n @@ -186,7 +186,13 @@ pandocToMarkdown opts (Pandoc meta blocks) = do let toc = if writerTableOfContents opts then tableOfContents opts headerBlocks else empty - body <- blockListToMarkdown opts blocks + -- Strip off final 'references' header if markdown citations enabled + let blocks' = if not isPlain && isEnabled Ext_citations opts + then case reverse blocks of + (Div (_,["references"],_) _):xs -> reverse xs + _ -> blocks + else blocks + body <- blockListToMarkdown opts blocks' st <- get notes' <- notesToMarkdown opts (reverse $ stNotes st) st' <- get -- note that the notes may contain refs @@ -301,22 +307,34 @@ blockToMarkdown :: WriterOptions -- ^ Options -> Block -- ^ Block element -> State WriterState Doc blockToMarkdown _ Null = return empty +blockToMarkdown opts (Div attrs ils) = do + isPlain <- gets stPlain + contents <- blockListToMarkdown opts ils + return $ if isPlain || not (isEnabled Ext_markdown_in_html_blocks opts) + then contents <> blankline + else tagWithAttrs "div" attrs <> blankline <> + contents <> blankline <> "</div>" <> blankline blockToMarkdown opts (Plain inlines) = do contents <- inlineListToMarkdown opts inlines - return $ contents <> cr + -- escape if para starts with ordered list marker + st <- get + let colwidth = if writerWrapText opts + then Just $ writerColumns opts + else Nothing + let rendered = render colwidth contents + let escapeDelimiter (x:xs) | x `elem` ".()" = '\\':x:xs + | otherwise = x : escapeDelimiter xs + escapeDelimiter [] = [] + let contents' = if isEnabled Ext_all_symbols_escapable opts && + not (stPlain st) && beginsWithOrderedListMarker rendered + then text $ escapeDelimiter rendered + else contents + return $ contents' <> cr -- title beginning with fig: indicates figure blockToMarkdown opts (Para [Image alt (src,'f':'i':'g':':':tit)]) = blockToMarkdown opts (Para [Image alt (src,tit)]) -blockToMarkdown opts (Para inlines) = do - contents <- inlineListToMarkdown opts inlines - -- escape if para starts with ordered list marker - st <- get - let esc = if isEnabled Ext_all_symbols_escapable opts && - not (stPlain st) && - beginsWithOrderedListMarker (render Nothing contents) - then text "\x200B" -- zero-width space, a hack - else empty - return $ esc <> contents <> blankline +blockToMarkdown opts (Para inlines) = + (<> blankline) `fmap` blockToMarkdown opts (Plain inlines) blockToMarkdown opts (RawBlock f str) | f == "html" = do st <- get @@ -325,7 +343,7 @@ blockToMarkdown opts (RawBlock f str) else return $ if isEnabled Ext_markdown_attribute opts then text (addMarkdownAttribute str) <> text "\n" else text str <> text "\n" - | f == "latex" || f == "tex" || f == "markdown" = do + | f `elem` ["latex", "tex", "markdown"] = do st <- get if stPlain st then return empty @@ -368,23 +386,27 @@ blockToMarkdown opts (CodeBlock (_,classes,_) str) isEnabled Ext_literate_haskell opts = return $ prefixed "> " (text str) <> blankline blockToMarkdown opts (CodeBlock attribs str) = return $ - case attribs of - x | x /= nullAttr && isEnabled Ext_fenced_code_blocks opts -> - tildes <> " " <> attrs <> cr <> text str <> - cr <> tildes <> blankline - (_,(cls:_),_) | isEnabled Ext_backtick_code_blocks opts -> - backticks <> " " <> text cls <> cr <> text str <> - cr <> backticks <> blankline + case attribs == nullAttr of + False | isEnabled Ext_backtick_code_blocks opts -> + backticks <> attrs <> cr <> text str <> cr <> backticks <> blankline + | isEnabled Ext_fenced_code_blocks opts -> + tildes <> attrs <> cr <> text str <> cr <> tildes <> blankline _ -> nest (writerTabStop opts) (text str) <> blankline where tildes = text $ case [ln | ln <- lines str, all (=='~') ln] of [] -> "~~~~" xs -> case maximum $ map length xs of n | n < 3 -> "~~~~" | otherwise -> replicate (n+1) '~' - backticks = text "```" + backticks = text $ case [ln | ln <- lines str, all (=='`') ln] of + [] -> "```" + xs -> case maximum $ map length xs of + n | n < 3 -> "```" + | otherwise -> replicate (n+1) '`' attrs = if isEnabled Ext_fenced_code_attributes opts - then nowrap $ attrsToMarkdown attribs - else empty + then nowrap $ " " <> attrsToMarkdown attribs + else case attribs of + (_,[cls],_) -> " " <> text cls + _ -> empty blockToMarkdown opts (BlockQuote blocks) = do st <- get -- if we're writing literate haskell, put a space before the bird tracks @@ -456,16 +478,24 @@ addMarkdownAttribute s = pipeTable :: Bool -> [Alignment] -> [Doc] -> [[Doc]] -> State WriterState Doc pipeTable headless aligns rawHeaders rawRows = do + let sp = text " " + let blockFor AlignLeft x y = lblock (x + 2) (sp <> y) <> lblock 0 empty + blockFor AlignCenter x y = cblock (x + 2) (sp <> y) <> lblock 0 empty + blockFor AlignRight x y = rblock (x + 2) (sp <> y) <> lblock 0 empty + blockFor _ x y = lblock (x + 2) (sp <> y) <> lblock 0 empty + let widths = map (max 3 . maximum . map offset) $ transpose (rawHeaders : rawRows) let torow cs = nowrap $ text "|" <> - hcat (intersperse (text "|") $ map chomp cs) <> text "|" - let toborder (a, h) = let wid = max (offset h) 3 - in text $ case a of - AlignLeft -> ':':replicate (wid - 1) '-' - AlignCenter -> ':':replicate (wid - 2) '-' ++ ":" - AlignRight -> replicate (wid - 1) '-' ++ ":" - AlignDefault -> replicate wid '-' + hcat (intersperse (text "|") $ + zipWith3 blockFor aligns widths (map chomp cs)) + <> text "|" + let toborder (a, w) = text $ case a of + AlignLeft -> ':':replicate (w + 1) '-' + AlignCenter -> ':':replicate w '-' ++ ":" + AlignRight -> replicate (w + 1) '-' ++ ":" + AlignDefault -> replicate (w + 2) '-' let header = if headless then empty else torow rawHeaders - let border = torow $ map toborder $ zip aligns rawHeaders + let border = nowrap $ text "|" <> hcat (intersperse (text "|") $ + map toborder $ zip aligns widths) <> text "|" let body = vcat $ map torow rawRows return $ header $$ border $$ body @@ -542,7 +572,14 @@ bulletListItemToMarkdown opts items = do contents <- blockListToMarkdown opts items let sps = replicate (writerTabStop opts - 2) ' ' let start = text ('-' : ' ' : sps) - return $ hang (writerTabStop opts) start $ contents <> cr + -- remove trailing blank line if it is a tight list + let contents' = case reverse items of + (BulletList xs:_) | isTightList xs -> + chomp contents <> cr + (OrderedList _ xs:_) | isTightList xs -> + chomp contents <> cr + _ -> contents + return $ hang (writerTabStop opts) start $ contents' <> cr -- | Convert ordered list item (a list of blocks) to markdown. orderedListItemToMarkdown :: WriterOptions -- ^ options @@ -608,10 +645,11 @@ getReference label (src, tit) = do Nothing -> do let label' = case find ((== label) . fst) (stRefs st) of Just _ -> -- label is used; generate numerical label - case find (\n -> not (any (== [Str (show n)]) - (map fst (stRefs st)))) [1..(10000 :: Integer)] of - Just x -> [Str (show x)] - Nothing -> error "no unique label" + case find (\n -> notElem [Str (show n)] + (map fst (stRefs st))) + [1..(10000 :: Integer)] of + Just x -> [Str (show x)] + Nothing -> error "no unique label" Nothing -> label modify (\s -> s{ stRefs = (label', (src,tit)) : stRefs st }) return label' @@ -628,6 +666,12 @@ escapeSpaces x = x -- | Convert Pandoc inline element to markdown. inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc +inlineToMarkdown opts (Span attrs ils) = do + st <- get + contents <- inlineListToMarkdown opts ils + return $ if stPlain st + then contents + else tagWithAttrs "span" attrs <> contents <> text "</span>" inlineToMarkdown opts (Emph lst) = do contents <- inlineListToMarkdown opts lst return $ "*" <> contents <> "*" @@ -640,13 +684,13 @@ inlineToMarkdown opts (Strikeout lst) = do then "~~" <> contents <> "~~" else "<s>" <> contents <> "</s>" inlineToMarkdown opts (Superscript lst) = do - let lst' = bottomUp escapeSpaces lst + let lst' = walk escapeSpaces lst contents <- inlineListToMarkdown opts lst' return $ if isEnabled Ext_superscript opts then "^" <> contents <> "^" else "<sup>" <> contents <> "</sup>" inlineToMarkdown opts (Subscript lst) = do - let lst' = bottomUp escapeSpaces lst + let lst' = walk escapeSpaces lst contents <- inlineListToMarkdown opts lst' return $ if isEnabled Ext_subscript opts then "~" <> contents <> "~" @@ -681,7 +725,7 @@ inlineToMarkdown opts (Math InlineMath str) return $ "\\(" <> text str <> "\\)" | isEnabled Ext_tex_math_double_backslash opts = return $ "\\\\(" <> text str <> "\\\\)" - | otherwise = inlineListToMarkdown opts $ readTeXMath str + | otherwise = inlineListToMarkdown opts $ readTeXMath' InlineMath str inlineToMarkdown opts (Math DisplayMath str) | isEnabled Ext_tex_math_dollars opts = return $ "$$" <> text str <> "$$" @@ -690,7 +734,7 @@ inlineToMarkdown opts (Math DisplayMath str) | isEnabled Ext_tex_math_double_backslash opts = return $ "\\\\[" <> text str <> "\\\\]" | otherwise = (\x -> cr <> x <> cr) `fmap` - inlineListToMarkdown opts (readTeXMath str) + inlineListToMarkdown opts (readTeXMath' DisplayMath str) inlineToMarkdown opts (RawInline f str) | f == "html" || f == "markdown" || (isEnabled Ext_raw_tex opts && (f == "latex" || f == "tex")) = @@ -701,17 +745,20 @@ inlineToMarkdown opts (LineBreak) | isEnabled Ext_escaped_line_breaks opts = return $ "\\" <> cr | otherwise = return $ " " <> cr inlineToMarkdown _ Space = return space -inlineToMarkdown opts (Cite (c:cs) lst@[RawInline "latex" _]) +inlineToMarkdown opts (Cite [] lst) = inlineListToMarkdown opts lst +inlineToMarkdown opts (Cite (c:cs) lst) | not (isEnabled Ext_citations opts) = inlineListToMarkdown opts lst - | citationMode c == AuthorInText = do - suffs <- inlineListToMarkdown opts $ citationSuffix c - rest <- mapM convertOne cs - let inbr = suffs <+> joincits rest - br = if isEmpty inbr then empty else char '[' <> inbr <> char ']' - return $ text ("@" ++ citationId c) <+> br - | otherwise = do - cits <- mapM convertOne (c:cs) - return $ text "[" <> joincits cits <> text "]" + | otherwise = + if citationMode c == AuthorInText + then do + suffs <- inlineListToMarkdown opts $ citationSuffix c + rest <- mapM convertOne cs + let inbr = suffs <+> joincits rest + br = if isEmpty inbr then empty else char '[' <> inbr <> char ']' + return $ text ("@" ++ citationId c) <+> br + else do + cits <- mapM convertOne (c:cs) + return $ text "[" <> joincits cits <> text "]" where joincits = hcat . intersperse (text "; ") . filter (not . isEmpty) convertOne Citation { citationId = k @@ -728,14 +775,13 @@ inlineToMarkdown opts (Cite (c:cs) lst@[RawInline "latex" _]) return $ pdoc <+> r modekey SuppressAuthor = "-" modekey _ = "" -inlineToMarkdown opts (Cite _ lst) = inlineListToMarkdown opts lst inlineToMarkdown opts (Link txt (src, tit)) = do linktext <- inlineListToMarkdown opts txt let linktitle = if null tit then empty else text $ " \"" ++ tit ++ "\"" let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src - let useAuto = isAbsoluteURI src && + let useAuto = isURI src && case txt of [Str s] | escapeURI s == srcSuffix -> True _ -> False diff --git a/src/Text/Pandoc/Writers/MediaWiki.hs b/src/Text/Pandoc/Writers/MediaWiki.hs index b3b319c2a..3b987ba2b 100644 --- a/src/Text/Pandoc/Writers/MediaWiki.hs +++ b/src/Text/Pandoc/Writers/MediaWiki.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2008-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2008-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.MediaWiki - Copyright : Copyright (C) 2008-2010 John MacFarlane + Copyright : Copyright (C) 2008-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,9 +34,10 @@ import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared +import Text.Pandoc.Pretty (render) import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.XML ( escapeStringForXML ) -import Data.List ( intersect, intercalate ) +import Data.List ( intersect, intercalate, intersperse ) import Network.URI ( isURI ) import Control.Monad.State @@ -50,7 +51,7 @@ data WriterState = WriterState { writeMediaWiki :: WriterOptions -> Pandoc -> String writeMediaWiki opts document = evalState (pandocToMediaWiki opts document) - (WriterState { stNotes = False, stListLevel = [], stUseTags = False }) + WriterState { stNotes = False, stListLevel = [], stUseTags = False } -- | Return MediaWiki representation of document. pandocToMediaWiki :: WriterOptions -> Pandoc -> State WriterState String @@ -83,6 +84,11 @@ blockToMediaWiki :: WriterOptions -- ^ Options blockToMediaWiki _ Null = return "" +blockToMediaWiki opts (Div attrs bs) = do + contents <- blockListToMediaWiki opts bs + return $ render Nothing (tagWithAttrs "div" attrs) ++ "\n\n" ++ + contents ++ "\n\n" ++ "</div>" + blockToMediaWiki opts (Plain inlines) = inlineListToMediaWiki opts inlines @@ -104,9 +110,10 @@ blockToMediaWiki opts (Para inlines) = do then "<p>" ++ contents ++ "</p>" else contents ++ if null listLevel then "\n" else "" -blockToMediaWiki _ (RawBlock "mediawiki" str) = return str -blockToMediaWiki _ (RawBlock "html" str) = return str -blockToMediaWiki _ (RawBlock _ _) = return "" +blockToMediaWiki _ (RawBlock f str) + | f == Format "mediawiki" = return str + | f == Format "html" = return str + | otherwise = return "" blockToMediaWiki _ HorizontalRule = return "\n-----\n" @@ -135,25 +142,17 @@ blockToMediaWiki opts (BlockQuote blocks) = do return $ "<blockquote>" ++ contents ++ "</blockquote>" blockToMediaWiki opts (Table capt aligns widths headers rows') = do - let alignStrings = map alignmentToString aligns - captionDoc <- if null capt - then return "" - else do - c <- inlineListToMediaWiki opts capt - return $ "<caption>" ++ c ++ "</caption>\n" - let percent w = show (truncate (100*w) :: Integer) ++ "%" - let coltags = if all (== 0.0) widths - then "" - else unlines $ map - (\w -> "<col width=\"" ++ percent w ++ "\" />") widths - head' <- if all null headers - then return "" - else do - hs <- tableRowToMediaWiki opts alignStrings 0 headers - return $ "<thead>\n" ++ hs ++ "\n</thead>\n" - body' <- zipWithM (tableRowToMediaWiki opts alignStrings) [1..] rows' - return $ "<table>\n" ++ captionDoc ++ coltags ++ head' ++ - "<tbody>\n" ++ unlines body' ++ "</tbody>\n</table>\n" + caption <- if null capt + then return "" + else do + c <- inlineListToMediaWiki opts capt + return $ "|+ " ++ trimr c ++ "\n" + let headless = all null headers + let allrows = if headless then rows' else headers:rows' + tableBody <- (concat . intersperse "|-\n") `fmap` + mapM (tableRowToMediaWiki opts headless aligns widths) + (zip [1..] allrows) + return $ "{|\n" ++ caption ++ tableBody ++ "|}\n" blockToMediaWiki opts x@(BulletList items) = do oldUseTags <- get >>= return . stUseTags @@ -285,20 +284,34 @@ vcat = intercalate "\n" -- Auxiliary functions for tables: tableRowToMediaWiki :: WriterOptions - -> [String] - -> Int - -> [[Block]] + -> Bool + -> [Alignment] + -> [Double] + -> (Int, [[Block]]) -> State WriterState String -tableRowToMediaWiki opts alignStrings rownum cols' = do - let celltype = if rownum == 0 then "th" else "td" - let rowclass = case rownum of - 0 -> "header" - x | x `rem` 2 == 1 -> "odd" - _ -> "even" - cols'' <- sequence $ zipWith - (\alignment item -> tableItemToMediaWiki opts celltype alignment item) - alignStrings cols' - return $ "<tr class=\"" ++ rowclass ++ "\">\n" ++ unlines cols'' ++ "</tr>" +tableRowToMediaWiki opts headless alignments widths (rownum, cells) = do + cells' <- mapM (\cellData -> + tableCellToMediaWiki opts headless rownum cellData) + $ zip3 alignments widths cells + return $ unlines cells' + +tableCellToMediaWiki :: WriterOptions + -> Bool + -> Int + -> (Alignment, Double, [Block]) + -> State WriterState String +tableCellToMediaWiki opts headless rownum (alignment, width, bs) = do + contents <- blockListToMediaWiki opts bs + let marker = if rownum == 1 && not headless then "!" else "|" + let percent w = show (truncate (100*w) :: Integer) ++ "%" + let attrs = ["align=" ++ show (alignmentToString alignment) | + alignment /= AlignDefault && alignment /= AlignLeft] ++ + ["width=\"" ++ percent width ++ "\"" | + width /= 0.0 && rownum == 1] + let attr = if null attrs + then "" + else unwords attrs ++ "|" + return $ marker ++ attr ++ trimr contents alignmentToString :: Alignment -> [Char] alignmentToString alignment = case alignment of @@ -307,17 +320,6 @@ alignmentToString alignment = case alignment of AlignCenter -> "center" AlignDefault -> "left" -tableItemToMediaWiki :: WriterOptions - -> String - -> String - -> [Block] - -> State WriterState String -tableItemToMediaWiki opts celltype align' item = do - let mkcell x = "<" ++ celltype ++ " align=\"" ++ align' ++ "\">" ++ - x ++ "</" ++ celltype ++ ">" - contents <- blockListToMediaWiki opts item - return $ mkcell contents - -- | Convert list of Pandoc block elements to MediaWiki. blockListToMediaWiki :: WriterOptions -- ^ Options -> [Block] -- ^ List of block elements @@ -333,6 +335,10 @@ inlineListToMediaWiki opts lst = -- | Convert Pandoc inline element to MediaWiki. inlineToMediaWiki :: WriterOptions -> Inline -> State WriterState String +inlineToMediaWiki opts (Span attrs ils) = do + contents <- inlineListToMediaWiki opts ils + return $ render Nothing (tagWithAttrs "span" attrs) ++ contents ++ "</span>" + inlineToMediaWiki opts (Emph lst) = do contents <- inlineListToMediaWiki opts lst return $ "''" ++ contents ++ "''" @@ -373,9 +379,10 @@ inlineToMediaWiki _ (Str str) = return $ escapeString str inlineToMediaWiki _ (Math _ str) = return $ "<math>" ++ str ++ "</math>" -- note: str should NOT be escaped -inlineToMediaWiki _ (RawInline "mediawiki" str) = return str -inlineToMediaWiki _ (RawInline "html" str) = return str -inlineToMediaWiki _ (RawInline _ _) = return "" +inlineToMediaWiki _ (RawInline f str) + | f == Format "mediawiki" = return str + | f == Format "html" = return str + | otherwise = return "" inlineToMediaWiki _ (LineBreak) = return "<br />" diff --git a/src/Text/Pandoc/Writers/Native.hs b/src/Text/Pandoc/Writers/Native.hs index 090b97433..cb821e40b 100644 --- a/src/Text/Pandoc/Writers/Native.hs +++ b/src/Text/Pandoc/Writers/Native.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Native - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index db27286e8..15f7c8be8 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -1,6 +1,6 @@ {-# LANGUAGE ScopedTypeVariables #-} {- -Copyright (C) 2008-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2008-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.ODT - Copyright : Copyright (C) 2008-2010 John MacFarlane + Copyright : Copyright (C) 2008-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -30,7 +30,10 @@ Conversion of 'Pandoc' documents to ODT. -} module Text.Pandoc.Writers.ODT ( writeODT ) where import Data.IORef -import Data.List ( isPrefixOf ) +import Data.List ( isPrefixOf, isSuffixOf ) +import Data.Maybe ( fromMaybe ) +import Text.XML.Light.Output +import Text.TeXMath import qualified Data.ByteString.Lazy as B import Text.Pandoc.UTF8 ( fromStringLazy ) import Codec.Archive.Zip @@ -39,15 +42,15 @@ import Text.Pandoc.Shared ( stringify, readDataFile, fetchItem, warn ) import Text.Pandoc.ImageSize ( imageSize, sizeInPoints ) import Text.Pandoc.MIME ( getMimeType ) import Text.Pandoc.Definition -import Text.Pandoc.Generic +import Text.Pandoc.Walk +import Text.Pandoc.Writers.Shared ( fixDisplayMath ) import Text.Pandoc.Writers.OpenDocument ( writeOpenDocument ) import Control.Monad (liftM) -import Control.Monad.Trans (liftIO) import Text.Pandoc.XML import Text.Pandoc.Pretty import qualified Control.Exception as E import Data.Time.Clock.POSIX ( getPOSIXTime ) -import System.FilePath ( takeExtension ) +import System.FilePath ( takeExtension, takeDirectory ) -- | Produce an ODT file from a Pandoc document. writeODT :: WriterOptions -- ^ Writer options @@ -61,34 +64,44 @@ writeODT opts doc@(Pandoc meta _) = do Just f -> B.readFile f Nothing -> (B.fromChunks . (:[])) `fmap` readDataFile datadir "reference.odt" - -- handle pictures + -- handle formulas and pictures picEntriesRef <- newIORef ([] :: [Entry]) - let sourceDir = writerSourceDirectory opts - doc' <- bottomUpM (transformPic sourceDir picEntriesRef) doc + doc' <- walkM (transformPicMath opts picEntriesRef) $ walk fixDisplayMath doc let newContents = writeOpenDocument opts{writerWrapText = False} doc' epochtime <- floor `fmap` getPOSIXTime - let contentEntry = toEntry "content.xml" epochtime $ fromStringLazy newContents + let contentEntry = toEntry "content.xml" epochtime + $ fromStringLazy newContents picEntries <- readIORef picEntriesRef - let archive = foldr addEntryToArchive refArchive $ contentEntry : picEntries + let archive = foldr addEntryToArchive refArchive + $ contentEntry : picEntries -- construct META-INF/manifest.xml based on archive let toFileEntry fp = case getMimeType fp of - Nothing -> empty + Nothing -> if "Formula-" `isPrefixOf` fp && "/" `isSuffixOf` fp + then selfClosingTag "manifest:file-entry" + [("manifest:media-type","application/vnd.oasis.opendocument.formula") + ,("manifest:full-path",fp)] + else empty Just m -> selfClosingTag "manifest:file-entry" [("manifest:media-type", m) ,("manifest:full-path", fp) + ,("manifest:version", "1.2") ] - let files = [ ent | ent <- filesInArchive archive, not ("META-INF" `isPrefixOf` ent) ] + let files = [ ent | ent <- filesInArchive archive, + not ("META-INF" `isPrefixOf` ent) ] + let formulas = [ takeDirectory ent ++ "/" | ent <- filesInArchive archive, + "Formula-" `isPrefixOf` ent, takeExtension ent == ".xml" ] let manifestEntry = toEntry "META-INF/manifest.xml" epochtime $ fromStringLazy $ render Nothing $ text "<?xml version=\"1.0\" encoding=\"utf-8\"?>" $$ ( inTags True "manifest:manifest" - [("xmlns:manifest","urn:oasis:names:tc:opendocument:xmlns:manifest:1.0")] + [("xmlns:manifest","urn:oasis:names:tc:opendocument:xmlns:manifest:1.0") + ,("manifest:version","1.2")] $ ( selfClosingTag "manifest:file-entry" [("manifest:media-type","application/vnd.oasis.opendocument.text") - ,("manifest:version","1.2") ,("manifest:full-path","/")] $$ vcat ( map toFileEntry $ files ) + $$ vcat ( map toFileEntry $ formulas ) ) ) let archive' = addEntryToArchive manifestEntry archive @@ -109,19 +122,23 @@ writeODT opts doc@(Pandoc meta _) = do ) ) ) - let archive'' = addEntryToArchive metaEntry archive' + -- make sure mimetype is first + let mimetypeEntry = toEntry "mimetype" epochtime + $ fromStringLazy "application/vnd.oasis.opendocument.text" + let archive'' = addEntryToArchive mimetypeEntry + $ addEntryToArchive metaEntry archive' return $ fromArchive archive'' -transformPic :: FilePath -> IORef [Entry] -> Inline -> IO Inline -transformPic sourceDir entriesRef (Image lab (src,_)) = do - res <- liftIO $ E.try $ fetchItem sourceDir src +transformPicMath :: WriterOptions -> IORef [Entry] -> Inline -> IO Inline +transformPicMath opts entriesRef (Image lab (src,_)) = do + res <- fetchItem (writerSourceURL opts) src case res of Left (_ :: E.SomeException) -> do - liftIO $ warn $ "Could not find image `" ++ src ++ "', skipping..." + warn $ "Could not find image `" ++ src ++ "', skipping..." return $ Emph lab Right (img, _) -> do let size = imageSize img - let (w,h) = maybe (0,0) id $ sizeInPoints `fmap` size + let (w,h) = fromMaybe (0,0) $ sizeInPoints `fmap` size let tit' = show w ++ "x" ++ show h entries <- readIORef entriesRef let newsrc = "Pictures/" ++ show (length entries) ++ takeExtension src @@ -130,5 +147,29 @@ transformPic sourceDir entriesRef (Image lab (src,_)) = do let entry = toEntry newsrc epochtime $ toLazy img modifyIORef entriesRef (entry:) return $ Image lab (newsrc, tit') -transformPic _ _ x = return x +transformPicMath _ entriesRef (Math t math) = do + entries <- readIORef entriesRef + let dt = if t == InlineMath then DisplayInline else DisplayBlock + case texMathToMathML dt math of + Left _ -> return $ Math t math + Right r -> do + let conf = useShortEmptyTags (const False) defaultConfigPP + let mathml = ppcTopElement conf r + epochtime <- floor `fmap` getPOSIXTime + let dirname = "Formula-" ++ show (length entries) ++ "/" + let fname = dirname ++ "content.xml" + let entry = toEntry fname epochtime (fromStringLazy mathml) + modifyIORef entriesRef (entry:) + return $ RawInline (Format "opendocument") $ render Nothing $ + inTags False "draw:frame" [("text:anchor-type", + if t == DisplayMath + then "paragraph" + else "as-char") + ,("style:vertical-pos", "middle") + ,("style:vertical-rel", "text")] $ + selfClosingTag "draw:object" [("xlink:href", dirname) + , ("xlink:type", "simple") + , ("xlink:show", "embed") + , ("xlink:actuate", "onLoad")] +transformPicMath _ _ x = return x diff --git a/src/Text/Pandoc/Writers/OPML.hs b/src/Text/Pandoc/Writers/OPML.hs index f6926c1dc..dd359f3f5 100644 --- a/src/Text/Pandoc/Writers/OPML.hs +++ b/src/Text/Pandoc/Writers/OPML.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2013 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2013-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.OPML - Copyright : Copyright (C) 2013 John MacFarlane + Copyright : Copyright (C) 2013-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs index 30f99c3e4..b6da2694c 100644 --- a/src/Text/Pandoc/Writers/OpenDocument.hs +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -1,7 +1,7 @@ -{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE PatternGuards, OverloadedStrings #-} {- -Copyright (C) 2008-2010 Andrea Rossato <andrea.rossato@ing.unitn.it> -and John MacFarlane. +Copyright (C) 2008-2014 Andrea Rossato <andrea.rossato@ing.unitn.it> + and John MacFarlane. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.OpenDocument - Copyright : Copyright (C) 2008-2010 Andrea Rossato and John MacFarlane + Copyright : Copyright (C) 2008-2014 Andrea Rossato and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Andrea Rossato <andrea.rossato@ing.unitn.it> @@ -64,6 +64,7 @@ data WriterState = , stInDefinition :: Bool , stTight :: Bool , stFirstPara :: Bool + , stImageId :: Int } defaultWriterState :: WriterState @@ -78,6 +79,7 @@ defaultWriterState = , stInDefinition = False , stTight = False , stFirstPara = False + , stImageId = 1 } when :: Bool -> Doc -> Doc @@ -283,8 +285,13 @@ blocksToOpenDocument o b = vcat <$> mapM (blockToOpenDocument o) b -- | Convert a Pandoc block element to OpenDocument. blockToOpenDocument :: WriterOptions -> Block -> State WriterState Doc blockToOpenDocument o bs - | Plain b <- bs = inParagraphTags =<< inlinesToOpenDocument o b - | Para b <- bs = inParagraphTags =<< inlinesToOpenDocument o b + | Plain b <- bs = if null b + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + | Para b <- bs = if null b + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + | Div _ xs <- bs = blocksToOpenDocument o xs | Header i _ b <- bs = setFirstPara >> (inHeaderTags i =<< inlinesToOpenDocument o b) | BlockQuote b <- bs = setFirstPara >> mkBlockQuote b @@ -295,7 +302,9 @@ blockToOpenDocument o bs | Table c a w h r <- bs = setFirstPara >> table c a w h r | HorizontalRule <- bs = setFirstPara >> return (selfClosingTag "text:p" [ ("text:style-name", "Horizontal_20_Line") ]) - | RawBlock _ _ <- bs = return empty + | RawBlock f s <- bs = if f == Format "opendocument" + then return $ text s + else return empty | Null <- bs = return empty | otherwise = return empty where @@ -360,6 +369,7 @@ inlinesToOpenDocument o l = hcat <$> mapM (inlineToOpenDocument o) l inlineToOpenDocument :: WriterOptions -> Inline -> State WriterState Doc inlineToOpenDocument o ils | Space <- ils = inTextStyle space + | Span _ xs <- ils = inlinesToOpenDocument o xs | LineBreak <- ils = return $ selfClosingTag "text:line-break" [] | Str s <- ils = inTextStyle $ handleSpaces $ escapeStringForXML s | Emph l <- ils = withTextStyle Italic $ inlinesToOpenDocument o l @@ -369,23 +379,27 @@ inlineToOpenDocument o ils | Subscript l <- ils = withTextStyle Sub $ inlinesToOpenDocument o l | SmallCaps l <- ils = withTextStyle SmallC $ inlinesToOpenDocument o l | Quoted t l <- ils = inQuotes t <$> inlinesToOpenDocument o l - | Code _ s <- ils = preformatted s - | Math _ s <- ils = inlinesToOpenDocument o (readTeXMath s) + | Code _ s <- ils = withTextStyle Pre $ inTextStyle $ preformatted s + | Math t s <- ils = inlinesToOpenDocument o (readTeXMath' t s) | Cite _ l <- ils = inlinesToOpenDocument o l - | RawInline "opendocument" s <- ils = preformatted s - | RawInline "html" s <- ils = preformatted s -- for backwards compat. - | RawInline _ _ <- ils = return empty + | RawInline f s <- ils = if f == Format "opendocument" + then return $ text s + else return empty | Link l (s,t) <- ils = mkLink s t <$> inlinesToOpenDocument o l - | Image _ (s,t) <- ils = return $ mkImg s t + | Image _ (s,t) <- ils = mkImg s t | Note l <- ils = mkNote l | otherwise = return empty where - preformatted = return . inSpanTags "Teletype" . handleSpaces . escapeStringForXML + preformatted s = handleSpaces $ escapeStringForXML s mkLink s t = inTags False "text:a" [ ("xlink:type" , "simple") , ("xlink:href" , s ) , ("office:name", t ) ] . inSpanTags "Definition" - mkImg s t = inTags False "draw:frame" (attrsFromTitle t) $ + mkImg s t = do + id' <- gets stImageId + modify (\st -> st{ stImageId = id' + 1 }) + return $ inTags False "draw:frame" + (("draw:name", "img" ++ show id'):attrsFromTitle t) $ selfClosingTag "draw:image" [ ("xlink:href" , s ) , ("xlink:type" , "simple") , ("xlink:show" , "embed" ) @@ -457,7 +471,8 @@ tableStyle :: Int -> [(Char,Double)] -> Doc tableStyle num wcs = let tableId = "Table" ++ show (num + 1) table = inTags True "style:style" - [("style:name", tableId)] $ + [("style:name", tableId) + ,("style:family", "table")] $ selfClosingTag "style:table-properties" [("table:align" , "center")] colStyle (c,0) = selfClosingTag "style:style" @@ -489,14 +504,16 @@ paraStyle parent attrs = do tight = if t then [ ("fo:margin-top" , "0in" ) , ("fo:margin-bottom" , "0in" )] else [] - indent = when (i /= 0 || b || t) $ - selfClosingTag "style:paragraph-properties" $ - [ ("fo:margin-left" , indentVal) + indent = if (i /= 0 || b) + then [ ("fo:margin-left" , indentVal) , ("fo:margin-right" , "0in" ) , ("fo:text-indent" , "0in" ) , ("style:auto-text-indent" , "false" )] - ++ tight - addParaStyle $ inTags True "style:style" (styleAttr ++ attrs) indent + else [] + attributes = indent ++ tight + paraProps = when (not $ null attributes) $ + selfClosingTag "style:paragraph-properties" attributes + addParaStyle $ inTags True "style:style" (styleAttr ++ attrs) paraProps return pn paraListStyle :: Int -> State WriterState Int @@ -517,7 +534,8 @@ paraTableStyles t s (a:xs) [ ("fo:text-align", x) , ("style:justify-single-word", "false")] -data TextStyle = Italic | Bold | Strike | Sub | Sup | SmallC deriving ( Eq,Ord ) +data TextStyle = Italic | Bold | Strike | Sub | Sup | SmallC | Pre + deriving ( Eq,Ord ) textStyleAttr :: TextStyle -> [(String,String)] textStyleAttr s @@ -531,5 +549,8 @@ textStyleAttr s | Sub <- s = [("style:text-position" ,"sub 58%" )] | Sup <- s = [("style:text-position" ,"super 58%" )] | SmallC <- s = [("fo:font-variant" ,"small-caps")] + | Pre <- s = [("style:font-name" ,"Courier New") + ,("style:font-name-asian" ,"Courier New") + ,("style:font-name-complex" ,"Courier New")] | otherwise = [] diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs index 40e8abf7e..87046537c 100644 --- a/src/Text/Pandoc/Writers/Org.hs +++ b/src/Text/Pandoc/Writers/Org.hs @@ -1,6 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 Puneeth Chaganti <punchagan@gmail.com> +Copyright (C) 2010-2014 Puneeth Chaganti <punchagan@gmail.com> + and John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Org - Copyright : Copyright (C) 2010 Puneeth Chaganti + Copyright : Copyright (C) 2010-2014 Puneeth Chaganti and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Puneeth Chaganti <punchagan@gmail.com> @@ -106,6 +107,14 @@ escapeString = escapeStringUsing $ blockToOrg :: Block -- ^ Block element -> State WriterState Doc blockToOrg Null = return empty +blockToOrg (Div attrs bs) = do + contents <- blockListToOrg bs + let startTag = tagWithAttrs "div" attrs + let endTag = text "</div>" + return $ blankline $$ "#+BEGIN_HTML" $$ + nest 2 startTag $$ "#+END_HTML" $$ blankline $$ + contents $$ blankline $$ "#+BEGIN_HTML" $$ + nest 2 endTag $$ "#+END_HTML" $$ blankline blockToOrg (Plain inlines) = inlineListToOrg inlines -- title beginning with fig: indicates that the image is a figure blockToOrg (Para [Image txt (src,'f':'i':'g':':':tit)]) = do @@ -121,7 +130,7 @@ blockToOrg (Para inlines) = do blockToOrg (RawBlock "html" str) = return $ blankline $$ "#+BEGIN_HTML" $$ nest 2 (text str) $$ "#+END_HTML" $$ blankline -blockToOrg (RawBlock f str) | f == "org" || f == "latex" || f == "tex" = +blockToOrg (RawBlock f str) | f `elem` ["org", "latex", "tex"] = return $ text str blockToOrg (RawBlock _ _) = return empty blockToOrg HorizontalRule = return $ blankline $$ "--------------" $$ blankline @@ -229,6 +238,8 @@ inlineListToOrg lst = mapM inlineToOrg lst >>= return . hcat -- | Convert Pandoc inline element to Org. inlineToOrg :: Inline -> State WriterState Doc +inlineToOrg (Span _ lst) = + inlineListToOrg lst inlineToOrg (Emph lst) = do contents <- inlineListToOrg lst return $ "/" <> contents <> "/" @@ -261,7 +272,7 @@ inlineToOrg (Math t str) = do else "$$" <> text str <> "$$" inlineToOrg (RawInline f str) | f == "tex" || f == "latex" = return $ text str inlineToOrg (RawInline _ _) = return empty -inlineToOrg (LineBreak) = return cr -- there's no line break in Org +inlineToOrg (LineBreak) = return (text "\\\\" <> cr) inlineToOrg Space = return space inlineToOrg (Link txt (src, _)) = do case txt of diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index 606793842..31c97349b 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.RST - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -38,11 +38,11 @@ import Text.Pandoc.Writers.Shared import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.Builder (deleteMeta) import Data.List ( isPrefixOf, intersperse, transpose ) -import Network.URI (isAbsoluteURI) +import Network.URI (isURI) import Text.Pandoc.Pretty import Control.Monad.State import Control.Applicative ( (<$>) ) -import Data.Char (isSpace) +import Data.Char (isSpace, toLower) type Refs = [([Inline], Target)] @@ -161,6 +161,11 @@ bordered contents c = blockToRST :: Block -- ^ Block element -> State WriterState Doc blockToRST Null = return empty +blockToRST (Div attr bs) = do + contents <- blockListToRST bs + let startTag = ".. raw:: html" $+$ nest 3 (tagWithAttrs "div" attr) + let endTag = ".. raw:: html" $+$ nest 3 "</div>" + return $ blankline <> startTag $+$ contents $+$ endTag $$ blankline blockToRST (Plain inlines) = inlineListToRST inlines -- title beginning with fig: indicates that the image is a figure blockToRST (Para [Image txt (src,'f':'i':'g':':':tit)]) = do @@ -175,9 +180,11 @@ blockToRST (Para inlines) | otherwise = do contents <- inlineListToRST inlines return $ contents <> blankline -blockToRST (RawBlock f str) = - return $ blankline <> ".. raw:: " <> text f $+$ - (nest 3 $ text str) $$ blankline +blockToRST (RawBlock f@(Format f') str) + | f == "rst" = return $ text str + | otherwise = return $ blankline <> ".. raw:: " <> + text (map toLower f') $+$ + (nest 3 $ text str) $$ blankline blockToRST HorizontalRule = return $ blankline $$ "--------------" $$ blankline blockToRST (Header level _ inlines) = do @@ -212,11 +219,15 @@ blockToRST (Table caption _ widths headers rows) = do else blankline <> text "Table: " <> caption' headers' <- mapM blockListToRST headers rawRows <- mapM (mapM blockListToRST) rows - let isSimple = all (==0) widths && all (all (\bs -> length bs <= 1)) rows + -- let isSimpleCell [Plain _] = True + -- isSimpleCell [Para _] = True + -- isSimpleCell [] = True + -- isSimpleCell _ = False + -- let isSimple = all (==0) widths && all (all isSimpleCell) rows let numChars = maximum . map offset opts <- get >>= return . stOptions let widthsInChars = - if isSimple + if all (== 0) widths then map ((+2) . numChars) $ transpose (headers' : rawRows) else map (floor . (fromIntegral (writerColumns opts) *)) widths let hpipeBlocks blocks = hcat [beg, middle, end] @@ -280,7 +291,7 @@ definitionListItemToRST (label, defs) = do label' <- inlineListToRST label contents <- liftM vcat $ mapM blockListToRST defs tabstop <- get >>= (return . writerTabStop . stOptions) - return $ label' $$ nest tabstop (contents <> cr) + return $ label' $$ nest tabstop (nestle contents <> cr) -- | Convert list of Pandoc block elements to RST. blockListToRST :: [Block] -- ^ List of block elements @@ -289,8 +300,14 @@ blockListToRST blocks = mapM blockToRST blocks >>= return . vcat -- | Convert list of Pandoc inline elements to RST. inlineListToRST :: [Inline] -> State WriterState Doc -inlineListToRST lst = mapM inlineToRST (insertBS lst) >>= return . hcat - where insertBS :: [Inline] -> [Inline] -- insert '\ ' where needed +inlineListToRST lst = + mapM inlineToRST (removeSpaceAfterDisplayMath $ insertBS lst) >>= return . hcat + where -- remove spaces after displaymath, as they screw up indentation: + removeSpaceAfterDisplayMath (Math DisplayMath x : zs) = + Math DisplayMath x : dropWhile (==Space) zs + removeSpaceAfterDisplayMath (x:xs) = x : removeSpaceAfterDisplayMath xs + removeSpaceAfterDisplayMath [] = [] + insertBS :: [Inline] -> [Inline] -- insert '\ ' where needed insertBS (x:y:z:zs) | isComplex y && surroundComplex x z = x : y : RawInline "rst" "\\ " : insertBS (z:zs) @@ -338,6 +355,7 @@ inlineListToRST lst = mapM inlineToRST (insertBS lst) >>= return . hcat -- | Convert Pandoc inline element to RST. inlineToRST :: Inline -> State WriterState Doc +inlineToRST (Span _ ils) = inlineListToRST ils inlineToRST (Emph lst) = do contents <- inlineListToRST lst return $ "*" <> contents <> "*" @@ -372,13 +390,14 @@ inlineToRST (Math t str) = do then blankline $$ ".. math::" $$ blankline $$ nest 3 (text str) $$ blankline else blankline $$ (".. math:: " <> text str) $$ blankline -inlineToRST (RawInline "rst" x) = return $ text x -inlineToRST (RawInline _ _) = return empty +inlineToRST (RawInline f x) + | f == "rst" = return $ text x + | otherwise = return empty inlineToRST (LineBreak) = return cr -- there's no line break in RST (see Para) inlineToRST Space = return space -- autolink inlineToRST (Link [Str str] (src, _)) - | isAbsoluteURI src && + | isURI src && if "mailto:" `isPrefixOf` src then src == escapeURI ("mailto:" ++ str) else src == escapeURI str = do diff --git a/src/Text/Pandoc/Writers/RTF.hs b/src/Text/Pandoc/Writers/RTF.hs index 0db1c52c4..e0428aaa8 100644 --- a/src/Text/Pandoc/Writers/RTF.hs +++ b/src/Text/Pandoc/Writers/RTF.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.RTF - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,13 +34,13 @@ import Text.Pandoc.Shared import Text.Pandoc.Writers.Shared import Text.Pandoc.Readers.TeXMath import Text.Pandoc.Templates (renderTemplate') -import Text.Pandoc.Generic (bottomUpM) +import Text.Pandoc.Walk import Data.List ( isSuffixOf, intercalate ) import Data.Char ( ord, chr, isDigit, toLower ) import System.FilePath ( takeExtension ) import qualified Data.ByteString as B import Text.Printf ( printf ) -import Network.URI ( isAbsoluteURI, unEscapeString ) +import Network.URI ( isURI, unEscapeString ) import qualified Control.Exception as E -- | Convert Image inlines into a raw RTF embedded image, read from a file. @@ -48,7 +48,7 @@ import qualified Control.Exception as E rtfEmbedImage :: Inline -> IO Inline rtfEmbedImage x@(Image _ (src,_)) = do let ext = map toLower (takeExtension src) - if ext `elem` [".jpg",".jpeg",".png"] && not (isAbsoluteURI src) + if ext `elem` [".jpg",".jpeg",".png"] && not (isURI src) then do let src' = unEscapeString src imgdata <- E.catch (B.readFile src') @@ -62,7 +62,7 @@ rtfEmbedImage x@(Image _ (src,_)) = do let raw = "{\\pict" ++ filetype ++ " " ++ concat bytes ++ "}" return $ if B.null imgdata then x - else RawInline "rtf" raw + else RawInline (Format "rtf") raw else return x rtfEmbedImage x = return x @@ -70,7 +70,7 @@ rtfEmbedImage x = return x -- images embedded as encoded binary data. writeRTFWithEmbeddedImages :: WriterOptions -> Pandoc -> IO String writeRTFWithEmbeddedImages options doc = - writeRTF options `fmap` bottomUpM rtfEmbedImage doc + writeRTF options `fmap` walkM rtfEmbedImage doc -- | Convert Pandoc to a string in rich text format. writeRTF :: WriterOptions -> Pandoc -> String @@ -208,6 +208,8 @@ blockToRTF :: Int -- ^ indent level -> Block -- ^ block to convert -> String blockToRTF _ _ Null = "" +blockToRTF indent alignment (Div _ bs) = + concatMap (blockToRTF indent alignment) bs blockToRTF indent alignment (Plain lst) = rtfCompact indent 0 alignment $ inlineListToRTF lst blockToRTF indent alignment (Para lst) = @@ -216,8 +218,9 @@ blockToRTF indent alignment (BlockQuote lst) = concatMap (blockToRTF (indent + indentIncrement) alignment) lst blockToRTF indent _ (CodeBlock _ str) = rtfPar indent 0 AlignLeft ("\\f1 " ++ (codeStringToRTF str)) -blockToRTF _ _ (RawBlock "rtf" str) = str -blockToRTF _ _ (RawBlock _ _) = "" +blockToRTF _ _ (RawBlock f str) + | f == Format "rtf" = str + | otherwise = "" blockToRTF indent alignment (BulletList lst) = spaceAtEnd $ concatMap (listItemToRTF alignment indent (bulletMarker indent)) lst blockToRTF indent alignment (OrderedList attribs lst) = spaceAtEnd $ concat $ @@ -256,7 +259,7 @@ tableRowToRTF header indent aligns sizes' cols = tableItemToRTF :: Int -> Alignment -> [Block] -> String tableItemToRTF indent alignment item = let contents = concatMap (blockToRTF indent alignment) item - in "{\\intbl " ++ contents ++ "\\cell}\n" + in "{" ++ substitute "\\pard" "\\pard\\intbl" contents ++ "\\cell}\n" -- | Ensure that there's the same amount of space after compact -- lists as after regular lists. @@ -308,6 +311,7 @@ inlineListToRTF lst = concatMap inlineToRTF lst -- | Convert inline item to RTF. inlineToRTF :: Inline -- ^ inline to convert -> String +inlineToRTF (Span _ lst) = inlineListToRTF lst inlineToRTF (Emph lst) = "{\\i " ++ (inlineListToRTF lst) ++ "}" inlineToRTF (Strong lst) = "{\\b " ++ (inlineListToRTF lst) ++ "}" inlineToRTF (Strikeout lst) = "{\\strike " ++ (inlineListToRTF lst) ++ "}" @@ -320,10 +324,11 @@ inlineToRTF (Quoted DoubleQuote lst) = "\\u8220\"" ++ (inlineListToRTF lst) ++ "\\u8221\"" inlineToRTF (Code _ str) = "{\\f1 " ++ (codeStringToRTF str) ++ "}" inlineToRTF (Str str) = stringToRTF str -inlineToRTF (Math _ str) = inlineListToRTF $ readTeXMath str +inlineToRTF (Math t str) = inlineListToRTF $ readTeXMath' t str inlineToRTF (Cite _ lst) = inlineListToRTF lst -inlineToRTF (RawInline "rtf" str) = str -inlineToRTF (RawInline _ _) = "" +inlineToRTF (RawInline f str) + | f == Format "rtf" = str + | otherwise = "" inlineToRTF (LineBreak) = "\\line " inlineToRTF Space = " " inlineToRTF (Link text (src, _)) = diff --git a/src/Text/Pandoc/Writers/Shared.hs b/src/Text/Pandoc/Writers/Shared.hs index c6c30d070..800e741a4 100644 --- a/src/Text/Pandoc/Writers/Shared.hs +++ b/src/Text/Pandoc/Writers/Shared.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2013 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2013-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Shared - Copyright : Copyright (C) 2013 John MacFarlane + Copyright : Copyright (C) 2013-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,9 +33,13 @@ module Text.Pandoc.Writers.Shared ( , getField , setField , defField + , tagWithAttrs + , fixDisplayMath ) where import Text.Pandoc.Definition +import Text.Pandoc.Pretty +import Text.Pandoc.XML (escapeStringForXML) import Control.Monad (liftM) import Text.Pandoc.Options (WriterOptions(..)) import qualified Data.HashMap.Strict as H @@ -42,6 +47,7 @@ import qualified Data.Map as M import qualified Data.Text as T import Data.Aeson (FromJSON(..), fromJSON, ToJSON (..), Value(Object), Result(..)) import qualified Data.Traversable as Traversable +import Data.List ( groupBy ) -- | Create JSON value for template from a 'Meta' and an association list -- of variables, specified at the command line or in the writer. @@ -61,8 +67,7 @@ metaToJSON opts blockWriter inlineWriter (Meta metamap) renderedMap <- Traversable.mapM (metaValueToJSON blockWriter inlineWriter) metamap - return $ M.foldWithKey (\key val obj -> defField key val obj) - baseContext renderedMap + return $ M.foldWithKey defField baseContext renderedMap | otherwise = return (Object H.empty) metaValueToJSON :: Monad m @@ -74,6 +79,7 @@ metaValueToJSON blockWriter inlineWriter (MetaMap metamap) = liftM toJSON $ Traversable.mapM (metaValueToJSON blockWriter inlineWriter) metamap metaValueToJSON blockWriter inlineWriter (MetaList xs) = liftM toJSON $ Traversable.mapM (metaValueToJSON blockWriter inlineWriter) xs +metaValueToJSON _ _ (MetaBool b) = return $ toJSON b metaValueToJSON _ _ (MetaString s) = return $ toJSON s metaValueToJSON blockWriter _ (MetaBlocks bs) = liftM toJSON $ blockWriter bs metaValueToJSON _ inlineWriter (MetaInlines bs) = liftM toJSON $ inlineWriter bs @@ -119,3 +125,41 @@ defField field val (Object hashmap) = where f _newval oldval = oldval defField _ _ x = x +-- Produce an HTML tag with the given pandoc attributes. +tagWithAttrs :: String -> Attr -> Doc +tagWithAttrs tag (ident,classes,kvs) = hsep + ["<" <> text tag + ,if null ident + then empty + else "id=" <> doubleQuotes (text ident) + ,if null classes + then empty + else "class=" <> doubleQuotes (text (unwords classes)) + ,hsep (map (\(k,v) -> text k <> "=" <> + doubleQuotes (text (escapeStringForXML v))) kvs) + ] <> ">" + +isDisplayMath :: Inline -> Bool +isDisplayMath (Math DisplayMath _) = True +isDisplayMath _ = False + +stripLeadingTrailingSpace :: [Inline] -> [Inline] +stripLeadingTrailingSpace = go . reverse . go . reverse + where go (Space:xs) = xs + go xs = xs + +-- Put display math in its own block (for ODT/DOCX). +fixDisplayMath :: Block -> Block +fixDisplayMath (Plain lst) + | any isDisplayMath lst && not (all isDisplayMath lst) = + -- chop into several paragraphs so each displaymath is its own + Div ("",["math"],[]) $ map (Plain . stripLeadingTrailingSpace) $ + groupBy (\x y -> (isDisplayMath x && isDisplayMath y) || + not (isDisplayMath x || isDisplayMath y)) lst +fixDisplayMath (Para lst) + | any isDisplayMath lst && not (all isDisplayMath lst) = + -- chop into several paragraphs so each displaymath is its own + Div ("",["math"],[]) $ map (Para . stripLeadingTrailingSpace) $ + groupBy (\x y -> (isDisplayMath x && isDisplayMath y) || + not (isDisplayMath x || isDisplayMath y)) lst +fixDisplayMath x = x diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs index 0f57d14b2..8ac717bab 100644 --- a/src/Text/Pandoc/Writers/Texinfo.hs +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2008-2010 John MacFarlane and Peter Wang +Copyright (C) 2008-2014 John MacFarlane and Peter Wang This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Texinfo - Copyright : Copyright (C) 2008-2010 John MacFarlane and Peter Wang + Copyright : Copyright (C) 2008-2014 John MacFarlane and Peter Wang License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -39,7 +40,7 @@ import Data.Ord ( comparing ) import Data.Char ( chr, ord ) import Control.Monad.State import Text.Pandoc.Pretty -import Network.URI ( isAbsoluteURI, unEscapeString ) +import Network.URI ( isURI, unEscapeString ) import System.FilePath data WriterState = @@ -123,6 +124,8 @@ blockToTexinfo :: Block -- ^ Block to convert blockToTexinfo Null = return empty +blockToTexinfo (Div _ bs) = blockListToTexinfo bs + blockToTexinfo (Plain lst) = inlineListToTexinfo lst @@ -150,10 +153,11 @@ blockToTexinfo (CodeBlock _ str) = do flush (text str) $$ text "@end verbatim" <> blankline -blockToTexinfo (RawBlock "texinfo" str) = return $ text str -blockToTexinfo (RawBlock "latex" str) = - return $ text "@tex" $$ text str $$ text "@end tex" -blockToTexinfo (RawBlock _ _) = return empty +blockToTexinfo (RawBlock f str) + | f == "texinfo" = return $ text str + | f == "latex" || f == "tex" = + return $ text "@tex" $$ text str $$ text "@end tex" + | otherwise = return empty blockToTexinfo (BulletList lst) = do items <- mapM listItemToTexinfo lst @@ -289,7 +293,7 @@ blockListToTexinfo (x:xs) = do case x of Header level _ _ -> do -- We need need to insert a menu for this node. - let (before, after) = break isHeader xs + let (before, after) = break isHeaderBlock xs before' <- blockListToTexinfo before let menu = if level < 4 then collectNodes (level + 1) after @@ -311,10 +315,6 @@ blockListToTexinfo (x:xs) = do xs' <- blockListToTexinfo xs return $ x' $$ xs' -isHeader :: Block -> Bool -isHeader (Header _ _ _) = True -isHeader _ = False - collectNodes :: Int -> [Block] -> [Block] collectNodes _ [] = [] collectNodes level (x:xs) = @@ -374,6 +374,9 @@ disallowedInNode c = c `elem` ".,:()" inlineToTexinfo :: Inline -- ^ Inline to convert -> State WriterState Doc +inlineToTexinfo (Span _ lst) = + inlineListToTexinfo lst + inlineToTexinfo (Emph lst) = inlineListToTexinfo lst >>= return . inCmd "emph" @@ -413,10 +416,11 @@ inlineToTexinfo (Cite _ lst) = inlineListToTexinfo lst inlineToTexinfo (Str str) = return $ text (stringToTexinfo str) inlineToTexinfo (Math _ str) = return $ inCmd "math" $ text str -inlineToTexinfo (RawInline f str) | f == "latex" || f == "tex" = - return $ text "@tex" $$ text str $$ text "@end tex" -inlineToTexinfo (RawInline "texinfo" str) = return $ text str -inlineToTexinfo (RawInline _ _) = return empty +inlineToTexinfo (RawInline f str) + | f == "latex" || f == "tex" = + return $ text "@tex" $$ text str $$ text "@end tex" + | f == "texinfo" = return $ text str + | otherwise = return empty inlineToTexinfo (LineBreak) = return $ text "@*" inlineToTexinfo Space = return $ char ' ' @@ -440,7 +444,7 @@ inlineToTexinfo (Image alternate (source, _)) = do where ext = drop 1 $ takeExtension source' base = dropExtension source' - source' = if isAbsoluteURI source + source' = if isURI source then source else unEscapeString source diff --git a/src/Text/Pandoc/Writers/Textile.hs b/src/Text/Pandoc/Writers/Textile.hs index 3288ce222..3a6982a01 100644 --- a/src/Text/Pandoc/Writers/Textile.hs +++ b/src/Text/Pandoc/Writers/Textile.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2010-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Writers.Textile - Copyright : Copyright (C) 2010 John MacFarlane + Copyright : Copyright (C) 2010-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,6 +33,7 @@ module Text.Pandoc.Writers.Textile ( writeTextile ) where import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Shared +import Text.Pandoc.Pretty (render) import Text.Pandoc.Writers.Shared import Text.Pandoc.Templates (renderTemplate') import Text.Pandoc.XML ( escapeStringForXML ) @@ -50,7 +51,7 @@ data WriterState = WriterState { writeTextile :: WriterOptions -> Pandoc -> String writeTextile opts document = evalState (pandocToTextile opts document) - (WriterState { stNotes = [], stListLevel = [], stUseTags = False }) + WriterState { stNotes = [], stListLevel = [], stUseTags = False } -- | Return Textile representation of document. pandocToTextile :: WriterOptions -> Pandoc -> State WriterState String @@ -101,6 +102,12 @@ blockToTextile :: WriterOptions -- ^ Options blockToTextile _ Null = return "" +blockToTextile opts (Div attr bs) = do + let startTag = render Nothing $ tagWithAttrs "div" attr + let endTag = "</div>" + contents <- blockListToTextile opts bs + return $ startTag ++ "\n\n" ++ contents ++ "\n\n" ++ endTag ++ "\n" + blockToTextile opts (Plain inlines) = inlineListToTextile opts inlines @@ -118,10 +125,9 @@ blockToTextile opts (Para inlines) = do then "<p>" ++ contents ++ "</p>" else contents ++ if null listLevel then "\n" else "" -blockToTextile _ (RawBlock f str) = - if f == "html" || f == "textile" - then return str - else return "" +blockToTextile _ (RawBlock f str) + | f == Format "html" || f == Format "textile" = return str + | otherwise = return "" blockToTextile _ HorizontalRule = return "<hr />\n" @@ -343,6 +349,9 @@ inlineListToTextile opts lst = -- | Convert Pandoc inline element to Textile. inlineToTextile :: WriterOptions -> Inline -> State WriterState String +inlineToTextile opts (Span _ lst) = + inlineListToTextile opts lst + inlineToTextile opts (Emph lst) = do contents <- inlineListToTextile opts lst return $ if '_' `elem` contents @@ -395,10 +404,9 @@ inlineToTextile _ (Str str) = return $ escapeStringForTextile str inlineToTextile _ (Math _ str) = return $ "<span class=\"math\">" ++ escapeStringForXML str ++ "</math>" -inlineToTextile _ (RawInline f str) = - if f == "html" || f == "textile" - then return str - else return "" +inlineToTextile _ (RawInline f str) + | f == Format "html" || f == Format "textile" = return str + | otherwise = return "" inlineToTextile _ (LineBreak) = return "\n" diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index 89ae81a10..8000368aa 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.XML - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -38,7 +38,7 @@ module Text.Pandoc.XML ( escapeCharForXML, import Text.Pandoc.Pretty import Data.Char (ord, isAscii, isSpace) -import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Compat.TagSoupEntity (lookupEntity) -- | Escape one character as needed for XML. escapeCharForXML :: Char -> String |