diff options
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 351 |
1 files changed, 254 insertions, 97 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 5b0d9b6b4..a91ca9115 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -1,5 +1,5 @@ {-# LANGUAGE DeriveDataTypeable, CPP, MultiParamTypeClasses, - FlexibleContexts, ScopedTypeVariables #-} + FlexibleContexts, ScopedTypeVariables, PatternGuards #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -55,7 +55,11 @@ module Text.Pandoc.Shared ( normalizeSpaces, extractSpaces, normalize, + normalizeInlines, + normalizeBlocks, + removeFormatting, stringify, + capitalize, compactify, compactify', compactify'DL, @@ -74,17 +78,21 @@ module Text.Pandoc.Shared ( readDataFile, readDataFileUTF8, fetchItem, + fetchItem', openURL, + collapseFilePath, -- * Error handling err, warn, -- * Safe read - safeRead + safeRead, + -- * Temp directory + withTempDir ) where import Text.Pandoc.Definition import Text.Pandoc.Walk -import Text.Pandoc.Generic +import Text.Pandoc.MediaBag (MediaBag, lookupMedia) import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..)) import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.UTF8 as UTF8 @@ -92,14 +100,15 @@ import System.Environment (getProgName) import System.Exit (exitWith, ExitCode(..)) import Data.Char ( toLower, isLower, isUpper, isAlpha, isLetter, isDigit, isSpace ) -import Data.List ( find, isPrefixOf, intercalate ) +import Data.List ( find, stripPrefix, intercalate ) import qualified Data.Map as M import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, - unEscapeString, parseURIReference ) + unEscapeString, parseURIReference, isAllowedInURI ) import qualified Data.Set as Set import System.Directory +import System.FilePath (joinPath, splitDirectories) import Text.Pandoc.MIME (getMimeType) -import System.FilePath ( (</>), takeExtension, dropExtension ) +import System.FilePath ( (</>), takeExtension, dropExtension) import Data.Generics (Typeable, Data) import qualified Control.Monad.State as S import qualified Control.Exception as E @@ -108,6 +117,7 @@ import Text.Pandoc.Pretty (charWidth) import System.Locale (defaultTimeLocale) import Data.Time import System.IO (stderr) +import System.IO.Temp import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) import qualified Data.ByteString as BS @@ -115,6 +125,7 @@ import qualified Data.ByteString.Char8 as B8 import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) +import qualified Data.Text as T (toUpper, pack, unpack) #ifdef EMBED_DATA_FILES import Text.Pandoc.Data (dataFiles) @@ -174,9 +185,9 @@ substitute :: (Eq a) => [a] -> [a] -> [a] -> [a] substitute _ _ [] = [] substitute [] _ xs = xs substitute target replacement lst@(x:xs) = - if target `isPrefixOf` lst - then replacement ++ substitute target replacement (drop (length target) lst) - else x : substitute target replacement xs + case stripPrefix target lst of + Just lst' -> replacement ++ substitute target replacement lst' + Nothing -> x : substitute target replacement xs ordNub :: (Ord a) => [a] -> [a] ordNub l = go Set.empty l @@ -334,10 +345,10 @@ isSpaceOrEmpty (Str "") = True isSpaceOrEmpty _ = False -- | Extract the leading and trailing spaces from inside an inline element --- and place them outside the element. +-- and place them outside the element. extractSpaces :: (Inlines -> Inlines) -> Inlines -> Inlines -extractSpaces f is = +extractSpaces f is = let contents = B.unMany is left = case viewl contents of (Space :< _) -> B.space @@ -350,72 +361,160 @@ extractSpaces f is = -- | Normalize @Pandoc@ document, consolidating doubled 'Space's, -- combining adjacent 'Str's and 'Emph's, remove 'Null's and -- empty elements, etc. -normalize :: (Eq a, Data a) => a -> a -normalize = topDown removeEmptyBlocks . - topDown consolidateInlines . - bottomUp (removeEmptyInlines . removeTrailingInlineSpaces) - -removeEmptyBlocks :: [Block] -> [Block] -removeEmptyBlocks (Null : xs) = removeEmptyBlocks xs -removeEmptyBlocks (BulletList [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (OrderedList _ [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (DefinitionList [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (RawBlock _ [] : xs) = removeEmptyBlocks xs -removeEmptyBlocks (x:xs) = x : removeEmptyBlocks xs -removeEmptyBlocks [] = [] - -removeEmptyInlines :: [Inline] -> [Inline] -removeEmptyInlines (Emph [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Strong [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Subscript [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Superscript [] : zs) = removeEmptyInlines zs -removeEmptyInlines (SmallCaps [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Strikeout [] : zs) = removeEmptyInlines zs -removeEmptyInlines (RawInline _ [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Code _ [] : zs) = removeEmptyInlines zs -removeEmptyInlines (Str "" : zs) = removeEmptyInlines zs -removeEmptyInlines (x : xs) = x : removeEmptyInlines xs -removeEmptyInlines [] = [] - -removeTrailingInlineSpaces :: [Inline] -> [Inline] -removeTrailingInlineSpaces = reverse . removeLeadingInlineSpaces . reverse - -removeLeadingInlineSpaces :: [Inline] -> [Inline] -removeLeadingInlineSpaces = dropWhile isSpaceOrEmpty - -consolidateInlines :: [Inline] -> [Inline] -consolidateInlines (Str x : ys) = +normalize :: Pandoc -> Pandoc +normalize (Pandoc (Meta meta) blocks) = + Pandoc (Meta $ M.map go meta) (normalizeBlocks blocks) + where go (MetaInlines xs) = MetaInlines $ normalizeInlines xs + go (MetaBlocks xs) = MetaBlocks $ normalizeBlocks xs + go (MetaList ms) = MetaList $ map go ms + go (MetaMap m) = MetaMap $ M.map go m + go x = x + +normalizeBlocks :: [Block] -> [Block] +normalizeBlocks (Null : xs) = normalizeBlocks xs +normalizeBlocks (Div attr bs : xs) = + Div attr (normalizeBlocks bs) : normalizeBlocks xs +normalizeBlocks (BlockQuote bs : xs) = + case normalizeBlocks bs of + [] -> normalizeBlocks xs + bs' -> BlockQuote bs' : normalizeBlocks xs +normalizeBlocks (BulletList [] : xs) = normalizeBlocks xs +normalizeBlocks (BulletList items : xs) = + BulletList (map normalizeBlocks items) : normalizeBlocks xs +normalizeBlocks (OrderedList _ [] : xs) = normalizeBlocks xs +normalizeBlocks (OrderedList attr items : xs) = + OrderedList attr (map normalizeBlocks items) : normalizeBlocks xs +normalizeBlocks (DefinitionList [] : xs) = normalizeBlocks xs +normalizeBlocks (DefinitionList items : xs) = + DefinitionList (map go items) : normalizeBlocks xs + where go (ils, bs) = (normalizeInlines ils, map normalizeBlocks bs) +normalizeBlocks (RawBlock _ "" : xs) = normalizeBlocks xs +normalizeBlocks (RawBlock f x : xs) = + case normalizeBlocks xs of + (RawBlock f' x' : rest) | f' == f -> + RawBlock f (x ++ ('\n':x')) : rest + rest -> RawBlock f x : rest +normalizeBlocks (Para ils : xs) = + case normalizeInlines ils of + [] -> normalizeBlocks xs + ils' -> Para ils' : normalizeBlocks xs +normalizeBlocks (Plain ils : xs) = + case normalizeInlines ils of + [] -> normalizeBlocks xs + ils' -> Plain ils' : normalizeBlocks xs +normalizeBlocks (Header lev attr ils : xs) = + Header lev attr (normalizeInlines ils) : normalizeBlocks xs +normalizeBlocks (Table capt aligns widths hdrs rows : xs) = + Table (normalizeInlines capt) aligns widths + (map normalizeBlocks hdrs) (map (map normalizeBlocks) rows) + : normalizeBlocks xs +normalizeBlocks (x:xs) = x : normalizeBlocks xs +normalizeBlocks [] = [] + +normalizeInlines :: [Inline] -> [Inline] +normalizeInlines (Str x : ys) = case concat (x : map fromStr strs) of - "" -> consolidateInlines rest - n -> Str n : consolidateInlines rest + "" -> rest + n -> Str n : rest where - (strs, rest) = span isStr ys + (strs, rest) = span isStr $ normalizeInlines ys isStr (Str _) = True isStr _ = False fromStr (Str z) = z - fromStr _ = error "consolidateInlines - fromStr - not a Str" -consolidateInlines (Space : ys) = Space : rest + fromStr _ = error "normalizeInlines - fromStr - not a Str" +normalizeInlines (Space : ys) = + if null rest + then [] + else Space : rest where isSp Space = True isSp _ = False - rest = consolidateInlines $ dropWhile isSp ys -consolidateInlines (Emph xs : Emph ys : zs) = consolidateInlines $ - Emph (xs ++ ys) : zs -consolidateInlines (Strong xs : Strong ys : zs) = consolidateInlines $ - Strong (xs ++ ys) : zs -consolidateInlines (Subscript xs : Subscript ys : zs) = consolidateInlines $ - Subscript (xs ++ ys) : zs -consolidateInlines (Superscript xs : Superscript ys : zs) = consolidateInlines $ - Superscript (xs ++ ys) : zs -consolidateInlines (SmallCaps xs : SmallCaps ys : zs) = consolidateInlines $ - SmallCaps (xs ++ ys) : zs -consolidateInlines (Strikeout xs : Strikeout ys : zs) = consolidateInlines $ - Strikeout (xs ++ ys) : zs -consolidateInlines (RawInline f x : RawInline f' y : zs) | f == f' = - consolidateInlines $ RawInline f (x ++ y) : zs -consolidateInlines (Code a1 x : Code a2 y : zs) | a1 == a2 = - consolidateInlines $ Code a1 (x ++ y) : zs -consolidateInlines (x : xs) = x : consolidateInlines xs -consolidateInlines [] = [] + rest = dropWhile isSp $ normalizeInlines ys +normalizeInlines (Emph xs : zs) = + case normalizeInlines zs of + (Emph ys : rest) -> normalizeInlines $ + Emph (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Emph xs' : rest +normalizeInlines (Strong xs : zs) = + case normalizeInlines zs of + (Strong ys : rest) -> normalizeInlines $ + Strong (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Strong xs' : rest +normalizeInlines (Subscript xs : zs) = + case normalizeInlines zs of + (Subscript ys : rest) -> normalizeInlines $ + Subscript (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Subscript xs' : rest +normalizeInlines (Superscript xs : zs) = + case normalizeInlines zs of + (Superscript ys : rest) -> normalizeInlines $ + Superscript (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Superscript xs' : rest +normalizeInlines (SmallCaps xs : zs) = + case normalizeInlines zs of + (SmallCaps ys : rest) -> normalizeInlines $ + SmallCaps (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> SmallCaps xs' : rest +normalizeInlines (Strikeout xs : zs) = + case normalizeInlines zs of + (Strikeout ys : rest) -> normalizeInlines $ + Strikeout (normalizeInlines $ xs ++ ys) : rest + rest -> case normalizeInlines xs of + [] -> rest + xs' -> Strikeout xs' : rest +normalizeInlines (RawInline _ [] : ys) = normalizeInlines ys +normalizeInlines (RawInline f xs : zs) = + case normalizeInlines zs of + (RawInline f' ys : rest) | f == f' -> normalizeInlines $ + RawInline f (xs ++ ys) : rest + rest -> RawInline f xs : rest +normalizeInlines (Code _ "" : ys) = normalizeInlines ys +normalizeInlines (Code attr xs : zs) = + case normalizeInlines zs of + (Code attr' ys : rest) | attr == attr' -> normalizeInlines $ + Code attr (xs ++ ys) : rest + rest -> Code attr xs : rest +-- allow empty spans, they may carry identifiers etc. +-- normalizeInlines (Span _ [] : ys) = normalizeInlines ys +normalizeInlines (Span attr xs : zs) = + case normalizeInlines zs of + (Span attr' ys : rest) | attr == attr' -> normalizeInlines $ + Span attr (normalizeInlines $ xs ++ ys) : rest + rest -> Span attr (normalizeInlines xs) : rest +normalizeInlines (Note bs : ys) = Note (normalizeBlocks bs) : + normalizeInlines ys +normalizeInlines (Quoted qt ils : ys) = + Quoted qt (normalizeInlines ils) : normalizeInlines ys +normalizeInlines (Link ils t : ys) = + Link (normalizeInlines ils) t : normalizeInlines ys +normalizeInlines (Image ils t : ys) = + Image (normalizeInlines ils) t : normalizeInlines ys +normalizeInlines (Cite cs ils : ys) = + Cite cs (normalizeInlines ils) : normalizeInlines ys +normalizeInlines (x : xs) = x : normalizeInlines xs +normalizeInlines [] = [] + +-- | Extract inlines, removing formatting. +removeFormatting :: Walkable Inline a => a -> [Inline] +removeFormatting = query go . walk deNote + where go :: Inline -> [Inline] + go (Str xs) = [Str xs] + go Space = [Space] + go (Code _ x) = [Str x] + go (Math _ x) = [Str x] + go LineBreak = [Space] + go _ = [] + deNote (Note _) = Str "" + deNote x = x -- | Convert pandoc structure to a string with formatting removed. -- Footnotes are skipped (since we don't want their contents in link @@ -432,6 +531,17 @@ stringify = query go . walk deNote deNote (Note _) = Str "" deNote x = x +-- | Bring all regular text in a pandoc structure to uppercase. +-- +-- This function correctly handles cases where a lowercase character doesn't +-- match to a single uppercase character – e.g. “Straße” would be converted +-- to “STRASSE”, not “STRAßE”. +capitalize :: Walkable Inline a => a -> a +capitalize = walk go + where go :: Inline -> Inline + go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s) + go x = x + -- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. compactify :: [[Block]] -- ^ List of list items (each a list of blocks) @@ -463,20 +573,22 @@ compactify' items = _ -> items _ -> items --- | Like @compactify'@, but akts on items of definition lists. +-- | Like @compactify'@, but acts on items of definition lists. compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] compactify'DL items = let defs = concatMap snd items - defBlocks = reverse $ concatMap B.toList defs - in case defBlocks of - (Para x:_) -> if not $ any isPara (drop 1 defBlocks) - then let (t,ds) = last items - lastDef = B.toList $ last ds - ds' = init ds ++ - [B.fromList $ init lastDef ++ [Plain x]] - in init items ++ [(t, ds')] - else items - _ -> items + in case reverse (concatMap B.toList defs) of + (Para x:xs) + | not (any isPara xs) -> + let (t,ds) = last items + lastDef = B.toList $ last ds + ds' = init ds ++ + if null lastDef + then [B.fromList lastDef] + else [B.fromList $ init lastDef ++ [Plain x]] + in init items ++ [(t, ds')] + | otherwise -> items + _ -> items isPara :: Block -> Bool isPara (Para _) = True @@ -669,28 +781,38 @@ readDataFileUTF8 userDir fname = -- Returns raw content and maybe mime type. fetchItem :: Maybe String -> String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) -fetchItem sourceURL s - | isURI s = openURL s - | otherwise = - case sourceURL >>= parseURIReference of - Just u -> case parseURIReference s of - Just s' -> openURL $ show $ - s' `nonStrictRelativeTo` u - Nothing -> openURL $ show u ++ "/" ++ s - Nothing -> E.try readLocalFile +fetchItem sourceURL s = + case (sourceURL >>= parseURIReference . ensureEscaped, ensureEscaped s) of + (_, s') | isURI s' -> openURL s' + (Just u, s') -> -- try fetching from relative path at source + case parseURIReference s' of + Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u + Nothing -> openURL s' -- will throw error + (Nothing, _) -> E.try readLocalFile -- get from local file system where readLocalFile = do - let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - cont <- BS.readFile s + cont <- BS.readFile fp return (cont, mime) + dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#') + fp = unEscapeString $ dropFragmentAndQuery s + mime = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + x -> getMimeType x + ensureEscaped = escapeURIString isAllowedInURI + +-- | Like 'fetchItem', but also looks for items in a 'MediaBag'. +fetchItem' :: MediaBag -> Maybe String -> String + -> IO (Either E.SomeException (BS.ByteString, Maybe String)) +fetchItem' media sourceURL s = do + case lookupMedia s media of + Nothing -> fetchItem sourceURL s + Just (mime, bs) -> return $ Right (BS.concat $ toChunks bs, Just mime) -- | Read from a URL and return raw data and maybe mime type. openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) openURL u - | "data:" `isPrefixOf` u = - let mime = takeWhile (/=',') $ drop 5 u - contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u + | Just u' <- stripPrefix "data:" u = + let mime = takeWhile (/=',') u' + contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u' in return $ Right (decodeLenient contents, Just mime) #ifdef HTTP_CLIENT | otherwise = withSocketsDo $ E.try $ do @@ -734,6 +856,29 @@ warn msg = do name <- getProgName UTF8.hPutStrLn stderr $ name ++ ": " ++ msg +-- | Remove intermediate "." and ".." directories from a path. +-- +-- @ +-- collapseFilePath "./foo" == "foo" +-- collapseFilePath "/bar/../baz" == "/baz" +-- collapseFilePath "/../baz" == "/../baz" +-- collapseFilePath "parent/foo/baz/../bar" == "parent/foo/bar" +-- collapseFilePath "parent/foo/baz/../../bar" == "parent/bar" +-- collapseFilePath "parent/foo/.." == "parent" +-- collapseFilePath "/parent/foo/../../bar" == "/bar" +-- @ +collapseFilePath :: FilePath -> FilePath +collapseFilePath = joinPath . reverse . foldl go [] . splitDirectories + where + go rs "." = rs + go r@(p:rs) ".." = case p of + ".." -> ("..":r) + "/" -> ("..":r) + _ -> rs + go _ "/" = ["/"] + go rs x = x:rs + + -- -- Safe read -- @@ -743,3 +888,15 @@ safeRead s = case reads s of (d,x):_ | all isSpace x -> return d _ -> fail $ "Could not read `" ++ s ++ "'" + +-- +-- Temp directory +-- + +withTempDir :: String -> (FilePath -> IO a) -> IO a +withTempDir = +#ifdef _WINDOWS + withTempDirectory "." +#else + withSystemTempDirectory +#endif |