aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs66
-rw-r--r--src/Text/Pandoc/Readers/Docx/Combine.hs154
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs90
-rw-r--r--src/Text/Pandoc/Readers/Docx/Reducible.hs181
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs2
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs96
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs29
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs54
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs5
-rw-r--r--src/Text/Pandoc/Readers/Odt/ContentReader.hs3
-rw-r--r--src/Text/Pandoc/Readers/Org.hs69
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs21
12 files changed, 425 insertions, 345 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 44f67ce75..604bc20de 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -50,8 +50,7 @@ implemented, [-] means partially implemented):
* Inlines
- [X] Str
- - [X] Emph (From italics. `underline` currently read as span. In
- future, it might optionally be emph as well)
+ - [X] Emph (italics and underline both read as Emph)
- [X] Strong
- [X] Strikeout
- [X] Superscript
@@ -62,16 +61,16 @@ implemented, [-] means partially implemented):
- [X] Code (styled with `VerbatimChar`)
- [X] Space
- [X] LineBreak (these are invisible in Word: entered with Shift-Return)
- - [ ] Math
+ - [X] Math
- [X] Link (links to an arbitrary bookmark create a span with the target as
id and "anchor" class)
- - [-] Image (Links to path in archive. Future option for
- data-encoded URI likely.)
+ - [X] Image
- [X] Note (Footnotes and Endnotes are silently combined.)
-}
module Text.Pandoc.Readers.Docx
- ( readDocx
+ ( readDocxWithWarnings
+ , readDocx
) where
import Codec.Archive.Zip
@@ -81,7 +80,7 @@ import Text.Pandoc.Builder
import Text.Pandoc.Walk
import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Readers.Docx.Lists
-import Text.Pandoc.Readers.Docx.Reducible
+import Text.Pandoc.Readers.Docx.Combine
import Text.Pandoc.Shared
import Text.Pandoc.MediaBag (insertMedia, MediaBag)
import Data.List (delete, (\\), intersect)
@@ -89,6 +88,7 @@ import Text.TeXMath (writeTeX)
import Data.Default (Default)
import qualified Data.ByteString.Lazy as B
import qualified Data.Map as M
+import qualified Data.Set as Set
import Control.Monad.Reader
import Control.Monad.State
import Data.Sequence (ViewL(..), viewl)
@@ -97,14 +97,22 @@ import qualified Data.Sequence as Seq (null)
import Text.Pandoc.Error
import Text.Pandoc.Compat.Except
+readDocxWithWarnings :: ReaderOptions
+ -> B.ByteString
+ -> Either PandocError (Pandoc, MediaBag, [String])
+readDocxWithWarnings opts bytes =
+ case archiveToDocxWithWarnings (toArchive bytes) of
+ Right (docx, warnings) -> do
+ (meta, blks, mediaBag) <- docxToOutput opts docx
+ return (Pandoc meta blks, mediaBag, warnings)
+ Left _ -> Left (ParseFailure "couldn't parse docx file")
+
readDocx :: ReaderOptions
-> B.ByteString
-> Either PandocError (Pandoc, MediaBag)
-readDocx opts bytes =
- case archiveToDocx (toArchive bytes) of
- Right docx -> (\(meta, blks, mediaBag) -> (Pandoc meta blks, mediaBag))
- <$> (docxToOutput opts docx)
- Left _ -> Left (ParseFailure "couldn't parse docx file")
+readDocx opts bytes = do
+ (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes
+ return (pandoc, mediaBag)
data DState = DState { docxAnchorMap :: M.Map String String
, docxMediaBag :: MediaBag
@@ -166,7 +174,7 @@ bodyPartsToMeta' (bp : bps)
| (Paragraph pPr parParts) <- bp
, (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
, (Just metaField) <- M.lookup c metaStyles = do
- inlines <- concatReduce <$> mapM parPartToInlines parParts
+ inlines <- smushInlines <$> mapM parPartToInlines parParts
remaining <- bodyPartsToMeta' bps
let
f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
@@ -290,13 +298,13 @@ runToInlines (Run rs runElems)
Just SubScrpt -> subscript codeString
_ -> codeString
| otherwise = do
- let ils = concatReduce (map runElemToInlines runElems)
+ let ils = smushInlines (map runElemToInlines runElems)
return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils
runToInlines (Footnote bps) = do
- blksList <- concatReduce <$> (mapM bodyPartToBlocks bps)
+ blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps)
return $ note blksList
runToInlines (Endnote bps) = do
- blksList <- concatReduce <$> (mapM bodyPartToBlocks bps)
+ blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps)
return $ note blksList
runToInlines (InlineDrawing fp bs ext) = do
mediaBag <- gets docxMediaBag
@@ -315,19 +323,19 @@ parPartToInlines (PlainRun r) = runToInlines r
parPartToInlines (Insertion _ author date runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
- AcceptChanges -> concatReduce <$> mapM runToInlines runs
+ AcceptChanges -> smushInlines <$> mapM runToInlines runs
RejectChanges -> return mempty
AllChanges -> do
- ils <- concatReduce <$> mapM runToInlines runs
+ ils <- smushInlines <$> mapM runToInlines runs
let attr = ("", ["insertion"], [("author", author), ("date", date)])
return $ spanWith attr ils
parPartToInlines (Deletion _ author date runs) = do
opts <- asks docxOptions
case readerTrackChanges opts of
AcceptChanges -> return mempty
- RejectChanges -> concatReduce <$> mapM runToInlines runs
+ RejectChanges -> smushInlines <$> mapM runToInlines runs
AllChanges -> do
- ils <- concatReduce <$> mapM runToInlines runs
+ ils <- smushInlines <$> mapM runToInlines runs
let attr = ("", ["deletion"], [("author", author), ("date", date)])
return $ spanWith attr ils
parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors =
@@ -350,7 +358,7 @@ parPartToInlines (BookMark _ anchor) =
-- avoid an extra pass.
let newAnchor =
if not inHdrBool && anchor `elem` (M.elems anchorMap)
- then uniqueIdent [Str anchor] (M.elems anchorMap)
+ then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap)
else anchor
unless inHdrBool
(modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
@@ -360,10 +368,10 @@ parPartToInlines (Drawing fp bs ext) = do
modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
return $ imageWith (extentToAttr ext) fp "" ""
parPartToInlines (InternalHyperLink anchor runs) = do
- ils <- concatReduce <$> mapM runToInlines runs
+ ils <- smushInlines <$> mapM runToInlines runs
return $ link ('#' : anchor) "" ils
parPartToInlines (ExternalHyperLink target runs) = do
- ils <- concatReduce <$> mapM runToInlines runs
+ ils <- smushInlines <$> mapM runToInlines runs
return $ link target "" ils
parPartToInlines (PlainOMath exps) = do
return $ math $ writeTeX exps
@@ -393,7 +401,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils)
| (c:cs) <- filter isAnchorSpan ils
, (Span (ident, ["anchor"], _) _) <- c = do
hdrIDMap <- gets docxAnchorMap
- let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs))
-- Otherwise we just give it a name, and register that name (associate
@@ -401,7 +409,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils)
makeHeaderAnchor' (Header n (_, classes, kvs) ils) =
do
hdrIDMap <- gets docxAnchorMap
- let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap)
modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
return $ Header n (newIdent, classes, kvs) ils
makeHeaderAnchor' blk = return blk
@@ -416,7 +424,7 @@ singleParaToPlain blks = blks
cellToBlocks :: Cell -> DocxContext Blocks
cellToBlocks (Cell bps) = do
- blks <- concatReduce <$> mapM bodyPartToBlocks bps
+ blks <- smushBlocks <$> mapM bodyPartToBlocks bps
return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
rowToBlocksList :: Row -> DocxContext [Blocks]
@@ -478,11 +486,11 @@ bodyPartToBlocks (Paragraph pPr parparts)
$ concatMap parPartToString parparts
| Just (style, n) <- pHeading pPr = do
ils <- local (\s-> s{docxInHeaderBlock=True}) $
- (concatReduce <$> mapM parPartToInlines parparts)
+ (smushInlines <$> mapM parPartToInlines parparts)
makeHeaderAnchor $
headerWith ("", delete style (pStyle pPr), []) n ils
| otherwise = do
- ils <- concatReduce <$> mapM parPartToInlines parparts >>=
+ ils <- smushInlines <$> mapM parPartToInlines parparts >>=
(return . fromList . trimLineBreaks . normalizeSpaces . toList)
dropIls <- gets docxDropCap
let ils' = dropIls <> ils
@@ -560,7 +568,7 @@ bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag)
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
- blks <- concatReduce <$> mapM bodyPartToBlocks blkbps
+ blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
mediaBag <- gets docxMediaBag
return $ (meta,
diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs
new file mode 100644
index 000000000..39e0df825
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Combine.hs
@@ -0,0 +1,154 @@
+{-# LANGUAGE TypeSynonymInstances, FlexibleInstances,
+ PatternGuards #-}
+
+module Text.Pandoc.Readers.Docx.Combine ( smushInlines
+ , smushBlocks
+ )
+ where
+
+import Text.Pandoc.Builder
+import Data.List
+import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr, (><), (|>))
+import qualified Data.Sequence as Seq (null)
+
+data Modifier a = Modifier (a -> a)
+ | AttrModifier (Attr -> a -> a) Attr
+ | NullModifier
+
+spaceOutInlinesL :: Inlines -> (Inlines, Inlines)
+spaceOutInlinesL ms = (l, stackInlines fs (m' <> r))
+ where (l, m, r) = spaceOutInlines ms
+ (fs, m') = unstackInlines m
+
+spaceOutInlinesR :: Inlines -> (Inlines, Inlines)
+spaceOutInlinesR ms = (stackInlines fs (l <> m'), r)
+ where (l, m, r) = spaceOutInlines ms
+ (fs, m') = unstackInlines m
+
+spaceOutInlines :: Inlines -> (Inlines, Inlines, Inlines)
+spaceOutInlines ils =
+ let (fs, ils') = unstackInlines ils
+ contents = unMany ils'
+ left = case viewl contents of
+ (Space :< _) -> space
+ _ -> mempty
+ right = case viewr contents of
+ (_ :> Space) -> space
+ _ -> mempty in
+ (left, (stackInlines fs $ trimInlines . Many $ contents), right)
+
+stackInlines :: [Modifier Inlines] -> Inlines -> Inlines
+stackInlines [] ms = ms
+stackInlines (NullModifier : fs) ms = stackInlines fs ms
+stackInlines ((Modifier f) : fs) ms =
+ if isEmpty ms
+ then stackInlines fs ms
+ else f $ stackInlines fs ms
+stackInlines ((AttrModifier f attr) : fs) ms = f attr $ stackInlines fs ms
+
+unstackInlines :: Inlines -> ([Modifier Inlines], Inlines)
+unstackInlines ms = case ilModifier ms of
+ NullModifier -> ([], ms)
+ _ -> (f : fs, ms') where
+ f = ilModifier ms
+ (fs, ms') = unstackInlines $ ilInnards ms
+
+ilModifier :: Inlines -> Modifier Inlines
+ilModifier ils = case viewl (unMany ils) of
+ (x :< xs) | Seq.null xs -> case x of
+ (Emph _) -> Modifier emph
+ (Strong _) -> Modifier strong
+ (SmallCaps _) -> Modifier smallcaps
+ (Strikeout _) -> Modifier strikeout
+ (Superscript _) -> Modifier superscript
+ (Subscript _) -> Modifier subscript
+ (Link attr _ tgt) -> Modifier $ linkWith attr (fst tgt) (snd tgt)
+ (Span attr _) -> AttrModifier spanWith attr
+ _ -> NullModifier
+ _ -> NullModifier
+
+ilInnards :: Inlines -> Inlines
+ilInnards ils = case viewl (unMany ils) of
+ (x :< xs) | Seq.null xs -> case x of
+ (Emph lst) -> fromList lst
+ (Strong lst) -> fromList lst
+ (SmallCaps lst) -> fromList lst
+ (Strikeout lst) -> fromList lst
+ (Superscript lst) -> fromList lst
+ (Subscript lst) -> fromList lst
+ (Link _ lst _) -> fromList lst
+ (Span _ lst) -> fromList lst
+ _ -> ils
+ _ -> ils
+
+inlinesL :: Inlines -> (Inlines, Inlines)
+inlinesL ils = case viewl $ unMany ils of
+ (s :< sq) -> (singleton s, Many sq)
+ _ -> (mempty, ils)
+
+inlinesR :: Inlines -> (Inlines, Inlines)
+inlinesR ils = case viewr $ unMany ils of
+ (sq :> s) -> (Many sq, singleton s)
+ _ -> (ils, mempty)
+
+combineInlines :: Inlines -> Inlines -> Inlines
+combineInlines x y =
+ let (xs', x') = inlinesR x
+ (y', ys') = inlinesL y
+ in
+ xs' <> (combineSingletonInlines x' y') <> ys'
+
+combineSingletonInlines :: Inlines -> Inlines -> Inlines
+combineSingletonInlines x y =
+ let (xfs, xs) = unstackInlines x
+ (yfs, ys) = unstackInlines y
+ shared = xfs `intersect` yfs
+ x_remaining = xfs \\ shared
+ y_remaining = yfs \\ shared
+ x_rem_attr = filter isAttrModifier x_remaining
+ y_rem_attr = filter isAttrModifier y_remaining
+ in
+ case null shared of
+ True | isEmpty xs && isEmpty ys ->
+ stackInlines (x_rem_attr ++ y_rem_attr) mempty
+ | isEmpty xs ->
+ let (sp, y') = spaceOutInlinesL y in
+ (stackInlines x_rem_attr mempty) <> sp <> y'
+ | isEmpty ys ->
+ let (x', sp) = spaceOutInlinesR x in
+ x' <> sp <> (stackInlines y_rem_attr mempty)
+ | otherwise ->
+ let (x', xsp) = spaceOutInlinesR x
+ (ysp, y') = spaceOutInlinesL y
+ in
+ x' <> xsp <> ysp <> y'
+ False -> stackInlines shared $
+ combineInlines
+ (stackInlines x_remaining xs)
+ (stackInlines y_remaining ys)
+
+combineBlocks :: Blocks -> Blocks -> Blocks
+combineBlocks bs cs
+ | bs' :> (BlockQuote bs'') <- viewr (unMany bs)
+ , (BlockQuote cs'') :< cs' <- viewl (unMany cs) =
+ Many $ (bs' |> (BlockQuote (bs'' <> cs''))) >< cs'
+combineBlocks bs cs = bs <> cs
+
+instance (Monoid a, Eq a) => Eq (Modifier a) where
+ (Modifier f) == (Modifier g) = (f mempty == g mempty)
+ (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty)
+ (NullModifier) == (NullModifier) = True
+ _ == _ = False
+
+isEmpty :: (Monoid a, Eq a) => a -> Bool
+isEmpty x = x == mempty
+
+isAttrModifier :: Modifier a -> Bool
+isAttrModifier (AttrModifier _ _) = True
+isAttrModifier _ = False
+
+smushInlines :: [Inlines] -> Inlines
+smushInlines xs = foldl combineInlines mempty xs
+
+smushBlocks :: [Blocks] -> Blocks
+smushBlocks xs = foldl combineBlocks mempty xs
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index eec8b12c9..364483929 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, Row(..)
, Cell(..)
, archiveToDocx
+ , archiveToDocxWithWarnings
) where
import Codec.Archive.Zip
import Text.XML.Light
@@ -60,6 +61,7 @@ import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import qualified Text.Pandoc.UTF8 as UTF8
import Control.Monad.Reader
+import Control.Monad.State
import Control.Applicative ((<|>))
import qualified Data.Map as M
import Text.Pandoc.Compat.Except
@@ -81,16 +83,20 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes
}
deriving Show
+data ReaderState = ReaderState { stateWarnings :: [String] }
+ deriving Show
+
+
data DocxError = DocxError | WrongElem
deriving Show
instance Error DocxError where
noMsg = WrongElem
-type D = ExceptT DocxError (Reader ReaderEnv)
+type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState))
-runD :: D a -> ReaderEnv -> Either DocxError a
-runD dx re = runReader (runExceptT dx) re
+runD :: D a -> ReaderEnv -> ReaderState -> (Either DocxError a, ReaderState)
+runD dx re rs = runState (runReaderT (runExceptT dx) re) rs
maybeToD :: Maybe a -> D a
maybeToD (Just a) = return a
@@ -257,7 +263,10 @@ type Author = String
type ChangeDate = String
archiveToDocx :: Archive -> Either DocxError Docx
-archiveToDocx archive = do
+archiveToDocx archive = fst <$> archiveToDocxWithWarnings archive
+
+archiveToDocxWithWarnings :: Archive -> Either DocxError (Docx, [String])
+archiveToDocxWithWarnings archive = do
let notes = archiveToNotes archive
numbering = archiveToNumbering archive
rels = archiveToRelationships archive
@@ -265,8 +274,12 @@ archiveToDocx archive = do
(styles, parstyles) = archiveToStyles archive
rEnv =
ReaderEnv notes numbering rels media Nothing styles parstyles InDocument
- doc <- runD (archiveToDocument archive) rEnv
- return $ Docx doc
+ rState = ReaderState { stateWarnings = [] }
+ (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState
+ case eitherDoc of
+ Right doc -> Right (Docx doc, stateWarnings st)
+ Left e -> Left e
+
archiveToDocument :: Archive -> D Document
@@ -576,12 +589,14 @@ elemToBodyPart ns element
sty <- asks envParStyles
let parstyle = elemToParagraphStyle ns element sty
parparts <- mapD (elemToParPart ns) (elChildren element)
- case pNumInfo parstyle of
- Just (numId, lvl) -> do
- num <- asks envNumbering
- let levelInfo = lookupLevel numId lvl num
- return $ ListItem parstyle numId lvl levelInfo parparts
- Nothing -> return $ Paragraph parstyle parparts
+ -- Word uses list enumeration for numbered headings, so we only
+ -- want to infer a list from the styles if it is NOT a heading.
+ case pHeading parstyle of
+ Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
+ num <- asks envNumbering
+ let levelInfo = lookupLevel numId lvl num
+ return $ ListItem parstyle numId lvl levelInfo parparts
+ _ -> return $ Paragraph parstyle parparts
elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
let caption' = findChild (elemName ns "w" "tblPr") element
@@ -702,36 +717,58 @@ elemToExtent drawingElem =
getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem
>>= findAttr (QName at Nothing Nothing) >>= safeRead
-elemToRun :: NameSpaces -> Element -> D Run
-elemToRun ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+
+childElemToRun :: NameSpaces -> Element -> D Run
+childElemToRun ns element
+ | isElem ns "w" "drawing" element =
let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
- drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element
>>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
in
case drawing of
Just s -> expandDrawingId s >>=
- (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent drawingElem)
+ (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent element)
Nothing -> throwError WrongElem
-elemToRun ns element
- | isElem ns "w" "r" element
- , Just ref <- findChild (elemName ns "w" "footnoteReference") element
- , Just fnId <- findAttr (elemName ns "w" "id") ref = do
+childElemToRun ns element
+ | isElem ns "w" "footnoteReference" element
+ , Just fnId <- findAttr (elemName ns "w" "id") element = do
notes <- asks envNotes
case lookupFootnote fnId notes of
Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e)
return $ Footnote bps
Nothing -> return $ Footnote []
-elemToRun ns element
- | isElem ns "w" "r" element
- , Just ref <- findChild (elemName ns "w" "endnoteReference") element
- , Just enId <- findAttr (elemName ns "w" "id") ref = do
+childElemToRun ns element
+ | isElem ns "w" "endnoteReference" element
+ , Just enId <- findAttr (elemName ns "w" "id") element = do
notes <- asks envNotes
case lookupEndnote enId notes of
Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e)
return $ Endnote bps
Nothing -> return $ Endnote []
+childElemToRun _ _ = throwError WrongElem
+
+elemToRun :: NameSpaces -> Element -> D Run
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element =
+ do let choices = findChildren (elemName ns "mc" "Choice") altCont
+ choiceChildren = map head $ filter (not . null) $ map elChildren choices
+ outputs <- mapD (childElemToRun ns) choiceChildren
+ case outputs of
+ r : _ -> return r
+ [] -> throwError WrongElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+ childElemToRun ns drawingElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "footnoteReference") element =
+ childElemToRun ns ref
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "endnoteReference") element =
+ childElemToRun ns ref
elemToRun ns element
| isElem ns "w" "r" element = do
runElems <- elemToRunElems ns element
@@ -940,3 +977,4 @@ elemToRunElems _ _ = throwError WrongElem
setFont :: Maybe Font -> ReaderEnv -> ReaderEnv
setFont f s = s{envFont = f}
+
diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs
deleted file mode 100644
index c93b40119..000000000
--- a/src/Text/Pandoc/Readers/Docx/Reducible.hs
+++ /dev/null
@@ -1,181 +0,0 @@
-{-# LANGUAGE TypeSynonymInstances, FlexibleInstances,
- PatternGuards #-}
-
-module Text.Pandoc.Readers.Docx.Reducible ( concatReduce
- , (<+>)
- )
- where
-
-
-import Text.Pandoc.Builder
-import Data.List
-import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr)
-import qualified Data.Sequence as Seq (null)
-
-data Modifier a = Modifier (a -> a)
- | AttrModifier (Attr -> a -> a) Attr
- | NullModifier
-
-class (Eq a) => Modifiable a where
- modifier :: a -> Modifier a
- innards :: a -> a
- getL :: a -> (a, a)
- getR :: a -> (a, a)
- spaceOut :: a -> (a, a, a)
-
-spaceOutL :: (Monoid a, Modifiable a) => a -> (a, a)
-spaceOutL ms = (l, stack fs (m' <> r))
- where (l, m, r) = spaceOut ms
- (fs, m') = unstack m
-
-spaceOutR :: (Monoid a, Modifiable a) => a -> (a, a)
-spaceOutR ms = (stack fs (l <> m'), r)
- where (l, m, r) = spaceOut ms
- (fs, m') = unstack m
-
-instance (Monoid a, Show a) => Show (Modifier a) where
- show (Modifier f) = show $ f mempty
- show (AttrModifier f attr) = show $ f attr mempty
- show (NullModifier) = "NullModifier"
-
-instance (Monoid a, Eq a) => Eq (Modifier a) where
- (Modifier f) == (Modifier g) = (f mempty == g mempty)
- (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty)
- (NullModifier) == (NullModifier) = True
- _ == _ = False
-
-instance Modifiable Inlines where
- modifier ils = case viewl (unMany ils) of
- (x :< xs) | Seq.null xs -> case x of
- (Emph _) -> Modifier emph
- (Strong _) -> Modifier strong
- (SmallCaps _) -> Modifier smallcaps
- (Strikeout _) -> Modifier strikeout
- (Superscript _) -> Modifier superscript
- (Subscript _) -> Modifier subscript
- (Span attr _) -> AttrModifier spanWith attr
- _ -> NullModifier
- _ -> NullModifier
-
- innards ils = case viewl (unMany ils) of
- (x :< xs) | Seq.null xs -> case x of
- (Emph lst) -> fromList lst
- (Strong lst) -> fromList lst
- (SmallCaps lst) -> fromList lst
- (Strikeout lst) -> fromList lst
- (Superscript lst) -> fromList lst
- (Subscript lst) -> fromList lst
- (Span _ lst) -> fromList lst
- _ -> ils
- _ -> ils
-
- getL ils = case viewl $ unMany ils of
- (s :< sq) -> (singleton s, Many sq)
- _ -> (mempty, ils)
-
- getR ils = case viewr $ unMany ils of
- (sq :> s) -> (Many sq, singleton s)
- _ -> (ils, mempty)
-
- spaceOut ils =
- let (fs, ils') = unstack ils
- contents = unMany ils'
- left = case viewl contents of
- (Space :< _) -> space
- _ -> mempty
- right = case viewr contents of
- (_ :> Space) -> space
- _ -> mempty in
- (left, (stack fs $ trimInlines .Many $ contents), right)
-
-instance Modifiable Blocks where
- modifier blks = case viewl (unMany blks) of
- (x :< xs) | Seq.null xs -> case x of
- (BlockQuote _) -> Modifier blockQuote
- -- (Div attr _) -> AttrModifier divWith attr
- _ -> NullModifier
- _ -> NullModifier
-
- innards blks = case viewl (unMany blks) of
- (x :< xs) | Seq.null xs -> case x of
- (BlockQuote lst) -> fromList lst
- -- (Div attr lst) -> fromList lst
- _ -> blks
- _ -> blks
-
- spaceOut blks = (mempty, blks, mempty)
-
- getL ils = case viewl $ unMany ils of
- (s :< sq) -> (singleton s, Many sq)
- _ -> (mempty, ils)
-
- getR ils = case viewr $ unMany ils of
- (sq :> s) -> (Many sq, singleton s)
- _ -> (ils, mempty)
-
-
-unstack :: (Modifiable a) => a -> ([Modifier a], a)
-unstack ms = case modifier ms of
- NullModifier -> ([], ms)
- _ -> (f : fs, ms') where
- f = modifier ms
- (fs, ms') = unstack $ innards ms
-
-stack :: (Monoid a, Modifiable a) => [Modifier a] -> a -> a
-stack [] ms = ms
-stack (NullModifier : fs) ms = stack fs ms
-stack ((Modifier f) : fs) ms =
- if isEmpty ms
- then stack fs ms
- else f $ stack fs ms
-stack ((AttrModifier f attr) : fs) ms = f attr $ stack fs ms
-
-isEmpty :: (Monoid a, Eq a) => a -> Bool
-isEmpty x = x == mempty
-
-
-combine :: (Monoid a, Modifiable a, Eq a) => a -> a -> a
-combine x y =
- let (xs', x') = getR x
- (y', ys') = getL y
- in
- xs' <> (combineSingleton x' y') <> ys'
-
-isAttrModifier :: Modifier a -> Bool
-isAttrModifier (AttrModifier _ _) = True
-isAttrModifier _ = False
-
-combineSingleton :: (Monoid a, Modifiable a, Eq a) => a -> a -> a
-combineSingleton x y =
- let (xfs, xs) = unstack x
- (yfs, ys) = unstack y
- shared = xfs `intersect` yfs
- x_remaining = xfs \\ shared
- y_remaining = yfs \\ shared
- x_rem_attr = filter isAttrModifier x_remaining
- y_rem_attr = filter isAttrModifier y_remaining
- in
- case null shared of
- True | isEmpty xs && isEmpty ys ->
- stack (x_rem_attr ++ y_rem_attr) mempty
- | isEmpty xs ->
- let (sp, y') = spaceOutL y in
- (stack x_rem_attr mempty) <> sp <> y'
- | isEmpty ys ->
- let (x', sp) = spaceOutR x in
- x' <> sp <> (stack y_rem_attr mempty)
- | otherwise ->
- let (x', xsp) = spaceOutR x
- (ysp, y') = spaceOutL y
- in
- x' <> xsp <> ysp <> y'
- False -> stack shared $
- combine
- (stack x_remaining xs)
- (stack y_remaining ys)
-
-(<+>) :: (Monoid a, Modifiable a, Eq a) => a -> a -> a
-x <+> y = combine x y
-
-concatReduce :: (Monoid a, Modifiable a) => [a] -> a
-concatReduce xs = foldl combine mempty xs
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index 79aa540f6..07d282708 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -30,7 +30,7 @@ import Control.Monad (guard, liftM, when)
import Data.List (isPrefixOf, isInfixOf)
import Data.Maybe (mapMaybe, fromMaybe)
import qualified Data.Map as M (Map, lookup, fromList, elems)
-import Control.DeepSeq.Generics (deepseq, NFData)
+import Control.DeepSeq (deepseq, NFData)
import Debug.Trace (trace)
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index a34e2fb5c..959a2d16f 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -43,7 +43,7 @@ import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
-import Text.Pandoc.Shared ( extractSpaces, renderTags'
+import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField
, escapeURI, safeRead, mapLeft )
import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
, Extension (Ext_epub_html_exts,
@@ -52,9 +52,9 @@ import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Walk
import qualified Data.Map as M
import Data.Maybe ( fromMaybe, isJust)
-import Data.List ( intercalate, isInfixOf, isPrefixOf, isSuffixOf )
+import Data.List ( intercalate, isInfixOf, isPrefixOf )
import Data.Char ( isDigit )
-import Control.Monad ( liftM, guard, when, mzero, void, unless )
+import Control.Monad ( guard, when, mzero, void, unless )
import Control.Arrow ((***))
import Control.Applicative ( (<|>) )
import Data.Monoid (First (..))
@@ -63,12 +63,12 @@ import Debug.Trace (trace)
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-import Network.URI (isURI)
+import Network.URI (URI, parseURIReference, nonStrictRelativeTo)
import Text.Pandoc.Error
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Text.Pandoc.Compat.Monoid ((<>))
import Text.Parsec.Error
-
+import qualified Data.Set as Set
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ReaderOptions -- ^ Reader options
@@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options
readHtml opts inp =
mapLeft (ParseFailure . getError) . flip runReader def $
runParserT parseDoc
- (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty)
+ (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty)
"source" tags
where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
@@ -103,8 +103,8 @@ data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(String, Blocks)],
- baseHref :: Maybe String,
- identifiers :: [String],
+ baseHref :: Maybe URI,
+ identifiers :: Set.Set String,
headerMap :: M.Map Inlines String
}
@@ -137,19 +137,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag
then return mempty
else do
let content = fromAttrib "content" mt
- updateState $ B.setMeta name (B.text content)
+ updateState $ \s ->
+ let ps = parserState s in
+ s{ parserState = ps{
+ stateMeta = addMetaField name (B.text content)
+ (stateMeta ps) } }
return mempty
pBaseTag = do
bt <- pSatisfy (~== TagOpen "base" [])
- let baseH = fromAttrib "href" bt
- if null baseH
- then return mempty
- else do
- let baseH' = case reverse baseH of
- '/':_ -> baseH
- _ -> baseH ++ "/"
- updateState $ \st -> st{ baseHref = Just baseH' }
- return mempty
+ updateState $ \st -> st{ baseHref =
+ parseURIReference $ fromAttrib "href" bt }
+ return mempty
block :: TagParser Blocks
block = do
@@ -441,6 +439,7 @@ pTable = try $ do
-- fail on empty table
guard $ not $ null head' && null rows
let isSinglePlain x = case B.toList x of
+ [] -> True
[Plain _] -> True
_ -> False
let isSimple = all isSinglePlain $ concat (head':rows)
@@ -605,9 +604,9 @@ pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "href" tag
- let url = case (isURI url', mbBaseHref) of
- (False, Just h) -> h ++ url'
- _ -> url'
+ let url = case (parseURIReference url', mbBaseHref) of
+ (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ _ -> url'
let title = fromAttrib "title" tag
let uid = fromAttrib "id" tag
let cls = words $ fromAttrib "class" tag
@@ -619,9 +618,9 @@ pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
mbBaseHref <- baseHref <$> getState
let url' = fromAttrib "src" tag
- let url = case (isURI url', mbBaseHref) of
- (False, Just h) -> h ++ url'
- _ -> url'
+ let url = case (parseURIReference url', mbBaseHref) of
+ (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
+ _ -> url'
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
let uid = fromAttrib "id" tag
@@ -925,14 +924,45 @@ htmlInBalanced :: (Monad m)
=> (Tag String -> Bool)
-> ParserT String st m String
htmlInBalanced f = try $ do
- (TagOpen t _, tag) <- htmlTag f
- guard $ not $ "/>" `isSuffixOf` tag -- not a self-closing tag
- let stopper = htmlTag (~== TagClose t)
- let anytag = snd <$> htmlTag (const True)
- contents <- many $ notFollowedBy' stopper >>
- (htmlInBalanced f <|> anytag <|> count 1 anyChar)
- endtag <- liftM snd stopper
- return $ tag ++ concat contents ++ endtag
+ lookAhead (char '<')
+ inp <- getInput
+ let ts = canonicalizeTags $
+ parseTagsOptions parseOptions{ optTagWarning = True,
+ optTagPosition = True } inp
+ case ts of
+ (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do
+ guard $ f t
+ guard $ not $ hasTagWarning (t : take 1 rest)
+ case htmlInBalanced' tn (t:rest) of
+ [] -> mzero
+ xs -> case reverse xs of
+ (TagClose _ : TagPosition er ec : _) -> do
+ let ls = er - sr
+ let cs = ec - sc
+ lscontents <- concat <$> count ls anyLine
+ cscontents <- count cs anyChar
+ (_,closetag) <- htmlTag (~== TagClose tn)
+ return (lscontents ++ cscontents ++ closetag)
+ _ -> mzero
+ _ -> mzero
+
+htmlInBalanced' :: String
+ -> [Tag String]
+ -> [Tag String]
+htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
+ where go :: Int -> [Tag String] -> Maybe [Tag String]
+ go n (t@(TagOpen tn' _):rest) | tn' == tagname =
+ (t :) <$> go (n + 1) rest
+ go 1 (t@(TagClose tn'):_) | tn' == tagname =
+ return [t]
+ go n (t@(TagClose tn'):rest) | tn' == tagname =
+ (t :) <$> go (n - 1) rest
+ go n (t:ts') = (t :) <$> go n ts'
+ go _ [] = mzero
+
+hasTagWarning :: [Tag String] -> Bool
+hasTagWarning (TagWarning _:_) = True
+hasTagWarning _ = False
-- | Matches a tag meeting a certain condition.
htmlTag :: Monad m
@@ -941,8 +971,6 @@ htmlTag :: Monad m
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let hasTagWarning (TagWarning _:_) = True
- hasTagWarning _ = False
let (next : rest) = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = True } inp
guard $ f next
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index d2e8d9d17..2be55c9da 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -166,10 +166,18 @@ mathInline :: LP String -> LP Inlines
mathInline p = math <$> (try p >>= applyMacros')
mathChars :: LP String
-mathChars = (concat <$>) $
- many $
- many1 (satisfy (\c -> c /= '$' && c /='\\'))
- <|> (\c -> ['\\',c]) <$> try (char '\\' *> anyChar)
+mathChars =
+ concat <$> many (escapedChar
+ <|> (snd <$> withRaw braced)
+ <|> many1 (satisfy isOrdChar))
+ where escapedChar = try $ do char '\\'
+ c <- anyChar
+ return ['\\',c]
+ isOrdChar '$' = False
+ isOrdChar '{' = False
+ isOrdChar '}' = False
+ isOrdChar '\\' = False
+ isOrdChar _ = True
quoted' :: (Inlines -> Inlines) -> LP String -> LP () -> LP Inlines
quoted' f starter ender = do
@@ -179,10 +187,11 @@ quoted' f starter ender = do
then do
ils <- many (notFollowedBy ender >> inline)
(ender >> return (f (mconcat ils))) <|>
- lit (case startchs of
- "``" -> "“"
- "`" -> "‘"
- _ -> startchs)
+ (<> mconcat ils) <$>
+ lit (case startchs of
+ "``" -> "“"
+ "`" -> "‘"
+ _ -> startchs)
else lit startchs
doubleQuote :: LP Inlines
@@ -421,7 +430,8 @@ inlineCommand = try $ do
else if parseRaw
then return $ rawInline "latex" rawcommand
else return mempty
- lookupListDefault mzero [name',name] inlineCommands
+ (lookupListDefault mzero [name',name] inlineCommands <*
+ optional (try (string "{}")))
<|> raw
unlessParseRaw :: LP ()
@@ -434,6 +444,7 @@ isBlockCommand s = s `M.member` blockCommands
inlineEnvironments :: M.Map String (LP Inlines)
inlineEnvironments = M.fromList
[ ("displaymath", mathEnv id Nothing "displaymath")
+ , ("math", math <$> verbEnv "math")
, ("equation", mathEnv id Nothing "equation")
, ("equation*", mathEnv id Nothing "equation*")
, ("gather", mathEnv id (Just "gathered") "gather")
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 77c3a1016..b5d175453 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -36,7 +36,7 @@ import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate )
import qualified Data.Map as M
import Data.Scientific (coefficient, base10Exponent)
import Data.Ord ( comparing )
-import Data.Char ( isSpace, isAlphaNum, toLower )
+import Data.Char ( isSpace, isAlphaNum, toLower, isPunctuation )
import Data.Maybe
import Text.Pandoc.Definition
import Text.Pandoc.Emoji (emojis)
@@ -61,7 +61,6 @@ import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockT
import Control.Monad
import System.FilePath (takeExtension, addExtension)
import Text.HTML.TagSoup
-import Text.HTML.TagSoup.Match (tagOpen)
import qualified Data.Set as Set
import Text.Printf (printf)
import Debug.Trace (trace)
@@ -1052,12 +1051,11 @@ strictHtmlBlock :: MarkdownParser String
strictHtmlBlock = htmlInBalanced (not . isInlineTag)
rawVerbatimBlock :: MarkdownParser String
-rawVerbatimBlock = try $ do
- (TagOpen tag _, open) <- htmlTag (tagOpen (flip elem
- ["pre", "style", "script"])
- (const True))
- contents <- manyTill anyChar (htmlTag (~== TagClose tag))
- return $ open ++ contents ++ renderTags' [TagClose tag]
+rawVerbatimBlock = htmlInBalanced isVerbTag
+ where isVerbTag (TagOpen "pre" _) = True
+ isVerbTag (TagOpen "style" _) = True
+ isVerbTag (TagOpen "script" _) = True
+ isVerbTag _ = False
rawTeXBlock :: MarkdownParser (F Blocks)
rawTeXBlock = do
@@ -1356,16 +1354,18 @@ pipeTable = try $ do
nonindentSpaces
lookAhead nonspaceChar
(heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak
+ let heads' = take (length aligns) <$> heads
lines' <- many pipeTableRow
+ let lines'' = map (take (length aligns) <$>) lines'
let maxlength = maximum $
- map (\x -> length . stringify $ runF x def) (heads : lines')
+ map (\x -> length . stringify $ runF x def) (heads' : lines'')
numColumns <- getOption readerColumns
let widths = if maxlength > numColumns
then map (\len ->
fromIntegral (len + 1) / fromIntegral numColumns)
seplengths
else replicate (length aligns) 0.0
- return $ (aligns, widths, heads, sequence lines')
+ return $ (aligns, widths, heads', sequence lines'')
sepPipe :: MarkdownParser ()
sepPipe = try $ do
@@ -1374,25 +1374,27 @@ sepPipe = try $ do
-- parse a row, also returning probable alignments for org-table cells
pipeTableRow :: MarkdownParser (F [Blocks])
-pipeTableRow = do
+pipeTableRow = try $ do
+ scanForPipe
skipMany spaceChar
openPipe <- (True <$ char '|') <|> return False
- let cell = mconcat <$>
- many (notFollowedBy (blankline <|> char '|') >> inline)
- first <- cell
- rest <- many $ sepPipe *> cell
+ -- split into cells
+ let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline')
+ <|> void (noneOf "|\n\r")
+ let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>=
+ parseFromString pipeTableCell
+ cells <- cellContents `sepEndBy1` (char '|')
-- surrounding pipes needed for a one-column table:
- guard $ not (null rest && not openPipe)
- optional (char '|')
+ guard $ not (length cells == 1 && not openPipe)
blankline
- let cells = sequence (first:rest)
- return $ do
- cells' <- cells
- return $ map
- (\ils ->
- case trimInlines ils of
- ils' | B.isNull ils' -> mempty
- | otherwise -> B.plain $ ils') cells'
+ return $ sequence cells
+
+pipeTableCell :: MarkdownParser (F Blocks)
+pipeTableCell = do
+ result <- many inline
+ if null result
+ then return mempty
+ else return $ B.plain . mconcat <$> sequence result
pipeTableHeaderPart :: Parser [Char] st (Alignment, Int)
pipeTableHeaderPart = try $ do
@@ -1554,7 +1556,7 @@ math :: MarkdownParser (F Inlines)
math = (return . B.displayMath <$> (mathDisplay >>= applyMacros'))
<|> (return . B.math <$> (mathInline >>= applyMacros')) <+?>
((getOption readerSmart >>= guard) *> (return <$> apostrophe)
- <* notFollowedBy space)
+ <* notFollowedBy (space <|> satisfy isPunctuation))
-- Parses material enclosed in *s, **s, _s, or __s.
-- Designed to avoid backtracking.
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index d29ec50e7..950497992 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -52,6 +52,7 @@ import Text.HTML.TagSoup
import Data.Sequence (viewl, ViewL(..), (<|))
import qualified Data.Foldable as F
import qualified Data.Map as M
+import qualified Data.Set as Set
import Data.Char (isDigit, isSpace)
import Data.Maybe (fromMaybe)
import Text.Printf (printf)
@@ -69,7 +70,7 @@ readMediaWiki opts s =
, mwNextLinkNumber = 1
, mwCategoryLinks = []
, mwHeaderMap = M.empty
- , mwIdentifierList = []
+ , mwIdentifierList = Set.empty
}
(s ++ "\n")
@@ -78,7 +79,7 @@ data MWState = MWState { mwOptions :: ReaderOptions
, mwNextLinkNumber :: Int
, mwCategoryLinks :: [Inlines]
, mwHeaderMap :: M.Map Inlines String
- , mwIdentifierList :: [String]
+ , mwIdentifierList :: Set.Set String
}
type MWParser = Parser [Char] MWState
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 1f1c57646..8c475eefc 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -61,6 +61,7 @@ import Text.Pandoc.Readers.Odt.Generic.XMLConverter
import Text.Pandoc.Readers.Odt.Generic.Fallible
import Text.Pandoc.Readers.Odt.Generic.Utils
+import qualified Data.Set as Set
--------------------------------------------------------------------------------
-- State
@@ -221,7 +222,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do
getHeaderAnchor :: OdtReaderSafe Inlines Anchor
getHeaderAnchor = proc title -> do
state <- getExtraState -< ()
- let anchor = uniqueIdent (toList title) (usedAnchors state)
+ let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state)
modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
index c7906618c..7dd611be3 100644
--- a/src/Text/Pandoc/Readers/Org.hs
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -2,7 +2,7 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-}
{-
-Copyright (C) 2014-2015 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
+Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Org
- Copyright : Copyright (C) 2014-2015 Albert Krewinkel
+ Copyright : Copyright (C) 2014-2016 Albert Krewinkel
License : GNU GPL, version 2 or above
Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
@@ -53,6 +53,7 @@ import Data.Char (isAlphaNum, toLower)
import Data.Default
import Data.List (intersperse, isPrefixOf, isSuffixOf)
import qualified Data.Map as M
+import qualified Data.Set as Set
import Data.Maybe (fromMaybe, isJust)
import Network.HTTP (urlEncode)
@@ -144,7 +145,7 @@ data OrgParserState = OrgParserState
, orgStateMeta' :: F Meta
, orgStateNotes' :: OrgNoteTable
, orgStateParserContext :: ParserContext
- , orgStateIdentifiers :: [String]
+ , orgStateIdentifiers :: Set.Set String
, orgStateHeaderMap :: M.Map Inlines String
}
@@ -186,7 +187,7 @@ defaultOrgParserState = OrgParserState
, orgStateMeta' = return nullMeta
, orgStateNotes' = []
, orgStateParserContext = NullState
- , orgStateIdentifiers = []
+ , orgStateIdentifiers = Set.empty
, orgStateHeaderMap = M.empty
}
@@ -628,7 +629,7 @@ figure = try $ do
maybeNam <- lookupBlockAttribute "name"
guard $ isJust maybeCap || isJust maybeNam
return ( fromMaybe mempty maybeCap
- , maybe mempty withFigPrefix maybeNam )
+ , withFigPrefix $ fromMaybe mempty maybeNam )
withFigPrefix cs =
if "fig:" `isPrefixOf` cs
then cs
@@ -1238,37 +1239,37 @@ applyCustomLinkFormat link = do
formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
return $ maybe link ($ drop 1 rest) formatter
--- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind
--- of parsing.
+-- | Take a link and return a function which produces new inlines when given
+-- description inlines.
linkToInlinesF :: String -> Inlines -> F Inlines
-linkToInlinesF s =
+linkToInlinesF linkStr =
+ case linkStr of
+ "" -> pure . B.link mempty "" -- wiki link (empty by convention)
+ ('#':_) -> pure . B.link linkStr "" -- document-local fraction
+ _ -> case cleanLinkString linkStr of
+ (Just cleanedLink) -> if isImageFilename cleanedLink
+ then const . pure $ B.image cleanedLink "" ""
+ else pure . B.link cleanedLink ""
+ Nothing -> internalLink linkStr -- other internal link
+
+-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if
+-- the string does not appear to be a link.
+cleanLinkString :: String -> Maybe String
+cleanLinkString s =
case s of
- "" -> pure . B.link "" ""
- ('#':_) -> pure . B.link s ""
- _ | isImageFilename s -> const . pure $ B.image s "" ""
- _ | isFileLink s -> pure . B.link (dropLinkType s) ""
- _ | isUri s -> pure . B.link s ""
- _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) ""
- _ | isRelativeFilePath s -> pure . B.link s ""
- _ -> internalLink s
-
-isFileLink :: String -> Bool
-isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s)
-
-dropLinkType :: String -> String
-dropLinkType = tail . snd . break (== ':')
-
-isRelativeFilePath :: String -> Bool
-isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) &&
- (':' `notElem` s)
-
-isUri :: String -> Bool
-isUri s = let (scheme, path) = break (== ':') s
- in all (\c -> isAlphaNum c || c `elem` (".-" :: String)) scheme
- && not (null path)
-
-isAbsoluteFilePath :: String -> Bool
-isAbsoluteFilePath = ('/' ==) . head
+ '/':_ -> Just $ "file://" ++ s -- absolute path
+ '.':'/':_ -> Just s -- relative path
+ '.':'.':'/':_ -> Just s -- relative path
+ -- Relative path or URL (file schema)
+ 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s'
+ _ | isUrl s -> Just s -- URL
+ _ -> Nothing
+ where
+ isUrl :: String -> Bool
+ isUrl cs =
+ let (scheme, path) = break (== ':') cs
+ in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme
+ && not (null path)
isImageFilename :: String -> Bool
isImageFilename filename =
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index dd1d289a3..6f64540f8 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -583,7 +583,18 @@ code2 = do
-- | Html / CSS attributes
attributes :: Parser [Char] ParserState Attr
-attributes = (foldl (flip ($)) ("",[],[])) `fmap` many attribute
+attributes = (foldl (flip ($)) ("",[],[])) <$>
+ try (do special <- option id specialAttribute
+ attrs <- many attribute
+ return (special : attrs))
+
+specialAttribute :: Parser [Char] ParserState (Attr -> Attr)
+specialAttribute = do
+ alignStr <- ("center" <$ char '=') <|>
+ ("justify" <$ try (string "<>")) <|>
+ ("right" <$ char '>') <|>
+ ("left" <$ char '<')
+ return $ addStyle ("text-align:" ++ alignStr)
attribute :: Parser [Char] ParserState (Attr -> Attr)
attribute = classIdAttr <|> styleAttr <|> langAttr
@@ -602,7 +613,13 @@ classIdAttr = try $ do -- (class class #id)
styleAttr :: Parser [Char] ParserState (Attr -> Attr)
styleAttr = do
style <- try $ enclosed (char '{') (char '}') anyChar'
- return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals)
+ return $ addStyle style
+
+addStyle :: String -> Attr -> Attr
+addStyle style (id',classes,keyvals) =
+ (id',classes,keyvals')
+ where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"]
+ style' = style ++ ";" ++ concat [v | ("style",v) <- keyvals]
langAttr :: Parser [Char] ParserState (Attr -> Attr)
langAttr = do