diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 66 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Combine.hs | 154 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 90 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Reducible.hs | 181 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/EPUB.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 96 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 29 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 54 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/MediaWiki.hs | 5 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt/ContentReader.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 69 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 21 |
12 files changed, 425 insertions, 345 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 44f67ce75..604bc20de 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -50,8 +50,7 @@ implemented, [-] means partially implemented): * Inlines - [X] Str - - [X] Emph (From italics. `underline` currently read as span. In - future, it might optionally be emph as well) + - [X] Emph (italics and underline both read as Emph) - [X] Strong - [X] Strikeout - [X] Superscript @@ -62,16 +61,16 @@ implemented, [-] means partially implemented): - [X] Code (styled with `VerbatimChar`) - [X] Space - [X] LineBreak (these are invisible in Word: entered with Shift-Return) - - [ ] Math + - [X] Math - [X] Link (links to an arbitrary bookmark create a span with the target as id and "anchor" class) - - [-] Image (Links to path in archive. Future option for - data-encoded URI likely.) + - [X] Image - [X] Note (Footnotes and Endnotes are silently combined.) -} module Text.Pandoc.Readers.Docx - ( readDocx + ( readDocxWithWarnings + , readDocx ) where import Codec.Archive.Zip @@ -81,7 +80,7 @@ import Text.Pandoc.Builder import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists -import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Readers.Docx.Combine import Text.Pandoc.Shared import Text.Pandoc.MediaBag (insertMedia, MediaBag) import Data.List (delete, (\\), intersect) @@ -89,6 +88,7 @@ import Text.TeXMath (writeTeX) import Data.Default (Default) import qualified Data.ByteString.Lazy as B import qualified Data.Map as M +import qualified Data.Set as Set import Control.Monad.Reader import Control.Monad.State import Data.Sequence (ViewL(..), viewl) @@ -97,14 +97,22 @@ import qualified Data.Sequence as Seq (null) import Text.Pandoc.Error import Text.Pandoc.Compat.Except +readDocxWithWarnings :: ReaderOptions + -> B.ByteString + -> Either PandocError (Pandoc, MediaBag, [String]) +readDocxWithWarnings opts bytes = + case archiveToDocxWithWarnings (toArchive bytes) of + Right (docx, warnings) -> do + (meta, blks, mediaBag) <- docxToOutput opts docx + return (Pandoc meta blks, mediaBag, warnings) + Left _ -> Left (ParseFailure "couldn't parse docx file") + readDocx :: ReaderOptions -> B.ByteString -> Either PandocError (Pandoc, MediaBag) -readDocx opts bytes = - case archiveToDocx (toArchive bytes) of - Right docx -> (\(meta, blks, mediaBag) -> (Pandoc meta blks, mediaBag)) - <$> (docxToOutput opts docx) - Left _ -> Left (ParseFailure "couldn't parse docx file") +readDocx opts bytes = do + (pandoc, mediaBag, _) <- readDocxWithWarnings opts bytes + return (pandoc, mediaBag) data DState = DState { docxAnchorMap :: M.Map String String , docxMediaBag :: MediaBag @@ -166,7 +174,7 @@ bodyPartsToMeta' (bp : bps) | (Paragraph pPr parParts) <- bp , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) , (Just metaField) <- M.lookup c metaStyles = do - inlines <- concatReduce <$> mapM parPartToInlines parParts + inlines <- smushInlines <$> mapM parPartToInlines parParts remaining <- bodyPartsToMeta' bps let f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] @@ -290,13 +298,13 @@ runToInlines (Run rs runElems) Just SubScrpt -> subscript codeString _ -> codeString | otherwise = do - let ils = concatReduce (map runElemToInlines runElems) + let ils = smushInlines (map runElemToInlines runElems) return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils runToInlines (Footnote bps) = do - blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) return $ note blksList runToInlines (Endnote bps) = do - blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) + blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) return $ note blksList runToInlines (InlineDrawing fp bs ext) = do mediaBag <- gets docxMediaBag @@ -315,19 +323,19 @@ parPartToInlines (PlainRun r) = runToInlines r parPartToInlines (Insertion _ author date runs) = do opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> concatReduce <$> mapM runToInlines runs + AcceptChanges -> smushInlines <$> mapM runToInlines runs RejectChanges -> return mempty AllChanges -> do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["insertion"], [("author", author), ("date", date)]) return $ spanWith attr ils parPartToInlines (Deletion _ author date runs) = do opts <- asks docxOptions case readerTrackChanges opts of AcceptChanges -> return mempty - RejectChanges -> concatReduce <$> mapM runToInlines runs + RejectChanges -> smushInlines <$> mapM runToInlines runs AllChanges -> do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs let attr = ("", ["deletion"], [("author", author), ("date", date)]) return $ spanWith attr ils parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = @@ -350,7 +358,7 @@ parPartToInlines (BookMark _ anchor) = -- avoid an extra pass. let newAnchor = if not inHdrBool && anchor `elem` (M.elems anchorMap) - then uniqueIdent [Str anchor] (M.elems anchorMap) + then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) else anchor unless inHdrBool (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) @@ -360,10 +368,10 @@ parPartToInlines (Drawing fp bs ext) = do modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } return $ imageWith (extentToAttr ext) fp "" "" parPartToInlines (InternalHyperLink anchor runs) = do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs return $ link ('#' : anchor) "" ils parPartToInlines (ExternalHyperLink target runs) = do - ils <- concatReduce <$> mapM runToInlines runs + ils <- smushInlines <$> mapM runToInlines runs return $ link target "" ils parPartToInlines (PlainOMath exps) = do return $ math $ writeTeX exps @@ -393,7 +401,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) | (c:cs) <- filter isAnchorSpan ils , (Span (ident, ["anchor"], _) _) <- c = do hdrIDMap <- gets docxAnchorMap - let newIdent = uniqueIdent ils (M.elems hdrIDMap) + let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs)) -- Otherwise we just give it a name, and register that name (associate @@ -401,7 +409,7 @@ makeHeaderAnchor' (Header n (_, classes, kvs) ils) makeHeaderAnchor' (Header n (_, classes, kvs) ils) = do hdrIDMap <- gets docxAnchorMap - let newIdent = uniqueIdent ils (M.elems hdrIDMap) + let newIdent = uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} return $ Header n (newIdent, classes, kvs) ils makeHeaderAnchor' blk = return blk @@ -416,7 +424,7 @@ singleParaToPlain blks = blks cellToBlocks :: Cell -> DocxContext Blocks cellToBlocks (Cell bps) = do - blks <- concatReduce <$> mapM bodyPartToBlocks bps + blks <- smushBlocks <$> mapM bodyPartToBlocks bps return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks rowToBlocksList :: Row -> DocxContext [Blocks] @@ -478,11 +486,11 @@ bodyPartToBlocks (Paragraph pPr parparts) $ concatMap parPartToString parparts | Just (style, n) <- pHeading pPr = do ils <- local (\s-> s{docxInHeaderBlock=True}) $ - (concatReduce <$> mapM parPartToInlines parparts) + (smushInlines <$> mapM parPartToInlines parparts) makeHeaderAnchor $ headerWith ("", delete style (pStyle pPr), []) n ils | otherwise = do - ils <- concatReduce <$> mapM parPartToInlines parparts >>= + ils <- smushInlines <$> mapM parPartToInlines parparts >>= (return . fromList . trimLineBreaks . normalizeSpaces . toList) dropIls <- gets docxDropCap let ils' = dropIls <> ils @@ -560,7 +568,7 @@ bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps - blks <- concatReduce <$> mapM bodyPartToBlocks blkbps + blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks mediaBag <- gets docxMediaBag return $ (meta, diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs new file mode 100644 index 000000000..39e0df825 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -0,0 +1,154 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, + PatternGuards #-} + +module Text.Pandoc.Readers.Docx.Combine ( smushInlines + , smushBlocks + ) + where + +import Text.Pandoc.Builder +import Data.List +import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr, (><), (|>)) +import qualified Data.Sequence as Seq (null) + +data Modifier a = Modifier (a -> a) + | AttrModifier (Attr -> a -> a) Attr + | NullModifier + +spaceOutInlinesL :: Inlines -> (Inlines, Inlines) +spaceOutInlinesL ms = (l, stackInlines fs (m' <> r)) + where (l, m, r) = spaceOutInlines ms + (fs, m') = unstackInlines m + +spaceOutInlinesR :: Inlines -> (Inlines, Inlines) +spaceOutInlinesR ms = (stackInlines fs (l <> m'), r) + where (l, m, r) = spaceOutInlines ms + (fs, m') = unstackInlines m + +spaceOutInlines :: Inlines -> (Inlines, Inlines, Inlines) +spaceOutInlines ils = + let (fs, ils') = unstackInlines ils + contents = unMany ils' + left = case viewl contents of + (Space :< _) -> space + _ -> mempty + right = case viewr contents of + (_ :> Space) -> space + _ -> mempty in + (left, (stackInlines fs $ trimInlines . Many $ contents), right) + +stackInlines :: [Modifier Inlines] -> Inlines -> Inlines +stackInlines [] ms = ms +stackInlines (NullModifier : fs) ms = stackInlines fs ms +stackInlines ((Modifier f) : fs) ms = + if isEmpty ms + then stackInlines fs ms + else f $ stackInlines fs ms +stackInlines ((AttrModifier f attr) : fs) ms = f attr $ stackInlines fs ms + +unstackInlines :: Inlines -> ([Modifier Inlines], Inlines) +unstackInlines ms = case ilModifier ms of + NullModifier -> ([], ms) + _ -> (f : fs, ms') where + f = ilModifier ms + (fs, ms') = unstackInlines $ ilInnards ms + +ilModifier :: Inlines -> Modifier Inlines +ilModifier ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph _) -> Modifier emph + (Strong _) -> Modifier strong + (SmallCaps _) -> Modifier smallcaps + (Strikeout _) -> Modifier strikeout + (Superscript _) -> Modifier superscript + (Subscript _) -> Modifier subscript + (Link attr _ tgt) -> Modifier $ linkWith attr (fst tgt) (snd tgt) + (Span attr _) -> AttrModifier spanWith attr + _ -> NullModifier + _ -> NullModifier + +ilInnards :: Inlines -> Inlines +ilInnards ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph lst) -> fromList lst + (Strong lst) -> fromList lst + (SmallCaps lst) -> fromList lst + (Strikeout lst) -> fromList lst + (Superscript lst) -> fromList lst + (Subscript lst) -> fromList lst + (Link _ lst _) -> fromList lst + (Span _ lst) -> fromList lst + _ -> ils + _ -> ils + +inlinesL :: Inlines -> (Inlines, Inlines) +inlinesL ils = case viewl $ unMany ils of + (s :< sq) -> (singleton s, Many sq) + _ -> (mempty, ils) + +inlinesR :: Inlines -> (Inlines, Inlines) +inlinesR ils = case viewr $ unMany ils of + (sq :> s) -> (Many sq, singleton s) + _ -> (ils, mempty) + +combineInlines :: Inlines -> Inlines -> Inlines +combineInlines x y = + let (xs', x') = inlinesR x + (y', ys') = inlinesL y + in + xs' <> (combineSingletonInlines x' y') <> ys' + +combineSingletonInlines :: Inlines -> Inlines -> Inlines +combineSingletonInlines x y = + let (xfs, xs) = unstackInlines x + (yfs, ys) = unstackInlines y + shared = xfs `intersect` yfs + x_remaining = xfs \\ shared + y_remaining = yfs \\ shared + x_rem_attr = filter isAttrModifier x_remaining + y_rem_attr = filter isAttrModifier y_remaining + in + case null shared of + True | isEmpty xs && isEmpty ys -> + stackInlines (x_rem_attr ++ y_rem_attr) mempty + | isEmpty xs -> + let (sp, y') = spaceOutInlinesL y in + (stackInlines x_rem_attr mempty) <> sp <> y' + | isEmpty ys -> + let (x', sp) = spaceOutInlinesR x in + x' <> sp <> (stackInlines y_rem_attr mempty) + | otherwise -> + let (x', xsp) = spaceOutInlinesR x + (ysp, y') = spaceOutInlinesL y + in + x' <> xsp <> ysp <> y' + False -> stackInlines shared $ + combineInlines + (stackInlines x_remaining xs) + (stackInlines y_remaining ys) + +combineBlocks :: Blocks -> Blocks -> Blocks +combineBlocks bs cs + | bs' :> (BlockQuote bs'') <- viewr (unMany bs) + , (BlockQuote cs'') :< cs' <- viewl (unMany cs) = + Many $ (bs' |> (BlockQuote (bs'' <> cs''))) >< cs' +combineBlocks bs cs = bs <> cs + +instance (Monoid a, Eq a) => Eq (Modifier a) where + (Modifier f) == (Modifier g) = (f mempty == g mempty) + (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) + (NullModifier) == (NullModifier) = True + _ == _ = False + +isEmpty :: (Monoid a, Eq a) => a -> Bool +isEmpty x = x == mempty + +isAttrModifier :: Modifier a -> Bool +isAttrModifier (AttrModifier _ _) = True +isAttrModifier _ = False + +smushInlines :: [Inlines] -> Inlines +smushInlines xs = foldl combineInlines mempty xs + +smushBlocks :: [Blocks] -> Blocks +smushBlocks xs = foldl combineBlocks mempty xs diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index eec8b12c9..364483929 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , Row(..) , Cell(..) , archiveToDocx + , archiveToDocxWithWarnings ) where import Codec.Archive.Zip import Text.XML.Light @@ -60,6 +61,7 @@ import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B import qualified Text.Pandoc.UTF8 as UTF8 import Control.Monad.Reader +import Control.Monad.State import Control.Applicative ((<|>)) import qualified Data.Map as M import Text.Pandoc.Compat.Except @@ -81,16 +83,20 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes } deriving Show +data ReaderState = ReaderState { stateWarnings :: [String] } + deriving Show + + data DocxError = DocxError | WrongElem deriving Show instance Error DocxError where noMsg = WrongElem -type D = ExceptT DocxError (Reader ReaderEnv) +type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) -runD :: D a -> ReaderEnv -> Either DocxError a -runD dx re = runReader (runExceptT dx) re +runD :: D a -> ReaderEnv -> ReaderState -> (Either DocxError a, ReaderState) +runD dx re rs = runState (runReaderT (runExceptT dx) re) rs maybeToD :: Maybe a -> D a maybeToD (Just a) = return a @@ -257,7 +263,10 @@ type Author = String type ChangeDate = String archiveToDocx :: Archive -> Either DocxError Docx -archiveToDocx archive = do +archiveToDocx archive = fst <$> archiveToDocxWithWarnings archive + +archiveToDocxWithWarnings :: Archive -> Either DocxError (Docx, [String]) +archiveToDocxWithWarnings archive = do let notes = archiveToNotes archive numbering = archiveToNumbering archive rels = archiveToRelationships archive @@ -265,8 +274,12 @@ archiveToDocx archive = do (styles, parstyles) = archiveToStyles archive rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles InDocument - doc <- runD (archiveToDocument archive) rEnv - return $ Docx doc + rState = ReaderState { stateWarnings = [] } + (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState + case eitherDoc of + Right doc -> Right (Docx doc, stateWarnings st) + Left e -> Left e + archiveToDocument :: Archive -> D Document @@ -576,12 +589,14 @@ elemToBodyPart ns element sty <- asks envParStyles let parstyle = elemToParagraphStyle ns element sty parparts <- mapD (elemToParPart ns) (elChildren element) - case pNumInfo parstyle of - Just (numId, lvl) -> do - num <- asks envNumbering - let levelInfo = lookupLevel numId lvl num - return $ ListItem parstyle numId lvl levelInfo parparts - Nothing -> return $ Paragraph parstyle parparts + -- Word uses list enumeration for numbered headings, so we only + -- want to infer a list from the styles if it is NOT a heading. + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + num <- asks envNumbering + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do let caption' = findChild (elemName ns "w" "tblPr") element @@ -702,36 +717,58 @@ elemToExtent drawingElem = getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem >>= findAttr (QName at Nothing Nothing) >>= safeRead -elemToRun :: NameSpaces -> Element -> D Run -elemToRun ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + +childElemToRun :: NameSpaces -> Element -> D Run +childElemToRun ns element + | isElem ns "w" "drawing" element = let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= - (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent drawingElem) + (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent element) Nothing -> throwError WrongElem -elemToRun ns element - | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "footnoteReference") element - , Just fnId <- findAttr (elemName ns "w" "id") ref = do +childElemToRun ns element + | isElem ns "w" "footnoteReference" element + , Just fnId <- findAttr (elemName ns "w" "id") element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Footnote bps Nothing -> return $ Footnote [] -elemToRun ns element - | isElem ns "w" "r" element - , Just ref <- findChild (elemName ns "w" "endnoteReference") element - , Just enId <- findAttr (elemName ns "w" "id") ref = do +childElemToRun ns element + | isElem ns "w" "endnoteReference" element + , Just enId <- findAttr (elemName ns "w" "id") element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Endnote bps Nothing -> return $ Endnote [] +childElemToRun _ _ = throwError WrongElem + +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just altCont <- findChild (elemName ns "mc" "AlternateContent") element = + do let choices = findChildren (elemName ns "mc" "Choice") altCont + choiceChildren = map head $ filter (not . null) $ map elChildren choices + outputs <- mapD (childElemToRun ns) choiceChildren + case outputs of + r : _ -> return r + [] -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + childElemToRun ns drawingElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "footnoteReference") element = + childElemToRun ns ref +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "endnoteReference") element = + childElemToRun ns ref elemToRun ns element | isElem ns "w" "r" element = do runElems <- elemToRunElems ns element @@ -940,3 +977,4 @@ elemToRunElems _ _ = throwError WrongElem setFont :: Maybe Font -> ReaderEnv -> ReaderEnv setFont f s = s{envFont = f} + diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs deleted file mode 100644 index c93b40119..000000000 --- a/src/Text/Pandoc/Readers/Docx/Reducible.hs +++ /dev/null @@ -1,181 +0,0 @@ -{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, - PatternGuards #-} - -module Text.Pandoc.Readers.Docx.Reducible ( concatReduce - , (<+>) - ) - where - - -import Text.Pandoc.Builder -import Data.List -import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) -import qualified Data.Sequence as Seq (null) - -data Modifier a = Modifier (a -> a) - | AttrModifier (Attr -> a -> a) Attr - | NullModifier - -class (Eq a) => Modifiable a where - modifier :: a -> Modifier a - innards :: a -> a - getL :: a -> (a, a) - getR :: a -> (a, a) - spaceOut :: a -> (a, a, a) - -spaceOutL :: (Monoid a, Modifiable a) => a -> (a, a) -spaceOutL ms = (l, stack fs (m' <> r)) - where (l, m, r) = spaceOut ms - (fs, m') = unstack m - -spaceOutR :: (Monoid a, Modifiable a) => a -> (a, a) -spaceOutR ms = (stack fs (l <> m'), r) - where (l, m, r) = spaceOut ms - (fs, m') = unstack m - -instance (Monoid a, Show a) => Show (Modifier a) where - show (Modifier f) = show $ f mempty - show (AttrModifier f attr) = show $ f attr mempty - show (NullModifier) = "NullModifier" - -instance (Monoid a, Eq a) => Eq (Modifier a) where - (Modifier f) == (Modifier g) = (f mempty == g mempty) - (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) - (NullModifier) == (NullModifier) = True - _ == _ = False - -instance Modifiable Inlines where - modifier ils = case viewl (unMany ils) of - (x :< xs) | Seq.null xs -> case x of - (Emph _) -> Modifier emph - (Strong _) -> Modifier strong - (SmallCaps _) -> Modifier smallcaps - (Strikeout _) -> Modifier strikeout - (Superscript _) -> Modifier superscript - (Subscript _) -> Modifier subscript - (Span attr _) -> AttrModifier spanWith attr - _ -> NullModifier - _ -> NullModifier - - innards ils = case viewl (unMany ils) of - (x :< xs) | Seq.null xs -> case x of - (Emph lst) -> fromList lst - (Strong lst) -> fromList lst - (SmallCaps lst) -> fromList lst - (Strikeout lst) -> fromList lst - (Superscript lst) -> fromList lst - (Subscript lst) -> fromList lst - (Span _ lst) -> fromList lst - _ -> ils - _ -> ils - - getL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) - _ -> (mempty, ils) - - getR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) - _ -> (ils, mempty) - - spaceOut ils = - let (fs, ils') = unstack ils - contents = unMany ils' - left = case viewl contents of - (Space :< _) -> space - _ -> mempty - right = case viewr contents of - (_ :> Space) -> space - _ -> mempty in - (left, (stack fs $ trimInlines .Many $ contents), right) - -instance Modifiable Blocks where - modifier blks = case viewl (unMany blks) of - (x :< xs) | Seq.null xs -> case x of - (BlockQuote _) -> Modifier blockQuote - -- (Div attr _) -> AttrModifier divWith attr - _ -> NullModifier - _ -> NullModifier - - innards blks = case viewl (unMany blks) of - (x :< xs) | Seq.null xs -> case x of - (BlockQuote lst) -> fromList lst - -- (Div attr lst) -> fromList lst - _ -> blks - _ -> blks - - spaceOut blks = (mempty, blks, mempty) - - getL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) - _ -> (mempty, ils) - - getR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) - _ -> (ils, mempty) - - -unstack :: (Modifiable a) => a -> ([Modifier a], a) -unstack ms = case modifier ms of - NullModifier -> ([], ms) - _ -> (f : fs, ms') where - f = modifier ms - (fs, ms') = unstack $ innards ms - -stack :: (Monoid a, Modifiable a) => [Modifier a] -> a -> a -stack [] ms = ms -stack (NullModifier : fs) ms = stack fs ms -stack ((Modifier f) : fs) ms = - if isEmpty ms - then stack fs ms - else f $ stack fs ms -stack ((AttrModifier f attr) : fs) ms = f attr $ stack fs ms - -isEmpty :: (Monoid a, Eq a) => a -> Bool -isEmpty x = x == mempty - - -combine :: (Monoid a, Modifiable a, Eq a) => a -> a -> a -combine x y = - let (xs', x') = getR x - (y', ys') = getL y - in - xs' <> (combineSingleton x' y') <> ys' - -isAttrModifier :: Modifier a -> Bool -isAttrModifier (AttrModifier _ _) = True -isAttrModifier _ = False - -combineSingleton :: (Monoid a, Modifiable a, Eq a) => a -> a -> a -combineSingleton x y = - let (xfs, xs) = unstack x - (yfs, ys) = unstack y - shared = xfs `intersect` yfs - x_remaining = xfs \\ shared - y_remaining = yfs \\ shared - x_rem_attr = filter isAttrModifier x_remaining - y_rem_attr = filter isAttrModifier y_remaining - in - case null shared of - True | isEmpty xs && isEmpty ys -> - stack (x_rem_attr ++ y_rem_attr) mempty - | isEmpty xs -> - let (sp, y') = spaceOutL y in - (stack x_rem_attr mempty) <> sp <> y' - | isEmpty ys -> - let (x', sp) = spaceOutR x in - x' <> sp <> (stack y_rem_attr mempty) - | otherwise -> - let (x', xsp) = spaceOutR x - (ysp, y') = spaceOutL y - in - x' <> xsp <> ysp <> y' - False -> stack shared $ - combine - (stack x_remaining xs) - (stack y_remaining ys) - -(<+>) :: (Monoid a, Modifiable a, Eq a) => a -> a -> a -x <+> y = combine x y - -concatReduce :: (Monoid a, Modifiable a) => [a] -> a -concatReduce xs = foldl combine mempty xs diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 79aa540f6..07d282708 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -30,7 +30,7 @@ import Control.Monad (guard, liftM, when) import Data.List (isPrefixOf, isInfixOf) import Data.Maybe (mapMaybe, fromMaybe) import qualified Data.Map as M (Map, lookup, fromList, elems) -import Control.DeepSeq.Generics (deepseq, NFData) +import Control.DeepSeq (deepseq, NFData) import Debug.Trace (trace) diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index a34e2fb5c..959a2d16f 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -43,7 +43,7 @@ import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags' +import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField , escapeURI, safeRead, mapLeft ) import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) , Extension (Ext_epub_html_exts, @@ -52,9 +52,9 @@ import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Walk import qualified Data.Map as M import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf, isSuffixOf ) +import Data.List ( intercalate, isInfixOf, isPrefixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Monad ( guard, when, mzero, void, unless ) import Control.Arrow ((***)) import Control.Applicative ( (<|>) ) import Data.Monoid (First (..)) @@ -63,12 +63,12 @@ import Debug.Trace (trace) import Text.TeXMath (readMathML, writeTeX) import Data.Default (Default (..), def) import Control.Monad.Reader (Reader,ask, asks, local, runReader) -import Network.URI (isURI) +import Network.URI (URI, parseURIReference, nonStrictRelativeTo) import Text.Pandoc.Error import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Compat.Monoid ((<>)) import Text.Parsec.Error - +import qualified Data.Set as Set -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options @@ -77,7 +77,7 @@ readHtml :: ReaderOptions -- ^ Reader options readHtml opts inp = mapLeft (ParseFailure . getError) . flip runReader def $ runParserT parseDoc - (HTMLState def{ stateOptions = opts } [] Nothing [] M.empty) + (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) "source" tags where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp @@ -103,8 +103,8 @@ data HTMLState = HTMLState { parserState :: ParserState, noteTable :: [(String, Blocks)], - baseHref :: Maybe String, - identifiers :: [String], + baseHref :: Maybe URI, + identifiers :: Set.Set String, headerMap :: M.Map Inlines String } @@ -137,19 +137,17 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag then return mempty else do let content = fromAttrib "content" mt - updateState $ B.setMeta name (B.text content) + updateState $ \s -> + let ps = parserState s in + s{ parserState = ps{ + stateMeta = addMetaField name (B.text content) + (stateMeta ps) } } return mempty pBaseTag = do bt <- pSatisfy (~== TagOpen "base" []) - let baseH = fromAttrib "href" bt - if null baseH - then return mempty - else do - let baseH' = case reverse baseH of - '/':_ -> baseH - _ -> baseH ++ "/" - updateState $ \st -> st{ baseHref = Just baseH' } - return mempty + updateState $ \st -> st{ baseHref = + parseURIReference $ fromAttrib "href" bt } + return mempty block :: TagParser Blocks block = do @@ -441,6 +439,7 @@ pTable = try $ do -- fail on empty table guard $ not $ null head' && null rows let isSinglePlain x = case B.toList x of + [] -> True [Plain _] -> True _ -> False let isSimple = all isSinglePlain $ concat (head':rows) @@ -605,9 +604,9 @@ pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) mbBaseHref <- baseHref <$> getState let url' = fromAttrib "href" tag - let url = case (isURI url', mbBaseHref) of - (False, Just h) -> h ++ url' - _ -> url' + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' let title = fromAttrib "title" tag let uid = fromAttrib "id" tag let cls = words $ fromAttrib "class" tag @@ -619,9 +618,9 @@ pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState let url' = fromAttrib "src" tag - let url = case (isURI url', mbBaseHref) of - (False, Just h) -> h ++ url' - _ -> url' + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag let uid = fromAttrib "id" tag @@ -925,14 +924,45 @@ htmlInBalanced :: (Monad m) => (Tag String -> Bool) -> ParserT String st m String htmlInBalanced f = try $ do - (TagOpen t _, tag) <- htmlTag f - guard $ not $ "/>" `isSuffixOf` tag -- not a self-closing tag - let stopper = htmlTag (~== TagClose t) - let anytag = snd <$> htmlTag (const True) - contents <- many $ notFollowedBy' stopper >> - (htmlInBalanced f <|> anytag <|> count 1 anyChar) - endtag <- liftM snd stopper - return $ tag ++ concat contents ++ endtag + lookAhead (char '<') + inp <- getInput + let ts = canonicalizeTags $ + parseTagsOptions parseOptions{ optTagWarning = True, + optTagPosition = True } inp + case ts of + (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do + guard $ f t + guard $ not $ hasTagWarning (t : take 1 rest) + case htmlInBalanced' tn (t:rest) of + [] -> mzero + xs -> case reverse xs of + (TagClose _ : TagPosition er ec : _) -> do + let ls = er - sr + let cs = ec - sc + lscontents <- concat <$> count ls anyLine + cscontents <- count cs anyChar + (_,closetag) <- htmlTag (~== TagClose tn) + return (lscontents ++ cscontents ++ closetag) + _ -> mzero + _ -> mzero + +htmlInBalanced' :: String + -> [Tag String] + -> [Tag String] +htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts + where go :: Int -> [Tag String] -> Maybe [Tag String] + go n (t@(TagOpen tn' _):rest) | tn' == tagname = + (t :) <$> go (n + 1) rest + go 1 (t@(TagClose tn'):_) | tn' == tagname = + return [t] + go n (t@(TagClose tn'):rest) | tn' == tagname = + (t :) <$> go (n - 1) rest + go n (t:ts') = (t :) <$> go n ts' + go _ [] = mzero + +hasTagWarning :: [Tag String] -> Bool +hasTagWarning (TagWarning _:_) = True +hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: Monad m @@ -941,8 +971,6 @@ htmlTag :: Monad m htmlTag f = try $ do lookAhead (char '<') inp <- getInput - let hasTagWarning (TagWarning _:_) = True - hasTagWarning _ = False let (next : rest) = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = True } inp guard $ f next diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index d2e8d9d17..2be55c9da 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -166,10 +166,18 @@ mathInline :: LP String -> LP Inlines mathInline p = math <$> (try p >>= applyMacros') mathChars :: LP String -mathChars = (concat <$>) $ - many $ - many1 (satisfy (\c -> c /= '$' && c /='\\')) - <|> (\c -> ['\\',c]) <$> try (char '\\' *> anyChar) +mathChars = + concat <$> many (escapedChar + <|> (snd <$> withRaw braced) + <|> many1 (satisfy isOrdChar)) + where escapedChar = try $ do char '\\' + c <- anyChar + return ['\\',c] + isOrdChar '$' = False + isOrdChar '{' = False + isOrdChar '}' = False + isOrdChar '\\' = False + isOrdChar _ = True quoted' :: (Inlines -> Inlines) -> LP String -> LP () -> LP Inlines quoted' f starter ender = do @@ -179,10 +187,11 @@ quoted' f starter ender = do then do ils <- many (notFollowedBy ender >> inline) (ender >> return (f (mconcat ils))) <|> - lit (case startchs of - "``" -> "“" - "`" -> "‘" - _ -> startchs) + (<> mconcat ils) <$> + lit (case startchs of + "``" -> "“" + "`" -> "‘" + _ -> startchs) else lit startchs doubleQuote :: LP Inlines @@ -421,7 +430,8 @@ inlineCommand = try $ do else if parseRaw then return $ rawInline "latex" rawcommand else return mempty - lookupListDefault mzero [name',name] inlineCommands + (lookupListDefault mzero [name',name] inlineCommands <* + optional (try (string "{}"))) <|> raw unlessParseRaw :: LP () @@ -434,6 +444,7 @@ isBlockCommand s = s `M.member` blockCommands inlineEnvironments :: M.Map String (LP Inlines) inlineEnvironments = M.fromList [ ("displaymath", mathEnv id Nothing "displaymath") + , ("math", math <$> verbEnv "math") , ("equation", mathEnv id Nothing "equation") , ("equation*", mathEnv id Nothing "equation*") , ("gather", mathEnv id (Just "gathered") "gather") diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 77c3a1016..b5d175453 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -36,7 +36,7 @@ import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate ) import qualified Data.Map as M import Data.Scientific (coefficient, base10Exponent) import Data.Ord ( comparing ) -import Data.Char ( isSpace, isAlphaNum, toLower ) +import Data.Char ( isSpace, isAlphaNum, toLower, isPunctuation ) import Data.Maybe import Text.Pandoc.Definition import Text.Pandoc.Emoji (emojis) @@ -61,7 +61,6 @@ import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockT import Control.Monad import System.FilePath (takeExtension, addExtension) import Text.HTML.TagSoup -import Text.HTML.TagSoup.Match (tagOpen) import qualified Data.Set as Set import Text.Printf (printf) import Debug.Trace (trace) @@ -1052,12 +1051,11 @@ strictHtmlBlock :: MarkdownParser String strictHtmlBlock = htmlInBalanced (not . isInlineTag) rawVerbatimBlock :: MarkdownParser String -rawVerbatimBlock = try $ do - (TagOpen tag _, open) <- htmlTag (tagOpen (flip elem - ["pre", "style", "script"]) - (const True)) - contents <- manyTill anyChar (htmlTag (~== TagClose tag)) - return $ open ++ contents ++ renderTags' [TagClose tag] +rawVerbatimBlock = htmlInBalanced isVerbTag + where isVerbTag (TagOpen "pre" _) = True + isVerbTag (TagOpen "style" _) = True + isVerbTag (TagOpen "script" _) = True + isVerbTag _ = False rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do @@ -1356,16 +1354,18 @@ pipeTable = try $ do nonindentSpaces lookAhead nonspaceChar (heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak + let heads' = take (length aligns) <$> heads lines' <- many pipeTableRow + let lines'' = map (take (length aligns) <$>) lines' let maxlength = maximum $ - map (\x -> length . stringify $ runF x def) (heads : lines') + map (\x -> length . stringify $ runF x def) (heads' : lines'') numColumns <- getOption readerColumns let widths = if maxlength > numColumns then map (\len -> fromIntegral (len + 1) / fromIntegral numColumns) seplengths else replicate (length aligns) 0.0 - return $ (aligns, widths, heads, sequence lines') + return $ (aligns, widths, heads', sequence lines'') sepPipe :: MarkdownParser () sepPipe = try $ do @@ -1374,25 +1374,27 @@ sepPipe = try $ do -- parse a row, also returning probable alignments for org-table cells pipeTableRow :: MarkdownParser (F [Blocks]) -pipeTableRow = do +pipeTableRow = try $ do + scanForPipe skipMany spaceChar openPipe <- (True <$ char '|') <|> return False - let cell = mconcat <$> - many (notFollowedBy (blankline <|> char '|') >> inline) - first <- cell - rest <- many $ sepPipe *> cell + -- split into cells + let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') + <|> void (noneOf "|\n\r") + let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>= + parseFromString pipeTableCell + cells <- cellContents `sepEndBy1` (char '|') -- surrounding pipes needed for a one-column table: - guard $ not (null rest && not openPipe) - optional (char '|') + guard $ not (length cells == 1 && not openPipe) blankline - let cells = sequence (first:rest) - return $ do - cells' <- cells - return $ map - (\ils -> - case trimInlines ils of - ils' | B.isNull ils' -> mempty - | otherwise -> B.plain $ ils') cells' + return $ sequence cells + +pipeTableCell :: MarkdownParser (F Blocks) +pipeTableCell = do + result <- many inline + if null result + then return mempty + else return $ B.plain . mconcat <$> sequence result pipeTableHeaderPart :: Parser [Char] st (Alignment, Int) pipeTableHeaderPart = try $ do @@ -1554,7 +1556,7 @@ math :: MarkdownParser (F Inlines) math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) <|> (return . B.math <$> (mathInline >>= applyMacros')) <+?> ((getOption readerSmart >>= guard) *> (return <$> apostrophe) - <* notFollowedBy space) + <* notFollowedBy (space <|> satisfy isPunctuation)) -- Parses material enclosed in *s, **s, _s, or __s. -- Designed to avoid backtracking. diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index d29ec50e7..950497992 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -52,6 +52,7 @@ import Text.HTML.TagSoup import Data.Sequence (viewl, ViewL(..), (<|)) import qualified Data.Foldable as F import qualified Data.Map as M +import qualified Data.Set as Set import Data.Char (isDigit, isSpace) import Data.Maybe (fromMaybe) import Text.Printf (printf) @@ -69,7 +70,7 @@ readMediaWiki opts s = , mwNextLinkNumber = 1 , mwCategoryLinks = [] , mwHeaderMap = M.empty - , mwIdentifierList = [] + , mwIdentifierList = Set.empty } (s ++ "\n") @@ -78,7 +79,7 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwNextLinkNumber :: Int , mwCategoryLinks :: [Inlines] , mwHeaderMap :: M.Map Inlines String - , mwIdentifierList :: [String] + , mwIdentifierList :: Set.Set String } type MWParser = Parser [Char] MWState diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 1f1c57646..8c475eefc 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -61,6 +61,7 @@ import Text.Pandoc.Readers.Odt.Generic.XMLConverter import Text.Pandoc.Readers.Odt.Generic.Fallible import Text.Pandoc.Readers.Odt.Generic.Utils +import qualified Data.Set as Set -------------------------------------------------------------------------------- -- State @@ -221,7 +222,7 @@ getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do getHeaderAnchor :: OdtReaderSafe Inlines Anchor getHeaderAnchor = proc title -> do state <- getExtraState -< () - let anchor = uniqueIdent (toList title) (usedAnchors state) + let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state) modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index c7906618c..7dd611be3 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -2,7 +2,7 @@ {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MultiParamTypeClasses, FlexibleContexts, FlexibleInstances #-} {- -Copyright (C) 2014-2015 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Org - Copyright : Copyright (C) 2014-2015 Albert Krewinkel + Copyright : Copyright (C) 2014-2016 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -53,6 +53,7 @@ import Data.Char (isAlphaNum, toLower) import Data.Default import Data.List (intersperse, isPrefixOf, isSuffixOf) import qualified Data.Map as M +import qualified Data.Set as Set import Data.Maybe (fromMaybe, isJust) import Network.HTTP (urlEncode) @@ -144,7 +145,7 @@ data OrgParserState = OrgParserState , orgStateMeta' :: F Meta , orgStateNotes' :: OrgNoteTable , orgStateParserContext :: ParserContext - , orgStateIdentifiers :: [String] + , orgStateIdentifiers :: Set.Set String , orgStateHeaderMap :: M.Map Inlines String } @@ -186,7 +187,7 @@ defaultOrgParserState = OrgParserState , orgStateMeta' = return nullMeta , orgStateNotes' = [] , orgStateParserContext = NullState - , orgStateIdentifiers = [] + , orgStateIdentifiers = Set.empty , orgStateHeaderMap = M.empty } @@ -628,7 +629,7 @@ figure = try $ do maybeNam <- lookupBlockAttribute "name" guard $ isJust maybeCap || isJust maybeNam return ( fromMaybe mempty maybeCap - , maybe mempty withFigPrefix maybeNam ) + , withFigPrefix $ fromMaybe mempty maybeNam ) withFigPrefix cs = if "fig:" `isPrefixOf` cs then cs @@ -1238,37 +1239,37 @@ applyCustomLinkFormat link = do formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters return $ maybe link ($ drop 1 rest) formatter --- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind --- of parsing. +-- | Take a link and return a function which produces new inlines when given +-- description inlines. linkToInlinesF :: String -> Inlines -> F Inlines -linkToInlinesF s = +linkToInlinesF linkStr = + case linkStr of + "" -> pure . B.link mempty "" -- wiki link (empty by convention) + ('#':_) -> pure . B.link linkStr "" -- document-local fraction + _ -> case cleanLinkString linkStr of + (Just cleanedLink) -> if isImageFilename cleanedLink + then const . pure $ B.image cleanedLink "" "" + else pure . B.link cleanedLink "" + Nothing -> internalLink linkStr -- other internal link + +-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if +-- the string does not appear to be a link. +cleanLinkString :: String -> Maybe String +cleanLinkString s = case s of - "" -> pure . B.link "" "" - ('#':_) -> pure . B.link s "" - _ | isImageFilename s -> const . pure $ B.image s "" "" - _ | isFileLink s -> pure . B.link (dropLinkType s) "" - _ | isUri s -> pure . B.link s "" - _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) "" - _ | isRelativeFilePath s -> pure . B.link s "" - _ -> internalLink s - -isFileLink :: String -> Bool -isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s) - -dropLinkType :: String -> String -dropLinkType = tail . snd . break (== ':') - -isRelativeFilePath :: String -> Bool -isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) && - (':' `notElem` s) - -isUri :: String -> Bool -isUri s = let (scheme, path) = break (== ':') s - in all (\c -> isAlphaNum c || c `elem` (".-" :: String)) scheme - && not (null path) - -isAbsoluteFilePath :: String -> Bool -isAbsoluteFilePath = ('/' ==) . head + '/':_ -> Just $ "file://" ++ s -- absolute path + '.':'/':_ -> Just s -- relative path + '.':'.':'/':_ -> Just s -- relative path + -- Relative path or URL (file schema) + 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' + _ | isUrl s -> Just s -- URL + _ -> Nothing + where + isUrl :: String -> Bool + isUrl cs = + let (scheme, path) = break (== ':') cs + in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme + && not (null path) isImageFilename :: String -> Bool isImageFilename filename = diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index dd1d289a3..6f64540f8 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -583,7 +583,18 @@ code2 = do -- | Html / CSS attributes attributes :: Parser [Char] ParserState Attr -attributes = (foldl (flip ($)) ("",[],[])) `fmap` many attribute +attributes = (foldl (flip ($)) ("",[],[])) <$> + try (do special <- option id specialAttribute + attrs <- many attribute + return (special : attrs)) + +specialAttribute :: Parser [Char] ParserState (Attr -> Attr) +specialAttribute = do + alignStr <- ("center" <$ char '=') <|> + ("justify" <$ try (string "<>")) <|> + ("right" <$ char '>') <|> + ("left" <$ char '<') + return $ addStyle ("text-align:" ++ alignStr) attribute :: Parser [Char] ParserState (Attr -> Attr) attribute = classIdAttr <|> styleAttr <|> langAttr @@ -602,7 +613,13 @@ classIdAttr = try $ do -- (class class #id) styleAttr :: Parser [Char] ParserState (Attr -> Attr) styleAttr = do style <- try $ enclosed (char '{') (char '}') anyChar' - return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals) + return $ addStyle style + +addStyle :: String -> Attr -> Attr +addStyle style (id',classes,keyvals) = + (id',classes,keyvals') + where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"] + style' = style ++ ";" ++ concat [v | ("style",v) <- keyvals] langAttr :: Parser [Char] ParserState (Attr -> Attr) langAttr = do |