diff options
| author | Clare Macrae <github@cfmacrae.fastmail.co.uk> | 2014-06-29 19:22:31 +0100 |
|---|---|---|
| committer | Clare Macrae <github@cfmacrae.fastmail.co.uk> | 2014-06-29 19:22:31 +0100 |
| commit | 717e16660d1ee83f690b35d0aa9b60c8ac9d6b61 (patch) | |
| tree | aa850d4ee99fa0b14da9ba0396ba6aa67e2037e3 /src/Text/Pandoc/Readers | |
| parent | fccfc8429cf4d002df37977f03508c9aae457416 (diff) | |
| parent | ce69021e42d7bf50deccba2a52ed4717f6ddac10 (diff) | |
| download | pandoc-717e16660d1ee83f690b35d0aa9b60c8ac9d6b61.tar.gz | |
Merge remote-tracking branch 'jgm/master' into dokuwiki
Diffstat (limited to 'src/Text/Pandoc/Readers')
| -rw-r--r-- | src/Text/Pandoc/Readers/DocBook.hs | 140 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 489 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Lists.hs | 227 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 596 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Reducible.hs | 181 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 343 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Haddock.hs | 139 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Haddock/Lex.x | 171 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Haddock/Parse.y | 178 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 719 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 500 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/MediaWiki.hs | 107 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Native.hs | 4 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/OPML.hs | 2 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Org.hs | 1379 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 163 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/TeXMath.hs | 110 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 364 |
18 files changed, 4446 insertions, 1366 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index 0058e889c..cf1d5132e 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -1,16 +1,18 @@ module Text.Pandoc.Readers.DocBook ( readDocBook ) where -import Data.Char (toUpper, isDigit) +import Data.Char (toUpper) +import Text.Pandoc.Shared (safeRead) import Text.Pandoc.Options import Text.Pandoc.Definition import Text.Pandoc.Builder import Text.XML.Light -import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Compat.TagSoupEntity (lookupEntity) import Data.Generics import Data.Monoid import Data.Char (isSpace) import Control.Monad.State import Control.Applicative ((<$>)) import Data.List (intersperse) +import Data.Maybe (fromMaybe) {- @@ -43,7 +45,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] audioobject - A wrapper for audio data and its associated meta-information [x] author - The name of an individual author [ ] authorblurb - A short description or note about an author -[ ] authorgroup - Wrapper for author information when a document has +[x] authorgroup - Wrapper for author information when a document has multiple authors or collabarators [x] authorinitials - The initials or other short identifier for an author [o] beginpage - The location of a page break in a print version of the document @@ -339,7 +341,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] refsectioninfo - Meta-information for a refsection [ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page [ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv -[ ] releaseinfo - Information about a particular release of a document +[x] releaseinfo - Information about a particular release of a document [ ] remark - A remark (or comment) intended for presentation in a draft manuscript [ ] replaceable - Content that may or must be replaced by the user @@ -490,34 +492,40 @@ List of all DocBook tags, with [x] indicating implemented, anything else [ ] xref - A cross reference to another part of the document [ ] year - The year of publication of a document - +[x] ?asciidoc-br? - line break from asciidoc docbook output -} type DB = State DBState data DBState = DBState{ dbSectionLevel :: Int , dbQuoteType :: QuoteType - , dbDocTitle :: Inlines - , dbDocAuthors :: [Inlines] - , dbDocDate :: Inlines + , dbMeta :: Meta + , dbAcceptsMeta :: Bool , dbBook :: Bool , dbFigureTitle :: Inlines } deriving Show readDocBook :: ReaderOptions -> String -> Pandoc -readDocBook _ inp = setTitle (dbDocTitle st') - $ setAuthors (dbDocAuthors st') - $ setDate (dbDocDate st') - $ doc $ mconcat bs - where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp) +readDocBook _ inp = Pandoc (dbMeta st') (toList $ mconcat bs) + where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp') DBState{ dbSectionLevel = 0 , dbQuoteType = DoubleQuote - , dbDocTitle = mempty - , dbDocAuthors = [] - , dbDocDate = mempty + , dbMeta = mempty + , dbAcceptsMeta = False , dbBook = False , dbFigureTitle = mempty } + inp' = handleInstructions inp + +-- We treat <?asciidoc-br?> specially (issue #1236), converting it +-- to <br/>, since xml-light doesn't parse the instruction correctly. +-- Other xml instructions are simply removed from the input stream. +handleInstructions :: String -> String +handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions xs +handleInstructions xs = case break (=='<') xs of + (ys, []) -> ys + ([], '<':zs) -> '<' : handleInstructions zs + (ys, zs) -> ys ++ handleInstructions zs getFigure :: Element -> DB Blocks getFigure e = do @@ -558,6 +566,30 @@ attrValue attr elt = named :: String -> Element -> Bool named s e = qName (elName e) == s +-- + +acceptingMetadata :: DB a -> DB a +acceptingMetadata p = do + modify (\s -> s { dbAcceptsMeta = True } ) + res <- p + modify (\s -> s { dbAcceptsMeta = False }) + return res + +checkInMeta :: Monoid a => DB () -> DB a +checkInMeta p = do + accepts <- dbAcceptsMeta <$> get + when accepts p + return mempty + + + +addMeta :: ToMetaValue a => String -> a -> DB () +addMeta field val = modify (setMeta field val) + +instance HasMeta DBState where + setMeta field v s = s {dbMeta = setMeta field v (dbMeta s)} + deleteMeta field s = s {dbMeta = deleteMeta field (dbMeta s)} + isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blocktags where blocktags = ["toc","index","para","formalpara","simpara", @@ -604,6 +636,7 @@ getImage e = do getBlocks :: Element -> DB Blocks getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) + parseBlock :: Content -> DB Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE parseBlock (Text (CData _ s _)) = if all isSpace s @@ -617,10 +650,10 @@ parseBlock (Elem e) = "para" -> parseMixed para (elContent e) "formalpara" -> do tit <- case filterChild (named "title") e of - Just t -> (<> str "." <> linebreak) <$> emph - <$> getInlines t + Just t -> (para . strong . (<> str ".")) <$> + getInlines t Nothing -> return mempty - addToStart tit <$> parseMixed para (elContent e) + (tit <>) <$> parseMixed para (elContent e) "simpara" -> parseMixed para (elContent e) "ackno" -> parseMixed para (elContent e) "epigraph" -> parseBlockquote @@ -628,7 +661,11 @@ parseBlock (Elem e) = "attribution" -> return mempty "titleabbrev" -> return mempty "authorinitials" -> return mempty - "title" -> return mempty -- handled by getTitle or sect or figure + "title" -> checkInMeta getTitle + "author" -> checkInMeta getAuthor + "authorgroup" -> checkInMeta getAuthorGroup + "releaseinfo" -> checkInMeta (getInlines e >>= addMeta "release") + "date" -> checkInMeta getDate "bibliography" -> sect 0 "bibliodiv" -> sect 1 "biblioentry" -> parseMixed para (elContent e) @@ -682,18 +719,17 @@ parseBlock (Elem e) = "lowerroman" -> LowerRoman "upperroman" -> UpperRoman _ -> Decimal - let start = case attrValue "override" <$> - filterElement (named "listitem") e of - Just x@(_:_) | all isDigit x -> read x - _ -> 1 + let start = fromMaybe 1 $ + (attrValue "override" <$> filterElement (named "listitem") e) + >>= safeRead orderedListWith (start,listStyle,DefaultDelim) <$> listitems "variablelist" -> definitionList <$> deflistitems "figure" -> getFigure e "mediaobject" -> para <$> getImage e "caption" -> return mempty - "info" -> getTitle >> getAuthors >> getDate >> return mempty - "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "info" -> metaBlock + "articleinfo" -> metaBlock "sectioninfo" -> return mempty -- keywords & other metadata "refsectioninfo" -> return mempty -- keywords & other metadata "refsect1info" -> return mempty -- keywords & other metadata @@ -707,10 +743,10 @@ parseBlock (Elem e) = "chapterinfo" -> return mempty -- keywords & other metadata "glossaryinfo" -> return mempty -- keywords & other metadata "appendixinfo" -> return mempty -- keywords & other metadata - "bookinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "bookinfo" -> metaBlock "article" -> modify (\st -> st{ dbBook = False }) >> - getTitle >> getBlocks e - "book" -> modify (\st -> st{ dbBook = True }) >> getTitle >> getBlocks e + getBlocks e + "book" -> modify (\st -> st{ dbBook = True }) >> getBlocks e "table" -> parseTable "informaltable" -> parseTable "literallayout" -> codeBlockWithLang @@ -756,30 +792,25 @@ parseBlock (Elem e) = terms' <- mapM getInlines terms items' <- mapM getBlocks items return (mconcat $ intersperse (str "; ") terms', items') - getTitle = case filterChild (named "title") e of - Just t -> do - tit <- getInlines t - subtit <- case filterChild (named "subtitle") e of - Just s -> (text ": " <>) <$> - getInlines s - Nothing -> return mempty - modify $ \st -> st{dbDocTitle = tit <> subtit} - Nothing -> return () - getAuthors = do - auths <- mapM getInlines - $ filterChildren (named "author") e - modify $ \st -> st{dbDocAuthors = auths} - getDate = case filterChild (named "date") e of - Just t -> do - dat <- getInlines t - modify $ \st -> st{dbDocDate = dat} - Nothing -> return () + getTitle = do + tit <- getInlines e + subtit <- case filterChild (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + addMeta "title" (tit <> subtit) + + getAuthor = (:[]) <$> getInlines e >>= addMeta "author" + getAuthorGroup = do + let terms = filterChildren (named "author") e + mapM getInlines terms >>= addMeta "author" + getDate = getInlines e >>= addMeta "date" parseTable = do let isCaption x = named "title" x || named "caption" x caption <- case filterChild isCaption e of Just t -> getInlines t Nothing -> return mempty - let e' = maybe e id $ filterChild (named "tgroup") e + let e' = fromMaybe e $ filterChild (named "tgroup") e let isColspec x = named "colspec" x || named "col" x let colspecs = case filterChild (named "colgroup") e' of Just c -> filterChildren isColspec c @@ -801,11 +832,14 @@ parseBlock (Elem e) = Just "center" -> AlignCenter _ -> AlignDefault let toWidth c = case findAttr (unqual "colwidth") c of - Just w -> read $ filter (\x -> + Just w -> fromMaybe 0 + $ safeRead $ '0': filter (\x -> (x >= '0' && x <= '9') || x == '.') w Nothing -> 0 :: Double - let numrows = maximum $ map length bodyrows + let numrows = case bodyrows of + [] -> 0 + xs -> maximum $ map length xs let aligns = case colspecs of [] -> replicate numrows AlignDefault cs -> map toAlignment cs @@ -832,6 +866,7 @@ parseBlock (Elem e) = b <- getBlocks e modify $ \st -> st{ dbSectionLevel = n - 1 } return $ header n' headerText <> b + metaBlock = acceptingMetadata (getBlocks e) >> return mempty getInlines :: Element -> DB Inlines getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') @@ -895,6 +930,11 @@ parseInline (Elem e) = _ -> emph <$> innerInlines "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) "title" -> return mempty + "affiliation" -> return mempty + -- Note: this isn't a real docbook tag; it's what we convert + -- <?asciidor-br?> to in handleInstructions, above. A kludge to + -- work around xml-light's inability to parse an instruction. + "br" -> return linebreak _ -> innerInlines where innerInlines = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e) diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs new file mode 100644 index 000000000..71baa5dde --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -0,0 +1,489 @@ +{-# LANGUAGE PatternGuards #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse) +to 'Pandoc' document. -} + +{- +Current state of implementation of Docx entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + + - [X] Para + - [X] CodeBlock (styled with `SourceCode`) + - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, + indented) + - [X] OrderedList + - [X] BulletList + - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) + - [X] Header (styled with `Heading#`) + - [ ] HorizontalRule + - [-] Table (column widths and alignments not yet implemented) + +* Inlines + + - [X] Str + - [X] Emph (From italics. `underline` currently read as span. In + future, it might optionally be emph as well) + - [X] Strong + - [X] Strikeout + - [X] Superscript + - [X] Subscript + - [X] SmallCaps + - [ ] Quoted + - [ ] Cite + - [X] Code (styled with `VerbatimChar`) + - [X] Space + - [X] LineBreak (these are invisible in Word: entered with Shift-Return) + - [ ] Math + - [X] Link (links to an arbitrary bookmark create a span with the target as + id and "anchor" class) + - [-] Image (Links to path in archive. Future option for + data-encoded URI likely.) + - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.Docx + ( readDocx + ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (text, toList) +import Text.Pandoc.MIME (getMimeType) +import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Walk +import Text.Pandoc.Readers.Docx.Parse +import Text.Pandoc.Readers.Docx.Lists +import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Shared +import Data.Maybe (mapMaybe) +import Data.List (delete, isPrefixOf, (\\)) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Base64 (encode) +import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State + +readDocx :: ReaderOptions + -> B.ByteString + -> Pandoc +readDocx opts bytes = + case archiveToDocx (toArchive bytes) of + Just docx -> Pandoc nullMeta (docxToBlocks opts docx) + Nothing -> error $ "couldn't parse docx file" + +data DState = DState { docxAnchorMap :: M.Map String String } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxDocument :: Docx} + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + +-- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + +divsToKeep :: [String] +divsToKeep = ["list-item", "Definition", "DefinitionTerm"] + +runStyleToContainers :: RunStyle -> [Container Inline] +runStyleToContainers rPr = + let spanClassToContainers :: String -> [Container Inline] + spanClassToContainers s | s `elem` codeSpans = + [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))] + spanClassToContainers s | s `elem` spansToKeep = + [Container $ Span ("", [s], [])] + spanClassToContainers _ = [] + + classContainers = case rStyle rPr of + Nothing -> [] + Just s -> spanClassToContainers s + + formatters = map Container $ mapMaybe id + [ if isBold rPr then (Just Strong) else Nothing + , if isItalic rPr then (Just Emph) else Nothing + , if isSmallCaps rPr then (Just SmallCaps) else Nothing + , if isStrike rPr then (Just Strikeout) else Nothing + , if isSuperScript rPr then (Just Superscript) else Nothing + , if isSubScript rPr then (Just Subscript) else Nothing + , underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)])) + ] + in + classContainers ++ formatters + + +divAttrToContainers :: [String] -> [(String, String)] -> [Container Block] +divAttrToContainers (c:cs) _ | Just n <- isHeaderClass c = + [Container $ \_ -> + Header n ("", delete ("Heading" ++ show n) cs, []) []] +divAttrToContainers (c:cs) kvs | c `elem` divsToKeep = + (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs) +divAttrToContainers (c:cs) kvs | c `elem` codeDivs = + -- This is a bit of a cludge. We make the codeblock from the raw + -- parparts in bodyPartToBlocks. But we need something to match against. + (Container $ \_ -> CodeBlock ("", [], []) "") : (divAttrToContainers cs kvs) +divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs = + let kvs' = filter (\(k,_) -> k /= "indent") kvs + in + (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs') +divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs = + (Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs) +divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs +divAttrToContainers [] kvs | Just numString <- lookup "indent" kvs = + let kvs' = filter (\(k,_) -> k /= "indent") kvs + in + case numString of + "0" -> divAttrToContainers [] kvs' + ('-' : _) -> divAttrToContainers [] kvs' + _ -> (Container BlockQuote) : divAttrToContainers [] kvs' +divAttrToContainers _ _ = [] + + +parStyleToContainers :: ParagraphStyle -> [Container Block] +parStyleToContainers pPr = + let classes = pStyle pPr + kvs = case indent pPr of + Just n -> [("indent", show n)] + Nothing -> [] + in + divAttrToContainers classes kvs + + +strToInlines :: String -> [Inline] +strToInlines = toList . text + +codeSpans :: [String] +codeSpans = ["VerbatimChar"] + +blockQuoteDivs :: [String] +blockQuoteDivs = ["Quote", "BlockQuote"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> [Inline] +runElemToInlines (TextRun s) = strToInlines s +runElemToInlines (LnBrk) = [LineBreak] +runElemToInlines (Tab) = [Space] + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] +runElemToString (Tab) = ['\t'] + +runElemsToString :: [RunElem] -> String +runElemsToString = concatMap runElemToString + +runToString :: Run -> String +runToString (Run _ runElems) = runElemsToString runElems +runToString _ = "" + +parPartToString :: ParPart -> String +parPartToString (PlainRun run) = runToString run +parPartToString (InternalHyperLink _ runs) = concatMap runToString runs +parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs +parPartToString _ = "" + + +inlineCodeContainer :: Container Inline -> Bool +inlineCodeContainer (Container f) = case f [] of + Code _ "" -> True + _ -> False +inlineCodeContainer _ = False + + +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) + | any inlineCodeContainer (runStyleToContainers rs) = + return $ + rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] + | otherwise = + return $ + rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) +runToInlines (Footnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument + case (getFootNote fnId notes) of + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] +runToInlines (Endnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument + case (getEndNote fnId notes) of + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- Check to see if the id is already in there. Rewrite if + -- necessary. This will have the possible effect of rewriting + -- user-defined anchor links. However, since these are not defined + -- in pandoc, it seems like a necessary evil to avoid an extra + -- pass. + let newAnchor = case anchor `elem` (M.elems anchorMap) of + True -> uniqueIdent [Str anchor] (M.elems anchorMap) + False -> anchor + put DState{ docxAnchorMap = M.insert anchor newAnchor anchorMap} + return [Span (anchor, ["anchor"], []) []] +parPartToInlines (Drawing relid) = do + (Docx _ _ _ rels _) <- asks docxDocument + return $ case lookupRelationship relid rels of + Just target -> [Image [] (combine "word" target, "")] + Nothing -> [Image [] ("", "")] +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink relid runs) = do + (Docx _ _ _ rels _) <- asks docxDocument + rs <- concatMapM runToInlines runs + return $ case lookupRelationship relid rels of + Just target -> + [Link rs (target, "")] + Nothing -> + [Link rs ("", "")] + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (ident, classes, kvs) ils) = + (not . null) ident && + classes == ["anchor"] && + null kvs && + null ils +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchor :: Block -> DocxContext Block +-- If there is an anchor already there (an anchor span in the header, +-- to be exact), we rename and associate the new id with the old one. +makeHeaderAnchor (Header n (_, classes, kvs) ils) + | (x : xs) <- filter isAnchorSpan ils + , (Span (ident, _, _) _) <- x + , notElem ident dummyAnchors = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + put DState{docxAnchorMap = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) +-- Otherwise we just give it a name, and register that name (associate +-- it with itself.) +makeHeaderAnchor (Header n (_, classes, kvs) ils) = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + put DState{docxAnchorMap = M.insert newIdent newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) ils +makeHeaderAnchor blk = return blk + + +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts >>= + -- TODO: Option for self-containted images + (if False then (walkM makeImagesSelfContained) else return) + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells + +isBlockCodeContainer :: Container Block -> Bool +isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True +isBlockCodeContainer _ = False + +isHeaderContainer :: Container Block -> Bool +isHeaderContainer (Container f) | Header _ _ _ <- f [] = True +isHeaderContainer _ = False + +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) + | any isBlockCodeContainer (parStyleToContainers pPr) = + let + otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr) + in + return $ + rebuild + otherConts + [CodeBlock ("", [], []) (concatMap parPartToString parparts)] +bodyPartToBlocks (Paragraph pPr parparts) + | any isHeaderContainer (parStyleToContainers pPr) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) + Header n attr _ = hdrFun [] + hdr <- makeHeaderAnchor $ Header n attr ils + return [hdr] +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + case ils of + [] -> return [] + _ -> do + return $ + rebuild + (parStyleToContainers pPr) + [Para ils] +bodyPartToBlocks (ListItem pPr numId lvl parparts) = do + (Docx _ _ numbering _ _) <- asks docxDocument + let + kvs = case lookupLevel numId lvl numbering of + Just (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + Just (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + Nothing -> [] + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do + let caption = strToInlines cap + (hdr, rows) = case firstRowFormatting look of + True -> (Just r, rs) + False -> (Nothing, r:rs) + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] + + cells <- mapM rowToBlocksList rows + + let size = case null hdrCells of + True -> length $ head cells + False -> length $ hdrCells + -- + -- The two following variables (horizontal column alignment and + -- relative column widths) go to the default at the + -- moment. Width information is in the TblGrid field of the Tbl, + -- so should be possible. Alignment might be more difficult, + -- since there doesn't seem to be a column entity in docx. + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] + +-- replace targets with generated anchors. +rewriteLink :: Inline -> DocxContext Inline +rewriteLink l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink il = return il + +makeImagesSelfContained :: Inline -> DocxContext Inline +makeImagesSelfContained i@(Image alt (uri, title)) = do + (Docx _ _ _ _ media) <- asks docxDocument + return $ case lookup uri media of + Just bs -> + case getMimeType uri of + Just mime -> + let data_uri = "data:" ++ mime ++ ";base64," ++ + toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i + Nothing -> i +makeImagesSelfContained inline = return inline + +bodyToBlocks :: Body -> DocxContext [Block] +bodyToBlocks (Body bps) = do + blks <- concatMapM bodyPartToBlocks bps >>= + walkM rewriteLink + return $ + blocksToDefinitions $ + blocksToBullets $ blks + +docxToBlocks :: ReaderOptions -> Docx -> [Block] +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = + let dState = DState { docxAnchorMap = M.empty } + dEnv = DEnv { docxOptions = opts + , docxDocument = d} + in + evalDocxContext (bodyToBlocks body) dEnv dState + +ilToCode :: Inline -> String +ilToCode (Str s) = s +ilToCode Space = " " +ilToCode _ = "" + +isHeaderClass :: String -> Maybe Int +isHeaderClass s | "Heading" `isPrefixOf` s = + case reads (drop (length "Heading") s) :: [(Int, String)] of + [] -> Nothing + ((n, "") : []) -> Just n + _ -> Nothing +isHeaderClass _ = Nothing diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs new file mode 100644 index 000000000..1e37d0076 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -0,0 +1,227 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Lists + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Functions for converting flat docx paragraphs into nested lists. +-} + +module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets + , blocksToDefinitions + , listParagraphDivs + ) where + +import Text.Pandoc.JSON +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.Shared (trim) +import Control.Monad +import Data.List +import Data.Maybe + +isListItem :: Block -> Bool +isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True +isListItem _ = False + +getLevel :: Block -> Maybe Integer +getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs +getLevel _ = Nothing + +getLevelN :: Block -> Integer +getLevelN b = case getLevel b of + Just n -> n + Nothing -> -1 + +getNumId :: Block -> Maybe Integer +getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs +getNumId _ = Nothing + +getNumIdN :: Block -> Integer +getNumIdN b = case getNumId b of + Just n -> n + Nothing -> -1 + +getText :: Block -> Maybe String +getText (Div (_, _, kvs) _) = lookup "text" kvs +getText _ = Nothing + +data ListType = Itemized | Enumerated ListAttributes + +listStyleMap :: [(String, ListNumberStyle)] +listStyleMap = [("upperLetter", UpperAlpha), + ("lowerLetter", LowerAlpha), + ("upperRoman", UpperRoman), + ("lowerRoman", LowerRoman), + ("decimal", Decimal)] + +listDelimMap :: [(String, ListNumberDelim)] +listDelimMap = [("%1)", OneParen), + ("(%1)", TwoParens), + ("%1.", Period)] + +getListType :: Block -> Maybe ListType +getListType b@(Div (_, _, kvs) _) | isListItem b = + let + start = lookup "start" kvs + frmt = lookup "format" kvs + txt = lookup "text" kvs + in + case frmt of + Just "bullet" -> Just Itemized + Just f -> + case txt of + Just t -> Just $ Enumerated ( + read (fromMaybe "1" start) :: Int, + fromMaybe DefaultStyle (lookup f listStyleMap), + fromMaybe DefaultDelim (lookup t listDelimMap)) + Nothing -> Nothing + _ -> Nothing +getListType _ = Nothing + +listParagraphDivs :: [String] +listParagraphDivs = ["ListParagraph"] + +-- This is a first stab at going through and attaching meaning to list +-- paragraphs, without an item marker, following a list item. We +-- assume that these are paragraphs in the same item. + +handleListParagraphs :: [Block] -> [Block] +handleListParagraphs [] = [] +handleListParagraphs ( + (Div attr1@(_, classes1, _) blks1) : + (Div (ident2, classes2, kvs2) blks2) : + blks + ) | "list-item" `elem` classes1 && + not ("list-item" `elem` classes2) && + (not . null) (listParagraphDivs `intersect` classes2) = + -- We don't want to keep this indent. + let newDiv2 = + (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) + in + handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) +handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) + +separateBlocks' :: Block -> [[Block]] -> [[Block]] +separateBlocks' blk ([] : []) = [[blk]] +separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +-- The following is for the invisible bullet lists. This is how +-- pandoc-generated ooxml does multiparagraph item lists. +separateBlocks' b acc | liftM trim (getText b) == Just "" = + (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc = acc ++ [[b]] + +separateBlocks :: [Block] -> [[Block]] +separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) + +flatToBullets' :: Integer -> [Block] -> [Block] +flatToBullets' _ [] = [] +flatToBullets' num xs@(b : elems) + | getLevelN b == num = b : (flatToBullets' num elems) + | otherwise = + let bNumId = getNumIdN b + bLevel = getLevelN b + (children, remaining) = + span + (\b' -> + ((getLevelN b') > bLevel || + ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) + xs + in + case getListType b of + Just (Enumerated attr) -> + (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + _ -> + (BulletList (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + +flatToBullets :: [Block] -> [Block] +flatToBullets elems = flatToBullets' (-1) elems + +blocksToBullets :: [Block] -> [Block] +blocksToBullets blks = + bottomUp removeListDivs $ + flatToBullets $ (handleListParagraphs blks) + +plainParaInlines :: Block -> [Inline] +plainParaInlines (Plain ils) = ils +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] + +blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] +blocksToDefinitions' [] acc [] = reverse acc +blocksToDefinitions' defAcc acc [] = + reverse $ (DefinitionList (reverse defAcc)) : acc +blocksToDefinitions' defAcc acc + ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) + | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + pair = case remainingAttr2 == ("", [], []) of + True -> (concatMap plainParaInlines blks1, [blks2]) + False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) + in + blocksToDefinitions' (pair : defAcc) acc blks +blocksToDefinitions' defAcc acc + ((Div (ident2, classes2, kvs2) blks2) : blks) + | (not . null) defAcc && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + defItems2 = case remainingAttr2 == ("", [], []) of + True -> blks2 + False -> [Div remainingAttr2 blks2] + ((defTerm, defItems):defs) = defAcc + defAcc' = case null defItems of + True -> (defTerm, [defItems2]) : defs + False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + in + blocksToDefinitions' defAcc' acc blks +blocksToDefinitions' [] acc (b:blks) = + blocksToDefinitions' [] (b:acc) blks +blocksToDefinitions' defAcc acc (b:blks) = + blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + +removeListDivs' :: Block -> [Block] +removeListDivs' (Div (ident, classes, kvs) blks) + | "list-item" `elem` classes = + case delete "list-item" classes of + [] -> blks + classes' -> [Div (ident, classes', kvs) $ blks] +removeListDivs' (Div (ident, classes, kvs) blks) + | not $ null $ listParagraphDivs `intersect` classes = + case classes \\ listParagraphDivs of + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] +removeListDivs' blk = [blk] + +removeListDivs :: [Block] -> [Block] +removeListDivs = concatMap removeListDivs' + + + +blocksToDefinitions :: [Block] -> [Block] +blocksToDefinitions = blocksToDefinitions' [] [] + + + + diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs new file mode 100644 index 000000000..07f34450d --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -0,0 +1,596 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of docx archive into Docx haskell type +-} + + +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , getFootNote + , getEndNote + , lookupLevel + , lookupRelationship + , archiveToDocx + ) where +import Codec.Archive.Zip +import Text.XML.Light +import Data.Maybe +import Data.List +import System.FilePath +import Data.Bits ((.|.)) +import qualified Data.ByteString.Lazy as B +import qualified Text.Pandoc.UTF8 as UTF8 + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + + +type NameSpaces = [(String, String)] + +data Docx = Docx Document Notes Numbering [Relationship] Media + deriving Show + +archiveToDocx :: Archive -> Maybe Docx +archiveToDocx archive = do + let notes = archiveToNotes archive + rels = archiveToRelationships archive + media = archiveToMedia archive + doc <- archiveToDocument archive + numbering <- archiveToNumbering archive + return $ Docx doc notes numbering rels media + +data Document = Document NameSpaces Body + deriving Show + +archiveToDocument :: Archive -> Maybe Document +archiveToDocument zf = do + entry <- findEntryByPath "word/document.xml" zf + docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem + body <- elemToBody namespaces bodyElem + return $ Document namespaces body + +type Media = [(FilePath, B.ByteString)] + +filePathIsMedia :: FilePath -> Bool +filePathIsMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "word/media/") + +getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) +getMediaPair zf fp = + case findEntryByPath fp zf of + Just e -> Just (fp, fromEntry e) + Nothing -> Nothing + +archiveToMedia :: Archive -> Media +archiveToMedia zf = + mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +lookupLevel :: String -> String -> Numbering -> Maybe Level +lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do + absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs + lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs + lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls + return lvl + +numElemToNum :: NameSpaces -> Element -> Maybe Numb +numElemToNum ns element | + qName (elName element) == "num" && + qURI (elName element) == (lookup "w" ns) = do + numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element + absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + return $ Numb numId absNumId +numElemToNum _ _ = Nothing + +absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb +absNumElemToAbsNum ns element | + qName (elName element) == "abstractNum" && + qURI (elName element) == (lookup "w" ns) = do + absNumId <- findAttr + (QName "abstractNumId" (lookup "w" ns) (Just "w")) + element + let levelElems = findChildren + (QName "lvl" (lookup "w" ns) (Just "w")) + element + levels = mapMaybe (levelElemToLevel ns) levelElems + return $ AbstractNumb absNumId levels +absNumElemToAbsNum _ _ = Nothing + +levelElemToLevel :: NameSpaces -> Element -> Maybe Level +levelElemToLevel ns element | + qName (elName element) == "lvl" && + qURI (elName element) == (lookup "w" ns) = do + ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element + fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + return (ilvl, fmt, txt, start) +levelElemToLevel _ _ = Nothing + +archiveToNumbering :: Archive -> Maybe Numbering +archiveToNumbering zf = + case findEntryByPath "word/numbering.xml" zf of + Nothing -> Just $ Numbering [] [] [] + Just entry -> do + numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem) + numElems = findChildren + (QName "num" (lookup "w" namespaces) (Just "w")) + numberingElem + absNumElems = findChildren + (QName "abstractNum" (lookup "w" namespaces) (Just "w")) + numberingElem + nums = mapMaybe (numElemToNum namespaces) numElems + absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems + return $ Numbering namespaces nums absNums + +data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) + deriving Show + +noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) +noteElemToNote ns element + | qName (elName element) `elem` ["endnote", "footnote"] && + qURI (elName element) == (lookup "w" ns) = + do + noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + let bps = mapMaybe (elemToBodyPart ns) + $ elChildren element + return $ (noteId, bps) +noteElemToNote _ _ = Nothing + +getFootNote :: String -> Notes -> Maybe [BodyPart] +getFootNote s (Notes _ fns _) = fns >>= (lookup s) + +getEndNote :: String -> Notes -> Maybe [BodyPart] +getEndNote s (Notes _ _ ens) = ens >>= (lookup s) + +elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] +elemToNotes ns notetype element + | qName (elName element) == (notetype ++ "s") && + qURI (elName element) == (lookup "w" ns) = + Just $ mapMaybe (noteElemToNote ns) + $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element +elemToNotes _ _ _ = Nothing + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + en_namespaces = case enElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + + +data Relationship = Relationship (RelId, Target) + deriving Show + +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = + lookup relid (map (\(Relationship pair) -> pair) rels) + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = + let (dir, name) = splitFileName fp + in + (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship (relId, target) +relElemToRelationship _ = Nothing + + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + let relPaths = filter filePathIsRel (filesInArchive archive) + entries = mapMaybe (\f -> findEntryByPath f archive) relPaths + relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries + rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems + in + rels + +data Body = Body [BodyPart] + deriving Show + +elemToBody :: NameSpaces -> Element -> Maybe Body +elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = + Just $ Body + $ mapMaybe (elemToBodyPart ns) $ elChildren element +elemToBody _ _ = Nothing + +elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) +elemToNumInfo ns element + | qName (elName element) == "p" && + qURI (elName element) == (lookup "w" ns) = + do + pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element + numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr + lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + return (numId, lvl) +elemToNumInfo _ _ = Nothing + +elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart +elemToBodyPart ns element + | qName (elName element) == "p" && + qURI (elName element) == (lookup "w" ns) = + let parstyle = elemToParagraphStyle ns element + parparts = mapMaybe (elemToParPart ns) + $ elChildren element + in + case elemToNumInfo ns element of + Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts + Nothing -> Just $ Paragraph parstyle parparts + | qName (elName element) == "tbl" && + qURI (elName element) == (lookup "w" ns) = + let + caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element + >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + grid = case + findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element + of + Just g -> elemToTblGrid ns g + Nothing -> [] + tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element + >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) + >>= elemToTblLook ns + in + Just $ Tbl + (fromMaybe "" caption) + grid + (fromMaybe defaultTblLook tblLook) + (mapMaybe (elemToRow ns) (elChildren element)) + | otherwise = Nothing + +elemToTblLook :: NameSpaces -> Element -> Maybe TblLook +elemToTblLook ns element + | qName (elName element) == "tblLook" && + qURI (elName element) == (lookup "w" ns) = + let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element + val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False + in + Just $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = Nothing + +testBitMask :: String -> Int -> Bool +testBitMask bitMaskS n = + case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + [] -> False + ((n', _) : _) -> ((n' .|. n) /= 0) + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indent :: Maybe Integer + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indent = Nothing + } + +elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle +elemToParagraphStyle ns element = + case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of + Just pPr -> + ParagraphStyle + {pStyle = + mapMaybe + (findAttr (QName "val" (lookup "w" ns) (Just "w"))) + (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) + , indent = + findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= + findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= + stringToInteger + } + Nothing -> defaultParagraphStyle + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String [ParPart] + | Tbl String TblGrid TblLook [Row] + + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +stringToInteger :: String -> Maybe Integer +stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) + +elemToTblGrid :: NameSpaces -> Element -> TblGrid +elemToTblGrid ns element + | qName (elName element) == "tblGrid" && + qURI (elName element) == (lookup "w" ns) = + let + cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element + in + mapMaybe (\e -> + findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e + >>= stringToInteger + ) + cols +elemToTblGrid _ _ = [] + +data Row = Row [Cell] + deriving Show + + +elemToRow :: NameSpaces -> Element -> Maybe Row +elemToRow ns element + | qName (elName element) == "tr" && + qURI (elName element) == (lookup "w" ns) = + let + cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element + in + Just $ Row (mapMaybe (elemToCell ns) cells) +elemToRow _ _ = Nothing + +data Cell = Cell [BodyPart] + deriving Show + +elemToCell :: NameSpaces -> Element -> Maybe Cell +elemToCell ns element + | qName (elName element) == "tc" && + qURI (elName element) == (lookup "w" ns) = + Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) +elemToCell _ _ = Nothing + +data ParPart = PlainRun Run + | Insertion ChangeId Author ChangeDate [Run] + | Deletion ChangeId Author ChangeDate [Run] + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink RelId [Run] + | Drawing String + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote String + | Endnote String + deriving Show + +data RunElem = TextRun String | LnBrk | Tab + deriving Show + +data RunStyle = RunStyle { isBold :: Bool + , isItalic :: Bool + , isSmallCaps :: Bool + , isStrike :: Bool + , isSuperScript :: Bool + , isSubScript :: Bool + , underline :: Maybe String + , rStyle :: Maybe String } + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = False + , isItalic = False + , isSmallCaps = False + , isStrike = False + , isSuperScript = False + , isSubScript = False + , underline = Nothing + , rStyle = Nothing + } + +elemToRunStyle :: NameSpaces -> Element -> RunStyle +elemToRunStyle ns element = + case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of + Just rPr -> + RunStyle + { + isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr + , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr + , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr + , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr + , isSuperScript = + (Just "superscript" == + (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")))) + , isSubScript = + (Just "subscript" == + (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")))) + , underline = + findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + , rStyle = + findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + } + Nothing -> defaultRunStyle + +elemToRun :: NameSpaces -> Element -> Maybe Run +elemToRun ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + case + findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= + findAttr (QName "id" (lookup "w" ns) (Just "w")) + of + Just s -> Just $ Footnote s + Nothing -> + case + findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= + findAttr (QName "id" (lookup "w" ns) (Just "w")) + of + Just s -> Just $ Endnote s + Nothing -> Just $ + Run (elemToRunStyle ns element) + (elemToRunElems ns element) +elemToRun _ _ = Nothing + +elemToRunElem :: NameSpaces -> Element -> Maybe RunElem +elemToRunElem ns element + | (qName (elName element) == "t" || qName (elName element) == "delText") && + qURI (elName element) == (lookup "w" ns) = + Just $ TextRun (strContent element) + | qName (elName element) == "br" && + qURI (elName element) == (lookup "w" ns) = + Just $ LnBrk + | qName (elName element) == "tab" && + qURI (elName element) == (lookup "w" ns) = + Just $ Tab + | otherwise = Nothing + + +elemToRunElems :: NameSpaces -> Element -> [RunElem] +elemToRunElems ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + mapMaybe (elemToRunElem ns) (elChildren element) + | otherwise = [] + +elemToDrawing :: NameSpaces -> Element -> Maybe ParPart +elemToDrawing ns element + | qName (elName element) == "drawing" && + qURI (elName element) == (lookup "w" ns) = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + in + findElement (QName "blip" (Just a_ns) (Just "a")) element + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + >>= (\s -> Just $ Drawing s) +elemToDrawing _ _ = Nothing + + +elemToParPart :: NameSpaces -> Element -> Maybe ParPart +elemToParPart ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of + Just drawingElem -> elemToDrawing ns drawingElem + Nothing -> do + r <- elemToRun ns element + return $ PlainRun r +elemToParPart ns element + | qName (elName element) == "ins" && + qURI (elName element) == (lookup "w" ns) = do + cId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + cAuthor <- findAttr (QName "author" (lookup "w" ns) (Just "w")) element + cDate <- findAttr (QName "date" (lookup "w" ns) (Just "w")) element + let runs = mapMaybe (elemToRun ns) (elChildren element) + return $ Insertion cId cAuthor cDate runs +elemToParPart ns element + | qName (elName element) == "del" && + qURI (elName element) == (lookup "w" ns) = do + cId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + cAuthor <- findAttr (QName "author" (lookup "w" ns) (Just "w")) element + cDate <- findAttr (QName "date" (lookup "w" ns) (Just "w")) element + let runs = mapMaybe (elemToRun ns) (elChildren element) + return $ Deletion cId cAuthor cDate runs +elemToParPart ns element + | qName (elName element) == "bookmarkStart" && + qURI (elName element) == (lookup "w" ns) = do + bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element + return $ BookMark bmId bmName +elemToParPart ns element + | qName (elName element) == "hyperlink" && + qURI (elName element) == (lookup "w" ns) = + let runs = mapMaybe (elemToRun ns) + $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element + in + case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of + Just anchor -> + Just $ InternalHyperLink anchor runs + Nothing -> + case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of + Just relId -> Just $ ExternalHyperLink relId runs + Nothing -> Nothing +elemToParPart _ _ = Nothing + +type Target = String +type Anchor = String +type BookMarkId = String +type RelId = String +type ChangeId = String +type Author = String +type ChangeDate = String diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs new file mode 100644 index 000000000..8c105d1f1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs @@ -0,0 +1,181 @@ +{-# LANGUAGE OverloadedStrings #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Reducible + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Typeclass for combining adjacent blocks and inlines correctly. +-} + + +module Text.Pandoc.Readers.Docx.Reducible ((<++>), + (<+++>), + Reducible, + Container(..), + container, + innards, + reduceList, + reduceListB, + rebuild) + where + +import Text.Pandoc.Builder +import Data.List ((\\), intersect) + +data Container a = Container ([a] -> a) | NullContainer + +instance (Eq a) => Eq (Container a) where + (Container x) == (Container y) = ((x []) == (y [])) + NullContainer == NullContainer = True + _ == _ = False + +instance (Show a) => Show (Container a) where + show (Container x) = "Container {" ++ + (reverse $ drop 3 $ reverse $ show $ x []) ++ + "}" + show (NullContainer) = "NullContainer" + +class Reducible a where + (<++>) :: a -> a -> [a] + container :: a -> Container a + innards :: a -> [a] + isSpace :: a -> Bool + +(<+++>) :: (Reducible a) => Many a -> Many a -> Many a +mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms + +reduceListB :: (Reducible a) => Many a -> Many a +reduceListB = fromList . reduceList . toList + +reduceList' :: (Reducible a) => [a] -> [a] -> [a] +reduceList' acc [] = acc +reduceList' [] (x:xs) = reduceList' [x] xs +reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs + +reduceList :: (Reducible a) => [a] -> [a] +reduceList = reduceList' [] + +combineReducibles :: (Reducible a, Eq a) => a -> a -> [a] +combineReducibles r s = + let (conts, rs) = topLevelContainers r + (conts', ss) = topLevelContainers s + shared = conts `intersect` conts' + remaining = conts \\ shared + remaining' = conts' \\ shared + in + case null shared of + True -> case (not . null) rs && isSpace (last rs) of + True -> rebuild conts (init rs) ++ [last rs, s] + False -> [r,s] + False -> rebuild + shared $ + reduceList $ + (rebuild remaining rs) ++ (rebuild remaining' ss) + +instance Reducible Inline where + s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> [s1,s2] + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + s1' = case null classes1' && null kvs1' of + True -> ils1 + False -> [Span attr1' ils1] + s2' = case null classes2' && null kvs2' of + True -> ils2 + False -> [Span attr2' ils2] + in + [Span attr' $ reduceList $ s1' ++ s2'] + + (Str x) <++> (Str y) = [Str (x++y)] + il <++> il' = combineReducibles il il' + + container (Emph _) = Container Emph + container (Strong _) = Container Strong + container (Strikeout _) = Container Strikeout + container (Subscript _) = Container Subscript + container (Superscript _) = Container Superscript + container (Quoted qt _) = Container $ Quoted qt + container (Cite cs _) = Container $ Cite cs + container (Span attr _) = Container $ Span attr + container _ = NullContainer + + innards (Emph ils) = ils + innards (Strong ils) = ils + innards (Strikeout ils) = ils + innards (Subscript ils) = ils + innards (Superscript ils) = ils + innards (Quoted _ ils) = ils + innards (Cite _ ils) = ils + innards (Span _ ils) = ils + innards _ = [] + + isSpace Space = True + isSpace _ = False + +instance Reducible Block where + (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes = + [Div (ident, classes, kvs) (reduceList blks), blk] + + blk <++> blk' = combineReducibles blk blk' + + container (BlockQuote _) = Container BlockQuote + container (Div attr _) = Container $ Div attr + container _ = NullContainer + + innards (BlockQuote bs) = bs + innards (Div _ bs) = bs + innards _ = [] + + isSpace _ = False + + +topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a]) +topLevelContainers' (r : []) = case container r of + NullContainer -> ([], [r]) + _ -> + let (conts, inns) = topLevelContainers' (innards r) + in + ((container r) : conts, inns) +topLevelContainers' rs = ([], rs) + +topLevelContainers :: (Reducible a) => a -> ([Container a], [a]) +topLevelContainers il = topLevelContainers' [il] + +rebuild :: [Container a] -> [a] -> [a] +rebuild [] xs = xs +rebuild ((Container f) : cs) xs = rebuild cs $ [f xs] +rebuild (NullContainer : cs) xs = rebuild cs $ xs + + diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index f6657a4d1..552e8a251 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -40,6 +40,7 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing @@ -47,7 +48,10 @@ import Data.Maybe ( fromMaybe, isJust ) import Data.List ( intercalate ) import Data.Char ( isDigit ) import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$) ) +import Control.Applicative ( (<$>), (<$), (<*) ) +import Data.Monoid +import Text.Printf (printf) +import Debug.Trace (trace) isSpace :: Char -> Bool isSpace ' ' = True @@ -66,39 +70,56 @@ readHtml opts inp = where tags = canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do - blocks <- (fixPlains False . concat) <$> manyTill block eof + blocks <- (fixPlains False) . mconcat <$> manyTill block eof meta <- stateMeta <$> getState - return $ Pandoc meta blocks + return $ Pandoc meta (B.toList blocks) type TagParser = Parser [Tag String] ParserState -pBody :: TagParser [Block] +pBody :: TagParser Blocks pBody = pInTags "body" block -pHead :: TagParser [Block] -pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag) - where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces - setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t)) - -block :: TagParser [Block] -block = choice +pHead :: TagParser Blocks +pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) + where pTitle = pInTags "title" inline >>= setTitle . trimInlines + setTitle t = mempty <$ (updateState $ B.setMeta "title" t) + pMetaTag = do + mt <- pSatisfy (~== TagOpen "meta" []) + let name = fromAttrib "name" mt + if null name + then return mempty + else do + let content = fromAttrib "content" mt + updateState $ B.setMeta name (B.text content) + return mempty + +block :: TagParser Blocks +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- choice [ pPara , pHeader , pBlockQuote , pCodeBlock , pList , pHrule - , pSimpleTable + , pTable , pHead , pBody , pPlain + , pDiv , pRawHtmlBlock ] + when tr $ trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res -pList :: TagParser [Block] + +pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList -pBulletList :: TagParser [Block] +pBulletList :: TagParser Blocks pBulletList = try $ do pSatisfy (~== TagOpen "ul" []) let nonItem = pSatisfy (\t -> @@ -108,9 +129,9 @@ pBulletList = try $ do -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") - return [BulletList $ map (fixPlains True) items] + return $ B.bulletList $ map (fixPlains True) items -pOrderedList :: TagParser [Block] +pOrderedList :: TagParser Blocks pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) let (start, style) = (sta', sty') @@ -136,27 +157,27 @@ pOrderedList = try $ do -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") - return [OrderedList (start, style, DefaultDelim) $ map (fixPlains True) items] + return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items -pDefinitionList :: TagParser [Block] +pDefinitionList :: TagParser Blocks pDefinitionList = try $ do pSatisfy (~== TagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") - return [DefinitionList items] + return $ B.definitionList items -pDefListItem :: TagParser ([Inline],[[Block]]) +pDefListItem :: TagParser (Inlines, [Blocks]) pDefListItem = try $ do let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem - let term = intercalate [LineBreak] terms + let term = foldl1 (\x y -> x <> B.linebreak <> y) terms return (term, map (fixPlains True) defs) -fixPlains :: Bool -> [Block] -> [Block] -fixPlains inList bs = if any isParaish bs - then map plainToPara bs +fixPlains :: Bool -> Blocks -> Blocks +fixPlains inList bs = if any isParaish bs' + then B.fromList $ map plainToPara bs' else bs where isParaish (Para _) = True isParaish (CodeBlock _ _) = True @@ -168,6 +189,7 @@ fixPlains inList bs = if any isParaish bs isParaish _ = False plainToPara (Plain xs) = Para xs plainToPara x = x + bs' = B.toList bs pRawTag :: TagParser String pRawTag = do @@ -177,13 +199,20 @@ pRawTag = do then return [] else return $ renderTags' [tag] -pRawHtmlBlock :: TagParser [Block] +pDiv :: TagParser Blocks +pDiv = try $ do + getOption readerParseRaw >>= guard + TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True) + contents <- pInTags "div" block + return $ B.divWith (mkAttr attr) contents + +pRawHtmlBlock :: TagParser Blocks pRawHtmlBlock = do raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag parseRaw <- getOption readerParseRaw if parseRaw && not (null raw) - then return [RawBlock "html" raw] - else return [] + then return $ B.rawBlock "html" raw + else return mempty pHtmlBlock :: String -> TagParser String pHtmlBlock t = try $ do @@ -191,70 +220,96 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] -pHeader :: TagParser [Block] +pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] let level = read (drop 1 tagtype) - contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof) - let ident = maybe "" id $ lookup "id" attr + contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) + let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] return $ if bodyTitle - then [] -- skip a representation of the title in the body - else [Header level (ident, classes, keyvals) $ - normalizeSpaces contents] + then mempty -- skip a representation of the title in the body + else B.headerWith (ident, classes, keyvals) level contents -pHrule :: TagParser [Block] +pHrule :: TagParser Blocks pHrule = do pSelfClosing (=="hr") (const True) - return [HorizontalRule] + return B.horizontalRule -pSimpleTable :: TagParser [Block] -pSimpleTable = try $ do +pTable :: TagParser Blocks +pTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank - caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank - skipMany $ (pInTags "col" block >> skipMany pBlank) <|> - (pInTags "colgroup" block >> skipMany pBlank) + caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank + -- TODO actually read these and take width information from them + widths' <- pColgroup <|> many pCol head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") skipMany pBlank rows <- pOptInTag "tbody" $ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td") skipMany pBlank TagClose _ <- pSatisfy (~== TagClose "table") - let cols = maximum $ map length rows - let aligns = replicate cols AlignLeft - let widths = replicate cols 0 - return [Table caption aligns widths head' rows] + let isSinglePlain x = case B.toList x of + [Plain _] -> True + _ -> False + let isSimple = all isSinglePlain $ concat (head':rows) + let cols = length $ if null head' then head rows else head' + -- fail if there are colspans or rowspans + guard $ all (\r -> length r == cols) rows + let aligns = replicate cols AlignDefault + let widths = if null widths' + then if isSimple + then replicate cols 0 + else replicate cols (1.0 / fromIntegral cols) + else widths' + return $ B.table caption (zip aligns widths) head' rows + +pCol :: TagParser Double +pCol = try $ do + TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + skipMany pBlank + optional $ pSatisfy (~== TagClose "col") + skipMany pBlank + return $ case lookup "width" attribs of + Just x | not (null x) && last x == '%' -> + fromMaybe 0.0 $ safeRead ('0':'.':init x) + _ -> 0.0 + +pColgroup :: TagParser [Double] +pColgroup = try $ do + pSatisfy (~== TagOpen "colgroup" []) + skipMany pBlank + manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -pCell :: String -> TagParser [TableCell] +pCell :: String -> TagParser [Blocks] pCell celltype = try $ do skipMany pBlank - res <- pInTags celltype pPlain + res <- pInTags celltype block skipMany pBlank return [res] -pBlockQuote :: TagParser [Block] +pBlockQuote :: TagParser Blocks pBlockQuote = do contents <- pInTags "blockquote" block - return [BlockQuote $ fixPlains False contents] + return $ B.blockQuote $ fixPlains False contents -pPlain :: TagParser [Block] +pPlain :: TagParser Blocks pPlain = do - contents <- liftM (normalizeSpaces . concat) $ many1 inline - if null contents - then return [] - else return [Plain contents] + contents <- trimInlines . mconcat <$> many1 inline + if B.isNull contents + then return mempty + else return $ B.plain contents -pPara :: TagParser [Block] +pPara :: TagParser Blocks pPara = do - contents <- pInTags "p" inline - return [Para $ normalizeSpaces contents] + contents <- trimInlines <$> pInTags "p" inline + return $ B.para contents -pCodeBlock :: TagParser [Block] +pCodeBlock :: TagParser Blocks pCodeBlock = try $ do TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) contents <- manyTill pAnyTag (pCloses "pre" <|> eof) @@ -267,13 +322,9 @@ pCodeBlock = try $ do let result = case reverse result' of '\n':_ -> init result' _ -> result' - let attribsId = fromMaybe "" $ lookup "id" attr - let attribsClasses = words $ fromMaybe "" $ lookup "class" attr - let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr - let attribs = (attribsId, attribsClasses, attribsKV) - return [CodeBlock attribs result] + return $ B.codeBlockWith (mkAttr attr) result -inline :: TagParser [Inline] +inline :: TagParser Inlines inline = choice [ pTagText , pQ @@ -286,6 +337,7 @@ inline = choice , pLink , pImage , pCode + , pSpan , pRawHtmlInline ] @@ -312,7 +364,7 @@ pSelfClosing f g = do optional $ pSatisfy (tagClose f) return open -pQ :: TagParser [Inline] +pQ :: TagParser Inlines pQ = do quoteContext <- stateQuoteContext `fmap` getState let quoteType = case quoteContext of @@ -321,79 +373,84 @@ pQ = do let innerQuoteContext = if quoteType == SingleQuote then InSingleQuote else InDoubleQuote - withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType) + let constructor = case quoteType of + SingleQuote -> B.singleQuoted + DoubleQuote -> B.doubleQuoted + withQuoteContext innerQuoteContext $ + pInlinesInTags "q" constructor -pEmph :: TagParser [Inline] -pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph +pEmph :: TagParser Inlines +pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph -pStrong :: TagParser [Inline] -pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong +pStrong :: TagParser Inlines +pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong -pSuperscript :: TagParser [Inline] -pSuperscript = pInlinesInTags "sup" Superscript +pSuperscript :: TagParser Inlines +pSuperscript = pInlinesInTags "sup" B.superscript -pSubscript :: TagParser [Inline] -pSubscript = pInlinesInTags "sub" Subscript +pSubscript :: TagParser Inlines +pSubscript = pInlinesInTags "sub" B.subscript -pStrikeout :: TagParser [Inline] +pStrikeout :: TagParser Inlines pStrikeout = do - pInlinesInTags "s" Strikeout <|> - pInlinesInTags "strike" Strikeout <|> - pInlinesInTags "del" Strikeout <|> + pInlinesInTags "s" B.strikeout <|> + pInlinesInTags "strike" B.strikeout <|> + pInlinesInTags "del" B.strikeout <|> try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) - contents <- liftM concat $ manyTill inline (pCloses "span") - return [Strikeout contents]) + contents <- mconcat <$> manyTill inline (pCloses "span") + return $ B.strikeout contents) -pLineBreak :: TagParser [Inline] +pLineBreak :: TagParser Inlines pLineBreak = do pSelfClosing (=="br") (const True) - return [LineBreak] + return B.linebreak -pLink :: TagParser [Inline] +pLink :: TagParser Inlines pLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) let url = fromAttrib "href" tag let title = fromAttrib "title" tag - lab <- liftM concat $ manyTill inline (pCloses "a") - return [Link (normalizeSpaces lab) (escapeURI url, title)] + lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") + return $ B.link (escapeURI url) title lab -pImage :: TagParser [Inline] +pImage :: TagParser Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") let url = fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - return [Image (B.toList $ B.text alt) (escapeURI url, title)] + return $ B.image (escapeURI url) title (B.text alt) -pCode :: TagParser [Inline] +pCode :: TagParser Inlines pCode = try $ do (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) result <- manyTill pAnyTag (pCloses open) - let ident = fromMaybe "" $ lookup "id" attr - let classes = words $ fromMaybe [] $ lookup "class" attr - let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr - return [Code (ident,classes,rest) - $ intercalate " " $ lines $ innerText result] + return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result -pRawHtmlInline :: TagParser [Inline] +pSpan :: TagParser Inlines +pSpan = try $ do + getOption readerParseRaw >>= guard + TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + contents <- pInTags "span" inline + return $ B.spanWith (mkAttr attr) contents + +pRawHtmlInline :: TagParser Inlines pRawHtmlInline = do result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag parseRaw <- getOption readerParseRaw if parseRaw - then return [RawInline "html" $ renderTags' [result]] - else return [] + then return $ B.rawInline "html" $ renderTags' [result] + else return mempty -pInlinesInTags :: String -> ([Inline] -> Inline) - -> TagParser [Inline] -pInlinesInTags tagtype f = do - contents <- pInTags tagtype inline - return [f $ normalizeSpaces contents] +pInlinesInTags :: String -> (Inlines -> Inlines) + -> TagParser Inlines +pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: String -> TagParser [a] - -> TagParser [a] +pInTags :: (Monoid a) => String -> TagParser a + -> TagParser a pInTags tagtype parser = try $ do pSatisfy (~== TagOpen tagtype []) - liftM concat $ manyTill parser (pCloses tagtype <|> eof) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) pOptInTag :: String -> TagParser a -> TagParser a @@ -409,56 +466,65 @@ pCloses :: String -> TagParser () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of - (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagClose t') | t' == tagtype -> pAnyTag >> return () (TagOpen t' _) | t' `closes` tagtype -> return () (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "li" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "tr" -> return () _ -> mzero -pTagText :: TagParser [Inline] +pTagText :: TagParser Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState case runParser (many pTagContents) st "text" str of Left _ -> fail $ "Could not parse `" ++ str ++ "'" - Right result -> return result + Right result -> return $ mconcat result pBlank :: TagParser () pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: Parser [Char] ParserState Inline +pTagContents :: Parser [Char] ParserState Inlines pTagContents = - pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad - -pStr :: Parser [Char] ParserState Inline + B.displayMath <$> mathDisplay + <|> B.math <$> mathInline + <|> pStr + <|> pSpace + <|> smartPunctuation pTagContents + <|> pSymbol + <|> pBad + +pStr :: Parser [Char] ParserState Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) pos <- getPosition updateState $ \s -> s{ stateLastStrPos = Just pos } - return $ Str result + return $ B.str result isSpecial :: Char -> Bool isSpecial '"' = True isSpecial '\'' = True isSpecial '.' = True isSpecial '-' = True +isSpecial '$' = True isSpecial '\8216' = True isSpecial '\8217' = True isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: Parser [Char] ParserState Inline -pSymbol = satisfy isSpecial >>= return . Str . (:[]) +pSymbol :: Parser [Char] ParserState Inlines +pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: Parser [Char] ParserState Inline +pBad :: Parser [Char] ParserState Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -490,10 +556,10 @@ pBad = do '\158' -> '\382' '\159' -> '\376' _ -> '?' - return $ Str [c'] + return $ B.str [c'] -pSpace :: Parser [Char] ParserState Inline -pSpace = many1 (satisfy isSpace) >> return Space +pSpace :: Parser [Char] ParserState Inlines +pSpace = many1 (satisfy isSpace) >> return B.space -- -- Constants @@ -521,7 +587,7 @@ blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style", "video"] + "th", "thead", "tr", "script", "style", "svg", "video"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. @@ -543,15 +609,19 @@ blockTags :: [String] blockTags = blockHtmlTags ++ blockDocBookTags isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen (`notElem` blockTags) (const True) t || - tagClose (`notElem` blockTags) t || +isInlineTag t = tagOpen isInlineTagName (const True) t || + tagClose isInlineTagName t || tagComment (const True) t + where isInlineTagName x = x `notElem` blockTags isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen (`elem` blocktags) (const True) t || - tagClose (`elem` blocktags) t || +isBlockTag t = tagOpen isBlockTagName (const True) t || + tagClose isBlockTagName t || tagComment (const True) t - where blocktags = blockTags ++ eitherBlockOrInline + where isBlockTagName ('?':_) = True + isBlockTagName ('!':_) = True + isBlockTagName x = x `elem` blockTags + || x `elem` eitherBlockOrInline isTextTag :: Tag String -> Bool isTextTag = tagText (const True) @@ -560,7 +630,7 @@ isCommentTag :: Tag String -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended - +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags closes :: String -> String -> Bool _ `closes` "body" = False _ `closes` "html" = False @@ -568,11 +638,18 @@ _ `closes` "html" = False "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True -"hr" `closes` "p" = True -"p" `closes` "p" = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- http://www.w3.org/TR/html-markup/p.html +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True "meta" `closes` "meta" = True -"colgroup" `closes` "colgroup" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True @@ -620,3 +697,11 @@ htmlTag f = try $ do _ -> do rendered <- manyTill anyChar (char '>') return (next, rendered ++ ">") + +mkAttr :: [(String, String)] -> Attr +mkAttr attr = (attribsId, attribsClasses, attribsKV) + where attribsId = fromMaybe "" $ lookup "id" attr + attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + + diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index 0e74406ef..4b46c869d 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -3,7 +3,8 @@ Copyright : Copyright (C) 2013 David Lazar License : GNU GPL, version 2 or above - Maintainer : David Lazar <lazar6@illinois.edu> + Maintainer : David Lazar <lazar6@illinois.edu>, + John MacFarlane <jgm@berkeley.edu> Stability : alpha Conversion of Haddock markup to 'Pandoc' document. @@ -12,30 +13,126 @@ module Text.Pandoc.Readers.Haddock ( readHaddock ) where -import Text.Pandoc.Builder +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Shared (trim, splitBy) +import Data.Monoid +import Data.List (intersperse, stripPrefix) +import Data.Maybe (fromMaybe) +import Text.Pandoc.Definition import Text.Pandoc.Options -import Text.Pandoc.Readers.Haddock.Lex -import Text.Pandoc.Readers.Haddock.Parse +import Documentation.Haddock.Parser +import Documentation.Haddock.Types +import Debug.Trace (trace) -- | Parse Haddock markup and return a 'Pandoc' document. readHaddock :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse -> Pandoc -readHaddock _ s = Pandoc nullMeta blocks +readHaddock opts = B.doc . docHToBlocks . trace' . parseParas + where trace' x = if readerTrace opts + then trace (show x) x + else x + +docHToBlocks :: DocH String Identifier -> Blocks +docHToBlocks d' = + case d' of + DocEmpty -> mempty + DocAppend (DocParagraph (DocHeader h)) (DocParagraph (DocAName ident)) -> + B.headerWith (ident,[],[]) (headerLevel h) + (docHToInlines False $ headerTitle h) + DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2) + DocString _ -> inlineFallback + DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h + DocParagraph x -> B.para $ docHToInlines False x + DocIdentifier _ -> inlineFallback + DocIdentifierUnchecked _ -> inlineFallback + DocModule s -> B.plain $ docHToInlines False $ DocModule s + DocWarning _ -> mempty -- TODO + DocEmphasis _ -> inlineFallback + DocMonospaced _ -> inlineFallback + DocBold _ -> inlineFallback + DocHeader h -> B.header (headerLevel h) + (docHToInlines False $ headerTitle h) + DocUnorderedList items -> B.bulletList (map docHToBlocks items) + DocOrderedList items -> B.orderedList (map docHToBlocks items) + DocDefList items -> B.definitionList (map (\(d,t) -> + (docHToInlines False d, + [consolidatePlains $ docHToBlocks t])) items) + DocCodeBlock (DocString s) -> B.codeBlockWith ("",[],[]) s + DocCodeBlock d -> B.para $ docHToInlines True d + DocHyperlink _ -> inlineFallback + DocPic _ -> inlineFallback + DocAName _ -> inlineFallback + DocProperty s -> B.codeBlockWith ("",["property","haskell"],[]) (trim s) + DocExamples es -> mconcat $ map (\e -> + makeExample ">>>" (exampleExpression e) (exampleResult e)) es + + where inlineFallback = B.plain $ docHToInlines False d' + consolidatePlains = B.fromList . consolidatePlains' . B.toList + consolidatePlains' zs@(Plain _ : _) = + let (xs, ys) = span isPlain zs in + Para (concatMap extractContents xs) : consolidatePlains' ys + consolidatePlains' (x : xs) = x : consolidatePlains' xs + consolidatePlains' [] = [] + isPlain (Plain _) = True + isPlain _ = False + extractContents (Plain xs) = xs + extractContents _ = [] + +docHToInlines :: Bool -> DocH String Identifier -> Inlines +docHToInlines isCode d' = + case d' of + DocEmpty -> mempty + DocAppend d1 d2 -> mappend (docHToInlines isCode d1) + (docHToInlines isCode d2) + DocString s + | isCode -> mconcat $ intersperse B.linebreak + $ map B.code $ splitBy (=='\n') s + | otherwise -> B.text s + DocParagraph _ -> mempty + DocIdentifier (_,s,_) -> B.codeWith ("",["haskell","identifier"],[]) s + DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) s + DocModule s -> B.codeWith ("",["haskell","module"],[]) s + DocWarning _ -> mempty -- TODO + DocEmphasis d -> B.emph (docHToInlines isCode d) + DocMonospaced (DocString s) -> B.code s + DocMonospaced d -> docHToInlines True d + DocBold d -> B.strong (docHToInlines isCode d) + DocHeader _ -> mempty + DocUnorderedList _ -> mempty + DocOrderedList _ -> mempty + DocDefList _ -> mempty + DocCodeBlock _ -> mempty + DocHyperlink h -> B.link (hyperlinkUrl h) (hyperlinkUrl h) + (maybe (B.text $ hyperlinkUrl h) B.text $ hyperlinkLabel h) + DocPic p -> B.image (pictureUri p) (fromMaybe (pictureUri p) $ pictureTitle p) + (maybe mempty B.text $ pictureTitle p) + DocAName s -> B.spanWith (s,["anchor"],[]) mempty + DocProperty _ -> mempty + DocExamples _ -> mempty + +-- | Create an 'Example', stripping superfluous characters as appropriate +makeExample :: String -> String -> [String] -> Blocks +makeExample prompt expression result = + B.para $ B.codeWith ("",["prompt"],[]) prompt + <> B.space + <> B.codeWith ([], ["haskell","expr"], []) (trim expression) + <> B.linebreak + <> (mconcat $ intersperse B.linebreak $ map coder result') where - blocks = case parseParas (tokenise s (0,0)) of - Left [] -> error "parse failure" - Left (tok:_) -> error $ "parse failure " ++ pos (tokenPos tok) - where pos (l, c) = "(line " ++ show l ++ ", column " ++ show c ++ ")" - Right x -> mergeLists (toList x) - --- similar to 'docAppend' in Haddock.Doc -mergeLists :: [Block] -> [Block] -mergeLists (BulletList xs : BulletList ys : blocks) - = mergeLists (BulletList (xs ++ ys) : blocks) -mergeLists (OrderedList _ xs : OrderedList a ys : blocks) - = mergeLists (OrderedList a (xs ++ ys) : blocks) -mergeLists (DefinitionList xs : DefinitionList ys : blocks) - = mergeLists (DefinitionList (xs ++ ys) : blocks) -mergeLists (x : blocks) = x : mergeLists blocks -mergeLists [] = [] + -- 1. drop trailing whitespace from the prompt, remember the prefix + prefix = takeWhile (`elem` " \t") prompt + + -- 2. drop, if possible, the exact same sequence of whitespace + -- characters from each result line + -- + -- 3. interpret lines that only contain the string "<BLANKLINE>" as an + -- empty line + result' = map (substituteBlankLine . tryStripPrefix prefix) result + where + tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys + + substituteBlankLine "<BLANKLINE>" = "" + substituteBlankLine line = line + coder = B.codeWith ([], ["result"], []) diff --git a/src/Text/Pandoc/Readers/Haddock/Lex.x b/src/Text/Pandoc/Readers/Haddock/Lex.x deleted file mode 100644 index 120e96ebf..000000000 --- a/src/Text/Pandoc/Readers/Haddock/Lex.x +++ /dev/null @@ -1,171 +0,0 @@ --- --- Haddock - A Haskell Documentation Tool --- --- (c) Simon Marlow 2002 --- --- This file was modified and integrated into GHC by David Waern 2006. --- Then moved back into Haddock by Isaac Dupree in 2009 :-) --- Then copied into Pandoc by David Lazar in 2013 :-D - -{ -{-# LANGUAGE BangPatterns #-} -- Generated by Alex -{-# OPTIONS -Wwarn -w #-} --- The above warning supression flag is a temporary kludge. --- While working on this module you are encouraged to remove it and fix --- any warnings in the module. See --- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings --- for details - -module Text.Pandoc.Readers.Haddock.Lex ( - Token(..), - LToken, - tokenise, - tokenPos - ) where - -import Data.Char -import Numeric (readHex) -} - -%wrapper "posn" - -$ws = $white # \n -$digit = [0-9] -$hexdigit = [0-9a-fA-F] -$special = [\"\@] -$alphanum = [A-Za-z0-9] -$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:] - -:- - --- beginning of a paragraph -<0,para> { - $ws* \n ; - $ws* \> { begin birdtrack } - $ws* prop \> .* \n { strtoken TokProperty `andBegin` property} - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - $ws* [\*\-] { token TokBullet `andBegin` string } - $ws* \[ { token TokDefStart `andBegin` def } - $ws* \( $digit+ \) { token TokNumber `andBegin` string } - $ws* $digit+ \. { token TokNumber `andBegin` string } - $ws* { begin string } -} - --- beginning of a line -<line> { - $ws* \> { begin birdtrack } - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - - $ws* \n { token TokPara `andBegin` para } - -- ^ Here, we really want to be able to say - -- $ws* (\n | <eof>) { token TokPara `andBegin` para} - -- because otherwise a trailing line of whitespace will result in - -- a spurious TokString at the end of a docstring. We don't have <eof>, - -- though (NOW I realise what it was for :-). To get around this, we always - -- append \n to the end of a docstring. - - () { begin string } -} - -<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line } - -<property> () { token TokPara `andBegin` para } - -<example> { - $ws* \n { token TokPara `andBegin` para } - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - () { begin exampleresult } -} - -<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example } - -<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example } - -<string,def> { - $special { strtoken $ \s -> TokSpecial (head s) } - \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) } - \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) } - \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) } - \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) } - [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) } - \\ . { strtoken (TokString . tail) } - "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] } - "&#" [xX] $hexdigit+ \; - { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] } - -- allow special characters through if they don't fit one of the previous - -- patterns. - [\/\'\`\<\#\&\\] { strtoken TokString } - [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line } - [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString } -} - -<def> { - \] { token TokDefEnd `andBegin` string } -} - --- ']' doesn't have any special meaning outside of the [...] at the beginning --- of a definition paragraph. -<string> { - \] { strtoken TokString } -} - -{ --- | A located token -type LToken = (Token, AlexPosn) - -data Token - = TokPara - | TokNumber - | TokBullet - | TokDefStart - | TokDefEnd - | TokSpecial Char - | TokIdent String - | TokString String - | TokURL String - | TokPic String - | TokEmphasis String - | TokAName String - | TokBirdTrack String - | TokProperty String - | TokExamplePrompt String - | TokExampleExpression String - | TokExampleResult String - deriving Show - -tokenPos :: LToken -> (Int, Int) -tokenPos t = let AlexPn _ line col = snd t in (line, col) - -type StartCode = Int -type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken] - -tokenise :: String -> (Int, Int) -> [LToken] -tokenise str (line, col) = go (posn,'\n',[],eofHack str) para - where posn = AlexPn 0 line col - go inp@(pos,_,_,str) sc = - case alexScan inp sc of - AlexEOF -> [] - AlexError _ -> [] - AlexSkip inp' len -> go inp' sc - AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) - --- NB. we add a final \n to the string, (see comment in the beginning of line --- production above). -eofHack str = str++"\n" - -andBegin :: Action -> StartCode -> Action -andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont - -token :: Token -> Action -token t = \pos _ sc cont -> (t, pos) : cont sc - -strtoken, strtokenNL :: (String -> Token) -> Action -strtoken t = \pos str sc cont -> (t str, pos) : cont sc -strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc --- ^ We only want LF line endings in our internal doc string format, so we --- filter out all CRs. - -begin :: StartCode -> Action -begin sc = \_ _ _ cont -> cont sc - -} diff --git a/src/Text/Pandoc/Readers/Haddock/Parse.y b/src/Text/Pandoc/Readers/Haddock/Parse.y deleted file mode 100644 index 9c2bbc8a9..000000000 --- a/src/Text/Pandoc/Readers/Haddock/Parse.y +++ /dev/null @@ -1,178 +0,0 @@ --- This code was copied from the 'haddock' package, modified, and integrated --- into Pandoc by David Lazar. -{ -{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6 --- The above warning supression flag is a temporary kludge. --- While working on this module you are encouraged to remove it and fix --- any warnings in the module. See --- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings --- for details - -module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where - -import Text.Pandoc.Readers.Haddock.Lex -import Text.Pandoc.Builder -import Text.Pandoc.Shared (trim, trimr) -import Data.Generics (everywhere, mkT) -import Data.Char (isSpace) -import Data.Maybe (fromMaybe) -import Data.List (stripPrefix, intersperse) -import Data.Monoid (mempty, mconcat) -} - -%expect 0 - -%tokentype { LToken } - -%token - '/' { (TokSpecial '/',_) } - '@' { (TokSpecial '@',_) } - '[' { (TokDefStart,_) } - ']' { (TokDefEnd,_) } - DQUO { (TokSpecial '\"',_) } - URL { (TokURL $$,_) } - PIC { (TokPic $$,_) } - ANAME { (TokAName $$,_) } - '/../' { (TokEmphasis $$,_) } - '-' { (TokBullet,_) } - '(n)' { (TokNumber,_) } - '>..' { (TokBirdTrack $$,_) } - PROP { (TokProperty $$,_) } - PROMPT { (TokExamplePrompt $$,_) } - RESULT { (TokExampleResult $$,_) } - EXP { (TokExampleExpression $$,_) } - IDENT { (TokIdent $$,_) } - PARA { (TokPara,_) } - STRING { (TokString $$,_) } - -%monad { Either [LToken] } - -%name parseParas doc -%name parseString seq - -%% - -doc :: { Blocks } - : apara PARA doc { $1 <> $3 } - | PARA doc { $2 } - | apara { $1 } - | {- empty -} { mempty } - -apara :: { Blocks } - : ulpara { bulletList [$1] } - | olpara { orderedList [$1] } - | defpara { definitionList [$1] } - | para { $1 } - -ulpara :: { Blocks } - : '-' para { $2 } - -olpara :: { Blocks } - : '(n)' para { $2 } - -defpara :: { (Inlines, [Blocks]) } - : '[' seq ']' seq { (trimInlines $2, [plain $ trimInlines $4]) } - -para :: { Blocks } - : seq { para' $1 } - | codepara { codeBlockWith ([], ["haskell"], []) $1 } - | property { $1 } - | examples { $1 } - -codepara :: { String } - : '>..' codepara { $1 ++ $2 } - | '>..' { $1 } - -property :: { Blocks } - : PROP { makeProperty $1 } - -examples :: { Blocks } - : example examples { $1 <> $2 } - | example { $1 } - -example :: { Blocks } - : PROMPT EXP result { makeExample $1 $2 (lines $3) } - | PROMPT EXP { makeExample $1 $2 [] } - -result :: { String } - : RESULT result { $1 ++ $2 } - | RESULT { $1 } - -seq :: { Inlines } - : elem seq { $1 <> $2 } - | elem { $1 } - -elem :: { Inlines } - : elem1 { $1 } - | '@' seq1 '@' { monospace $2 } - -seq1 :: { Inlines } - : PARA seq1 { linebreak <> $2 } - | elem1 seq1 { $1 <> $2 } - | elem1 { $1 } - -elem1 :: { Inlines } - : STRING { text $1 } - | '/../' { emph (str $1) } - | URL { makeHyperlink $1 } - | PIC { image $1 $1 mempty } - | ANAME { mempty } -- TODO - | IDENT { codeWith ([], ["haskell"], []) $1 } - | DQUO strings DQUO { codeWith ([], ["haskell"], []) $2 } - -strings :: { String } - : STRING { $1 } - | STRING strings { $1 ++ $2 } - -{ -happyError :: [LToken] -> Either [LToken] a -happyError toks = Left toks - -para' :: Inlines -> Blocks -para' = para . trimInlines - -monospace :: Inlines -> Inlines -monospace = everywhere (mkT go) - where - go (Str s) = Code nullAttr s - go x = x - --- | Create a `Hyperlink` from given string. --- --- A hyperlink consists of a URL and an optional label. The label is separated --- from the url by one or more whitespace characters. -makeHyperlink :: String -> Inlines -makeHyperlink input = case break isSpace $ trim input of - (url, "") -> link url url (str url) - (url, lb) -> link url url (trimInlines $ text lb) - -makeProperty :: String -> Blocks -makeProperty s = case trim s of - 'p':'r':'o':'p':'>':xs -> - codeBlockWith ([], ["property"], []) (dropWhile isSpace xs) - xs -> - error $ "makeProperty: invalid input " ++ show xs - --- | Create an 'Example', stripping superfluous characters as appropriate -makeExample :: String -> String -> [String] -> Blocks -makeExample prompt expression result = - para $ codeWith ([], ["haskell","expr"], []) (trim expression) - <> linebreak - <> (mconcat $ intersperse linebreak $ map coder result') - where - -- 1. drop trailing whitespace from the prompt, remember the prefix - prefix = takeWhile isSpace prompt - - -- 2. drop, if possible, the exact same sequence of whitespace - -- characters from each result line - -- - -- 3. interpret lines that only contain the string "<BLANKLINE>" as an - -- empty line - result' = map (substituteBlankLine . tryStripPrefix prefix) result - where - tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys - - substituteBlankLine "<BLANKLINE>" = "" - substituteBlankLine line = line - coder = codeWith ([], ["result"], []) -} diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 1a22f2ad2..97bfaa455 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,6 +1,6 @@ -{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ScopedTypeVariables, OverloadedStrings #-} {- -Copyright (C) 2006-2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2012 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -31,22 +31,26 @@ Conversion of LaTeX to 'Pandoc' document. module Text.Pandoc.Readers.LaTeX ( readLaTeX, rawLaTeXInline, rawLaTeXBlock, + inlineCommand, handleIncludes ) where import Text.Pandoc.Definition -import Text.Pandoc.Generic +import Text.Pandoc.Walk import Text.Pandoc.Shared import Text.Pandoc.Options -import Text.Pandoc.Biblio (processBiblio) -import Text.Pandoc.Parsing hiding ((<|>), many, optional, space) +import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, + mathDisplay, mathInline) +import Text.Parsec.Prim (ParsecT, runParserT) import qualified Text.Pandoc.UTF8 as UTF8 import Data.Char ( chr, ord ) +import Control.Monad.Trans (lift) import Control.Monad import Text.Pandoc.Builder -import Data.Char (isLetter, isPunctuation, isSpace) +import Data.Char (isLetter, isAlphaNum) import Control.Applicative import Data.Monoid +import Data.Maybe (fromMaybe) import System.Environment (getEnv) import System.FilePath (replaceExtension, (</>)) import Data.List (intercalate, intersperse) @@ -67,9 +71,7 @@ parseLaTeX = do eof st <- getState let meta = stateMeta st - refs <- getOption readerReferences - mbsty <- getOption readerCitationStyle - let (Pandoc _ bs') = processBiblio mbsty refs $ doc bs + let (Pandoc _ bs') = doc bs return $ Pandoc meta bs' type LP = Parser [Char] ParserState @@ -124,7 +126,7 @@ comment :: LP () comment = do char '%' skipMany (satisfy (/='\n')) - newline + optional newline return () bgroup :: LP () @@ -165,28 +167,40 @@ mathChars = concat <$> <|> (\c -> ['\\',c]) <$> (try $ char '\\' *> anyChar) ) +quoted' :: (Inlines -> Inlines) -> LP String -> LP () -> LP Inlines +quoted' f starter ender = do + startchs <- starter + try ((f . mconcat) <$> manyTill inline ender) <|> lit startchs + double_quote :: LP Inlines -double_quote = (doubleQuoted . mconcat) <$> - (try $ string "``" *> manyTill inline (try $ string "''")) +double_quote = + ( quoted' doubleQuoted (try $ string "``") (void $ try $ string "''") + <|> quoted' doubleQuoted (string "“") (void $ char '”') + -- the following is used by babel for localized quotes: + <|> quoted' doubleQuoted (try $ string "\"`") (void $ try $ string "\"'") + <|> quoted' doubleQuoted (string "\"") (void $ char '"') + ) single_quote :: LP Inlines -single_quote = (singleQuoted . mconcat) <$> - (try $ char '`' *> manyTill inline (try $ char '\'' >> notFollowedBy letter)) +single_quote = + ( quoted' singleQuoted (string "`") (try $ char '\'' >> notFollowedBy letter) + <|> quoted' singleQuoted (string "‘") (try $ char '’' >> notFollowedBy letter) + ) inline :: LP Inlines inline = (mempty <$ comment) <|> (space <$ sp) <|> inlineText <|> inlineCommand - <|> grouped inline + <|> inlineGroup <|> (char '-' *> option (str "-") ((char '-') *> option (str "–") (str "—" <$ char '-'))) <|> double_quote <|> single_quote - <|> (str "“" <$ try (string "``")) -- nb. {``} won't be caught by double_quote <|> (str "”" <$ try (string "''")) - <|> (str "‘" <$ char '`') -- nb. {`} won't be caught by single_quote + <|> (str "”" <$ char '”') <|> (str "’" <$ char '\'') + <|> (str "’" <$ char '’') <|> (str "\160" <$ char '~') <|> (mathDisplay $ string "$$" *> mathChars <* string "$$") <|> (mathInline $ char '$' *> mathChars <* char '$') @@ -201,6 +215,15 @@ inline = (mempty <$ comment) inlines :: LP Inlines inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) +inlineGroup :: LP Inlines +inlineGroup = do + ils <- grouped inline + if isNull ils + then return mempty + else return $ spanWith nullAttr ils + -- we need the span so we can detitlecase bibtex entries; + -- we need to know when something is {C}apitalized + block :: LP Blocks block = (mempty <$ comment) <|> (mempty <$ ((spaceChar <|> newline) *> spaces)) @@ -282,6 +305,13 @@ blockCommands = M.fromList $ , ("item", skipopts *> loose_item) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) + , ("caption", skipopts *> tok >>= setCaption) + , ("PandocStartInclude", startInclude) + , ("PandocEndInclude", endInclude) + , ("bibliography", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) + , ("addbibresource", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) ] ++ map ignoreBlocks -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks @@ -289,7 +319,7 @@ blockCommands = M.fromList $ -- newcommand, etc. should be parsed by macro, but we need this -- here so these aren't parsed as inline commands to ignore , "special", "pdfannot", "pdfstringdef" - , "bibliography", "bibliographystyle" + , "bibliographystyle" , "maketitle", "makeindex", "makeglossary" , "addcontentsline", "addtocontents", "addtocounter" -- \ignore{} is used conventionally in literate haskell for definitions @@ -301,7 +331,19 @@ blockCommands = M.fromList $ ] addMeta :: ToMetaValue a => String -> a -> LP () -addMeta field val = updateState $ setMeta field val +addMeta field val = updateState $ \st -> + st{ stateMeta = addMetaField field val $ stateMeta st } + +splitBibs :: String -> [Inlines] +splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') + +setCaption :: Inlines -> LP Blocks +setCaption ils = do + updateState $ \st -> st{ stateCaption = Just ils } + return mempty + +resetCaption :: LP () +resetCaption = updateState $ \st -> st{ stateCaption = Nothing } authors :: LP () authors = try $ do @@ -312,15 +354,17 @@ authors = try $ do -- skip e.g. \vspace{10pt} auths <- sepBy oneAuthor (controlSeq "and") char '}' - addMeta "authors" (map trimInlines auths) + addMeta "author" (map trimInlines auths) section :: Attr -> Int -> LP Blocks -section attr lvl = do +section (ident, classes, kvs) lvl = do hasChapters <- stateHasChapters `fmap` getState let lvl' = if hasChapters then lvl + 1 else lvl skipopts contents <- grouped inline - return $ headerWith attr lvl' contents + lab <- option ident $ try (spaces >> controlSeq "label" >> spaces >> braced) + attr' <- registerHeader (lab, classes, kvs) contents + return $ headerWith attr' lvl' contents inlineCommand :: LP Inlines inlineCommand = try $ do @@ -353,17 +397,18 @@ isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands inlineCommands :: M.Map String (LP Inlines) inlineCommands = M.fromList $ - [ ("emph", emph <$> tok) - , ("textit", emph <$> tok) - , ("textsl", emph <$> tok) - , ("textsc", smallcaps <$> tok) - , ("sout", strikeout <$> tok) - , ("textsuperscript", superscript <$> tok) - , ("textsubscript", subscript <$> tok) + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("sout", extractSpaces strikeout <$> tok) + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) , ("textbackslash", lit "\\") , ("backslash", lit "\\") , ("slash", lit "/") - , ("textbf", strong <$> tok) + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) , ("ldots", lit "…") , ("dots", lit "…") , ("mdots", lit "…") @@ -383,15 +428,15 @@ inlineCommands = M.fromList $ , ("{", lit "{") , ("}", lit "}") -- old TeX commands - , ("em", emph <$> inlines) - , ("it", emph <$> inlines) - , ("sl", emph <$> inlines) - , ("bf", strong <$> inlines) + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) , ("rm", inlines) - , ("itshape", emph <$> inlines) - , ("slshape", emph <$> inlines) - , ("scshape", smallcaps <$> inlines) - , ("bfseries", strong <$> inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) , ("/", pure mempty) -- italic correction , ("aa", lit "å") , ("AA", lit "Å") @@ -402,6 +447,8 @@ inlineCommands = M.fromList $ , ("l", lit "ł") , ("ae", lit "æ") , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") , ("pounds", lit "£") , ("euro", lit "€") , ("copyright", lit "©") @@ -415,6 +462,8 @@ inlineCommands = M.fromList $ , (".", option (str ".") $ try $ tok >>= accent dot) , ("=", option (str "=") $ try $ tok >>= accent macron) , ("c", option (str "c") $ try $ tok >>= accent cedilla) + , ("v", option (str "v") $ try $ tok >>= accent hacek) + , ("u", option (str "u") $ try $ tok >>= accent breve) , ("i", lit "i") , ("\\", linebreak <$ (optional (bracketed inline) *> optional sp)) , (",", pure mempty) @@ -430,6 +479,7 @@ inlineCommands = M.fromList $ , ("footnote", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) , ("verb", doverb) , ("lstinline", doverb) + , ("Verb", doverb) , ("texttt", (code . stringify . toList) <$> tok) , ("url", (unescapeURL <$> braced) >>= \url -> pure (link url "" (str url))) @@ -488,25 +538,21 @@ inlineCommands = M.fromList $ , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> complexNatbibCitation AuthorInText) <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) ] ++ map ignoreInlines -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks: - [ "noindent", "index", "nocite" ] + [ "noindent", "index" ] mkImage :: String -> LP Inlines mkImage src = do - -- try for a caption - (alt, tit) <- option (str "image", "") $ try $ do - spaces - controlSeq "caption" - optional (char '*') - ils <- grouped inline - return (ils, "fig:") + let alt = str "image" case takeExtension src of "" -> do defaultExt <- getOption readerDefaultImageExtension - return $ image (addExtension src defaultExt) tit alt - _ -> return $ image src tit alt + return $ image (addExtension src defaultExt) "" alt + _ -> return $ image src "" alt inNote :: Inlines -> Inlines inNote ils = @@ -514,9 +560,7 @@ inNote ils = unescapeURL :: String -> String unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs - where isEscapable '%' = True - isEscapable '#' = True - isEscapable _ = False + where isEscapable c = c `elem` "#$%&~_^\\{}" unescapeURL (x:xs) = x:unescapeURL xs unescapeURL [] = "" @@ -539,137 +583,196 @@ doLHSverb = codeWith ("",["haskell"],[]) <$> manyTill (satisfy (/='\n')) (char ' lit :: String -> LP Inlines lit = pure . str -accent :: (Char -> Char) -> Inlines -> LP Inlines +accent :: (Char -> String) -> Inlines -> LP Inlines accent f ils = case toList ils of - (Str (x:xs) : ys) -> return $ fromList $ (Str (f x : xs) : ys) + (Str (x:xs) : ys) -> return $ fromList $ (Str (f x ++ xs) : ys) [] -> mzero _ -> return ils -grave :: Char -> Char -grave 'A' = 'À' -grave 'E' = 'È' -grave 'I' = 'Ì' -grave 'O' = 'Ò' -grave 'U' = 'Ù' -grave 'a' = 'à' -grave 'e' = 'è' -grave 'i' = 'ì' -grave 'o' = 'ò' -grave 'u' = 'ù' -grave c = c - -acute :: Char -> Char -acute 'A' = 'Á' -acute 'E' = 'É' -acute 'I' = 'Í' -acute 'O' = 'Ó' -acute 'U' = 'Ú' -acute 'Y' = 'Ý' -acute 'a' = 'á' -acute 'e' = 'é' -acute 'i' = 'í' -acute 'o' = 'ó' -acute 'u' = 'ú' -acute 'y' = 'ý' -acute 'C' = 'Ć' -acute 'c' = 'ć' -acute 'L' = 'Ĺ' -acute 'l' = 'ĺ' -acute 'N' = 'Ń' -acute 'n' = 'ń' -acute 'R' = 'Ŕ' -acute 'r' = 'ŕ' -acute 'S' = 'Ś' -acute 's' = 'ś' -acute 'Z' = 'Ź' -acute 'z' = 'ź' -acute c = c - -circ :: Char -> Char -circ 'A' = 'Â' -circ 'E' = 'Ê' -circ 'I' = 'Î' -circ 'O' = 'Ô' -circ 'U' = 'Û' -circ 'a' = 'â' -circ 'e' = 'ê' -circ 'i' = 'î' -circ 'o' = 'ô' -circ 'u' = 'û' -circ 'C' = 'Ĉ' -circ 'c' = 'ĉ' -circ 'G' = 'Ĝ' -circ 'g' = 'ĝ' -circ 'H' = 'Ĥ' -circ 'h' = 'ĥ' -circ 'J' = 'Ĵ' -circ 'j' = 'ĵ' -circ 'S' = 'Ŝ' -circ 's' = 'ŝ' -circ 'W' = 'Ŵ' -circ 'w' = 'ŵ' -circ 'Y' = 'Ŷ' -circ 'y' = 'ŷ' -circ c = c - -tilde :: Char -> Char -tilde 'A' = 'Ã' -tilde 'a' = 'ã' -tilde 'O' = 'Õ' -tilde 'o' = 'õ' -tilde 'I' = 'Ĩ' -tilde 'i' = 'ĩ' -tilde 'U' = 'Ũ' -tilde 'u' = 'ũ' -tilde 'N' = 'Ñ' -tilde 'n' = 'ñ' -tilde c = c - -umlaut :: Char -> Char -umlaut 'A' = 'Ä' -umlaut 'E' = 'Ë' -umlaut 'I' = 'Ï' -umlaut 'O' = 'Ö' -umlaut 'U' = 'Ü' -umlaut 'a' = 'ä' -umlaut 'e' = 'ë' -umlaut 'i' = 'ï' -umlaut 'o' = 'ö' -umlaut 'u' = 'ü' -umlaut c = c - -dot :: Char -> Char -dot 'C' = 'Ċ' -dot 'c' = 'ċ' -dot 'E' = 'Ė' -dot 'e' = 'ė' -dot 'G' = 'Ġ' -dot 'g' = 'ġ' -dot 'I' = 'İ' -dot 'Z' = 'Ż' -dot 'z' = 'ż' -dot c = c - -macron :: Char -> Char -macron 'A' = 'Ā' -macron 'E' = 'Ē' -macron 'I' = 'Ī' -macron 'O' = 'Ō' -macron 'U' = 'Ū' -macron 'a' = 'ā' -macron 'e' = 'ē' -macron 'i' = 'ī' -macron 'o' = 'ō' -macron 'u' = 'ū' -macron c = c - -cedilla :: Char -> Char -cedilla 'c' = 'ç' -cedilla 'C' = 'Ç' -cedilla 's' = 'ş' -cedilla 'S' = 'Ş' -cedilla c = c +grave :: Char -> String +grave 'A' = "À" +grave 'E' = "È" +grave 'I' = "Ì" +grave 'O' = "Ò" +grave 'U' = "Ù" +grave 'a' = "à" +grave 'e' = "è" +grave 'i' = "ì" +grave 'o' = "ò" +grave 'u' = "ù" +grave c = [c] + +acute :: Char -> String +acute 'A' = "Á" +acute 'E' = "É" +acute 'I' = "Í" +acute 'O' = "Ó" +acute 'U' = "Ú" +acute 'Y' = "Ý" +acute 'a' = "á" +acute 'e' = "é" +acute 'i' = "í" +acute 'o' = "ó" +acute 'u' = "ú" +acute 'y' = "ý" +acute 'C' = "Ć" +acute 'c' = "ć" +acute 'L' = "Ĺ" +acute 'l' = "ĺ" +acute 'N' = "Ń" +acute 'n' = "ń" +acute 'R' = "Ŕ" +acute 'r' = "ŕ" +acute 'S' = "Ś" +acute 's' = "ś" +acute 'Z' = "Ź" +acute 'z' = "ź" +acute c = [c] + +circ :: Char -> String +circ 'A' = "Â" +circ 'E' = "Ê" +circ 'I' = "Î" +circ 'O' = "Ô" +circ 'U' = "Û" +circ 'a' = "â" +circ 'e' = "ê" +circ 'i' = "î" +circ 'o' = "ô" +circ 'u' = "û" +circ 'C' = "Ĉ" +circ 'c' = "ĉ" +circ 'G' = "Ĝ" +circ 'g' = "ĝ" +circ 'H' = "Ĥ" +circ 'h' = "ĥ" +circ 'J' = "Ĵ" +circ 'j' = "ĵ" +circ 'S' = "Ŝ" +circ 's' = "ŝ" +circ 'W' = "Ŵ" +circ 'w' = "ŵ" +circ 'Y' = "Ŷ" +circ 'y' = "ŷ" +circ c = [c] + +tilde :: Char -> String +tilde 'A' = "Ã" +tilde 'a' = "ã" +tilde 'O' = "Õ" +tilde 'o' = "õ" +tilde 'I' = "Ĩ" +tilde 'i' = "ĩ" +tilde 'U' = "Ũ" +tilde 'u' = "ũ" +tilde 'N' = "Ñ" +tilde 'n' = "ñ" +tilde c = [c] + +umlaut :: Char -> String +umlaut 'A' = "Ä" +umlaut 'E' = "Ë" +umlaut 'I' = "Ï" +umlaut 'O' = "Ö" +umlaut 'U' = "Ü" +umlaut 'a' = "ä" +umlaut 'e' = "ë" +umlaut 'i' = "ï" +umlaut 'o' = "ö" +umlaut 'u' = "ü" +umlaut c = [c] + +dot :: Char -> String +dot 'C' = "Ċ" +dot 'c' = "ċ" +dot 'E' = "Ė" +dot 'e' = "ė" +dot 'G' = "Ġ" +dot 'g' = "ġ" +dot 'I' = "İ" +dot 'Z' = "Ż" +dot 'z' = "ż" +dot c = [c] + +macron :: Char -> String +macron 'A' = "Ā" +macron 'E' = "Ē" +macron 'I' = "Ī" +macron 'O' = "Ō" +macron 'U' = "Ū" +macron 'a' = "ā" +macron 'e' = "ē" +macron 'i' = "ī" +macron 'o' = "ō" +macron 'u' = "ū" +macron c = [c] + +cedilla :: Char -> String +cedilla 'c' = "ç" +cedilla 'C' = "Ç" +cedilla 's' = "ş" +cedilla 'S' = "Ş" +cedilla 't' = "ţ" +cedilla 'T' = "Ţ" +cedilla 'e' = "ȩ" +cedilla 'E' = "Ȩ" +cedilla 'h' = "ḩ" +cedilla 'H' = "Ḩ" +cedilla 'o' = "o̧" +cedilla 'O' = "O̧" +cedilla c = [c] + +hacek :: Char -> String +hacek 'A' = "Ǎ" +hacek 'a' = "ǎ" +hacek 'C' = "Č" +hacek 'c' = "č" +hacek 'D' = "Ď" +hacek 'd' = "ď" +hacek 'E' = "Ě" +hacek 'e' = "ě" +hacek 'G' = "Ǧ" +hacek 'g' = "ǧ" +hacek 'H' = "Ȟ" +hacek 'h' = "ȟ" +hacek 'I' = "Ǐ" +hacek 'i' = "ǐ" +hacek 'j' = "ǰ" +hacek 'K' = "Ǩ" +hacek 'k' = "ǩ" +hacek 'L' = "Ľ" +hacek 'l' = "ľ" +hacek 'N' = "Ň" +hacek 'n' = "ň" +hacek 'O' = "Ǒ" +hacek 'o' = "ǒ" +hacek 'R' = "Ř" +hacek 'r' = "ř" +hacek 'S' = "Š" +hacek 's' = "š" +hacek 'T' = "Ť" +hacek 't' = "ť" +hacek 'U' = "Ǔ" +hacek 'u' = "ǔ" +hacek 'Z' = "Ž" +hacek 'z' = "ž" +hacek c = [c] + +breve :: Char -> String +breve 'A' = "Ă" +breve 'a' = "ă" +breve 'E' = "Ĕ" +breve 'e' = "ĕ" +breve 'G' = "Ğ" +breve 'g' = "ğ" +breve 'I' = "Ĭ" +breve 'i' = "ĭ" +breve 'O' = "Ŏ" +breve 'o' = "ŏ" +breve 'U' = "Ŭ" +breve 'u' = "ŭ" +breve c = [c] tok :: LP Inlines tok = try $ grouped inline <|> inlineCommand <|> str <$> (count 1 $ inlineChar) @@ -684,7 +787,7 @@ inlineText :: LP Inlines inlineText = str <$> many1 inlineChar inlineChar :: LP Char -inlineChar = noneOf "\\$%^_&~#{}^'`-[] \t\n" +inlineChar = noneOf "\\$%^_&~#{}^'`\"‘’“”-[] \t\n" environment :: LP Blocks environment = do @@ -703,31 +806,107 @@ rawEnv name = do (withRaw (env name blocks) >>= applyMacros' . snd) else env name blocks +---- + +type IncludeParser = ParsecT [Char] [String] IO String + -- | Replace "include" commands with file contents. handleIncludes :: String -> IO String -handleIncludes = handleIncludes' [] - --- parents parameter prevents infinite include loops -handleIncludes' :: [FilePath] -> String -> IO String -handleIncludes' _ [] = return [] -handleIncludes' parents ('\\':'%':xs) = - ("\\%"++) `fmap` handleIncludes' parents xs -handleIncludes' parents ('%':xs) = handleIncludes' parents - $ drop 1 $ dropWhile (/='\n') xs -handleIncludes' parents ('\\':xs) = - case runParser include defaultParserState "input" ('\\':xs) of - Right (fs, rest) -> do yss <- mapM (\f -> if f `elem` parents - then "" <$ warn ("Include file loop in '" - ++ f ++ "'.") - else readTeXFile f >>= - handleIncludes' (f:parents)) fs - rest' <- handleIncludes' parents rest - return $ intercalate "\n" yss ++ rest' - _ -> case runParser (verbCmd <|> verbatimEnv) defaultParserState - "input" ('\\':xs) of - Right (r, rest) -> (r ++) `fmap` handleIncludes' parents rest - _ -> ('\\':) `fmap` handleIncludes' parents xs -handleIncludes' parents (x:xs) = (x:) `fmap` handleIncludes' parents xs +handleIncludes s = do + res <- runParserT includeParser' [] "input" s + case res of + Right s' -> return s' + Left e -> error $ show e + +includeParser' :: IncludeParser +includeParser' = + concat <$> many (comment' <|> escaped' <|> blob' <|> include' + <|> startMarker' <|> endMarker' + <|> verbCmd' <|> verbatimEnv' <|> backslash') + +comment' :: IncludeParser +comment' = do + char '%' + xs <- manyTill anyChar newline + return ('%':xs ++ "\n") + +escaped' :: IncludeParser +escaped' = try $ string "\\%" <|> string "\\\\" + +verbCmd' :: IncludeParser +verbCmd' = fmap snd <$> + withRaw $ try $ do + string "\\verb" + c <- anyChar + manyTill anyChar (char c) + +verbatimEnv' :: IncludeParser +verbatimEnv' = fmap snd <$> + withRaw $ try $ do + string "\\begin" + name <- braced' + guard $ name `elem` ["verbatim", "Verbatim", "lstlisting", + "minted", "alltt"] + manyTill anyChar (try $ string $ "\\end{" ++ name ++ "}") + +blob' :: IncludeParser +blob' = try $ many1 (noneOf "\\%") + +backslash' :: IncludeParser +backslash' = string "\\" + +braced' :: IncludeParser +braced' = try $ char '{' *> manyTill (satisfy (/='}')) (char '}') + +include' :: IncludeParser +include' = do + fs' <- try $ do + char '\\' + name <- try (string "include") + <|> try (string "input") + <|> string "usepackage" + -- skip options + skipMany $ try $ char '[' *> (manyTill anyChar (char ']')) + fs <- (map trim . splitBy (==',')) <$> braced' + return $ if name == "usepackage" + then map (flip replaceExtension ".sty") fs + else map (flip replaceExtension ".tex") fs + pos <- getPosition + containers <- getState + let fn = case containers of + (f':_) -> f' + [] -> "input" + -- now process each include file in order... + rest <- getInput + results' <- forM fs' (\f -> do + when (f `elem` containers) $ + fail "Include file loop!" + contents <- lift $ readTeXFile f + return $ "\\PandocStartInclude{" ++ f ++ "}" ++ + contents ++ "\\PandocEndInclude{" ++ + fn ++ "}{" ++ show (sourceLine pos) ++ "}{" + ++ show (sourceColumn pos) ++ "}") + setInput $ concat results' ++ rest + return "" + +startMarker' :: IncludeParser +startMarker' = try $ do + string "\\PandocStartInclude" + fn <- braced' + updateState (fn:) + setPosition $ newPos fn 1 1 + return $ "\\PandocStartInclude{" ++ fn ++ "}" + +endMarker' :: IncludeParser +endMarker' = try $ do + string "\\PandocEndInclude" + fn <- braced' + ln <- braced' + co <- braced' + updateState tail + setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) + return $ "\\PandocEndInclude{" ++ fn ++ "}{" ++ ln ++ "}{" ++ + co ++ "}" readTeXFile :: FilePath -> IO String readTeXFile f = do @@ -742,27 +921,7 @@ readFileFromDirs (d:ds) f = E.catch (UTF8.readFile $ d </> f) $ \(_ :: E.SomeException) -> readFileFromDirs ds f -include :: LP ([FilePath], String) -include = do - name <- controlSeq "include" - <|> controlSeq "input" - <|> controlSeq "usepackage" - skipopts - fs <- (splitBy (==',')) <$> braced - rest <- getInput - let fs' = if name == "usepackage" - then map (flip replaceExtension ".sty") fs - else map (flip replaceExtension ".tex") fs - return (fs', rest) - -verbCmd :: LP (String, String) -verbCmd = do - (_,r) <- withRaw $ do - controlSeq "verb" - c <- anyChar - manyTill anyChar (char c) - rest <- getInput - return (r, rest) +---- keyval :: LP (String, String) keyval = try $ do @@ -778,24 +937,12 @@ keyvals :: LP [(String, String)] keyvals = try $ char '[' *> manyTill keyval (char ']') alltt :: String -> LP Blocks -alltt t = bottomUp strToCode <$> parseFromString blocks +alltt t = walk strToCode <$> parseFromString blocks (substitute " " "\\ " $ substitute "%" "\\%" $ concat $ intersperse "\\\\\n" $ lines t) where strToCode (Str s) = Code nullAttr s strToCode x = x -verbatimEnv :: LP (String, String) -verbatimEnv = do - (_,r) <- withRaw $ do - controlSeq "begin" - name <- braced - guard $ name == "verbatim" || name == "Verbatim" || - name == "lstlisting" || name == "minted" || - name == "alltt" - verbEnv name - rest <- getInput - return (r,rest) - rawLaTeXBlock :: Parser [Char] ParserState String rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand)) @@ -804,12 +951,33 @@ rawLaTeXInline = do raw <- (snd <$> withRaw inlineCommand) <|> (snd <$> withRaw blockCommand) RawInline "latex" <$> applyMacros' raw +addImageCaption :: Blocks -> LP Blocks +addImageCaption = walkM go + where go (Image alt (src,tit)) = do + mbcapt <- stateCaption <$> getState + case mbcapt of + Just ils -> return (Image (toList ils) (src, "fig:")) + Nothing -> return (Image alt (src,tit)) + go x = return x + +addTableCaption :: Blocks -> LP Blocks +addTableCaption = walkM go + where go (Table c als ws hs rs) = do + mbcapt <- stateCaption <$> getState + case mbcapt of + Just ils -> return (Table (toList ils) als ws hs rs) + Nothing -> return (Table c als ws hs rs) + go x = return x + environments :: M.Map String (LP Blocks) environments = M.fromList [ ("document", env "document" blocks <* skipMany anyChar) , ("letter", env "letter" letter_contents) - , ("figure", env "figure" $ skipopts *> blocks) + , ("figure", env "figure" $ + resetCaption *> skipopts *> blocks >>= addImageCaption) , ("center", env "center" blocks) + , ("table", env "table" $ + resetCaption *> skipopts *> blocks >>= addTableCaption) , ("tabular", env "tabular" simpTable) , ("quote", blockQuote <$> env "quote" blocks) , ("quotation", blockQuote <$> env "quotation" blocks) @@ -838,7 +1006,7 @@ environments = M.fromList lookup "numbers" options == Just "left" ] ++ maybe [] (:[]) (lookup "language" options >>= fromListingsLanguage) - let attr = ("",classes,kvs) + let attr = (fromMaybe "" (lookup "label" options),classes,kvs) codeBlockWith attr <$> (verbEnv "lstlisting")) , ("minted", do options <- option [] keyvals lang <- grouped (many1 $ satisfy (/='}')) @@ -966,15 +1134,15 @@ paragraph = do preamble :: LP Blocks preamble = mempty <$> manyTill preambleBlock beginDoc - where beginDoc = lookAhead $ controlSeq "begin" *> string "{document}" - preambleBlock = (mempty <$ comment) - <|> (mempty <$ sp) - <|> (mempty <$ blanklines) - <|> (mempty <$ macro) - <|> blockCommand - <|> (mempty <$ anyControlSeq) - <|> (mempty <$ braced) - <|> (mempty <$ anyChar) + where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}" + preambleBlock = (void comment) + <|> (void sp) + <|> (void blanklines) + <|> (void macro) + <|> (void blockCommand) + <|> (void anyControlSeq) + <|> (void braced) + <|> (void anyChar) ------- @@ -986,12 +1154,8 @@ addPrefix _ _ = [] addSuffix :: [Inline] -> [Citation] -> [Citation] addSuffix s ks@(_:_) = - let k = last ks - s' = case s of - (Str (c:_):_) - | not (isPunctuation c || isSpace c) -> Str "," : Space : s - _ -> s - in init ks ++ [k {citationSuffix = citationSuffix k ++ s'}] + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] addSuffix _ _ = [] simpleCiteArgs :: LP [Citation] @@ -999,6 +1163,7 @@ simpleCiteArgs = try $ do first <- optionMaybe $ toList <$> opt second <- optionMaybe $ toList <$> opt char '{' + optional sp keys <- manyTill citationLabel (char '}') let (pre, suf) = case (first , second ) of (Just s , Nothing) -> (mempty, s ) @@ -1014,18 +1179,24 @@ simpleCiteArgs = try $ do return $ addPrefix pre $ addSuffix suf $ map conv keys citationLabel :: LP String -citationLabel = trim <$> - (many1 (satisfy $ \c -> c /=',' && c /='}') <* optional (char ',') <* optional sp) +citationLabel = optional sp *> + (many1 (satisfy isBibtexKeyChar) + <* optional sp + <* optional (char ',') + <* optional sp) + where isBibtexKeyChar c = isAlphaNum c || c `elem` ".:;?!`'()/*@_+=-[]*" cites :: CitationMode -> Bool -> LP [Citation] cites mode multi = try $ do cits <- if multi then many1 simpleCiteArgs else count 1 simpleCiteArgs - let (c:cs) = concat cits + let cs = concat cits return $ case mode of - AuthorInText -> c {citationMode = mode} : cs - _ -> map (\a -> a {citationMode = mode}) (c:cs) + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs citation :: String -> CitationMode -> Bool -> LP Inlines citation name mode multi = do @@ -1057,12 +1228,14 @@ complexNatbibCitation mode = try $ do parseAligns :: LP [Alignment] parseAligns = try $ do char '{' - optional $ char '|' + let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ try (string "@{}") + maybeBar let cAlign = AlignCenter <$ char 'c' let lAlign = AlignLeft <$ char 'l' let rAlign = AlignRight <$ char 'r' - let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign) - aligns' <- sepEndBy alignChar (optional $ char '|') + let parAlign = AlignLeft <$ (char 'p' >> braced) + let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign + aligns' <- sepEndBy alignChar maybeBar spaces char '}' spaces @@ -1082,10 +1255,14 @@ parseTableRow :: Int -- ^ number of columns parseTableRow cols = try $ do let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline let tableCell = (plain . trimInlines . mconcat) <$> many tableCellInline - cells' <- sepBy tableCell amp - guard $ length cells' == cols + cells' <- sepBy1 tableCell amp + let numcells = length cells' + guard $ numcells <= cols && numcells >= 1 + guard $ cells' /= [mempty] + -- note: a & b in a three-column table leaves an empty 3rd cell: + let cells'' = cells' ++ replicate (cols - numcells) mempty spaces - return cells' + return cells'' simpTable :: LP Blocks simpTable = try $ do @@ -1096,9 +1273,23 @@ simpTable = try $ do header' <- option [] $ try (parseTableRow cols <* lbreak <* hline) rows <- sepEndBy (parseTableRow cols) (lbreak <* optional hline) spaces + skipMany (comment *> spaces) let header'' = if null header' then replicate cols mempty else header' lookAhead $ controlSeq "end" -- make sure we're at end return $ table mempty (zip aligns (repeat 0)) header'' rows +startInclude :: LP Blocks +startInclude = do + fn <- braced + setPosition $ newPos fn 1 1 + return mempty + +endInclude :: LP Blocks +endInclude = do + fn <- braced + ln <- braced + co <- braced + setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) + return mempty diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index a3500fbcf..690256224 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,6 +1,6 @@ {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-2013 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,6 +33,7 @@ module Text.Pandoc.Readers.Markdown ( readMarkdown, import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate ) import qualified Data.Map as M +import Data.Scientific (coefficient, base10Exponent) import Data.Ord ( comparing ) import Data.Char ( isAlphaNum, toLower ) import Data.Maybe @@ -49,13 +50,10 @@ import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import Text.Pandoc.Options import Text.Pandoc.Shared import Text.Pandoc.XML (fromEntities) -import Text.Pandoc.Asciify (toAsciiChar) import Text.Pandoc.Parsing hiding (tableWith) import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag, isCommentTag ) -import Text.Pandoc.Biblio (processBiblio) -import qualified Text.CSL as CSL import Data.Monoid (mconcat, mempty) import Control.Applicative ((<$>), (<*), (*>), (<$)) import Control.Monad @@ -63,6 +61,8 @@ import System.FilePath (takeExtension, addExtension) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match (tagOpen) import qualified Data.Set as Set +import Text.Printf (printf) +import Debug.Trace (trace) type MarkdownParser = Parser [Char] ParserState @@ -203,13 +203,10 @@ dateLine = try $ do skipSpaces trimInlinesF . mconcat <$> manyTill inline newline -titleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) -titleBlock = pandocTitleBlock - <|> yamlTitleBlock - <|> mmdTitleBlock - <|> return (return id) +titleBlock :: MarkdownParser () +titleBlock = pandocTitleBlock <|> mmdTitleBlock -pandocTitleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) +pandocTitleBlock :: MarkdownParser () pandocTitleBlock = try $ do guardEnabled Ext_pandoc_title_block lookAhead (char '%') @@ -217,49 +214,61 @@ pandocTitleBlock = try $ do author <- option (return []) authorsLine date <- option mempty dateLine optional blanklines - return $ do - title' <- title - author' <- author - date' <- date - return $ B.setMeta "title" title' - . B.setMeta "author" author' - . B.setMeta "date" date' - -yamlTitleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) -yamlTitleBlock = try $ do + let meta' = do title' <- title + author' <- author + date' <- date + return $ + (if B.isNull title' then id else B.setMeta "title" title') + . (if null author' then id else B.setMeta "author" author') + . (if B.isNull date' then id else B.setMeta "date" date') + $ nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + +yamlMetaBlock :: MarkdownParser (F Blocks) +yamlMetaBlock = try $ do guardEnabled Ext_yaml_metadata_block pos <- getPosition string "---" blankline - rawYaml <- unlines <$> manyTill anyLine stopLine + notFollowedBy blankline -- if --- is followed by a blank it's an HRULE + rawYamlLines <- manyTill anyLine stopLine + -- by including --- and ..., we allow yaml blocks with just comments: + let rawYaml = unlines ("---" : (rawYamlLines ++ ["..."])) optional blanklines opts <- stateOptions <$> getState - case Yaml.decodeEither' $ UTF8.fromString rawYaml of - Right (Yaml.Object hashmap) -> return $ return $ - H.foldrWithKey (\k v f -> - if ignorable k - then f - else B.setMeta (T.unpack k) (yamlToMeta opts v) . f) - id hashmap - Right _ -> do - addWarning (Just pos) "YAML header is not an object" - return $ return id - Left err' -> do - case err' of - InvalidYaml (Just YamlParseException{ - yamlProblem = problem - , yamlContext = _ctxt - , yamlProblemMark = Yaml.YamlMark { - yamlLine = yline - , yamlColumn = ycol - }}) -> - addWarning (Just $ setSourceLine - (setSourceColumn pos (sourceColumn pos + ycol)) - (sourceLine pos + 1 + yline)) - $ "Could not parse YAML header: " ++ problem - _ -> addWarning (Just pos) - $ "Could not parse YAML header: " ++ show err' - return $ return id + meta' <- case Yaml.decodeEither' $ UTF8.fromString rawYaml of + Right (Yaml.Object hashmap) -> return $ return $ + H.foldrWithKey (\k v m -> + if ignorable k + then m + else B.setMeta (T.unpack k) + (yamlToMeta opts v) m) + nullMeta hashmap + Right Yaml.Null -> return $ return nullMeta + Right _ -> do + addWarning (Just pos) "YAML header is not an object" + return $ return nullMeta + Left err' -> do + case err' of + InvalidYaml (Just YamlParseException{ + yamlProblem = problem + , yamlContext = _ctxt + , yamlProblemMark = Yaml.YamlMark { + yamlLine = yline + , yamlColumn = ycol + }}) -> + addWarning (Just $ setSourceLine + (setSourceColumn pos + (sourceColumn pos + ycol)) + (sourceLine pos + 1 + yline)) + $ "Could not parse YAML header: " ++ + problem + _ -> addWarning (Just pos) + $ "Could not parse YAML header: " ++ + show err' + return $ return nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + return mempty -- ignore fields ending with _ ignorable :: Text -> Bool @@ -277,8 +286,12 @@ toMetaValue opts x = yamlToMeta :: ReaderOptions -> Yaml.Value -> MetaValue yamlToMeta opts (Yaml.String t) = toMetaValue opts t -yamlToMeta _ (Yaml.Number n) = MetaString $ show n -yamlToMeta _ (Yaml.Bool b) = MetaString $ map toLower $ show b +yamlToMeta _ (Yaml.Number n) + -- avoid decimal points for numbers that don't need them: + | base10Exponent n >= 0 = MetaString $ show + $ coefficient n * (10 ^ base10Exponent n) + | otherwise = MetaString $ show n +yamlToMeta _ (Yaml.Bool b) = MetaBool b yamlToMeta opts (Yaml.Array xs) = B.toMetaValue $ map (yamlToMeta opts) $ V.toList xs yamlToMeta opts (Yaml.Object o) = MetaMap $ H.foldrWithKey (\k v m -> @@ -292,13 +305,13 @@ yamlToMeta _ _ = MetaString "" stopLine :: MarkdownParser () stopLine = try $ (string "---" <|> string "...") >> blankline >> return () -mmdTitleBlock :: MarkdownParser (F (Pandoc -> Pandoc)) +mmdTitleBlock :: MarkdownParser () mmdTitleBlock = try $ do guardEnabled Ext_mmd_title_block kvPairs <- many1 kvPair blanklines - return $ return $ \(Pandoc m bs) -> - Pandoc (foldl (\m' (k,v) -> addMetaField k v m') m kvPairs) bs + updateState $ \st -> st{ stateMeta' = stateMeta' st <> + return (Meta $ M.fromList kvPairs) } kvPair :: MarkdownParser (String, MetaValue) kvPair = try $ do @@ -315,15 +328,12 @@ parseMarkdown = do updateState $ \state -> state { stateOptions = let oldOpts = stateOptions state in oldOpts{ readerParseRaw = True } } - titleTrans <- option (return id) titleBlock + optional titleBlock blocks <- parseBlocks st <- getState - mbsty <- getOption readerCitationStyle - refs <- getOption readerReferences - return $ processBiblio mbsty refs - $ runF titleTrans st - $ B.doc - $ runF blocks st + let meta = runF (stateMeta' st) st + let Pandoc _ bs = B.doc $ runF blocks st + return $ Pandoc meta bs addWarning :: Maybe SourcePos -> String -> MarkdownParser () addWarning mbpos msg = @@ -339,10 +349,8 @@ referenceKey = try $ do char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') let sourceURL = liftM unwords $ many $ try $ do - notFollowedBy' referenceTitle - skipMany spaceChar - optional $ newline >> notFollowedBy blankline skipMany spaceChar + notFollowedBy' referenceTitle notFollowedBy' (() <$ reference) many1 $ notFollowedBy space >> litChar let betweenAngles = try $ char '<' >> @@ -351,7 +359,7 @@ referenceKey = try $ do tit <- option "" referenceTitle -- currently we just ignore MMD-style link/image attributes _kvs <- option [] $ guardEnabled Ext_link_attributes - >> many (spnl >> keyValAttr) + >> many (try $ spnl >> keyValAttr) blanklines let target = (escapeURI $ trimr src, tit) st <- getState @@ -437,19 +445,26 @@ parseBlocks :: MarkdownParser (F Blocks) parseBlocks = mconcat <$> manyTill block eof block :: MarkdownParser (F Blocks) -block = choice [ mempty <$ blanklines +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- choice [ mempty <$ blanklines , codeBlockFenced + , yamlMetaBlock , guardEnabled Ext_latex_macros *> (macro >>= return . return) + -- note: bulletList needs to be before header because of + -- the possibility of empty list items: - + , bulletList , header , lhsCodeBlock , rawTeXBlock + , divHtml , htmlBlock , table , lineBlock , codeBlockIndented , blockQuote , hrule - , bulletList , orderedList , definitionList , noteBlock @@ -458,6 +473,11 @@ block = choice [ mempty <$ blanklines , para , plain ] <?> "block" + when tr $ do + st <- getState + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList $ runF res st)) (return ()) + return res -- -- header blocks @@ -466,39 +486,15 @@ block = choice [ mempty <$ blanklines header :: MarkdownParser (F Blocks) header = setextHeader <|> atxHeader <?> "header" --- returns unique identifier -addToHeaderList :: Attr -> F Inlines -> MarkdownParser Attr -addToHeaderList (ident,classes,kvs) text = do - let header' = runF text defaultParserState - exts <- getOption readerExtensions - let insert' = M.insertWith (\_new old -> old) - if null ident && Ext_auto_identifiers `Set.member` exts - then do - ids <- stateIdentifiers `fmap` getState - let id' = uniqueIdent (B.toList header') ids - let id'' = if Ext_ascii_identifiers `Set.member` exts - then catMaybes $ map toAsciiChar id' - else id' - updateState $ \st -> st{ - stateIdentifiers = if id' == id'' - then id' : ids - else id' : id'' : ids, - stateHeaders = insert' header' id' $ stateHeaders st } - return (id'',classes,kvs) - else do - unless (null ident) $ - updateState $ \st -> st{ - stateHeaders = insert' header' ident $ stateHeaders st } - return (ident,classes,kvs) - atxHeader :: MarkdownParser (F Blocks) atxHeader = try $ do level <- many1 (char '#') >>= return . length - notFollowedBy (char '.' <|> char ')') -- this would be a list + notFollowedBy $ guardEnabled Ext_fancy_lists >> + (char '.' <|> char ')') -- this would be a list skipSpaces text <- trimInlinesF . mconcat <$> many (notFollowedBy atxClosing >> inline) attr <- atxClosing - attr' <- addToHeaderList attr text + attr' <- registerHeader attr (runF text defaultParserState) return $ B.headerWith attr' level <$> text atxClosing :: MarkdownParser Attr @@ -537,7 +533,7 @@ setextHeader = try $ do many (char underlineChar) blanklines let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 - attr' <- addToHeaderList attr text + attr' <- registerHeader attr (runF text defaultParserState) return $ B.headerWith attr' level <$> text -- @@ -622,12 +618,19 @@ codeBlockFenced = try $ do skipMany spaceChar attr <- option ([],[],[]) $ try (guardEnabled Ext_fenced_code_attributes >> attributes) - <|> ((\x -> ("",[x],[])) <$> identifier) + <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar) blankline contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) blanklines return $ return $ B.codeBlockWith attr $ intercalate "\n" contents +-- correctly handle github language identifiers +toLanguageId :: String -> String +toLanguageId = map toLower . go + where go "c++" = "cpp" + go "objective-c" = "objectivec" + go x = x + codeBlockIndented :: MarkdownParser (F Blocks) codeBlockIndented = do contents <- many1 (indentedLine <|> @@ -718,7 +721,7 @@ bulletListStart = try $ do skipNonindentSpaces notFollowedBy' (() <$ hrule) -- because hrules start out just like lists satisfy isBulletListMarker - spaceChar + spaceChar <|> lookAhead newline skipSpaces anyOrderedListStart :: MarkdownParser (Int, ListNumberStyle, ListNumberDelim) @@ -746,11 +749,16 @@ listStart = bulletListStart <|> (anyOrderedListStart >> return ()) -- parse a line of a list item (start = parser for beginning of list item) listLine :: MarkdownParser String listLine = try $ do - notFollowedBy blankline notFollowedBy' (do indentSpaces - many (spaceChar) + many spaceChar listStart) - chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline + notFollowedBy' $ htmlTag (~== TagClose "div") + optional (() <$ indentSpaces) + chunks <- manyTill + ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') + <|> liftM snd (htmlTag isCommentTag) + <|> count 1 anyChar + ) newline return $ concat chunks -- parse raw text for one list item, excluding start marker and continuations @@ -759,7 +767,7 @@ rawListItem :: MarkdownParser a rawListItem start = try $ do start first <- listLine - rest <- many (notFollowedBy listStart >> listLine) + rest <- many (notFollowedBy listStart >> notFollowedBy blankline >> listLine) blanks <- many blankline return $ unlines (first:rest) ++ blanks @@ -777,6 +785,7 @@ listContinuationLine :: MarkdownParser String listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart + notFollowedBy' $ htmlTag (~== TagClose "div") optional indentSpaces result <- anyLine return $ result ++ "\n" @@ -801,8 +810,8 @@ listItem start = try $ do orderedList :: MarkdownParser (F Blocks) orderedList = try $ do (start, style, delim) <- lookAhead anyOrderedListStart - unless ((style == DefaultStyle || style == Decimal || style == Example) && - (delim == DefaultDelim || delim == Period)) $ + unless (style `elem` [DefaultStyle, Decimal, Example] && + delim `elem` [DefaultDelim, Period]) $ guardEnabled Ext_fancy_lists when (style == Example) $ guardEnabled Ext_example_lists items <- fmap sequence $ many1 $ listItem @@ -863,22 +872,6 @@ definitionList = do items <- fmap sequence $ many1 definitionListItem return $ B.definitionList <$> fmap compactify'DL items -compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] -compactify'DL items = - let defs = concatMap snd items - defBlocks = reverse $ concatMap B.toList defs - isPara (Para _) = True - isPara _ = False - in case defBlocks of - (Para x:_) -> if not $ any isPara (drop 1 defBlocks) - then let (t,ds) = last items - lastDef = B.toList $ last ds - ds' = init ds ++ - [B.fromList $ init lastDef ++ [Plain x]] - in init items ++ [(t, ds')] - else items - _ -> items - -- -- paragraph block -- @@ -891,8 +884,11 @@ para = try $ do $ try $ do newline (blanklines >> return mempty) - <|> (guardDisabled Ext_blank_before_blockquote >> lookAhead blockQuote) - <|> (guardDisabled Ext_blank_before_header >> lookAhead header) + <|> (guardDisabled Ext_blank_before_blockquote >> () <$ lookAhead blockQuote) + <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) + <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) + <|> (guardEnabled Ext_lists_without_preceding_blankline >> + () <$ lookAhead listStart) return $ do result' <- result case B.toList result' of @@ -911,7 +907,9 @@ plain = fmap B.plain . trimInlinesF . mconcat <$> many1 inline -- htmlElement :: MarkdownParser String -htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag) +htmlElement = rawVerbatimBlock + <|> strictHtmlBlock + <|> liftM snd (htmlTag isBlockTag) htmlBlock :: MarkdownParser (F Blocks) htmlBlock = do @@ -932,8 +930,8 @@ strictHtmlBlock = htmlInBalanced (not . isInlineTag) rawVerbatimBlock :: MarkdownParser String rawVerbatimBlock = try $ do - (TagOpen tag _, open) <- htmlTag (tagOpen (\t -> - t == "pre" || t == "style" || t == "script") + (TagOpen tag _, open) <- htmlTag (tagOpen (flip elem + ["pre", "style", "script"]) (const True)) contents <- manyTill anyChar (htmlTag (~== TagClose tag)) return $ open ++ contents ++ renderTags [TagClose tag] @@ -941,8 +939,10 @@ rawVerbatimBlock = try $ do rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do guardEnabled Ext_raw_tex - result <- (B.rawBlock "latex" <$> rawLaTeXBlock) - <|> (B.rawBlock "context" <$> rawConTeXtEnvironment) + result <- (B.rawBlock "latex" . concat <$> + rawLaTeXBlock `sepEndBy1` blankline) + <|> (B.rawBlock "context" . concat <$> + rawConTeXtEnvironment `sepEndBy1` blankline) spaces return $ return result @@ -951,6 +951,8 @@ rawHtmlBlocks = do htmlBlocks <- many1 $ try $ do s <- rawVerbatimBlock <|> try ( do (t,raw) <- htmlTag isBlockTag + guard $ t ~/= TagOpen "div" [] && + t ~/= TagClose "div" exts <- getOption readerExtensions -- if open tag, need markdown="1" if -- markdown_attributes extension is set @@ -1117,13 +1119,11 @@ multilineTable headless = multilineTableHeader :: Bool -- ^ Headerless table -> MarkdownParser (F [Blocks], [Alignment], [Int]) multilineTableHeader headless = try $ do - if headless - then return '\n' - else tableSep >>~ notFollowedBy blankline + unless headless $ + tableSep >> notFollowedBy blankline rawContent <- if headless then return $ repeat "" - else many1 - (notFollowedBy tableSep >> many1Till anyChar newline) + else many1 $ notFollowedBy tableSep >> anyLine initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline @@ -1133,12 +1133,12 @@ multilineTableHeader headless = try $ do then liftM (map (:[]) . tail . splitStringByIndices (init indices)) $ lookAhead anyLine else return $ transpose $ map - (\ln -> tail $ splitStringByIndices (init indices) ln) + (tail . splitStringByIndices (init indices)) rawContent let aligns = zipWith alignType rawHeadsList lengths let rawHeads = if headless then replicate (length dashes) "" - else map unwords rawHeadsList + else map (unlines . map trim) rawHeadsList heads <- fmap sequence $ mapM (parseFromString (mconcat <$> many plain)) $ map trim rawHeads @@ -1195,7 +1195,7 @@ gridTableHeader headless = try $ do -- RST does not have a notion of alignments let rawHeads = if headless then replicate (length dashes) "" - else map unwords $ transpose + else map (unlines . map trim) $ transpose $ map (gridTableSplitLine indices) rawContent heads <- fmap sequence $ mapM (parseFromString parseBlocks . trim) rawHeads return (heads, aligns, indices) @@ -1227,11 +1227,20 @@ removeOneLeadingSpace xs = gridTableFooter :: MarkdownParser [Char] gridTableFooter = blanklines +pipeBreak :: MarkdownParser [Alignment] +pipeBreak = try $ do + nonindentSpaces + openPipe <- (True <$ char '|') <|> return False + first <- pipeTableHeaderPart + rest <- many $ sepPipe *> pipeTableHeaderPart + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) + optional (char '|') + blankline + return (first:rest) + pipeTable :: MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) pipeTable = try $ do - let pipeBreak = nonindentSpaces *> optional (char '|') *> - pipeTableHeaderPart `sepBy1` sepPipe <* - optional (char '|') <* blankline (heads,aligns) <- try ( pipeBreak >>= \als -> return (return $ replicate (length als) mempty, als)) <|> ( pipeTableRow >>= \row -> pipeBreak >>= \als -> @@ -1250,12 +1259,13 @@ sepPipe = try $ do pipeTableRow :: MarkdownParser (F [Blocks]) pipeTableRow = do nonindentSpaces - optional (char '|') + openPipe <- (True <$ char '|') <|> return False let cell = mconcat <$> many (notFollowedBy (blankline <|> char '|') >> inline) first <- cell - sepPipe - rest <- cell `sepBy1` sepPipe + rest <- many $ sepPipe *> cell + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) optional (char '|') blankline let cells = sequence (first:rest) @@ -1340,19 +1350,18 @@ inline = choice [ whitespace , str , endline , code - , fours - , strong - , emph + , strongOrEmph , note , cite , link , image , math , strikeout - , superscript , subscript + , superscript , inlineNote -- after superscript because of ^[link](/foo)^ , autoLink + , spanHtml , rawHtmlInline , escapedChar , rawLaTeXInline' @@ -1382,7 +1391,7 @@ ltSign :: MarkdownParser (F Inlines) ltSign = do guardDisabled Ext_raw_html <|> guardDisabled Ext_markdown_in_html_blocks - <|> (notFollowedBy' rawHtmlBlocks >> return ()) + <|> (notFollowedBy' (htmlTag isBlockTag) >> return ()) char '<' return $ return $ B.str "<" @@ -1422,47 +1431,57 @@ math :: MarkdownParser (F Inlines) math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) <|> (return . B.math <$> (mathInline >>= applyMacros')) -mathDisplay :: MarkdownParser String -mathDisplay = - (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") - <|> (guardEnabled Ext_tex_math_single_backslash >> - mathDisplayWith "\\[" "\\]") - <|> (guardEnabled Ext_tex_math_double_backslash >> - mathDisplayWith "\\\\[" "\\\\]") - -mathDisplayWith :: String -> String -> MarkdownParser String -mathDisplayWith op cl = try $ do - string op - many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string cl) - -mathInline :: MarkdownParser String -mathInline = - (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") - <|> (guardEnabled Ext_tex_math_single_backslash >> - mathInlineWith "\\(" "\\)") - <|> (guardEnabled Ext_tex_math_double_backslash >> - mathInlineWith "\\\\(" "\\\\)") - -mathInlineWith :: String -> String -> MarkdownParser String -mathInlineWith op cl = try $ do - string op - notFollowedBy space - words' <- many1Till (count 1 (noneOf "\n\\") - <|> (char '\\' >> anyChar >>= \c -> return ['\\',c]) - <|> count 1 newline <* notFollowedBy' blankline - *> return " ") - (try $ string cl) - notFollowedBy digit -- to prevent capture of $5 - return $ concat words' - --- to avoid performance problems, treat 4 or more _ or * or ~ or ^ in a row --- as a literal rather than attempting to parse for emph/strong/strikeout/super/sub -fours :: Parser [Char] st (F Inlines) -fours = try $ do - x <- char '*' <|> char '_' <|> char '~' <|> char '^' - count 2 $ satisfy (==x) - rest <- many1 (satisfy (==x)) - return $ return $ B.str (x:x:x:rest) +-- Parses material enclosed in *s, **s, _s, or __s. +-- Designed to avoid backtracking. +enclosure :: Char + -> MarkdownParser (F Inlines) +enclosure c = do + cs <- many1 (char c) + (return (B.str cs) <>) <$> whitespace + <|> case length cs of + 3 -> three c + 2 -> two c mempty + 1 -> one c mempty + _ -> return (return $ B.str cs) + +-- Parse inlines til you hit one c or a sequence of two cs. +-- If one c, emit emph and then parse two. +-- If two cs, emit strong and then parse one. +-- Otherwise, emit ccc then the results. +three :: Char -> MarkdownParser (F Inlines) +three c = do + contents <- mconcat <$> many (notFollowedBy (char c) >> inline) + (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents)) + <|> (try (string [c,c]) >> one c (B.strong <$> contents)) + <|> (char c >> two c (B.emph <$> contents)) + <|> return (return (B.str [c,c,c]) <> contents) + +-- Parse inlines til you hit two c's, and emit strong. +-- If you never do hit two cs, emit ** plus inlines parsed. +two :: Char -> F Inlines -> MarkdownParser (F Inlines) +two c prefix' = do + let ender = try $ string [c,c] + contents <- mconcat <$> many (try $ notFollowedBy ender >> inline) + (ender >> return (B.strong <$> (prefix' <> contents))) + <|> return (return (B.str [c,c]) <> (prefix' <> contents)) + +-- Parse inlines til you hit a c, and emit emph. +-- If you never hit a c, emit * plus inlines parsed. +one :: Char -> F Inlines -> MarkdownParser (F Inlines) +one c prefix' = do + contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline) + <|> try (string [c,c] >> + notFollowedBy (char c) >> + two c mempty) ) + (char c >> return (B.emph <$> (prefix' <> contents))) + <|> return (return (B.str [c]) <> (prefix' <> contents)) + +strongOrEmph :: MarkdownParser (F Inlines) +strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_') + where checkIntraword = do + exts <- getOption readerExtensions + when (Ext_intraword_underscores `Set.member` exts) $ do + guard =<< notAfterString -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) @@ -1474,28 +1493,6 @@ inlinesBetween start end = where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) innerSpace = try $ whitespace >>~ notFollowedBy' end -emph :: MarkdownParser (F Inlines) -emph = fmap B.emph <$> nested - (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) - where starStart = char '*' >> lookAhead nonspaceChar - starEnd = notFollowedBy' (() <$ strong) >> char '*' - ulStart = checkIntraword >> char '_' >> lookAhead nonspaceChar - ulEnd = notFollowedBy' (() <$ strong) >> char '_' - checkIntraword = do - exts <- getOption readerExtensions - when (Ext_intraword_underscores `Set.member` exts) $ do - pos <- getPosition - lastStrPos <- stateLastStrPos <$> getState - guard $ lastStrPos /= Just pos - -strong :: MarkdownParser (F Inlines) -strong = fmap B.strong <$> nested - (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) - where starStart = string "**" >> lookAhead nonspaceChar - starEnd = try $ string "**" - ulStart = string "__" >> lookAhead nonspaceChar - ulEnd = try $ string "__" - strikeout :: MarkdownParser (F Inlines) strikeout = fmap B.strikeout <$> (guardEnabled Ext_strikeout >> inlinesBetween strikeStart strikeEnd) @@ -1526,8 +1523,7 @@ nonEndline = satisfy (/='\n') str :: MarkdownParser (F Inlines) str = do result <- many1 alphaNum - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) isSmart <- getOption readerSmart if isSmart @@ -1558,14 +1554,17 @@ endline :: MarkdownParser (F Inlines) endline = try $ do newline notFollowedBy blankline - guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart - guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header -- parse potential list-starts differently if in a list: st <- getState - when (stateParserContext st == ListItemState) $ do - notFollowedBy' bulletListStart - notFollowedBy' anyOrderedListStart - (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) + when (stateParserContext st == ListItemState) $ notFollowedBy listStart + guardDisabled Ext_lists_without_preceding_blankline <|> notFollowedBy listStart + guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart + guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header + guardDisabled Ext_backtick_code_blocks <|> + notFollowedBy (() <$ (lookAhead (char '`') >> codeBlockFenced)) + (eof >> return mempty) + <|> (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) + <|> (guardEnabled Ext_ignore_line_breaks >> return mempty) <|> (return $ return B.space) -- @@ -1660,6 +1659,7 @@ bareURL :: MarkdownParser (F Inlines) bareURL = try $ do guardEnabled Ext_autolink_bare_uris (orig, src) <- uri <|> emailAddress + notFollowedBy $ try $ spaces >> htmlTag (~== TagClose "a") return $ return $ B.link src "" (B.str orig) autoLink :: MarkdownParser (F Inlines) @@ -1730,6 +1730,38 @@ inBrackets parser = do char ']' return $ "[" ++ contents ++ "]" +spanHtml :: MarkdownParser (F Inlines) +spanHtml = try $ do + guardEnabled Ext_markdown_in_html_blocks + (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) + contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span")) + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + case lookup "style" keyvals of + Just s | null ident && null classes && + map toLower (filter (`notElem` " \t;") s) == + "font-variant:small-caps" + -> return $ B.smallcaps <$> contents + _ -> return $ B.spanWith (ident, classes, keyvals) <$> contents + +divHtml :: MarkdownParser (F Blocks) +divHtml = try $ do + guardEnabled Ext_markdown_in_html_blocks + (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) + bls <- option "" (blankline >> option "" blanklines) + contents <- mconcat <$> + many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block) + closed <- option False (True <$ htmlTag (~== TagClose "div")) + if closed + then do + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.divWith (ident, classes, keyvals) <$> contents + else -- avoid backtracing + return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents + rawHtmlInline :: MarkdownParser (F Inlines) rawHtmlInline = do guardEnabled Ext_raw_html @@ -1745,11 +1777,12 @@ rawHtmlInline = do cite :: MarkdownParser (F Inlines) cite = do guardEnabled Ext_citations - getOption readerReferences >>= guard . not . null - citations <- textualCite <|> normalCite - return $ flip B.cite mempty <$> citations + citations <- textualCite + <|> do (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + return citations -textualCite :: MarkdownParser (F [Citation]) +textualCite :: MarkdownParser (F Inlines) textualCite = try $ do (_, key) <- citeKey let first = Citation{ citationId = key @@ -1759,10 +1792,18 @@ textualCite = try $ do , citationNoteNum = 0 , citationHash = 0 } - mbrest <- option Nothing $ try $ spnl >> Just <$> normalCite + mbrest <- option Nothing $ try $ spnl >> Just <$> withRaw normalCite case mbrest of - Just rest -> return $ (first:) <$> rest - Nothing -> option (return [first]) $ bareloc first + Just (rest, raw) -> + return $ (flip B.cite (B.text $ '@':key ++ " " ++ raw) . (first:)) + <$> rest + Nothing -> + (do (cs, raw) <- withRaw $ bareloc first + return $ (flip B.cite (B.text $ '@':key ++ " " ++ raw)) <$> cs) + <|> return (do st <- askF + return $ case M.lookup key (stateExamples st) of + Just n -> B.str (show n) + _ -> B.cite [first] $ B.str $ '@':key) bareloc :: Citation -> MarkdownParser (F [Citation]) bareloc c = try $ do @@ -1786,18 +1827,6 @@ normalCite = try $ do char ']' return citations -citeKey :: MarkdownParser (Bool, String) -citeKey = try $ do - suppress_author <- option False (char '-' >> return True) - char '@' - first <- letter - let internal p = try $ p >>~ lookAhead (letter <|> digit) - rest <- many $ letter <|> digit <|> internal (oneOf ":.#$%&-_+?<>~/") - let key = first:rest - citations' <- map CSL.refId <$> getOption readerReferences - guard $ key `elem` citations' - return (suppress_author, key) - suffix :: MarkdownParser (F Inlines) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) @@ -1836,7 +1865,7 @@ smart :: MarkdownParser (F Inlines) smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> - choice (map (return . B.singleton <$>) [apostrophe, dash, ellipses]) + choice (map (return <$>) [apostrophe, dash, ellipses]) singleQuoted :: MarkdownParser (F Inlines) singleQuoted = try $ do @@ -1855,4 +1884,3 @@ doubleQuoted = try $ do (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return (fmap B.doubleQuoted . trimInlinesF $ contents)) <|> (return $ return (B.str "\8220") <> contents) - diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 56049e035..f1dcce8f7 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -1,6 +1,7 @@ -{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 +{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} +-- RelaxedPolyRec needed for inlinesBetween on GHC < 7 {- - Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> + Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.MediaWiki - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -42,8 +43,8 @@ import Text.Pandoc.Options import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isCommentTag ) import Text.Pandoc.XML ( fromEntities ) import Text.Pandoc.Parsing hiding ( nested ) -import Text.Pandoc.Generic ( bottomUp ) -import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead ) +import Text.Pandoc.Walk ( walk ) +import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead, stringify, trim ) import Data.Monoid (mconcat, mempty) import Control.Applicative ((<$>), (<*), (*>), (<$)) import Control.Monad @@ -51,7 +52,11 @@ import Data.List (intersperse, intercalate, isPrefixOf ) import Text.HTML.TagSoup import Data.Sequence (viewl, ViewL(..), (<|)) import qualified Data.Foldable as F +import qualified Data.Map as M import Data.Char (isDigit, isSpace) +import Data.Maybe (fromMaybe) +import Text.Printf (printf) +import Debug.Trace (trace) -- | Read mediawiki from an input string and return a Pandoc document. readMediaWiki :: ReaderOptions -- ^ Reader options @@ -62,6 +67,8 @@ readMediaWiki opts s = , mwMaxNestingLevel = 4 , mwNextLinkNumber = 1 , mwCategoryLinks = [] + , mwHeaderMap = M.empty + , mwIdentifierList = [] } "source" (s ++ "\n") of Left err' -> error $ "\nError:\n" ++ show err' @@ -71,10 +78,23 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwMaxNestingLevel :: Int , mwNextLinkNumber :: Int , mwCategoryLinks :: [Inlines] + , mwHeaderMap :: M.Map Inlines String + , mwIdentifierList :: [String] } type MWParser = Parser [Char] MWState +instance HasReaderOptions MWState where + extractReaderOptions = mwOptions + +instance HasHeaderMap MWState where + extractHeaderMap = mwHeaderMap + updateHeaderMap f st = st{ mwHeaderMap = f $ mwHeaderMap st } + +instance HasIdentifierList MWState where + extractIdentifierList = mwIdentifierList + updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st } + -- -- auxiliary functions -- @@ -91,7 +111,7 @@ nested p = do return res specialChars :: [Char] -specialChars = "'[]<=&*{}|\"" +specialChars = "'[]<=&*{}|\":\\" spaceChars :: [Char] spaceChars = " \n\t" @@ -131,9 +151,16 @@ inlinesInTags tag = try $ do blocksInTags :: String -> MWParser Blocks blocksInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) + let closer = if tag == "li" + then htmlTag (~== TagClose "li") + <|> lookAhead ( + htmlTag (~== TagOpen "li" []) + <|> htmlTag (~== TagClose "ol") + <|> htmlTag (~== TagClose "ul")) + else htmlTag (~== TagClose tag) if '/' `elem` raw -- self-closing tag then return mempty - else mconcat <$> manyTill block (htmlTag (~== TagClose tag)) + else mconcat <$> manyTill block closer charsInTags :: String -> MWParser [Char] charsInTags tag = try $ do @@ -162,7 +189,10 @@ parseMediaWiki = do -- block :: MWParser Blocks -block = mempty <$ skipMany1 blankline +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- mempty <$ skipMany1 blankline <|> table <|> header <|> hrule @@ -174,6 +204,10 @@ block = mempty <$ skipMany1 blankline <|> blockTag <|> (B.rawBlock "mediawiki" <$> template) <|> para + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res para :: MWParser Blocks para = do @@ -187,7 +221,7 @@ table = do tableStart styles <- option [] parseAttrs <* blankline let tableWidth = case lookup "width" styles of - Just w -> maybe 1.0 id $ parseWidth w + Just w -> fromMaybe 1.0 $ parseWidth w Nothing -> 1.0 caption <- option mempty tableCaption optional rowsep @@ -202,6 +236,7 @@ table = do let widths' = map (\w -> if w == 0 then defaultwidth else w) widths let cellspecs = zip (map fst cellspecs') widths' rows' <- many $ try $ rowsep *> (map snd <$> tableRow) + optional blanklines tableEnd let cols = length hdr let (headers,rows) = if hasheader @@ -250,7 +285,7 @@ tableCaption = try $ do (trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline) tableRow :: MWParser [((Alignment, Double), Blocks)] -tableRow = try $ many tableCell +tableRow = try $ skipMany htmlComment *> many tableCell tableCell :: MWParser ((Alignment, Double), Blocks) tableCell = try $ do @@ -268,7 +303,7 @@ tableCell = try $ do Just "center" -> AlignCenter _ -> AlignDefault let width = case lookup "width" attrs of - Just xs -> maybe 0.0 id $ parseWidth xs + Just xs -> fromMaybe 0.0 $ parseWidth xs Nothing -> 0.0 return ((align, width), bs) @@ -282,6 +317,7 @@ template :: MWParser String template = try $ do string "{{" notFollowedBy (char '{') + lookAhead $ letter <|> digit <|> char ':' let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar contents <- manyTill chunk (try $ string "}}") return $ "{{" ++ concat contents ++ "}}" @@ -342,7 +378,7 @@ preformatted = try $ do spacesStr _ = False if F.all spacesStr contents then return mempty - else return $ B.para $ bottomUp strToCode contents + else return $ B.para $ walk strToCode contents header :: MWParser Blocks header = try $ do @@ -351,7 +387,8 @@ header = try $ do let lev = length eqs guard $ lev <= 6 contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=') - return $ B.header lev contents + attr <- registerHeader nullAttr contents + return $ B.headerWith attr lev contents bulletList :: MWParser Blocks bulletList = B.bulletList <$> @@ -362,15 +399,13 @@ bulletList = B.bulletList <$> orderedList :: MWParser Blocks orderedList = (B.orderedList <$> many1 (listItem '#')) - <|> (B.orderedList <$> (htmlTag (~== TagOpen "ul" []) *> spaces *> - many (listItem '#' <|> li) <* - optional (htmlTag (~== TagClose "ul")))) - <|> do (tag,_) <- htmlTag (~== TagOpen "ol" []) - spaces - items <- many (listItem '#' <|> li) - optional (htmlTag (~== TagClose "ol")) - let start = maybe 1 id $ safeRead $ fromAttrib "start" tag - return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items + <|> try + (do (tag,_) <- htmlTag (~== TagOpen "ol" []) + spaces + items <- many (listItem '#' <|> li) + optional (htmlTag (~== TagClose "ol")) + let start = fromMaybe 1 $ safeRead $ fromAttrib "start" tag + return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items) definitionList :: MWParser Blocks definitionList = B.definitionList <$> many1 defListItem @@ -380,8 +415,9 @@ defListItem = try $ do terms <- mconcat . intersperse B.linebreak <$> many defListTerm -- we allow dd with no dt, or dt with no dd defs <- if B.isNull terms - then many1 $ listItem ':' - else many $ listItem ':' + then notFollowedBy (try $ string ":<math>") *> + many1 (listItem ':') + else many (listItem ':') return (terms, defs) defListTerm :: MWParser Inlines @@ -413,7 +449,8 @@ listItem c = try $ do skipMany spaceChar first <- concat <$> manyTill listChunk newline rest <- many - (try $ string extras *> (concat <$> manyTill listChunk newline)) + (try $ string extras *> lookAhead listStartChar *> + (concat <$> manyTill listChunk newline)) contents <- parseFromString (many1 $ listItem' c) (unlines (first : rest)) case c of @@ -462,6 +499,7 @@ inline = whitespace <|> image <|> internalLink <|> externalLink + <|> math <|> inlineTag <|> B.singleton <$> charRef <|> inlineHtml @@ -472,6 +510,16 @@ inline = whitespace str :: MWParser Inlines str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) +math :: MWParser Inlines +math = (B.displayMath . trim <$> try (char ':' >> charsInTags "math")) + <|> (B.math . trim <$> charsInTags "math") + <|> (B.displayMath . trim <$> try (dmStart *> manyTill anyChar dmEnd)) + <|> (B.math . trim <$> try (mStart *> manyTill (satisfy (/='\n')) mEnd)) + where dmStart = string "\\[" + dmEnd = try (string "\\]") + mStart = string "\\(" + mEnd = try (string "\\)") + variable :: MWParser String variable = try $ do string "{{{" @@ -495,7 +543,6 @@ inlineTag = do TagOpen "del" _ -> B.strikeout <$> inlinesInTags "del" TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub" TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup" - TagOpen "math" _ -> B.math <$> charsInTags "math" TagOpen "code" _ -> B.code <$> charsInTags "code" TagOpen "tt" _ -> B.code <$> charsInTags "tt" TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" @@ -520,15 +567,19 @@ endline = () <$ try (newline <* notFollowedBy' header <* notFollowedBy anyListStart) +imageIdentifiers :: [MWParser ()] +imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] + where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier"] + image :: MWParser Inlines image = try $ do sym "[[" - sym "File:" + choice imageIdentifiers fname <- many1 (noneOf "|]") _ <- many (try $ char '|' *> imageOption) caption <- (B.str fname <$ sym "]]") <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]"))) - return $ B.image fname "image" caption + return $ B.image fname ("fig:" ++ stringify caption) caption imageOption :: MWParser String imageOption = diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs index c5d4cb98a..f4dfa62c1 100644 --- a/src/Text/Pandoc/Readers/Native.hs +++ b/src/Text/Pandoc/Readers/Native.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Native - Copyright : Copyright (C) 2011 John MacFarlane + Copyright : Copyright (C) 2011-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index c9726d195..35d01e877 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -6,7 +6,7 @@ import Text.Pandoc.Builder import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) import Text.XML.Light -import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Compat.TagSoupEntity (lookupEntity) import Data.Generics import Data.Monoid import Control.Monad.State diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs new file mode 100644 index 000000000..7a35e2ca0 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org.hs @@ -0,0 +1,1379 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org + Copyright : Copyright (C) 2014 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb@moltkeplatz.de> + +Conversion of org-mode formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Org ( readOrg ) where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), (<>) + , trimInlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import qualified Text.Pandoc.Parsing as P +import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF + , newline, orderedListMarker + , parseFromString + ) +import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) +import Text.Pandoc.Shared (compactify', compactify'DL) +import Text.TeXMath (texMathToPandoc, DisplayType(..)) + +import Control.Applicative ( Applicative, pure + , (<$>), (<$), (<*>), (<*), (*>), (<**>) ) +import Control.Arrow (first) +import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) +import Control.Monad.Reader (Reader, runReader, ask, asks) +import Data.Char (isAlphaNum, toLower) +import Data.Default +import Data.List (intersperse, isPrefixOf, isSuffixOf) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isJust) +import Data.Monoid (Monoid, mconcat, mempty, mappend) +import Network.HTTP (urlEncode) + +-- | Parse org-mode string and return a Pandoc document. +readOrg :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") + +type OrgParser = Parser [Char] OrgParserState + +parseOrg :: OrgParser Pandoc +parseOrg = do + blocks' <- parseBlocks + st <- getState + let meta = runF (orgStateMeta' st) st + return $ Pandoc meta $ filter (/= Null) (B.toList $ runF blocks' st) + +-- +-- Parser State for Org +-- + +type OrgNoteRecord = (String, F Blocks) +type OrgNoteTable = [OrgNoteRecord] + +type OrgBlockAttributes = M.Map String String + +type OrgLinkFormatters = M.Map String (String -> String) + +-- | Org-mode parser state +data OrgParserState = OrgParserState + { orgStateOptions :: ReaderOptions + , orgStateAnchorIds :: [String] + , orgStateBlockAttributes :: OrgBlockAttributes + , orgStateEmphasisCharStack :: [Char] + , orgStateEmphasisNewlines :: Maybe Int + , orgStateLastForbiddenCharPos :: Maybe SourcePos + , orgStateLastPreCharPos :: Maybe SourcePos + , orgStateLastStrPos :: Maybe SourcePos + , orgStateLinkFormatters :: OrgLinkFormatters + , orgStateMeta :: Meta + , orgStateMeta' :: F Meta + , orgStateNotes' :: OrgNoteTable + } + +instance HasReaderOptions OrgParserState where + extractReaderOptions = orgStateOptions + +instance HasMeta OrgParserState where + setMeta field val st = + st{ orgStateMeta = setMeta field val $ orgStateMeta st } + deleteMeta field st = + st{ orgStateMeta = deleteMeta field $ orgStateMeta st } + +instance HasLastStrPosition OrgParserState where + getLastStrPos = orgStateLastStrPos + setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } + +instance Default OrgParserState where + def = defaultOrgParserState + +defaultOrgParserState :: OrgParserState +defaultOrgParserState = OrgParserState + { orgStateOptions = def + , orgStateAnchorIds = [] + , orgStateBlockAttributes = M.empty + , orgStateEmphasisCharStack = [] + , orgStateEmphasisNewlines = Nothing + , orgStateLastForbiddenCharPos = Nothing + , orgStateLastPreCharPos = Nothing + , orgStateLastStrPos = Nothing + , orgStateLinkFormatters = M.empty + , orgStateMeta = nullMeta + , orgStateMeta' = return nullMeta + , orgStateNotes' = [] + } + +recordAnchorId :: String -> OrgParser () +recordAnchorId i = updateState $ \s -> + s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + +addBlockAttribute :: String -> String -> OrgParser () +addBlockAttribute key val = updateState $ \s -> + let attrs = orgStateBlockAttributes s + in s{ orgStateBlockAttributes = M.insert key val attrs } + +lookupBlockAttribute :: String -> OrgParser (Maybe String) +lookupBlockAttribute key = + M.lookup key . orgStateBlockAttributes <$> getState + +resetBlockAttributes :: OrgParser () +resetBlockAttributes = updateState $ \s -> + s{ orgStateBlockAttributes = orgStateBlockAttributes def } + +updateLastForbiddenCharPos :: OrgParser () +updateLastForbiddenCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} + +updateLastPreCharPos :: OrgParser () +updateLastPreCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastPreCharPos = Just p} + +pushToInlineCharStack :: Char -> OrgParser () +pushToInlineCharStack c = updateState $ \s -> + s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } + +popInlineCharStack :: OrgParser () +popInlineCharStack = updateState $ \s -> + s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } + +surroundingEmphasisChar :: OrgParser [Char] +surroundingEmphasisChar = + take 1 . drop 1 . orgStateEmphasisCharStack <$> getState + +startEmphasisNewlinesCounting :: Int -> OrgParser () +startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Just maxNewlines } + +decEmphasisNewlinesCount :: OrgParser () +decEmphasisNewlinesCount = updateState $ \s -> + s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } + +newlinesCountWithinLimits :: OrgParser Bool +newlinesCountWithinLimits = do + st <- getState + return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True + +resetEmphasisNewlines :: OrgParser () +resetEmphasisNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Nothing } + +addLinkFormat :: String + -> (String -> String) + -> OrgParser () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + +addToNotesTable :: OrgNoteRecord -> OrgParser () +addToNotesTable note = do + oldnotes <- orgStateNotes' <$> getState + updateState $ \s -> s{ orgStateNotes' = note:oldnotes } + +-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts +-- of the state saved and restored. +parseFromString :: OrgParser a -> String -> OrgParser a +parseFromString parser str' = do + oldLastPreCharPos <- orgStateLastPreCharPos <$> getState + updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } + result <- P.parseFromString parser str' + updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } + return result + + +-- +-- Adaptions and specializations of parsing utilities +-- + +newtype F a = F { unF :: Reader OrgParserState a + } deriving (Monad, Applicative, Functor) + +runF :: F a -> OrgParserState -> a +runF = runReader . unF + +askF :: F OrgParserState +askF = F ask + +asksF :: (OrgParserState -> a) -> F a +asksF f = F $ asks f + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = fmap mconcat . sequence + +trimInlinesF :: F Inlines -> F Inlines +trimInlinesF = liftM trimInlines + +returnF :: a -> OrgParser (F a) +returnF = return . return + + +-- | Like @Text.Parsec.Char.newline@, but causes additional state changes. +newline :: OrgParser Char +newline = + P.newline + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- +-- parsing blocks +-- + +parseBlocks :: OrgParser (F Blocks) +parseBlocks = mconcat <$> manyTill block eof + +block :: OrgParser (F Blocks) +block = choice [ mempty <$ blanklines + , optionalAttributes $ choice + [ orgBlock + , figure + , table + ] + , example + , drawer + , specialLine + , header + , return <$> hline + , list + , latexFragment + , noteBlock + , paraOrPlain + ] <?> "block" + +optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks) +optionalAttributes parser = try $ + resetBlockAttributes *> parseBlockAttributes *> parser + +parseBlockAttributes :: OrgParser () +parseBlockAttributes = do + attrs <- many attribute + () <$ mapM (uncurry parseAndAddAttribute) attrs + where + attribute :: OrgParser (String, String) + attribute = try $ do + key <- metaLineStart *> many1Till nonspaceChar (char ':') + val <- skipSpaces *> anyLine + return (map toLower key, val) + +parseAndAddAttribute :: String -> String -> OrgParser () +parseAndAddAttribute key value = do + let key' = map toLower key + () <$ addBlockAttribute key' value + +lookupInlinesAttr :: String -> OrgParser (Maybe (F Inlines)) +lookupInlinesAttr attr = try $ do + val <- lookupBlockAttribute attr + maybe (return Nothing) + (fmap Just . parseFromString parseInlines) + val + + +-- +-- Org Blocks (#+BEGIN_... / #+END_...) +-- + +type BlockProperties = (Int, String) -- (Indentation, Block-Type) + +orgBlock :: OrgParser (F Blocks) +orgBlock = try $ do + blockProp@(_, blkType) <- blockHeaderStart + ($ blockProp) $ + case blkType of + "comment" -> withRaw' (const mempty) + "html" -> withRaw' (return . (B.rawBlock blkType)) + "latex" -> withRaw' (return . (B.rawBlock blkType)) + "ascii" -> withRaw' (return . (B.rawBlock blkType)) + "example" -> withRaw' (return . exampleCode) + "quote" -> withParsed (fmap B.blockQuote) + "verse" -> verseBlock + "src" -> codeBlock + _ -> withParsed (fmap $ divWithClass blkType) + +blockHeaderStart :: OrgParser (Int, String) +blockHeaderStart = try $ (,) <$> indent <*> blockType + where + indent = length <$> many spaceChar + blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) + +withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) + +withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp)) + +ignHeaders :: OrgParser () +ignHeaders = (() <$ newline) <|> (() <$ anyLine) + +divWithClass :: String -> Blocks -> Blocks +divWithClass cls = B.divWith ("", [cls], []) + +verseBlock :: BlockProperties -> OrgParser (F Blocks) +verseBlock blkProp = try $ do + ignHeaders + content <- rawBlockContent blkProp + fmap B.para . mconcat . intersperse (pure B.linebreak) + <$> mapM (parseFromString parseInlines) (lines content) + +codeBlock :: BlockProperties -> OrgParser (F Blocks) +codeBlock blkProp = do + skipSpaces + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + id' <- fromMaybe "" <$> lookupBlockAttribute "name" + content <- rawBlockContent blkProp + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + maybe (pure codeBlck) (labelDiv codeBlck) <$> lookupInlinesAttr "caption" + where + labelDiv blk value = + B.divWith nullAttr <$> (mappend <$> labelledBlock value + <*> pure blk) + labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + +rawBlockContent :: BlockProperties -> OrgParser String +rawBlockContent (indent, blockType) = try $ + unlines . map commaEscaped <$> manyTill indentedLine blockEnder + where + indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) + blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType) + +parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) +parsedBlockContent blkProps = try $ do + raw <- rawBlockContent blkProps + parseFromString parseBlocks (raw ++ "\n") + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Int -> OrgParser String +indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> count (num - tabStop) (char ' ')) ] + +type SwitchOption = (Char, Maybe String) + +orgArgWord :: OrgParser String +orgArgWord = many1 orgArgWordChar + +-- | Parse code block arguments +-- TODO: We currently don't handle switches. +codeHeaderArgs :: OrgParser ([String], [(String, String)]) +codeHeaderArgs = try $ do + language <- skipSpaces *> orgArgWord + _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) + parameters <- manyTill blockOption newline + let pandocLang = translateLang language + return $ + if hasRundocParameters parameters + then ( [ pandocLang, rundocBlockClass ] + , map toRundocAttrib (("language", language) : parameters) + ) + else ([ pandocLang ], parameters) + where hasRundocParameters = not . null + +switch :: OrgParser SwitchOption +switch = try $ simpleSwitch <|> lineNumbersSwitch + where + simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) + lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> + (string "-l \"" *> many1Till nonspaceChar (char '"')) + +translateLang :: String -> String +translateLang "C" = "c" +translateLang "C++" = "cpp" +translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported +translateLang "js" = "javascript" +translateLang "lisp" = "commonlisp" +translateLang "R" = "r" +translateLang "sh" = "bash" +translateLang "sqlite" = "sql" +translateLang cs = cs + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +blockOption :: OrgParser (String, String) +blockOption = try $ (,) <$> orgArgKey <*> orgParamValue + +inlineBlockOption :: OrgParser (String, String) +inlineBlockOption = try $ (,) <$> orgArgKey <*> orgInlineParamValue + +orgArgKey :: OrgParser String +orgArgKey = try $ + skipSpaces *> char ':' + *> many1 orgArgWordChar + +orgParamValue :: OrgParser String +orgParamValue = try $ + skipSpaces *> many1 (noneOf "\t\n\r ") <* skipSpaces + +orgInlineParamValue :: OrgParser String +orgInlineParamValue = try $ + skipSpaces *> many1 (noneOf "\t\n\r ]") <* skipSpaces + +orgArgWordChar :: OrgParser Char +orgArgWordChar = alphaNum <|> oneOf "-_" + +toRundocAttrib :: (String, String) -> (String, String) +toRundocAttrib = first ("rundoc-" ++) + +commaEscaped :: String -> String +commaEscaped (',':cs@('*':_)) = cs +commaEscaped (',':cs@('#':'+':_)) = cs +commaEscaped cs = cs + +example :: OrgParser (F Blocks) +example = try $ do + return . return . exampleCode =<< unlines <$> many1 exampleLine + +exampleCode :: String -> Blocks +exampleCode = B.codeBlockWith ("", ["example"], []) + +exampleLine :: OrgParser String +exampleLine = try $ string ": " *> anyLine + +-- Drawers for properties or a logbook +drawer :: OrgParser (F Blocks) +drawer = try $ do + drawerStart + manyTill drawerLine (try drawerEnd) + return mempty + +drawerStart :: OrgParser String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* P.newline + where drawerName = try $ char ':' *> validDrawerName <* char ':' + validDrawerName = stringAnyCase "PROPERTIES" + <|> stringAnyCase "LOGBOOK" + +drawerLine :: OrgParser String +drawerLine = try anyLine + +drawerEnd :: OrgParser String +drawerEnd = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* P.newline + + +-- +-- Figures +-- + +-- Figures (Image on a line by itself, preceded by name and/or caption) +figure :: OrgParser (F Blocks) +figure = try $ do + (cap, nam) <- nameAndCaption + src <- skipSpaces *> selfTarget <* skipSpaces <* P.newline + guard (isImageFilename src) + return $ do + cap' <- cap + return $ B.para $ B.image src nam cap' + where + nameAndCaption = + do + maybeCap <- lookupInlinesAttr "caption" + maybeNam <- lookupBlockAttribute "name" + guard $ isJust maybeCap || isJust maybeNam + return ( fromMaybe mempty maybeCap + , maybe mempty withFigPrefix maybeNam ) + withFigPrefix cs = + if "fig:" `isPrefixOf` cs + then cs + else "fig:" ++ cs + +-- +-- Comments, Options and Metadata +specialLine :: OrgParser (F Blocks) +specialLine = fmap return . try $ metaLine <|> commentLine + +metaLine :: OrgParser Blocks +metaLine = try $ mempty + <$ (metaLineStart *> (optionLine <|> declarationLine)) + +commentLine :: OrgParser Blocks +commentLine = try $ commentLineStart *> anyLine *> pure mempty + +-- The order, in which blocks are tried, makes sure that we're not looking at +-- the beginning of a block, so we don't need to check for it +metaLineStart :: OrgParser String +metaLineStart = try $ mappend <$> many spaceChar <*> string "#+" + +commentLineStart :: OrgParser String +commentLineStart = try $ mappend <$> many spaceChar <*> string "# " + +declarationLine :: OrgParser () +declarationLine = try $ do + key <- metaKey + inlinesF <- metaInlines + updateState $ \st -> + let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta + in st { orgStateMeta' = orgStateMeta' st <> meta' } + return () + +metaInlines :: OrgParser (F MetaValue) +metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline + +metaKey :: OrgParser String +metaKey = map toLower <$> many1 (noneOf ": \n\r") + <* char ':' + <* skipSpaces + +optionLine :: OrgParser () +optionLine = try $ do + key <- metaKey + case key of + "link" -> parseLinkFormat >>= uncurry addLinkFormat + _ -> mzero + +parseLinkFormat :: OrgParser ((String, String -> String)) +parseLinkFormat = try $ do + linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces + linkSubst <- parseFormat + return (linkType, linkSubst) + +-- | An ad-hoc, single-argument-only implementation of a printf-style format +-- parser. +parseFormat :: OrgParser (String -> String) +parseFormat = try $ do + replacePlain <|> replaceUrl <|> justAppend + where + -- inefficient, but who cares + replacePlain = try $ (\x -> concat . flip intersperse x) + <$> sequence [tillSpecifier 's', rest] + replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode) + <$> sequence [tillSpecifier 'h', rest] + justAppend = try $ (++) <$> rest + + rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") + tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) + +-- +-- Headers +-- + +-- | Headers +header :: OrgParser (F Blocks) +header = try $ do + level <- headerStart + title <- inlinesTillNewline + return $ B.header level <$> title + +headerStart :: OrgParser Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') + + +-- Don't use (or need) the reader wrapper here, we want hline to be +-- @show@able. Otherwise we can't use it with @notFollowedBy'@. + +-- | Horizontal Line (five -- dashes or more) +hline :: OrgParser Blocks +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return B.horizontalRule + +-- +-- Tables +-- + +data OrgTableRow = OrgContentRow (F [Blocks]) + | OrgAlignRow [Alignment] + | OrgHlineRow + +data OrgTable = OrgTable + { orgTableColumns :: Int + , orgTableAlignments :: [Alignment] + , orgTableHeader :: [Blocks] + , orgTableRows :: [[Blocks]] + } + +table :: OrgParser (F Blocks) +table = try $ do + lookAhead tableStart + do + rows <- tableRows + cptn <- fromMaybe (pure "") <$> lookupInlinesAttr "caption" + return $ (<$> cptn) . orgToPandocTable . normalizeTable =<< rowsToTable rows + +orgToPandocTable :: OrgTable + -> Inlines + -> Blocks +orgToPandocTable (OrgTable _ aligns heads lns) caption = + B.table caption (zip aligns $ repeat 0) heads lns + +tableStart :: OrgParser Char +tableStart = try $ skipSpaces *> char '|' + +tableRows :: OrgParser [OrgTableRow] +tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) + +tableContentRow :: OrgParser OrgTableRow +tableContentRow = try $ + OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline) + +tableContentCell :: OrgParser (F Blocks) +tableContentCell = try $ + fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell + +endOfCell :: OrgParser Char +endOfCell = try $ char '|' <|> lookAhead newline + +tableAlignRow :: OrgParser OrgTableRow +tableAlignRow = try $ + OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline) + +tableAlignCell :: OrgParser Alignment +tableAlignCell = + choice [ try $ emptyCell *> return AlignDefault + , try $ skipSpaces + *> char '<' + *> tableAlignFromChar + <* many digit + <* char '>' + <* emptyCell + ] <?> "alignment info" + where emptyCell = try $ skipSpaces *> endOfCell + +tableAlignFromChar :: OrgParser Alignment +tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] + +tableHline :: OrgParser OrgTableRow +tableHline = try $ + OrgHlineRow <$ (tableStart *> char '-' *> anyLine) + +rowsToTable :: [OrgTableRow] + -> F OrgTable +rowsToTable = foldM (flip rowToContent) zeroTable + where zeroTable = OrgTable 0 mempty mempty mempty + +normalizeTable :: OrgTable + -> OrgTable +normalizeTable (OrgTable cols aligns heads lns) = + let aligns' = fillColumns aligns AlignDefault + heads' = if heads == mempty + then mempty + else fillColumns heads (B.plain mempty) + lns' = map (`fillColumns` B.plain mempty) lns + fillColumns base padding = take cols $ base ++ repeat padding + in OrgTable cols aligns' heads' lns' + + +-- One or more horizontal rules after the first content line mark the previous +-- line as a header. All other horizontal lines are discarded. +rowToContent :: OrgTableRow + -> OrgTable + -> F OrgTable +rowToContent OrgHlineRow t = maybeBodyToHeader t +rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t +rowToContent (OrgContentRow rf) t = do + rs <- rf + setLongestRow rs =<< appendToBody rs t + +setLongestRow :: [a] + -> OrgTable + -> F OrgTable +setLongestRow rs t = + return t{ orgTableColumns = max (length rs) (orgTableColumns t) } + +maybeBodyToHeader :: OrgTable + -> F OrgTable +maybeBodyToHeader t = case t of + OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> + return t{ orgTableHeader = b , orgTableRows = [] } + _ -> return t + +appendToBody :: [Blocks] + -> OrgTable + -> F OrgTable +appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] } + +setAligns :: [Alignment] + -> OrgTable + -> F OrgTable +setAligns aligns t = return $ t{ orgTableAlignments = aligns } + + +-- +-- LaTeX fragments +-- +latexFragment :: OrgParser (F Blocks) +latexFragment = try $ do + envName <- latexEnvStart + content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) + return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) + where + c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" + , c + , "\\end{", e, "}\n" + ] + +latexEnvStart :: OrgParser String +latexEnvStart = try $ do + skipSpaces *> string "\\begin{" + *> latexEnvName + <* string "}" + <* blankline + +latexEnd :: String -> OrgParser () +latexEnd envName = try $ + () <$ skipSpaces + <* string ("\\end{" ++ envName ++ "}") + <* blankline + +-- | Parses a LaTeX environment name. +latexEnvName :: OrgParser String +latexEnvName = try $ do + mappend <$> many1 alphaNum + <*> option "" (string "*") + + +-- +-- Footnote defintions +-- +noteBlock :: OrgParser (F Blocks) +noteBlock = try $ do + ref <- noteMarker <* skipSpaces + content <- mconcat <$> blocksTillHeaderOrNote + addToNotesTable (ref, content) + return mempty + where + blocksTillHeaderOrNote = + many1Till block (eof <|> () <$ lookAhead noteMarker + <|> () <$ lookAhead headerStart) + +-- Paragraphs or Plain text +paraOrPlain :: OrgParser (F Blocks) +paraOrPlain = try $ + parseInlines <**> (fmap <$> option B.plain (try $ newline *> pure B.para)) + +inlinesTillNewline :: OrgParser (F Inlines) +inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline + + +-- +-- list blocks +-- + +list :: OrgParser (F Blocks) +list = choice [ definitionList, bulletList, orderedList ] <?> "list" + +definitionList :: OrgParser (F Blocks) +definitionList = fmap B.definitionList . fmap compactify'DL . sequence + <$> many1 (definitionListItem bulletListStart) + +bulletList :: OrgParser (F Blocks) +bulletList = fmap B.bulletList . fmap compactify' . sequence + <$> many1 (listItem bulletListStart) + +orderedList :: OrgParser (F Blocks) +orderedList = fmap B.orderedList . fmap compactify' . sequence + <$> many1 (listItem orderedListStart) + +genericListStart :: OrgParser String + -> OrgParser Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +-- parses bullet list start and returns its length (excl. following whitespace) +bulletListStart :: OrgParser Int +bulletListStart = genericListStart bulletListMarker + where bulletListMarker = pure <$> oneOf "*-+" + +orderedListStart :: OrgParser Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +definitionListItem :: OrgParser Int + -> OrgParser (F (Inlines, [Blocks])) +definitionListItem parseMarkerGetLength = try $ do + markerLength <- parseMarkerGetLength + term <- manyTill (noneOf "\n\r") (try $ string "::") + line1 <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + cont <- concat <$> many (listContinuation markerLength) + term' <- parseFromString inline term + contents' <- parseFromString parseBlocks $ line1 ++ blank ++ cont + return $ (,) <$> term' <*> fmap (:[]) contents' + + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: OrgParser Int + -> OrgParser (F Blocks) +listItem start = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString parseBlocks $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> OrgParser String +listContinuation markerLength = try $ + notFollowedBy' blankline + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: OrgParser String +anyLineNewline = (++ "\n") <$> anyLine + + +-- +-- inline +-- + +inline :: OrgParser (F Inlines) +inline = + choice [ whitespace + , linebreak + , cite + , footnote + , linkOrImage + , anchor + , inlineCodeBlock + , str + , endline + , emph + , strong + , strikeout + , underline + , code + , math + , displayMath + , verbatim + , subscript + , superscript + , inlineLaTeX + , symbol + ] <* (guard =<< newlinesCountWithinLimits) + <?> "inline" + +parseInlines :: OrgParser (F Inlines) +parseInlines = trimInlinesF . mconcat <$> many1 inline + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-./:<=>[\\]^_{|}~" + + +whitespace :: OrgParser (F Inlines) +whitespace = pure B.space <$ skipMany1 spaceChar + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + <?> "whitespace" + +linebreak :: OrgParser (F Inlines) +linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline + +str :: OrgParser (F Inlines) +str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- | An endline character that can be treated as a space, not a structural +-- break. This should reflect the values of the Emacs variable +-- @org-element-pagaraph-separate@. +endline :: OrgParser (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy' exampleLine + notFollowedBy' hline + notFollowedBy' noteMarker + notFollowedBy' tableStart + notFollowedBy' drawerStart + notFollowedBy' headerStart + notFollowedBy' metaLineStart + notFollowedBy' latexEnvStart + notFollowedBy' commentLineStart + notFollowedBy' bulletListStart + notFollowedBy' orderedListStart + decEmphasisNewlinesCount + guard =<< newlinesCountWithinLimits + updateLastPreCharPos + return . return $ B.space + +cite :: OrgParser (F Inlines) +cite = try $ do + guardEnabled Ext_citations + (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + +normalCite :: OrgParser (F [Citation]) +normalCite = try $ char '[' + *> skipSpaces + *> citeList + <* skipSpaces + <* char ']' + +citeList :: OrgParser (F [Citation]) +citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) + +citation :: OrgParser (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +footnote :: OrgParser (F Inlines) +footnote = try $ inlineNote <|> referencedNote + +inlineNote :: OrgParser (F Inlines) +inlineNote = try $ do + string "[fn:" + ref <- many alphaNum + char ':' + note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') + when (not $ null ref) $ + addToNotesTable ("fn:" ++ ref, note) + return $ B.note <$> note + +referencedNote :: OrgParser (F Inlines) +referencedNote = try $ do + ref <- noteMarker + return $ do + notes <- asksF orgStateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just contents -> do + st <- askF + let contents' = runF contents st{ orgStateNotes' = [] } + return $ B.note contents' + +noteMarker :: OrgParser String +noteMarker = try $ do + char '[' + choice [ many1Till digit (char ']') + , (++) <$> string "fn:" + <*> many1Till (noneOf "\n\r\t ") (char ']') + ] + +linkOrImage :: OrgParser (F Inlines) +linkOrImage = explicitOrImageLink + <|> selflinkOrImage + <|> angleLink + <|> plainLink + <?> "link or image" + +explicitOrImageLink :: OrgParser (F Inlines) +explicitOrImageLink = try $ do + char '[' + srcF <- applyCustomLinkFormat =<< linkTarget + title <- enclosedRaw (char '[') (char ']') + title' <- parseFromString (mconcat <$> many inline) title + char ']' + return $ do + src <- srcF + if isImageFilename src && isImageFilename title + then pure $ B.link src "" $ B.image title mempty mempty + else linkToInlinesF src =<< title' + +selflinkOrImage :: OrgParser (F Inlines) +selflinkOrImage = try $ do + src <- char '[' *> linkTarget <* char ']' + return $ linkToInlinesF src (B.str src) + +plainLink :: OrgParser (F Inlines) +plainLink = try $ do + (orig, src) <- uri + returnF $ B.link src "" (B.str orig) + +angleLink :: OrgParser (F Inlines) +angleLink = try $ do + char '<' + link <- plainLink + char '>' + return link + +selfTarget :: OrgParser String +selfTarget = try $ char '[' *> linkTarget <* char ']' + +linkTarget :: OrgParser String +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") + +applyCustomLinkFormat :: String -> OrgParser (F String) +applyCustomLinkFormat link = do + let (linkType, rest) = break (== ':') link + return $ do + formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters + return $ maybe link ($ drop 1 rest) formatter + + +linkToInlinesF :: String -> Inlines -> F Inlines +linkToInlinesF s@('#':_) = pure . B.link s "" +linkToInlinesF s + | isImageFilename s = const . pure $ B.image s "" "" + | isUri s = pure . B.link s "" + | isRelativeUrl s = pure . B.link s "" +linkToInlinesF s = \title -> do + anchorB <- (s `elem`) <$> asksF orgStateAnchorIds + if anchorB + then pure $ B.link ('#':s) "" title + else pure $ B.emph title + +isRelativeUrl :: String -> Bool +isRelativeUrl s = (':' `notElem` s) && ("./" `isPrefixOf` s) + +isUri :: String -> Bool +isUri s = let (scheme, path) = break (== ':') s + in all (\c -> isAlphaNum c || c `elem` ".-") scheme + && not (null path) + +isImageFilename :: String -> Bool +isImageFilename filename = + any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && + (any (\x -> (x++":") `isPrefixOf` filename) protocols || + ':' `notElem` filename) + where + imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] + protocols = [ "file", "http", "https" ] + +-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with +-- @anchor-id@ set as id. Legal anchors in org-mode are defined through +-- @org-target-regexp@, which is fairly liberal. Since no link is created if +-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as +-- an anchor. + +anchor :: OrgParser (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + recordAnchorId anchorId + returnF $ B.spanWith (solidify anchorId, [], []) mempty + where + parseAnchor = string "<<" + *> many1 (noneOf "\t\n\r<>\"' ") + <* string ">>" + <* skipSpaces + +-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors +-- the org function @org-export-solidify-link-text@. + +solidify :: String -> String +solidify = map replaceSpecialChar + where replaceSpecialChar c + | isAlphaNum c = c + | c `elem` "_.-:" = c + | otherwise = '-' + +-- | Parses an inline code block and marks it as an babel block. +inlineCodeBlock :: OrgParser (F Inlines) +inlineCodeBlock = try $ do + string "src_" + lang <- many1 orgArgWordChar + opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption + inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") + let attrClasses = [translateLang lang, rundocBlockClass] + let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode + +enclosedByPair :: Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser a -- ^ parser + -> OrgParser [a] +enclosedByPair s e p = char s *> many1Till p (char e) + +emph :: OrgParser (F Inlines) +emph = fmap B.emph <$> emphasisBetween '/' + +strong :: OrgParser (F Inlines) +strong = fmap B.strong <$> emphasisBetween '*' + +strikeout :: OrgParser (F Inlines) +strikeout = fmap B.strikeout <$> emphasisBetween '+' + +-- There is no underline, so we use strong instead. +underline :: OrgParser (F Inlines) +underline = fmap B.strong <$> emphasisBetween '_' + +verbatim :: OrgParser (F Inlines) +verbatim = return . B.code <$> verbatimBetween '=' + +code :: OrgParser (F Inlines) +code = return . B.code <$> verbatimBetween '~' + +subscript :: OrgParser (F Inlines) +subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) + +superscript :: OrgParser (F Inlines) +superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) + +math :: OrgParser (F Inlines) +math = return . B.math <$> choice [ math1CharBetween '$' + , mathStringBetween '$' + , rawMathBetween "\\(" "\\)" + ] + +displayMath :: OrgParser (F Inlines) +displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" + , rawMathBetween "$$" "$$" + ] +symbol :: OrgParser (F Inlines) +symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) + where updatePositions c + | c `elem` emphasisPreChars = c <$ updateLastPreCharPos + | c `elem` emphasisForbiddenBorderChars = c <$ updateLastForbiddenCharPos + | otherwise = return c + +emphasisBetween :: Char + -> OrgParser (F Inlines) +emphasisBetween c = try $ do + startEmphasisNewlinesCounting emphasisAllowedNewlines + res <- enclosedInlines (emphasisStart c) (emphasisEnd c) + isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState + when isTopLevelEmphasis + resetEmphasisNewlines + return res + +verbatimBetween :: Char + -> OrgParser String +verbatimBetween c = try $ + emphasisStart c *> + many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) + +-- | Parses a raw string delimited by @c@ using Org's math rules +mathStringBetween :: Char + -> OrgParser String +mathStringBetween c = try $ do + mathStart c + body <- many1TillNOrLessNewlines mathAllowedNewlines + (noneOf (c:"\n\r")) + (lookAhead $ mathEnd c) + final <- mathEnd c + return $ body ++ [final] + +-- | Parse a single character between @c@ using math rules +math1CharBetween :: Char + -> OrgParser String +math1CharBetween c = try $ do + char c + res <- noneOf $ c:mathForbiddenBorderChars + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return [res] + +rawMathBetween :: String + -> String + -> OrgParser String +rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) + +-- | Parses the start (opening character) of emphasis +emphasisStart :: Char -> OrgParser Char +emphasisStart c = try $ do + guard =<< afterEmphasisPreChar + guard =<< notAfterString + char c + lookAhead (noneOf emphasisForbiddenBorderChars) + pushToInlineCharStack c + return c + +-- | Parses the closing character of emphasis +emphasisEnd :: Char -> OrgParser Char +emphasisEnd c = try $ do + guard =<< notAfterForbiddenBorderChar + char c + eof <|> () <$ lookAhead acceptablePostChars + updateLastStrPos + popInlineCharStack + return c + where acceptablePostChars = + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + +mathStart :: Char -> OrgParser Char +mathStart c = try $ + char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) + +mathEnd :: Char -> OrgParser Char +mathEnd c = try $ do + res <- noneOf (c:mathForbiddenBorderChars) + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return res + + +enclosedInlines :: OrgParser a + -> OrgParser b + -> OrgParser (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> enclosed start end inline + +enclosedRaw :: OrgParser a + -> OrgParser b + -> OrgParser String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume +-- newlines. +many1TillNOrLessNewlines :: Int + -> OrgParser Char + -> OrgParser a + -> OrgParser String +many1TillNOrLessNewlines n p end = try $ + nMoreLines (Just n) mempty >>= oneOrMore + where + nMoreLines Nothing cs = return cs + nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine + nMoreLines k cs = try $ (final k cs <|> rest k cs) + >>= uncurry nMoreLines + final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p P.newline) + finalLine = try $ manyTill p end + minus1 k = k - 1 + oneOrMore cs = guard (not $ null cs) *> return cs + +-- Org allows customization of the way it reads emphasis. We use the defaults +-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` +-- for details). + +-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) +emphasisPreChars :: [Char] +emphasisPreChars = "\t \"'({" + +-- | Chars allowed at after emphasis +emphasisPostChars :: [Char] +emphasisPostChars = "\t\n !\"'),-.:;?\\}" + +-- | Chars not allowed at the (inner) border of emphasis +emphasisForbiddenBorderChars :: [Char] +emphasisForbiddenBorderChars = "\t\n\r \"'," + +-- | The maximum number of newlines within +emphasisAllowedNewlines :: Int +emphasisAllowedNewlines = 1 + +-- LaTeX-style math: see `org-latex-regexps` for details + +-- | Chars allowed after an inline ($...$) math statement +mathPostChars :: [Char] +mathPostChars = "\t\n \"'),-.:;?" + +-- | Chars not allowed at the (inner) border of math +mathForbiddenBorderChars :: [Char] +mathForbiddenBorderChars = "\t\n\r ,;.$" + +-- | Maximum number of newlines in an inline math statement +mathAllowedNewlines :: Int +mathAllowedNewlines = 2 + +-- | Whether we are right behind a char allowed before emphasis +afterEmphasisPreChar :: OrgParser Bool +afterEmphasisPreChar = do + pos <- getPosition + lastPrePos <- orgStateLastPreCharPos <$> getState + return . fromMaybe True $ (== pos) <$> lastPrePos + +-- | Whether the parser is right after a forbidden border char +notAfterForbiddenBorderChar :: OrgParser Bool +notAfterForbiddenBorderChar = do + pos <- getPosition + lastFBCPos <- orgStateLastForbiddenCharPos <$> getState + return $ lastFBCPos /= Just pos + +-- | Read a sub- or superscript expression +subOrSuperExpr :: OrgParser (F Inlines) +subOrSuperExpr = try $ + choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + , simpleSubOrSuperString + ] >>= parseFromString (mconcat <$> many inline) + where enclosing (left, right) s = left : s ++ [right] + +simpleSubOrSuperString :: OrgParser String +simpleSubOrSuperString = try $ + choice [ string "*" + , mappend <$> option [] ((:[]) <$> oneOf "+-") + <*> many1 alphaNum + ] + +inlineLaTeX :: OrgParser (F Inlines) +inlineLaTeX = try $ do + cmd <- inlineLaTeXCommand + maybe mzero returnF $ parseAsMath cmd `mplus` parseAsInlineLaTeX cmd + where + parseAsMath :: String -> Maybe Inlines + parseAsMath cs = maybeRight $ B.fromList <$> texMathToPandoc DisplayInline cs + + parseAsInlineLaTeX :: String -> Maybe Inlines + parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + + state :: ParserState + state = def{ stateOptions = def{ readerParseRaw = True }} + +maybeRight :: Either a b -> Maybe b +maybeRight = either (const Nothing) Just + +inlineLaTeXCommand :: OrgParser String +inlineLaTeXCommand = try $ do + rest <- getInput + case runParser rawLaTeXInline def "source" rest of + Right (RawInline _ cs) -> do + let len = length cs + count len anyChar + return cs + _ -> mzero diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 34962b553..fa8438e70 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -35,12 +36,13 @@ import Text.Pandoc.Builder (setMeta, fromList) import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Options -import Control.Monad ( when, liftM, guard, mzero ) +import Control.Monad ( when, liftM, guard, mzero, mplus ) import Data.List ( findIndex, intersperse, intercalate, transpose, sort, deleteFirstsBy, isSuffixOf ) +import Data.Maybe (fromMaybe) import qualified Data.Map as M import Text.Printf ( printf ) -import Control.Applicative ((<$>), (<$), (<*), (*>)) +import Control.Applicative ((<$>), (<$), (<*), (*>), (<*>)) import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import qualified Text.Pandoc.Builder as B import Data.Monoid (mconcat, mempty) @@ -111,15 +113,16 @@ titleTransform (bs, meta) = metaFromDefList :: [([Inline], [[Block]])] -> Meta -> Meta metaFromDefList ds meta = adjustAuthors $ foldr f meta ds where f (k,v) = setMeta (map toLower $ stringify k) (mconcat $ map fromList v) - adjustAuthors (Meta metamap) = Meta $ M.adjust toPlain "author" + adjustAuthors (Meta metamap) = Meta $ M.adjust splitAuthors "author" $ M.adjust toPlain "date" $ M.adjust toPlain "title" - $ M.adjust splitAuthors "authors" + $ M.mapKeys (\k -> if k == "authors" then "author" else k) $ metamap toPlain (MetaBlocks [Para xs]) = MetaInlines xs toPlain x = x - splitAuthors (MetaBlocks [Para xs]) = MetaList $ map MetaInlines - $ splitAuthors' xs + splitAuthors (MetaBlocks [Para xs]) + = MetaList $ map MetaInlines + $ splitAuthors' xs splitAuthors x = x splitAuthors' = map normalizeSpaces . splitOnSemi . concatMap factorSemi @@ -183,22 +186,22 @@ block = choice [ codeBlock -- field list -- -rawFieldListItem :: String -> RSTParser (String, String) -rawFieldListItem indent = try $ do - string indent +rawFieldListItem :: Int -> RSTParser (String, String) +rawFieldListItem minIndent = try $ do + indent <- length <$> many (char ' ') + guard $ indent >= minIndent char ':' name <- many1Till (noneOf "\n") (char ':') (() <$ lookAhead newline) <|> skipMany1 spaceChar first <- anyLine - rest <- option "" $ try $ do lookAhead (string indent >> spaceChar) + rest <- option "" $ try $ do lookAhead (count indent (char ' ') >> spaceChar) indentedBlock let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n" return (name, raw) -fieldListItem :: String - -> RSTParser (Inlines, [Blocks]) -fieldListItem indent = try $ do - (name, raw) <- rawFieldListItem indent +fieldListItem :: Int -> RSTParser (Inlines, [Blocks]) +fieldListItem minIndent = try $ do + (name, raw) <- rawFieldListItem minIndent let term = B.str name contents <- parseFromString parseBlocks raw optional blanklines @@ -206,7 +209,7 @@ fieldListItem indent = try $ do fieldList :: RSTParser Blocks fieldList = try $ do - indent <- lookAhead $ many spaceChar + indent <- length <$> lookAhead (many spaceChar) items <- many1 $ fieldListItem indent case items of [] -> return mempty @@ -274,7 +277,8 @@ doubleHeader = try $ do Just ind -> (headerTable, ind + 1) Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) setState (state { stateHeaderTable = headerTable' }) - return $ B.header level txt + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt -- a header with line on the bottom only singleHeader :: RSTParser Blocks @@ -294,7 +298,8 @@ singleHeader = try $ do Just ind -> (headerTable, ind + 1) Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) setState (state { stateHeaderTable = headerTable' }) - return $ B.header level txt + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt -- -- hrule block @@ -344,14 +349,25 @@ lhsCodeBlock = try $ do getPosition >>= guard . (==1) . sourceColumn guardEnabled Ext_literate_haskell optional codeBlockStart - lns <- many1 birdTrackLine - -- if (as is normal) there is always a space after >, drop it - let lns' = if all (\ln -> null ln || take 1 ln == " ") lns - then map (drop 1) lns - else lns + lns <- latexCodeBlock <|> birdCodeBlock blanklines return $ B.codeBlockWith ("", ["sourceCode", "literate", "haskell"], []) - $ intercalate "\n" lns' + $ intercalate "\n" lns + +latexCodeBlock :: Parser [Char] st [[Char]] +latexCodeBlock = try $ do + try (latexBlockLine "\\begin{code}") + many1Till anyLine (try $ latexBlockLine "\\end{code}") + where + latexBlockLine s = skipMany spaceChar >> string s >> blankline + +birdCodeBlock :: Parser [Char] st [[Char]] +birdCodeBlock = filterSpace <$> many1 birdTrackLine + where filterSpace lns = + -- if (as is normal) there is always a space after >, drop it + if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns birdTrackLine :: Parser [Char] st [Char] birdTrackLine = char '>' >> anyLine @@ -506,17 +522,17 @@ directive' = do skipMany spaceChar top <- many $ satisfy (/='\n') <|> try (char '\n' <* - notFollowedBy' (rawFieldListItem " ") <* + notFollowedBy' (rawFieldListItem 3) <* count 3 (char ' ') <* notFollowedBy blankline) newline - fields <- many $ rawFieldListItem " " + fields <- many $ rawFieldListItem 3 body <- option "" $ try $ blanklines >> indentedBlock optional blanklines let body' = body ++ "\n\n" case label of "raw" -> return $ B.rawBlock (trim top) (stripTrailingNewlines body) - "role" -> return mempty + "role" -> addNewRole top $ map (\(k,v) -> (k, trim v)) fields "container" -> parseFromString parseBlocks body' "replace" -> B.para <$> -- consumed by substKey parseFromString (trimInlines . mconcat <$> many inline) @@ -561,12 +577,15 @@ directive' = do role -> role }) "code" -> codeblock (lookup "number-lines" fields) (trim top) body "code-block" -> codeblock (lookup "number-lines" fields) (trim top) body + "aafig" -> do + let attribs = ("", ["aafig"], map (\(k,v) -> (k, trimr v)) fields) + return $ B.codeBlockWith attribs $ stripTrailingNewlines body "math" -> return $ B.para $ mconcat $ map B.displayMath $ toChunks $ top ++ "\n\n" ++ body "figure" -> do (caption, legend) <- parseFromString extractCaption body' let src = escapeURI $ trim top - return $ B.para (B.image src "" caption) <> legend + return $ B.para (B.image src "fig:" caption) <> legend "image" -> do let src = escapeURI $ trim top let alt = B.str $ maybe "image" trim $ lookup "alt" fields @@ -577,7 +596,38 @@ directive' = do Nothing -> B.image src "" alt _ -> return mempty --- Can contain haracter codes as decimal numbers or +-- TODO: +-- - Silently ignores illegal fields +-- - Silently drops classes +-- - Only supports :format: fields with a single format for :raw: roles, +-- change Text.Pandoc.Definition.Format to fix +addNewRole :: String -> [(String, String)] -> RSTParser Blocks +addNewRole roleString fields = do + (role, parentRole) <- parseFromString inheritedRole roleString + customRoles <- stateRstCustomRoles <$> getState + baseRole <- case M.lookup parentRole customRoles of + Just (base, _, _) -> return base + Nothing -> return parentRole + + let fmt = if baseRole == "raw" then lookup "format" fields else Nothing + annotate = maybe id addLanguage $ + if baseRole == "code" + then lookup "language" fields + else Nothing + + updateState $ \s -> s { + stateRstCustomRoles = + M.insert role (baseRole, fmt, (,) parentRole . annotate) customRoles + } + + return $ B.singleton Null + where + addLanguage lang (ident, classes, keyValues) = + (ident, "sourceCode" : lang : classes, keyValues) + inheritedRole = + (,) <$> roleNameEndingIn (char '(') <*> roleNameEndingIn (char ')') + +-- Can contain character codes as decimal numbers or -- hexadecimal numbers, prefixed by 0x, x, \x, U+, u, or \u -- or as XML-style hexadecimal character entities, e.g. ᨫ -- or text, which is used as-is. Comments start with .. @@ -916,17 +966,56 @@ strong = B.strong . trimInlines . mconcat <$> -- Note, this doesn't precisely implement the complex rule in -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules -- but it should be good enough for most purposes +-- +-- TODO: +-- - Classes are silently discarded in addNewRole +-- - Lacks sensible implementation for title-reference (which is the default) +-- - Allows direct use of the :raw: role, rST only allows inherited use. interpretedRole :: RSTParser Inlines interpretedRole = try $ do (role, contents) <- roleBefore <|> roleAfter - case role of - "sup" -> return $ B.superscript $ B.str contents - "sub" -> return $ B.subscript $ B.str contents - "math" -> return $ B.math contents - _ -> return $ B.str contents --unknown + renderRole contents Nothing role nullAttr + +renderRole :: String -> Maybe String -> String -> Attr -> RSTParser Inlines +renderRole contents fmt role attr = case role of + "sup" -> return $ B.superscript $ B.str contents + "superscript" -> return $ B.superscript $ B.str contents + "sub" -> return $ B.subscript $ B.str contents + "subscript" -> return $ B.subscript $ B.str contents + "emphasis" -> return $ B.emph $ B.str contents + "strong" -> return $ B.strong $ B.str contents + "rfc-reference" -> return $ rfcLink contents + "RFC" -> return $ rfcLink contents + "pep-reference" -> return $ pepLink contents + "PEP" -> return $ pepLink contents + "literal" -> return $ B.str contents + "math" -> return $ B.math contents + "title-reference" -> titleRef contents + "title" -> titleRef contents + "t" -> titleRef contents + "code" -> return $ B.codeWith attr contents + "raw" -> return $ B.rawInline (fromMaybe "" fmt) contents + custom -> do + customRole <- stateRstCustomRoles <$> getState + case M.lookup custom customRole of + Just (_, newFmt, inherit) -> let + fmtStr = fmt `mplus` newFmt + (newRole, newAttr) = inherit attr + in renderRole contents fmtStr newRole newAttr + Nothing -> return $ B.str contents -- Undefined role + where + titleRef ref = return $ B.str ref -- FIXME: Not a sensible behaviour + rfcLink rfcNo = B.link rfcUrl ("RFC " ++ rfcNo) $ B.str ("RFC " ++ rfcNo) + where rfcUrl = "http://www.faqs.org/rfcs/rfc" ++ rfcNo ++ ".html" + pepLink pepNo = B.link pepUrl ("PEP " ++ pepNo) $ B.str ("PEP " ++ pepNo) + where padNo = replicate (4 - length pepNo) '0' ++ pepNo + pepUrl = "http://www.python.org/dev/peps/pep-" ++ padNo ++ "/" + +roleNameEndingIn :: RSTParser Char -> RSTParser String +roleNameEndingIn end = many1Till (letter <|> char '-') end roleMarker :: RSTParser String -roleMarker = char ':' *> many1Till (letter <|> char '-') (char ':') +roleMarker = char ':' *> roleNameEndingIn (char ':') roleBefore :: RSTParser (String,String) roleBefore = try $ do @@ -1055,7 +1144,7 @@ smart :: RSTParser Inlines smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> - choice (map (B.singleton <$>) [apostrophe, dash, ellipses]) + choice [apostrophe, dash, ellipses] singleQuoted :: RSTParser Inlines singleQuoted = try $ do diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index fe49a992e..f03eae044 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2007-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.TeXMath - Copyright : Copyright (C) 2007-2010 John MacFarlane + Copyright : Copyright (C) 2007-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,96 +27,30 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} -module Text.Pandoc.Readers.TeXMath ( readTeXMath ) where +module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where import Text.Pandoc.Definition -import Text.TeXMath.Types -import Text.TeXMath.Parser +import Text.TeXMath -- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. --- Defaults to raw formula between @$@ characters if entire formula +-- Defaults to raw formula between @$@ or @$$@ characters if entire formula -- can't be converted. +readTeXMath' :: MathType + -> String -- ^ String to parse (assumes @'\n'@ line endings) + -> [Inline] +readTeXMath' mt inp = case texMathToPandoc dt inp of + Left _ -> [Str (delim ++ inp ++ delim)] + Right res -> res + where (dt, delim) = case mt of + DisplayMath -> (DisplayBlock, "$$") + InlineMath -> (DisplayInline, "$") + +{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-} +-- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. +-- Defaults to raw formula between @$@ characters if entire formula +-- can't be converted. (This is provided for backwards compatibility; +-- it is better to use @readTeXMath'@, which properly distinguishes +-- between display and inline math.) readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath inp = case texMathToPandoc inp of - Left _ -> [Str ("$" ++ inp ++ "$")] - Right res -> res - -texMathToPandoc :: String -> Either String [Inline] -texMathToPandoc inp = inp `seq` - case parseFormula inp of - Left err -> Left err - Right exps -> case expsToInlines exps of - Nothing -> Left "Formula too complex for [Inline]" - Just r -> Right r - -expsToInlines :: [Exp] -> Maybe [Inline] -expsToInlines xs = do - res <- mapM expToInlines xs - return (concat res) - -expToInlines :: Exp -> Maybe [Inline] -expToInlines (ENumber s) = Just [Str s] -expToInlines (EIdentifier s) = Just [Emph [Str s]] -expToInlines (EMathOperator s) = Just [Str s] -expToInlines (ESymbol t s) = Just $ addSpace t (Str s) - where addSpace Op x = [x, thinspace] - addSpace Bin x = [medspace, x, medspace] - addSpace Rel x = [widespace, x, widespace] - addSpace Pun x = [x, thinspace] - addSpace _ x = [x] - thinspace = Str "\x2006" - medspace = Str "\x2005" - widespace = Str "\x2004" -expToInlines (EStretchy x) = expToInlines x -expToInlines (EDelimited start end xs) = do - xs' <- mapM expToInlines xs - return $ [Str start] ++ concat xs' ++ [Str end] -expToInlines (EGrouped xs) = expsToInlines xs -expToInlines (ESpace "0.167em") = Just [Str "\x2009"] -expToInlines (ESpace "0.222em") = Just [Str "\x2005"] -expToInlines (ESpace "0.278em") = Just [Str "\x2004"] -expToInlines (ESpace "0.333em") = Just [Str "\x2004"] -expToInlines (ESpace "1em") = Just [Str "\x2001"] -expToInlines (ESpace "2em") = Just [Str "\x2001\x2001"] -expToInlines (ESpace _) = Just [Str " "] -expToInlines (EBinary _ _ _) = Nothing -expToInlines (ESub x y) = do - x' <- expToInlines x - y' <- expToInlines y - return $ x' ++ [Subscript y'] -expToInlines (ESuper x y) = do - x' <- expToInlines x - y' <- expToInlines y - return $ x' ++ [Superscript y'] -expToInlines (ESubsup x y z) = do - x' <- expToInlines x - y' <- expToInlines y - z' <- expToInlines z - return $ x' ++ [Subscript y'] ++ [Superscript z'] -expToInlines (EDown x y) = expToInlines (ESub x y) -expToInlines (EUp x y) = expToInlines (ESuper x y) -expToInlines (EDownup x y z) = expToInlines (ESubsup x y z) -expToInlines (EText TextNormal x) = Just [Str x] -expToInlines (EText TextBold x) = Just [Strong [Str x]] -expToInlines (EText TextMonospace x) = Just [Code nullAttr x] -expToInlines (EText TextItalic x) = Just [Emph [Str x]] -expToInlines (EText _ x) = Just [Str x] -expToInlines (EOver (EGrouped [EIdentifier [c]]) (ESymbol Accent [accent])) = - case accent of - '\x203E' -> Just [Emph [Str [c,'\x0304']]] -- bar - '\x00B4' -> Just [Emph [Str [c,'\x0301']]] -- acute - '\x0060' -> Just [Emph [Str [c,'\x0300']]] -- grave - '\x02D8' -> Just [Emph [Str [c,'\x0306']]] -- breve - '\x02C7' -> Just [Emph [Str [c,'\x030C']]] -- check - '.' -> Just [Emph [Str [c,'\x0307']]] -- dot - '\x00B0' -> Just [Emph [Str [c,'\x030A']]] -- ring - '\x20D7' -> Just [Emph [Str [c,'\x20D7']]] -- arrow right - '\x20D6' -> Just [Emph [Str [c,'\x20D6']]] -- arrow left - '\x005E' -> Just [Emph [Str [c,'\x0302']]] -- hat - '\x0302' -> Just [Emph [Str [c,'\x0302']]] -- hat - '~' -> Just [Emph [Str [c,'\x0303']]] -- tilde - _ -> Nothing -expToInlines _ = Nothing - - +readTeXMath = readTeXMath' InlineMath diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index a1687a691..6d839ec1d 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -1,5 +1,6 @@ {- -Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' +Copyright (C) 2010-2014 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + and John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Textile - Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane + Copyright : Copyright (C) 2010-2014 Paul Rivier and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -50,18 +51,22 @@ TODO : refactor common patterns across readers : module Text.Pandoc.Readers.Textile ( readTextile) where - import Text.Pandoc.Definition -import Text.Pandoc.Shared +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import qualified Text.Pandoc.Builder as B import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) +import Text.HTML.TagSoup (parseTags, innerText, fromAttrib, Tag(..)) import Text.HTML.TagSoup.Match import Data.List ( intercalate ) -import Data.Char ( digitToInt, isUpper ) -import Control.Monad ( guard, liftM ) -import Control.Applicative ((<$>), (*>), (<*)) +import Data.Char ( digitToInt, isUpper) +import Control.Monad ( guard, liftM, when ) +import Text.Printf +import Control.Applicative ((<$>), (*>), (<*), (<$)) +import Data.Monoid +import Debug.Trace (trace) -- | Parse a Textile text and return a Pandoc document. readTextile :: ReaderOptions -- ^ Reader options @@ -93,7 +98,7 @@ parseTextile = do updateState $ \s -> s { stateNotes = reverse reversedNotes } -- now parse it for real... blocks <- parseBlocks - return $ Pandoc nullMeta blocks -- FIXME + return $ Pandoc nullMeta (B.toList blocks) -- FIXME noteMarker :: Parser [Char] ParserState [Char] noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') @@ -113,11 +118,11 @@ noteBlock = try $ do return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -- | Parse document blocks -parseBlocks :: Parser [Char] ParserState [Block] -parseBlocks = manyTill block eof +parseBlocks :: Parser [Char] ParserState Blocks +parseBlocks = mconcat <$> manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: [Parser [Char] ParserState Block] +blockParsers :: [Parser [Char] ParserState Blocks] blockParsers = [ codeBlock , header , blockQuote @@ -128,32 +133,45 @@ blockParsers = [ codeBlock , rawLaTeXBlock' , maybeExplicitBlock "table" table , maybeExplicitBlock "p" para + , endBlock ] --- | Any block in the order of definition of blockParsers -block :: Parser [Char] ParserState Block -block = choice blockParsers <?> "block" +endBlock :: Parser [Char] ParserState Blocks +endBlock = string "\n\n" >> return mempty -commentBlock :: Parser [Char] ParserState Block +-- | Any block in the order of definition of blockParsers +block :: Parser [Char] ParserState Blocks +block = do + res <- choice blockParsers <?> "block" + pos <- getPosition + tr <- getOption readerTrace + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res + +commentBlock :: Parser [Char] ParserState Blocks commentBlock = try $ do string "###." manyTill anyLine blanklines - return Null + return mempty -codeBlock :: Parser [Char] ParserState Block +codeBlock :: Parser [Char] ParserState Blocks codeBlock = codeBlockBc <|> codeBlockPre -codeBlockBc :: Parser [Char] ParserState Block +codeBlockBc :: Parser [Char] ParserState Blocks codeBlockBc = try $ do string "bc. " contents <- manyTill anyLine blanklines - return $ CodeBlock ("",[],[]) $ unlines contents + return $ B.codeBlock (unlines contents) -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockPre :: Parser [Char] ParserState Block +codeBlockPre :: Parser [Char] ParserState Blocks codeBlockPre = try $ do - htmlTag (tagOpen (=="pre") null) - result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak) + (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) + result' <- (innerText . parseTags) `fmap` -- remove internal tags + manyTill anyChar (htmlTag (tagClose (=="pre"))) + optional blanklines -- drop leading newline if any let result'' = case result' of '\n':xs -> xs @@ -162,28 +180,32 @@ codeBlockPre = try $ do let result''' = case reverse result'' of '\n':_ -> init result'' _ -> result'' - return $ CodeBlock ("",[],[]) result''' + let classes = words $ fromAttrib "class" t + let ident = fromAttrib "id" t + let kvs = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.codeBlockWith (ident,classes,kvs) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: Parser [Char] ParserState Block +header :: Parser [Char] ParserState Blocks header = try $ do char 'h' level <- digitToInt <$> oneOf "123456" attr <- attributes char '.' - whitespace - name <- normalizeSpaces <$> manyTill inline blockBreak - return $ Header level attr name + lookAhead whitespace + name <- trimInlines . mconcat <$> manyTill inline blockBreak + attr' <- registerHeader attr name + return $ B.headerWith attr' level name -- | Blockquote of the form "bq. content" -blockQuote :: Parser [Char] ParserState Block +blockQuote :: Parser [Char] ParserState Blocks blockQuote = try $ do string "bq" >> attributes >> char '.' >> whitespace - BlockQuote . singleton <$> para + B.blockQuote <$> para -- Horizontal rule -hrule :: Parser [Char] st Block +hrule :: Parser [Char] st Blocks hrule = try $ do skipSpaces start <- oneOf "-*" @@ -191,62 +213,62 @@ hrule = try $ do skipMany (spaceChar <|> char start) newline optional blanklines - return HorizontalRule + return B.horizontalRule -- Lists handling -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: Parser [Char] ParserState Block +anyList :: Parser [Char] ParserState Blocks anyList = try $ anyListAtDepth 1 <* blanklines -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: Int -> Parser [Char] ParserState Block +anyListAtDepth :: Int -> Parser [Char] ParserState Blocks anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: Int -> Parser [Char] ParserState Block -bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth) +bulletListAtDepth :: Int -> Parser [Char] ParserState Blocks +bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +bulletListItemAtDepth :: Int -> Parser [Char] ParserState Blocks bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' -orderedListAtDepth :: Int -> Parser [Char] ParserState Block +orderedListAtDepth :: Int -> Parser [Char] ParserState Blocks orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) - return (OrderedList (1, DefaultStyle, DefaultDelim) items) + return $ B.orderedList items -- | Ordered List Item of given depth, depth being the number of -- leading '#' -orderedListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +orderedListItemAtDepth :: Int -> Parser [Char] ParserState Blocks orderedListItemAtDepth = genericListItemAtDepth '#' -- | Common implementation of list items -genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState [Block] +genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState Blocks genericListItemAtDepth c depth = try $ do count depth (char c) >> attributes >> whitespace - p <- many listInline + p <- mconcat <$> many listInline newline - sublist <- option [] (singleton <$> anyListAtDepth (depth + 1)) - return (Plain p : sublist) + sublist <- option mempty (anyListAtDepth (depth + 1)) + return $ (B.plain p) <> sublist -- | A definition list is a set of consecutive definition items -definitionList :: Parser [Char] ParserState Block -definitionList = try $ DefinitionList <$> many1 definitionListItem +definitionList :: Parser [Char] ParserState Blocks +definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. listStart :: Parser [Char] st Char listStart = oneOf "*#-" -listInline :: Parser [Char] ParserState Inline +listInline :: Parser [Char] ParserState Inlines listInline = try (notFollowedBy newline >> inline) <|> try (endline <* notFollowedBy listStart) @@ -254,16 +276,16 @@ listInline = try (notFollowedBy newline >> inline) -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) +definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks]) definitionListItem = try $ do string "- " - term <- many1Till inline (try (whitespace >> string ":=")) + term <- mconcat <$> many1Till inline (try (whitespace >> string ":=")) def' <- multilineDef <|> inlineDef return (term, def') - where inlineDef :: Parser [Char] ParserState [[Block]] - inlineDef = liftM (\d -> [[Plain d]]) - $ optional whitespace >> many listInline <* newline - multilineDef :: Parser [Char] ParserState [[Block]] + where inlineDef :: Parser [Char] ParserState [Blocks] + inlineDef = liftM (\d -> [B.plain d]) + $ optional whitespace >> (trimInlines . mconcat <$> many listInline) <* newline + multilineDef :: Parser [Char] ParserState [Blocks] multilineDef = try $ do optional whitespace >> newline s <- many1Till anyChar (try (string "=:" >> newline)) @@ -275,64 +297,62 @@ definitionListItem = try $ do -- blocks support, we have to lookAhead for a rawHtmlBlock. blockBreak :: Parser [Char] ParserState () blockBreak = try (newline >> blanklines >> return ()) <|> - (lookAhead rawHtmlBlock >> return ()) + try (optional spaces >> lookAhead rawHtmlBlock >> return ()) -- raw content -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: Parser [Char] ParserState Block +rawHtmlBlock :: Parser [Char] ParserState Blocks rawHtmlBlock = try $ do (_,b) <- htmlTag isBlockTag optional blanklines - return $ RawBlock "html" b + return $ B.rawBlock "html" b -- | Raw block of LaTeX content -rawLaTeXBlock' :: Parser [Char] ParserState Block +rawLaTeXBlock' :: Parser [Char] ParserState Blocks rawLaTeXBlock' = do guardEnabled Ext_raw_tex - RawBlock "latex" <$> (rawLaTeXBlock <* spaces) + B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) -- | In textile, paragraphs are separated by blank lines. -para :: Parser [Char] ParserState Block -para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak - +para :: Parser [Char] ParserState Blocks +para = B.para . trimInlines . mconcat <$> manyTill inline blockBreak -- Tables -- | A table cell spans until a pipe | -tableCell :: Parser [Char] ParserState TableCell +tableCell :: Parser [Char] ParserState Blocks tableCell = do c <- many1 (noneOf "|\n") - content <- parseFromString (many1 inline) c - return $ [ Plain $ normalizeSpaces content ] + content <- trimInlines . mconcat <$> parseFromString (many1 inline) c + return $ B.plain content -- | A table row is made of many table cells -tableRow :: Parser [Char] ParserState [TableCell] +tableRow :: Parser [Char] ParserState [Blocks] tableRow = try $ ( char '|' *> (endBy1 tableCell (optional blankline *> char '|')) <* newline) -- | Many table rows -tableRows :: Parser [Char] ParserState [[TableCell]] +tableRows :: Parser [Char] ParserState [[Blocks]] tableRows = many1 tableRow -- | Table headers are made of cells separated by a tag "|_." -tableHeaders :: Parser [Char] ParserState [TableCell] +tableHeaders :: Parser [Char] ParserState [Blocks] tableHeaders = let separator = (try $ string "|_.") in try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline ) -- | A table with an optional header. Current implementation can -- handle tables with and without header, but will parse cells -- alignment attributes as content. -table :: Parser [Char] ParserState Block +table :: Parser [Char] ParserState Blocks table = try $ do - headers <- option [] tableHeaders + headers <- option mempty tableHeaders rows <- tableRows blanklines let nbOfCols = max (length headers) (length $ head rows) - return $ Table [] - (replicate nbOfCols AlignDefault) - (replicate nbOfCols 0.0) + return $ B.table mempty + (zip (replicate nbOfCols AlignDefault) (replicate nbOfCols 0.0)) headers rows @@ -340,8 +360,8 @@ table = try $ do -- | Blocks like 'p' and 'table' do not need explicit block tag. -- However, they can be used to set HTML/CSS attributes when needed. maybeExplicitBlock :: String -- ^ block tag name - -> Parser [Char] ParserState Block -- ^ implicit block - -> Parser [Char] ParserState Block + -> Parser [Char] ParserState Blocks -- ^ implicit block + -> Parser [Char] ParserState Blocks maybeExplicitBlock name blk = try $ do optional $ try $ string name >> attributes >> char '.' >> optional whitespace >> optional endline @@ -355,73 +375,74 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: Parser [Char] ParserState Inline -inline = choice inlineParsers <?> "inline" +inline :: Parser [Char] ParserState Inlines +inline = do + choice inlineParsers <?> "inline" -- | Inline parsers tried in order -inlineParsers :: [Parser [Char] ParserState Inline] +inlineParsers :: [Parser [Char] ParserState Inlines] inlineParsers = [ str , whitespace , endline , code , escapedInline - , htmlSpan + , inlineMarkup + , groupedInlineMarkup , rawHtmlInline , rawLaTeXInline' , note - , try $ (char '[' *> inlineMarkup <* char ']') - , inlineMarkup , link , image , mark - , (Str . (:[])) <$> characterReference + , (B.str . (:[])) <$> characterReference , smartPunctuation inline , symbol ] -- | Inline markups -inlineMarkup :: Parser [Char] ParserState Inline -inlineMarkup = choice [ simpleInline (string "??") (Cite []) - , simpleInline (string "**") Strong - , simpleInline (string "__") Emph - , simpleInline (char '*') Strong - , simpleInline (char '_') Emph - , simpleInline (char '+') Emph -- approximates underline - , simpleInline (char '-' <* notFollowedBy (char '-')) Strikeout - , simpleInline (char '^') Superscript - , simpleInline (char '~') Subscript +inlineMarkup :: Parser [Char] ParserState Inlines +inlineMarkup = choice [ simpleInline (string "??") (B.cite []) + , simpleInline (string "**") B.strong + , simpleInline (string "__") B.emph + , simpleInline (char '*') B.strong + , simpleInline (char '_') B.emph + , simpleInline (char '+') B.emph -- approximates underline + , simpleInline (char '-' <* notFollowedBy (char '-')) B.strikeout + , simpleInline (char '^') B.superscript + , simpleInline (char '~') B.subscript + , simpleInline (char '%') id ] -- | Trademark, registered, copyright -mark :: Parser [Char] st Inline +mark :: Parser [Char] st Inlines mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: Parser [Char] st Inline +reg :: Parser [Char] st Inlines reg = do oneOf "Rr" char ')' - return $ Str "\174" + return $ B.str "\174" -tm :: Parser [Char] st Inline +tm :: Parser [Char] st Inlines tm = do oneOf "Tt" oneOf "Mm" char ')' - return $ Str "\8482" + return $ B.str "\8482" -copy :: Parser [Char] st Inline +copy :: Parser [Char] st Inlines copy = do oneOf "Cc" char ')' - return $ Str "\169" + return $ B.str "\169" -note :: Parser [Char] ParserState Inline +note :: Parser [Char] ParserState Inlines note = try $ do ref <- (char '[' *> many1 digit <* char ']') notes <- stateNotes <$> getState case lookup ref notes of Nothing -> fail "note not found" - Just raw -> liftM Note $ parseFromString parseBlocks raw + Just raw -> B.note <$> parseFromString parseBlocks raw -- | Special chars markupChars :: [Char] @@ -442,7 +463,7 @@ wordBoundaries = markupChars ++ stringBreakers hyphenedWords :: Parser [Char] ParserState String hyphenedWords = do x <- wordChunk - xs <- many (try $ char '-' >> wordChunk) + xs <- many (try $ char '-' >> wordChunk) return $ intercalate "-" (x:xs) wordChunk :: Parser [Char] ParserState String @@ -454,99 +475,99 @@ wordChunk = try $ do return $ hd:tl -- | Any string -str :: Parser [Char] ParserState Inline +str :: Parser [Char] ParserState Inlines str = do baseStr <- hyphenedWords -- RedCloth compliance : if parsed word is uppercase and immediatly -- followed by parens, parens content is unconditionally word acronym fullStr <- option baseStr $ try $ do guard $ all isUpper baseStr - acro <- enclosed (char '(') (char ')') anyChar + acro <- enclosed (char '(') (char ')') anyChar' return $ concat [baseStr, " (", acro, ")"] updateLastStrPos - return $ Str fullStr - --- | Textile allows HTML span infos, we discard them -htmlSpan :: Parser [Char] ParserState Inline -htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') ) + return $ B.str fullStr -- | Some number of space chars -whitespace :: Parser [Char] ParserState Inline -whitespace = many1 spaceChar >> return Space <?> "whitespace" +whitespace :: Parser [Char] ParserState Inlines +whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: Parser [Char] ParserState Inline +endline :: Parser [Char] ParserState Inlines endline = try $ do newline >> notFollowedBy blankline - return LineBreak + return B.linebreak -rawHtmlInline :: Parser [Char] ParserState Inline -rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag +rawHtmlInline :: Parser [Char] ParserState Inlines +rawHtmlInline = B.rawInline "html" . snd <$> htmlTag isInlineTag -- | Raw LaTeX Inline -rawLaTeXInline' :: Parser [Char] ParserState Inline +rawLaTeXInline' :: Parser [Char] ParserState Inlines rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex - rawLaTeXInline + B.singleton <$> rawLaTeXInline -- | Textile standard link syntax is "label":target. But we -- can also have ["label":target]. -link :: Parser [Char] ParserState Inline -link = linkB <|> linkNoB - -linkNoB :: Parser [Char] ParserState Inline -linkNoB = try $ do - name <- surrounded (char '"') inline - char ':' - let stopChars = "!.,;:" - url <- manyTill nonspaceChar (lookAhead $ space <|> try (oneOf stopChars >> (space <|> newline))) - let name' = if name == [Str "$"] then [Str url] else name - return $ Link name' (url, "") - -linkB :: Parser [Char] ParserState Inline -linkB = try $ do - char '[' - name <- surrounded (char '"') inline +link :: Parser [Char] ParserState Inlines +link = try $ do + bracketed <- (True <$ char '[') <|> return False + char '"' *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + name <- trimInlines . mconcat <$> + withQuoteContext InDoubleQuote (many1Till inline (char '"')) char ':' - url <- manyTill nonspaceChar (char ']') - let name' = if name == [Str "$"] then [Str url] else name - return $ Link name' (url, "") + let stop = if bracketed + then char ']' + else lookAhead $ space <|> + try (oneOf "!.,;:" *> (space <|> newline)) + url <- manyTill nonspaceChar stop + let name' = if B.toList name == [Str "$"] then B.str url else name + return $ if attr == nullAttr + then B.link url "" name' + else B.spanWith attr $ B.link url "" name' -- | image embedding -image :: Parser [Char] ParserState Inline +image :: Parser [Char] ParserState Inlines image = try $ do char '!' >> notFollowedBy space - src <- manyTill anyChar (lookAhead $ oneOf "!(") - alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')'))) + src <- manyTill anyChar' (lookAhead $ oneOf "!(") + alt <- option "" (try $ (char '(' >> manyTill anyChar' (char ')'))) char '!' - return $ Image [Str alt] (src, alt) + return $ B.image src alt (B.str alt) -escapedInline :: Parser [Char] ParserState Inline +escapedInline :: Parser [Char] ParserState Inlines escapedInline = escapedEqs <|> escapedTag -escapedEqs :: Parser [Char] ParserState Inline -escapedEqs = Str <$> (try $ string "==" *> manyTill anyChar (try $ string "==")) +escapedEqs :: Parser [Char] ParserState Inlines +escapedEqs = B.str <$> + (try $ string "==" *> manyTill anyChar' (try $ string "==")) -- | literal text escaped btw <notextile> tags -escapedTag :: Parser [Char] ParserState Inline -escapedTag = Str <$> - (try $ string "<notextile>" *> manyTill anyChar (try $ string "</notextile>")) +escapedTag :: Parser [Char] ParserState Inlines +escapedTag = B.str <$> + (try $ string "<notextile>" *> + manyTill anyChar' (try $ string "</notextile>")) -- | Any special symbol defined in wordBoundaries -symbol :: Parser [Char] ParserState Inline -symbol = Str . singleton <$> (oneOf wordBoundaries <|> oneOf markupChars) +symbol :: Parser [Char] ParserState Inlines +symbol = B.str . singleton <$> (oneOf wordBoundaries <|> oneOf markupChars) -- | Inline code -code :: Parser [Char] ParserState Inline +code :: Parser [Char] ParserState Inlines code = code1 <|> code2 -code1 :: Parser [Char] ParserState Inline -code1 = Code nullAttr <$> surrounded (char '@') anyChar +-- any character except a newline before a blank line +anyChar' :: Parser [Char] ParserState Char +anyChar' = + satisfy (/='\n') <|> (try $ char '\n' <* notFollowedBy blankline) + +code1 :: Parser [Char] ParserState Inlines +code1 = B.code <$> surrounded (char '@') anyChar' -code2 :: Parser [Char] ParserState Inline +code2 :: Parser [Char] ParserState Inlines code2 = do htmlTag (tagOpen (=="tt") null) - Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) + B.code <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes attributes :: Parser [Char] ParserState Attr @@ -558,7 +579,7 @@ attribute = classIdAttr <|> styleAttr <|> langAttr classIdAttr :: Parser [Char] ParserState (Attr -> Attr) classIdAttr = try $ do -- (class class #id) char '(' - ws <- words `fmap` manyTill anyChar (char ')') + ws <- words `fmap` manyTill anyChar' (char ')') case reverse ws of [] -> return $ \(_,_,keyvals) -> ("",[],keyvals) (('#':ident'):classes') -> return $ \(_,_,keyvals) -> @@ -568,28 +589,49 @@ classIdAttr = try $ do -- (class class #id) styleAttr :: Parser [Char] ParserState (Attr -> Attr) styleAttr = do - style <- try $ enclosed (char '{') (char '}') anyChar + style <- try $ enclosed (char '{') (char '}') anyChar' return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals) langAttr :: Parser [Char] ParserState (Attr -> Attr) langAttr = do - lang <- try $ enclosed (char '[') (char ']') anyChar + lang <- try $ enclosed (char '[') (char ']') alphaNum return $ \(id',classes,keyvals) -> (id',classes,("lang",lang):keyvals) -- | Parses material surrounded by a parser. surrounded :: Parser [Char] st t -- ^ surrounding parser -> Parser [Char] st a -- ^ content parser (to be used repeatedly) -> Parser [Char] st [a] -surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) +surrounded border = + enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) --- | Inlines are most of the time of the same form simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser - -> ([Inline] -> Inline) -- ^ Inline constructor - -> Parser [Char] ParserState Inline -- ^ content parser (to be used repeatedly) -simpleInline border construct = surrounded border (inlineWithAttribute) >>= - return . construct . normalizeSpaces - where inlineWithAttribute = (try $ optional attributes) >> inline + -> (Inlines -> Inlines) -- ^ Inline constructor + -> Parser [Char] ParserState Inlines -- ^ content parser (to be used repeatedly) +simpleInline border construct = try $ do + st <- getState + pos <- getPosition + let afterString = stateLastStrPos st == Just pos + guard $ not afterString + border *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + body <- trimInlines . mconcat <$> + withQuoteContext InSingleQuote + (manyTill inline (try border <* notFollowedBy alphaNum)) + return $ construct $ + if attr == nullAttr + then body + else B.spanWith attr body + +groupedInlineMarkup :: Parser [Char] ParserState Inlines +groupedInlineMarkup = try $ do + char '[' + sp1 <- option mempty $ B.space <$ whitespace + result <- withQuoteContext InSingleQuote inlineMarkup + sp2 <- option mempty $ B.space <$ whitespace + char ']' + return $ sp1 <> result <> sp2 -- | Create a singleton list singleton :: a -> [a] singleton x = [x] + |
