diff options
Diffstat (limited to 'src/Text/Pandoc/Readers')
21 files changed, 6580 insertions, 1142 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index 56cb16b20..663960a87 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -6,6 +6,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Builder import Text.XML.Light import Text.Pandoc.Compat.TagSoupEntity (lookupEntity) +import Data.Either (rights) import Data.Generics import Data.Monoid import Data.Char (isSpace) @@ -13,6 +14,7 @@ import Control.Monad.State import Control.Applicative ((<$>)) import Data.List (intersperse) import Data.Maybe (fromMaybe) +import Text.TeXMath (readMathML, writeTeX) {- @@ -45,7 +47,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] audioobject - A wrapper for audio data and its associated meta-information [x] author - The name of an individual author [ ] authorblurb - A short description or note about an author -[ ] authorgroup - Wrapper for author information when a document has +[x] authorgroup - Wrapper for author information when a document has multiple authors or collabarators [x] authorinitials - The initials or other short identifier for an author [o] beginpage - The location of a page break in a print version of the document @@ -68,8 +70,8 @@ List of all DocBook tags, with [x] indicating implemented, [x] book - A book [x] bookinfo - Meta-information for a Book [x] bridgehead - A free-floating heading -[ ] callout - A “called out” description of a marked Area -[ ] calloutlist - A list of Callouts +[x] callout - A “called out” description of a marked Area +[x] calloutlist - A list of Callouts [x] caption - A caption [x] caution - A note of caution [x] chapter - A chapter, as of a book @@ -79,7 +81,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] citerefentry - A citation to a reference page [ ] citetitle - The title of a cited work [ ] city - The name of a city in an address -[ ] classname - The name of a class, in the object-oriented programming sense +[x] classname - The name of a class, in the object-oriented programming sense [ ] classsynopsis - The syntax summary for a class definition [ ] classsynopsisinfo - Information supplementing the contents of a ClassSynopsis @@ -126,7 +128,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] envar - A software environment variable [x] epigraph - A short inscription at the beginning of a document or component note: also handle embedded attribution tag -[ ] equation - A displayed mathematical equation +[x] equation - A displayed mathematical equation [ ] errorcode - An error code [ ] errorname - An error name [ ] errortext - An error message. @@ -167,9 +169,9 @@ List of all DocBook tags, with [x] indicating implemented, [ ] guibutton - The text on a button in a GUI [ ] guiicon - Graphic and/or text appearing as a icon in a GUI [ ] guilabel - The text of a label in a GUI -[ ] guimenu - The name of a menu in a GUI -[ ] guimenuitem - The name of a terminal menu item in a GUI -[ ] guisubmenu - The name of a submenu in a GUI +[x] guimenu - The name of a menu in a GUI +[x] guimenuitem - The name of a terminal menu item in a GUI +[x] guisubmenu - The name of a submenu in a GUI [ ] hardware - A physical part of a computer system [ ] highlights - A summary of the main points of the discussed component [ ] holder - The name of the individual or organization that holds a copyright @@ -185,12 +187,12 @@ List of all DocBook tags, with [x] indicating implemented, [x] indexinfo - Meta-information for an Index [x] indexterm - A wrapper for terms to be indexed [x] info - A wrapper for information about a component or other block. (DocBook v5) -[ ] informalequation - A displayed mathematical equation without a title +[x] informalequation - A displayed mathematical equation without a title [ ] informalexample - A displayed example without a title [ ] informalfigure - A untitled figure [ ] informaltable - A table without a title [ ] initializer - The initializer for a FieldSynopsis -[ ] inlineequation - A mathematical equation or expression occurring inline +[x] inlineequation - A mathematical equation or expression occurring inline [ ] inlinegraphic - An object containing or pointing to graphical data that will be rendered inline [x] inlinemediaobject - An inline media object (video, audio, image, and so on) @@ -204,10 +206,10 @@ List of all DocBook tags, with [x] indicating implemented, other dingbat [ ] itermset - A set of index terms in the meta-information of a document [ ] jobtitle - The title of an individual in an organization -[ ] keycap - The text printed on a key on a keyboard +[x] keycap - The text printed on a key on a keyboard [ ] keycode - The internal, frequently numeric, identifier for a key on a keyboard -[ ] keycombo - A combination of input actions +[x] keycombo - A combination of input actions [ ] keysym - The symbolic name of a key on a keyboard [ ] keyword - One of a set of keywords describing the content of a document [ ] keywordset - A set of keywords describing the content of a document @@ -235,11 +237,11 @@ List of all DocBook tags, with [x] indicating implemented, [x] mediaobject - A displayed media object (video, audio, image, etc.) [ ] mediaobjectco - A media object that contains callouts [x] member - An element of a simple list -[ ] menuchoice - A selection or series of selections from a menu +[x] menuchoice - A selection or series of selections from a menu [ ] methodname - The name of a method [ ] methodparam - Parameters to a method [ ] methodsynopsis - A syntax summary for a method -[ ] mml:math - A MathML equation +[x] mml:math - A MathML equation [ ] modespec - Application-specific information necessary for the completion of an OLink [ ] modifier - Modifiers in a synopsis @@ -341,7 +343,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] refsectioninfo - Meta-information for a refsection [ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page [ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv -[ ] releaseinfo - Information about a particular release of a document +[x] releaseinfo - Information about a particular release of a document [ ] remark - A remark (or comment) intended for presentation in a draft manuscript [ ] replaceable - Content that may or must be replaced by the user @@ -469,7 +471,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] token - A unit of information [x] tr - A row in an HTML table [ ] trademark - A trademark -[ ] type - The classification of a value +[x] type - The classification of a value [x] ulink - A link that addresses its target by means of a URL (Uniform Resource Locator) [x] uri - A Uniform Resource Identifier @@ -492,34 +494,40 @@ List of all DocBook tags, with [x] indicating implemented, anything else [ ] xref - A cross reference to another part of the document [ ] year - The year of publication of a document - +[x] ?asciidoc-br? - line break from asciidoc docbook output -} type DB = State DBState data DBState = DBState{ dbSectionLevel :: Int , dbQuoteType :: QuoteType - , dbDocTitle :: Inlines - , dbDocAuthors :: [Inlines] - , dbDocDate :: Inlines + , dbMeta :: Meta + , dbAcceptsMeta :: Bool , dbBook :: Bool , dbFigureTitle :: Inlines } deriving Show readDocBook :: ReaderOptions -> String -> Pandoc -readDocBook _ inp = setTitle (dbDocTitle st') - $ setAuthors (dbDocAuthors st') - $ setDate (dbDocDate st') - $ doc $ mconcat bs - where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp) +readDocBook _ inp = Pandoc (dbMeta st') (toList $ mconcat bs) + where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp') DBState{ dbSectionLevel = 0 , dbQuoteType = DoubleQuote - , dbDocTitle = mempty - , dbDocAuthors = [] - , dbDocDate = mempty + , dbMeta = mempty + , dbAcceptsMeta = False , dbBook = False , dbFigureTitle = mempty } + inp' = handleInstructions inp + +-- We treat <?asciidoc-br?> specially (issue #1236), converting it +-- to <br/>, since xml-light doesn't parse the instruction correctly. +-- Other xml instructions are simply removed from the input stream. +handleInstructions :: String -> String +handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions xs +handleInstructions xs = case break (=='<') xs of + (ys, []) -> ys + ([], '<':zs) -> '<' : handleInstructions zs + (ys, zs) -> ys ++ handleInstructions zs getFigure :: Element -> DB Blocks getFigure e = do @@ -560,6 +568,30 @@ attrValue attr elt = named :: String -> Element -> Bool named s e = qName (elName e) == s +-- + +acceptingMetadata :: DB a -> DB a +acceptingMetadata p = do + modify (\s -> s { dbAcceptsMeta = True } ) + res <- p + modify (\s -> s { dbAcceptsMeta = False }) + return res + +checkInMeta :: Monoid a => DB () -> DB a +checkInMeta p = do + accepts <- dbAcceptsMeta <$> get + when accepts p + return mempty + + + +addMeta :: ToMetaValue a => String -> a -> DB () +addMeta field val = modify (setMeta field val) + +instance HasMeta DBState where + setMeta field v s = s {dbMeta = setMeta field v (dbMeta s)} + deleteMeta field s = s {dbMeta = deleteMeta field (dbMeta s)} + isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blocktags where blocktags = ["toc","index","para","formalpara","simpara", @@ -571,7 +603,7 @@ isBlockElement (Elem e) = qName (elName e) `elem` blocktags "important","caution","note","tip","warning","qandadiv", "question","answer","abstract","itemizedlist","orderedlist", "variablelist","article","book","table","informaltable", - "screen","programlisting","example"] + "screen","programlisting","example","calloutlist"] isBlockElement _ = False -- Trim leading and trailing newline characters @@ -606,6 +638,7 @@ getImage e = do getBlocks :: Element -> DB Blocks getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) + parseBlock :: Content -> DB Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE parseBlock (Text (CData _ s _)) = if all isSpace s @@ -619,10 +652,10 @@ parseBlock (Elem e) = "para" -> parseMixed para (elContent e) "formalpara" -> do tit <- case filterChild (named "title") e of - Just t -> (<> str "." <> linebreak) <$> emph - <$> getInlines t + Just t -> (para . strong . (<> str ".")) <$> + getInlines t Nothing -> return mempty - addToStart tit <$> parseMixed para (elContent e) + (tit <>) <$> parseMixed para (elContent e) "simpara" -> parseMixed para (elContent e) "ackno" -> parseMixed para (elContent e) "epigraph" -> parseBlockquote @@ -630,7 +663,11 @@ parseBlock (Elem e) = "attribution" -> return mempty "titleabbrev" -> return mempty "authorinitials" -> return mempty - "title" -> return mempty -- handled by getTitle or sect or figure + "title" -> checkInMeta getTitle + "author" -> checkInMeta getAuthor + "authorgroup" -> checkInMeta getAuthorGroup + "releaseinfo" -> checkInMeta (getInlines e >>= addMeta "release") + "date" -> checkInMeta getDate "bibliography" -> sect 0 "bibliodiv" -> sect 1 "biblioentry" -> parseMixed para (elContent e) @@ -675,6 +712,7 @@ parseBlock (Elem e) = "question" -> addToStart (strong (str "Q:") <> str " ") <$> getBlocks e "answer" -> addToStart (strong (str "A:") <> str " ") <$> getBlocks e "abstract" -> blockQuote <$> getBlocks e + "calloutlist" -> bulletList <$> callouts "itemizedlist" -> bulletList <$> listitems "orderedlist" -> do let listStyle = case attrValue "numeration" e of @@ -693,8 +731,8 @@ parseBlock (Elem e) = "figure" -> getFigure e "mediaobject" -> para <$> getImage e "caption" -> return mempty - "info" -> getTitle >> getAuthors >> getDate >> return mempty - "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "info" -> metaBlock + "articleinfo" -> metaBlock "sectioninfo" -> return mempty -- keywords & other metadata "refsectioninfo" -> return mempty -- keywords & other metadata "refsect1info" -> return mempty -- keywords & other metadata @@ -708,10 +746,10 @@ parseBlock (Elem e) = "chapterinfo" -> return mempty -- keywords & other metadata "glossaryinfo" -> return mempty -- keywords & other metadata "appendixinfo" -> return mempty -- keywords & other metadata - "bookinfo" -> getTitle >> getAuthors >> getDate >> return mempty + "bookinfo" -> metaBlock "article" -> modify (\st -> st{ dbBook = False }) >> - getTitle >> getBlocks e - "book" -> modify (\st -> st{ dbBook = True }) >> getTitle >> getBlocks e + getBlocks e + "book" -> modify (\st -> st{ dbBook = True }) >> getBlocks e "table" -> parseTable "informaltable" -> parseTable "literallayout" -> codeBlockWithLang @@ -734,7 +772,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ strContent e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -743,6 +781,7 @@ parseBlock (Elem e) = contents <- getBlocks e return $ blockQuote (contents <> attrib) listitems = mapM getBlocks $ filterChildren (named "listitem") e + callouts = mapM getBlocks $ filterChildren (named "callout") e deflistitems = mapM parseVarListEntry $ filterChildren (named "varlistentry") e parseVarListEntry e' = do @@ -757,24 +796,19 @@ parseBlock (Elem e) = terms' <- mapM getInlines terms items' <- mapM getBlocks items return (mconcat $ intersperse (str "; ") terms', items') - getTitle = case filterChild (named "title") e of - Just t -> do - tit <- getInlines t - subtit <- case filterChild (named "subtitle") e of - Just s -> (text ": " <>) <$> - getInlines s - Nothing -> return mempty - modify $ \st -> st{dbDocTitle = tit <> subtit} - Nothing -> return () - getAuthors = do - auths <- mapM getInlines - $ filterChildren (named "author") e - modify $ \st -> st{dbDocAuthors = auths} - getDate = case filterChild (named "date") e of - Just t -> do - dat <- getInlines t - modify $ \st -> st{dbDocDate = dat} - Nothing -> return () + getTitle = do + tit <- getInlines e + subtit <- case filterChild (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + addMeta "title" (tit <> subtit) + + getAuthor = (:[]) <$> getInlines e >>= addMeta "author" + getAuthorGroup = do + let terms = filterChildren (named "author") e + mapM getInlines terms >>= addMeta "author" + getDate = getInlines e >>= addMeta "date" parseTable = do let isCaption x = named "title" x || named "caption" x caption <- case filterChild isCaption e of @@ -834,18 +868,31 @@ parseBlock (Elem e) = Nothing -> return mempty modify $ \st -> st{ dbSectionLevel = n } b <- getBlocks e + let ident = attrValue "id" e modify $ \st -> st{ dbSectionLevel = n - 1 } - return $ header n' headerText <> b + return $ headerWith (ident,[],[]) n' headerText <> b + metaBlock = acceptingMetadata (getBlocks e) >> return mempty getInlines :: Element -> DB Inlines getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') +strContentRecursive :: Element -> String +strContentRecursive = strContent . + (\e' -> e'{ elContent = map elementToStr $ elContent e' }) + +elementToStr :: Content -> Content +elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing +elementToStr x = x + parseInline :: Content -> DB Inlines parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = return $ maybe (text $ map toUpper ref) (text . (:[])) $ lookupEntity ref parseInline (Elem e) = case qName (elName e) of + "equation" -> equation displayMath + "informalequation" -> equation displayMath + "inlineequation" -> equation math "subscript" -> subscript <$> innerInlines "superscript" -> superscript <$> innerInlines "inlinemediaobject" -> getImage e @@ -860,6 +907,7 @@ parseInline (Elem e) = else doubleQuoted contents "simplelist" -> simpleList "segmentedlist" -> segmentedList + "classname" -> codeWithLang "code" -> codeWithLang "filename" -> codeWithLang "literal" -> codeWithLang @@ -879,6 +927,10 @@ parseInline (Elem e) = "constant" -> codeWithLang "userinput" -> codeWithLang "varargs" -> return $ code "(...)" + "keycap" -> return (str $ strContent e) + "keycombo" -> keycombo <$> (mapM parseInline $ elContent e) + "menuchoice" -> menuchoice <$> (mapM parseInline $ + filter isGuiMenu $ elContent e) "xref" -> return $ str "?" -- so at least you know something is there "email" -> return $ link ("mailto:" ++ strContent e) "" $ str $ strContent e @@ -899,14 +951,26 @@ parseInline (Elem e) = _ -> emph <$> innerInlines "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) "title" -> return mempty + "affiliation" -> return mempty + -- Note: this isn't a real docbook tag; it's what we convert + -- <?asciidor-br?> to in handleInstructions, above. A kludge to + -- work around xml-light's inability to parse an instruction. + "br" -> return linebreak _ -> innerInlines where innerInlines = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e) + equation constructor = return $ mconcat $ + map (constructor . writeTeX) + $ rights + $ map (readMathML . showElement . everywhere (mkT removePrefix)) + $ filterChildren (\x -> qName (elName x) == "math" && + qPrefix (elName x) == Just "mml") e + removePrefix elname = elname { qPrefix = Nothing } codeWithLang = do let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ strContent e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e simpleList = (mconcat . intersperse (str "," <> space)) <$> mapM getInlines (filterChildren (named "member") e) segmentedList = do @@ -921,3 +985,10 @@ parseInline (Elem e) = then mempty else strong tit <> linebreak return $ linebreak <> tit' <> segs + keycombo = spanWith ("",["keycombo"],[]) . + mconcat . intersperse (str "+") + menuchoice = spanWith ("",["menuchoice"],[]) . + mconcat . intersperse (text " > ") + isGuiMenu (Elem x) = named "guimenu" x || named "guisubmenu" x || + named "guimenuitem" x + isGuiMenu _ = False diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs new file mode 100644 index 000000000..64eb0322f --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -0,0 +1,549 @@ +{-# LANGUAGE PatternGuards, OverloadedStrings #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse) +to 'Pandoc' document. -} + +{- +Current state of implementation of Docx entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + + - [X] Para + - [X] CodeBlock (styled with `SourceCode`) + - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, + indented) + - [X] OrderedList + - [X] BulletList + - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) + - [X] Header (styled with `Heading#`) + - [ ] HorizontalRule + - [-] Table (column widths and alignments not yet implemented) + +* Inlines + + - [X] Str + - [X] Emph (From italics. `underline` currently read as span. In + future, it might optionally be emph as well) + - [X] Strong + - [X] Strikeout + - [X] Superscript + - [X] Subscript + - [X] SmallCaps + - [ ] Quoted + - [ ] Cite + - [X] Code (styled with `VerbatimChar`) + - [X] Space + - [X] LineBreak (these are invisible in Word: entered with Shift-Return) + - [ ] Math + - [X] Link (links to an arbitrary bookmark create a span with the target as + id and "anchor" class) + - [-] Image (Links to path in archive. Future option for + data-encoded URI likely.) + - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.Docx + ( readDocx + ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder +import Text.Pandoc.Walk +import Text.Pandoc.Readers.Docx.Parse +import Text.Pandoc.Readers.Docx.Lists +import Text.Pandoc.Readers.Docx.Reducible +import Text.Pandoc.Shared +import Text.Pandoc.MediaBag (insertMedia, MediaBag) +import Data.List (delete, (\\), intersect) +import Data.Monoid +import Text.TeXMath (writeTeX) +import Data.Default (Default) +import qualified Data.ByteString.Lazy as B +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State +import Control.Applicative ((<$>)) +import Data.Sequence (ViewL(..), viewl) +import qualified Data.Sequence as Seq (null) + +readDocx :: ReaderOptions + -> B.ByteString + -> (Pandoc, MediaBag) +readDocx opts bytes = + case archiveToDocx (toArchive bytes) of + Right docx -> (Pandoc meta blks, mediaBag) where + (meta, blks, mediaBag) = (docxToOutput opts docx) + Left _ -> error $ "couldn't parse docx file" + +data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag + , docxDropCap :: Inlines + } + +instance Default DState where + def = DState { docxAnchorMap = M.empty + , docxMediaBag = mempty + , docxDropCap = mempty + } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxInHeaderBlock :: Bool } + +instance Default DEnv where + def = DEnv def False + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +-- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + +divsToKeep :: [String] +divsToKeep = ["list-item", "Definition", "DefinitionTerm"] + +metaStyles :: M.Map String String +metaStyles = M.fromList [ ("Title", "title") + , ("Subtitle", "subtitle") + , ("Author", "author") + , ("Date", "date") + , ("Abstract", "abstract")] + +sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) +sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) + +isMetaPar :: BodyPart -> Bool +isMetaPar (Paragraph pPr _) = + not $ null $ intersect (pStyle pPr) (M.keys metaStyles) +isMetaPar _ = False + +isEmptyPar :: BodyPart -> Bool +isEmptyPar (Paragraph _ parParts) = + all isEmptyParPart parParts + where + isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems + isEmptyParPart _ = False + isEmptyElem (TextRun s) = trim s == "" + isEmptyElem _ = True +isEmptyPar _ = False + +bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue) +bodyPartsToMeta' [] = return M.empty +bodyPartsToMeta' (bp : bps) + | (Paragraph pPr parParts) <- bp + , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (Just metaField) <- M.lookup c metaStyles = do + inlines <- concatReduce <$> mapM parPartToInlines parParts + remaining <- bodyPartsToMeta' bps + let + f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f m (MetaList mv) = MetaList (m : mv) + f m n = MetaList [m, n] + return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining +bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps + +bodyPartsToMeta :: [BodyPart] -> DocxContext Meta +bodyPartsToMeta bps = do + mp <- bodyPartsToMeta' bps + let mp' = + case M.lookup "author" mp of + Just mv -> M.insert "author" (fixAuthors mv) mp + Nothing -> mp + return $ Meta mp' + +fixAuthors :: MetaValue -> MetaValue +fixAuthors (MetaBlocks blks) = + MetaList $ map g $ filter f blks + where f (Para _) = True + f _ = False + g (Para ils) = MetaInlines ils + g _ = MetaInlines [] +fixAuthors mv = mv + +codeStyles :: [String] +codeStyles = ["VerbatimChar"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> Inlines +runElemToInlines (TextRun s) = text s +runElemToInlines (LnBrk) = linebreak +runElemToInlines (Tab) = space + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] +runElemToString (Tab) = ['\t'] + +runToString :: Run -> String +runToString (Run _ runElems) = concatMap runElemToString runElems +runToString _ = "" + +parPartToString :: ParPart -> String +parPartToString (PlainRun run) = runToString run +parPartToString (InternalHyperLink _ runs) = concatMap runToString runs +parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs +parPartToString _ = "" + +blacklistedCharStyles :: [String] +blacklistedCharStyles = ["Hyperlink"] + +resolveDependentRunStyle :: RunStyle -> RunStyle +resolveDependentRunStyle rPr + | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = + rPr + | Just (_, cs) <- rStyle rPr = + let rPr' = resolveDependentRunStyle cs + in + RunStyle { isBold = case isBold rPr of + Just bool -> Just bool + Nothing -> isBold rPr' + , isItalic = case isItalic rPr of + Just bool -> Just bool + Nothing -> isItalic rPr' + , isSmallCaps = case isSmallCaps rPr of + Just bool -> Just bool + Nothing -> isSmallCaps rPr' + , isStrike = case isStrike rPr of + Just bool -> Just bool + Nothing -> isStrike rPr' + , rVertAlign = case rVertAlign rPr of + Just valign -> Just valign + Nothing -> rVertAlign rPr' + , rUnderline = case rUnderline rPr of + Just ulstyle -> Just ulstyle + Nothing -> rUnderline rPr' + , rStyle = rStyle rPr } + | otherwise = rPr + +runStyleToTransform :: RunStyle -> (Inlines -> Inlines) +runStyleToTransform rPr + | Just (s, _) <- rStyle rPr + , s `elem` spansToKeep = + let rPr' = rPr{rStyle = Nothing} + in + (spanWith ("", [s], [])) . (runStyleToTransform rPr') + | Just True <- isItalic rPr = + emph . (runStyleToTransform rPr {isItalic = Nothing}) + | Just True <- isBold rPr = + strong . (runStyleToTransform rPr {isBold = Nothing}) + | Just True <- isSmallCaps rPr = + smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing}) + | Just True <- isStrike rPr = + strikeout . (runStyleToTransform rPr {isStrike = Nothing}) + | Just SupScrpt <- rVertAlign rPr = + superscript . (runStyleToTransform rPr {rVertAlign = Nothing}) + | Just SubScrpt <- rVertAlign rPr = + subscript . (runStyleToTransform rPr {rVertAlign = Nothing}) + | Just "single" <- rUnderline rPr = + emph . (runStyleToTransform rPr {rUnderline = Nothing}) + | otherwise = id + +runToInlines :: Run -> DocxContext Inlines +runToInlines (Run rs runElems) + | Just (s, _) <- rStyle rs + , s `elem` codeStyles = + return $ code $ concatMap runElemToString runElems + | otherwise = do + let ils = concatReduce (map runElemToInlines runElems) + return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils +runToInlines (Footnote bps) = do + blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) + return $ note blksList +runToInlines (Endnote bps) = do + blksList <- concatReduce <$> (mapM bodyPartToBlocks bps) + return $ note blksList +runToInlines (InlineDrawing fp bs) = do + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + return $ image fp "" "" + +parPartToInlines :: ParPart -> DocxContext Inlines +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> concatReduce <$> mapM runToInlines runs + RejectChanges -> return mempty + AllChanges -> do + ils <- concatReduce <$> mapM runToInlines runs + let attr = ("", ["insertion"], [("author", author), ("date", date)]) + return $ spanWith attr ils +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> return mempty + RejectChanges -> concatReduce <$> mapM runToInlines runs + AllChanges -> do + ils <- concatReduce <$> mapM runToInlines runs + let attr = ("", ["deletion"], [("author", author), ("date", date)]) + return $ spanWith attr ils +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = + return mempty +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- get whether we're in a header. + inHdrBool <- asks docxInHeaderBlock + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- We don't want to rewrite if we're in a header, since we'll take + -- care of that later, when we make the header anchor. If the + -- bookmark were already in uniqueIdent form, this would lead to a + -- duplication. Otherwise, we check to see if the id is already in + -- there. Rewrite if necessary. This will have the possible effect + -- of rewriting user-defined anchor links. However, since these + -- are not defined in pandoc, it seems like a necessary evil to + -- avoid an extra pass. + let newAnchor = + if not inHdrBool && anchor `elem` (M.elems anchorMap) + then uniqueIdent [Str anchor] (M.elems anchorMap) + else anchor + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) + return $ spanWith (newAnchor, ["anchor"], []) mempty +parPartToInlines (Drawing fp bs) = do + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } + return $ image fp "" "" +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatReduce <$> mapM runToInlines runs + return $ link ('#' : anchor) "" ils +parPartToInlines (ExternalHyperLink target runs) = do + ils <- concatReduce <$> mapM runToInlines runs + return $ link target "" ils +parPartToInlines (PlainOMath exps) = do + return $ math $ writeTeX exps + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (_, classes, kvs) ils) = + classes == ["anchor"] && + null kvs && + null ils +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchor :: Blocks -> DocxContext Blocks +makeHeaderAnchor bs = case viewl $ unMany bs of + (x :< xs) -> do + x' <- (makeHeaderAnchor' x) + xs' <- (makeHeaderAnchor $ Many xs) + return $ (singleton x') <> xs' + EmptyL -> return mempty + +makeHeaderAnchor' :: Block -> DocxContext Block +-- If there is an anchor already there (an anchor span in the header, +-- to be exact), we rename and associate the new id with the old one. +makeHeaderAnchor' (Header n (_, classes, kvs) ils) + | (c:cs) <- filter isAnchorSpan ils + , (Span (ident, ["anchor"], _) _) <- c = do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs)) +-- Otherwise we just give it a name, and register that name (associate +-- it with itself.) +makeHeaderAnchor' (Header n (_, classes, kvs) ils) = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) ils +makeHeaderAnchor' blk = return blk + +-- Rewrite a standalone paragraph block as a plain +singleParaToPlain :: Blocks -> Blocks +singleParaToPlain blks + | (Para (ils) :< seeq) <- viewl $ unMany blks + , Seq.null seeq = + singleton $ Plain ils +singleParaToPlain blks = blks + +cellToBlocks :: Cell -> DocxContext Blocks +cellToBlocks (Cell bps) = concatReduce <$> mapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [Blocks] +rowToBlocksList (Row cells) = do + blksList <- mapM cellToBlocks cells + return $ map singleParaToPlain blksList + +trimLineBreaks :: [Inline] -> [Inline] +trimLineBreaks [] = [] +trimLineBreaks (LineBreak : ils) = trimLineBreaks ils +trimLineBreaks ils + | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils') +trimLineBreaks ils = ils + +parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks) +parStyleToTransform pPr + | (c:cs) <- pStyle pPr + , c `elem` divsToKeep = + let pPr' = pPr { pStyle = cs } + in + (divWith ("", [c], [])) . (parStyleToTransform pPr') + | (c:cs) <- pStyle pPr, + c `elem` listParagraphDivs = + let pPr' = pPr { pStyle = cs, indentation = Nothing} + in + (divWith ("", [c], [])) . (parStyleToTransform pPr') + | (_:cs) <- pStyle pPr + , Just True <- pBlockQuote pPr = + let pPr' = pPr { pStyle = cs } + in + blockQuote . (parStyleToTransform pPr') + | (_:cs) <- pStyle pPr = + let pPr' = pPr { pStyle = cs} + in + parStyleToTransform pPr' + | null (pStyle pPr) + , Just left <- indentation pPr >>= leftParIndent + , Just hang <- indentation pPr >>= hangingParIndent = + let pPr' = pPr { indentation = Nothing } + in + case (left - hang) > 0 of + True -> blockQuote . (parStyleToTransform pPr') + False -> parStyleToTransform pPr' + | null (pStyle pPr), + Just left <- indentation pPr >>= leftParIndent = + let pPr' = pPr { indentation = Nothing } + in + case left > 0 of + True -> blockQuote . (parStyleToTransform pPr') + False -> parStyleToTransform pPr' +parStyleToTransform _ = id + +bodyPartToBlocks :: BodyPart -> DocxContext Blocks +bodyPartToBlocks (Paragraph pPr parparts) + | not $ null $ codeDivs `intersect` (pStyle pPr) = + return + $ parStyleToTransform pPr + $ codeBlock + $ concatMap parPartToString parparts + | Just (style, n) <- pHeading pPr = do + ils <- local (\s-> s{docxInHeaderBlock=True}) $ + (concatReduce <$> mapM parPartToInlines parparts) + makeHeaderAnchor $ + headerWith ("", delete style (pStyle pPr), []) n ils + | otherwise = do + ils <- concatReduce <$> mapM parPartToInlines parparts >>= + (return . fromList . trimLineBreaks . normalizeSpaces . toList) + dropIls <- gets docxDropCap + let ils' = dropIls <> ils + if dropCap pPr + then do modify $ \s -> s { docxDropCap = ils' } + return mempty + else do modify $ \s -> s { docxDropCap = mempty } + return $ case isNull ils' of + True -> mempty + _ -> parStyleToTransform pPr $ para ils' +bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do + let + kvs = case levelInfo of + (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ divWith ("", ["list-item"], kvs) blks +bodyPartToBlocks (Tbl _ _ _ []) = + return $ para mempty +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do + let caption = text cap + (hdr, rows) = case firstRowFormatting look of + True -> (Just r, rs) + False -> (Nothing, r:rs) + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] + + cells <- mapM rowToBlocksList rows + + let size = case null hdrCells of + True -> length $ head cells + False -> length $ hdrCells + -- + -- The two following variables (horizontal column alignment and + -- relative column widths) go to the default at the + -- moment. Width information is in the TblGrid field of the Tbl, + -- so should be possible. Alignment might be more difficult, + -- since there doesn't seem to be a column entity in docx. + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return $ table caption (zip alignments widths) hdrCells cells +bodyPartToBlocks (OMathPara e) = do + return $ para $ displayMath (writeTeX e) + + +-- replace targets with generated anchors. +rewriteLink' :: Inline -> DocxContext Inline +rewriteLink' l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink' il = return il + +rewriteLinks :: [Block] -> DocxContext [Block] +rewriteLinks = mapM (walkM rewriteLink') + +bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) +bodyToOutput (Body bps) = do + let (metabps, blkbps) = sepBodyParts bps + meta <- bodyPartsToMeta metabps + blks <- concatReduce <$> mapM bodyPartToBlocks blkbps + blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks + mediaBag <- gets docxMediaBag + return $ (meta, + blks', + mediaBag) + +docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) +docxToOutput opts (Docx (Document _ body)) = + let dEnv = def { docxOptions = opts} in + evalDocxContext (bodyToOutput body) dEnv def diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs new file mode 100644 index 000000000..b44c71412 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs @@ -0,0 +1,238 @@ +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Fonts + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + Stability : alpha + Portability : portable + +Utilities to convert between font codepoints and unicode characters. +-} +module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where + + +-- | Enumeration of recognised fonts +data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol> + deriving (Show, Eq) + +-- | Given a font and codepoint, returns the corresponding unicode +-- character +getUnicode :: Font -> Char -> Maybe Char +getUnicode Symbol c = lookup c symbol + +-- Generated from lib/fonts/symbol.txt +symbol :: [(Char, Char)] +symbol = + [ (' ',' ') + , (' ','\160') + , ('!','!') + , ('"','\8704') + , ('#','#') + , ('$','\8707') + , ('%','%') + , ('&','&') + , ('\'','\8715') + , ('(','(') + , (')',')') + , ('*','\8727') + , ('+','+') + , (',',',') + , ('-','\8722') + , ('.','.') + , ('/','/') + , ('0','0') + , ('1','1') + , ('2','2') + , ('3','3') + , ('4','4') + , ('5','5') + , ('6','6') + , ('7','7') + , ('8','8') + , ('9','9') + , (':',':') + , (';',';') + , ('<','<') + , ('=','=') + , ('>','>') + , ('?','?') + , ('@','\8773') + , ('A','\913') + , ('B','\914') + , ('C','\935') + , ('D','\916') + , ('D','\8710') + , ('E','\917') + , ('F','\934') + , ('G','\915') + , ('H','\919') + , ('I','\921') + , ('J','\977') + , ('K','\922') + , ('L','\923') + , ('M','\924') + , ('N','\925') + , ('O','\927') + , ('P','\928') + , ('Q','\920') + , ('R','\929') + , ('S','\931') + , ('T','\932') + , ('U','\933') + , ('V','\962') + , ('W','\937') + , ('W','\8486') + , ('X','\926') + , ('Y','\936') + , ('Z','\918') + , ('[','[') + , ('\\','\8756') + , (']',']') + , ('^','\8869') + , ('_','_') + , ('`','\63717') + , ('a','\945') + , ('b','\946') + , ('c','\967') + , ('d','\948') + , ('e','\949') + , ('f','\966') + , ('g','\947') + , ('h','\951') + , ('i','\953') + , ('j','\981') + , ('k','\954') + , ('l','\955') + , ('m','\181') + , ('m','\956') + , ('n','\957') + , ('o','\959') + , ('p','\960') + , ('q','\952') + , ('r','\961') + , ('s','\963') + , ('t','\964') + , ('u','\965') + , ('v','\982') + , ('w','\969') + , ('x','\958') + , ('y','\968') + , ('z','\950') + , ('{','{') + , ('|','|') + , ('}','}') + , ('~','\8764') + , ('\160','\8364') + , ('\161','\978') + , ('\162','\8242') + , ('\163','\8804') + , ('\164','\8260') + , ('\164','\8725') + , ('\165','\8734') + , ('\166','\402') + , ('\167','\9827') + , ('\168','\9830') + , ('\169','\9829') + , ('\170','\9824') + , ('\171','\8596') + , ('\172','\8592') + , ('\173','\8593') + , ('\174','\8594') + , ('\175','\8595') + , ('\176','\176') + , ('\177','\177') + , ('\178','\8243') + , ('\179','\8805') + , ('\180','\215') + , ('\181','\8733') + , ('\182','\8706') + , ('\183','\8226') + , ('\184','\247') + , ('\185','\8800') + , ('\186','\8801') + , ('\187','\8776') + , ('\188','\8230') + , ('\189','\63718') + , ('\190','\63719') + , ('\191','\8629') + , ('\192','\8501') + , ('\193','\8465') + , ('\194','\8476') + , ('\195','\8472') + , ('\196','\8855') + , ('\197','\8853') + , ('\198','\8709') + , ('\199','\8745') + , ('\200','\8746') + , ('\201','\8835') + , ('\202','\8839') + , ('\203','\8836') + , ('\204','\8834') + , ('\205','\8838') + , ('\206','\8712') + , ('\207','\8713') + , ('\208','\8736') + , ('\209','\8711') + , ('\210','\63194') + , ('\211','\63193') + , ('\212','\63195') + , ('\213','\8719') + , ('\214','\8730') + , ('\215','\8901') + , ('\216','\172') + , ('\217','\8743') + , ('\218','\8744') + , ('\219','\8660') + , ('\220','\8656') + , ('\221','\8657') + , ('\222','\8658') + , ('\223','\8659') + , ('\224','\9674') + , ('\225','\9001') + , ('\226','\63720') + , ('\227','\63721') + , ('\228','\63722') + , ('\229','\8721') + , ('\230','\63723') + , ('\231','\63724') + , ('\232','\63725') + , ('\233','\63726') + , ('\234','\63727') + , ('\235','\63728') + , ('\236','\63729') + , ('\237','\63730') + , ('\238','\63731') + , ('\239','\63732') + , ('\241','\9002') + , ('\242','\8747') + , ('\243','\8992') + , ('\244','\63733') + , ('\245','\8993') + , ('\246','\63734') + , ('\247','\63735') + , ('\248','\63736') + , ('\249','\63737') + , ('\250','\63738') + , ('\251','\63739') + , ('\252','\63740') + , ('\253','\63741') + , ('\254','\63742')] diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs new file mode 100644 index 000000000..c265ad074 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -0,0 +1,229 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Lists + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Functions for converting flat docx paragraphs into nested lists. +-} + +module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets + , blocksToDefinitions + , listParagraphDivs + ) where + +import Text.Pandoc.JSON +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.Shared (trim) +import Control.Monad +import Data.List +import Data.Maybe + +isListItem :: Block -> Bool +isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True +isListItem _ = False + +getLevel :: Block -> Maybe Integer +getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs +getLevel _ = Nothing + +getLevelN :: Block -> Integer +getLevelN b = case getLevel b of + Just n -> n + Nothing -> -1 + +getNumId :: Block -> Maybe Integer +getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs +getNumId _ = Nothing + +getNumIdN :: Block -> Integer +getNumIdN b = case getNumId b of + Just n -> n + Nothing -> -1 + +getText :: Block -> Maybe String +getText (Div (_, _, kvs) _) = lookup "text" kvs +getText _ = Nothing + +data ListType = Itemized | Enumerated ListAttributes + +listStyleMap :: [(String, ListNumberStyle)] +listStyleMap = [("upperLetter", UpperAlpha), + ("lowerLetter", LowerAlpha), + ("upperRoman", UpperRoman), + ("lowerRoman", LowerRoman), + ("decimal", Decimal)] + +listDelimMap :: [(String, ListNumberDelim)] +listDelimMap = [("%1)", OneParen), + ("(%1)", TwoParens), + ("%1.", Period)] + +getListType :: Block -> Maybe ListType +getListType b@(Div (_, _, kvs) _) | isListItem b = + let + start = lookup "start" kvs + frmt = lookup "format" kvs + txt = lookup "text" kvs + in + case frmt of + Just "bullet" -> Just Itemized + Just f -> + case txt of + Just t -> Just $ Enumerated ( + read (fromMaybe "1" start) :: Int, + fromMaybe DefaultStyle (lookup f listStyleMap), + fromMaybe DefaultDelim (lookup t listDelimMap)) + Nothing -> Nothing + _ -> Nothing +getListType _ = Nothing + +listParagraphDivs :: [String] +listParagraphDivs = ["ListParagraph"] + +-- This is a first stab at going through and attaching meaning to list +-- paragraphs, without an item marker, following a list item. We +-- assume that these are paragraphs in the same item. + +handleListParagraphs :: [Block] -> [Block] +handleListParagraphs [] = [] +handleListParagraphs ( + (Div attr1@(_, classes1, _) blks1) : + (Div (ident2, classes2, kvs2) blks2) : + blks + ) | "list-item" `elem` classes1 && + not ("list-item" `elem` classes2) && + (not . null) (listParagraphDivs `intersect` classes2) = + -- We don't want to keep this indent. + let newDiv2 = + (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) + in + handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) +handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) + +separateBlocks' :: Block -> [[Block]] -> [[Block]] +separateBlocks' blk ([] : []) = [[blk]] +separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +-- The following is for the invisible bullet lists. This is how +-- pandoc-generated ooxml does multiparagraph item lists. +separateBlocks' b acc | liftM trim (getText b) == Just "" = + (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc = acc ++ [[b]] + +separateBlocks :: [Block] -> [[Block]] +separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) + +flatToBullets' :: Integer -> [Block] -> [Block] +flatToBullets' _ [] = [] +flatToBullets' num xs@(b : elems) + | getLevelN b == num = b : (flatToBullets' num elems) + | otherwise = + let bNumId = getNumIdN b + bLevel = getLevelN b + (children, remaining) = + span + (\b' -> + ((getLevelN b') > bLevel || + ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) + xs + in + case getListType b of + Just (Enumerated attr) -> + (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + _ -> + (BulletList (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + +flatToBullets :: [Block] -> [Block] +flatToBullets elems = flatToBullets' (-1) elems + +singleItemHeaderToHeader :: Block -> Block +singleItemHeaderToHeader (OrderedList _ [[h@(Header _ _ _)]]) = h +singleItemHeaderToHeader blk = blk + + +blocksToBullets :: [Block] -> [Block] +blocksToBullets blks = + map singleItemHeaderToHeader $ + bottomUp removeListDivs $ + flatToBullets $ (handleListParagraphs blks) + +plainParaInlines :: Block -> [Inline] +plainParaInlines (Plain ils) = ils +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] + +blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] +blocksToDefinitions' [] acc [] = reverse acc +blocksToDefinitions' defAcc acc [] = + reverse $ (DefinitionList (reverse defAcc)) : acc +blocksToDefinitions' defAcc acc + ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) + | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + pair = case remainingAttr2 == ("", [], []) of + True -> (concatMap plainParaInlines blks1, [blks2]) + False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) + in + blocksToDefinitions' (pair : defAcc) acc blks +blocksToDefinitions' defAcc acc + ((Div (ident2, classes2, kvs2) blks2) : blks) + | (not . null) defAcc && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + defItems2 = case remainingAttr2 == ("", [], []) of + True -> blks2 + False -> [Div remainingAttr2 blks2] + ((defTerm, defItems):defs) = defAcc + defAcc' = case null defItems of + True -> (defTerm, [defItems2]) : defs + False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + in + blocksToDefinitions' defAcc' acc blks +blocksToDefinitions' [] acc (b:blks) = + blocksToDefinitions' [] (b:acc) blks +blocksToDefinitions' defAcc acc (b:blks) = + blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + +removeListDivs' :: Block -> [Block] +removeListDivs' (Div (ident, classes, kvs) blks) + | "list-item" `elem` classes = + case delete "list-item" classes of + [] -> blks + classes' -> [Div (ident, classes', kvs) $ blks] +removeListDivs' (Div (ident, classes, kvs) blks) + | not $ null $ listParagraphDivs `intersect` classes = + case classes \\ listParagraphDivs of + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] +removeListDivs' blk = [blk] + +removeListDivs :: [Block] -> [Block] +removeListDivs = concatMap removeListDivs' + + + +blocksToDefinitions :: [Block] -> [Block] +blocksToDefinitions = blocksToDefinitions' [] [] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs new file mode 100644 index 000000000..5fd6b7a81 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -0,0 +1,911 @@ +{-# LANGUAGE PatternGuards, ViewPatterns, FlexibleInstances #-} + +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of docx archive into Docx haskell type +-} + +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , VertAlign(..) + , ParIndentation(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , archiveToDocx + ) where +import Codec.Archive.Zip +import Text.XML.Light +import Data.Maybe +import Data.List +import System.FilePath +import Data.Bits ((.|.)) +import qualified Data.ByteString.Lazy as B +import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad.Reader +import Control.Applicative ((<$>), (<|>)) +import qualified Data.Map as M +import Text.Pandoc.Compat.Except +import Text.TeXMath.Readers.OMML (readOMML) +import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) +import Text.TeXMath (Exp) +import Data.Char (readLitChar, ord, chr, isDigit) + +data ReaderEnv = ReaderEnv { envNotes :: Notes + , envNumbering :: Numbering + , envRelationships :: [Relationship] + , envMedia :: Media + , envFont :: Maybe Font + , envCharStyles :: CharStyleMap + , envParStyles :: ParStyleMap + } + deriving Show + +data DocxError = DocxError | WrongElem + deriving Show + +instance Error DocxError where + noMsg = WrongElem + +type D = ExceptT DocxError (Reader ReaderEnv) + +runD :: D a -> ReaderEnv -> Either DocxError a +runD dx re = runReader (runExceptT dx ) re + +maybeToD :: Maybe a -> D a +maybeToD (Just a) = return a +maybeToD Nothing = throwError DocxError + +eitherToD :: Either a b -> D b +eitherToD (Right b) = return b +eitherToD (Left _) = throwError DocxError + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + + +-- This is similar to `mapMaybe`: it maps a function returning the D +-- monad over a list, and only keeps the non-erroring return values. +mapD :: (a -> D b) -> [a] -> D [b] +mapD f xs = + let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return []) + in + concatMapM handler xs + +type NameSpaces = [(String, String)] + +data Docx = Docx Document + deriving Show + +data Document = Document NameSpaces Body + deriving Show + +data Body = Body [BodyPart] + deriving Show + +type Media = [(FilePath, B.ByteString)] + +type CharStyle = (String, RunStyle) + +type ParStyle = (String, ParStyleData) + +type CharStyleMap = M.Map String RunStyle + +type ParStyleMap = M.Map String ParStyleData + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +data Relationship = Relationship (RelId, Target) + deriving Show + +data Notes = Notes NameSpaces + (Maybe (M.Map String Element)) + (Maybe (M.Map String Element)) + deriving Show + +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer + , hangingParIndent :: Maybe Integer} + deriving Show + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indentation :: Maybe ParIndentation + , dropCap :: Bool + , pHeading :: Maybe (String, Int) + , pBlockQuote :: Maybe Bool + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indentation = Nothing + , dropCap = False + , pHeading = Nothing + , pBlockQuote = Nothing + } + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String Level [ParPart] + | Tbl String TblGrid TblLook [Row] + | OMathPara [Exp] + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +data Row = Row [Cell] + deriving Show + +data Cell = Cell [BodyPart] + deriving Show + +data ParPart = PlainRun Run + | Insertion ChangeId Author ChangeDate [Run] + | Deletion ChangeId Author ChangeDate [Run] + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink URL [Run] + | Drawing FilePath B.ByteString + | PlainOMath [Exp] + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote [BodyPart] + | Endnote [BodyPart] + | InlineDrawing FilePath B.ByteString + deriving Show + +data RunElem = TextRun String | LnBrk | Tab + deriving Show + +data VertAlign = BaseLn | SupScrpt | SubScrpt + deriving Show + +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool + , isSmallCaps :: Maybe Bool + , isStrike :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rStyle :: Maybe CharStyle} + deriving Show + +data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) + , isBlockQuote :: Maybe Bool + , psStyle :: Maybe ParStyle} + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = Nothing + , isItalic = Nothing + , isSmallCaps = Nothing + , isStrike = Nothing + , rVertAlign = Nothing + , rUnderline = Nothing + , rStyle = Nothing} + + +type Target = String +type Anchor = String +type URL = String +type BookMarkId = String +type RelId = String +type ChangeId = String +type Author = String +type ChangeDate = String + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + +archiveToDocx :: Archive -> Either DocxError Docx +archiveToDocx archive = do + let notes = archiveToNotes archive + numbering = archiveToNumbering archive + rels = archiveToRelationships archive + media = archiveToMedia archive + (styles, parstyles) = archiveToStyles archive + rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles + doc <- runD (archiveToDocument archive) rEnv + return $ Docx doc + + +archiveToDocument :: Archive -> D Document +archiveToDocument zf = do + entry <- maybeToD $ findEntryByPath "word/document.xml" zf + docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem + body <- elemToBody namespaces bodyElem + return $ Document namespaces body + +elemToBody :: NameSpaces -> Element -> D Body +elemToBody ns element | isElem ns "w" "body" element = + mapD (elemToBodyPart ns) (elChildren element) >>= + (\bps -> return $ Body bps) +elemToBody _ _ = throwError WrongElem + +archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) +archiveToStyles zf = + let stylesElem = findEntryByPath "word/styles.xml" zf >>= + (parseXMLDoc . UTF8.toStringLazy . fromEntry) + in + case stylesElem of + Nothing -> (M.empty, M.empty) + Just styElem -> + let namespaces = mapMaybe attrToNSPair (elAttribs styElem) + in + ( M.fromList $ buildBasedOnList namespaces styElem + (Nothing :: Maybe CharStyle), + M.fromList $ buildBasedOnList namespaces styElem + (Nothing :: Maybe ParStyle) ) + +isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool +isBasedOnStyle ns element parentStyle + | isElem ns "w" "style" element + , Just styleType <- findAttr (elemName ns "w" "type") element + , styleType == cStyleType parentStyle + , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>= + findAttr (elemName ns "w" "val") + , Just ps <- parentStyle = (basedOnVal == getStyleId ps) + | isElem ns "w" "style" element + , Just styleType <- findAttr (elemName ns "w" "type") element + , styleType == cStyleType parentStyle + , Nothing <- findChild (elemName ns "w" "basedOn") element + , Nothing <- parentStyle = True + | otherwise = False + +class ElemToStyle a where + cStyleType :: Maybe a -> String + elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a + getStyleId :: a -> String + +instance ElemToStyle CharStyle where + cStyleType _ = "character" + elemToStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "character" <- findAttr (elemName ns "w" "type") element + , Just styleId <- findAttr (elemName ns "w" "styleId") element = + Just (styleId, elemToRunStyle ns element parentStyle) + | otherwise = Nothing + getStyleId s = fst s + +instance ElemToStyle ParStyle where + cStyleType _ = "paragraph" + elemToStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "paragraph" <- findAttr (elemName ns "w" "type") element + , Just styleId <- findAttr (elemName ns "w" "styleId") element = + Just (styleId, elemToParStyleData ns element parentStyle) + | otherwise = Nothing + getStyleId s = fst s + +getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] +getStyleChildren ns element parentStyle + | isElem ns "w" "styles" element = + mapMaybe (\e -> elemToStyle ns e parentStyle) $ + filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element + | otherwise = [] + +buildBasedOnList :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] +buildBasedOnList ns element rootStyle = + case (getStyleChildren ns element rootStyle) of + [] -> [] + stys -> stys ++ + (concatMap (\s -> buildBasedOnList ns element (Just s)) stys) + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + en_namespaces = case enElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = + let (dir, name) = splitFileName fp + in + (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship (relId, target) +relElemToRelationship _ = Nothing + + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + let relPaths = filter filePathIsRel (filesInArchive archive) + entries = mapMaybe (\f -> findEntryByPath f archive) relPaths + relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries + rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems + in + rels + +filePathIsMedia :: FilePath -> Bool +filePathIsMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "word/media/") + +getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) +getMediaPair zf fp = + case findEntryByPath fp zf of + Just e -> Just (fp, fromEntry e) + Nothing -> Nothing + +archiveToMedia :: Archive -> Media +archiveToMedia zf = + mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) + +lookupLevel :: String -> String -> Numbering -> Maybe Level +lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do + absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs + lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs + lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls + return lvl + +numElemToNum :: NameSpaces -> Element -> Maybe Numb +numElemToNum ns element | + qName (elName element) == "num" && + qURI (elName element) == (lookup "w" ns) = do + numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element + absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + return $ Numb numId absNumId +numElemToNum _ _ = Nothing + +absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb +absNumElemToAbsNum ns element | + qName (elName element) == "abstractNum" && + qURI (elName element) == (lookup "w" ns) = do + absNumId <- findAttr + (QName "abstractNumId" (lookup "w" ns) (Just "w")) + element + let levelElems = findChildren + (QName "lvl" (lookup "w" ns) (Just "w")) + element + levels = mapMaybe (levelElemToLevel ns) levelElems + return $ AbstractNumb absNumId levels +absNumElemToAbsNum _ _ = Nothing + +levelElemToLevel :: NameSpaces -> Element -> Maybe Level +levelElemToLevel ns element | + qName (elName element) == "lvl" && + qURI (elName element) == (lookup "w" ns) = do + ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element + fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + return (ilvl, fmt, txt, start) +levelElemToLevel _ _ = Nothing + +archiveToNumbering' :: Archive -> Maybe Numbering +archiveToNumbering' zf = do + case findEntryByPath "word/numbering.xml" zf of + Nothing -> Just $ Numbering [] [] [] + Just entry -> do + numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem) + numElems = findChildren + (QName "num" (lookup "w" namespaces) (Just "w")) + numberingElem + absNumElems = findChildren + (QName "abstractNum" (lookup "w" namespaces) (Just "w")) + numberingElem + nums = mapMaybe (numElemToNum namespaces) numElems + absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems + return $ Numbering namespaces nums absNums + +archiveToNumbering :: Archive -> Numbering +archiveToNumbering archive = + fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) + +elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element) +elemToNotes ns notetype element + | isElem ns "w" (notetype ++ "s") element = + let pairs = mapMaybe + (\e -> findAttr (elemName ns "w" "id") e >>= + (\a -> Just (a, e))) + (findChildren (elemName ns "w" notetype) element) + in + Just $ M.fromList $ pairs +elemToNotes _ _ _ = Nothing + +--------------------------------------------- +--------------------------------------------- + +elemName :: NameSpaces -> String -> String -> QName +elemName ns prefix name = (QName name (lookup prefix ns) (Just prefix)) + +isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem ns prefix name element = + qName (elName element) == name && + qURI (elName element) == (lookup prefix ns) + + +elemToTblGrid :: NameSpaces -> Element -> D TblGrid +elemToTblGrid ns element | isElem ns "w" "tblGrid" element = + let cols = findChildren (elemName ns "w" "gridCol") element + in + mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger)) + cols +elemToTblGrid _ _ = throwError WrongElem + +elemToTblLook :: NameSpaces -> Element -> D TblLook +elemToTblLook ns element | isElem ns "w" "tblLook" element = + let firstRow = findAttr (elemName ns "w" "firstRow") element + val = findAttr (elemName ns "w" "val") element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False + in + return $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = throwError WrongElem + +elemToRow :: NameSpaces -> Element -> D Row +elemToRow ns element | isElem ns "w" "tr" element = + do + let cellElems = findChildren (elemName ns "w" "tc") element + cells <- mapD (elemToCell ns) cellElems + return $ Row cells +elemToRow _ _ = throwError WrongElem + +elemToCell :: NameSpaces -> Element -> D Cell +elemToCell ns element | isElem ns "w" "tc" element = + do + cellContents <- mapD (elemToBodyPart ns) (elChildren element) + return $ Cell cellContents +elemToCell _ _ = throwError WrongElem + +elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation +elemToParIndentation ns element | isElem ns "w" "ind" element = + Just $ ParIndentation { + leftParIndent = + findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>= + stringToInteger + , rightParIndent = + findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>= + stringToInteger + , hangingParIndent = + findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>= + stringToInteger} +elemToParIndentation _ _ = Nothing + + +elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) +elemToNumInfo ns element | isElem ns "w" "p" element = do + let pPr = findChild (elemName ns "w" "pPr") element + numPr = pPr >>= findChild (elemName ns "w" "numPr") + lvl <- numPr >>= + findChild (elemName ns "w" "ilvl") >>= + findAttr (elemName ns "w" "val") + numId <- numPr >>= + findChild (elemName ns "w" "numId") >>= + findAttr (elemName ns "w" "val") + return (numId, lvl) +elemToNumInfo _ _ = Nothing + +testBitMask :: String -> Int -> Bool +testBitMask bitMaskS n = + case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + [] -> False + ((n', _) : _) -> ((n' .|. n) /= 0) + +stringToInteger :: String -> Maybe Integer +stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) + +elemToBodyPart :: NameSpaces -> Element -> D BodyPart +elemToBodyPart ns element + | isElem ns "w" "p" element + , (c:_) <- findChildren (elemName ns "m" "oMathPara") element = + do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst +elemToBodyPart ns element + | isElem ns "w" "p" element + , Just (numId, lvl) <- elemToNumInfo ns element = do + sty <- asks envParStyles + let parstyle = elemToParagraphStyle ns element sty + parparts <- mapD (elemToParPart ns) (elChildren element) + num <- asks envNumbering + case lookupLevel numId lvl num of + Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts + Nothing -> throwError WrongElem +elemToBodyPart ns element + | isElem ns "w" "p" element = do + sty <- asks envParStyles + let parstyle = elemToParagraphStyle ns element sty + parparts <- mapD (elemToParPart ns) (elChildren element) + return $ Paragraph parstyle parparts +elemToBodyPart ns element + | isElem ns "w" "tbl" element = do + let caption' = findChild (elemName ns "w" "tblPr") element + >>= findChild (elemName ns "w" "tblCaption") + >>= findAttr (elemName ns "w" "val") + caption = (fromMaybe "" caption') + grid' = case findChild (elemName ns "w" "tblGrid") element of + Just g -> elemToTblGrid ns g + Nothing -> return [] + tblLook' = case findChild (elemName ns "w" "tblPr") element >>= + findChild (elemName ns "w" "tblLook") + of + Just l -> elemToTblLook ns l + Nothing -> return defaultTblLook + + grid <- grid' + tblLook <- tblLook' + rows <- mapD (elemToRow ns) (elChildren element) + return $ Tbl caption grid tblLook rows +elemToBodyPart _ _ = throwError WrongElem + +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = + lookup relid (map (\(Relationship pair) -> pair) rels) + +expandDrawingId :: String -> D (FilePath, B.ByteString) +expandDrawingId s = do + target <- asks (lookupRelationship s . envRelationships) + case target of + Just filepath -> do + bytes <- asks (lookup ("word/" ++ filepath) . envMedia) + case bytes of + Just bs -> return (filepath, bs) + Nothing -> throwError DocxError + Nothing -> throwError DocxError + +elemToParPart :: NameSpaces -> Element -> D ParPart +elemToParPart ns element + | isElem ns "w" "r" element + , Just _ <- findChild (elemName ns "w" "drawing") element = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs) + Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element = + elemToRun ns element >>= (\r -> return $ PlainRun r) +elemToParPart ns element + | isElem ns "w" "ins" element + , Just cId <- findAttr (elemName ns "w" "id") element + , Just cAuthor <- findAttr (elemName ns "w" "author") element + , Just cDate <- findAttr (elemName ns "w" "date") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Insertion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "del" element + , Just cId <- findAttr (elemName ns "w" "id") element + , Just cAuthor <- findAttr (elemName ns "w" "author") element + , Just cDate <- findAttr (elemName ns "w" "date") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Deletion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "bookmarkStart" element + , Just bmId <- findAttr (elemName ns "w" "id") element + , Just bmName <- findAttr (elemName ns "w" "name") element = + return $ BookMark bmId bmName +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just relId <- findAttr (elemName ns "r" "id") element = do + runs <- mapD (elemToRun ns) (elChildren element) + rels <- asks envRelationships + case lookupRelationship relId rels of + Just target -> do + case findAttr (elemName ns "w" "anchor") element of + Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs + Nothing -> return $ ExternalHyperLink target runs + Nothing -> return $ ExternalHyperLink "" runs +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just anchor <- findAttr (elemName ns "w" "anchor") element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ InternalHyperLink anchor runs +elemToParPart ns element + | isElem ns "m" "oMath" element = + (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) +elemToParPart _ _ = throwError WrongElem + +lookupFootnote :: String -> Notes -> Maybe Element +lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) + +lookupEndnote :: String -> Notes -> Maybe Element +lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) + +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= + (\(fp, bs) -> return $ InlineDrawing fp bs) + Nothing -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "footnoteReference") element + , Just fnId <- findAttr (elemName ns "w" "id") ref = do + notes <- asks envNotes + case lookupFootnote fnId notes of + Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + return $ Footnote bps + Nothing -> return $ Footnote [] +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChild (elemName ns "w" "endnoteReference") element + , Just enId <- findAttr (elemName ns "w" "id") ref = do + notes <- asks envNotes + case lookupEndnote enId notes of + Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + return $ Endnote bps + Nothing -> return $ Endnote [] +elemToRun ns element + | isElem ns "w" "r" element = do + runElems <- elemToRunElems ns element + runStyle <- elemToRunStyleD ns element + return $ Run runStyle runElems +elemToRun _ _ = throwError WrongElem + +getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a +getParentStyleValue field style + | Just value <- field style = Just value + | Just parentStyle <- psStyle style + = getParentStyleValue field (snd parentStyle) +getParentStyleValue _ _ = Nothing + +getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] -> + Maybe a +getParStyleField field stylemap styles + | x <- mapMaybe (\x -> M.lookup x stylemap) styles + , (y:_) <- mapMaybe (getParentStyleValue field) x + = Just y +getParStyleField _ _ _ = Nothing + +elemToParagraphStyle :: NameSpaces -> Element -> ParStyleMap -> ParagraphStyle +elemToParagraphStyle ns element sty + | Just pPr <- findChild (elemName ns "w" "pPr") element = + let style = + mapMaybe + (findAttr (elemName ns "w" "val")) + (findChildren (elemName ns "w" "pStyle") pPr) + in ParagraphStyle + {pStyle = style + , indentation = + findChild (elemName ns "w" "ind") pPr >>= + elemToParIndentation ns + , dropCap = + case + findChild (elemName ns "w" "framePr") pPr >>= + findAttr (elemName ns "w" "dropCap") + of + Just "none" -> False + Just _ -> True + Nothing -> False + , pHeading = getParStyleField headingLev sty style + , pBlockQuote = getParStyleField isBlockQuote sty style + } +elemToParagraphStyle _ _ _ = defaultParagraphStyle + +checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool +checkOnOff ns rPr tag + | Just t <- findChild tag rPr + , Just val <- findAttr (elemName ns "w" "val") t = + Just $ case val of + "true" -> True + "false" -> False + "on" -> True + "off" -> False + "1" -> True + "0" -> False + _ -> False + | Just _ <- findChild tag rPr = Just True +checkOnOff _ _ _ = Nothing + +elemToRunStyleD :: NameSpaces -> Element -> D RunStyle +elemToRunStyleD ns element + | Just rPr <- findChild (elemName ns "w" "rPr") element = do + charStyles <- asks envCharStyles + let parentSty = case + findChild (elemName ns "w" "rStyle") rPr >>= + findAttr (elemName ns "w" "val") + of + Just styName | Just style <- M.lookup styName charStyles -> + Just (styName, style) + _ -> Nothing + return $ elemToRunStyle ns element parentSty +elemToRunStyleD _ _ = return defaultRunStyle + +elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle +elemToRunStyle ns element parentStyle + | Just rPr <- findChild (elemName ns "w" "rPr") element = + RunStyle + { + isBold = checkOnOff ns rPr (elemName ns "w" "b") + , isItalic = checkOnOff ns rPr (elemName ns "w" "i") + , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps") + , isStrike = checkOnOff ns rPr (elemName ns "w" "strike") + , rVertAlign = + findChild (elemName ns "w" "vertAlign") rPr >>= + findAttr (elemName ns "w" "val") >>= + \v -> Just $ case v of + "superscript" -> SupScrpt + "subscript" -> SubScrpt + _ -> BaseLn + , rUnderline = + findChild (elemName ns "w" "u") rPr >>= + findAttr (elemName ns "w" "val") + , rStyle = parentStyle + } +elemToRunStyle _ _ _ = defaultRunStyle + +isNumericNotNull :: String -> Bool +isNumericNotNull str = (str /= []) && (all isDigit str) + +getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) +getHeaderLevel ns element + | Just styleId <- findAttr (elemName ns "w" "styleId") element + , Just index <- stripPrefix "Heading" styleId + , isNumericNotNull index = Just (styleId, read index) + | Just styleId <- findAttr (elemName ns "w" "styleId") element + , Just index <- findChild (elemName ns "w" "name") element >>= + findAttr (elemName ns "w" "val") >>= + stripPrefix "heading " + , isNumericNotNull index = Just (styleId, read index) +getHeaderLevel _ _ = Nothing + +blockQuoteStyleIds :: [String] +blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"] + +blockQuoteStyleNames :: [String] +blockQuoteStyleNames = ["Quote", "Block Text"] + +getBlockQuote :: NameSpaces -> Element -> Maybe Bool +getBlockQuote ns element + | Just styleId <- findAttr (elemName ns "w" "styleId") element + , styleId `elem` blockQuoteStyleIds = Just True + | Just styleName <- findChild (elemName ns "w" "name") element >>= + findAttr (elemName ns "w" "val") + , styleName `elem` blockQuoteStyleNames = Just True +getBlockQuote _ _ = Nothing + +elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData +elemToParStyleData ns element parentStyle = + ParStyleData + { + headingLev = getHeaderLevel ns element + , isBlockQuote = getBlockQuote ns element + , psStyle = parentStyle + } + +elemToRunElem :: NameSpaces -> Element -> D RunElem +elemToRunElem ns element + | isElem ns "w" "t" element + || isElem ns "w" "delText" element + || isElem ns "m" "t" element = do + let str = strContent element + font <- asks envFont + case font of + Nothing -> return $ TextRun str + Just f -> return . TextRun $ + map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str + | isElem ns "w" "br" element = return LnBrk + | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "sym" element = return (getSymChar ns element) + | otherwise = throwError WrongElem + where + lowerFromPrivate (ord -> c) + | c >= ord '\xF000' = chr $ c - ord '\xF000' + | otherwise = chr c + +-- The char attribute is a hex string +getSymChar :: NameSpaces -> Element -> RunElem +getSymChar ns element + | Just s <- lowerFromPrivate <$> getCodepoint + , Just font <- getFont = + let [(char, _)] = readLitChar ("\\x" ++ s) in + TextRun . maybe "" (:[]) $ getUnicode font char + where + getCodepoint = findAttr (elemName ns "w" "char") element + getFont = stringToFont =<< findAttr (elemName ns "w" "font") element + lowerFromPrivate ('F':xs) = '0':xs + lowerFromPrivate xs = xs +getSymChar _ _ = TextRun "" + +stringToFont :: String -> Maybe Font +stringToFont "Symbol" = Just Symbol +stringToFont _ = Nothing + +elemToRunElems :: NameSpaces -> Element -> D [RunElem] +elemToRunElems ns element + | isElem ns "w" "r" element + || isElem ns "m" "r" element = do + let qualName = elemName ns "w" + let font = do + fontElem <- findElement (qualName "rFonts") element + stringToFont =<< + (foldr (<|>) Nothing $ + map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) +elemToRunElems _ _ = throwError WrongElem + +setFont :: Maybe Font -> ReaderEnv -> ReaderEnv +setFont f s = s{envFont = f} diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs new file mode 100644 index 000000000..8269ca88d --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs @@ -0,0 +1,182 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, + PatternGuards #-} + +module Text.Pandoc.Readers.Docx.Reducible ( concatReduce + , (<+>) + ) + where + + +import Text.Pandoc.Builder +import Data.Monoid +import Data.List +import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) +import qualified Data.Sequence as Seq (null) + +data Modifier a = Modifier (a -> a) + | AttrModifier (Attr -> a -> a) Attr + | NullModifier + +class (Eq a) => Modifiable a where + modifier :: a -> Modifier a + innards :: a -> a + getL :: a -> (a, a) + getR :: a -> (a, a) + spaceOut :: a -> (a, a, a) + +spaceOutL :: (Monoid a, Modifiable a) => a -> (a, a) +spaceOutL ms = (l, stack fs (m' <> r)) + where (l, m, r) = spaceOut ms + (fs, m') = unstack m + +spaceOutR :: (Monoid a, Modifiable a) => a -> (a, a) +spaceOutR ms = (stack fs (l <> m'), r) + where (l, m, r) = spaceOut ms + (fs, m') = unstack m + +instance (Monoid a, Show a) => Show (Modifier a) where + show (Modifier f) = show $ f mempty + show (AttrModifier f attr) = show $ f attr mempty + show (NullModifier) = "NullModifier" + +instance (Monoid a, Eq a) => Eq (Modifier a) where + (Modifier f) == (Modifier g) = (f mempty == g mempty) + (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) + (NullModifier) == (NullModifier) = True + _ == _ = False + +instance Modifiable Inlines where + modifier ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph _) -> Modifier emph + (Strong _) -> Modifier strong + (SmallCaps _) -> Modifier smallcaps + (Strikeout _) -> Modifier strikeout + (Superscript _) -> Modifier superscript + (Subscript _) -> Modifier subscript + (Span attr _) -> AttrModifier spanWith attr + _ -> NullModifier + _ -> NullModifier + + innards ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph lst) -> fromList lst + (Strong lst) -> fromList lst + (SmallCaps lst) -> fromList lst + (Strikeout lst) -> fromList lst + (Superscript lst) -> fromList lst + (Subscript lst) -> fromList lst + (Span _ lst) -> fromList lst + _ -> ils + _ -> ils + + getL ils = case viewl $ unMany ils of + (s :< sq) -> (singleton s, Many sq) + _ -> (mempty, ils) + + getR ils = case viewr $ unMany ils of + (sq :> s) -> (Many sq, singleton s) + _ -> (ils, mempty) + + spaceOut ils = + let (fs, ils') = unstack ils + contents = unMany ils' + left = case viewl contents of + (Space :< _) -> space + _ -> mempty + right = case viewr contents of + (_ :> Space) -> space + _ -> mempty in + (left, (stack fs $ trimInlines .Many $ contents), right) + +instance Modifiable Blocks where + modifier blks = case viewl (unMany blks) of + (x :< xs) | Seq.null xs -> case x of + (BlockQuote _) -> Modifier blockQuote + -- (Div attr _) -> AttrModifier divWith attr + _ -> NullModifier + _ -> NullModifier + + innards blks = case viewl (unMany blks) of + (x :< xs) | Seq.null xs -> case x of + (BlockQuote lst) -> fromList lst + -- (Div attr lst) -> fromList lst + _ -> blks + _ -> blks + + spaceOut blks = (mempty, blks, mempty) + + getL ils = case viewl $ unMany ils of + (s :< sq) -> (singleton s, Many sq) + _ -> (mempty, ils) + + getR ils = case viewr $ unMany ils of + (sq :> s) -> (Many sq, singleton s) + _ -> (ils, mempty) + + +unstack :: (Modifiable a) => a -> ([Modifier a], a) +unstack ms = case modifier ms of + NullModifier -> ([], ms) + _ -> (f : fs, ms') where + f = modifier ms + (fs, ms') = unstack $ innards ms + +stack :: (Monoid a, Modifiable a) => [Modifier a] -> a -> a +stack [] ms = ms +stack (NullModifier : fs) ms = stack fs ms +stack ((Modifier f) : fs) ms = + if isEmpty ms + then stack fs ms + else f $ stack fs ms +stack ((AttrModifier f attr) : fs) ms = f attr $ stack fs ms + +isEmpty :: (Monoid a, Eq a) => a -> Bool +isEmpty x = x == mempty + + +combine :: (Monoid a, Modifiable a, Eq a) => a -> a -> a +combine x y = + let (xs', x') = getR x + (y', ys') = getL y + in + xs' <> (combineSingleton x' y') <> ys' + +isAttrModifier :: Modifier a -> Bool +isAttrModifier (AttrModifier _ _) = True +isAttrModifier _ = False + +combineSingleton :: (Monoid a, Modifiable a, Eq a) => a -> a -> a +combineSingleton x y = + let (xfs, xs) = unstack x + (yfs, ys) = unstack y + shared = xfs `intersect` yfs + x_remaining = xfs \\ shared + y_remaining = yfs \\ shared + x_rem_attr = filter isAttrModifier x_remaining + y_rem_attr = filter isAttrModifier y_remaining + in + case null shared of + True | isEmpty xs && isEmpty ys -> + stack (x_rem_attr ++ y_rem_attr) mempty + | isEmpty xs -> + let (sp, y') = spaceOutL y in + (stack x_rem_attr mempty) <> sp <> y' + | isEmpty ys -> + let (x', sp) = spaceOutR x in + x' <> sp <> (stack y_rem_attr mempty) + | otherwise -> + let (x', xsp) = spaceOutR x + (ysp, y') = spaceOutL y + in + x' <> xsp <> ysp <> y' + False -> stack shared $ + combine + (stack x_remaining xs) + (stack y_remaining ys) + +(<+>) :: (Monoid a, Modifiable a, Eq a) => a -> a -> a +x <+> y = combine x y + +concatReduce :: (Monoid a, Modifiable a) => [a] -> a +concatReduce xs = foldl combine mempty xs diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs new file mode 100644 index 000000000..b061d8683 --- /dev/null +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -0,0 +1,283 @@ +{-# LANGUAGE + ViewPatterns + , StandaloneDeriving + , TupleSections + , FlexibleContexts #-} + +module Text.Pandoc.Readers.EPUB + (readEPUB) + where + +import Text.XML.Light +import Text.Pandoc.Definition hiding (Attr) +import Text.Pandoc.Walk (walk, query) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Options ( ReaderOptions(..), readerTrace) +import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField) +import Text.Pandoc.MediaBag (MediaBag, insertMedia) +import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) +import Text.Pandoc.MIME (MimeType) +import qualified Text.Pandoc.Builder as B +import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry + , findEntryByPath, Entry) +import qualified Data.ByteString.Lazy as BL (ByteString) +import System.FilePath ( takeFileName, (</>), dropFileName, normalise + , dropFileName + , splitFileName ) +import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Control.Applicative ((<$>)) +import Control.Monad (guard, liftM, when) +import Data.Monoid (mempty, (<>)) +import Data.List (isPrefixOf, isInfixOf) +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Map as M (Map, lookup, fromList, elems) +import Control.DeepSeq.Generics (deepseq, NFData) + +import Debug.Trace (trace) + +type Items = M.Map String (FilePath, MimeType) + +readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag) +readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) + +runEPUB :: Except String a -> a +runEPUB = either error id . runExcept + +-- Note that internal reference are aggresively normalised so that all ids +-- are of the form "filename#id" +-- +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) +archiveToEPUB os archive = do + -- root is path to folder with manifest file in + (root, content) <- getManifest archive + meta <- parseMeta content + (cover, items) <- parseManifest content + -- No need to collapse here as the image path is from the manifest file + let coverDoc = fromMaybe mempty (imageToPandoc <$> cover) + spine <- parseSpine items content + let escapedSpine = map (escapeURI . takeFileName . fst) spine + Pandoc _ bs <- + foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine)) + `liftM` parseSpineElem root b) mempty spine + let ast = coverDoc <> (Pandoc meta bs) + let mediaBag = fetchImages (M.elems items) root archive ast + return $ (ast, mediaBag) + where + os' = os {readerParseRaw = True} + parseSpineElem :: MonadError String m => FilePath -> (FilePath, MimeType) -> m Pandoc + parseSpineElem (normalise -> r) (normalise -> path, mime) = do + when (readerTrace os) (traceM path) + doc <- mimeToReader mime r path + let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty + return $ docSpan <> doc + mimeToReader :: MonadError String m => MimeType -> FilePath -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" (normalise -> root) (normalise -> path) = do + fname <- findEntryByPathE (root </> path) archive + return $ fixInternalReferences path . + readHtml os' . + UTF8.toStringLazy $ + fromEntry fname + mimeToReader s _ path + | s `elem` imageMimes = return $ imageToPandoc path + | otherwise = return $ mempty + +-- paths should be absolute when this function is called +-- renameImages should do this +fetchImages :: [(FilePath, MimeType)] + -> FilePath -- ^ Root + -> Archive + -> Pandoc + -> MediaBag +fetchImages mimes root arc (query iq -> links) = + foldr (uncurry3 insertMedia) mempty + (mapMaybe getEntry links) + where + getEntry link = + let abslink = normalise (root </> link) in + (link , lookup link mimes, ) . fromEntry + <$> findEntryByPath abslink arc + +iq :: Inline -> [FilePath] +iq (Image _ (url, _)) = [url] +iq _ = [] + +-- Remove relative paths +renameImages :: FilePath -> Inline -> Inline +renameImages root (Image a (url, b)) = Image a (collapseFilePath (root </> url), b) +renameImages _ x = x + +imageToPandoc :: FilePath -> Pandoc +imageToPandoc s = B.doc . B.para $ B.image s "" mempty + +imageMimes :: [MimeType] +imageMimes = ["image/gif", "image/jpeg", "image/png"] + +type CoverImage = FilePath + +parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items) +parseManifest content = do + manifest <- findElementE (dfName "manifest") content + let items = findChildren (dfName "item") manifest + r <- mapM parseItem items + let cover = findAttr (emptyName "href") =<< filterChild findCover manifest + return (cover, (M.fromList r)) + where + findCover e = maybe False (isInfixOf "cover-image") + (findAttr (emptyName "properties") e) + parseItem e = do + uid <- findAttrE (emptyName "id") e + href <- findAttrE (emptyName "href") e + mime <- findAttrE (emptyName "media-type") e + return (uid, (href, mime)) + +parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MimeType)] +parseSpine is e = do + spine <- findElementE (dfName "spine") e + let itemRefs = findChildren (dfName "itemref") spine + mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + where + parseItemRef ref = do + let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) + guard linear + findAttr (emptyName "idref") ref + +parseMeta :: MonadError String m => Element -> m Meta +parseMeta content = do + meta <- findElementE (dfName "metadata") content + let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True + dcspace _ = False + let dcs = filterChildrenName dcspace meta + let r = foldr parseMetaItem nullMeta dcs + return r + +-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem +parseMetaItem :: Element -> Meta -> Meta +parseMetaItem e@(stripNamespace . elName -> field) meta = + addMetaField (renameMeta field) (B.str $ strContent e) meta + +renameMeta :: String -> String +renameMeta "creator" = "author" +renameMeta s = s + +getManifest :: MonadError String m => Archive -> m (String, Element) +getManifest archive = do + metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive + docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) + as <- liftM ((map attrToPair) . elAttribs) + (findElementE (QName "rootfile" (Just ns) Nothing) docElem) + manifestFile <- mkE "Root not found" (lookup "full-path" as) + let rootdir = dropFileName manifestFile + --mime <- lookup "media-type" as + manifest <- findEntryByPathE manifestFile archive + liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + +-- Fixup + +fixInternalReferences :: FilePath -> Pandoc -> Pandoc +fixInternalReferences pathToFile = + (walk $ renameImages root) + . (walk normalisePath) + . (walk $ fixBlockIRs filename) + . (walk $ fixInlineIRs filename) + where + (root, escapeURI -> filename) = splitFileName pathToFile + +fixInlineIRs :: String -> Inline -> Inline +fixInlineIRs s (Span as v) = + Span (fixAttrs s as) v +fixInlineIRs s (Code as code) = + Code (fixAttrs s as) code +fixInlineIRs s (Link t ('#':url, tit)) = + Link t (addHash s url, tit) +fixInlineIRs _ v = v + +normalisePath :: Inline -> Inline +normalisePath (Link t (url, tit)) = + let (path, uid) = span (/= '#') url in + Link t (takeFileName path ++ uid, tit) +normalisePath s = s + +prependHash :: [String] -> Inline -> Inline +prependHash ps l@(Link is (url, tit)) + | or [s `isPrefixOf` url | s <- ps] = + Link is ('#':url, tit) + | otherwise = l +prependHash _ i = i + +fixBlockIRs :: String -> Block -> Block +fixBlockIRs s (Div as b) = + Div (fixAttrs s as) b +fixBlockIRs s (Header i as b) = + Header i (fixAttrs s as) b +fixBlockIRs s (CodeBlock as code) = + CodeBlock (fixAttrs s as) code +fixBlockIRs _ b = b + +fixAttrs :: FilePath -> B.Attr -> B.Attr +fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) + +addHash :: String -> String -> String +addHash _ "" = "" +addHash s ident = s ++ "#" ++ ident + +removeEPUBAttrs :: [(String, String)] -> [(String, String)] +removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs + +isEPUBAttr :: (String, String) -> Bool +isEPUBAttr (k, _) = "epub:" `isPrefixOf` k + +-- Library + +-- Strict version of foldM +foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a +foldM' _ z [] = return z +foldM' f z (x:xs) = do + z' <- f z x + z' `deepseq` foldM' f z' xs + +uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d +uncurry3 f (a, b, c) = f a b c + +traceM :: Monad m => String -> m () +traceM = flip trace (return ()) + +-- Utility + +stripNamespace :: QName -> String +stripNamespace (QName v _ _) = v + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) +attrToNSPair _ = Nothing + +attrToPair :: Attr -> (String, String) +attrToPair (Attr (QName name _ _) val) = (name, val) + +defaultNameSpace :: Maybe String +defaultNameSpace = Just "http://www.idpf.org/2007/opf" + +dfName :: String -> QName +dfName s = QName s defaultNameSpace Nothing + +emptyName :: String -> QName +emptyName s = QName s Nothing Nothing + +-- Convert Maybe interface to Either + +findAttrE :: MonadError String m => QName -> Element -> m String +findAttrE q e = mkE "findAttr" $ findAttr q e + +findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry +findEntryByPathE (normalise -> path) a = + mkE ("No entry on path: " ++ path) $ findEntryByPath path a + +parseXMLDocE :: MonadError String m => String -> m Element +parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc + +findElementE :: MonadError String m => QName -> Element -> m Element +findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x + +mkE :: MonadError String m => String -> Maybe a -> m a +mkE s = maybe (throwError s) return diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index d1e4d0024..2a23f2a62 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,5 +1,6 @@ +{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -40,58 +41,102 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Shared -import Text.Pandoc.Options -import Text.Pandoc.Parsing -import Data.Maybe ( fromMaybe, isJust ) -import Data.List ( intercalate ) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) +import Text.Pandoc.Shared ( extractSpaces, renderTags' + , escapeURI, safeRead ) +import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) + , Extension (Ext_epub_html_exts, + Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Walk +import Data.Maybe ( fromMaybe, isJust) +import Data.List ( intercalate, isInfixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$), (<*) ) +import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Arrow ((***)) +import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>)) +import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..)) +import Text.Printf (printf) +import Debug.Trace (trace) +import Text.TeXMath (readMathML, writeTeX) +import Data.Default (Default (..), def) +import Control.Monad.Reader (Reader,ask, asks, local, runReader) -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = - case runParser parseDoc def{ stateOptions = opts } "source" tags of + case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of Left err' -> error $ "\nError at " ++ show err' Right result -> result - where tags = canonicalizeTags $ + where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do - blocks <- (fixPlains False . concat) <$> manyTill block eof - meta <- stateMeta <$> getState - return $ Pandoc meta blocks + blocks <- (fixPlains False) . mconcat <$> manyTill block eof + meta <- stateMeta . parserState <$> getState + bs' <- replaceNotes (B.toList blocks) + return $ Pandoc meta bs' + +replaceNotes :: [Block] -> TagParser [Block] +replaceNotes = walkM replaceNotes' + +replaceNotes' :: Inline -> TagParser Inline +replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes + where + getNotes = noteTable <$> getState +replaceNotes' x = return x + +data HTMLState = + HTMLState + { parserState :: ParserState, + noteTable :: [(String, Blocks)] + } -type TagParser = Parser [Tag String] ParserState +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain + } -pBody :: TagParser [Block] +setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter = local (\s -> s {inChapter = True}) + +setInPlain :: HTMLParser s a -> HTMLParser s a +setInPlain = local (\s -> s {inPlain = True}) + +type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) + +type TagParser = HTMLParser [Tag String] + +pBody :: TagParser Blocks pBody = pInTags "body" block -pHead :: TagParser [Block] -pHead = pInTags "head" $ pTitle <|> pMetaTag <|> ([] <$ pAnyTag) - where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces - setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t)) +pHead :: TagParser Blocks +pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag) + where pTitle = pInTags "title" inline >>= setTitle . trimInlines + setTitle t = mempty <$ (updateState $ B.setMeta "title" t) pMetaTag = do mt <- pSatisfy (~== TagOpen "meta" []) let name = fromAttrib "name" mt if null name - then return [] + then return mempty else do let content = fromAttrib "content" mt updateState $ B.setMeta name (B.text content) - return [] + return mempty -block :: TagParser [Block] -block = choice - [ pPara +block :: TagParser Blocks +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- choice + [ eSection + , eSwitch B.para block + , mempty <$ eFootnote + , mempty <$ eTOC + , mempty <$ eTitlePage + , pPara , pHeader , pBlockQuote , pCodeBlock @@ -100,15 +145,76 @@ block = choice , pTable , pHead , pBody - , pPlain , pDiv + , pPlain , pRawHtmlBlock ] + when tr $ trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res + +namespaces :: [(String, TagParser Inlines)] +namespaces = [(mathMLNamespace, pMath True)] + +mathMLNamespace :: String +mathMLNamespace = "http://www.w3.org/1998/Math/MathML" + +eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a +eSwitch constructor parser = try $ do + guardEnabled Ext_epub_html_exts + pSatisfy (~== TagOpen "switch" []) + cases <- getFirst . mconcat <$> + manyTill (First <$> (eCase <* skipMany pBlank) ) + (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + skipMany pBlank + fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) + skipMany pBlank + pSatisfy (~== TagClose "switch") + return $ maybe fallback constructor cases -pList :: TagParser [Block] +eCase :: TagParser (Maybe Inlines) +eCase = do + skipMany pBlank + TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + case (flip lookup namespaces) =<< lookup "required-namespace" attr of + Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + +eFootnote :: TagParser () +eFootnote = try $ do + let notes = ["footnote", "rearnote"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (flip elem notes) (lookup "type" attr)) + let ident = fromMaybe "" (lookup "id" attr) + content <- pInTags tag block + addNote ident content + +addNote :: String -> Blocks -> TagParser () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) + +eNoteref :: TagParser Inlines +eNoteref = try $ do + guardEnabled Ext_epub_html_exts + TagOpen tag attr <- lookAhead $ pAnyTag + guard (maybe False (== "noteref") (lookup "type" attr)) + let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) + guard (not (null ident)) + pInTags tag block + return $ B.rawInline "noteref" ident + +-- Strip TOC if there is one, better to generate again +eTOC :: TagParser () +eTOC = try $ do + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (== "toc") (lookup "type" attr)) + void (pInTags tag block) + +pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList -pBulletList :: TagParser [Block] +pBulletList :: TagParser Blocks pBulletList = try $ do pSatisfy (~== TagOpen "ul" []) let nonItem = pSatisfy (\t -> @@ -117,10 +223,16 @@ pBulletList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") - return [BulletList $ map (fixPlains True) items] + items <- manyTill (pListItem nonItem) (pCloses "ul") + return $ B.bulletList $ map (fixPlains True) items -pOrderedList :: TagParser [Block] +pListItem :: TagParser a -> TagParser Blocks +pListItem nonItem = do + TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) + let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) + (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + +pOrderedList :: TagParser Blocks pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) let (start, style) = (sta', sty') @@ -145,28 +257,28 @@ pOrderedList = try $ do -- note: if they have an <ol> or <ul> not in scope of a <li>, -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") - return [OrderedList (start, style, DefaultDelim) $ map (fixPlains True) items] + items <- manyTill (pListItem nonItem) (pCloses "ol") + return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items -pDefinitionList :: TagParser [Block] +pDefinitionList :: TagParser Blocks pDefinitionList = try $ do pSatisfy (~== TagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") - return [DefinitionList items] + return $ B.definitionList items -pDefListItem :: TagParser ([Inline],[[Block]]) +pDefListItem :: TagParser (Inlines, [Blocks]) pDefListItem = try $ do let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem - let term = intercalate [LineBreak] terms + let term = foldl1 (\x y -> x <> B.linebreak <> y) terms return (term, map (fixPlains True) defs) -fixPlains :: Bool -> [Block] -> [Block] -fixPlains inList bs = if any isParaish bs - then map plainToPara bs +fixPlains :: Bool -> Blocks -> Blocks +fixPlains inList bs = if any isParaish bs' + then B.fromList $ map plainToPara bs' else bs where isParaish (Para _) = True isParaish (CodeBlock _ _) = True @@ -178,29 +290,30 @@ fixPlains inList bs = if any isParaish bs isParaish _ = False plainToPara (Plain xs) = Para xs plainToPara x = x + bs' = B.toList bs pRawTag :: TagParser String pRawTag = do tag <- pAnyTag - let ignorable x = x `elem` ["html","head","body"] + let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag then return [] else return $ renderTags' [tag] -pDiv :: TagParser [Block] +pDiv :: TagParser Blocks pDiv = try $ do - getOption readerParseRaw >>= guard + guardEnabled Ext_native_divs TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True) contents <- pInTags "div" block - return [Div (mkAttr attr) contents] + return $ B.divWith (mkAttr attr) contents -pRawHtmlBlock :: TagParser [Block] +pRawHtmlBlock :: TagParser Blocks pRawHtmlBlock = do raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag parseRaw <- getOption readerParseRaw if parseRaw && not (null raw) - then return [RawBlock (Format "html") raw] - else return [] + then return $ B.rawBlock "html" raw + else return mempty pHtmlBlock :: String -> TagParser String pHtmlBlock t = try $ do @@ -208,32 +321,57 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] -pHeader :: TagParser [Block] +-- Sets chapter context +eSection :: TagParser Blocks +eSection = try $ do + let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let sectTag = tagOpen (`elem` sectioningContent) matchChapter + TagOpen tag _ <- lookAhead $ pSatisfy sectTag + setInChapter (pInTags tag block) + +headerLevel :: String -> TagParser Int +headerLevel tagtype = do + let level = read (drop 1 tagtype) + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + +eTitlePage :: TagParser () +eTitlePage = try $ do + let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) + let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") + isTitlePage + TagOpen tag _ <- lookAhead $ pSatisfy groupTag + () <$ pInTags tag block + +pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] - let level = read (drop 1 tagtype) - contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof) + level <- headerLevel tagtype + contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] return $ if bodyTitle - then [] -- skip a representation of the title in the body - else [Header level (ident, classes, keyvals) $ - normalizeSpaces contents] + then mempty -- skip a representation of the title in the body + else B.headerWith (ident, classes, keyvals) level contents -pHrule :: TagParser [Block] +pHrule :: TagParser Blocks pHrule = do pSelfClosing (=="hr") (const True) - return [HorizontalRule] + return B.horizontalRule -pTable :: TagParser [Block] +pTable :: TagParser Blocks pTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank - caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank + caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank -- TODO actually read these and take width information from them widths' <- pColgroup <|> many pCol head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") @@ -242,26 +380,25 @@ pTable = try $ do $ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td") skipMany pBlank TagClose _ <- pSatisfy (~== TagClose "table") - let isSinglePlain [] = True - isSinglePlain [Plain _] = True - isSinglePlain _ = False + let isSinglePlain x = case B.toList x of + [Plain _] -> True + _ -> False let isSimple = all isSinglePlain $ concat (head':rows) - let cols = length $ if null head' - then head rows - else head' + let cols = length $ if null head' then head rows else head' -- fail if there are colspans or rowspans guard $ all (\r -> length r == cols) rows - let aligns = replicate cols AlignLeft + let aligns = replicate cols AlignDefault let widths = if null widths' then if isSimple then replicate cols 0 else replicate cols (1.0 / fromIntegral cols) else widths' - return [Table caption aligns widths head' rows] + return $ B.table caption (zip aligns widths) head' rows pCol :: TagParser Double pCol = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + skipMany pBlank optional $ pSatisfy (~== TagClose "col") skipMany pBlank return $ case lookup "width" attribs of @@ -275,35 +412,35 @@ pColgroup = try $ do skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -pCell :: String -> TagParser [TableCell] +pCell :: String -> TagParser [Blocks] pCell celltype = try $ do skipMany pBlank res <- pInTags celltype block skipMany pBlank return [res] -pBlockQuote :: TagParser [Block] +pBlockQuote :: TagParser Blocks pBlockQuote = do contents <- pInTags "blockquote" block - return [BlockQuote $ fixPlains False contents] + return $ B.blockQuote $ fixPlains False contents -pPlain :: TagParser [Block] +pPlain :: TagParser Blocks pPlain = do - contents <- liftM (normalizeSpaces . concat) $ many1 inline - if null contents - then return [] - else return [Plain contents] + contents <- setInPlain $ trimInlines . mconcat <$> many1 inline + if B.isNull contents + then return mempty + else return $ B.plain contents -pPara :: TagParser [Block] +pPara :: TagParser Blocks pPara = do - contents <- pInTags "p" inline - return [Para $ normalizeSpaces contents] + contents <- trimInlines <$> pInTags "p" inline + return $ B.para contents -pCodeBlock :: TagParser [Block] +pCodeBlock :: TagParser Blocks pCodeBlock = try $ do TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) contents <- manyTill pAnyTag (pCloses "pre" <|> eof) - let rawText = concatMap fromTagText $ filter isTagText contents + let rawText = concatMap tagToString contents -- drop leading newline if any let result' = case rawText of '\n':xs -> xs @@ -312,11 +449,18 @@ pCodeBlock = try $ do let result = case reverse result' of '\n':_ -> init result' _ -> result' - return [CodeBlock (mkAttr attr) result] + return $ B.codeBlockWith (mkAttr attr) result -inline :: TagParser [Inline] +tagToString :: Tag String -> String +tagToString (TagText s) = s +tagToString (TagOpen "br" _) = "\n" +tagToString _ = "" + +inline :: TagParser Inlines inline = choice - [ pTagText + [ eNoteref + , eSwitch id inline + , pTagText , pQ , pEmph , pStrong @@ -328,6 +472,7 @@ inline = choice , pImage , pCode , pSpan + , pMath False , pRawHtmlInline ] @@ -354,91 +499,130 @@ pSelfClosing f g = do optional $ pSatisfy (tagClose f) return open -pQ :: TagParser [Inline] +pQ :: TagParser Inlines pQ = do - quoteContext <- stateQuoteContext `fmap` getState - let quoteType = case quoteContext of + context <- asks quoteContext + let quoteType = case context of InDoubleQuote -> SingleQuote _ -> DoubleQuote let innerQuoteContext = if quoteType == SingleQuote then InSingleQuote else InDoubleQuote - withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType) + let constructor = case quoteType of + SingleQuote -> B.singleQuoted + DoubleQuote -> B.doubleQuoted + withQuoteContext innerQuoteContext $ + pInlinesInTags "q" constructor -pEmph :: TagParser [Inline] -pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph +pEmph :: TagParser Inlines +pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph -pStrong :: TagParser [Inline] -pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong +pStrong :: TagParser Inlines +pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong -pSuperscript :: TagParser [Inline] -pSuperscript = pInlinesInTags "sup" Superscript +pSuperscript :: TagParser Inlines +pSuperscript = pInlinesInTags "sup" B.superscript -pSubscript :: TagParser [Inline] -pSubscript = pInlinesInTags "sub" Subscript +pSubscript :: TagParser Inlines +pSubscript = pInlinesInTags "sub" B.subscript -pStrikeout :: TagParser [Inline] +pStrikeout :: TagParser Inlines pStrikeout = do - pInlinesInTags "s" Strikeout <|> - pInlinesInTags "strike" Strikeout <|> - pInlinesInTags "del" Strikeout <|> + pInlinesInTags "s" B.strikeout <|> + pInlinesInTags "strike" B.strikeout <|> + pInlinesInTags "del" B.strikeout <|> try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) - contents <- liftM concat $ manyTill inline (pCloses "span") - return [Strikeout contents]) + contents <- mconcat <$> manyTill inline (pCloses "span") + return $ B.strikeout contents) -pLineBreak :: TagParser [Inline] +pLineBreak :: TagParser Inlines pLineBreak = do pSelfClosing (=="br") (const True) - return [LineBreak] + return B.linebreak + +pLink :: TagParser Inlines +pLink = pRelLink <|> pAnchor -pLink :: TagParser [Inline] -pLink = try $ do +pAnchor :: TagParser Inlines +pAnchor = try $ do + tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id")) + return $ B.spanWith (fromAttrib "id" tag , [], []) mempty + +pRelLink :: TagParser Inlines +pRelLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) let url = fromAttrib "href" tag let title = fromAttrib "title" tag - lab <- liftM concat $ manyTill inline (pCloses "a") - return [Link (normalizeSpaces lab) (escapeURI url, title)] - -pImage :: TagParser [Inline] + let uid = fromAttrib "id" tag + let spanC = case uid of + [] -> id + s -> B.spanWith (s, [], []) + lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") + return $ spanC $ B.link (escapeURI url) title lab + +pImage :: TagParser Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") let url = fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - return [Image (B.toList $ B.text alt) (escapeURI url, title)] + return $ B.image (escapeURI url) title (B.text alt) -pCode :: TagParser [Inline] +pCode :: TagParser Inlines pCode = try $ do (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) result <- manyTill pAnyTag (pCloses open) - return [Code (mkAttr attr) $ intercalate " " $ lines $ innerText result] + return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result -pSpan :: TagParser [Inline] +pSpan :: TagParser Inlines pSpan = try $ do - getOption readerParseRaw >>= guard + guardEnabled Ext_native_spans TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) contents <- pInTags "span" inline - return [Span (mkAttr attr) contents] - -pRawHtmlInline :: TagParser [Inline] + let attr' = mkAttr attr + return $ case attr' of + ("",[],[("style",s)]) + | filter (`notElem` " \t;") s == "font-variant:small-caps" -> + B.smallcaps contents + _ -> B.spanWith (mkAttr attr) contents + +pRawHtmlInline :: TagParser Inlines pRawHtmlInline = do - result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag + inplain <- asks inPlain + result <- pSatisfy (tagComment (const True)) + <|> if inplain + then pSatisfy (not . isBlockTag) + else pSatisfy isInlineTag parseRaw <- getOption readerParseRaw if parseRaw - then return [RawInline (Format "html") $ renderTags' [result]] - else return [] - -pInlinesInTags :: String -> ([Inline] -> Inline) - -> TagParser [Inline] -pInlinesInTags tagtype f = do - contents <- pInTags tagtype inline - return [f $ normalizeSpaces contents] - -pInTags :: String -> TagParser [a] - -> TagParser [a] + then return $ B.rawInline "html" $ renderTags' [result] + else return mempty + +mathMLToTeXMath :: String -> Either String String +mathMLToTeXMath s = writeTeX <$> readMathML s + +pMath :: Bool -> TagParser Inlines +pMath inCase = try $ do + open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr))) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) + let math = mathMLToTeXMath $ + (renderTags $ [open] ++ contents ++ [TagClose "math"]) + let constructor = + maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath) + (lookup "display" attr) + return $ either (const mempty) + (\x -> if null x then mempty else constructor x) math + +pInlinesInTags :: String -> (Inlines -> Inlines) + -> TagParser Inlines +pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline + +pInTags :: (Monoid a) => String -> TagParser a + -> TagParser a pInTags tagtype parser = try $ do pSatisfy (~== TagOpen tagtype []) - liftM concat $ manyTill parser (pCloses tagtype <|> eof) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) pOptInTag :: String -> TagParser a -> TagParser a @@ -454,43 +638,47 @@ pCloses :: String -> TagParser () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of - (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagClose t') | t' == tagtype -> pAnyTag >> return () (TagOpen t' _) | t' `closes` tagtype -> return () (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "li" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "tr" -> return () _ -> mzero -pTagText :: TagParser [Inline] +pTagText :: TagParser Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState - case runParser (many pTagContents) st "text" str of + qu <- ask + case flip runReader qu $ runParserT (many pTagContents) st "text" str of Left _ -> fail $ "Could not parse `" ++ str ++ "'" - Right result -> return result + Right result -> return $ mconcat result pBlank :: TagParser () pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: Parser [Char] ParserState Inline +type InlinesParser = HTMLParser String + +pTagContents :: InlinesParser Inlines pTagContents = - Math DisplayMath `fmap` mathDisplay - <|> Math InlineMath `fmap` mathInline + B.displayMath <$> mathDisplay + <|> B.math <$> mathInline <|> pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad -pStr :: Parser [Char] ParserState Inline +pStr :: InlinesParser Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } - return $ Str result + updateLastStrPos + return $ B.str result isSpecial :: Char -> Bool isSpecial '"' = True @@ -504,13 +692,13 @@ isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: Parser [Char] ParserState Inline -pSymbol = satisfy isSpecial >>= return . Str . (:[]) +pSymbol :: InlinesParser Inlines +pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: Parser [Char] ParserState Inline +pBad :: InlinesParser Inlines pBad = do c <- satisfy isBad let c' = case c of @@ -542,18 +730,20 @@ pBad = do '\158' -> '\382' '\159' -> '\376' _ -> '?' - return $ Str [c'] + return $ B.str [c'] -pSpace :: Parser [Char] ParserState Inline -pSpace = many1 (satisfy isSpace) >> return Space +pSpace :: InlinesParser Inlines +pSpace = many1 (satisfy isSpace) >> return B.space -- -- Constants -- eitherBlockOrInline :: [String] -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object"] +eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed", + "del", "ins", + "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] {- inlineHtmlTags :: [[Char]] @@ -565,15 +755,17 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", -} blockHtmlTags :: [String] -blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "canvas", +blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "button", "canvas", "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", - "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "map", "menu", - "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress", - "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd", + "dl", "dt", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "menu", "noframes", "ol", "output", "p", "pre", + "section", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style", "svg", "video"] + "th", "thead", "tr", "script", "style"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. @@ -591,19 +783,26 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] +epubTags :: [String] +epubTags = ["case", "switch", "default"] + blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags +blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen (`notElem` blockTags) (const True) t || - tagClose (`notElem` blockTags) t || +isInlineTag t = tagOpen isInlineTagName (const True) t || + tagClose isInlineTagName t || tagComment (const True) t + where isInlineTagName x = x `notElem` blockTags isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen (`elem` blocktags) (const True) t || - tagClose (`elem` blocktags) t || +isBlockTag t = tagOpen isBlockTagName (const True) t || + tagClose isBlockTagName t || tagComment (const True) t - where blocktags = blockTags ++ eitherBlockOrInline + where isBlockTagName ('?':_) = True + isBlockTagName ('!':_) = True + isBlockTagName x = x `elem` blockTags + || x `elem` eitherBlockOrInline isTextTag :: Tag String -> Bool isTextTag = tagText (const True) @@ -612,7 +811,7 @@ isCommentTag :: Tag String -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended - +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags closes :: String -> String -> Bool _ `closes` "body" = False _ `closes` "html" = False @@ -620,11 +819,18 @@ _ `closes` "html" = False "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True -"hr" `closes` "p" = True -"p" `closes` "p" = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- http://www.w3.org/TR/html-markup/p.html +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True "meta" `closes` "meta" = True -"colgroup" `closes` "colgroup" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True @@ -645,19 +851,23 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String +htmlInBalanced :: (Monad m) + => (Tag String -> Bool) + -> ParserT String st m String htmlInBalanced f = try $ do (TagOpen t _, tag) <- htmlTag f guard $ '/' `notElem` tag -- not a self-closing tag let stopper = htmlTag (~== TagClose t) - let anytag = liftM snd $ htmlTag (const True) + let anytag = snd <$> htmlTag (const True) contents <- many $ notFollowedBy' stopper >> (htmlInBalanced f <|> anytag <|> count 1 anyChar) endtag <- liftM snd stopper return $ tag ++ concat contents ++ endtag -- | Matches a tag meeting a certain condition. -htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String) +htmlTag :: Monad m + => (Tag String -> Bool) + -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead $ char '<' >> (oneOf "/!?" <|> letter) (next : _) <- getInput >>= return . canonicalizeTags . parseTags @@ -676,6 +886,79 @@ htmlTag f = try $ do mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + +-- Strip namespace prefixes +stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag String -> Tag String +stripPrefix (TagOpen s as) = + TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) +stripPrefix (TagClose s) = TagClose (stripPrefix' s) +stripPrefix x = x + +stripPrefix' :: String -> String +stripPrefix' s = + case span (/= ':') s of + (_, "") -> s + (_, (_:ts)) -> ts + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- Instances + +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance HasQuoteContext st (Reader HTMLLocal) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance Default HTMLState where + def = HTMLState def [] + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote False False + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState + + +-- EPUB Specific +-- +-- +sectioningContent :: [String] +sectioningContent = ["article", "aside", "nav", "section"] + + +groupingContent :: [String] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] + + +{- + +types :: [(String, ([String], Int))] +types = -- Document divisions + map (\s -> (s, (["section", "body"], 0))) + ["volume", "part", "chapter", "division"] + ++ -- Document section and components + [ + ("abstract", ([], 0))] +-} diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index 0e74406ef..4b46c869d 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -3,7 +3,8 @@ Copyright : Copyright (C) 2013 David Lazar License : GNU GPL, version 2 or above - Maintainer : David Lazar <lazar6@illinois.edu> + Maintainer : David Lazar <lazar6@illinois.edu>, + John MacFarlane <jgm@berkeley.edu> Stability : alpha Conversion of Haddock markup to 'Pandoc' document. @@ -12,30 +13,126 @@ module Text.Pandoc.Readers.Haddock ( readHaddock ) where -import Text.Pandoc.Builder +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Shared (trim, splitBy) +import Data.Monoid +import Data.List (intersperse, stripPrefix) +import Data.Maybe (fromMaybe) +import Text.Pandoc.Definition import Text.Pandoc.Options -import Text.Pandoc.Readers.Haddock.Lex -import Text.Pandoc.Readers.Haddock.Parse +import Documentation.Haddock.Parser +import Documentation.Haddock.Types +import Debug.Trace (trace) -- | Parse Haddock markup and return a 'Pandoc' document. readHaddock :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse -> Pandoc -readHaddock _ s = Pandoc nullMeta blocks +readHaddock opts = B.doc . docHToBlocks . trace' . parseParas + where trace' x = if readerTrace opts + then trace (show x) x + else x + +docHToBlocks :: DocH String Identifier -> Blocks +docHToBlocks d' = + case d' of + DocEmpty -> mempty + DocAppend (DocParagraph (DocHeader h)) (DocParagraph (DocAName ident)) -> + B.headerWith (ident,[],[]) (headerLevel h) + (docHToInlines False $ headerTitle h) + DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2) + DocString _ -> inlineFallback + DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h + DocParagraph x -> B.para $ docHToInlines False x + DocIdentifier _ -> inlineFallback + DocIdentifierUnchecked _ -> inlineFallback + DocModule s -> B.plain $ docHToInlines False $ DocModule s + DocWarning _ -> mempty -- TODO + DocEmphasis _ -> inlineFallback + DocMonospaced _ -> inlineFallback + DocBold _ -> inlineFallback + DocHeader h -> B.header (headerLevel h) + (docHToInlines False $ headerTitle h) + DocUnorderedList items -> B.bulletList (map docHToBlocks items) + DocOrderedList items -> B.orderedList (map docHToBlocks items) + DocDefList items -> B.definitionList (map (\(d,t) -> + (docHToInlines False d, + [consolidatePlains $ docHToBlocks t])) items) + DocCodeBlock (DocString s) -> B.codeBlockWith ("",[],[]) s + DocCodeBlock d -> B.para $ docHToInlines True d + DocHyperlink _ -> inlineFallback + DocPic _ -> inlineFallback + DocAName _ -> inlineFallback + DocProperty s -> B.codeBlockWith ("",["property","haskell"],[]) (trim s) + DocExamples es -> mconcat $ map (\e -> + makeExample ">>>" (exampleExpression e) (exampleResult e)) es + + where inlineFallback = B.plain $ docHToInlines False d' + consolidatePlains = B.fromList . consolidatePlains' . B.toList + consolidatePlains' zs@(Plain _ : _) = + let (xs, ys) = span isPlain zs in + Para (concatMap extractContents xs) : consolidatePlains' ys + consolidatePlains' (x : xs) = x : consolidatePlains' xs + consolidatePlains' [] = [] + isPlain (Plain _) = True + isPlain _ = False + extractContents (Plain xs) = xs + extractContents _ = [] + +docHToInlines :: Bool -> DocH String Identifier -> Inlines +docHToInlines isCode d' = + case d' of + DocEmpty -> mempty + DocAppend d1 d2 -> mappend (docHToInlines isCode d1) + (docHToInlines isCode d2) + DocString s + | isCode -> mconcat $ intersperse B.linebreak + $ map B.code $ splitBy (=='\n') s + | otherwise -> B.text s + DocParagraph _ -> mempty + DocIdentifier (_,s,_) -> B.codeWith ("",["haskell","identifier"],[]) s + DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) s + DocModule s -> B.codeWith ("",["haskell","module"],[]) s + DocWarning _ -> mempty -- TODO + DocEmphasis d -> B.emph (docHToInlines isCode d) + DocMonospaced (DocString s) -> B.code s + DocMonospaced d -> docHToInlines True d + DocBold d -> B.strong (docHToInlines isCode d) + DocHeader _ -> mempty + DocUnorderedList _ -> mempty + DocOrderedList _ -> mempty + DocDefList _ -> mempty + DocCodeBlock _ -> mempty + DocHyperlink h -> B.link (hyperlinkUrl h) (hyperlinkUrl h) + (maybe (B.text $ hyperlinkUrl h) B.text $ hyperlinkLabel h) + DocPic p -> B.image (pictureUri p) (fromMaybe (pictureUri p) $ pictureTitle p) + (maybe mempty B.text $ pictureTitle p) + DocAName s -> B.spanWith (s,["anchor"],[]) mempty + DocProperty _ -> mempty + DocExamples _ -> mempty + +-- | Create an 'Example', stripping superfluous characters as appropriate +makeExample :: String -> String -> [String] -> Blocks +makeExample prompt expression result = + B.para $ B.codeWith ("",["prompt"],[]) prompt + <> B.space + <> B.codeWith ([], ["haskell","expr"], []) (trim expression) + <> B.linebreak + <> (mconcat $ intersperse B.linebreak $ map coder result') where - blocks = case parseParas (tokenise s (0,0)) of - Left [] -> error "parse failure" - Left (tok:_) -> error $ "parse failure " ++ pos (tokenPos tok) - where pos (l, c) = "(line " ++ show l ++ ", column " ++ show c ++ ")" - Right x -> mergeLists (toList x) - --- similar to 'docAppend' in Haddock.Doc -mergeLists :: [Block] -> [Block] -mergeLists (BulletList xs : BulletList ys : blocks) - = mergeLists (BulletList (xs ++ ys) : blocks) -mergeLists (OrderedList _ xs : OrderedList a ys : blocks) - = mergeLists (OrderedList a (xs ++ ys) : blocks) -mergeLists (DefinitionList xs : DefinitionList ys : blocks) - = mergeLists (DefinitionList (xs ++ ys) : blocks) -mergeLists (x : blocks) = x : mergeLists blocks -mergeLists [] = [] + -- 1. drop trailing whitespace from the prompt, remember the prefix + prefix = takeWhile (`elem` " \t") prompt + + -- 2. drop, if possible, the exact same sequence of whitespace + -- characters from each result line + -- + -- 3. interpret lines that only contain the string "<BLANKLINE>" as an + -- empty line + result' = map (substituteBlankLine . tryStripPrefix prefix) result + where + tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys + + substituteBlankLine "<BLANKLINE>" = "" + substituteBlankLine line = line + coder = B.codeWith ([], ["result"], []) diff --git a/src/Text/Pandoc/Readers/Haddock/Lex.x b/src/Text/Pandoc/Readers/Haddock/Lex.x deleted file mode 100644 index 120e96ebf..000000000 --- a/src/Text/Pandoc/Readers/Haddock/Lex.x +++ /dev/null @@ -1,171 +0,0 @@ --- --- Haddock - A Haskell Documentation Tool --- --- (c) Simon Marlow 2002 --- --- This file was modified and integrated into GHC by David Waern 2006. --- Then moved back into Haddock by Isaac Dupree in 2009 :-) --- Then copied into Pandoc by David Lazar in 2013 :-D - -{ -{-# LANGUAGE BangPatterns #-} -- Generated by Alex -{-# OPTIONS -Wwarn -w #-} --- The above warning supression flag is a temporary kludge. --- While working on this module you are encouraged to remove it and fix --- any warnings in the module. See --- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings --- for details - -module Text.Pandoc.Readers.Haddock.Lex ( - Token(..), - LToken, - tokenise, - tokenPos - ) where - -import Data.Char -import Numeric (readHex) -} - -%wrapper "posn" - -$ws = $white # \n -$digit = [0-9] -$hexdigit = [0-9a-fA-F] -$special = [\"\@] -$alphanum = [A-Za-z0-9] -$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:] - -:- - --- beginning of a paragraph -<0,para> { - $ws* \n ; - $ws* \> { begin birdtrack } - $ws* prop \> .* \n { strtoken TokProperty `andBegin` property} - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - $ws* [\*\-] { token TokBullet `andBegin` string } - $ws* \[ { token TokDefStart `andBegin` def } - $ws* \( $digit+ \) { token TokNumber `andBegin` string } - $ws* $digit+ \. { token TokNumber `andBegin` string } - $ws* { begin string } -} - --- beginning of a line -<line> { - $ws* \> { begin birdtrack } - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - - $ws* \n { token TokPara `andBegin` para } - -- ^ Here, we really want to be able to say - -- $ws* (\n | <eof>) { token TokPara `andBegin` para} - -- because otherwise a trailing line of whitespace will result in - -- a spurious TokString at the end of a docstring. We don't have <eof>, - -- though (NOW I realise what it was for :-). To get around this, we always - -- append \n to the end of a docstring. - - () { begin string } -} - -<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line } - -<property> () { token TokPara `andBegin` para } - -<example> { - $ws* \n { token TokPara `andBegin` para } - $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr } - () { begin exampleresult } -} - -<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example } - -<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example } - -<string,def> { - $special { strtoken $ \s -> TokSpecial (head s) } - \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) } - \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) } - \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) } - \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) } - [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) } - \\ . { strtoken (TokString . tail) } - "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] } - "&#" [xX] $hexdigit+ \; - { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] } - -- allow special characters through if they don't fit one of the previous - -- patterns. - [\/\'\`\<\#\&\\] { strtoken TokString } - [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line } - [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString } -} - -<def> { - \] { token TokDefEnd `andBegin` string } -} - --- ']' doesn't have any special meaning outside of the [...] at the beginning --- of a definition paragraph. -<string> { - \] { strtoken TokString } -} - -{ --- | A located token -type LToken = (Token, AlexPosn) - -data Token - = TokPara - | TokNumber - | TokBullet - | TokDefStart - | TokDefEnd - | TokSpecial Char - | TokIdent String - | TokString String - | TokURL String - | TokPic String - | TokEmphasis String - | TokAName String - | TokBirdTrack String - | TokProperty String - | TokExamplePrompt String - | TokExampleExpression String - | TokExampleResult String - deriving Show - -tokenPos :: LToken -> (Int, Int) -tokenPos t = let AlexPn _ line col = snd t in (line, col) - -type StartCode = Int -type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken] - -tokenise :: String -> (Int, Int) -> [LToken] -tokenise str (line, col) = go (posn,'\n',[],eofHack str) para - where posn = AlexPn 0 line col - go inp@(pos,_,_,str) sc = - case alexScan inp sc of - AlexEOF -> [] - AlexError _ -> [] - AlexSkip inp' len -> go inp' sc - AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) - --- NB. we add a final \n to the string, (see comment in the beginning of line --- production above). -eofHack str = str++"\n" - -andBegin :: Action -> StartCode -> Action -andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont - -token :: Token -> Action -token t = \pos _ sc cont -> (t, pos) : cont sc - -strtoken, strtokenNL :: (String -> Token) -> Action -strtoken t = \pos str sc cont -> (t str, pos) : cont sc -strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc --- ^ We only want LF line endings in our internal doc string format, so we --- filter out all CRs. - -begin :: StartCode -> Action -begin sc = \_ _ _ cont -> cont sc - -} diff --git a/src/Text/Pandoc/Readers/Haddock/Parse.y b/src/Text/Pandoc/Readers/Haddock/Parse.y deleted file mode 100644 index 9c2bbc8a9..000000000 --- a/src/Text/Pandoc/Readers/Haddock/Parse.y +++ /dev/null @@ -1,178 +0,0 @@ --- This code was copied from the 'haddock' package, modified, and integrated --- into Pandoc by David Lazar. -{ -{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6 --- The above warning supression flag is a temporary kludge. --- While working on this module you are encouraged to remove it and fix --- any warnings in the module. See --- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings --- for details - -module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where - -import Text.Pandoc.Readers.Haddock.Lex -import Text.Pandoc.Builder -import Text.Pandoc.Shared (trim, trimr) -import Data.Generics (everywhere, mkT) -import Data.Char (isSpace) -import Data.Maybe (fromMaybe) -import Data.List (stripPrefix, intersperse) -import Data.Monoid (mempty, mconcat) -} - -%expect 0 - -%tokentype { LToken } - -%token - '/' { (TokSpecial '/',_) } - '@' { (TokSpecial '@',_) } - '[' { (TokDefStart,_) } - ']' { (TokDefEnd,_) } - DQUO { (TokSpecial '\"',_) } - URL { (TokURL $$,_) } - PIC { (TokPic $$,_) } - ANAME { (TokAName $$,_) } - '/../' { (TokEmphasis $$,_) } - '-' { (TokBullet,_) } - '(n)' { (TokNumber,_) } - '>..' { (TokBirdTrack $$,_) } - PROP { (TokProperty $$,_) } - PROMPT { (TokExamplePrompt $$,_) } - RESULT { (TokExampleResult $$,_) } - EXP { (TokExampleExpression $$,_) } - IDENT { (TokIdent $$,_) } - PARA { (TokPara,_) } - STRING { (TokString $$,_) } - -%monad { Either [LToken] } - -%name parseParas doc -%name parseString seq - -%% - -doc :: { Blocks } - : apara PARA doc { $1 <> $3 } - | PARA doc { $2 } - | apara { $1 } - | {- empty -} { mempty } - -apara :: { Blocks } - : ulpara { bulletList [$1] } - | olpara { orderedList [$1] } - | defpara { definitionList [$1] } - | para { $1 } - -ulpara :: { Blocks } - : '-' para { $2 } - -olpara :: { Blocks } - : '(n)' para { $2 } - -defpara :: { (Inlines, [Blocks]) } - : '[' seq ']' seq { (trimInlines $2, [plain $ trimInlines $4]) } - -para :: { Blocks } - : seq { para' $1 } - | codepara { codeBlockWith ([], ["haskell"], []) $1 } - | property { $1 } - | examples { $1 } - -codepara :: { String } - : '>..' codepara { $1 ++ $2 } - | '>..' { $1 } - -property :: { Blocks } - : PROP { makeProperty $1 } - -examples :: { Blocks } - : example examples { $1 <> $2 } - | example { $1 } - -example :: { Blocks } - : PROMPT EXP result { makeExample $1 $2 (lines $3) } - | PROMPT EXP { makeExample $1 $2 [] } - -result :: { String } - : RESULT result { $1 ++ $2 } - | RESULT { $1 } - -seq :: { Inlines } - : elem seq { $1 <> $2 } - | elem { $1 } - -elem :: { Inlines } - : elem1 { $1 } - | '@' seq1 '@' { monospace $2 } - -seq1 :: { Inlines } - : PARA seq1 { linebreak <> $2 } - | elem1 seq1 { $1 <> $2 } - | elem1 { $1 } - -elem1 :: { Inlines } - : STRING { text $1 } - | '/../' { emph (str $1) } - | URL { makeHyperlink $1 } - | PIC { image $1 $1 mempty } - | ANAME { mempty } -- TODO - | IDENT { codeWith ([], ["haskell"], []) $1 } - | DQUO strings DQUO { codeWith ([], ["haskell"], []) $2 } - -strings :: { String } - : STRING { $1 } - | STRING strings { $1 ++ $2 } - -{ -happyError :: [LToken] -> Either [LToken] a -happyError toks = Left toks - -para' :: Inlines -> Blocks -para' = para . trimInlines - -monospace :: Inlines -> Inlines -monospace = everywhere (mkT go) - where - go (Str s) = Code nullAttr s - go x = x - --- | Create a `Hyperlink` from given string. --- --- A hyperlink consists of a URL and an optional label. The label is separated --- from the url by one or more whitespace characters. -makeHyperlink :: String -> Inlines -makeHyperlink input = case break isSpace $ trim input of - (url, "") -> link url url (str url) - (url, lb) -> link url url (trimInlines $ text lb) - -makeProperty :: String -> Blocks -makeProperty s = case trim s of - 'p':'r':'o':'p':'>':xs -> - codeBlockWith ([], ["property"], []) (dropWhile isSpace xs) - xs -> - error $ "makeProperty: invalid input " ++ show xs - --- | Create an 'Example', stripping superfluous characters as appropriate -makeExample :: String -> String -> [String] -> Blocks -makeExample prompt expression result = - para $ codeWith ([], ["haskell","expr"], []) (trim expression) - <> linebreak - <> (mconcat $ intersperse linebreak $ map coder result') - where - -- 1. drop trailing whitespace from the prompt, remember the prefix - prefix = takeWhile isSpace prompt - - -- 2. drop, if possible, the exact same sequence of whitespace - -- characters from each result line - -- - -- 3. interpret lines that only contain the string "<BLANKLINE>" as an - -- empty line - result' = map (substituteBlankLine . tryStripPrefix prefix) result - where - tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys - - substituteBlankLine "<BLANKLINE>" = "" - substituteBlankLine line = line - coder = codeWith ([], ["result"], []) -} diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 51271edc5..9420d602f 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,6 +1,6 @@ {-# LANGUAGE ScopedTypeVariables, OverloadedStrings #-} {- -Copyright (C) 2006-2012 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2012 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -31,6 +31,7 @@ Conversion of LaTeX to 'Pandoc' document. module Text.Pandoc.Readers.LaTeX ( readLaTeX, rawLaTeXInline, rawLaTeXBlock, + inlineCommand, handleIncludes ) where @@ -42,6 +43,7 @@ import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, mathDisplay, mathInline) import qualified Text.Pandoc.UTF8 as UTF8 import Data.Char ( chr, ord ) +import Control.Monad.Trans (lift) import Control.Monad import Text.Pandoc.Builder import Data.Char (isLetter, isAlphaNum) @@ -101,7 +103,7 @@ dimenarg = try $ do sp :: LP () sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') - <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline) + <|> (try $ newline <* lookAhead anyChar <* notFollowedBy blankline) isLowerHex :: Char -> Bool isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' @@ -123,7 +125,7 @@ comment :: LP () comment = do char '%' skipMany (satisfy (/='\n')) - newline + optional newline return () bgroup :: LP () @@ -302,6 +304,13 @@ blockCommands = M.fromList $ , ("item", skipopts *> loose_item) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) + , ("caption", skipopts *> tok >>= setCaption) + , ("PandocStartInclude", startInclude) + , ("PandocEndInclude", endInclude) + , ("bibliography", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) + , ("addbibresource", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) ] ++ map ignoreBlocks -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks @@ -309,7 +318,7 @@ blockCommands = M.fromList $ -- newcommand, etc. should be parsed by macro, but we need this -- here so these aren't parsed as inline commands to ignore , "special", "pdfannot", "pdfstringdef" - , "bibliography", "bibliographystyle" + , "bibliographystyle" , "maketitle", "makeindex", "makeglossary" , "addcontentsline", "addtocontents", "addtocounter" -- \ignore{} is used conventionally in literate haskell for definitions @@ -321,7 +330,19 @@ blockCommands = M.fromList $ ] addMeta :: ToMetaValue a => String -> a -> LP () -addMeta field val = updateState $ setMeta field val +addMeta field val = updateState $ \st -> + st{ stateMeta = addMetaField field val $ stateMeta st } + +splitBibs :: String -> [Inlines] +splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') + +setCaption :: Inlines -> LP Blocks +setCaption ils = do + updateState $ \st -> st{ stateCaption = Just ils } + return mempty + +resetCaption :: LP () +resetCaption = updateState $ \st -> st{ stateCaption = Nothing } authors :: LP () authors = try $ do @@ -332,7 +353,7 @@ authors = try $ do -- skip e.g. \vspace{10pt} auths <- sepBy oneAuthor (controlSeq "and") char '}' - addMeta "authors" (map trimInlines auths) + addMeta "author" (map trimInlines auths) section :: Attr -> Int -> LP Blocks section (ident, classes, kvs) lvl = do @@ -375,18 +396,18 @@ isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands inlineCommands :: M.Map String (LP Inlines) inlineCommands = M.fromList $ - [ ("emph", emph <$> tok) - , ("textit", emph <$> tok) - , ("textsl", emph <$> tok) - , ("textsc", smallcaps <$> tok) - , ("sout", strikeout <$> tok) - , ("textsuperscript", superscript <$> tok) - , ("textsubscript", subscript <$> tok) + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("sout", extractSpaces strikeout <$> tok) + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) , ("textbackslash", lit "\\") , ("backslash", lit "\\") , ("slash", lit "/") - , ("textbf", strong <$> tok) - , ("textnormal", spanWith ("",["nodecor"],[]) <$> tok) + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) , ("ldots", lit "…") , ("dots", lit "…") , ("mdots", lit "…") @@ -406,15 +427,15 @@ inlineCommands = M.fromList $ , ("{", lit "{") , ("}", lit "}") -- old TeX commands - , ("em", emph <$> inlines) - , ("it", emph <$> inlines) - , ("sl", emph <$> inlines) - , ("bf", strong <$> inlines) + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) , ("rm", inlines) - , ("itshape", emph <$> inlines) - , ("slshape", emph <$> inlines) - , ("scshape", smallcaps <$> inlines) - , ("bfseries", strong <$> inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) , ("/", pure mempty) -- italic correction , ("aa", lit "å") , ("AA", lit "Å") @@ -473,6 +494,7 @@ inlineCommands = M.fromList $ , ("citealp", citation "citealp" NormalCitation False) , ("citealp*", citation "citealp*" NormalCitation False) , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) , ("footcite", inNote <$> citation "footcite" NormalCitation False) , ("parencite", citation "parencite" NormalCitation False) , ("supercite", citation "supercite" NormalCitation False) @@ -495,6 +517,7 @@ inlineCommands = M.fromList $ , ("supercites", citation "supercites" NormalCitation True) , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) , ("Footcite", citation "Footcite" NormalCitation False) , ("Parencite", citation "Parencite" NormalCitation False) , ("Supercite", citation "Supercite" NormalCitation False) @@ -516,25 +539,21 @@ inlineCommands = M.fromList $ , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> complexNatbibCitation AuthorInText) <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) ] ++ map ignoreInlines -- these commands will be ignored unless --parse-raw is specified, -- in which case they will appear as raw latex blocks: - [ "noindent", "index", "nocite" ] + [ "noindent", "index" ] mkImage :: String -> LP Inlines mkImage src = do - -- try for a caption - (alt, tit) <- option (str "image", "") $ try $ do - spaces - controlSeq "caption" - optional (char '*') - ils <- grouped inline - return (ils, "fig:") + let alt = str "image" case takeExtension src of "" -> do defaultExt <- getOption readerDefaultImageExtension - return $ image (addExtension src defaultExt) tit alt - _ -> return $ image src tit alt + return $ image (addExtension src defaultExt) "" alt + _ -> return $ image src "" alt inNote :: Inlines -> Inlines inNote ils = @@ -788,31 +807,107 @@ rawEnv name = do (withRaw (env name blocks) >>= applyMacros' . snd) else env name blocks +---- + +type IncludeParser = ParserT [Char] [String] IO String + -- | Replace "include" commands with file contents. handleIncludes :: String -> IO String -handleIncludes = handleIncludes' [] - --- parents parameter prevents infinite include loops -handleIncludes' :: [FilePath] -> String -> IO String -handleIncludes' _ [] = return [] -handleIncludes' parents ('\\':'%':xs) = - ("\\%"++) `fmap` handleIncludes' parents xs -handleIncludes' parents ('%':xs) = handleIncludes' parents - $ drop 1 $ dropWhile (/='\n') xs -handleIncludes' parents ('\\':xs) = - case runParser include defaultParserState "input" ('\\':xs) of - Right (fs, rest) -> do yss <- mapM (\f -> if f `elem` parents - then "" <$ warn ("Include file loop in '" - ++ f ++ "'.") - else readTeXFile f >>= - handleIncludes' (f:parents)) fs - rest' <- handleIncludes' parents rest - return $ intercalate "\n" yss ++ rest' - _ -> case runParser (verbCmd <|> verbatimEnv) defaultParserState - "input" ('\\':xs) of - Right (r, rest) -> (r ++) `fmap` handleIncludes' parents rest - _ -> ('\\':) `fmap` handleIncludes' parents xs -handleIncludes' parents (x:xs) = (x:) `fmap` handleIncludes' parents xs +handleIncludes s = do + res <- runParserT includeParser' [] "input" s + case res of + Right s' -> return s' + Left e -> error $ show e + +includeParser' :: IncludeParser +includeParser' = + concat <$> many (comment' <|> escaped' <|> blob' <|> include' + <|> startMarker' <|> endMarker' + <|> verbCmd' <|> verbatimEnv' <|> backslash') + +comment' :: IncludeParser +comment' = do + char '%' + xs <- manyTill anyChar newline + return ('%':xs ++ "\n") + +escaped' :: IncludeParser +escaped' = try $ string "\\%" <|> string "\\\\" + +verbCmd' :: IncludeParser +verbCmd' = fmap snd <$> + withRaw $ try $ do + string "\\verb" + c <- anyChar + manyTill anyChar (char c) + +verbatimEnv' :: IncludeParser +verbatimEnv' = fmap snd <$> + withRaw $ try $ do + string "\\begin" + name <- braced' + guard $ name `elem` ["verbatim", "Verbatim", "lstlisting", + "minted", "alltt"] + manyTill anyChar (try $ string $ "\\end{" ++ name ++ "}") + +blob' :: IncludeParser +blob' = try $ many1 (noneOf "\\%") + +backslash' :: IncludeParser +backslash' = string "\\" + +braced' :: IncludeParser +braced' = try $ char '{' *> manyTill (satisfy (/='}')) (char '}') + +include' :: IncludeParser +include' = do + fs' <- try $ do + char '\\' + name <- try (string "include") + <|> try (string "input") + <|> string "usepackage" + -- skip options + skipMany $ try $ char '[' *> (manyTill anyChar (char ']')) + fs <- (map trim . splitBy (==',')) <$> braced' + return $ if name == "usepackage" + then map (flip replaceExtension ".sty") fs + else map (flip replaceExtension ".tex") fs + pos <- getPosition + containers <- getState + let fn = case containers of + (f':_) -> f' + [] -> "input" + -- now process each include file in order... + rest <- getInput + results' <- forM fs' (\f -> do + when (f `elem` containers) $ + fail "Include file loop!" + contents <- lift $ readTeXFile f + return $ "\\PandocStartInclude{" ++ f ++ "}" ++ + contents ++ "\\PandocEndInclude{" ++ + fn ++ "}{" ++ show (sourceLine pos) ++ "}{" + ++ show (sourceColumn pos) ++ "}") + setInput $ concat results' ++ rest + return "" + +startMarker' :: IncludeParser +startMarker' = try $ do + string "\\PandocStartInclude" + fn <- braced' + updateState (fn:) + setPosition $ newPos fn 1 1 + return $ "\\PandocStartInclude{" ++ fn ++ "}" + +endMarker' :: IncludeParser +endMarker' = try $ do + string "\\PandocEndInclude" + fn <- braced' + ln <- braced' + co <- braced' + updateState tail + setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) + return $ "\\PandocEndInclude{" ++ fn ++ "}{" ++ ln ++ "}{" ++ + co ++ "}" readTeXFile :: FilePath -> IO String readTeXFile f = do @@ -827,27 +922,7 @@ readFileFromDirs (d:ds) f = E.catch (UTF8.readFile $ d </> f) $ \(_ :: E.SomeException) -> readFileFromDirs ds f -include :: LP ([FilePath], String) -include = do - name <- controlSeq "include" - <|> controlSeq "input" - <|> controlSeq "usepackage" - skipopts - fs <- (splitBy (==',')) <$> braced - rest <- getInput - let fs' = if name == "usepackage" - then map (flip replaceExtension ".sty") fs - else map (flip replaceExtension ".tex") fs - return (fs', rest) - -verbCmd :: LP (String, String) -verbCmd = do - (_,r) <- withRaw $ do - controlSeq "verb" - c <- anyChar - manyTill anyChar (char c) - rest <- getInput - return (r, rest) +---- keyval :: LP (String, String) keyval = try $ do @@ -869,17 +944,6 @@ alltt t = walk strToCode <$> parseFromString blocks where strToCode (Str s) = Code nullAttr s strToCode x = x -verbatimEnv :: LP (String, String) -verbatimEnv = do - (_,r) <- withRaw $ do - controlSeq "begin" - name <- braced - guard $ name `elem` ["verbatim", "Verbatim", "lstlisting", - "minted", "alltt"] - verbEnv name - rest <- getInput - return (r,rest) - rawLaTeXBlock :: Parser [Char] ParserState String rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand)) @@ -888,12 +952,33 @@ rawLaTeXInline = do raw <- (snd <$> withRaw inlineCommand) <|> (snd <$> withRaw blockCommand) RawInline "latex" <$> applyMacros' raw +addImageCaption :: Blocks -> LP Blocks +addImageCaption = walkM go + where go (Image alt (src,tit)) = do + mbcapt <- stateCaption <$> getState + case mbcapt of + Just ils -> return (Image (toList ils) (src, "fig:")) + Nothing -> return (Image alt (src,tit)) + go x = return x + +addTableCaption :: Blocks -> LP Blocks +addTableCaption = walkM go + where go (Table c als ws hs rs) = do + mbcapt <- stateCaption <$> getState + case mbcapt of + Just ils -> return (Table (toList ils) als ws hs rs) + Nothing -> return (Table c als ws hs rs) + go x = return x + environments :: M.Map String (LP Blocks) environments = M.fromList [ ("document", env "document" blocks <* skipMany anyChar) , ("letter", env "letter" letter_contents) - , ("figure", env "figure" $ skipopts *> blocks) + , ("figure", env "figure" $ + resetCaption *> skipopts *> blocks >>= addImageCaption) , ("center", env "center" blocks) + , ("table", env "table" $ + resetCaption *> skipopts *> blocks >>= addTableCaption) , ("tabular", env "tabular" simpTable) , ("quote", blockQuote <$> env "quote" blocks) , ("quotation", blockQuote <$> env "quotation" blocks) @@ -1050,7 +1135,7 @@ paragraph = do preamble :: LP Blocks preamble = mempty <$> manyTill preambleBlock beginDoc - where beginDoc = lookAhead $ controlSeq "begin" *> string "{document}" + where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}" preambleBlock = (void comment) <|> (void sp) <|> (void blanklines) @@ -1144,12 +1229,13 @@ complexNatbibCitation mode = try $ do parseAligns :: LP [Alignment] parseAligns = try $ do char '{' - let maybeBar = try $ spaces >> optional (char '|') + let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ try (string "@{}") maybeBar let cAlign = AlignCenter <$ char 'c' let lAlign = AlignLeft <$ char 'l' let rAlign = AlignRight <$ char 'r' - let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign) + let parAlign = AlignLeft <$ (char 'p' >> braced) + let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign aligns' <- sepEndBy alignChar maybeBar spaces char '}' @@ -1170,10 +1256,14 @@ parseTableRow :: Int -- ^ number of columns parseTableRow cols = try $ do let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline let tableCell = (plain . trimInlines . mconcat) <$> many tableCellInline - cells' <- sepBy tableCell amp - guard $ length cells' == cols + cells' <- sepBy1 tableCell amp + let numcells = length cells' + guard $ numcells <= cols && numcells >= 1 + guard $ cells' /= [mempty] + -- note: a & b in a three-column table leaves an empty 3rd cell: + let cells'' = cells' ++ replicate (cols - numcells) mempty spaces - return cells' + return cells'' simpTable :: LP Blocks simpTable = try $ do @@ -1184,9 +1274,23 @@ simpTable = try $ do header' <- option [] $ try (parseTableRow cols <* lbreak <* hline) rows <- sepEndBy (parseTableRow cols) (lbreak <* optional hline) spaces + skipMany (comment *> spaces) let header'' = if null header' then replicate cols mempty else header' lookAhead $ controlSeq "end" -- make sure we're at end return $ table mempty (zip aligns (repeat 0)) header'' rows +startInclude :: LP Blocks +startInclude = do + fn <- braced + setPosition $ newPos fn 1 1 + return mempty + +endInclude :: LP Blocks +endInclude = do + fn <- braced + ln <- braced + co <- braced + setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co) + return mempty diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 8a41cef49..2ca3b0eb6 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,6 +1,6 @@ {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-2013 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -33,6 +33,7 @@ module Text.Pandoc.Readers.Markdown ( readMarkdown, import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate ) import qualified Data.Map as M +import Data.Scientific (coefficient, base10Exponent) import Data.Ord ( comparing ) import Data.Char ( isAlphaNum, toLower ) import Data.Maybe @@ -60,6 +61,8 @@ import System.FilePath (takeExtension, addExtension) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match (tagOpen) import qualified Data.Set as Set +import Text.Printf (printf) +import Debug.Trace (trace) type MarkdownParser = Parser [Char] ParserState @@ -76,11 +79,7 @@ readMarkdownWithWarnings :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) -> (Pandoc, [String]) readMarkdownWithWarnings opts s = - (readWith parseMarkdownWithWarnings) def{ stateOptions = opts } (s ++ "\n\n") - where parseMarkdownWithWarnings = do - doc <- parseMarkdown - warnings <- stateWarnings <$> getState - return (doc, warnings) + (readWithWarnings parseMarkdown) def{ stateOptions = opts } (s ++ "\n\n") trimInlinesF :: F Inlines -> F Inlines trimInlinesF = liftM trimInlines @@ -114,6 +113,12 @@ isBlank _ = False -- auxiliary functions -- +-- | Succeeds when we're in list context. +inList :: MarkdownParser () +inList = do + ctx <- stateParserContext <$> getState + guard (ctx == ListItemState) + isNull :: F Inlines -> Bool isNull ils = B.isNull $ runF ils def @@ -138,14 +143,16 @@ nonindentSpaces = do then return sps else unexpected "indented line" -skipNonindentSpaces :: MarkdownParser () +-- returns number of spaces parsed +skipNonindentSpaces :: MarkdownParser Int skipNonindentSpaces = do tabStop <- getOption readerTabStop - atMostSpaces (tabStop - 1) + atMostSpaces (tabStop - 1) <* notFollowedBy (char ' ') -atMostSpaces :: Int -> MarkdownParser () -atMostSpaces 0 = notFollowedBy (char ' ') -atMostSpaces n = (char ' ' >> atMostSpaces (n-1)) <|> return () +atMostSpaces :: Int -> MarkdownParser Int +atMostSpaces n + | n > 0 = (char ' ' >> (+1) <$> atMostSpaces (n-1)) <|> return 0 + | otherwise = return 0 litChar :: MarkdownParser Char litChar = escapedChar' @@ -283,7 +290,11 @@ toMetaValue opts x = yamlToMeta :: ReaderOptions -> Yaml.Value -> MetaValue yamlToMeta opts (Yaml.String t) = toMetaValue opts t -yamlToMeta _ (Yaml.Number n) = MetaString $ show n +yamlToMeta _ (Yaml.Number n) + -- avoid decimal points for numbers that don't need them: + | base10Exponent n >= 0 = MetaString $ show + $ coefficient n * (10 ^ base10Exponent n) + | otherwise = MetaString $ show n yamlToMeta _ (Yaml.Bool b) = MetaBool b yamlToMeta opts (Yaml.Array xs) = B.toMetaValue $ map (yamlToMeta opts) $ V.toList xs @@ -328,12 +339,6 @@ parseMarkdown = do let Pandoc _ bs = B.doc $ runF blocks st return $ Pandoc meta bs -addWarning :: Maybe SourcePos -> String -> MarkdownParser () -addWarning mbpos msg = - updateState $ \st -> st{ - stateWarnings = (msg ++ maybe "" (\pos -> " " ++ show pos) mbpos) : - stateWarnings st } - referenceKey :: MarkdownParser (F Blocks) referenceKey = try $ do pos <- getPosition @@ -342,19 +347,16 @@ referenceKey = try $ do char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') let sourceURL = liftM unwords $ many $ try $ do - notFollowedBy' referenceTitle - skipMany spaceChar - optional $ newline >> notFollowedBy blankline skipMany spaceChar + notFollowedBy' referenceTitle notFollowedBy' (() <$ reference) many1 $ notFollowedBy space >> litChar - let betweenAngles = try $ char '<' >> - manyTill (escapedChar' <|> litChar) (char '>') + let betweenAngles = try $ char '<' >> manyTill litChar (char '>') src <- try betweenAngles <|> sourceURL tit <- option "" referenceTitle -- currently we just ignore MMD-style link/image attributes _kvs <- option [] $ guardEnabled Ext_link_attributes - >> many (spnl >> keyValAttr) + >> many (try $ spnl >> keyValAttr) blanklines let target = (escapeURI $ trimr src, tit) st <- getState @@ -440,7 +442,10 @@ parseBlocks :: MarkdownParser (F Blocks) parseBlocks = mconcat <$> manyTill block eof block :: MarkdownParser (F Blocks) -block = choice [ mempty <$ blanklines +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- choice [ mempty <$ blanklines , codeBlockFenced , yamlMetaBlock , guardEnabled Ext_latex_macros *> (macro >>= return . return) @@ -465,6 +470,11 @@ block = choice [ mempty <$ blanklines , para , plain ] <?> "block" + when tr $ do + st <- getState + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList $ runF res st)) (return ()) + return res -- -- header blocks @@ -558,7 +568,7 @@ attributes :: MarkdownParser Attr attributes = try $ do char '{' spnl - attrs <- many (attribute >>~ spnl) + attrs <- many (attribute <* spnl) char '}' return $ foldl (\x f -> f x) nullAttr attrs @@ -605,12 +615,19 @@ codeBlockFenced = try $ do skipMany spaceChar attr <- option ([],[],[]) $ try (guardEnabled Ext_fenced_code_attributes >> attributes) - <|> ((\x -> ("",[x],[])) <$> identifier) + <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar) blankline contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) blanklines return $ return $ B.codeBlockWith attr $ intercalate "\n" contents +-- correctly handle github language identifiers +toLanguageId :: String -> String +toLanguageId = map toLower . go + where go "c++" = "cpp" + go "objective-c" = "objectivec" + go x = x + codeBlockIndented :: MarkdownParser (F Blocks) codeBlockIndented = do contents <- many1 (indentedLine <|> @@ -668,7 +685,7 @@ birdTrackLine c = try $ do -- emailBlockQuoteStart :: MarkdownParser Char -emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ') +emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ') emailBlockQuote :: MarkdownParser [String] emailBlockQuote = try $ do @@ -698,54 +715,64 @@ blockQuote = do bulletListStart :: MarkdownParser () bulletListStart = try $ do optional newline -- if preceded by a Plain block in a list context + startpos <- sourceColumn <$> getPosition skipNonindentSpaces notFollowedBy' (() <$ hrule) -- because hrules start out just like lists satisfy isBulletListMarker - spaceChar <|> lookAhead newline - skipSpaces + endpos <- sourceColumn <$> getPosition + tabStop <- getOption readerTabStop + lookAhead (newline <|> spaceChar) + () <$ atMostSpaces (tabStop - (endpos - startpos)) anyOrderedListStart :: MarkdownParser (Int, ListNumberStyle, ListNumberDelim) anyOrderedListStart = try $ do optional newline -- if preceded by a Plain block in a list context + startpos <- sourceColumn <$> getPosition skipNonindentSpaces notFollowedBy $ string "p." >> spaceChar >> digit -- page number - (guardDisabled Ext_fancy_lists >> - do many1 digit - char '.' - spaceChar - return (1, DefaultStyle, DefaultDelim)) - <|> do (num, style, delim) <- anyOrderedListMarker - -- if it could be an abbreviated first name, insist on more than one space - if delim == Period && (style == UpperAlpha || (style == UpperRoman && - num `elem` [1, 5, 10, 50, 100, 500, 1000])) - then char '\t' <|> (try $ char ' ' >> spaceChar) - else spaceChar - skipSpaces - return (num, style, delim) + res <- do guardDisabled Ext_fancy_lists + start <- many1 digit >>= safeRead + char '.' + return (start, DefaultStyle, DefaultDelim) + <|> do (num, style, delim) <- anyOrderedListMarker + -- if it could be an abbreviated first name, + -- insist on more than one space + when (delim == Period && (style == UpperAlpha || + (style == UpperRoman && + num `elem` [1, 5, 10, 50, 100, 500, 1000]))) $ + () <$ spaceChar + return (num, style, delim) + endpos <- sourceColumn <$> getPosition + tabStop <- getOption readerTabStop + lookAhead (newline <|> spaceChar) + atMostSpaces (tabStop - (endpos - startpos)) + return res listStart :: MarkdownParser () listStart = bulletListStart <|> (anyOrderedListStart >> return ()) --- parse a line of a list item (start = parser for beginning of list item) listLine :: MarkdownParser String listLine = try $ do notFollowedBy' (do indentSpaces many spaceChar listStart) - notFollowedBy' $ htmlTag (~== TagClose "div") - chunks <- manyTill + notFollowedByHtmlCloser + optional (() <$ indentSpaces) + listLineCommon + +listLineCommon :: MarkdownParser String +listLineCommon = concat <$> manyTill ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') <|> liftM snd (htmlTag isCommentTag) <|> count 1 anyChar ) newline - return $ concat chunks -- parse raw text for one list item, excluding start marker and continuations rawListItem :: MarkdownParser a -> MarkdownParser String rawListItem start = try $ do start - first <- listLine + first <- listLineCommon rest <- many (notFollowedBy listStart >> notFollowedBy blankline >> listLine) blanks <- many blankline return $ unlines (first:rest) ++ blanks @@ -760,11 +787,18 @@ listContinuation = try $ do blanks <- many blankline return $ concat result ++ blanks +notFollowedByHtmlCloser :: MarkdownParser () +notFollowedByHtmlCloser = do + inHtmlBlock <- stateInHtmlBlock <$> getState + case inHtmlBlock of + Just t -> notFollowedBy' $ htmlTag (~== TagClose t) + Nothing -> return () + listContinuationLine :: MarkdownParser String listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart - notFollowedBy' $ htmlTag (~== TagClose "div") + notFollowedByHtmlCloser optional indentSpaces result <- anyLine return $ result ++ "\n" @@ -796,8 +830,14 @@ orderedList = try $ do items <- fmap sequence $ many1 $ listItem ( try $ do optional newline -- if preceded by Plain block in a list + startpos <- sourceColumn <$> getPosition skipNonindentSpaces - orderedListMarker style delim ) + res <- orderedListMarker style delim + endpos <- sourceColumn <$> getPosition + tabStop <- getOption readerTabStop + lookAhead (newline <|> spaceChar) + atMostSpaces (tabStop - (endpos - startpos)) + return res ) start' <- option 1 $ guardEnabled Ext_startnum >> return start return $ B.orderedListWith (start', style, delim) <$> fmap compactify' items @@ -819,53 +859,52 @@ defListMarker = do else mzero return () -definitionListItem :: MarkdownParser (F (Inlines, [Blocks])) -definitionListItem = try $ do - -- first, see if this has any chance of being a definition list: - lookAhead (anyLine >> optional blankline >> defListMarker) - term <- trimInlinesF . mconcat <$> manyTill inline newline - optional blankline - raw <- many1 defRawBlock - state <- getState - let oldContext = stateParserContext state - -- parse the extracted block, which may contain various block elements: +definitionListItem :: Bool -> MarkdownParser (F (Inlines, [Blocks])) +definitionListItem compact = try $ do + rawLine' <- anyLine + raw <- many1 $ defRawBlock compact + term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine' contents <- mapM (parseFromString parseBlocks) raw - updateState (\st -> st {stateParserContext = oldContext}) + optional blanklines return $ liftM2 (,) term (sequence contents) -defRawBlock :: MarkdownParser String -defRawBlock = try $ do +defRawBlock :: Bool -> MarkdownParser String +defRawBlock compact = try $ do + hasBlank <- option False $ blankline >> return True defListMarker firstline <- anyLine - rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) - trailing <- option "" blanklines - cont <- liftM concat $ many $ do - lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine - trl <- option "" blanklines - return $ unlines lns ++ trl - return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont + let dline = try + ( do notFollowedBy blankline + if compact -- laziness not compatible with compact + then () <$ indentSpaces + else (() <$ indentSpaces) + <|> notFollowedBy defListMarker + anyLine ) + rawlines <- many dline + cont <- liftM concat $ many $ try $ do + trailing <- option "" blanklines + ln <- indentSpaces >> notFollowedBy blankline >> anyLine + lns <- many dline + return $ trailing ++ unlines (ln:lns) + return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++ + if hasBlank || not (null cont) then "\n\n" else "" definitionList :: MarkdownParser (F Blocks) -definitionList = do - guardEnabled Ext_definition_lists - items <- fmap sequence $ many1 definitionListItem +definitionList = try $ do + lookAhead (anyLine >> optional blankline >> defListMarker) + compactDefinitionList <|> normalDefinitionList + +compactDefinitionList :: MarkdownParser (F Blocks) +compactDefinitionList = do + guardEnabled Ext_compact_definition_lists + items <- fmap sequence $ many1 $ definitionListItem True return $ B.definitionList <$> fmap compactify'DL items -compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] -compactify'DL items = - let defs = concatMap snd items - defBlocks = reverse $ concatMap B.toList defs - isPara (Para _) = True - isPara _ = False - in case defBlocks of - (Para x:_) -> if not $ any isPara (drop 1 defBlocks) - then let (t,ds) = last items - lastDef = B.toList $ last ds - ds' = init ds ++ - [B.fromList $ init lastDef ++ [Plain x]] - in init items ++ [(t, ds')] - else items - _ -> items +normalDefinitionList :: MarkdownParser (F Blocks) +normalDefinitionList = do + guardEnabled Ext_definition_lists + items <- fmap sequence $ many1 $ definitionListItem False + return $ B.definitionList <$> items -- -- paragraph block @@ -883,7 +922,15 @@ para = try $ do <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) <|> (guardEnabled Ext_lists_without_preceding_blankline >> + -- Avoid creating a paragraph in a nested list. + notFollowedBy' inList >> () <$ lookAhead listStart) + <|> do guardEnabled Ext_native_divs + inHtmlBlock <- stateInHtmlBlock <$> getState + case inHtmlBlock of + Just "div" -> () <$ + lookAhead (htmlTag (~== TagClose "div")) + _ -> mzero return $ do result' <- result case B.toList result' of @@ -909,16 +956,34 @@ htmlElement = rawVerbatimBlock htmlBlock :: MarkdownParser (F Blocks) htmlBlock = do guardEnabled Ext_raw_html - res <- (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks) - <|> htmlBlock' - return $ return $ B.rawBlock "html" res - -htmlBlock' :: MarkdownParser String + try (do + (TagOpen t attrs) <- lookAhead $ fst <$> htmlTag isBlockTag + (guard (t `elem` ["pre","style","script"]) >> + (return . B.rawBlock "html") <$> rawVerbatimBlock) + <|> (do guardEnabled Ext_markdown_attribute + oldMarkdownAttribute <- stateMarkdownAttribute <$> getState + markdownAttribute <- + case lookup "markdown" attrs of + Just "0" -> False <$ updateState (\st -> st{ + stateMarkdownAttribute = False }) + Just _ -> True <$ updateState (\st -> st{ + stateMarkdownAttribute = True }) + Nothing -> return oldMarkdownAttribute + res <- if markdownAttribute + then rawHtmlBlocks + else htmlBlock' + updateState $ \st -> st{ stateMarkdownAttribute = + oldMarkdownAttribute } + return res) + <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)) + <|> htmlBlock' + +htmlBlock' :: MarkdownParser (F Blocks) htmlBlock' = try $ do first <- htmlElement - finalSpace <- many spaceChar - finalNewlines <- many newline - return $ first ++ finalSpace ++ finalNewlines + skipMany spaceChar + optional blanklines + return $ return $ B.rawBlock "html" first strictHtmlBlock :: MarkdownParser String strictHtmlBlock = htmlInBalanced (not . isInlineTag) @@ -929,46 +994,36 @@ rawVerbatimBlock = try $ do ["pre", "style", "script"]) (const True)) contents <- manyTill anyChar (htmlTag (~== TagClose tag)) - return $ open ++ contents ++ renderTags [TagClose tag] + return $ open ++ contents ++ renderTags' [TagClose tag] rawTeXBlock :: MarkdownParser (F Blocks) rawTeXBlock = do guardEnabled Ext_raw_tex - result <- (B.rawBlock "latex" <$> rawLaTeXBlock) - <|> (B.rawBlock "context" <$> rawConTeXtEnvironment) + result <- (B.rawBlock "latex" . concat <$> + rawLaTeXBlock `sepEndBy1` blankline) + <|> (B.rawBlock "context" . concat <$> + rawConTeXtEnvironment `sepEndBy1` blankline) spaces return $ return result -rawHtmlBlocks :: MarkdownParser String +rawHtmlBlocks :: MarkdownParser (F Blocks) rawHtmlBlocks = do - htmlBlocks <- many1 $ try $ do - s <- rawVerbatimBlock <|> try ( - do (t,raw) <- htmlTag isBlockTag - exts <- getOption readerExtensions - -- if open tag, need markdown="1" if - -- markdown_attributes extension is set - case t of - TagOpen _ as - | Ext_markdown_attribute `Set.member` - exts -> - if "markdown" `notElem` - map fst as - then mzero - else return $ - stripMarkdownAttribute raw - | otherwise -> return raw - _ -> return raw ) - sps <- do sp1 <- many spaceChar - sp2 <- option "" (blankline >> return "\n") - sp3 <- many spaceChar - sp4 <- option "" blanklines - return $ sp1 ++ sp2 ++ sp3 ++ sp4 - -- note: we want raw html to be able to - -- precede a code block, when separated - -- by a blank line - return $ s ++ sps - let combined = concat htmlBlocks - return $ if last combined == '\n' then init combined else combined + (TagOpen tagtype _, raw) <- htmlTag isBlockTag + -- try to find closing tag + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just tagtype } + let closer = htmlTag (\x -> x ~== TagClose tagtype) + contents <- mconcat <$> many (notFollowedBy' closer >> block) + result <- + (closer >>= \(_, rawcloser) -> return ( + return (B.rawBlock "html" $ stripMarkdownAttribute raw) <> + contents <> + return (B.rawBlock "html" rawcloser))) + <|> return (return (B.rawBlock "html" raw) <> contents) + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + return result -- remove markdown="1" attribute stripMarkdownAttribute :: String -> String @@ -1110,13 +1165,11 @@ multilineTable headless = multilineTableHeader :: Bool -- ^ Headerless table -> MarkdownParser (F [Blocks], [Alignment], [Int]) multilineTableHeader headless = try $ do - if headless - then return '\n' - else tableSep >>~ notFollowedBy blankline + unless headless $ + tableSep >> notFollowedBy blankline rawContent <- if headless then return $ repeat "" - else many1 - (notFollowedBy tableSep >> many1Till anyChar newline) + else many1 $ notFollowedBy tableSep >> anyLine initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline @@ -1158,7 +1211,7 @@ gridPart ch = do return (length dashes, length dashes + 1) gridDashedLines :: Char -> Parser [Char] st [(Int,Int)] -gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline removeFinalBar :: String -> String removeFinalBar = @@ -1220,11 +1273,20 @@ removeOneLeadingSpace xs = gridTableFooter :: MarkdownParser [Char] gridTableFooter = blanklines +pipeBreak :: MarkdownParser [Alignment] +pipeBreak = try $ do + nonindentSpaces + openPipe <- (True <$ char '|') <|> return False + first <- pipeTableHeaderPart + rest <- many $ sepPipe *> pipeTableHeaderPart + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) + optional (char '|') + blankline + return (first:rest) + pipeTable :: MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]]) pipeTable = try $ do - let pipeBreak = nonindentSpaces *> optional (char '|') *> - pipeTableHeaderPart `sepBy1` sepPipe <* - optional (char '|') <* blankline (heads,aligns) <- try ( pipeBreak >>= \als -> return (return $ replicate (length als) mempty, als)) <|> ( pipeTableRow >>= \row -> pipeBreak >>= \als -> @@ -1243,12 +1305,13 @@ sepPipe = try $ do pipeTableRow :: MarkdownParser (F [Blocks]) pipeTableRow = do nonindentSpaces - optional (char '|') + openPipe <- (True <$ char '|') <|> return False let cell = mconcat <$> many (notFollowedBy (blankline <|> char '|') >> inline) first <- cell - sepPipe - rest <- cell `sepBy1` sepPipe + rest <- many $ sepPipe *> cell + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) optional (char '|') blankline let cells = sequence (first:rest) @@ -1373,8 +1436,7 @@ escapedChar = do ltSign :: MarkdownParser (F Inlines) ltSign = do guardDisabled Ext_raw_html - <|> guardDisabled Ext_markdown_in_html_blocks - <|> (notFollowedBy' rawHtmlBlocks >> return ()) + <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag)) char '<' return $ return $ B.str "<" @@ -1419,54 +1481,60 @@ math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) enclosure :: Char -> MarkdownParser (F Inlines) enclosure c = do + -- we can't start an enclosure with _ if after a string and + -- the intraword_underscores extension is enabled: + guardDisabled Ext_intraword_underscores + <|> guard (c == '*') + <|> (guard =<< notAfterString) cs <- many1 (char c) (return (B.str cs) <>) <$> whitespace - <|> case length cs of + <|> do + case length cs of 3 -> three c 2 -> two c mempty 1 -> one c mempty _ -> return (return $ B.str cs) +ender :: Char -> Int -> MarkdownParser () +ender c n = try $ do + count n (char c) + guard (c == '*') + <|> guardDisabled Ext_intraword_underscores + <|> notFollowedBy alphaNum + -- Parse inlines til you hit one c or a sequence of two cs. -- If one c, emit emph and then parse two. -- If two cs, emit strong and then parse one. -- Otherwise, emit ccc then the results. three :: Char -> MarkdownParser (F Inlines) three c = do - contents <- mconcat <$> many (notFollowedBy (char c) >> inline) - (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents)) - <|> (try (string [c,c]) >> one c (B.strong <$> contents)) - <|> (char c >> two c (B.emph <$> contents)) + contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline) + (ender c 3 >> return ((B.strong . B.emph) <$> contents)) + <|> (ender c 2 >> one c (B.strong <$> contents)) + <|> (ender c 1 >> two c (B.emph <$> contents)) <|> return (return (B.str [c,c,c]) <> contents) -- Parse inlines til you hit two c's, and emit strong. -- If you never do hit two cs, emit ** plus inlines parsed. two :: Char -> F Inlines -> MarkdownParser (F Inlines) two c prefix' = do - let ender = try $ string [c,c] - contents <- mconcat <$> many (try $ notFollowedBy ender >> inline) - (ender >> return (B.strong <$> (prefix' <> contents))) + contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline) + (ender c 2 >> return (B.strong <$> (prefix' <> contents))) <|> return (return (B.str [c,c]) <> (prefix' <> contents)) -- Parse inlines til you hit a c, and emit emph. -- If you never hit a c, emit * plus inlines parsed. one :: Char -> F Inlines -> MarkdownParser (F Inlines) one c prefix' = do - contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline) + contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline) <|> try (string [c,c] >> - notFollowedBy (char c) >> + notFollowedBy (ender c 1) >> two c mempty) ) - (char c >> return (B.emph <$> (prefix' <> contents))) + (ender c 1 >> return (B.emph <$> (prefix' <> contents))) <|> return (return (B.str [c]) <> (prefix' <> contents)) strongOrEmph :: MarkdownParser (F Inlines) -strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_') - where checkIntraword = do - exts <- getOption readerExtensions - when (Ext_intraword_underscores `Set.member` exts) $ do - pos <- getPosition - lastStrPos <- stateLastStrPos <$> getState - guard $ lastStrPos /= Just pos +strongOrEmph = enclosure '*' <|> enclosure '_' -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) @@ -1476,7 +1544,7 @@ inlinesBetween :: (Show b) inlinesBetween start end = (trimInlinesF . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace >>~ notFollowedBy' end + innerSpace = try $ whitespace <* notFollowedBy' end strikeout :: MarkdownParser (F Inlines) strikeout = fmap B.strikeout <$> @@ -1508,8 +1576,7 @@ nonEndline = satisfy (/='\n') str :: MarkdownParser (F Inlines) str = do result <- many1 alphaNum - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } + updateLastStrPos let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) isSmart <- getOption readerSmart if isSmart @@ -1541,14 +1608,15 @@ endline = try $ do newline notFollowedBy blankline -- parse potential list-starts differently if in a list: - st <- getState - when (stateParserContext st == ListItemState) $ notFollowedBy listStart + notFollowedBy (inList >> listStart) guardDisabled Ext_lists_without_preceding_blankline <|> notFollowedBy listStart guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header - guardEnabled Ext_backtick_code_blocks >> + guardDisabled Ext_backtick_code_blocks <|> notFollowedBy (() <$ (lookAhead (char '`') >> codeBlockFenced)) - (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) + notFollowedByHtmlCloser + (eof >> return mempty) + <|> (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) <|> (guardEnabled Ext_ignore_line_breaks >> return mempty) <|> (return $ return B.space) @@ -1571,8 +1639,10 @@ source :: MarkdownParser (String, String) source = do char '(' skipSpaces - let urlChunk = try $ notFollowedBy (oneOf "\"')") >> - (parenthesizedChars <|> count 1 litChar) + let urlChunk = + try parenthesizedChars + <|> (notFollowedBy (oneOf " )") >> (count 1 litChar)) + <|> try (many1 spaceChar <* notFollowedBy (oneOf "\"')")) let sourceURL = (unwords . words . concat) <$> many urlChunk let betweenAngles = try $ char '<' >> manyTill litChar (char '>') @@ -1717,31 +1787,55 @@ inBrackets parser = do spanHtml :: MarkdownParser (F Inlines) spanHtml = try $ do - guardEnabled Ext_markdown_in_html_blocks + guardEnabled Ext_native_spans (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span")) let ident = fromMaybe "" $ lookup "id" attrs let classes = maybe [] words $ lookup "class" attrs let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ B.spanWith (ident, classes, keyvals) <$> contents + case lookup "style" keyvals of + Just s | null ident && null classes && + map toLower (filter (`notElem` " \t;") s) == + "font-variant:small-caps" + -> return $ B.smallcaps <$> contents + _ -> return $ B.spanWith (ident, classes, keyvals) <$> contents divHtml :: MarkdownParser (F Blocks) divHtml = try $ do - guardEnabled Ext_markdown_in_html_blocks - (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "div" []) - contents <- mconcat <$> manyTill block (htmlTag (~== TagClose "div")) - let ident = fromMaybe "" $ lookup "id" attrs - let classes = maybe [] words $ lookup "class" attrs - let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ B.divWith (ident, classes, keyvals) <$> contents + guardEnabled Ext_native_divs + (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just "div" } + bls <- option "" (blankline >> option "" blanklines) + contents <- mconcat <$> + many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block) + closed <- option False (True <$ htmlTag (~== TagClose "div")) + if closed + then do + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.divWith (ident, classes, keyvals) <$> contents + else -- avoid backtracing + return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents rawHtmlInline :: MarkdownParser (F Inlines) rawHtmlInline = do guardEnabled Ext_raw_html + inHtmlBlock <- stateInHtmlBlock <$> getState + let isCloseBlockTag t = case inHtmlBlock of + Just t' -> t ~== TagClose t' + Nothing -> False mdInHtml <- option False $ - guardEnabled Ext_markdown_in_html_blocks >> return True + ( guardEnabled Ext_markdown_in_html_blocks + <|> guardEnabled Ext_markdown_attribute + ) >> return True (_,result) <- htmlTag $ if mdInHtml - then isInlineTag + then (\x -> isInlineTag x && + not (isCloseBlockTag x)) else not . isTextTag return $ return $ B.rawInline "html" result @@ -1800,22 +1894,6 @@ normalCite = try $ do char ']' return citations -citeKey :: MarkdownParser (Bool, String) -citeKey = try $ do - -- make sure we're not right after an alphanumeric, - -- since foo@bar.baz is probably an email address - lastStrPos <- stateLastStrPos <$> getState - pos <- getPosition - guard $ lastStrPos /= Just pos - suppress_author <- option False (char '-' >> return True) - char '@' - first <- letter <|> char '_' - let regchar = satisfy (\c -> isAlphaNum c || c == '_') - let internal p = try $ p >>~ lookAhead regchar - rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") - let key = first:rest - return (suppress_author, key) - suffix :: MarkdownParser (F Inlines) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) @@ -1854,7 +1932,7 @@ smart :: MarkdownParser (F Inlines) smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> - choice (map (return . B.singleton <$>) [apostrophe, dash, ellipses]) + choice (map (return <$>) [apostrophe, dash, ellipses]) singleQuoted :: MarkdownParser (F Inlines) singleQuoted = try $ do @@ -1873,4 +1951,3 @@ doubleQuoted = try $ do (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return (fmap B.doubleQuoted . trimInlinesF $ contents)) <|> (return $ return (B.str "\8220") <> contents) - diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 794890eb6..e43b8a86c 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -1,7 +1,7 @@ {-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} -- RelaxedPolyRec needed for inlinesBetween on GHC < 7 {- - Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu> + Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.MediaWiki - Copyright : Copyright (C) 2012 John MacFarlane + Copyright : Copyright (C) 2012-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -55,6 +55,8 @@ import qualified Data.Foldable as F import qualified Data.Map as M import Data.Char (isDigit, isSpace) import Data.Maybe (fromMaybe) +import Text.Printf (printf) +import Debug.Trace (trace) -- | Read mediawiki from an input string and return a Pandoc document. readMediaWiki :: ReaderOptions -- ^ Reader options @@ -82,16 +84,16 @@ data MWState = MWState { mwOptions :: ReaderOptions type MWParser = Parser [Char] MWState -instance HasReaderOptions MWParser where - askReaderOption f = (f . mwOptions) `fmap` getState +instance HasReaderOptions MWState where + extractReaderOptions = mwOptions -instance HasHeaderMap MWParser where - getHeaderMap = fmap mwHeaderMap getState - putHeaderMap hm = updateState $ \st -> st{ mwHeaderMap = hm } +instance HasHeaderMap MWState where + extractHeaderMap = mwHeaderMap + updateHeaderMap f st = st{ mwHeaderMap = f $ mwHeaderMap st } -instance HasIdentifierList MWParser where - getIdentifierList = fmap mwIdentifierList getState - putIdentifierList l = updateState $ \st -> st{ mwIdentifierList = l } +instance HasIdentifierList MWState where + extractIdentifierList = mwIdentifierList + updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st } -- -- auxiliary functions @@ -187,7 +189,10 @@ parseMediaWiki = do -- block :: MWParser Blocks -block = mempty <$ skipMany1 blankline +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- mempty <$ skipMany1 blankline <|> table <|> header <|> hrule @@ -199,6 +204,10 @@ block = mempty <$ skipMany1 blankline <|> blockTag <|> (B.rawBlock "mediawiki" <$> template) <|> para + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res para :: MWParser Blocks para = do @@ -227,6 +236,7 @@ table = do let widths' = map (\w -> if w == 0 then defaultwidth else w) widths let cellspecs = zip (map fst cellspecs') widths' rows' <- many $ try $ rowsep *> (map snd <$> tableRow) + optional blanklines tableEnd let cols = length hdr let (headers,rows) = if hasheader @@ -275,7 +285,7 @@ tableCaption = try $ do (trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline) tableRow :: MWParser [((Alignment, Double), Blocks)] -tableRow = try $ many tableCell +tableRow = try $ skipMany htmlComment *> many tableCell tableCell :: MWParser ((Alignment, Double), Blocks) tableCell = try $ do @@ -307,6 +317,7 @@ template :: MWParser String template = try $ do string "{{" notFollowedBy (char '{') + lookAhead $ letter <|> digit <|> char ':' let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar contents <- manyTill chunk (try $ string "}}") return $ "{{" ++ concat contents ++ "}}" @@ -438,7 +449,8 @@ listItem c = try $ do skipMany spaceChar first <- concat <$> manyTill listChunk newline rest <- many - (try $ string extras *> (concat <$> manyTill listChunk newline)) + (try $ string extras *> lookAhead listStartChar *> + (concat <$> manyTill listChunk newline)) contents <- parseFromString (many1 $ listItem' c) (unlines (first : rest)) case c of @@ -555,10 +567,15 @@ endline = () <$ try (newline <* notFollowedBy' header <* notFollowedBy anyListStart) +imageIdentifiers :: [MWParser ()] +imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] + where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier", + "Bild"] + image :: MWParser Inlines image = try $ do sym "[[" - sym "File:" <|> sym "Image:" + choice imageIdentifiers fname <- many1 (noneOf "|]") _ <- many (try $ char '|' *> imageOption) caption <- (B.str fname <$ sym "]]") @@ -618,7 +635,7 @@ inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines inlinesBetween start end = (trimInlines . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace >>~ notFollowedBy' end + innerSpace = try $ whitespace <* notFollowedBy' end emph :: MWParser Inlines emph = B.emph <$> nested (inlinesBetween start end) diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs index c5d4cb98a..f4dfa62c1 100644 --- a/src/Text/Pandoc/Readers/Native.hs +++ b/src/Text/Pandoc/Readers/Native.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Native - Copyright : Copyright (C) 2011 John MacFarlane + Copyright : Copyright (C) 2011-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs new file mode 100644 index 000000000..440b6d144 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org.hs @@ -0,0 +1,1487 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org + Copyright : Copyright (C) 2014 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb@moltkeplatz.de> + +Conversion of org-mode formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Org ( readOrg ) where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), (<>) + , trimInlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import qualified Text.Pandoc.Parsing as P +import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF + , newline, orderedListMarker + , parseFromString, blanklines + ) +import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) +import Text.Pandoc.Shared (compactify', compactify'DL) +import Text.TeXMath (readTeX, writePandoc, DisplayType(..)) +import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap + +import Control.Applicative ( Applicative, pure + , (<$>), (<$), (<*>), (<*), (*>) ) +import Control.Arrow (first) +import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when) +import Control.Monad.Reader (Reader, runReader, ask, asks) +import Data.Char (isAlphaNum, toLower) +import Data.Default +import Data.List (intersperse, isPrefixOf, isSuffixOf) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isJust) +import Data.Monoid (Monoid, mconcat, mempty, mappend) +import Network.HTTP (urlEncode) + +-- | Parse org-mode string and return a Pandoc document. +readOrg :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n") + +type OrgParser = Parser [Char] OrgParserState + +parseOrg :: OrgParser Pandoc +parseOrg = do + blocks' <- parseBlocks + st <- getState + let meta = runF (orgStateMeta' st) st + let removeUnwantedBlocks = dropCommentTrees . filter (/= Null) + return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st) + +-- | Drop COMMENT headers and the document tree below those headers. +dropCommentTrees :: [Block] -> [Block] +dropCommentTrees [] = [] +dropCommentTrees blks@(b:bs) = + maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b + +-- | Return the level of a header starting a comment tree and Nothing +-- otherwise. +commentHeaderLevel :: Block -> Maybe Int +commentHeaderLevel blk = + case blk of + (Header level _ ((Str "COMMENT"):_)) -> Just level + _ -> Nothing + +-- | Drop blocks until a header on or above the given level is seen +dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block] +dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n) + +isHeaderLevelLowerEq :: Int -> Block -> Bool +isHeaderLevelLowerEq n blk = + case blk of + (Header level _ _) -> n >= level + _ -> False + +-- +-- Parser State for Org +-- + +type OrgNoteRecord = (String, F Blocks) +type OrgNoteTable = [OrgNoteRecord] + +type OrgBlockAttributes = M.Map String String + +type OrgLinkFormatters = M.Map String (String -> String) + +-- | Org-mode parser state +data OrgParserState = OrgParserState + { orgStateOptions :: ReaderOptions + , orgStateAnchorIds :: [String] + , orgStateBlockAttributes :: OrgBlockAttributes + , orgStateEmphasisCharStack :: [Char] + , orgStateEmphasisNewlines :: Maybe Int + , orgStateLastForbiddenCharPos :: Maybe SourcePos + , orgStateLastPreCharPos :: Maybe SourcePos + , orgStateLastStrPos :: Maybe SourcePos + , orgStateLinkFormatters :: OrgLinkFormatters + , orgStateMeta :: Meta + , orgStateMeta' :: F Meta + , orgStateNotes' :: OrgNoteTable + } + +instance HasReaderOptions OrgParserState where + extractReaderOptions = orgStateOptions + +instance HasMeta OrgParserState where + setMeta field val st = + st{ orgStateMeta = setMeta field val $ orgStateMeta st } + deleteMeta field st = + st{ orgStateMeta = deleteMeta field $ orgStateMeta st } + +instance HasLastStrPosition OrgParserState where + getLastStrPos = orgStateLastStrPos + setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } + +instance Default OrgParserState where + def = defaultOrgParserState + +defaultOrgParserState :: OrgParserState +defaultOrgParserState = OrgParserState + { orgStateOptions = def + , orgStateAnchorIds = [] + , orgStateBlockAttributes = M.empty + , orgStateEmphasisCharStack = [] + , orgStateEmphasisNewlines = Nothing + , orgStateLastForbiddenCharPos = Nothing + , orgStateLastPreCharPos = Nothing + , orgStateLastStrPos = Nothing + , orgStateLinkFormatters = M.empty + , orgStateMeta = nullMeta + , orgStateMeta' = return nullMeta + , orgStateNotes' = [] + } + +recordAnchorId :: String -> OrgParser () +recordAnchorId i = updateState $ \s -> + s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + +addBlockAttribute :: String -> String -> OrgParser () +addBlockAttribute key val = updateState $ \s -> + let attrs = orgStateBlockAttributes s + in s{ orgStateBlockAttributes = M.insert key val attrs } + +lookupBlockAttribute :: String -> OrgParser (Maybe String) +lookupBlockAttribute key = + M.lookup key . orgStateBlockAttributes <$> getState + +resetBlockAttributes :: OrgParser () +resetBlockAttributes = updateState $ \s -> + s{ orgStateBlockAttributes = orgStateBlockAttributes def } + +updateLastForbiddenCharPos :: OrgParser () +updateLastForbiddenCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} + +updateLastPreCharPos :: OrgParser () +updateLastPreCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastPreCharPos = Just p} + +pushToInlineCharStack :: Char -> OrgParser () +pushToInlineCharStack c = updateState $ \s -> + s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } + +popInlineCharStack :: OrgParser () +popInlineCharStack = updateState $ \s -> + s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } + +surroundingEmphasisChar :: OrgParser [Char] +surroundingEmphasisChar = + take 1 . drop 1 . orgStateEmphasisCharStack <$> getState + +startEmphasisNewlinesCounting :: Int -> OrgParser () +startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Just maxNewlines } + +decEmphasisNewlinesCount :: OrgParser () +decEmphasisNewlinesCount = updateState $ \s -> + s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } + +newlinesCountWithinLimits :: OrgParser Bool +newlinesCountWithinLimits = do + st <- getState + return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True + +resetEmphasisNewlines :: OrgParser () +resetEmphasisNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Nothing } + +addLinkFormat :: String + -> (String -> String) + -> OrgParser () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + +addToNotesTable :: OrgNoteRecord -> OrgParser () +addToNotesTable note = do + oldnotes <- orgStateNotes' <$> getState + updateState $ \s -> s{ orgStateNotes' = note:oldnotes } + +-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts +-- of the state saved and restored. +parseFromString :: OrgParser a -> String -> OrgParser a +parseFromString parser str' = do + oldLastPreCharPos <- orgStateLastPreCharPos <$> getState + updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } + result <- P.parseFromString parser str' + updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } + return result + + +-- +-- Adaptions and specializations of parsing utilities +-- + +newtype F a = F { unF :: Reader OrgParserState a + } deriving (Monad, Applicative, Functor) + +runF :: F a -> OrgParserState -> a +runF = runReader . unF + +askF :: F OrgParserState +askF = F ask + +asksF :: (OrgParserState -> a) -> F a +asksF f = F $ asks f + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = fmap mconcat . sequence + +trimInlinesF :: F Inlines -> F Inlines +trimInlinesF = liftM trimInlines + +returnF :: a -> OrgParser (F a) +returnF = return . return + + +-- | Like @Text.Parsec.Char.newline@, but causes additional state changes. +newline :: OrgParser Char +newline = + P.newline + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- | Like @Text.Parsec.Char.blanklines@, but causes additional state changes. +blanklines :: OrgParser [Char] +blanklines = + P.blanklines + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- +-- parsing blocks +-- + +parseBlocks :: OrgParser (F Blocks) +parseBlocks = mconcat <$> manyTill block eof + +block :: OrgParser (F Blocks) +block = choice [ mempty <$ blanklines + , optionalAttributes $ choice + [ orgBlock + , figure + , table + ] + , example + , drawer + , specialLine + , header + , return <$> hline + , list + , latexFragment + , noteBlock + , paraOrPlain + ] <?> "block" + +optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks) +optionalAttributes parser = try $ + resetBlockAttributes *> parseBlockAttributes *> parser + +parseBlockAttributes :: OrgParser () +parseBlockAttributes = do + attrs <- many attribute + mapM_ (uncurry parseAndAddAttribute) attrs + where + attribute :: OrgParser (String, String) + attribute = try $ do + key <- metaLineStart *> many1Till nonspaceChar (char ':') + val <- skipSpaces *> anyLine + return (map toLower key, val) + +parseAndAddAttribute :: String -> String -> OrgParser () +parseAndAddAttribute key value = do + let key' = map toLower key + () <$ addBlockAttribute key' value + +lookupInlinesAttr :: String -> OrgParser (Maybe (F Inlines)) +lookupInlinesAttr attr = try $ do + val <- lookupBlockAttribute attr + maybe (return Nothing) + (fmap Just . parseFromString parseInlines) + val + + +-- +-- Org Blocks (#+BEGIN_... / #+END_...) +-- + +type BlockProperties = (Int, String) -- (Indentation, Block-Type) + +orgBlock :: OrgParser (F Blocks) +orgBlock = try $ do + blockProp@(_, blkType) <- blockHeaderStart + ($ blockProp) $ + case blkType of + "comment" -> withRaw' (const mempty) + "html" -> withRaw' (return . (B.rawBlock blkType)) + "latex" -> withRaw' (return . (B.rawBlock blkType)) + "ascii" -> withRaw' (return . (B.rawBlock blkType)) + "example" -> withRaw' (return . exampleCode) + "quote" -> withParsed (fmap B.blockQuote) + "verse" -> verseBlock + "src" -> codeBlock + _ -> withParsed (fmap $ divWithClass blkType) + +blockHeaderStart :: OrgParser (Int, String) +blockHeaderStart = try $ (,) <$> indent <*> blockType + where + indent = length <$> many spaceChar + blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord) + +withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp)) + +withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks) +withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp)) + +ignHeaders :: OrgParser () +ignHeaders = (() <$ newline) <|> (() <$ anyLine) + +divWithClass :: String -> Blocks -> Blocks +divWithClass cls = B.divWith ("", [cls], []) + +verseBlock :: BlockProperties -> OrgParser (F Blocks) +verseBlock blkProp = try $ do + ignHeaders + content <- rawBlockContent blkProp + fmap B.para . mconcat . intersperse (pure B.linebreak) + <$> mapM (parseFromString parseInlines) (lines content) + +exportsCode :: [(String, String)] -> Bool +exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs + || ("rundoc-exports", "results") `elem` attrs) + +exportsResults :: [(String, String)] -> Bool +exportsResults attrs = ("rundoc-exports", "results") `elem` attrs + || ("rundoc-exports", "both") `elem` attrs + +followingResultsBlock :: OrgParser (Maybe String) +followingResultsBlock = + optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:" + *> blankline + *> (unlines <$> many1 exampleLine)) + +codeBlock :: BlockProperties -> OrgParser (F Blocks) +codeBlock blkProp = do + skipSpaces + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + id' <- fromMaybe "" <$> lookupBlockAttribute "name" + content <- rawBlockContent blkProp + resultsContent <- followingResultsBlock + let includeCode = exportsCode kv + let includeResults = exportsResults kv + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + labelledBlck <- maybe (pure codeBlck) + (labelDiv codeBlck) + <$> lookupInlinesAttr "caption" + let resultBlck = pure $ maybe mempty (exampleCode) resultsContent + return $ (if includeCode then labelledBlck else mempty) + <> (if includeResults then resultBlck else mempty) + where + labelDiv blk value = + B.divWith nullAttr <$> (mappend <$> labelledBlock value + <*> pure blk) + labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + +rawBlockContent :: BlockProperties -> OrgParser String +rawBlockContent (indent, blockType) = try $ + unlines . map commaEscaped <$> manyTill indentedLine blockEnder + where + indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine) + blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType) + +parsedBlockContent :: BlockProperties -> OrgParser (F Blocks) +parsedBlockContent blkProps = try $ do + raw <- rawBlockContent blkProps + parseFromString parseBlocks (raw ++ "\n") + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Int -> OrgParser String +indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> count (num - tabStop) (char ' ')) ] + +type SwitchOption = (Char, Maybe String) + +orgArgWord :: OrgParser String +orgArgWord = many1 orgArgWordChar + +-- | Parse code block arguments +-- TODO: We currently don't handle switches. +codeHeaderArgs :: OrgParser ([String], [(String, String)]) +codeHeaderArgs = try $ do + language <- skipSpaces *> orgArgWord + _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) + parameters <- manyTill blockOption newline + let pandocLang = translateLang language + return $ + if hasRundocParameters parameters + then ( [ pandocLang, rundocBlockClass ] + , map toRundocAttrib (("language", language) : parameters) + ) + else ([ pandocLang ], parameters) + where hasRundocParameters = not . null + +switch :: OrgParser SwitchOption +switch = try $ simpleSwitch <|> lineNumbersSwitch + where + simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) + lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> + (string "-l \"" *> many1Till nonspaceChar (char '"')) + +translateLang :: String -> String +translateLang "C" = "c" +translateLang "C++" = "cpp" +translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported +translateLang "js" = "javascript" +translateLang "lisp" = "commonlisp" +translateLang "R" = "r" +translateLang "sh" = "bash" +translateLang "sqlite" = "sql" +translateLang cs = cs + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +blockOption :: OrgParser (String, String) +blockOption = try $ (,) <$> orgArgKey <*> orgParamValue + +inlineBlockOption :: OrgParser (String, String) +inlineBlockOption = try $ (,) <$> orgArgKey <*> orgInlineParamValue + +orgArgKey :: OrgParser String +orgArgKey = try $ + skipSpaces *> char ':' + *> many1 orgArgWordChar + +orgParamValue :: OrgParser String +orgParamValue = try $ + skipSpaces *> many1 (noneOf "\t\n\r ") <* skipSpaces + +orgInlineParamValue :: OrgParser String +orgInlineParamValue = try $ + skipSpaces *> many1 (noneOf "\t\n\r ]") <* skipSpaces + +orgArgWordChar :: OrgParser Char +orgArgWordChar = alphaNum <|> oneOf "-_" + +toRundocAttrib :: (String, String) -> (String, String) +toRundocAttrib = first ("rundoc-" ++) + +commaEscaped :: String -> String +commaEscaped (',':cs@('*':_)) = cs +commaEscaped (',':cs@('#':'+':_)) = cs +commaEscaped cs = cs + +example :: OrgParser (F Blocks) +example = try $ do + return . return . exampleCode =<< unlines <$> many1 exampleLine + +exampleCode :: String -> Blocks +exampleCode = B.codeBlockWith ("", ["example"], []) + +exampleLine :: OrgParser String +exampleLine = try $ skipSpaces *> string ": " *> anyLine + +-- Drawers for properties or a logbook +drawer :: OrgParser (F Blocks) +drawer = try $ do + drawerStart + manyTill drawerLine (try drawerEnd) + return mempty + +drawerStart :: OrgParser String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* P.newline + where drawerName = try $ char ':' *> validDrawerName <* char ':' + validDrawerName = stringAnyCase "PROPERTIES" + <|> stringAnyCase "LOGBOOK" + +drawerLine :: OrgParser String +drawerLine = try anyLine + +drawerEnd :: OrgParser String +drawerEnd = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* P.newline + + +-- +-- Figures +-- + +-- Figures (Image on a line by itself, preceded by name and/or caption) +figure :: OrgParser (F Blocks) +figure = try $ do + (cap, nam) <- nameAndCaption + src <- skipSpaces *> selfTarget <* skipSpaces <* P.newline + guard (isImageFilename src) + return $ do + cap' <- cap + return $ B.para $ B.image src nam cap' + where + nameAndCaption = + do + maybeCap <- lookupInlinesAttr "caption" + maybeNam <- lookupBlockAttribute "name" + guard $ isJust maybeCap || isJust maybeNam + return ( fromMaybe mempty maybeCap + , maybe mempty withFigPrefix maybeNam ) + withFigPrefix cs = + if "fig:" `isPrefixOf` cs + then cs + else "fig:" ++ cs + +-- +-- Comments, Options and Metadata +specialLine :: OrgParser (F Blocks) +specialLine = fmap return . try $ metaLine <|> commentLine + +metaLine :: OrgParser Blocks +metaLine = try $ mempty + <$ (metaLineStart *> (optionLine <|> declarationLine)) + +commentLine :: OrgParser Blocks +commentLine = try $ commentLineStart *> anyLine *> pure mempty + +-- The order, in which blocks are tried, makes sure that we're not looking at +-- the beginning of a block, so we don't need to check for it +metaLineStart :: OrgParser String +metaLineStart = try $ mappend <$> many spaceChar <*> string "#+" + +commentLineStart :: OrgParser String +commentLineStart = try $ mappend <$> many spaceChar <*> string "# " + +declarationLine :: OrgParser () +declarationLine = try $ do + key <- metaKey + inlinesF <- metaInlines + updateState $ \st -> + let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta + in st { orgStateMeta' = orgStateMeta' st <> meta' } + return () + +metaInlines :: OrgParser (F MetaValue) +metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline + +metaKey :: OrgParser String +metaKey = map toLower <$> many1 (noneOf ": \n\r") + <* char ':' + <* skipSpaces + +optionLine :: OrgParser () +optionLine = try $ do + key <- metaKey + case key of + "link" -> parseLinkFormat >>= uncurry addLinkFormat + _ -> mzero + +parseLinkFormat :: OrgParser ((String, String -> String)) +parseLinkFormat = try $ do + linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces + linkSubst <- parseFormat + return (linkType, linkSubst) + +-- | An ad-hoc, single-argument-only implementation of a printf-style format +-- parser. +parseFormat :: OrgParser (String -> String) +parseFormat = try $ do + replacePlain <|> replaceUrl <|> justAppend + where + -- inefficient, but who cares + replacePlain = try $ (\x -> concat . flip intersperse x) + <$> sequence [tillSpecifier 's', rest] + replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode) + <$> sequence [tillSpecifier 'h', rest] + justAppend = try $ (++) <$> rest + + rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") + tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) + +-- +-- Headers +-- + +-- | Headers +header :: OrgParser (F Blocks) +header = try $ do + level <- headerStart + title <- inlinesTillNewline + return $ B.header level <$> title + +headerStart :: OrgParser Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') + + +-- Don't use (or need) the reader wrapper here, we want hline to be +-- @show@able. Otherwise we can't use it with @notFollowedBy'@. + +-- | Horizontal Line (five -- dashes or more) +hline :: OrgParser Blocks +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return B.horizontalRule + +-- +-- Tables +-- + +data OrgTableRow = OrgContentRow (F [Blocks]) + | OrgAlignRow [Alignment] + | OrgHlineRow + +data OrgTable = OrgTable + { orgTableColumns :: Int + , orgTableAlignments :: [Alignment] + , orgTableHeader :: [Blocks] + , orgTableRows :: [[Blocks]] + } + +table :: OrgParser (F Blocks) +table = try $ do + lookAhead tableStart + do + rows <- tableRows + cptn <- fromMaybe (pure "") <$> lookupInlinesAttr "caption" + return $ (<$> cptn) . orgToPandocTable . normalizeTable =<< rowsToTable rows + +orgToPandocTable :: OrgTable + -> Inlines + -> Blocks +orgToPandocTable (OrgTable _ aligns heads lns) caption = + B.table caption (zip aligns $ repeat 0) heads lns + +tableStart :: OrgParser Char +tableStart = try $ skipSpaces *> char '|' + +tableRows :: OrgParser [OrgTableRow] +tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) + +tableContentRow :: OrgParser OrgTableRow +tableContentRow = try $ + OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline) + +tableContentCell :: OrgParser (F Blocks) +tableContentCell = try $ + fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell + +endOfCell :: OrgParser Char +endOfCell = try $ char '|' <|> lookAhead newline + +tableAlignRow :: OrgParser OrgTableRow +tableAlignRow = try $ + OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline) + +tableAlignCell :: OrgParser Alignment +tableAlignCell = + choice [ try $ emptyCell *> return AlignDefault + , try $ skipSpaces + *> char '<' + *> tableAlignFromChar + <* many digit + <* char '>' + <* emptyCell + ] <?> "alignment info" + where emptyCell = try $ skipSpaces *> endOfCell + +tableAlignFromChar :: OrgParser Alignment +tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] + +tableHline :: OrgParser OrgTableRow +tableHline = try $ + OrgHlineRow <$ (tableStart *> char '-' *> anyLine) + +rowsToTable :: [OrgTableRow] + -> F OrgTable +rowsToTable = foldM (flip rowToContent) zeroTable + where zeroTable = OrgTable 0 mempty mempty mempty + +normalizeTable :: OrgTable + -> OrgTable +normalizeTable (OrgTable cols aligns heads lns) = + let aligns' = fillColumns aligns AlignDefault + heads' = if heads == mempty + then mempty + else fillColumns heads (B.plain mempty) + lns' = map (`fillColumns` B.plain mempty) lns + fillColumns base padding = take cols $ base ++ repeat padding + in OrgTable cols aligns' heads' lns' + + +-- One or more horizontal rules after the first content line mark the previous +-- line as a header. All other horizontal lines are discarded. +rowToContent :: OrgTableRow + -> OrgTable + -> F OrgTable +rowToContent OrgHlineRow t = maybeBodyToHeader t +rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t +rowToContent (OrgContentRow rf) t = do + rs <- rf + setLongestRow rs =<< appendToBody rs t + +setLongestRow :: [a] + -> OrgTable + -> F OrgTable +setLongestRow rs t = + return t{ orgTableColumns = max (length rs) (orgTableColumns t) } + +maybeBodyToHeader :: OrgTable + -> F OrgTable +maybeBodyToHeader t = case t of + OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> + return t{ orgTableHeader = b , orgTableRows = [] } + _ -> return t + +appendToBody :: [Blocks] + -> OrgTable + -> F OrgTable +appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] } + +setAligns :: [Alignment] + -> OrgTable + -> F OrgTable +setAligns aligns t = return $ t{ orgTableAlignments = aligns } + + +-- +-- LaTeX fragments +-- +latexFragment :: OrgParser (F Blocks) +latexFragment = try $ do + envName <- latexEnvStart + content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) + return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) + where + c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" + , c + , "\\end{", e, "}\n" + ] + +latexEnvStart :: OrgParser String +latexEnvStart = try $ do + skipSpaces *> string "\\begin{" + *> latexEnvName + <* string "}" + <* blankline + +latexEnd :: String -> OrgParser () +latexEnd envName = try $ + () <$ skipSpaces + <* string ("\\end{" ++ envName ++ "}") + <* blankline + +-- | Parses a LaTeX environment name. +latexEnvName :: OrgParser String +latexEnvName = try $ do + mappend <$> many1 alphaNum + <*> option "" (string "*") + + +-- +-- Footnote defintions +-- +noteBlock :: OrgParser (F Blocks) +noteBlock = try $ do + ref <- noteMarker <* skipSpaces + content <- mconcat <$> blocksTillHeaderOrNote + addToNotesTable (ref, content) + return mempty + where + blocksTillHeaderOrNote = + many1Till block (eof <|> () <$ lookAhead noteMarker + <|> () <$ lookAhead headerStart) + +-- Paragraphs or Plain text +paraOrPlain :: OrgParser (F Blocks) +paraOrPlain = try $ do + ils <- parseInlines + nl <- option False (newline >> return True) + try (guard nl >> notFollowedBy (orderedListStart <|> bulletListStart) >> + return (B.para <$> ils)) + <|> (return (B.plain <$> ils)) + +inlinesTillNewline :: OrgParser (F Inlines) +inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline + + +-- +-- list blocks +-- + +list :: OrgParser (F Blocks) +list = choice [ definitionList, bulletList, orderedList ] <?> "list" + +definitionList :: OrgParser (F Blocks) +definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.definitionList . fmap compactify'DL . sequence + <$> many1 (definitionListItem $ bulletListStart' (Just n)) + +bulletList :: OrgParser (F Blocks) +bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.bulletList . fmap compactify' . sequence + <$> many1 (listItem (bulletListStart' $ Just n)) + +orderedList :: OrgParser (F Blocks) +orderedList = fmap B.orderedList . fmap compactify' . sequence + <$> many1 (listItem orderedListStart) + +genericListStart :: OrgParser String + -> OrgParser Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +-- parses bullet list marker. maybe we know the indent level +bulletListStart :: OrgParser Int +bulletListStart = bulletListStart' Nothing + +bulletListStart' :: Maybe Int -> OrgParser Int +-- returns length of bulletList prefix, inclusive of marker +bulletListStart' Nothing = do ind <- length <$> many spaceChar + when (ind == 0) $ notFollowedBy (char '*') + oneOf bullets + many1 spaceChar + return (ind + 1) + -- Unindented lists are legal, but they can't use '*' bullets + -- We return n to maintain compatibility with the generic listItem +bulletListStart' (Just n) = do count (n-1) spaceChar + when (n == 1) $ notFollowedBy (char '*') + oneOf bullets + many1 spaceChar + return n + +bullets :: String +bullets = "*+-" + +orderedListStart :: OrgParser Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +definitionListItem :: OrgParser Int + -> OrgParser (F (Inlines, [Blocks])) +definitionListItem parseMarkerGetLength = try $ do + markerLength <- parseMarkerGetLength + term <- manyTill (noneOf "\n\r") (try $ string "::") + line1 <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + cont <- concat <$> many (listContinuation markerLength) + term' <- parseFromString parseInlines term + contents' <- parseFromString parseBlocks $ line1 ++ blank ++ cont + return $ (,) <$> term' <*> fmap (:[]) contents' + + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: OrgParser Int + -> OrgParser (F Blocks) +listItem start = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString parseBlocks $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> OrgParser String +listContinuation markerLength = try $ + notFollowedBy' blankline + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: OrgParser String +anyLineNewline = (++ "\n") <$> anyLine + + +-- +-- inline +-- + +inline :: OrgParser (F Inlines) +inline = + choice [ whitespace + , linebreak + , cite + , footnote + , linkOrImage + , anchor + , inlineCodeBlock + , str + , endline + , emph + , strong + , strikeout + , underline + , code + , math + , displayMath + , verbatim + , subscript + , superscript + , inlineLaTeX + , symbol + ] <* (guard =<< newlinesCountWithinLimits) + <?> "inline" + +parseInlines :: OrgParser (F Inlines) +parseInlines = trimInlinesF . mconcat <$> many1 inline + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~" + + +whitespace :: OrgParser (F Inlines) +whitespace = pure B.space <$ skipMany1 spaceChar + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + <?> "whitespace" + +linebreak :: OrgParser (F Inlines) +linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline + +str :: OrgParser (F Inlines) +str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- | An endline character that can be treated as a space, not a structural +-- break. This should reflect the values of the Emacs variable +-- @org-element-pagaraph-separate@. +endline :: OrgParser (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy' exampleLine + notFollowedBy' hline + notFollowedBy' noteMarker + notFollowedBy' tableStart + notFollowedBy' drawerStart + notFollowedBy' headerStart + notFollowedBy' metaLineStart + notFollowedBy' latexEnvStart + notFollowedBy' commentLineStart + notFollowedBy' bulletListStart + notFollowedBy' orderedListStart + decEmphasisNewlinesCount + guard =<< newlinesCountWithinLimits + updateLastPreCharPos + return . return $ B.space + +cite :: OrgParser (F Inlines) +cite = try $ do + guardEnabled Ext_citations + (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + +normalCite :: OrgParser (F [Citation]) +normalCite = try $ char '[' + *> skipSpaces + *> citeList + <* skipSpaces + <* char ']' + +citeList :: OrgParser (F [Citation]) +citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces) + +citation :: OrgParser (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +footnote :: OrgParser (F Inlines) +footnote = try $ inlineNote <|> referencedNote + +inlineNote :: OrgParser (F Inlines) +inlineNote = try $ do + string "[fn:" + ref <- many alphaNum + char ':' + note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') + when (not $ null ref) $ + addToNotesTable ("fn:" ++ ref, note) + return $ B.note <$> note + +referencedNote :: OrgParser (F Inlines) +referencedNote = try $ do + ref <- noteMarker + return $ do + notes <- asksF orgStateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just contents -> do + st <- askF + let contents' = runF contents st{ orgStateNotes' = [] } + return $ B.note contents' + +noteMarker :: OrgParser String +noteMarker = try $ do + char '[' + choice [ many1Till digit (char ']') + , (++) <$> string "fn:" + <*> many1Till (noneOf "\n\r\t ") (char ']') + ] + +linkOrImage :: OrgParser (F Inlines) +linkOrImage = explicitOrImageLink + <|> selflinkOrImage + <|> angleLink + <|> plainLink + <?> "link or image" + +explicitOrImageLink :: OrgParser (F Inlines) +explicitOrImageLink = try $ do + char '[' + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget + title <- enclosedRaw (char '[') (char ']') + title' <- parseFromString (mconcat <$> many inline) title + char ']' + return $ do + src <- srcF + if isImageFilename src && isImageFilename title + then pure $ B.link src "" $ B.image title mempty mempty + else linkToInlinesF src =<< title' + +selflinkOrImage :: OrgParser (F Inlines) +selflinkOrImage = try $ do + src <- char '[' *> linkTarget <* char ']' + return $ linkToInlinesF src (B.str src) + +plainLink :: OrgParser (F Inlines) +plainLink = try $ do + (orig, src) <- uri + returnF $ B.link src "" (B.str orig) + +angleLink :: OrgParser (F Inlines) +angleLink = try $ do + char '<' + link <- plainLink + char '>' + return link + +selfTarget :: OrgParser String +selfTarget = try $ char '[' *> linkTarget <* char ']' + +linkTarget :: OrgParser String +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") + +possiblyEmptyLinkTarget :: OrgParser String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + +applyCustomLinkFormat :: String -> OrgParser (F String) +applyCustomLinkFormat link = do + let (linkType, rest) = break (== ':') link + return $ do + formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters + return $ maybe link ($ drop 1 rest) formatter + +-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind +-- of parsing. +linkToInlinesF :: String -> Inlines -> F Inlines +linkToInlinesF s = + case s of + "" -> pure . B.link "" "" + ('#':_) -> pure . B.link s "" + _ | isImageFilename s -> const . pure $ B.image s "" "" + _ | isFileLink s -> pure . B.link (dropLinkType s) "" + _ | isUri s -> pure . B.link s "" + _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) "" + _ | isRelativeFilePath s -> pure . B.link s "" + _ -> internalLink s + +isFileLink :: String -> Bool +isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s) + +dropLinkType :: String -> String +dropLinkType = tail . snd . break (== ':') + +isRelativeFilePath :: String -> Bool +isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) && + (':' `notElem` s) + +isUri :: String -> Bool +isUri s = let (scheme, path) = break (== ':') s + in all (\c -> isAlphaNum c || c `elem` ".-") scheme + && not (null path) + +isAbsoluteFilePath :: String -> Bool +isAbsoluteFilePath = ('/' ==) . head + +isImageFilename :: String -> Bool +isImageFilename filename = + any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && + (any (\x -> (x++":") `isPrefixOf` filename) protocols || + ':' `notElem` filename) + where + imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] + protocols = [ "file", "http", "https" ] + +internalLink :: String -> Inlines -> F Inlines +internalLink link title = do + anchorB <- (link `elem`) <$> asksF orgStateAnchorIds + if anchorB + then return $ B.link ('#':link) "" title + else return $ B.emph title + +-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with +-- @anchor-id@ set as id. Legal anchors in org-mode are defined through +-- @org-target-regexp@, which is fairly liberal. Since no link is created if +-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as +-- an anchor. + +anchor :: OrgParser (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + recordAnchorId anchorId + returnF $ B.spanWith (solidify anchorId, [], []) mempty + where + parseAnchor = string "<<" + *> many1 (noneOf "\t\n\r<>\"' ") + <* string ">>" + <* skipSpaces + +-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors +-- the org function @org-export-solidify-link-text@. + +solidify :: String -> String +solidify = map replaceSpecialChar + where replaceSpecialChar c + | isAlphaNum c = c + | c `elem` "_.-:" = c + | otherwise = '-' + +-- | Parses an inline code block and marks it as an babel block. +inlineCodeBlock :: OrgParser (F Inlines) +inlineCodeBlock = try $ do + string "src_" + lang <- many1 orgArgWordChar + opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption + inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") + let attrClasses = [translateLang lang, rundocBlockClass] + let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode + +enclosedByPair :: Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser a -- ^ parser + -> OrgParser [a] +enclosedByPair s e p = char s *> many1Till p (char e) + +emph :: OrgParser (F Inlines) +emph = fmap B.emph <$> emphasisBetween '/' + +strong :: OrgParser (F Inlines) +strong = fmap B.strong <$> emphasisBetween '*' + +strikeout :: OrgParser (F Inlines) +strikeout = fmap B.strikeout <$> emphasisBetween '+' + +-- There is no underline, so we use strong instead. +underline :: OrgParser (F Inlines) +underline = fmap B.strong <$> emphasisBetween '_' + +verbatim :: OrgParser (F Inlines) +verbatim = return . B.code <$> verbatimBetween '=' + +code :: OrgParser (F Inlines) +code = return . B.code <$> verbatimBetween '~' + +subscript :: OrgParser (F Inlines) +subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) + +superscript :: OrgParser (F Inlines) +superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) + +math :: OrgParser (F Inlines) +math = return . B.math <$> choice [ math1CharBetween '$' + , mathStringBetween '$' + , rawMathBetween "\\(" "\\)" + ] + +displayMath :: OrgParser (F Inlines) +displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" + , rawMathBetween "$$" "$$" + ] +symbol :: OrgParser (F Inlines) +symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) + where updatePositions c = do + when (c `elem` emphasisPreChars) updateLastPreCharPos + when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos + return c + +emphasisBetween :: Char + -> OrgParser (F Inlines) +emphasisBetween c = try $ do + startEmphasisNewlinesCounting emphasisAllowedNewlines + res <- enclosedInlines (emphasisStart c) (emphasisEnd c) + isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState + when isTopLevelEmphasis + resetEmphasisNewlines + return res + +verbatimBetween :: Char + -> OrgParser String +verbatimBetween c = try $ + emphasisStart c *> + many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c) + +-- | Parses a raw string delimited by @c@ using Org's math rules +mathStringBetween :: Char + -> OrgParser String +mathStringBetween c = try $ do + mathStart c + body <- many1TillNOrLessNewlines mathAllowedNewlines + (noneOf (c:"\n\r")) + (lookAhead $ mathEnd c) + final <- mathEnd c + return $ body ++ [final] + +-- | Parse a single character between @c@ using math rules +math1CharBetween :: Char + -> OrgParser String +math1CharBetween c = try $ do + char c + res <- noneOf $ c:mathForbiddenBorderChars + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return [res] + +rawMathBetween :: String + -> String + -> OrgParser String +rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) + +-- | Parses the start (opening character) of emphasis +emphasisStart :: Char -> OrgParser Char +emphasisStart c = try $ do + guard =<< afterEmphasisPreChar + guard =<< notAfterString + char c + lookAhead (noneOf emphasisForbiddenBorderChars) + pushToInlineCharStack c + return c + +-- | Parses the closing character of emphasis +emphasisEnd :: Char -> OrgParser Char +emphasisEnd c = try $ do + guard =<< notAfterForbiddenBorderChar + char c + eof <|> () <$ lookAhead acceptablePostChars + updateLastStrPos + popInlineCharStack + return c + where acceptablePostChars = + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + +mathStart :: Char -> OrgParser Char +mathStart c = try $ + char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) + +mathEnd :: Char -> OrgParser Char +mathEnd c = try $ do + res <- noneOf (c:mathForbiddenBorderChars) + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return res + + +enclosedInlines :: OrgParser a + -> OrgParser b + -> OrgParser (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> enclosed start end inline + +enclosedRaw :: OrgParser a + -> OrgParser b + -> OrgParser String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume +-- newlines. +many1TillNOrLessNewlines :: Int + -> OrgParser Char + -> OrgParser a + -> OrgParser String +many1TillNOrLessNewlines n p end = try $ + nMoreLines (Just n) mempty >>= oneOrMore + where + nMoreLines Nothing cs = return cs + nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine + nMoreLines k cs = try $ (final k cs <|> rest k cs) + >>= uncurry nMoreLines + final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p P.newline) + finalLine = try $ manyTill p end + minus1 k = k - 1 + oneOrMore cs = guard (not $ null cs) *> return cs + +-- Org allows customization of the way it reads emphasis. We use the defaults +-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` +-- for details). + +-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) +emphasisPreChars :: [Char] +emphasisPreChars = "\t \"'({" + +-- | Chars allowed at after emphasis +emphasisPostChars :: [Char] +emphasisPostChars = "\t\n !\"'),-.:;?\\}" + +-- | Chars not allowed at the (inner) border of emphasis +emphasisForbiddenBorderChars :: [Char] +emphasisForbiddenBorderChars = "\t\n\r \"'," + +-- | The maximum number of newlines within +emphasisAllowedNewlines :: Int +emphasisAllowedNewlines = 1 + +-- LaTeX-style math: see `org-latex-regexps` for details + +-- | Chars allowed after an inline ($...$) math statement +mathPostChars :: [Char] +mathPostChars = "\t\n \"'),-.:;?" + +-- | Chars not allowed at the (inner) border of math +mathForbiddenBorderChars :: [Char] +mathForbiddenBorderChars = "\t\n\r ,;.$" + +-- | Maximum number of newlines in an inline math statement +mathAllowedNewlines :: Int +mathAllowedNewlines = 2 + +-- | Whether we are right behind a char allowed before emphasis +afterEmphasisPreChar :: OrgParser Bool +afterEmphasisPreChar = do + pos <- getPosition + lastPrePos <- orgStateLastPreCharPos <$> getState + return . fromMaybe True $ (== pos) <$> lastPrePos + +-- | Whether the parser is right after a forbidden border char +notAfterForbiddenBorderChar :: OrgParser Bool +notAfterForbiddenBorderChar = do + pos <- getPosition + lastFBCPos <- orgStateLastForbiddenCharPos <$> getState + return $ lastFBCPos /= Just pos + +-- | Read a sub- or superscript expression +subOrSuperExpr :: OrgParser (F Inlines) +subOrSuperExpr = try $ + choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + , simpleSubOrSuperString + ] >>= parseFromString (mconcat <$> many inline) + where enclosing (left, right) s = left : s ++ [right] + +simpleSubOrSuperString :: OrgParser String +simpleSubOrSuperString = try $ + choice [ string "*" + , mappend <$> option [] ((:[]) <$> oneOf "+-") + <*> many1 alphaNum + ] + +inlineLaTeX :: OrgParser (F Inlines) +inlineLaTeX = try $ do + cmd <- inlineLaTeXCommand + maybe mzero returnF $ + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd + where + parseAsMath :: String -> Maybe Inlines + parseAsMath cs = B.fromList <$> texMathToPandoc cs + + parseAsInlineLaTeX :: String -> Maybe Inlines + parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs + + parseAsMathMLSym :: String -> Maybe Inlines + parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) + -- dropWhileEnd would be nice here, but it's not available before base 4.5 + where clean = reverse . dropWhile (`elem` "{}") . reverse . drop 1 + + state :: ParserState + state = def{ stateOptions = def{ readerParseRaw = True }} + + texMathToPandoc inp = (maybeRight $ readTeX inp) >>= + writePandoc DisplayInline + +maybeRight :: Either a b -> Maybe b +maybeRight = either (const Nothing) Just + +inlineLaTeXCommand :: OrgParser String +inlineLaTeXCommand = try $ do + rest <- getInput + case runParser rawLaTeXInline def "source" rest of + Right (RawInline _ cs) -> do + let len = length cs + count len anyChar + return cs + _ -> mzero diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index c12a1493a..8bfc6f606 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -1,6 +1,6 @@ {-# LANGUAGE OverloadedStrings #-} {- -Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2010 John MacFarlane + Copyright : Copyright (C) 2006-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -29,7 +29,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion from reStructuredText to 'Pandoc' document. -} module Text.Pandoc.Readers.RST ( - readRST + readRST, + readRSTWithWarnings ) where import Text.Pandoc.Definition import Text.Pandoc.Builder (setMeta, fromList) @@ -38,15 +39,16 @@ import Text.Pandoc.Parsing import Text.Pandoc.Options import Control.Monad ( when, liftM, guard, mzero ) import Data.List ( findIndex, intersperse, intercalate, - transpose, sort, deleteFirstsBy, isSuffixOf ) + transpose, sort, deleteFirstsBy, isSuffixOf , nub, union) +import Data.Maybe (fromMaybe) import qualified Data.Map as M import Text.Printf ( printf ) -import Control.Applicative ((<$>), (<$), (<*), (*>)) +import Control.Applicative ((<$>), (<$), (<*), (*>), (<*>), pure) import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import qualified Text.Pandoc.Builder as B import Data.Monoid (mconcat, mempty) import Data.Sequence (viewr, ViewR(..)) -import Data.Char (toLower) +import Data.Char (toLower, isHexDigit, isSpace) -- | Parse reStructuredText string and return Pandoc document. readRST :: ReaderOptions -- ^ Reader options @@ -54,6 +56,9 @@ readRST :: ReaderOptions -- ^ Reader options -> Pandoc readRST opts s = (readWith parseRST) def{ stateOptions = opts } (s ++ "\n\n") +readRSTWithWarnings :: ReaderOptions -> String -> (Pandoc, [String]) +readRSTWithWarnings opts s = (readWithWarnings parseRST) def{ stateOptions = opts } (s ++ "\n\n") + type RSTParser = Parser [Char] ParserState -- @@ -112,15 +117,16 @@ titleTransform (bs, meta) = metaFromDefList :: [([Inline], [[Block]])] -> Meta -> Meta metaFromDefList ds meta = adjustAuthors $ foldr f meta ds where f (k,v) = setMeta (map toLower $ stringify k) (mconcat $ map fromList v) - adjustAuthors (Meta metamap) = Meta $ M.adjust toPlain "author" + adjustAuthors (Meta metamap) = Meta $ M.adjust splitAuthors "author" $ M.adjust toPlain "date" $ M.adjust toPlain "title" - $ M.adjust splitAuthors "authors" + $ M.mapKeys (\k -> if k == "authors" then "author" else k) $ metamap toPlain (MetaBlocks [Para xs]) = MetaInlines xs toPlain x = x - splitAuthors (MetaBlocks [Para xs]) = MetaList $ map MetaInlines - $ splitAuthors' xs + splitAuthors (MetaBlocks [Para xs]) + = MetaList $ map MetaInlines + $ splitAuthors' xs splitAuthors x = x splitAuthors' = map normalizeSpaces . splitOnSemi . concatMap factorSemi @@ -184,22 +190,22 @@ block = choice [ codeBlock -- field list -- -rawFieldListItem :: String -> RSTParser (String, String) -rawFieldListItem indent = try $ do - string indent +rawFieldListItem :: Int -> RSTParser (String, String) +rawFieldListItem minIndent = try $ do + indent <- length <$> many (char ' ') + guard $ indent >= minIndent char ':' name <- many1Till (noneOf "\n") (char ':') (() <$ lookAhead newline) <|> skipMany1 spaceChar first <- anyLine - rest <- option "" $ try $ do lookAhead (string indent >> spaceChar) + rest <- option "" $ try $ do lookAhead (count indent (char ' ') >> spaceChar) indentedBlock let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n" return (name, raw) -fieldListItem :: String - -> RSTParser (Inlines, [Blocks]) -fieldListItem indent = try $ do - (name, raw) <- rawFieldListItem indent +fieldListItem :: Int -> RSTParser (Inlines, [Blocks]) +fieldListItem minIndent = try $ do + (name, raw) <- rawFieldListItem minIndent let term = B.str name contents <- parseFromString parseBlocks raw optional blanklines @@ -207,7 +213,7 @@ fieldListItem indent = try $ do fieldList :: RSTParser Blocks fieldList = try $ do - indent <- lookAhead $ many spaceChar + indent <- length <$> lookAhead (many spaceChar) items <- many1 $ fieldListItem indent case items of [] -> return mempty @@ -333,6 +339,13 @@ indentedBlock = try $ do optional blanklines return $ unlines lns +quotedBlock :: Parser [Char] st [Char] +quotedBlock = try $ do + quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + lns <- many1 $ lookAhead (char quote) >> anyLine + optional blanklines + return $ unlines lns + codeBlockStart :: Parser [Char] st Char codeBlockStart = string "::" >> blankline >> blankline @@ -340,7 +353,8 @@ codeBlock :: Parser [Char] st Blocks codeBlock = try $ codeBlockStart >> codeBlockBody codeBlockBody :: Parser [Char] st Blocks -codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> indentedBlock +codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> + (indentedBlock <|> quotedBlock) lhsCodeBlock :: RSTParser Blocks lhsCodeBlock = try $ do @@ -458,7 +472,7 @@ listItem :: RSTParser Int listItem start = try $ do (markerLength, first) <- rawListItem start rest <- many (listContinuation markerLength) - blanks <- choice [ try (many blankline >>~ lookAhead start), + blanks <- choice [ try (many blankline <* lookAhead start), many1 blankline ] -- whole list must end with blank. -- parsing with ListItemState forces markers at beginning of lines to -- count as list item markers, even if not separated by blank space. @@ -478,7 +492,7 @@ listItem start = try $ do orderedList :: RSTParser Blocks orderedList = try $ do - (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) + (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar) items <- many1 (listItem (orderedListStart style delim)) let items' = compactify' items return $ B.orderedListWith (start, style, delim) items' @@ -511,7 +525,6 @@ directive = try $ do -- TODO: line-block, parsed-literal, table, csv-table, list-table -- date -- include --- class -- title directive' :: RSTParser Blocks directive' = do @@ -520,17 +533,17 @@ directive' = do skipMany spaceChar top <- many $ satisfy (/='\n') <|> try (char '\n' <* - notFollowedBy' (rawFieldListItem " ") <* + notFollowedBy' (rawFieldListItem 3) <* count 3 (char ' ') <* notFollowedBy blankline) newline - fields <- many $ rawFieldListItem " " + fields <- many $ rawFieldListItem 3 body <- option "" $ try $ blanklines >> indentedBlock optional blanklines let body' = body ++ "\n\n" case label of "raw" -> return $ B.rawBlock (trim top) (stripTrailingNewlines body) - "role" -> return mempty + "role" -> addNewRole top $ map (\(k,v) -> (k, trim v)) fields "container" -> parseFromString parseBlocks body' "replace" -> B.para <$> -- consumed by substKey parseFromString (trimInlines . mconcat <$> many inline) @@ -575,12 +588,15 @@ directive' = do role -> role }) "code" -> codeblock (lookup "number-lines" fields) (trim top) body "code-block" -> codeblock (lookup "number-lines" fields) (trim top) body + "aafig" -> do + let attribs = ("", ["aafig"], map (\(k,v) -> (k, trimr v)) fields) + return $ B.codeBlockWith attribs $ stripTrailingNewlines body "math" -> return $ B.para $ mconcat $ map B.displayMath $ toChunks $ top ++ "\n\n" ++ body "figure" -> do (caption, legend) <- parseFromString extractCaption body' let src = escapeURI $ trim top - return $ B.para (B.image src "" caption) <> legend + return $ B.para (B.image src "fig:" caption) <> legend "image" -> do let src = escapeURI $ trim top let alt = B.str $ maybe "image" trim $ lookup "alt" fields @@ -589,9 +605,71 @@ directive' = do Just t -> B.link (escapeURI $ trim t) "" $ B.image src "" alt Nothing -> B.image src "" alt - _ -> return mempty - --- Can contain haracter codes as decimal numbers or + "class" -> do + let attrs = ("", (splitBy isSpace $ trim top), map (\(k,v) -> (k, trimr v)) fields) + -- directive content or the first immediately following element + children <- case body of + "" -> block + _ -> parseFromString parseBlocks body' + return $ B.divWith attrs children + other -> do + pos <- getPosition + addWarning (Just pos) $ "ignoring unknown directive: " ++ other + return mempty + +-- TODO: +-- - Silently ignores illegal fields +-- - Only supports :format: fields with a single format for :raw: roles, +-- change Text.Pandoc.Definition.Format to fix +addNewRole :: String -> [(String, String)] -> RSTParser Blocks +addNewRole roleString fields = do + (role, parentRole) <- parseFromString inheritedRole roleString + customRoles <- stateRstCustomRoles <$> getState + let (baseRole, baseFmt, baseAttr) = + maybe (parentRole, Nothing, nullAttr) id $ + M.lookup parentRole customRoles + fmt = if parentRole == "raw" then lookup "format" fields else baseFmt + annotate :: [String] -> [String] + annotate = maybe id (:) $ + if parentRole == "code" + then lookup "language" fields + else Nothing + attr = let (ident, classes, keyValues) = baseAttr + -- nub in case role name & language class are the same + in (ident, nub . (role :) . annotate $ classes, keyValues) + + -- warn about syntax we ignore + flip mapM_ fields $ \(key, _) -> case key of + "language" -> when (parentRole /= "code") $ addWarning Nothing $ + "ignoring :language: field because the parent of role :" ++ + role ++ ": is :" ++ parentRole ++ ": not :code:" + "format" -> when (parentRole /= "raw") $ addWarning Nothing $ + "ignoring :format: field because the parent of role :" ++ + role ++ ": is :" ++ parentRole ++ ": not :raw:" + _ -> addWarning Nothing $ "ignoring unknown field :" ++ key ++ + ": in definition of role :" ++ role ++ ": in" + when (parentRole == "raw" && countKeys "format" > 1) $ + addWarning Nothing $ + "ignoring :format: fields after the first in the definition of role :" + ++ role ++": in" + when (parentRole == "code" && countKeys "language" > 1) $ + addWarning Nothing $ + "ignoring :language: fields after the first in the definition of role :" + ++ role ++": in" + + updateState $ \s -> s { + stateRstCustomRoles = + M.insert role (baseRole, fmt, attr) customRoles + } + + return $ B.singleton Null + where + countKeys k = length . filter (== k) . map fst $ fields + inheritedRole = + (,) <$> roleName <*> ((char '(' *> roleName <* char ')') <|> pure "span") + + +-- Can contain character codes as decimal numbers or -- hexadecimal numbers, prefixed by 0x, x, \x, U+, u, or \u -- or as XML-style hexadecimal character entities, e.g. ᨫ -- or text, which is used as-is. Comments start with .. @@ -620,9 +698,6 @@ extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc where (ds,rest) = span isHexDigit s mbc = safeRead ('\'':'\\':'x':ds ++ "'") -isHexDigit :: Char -> Bool -isHexDigit c = c `elem` "0123456789ABCDEFabcdef" - extractCaption :: RSTParser (Inlines, Blocks) extractCaption = do capt <- trimInlines . mconcat <$> many inline @@ -711,7 +786,7 @@ simpleReferenceName = do referenceName :: RSTParser Inlines referenceName = quotedReferenceName <|> - (try $ simpleReferenceName >>~ lookAhead (char ':')) <|> + (try $ simpleReferenceName <* lookAhead (char ':')) <|> unquotedReferenceName referenceKey :: RSTParser [Char] @@ -930,17 +1005,61 @@ strong = B.strong . trimInlines . mconcat <$> -- Note, this doesn't precisely implement the complex rule in -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules -- but it should be good enough for most purposes +-- +-- TODO: +-- - Classes are silently discarded in addNewRole +-- - Lacks sensible implementation for title-reference (which is the default) +-- - Allows direct use of the :raw: role, rST only allows inherited use. interpretedRole :: RSTParser Inlines interpretedRole = try $ do (role, contents) <- roleBefore <|> roleAfter - case role of - "sup" -> return $ B.superscript $ B.str contents - "sub" -> return $ B.subscript $ B.str contents - "math" -> return $ B.math contents - _ -> return $ B.str contents --unknown + renderRole contents Nothing role nullAttr + +renderRole :: String -> Maybe String -> String -> Attr -> RSTParser Inlines +renderRole contents fmt role attr = case role of + "sup" -> return $ B.superscript $ B.str contents + "superscript" -> return $ B.superscript $ B.str contents + "sub" -> return $ B.subscript $ B.str contents + "subscript" -> return $ B.subscript $ B.str contents + "emphasis" -> return $ B.emph $ B.str contents + "strong" -> return $ B.strong $ B.str contents + "rfc-reference" -> return $ rfcLink contents + "RFC" -> return $ rfcLink contents + "pep-reference" -> return $ pepLink contents + "PEP" -> return $ pepLink contents + "literal" -> return $ B.codeWith attr contents + "math" -> return $ B.math contents + "title-reference" -> titleRef contents + "title" -> titleRef contents + "t" -> titleRef contents + "code" -> return $ B.codeWith (addClass "sourceCode" attr) contents + "span" -> return $ B.spanWith attr $ B.str contents + "raw" -> return $ B.rawInline (fromMaybe "" fmt) contents + custom -> do + customRoles <- stateRstCustomRoles <$> getState + case M.lookup custom customRoles of + Just (newRole, newFmt, newAttr) -> + renderRole contents newFmt newRole newAttr + Nothing -> do + pos <- getPosition + addWarning (Just pos) $ "ignoring unknown role :" ++ custom ++ ": in" + return $ B.str contents -- Undefined role + where + titleRef ref = return $ B.str ref -- FIXME: Not a sensible behaviour + rfcLink rfcNo = B.link rfcUrl ("RFC " ++ rfcNo) $ B.str ("RFC " ++ rfcNo) + where rfcUrl = "http://www.faqs.org/rfcs/rfc" ++ rfcNo ++ ".html" + pepLink pepNo = B.link pepUrl ("PEP " ++ pepNo) $ B.str ("PEP " ++ pepNo) + where padNo = replicate (4 - length pepNo) '0' ++ pepNo + pepUrl = "http://www.python.org/dev/peps/pep-" ++ padNo ++ "/" + +addClass :: String -> Attr -> Attr +addClass c (ident, classes, keyValues) = (ident, union classes [c], keyValues) + +roleName :: RSTParser String +roleName = many1 (letter <|> char '-') roleMarker :: RSTParser String -roleMarker = char ':' *> many1Till (letter <|> char '-') (char ':') +roleMarker = char ':' *> roleName <* char ':' roleBefore :: RSTParser (String,String) roleBefore = try $ do @@ -1001,7 +1120,7 @@ explicitLink = try $ do referenceLink :: RSTParser Inlines referenceLink = try $ do - (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) >>~ + (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <* char '_' state <- getState let keyTable = stateKeys state @@ -1069,7 +1188,7 @@ smart :: RSTParser Inlines smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> - choice (map (B.singleton <$>) [apostrophe, dash, ellipses]) + choice [apostrophe, dash, ellipses] singleQuoted :: RSTParser Inlines singleQuoted = try $ do diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs new file mode 100644 index 000000000..c2325c0ea --- /dev/null +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -0,0 +1,526 @@ +{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} +-- RelaxedPolyRec needed for inlinesBetween on GHC < 7 +{- + Copyright (C) 2014 Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.TWiki + Copyright : Copyright (C) 2014 Alexander Sulfrian + License : GNU GPL, version 2 or above + + Maintainer : Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + Stability : alpha + Portability : portable + +Conversion of twiki text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.TWiki ( readTWiki + , readTWikiWithWarnings + ) where + +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed, macro, nested) +import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) +import Data.Monoid (Monoid, mconcat, mempty) +import Control.Applicative ((<$>), (<*), (*>), (<$)) +import Control.Monad +import Text.Printf (printf) +import Debug.Trace (trace) +import Text.Pandoc.XML (fromEntities) +import Data.Maybe (fromMaybe) +import Text.HTML.TagSoup +import Data.Char (isAlphaNum) +import qualified Data.Foldable as F + +-- | Read twiki from an input string and return a Pandoc document. +readTWiki :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readTWiki opts s = + (readWith parseTWiki) def{ stateOptions = opts } (s ++ "\n\n") + +readTWikiWithWarnings :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> (Pandoc, [String]) +readTWikiWithWarnings opts s = + (readWith parseTWikiWithWarnings) def{ stateOptions = opts } (s ++ "\n\n") + where parseTWikiWithWarnings = do + doc <- parseTWiki + warnings <- stateWarnings <$> getState + return (doc, warnings) + +type TWParser = Parser [Char] ParserState + +-- +-- utility functions +-- + +tryMsg :: String -> TWParser a -> TWParser a +tryMsg msg p = try p <?> msg + +skip :: TWParser a -> TWParser () +skip parser = parser >> return () + +nested :: TWParser a -> TWParser a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +htmlElement :: String -> TWParser (Attr, String) +htmlElement tag = tryMsg tag $ do + (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) + content <- manyTill anyChar (endtag <|> endofinput) + return (htmlAttrToPandoc attr, trim content) + where + endtag = skip $ htmlTag (~== TagClose tag) + endofinput = lookAhead $ try $ skipMany blankline >> skipSpaces >> eof + trim = dropWhile (=='\n') . reverse . dropWhile (=='\n') . reverse + +htmlAttrToPandoc :: [Attribute String] -> Attr +htmlAttrToPandoc attrs = (ident, classes, keyvals) + where + ident = fromMaybe "" $ lookup "id" attrs + classes = maybe [] words $ lookup "class" attrs + keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + +parseHtmlContentWithAttrs :: String -> TWParser a -> TWParser (Attr, [a]) +parseHtmlContentWithAttrs tag parser = do + (attr, content) <- htmlElement tag + parsedContent <- try $ parseContent content + return (attr, parsedContent) + where + parseContent = parseFromString $ nested $ manyTill parser endOfContent + endOfContent = try $ skipMany blankline >> skipSpaces >> eof + +parseHtmlContent :: String -> TWParser a -> TWParser [a] +parseHtmlContent tag p = parseHtmlContentWithAttrs tag p >>= return . snd + +-- +-- main parser +-- + +parseTWiki :: TWParser Pandoc +parseTWiki = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + + +-- +-- block parsers +-- + +block :: TWParser B.Blocks +block = do + tr <- getOption readerTrace + pos <- getPosition + res <- mempty <$ skipMany1 blankline + <|> blockElements + <|> para + skipMany blankline + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res + +blockElements :: TWParser B.Blocks +blockElements = choice [ separator + , header + , verbatim + , literal + , list "" + , table + , blockQuote + , noautolink + ] + +separator :: TWParser B.Blocks +separator = tryMsg "separator" $ string "---" >> newline >> return B.horizontalRule + +header :: TWParser B.Blocks +header = tryMsg "header" $ do + string "---" + level <- many1 (char '+') >>= return . length + guard $ level <= 6 + classes <- option [] $ string "!!" >> return ["unnumbered"] + skipSpaces + content <- B.trimInlines . mconcat <$> manyTill inline newline + attr <- registerHeader ("", classes, []) content + return $ B.headerWith attr level $ content + +verbatim :: TWParser B.Blocks +verbatim = (htmlElement "verbatim" <|> htmlElement "pre") + >>= return . (uncurry B.codeBlockWith) + +literal :: TWParser B.Blocks +literal = htmlElement "literal" >>= return . rawBlock + where + format (_, _, kvs) = fromMaybe "html" $ lookup "format" kvs + rawBlock (attrs, content) = B.rawBlock (format attrs) content + +list :: String -> TWParser B.Blocks +list prefix = choice [ bulletList prefix + , orderedList prefix + , definitionList prefix] + +definitionList :: String -> TWParser B.Blocks +definitionList prefix = tryMsg "definitionList" $ do + indent <- lookAhead $ string prefix *> (many1 $ string " ") <* string "$ " + elements <- many $ parseDefinitionListItem (prefix ++ concat indent) + return $ B.definitionList elements + where + parseDefinitionListItem :: String -> TWParser (B.Inlines, [B.Blocks]) + parseDefinitionListItem indent = do + string (indent ++ "$ ") >> skipSpaces + term <- many1Till inline $ string ": " + line <- listItemLine indent $ string "$ " + return $ (mconcat term, [line]) + +bulletList :: String -> TWParser B.Blocks +bulletList prefix = tryMsg "bulletList" $ + parseList prefix (char '*') (char ' ') + +orderedList :: String -> TWParser B.Blocks +orderedList prefix = tryMsg "orderedList" $ + parseList prefix (oneOf "1iIaA") (string ". ") + +parseList :: Show a => String -> TWParser Char -> TWParser a -> TWParser B.Blocks +parseList prefix marker delim = do + (indent, style) <- lookAhead $ string prefix *> listStyle <* delim + blocks <- many $ parseListItem (prefix ++ indent) (char style <* delim) + return $ case style of + '1' -> B.orderedListWith (1, DefaultStyle, DefaultDelim) blocks + 'i' -> B.orderedListWith (1, LowerRoman, DefaultDelim) blocks + 'I' -> B.orderedListWith (1, UpperRoman, DefaultDelim) blocks + 'a' -> B.orderedListWith (1, LowerAlpha, DefaultDelim) blocks + 'A' -> B.orderedListWith (1, UpperAlpha, DefaultDelim) blocks + _ -> B.bulletList blocks + where + listStyle = do + indent <- many1 $ string " " + style <- marker + return (concat indent, style) + +parseListItem :: Show a => String -> TWParser a -> TWParser B.Blocks +parseListItem prefix marker = string prefix >> marker >> listItemLine prefix marker + +listItemLine :: Show a => String -> TWParser a -> TWParser B.Blocks +listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat + where + lineContent = do + content <- anyLine + continuation <- optionMaybe listContinuation + return $ filterSpaces content ++ "\n" ++ (maybe "" (" " ++) continuation) + filterSpaces = reverse . dropWhile (== ' ') . reverse + listContinuation = notFollowedBy (string prefix >> marker) >> + string " " >> lineContent + parseContent = parseFromString $ many1 $ nestedList <|> parseInline + parseInline = many1Till inline (lastNewline <|> newlineBeforeNestedList) >>= + return . B.plain . mconcat + nestedList = list prefix + lastNewline = try $ char '\n' <* eof + newlineBeforeNestedList = try $ char '\n' <* lookAhead nestedList + +table :: TWParser B.Blocks +table = try $ do + tableHead <- optionMaybe $ many1Till tableParseHeader newline >>= return . unzip + rows <- many1 tableParseRow + return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) tableHead + where + buildTable caption rows (aligns, heads) + = B.table caption aligns heads rows + align rows = replicate (columCount rows) (AlignDefault, 0) + columns rows = replicate (columCount rows) mempty + columCount rows = length $ head rows + +tableParseHeader :: TWParser ((Alignment, Double), B.Blocks) +tableParseHeader = try $ do + char '|' + leftSpaces <- many spaceChar >>= return . length + char '*' + content <- tableColumnContent (char '*' >> skipSpaces >> char '|') + char '*' + rightSpaces <- many spaceChar >>= return . length + optional tableEndOfRow + return (tableAlign leftSpaces rightSpaces, content) + where + tableAlign left right + | left >= 2 && left == right = (AlignCenter, 0) + | left > right = (AlignRight, 0) + | otherwise = (AlignLeft, 0) + +tableParseRow :: TWParser [B.Blocks] +tableParseRow = many1Till tableParseColumn newline + +tableParseColumn :: TWParser B.Blocks +tableParseColumn = char '|' *> skipSpaces *> + tableColumnContent (skipSpaces >> char '|') + <* skipSpaces <* optional tableEndOfRow + +tableEndOfRow :: TWParser Char +tableEndOfRow = lookAhead (try $ char '|' >> char '\n') >> char '|' + +tableColumnContent :: Show a => TWParser a -> TWParser B.Blocks +tableColumnContent end = manyTill content (lookAhead $ try end) >>= return . B.plain . mconcat + where + content = continuation <|> inline + continuation = try $ char '\\' >> newline >> return mempty + +blockQuote :: TWParser B.Blocks +blockQuote = parseHtmlContent "blockquote" block >>= return . B.blockQuote . mconcat + +noautolink :: TWParser B.Blocks +noautolink = do + (_, content) <- htmlElement "noautolink" + st <- getState + setState $ st{ stateAllowLinks = False } + blocks <- try $ parseContent content + setState $ st{ stateAllowLinks = True } + return $ mconcat blocks + where + parseContent = parseFromString $ many $ block + +para :: TWParser B.Blocks +para = many1Till inline endOfParaElement >>= return . result . mconcat + where + endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + newBlockElement = try $ blankline >> skip blockElements + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content + + +-- +-- inline parsers +-- + +inline :: TWParser B.Inlines +inline = choice [ whitespace + , br + , macro + , strong + , strongHtml + , strongAndEmph + , emph + , emphHtml + , boldCode + , smart + , link + , htmlComment + , code + , codeHtml + , nop + , autoLink + , str + , symbol + ] <?> "inline" + +whitespace :: TWParser B.Inlines +whitespace = (lb <|> regsp) >>= return + where lb = try $ skipMany spaceChar >> linebreak >> return B.space + regsp = try $ skipMany1 spaceChar >> return B.space + +br :: TWParser B.Inlines +br = try $ string "%BR%" >> return B.linebreak + +linebreak :: TWParser B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = eof >> return mempty + innerNewline = return B.space + +between :: (Show b, Monoid c) => TWParser a -> TWParser b -> (TWParser b -> TWParser c) -> TWParser c +between start end p = + mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) + +enclosed :: (Show a, Monoid b) => TWParser a -> (TWParser a -> TWParser b) -> TWParser b +enclosed sep p = between sep (try $ sep <* endMarker) p + where + endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|") <|> eof + endSpace = (spaceChar <|> newline) >> return B.space + +macro :: TWParser B.Inlines +macro = macroWithParameters <|> withoutParameters + where + withoutParameters = enclosed (char '%') (\_ -> macroName) >>= return . emptySpan + emptySpan name = buildSpan name [] mempty + +macroWithParameters :: TWParser B.Inlines +macroWithParameters = try $ do + char '%' + name <- macroName + (content, kvs) <- attributes + char '%' + return $ buildSpan name kvs $ B.str content + +buildSpan :: String -> [(String, String)] -> B.Inlines -> B.Inlines +buildSpan className kvs = B.spanWith attrs + where + attrs = ("", ["twiki-macro", className] ++ additionalClasses, kvsWithoutClasses) + additionalClasses = maybe [] words $ lookup "class" kvs + kvsWithoutClasses = [(k,v) | (k,v) <- kvs, k /= "class"] + +macroName :: TWParser String +macroName = do + first <- letter + rest <- many $ alphaNum <|> char '_' + return (first:rest) + +attributes :: TWParser (String, [(String, String)]) +attributes = char '{' *> spnl *> many (attribute <* spnl) <* char '}' >>= + return . foldr (either mkContent mkKvs) ([], []) + where + spnl = skipMany (spaceChar <|> newline) + mkContent c ([], kvs) = (c, kvs) + mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs) + mkKvs kv (cont, rest) = (cont, (kv : rest)) + +attribute :: TWParser (Either String (String, String)) +attribute = withKey <|> withoutKey + where + withKey = try $ do + key <- macroName + char '=' + parseValue False >>= return . (curry Right key) + withoutKey = try $ parseValue True >>= return . Left + parseValue allowSpaces = (withQuotes <|> withoutQuotes allowSpaces) >>= return . fromEntities + withQuotes = between (char '"') (char '"') (\_ -> count 1 $ noneOf ['"']) + withoutQuotes allowSpaces + | allowSpaces == True = many1 $ noneOf "}" + | otherwise = many1 $ noneOf " }" + +nestedInlines :: Show a => TWParser a -> TWParser B.Inlines +nestedInlines end = innerSpace <|> nestedInline + where + innerSpace = try $ whitespace <* (notFollowedBy end) + nestedInline = notFollowedBy whitespace >> nested inline + +strong :: TWParser B.Inlines +strong = try $ enclosed (char '*') nestedInlines >>= return . B.strong + +strongHtml :: TWParser B.Inlines +strongHtml = (parseHtmlContent "strong" inline <|> parseHtmlContent "b" inline) + >>= return . B.strong . mconcat + +strongAndEmph :: TWParser B.Inlines +strongAndEmph = try $ enclosed (string "__") nestedInlines >>= return . B.emph . B.strong + +emph :: TWParser B.Inlines +emph = try $ enclosed (char '_') nestedInlines >>= return . B.emph + +emphHtml :: TWParser B.Inlines +emphHtml = (parseHtmlContent "em" inline <|> parseHtmlContent "i" inline) + >>= return . B.emph . mconcat + +nestedString :: Show a => TWParser a -> TWParser String +nestedString end = innerSpace <|> (count 1 nonspaceChar) + where + innerSpace = try $ many1 spaceChar <* notFollowedBy end + +boldCode :: TWParser B.Inlines +boldCode = try $ enclosed (string "==") nestedString >>= return . B.strong . B.code . fromEntities + +htmlComment :: TWParser B.Inlines +htmlComment = htmlTag isCommentTag >> return mempty + +code :: TWParser B.Inlines +code = try $ enclosed (char '=') nestedString >>= return . B.code . fromEntities + +codeHtml :: TWParser B.Inlines +codeHtml = do + (attrs, content) <- parseHtmlContentWithAttrs "code" anyChar + return $ B.codeWith attrs $ fromEntities content + +autoLink :: TWParser B.Inlines +autoLink = try $ do + state <- getState + guard $ stateAllowLinks state + (text, url) <- parseLink + guard $ checkLink (head $ reverse url) + return $ makeLink (text, url) + where + parseLink = notFollowedBy nop >> (uri <|> emailAddress) + makeLink (text, url) = B.link url "" $ B.str text + checkLink c + | c == '/' = True + | otherwise = isAlphaNum c + +str :: TWParser B.Inlines +str = (many1 alphaNum <|> count 1 characterReference) >>= return . B.str + +nop :: TWParser B.Inlines +nop = try $ (skip exclamation <|> skip nopTag) >> followContent + where + exclamation = char '!' + nopTag = stringAnyCase "<nop>" + followContent = many1 nonspaceChar >>= return . B.str . fromEntities + +symbol :: TWParser B.Inlines +symbol = count 1 nonspaceChar >>= return . B.str + +smart :: TWParser B.Inlines +smart = do + getOption readerSmart >>= guard + doubleQuoted <|> singleQuoted <|> + choice [ apostrophe + , dash + , ellipses + ] + +singleQuoted :: TWParser B.Inlines +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ + many1Till inline singleQuoteEnd >>= + (return . B.singleQuoted . B.trimInlines . mconcat) + +doubleQuoted :: TWParser B.Inlines +doubleQuoted = try $ do + doubleQuoteStart + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> + return (B.doubleQuoted $ B.trimInlines contents)) + <|> (return $ (B.str "\8220") B.<> contents) + +link :: TWParser B.Inlines +link = try $ do + st <- getState + guard $ stateAllowLinks st + setState $ st{ stateAllowLinks = False } + (url, title, content) <- linkText + setState $ st{ stateAllowLinks = True } + return $ B.link url title content + +linkText :: TWParser (String, String, B.Inlines) +linkText = do + string "[[" + url <- many1Till anyChar (char ']') + content <- option [B.str url] linkContent + char ']' + return (url, "", mconcat content) + where + linkContent = (char '[') >> many1Till anyChar (char ']') >>= parseLinkContent + parseLinkContent = parseFromString $ many1 inline diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs index 6bd617f7e..3fee3051e 100644 --- a/src/Text/Pandoc/Readers/TeXMath.hs +++ b/src/Text/Pandoc/Readers/TeXMath.hs @@ -1,5 +1,5 @@ {- -Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu> +Copyright (C) 2007-2014 John MacFarlane <jgm@berkeley.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.TeXMath - Copyright : Copyright (C) 2007-2010 John MacFarlane + Copyright : Copyright (C) 2007-2014 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,7 +27,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Conversion of TeX math to a list of 'Pandoc' inline elements. -} -module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where +module Text.Pandoc.Readers.TeXMath ( texMathToInlines ) where import Text.Pandoc.Definition import Text.TeXMath @@ -35,22 +35,14 @@ import Text.TeXMath -- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. -- Defaults to raw formula between @$@ or @$$@ characters if entire formula -- can't be converted. -readTeXMath' :: MathType +texMathToInlines :: MathType -> String -- ^ String to parse (assumes @'\n'@ line endings) -> [Inline] -readTeXMath' mt inp = case texMathToPandoc dt inp of - Left _ -> [Str (delim ++ inp ++ delim)] - Right res -> res +texMathToInlines mt inp = + case writePandoc dt `fmap` readTeX inp of + Right (Just ils) -> ils + _ -> [Str (delim ++ inp ++ delim)] where (dt, delim) = case mt of DisplayMath -> (DisplayBlock, "$$") InlineMath -> (DisplayInline, "$") -{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-} --- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. --- Defaults to raw formula between @$@ characters if entire formula --- can't be converted. (This is provided for backwards compatibility; --- it is better to use @readTeXMath'@, which properly distinguishes --- between display and inline math.) -readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings) - -> [Inline] -readTeXMath = readTeXMath' InlineMath diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 93658cdea..ee64e8f2a 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -1,5 +1,6 @@ {- -Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' +Copyright (C) 2010-2014 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + and John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Textile - Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane + Copyright : Copyright (C) 2010-2014 Paul Rivier and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -50,20 +51,22 @@ TODO : refactor common patterns across readers : module Text.Pandoc.Readers.Textile ( readTextile) where - import Text.Pandoc.Definition +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing -import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) +import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag ) import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.HTML.TagSoup (parseTags, innerText, fromAttrib, Tag(..)) import Text.HTML.TagSoup.Match import Data.List ( intercalate ) -import Data.Char ( digitToInt, isUpper ) -import Control.Monad ( guard, liftM ) -import Control.Applicative ((<$>), (*>), (<*)) +import Data.Char ( digitToInt, isUpper) +import Control.Monad ( guard, liftM, when ) +import Text.Printf +import Control.Applicative ((<$>), (*>), (<*), (<$)) +import Data.Monoid +import Debug.Trace (trace) -- | Parse a Textile text and return a Pandoc document. readTextile :: ReaderOptions -- ^ Reader options @@ -95,7 +98,7 @@ parseTextile = do updateState $ \s -> s { stateNotes = reverse reversedNotes } -- now parse it for real... blocks <- parseBlocks - return $ Pandoc nullMeta blocks -- FIXME + return $ Pandoc nullMeta (B.toList blocks) -- FIXME noteMarker :: Parser [Char] ParserState [Char] noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') @@ -115,11 +118,11 @@ noteBlock = try $ do return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -- | Parse document blocks -parseBlocks :: Parser [Char] ParserState [Block] -parseBlocks = manyTill block eof +parseBlocks :: Parser [Char] ParserState Blocks +parseBlocks = mconcat <$> manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: [Parser [Char] ParserState Block] +blockParsers :: [Parser [Char] ParserState Blocks] blockParsers = [ codeBlock , header , blockQuote @@ -130,29 +133,37 @@ blockParsers = [ codeBlock , rawLaTeXBlock' , maybeExplicitBlock "table" table , maybeExplicitBlock "p" para + , mempty <$ blanklines ] -- | Any block in the order of definition of blockParsers -block :: Parser [Char] ParserState Block -block = choice blockParsers <?> "block" - -commentBlock :: Parser [Char] ParserState Block +block :: Parser [Char] ParserState Blocks +block = do + res <- choice blockParsers <?> "block" + pos <- getPosition + tr <- getOption readerTrace + when tr $ + trace (printf "line %d: %s" (sourceLine pos) + (take 60 $ show $ B.toList res)) (return ()) + return res + +commentBlock :: Parser [Char] ParserState Blocks commentBlock = try $ do string "###." manyTill anyLine blanklines - return Null + return mempty -codeBlock :: Parser [Char] ParserState Block +codeBlock :: Parser [Char] ParserState Blocks codeBlock = codeBlockBc <|> codeBlockPre -codeBlockBc :: Parser [Char] ParserState Block +codeBlockBc :: Parser [Char] ParserState Blocks codeBlockBc = try $ do string "bc. " contents <- manyTill anyLine blanklines - return $ CodeBlock ("",[],[]) $ unlines contents + return $ B.codeBlock (unlines contents) -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockPre :: Parser [Char] ParserState Block +codeBlockPre :: Parser [Char] ParserState Blocks codeBlockPre = try $ do (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) result' <- (innerText . parseTags) `fmap` -- remove internal tags @@ -169,29 +180,29 @@ codeBlockPre = try $ do let classes = words $ fromAttrib "class" t let ident = fromAttrib "id" t let kvs = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ CodeBlock (ident,classes,kvs) result''' + return $ B.codeBlockWith (ident,classes,kvs) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: Parser [Char] ParserState Block +header :: Parser [Char] ParserState Blocks header = try $ do char 'h' level <- digitToInt <$> oneOf "123456" attr <- attributes char '.' - whitespace - name <- normalizeSpaces <$> manyTill inline blockBreak - attr' <- registerHeader attr (B.fromList name) - return $ Header level attr' name + lookAhead whitespace + name <- trimInlines . mconcat <$> many inline + attr' <- registerHeader attr name + return $ B.headerWith attr' level name -- | Blockquote of the form "bq. content" -blockQuote :: Parser [Char] ParserState Block +blockQuote :: Parser [Char] ParserState Blocks blockQuote = try $ do string "bq" >> attributes >> char '.' >> whitespace - BlockQuote . singleton <$> para + B.blockQuote <$> para -- Horizontal rule -hrule :: Parser [Char] st Block +hrule :: Parser [Char] st Blocks hrule = try $ do skipSpaces start <- oneOf "-*" @@ -199,62 +210,74 @@ hrule = try $ do skipMany (spaceChar <|> char start) newline optional blanklines - return HorizontalRule + return B.horizontalRule -- Lists handling -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: Parser [Char] ParserState Block +anyList :: Parser [Char] ParserState Blocks anyList = try $ anyListAtDepth 1 <* blanklines -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: Int -> Parser [Char] ParserState Block +anyListAtDepth :: Int -> Parser [Char] ParserState Blocks anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: Int -> Parser [Char] ParserState Block -bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth) +bulletListAtDepth :: Int -> Parser [Char] ParserState Blocks +bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +bulletListItemAtDepth :: Int -> Parser [Char] ParserState Blocks bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' -orderedListAtDepth :: Int -> Parser [Char] ParserState Block +orderedListAtDepth :: Int -> Parser [Char] ParserState Blocks orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) - return (OrderedList (1, DefaultStyle, DefaultDelim) items) + return $ B.orderedList items -- | Ordered List Item of given depth, depth being the number of -- leading '#' -orderedListItemAtDepth :: Int -> Parser [Char] ParserState [Block] +orderedListItemAtDepth :: Int -> Parser [Char] ParserState Blocks orderedListItemAtDepth = genericListItemAtDepth '#' -- | Common implementation of list items -genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState [Block] +genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState Blocks genericListItemAtDepth c depth = try $ do count depth (char c) >> attributes >> whitespace - p <- many listInline + p <- mconcat <$> many listInline newline - sublist <- option [] (singleton <$> anyListAtDepth (depth + 1)) - return (Plain p : sublist) + sublist <- option mempty (anyListAtDepth (depth + 1)) + return $ (B.plain p) <> sublist -- | A definition list is a set of consecutive definition items -definitionList :: Parser [Char] ParserState Block -definitionList = try $ DefinitionList <$> many1 definitionListItem +definitionList :: Parser [Char] ParserState Blocks +definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. -listStart :: Parser [Char] st Char -listStart = oneOf "*#-" +listStart :: Parser [Char] ParserState () +listStart = genericListStart '*' + <|> () <$ genericListStart '#' + <|> () <$ definitionListStart + +genericListStart :: Char -> Parser [Char] st () +genericListStart c = () <$ try (many1 (char c) >> whitespace) + +definitionListStart :: Parser [Char] ParserState Inlines +definitionListStart = try $ do + char '-' + whitespace + trimInlines . mconcat <$> + many1Till inline (try (string ":=")) <* optional whitespace -listInline :: Parser [Char] ParserState Inline +listInline :: Parser [Char] ParserState Inlines listInline = try (notFollowedBy newline >> inline) <|> try (endline <* notFollowedBy listStart) @@ -262,16 +285,15 @@ listInline = try (notFollowedBy newline >> inline) -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) +definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks]) definitionListItem = try $ do - string "- " - term <- many1Till inline (try (whitespace >> string ":=")) + term <- definitionListStart def' <- multilineDef <|> inlineDef return (term, def') - where inlineDef :: Parser [Char] ParserState [[Block]] - inlineDef = liftM (\d -> [[Plain d]]) - $ optional whitespace >> many listInline <* newline - multilineDef :: Parser [Char] ParserState [[Block]] + where inlineDef :: Parser [Char] ParserState [Blocks] + inlineDef = liftM (\d -> [B.plain d]) + $ optional whitespace >> (trimInlines . mconcat <$> many listInline) <* newline + multilineDef :: Parser [Char] ParserState [Blocks] multilineDef = try $ do optional whitespace >> newline s <- many1Till anyChar (try (string "=:" >> newline)) @@ -279,68 +301,61 @@ definitionListItem = try $ do ds <- parseFromString parseBlocks (s ++ "\n\n") return [ds] --- | This terminates a block such as a paragraph. Because of raw html --- blocks support, we have to lookAhead for a rawHtmlBlock. -blockBreak :: Parser [Char] ParserState () -blockBreak = try (newline >> blanklines >> return ()) <|> - try (optional spaces >> lookAhead rawHtmlBlock >> return ()) - -- raw content -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: Parser [Char] ParserState Block +rawHtmlBlock :: Parser [Char] ParserState Blocks rawHtmlBlock = try $ do + skipMany spaceChar (_,b) <- htmlTag isBlockTag optional blanklines - return $ RawBlock (Format "html") b + return $ B.rawBlock "html" b -- | Raw block of LaTeX content -rawLaTeXBlock' :: Parser [Char] ParserState Block +rawLaTeXBlock' :: Parser [Char] ParserState Blocks rawLaTeXBlock' = do guardEnabled Ext_raw_tex - RawBlock (Format "latex") <$> (rawLaTeXBlock <* spaces) + B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) -- | In textile, paragraphs are separated by blank lines. -para :: Parser [Char] ParserState Block -para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak - +para :: Parser [Char] ParserState Blocks +para = B.para . trimInlines . mconcat <$> many1 inline -- Tables -- | A table cell spans until a pipe | -tableCell :: Parser [Char] ParserState TableCell +tableCell :: Parser [Char] ParserState Blocks tableCell = do c <- many1 (noneOf "|\n") - content <- parseFromString (many1 inline) c - return $ [ Plain $ normalizeSpaces content ] + content <- trimInlines . mconcat <$> parseFromString (many1 inline) c + return $ B.plain content -- | A table row is made of many table cells -tableRow :: Parser [Char] ParserState [TableCell] +tableRow :: Parser [Char] ParserState [Blocks] tableRow = try $ ( char '|' *> (endBy1 tableCell (optional blankline *> char '|')) <* newline) -- | Many table rows -tableRows :: Parser [Char] ParserState [[TableCell]] +tableRows :: Parser [Char] ParserState [[Blocks]] tableRows = many1 tableRow -- | Table headers are made of cells separated by a tag "|_." -tableHeaders :: Parser [Char] ParserState [TableCell] +tableHeaders :: Parser [Char] ParserState [Blocks] tableHeaders = let separator = (try $ string "|_.") in try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline ) -- | A table with an optional header. Current implementation can -- handle tables with and without header, but will parse cells -- alignment attributes as content. -table :: Parser [Char] ParserState Block +table :: Parser [Char] ParserState Blocks table = try $ do - headers <- option [] tableHeaders + headers <- option mempty tableHeaders rows <- tableRows blanklines let nbOfCols = max (length headers) (length $ head rows) - return $ Table [] - (replicate nbOfCols AlignDefault) - (replicate nbOfCols 0.0) + return $ B.table mempty + (zip (replicate nbOfCols AlignDefault) (replicate nbOfCols 0.0)) headers rows @@ -348,8 +363,8 @@ table = try $ do -- | Blocks like 'p' and 'table' do not need explicit block tag. -- However, they can be used to set HTML/CSS attributes when needed. maybeExplicitBlock :: String -- ^ block tag name - -> Parser [Char] ParserState Block -- ^ implicit block - -> Parser [Char] ParserState Block + -> Parser [Char] ParserState Blocks -- ^ implicit block + -> Parser [Char] ParserState Blocks maybeExplicitBlock name blk = try $ do optional $ try $ string name >> attributes >> char '.' >> optional whitespace >> optional endline @@ -363,73 +378,74 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: Parser [Char] ParserState Inline -inline = choice inlineParsers <?> "inline" +inline :: Parser [Char] ParserState Inlines +inline = do + choice inlineParsers <?> "inline" -- | Inline parsers tried in order -inlineParsers :: [Parser [Char] ParserState Inline] +inlineParsers :: [Parser [Char] ParserState Inlines] inlineParsers = [ str , whitespace , endline , code , escapedInline - , htmlSpan + , inlineMarkup + , groupedInlineMarkup , rawHtmlInline , rawLaTeXInline' , note - , try $ (char '[' *> inlineMarkup <* char ']') - , inlineMarkup , link , image , mark - , (Str . (:[])) <$> characterReference + , (B.str . (:[])) <$> characterReference , smartPunctuation inline , symbol ] -- | Inline markups -inlineMarkup :: Parser [Char] ParserState Inline -inlineMarkup = choice [ simpleInline (string "??") (Cite []) - , simpleInline (string "**") Strong - , simpleInline (string "__") Emph - , simpleInline (char '*') Strong - , simpleInline (char '_') Emph - , simpleInline (char '+') Emph -- approximates underline - , simpleInline (char '-' <* notFollowedBy (char '-')) Strikeout - , simpleInline (char '^') Superscript - , simpleInline (char '~') Subscript +inlineMarkup :: Parser [Char] ParserState Inlines +inlineMarkup = choice [ simpleInline (string "??") (B.cite []) + , simpleInline (string "**") B.strong + , simpleInline (string "__") B.emph + , simpleInline (char '*') B.strong + , simpleInline (char '_') B.emph + , simpleInline (char '+') B.emph -- approximates underline + , simpleInline (char '-' <* notFollowedBy (char '-')) B.strikeout + , simpleInline (char '^') B.superscript + , simpleInline (char '~') B.subscript + , simpleInline (char '%') id ] -- | Trademark, registered, copyright -mark :: Parser [Char] st Inline +mark :: Parser [Char] st Inlines mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: Parser [Char] st Inline +reg :: Parser [Char] st Inlines reg = do oneOf "Rr" char ')' - return $ Str "\174" + return $ B.str "\174" -tm :: Parser [Char] st Inline +tm :: Parser [Char] st Inlines tm = do oneOf "Tt" oneOf "Mm" char ')' - return $ Str "\8482" + return $ B.str "\8482" -copy :: Parser [Char] st Inline +copy :: Parser [Char] st Inlines copy = do oneOf "Cc" char ')' - return $ Str "\169" + return $ B.str "\169" -note :: Parser [Char] ParserState Inline +note :: Parser [Char] ParserState Inlines note = try $ do ref <- (char '[' *> many1 digit <* char ']') notes <- stateNotes <$> getState case lookup ref notes of Nothing -> fail "note not found" - Just raw -> liftM Note $ parseFromString parseBlocks raw + Just raw -> B.note <$> parseFromString parseBlocks raw -- | Special chars markupChars :: [Char] @@ -450,7 +466,7 @@ wordBoundaries = markupChars ++ stringBreakers hyphenedWords :: Parser [Char] ParserState String hyphenedWords = do x <- wordChunk - xs <- many (try $ char '-' >> wordChunk) + xs <- many (try $ char '-' >> wordChunk) return $ intercalate "-" (x:xs) wordChunk :: Parser [Char] ParserState String @@ -462,99 +478,104 @@ wordChunk = try $ do return $ hd:tl -- | Any string -str :: Parser [Char] ParserState Inline +str :: Parser [Char] ParserState Inlines str = do baseStr <- hyphenedWords -- RedCloth compliance : if parsed word is uppercase and immediatly -- followed by parens, parens content is unconditionally word acronym fullStr <- option baseStr $ try $ do guard $ all isUpper baseStr - acro <- enclosed (char '(') (char ')') anyChar + acro <- enclosed (char '(') (char ')') anyChar' return $ concat [baseStr, " (", acro, ")"] updateLastStrPos - return $ Str fullStr - --- | Textile allows HTML span infos, we discard them -htmlSpan :: Parser [Char] ParserState Inline -htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') ) + return $ B.str fullStr -- | Some number of space chars -whitespace :: Parser [Char] ParserState Inline -whitespace = many1 spaceChar >> return Space <?> "whitespace" +whitespace :: Parser [Char] st Inlines +whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: Parser [Char] ParserState Inline +endline :: Parser [Char] ParserState Inlines endline = try $ do - newline >> notFollowedBy blankline - return LineBreak + newline + notFollowedBy blankline + notFollowedBy listStart + notFollowedBy rawHtmlBlock + return B.linebreak -rawHtmlInline :: Parser [Char] ParserState Inline -rawHtmlInline = RawInline (Format "html") . snd <$> htmlTag isInlineTag +rawHtmlInline :: Parser [Char] ParserState Inlines +rawHtmlInline = B.rawInline "html" . snd <$> htmlTag (const True) -- | Raw LaTeX Inline -rawLaTeXInline' :: Parser [Char] ParserState Inline +rawLaTeXInline' :: Parser [Char] ParserState Inlines rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex - rawLaTeXInline + B.singleton <$> rawLaTeXInline -- | Textile standard link syntax is "label":target. But we -- can also have ["label":target]. -link :: Parser [Char] ParserState Inline -link = linkB <|> linkNoB - -linkNoB :: Parser [Char] ParserState Inline -linkNoB = try $ do - name <- surrounded (char '"') inline - char ':' - let stopChars = "!.,;:" - url <- manyTill nonspaceChar (lookAhead $ space <|> try (oneOf stopChars >> (space <|> newline))) - let name' = if name == [Str "$"] then [Str url] else name - return $ Link name' (url, "") - -linkB :: Parser [Char] ParserState Inline -linkB = try $ do - char '[' - name <- surrounded (char '"') inline +link :: Parser [Char] ParserState Inlines +link = try $ do + bracketed <- (True <$ char '[') <|> return False + char '"' *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + name <- trimInlines . mconcat <$> + withQuoteContext InDoubleQuote (many1Till inline (char '"')) char ':' - url <- manyTill nonspaceChar (char ']') - let name' = if name == [Str "$"] then [Str url] else name - return $ Link name' (url, "") + let stop = if bracketed + then char ']' + else lookAhead $ space <|> + try (oneOf "!.,;:" *> (space <|> newline)) + url <- manyTill nonspaceChar stop + let name' = if B.toList name == [Str "$"] then B.str url else name + return $ if attr == nullAttr + then B.link url "" name' + else B.spanWith attr $ B.link url "" name' -- | image embedding -image :: Parser [Char] ParserState Inline +image :: Parser [Char] ParserState Inlines image = try $ do char '!' >> notFollowedBy space - src <- manyTill anyChar (lookAhead $ oneOf "!(") - alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')'))) + src <- manyTill anyChar' (lookAhead $ oneOf "!(") + alt <- option "" (try $ (char '(' >> manyTill anyChar' (char ')'))) char '!' - return $ Image [Str alt] (src, alt) + return $ B.image src alt (B.str alt) -escapedInline :: Parser [Char] ParserState Inline +escapedInline :: Parser [Char] ParserState Inlines escapedInline = escapedEqs <|> escapedTag -escapedEqs :: Parser [Char] ParserState Inline -escapedEqs = Str <$> (try $ string "==" *> manyTill anyChar (try $ string "==")) +escapedEqs :: Parser [Char] ParserState Inlines +escapedEqs = B.str <$> + (try $ string "==" *> manyTill anyChar' (try $ string "==")) -- | literal text escaped btw <notextile> tags -escapedTag :: Parser [Char] ParserState Inline -escapedTag = Str <$> - (try $ string "<notextile>" *> manyTill anyChar (try $ string "</notextile>")) +escapedTag :: Parser [Char] ParserState Inlines +escapedTag = B.str <$> + (try $ string "<notextile>" *> + manyTill anyChar' (try $ string "</notextile>")) -- | Any special symbol defined in wordBoundaries -symbol :: Parser [Char] ParserState Inline -symbol = Str . singleton <$> (oneOf wordBoundaries <|> oneOf markupChars) +symbol :: Parser [Char] ParserState Inlines +symbol = B.str . singleton <$> (notFollowedBy newline *> + notFollowedBy rawHtmlBlock *> + oneOf wordBoundaries) -- | Inline code -code :: Parser [Char] ParserState Inline +code :: Parser [Char] ParserState Inlines code = code1 <|> code2 -code1 :: Parser [Char] ParserState Inline -code1 = Code nullAttr <$> surrounded (char '@') anyChar +-- any character except a newline before a blank line +anyChar' :: Parser [Char] ParserState Char +anyChar' = + satisfy (/='\n') <|> (try $ char '\n' <* notFollowedBy blankline) -code2 :: Parser [Char] ParserState Inline +code1 :: Parser [Char] ParserState Inlines +code1 = B.code <$> surrounded (char '@') anyChar' + +code2 :: Parser [Char] ParserState Inlines code2 = do htmlTag (tagOpen (=="tt") null) - Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) + B.code <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes attributes :: Parser [Char] ParserState Attr @@ -566,7 +587,7 @@ attribute = classIdAttr <|> styleAttr <|> langAttr classIdAttr :: Parser [Char] ParserState (Attr -> Attr) classIdAttr = try $ do -- (class class #id) char '(' - ws <- words `fmap` manyTill anyChar (char ')') + ws <- words `fmap` manyTill anyChar' (char ')') case reverse ws of [] -> return $ \(_,_,keyvals) -> ("",[],keyvals) (('#':ident'):classes') -> return $ \(_,_,keyvals) -> @@ -576,28 +597,50 @@ classIdAttr = try $ do -- (class class #id) styleAttr :: Parser [Char] ParserState (Attr -> Attr) styleAttr = do - style <- try $ enclosed (char '{') (char '}') anyChar + style <- try $ enclosed (char '{') (char '}') anyChar' return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals) langAttr :: Parser [Char] ParserState (Attr -> Attr) langAttr = do - lang <- try $ enclosed (char '[') (char ']') anyChar + lang <- try $ enclosed (char '[') (char ']') alphaNum return $ \(id',classes,keyvals) -> (id',classes,("lang",lang):keyvals) -- | Parses material surrounded by a parser. surrounded :: Parser [Char] st t -- ^ surrounding parser -> Parser [Char] st a -- ^ content parser (to be used repeatedly) -> Parser [Char] st [a] -surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) +surrounded border = + enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) --- | Inlines are most of the time of the same form simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser - -> ([Inline] -> Inline) -- ^ Inline constructor - -> Parser [Char] ParserState Inline -- ^ content parser (to be used repeatedly) -simpleInline border construct = surrounded border inlineWithAttribute >>= - return . construct . normalizeSpaces - where inlineWithAttribute = (try $ optional attributes) >> inline + -> (Inlines -> Inlines) -- ^ Inline constructor + -> Parser [Char] ParserState Inlines -- ^ content parser (to be used repeatedly) +simpleInline border construct = try $ do + st <- getState + pos <- getPosition + let afterString = stateLastStrPos st == Just pos + guard $ not afterString + border *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + body <- trimInlines . mconcat <$> + withQuoteContext InSingleQuote + (manyTill (notFollowedBy newline >> inline) + (try border <* notFollowedBy alphaNum)) + return $ construct $ + if attr == nullAttr + then body + else B.spanWith attr body + +groupedInlineMarkup :: Parser [Char] ParserState Inlines +groupedInlineMarkup = try $ do + char '[' + sp1 <- option mempty $ B.space <$ whitespace + result <- withQuoteContext InSingleQuote inlineMarkup + sp2 <- option mempty $ B.space <$ whitespace + char ']' + return $ sp1 <> result <> sp2 -- | Create a singleton list singleton :: a -> [a] singleton x = [x] + diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs new file mode 100644 index 000000000..6f8c19ac7 --- /dev/null +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -0,0 +1,579 @@ +{-# LANGUAGE ViewPatterns #-} +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Txt2Tags + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + +Conversion of txt2tags formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags + , getT2TMeta + , T2TMeta (..) + , readTxt2TagsNoMacros) + where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, (<>) + , trimInlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL) +import Text.Pandoc.Parsing hiding (space, spaces, uri, macro) +import Control.Applicative ((<$>), (<$), (<*>), (<*), (*>)) +import Data.Char (toLower) +import Data.List (transpose, intersperse, intercalate) +import Data.Maybe (fromMaybe) +import Data.Monoid (Monoid, mconcat, mempty, mappend) +--import Network.URI (isURI) -- Not sure whether to use this function +import Control.Monad (void, guard, when) +import Data.Default +import Control.Monad.Reader (Reader, runReader, asks) + +import Data.Time.LocalTime (getZonedTime) +import Text.Pandoc.Compat.Directory(getModificationTime) +import Data.Time.Format (formatTime) +import System.Locale (defaultTimeLocale) +import System.IO.Error (catchIOError) + +type T2T = ParserT String ParserState (Reader T2TMeta) + +-- | An object for the T2T macros meta information +-- the contents of each field is simply substituted verbatim into the file +data T2TMeta = T2TMeta { + date :: String -- ^ Current date + , mtime :: String -- ^ Last modification time of infile + , infile :: FilePath -- ^ Input file + , outfile :: FilePath -- ^ Output file + } deriving Show + +instance Default T2TMeta where + def = T2TMeta "" "" "" "" + +-- | Get the meta information required by Txt2Tags macros +getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta +getT2TMeta inps out = do + curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime + let getModTime = fmap (formatTime defaultTimeLocale "%T") . + getModificationTime + curMtime <- case inps of + [] -> formatTime defaultTimeLocale "%T" <$> getZonedTime + _ -> catchIOError + (maximum <$> mapM getModTime inps) + (const (return "")) + return $ T2TMeta curDate curMtime (intercalate ", " inps) out + +-- | Read Txt2Tags from an input string returning a Pandoc document +readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Pandoc +readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n") + +-- | Read Txt2Tags (ignoring all macros) from an input string returning +-- a Pandoc document +readTxt2TagsNoMacros :: ReaderOptions -> String -> Pandoc +readTxt2TagsNoMacros = readTxt2Tags def + +parseT2T :: T2T Pandoc +parseT2T = do + -- Parse header if standalone flag is set + standalone <- getOption readerStandalone + when standalone parseHeader + body <- mconcat <$> manyTill block eof + meta' <- stateMeta <$> getState + return $ Pandoc meta' (B.toList body) + +parseHeader :: T2T () +parseHeader = do + () <$ try blankline <|> header + meta <- stateMeta <$> getState + optional blanklines + config <- manyTill setting (notFollowedBy setting) + -- TODO: Handle settings better + let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) meta config + updateState (\s -> s {stateMeta = settings}) <* optional blanklines + +header :: T2T () +header = titleline >> authorline >> dateline + +headerline :: B.ToMetaValue a => String -> T2T a -> T2T () +headerline field p = (() <$ try blankline) + <|> (p >>= updateState . B.setMeta field) + +titleline :: T2T () +titleline = + headerline "title" (trimInlines . mconcat <$> manyTill inline newline) + +authorline :: T2T () +authorline = + headerline "author" (sepBy author (char ';') <* newline) + where + author = trimInlines . mconcat <$> many (notFollowedBy (char ';' <|> newline) >> inline) + +dateline :: T2T () +dateline = headerline "date" (trimInlines . mconcat <$> manyTill inline newline) + +type Keyword = String +type Value = String + +setting :: T2T (Keyword, Value) +setting = do + string "%!" + keyword <- ignoreSpacesCap (many1 alphaNum) + char ':' + value <- ignoreSpacesCap (manyTill anyChar (newline)) + return (keyword, value) + +-- Blocks + +parseBlocks :: T2T Blocks +parseBlocks = mconcat <$> manyTill block eof + +block :: T2T Blocks +block = do + choice + [ mempty <$ blanklines + , quote + , hrule -- hrule must go above title + , title + , commentBlock + , verbatim + , rawBlock + , taggedBlock + , list + , table + , para + ] + +title :: T2T Blocks +title = try $ balancedTitle '+' <|> balancedTitle '=' + +balancedTitle :: Char -> T2T Blocks +balancedTitle c = try $ do + spaces + level <- length <$> many1 (char c) + guard (level <= 5) -- Max header level 5 + heading <- manyTill (noneOf "\n\r") (count level (char c)) + label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-")) + many spaceChar *> newline + let attr = maybe nullAttr (\x -> (x, [], [])) label + return $ B.headerWith attr level (trimInlines $ B.text heading) + +para :: T2T Blocks +para = try $ do + ils <- parseInlines + nl <- option False (True <$ newline) + option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils)) + where + listStart = try bulletListStart <|> orderedListStart + +commentBlock :: T2T Blocks +commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment + +-- Seperator and Strong line treated the same +hrule :: T2T Blocks +hrule = try $ do + spaces + line <- many1 (oneOf "=-_") + guard (length line >= 20) + B.horizontalRule <$ blankline + +quote :: T2T Blocks +quote = try $ do + lookAhead tab + rawQuote <- many1 (tab *> optional spaces *> anyLine) + contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n") + return $ B.blockQuote contents + +commentLine :: T2T Inlines +commentLine = comment + +-- List Parsing code from Org Reader + +list :: T2T Blocks +list = choice [bulletList, orderedList, definitionList] + +bulletList :: T2T Blocks +bulletList = B.bulletList . compactify' + <$> many1 (listItem bulletListStart parseBlocks) + +orderedList :: T2T Blocks +orderedList = B.orderedList . compactify' + <$> many1 (listItem orderedListStart parseBlocks) + +definitionList :: T2T Blocks +definitionList = try $ do + B.definitionList . compactify'DL <$> + many1 (listItem definitionListStart definitionListEnd) + +definitionListEnd :: T2T (Inlines, [Blocks]) +definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks) + +genericListStart :: T2T Char + -> T2T Int +genericListStart listMarker = try $ + (2+) <$> (length <$> many spaceChar + <* listMarker <* space <* notFollowedBy space) + +-- parses bullet list \start and returns its length (excl. following whitespace) +bulletListStart :: T2T Int +bulletListStart = genericListStart (char '-') + +orderedListStart :: T2T Int +orderedListStart = genericListStart (char '+' ) + +definitionListStart :: T2T Int +definitionListStart = genericListStart (char ':') + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: T2T Int + -> T2T a + -> T2T a +listItem start end = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString end $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> T2T String +listContinuation markerLength = try $ + notFollowedBy' (blankline >> blankline) + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: T2T String +anyLineNewline = (++ "\n") <$> anyLine + +indentWith :: Int -> T2T String +indentWith n = count n space + +-- Table + +table :: T2T Blocks +table = try $ do + tableHeader <- fmap snd <$> option mempty (try headerRow) + rows <- many1 (many commentLine *> tableRow) + let columns = transpose rows + let ncolumns = length columns + let aligns = map (foldr1 findAlign) (map (map fst) columns) + let rows' = map (map snd) rows + let size = maximum (map length rows') + let rowsPadded = map (pad size) rows' + let headerPadded = if (not (null tableHeader)) then pad size tableHeader else mempty + return $ B.table mempty + (zip aligns (replicate ncolumns 0.0)) + headerPadded rowsPadded + +pad :: (Show a, Monoid a) => Int -> [a] -> [a] +pad n xs = xs ++ (replicate (n - length xs) mempty) + + +findAlign :: Alignment -> Alignment -> Alignment +findAlign x y + | x == y = x + | otherwise = AlignDefault + +headerRow :: T2T [(Alignment, Blocks)] +headerRow = genericRow (string "||") + +tableRow :: T2T [(Alignment, Blocks)] +tableRow = genericRow (char '|') + +genericRow :: T2T a -> T2T [(Alignment, Blocks)] +genericRow start = try $ do + spaces *> start + manyTill tableCell newline <?> "genericRow" + + +tableCell :: T2T (Alignment, Blocks) +tableCell = try $ do + leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead + content <- (manyTill inline (try $ lookAhead (cellEnd))) + rightSpaces <- length <$> many space + let align = + case compare leftSpaces rightSpaces of + LT -> AlignLeft + EQ -> AlignCenter + GT -> AlignRight + endOfCell + return $ (align, B.plain (B.trimInlines $ mconcat content)) + where + cellEnd = (void newline <|> (many1 space *> endOfCell)) + +endOfCell :: T2T () +endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline) + +-- Raw area + +verbatim :: T2T Blocks +verbatim = genericBlock anyLineNewline B.codeBlock "```" + +rawBlock :: T2T Blocks +rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\"" + +taggedBlock :: T2T Blocks +taggedBlock = do + target <- getTarget + genericBlock anyLineNewline (B.rawBlock target) "'''" + +-- Generic + +genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks +genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s + +blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupArea p f s = try $ (do + string s *> blankline + f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline)))) + +blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupLine p f s = try (f <$> (string s *> space *> p)) + +-- Can be in either block or inline position +comment :: Monoid a => T2T a +comment = try $ do + atStart + notFollowedBy macro + mempty <$ (char '%' *> anyLine) + +-- Inline + +parseInlines :: T2T Inlines +parseInlines = trimInlines . mconcat <$> many1 inline + +inline :: T2T Inlines +inline = do + choice + [ endline + , macro + , commentLine + , whitespace + , url + , link + , image + , bold + , underline + , code + , raw + , tagged + , strike + , italic + , code + , str + , symbol + ] + +bold :: T2T Inlines +bold = inlineMarkup inline B.strong '*' (B.str) + +underline :: T2T Inlines +underline = inlineMarkup inline B.emph '_' (B.str) + +strike :: T2T Inlines +strike = inlineMarkup inline B.strikeout '-' (B.str) + +italic :: T2T Inlines +italic = inlineMarkup inline B.emph '/' (B.str) + +code :: T2T Inlines +code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id + +raw :: T2T Inlines +raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id + +tagged :: T2T Inlines +tagged = do + target <- getTarget + inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id + +-- Parser for markup indicated by a double character. +-- Inline markup is greedy and glued +-- Greedy meaning ***a*** = Bold [Str "*a*"] +-- Glued meaning that markup must be tight to content +-- Markup can't pass newlines +inlineMarkup :: Monoid a + => (T2T a) -- Content parser + -> (a -> Inlines) -- Constructor + -> Char -- Fence + -> (String -> a) -- Special Case to handle ****** + -> T2T Inlines +inlineMarkup p f c special = try $ do + start <- many1 (char c) + let l = length start + guard (l >= 2) + when (l == 2) (void $ notFollowedBy space) + -- We must make sure that there is no space before the start of the + -- closing tags + body <- optionMaybe (try $ manyTill (noneOf "\n\r") $ + (try $ lookAhead (noneOf " " >> string [c,c] ))) + case body of + Just middle -> do + lastChar <- anyChar + end <- many1 (char c) + let parser inp = parseFromString (mconcat <$> many p) inp + let start' = special (drop 2 start) + body' <- parser (middle ++ [lastChar]) + let end' = special (drop 2 end) + return $ f (start' <> body' <> end') + Nothing -> do -- Either bad or case such as ***** + guard (l >= 5) + let body' = (replicate (l - 4) c) + return $ f (special body') + +link :: T2T Inlines +link = try imageLink <|> titleLink + +-- Link with title +titleLink :: T2T Inlines +titleLink = try $ do + char '[' + notFollowedBy space + tokens <- sepBy1 (many $ noneOf " ]") space + guard (length tokens >= 2) + char ']' + let link' = last tokens + guard (length link' > 0) + let tit = concat (intersperse " " (init tokens)) + return $ B.link link' "" (B.text tit) + +-- Link with image +imageLink :: T2T Inlines +imageLink = try $ do + char '[' + body <- image + many1 space + l <- manyTill (noneOf "\n\r ") (char ']') + return (B.link l "" body) + +macro :: T2T Inlines +macro = try $ do + name <- string "%%" *> oneOfStringsCI (map fst commands) + optional (try $ enclosed (char '(') (char ')') anyChar) + lookAhead (spaceChar <|> oneOf specialChars <|> newline) + maybe (return mempty) (\f -> B.str <$> asks f) (lookup name commands) + where + commands = [ ("date", date), ("mtime", mtime) + , ("infile", infile), ("outfile", outfile)] + +-- raw URLs in text are automatically linked +url :: T2T Inlines +url = try $ do + (rawUrl, escapedUrl) <- (try uri <|> emailAddress) + return $ B.link rawUrl "" (B.str escapedUrl) + +uri :: T2T (String, String) +uri = try $ do + address <- t2tURI + return (address, escapeURI address) + +-- The definition of a URI in the T2T source differs from the +-- actual definition. This is a transcription of the definition in +-- the source of v2.6 +--isT2TURI :: String -> Bool +--isT2TURI (parse t2tURI "" -> Right _) = True +--isT2TURI _ = False + +t2tURI :: T2T String +t2tURI = do + start <- try ((++) <$> proto <*> urlLogin) <|> guess + domain <- many1 chars + sep <- many (char '/') + form' <- option mempty ((:) <$> char '?' <*> many1 form) + anchor' <- option mempty ((:) <$> char '#' <*> many anchor) + return (start ++ domain ++ sep ++ form' ++ anchor') + where + protos = ["http", "https", "ftp", "telnet", "gopher", "wais"] + proto = (++) <$> oneOfStrings protos <*> string "://" + guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23")) + <|> stringAnyCase "ftp") <*> ((:[]) <$> char '.') + login = alphaNum <|> oneOf "_.-" + pass = many (noneOf " @") + chars = alphaNum <|> oneOf "%._/~:,=$@&+-" + anchor = alphaNum <|> oneOf "%._0" + form = chars <|> oneOf ";*" + urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@') + + +image :: T2T Inlines +image = try $ do + -- List taken from txt2tags source + let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"] + char '[' + path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions)) + ext <- oneOfStrings extensions + char ']' + return $ B.image (path ++ ext) "" mempty + +-- Characters used in markup +specialChars :: String +specialChars = "%*-_/|:+;" + +tab :: T2T Char +tab = char '\t' + +space :: T2T Char +space = char ' ' + +spaces :: T2T String +spaces = many space + +endline :: T2T Inlines +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy hrule + notFollowedBy title + notFollowedBy verbatim + notFollowedBy rawBlock + notFollowedBy taggedBlock + notFollowedBy quote + notFollowedBy list + notFollowedBy table + return $ B.space + +str :: T2T Inlines +str = try $ do + B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + +whitespace :: T2T Inlines +whitespace = try $ B.space <$ spaceChar + +symbol :: T2T Inlines +symbol = B.str . (:[]) <$> oneOf specialChars + +-- Utility + +getTarget :: T2T String +getTarget = do + mv <- lookupMeta "target" . stateMeta <$> getState + let MetaString target = fromMaybe (MetaString "html") mv + return target + +atStart :: T2T () +atStart = (sourceColumn <$> getPosition) >>= guard . (== 1) + +ignoreSpacesCap :: T2T String -> T2T String +ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces) + |