aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs189
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs549
-rw-r--r--src/Text/Pandoc/Readers/Docx/Fonts.hs238
-rw-r--r--src/Text/Pandoc/Readers/Docx/Lists.hs229
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs911
-rw-r--r--src/Text/Pandoc/Readers/Docx/Reducible.hs182
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs283
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs629
-rw-r--r--src/Text/Pandoc/Readers/Haddock.hs139
-rw-r--r--src/Text/Pandoc/Readers/Haddock/Lex.x171
-rw-r--r--src/Text/Pandoc/Readers/Haddock/Parse.y178
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs296
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs475
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs47
-rw-r--r--src/Text/Pandoc/Readers/Native.hs4
-rw-r--r--src/Text/Pandoc/Readers/Org.hs1487
-rw-r--r--src/Text/Pandoc/Readers/RST.hs203
-rw-r--r--src/Text/Pandoc/Readers/TWiki.hs526
-rw-r--r--src/Text/Pandoc/Readers/TeXMath.hs24
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs383
-rw-r--r--src/Text/Pandoc/Readers/Txt2Tags.hs579
21 files changed, 6580 insertions, 1142 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index 56cb16b20..663960a87 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -6,6 +6,7 @@ import Text.Pandoc.Definition
import Text.Pandoc.Builder
import Text.XML.Light
import Text.Pandoc.Compat.TagSoupEntity (lookupEntity)
+import Data.Either (rights)
import Data.Generics
import Data.Monoid
import Data.Char (isSpace)
@@ -13,6 +14,7 @@ import Control.Monad.State
import Control.Applicative ((<$>))
import Data.List (intersperse)
import Data.Maybe (fromMaybe)
+import Text.TeXMath (readMathML, writeTeX)
{-
@@ -45,7 +47,7 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] audioobject - A wrapper for audio data and its associated meta-information
[x] author - The name of an individual author
[ ] authorblurb - A short description or note about an author
-[ ] authorgroup - Wrapper for author information when a document has
+[x] authorgroup - Wrapper for author information when a document has
multiple authors or collabarators
[x] authorinitials - The initials or other short identifier for an author
[o] beginpage - The location of a page break in a print version of the document
@@ -68,8 +70,8 @@ List of all DocBook tags, with [x] indicating implemented,
[x] book - A book
[x] bookinfo - Meta-information for a Book
[x] bridgehead - A free-floating heading
-[ ] callout - A “called out” description of a marked Area
-[ ] calloutlist - A list of Callouts
+[x] callout - A “called out” description of a marked Area
+[x] calloutlist - A list of Callouts
[x] caption - A caption
[x] caution - A note of caution
[x] chapter - A chapter, as of a book
@@ -79,7 +81,7 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] citerefentry - A citation to a reference page
[ ] citetitle - The title of a cited work
[ ] city - The name of a city in an address
-[ ] classname - The name of a class, in the object-oriented programming sense
+[x] classname - The name of a class, in the object-oriented programming sense
[ ] classsynopsis - The syntax summary for a class definition
[ ] classsynopsisinfo - Information supplementing the contents of
a ClassSynopsis
@@ -126,7 +128,7 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] envar - A software environment variable
[x] epigraph - A short inscription at the beginning of a document or component
note: also handle embedded attribution tag
-[ ] equation - A displayed mathematical equation
+[x] equation - A displayed mathematical equation
[ ] errorcode - An error code
[ ] errorname - An error name
[ ] errortext - An error message.
@@ -167,9 +169,9 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] guibutton - The text on a button in a GUI
[ ] guiicon - Graphic and/or text appearing as a icon in a GUI
[ ] guilabel - The text of a label in a GUI
-[ ] guimenu - The name of a menu in a GUI
-[ ] guimenuitem - The name of a terminal menu item in a GUI
-[ ] guisubmenu - The name of a submenu in a GUI
+[x] guimenu - The name of a menu in a GUI
+[x] guimenuitem - The name of a terminal menu item in a GUI
+[x] guisubmenu - The name of a submenu in a GUI
[ ] hardware - A physical part of a computer system
[ ] highlights - A summary of the main points of the discussed component
[ ] holder - The name of the individual or organization that holds a copyright
@@ -185,12 +187,12 @@ List of all DocBook tags, with [x] indicating implemented,
[x] indexinfo - Meta-information for an Index
[x] indexterm - A wrapper for terms to be indexed
[x] info - A wrapper for information about a component or other block. (DocBook v5)
-[ ] informalequation - A displayed mathematical equation without a title
+[x] informalequation - A displayed mathematical equation without a title
[ ] informalexample - A displayed example without a title
[ ] informalfigure - A untitled figure
[ ] informaltable - A table without a title
[ ] initializer - The initializer for a FieldSynopsis
-[ ] inlineequation - A mathematical equation or expression occurring inline
+[x] inlineequation - A mathematical equation or expression occurring inline
[ ] inlinegraphic - An object containing or pointing to graphical data
that will be rendered inline
[x] inlinemediaobject - An inline media object (video, audio, image, and so on)
@@ -204,10 +206,10 @@ List of all DocBook tags, with [x] indicating implemented,
other dingbat
[ ] itermset - A set of index terms in the meta-information of a document
[ ] jobtitle - The title of an individual in an organization
-[ ] keycap - The text printed on a key on a keyboard
+[x] keycap - The text printed on a key on a keyboard
[ ] keycode - The internal, frequently numeric, identifier for a key
on a keyboard
-[ ] keycombo - A combination of input actions
+[x] keycombo - A combination of input actions
[ ] keysym - The symbolic name of a key on a keyboard
[ ] keyword - One of a set of keywords describing the content of a document
[ ] keywordset - A set of keywords describing the content of a document
@@ -235,11 +237,11 @@ List of all DocBook tags, with [x] indicating implemented,
[x] mediaobject - A displayed media object (video, audio, image, etc.)
[ ] mediaobjectco - A media object that contains callouts
[x] member - An element of a simple list
-[ ] menuchoice - A selection or series of selections from a menu
+[x] menuchoice - A selection or series of selections from a menu
[ ] methodname - The name of a method
[ ] methodparam - Parameters to a method
[ ] methodsynopsis - A syntax summary for a method
-[ ] mml:math - A MathML equation
+[x] mml:math - A MathML equation
[ ] modespec - Application-specific information necessary for the
completion of an OLink
[ ] modifier - Modifiers in a synopsis
@@ -341,7 +343,7 @@ List of all DocBook tags, with [x] indicating implemented,
[x] refsectioninfo - Meta-information for a refsection
[ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page
[ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv
-[ ] releaseinfo - Information about a particular release of a document
+[x] releaseinfo - Information about a particular release of a document
[ ] remark - A remark (or comment) intended for presentation in a draft
manuscript
[ ] replaceable - Content that may or must be replaced by the user
@@ -469,7 +471,7 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] token - A unit of information
[x] tr - A row in an HTML table
[ ] trademark - A trademark
-[ ] type - The classification of a value
+[x] type - The classification of a value
[x] ulink - A link that addresses its target by means of a URL
(Uniform Resource Locator)
[x] uri - A Uniform Resource Identifier
@@ -492,34 +494,40 @@ List of all DocBook tags, with [x] indicating implemented,
anything else
[ ] xref - A cross reference to another part of the document
[ ] year - The year of publication of a document
-
+[x] ?asciidoc-br? - line break from asciidoc docbook output
-}
type DB = State DBState
data DBState = DBState{ dbSectionLevel :: Int
, dbQuoteType :: QuoteType
- , dbDocTitle :: Inlines
- , dbDocAuthors :: [Inlines]
- , dbDocDate :: Inlines
+ , dbMeta :: Meta
+ , dbAcceptsMeta :: Bool
, dbBook :: Bool
, dbFigureTitle :: Inlines
} deriving Show
readDocBook :: ReaderOptions -> String -> Pandoc
-readDocBook _ inp = setTitle (dbDocTitle st')
- $ setAuthors (dbDocAuthors st')
- $ setDate (dbDocDate st')
- $ doc $ mconcat bs
- where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp)
+readDocBook _ inp = Pandoc (dbMeta st') (toList $ mconcat bs)
+ where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp')
DBState{ dbSectionLevel = 0
, dbQuoteType = DoubleQuote
- , dbDocTitle = mempty
- , dbDocAuthors = []
- , dbDocDate = mempty
+ , dbMeta = mempty
+ , dbAcceptsMeta = False
, dbBook = False
, dbFigureTitle = mempty
}
+ inp' = handleInstructions inp
+
+-- We treat <?asciidoc-br?> specially (issue #1236), converting it
+-- to <br/>, since xml-light doesn't parse the instruction correctly.
+-- Other xml instructions are simply removed from the input stream.
+handleInstructions :: String -> String
+handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions xs
+handleInstructions xs = case break (=='<') xs of
+ (ys, []) -> ys
+ ([], '<':zs) -> '<' : handleInstructions zs
+ (ys, zs) -> ys ++ handleInstructions zs
getFigure :: Element -> DB Blocks
getFigure e = do
@@ -560,6 +568,30 @@ attrValue attr elt =
named :: String -> Element -> Bool
named s e = qName (elName e) == s
+--
+
+acceptingMetadata :: DB a -> DB a
+acceptingMetadata p = do
+ modify (\s -> s { dbAcceptsMeta = True } )
+ res <- p
+ modify (\s -> s { dbAcceptsMeta = False })
+ return res
+
+checkInMeta :: Monoid a => DB () -> DB a
+checkInMeta p = do
+ accepts <- dbAcceptsMeta <$> get
+ when accepts p
+ return mempty
+
+
+
+addMeta :: ToMetaValue a => String -> a -> DB ()
+addMeta field val = modify (setMeta field val)
+
+instance HasMeta DBState where
+ setMeta field v s = s {dbMeta = setMeta field v (dbMeta s)}
+ deleteMeta field s = s {dbMeta = deleteMeta field (dbMeta s)}
+
isBlockElement :: Content -> Bool
isBlockElement (Elem e) = qName (elName e) `elem` blocktags
where blocktags = ["toc","index","para","formalpara","simpara",
@@ -571,7 +603,7 @@ isBlockElement (Elem e) = qName (elName e) `elem` blocktags
"important","caution","note","tip","warning","qandadiv",
"question","answer","abstract","itemizedlist","orderedlist",
"variablelist","article","book","table","informaltable",
- "screen","programlisting","example"]
+ "screen","programlisting","example","calloutlist"]
isBlockElement _ = False
-- Trim leading and trailing newline characters
@@ -606,6 +638,7 @@ getImage e = do
getBlocks :: Element -> DB Blocks
getBlocks e = mconcat <$> (mapM parseBlock $ elContent e)
+
parseBlock :: Content -> DB Blocks
parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
parseBlock (Text (CData _ s _)) = if all isSpace s
@@ -619,10 +652,10 @@ parseBlock (Elem e) =
"para" -> parseMixed para (elContent e)
"formalpara" -> do
tit <- case filterChild (named "title") e of
- Just t -> (<> str "." <> linebreak) <$> emph
- <$> getInlines t
+ Just t -> (para . strong . (<> str ".")) <$>
+ getInlines t
Nothing -> return mempty
- addToStart tit <$> parseMixed para (elContent e)
+ (tit <>) <$> parseMixed para (elContent e)
"simpara" -> parseMixed para (elContent e)
"ackno" -> parseMixed para (elContent e)
"epigraph" -> parseBlockquote
@@ -630,7 +663,11 @@ parseBlock (Elem e) =
"attribution" -> return mempty
"titleabbrev" -> return mempty
"authorinitials" -> return mempty
- "title" -> return mempty -- handled by getTitle or sect or figure
+ "title" -> checkInMeta getTitle
+ "author" -> checkInMeta getAuthor
+ "authorgroup" -> checkInMeta getAuthorGroup
+ "releaseinfo" -> checkInMeta (getInlines e >>= addMeta "release")
+ "date" -> checkInMeta getDate
"bibliography" -> sect 0
"bibliodiv" -> sect 1
"biblioentry" -> parseMixed para (elContent e)
@@ -675,6 +712,7 @@ parseBlock (Elem e) =
"question" -> addToStart (strong (str "Q:") <> str " ") <$> getBlocks e
"answer" -> addToStart (strong (str "A:") <> str " ") <$> getBlocks e
"abstract" -> blockQuote <$> getBlocks e
+ "calloutlist" -> bulletList <$> callouts
"itemizedlist" -> bulletList <$> listitems
"orderedlist" -> do
let listStyle = case attrValue "numeration" e of
@@ -693,8 +731,8 @@ parseBlock (Elem e) =
"figure" -> getFigure e
"mediaobject" -> para <$> getImage e
"caption" -> return mempty
- "info" -> getTitle >> getAuthors >> getDate >> return mempty
- "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty
+ "info" -> metaBlock
+ "articleinfo" -> metaBlock
"sectioninfo" -> return mempty -- keywords & other metadata
"refsectioninfo" -> return mempty -- keywords & other metadata
"refsect1info" -> return mempty -- keywords & other metadata
@@ -708,10 +746,10 @@ parseBlock (Elem e) =
"chapterinfo" -> return mempty -- keywords & other metadata
"glossaryinfo" -> return mempty -- keywords & other metadata
"appendixinfo" -> return mempty -- keywords & other metadata
- "bookinfo" -> getTitle >> getAuthors >> getDate >> return mempty
+ "bookinfo" -> metaBlock
"article" -> modify (\st -> st{ dbBook = False }) >>
- getTitle >> getBlocks e
- "book" -> modify (\st -> st{ dbBook = True }) >> getTitle >> getBlocks e
+ getBlocks e
+ "book" -> modify (\st -> st{ dbBook = True }) >> getBlocks e
"table" -> parseTable
"informaltable" -> parseTable
"literallayout" -> codeBlockWithLang
@@ -734,7 +772,7 @@ parseBlock (Elem e) =
"" -> []
x -> [x]
return $ codeBlockWith (attrValue "id" e, classes', [])
- $ trimNl $ strContent e
+ $ trimNl $ strContentRecursive e
parseBlockquote = do
attrib <- case filterChild (named "attribution") e of
Nothing -> return mempty
@@ -743,6 +781,7 @@ parseBlock (Elem e) =
contents <- getBlocks e
return $ blockQuote (contents <> attrib)
listitems = mapM getBlocks $ filterChildren (named "listitem") e
+ callouts = mapM getBlocks $ filterChildren (named "callout") e
deflistitems = mapM parseVarListEntry $ filterChildren
(named "varlistentry") e
parseVarListEntry e' = do
@@ -757,24 +796,19 @@ parseBlock (Elem e) =
terms' <- mapM getInlines terms
items' <- mapM getBlocks items
return (mconcat $ intersperse (str "; ") terms', items')
- getTitle = case filterChild (named "title") e of
- Just t -> do
- tit <- getInlines t
- subtit <- case filterChild (named "subtitle") e of
- Just s -> (text ": " <>) <$>
- getInlines s
- Nothing -> return mempty
- modify $ \st -> st{dbDocTitle = tit <> subtit}
- Nothing -> return ()
- getAuthors = do
- auths <- mapM getInlines
- $ filterChildren (named "author") e
- modify $ \st -> st{dbDocAuthors = auths}
- getDate = case filterChild (named "date") e of
- Just t -> do
- dat <- getInlines t
- modify $ \st -> st{dbDocDate = dat}
- Nothing -> return ()
+ getTitle = do
+ tit <- getInlines e
+ subtit <- case filterChild (named "subtitle") e of
+ Just s -> (text ": " <>) <$>
+ getInlines s
+ Nothing -> return mempty
+ addMeta "title" (tit <> subtit)
+
+ getAuthor = (:[]) <$> getInlines e >>= addMeta "author"
+ getAuthorGroup = do
+ let terms = filterChildren (named "author") e
+ mapM getInlines terms >>= addMeta "author"
+ getDate = getInlines e >>= addMeta "date"
parseTable = do
let isCaption x = named "title" x || named "caption" x
caption <- case filterChild isCaption e of
@@ -834,18 +868,31 @@ parseBlock (Elem e) =
Nothing -> return mempty
modify $ \st -> st{ dbSectionLevel = n }
b <- getBlocks e
+ let ident = attrValue "id" e
modify $ \st -> st{ dbSectionLevel = n - 1 }
- return $ header n' headerText <> b
+ return $ headerWith (ident,[],[]) n' headerText <> b
+ metaBlock = acceptingMetadata (getBlocks e) >> return mempty
getInlines :: Element -> DB Inlines
getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e')
+strContentRecursive :: Element -> String
+strContentRecursive = strContent .
+ (\e' -> e'{ elContent = map elementToStr $ elContent e' })
+
+elementToStr :: Content -> Content
+elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
+elementToStr x = x
+
parseInline :: Content -> DB Inlines
parseInline (Text (CData _ s _)) = return $ text s
parseInline (CRef ref) =
return $ maybe (text $ map toUpper ref) (text . (:[])) $ lookupEntity ref
parseInline (Elem e) =
case qName (elName e) of
+ "equation" -> equation displayMath
+ "informalequation" -> equation displayMath
+ "inlineequation" -> equation math
"subscript" -> subscript <$> innerInlines
"superscript" -> superscript <$> innerInlines
"inlinemediaobject" -> getImage e
@@ -860,6 +907,7 @@ parseInline (Elem e) =
else doubleQuoted contents
"simplelist" -> simpleList
"segmentedlist" -> segmentedList
+ "classname" -> codeWithLang
"code" -> codeWithLang
"filename" -> codeWithLang
"literal" -> codeWithLang
@@ -879,6 +927,10 @@ parseInline (Elem e) =
"constant" -> codeWithLang
"userinput" -> codeWithLang
"varargs" -> return $ code "(...)"
+ "keycap" -> return (str $ strContent e)
+ "keycombo" -> keycombo <$> (mapM parseInline $ elContent e)
+ "menuchoice" -> menuchoice <$> (mapM parseInline $
+ filter isGuiMenu $ elContent e)
"xref" -> return $ str "?" -- so at least you know something is there
"email" -> return $ link ("mailto:" ++ strContent e) ""
$ str $ strContent e
@@ -899,14 +951,26 @@ parseInline (Elem e) =
_ -> emph <$> innerInlines
"footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e)
"title" -> return mempty
+ "affiliation" -> return mempty
+ -- Note: this isn't a real docbook tag; it's what we convert
+ -- <?asciidor-br?> to in handleInstructions, above. A kludge to
+ -- work around xml-light's inability to parse an instruction.
+ "br" -> return linebreak
_ -> innerInlines
where innerInlines = (trimInlines . mconcat) <$>
(mapM parseInline $ elContent e)
+ equation constructor = return $ mconcat $
+ map (constructor . writeTeX)
+ $ rights
+ $ map (readMathML . showElement . everywhere (mkT removePrefix))
+ $ filterChildren (\x -> qName (elName x) == "math" &&
+ qPrefix (elName x) == Just "mml") e
+ removePrefix elname = elname { qPrefix = Nothing }
codeWithLang = do
let classes' = case attrValue "language" e of
"" -> []
l -> [l]
- return $ codeWith (attrValue "id" e,classes',[]) $ strContent e
+ return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e
simpleList = (mconcat . intersperse (str "," <> space)) <$> mapM getInlines
(filterChildren (named "member") e)
segmentedList = do
@@ -921,3 +985,10 @@ parseInline (Elem e) =
then mempty
else strong tit <> linebreak
return $ linebreak <> tit' <> segs
+ keycombo = spanWith ("",["keycombo"],[]) .
+ mconcat . intersperse (str "+")
+ menuchoice = spanWith ("",["menuchoice"],[]) .
+ mconcat . intersperse (text " > ")
+ isGuiMenu (Elem x) = named "guimenu" x || named "guisubmenu" x ||
+ named "guimenuitem" x
+ isGuiMenu _ = False
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
new file mode 100644
index 000000000..64eb0322f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -0,0 +1,549 @@
+{-# LANGUAGE PatternGuards, OverloadedStrings #-}
+
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse)
+to 'Pandoc' document. -}
+
+{-
+Current state of implementation of Docx entities ([x] means
+implemented, [-] means partially implemented):
+
+* Blocks
+
+ - [X] Para
+ - [X] CodeBlock (styled with `SourceCode`)
+ - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally,
+ indented)
+ - [X] OrderedList
+ - [X] BulletList
+ - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`)
+ - [X] Header (styled with `Heading#`)
+ - [ ] HorizontalRule
+ - [-] Table (column widths and alignments not yet implemented)
+
+* Inlines
+
+ - [X] Str
+ - [X] Emph (From italics. `underline` currently read as span. In
+ future, it might optionally be emph as well)
+ - [X] Strong
+ - [X] Strikeout
+ - [X] Superscript
+ - [X] Subscript
+ - [X] SmallCaps
+ - [ ] Quoted
+ - [ ] Cite
+ - [X] Code (styled with `VerbatimChar`)
+ - [X] Space
+ - [X] LineBreak (these are invisible in Word: entered with Shift-Return)
+ - [ ] Math
+ - [X] Link (links to an arbitrary bookmark create a span with the target as
+ id and "anchor" class)
+ - [-] Image (Links to path in archive. Future option for
+ data-encoded URI likely.)
+ - [X] Note (Footnotes and Endnotes are silently combined.)
+-}
+
+module Text.Pandoc.Readers.Docx
+ ( readDocx
+ ) where
+
+import Codec.Archive.Zip
+import Text.Pandoc.Definition
+import Text.Pandoc.Options
+import Text.Pandoc.Builder
+import Text.Pandoc.Walk
+import Text.Pandoc.Readers.Docx.Parse
+import Text.Pandoc.Readers.Docx.Lists
+import Text.Pandoc.Readers.Docx.Reducible
+import Text.Pandoc.Shared
+import Text.Pandoc.MediaBag (insertMedia, MediaBag)
+import Data.List (delete, (\\), intersect)
+import Data.Monoid
+import Text.TeXMath (writeTeX)
+import Data.Default (Default)
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Map as M
+import Control.Monad.Reader
+import Control.Monad.State
+import Control.Applicative ((<$>))
+import Data.Sequence (ViewL(..), viewl)
+import qualified Data.Sequence as Seq (null)
+
+readDocx :: ReaderOptions
+ -> B.ByteString
+ -> (Pandoc, MediaBag)
+readDocx opts bytes =
+ case archiveToDocx (toArchive bytes) of
+ Right docx -> (Pandoc meta blks, mediaBag) where
+ (meta, blks, mediaBag) = (docxToOutput opts docx)
+ Left _ -> error $ "couldn't parse docx file"
+
+data DState = DState { docxAnchorMap :: M.Map String String
+ , docxMediaBag :: MediaBag
+ , docxDropCap :: Inlines
+ }
+
+instance Default DState where
+ def = DState { docxAnchorMap = M.empty
+ , docxMediaBag = mempty
+ , docxDropCap = mempty
+ }
+
+data DEnv = DEnv { docxOptions :: ReaderOptions
+ , docxInHeaderBlock :: Bool }
+
+instance Default DEnv where
+ def = DEnv def False
+
+type DocxContext = ReaderT DEnv (State DState)
+
+evalDocxContext :: DocxContext a -> DEnv -> DState -> a
+evalDocxContext ctx env st = evalState (runReaderT ctx env) st
+
+-- This is empty, but we put it in for future-proofing.
+spansToKeep :: [String]
+spansToKeep = []
+
+divsToKeep :: [String]
+divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
+
+metaStyles :: M.Map String String
+metaStyles = M.fromList [ ("Title", "title")
+ , ("Subtitle", "subtitle")
+ , ("Author", "author")
+ , ("Date", "date")
+ , ("Abstract", "abstract")]
+
+sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
+sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
+
+isMetaPar :: BodyPart -> Bool
+isMetaPar (Paragraph pPr _) =
+ not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
+isMetaPar _ = False
+
+isEmptyPar :: BodyPart -> Bool
+isEmptyPar (Paragraph _ parParts) =
+ all isEmptyParPart parParts
+ where
+ isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
+ isEmptyParPart _ = False
+ isEmptyElem (TextRun s) = trim s == ""
+ isEmptyElem _ = True
+isEmptyPar _ = False
+
+bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue)
+bodyPartsToMeta' [] = return M.empty
+bodyPartsToMeta' (bp : bps)
+ | (Paragraph pPr parParts) <- bp
+ , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
+ , (Just metaField) <- M.lookup c metaStyles = do
+ inlines <- concatReduce <$> mapM parPartToInlines parParts
+ remaining <- bodyPartsToMeta' bps
+ let
+ f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
+ f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
+ f m (MetaList mv) = MetaList (m : mv)
+ f m n = MetaList [m, n]
+ return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining
+bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
+
+bodyPartsToMeta :: [BodyPart] -> DocxContext Meta
+bodyPartsToMeta bps = do
+ mp <- bodyPartsToMeta' bps
+ let mp' =
+ case M.lookup "author" mp of
+ Just mv -> M.insert "author" (fixAuthors mv) mp
+ Nothing -> mp
+ return $ Meta mp'
+
+fixAuthors :: MetaValue -> MetaValue
+fixAuthors (MetaBlocks blks) =
+ MetaList $ map g $ filter f blks
+ where f (Para _) = True
+ f _ = False
+ g (Para ils) = MetaInlines ils
+ g _ = MetaInlines []
+fixAuthors mv = mv
+
+codeStyles :: [String]
+codeStyles = ["VerbatimChar"]
+
+codeDivs :: [String]
+codeDivs = ["SourceCode"]
+
+runElemToInlines :: RunElem -> Inlines
+runElemToInlines (TextRun s) = text s
+runElemToInlines (LnBrk) = linebreak
+runElemToInlines (Tab) = space
+
+runElemToString :: RunElem -> String
+runElemToString (TextRun s) = s
+runElemToString (LnBrk) = ['\n']
+runElemToString (Tab) = ['\t']
+
+runToString :: Run -> String
+runToString (Run _ runElems) = concatMap runElemToString runElems
+runToString _ = ""
+
+parPartToString :: ParPart -> String
+parPartToString (PlainRun run) = runToString run
+parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
+parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
+parPartToString _ = ""
+
+blacklistedCharStyles :: [String]
+blacklistedCharStyles = ["Hyperlink"]
+
+resolveDependentRunStyle :: RunStyle -> RunStyle
+resolveDependentRunStyle rPr
+ | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles =
+ rPr
+ | Just (_, cs) <- rStyle rPr =
+ let rPr' = resolveDependentRunStyle cs
+ in
+ RunStyle { isBold = case isBold rPr of
+ Just bool -> Just bool
+ Nothing -> isBold rPr'
+ , isItalic = case isItalic rPr of
+ Just bool -> Just bool
+ Nothing -> isItalic rPr'
+ , isSmallCaps = case isSmallCaps rPr of
+ Just bool -> Just bool
+ Nothing -> isSmallCaps rPr'
+ , isStrike = case isStrike rPr of
+ Just bool -> Just bool
+ Nothing -> isStrike rPr'
+ , rVertAlign = case rVertAlign rPr of
+ Just valign -> Just valign
+ Nothing -> rVertAlign rPr'
+ , rUnderline = case rUnderline rPr of
+ Just ulstyle -> Just ulstyle
+ Nothing -> rUnderline rPr'
+ , rStyle = rStyle rPr }
+ | otherwise = rPr
+
+runStyleToTransform :: RunStyle -> (Inlines -> Inlines)
+runStyleToTransform rPr
+ | Just (s, _) <- rStyle rPr
+ , s `elem` spansToKeep =
+ let rPr' = rPr{rStyle = Nothing}
+ in
+ (spanWith ("", [s], [])) . (runStyleToTransform rPr')
+ | Just True <- isItalic rPr =
+ emph . (runStyleToTransform rPr {isItalic = Nothing})
+ | Just True <- isBold rPr =
+ strong . (runStyleToTransform rPr {isBold = Nothing})
+ | Just True <- isSmallCaps rPr =
+ smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing})
+ | Just True <- isStrike rPr =
+ strikeout . (runStyleToTransform rPr {isStrike = Nothing})
+ | Just SupScrpt <- rVertAlign rPr =
+ superscript . (runStyleToTransform rPr {rVertAlign = Nothing})
+ | Just SubScrpt <- rVertAlign rPr =
+ subscript . (runStyleToTransform rPr {rVertAlign = Nothing})
+ | Just "single" <- rUnderline rPr =
+ emph . (runStyleToTransform rPr {rUnderline = Nothing})
+ | otherwise = id
+
+runToInlines :: Run -> DocxContext Inlines
+runToInlines (Run rs runElems)
+ | Just (s, _) <- rStyle rs
+ , s `elem` codeStyles =
+ return $ code $ concatMap runElemToString runElems
+ | otherwise = do
+ let ils = concatReduce (map runElemToInlines runElems)
+ return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils
+runToInlines (Footnote bps) = do
+ blksList <- concatReduce <$> (mapM bodyPartToBlocks bps)
+ return $ note blksList
+runToInlines (Endnote bps) = do
+ blksList <- concatReduce <$> (mapM bodyPartToBlocks bps)
+ return $ note blksList
+runToInlines (InlineDrawing fp bs) = do
+ mediaBag <- gets docxMediaBag
+ modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ return $ image fp "" ""
+
+parPartToInlines :: ParPart -> DocxContext Inlines
+parPartToInlines (PlainRun r) = runToInlines r
+parPartToInlines (Insertion _ author date runs) = do
+ opts <- asks docxOptions
+ case readerTrackChanges opts of
+ AcceptChanges -> concatReduce <$> mapM runToInlines runs
+ RejectChanges -> return mempty
+ AllChanges -> do
+ ils <- concatReduce <$> mapM runToInlines runs
+ let attr = ("", ["insertion"], [("author", author), ("date", date)])
+ return $ spanWith attr ils
+parPartToInlines (Deletion _ author date runs) = do
+ opts <- asks docxOptions
+ case readerTrackChanges opts of
+ AcceptChanges -> return mempty
+ RejectChanges -> concatReduce <$> mapM runToInlines runs
+ AllChanges -> do
+ ils <- concatReduce <$> mapM runToInlines runs
+ let attr = ("", ["deletion"], [("author", author), ("date", date)])
+ return $ spanWith attr ils
+parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors =
+ return mempty
+parPartToInlines (BookMark _ anchor) =
+ -- We record these, so we can make sure not to overwrite
+ -- user-defined anchor links with header auto ids.
+ do
+ -- get whether we're in a header.
+ inHdrBool <- asks docxInHeaderBlock
+ -- Get the anchor map.
+ anchorMap <- gets docxAnchorMap
+ -- We don't want to rewrite if we're in a header, since we'll take
+ -- care of that later, when we make the header anchor. If the
+ -- bookmark were already in uniqueIdent form, this would lead to a
+ -- duplication. Otherwise, we check to see if the id is already in
+ -- there. Rewrite if necessary. This will have the possible effect
+ -- of rewriting user-defined anchor links. However, since these
+ -- are not defined in pandoc, it seems like a necessary evil to
+ -- avoid an extra pass.
+ let newAnchor =
+ if not inHdrBool && anchor `elem` (M.elems anchorMap)
+ then uniqueIdent [Str anchor] (M.elems anchorMap)
+ else anchor
+ unless inHdrBool
+ (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
+ return $ spanWith (newAnchor, ["anchor"], []) mempty
+parPartToInlines (Drawing fp bs) = do
+ mediaBag <- gets docxMediaBag
+ modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ return $ image fp "" ""
+parPartToInlines (InternalHyperLink anchor runs) = do
+ ils <- concatReduce <$> mapM runToInlines runs
+ return $ link ('#' : anchor) "" ils
+parPartToInlines (ExternalHyperLink target runs) = do
+ ils <- concatReduce <$> mapM runToInlines runs
+ return $ link target "" ils
+parPartToInlines (PlainOMath exps) = do
+ return $ math $ writeTeX exps
+
+isAnchorSpan :: Inline -> Bool
+isAnchorSpan (Span (_, classes, kvs) ils) =
+ classes == ["anchor"] &&
+ null kvs &&
+ null ils
+isAnchorSpan _ = False
+
+dummyAnchors :: [String]
+dummyAnchors = ["_GoBack"]
+
+makeHeaderAnchor :: Blocks -> DocxContext Blocks
+makeHeaderAnchor bs = case viewl $ unMany bs of
+ (x :< xs) -> do
+ x' <- (makeHeaderAnchor' x)
+ xs' <- (makeHeaderAnchor $ Many xs)
+ return $ (singleton x') <> xs'
+ EmptyL -> return mempty
+
+makeHeaderAnchor' :: Block -> DocxContext Block
+-- If there is an anchor already there (an anchor span in the header,
+-- to be exact), we rename and associate the new id with the old one.
+makeHeaderAnchor' (Header n (_, classes, kvs) ils)
+ | (c:cs) <- filter isAnchorSpan ils
+ , (Span (ident, ["anchor"], _) _) <- c = do
+ hdrIDMap <- gets docxAnchorMap
+ let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
+ return $ Header n (newIdent, classes, kvs) (ils \\ (c:cs))
+-- Otherwise we just give it a name, and register that name (associate
+-- it with itself.)
+makeHeaderAnchor' (Header n (_, classes, kvs) ils) =
+ do
+ hdrIDMap <- gets docxAnchorMap
+ let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
+ return $ Header n (newIdent, classes, kvs) ils
+makeHeaderAnchor' blk = return blk
+
+-- Rewrite a standalone paragraph block as a plain
+singleParaToPlain :: Blocks -> Blocks
+singleParaToPlain blks
+ | (Para (ils) :< seeq) <- viewl $ unMany blks
+ , Seq.null seeq =
+ singleton $ Plain ils
+singleParaToPlain blks = blks
+
+cellToBlocks :: Cell -> DocxContext Blocks
+cellToBlocks (Cell bps) = concatReduce <$> mapM bodyPartToBlocks bps
+
+rowToBlocksList :: Row -> DocxContext [Blocks]
+rowToBlocksList (Row cells) = do
+ blksList <- mapM cellToBlocks cells
+ return $ map singleParaToPlain blksList
+
+trimLineBreaks :: [Inline] -> [Inline]
+trimLineBreaks [] = []
+trimLineBreaks (LineBreak : ils) = trimLineBreaks ils
+trimLineBreaks ils
+ | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils')
+trimLineBreaks ils = ils
+
+parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks)
+parStyleToTransform pPr
+ | (c:cs) <- pStyle pPr
+ , c `elem` divsToKeep =
+ let pPr' = pPr { pStyle = cs }
+ in
+ (divWith ("", [c], [])) . (parStyleToTransform pPr')
+ | (c:cs) <- pStyle pPr,
+ c `elem` listParagraphDivs =
+ let pPr' = pPr { pStyle = cs, indentation = Nothing}
+ in
+ (divWith ("", [c], [])) . (parStyleToTransform pPr')
+ | (_:cs) <- pStyle pPr
+ , Just True <- pBlockQuote pPr =
+ let pPr' = pPr { pStyle = cs }
+ in
+ blockQuote . (parStyleToTransform pPr')
+ | (_:cs) <- pStyle pPr =
+ let pPr' = pPr { pStyle = cs}
+ in
+ parStyleToTransform pPr'
+ | null (pStyle pPr)
+ , Just left <- indentation pPr >>= leftParIndent
+ , Just hang <- indentation pPr >>= hangingParIndent =
+ let pPr' = pPr { indentation = Nothing }
+ in
+ case (left - hang) > 0 of
+ True -> blockQuote . (parStyleToTransform pPr')
+ False -> parStyleToTransform pPr'
+ | null (pStyle pPr),
+ Just left <- indentation pPr >>= leftParIndent =
+ let pPr' = pPr { indentation = Nothing }
+ in
+ case left > 0 of
+ True -> blockQuote . (parStyleToTransform pPr')
+ False -> parStyleToTransform pPr'
+parStyleToTransform _ = id
+
+bodyPartToBlocks :: BodyPart -> DocxContext Blocks
+bodyPartToBlocks (Paragraph pPr parparts)
+ | not $ null $ codeDivs `intersect` (pStyle pPr) =
+ return
+ $ parStyleToTransform pPr
+ $ codeBlock
+ $ concatMap parPartToString parparts
+ | Just (style, n) <- pHeading pPr = do
+ ils <- local (\s-> s{docxInHeaderBlock=True}) $
+ (concatReduce <$> mapM parPartToInlines parparts)
+ makeHeaderAnchor $
+ headerWith ("", delete style (pStyle pPr), []) n ils
+ | otherwise = do
+ ils <- concatReduce <$> mapM parPartToInlines parparts >>=
+ (return . fromList . trimLineBreaks . normalizeSpaces . toList)
+ dropIls <- gets docxDropCap
+ let ils' = dropIls <> ils
+ if dropCap pPr
+ then do modify $ \s -> s { docxDropCap = ils' }
+ return mempty
+ else do modify $ \s -> s { docxDropCap = mempty }
+ return $ case isNull ils' of
+ True -> mempty
+ _ -> parStyleToTransform pPr $ para ils'
+bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do
+ let
+ kvs = case levelInfo of
+ (_, fmt, txt, Just start) -> [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ , ("start", (show start))
+ ]
+
+ (_, fmt, txt, Nothing) -> [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ ]
+ blks <- bodyPartToBlocks (Paragraph pPr parparts)
+ return $ divWith ("", ["list-item"], kvs) blks
+bodyPartToBlocks (Tbl _ _ _ []) =
+ return $ para mempty
+bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
+ let caption = text cap
+ (hdr, rows) = case firstRowFormatting look of
+ True -> (Just r, rs)
+ False -> (Nothing, r:rs)
+ hdrCells <- case hdr of
+ Just r' -> rowToBlocksList r'
+ Nothing -> return []
+
+ cells <- mapM rowToBlocksList rows
+
+ let size = case null hdrCells of
+ True -> length $ head cells
+ False -> length $ hdrCells
+ --
+ -- The two following variables (horizontal column alignment and
+ -- relative column widths) go to the default at the
+ -- moment. Width information is in the TblGrid field of the Tbl,
+ -- so should be possible. Alignment might be more difficult,
+ -- since there doesn't seem to be a column entity in docx.
+ alignments = replicate size AlignDefault
+ widths = replicate size 0 :: [Double]
+
+ return $ table caption (zip alignments widths) hdrCells cells
+bodyPartToBlocks (OMathPara e) = do
+ return $ para $ displayMath (writeTeX e)
+
+
+-- replace targets with generated anchors.
+rewriteLink' :: Inline -> DocxContext Inline
+rewriteLink' l@(Link ils ('#':target, title)) = do
+ anchorMap <- gets docxAnchorMap
+ return $ case M.lookup target anchorMap of
+ Just newTarget -> (Link ils ('#':newTarget, title))
+ Nothing -> l
+rewriteLink' il = return il
+
+rewriteLinks :: [Block] -> DocxContext [Block]
+rewriteLinks = mapM (walkM rewriteLink')
+
+bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag)
+bodyToOutput (Body bps) = do
+ let (metabps, blkbps) = sepBodyParts bps
+ meta <- bodyPartsToMeta metabps
+ blks <- concatReduce <$> mapM bodyPartToBlocks blkbps
+ blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
+ mediaBag <- gets docxMediaBag
+ return $ (meta,
+ blks',
+ mediaBag)
+
+docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag)
+docxToOutput opts (Docx (Document _ body)) =
+ let dEnv = def { docxOptions = opts} in
+ evalDocxContext (bodyToOutput body) dEnv def
diff --git a/src/Text/Pandoc/Readers/Docx/Fonts.hs b/src/Text/Pandoc/Readers/Docx/Fonts.hs
new file mode 100644
index 000000000..b44c71412
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Fonts.hs
@@ -0,0 +1,238 @@
+{-
+Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Fonts
+ Copyright : Copyright (C) 2014 Matthew Pickering
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Matthew Pickering <matthewtpickering@gmail.com>
+ Stability : alpha
+ Portability : portable
+
+Utilities to convert between font codepoints and unicode characters.
+-}
+module Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) where
+
+
+-- | Enumeration of recognised fonts
+data Font = Symbol -- ^ <http://en.wikipedia.org/wiki/Symbol_(typeface) Adobe Symbol>
+ deriving (Show, Eq)
+
+-- | Given a font and codepoint, returns the corresponding unicode
+-- character
+getUnicode :: Font -> Char -> Maybe Char
+getUnicode Symbol c = lookup c symbol
+
+-- Generated from lib/fonts/symbol.txt
+symbol :: [(Char, Char)]
+symbol =
+ [ (' ',' ')
+ , (' ','\160')
+ , ('!','!')
+ , ('"','\8704')
+ , ('#','#')
+ , ('$','\8707')
+ , ('%','%')
+ , ('&','&')
+ , ('\'','\8715')
+ , ('(','(')
+ , (')',')')
+ , ('*','\8727')
+ , ('+','+')
+ , (',',',')
+ , ('-','\8722')
+ , ('.','.')
+ , ('/','/')
+ , ('0','0')
+ , ('1','1')
+ , ('2','2')
+ , ('3','3')
+ , ('4','4')
+ , ('5','5')
+ , ('6','6')
+ , ('7','7')
+ , ('8','8')
+ , ('9','9')
+ , (':',':')
+ , (';',';')
+ , ('<','<')
+ , ('=','=')
+ , ('>','>')
+ , ('?','?')
+ , ('@','\8773')
+ , ('A','\913')
+ , ('B','\914')
+ , ('C','\935')
+ , ('D','\916')
+ , ('D','\8710')
+ , ('E','\917')
+ , ('F','\934')
+ , ('G','\915')
+ , ('H','\919')
+ , ('I','\921')
+ , ('J','\977')
+ , ('K','\922')
+ , ('L','\923')
+ , ('M','\924')
+ , ('N','\925')
+ , ('O','\927')
+ , ('P','\928')
+ , ('Q','\920')
+ , ('R','\929')
+ , ('S','\931')
+ , ('T','\932')
+ , ('U','\933')
+ , ('V','\962')
+ , ('W','\937')
+ , ('W','\8486')
+ , ('X','\926')
+ , ('Y','\936')
+ , ('Z','\918')
+ , ('[','[')
+ , ('\\','\8756')
+ , (']',']')
+ , ('^','\8869')
+ , ('_','_')
+ , ('`','\63717')
+ , ('a','\945')
+ , ('b','\946')
+ , ('c','\967')
+ , ('d','\948')
+ , ('e','\949')
+ , ('f','\966')
+ , ('g','\947')
+ , ('h','\951')
+ , ('i','\953')
+ , ('j','\981')
+ , ('k','\954')
+ , ('l','\955')
+ , ('m','\181')
+ , ('m','\956')
+ , ('n','\957')
+ , ('o','\959')
+ , ('p','\960')
+ , ('q','\952')
+ , ('r','\961')
+ , ('s','\963')
+ , ('t','\964')
+ , ('u','\965')
+ , ('v','\982')
+ , ('w','\969')
+ , ('x','\958')
+ , ('y','\968')
+ , ('z','\950')
+ , ('{','{')
+ , ('|','|')
+ , ('}','}')
+ , ('~','\8764')
+ , ('\160','\8364')
+ , ('\161','\978')
+ , ('\162','\8242')
+ , ('\163','\8804')
+ , ('\164','\8260')
+ , ('\164','\8725')
+ , ('\165','\8734')
+ , ('\166','\402')
+ , ('\167','\9827')
+ , ('\168','\9830')
+ , ('\169','\9829')
+ , ('\170','\9824')
+ , ('\171','\8596')
+ , ('\172','\8592')
+ , ('\173','\8593')
+ , ('\174','\8594')
+ , ('\175','\8595')
+ , ('\176','\176')
+ , ('\177','\177')
+ , ('\178','\8243')
+ , ('\179','\8805')
+ , ('\180','\215')
+ , ('\181','\8733')
+ , ('\182','\8706')
+ , ('\183','\8226')
+ , ('\184','\247')
+ , ('\185','\8800')
+ , ('\186','\8801')
+ , ('\187','\8776')
+ , ('\188','\8230')
+ , ('\189','\63718')
+ , ('\190','\63719')
+ , ('\191','\8629')
+ , ('\192','\8501')
+ , ('\193','\8465')
+ , ('\194','\8476')
+ , ('\195','\8472')
+ , ('\196','\8855')
+ , ('\197','\8853')
+ , ('\198','\8709')
+ , ('\199','\8745')
+ , ('\200','\8746')
+ , ('\201','\8835')
+ , ('\202','\8839')
+ , ('\203','\8836')
+ , ('\204','\8834')
+ , ('\205','\8838')
+ , ('\206','\8712')
+ , ('\207','\8713')
+ , ('\208','\8736')
+ , ('\209','\8711')
+ , ('\210','\63194')
+ , ('\211','\63193')
+ , ('\212','\63195')
+ , ('\213','\8719')
+ , ('\214','\8730')
+ , ('\215','\8901')
+ , ('\216','\172')
+ , ('\217','\8743')
+ , ('\218','\8744')
+ , ('\219','\8660')
+ , ('\220','\8656')
+ , ('\221','\8657')
+ , ('\222','\8658')
+ , ('\223','\8659')
+ , ('\224','\9674')
+ , ('\225','\9001')
+ , ('\226','\63720')
+ , ('\227','\63721')
+ , ('\228','\63722')
+ , ('\229','\8721')
+ , ('\230','\63723')
+ , ('\231','\63724')
+ , ('\232','\63725')
+ , ('\233','\63726')
+ , ('\234','\63727')
+ , ('\235','\63728')
+ , ('\236','\63729')
+ , ('\237','\63730')
+ , ('\238','\63731')
+ , ('\239','\63732')
+ , ('\241','\9002')
+ , ('\242','\8747')
+ , ('\243','\8992')
+ , ('\244','\63733')
+ , ('\245','\8993')
+ , ('\246','\63734')
+ , ('\247','\63735')
+ , ('\248','\63736')
+ , ('\249','\63737')
+ , ('\250','\63738')
+ , ('\251','\63739')
+ , ('\252','\63740')
+ , ('\253','\63741')
+ , ('\254','\63742')]
diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs
new file mode 100644
index 000000000..c265ad074
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Lists.hs
@@ -0,0 +1,229 @@
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Lists
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Functions for converting flat docx paragraphs into nested lists.
+-}
+
+module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets
+ , blocksToDefinitions
+ , listParagraphDivs
+ ) where
+
+import Text.Pandoc.JSON
+import Text.Pandoc.Generic (bottomUp)
+import Text.Pandoc.Shared (trim)
+import Control.Monad
+import Data.List
+import Data.Maybe
+
+isListItem :: Block -> Bool
+isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True
+isListItem _ = False
+
+getLevel :: Block -> Maybe Integer
+getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs
+getLevel _ = Nothing
+
+getLevelN :: Block -> Integer
+getLevelN b = case getLevel b of
+ Just n -> n
+ Nothing -> -1
+
+getNumId :: Block -> Maybe Integer
+getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs
+getNumId _ = Nothing
+
+getNumIdN :: Block -> Integer
+getNumIdN b = case getNumId b of
+ Just n -> n
+ Nothing -> -1
+
+getText :: Block -> Maybe String
+getText (Div (_, _, kvs) _) = lookup "text" kvs
+getText _ = Nothing
+
+data ListType = Itemized | Enumerated ListAttributes
+
+listStyleMap :: [(String, ListNumberStyle)]
+listStyleMap = [("upperLetter", UpperAlpha),
+ ("lowerLetter", LowerAlpha),
+ ("upperRoman", UpperRoman),
+ ("lowerRoman", LowerRoman),
+ ("decimal", Decimal)]
+
+listDelimMap :: [(String, ListNumberDelim)]
+listDelimMap = [("%1)", OneParen),
+ ("(%1)", TwoParens),
+ ("%1.", Period)]
+
+getListType :: Block -> Maybe ListType
+getListType b@(Div (_, _, kvs) _) | isListItem b =
+ let
+ start = lookup "start" kvs
+ frmt = lookup "format" kvs
+ txt = lookup "text" kvs
+ in
+ case frmt of
+ Just "bullet" -> Just Itemized
+ Just f ->
+ case txt of
+ Just t -> Just $ Enumerated (
+ read (fromMaybe "1" start) :: Int,
+ fromMaybe DefaultStyle (lookup f listStyleMap),
+ fromMaybe DefaultDelim (lookup t listDelimMap))
+ Nothing -> Nothing
+ _ -> Nothing
+getListType _ = Nothing
+
+listParagraphDivs :: [String]
+listParagraphDivs = ["ListParagraph"]
+
+-- This is a first stab at going through and attaching meaning to list
+-- paragraphs, without an item marker, following a list item. We
+-- assume that these are paragraphs in the same item.
+
+handleListParagraphs :: [Block] -> [Block]
+handleListParagraphs [] = []
+handleListParagraphs (
+ (Div attr1@(_, classes1, _) blks1) :
+ (Div (ident2, classes2, kvs2) blks2) :
+ blks
+ ) | "list-item" `elem` classes1 &&
+ not ("list-item" `elem` classes2) &&
+ (not . null) (listParagraphDivs `intersect` classes2) =
+ -- We don't want to keep this indent.
+ let newDiv2 =
+ (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2)
+ in
+ handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks)
+handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks)
+
+separateBlocks' :: Block -> [[Block]] -> [[Block]]
+separateBlocks' blk ([] : []) = [[blk]]
+separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]]
+separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]]
+-- The following is for the invisible bullet lists. This is how
+-- pandoc-generated ooxml does multiparagraph item lists.
+separateBlocks' b acc | liftM trim (getText b) == Just "" =
+ (init acc) ++ [(last acc) ++ [b]]
+separateBlocks' b acc = acc ++ [[b]]
+
+separateBlocks :: [Block] -> [[Block]]
+separateBlocks blks = foldr separateBlocks' [[]] (reverse blks)
+
+flatToBullets' :: Integer -> [Block] -> [Block]
+flatToBullets' _ [] = []
+flatToBullets' num xs@(b : elems)
+ | getLevelN b == num = b : (flatToBullets' num elems)
+ | otherwise =
+ let bNumId = getNumIdN b
+ bLevel = getLevelN b
+ (children, remaining) =
+ span
+ (\b' ->
+ ((getLevelN b') > bLevel ||
+ ((getLevelN b') == bLevel && (getNumIdN b') == bNumId)))
+ xs
+ in
+ case getListType b of
+ Just (Enumerated attr) ->
+ (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) :
+ (flatToBullets' num remaining)
+ _ ->
+ (BulletList (separateBlocks $ flatToBullets' bLevel children)) :
+ (flatToBullets' num remaining)
+
+flatToBullets :: [Block] -> [Block]
+flatToBullets elems = flatToBullets' (-1) elems
+
+singleItemHeaderToHeader :: Block -> Block
+singleItemHeaderToHeader (OrderedList _ [[h@(Header _ _ _)]]) = h
+singleItemHeaderToHeader blk = blk
+
+
+blocksToBullets :: [Block] -> [Block]
+blocksToBullets blks =
+ map singleItemHeaderToHeader $
+ bottomUp removeListDivs $
+ flatToBullets $ (handleListParagraphs blks)
+
+plainParaInlines :: Block -> [Inline]
+plainParaInlines (Plain ils) = ils
+plainParaInlines (Para ils) = ils
+plainParaInlines _ = []
+
+blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block]
+blocksToDefinitions' [] acc [] = reverse acc
+blocksToDefinitions' defAcc acc [] =
+ reverse $ (DefinitionList (reverse defAcc)) : acc
+blocksToDefinitions' defAcc acc
+ ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks)
+ | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 =
+ let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
+ pair = case remainingAttr2 == ("", [], []) of
+ True -> (concatMap plainParaInlines blks1, [blks2])
+ False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]])
+ in
+ blocksToDefinitions' (pair : defAcc) acc blks
+blocksToDefinitions' defAcc acc
+ ((Div (ident2, classes2, kvs2) blks2) : blks)
+ | (not . null) defAcc && "Definition" `elem` classes2 =
+ let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
+ defItems2 = case remainingAttr2 == ("", [], []) of
+ True -> blks2
+ False -> [Div remainingAttr2 blks2]
+ ((defTerm, defItems):defs) = defAcc
+ defAcc' = case null defItems of
+ True -> (defTerm, [defItems2]) : defs
+ False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs
+ in
+ blocksToDefinitions' defAcc' acc blks
+blocksToDefinitions' [] acc (b:blks) =
+ blocksToDefinitions' [] (b:acc) blks
+blocksToDefinitions' defAcc acc (b:blks) =
+ blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
+
+removeListDivs' :: Block -> [Block]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | "list-item" `elem` classes =
+ case delete "list-item" classes of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) $ blks]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | not $ null $ listParagraphDivs `intersect` classes =
+ case classes \\ listParagraphDivs of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) blks]
+removeListDivs' blk = [blk]
+
+removeListDivs :: [Block] -> [Block]
+removeListDivs = concatMap removeListDivs'
+
+
+
+blocksToDefinitions :: [Block] -> [Block]
+blocksToDefinitions = blocksToDefinitions' [] []
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
new file mode 100644
index 000000000..5fd6b7a81
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -0,0 +1,911 @@
+{-# LANGUAGE PatternGuards, ViewPatterns, FlexibleInstances #-}
+
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Parse
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of docx archive into Docx haskell type
+-}
+
+module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
+ , Document(..)
+ , Body(..)
+ , BodyPart(..)
+ , TblLook(..)
+ , ParPart(..)
+ , Run(..)
+ , RunElem(..)
+ , Notes
+ , Numbering
+ , Relationship
+ , Media
+ , RunStyle(..)
+ , VertAlign(..)
+ , ParIndentation(..)
+ , ParagraphStyle(..)
+ , Row(..)
+ , Cell(..)
+ , archiveToDocx
+ ) where
+import Codec.Archive.Zip
+import Text.XML.Light
+import Data.Maybe
+import Data.List
+import System.FilePath
+import Data.Bits ((.|.))
+import qualified Data.ByteString.Lazy as B
+import qualified Text.Pandoc.UTF8 as UTF8
+import Control.Monad.Reader
+import Control.Applicative ((<$>), (<|>))
+import qualified Data.Map as M
+import Text.Pandoc.Compat.Except
+import Text.TeXMath.Readers.OMML (readOMML)
+import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
+import Text.TeXMath (Exp)
+import Data.Char (readLitChar, ord, chr, isDigit)
+
+data ReaderEnv = ReaderEnv { envNotes :: Notes
+ , envNumbering :: Numbering
+ , envRelationships :: [Relationship]
+ , envMedia :: Media
+ , envFont :: Maybe Font
+ , envCharStyles :: CharStyleMap
+ , envParStyles :: ParStyleMap
+ }
+ deriving Show
+
+data DocxError = DocxError | WrongElem
+ deriving Show
+
+instance Error DocxError where
+ noMsg = WrongElem
+
+type D = ExceptT DocxError (Reader ReaderEnv)
+
+runD :: D a -> ReaderEnv -> Either DocxError a
+runD dx re = runReader (runExceptT dx ) re
+
+maybeToD :: Maybe a -> D a
+maybeToD (Just a) = return a
+maybeToD Nothing = throwError DocxError
+
+eitherToD :: Either a b -> D b
+eitherToD (Right b) = return b
+eitherToD (Left _) = throwError DocxError
+
+concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
+concatMapM f xs = liftM concat (mapM f xs)
+
+
+-- This is similar to `mapMaybe`: it maps a function returning the D
+-- monad over a list, and only keeps the non-erroring return values.
+mapD :: (a -> D b) -> [a] -> D [b]
+mapD f xs =
+ let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return [])
+ in
+ concatMapM handler xs
+
+type NameSpaces = [(String, String)]
+
+data Docx = Docx Document
+ deriving Show
+
+data Document = Document NameSpaces Body
+ deriving Show
+
+data Body = Body [BodyPart]
+ deriving Show
+
+type Media = [(FilePath, B.ByteString)]
+
+type CharStyle = (String, RunStyle)
+
+type ParStyle = (String, ParStyleData)
+
+type CharStyleMap = M.Map String RunStyle
+
+type ParStyleMap = M.Map String ParStyleData
+
+data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
+ deriving Show
+
+data Numb = Numb String String -- right now, only a key to an abstract num
+ deriving Show
+
+data AbstractNumb = AbstractNumb String [Level]
+ deriving Show
+
+-- (ilvl, format, string, start)
+type Level = (String, String, String, Maybe Integer)
+
+data Relationship = Relationship (RelId, Target)
+ deriving Show
+
+data Notes = Notes NameSpaces
+ (Maybe (M.Map String Element))
+ (Maybe (M.Map String Element))
+ deriving Show
+
+data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer
+ , rightParIndent :: Maybe Integer
+ , hangingParIndent :: Maybe Integer}
+ deriving Show
+
+data ParagraphStyle = ParagraphStyle { pStyle :: [String]
+ , indentation :: Maybe ParIndentation
+ , dropCap :: Bool
+ , pHeading :: Maybe (String, Int)
+ , pBlockQuote :: Maybe Bool
+ }
+ deriving Show
+
+defaultParagraphStyle :: ParagraphStyle
+defaultParagraphStyle = ParagraphStyle { pStyle = []
+ , indentation = Nothing
+ , dropCap = False
+ , pHeading = Nothing
+ , pBlockQuote = Nothing
+ }
+
+
+data BodyPart = Paragraph ParagraphStyle [ParPart]
+ | ListItem ParagraphStyle String String Level [ParPart]
+ | Tbl String TblGrid TblLook [Row]
+ | OMathPara [Exp]
+ deriving Show
+
+type TblGrid = [Integer]
+
+data TblLook = TblLook {firstRowFormatting::Bool}
+ deriving Show
+
+defaultTblLook :: TblLook
+defaultTblLook = TblLook{firstRowFormatting = False}
+
+data Row = Row [Cell]
+ deriving Show
+
+data Cell = Cell [BodyPart]
+ deriving Show
+
+data ParPart = PlainRun Run
+ | Insertion ChangeId Author ChangeDate [Run]
+ | Deletion ChangeId Author ChangeDate [Run]
+ | BookMark BookMarkId Anchor
+ | InternalHyperLink Anchor [Run]
+ | ExternalHyperLink URL [Run]
+ | Drawing FilePath B.ByteString
+ | PlainOMath [Exp]
+ deriving Show
+
+data Run = Run RunStyle [RunElem]
+ | Footnote [BodyPart]
+ | Endnote [BodyPart]
+ | InlineDrawing FilePath B.ByteString
+ deriving Show
+
+data RunElem = TextRun String | LnBrk | Tab
+ deriving Show
+
+data VertAlign = BaseLn | SupScrpt | SubScrpt
+ deriving Show
+
+data RunStyle = RunStyle { isBold :: Maybe Bool
+ , isItalic :: Maybe Bool
+ , isSmallCaps :: Maybe Bool
+ , isStrike :: Maybe Bool
+ , rVertAlign :: Maybe VertAlign
+ , rUnderline :: Maybe String
+ , rStyle :: Maybe CharStyle}
+ deriving Show
+
+data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int)
+ , isBlockQuote :: Maybe Bool
+ , psStyle :: Maybe ParStyle}
+ deriving Show
+
+defaultRunStyle :: RunStyle
+defaultRunStyle = RunStyle { isBold = Nothing
+ , isItalic = Nothing
+ , isSmallCaps = Nothing
+ , isStrike = Nothing
+ , rVertAlign = Nothing
+ , rUnderline = Nothing
+ , rStyle = Nothing}
+
+
+type Target = String
+type Anchor = String
+type URL = String
+type BookMarkId = String
+type RelId = String
+type ChangeId = String
+type Author = String
+type ChangeDate = String
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
+attrToNSPair _ = Nothing
+
+archiveToDocx :: Archive -> Either DocxError Docx
+archiveToDocx archive = do
+ let notes = archiveToNotes archive
+ numbering = archiveToNumbering archive
+ rels = archiveToRelationships archive
+ media = archiveToMedia archive
+ (styles, parstyles) = archiveToStyles archive
+ rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles
+ doc <- runD (archiveToDocument archive) rEnv
+ return $ Docx doc
+
+
+archiveToDocument :: Archive -> D Document
+archiveToDocument zf = do
+ entry <- maybeToD $ findEntryByPath "word/document.xml" zf
+ docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem
+ body <- elemToBody namespaces bodyElem
+ return $ Document namespaces body
+
+elemToBody :: NameSpaces -> Element -> D Body
+elemToBody ns element | isElem ns "w" "body" element =
+ mapD (elemToBodyPart ns) (elChildren element) >>=
+ (\bps -> return $ Body bps)
+elemToBody _ _ = throwError WrongElem
+
+archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap)
+archiveToStyles zf =
+ let stylesElem = findEntryByPath "word/styles.xml" zf >>=
+ (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ in
+ case stylesElem of
+ Nothing -> (M.empty, M.empty)
+ Just styElem ->
+ let namespaces = mapMaybe attrToNSPair (elAttribs styElem)
+ in
+ ( M.fromList $ buildBasedOnList namespaces styElem
+ (Nothing :: Maybe CharStyle),
+ M.fromList $ buildBasedOnList namespaces styElem
+ (Nothing :: Maybe ParStyle) )
+
+isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool
+isBasedOnStyle ns element parentStyle
+ | isElem ns "w" "style" element
+ , Just styleType <- findAttr (elemName ns "w" "type") element
+ , styleType == cStyleType parentStyle
+ , Just basedOnVal <- findChild (elemName ns "w" "basedOn") element >>=
+ findAttr (elemName ns "w" "val")
+ , Just ps <- parentStyle = (basedOnVal == getStyleId ps)
+ | isElem ns "w" "style" element
+ , Just styleType <- findAttr (elemName ns "w" "type") element
+ , styleType == cStyleType parentStyle
+ , Nothing <- findChild (elemName ns "w" "basedOn") element
+ , Nothing <- parentStyle = True
+ | otherwise = False
+
+class ElemToStyle a where
+ cStyleType :: Maybe a -> String
+ elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a
+ getStyleId :: a -> String
+
+instance ElemToStyle CharStyle where
+ cStyleType _ = "character"
+ elemToStyle ns element parentStyle
+ | isElem ns "w" "style" element
+ , Just "character" <- findAttr (elemName ns "w" "type") element
+ , Just styleId <- findAttr (elemName ns "w" "styleId") element =
+ Just (styleId, elemToRunStyle ns element parentStyle)
+ | otherwise = Nothing
+ getStyleId s = fst s
+
+instance ElemToStyle ParStyle where
+ cStyleType _ = "paragraph"
+ elemToStyle ns element parentStyle
+ | isElem ns "w" "style" element
+ , Just "paragraph" <- findAttr (elemName ns "w" "type") element
+ , Just styleId <- findAttr (elemName ns "w" "styleId") element =
+ Just (styleId, elemToParStyleData ns element parentStyle)
+ | otherwise = Nothing
+ getStyleId s = fst s
+
+getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a]
+getStyleChildren ns element parentStyle
+ | isElem ns "w" "styles" element =
+ mapMaybe (\e -> elemToStyle ns e parentStyle) $
+ filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element
+ | otherwise = []
+
+buildBasedOnList :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a]
+buildBasedOnList ns element rootStyle =
+ case (getStyleChildren ns element rootStyle) of
+ [] -> []
+ stys -> stys ++
+ (concatMap (\s -> buildBasedOnList ns element (Just s)) stys)
+
+archiveToNotes :: Archive -> Notes
+archiveToNotes zf =
+ let fnElem = findEntryByPath "word/footnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ enElem = findEntryByPath "word/endnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ fn_namespaces = case fnElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ en_namespaces = case enElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
+ fn = fnElem >>= (elemToNotes ns "footnote")
+ en = enElem >>= (elemToNotes ns "endnote")
+ in
+ Notes ns fn en
+
+filePathIsRel :: FilePath -> Bool
+filePathIsRel fp =
+ let (dir, name) = splitFileName fp
+ in
+ (dir == "word/_rels/") && ((takeExtension name) == ".rels")
+
+relElemToRelationship :: Element -> Maybe Relationship
+relElemToRelationship element | qName (elName element) == "Relationship" =
+ do
+ relId <- findAttr (QName "Id" Nothing Nothing) element
+ target <- findAttr (QName "Target" Nothing Nothing) element
+ return $ Relationship (relId, target)
+relElemToRelationship _ = Nothing
+
+
+archiveToRelationships :: Archive -> [Relationship]
+archiveToRelationships archive =
+ let relPaths = filter filePathIsRel (filesInArchive archive)
+ entries = mapMaybe (\f -> findEntryByPath f archive) relPaths
+ relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
+ rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems
+ in
+ rels
+
+filePathIsMedia :: FilePath -> Bool
+filePathIsMedia fp =
+ let (dir, _) = splitFileName fp
+ in
+ (dir == "word/media/")
+
+getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString)
+getMediaPair zf fp =
+ case findEntryByPath fp zf of
+ Just e -> Just (fp, fromEntry e)
+ Nothing -> Nothing
+
+archiveToMedia :: Archive -> Media
+archiveToMedia zf =
+ mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
+
+lookupLevel :: String -> String -> Numbering -> Maybe Level
+lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
+ absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
+ lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs
+ lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls
+ return lvl
+
+numElemToNum :: NameSpaces -> Element -> Maybe Numb
+numElemToNum ns element |
+ qName (elName element) == "num" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element
+ absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ return $ Numb numId absNumId
+numElemToNum _ _ = Nothing
+
+absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb
+absNumElemToAbsNum ns element |
+ qName (elName element) == "abstractNum" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ absNumId <- findAttr
+ (QName "abstractNumId" (lookup "w" ns) (Just "w"))
+ element
+ let levelElems = findChildren
+ (QName "lvl" (lookup "w" ns) (Just "w"))
+ element
+ levels = mapMaybe (levelElemToLevel ns) levelElems
+ return $ AbstractNumb absNumId levels
+absNumElemToAbsNum _ _ = Nothing
+
+levelElemToLevel :: NameSpaces -> Element -> Maybe Level
+levelElemToLevel ns element |
+ qName (elName element) == "lvl" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element
+ fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)])))
+ return (ilvl, fmt, txt, start)
+levelElemToLevel _ _ = Nothing
+
+archiveToNumbering' :: Archive -> Maybe Numbering
+archiveToNumbering' zf = do
+ case findEntryByPath "word/numbering.xml" zf of
+ Nothing -> Just $ Numbering [] [] []
+ Just entry -> do
+ numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem)
+ numElems = findChildren
+ (QName "num" (lookup "w" namespaces) (Just "w"))
+ numberingElem
+ absNumElems = findChildren
+ (QName "abstractNum" (lookup "w" namespaces) (Just "w"))
+ numberingElem
+ nums = mapMaybe (numElemToNum namespaces) numElems
+ absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems
+ return $ Numbering namespaces nums absNums
+
+archiveToNumbering :: Archive -> Numbering
+archiveToNumbering archive =
+ fromMaybe (Numbering [] [] []) (archiveToNumbering' archive)
+
+elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element)
+elemToNotes ns notetype element
+ | isElem ns "w" (notetype ++ "s") element =
+ let pairs = mapMaybe
+ (\e -> findAttr (elemName ns "w" "id") e >>=
+ (\a -> Just (a, e)))
+ (findChildren (elemName ns "w" notetype) element)
+ in
+ Just $ M.fromList $ pairs
+elemToNotes _ _ _ = Nothing
+
+---------------------------------------------
+---------------------------------------------
+
+elemName :: NameSpaces -> String -> String -> QName
+elemName ns prefix name = (QName name (lookup prefix ns) (Just prefix))
+
+isElem :: NameSpaces -> String -> String -> Element -> Bool
+isElem ns prefix name element =
+ qName (elName element) == name &&
+ qURI (elName element) == (lookup prefix ns)
+
+
+elemToTblGrid :: NameSpaces -> Element -> D TblGrid
+elemToTblGrid ns element | isElem ns "w" "tblGrid" element =
+ let cols = findChildren (elemName ns "w" "gridCol") element
+ in
+ mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger))
+ cols
+elemToTblGrid _ _ = throwError WrongElem
+
+elemToTblLook :: NameSpaces -> Element -> D TblLook
+elemToTblLook ns element | isElem ns "w" "tblLook" element =
+ let firstRow = findAttr (elemName ns "w" "firstRow") element
+ val = findAttr (elemName ns "w" "val") element
+ firstRowFmt =
+ case firstRow of
+ Just "1" -> True
+ Just _ -> False
+ Nothing -> case val of
+ Just bitMask -> testBitMask bitMask 0x020
+ Nothing -> False
+ in
+ return $ TblLook{firstRowFormatting = firstRowFmt}
+elemToTblLook _ _ = throwError WrongElem
+
+elemToRow :: NameSpaces -> Element -> D Row
+elemToRow ns element | isElem ns "w" "tr" element =
+ do
+ let cellElems = findChildren (elemName ns "w" "tc") element
+ cells <- mapD (elemToCell ns) cellElems
+ return $ Row cells
+elemToRow _ _ = throwError WrongElem
+
+elemToCell :: NameSpaces -> Element -> D Cell
+elemToCell ns element | isElem ns "w" "tc" element =
+ do
+ cellContents <- mapD (elemToBodyPart ns) (elChildren element)
+ return $ Cell cellContents
+elemToCell _ _ = throwError WrongElem
+
+elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
+elemToParIndentation ns element | isElem ns "w" "ind" element =
+ Just $ ParIndentation {
+ leftParIndent =
+ findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger
+ , rightParIndent =
+ findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger
+ , hangingParIndent =
+ findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger}
+elemToParIndentation _ _ = Nothing
+
+
+elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String)
+elemToNumInfo ns element | isElem ns "w" "p" element = do
+ let pPr = findChild (elemName ns "w" "pPr") element
+ numPr = pPr >>= findChild (elemName ns "w" "numPr")
+ lvl <- numPr >>=
+ findChild (elemName ns "w" "ilvl") >>=
+ findAttr (elemName ns "w" "val")
+ numId <- numPr >>=
+ findChild (elemName ns "w" "numId") >>=
+ findAttr (elemName ns "w" "val")
+ return (numId, lvl)
+elemToNumInfo _ _ = Nothing
+
+testBitMask :: String -> Int -> Bool
+testBitMask bitMaskS n =
+ case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of
+ [] -> False
+ ((n', _) : _) -> ((n' .|. n) /= 0)
+
+stringToInteger :: String -> Maybe Integer
+stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
+
+elemToBodyPart :: NameSpaces -> Element -> D BodyPart
+elemToBodyPart ns element
+ | isElem ns "w" "p" element
+ , (c:_) <- findChildren (elemName ns "m" "oMathPara") element =
+ do
+ expsLst <- eitherToD $ readOMML $ showElement c
+ return $ OMathPara expsLst
+elemToBodyPart ns element
+ | isElem ns "w" "p" element
+ , Just (numId, lvl) <- elemToNumInfo ns element = do
+ sty <- asks envParStyles
+ let parstyle = elemToParagraphStyle ns element sty
+ parparts <- mapD (elemToParPart ns) (elChildren element)
+ num <- asks envNumbering
+ case lookupLevel numId lvl num of
+ Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts
+ Nothing -> throwError WrongElem
+elemToBodyPart ns element
+ | isElem ns "w" "p" element = do
+ sty <- asks envParStyles
+ let parstyle = elemToParagraphStyle ns element sty
+ parparts <- mapD (elemToParPart ns) (elChildren element)
+ return $ Paragraph parstyle parparts
+elemToBodyPart ns element
+ | isElem ns "w" "tbl" element = do
+ let caption' = findChild (elemName ns "w" "tblPr") element
+ >>= findChild (elemName ns "w" "tblCaption")
+ >>= findAttr (elemName ns "w" "val")
+ caption = (fromMaybe "" caption')
+ grid' = case findChild (elemName ns "w" "tblGrid") element of
+ Just g -> elemToTblGrid ns g
+ Nothing -> return []
+ tblLook' = case findChild (elemName ns "w" "tblPr") element >>=
+ findChild (elemName ns "w" "tblLook")
+ of
+ Just l -> elemToTblLook ns l
+ Nothing -> return defaultTblLook
+
+ grid <- grid'
+ tblLook <- tblLook'
+ rows <- mapD (elemToRow ns) (elChildren element)
+ return $ Tbl caption grid tblLook rows
+elemToBodyPart _ _ = throwError WrongElem
+
+lookupRelationship :: RelId -> [Relationship] -> Maybe Target
+lookupRelationship relid rels =
+ lookup relid (map (\(Relationship pair) -> pair) rels)
+
+expandDrawingId :: String -> D (FilePath, B.ByteString)
+expandDrawingId s = do
+ target <- asks (lookupRelationship s . envRelationships)
+ case target of
+ Just filepath -> do
+ bytes <- asks (lookup ("word/" ++ filepath) . envMedia)
+ case bytes of
+ Just bs -> return (filepath, bs)
+ Nothing -> throwError DocxError
+ Nothing -> throwError DocxError
+
+elemToParPart :: NameSpaces -> Element -> D ParPart
+elemToParPart ns element
+ | isElem ns "w" "r" element
+ , Just _ <- findChild (elemName ns "w" "drawing") element =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs)
+ Nothing -> throwError WrongElem
+elemToParPart ns element
+ | isElem ns "w" "r" element =
+ elemToRun ns element >>= (\r -> return $ PlainRun r)
+elemToParPart ns element
+ | isElem ns "w" "ins" element
+ , Just cId <- findAttr (elemName ns "w" "id") element
+ , Just cAuthor <- findAttr (elemName ns "w" "author") element
+ , Just cDate <- findAttr (elemName ns "w" "date") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ Insertion cId cAuthor cDate runs
+elemToParPart ns element
+ | isElem ns "w" "del" element
+ , Just cId <- findAttr (elemName ns "w" "id") element
+ , Just cAuthor <- findAttr (elemName ns "w" "author") element
+ , Just cDate <- findAttr (elemName ns "w" "date") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ Deletion cId cAuthor cDate runs
+elemToParPart ns element
+ | isElem ns "w" "bookmarkStart" element
+ , Just bmId <- findAttr (elemName ns "w" "id") element
+ , Just bmName <- findAttr (elemName ns "w" "name") element =
+ return $ BookMark bmId bmName
+elemToParPart ns element
+ | isElem ns "w" "hyperlink" element
+ , Just relId <- findAttr (elemName ns "r" "id") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ rels <- asks envRelationships
+ case lookupRelationship relId rels of
+ Just target -> do
+ case findAttr (elemName ns "w" "anchor") element of
+ Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs
+ Nothing -> return $ ExternalHyperLink target runs
+ Nothing -> return $ ExternalHyperLink "" runs
+elemToParPart ns element
+ | isElem ns "w" "hyperlink" element
+ , Just anchor <- findAttr (elemName ns "w" "anchor") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ InternalHyperLink anchor runs
+elemToParPart ns element
+ | isElem ns "m" "oMath" element =
+ (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath)
+elemToParPart _ _ = throwError WrongElem
+
+lookupFootnote :: String -> Notes -> Maybe Element
+lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s)
+
+lookupEndnote :: String -> Notes -> Maybe Element
+lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s)
+
+elemToRun :: NameSpaces -> Element -> D Run
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>=
+ (\(fp, bs) -> return $ InlineDrawing fp bs)
+ Nothing -> throwError WrongElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "footnoteReference") element
+ , Just fnId <- findAttr (elemName ns "w" "id") ref = do
+ notes <- asks envNotes
+ case lookupFootnote fnId notes of
+ Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e)
+ return $ Footnote bps
+ Nothing -> return $ Footnote []
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "endnoteReference") element
+ , Just enId <- findAttr (elemName ns "w" "id") ref = do
+ notes <- asks envNotes
+ case lookupEndnote enId notes of
+ Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e)
+ return $ Endnote bps
+ Nothing -> return $ Endnote []
+elemToRun ns element
+ | isElem ns "w" "r" element = do
+ runElems <- elemToRunElems ns element
+ runStyle <- elemToRunStyleD ns element
+ return $ Run runStyle runElems
+elemToRun _ _ = throwError WrongElem
+
+getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a
+getParentStyleValue field style
+ | Just value <- field style = Just value
+ | Just parentStyle <- psStyle style
+ = getParentStyleValue field (snd parentStyle)
+getParentStyleValue _ _ = Nothing
+
+getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] ->
+ Maybe a
+getParStyleField field stylemap styles
+ | x <- mapMaybe (\x -> M.lookup x stylemap) styles
+ , (y:_) <- mapMaybe (getParentStyleValue field) x
+ = Just y
+getParStyleField _ _ _ = Nothing
+
+elemToParagraphStyle :: NameSpaces -> Element -> ParStyleMap -> ParagraphStyle
+elemToParagraphStyle ns element sty
+ | Just pPr <- findChild (elemName ns "w" "pPr") element =
+ let style =
+ mapMaybe
+ (findAttr (elemName ns "w" "val"))
+ (findChildren (elemName ns "w" "pStyle") pPr)
+ in ParagraphStyle
+ {pStyle = style
+ , indentation =
+ findChild (elemName ns "w" "ind") pPr >>=
+ elemToParIndentation ns
+ , dropCap =
+ case
+ findChild (elemName ns "w" "framePr") pPr >>=
+ findAttr (elemName ns "w" "dropCap")
+ of
+ Just "none" -> False
+ Just _ -> True
+ Nothing -> False
+ , pHeading = getParStyleField headingLev sty style
+ , pBlockQuote = getParStyleField isBlockQuote sty style
+ }
+elemToParagraphStyle _ _ _ = defaultParagraphStyle
+
+checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool
+checkOnOff ns rPr tag
+ | Just t <- findChild tag rPr
+ , Just val <- findAttr (elemName ns "w" "val") t =
+ Just $ case val of
+ "true" -> True
+ "false" -> False
+ "on" -> True
+ "off" -> False
+ "1" -> True
+ "0" -> False
+ _ -> False
+ | Just _ <- findChild tag rPr = Just True
+checkOnOff _ _ _ = Nothing
+
+elemToRunStyleD :: NameSpaces -> Element -> D RunStyle
+elemToRunStyleD ns element
+ | Just rPr <- findChild (elemName ns "w" "rPr") element = do
+ charStyles <- asks envCharStyles
+ let parentSty = case
+ findChild (elemName ns "w" "rStyle") rPr >>=
+ findAttr (elemName ns "w" "val")
+ of
+ Just styName | Just style <- M.lookup styName charStyles ->
+ Just (styName, style)
+ _ -> Nothing
+ return $ elemToRunStyle ns element parentSty
+elemToRunStyleD _ _ = return defaultRunStyle
+
+elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle
+elemToRunStyle ns element parentStyle
+ | Just rPr <- findChild (elemName ns "w" "rPr") element =
+ RunStyle
+ {
+ isBold = checkOnOff ns rPr (elemName ns "w" "b")
+ , isItalic = checkOnOff ns rPr (elemName ns "w" "i")
+ , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps")
+ , isStrike = checkOnOff ns rPr (elemName ns "w" "strike")
+ , rVertAlign =
+ findChild (elemName ns "w" "vertAlign") rPr >>=
+ findAttr (elemName ns "w" "val") >>=
+ \v -> Just $ case v of
+ "superscript" -> SupScrpt
+ "subscript" -> SubScrpt
+ _ -> BaseLn
+ , rUnderline =
+ findChild (elemName ns "w" "u") rPr >>=
+ findAttr (elemName ns "w" "val")
+ , rStyle = parentStyle
+ }
+elemToRunStyle _ _ _ = defaultRunStyle
+
+isNumericNotNull :: String -> Bool
+isNumericNotNull str = (str /= []) && (all isDigit str)
+
+getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int)
+getHeaderLevel ns element
+ | Just styleId <- findAttr (elemName ns "w" "styleId") element
+ , Just index <- stripPrefix "Heading" styleId
+ , isNumericNotNull index = Just (styleId, read index)
+ | Just styleId <- findAttr (elemName ns "w" "styleId") element
+ , Just index <- findChild (elemName ns "w" "name") element >>=
+ findAttr (elemName ns "w" "val") >>=
+ stripPrefix "heading "
+ , isNumericNotNull index = Just (styleId, read index)
+getHeaderLevel _ _ = Nothing
+
+blockQuoteStyleIds :: [String]
+blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"]
+
+blockQuoteStyleNames :: [String]
+blockQuoteStyleNames = ["Quote", "Block Text"]
+
+getBlockQuote :: NameSpaces -> Element -> Maybe Bool
+getBlockQuote ns element
+ | Just styleId <- findAttr (elemName ns "w" "styleId") element
+ , styleId `elem` blockQuoteStyleIds = Just True
+ | Just styleName <- findChild (elemName ns "w" "name") element >>=
+ findAttr (elemName ns "w" "val")
+ , styleName `elem` blockQuoteStyleNames = Just True
+getBlockQuote _ _ = Nothing
+
+elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData
+elemToParStyleData ns element parentStyle =
+ ParStyleData
+ {
+ headingLev = getHeaderLevel ns element
+ , isBlockQuote = getBlockQuote ns element
+ , psStyle = parentStyle
+ }
+
+elemToRunElem :: NameSpaces -> Element -> D RunElem
+elemToRunElem ns element
+ | isElem ns "w" "t" element
+ || isElem ns "w" "delText" element
+ || isElem ns "m" "t" element = do
+ let str = strContent element
+ font <- asks envFont
+ case font of
+ Nothing -> return $ TextRun str
+ Just f -> return . TextRun $
+ map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str
+ | isElem ns "w" "br" element = return LnBrk
+ | isElem ns "w" "tab" element = return Tab
+ | isElem ns "w" "sym" element = return (getSymChar ns element)
+ | otherwise = throwError WrongElem
+ where
+ lowerFromPrivate (ord -> c)
+ | c >= ord '\xF000' = chr $ c - ord '\xF000'
+ | otherwise = chr c
+
+-- The char attribute is a hex string
+getSymChar :: NameSpaces -> Element -> RunElem
+getSymChar ns element
+ | Just s <- lowerFromPrivate <$> getCodepoint
+ , Just font <- getFont =
+ let [(char, _)] = readLitChar ("\\x" ++ s) in
+ TextRun . maybe "" (:[]) $ getUnicode font char
+ where
+ getCodepoint = findAttr (elemName ns "w" "char") element
+ getFont = stringToFont =<< findAttr (elemName ns "w" "font") element
+ lowerFromPrivate ('F':xs) = '0':xs
+ lowerFromPrivate xs = xs
+getSymChar _ _ = TextRun ""
+
+stringToFont :: String -> Maybe Font
+stringToFont "Symbol" = Just Symbol
+stringToFont _ = Nothing
+
+elemToRunElems :: NameSpaces -> Element -> D [RunElem]
+elemToRunElems ns element
+ | isElem ns "w" "r" element
+ || isElem ns "m" "r" element = do
+ let qualName = elemName ns "w"
+ let font = do
+ fontElem <- findElement (qualName "rFonts") element
+ stringToFont =<<
+ (foldr (<|>) Nothing $
+ map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"])
+ local (setFont font) (mapD (elemToRunElem ns) (elChildren element))
+elemToRunElems _ _ = throwError WrongElem
+
+setFont :: Maybe Font -> ReaderEnv -> ReaderEnv
+setFont f s = s{envFont = f}
diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs
new file mode 100644
index 000000000..8269ca88d
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs
@@ -0,0 +1,182 @@
+{-# LANGUAGE TypeSynonymInstances, FlexibleInstances,
+ PatternGuards #-}
+
+module Text.Pandoc.Readers.Docx.Reducible ( concatReduce
+ , (<+>)
+ )
+ where
+
+
+import Text.Pandoc.Builder
+import Data.Monoid
+import Data.List
+import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr)
+import qualified Data.Sequence as Seq (null)
+
+data Modifier a = Modifier (a -> a)
+ | AttrModifier (Attr -> a -> a) Attr
+ | NullModifier
+
+class (Eq a) => Modifiable a where
+ modifier :: a -> Modifier a
+ innards :: a -> a
+ getL :: a -> (a, a)
+ getR :: a -> (a, a)
+ spaceOut :: a -> (a, a, a)
+
+spaceOutL :: (Monoid a, Modifiable a) => a -> (a, a)
+spaceOutL ms = (l, stack fs (m' <> r))
+ where (l, m, r) = spaceOut ms
+ (fs, m') = unstack m
+
+spaceOutR :: (Monoid a, Modifiable a) => a -> (a, a)
+spaceOutR ms = (stack fs (l <> m'), r)
+ where (l, m, r) = spaceOut ms
+ (fs, m') = unstack m
+
+instance (Monoid a, Show a) => Show (Modifier a) where
+ show (Modifier f) = show $ f mempty
+ show (AttrModifier f attr) = show $ f attr mempty
+ show (NullModifier) = "NullModifier"
+
+instance (Monoid a, Eq a) => Eq (Modifier a) where
+ (Modifier f) == (Modifier g) = (f mempty == g mempty)
+ (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty)
+ (NullModifier) == (NullModifier) = True
+ _ == _ = False
+
+instance Modifiable Inlines where
+ modifier ils = case viewl (unMany ils) of
+ (x :< xs) | Seq.null xs -> case x of
+ (Emph _) -> Modifier emph
+ (Strong _) -> Modifier strong
+ (SmallCaps _) -> Modifier smallcaps
+ (Strikeout _) -> Modifier strikeout
+ (Superscript _) -> Modifier superscript
+ (Subscript _) -> Modifier subscript
+ (Span attr _) -> AttrModifier spanWith attr
+ _ -> NullModifier
+ _ -> NullModifier
+
+ innards ils = case viewl (unMany ils) of
+ (x :< xs) | Seq.null xs -> case x of
+ (Emph lst) -> fromList lst
+ (Strong lst) -> fromList lst
+ (SmallCaps lst) -> fromList lst
+ (Strikeout lst) -> fromList lst
+ (Superscript lst) -> fromList lst
+ (Subscript lst) -> fromList lst
+ (Span _ lst) -> fromList lst
+ _ -> ils
+ _ -> ils
+
+ getL ils = case viewl $ unMany ils of
+ (s :< sq) -> (singleton s, Many sq)
+ _ -> (mempty, ils)
+
+ getR ils = case viewr $ unMany ils of
+ (sq :> s) -> (Many sq, singleton s)
+ _ -> (ils, mempty)
+
+ spaceOut ils =
+ let (fs, ils') = unstack ils
+ contents = unMany ils'
+ left = case viewl contents of
+ (Space :< _) -> space
+ _ -> mempty
+ right = case viewr contents of
+ (_ :> Space) -> space
+ _ -> mempty in
+ (left, (stack fs $ trimInlines .Many $ contents), right)
+
+instance Modifiable Blocks where
+ modifier blks = case viewl (unMany blks) of
+ (x :< xs) | Seq.null xs -> case x of
+ (BlockQuote _) -> Modifier blockQuote
+ -- (Div attr _) -> AttrModifier divWith attr
+ _ -> NullModifier
+ _ -> NullModifier
+
+ innards blks = case viewl (unMany blks) of
+ (x :< xs) | Seq.null xs -> case x of
+ (BlockQuote lst) -> fromList lst
+ -- (Div attr lst) -> fromList lst
+ _ -> blks
+ _ -> blks
+
+ spaceOut blks = (mempty, blks, mempty)
+
+ getL ils = case viewl $ unMany ils of
+ (s :< sq) -> (singleton s, Many sq)
+ _ -> (mempty, ils)
+
+ getR ils = case viewr $ unMany ils of
+ (sq :> s) -> (Many sq, singleton s)
+ _ -> (ils, mempty)
+
+
+unstack :: (Modifiable a) => a -> ([Modifier a], a)
+unstack ms = case modifier ms of
+ NullModifier -> ([], ms)
+ _ -> (f : fs, ms') where
+ f = modifier ms
+ (fs, ms') = unstack $ innards ms
+
+stack :: (Monoid a, Modifiable a) => [Modifier a] -> a -> a
+stack [] ms = ms
+stack (NullModifier : fs) ms = stack fs ms
+stack ((Modifier f) : fs) ms =
+ if isEmpty ms
+ then stack fs ms
+ else f $ stack fs ms
+stack ((AttrModifier f attr) : fs) ms = f attr $ stack fs ms
+
+isEmpty :: (Monoid a, Eq a) => a -> Bool
+isEmpty x = x == mempty
+
+
+combine :: (Monoid a, Modifiable a, Eq a) => a -> a -> a
+combine x y =
+ let (xs', x') = getR x
+ (y', ys') = getL y
+ in
+ xs' <> (combineSingleton x' y') <> ys'
+
+isAttrModifier :: Modifier a -> Bool
+isAttrModifier (AttrModifier _ _) = True
+isAttrModifier _ = False
+
+combineSingleton :: (Monoid a, Modifiable a, Eq a) => a -> a -> a
+combineSingleton x y =
+ let (xfs, xs) = unstack x
+ (yfs, ys) = unstack y
+ shared = xfs `intersect` yfs
+ x_remaining = xfs \\ shared
+ y_remaining = yfs \\ shared
+ x_rem_attr = filter isAttrModifier x_remaining
+ y_rem_attr = filter isAttrModifier y_remaining
+ in
+ case null shared of
+ True | isEmpty xs && isEmpty ys ->
+ stack (x_rem_attr ++ y_rem_attr) mempty
+ | isEmpty xs ->
+ let (sp, y') = spaceOutL y in
+ (stack x_rem_attr mempty) <> sp <> y'
+ | isEmpty ys ->
+ let (x', sp) = spaceOutR x in
+ x' <> sp <> (stack y_rem_attr mempty)
+ | otherwise ->
+ let (x', xsp) = spaceOutR x
+ (ysp, y') = spaceOutL y
+ in
+ x' <> xsp <> ysp <> y'
+ False -> stack shared $
+ combine
+ (stack x_remaining xs)
+ (stack y_remaining ys)
+
+(<+>) :: (Monoid a, Modifiable a, Eq a) => a -> a -> a
+x <+> y = combine x y
+
+concatReduce :: (Monoid a, Modifiable a) => [a] -> a
+concatReduce xs = foldl combine mempty xs
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
new file mode 100644
index 000000000..b061d8683
--- /dev/null
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -0,0 +1,283 @@
+{-# LANGUAGE
+ ViewPatterns
+ , StandaloneDeriving
+ , TupleSections
+ , FlexibleContexts #-}
+
+module Text.Pandoc.Readers.EPUB
+ (readEPUB)
+ where
+
+import Text.XML.Light
+import Text.Pandoc.Definition hiding (Attr)
+import Text.Pandoc.Walk (walk, query)
+import Text.Pandoc.Readers.HTML (readHtml)
+import Text.Pandoc.Options ( ReaderOptions(..), readerTrace)
+import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField)
+import Text.Pandoc.MediaBag (MediaBag, insertMedia)
+import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
+import Text.Pandoc.MIME (MimeType)
+import qualified Text.Pandoc.Builder as B
+import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry
+ , findEntryByPath, Entry)
+import qualified Data.ByteString.Lazy as BL (ByteString)
+import System.FilePath ( takeFileName, (</>), dropFileName, normalise
+ , dropFileName
+ , splitFileName )
+import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
+import Control.Applicative ((<$>))
+import Control.Monad (guard, liftM, when)
+import Data.Monoid (mempty, (<>))
+import Data.List (isPrefixOf, isInfixOf)
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Map as M (Map, lookup, fromList, elems)
+import Control.DeepSeq.Generics (deepseq, NFData)
+
+import Debug.Trace (trace)
+
+type Items = M.Map String (FilePath, MimeType)
+
+readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag)
+readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes)
+
+runEPUB :: Except String a -> a
+runEPUB = either error id . runExcept
+
+-- Note that internal reference are aggresively normalised so that all ids
+-- are of the form "filename#id"
+--
+archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
+archiveToEPUB os archive = do
+ -- root is path to folder with manifest file in
+ (root, content) <- getManifest archive
+ meta <- parseMeta content
+ (cover, items) <- parseManifest content
+ -- No need to collapse here as the image path is from the manifest file
+ let coverDoc = fromMaybe mempty (imageToPandoc <$> cover)
+ spine <- parseSpine items content
+ let escapedSpine = map (escapeURI . takeFileName . fst) spine
+ Pandoc _ bs <-
+ foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine))
+ `liftM` parseSpineElem root b) mempty spine
+ let ast = coverDoc <> (Pandoc meta bs)
+ let mediaBag = fetchImages (M.elems items) root archive ast
+ return $ (ast, mediaBag)
+ where
+ os' = os {readerParseRaw = True}
+ parseSpineElem :: MonadError String m => FilePath -> (FilePath, MimeType) -> m Pandoc
+ parseSpineElem (normalise -> r) (normalise -> path, mime) = do
+ when (readerTrace os) (traceM path)
+ doc <- mimeToReader mime r path
+ let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
+ return $ docSpan <> doc
+ mimeToReader :: MonadError String m => MimeType -> FilePath -> FilePath -> m Pandoc
+ mimeToReader "application/xhtml+xml" (normalise -> root) (normalise -> path) = do
+ fname <- findEntryByPathE (root </> path) archive
+ return $ fixInternalReferences path .
+ readHtml os' .
+ UTF8.toStringLazy $
+ fromEntry fname
+ mimeToReader s _ path
+ | s `elem` imageMimes = return $ imageToPandoc path
+ | otherwise = return $ mempty
+
+-- paths should be absolute when this function is called
+-- renameImages should do this
+fetchImages :: [(FilePath, MimeType)]
+ -> FilePath -- ^ Root
+ -> Archive
+ -> Pandoc
+ -> MediaBag
+fetchImages mimes root arc (query iq -> links) =
+ foldr (uncurry3 insertMedia) mempty
+ (mapMaybe getEntry links)
+ where
+ getEntry link =
+ let abslink = normalise (root </> link) in
+ (link , lookup link mimes, ) . fromEntry
+ <$> findEntryByPath abslink arc
+
+iq :: Inline -> [FilePath]
+iq (Image _ (url, _)) = [url]
+iq _ = []
+
+-- Remove relative paths
+renameImages :: FilePath -> Inline -> Inline
+renameImages root (Image a (url, b)) = Image a (collapseFilePath (root </> url), b)
+renameImages _ x = x
+
+imageToPandoc :: FilePath -> Pandoc
+imageToPandoc s = B.doc . B.para $ B.image s "" mempty
+
+imageMimes :: [MimeType]
+imageMimes = ["image/gif", "image/jpeg", "image/png"]
+
+type CoverImage = FilePath
+
+parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items)
+parseManifest content = do
+ manifest <- findElementE (dfName "manifest") content
+ let items = findChildren (dfName "item") manifest
+ r <- mapM parseItem items
+ let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
+ return (cover, (M.fromList r))
+ where
+ findCover e = maybe False (isInfixOf "cover-image")
+ (findAttr (emptyName "properties") e)
+ parseItem e = do
+ uid <- findAttrE (emptyName "id") e
+ href <- findAttrE (emptyName "href") e
+ mime <- findAttrE (emptyName "media-type") e
+ return (uid, (href, mime))
+
+parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MimeType)]
+parseSpine is e = do
+ spine <- findElementE (dfName "spine") e
+ let itemRefs = findChildren (dfName "itemref") spine
+ mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
+ where
+ parseItemRef ref = do
+ let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
+ guard linear
+ findAttr (emptyName "idref") ref
+
+parseMeta :: MonadError String m => Element -> m Meta
+parseMeta content = do
+ meta <- findElementE (dfName "metadata") content
+ let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
+ dcspace _ = False
+ let dcs = filterChildrenName dcspace meta
+ let r = foldr parseMetaItem nullMeta dcs
+ return r
+
+-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
+parseMetaItem :: Element -> Meta -> Meta
+parseMetaItem e@(stripNamespace . elName -> field) meta =
+ addMetaField (renameMeta field) (B.str $ strContent e) meta
+
+renameMeta :: String -> String
+renameMeta "creator" = "author"
+renameMeta s = s
+
+getManifest :: MonadError String m => Archive -> m (String, Element)
+getManifest archive = do
+ metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
+ docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
+ as <- liftM ((map attrToPair) . elAttribs)
+ (findElementE (QName "rootfile" (Just ns) Nothing) docElem)
+ manifestFile <- mkE "Root not found" (lookup "full-path" as)
+ let rootdir = dropFileName manifestFile
+ --mime <- lookup "media-type" as
+ manifest <- findEntryByPathE manifestFile archive
+ liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+
+-- Fixup
+
+fixInternalReferences :: FilePath -> Pandoc -> Pandoc
+fixInternalReferences pathToFile =
+ (walk $ renameImages root)
+ . (walk normalisePath)
+ . (walk $ fixBlockIRs filename)
+ . (walk $ fixInlineIRs filename)
+ where
+ (root, escapeURI -> filename) = splitFileName pathToFile
+
+fixInlineIRs :: String -> Inline -> Inline
+fixInlineIRs s (Span as v) =
+ Span (fixAttrs s as) v
+fixInlineIRs s (Code as code) =
+ Code (fixAttrs s as) code
+fixInlineIRs s (Link t ('#':url, tit)) =
+ Link t (addHash s url, tit)
+fixInlineIRs _ v = v
+
+normalisePath :: Inline -> Inline
+normalisePath (Link t (url, tit)) =
+ let (path, uid) = span (/= '#') url in
+ Link t (takeFileName path ++ uid, tit)
+normalisePath s = s
+
+prependHash :: [String] -> Inline -> Inline
+prependHash ps l@(Link is (url, tit))
+ | or [s `isPrefixOf` url | s <- ps] =
+ Link is ('#':url, tit)
+ | otherwise = l
+prependHash _ i = i
+
+fixBlockIRs :: String -> Block -> Block
+fixBlockIRs s (Div as b) =
+ Div (fixAttrs s as) b
+fixBlockIRs s (Header i as b) =
+ Header i (fixAttrs s as) b
+fixBlockIRs s (CodeBlock as code) =
+ CodeBlock (fixAttrs s as) code
+fixBlockIRs _ b = b
+
+fixAttrs :: FilePath -> B.Attr -> B.Attr
+fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
+
+addHash :: String -> String -> String
+addHash _ "" = ""
+addHash s ident = s ++ "#" ++ ident
+
+removeEPUBAttrs :: [(String, String)] -> [(String, String)]
+removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs
+
+isEPUBAttr :: (String, String) -> Bool
+isEPUBAttr (k, _) = "epub:" `isPrefixOf` k
+
+-- Library
+
+-- Strict version of foldM
+foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a
+foldM' _ z [] = return z
+foldM' f z (x:xs) = do
+ z' <- f z x
+ z' `deepseq` foldM' f z' xs
+
+uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
+uncurry3 f (a, b, c) = f a b c
+
+traceM :: Monad m => String -> m ()
+traceM = flip trace (return ())
+
+-- Utility
+
+stripNamespace :: QName -> String
+stripNamespace (QName v _ _) = v
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
+attrToNSPair _ = Nothing
+
+attrToPair :: Attr -> (String, String)
+attrToPair (Attr (QName name _ _) val) = (name, val)
+
+defaultNameSpace :: Maybe String
+defaultNameSpace = Just "http://www.idpf.org/2007/opf"
+
+dfName :: String -> QName
+dfName s = QName s defaultNameSpace Nothing
+
+emptyName :: String -> QName
+emptyName s = QName s Nothing Nothing
+
+-- Convert Maybe interface to Either
+
+findAttrE :: MonadError String m => QName -> Element -> m String
+findAttrE q e = mkE "findAttr" $ findAttr q e
+
+findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry
+findEntryByPathE (normalise -> path) a =
+ mkE ("No entry on path: " ++ path) $ findEntryByPath path a
+
+parseXMLDocE :: MonadError String m => String -> m Element
+parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
+
+findElementE :: MonadError String m => QName -> Element -> m Element
+findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
+
+mkE :: MonadError String m => String -> Maybe a -> m a
+mkE s = maybe (throwError s) return
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index d1e4d0024..2a23f2a62 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,5 +1,6 @@
+{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses #-}
{-
-Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.HTML
- Copyright : Copyright (C) 2006-2010 John MacFarlane
+ Copyright : Copyright (C) 2006-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -40,58 +41,102 @@ import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Shared
-import Text.Pandoc.Options
-import Text.Pandoc.Parsing
-import Data.Maybe ( fromMaybe, isJust )
-import Data.List ( intercalate )
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
+import Text.Pandoc.Shared ( extractSpaces, renderTags'
+ , escapeURI, safeRead )
+import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
+ , Extension (Ext_epub_html_exts,
+ Ext_native_divs, Ext_native_spans))
+import Text.Pandoc.Parsing hiding ((<|>))
+import Text.Pandoc.Walk
+import Data.Maybe ( fromMaybe, isJust)
+import Data.List ( intercalate, isInfixOf )
import Data.Char ( isDigit )
-import Control.Monad ( liftM, guard, when, mzero )
-import Control.Applicative ( (<$>), (<$), (<*) )
+import Control.Monad ( liftM, guard, when, mzero, void, unless )
+import Control.Arrow ((***))
+import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>))
+import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..))
+import Text.Printf (printf)
+import Debug.Trace (trace)
+import Text.TeXMath (readMathML, writeTeX)
+import Data.Default (Default (..), def)
+import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-isSpace :: Char -> Bool
-isSpace ' ' = True
-isSpace '\t' = True
-isSpace '\n' = True
-isSpace _ = False
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
readHtml opts inp =
- case runParser parseDoc def{ stateOptions = opts } "source" tags of
+ case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of
Left err' -> error $ "\nError at " ++ show err'
Right result -> result
- where tags = canonicalizeTags $
+ where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
- blocks <- (fixPlains False . concat) <$> manyTill block eof
- meta <- stateMeta <$> getState
- return $ Pandoc meta blocks
+ blocks <- (fixPlains False) . mconcat <$> manyTill block eof
+ meta <- stateMeta . parserState <$> getState
+ bs' <- replaceNotes (B.toList blocks)
+ return $ Pandoc meta bs'
+
+replaceNotes :: [Block] -> TagParser [Block]
+replaceNotes = walkM replaceNotes'
+
+replaceNotes' :: Inline -> TagParser Inline
+replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
+ where
+ getNotes = noteTable <$> getState
+replaceNotes' x = return x
+
+data HTMLState =
+ HTMLState
+ { parserState :: ParserState,
+ noteTable :: [(String, Blocks)]
+ }
-type TagParser = Parser [Tag String] ParserState
+data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
+ , inChapter :: Bool -- ^ Set if in chapter section
+ , inPlain :: Bool -- ^ Set if in pPlain
+ }
-pBody :: TagParser [Block]
+setInChapter :: HTMLParser s a -> HTMLParser s a
+setInChapter = local (\s -> s {inChapter = True})
+
+setInPlain :: HTMLParser s a -> HTMLParser s a
+setInPlain = local (\s -> s {inPlain = True})
+
+type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
+
+type TagParser = HTMLParser [Tag String]
+
+pBody :: TagParser Blocks
pBody = pInTags "body" block
-pHead :: TagParser [Block]
-pHead = pInTags "head" $ pTitle <|> pMetaTag <|> ([] <$ pAnyTag)
- where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces
- setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t))
+pHead :: TagParser Blocks
+pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
+ where pTitle = pInTags "title" inline >>= setTitle . trimInlines
+ setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
pMetaTag = do
mt <- pSatisfy (~== TagOpen "meta" [])
let name = fromAttrib "name" mt
if null name
- then return []
+ then return mempty
else do
let content = fromAttrib "content" mt
updateState $ B.setMeta name (B.text content)
- return []
+ return mempty
-block :: TagParser [Block]
-block = choice
- [ pPara
+block :: TagParser Blocks
+block = do
+ tr <- getOption readerTrace
+ pos <- getPosition
+ res <- choice
+ [ eSection
+ , eSwitch B.para block
+ , mempty <$ eFootnote
+ , mempty <$ eTOC
+ , mempty <$ eTitlePage
+ , pPara
, pHeader
, pBlockQuote
, pCodeBlock
@@ -100,15 +145,76 @@ block = choice
, pTable
, pHead
, pBody
- , pPlain
, pDiv
+ , pPlain
, pRawHtmlBlock
]
+ when tr $ trace (printf "line %d: %s" (sourceLine pos)
+ (take 60 $ show $ B.toList res)) (return ())
+ return res
+
+namespaces :: [(String, TagParser Inlines)]
+namespaces = [(mathMLNamespace, pMath True)]
+
+mathMLNamespace :: String
+mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
+
+eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a
+eSwitch constructor parser = try $ do
+ guardEnabled Ext_epub_html_exts
+ pSatisfy (~== TagOpen "switch" [])
+ cases <- getFirst . mconcat <$>
+ manyTill (First <$> (eCase <* skipMany pBlank) )
+ (lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
+ skipMany pBlank
+ fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
+ skipMany pBlank
+ pSatisfy (~== TagClose "switch")
+ return $ maybe fallback constructor cases
-pList :: TagParser [Block]
+eCase :: TagParser (Maybe Inlines)
+eCase = do
+ skipMany pBlank
+ TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
+ case (flip lookup namespaces) =<< lookup "required-namespace" attr of
+ Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
+ Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
+
+eFootnote :: TagParser ()
+eFootnote = try $ do
+ let notes = ["footnote", "rearnote"]
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr) <- lookAhead $ pAnyTag
+ guard (maybe False (flip elem notes) (lookup "type" attr))
+ let ident = fromMaybe "" (lookup "id" attr)
+ content <- pInTags tag block
+ addNote ident content
+
+addNote :: String -> Blocks -> TagParser ()
+addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
+
+eNoteref :: TagParser Inlines
+eNoteref = try $ do
+ guardEnabled Ext_epub_html_exts
+ TagOpen tag attr <- lookAhead $ pAnyTag
+ guard (maybe False (== "noteref") (lookup "type" attr))
+ let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
+ guard (not (null ident))
+ pInTags tag block
+ return $ B.rawInline "noteref" ident
+
+-- Strip TOC if there is one, better to generate again
+eTOC :: TagParser ()
+eTOC = try $ do
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr) <- lookAhead $ pAnyTag
+ guard (maybe False (== "toc") (lookup "type" attr))
+ void (pInTags tag block)
+
+pList :: TagParser Blocks
pList = pBulletList <|> pOrderedList <|> pDefinitionList
-pBulletList :: TagParser [Block]
+pBulletList :: TagParser Blocks
pBulletList = try $ do
pSatisfy (~== TagOpen "ul" [])
let nonItem = pSatisfy (\t ->
@@ -117,10 +223,16 @@ pBulletList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
- return [BulletList $ map (fixPlains True) items]
+ items <- manyTill (pListItem nonItem) (pCloses "ul")
+ return $ B.bulletList $ map (fixPlains True) items
-pOrderedList :: TagParser [Block]
+pListItem :: TagParser a -> TagParser Blocks
+pListItem nonItem = do
+ TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
+ let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr)
+ (liDiv <>) <$> pInTags "li" block <* skipMany nonItem
+
+pOrderedList :: TagParser Blocks
pOrderedList = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
let (start, style) = (sta', sty')
@@ -145,28 +257,28 @@ pOrderedList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
- return [OrderedList (start, style, DefaultDelim) $ map (fixPlains True) items]
+ items <- manyTill (pListItem nonItem) (pCloses "ol")
+ return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
-pDefinitionList :: TagParser [Block]
+pDefinitionList :: TagParser Blocks
pDefinitionList = try $ do
pSatisfy (~== TagOpen "dl" [])
items <- manyTill pDefListItem (pCloses "dl")
- return [DefinitionList items]
+ return $ B.definitionList items
-pDefListItem :: TagParser ([Inline],[[Block]])
+pDefListItem :: TagParser (Inlines, [Blocks])
pDefListItem = try $ do
let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
- let term = intercalate [LineBreak] terms
+ let term = foldl1 (\x y -> x <> B.linebreak <> y) terms
return (term, map (fixPlains True) defs)
-fixPlains :: Bool -> [Block] -> [Block]
-fixPlains inList bs = if any isParaish bs
- then map plainToPara bs
+fixPlains :: Bool -> Blocks -> Blocks
+fixPlains inList bs = if any isParaish bs'
+ then B.fromList $ map plainToPara bs'
else bs
where isParaish (Para _) = True
isParaish (CodeBlock _ _) = True
@@ -178,29 +290,30 @@ fixPlains inList bs = if any isParaish bs
isParaish _ = False
plainToPara (Plain xs) = Para xs
plainToPara x = x
+ bs' = B.toList bs
pRawTag :: TagParser String
pRawTag = do
tag <- pAnyTag
- let ignorable x = x `elem` ["html","head","body"]
+ let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
if tagOpen ignorable (const True) tag || tagClose ignorable tag
then return []
else return $ renderTags' [tag]
-pDiv :: TagParser [Block]
+pDiv :: TagParser Blocks
pDiv = try $ do
- getOption readerParseRaw >>= guard
+ guardEnabled Ext_native_divs
TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True)
contents <- pInTags "div" block
- return [Div (mkAttr attr) contents]
+ return $ B.divWith (mkAttr attr) contents
-pRawHtmlBlock :: TagParser [Block]
+pRawHtmlBlock :: TagParser Blocks
pRawHtmlBlock = do
raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
parseRaw <- getOption readerParseRaw
if parseRaw && not (null raw)
- then return [RawBlock (Format "html") raw]
- else return []
+ then return $ B.rawBlock "html" raw
+ else return mempty
pHtmlBlock :: String -> TagParser String
pHtmlBlock t = try $ do
@@ -208,32 +321,57 @@ pHtmlBlock t = try $ do
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
-pHeader :: TagParser [Block]
+-- Sets chapter context
+eSection :: TagParser Blocks
+eSection = try $ do
+ let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
+ let sectTag = tagOpen (`elem` sectioningContent) matchChapter
+ TagOpen tag _ <- lookAhead $ pSatisfy sectTag
+ setInChapter (pInTags tag block)
+
+headerLevel :: String -> TagParser Int
+headerLevel tagtype = do
+ let level = read (drop 1 tagtype)
+ (try $ do
+ guardEnabled Ext_epub_html_exts
+ asks inChapter >>= guard
+ return (level - 1))
+ <|>
+ return level
+
+eTitlePage :: TagParser ()
+eTitlePage = try $ do
+ let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as)
+ let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
+ isTitlePage
+ TagOpen tag _ <- lookAhead $ pSatisfy groupTag
+ () <$ pInTags tag block
+
+pHeader :: TagParser Blocks
pHeader = try $ do
TagOpen tagtype attr <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
- let level = read (drop 1 tagtype)
- contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
+ level <- headerLevel tagtype
+ contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
let classes = maybe [] words $ lookup "class" attr
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
return $ if bodyTitle
- then [] -- skip a representation of the title in the body
- else [Header level (ident, classes, keyvals) $
- normalizeSpaces contents]
+ then mempty -- skip a representation of the title in the body
+ else B.headerWith (ident, classes, keyvals) level contents
-pHrule :: TagParser [Block]
+pHrule :: TagParser Blocks
pHrule = do
pSelfClosing (=="hr") (const True)
- return [HorizontalRule]
+ return B.horizontalRule
-pTable :: TagParser [Block]
+pTable :: TagParser Blocks
pTable = try $ do
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
skipMany pBlank
- caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank
+ caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
-- TODO actually read these and take width information from them
widths' <- pColgroup <|> many pCol
head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th")
@@ -242,26 +380,25 @@ pTable = try $ do
$ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td")
skipMany pBlank
TagClose _ <- pSatisfy (~== TagClose "table")
- let isSinglePlain [] = True
- isSinglePlain [Plain _] = True
- isSinglePlain _ = False
+ let isSinglePlain x = case B.toList x of
+ [Plain _] -> True
+ _ -> False
let isSimple = all isSinglePlain $ concat (head':rows)
- let cols = length $ if null head'
- then head rows
- else head'
+ let cols = length $ if null head' then head rows else head'
-- fail if there are colspans or rowspans
guard $ all (\r -> length r == cols) rows
- let aligns = replicate cols AlignLeft
+ let aligns = replicate cols AlignDefault
let widths = if null widths'
then if isSimple
then replicate cols 0
else replicate cols (1.0 / fromIntegral cols)
else widths'
- return [Table caption aligns widths head' rows]
+ return $ B.table caption (zip aligns widths) head' rows
pCol :: TagParser Double
pCol = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
+ skipMany pBlank
optional $ pSatisfy (~== TagClose "col")
skipMany pBlank
return $ case lookup "width" attribs of
@@ -275,35 +412,35 @@ pColgroup = try $ do
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
-pCell :: String -> TagParser [TableCell]
+pCell :: String -> TagParser [Blocks]
pCell celltype = try $ do
skipMany pBlank
res <- pInTags celltype block
skipMany pBlank
return [res]
-pBlockQuote :: TagParser [Block]
+pBlockQuote :: TagParser Blocks
pBlockQuote = do
contents <- pInTags "blockquote" block
- return [BlockQuote $ fixPlains False contents]
+ return $ B.blockQuote $ fixPlains False contents
-pPlain :: TagParser [Block]
+pPlain :: TagParser Blocks
pPlain = do
- contents <- liftM (normalizeSpaces . concat) $ many1 inline
- if null contents
- then return []
- else return [Plain contents]
+ contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
+ if B.isNull contents
+ then return mempty
+ else return $ B.plain contents
-pPara :: TagParser [Block]
+pPara :: TagParser Blocks
pPara = do
- contents <- pInTags "p" inline
- return [Para $ normalizeSpaces contents]
+ contents <- trimInlines <$> pInTags "p" inline
+ return $ B.para contents
-pCodeBlock :: TagParser [Block]
+pCodeBlock :: TagParser Blocks
pCodeBlock = try $ do
TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
- let rawText = concatMap fromTagText $ filter isTagText contents
+ let rawText = concatMap tagToString contents
-- drop leading newline if any
let result' = case rawText of
'\n':xs -> xs
@@ -312,11 +449,18 @@ pCodeBlock = try $ do
let result = case reverse result' of
'\n':_ -> init result'
_ -> result'
- return [CodeBlock (mkAttr attr) result]
+ return $ B.codeBlockWith (mkAttr attr) result
-inline :: TagParser [Inline]
+tagToString :: Tag String -> String
+tagToString (TagText s) = s
+tagToString (TagOpen "br" _) = "\n"
+tagToString _ = ""
+
+inline :: TagParser Inlines
inline = choice
- [ pTagText
+ [ eNoteref
+ , eSwitch id inline
+ , pTagText
, pQ
, pEmph
, pStrong
@@ -328,6 +472,7 @@ inline = choice
, pImage
, pCode
, pSpan
+ , pMath False
, pRawHtmlInline
]
@@ -354,91 +499,130 @@ pSelfClosing f g = do
optional $ pSatisfy (tagClose f)
return open
-pQ :: TagParser [Inline]
+pQ :: TagParser Inlines
pQ = do
- quoteContext <- stateQuoteContext `fmap` getState
- let quoteType = case quoteContext of
+ context <- asks quoteContext
+ let quoteType = case context of
InDoubleQuote -> SingleQuote
_ -> DoubleQuote
let innerQuoteContext = if quoteType == SingleQuote
then InSingleQuote
else InDoubleQuote
- withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType)
+ let constructor = case quoteType of
+ SingleQuote -> B.singleQuoted
+ DoubleQuote -> B.doubleQuoted
+ withQuoteContext innerQuoteContext $
+ pInlinesInTags "q" constructor
-pEmph :: TagParser [Inline]
-pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
+pEmph :: TagParser Inlines
+pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
-pStrong :: TagParser [Inline]
-pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
+pStrong :: TagParser Inlines
+pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
-pSuperscript :: TagParser [Inline]
-pSuperscript = pInlinesInTags "sup" Superscript
+pSuperscript :: TagParser Inlines
+pSuperscript = pInlinesInTags "sup" B.superscript
-pSubscript :: TagParser [Inline]
-pSubscript = pInlinesInTags "sub" Subscript
+pSubscript :: TagParser Inlines
+pSubscript = pInlinesInTags "sub" B.subscript
-pStrikeout :: TagParser [Inline]
+pStrikeout :: TagParser Inlines
pStrikeout = do
- pInlinesInTags "s" Strikeout <|>
- pInlinesInTags "strike" Strikeout <|>
- pInlinesInTags "del" Strikeout <|>
+ pInlinesInTags "s" B.strikeout <|>
+ pInlinesInTags "strike" B.strikeout <|>
+ pInlinesInTags "del" B.strikeout <|>
try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
- contents <- liftM concat $ manyTill inline (pCloses "span")
- return [Strikeout contents])
+ contents <- mconcat <$> manyTill inline (pCloses "span")
+ return $ B.strikeout contents)
-pLineBreak :: TagParser [Inline]
+pLineBreak :: TagParser Inlines
pLineBreak = do
pSelfClosing (=="br") (const True)
- return [LineBreak]
+ return B.linebreak
+
+pLink :: TagParser Inlines
+pLink = pRelLink <|> pAnchor
-pLink :: TagParser [Inline]
-pLink = try $ do
+pAnchor :: TagParser Inlines
+pAnchor = try $ do
+ tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id"))
+ return $ B.spanWith (fromAttrib "id" tag , [], []) mempty
+
+pRelLink :: TagParser Inlines
+pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
let title = fromAttrib "title" tag
- lab <- liftM concat $ manyTill inline (pCloses "a")
- return [Link (normalizeSpaces lab) (escapeURI url, title)]
-
-pImage :: TagParser [Inline]
+ let uid = fromAttrib "id" tag
+ let spanC = case uid of
+ [] -> id
+ s -> B.spanWith (s, [], [])
+ lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
+ return $ spanC $ B.link (escapeURI url) title lab
+
+pImage :: TagParser Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
let url = fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
- return [Image (B.toList $ B.text alt) (escapeURI url, title)]
+ return $ B.image (escapeURI url) title (B.text alt)
-pCode :: TagParser [Inline]
+pCode :: TagParser Inlines
pCode = try $ do
(TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
result <- manyTill pAnyTag (pCloses open)
- return [Code (mkAttr attr) $ intercalate " " $ lines $ innerText result]
+ return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
-pSpan :: TagParser [Inline]
+pSpan :: TagParser Inlines
pSpan = try $ do
- getOption readerParseRaw >>= guard
+ guardEnabled Ext_native_spans
TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
contents <- pInTags "span" inline
- return [Span (mkAttr attr) contents]
-
-pRawHtmlInline :: TagParser [Inline]
+ let attr' = mkAttr attr
+ return $ case attr' of
+ ("",[],[("style",s)])
+ | filter (`notElem` " \t;") s == "font-variant:small-caps" ->
+ B.smallcaps contents
+ _ -> B.spanWith (mkAttr attr) contents
+
+pRawHtmlInline :: TagParser Inlines
pRawHtmlInline = do
- result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
+ inplain <- asks inPlain
+ result <- pSatisfy (tagComment (const True))
+ <|> if inplain
+ then pSatisfy (not . isBlockTag)
+ else pSatisfy isInlineTag
parseRaw <- getOption readerParseRaw
if parseRaw
- then return [RawInline (Format "html") $ renderTags' [result]]
- else return []
-
-pInlinesInTags :: String -> ([Inline] -> Inline)
- -> TagParser [Inline]
-pInlinesInTags tagtype f = do
- contents <- pInTags tagtype inline
- return [f $ normalizeSpaces contents]
-
-pInTags :: String -> TagParser [a]
- -> TagParser [a]
+ then return $ B.rawInline "html" $ renderTags' [result]
+ else return mempty
+
+mathMLToTeXMath :: String -> Either String String
+mathMLToTeXMath s = writeTeX <$> readMathML s
+
+pMath :: Bool -> TagParser Inlines
+pMath inCase = try $ do
+ open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
+ unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr)))
+ contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
+ let math = mathMLToTeXMath $
+ (renderTags $ [open] ++ contents ++ [TagClose "math"])
+ let constructor =
+ maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath)
+ (lookup "display" attr)
+ return $ either (const mempty)
+ (\x -> if null x then mempty else constructor x) math
+
+pInlinesInTags :: String -> (Inlines -> Inlines)
+ -> TagParser Inlines
+pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
+
+pInTags :: (Monoid a) => String -> TagParser a
+ -> TagParser a
pInTags tagtype parser = try $ do
pSatisfy (~== TagOpen tagtype [])
- liftM concat $ manyTill parser (pCloses tagtype <|> eof)
+ mconcat <$> manyTill parser (pCloses tagtype <|> eof)
pOptInTag :: String -> TagParser a
-> TagParser a
@@ -454,43 +638,47 @@ pCloses :: String -> TagParser ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
- (TagClose t') | t' == tagtype -> pAnyTag >> return ()
+ (TagClose t') | t' == tagtype -> pAnyTag >> return ()
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "li" -> return ()
+ (TagClose "table") | tagtype == "td" -> return ()
+ (TagClose "table") | tagtype == "tr" -> return ()
_ -> mzero
-pTagText :: TagParser [Inline]
+pTagText :: TagParser Inlines
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
st <- getState
- case runParser (many pTagContents) st "text" str of
+ qu <- ask
+ case flip runReader qu $ runParserT (many pTagContents) st "text" str of
Left _ -> fail $ "Could not parse `" ++ str ++ "'"
- Right result -> return result
+ Right result -> return $ mconcat result
pBlank :: TagParser ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ all isSpace str
-pTagContents :: Parser [Char] ParserState Inline
+type InlinesParser = HTMLParser String
+
+pTagContents :: InlinesParser Inlines
pTagContents =
- Math DisplayMath `fmap` mathDisplay
- <|> Math InlineMath `fmap` mathInline
+ B.displayMath <$> mathDisplay
+ <|> B.math <$> mathInline
<|> pStr
<|> pSpace
<|> smartPunctuation pTagContents
<|> pSymbol
<|> pBad
-pStr :: Parser [Char] ParserState Inline
+pStr :: InlinesParser Inlines
pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
- pos <- getPosition
- updateState $ \s -> s{ stateLastStrPos = Just pos }
- return $ Str result
+ updateLastStrPos
+ return $ B.str result
isSpecial :: Char -> Bool
isSpecial '"' = True
@@ -504,13 +692,13 @@ isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
-pSymbol :: Parser [Char] ParserState Inline
-pSymbol = satisfy isSpecial >>= return . Str . (:[])
+pSymbol :: InlinesParser Inlines
+pSymbol = satisfy isSpecial >>= return . B.str . (:[])
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
-pBad :: Parser [Char] ParserState Inline
+pBad :: InlinesParser Inlines
pBad = do
c <- satisfy isBad
let c' = case c of
@@ -542,18 +730,20 @@ pBad = do
'\158' -> '\382'
'\159' -> '\376'
_ -> '?'
- return $ Str [c']
+ return $ B.str [c']
-pSpace :: Parser [Char] ParserState Inline
-pSpace = many1 (satisfy isSpace) >> return Space
+pSpace :: InlinesParser Inlines
+pSpace = many1 (satisfy isSpace) >> return B.space
--
-- Constants
--
eitherBlockOrInline :: [String]
-eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
- "map", "area", "object"]
+eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed",
+ "del", "ins",
+ "progress", "map", "area", "noscript", "script",
+ "object", "svg", "video", "source"]
{-
inlineHtmlTags :: [[Char]]
@@ -565,15 +755,17 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
-}
blockHtmlTags :: [String]
-blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button", "canvas",
+blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside",
+ "blockquote", "body", "button", "canvas",
"caption", "center", "col", "colgroup", "dd", "dir", "div",
- "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer",
- "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "map", "menu",
- "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress",
- "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd",
+ "dl", "dt", "fieldset", "figcaption", "figure",
+ "footer", "form", "h1", "h2", "h3", "h4",
+ "h5", "h6", "head", "header", "hgroup", "hr", "html",
+ "isindex", "menu", "noframes", "ol", "output", "p", "pre",
+ "section", "table", "tbody", "textarea",
+ "thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr", "script", "style", "svg", "video"]
+ "th", "thead", "tr", "script", "style"]
-- We want to allow raw docbook in markdown documents, so we
-- include docbook block tags here too.
@@ -591,19 +783,26 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
+epubTags :: [String]
+epubTags = ["case", "switch", "default"]
+
blockTags :: [String]
-blockTags = blockHtmlTags ++ blockDocBookTags
+blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
isInlineTag :: Tag String -> Bool
-isInlineTag t = tagOpen (`notElem` blockTags) (const True) t ||
- tagClose (`notElem` blockTags) t ||
+isInlineTag t = tagOpen isInlineTagName (const True) t ||
+ tagClose isInlineTagName t ||
tagComment (const True) t
+ where isInlineTagName x = x `notElem` blockTags
isBlockTag :: Tag String -> Bool
-isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
- tagClose (`elem` blocktags) t ||
+isBlockTag t = tagOpen isBlockTagName (const True) t ||
+ tagClose isBlockTagName t ||
tagComment (const True) t
- where blocktags = blockTags ++ eitherBlockOrInline
+ where isBlockTagName ('?':_) = True
+ isBlockTagName ('!':_) = True
+ isBlockTagName x = x `elem` blockTags
+ || x `elem` eitherBlockOrInline
isTextTag :: Tag String -> Bool
isTextTag = tagText (const True)
@@ -612,7 +811,7 @@ isCommentTag :: Tag String -> Bool
isCommentTag = tagComment (const True)
-- taken from HXT and extended
-
+-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
closes :: String -> String -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
@@ -620,11 +819,18 @@ _ `closes` "html" = False
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
+"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
-"hr" `closes` "p" = True
-"p" `closes` "p" = True
+"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
+"optgroup" `closes` "optgroup" = True
+"optgroup" `closes` "option" = True
+"option" `closes` "option" = True
+-- http://www.w3.org/TR/html-markup/p.html
+x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
+ "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
+ "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section",
+ "table", "ul"] = True
"meta" `closes` "meta" = True
-"colgroup" `closes` "colgroup" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
@@ -645,19 +851,23 @@ _ `closes` _ = False
--- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags.
-htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String
+htmlInBalanced :: (Monad m)
+ => (Tag String -> Bool)
+ -> ParserT String st m String
htmlInBalanced f = try $ do
(TagOpen t _, tag) <- htmlTag f
guard $ '/' `notElem` tag -- not a self-closing tag
let stopper = htmlTag (~== TagClose t)
- let anytag = liftM snd $ htmlTag (const True)
+ let anytag = snd <$> htmlTag (const True)
contents <- many $ notFollowedBy' stopper >>
(htmlInBalanced f <|> anytag <|> count 1 anyChar)
endtag <- liftM snd stopper
return $ tag ++ concat contents ++ endtag
-- | Matches a tag meeting a certain condition.
-htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String)
+htmlTag :: Monad m
+ => (Tag String -> Bool)
+ -> ParserT [Char] st m (Tag String, String)
htmlTag f = try $ do
lookAhead $ char '<' >> (oneOf "/!?" <|> letter)
(next : _) <- getInput >>= return . canonicalizeTags . parseTags
@@ -676,6 +886,79 @@ htmlTag f = try $ do
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+ attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+ epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+
+-- Strip namespace prefixes
+stripPrefixes :: [Tag String] -> [Tag String]
+stripPrefixes = map stripPrefix
+
+stripPrefix :: Tag String -> Tag String
+stripPrefix (TagOpen s as) =
+ TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
+stripPrefix (TagClose s) = TagClose (stripPrefix' s)
+stripPrefix x = x
+
+stripPrefix' :: String -> String
+stripPrefix' s =
+ case span (/= ':') s of
+ (_, "") -> s
+ (_, (_:ts)) -> ts
+
+isSpace :: Char -> Bool
+isSpace ' ' = True
+isSpace '\t' = True
+isSpace '\n' = True
+isSpace '\r' = True
+isSpace _ = False
+
+-- Instances
+
+-- This signature should be more general
+-- MonadReader HTMLLocal m => HasQuoteContext st m
+instance HasQuoteContext st (Reader HTMLLocal) where
+ getQuoteContext = asks quoteContext
+ withQuoteContext q = local (\s -> s{quoteContext = q})
+instance HasReaderOptions HTMLState where
+ extractReaderOptions = extractReaderOptions . parserState
+
+instance Default HTMLState where
+ def = HTMLState def []
+
+instance HasMeta HTMLState where
+ setMeta s b st = st {parserState = setMeta s b $ parserState st}
+ deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
+
+instance Default HTMLLocal where
+ def = HTMLLocal NoQuote False False
+
+instance HasLastStrPosition HTMLState where
+ setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
+ getLastStrPos = getLastStrPos . parserState
+
+
+-- EPUB Specific
+--
+--
+sectioningContent :: [String]
+sectioningContent = ["article", "aside", "nav", "section"]
+
+
+groupingContent :: [String]
+groupingContent = ["p", "hr", "pre", "blockquote", "ol"
+ , "ul", "li", "dl", "dt", "dt", "dd"
+ , "figure", "figcaption", "div", "main"]
+
+
+{-
+
+types :: [(String, ([String], Int))]
+types = -- Document divisions
+ map (\s -> (s, (["section", "body"], 0)))
+ ["volume", "part", "chapter", "division"]
+ ++ -- Document section and components
+ [
+ ("abstract", ([], 0))]
+-}
diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs
index 0e74406ef..4b46c869d 100644
--- a/src/Text/Pandoc/Readers/Haddock.hs
+++ b/src/Text/Pandoc/Readers/Haddock.hs
@@ -3,7 +3,8 @@
Copyright : Copyright (C) 2013 David Lazar
License : GNU GPL, version 2 or above
- Maintainer : David Lazar <lazar6@illinois.edu>
+ Maintainer : David Lazar <lazar6@illinois.edu>,
+ John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Conversion of Haddock markup to 'Pandoc' document.
@@ -12,30 +13,126 @@ module Text.Pandoc.Readers.Haddock
( readHaddock
) where
-import Text.Pandoc.Builder
+import Text.Pandoc.Builder (Blocks, Inlines)
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Shared (trim, splitBy)
+import Data.Monoid
+import Data.List (intersperse, stripPrefix)
+import Data.Maybe (fromMaybe)
+import Text.Pandoc.Definition
import Text.Pandoc.Options
-import Text.Pandoc.Readers.Haddock.Lex
-import Text.Pandoc.Readers.Haddock.Parse
+import Documentation.Haddock.Parser
+import Documentation.Haddock.Types
+import Debug.Trace (trace)
-- | Parse Haddock markup and return a 'Pandoc' document.
readHaddock :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse
-> Pandoc
-readHaddock _ s = Pandoc nullMeta blocks
+readHaddock opts = B.doc . docHToBlocks . trace' . parseParas
+ where trace' x = if readerTrace opts
+ then trace (show x) x
+ else x
+
+docHToBlocks :: DocH String Identifier -> Blocks
+docHToBlocks d' =
+ case d' of
+ DocEmpty -> mempty
+ DocAppend (DocParagraph (DocHeader h)) (DocParagraph (DocAName ident)) ->
+ B.headerWith (ident,[],[]) (headerLevel h)
+ (docHToInlines False $ headerTitle h)
+ DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2)
+ DocString _ -> inlineFallback
+ DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h
+ DocParagraph x -> B.para $ docHToInlines False x
+ DocIdentifier _ -> inlineFallback
+ DocIdentifierUnchecked _ -> inlineFallback
+ DocModule s -> B.plain $ docHToInlines False $ DocModule s
+ DocWarning _ -> mempty -- TODO
+ DocEmphasis _ -> inlineFallback
+ DocMonospaced _ -> inlineFallback
+ DocBold _ -> inlineFallback
+ DocHeader h -> B.header (headerLevel h)
+ (docHToInlines False $ headerTitle h)
+ DocUnorderedList items -> B.bulletList (map docHToBlocks items)
+ DocOrderedList items -> B.orderedList (map docHToBlocks items)
+ DocDefList items -> B.definitionList (map (\(d,t) ->
+ (docHToInlines False d,
+ [consolidatePlains $ docHToBlocks t])) items)
+ DocCodeBlock (DocString s) -> B.codeBlockWith ("",[],[]) s
+ DocCodeBlock d -> B.para $ docHToInlines True d
+ DocHyperlink _ -> inlineFallback
+ DocPic _ -> inlineFallback
+ DocAName _ -> inlineFallback
+ DocProperty s -> B.codeBlockWith ("",["property","haskell"],[]) (trim s)
+ DocExamples es -> mconcat $ map (\e ->
+ makeExample ">>>" (exampleExpression e) (exampleResult e)) es
+
+ where inlineFallback = B.plain $ docHToInlines False d'
+ consolidatePlains = B.fromList . consolidatePlains' . B.toList
+ consolidatePlains' zs@(Plain _ : _) =
+ let (xs, ys) = span isPlain zs in
+ Para (concatMap extractContents xs) : consolidatePlains' ys
+ consolidatePlains' (x : xs) = x : consolidatePlains' xs
+ consolidatePlains' [] = []
+ isPlain (Plain _) = True
+ isPlain _ = False
+ extractContents (Plain xs) = xs
+ extractContents _ = []
+
+docHToInlines :: Bool -> DocH String Identifier -> Inlines
+docHToInlines isCode d' =
+ case d' of
+ DocEmpty -> mempty
+ DocAppend d1 d2 -> mappend (docHToInlines isCode d1)
+ (docHToInlines isCode d2)
+ DocString s
+ | isCode -> mconcat $ intersperse B.linebreak
+ $ map B.code $ splitBy (=='\n') s
+ | otherwise -> B.text s
+ DocParagraph _ -> mempty
+ DocIdentifier (_,s,_) -> B.codeWith ("",["haskell","identifier"],[]) s
+ DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) s
+ DocModule s -> B.codeWith ("",["haskell","module"],[]) s
+ DocWarning _ -> mempty -- TODO
+ DocEmphasis d -> B.emph (docHToInlines isCode d)
+ DocMonospaced (DocString s) -> B.code s
+ DocMonospaced d -> docHToInlines True d
+ DocBold d -> B.strong (docHToInlines isCode d)
+ DocHeader _ -> mempty
+ DocUnorderedList _ -> mempty
+ DocOrderedList _ -> mempty
+ DocDefList _ -> mempty
+ DocCodeBlock _ -> mempty
+ DocHyperlink h -> B.link (hyperlinkUrl h) (hyperlinkUrl h)
+ (maybe (B.text $ hyperlinkUrl h) B.text $ hyperlinkLabel h)
+ DocPic p -> B.image (pictureUri p) (fromMaybe (pictureUri p) $ pictureTitle p)
+ (maybe mempty B.text $ pictureTitle p)
+ DocAName s -> B.spanWith (s,["anchor"],[]) mempty
+ DocProperty _ -> mempty
+ DocExamples _ -> mempty
+
+-- | Create an 'Example', stripping superfluous characters as appropriate
+makeExample :: String -> String -> [String] -> Blocks
+makeExample prompt expression result =
+ B.para $ B.codeWith ("",["prompt"],[]) prompt
+ <> B.space
+ <> B.codeWith ([], ["haskell","expr"], []) (trim expression)
+ <> B.linebreak
+ <> (mconcat $ intersperse B.linebreak $ map coder result')
where
- blocks = case parseParas (tokenise s (0,0)) of
- Left [] -> error "parse failure"
- Left (tok:_) -> error $ "parse failure " ++ pos (tokenPos tok)
- where pos (l, c) = "(line " ++ show l ++ ", column " ++ show c ++ ")"
- Right x -> mergeLists (toList x)
-
--- similar to 'docAppend' in Haddock.Doc
-mergeLists :: [Block] -> [Block]
-mergeLists (BulletList xs : BulletList ys : blocks)
- = mergeLists (BulletList (xs ++ ys) : blocks)
-mergeLists (OrderedList _ xs : OrderedList a ys : blocks)
- = mergeLists (OrderedList a (xs ++ ys) : blocks)
-mergeLists (DefinitionList xs : DefinitionList ys : blocks)
- = mergeLists (DefinitionList (xs ++ ys) : blocks)
-mergeLists (x : blocks) = x : mergeLists blocks
-mergeLists [] = []
+ -- 1. drop trailing whitespace from the prompt, remember the prefix
+ prefix = takeWhile (`elem` " \t") prompt
+
+ -- 2. drop, if possible, the exact same sequence of whitespace
+ -- characters from each result line
+ --
+ -- 3. interpret lines that only contain the string "<BLANKLINE>" as an
+ -- empty line
+ result' = map (substituteBlankLine . tryStripPrefix prefix) result
+ where
+ tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
+
+ substituteBlankLine "<BLANKLINE>" = ""
+ substituteBlankLine line = line
+ coder = B.codeWith ([], ["result"], [])
diff --git a/src/Text/Pandoc/Readers/Haddock/Lex.x b/src/Text/Pandoc/Readers/Haddock/Lex.x
deleted file mode 100644
index 120e96ebf..000000000
--- a/src/Text/Pandoc/Readers/Haddock/Lex.x
+++ /dev/null
@@ -1,171 +0,0 @@
---
--- Haddock - A Haskell Documentation Tool
---
--- (c) Simon Marlow 2002
---
--- This file was modified and integrated into GHC by David Waern 2006.
--- Then moved back into Haddock by Isaac Dupree in 2009 :-)
--- Then copied into Pandoc by David Lazar in 2013 :-D
-
-{
-{-# LANGUAGE BangPatterns #-} -- Generated by Alex
-{-# OPTIONS -Wwarn -w #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and fix
--- any warnings in the module. See
--- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
--- for details
-
-module Text.Pandoc.Readers.Haddock.Lex (
- Token(..),
- LToken,
- tokenise,
- tokenPos
- ) where
-
-import Data.Char
-import Numeric (readHex)
-}
-
-%wrapper "posn"
-
-$ws = $white # \n
-$digit = [0-9]
-$hexdigit = [0-9a-fA-F]
-$special = [\"\@]
-$alphanum = [A-Za-z0-9]
-$ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]
-
-:-
-
--- beginning of a paragraph
-<0,para> {
- $ws* \n ;
- $ws* \> { begin birdtrack }
- $ws* prop \> .* \n { strtoken TokProperty `andBegin` property}
- $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
- $ws* [\*\-] { token TokBullet `andBegin` string }
- $ws* \[ { token TokDefStart `andBegin` def }
- $ws* \( $digit+ \) { token TokNumber `andBegin` string }
- $ws* $digit+ \. { token TokNumber `andBegin` string }
- $ws* { begin string }
-}
-
--- beginning of a line
-<line> {
- $ws* \> { begin birdtrack }
- $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
-
- $ws* \n { token TokPara `andBegin` para }
- -- ^ Here, we really want to be able to say
- -- $ws* (\n | <eof>) { token TokPara `andBegin` para}
- -- because otherwise a trailing line of whitespace will result in
- -- a spurious TokString at the end of a docstring. We don't have <eof>,
- -- though (NOW I realise what it was for :-). To get around this, we always
- -- append \n to the end of a docstring.
-
- () { begin string }
-}
-
-<birdtrack> .* \n? { strtokenNL TokBirdTrack `andBegin` line }
-
-<property> () { token TokPara `andBegin` para }
-
-<example> {
- $ws* \n { token TokPara `andBegin` para }
- $ws* \>\>\> { strtoken TokExamplePrompt `andBegin` exampleexpr }
- () { begin exampleresult }
-}
-
-<exampleexpr> .* \n { strtokenNL TokExampleExpression `andBegin` example }
-
-<exampleresult> .* \n { strtokenNL TokExampleResult `andBegin` example }
-
-<string,def> {
- $special { strtoken $ \s -> TokSpecial (head s) }
- \<\< [^\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
- \< [^\>]* \> { strtoken $ \s -> TokURL (init (tail s)) }
- \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) }
- \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) }
- [\'\`] $ident+ [\'\`] { strtoken $ \s -> TokIdent (init (tail s)) }
- \\ . { strtoken (TokString . tail) }
- "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
- "&#" [xX] $hexdigit+ \;
- { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n] }
- -- allow special characters through if they don't fit one of the previous
- -- patterns.
- [\/\'\`\<\#\&\\] { strtoken TokString }
- [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
- [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString }
-}
-
-<def> {
- \] { token TokDefEnd `andBegin` string }
-}
-
--- ']' doesn't have any special meaning outside of the [...] at the beginning
--- of a definition paragraph.
-<string> {
- \] { strtoken TokString }
-}
-
-{
--- | A located token
-type LToken = (Token, AlexPosn)
-
-data Token
- = TokPara
- | TokNumber
- | TokBullet
- | TokDefStart
- | TokDefEnd
- | TokSpecial Char
- | TokIdent String
- | TokString String
- | TokURL String
- | TokPic String
- | TokEmphasis String
- | TokAName String
- | TokBirdTrack String
- | TokProperty String
- | TokExamplePrompt String
- | TokExampleExpression String
- | TokExampleResult String
- deriving Show
-
-tokenPos :: LToken -> (Int, Int)
-tokenPos t = let AlexPn _ line col = snd t in (line, col)
-
-type StartCode = Int
-type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> [LToken]
-
-tokenise :: String -> (Int, Int) -> [LToken]
-tokenise str (line, col) = go (posn,'\n',[],eofHack str) para
- where posn = AlexPn 0 line col
- go inp@(pos,_,_,str) sc =
- case alexScan inp sc of
- AlexEOF -> []
- AlexError _ -> []
- AlexSkip inp' len -> go inp' sc
- AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc)
-
--- NB. we add a final \n to the string, (see comment in the beginning of line
--- production above).
-eofHack str = str++"\n"
-
-andBegin :: Action -> StartCode -> Action
-andBegin act new_sc = \pos str _ cont -> act pos str new_sc cont
-
-token :: Token -> Action
-token t = \pos _ sc cont -> (t, pos) : cont sc
-
-strtoken, strtokenNL :: (String -> Token) -> Action
-strtoken t = \pos str sc cont -> (t str, pos) : cont sc
-strtokenNL t = \pos str sc cont -> (t (filter (/= '\r') str), pos) : cont sc
--- ^ We only want LF line endings in our internal doc string format, so we
--- filter out all CRs.
-
-begin :: StartCode -> Action
-begin sc = \_ _ _ cont -> cont sc
-
-}
diff --git a/src/Text/Pandoc/Readers/Haddock/Parse.y b/src/Text/Pandoc/Readers/Haddock/Parse.y
deleted file mode 100644
index 9c2bbc8a9..000000000
--- a/src/Text/Pandoc/Readers/Haddock/Parse.y
+++ /dev/null
@@ -1,178 +0,0 @@
--- This code was copied from the 'haddock' package, modified, and integrated
--- into Pandoc by David Lazar.
-{
-{-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and fix
--- any warnings in the module. See
--- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
--- for details
-
-module Text.Pandoc.Readers.Haddock.Parse (parseString, parseParas) where
-
-import Text.Pandoc.Readers.Haddock.Lex
-import Text.Pandoc.Builder
-import Text.Pandoc.Shared (trim, trimr)
-import Data.Generics (everywhere, mkT)
-import Data.Char (isSpace)
-import Data.Maybe (fromMaybe)
-import Data.List (stripPrefix, intersperse)
-import Data.Monoid (mempty, mconcat)
-}
-
-%expect 0
-
-%tokentype { LToken }
-
-%token
- '/' { (TokSpecial '/',_) }
- '@' { (TokSpecial '@',_) }
- '[' { (TokDefStart,_) }
- ']' { (TokDefEnd,_) }
- DQUO { (TokSpecial '\"',_) }
- URL { (TokURL $$,_) }
- PIC { (TokPic $$,_) }
- ANAME { (TokAName $$,_) }
- '/../' { (TokEmphasis $$,_) }
- '-' { (TokBullet,_) }
- '(n)' { (TokNumber,_) }
- '>..' { (TokBirdTrack $$,_) }
- PROP { (TokProperty $$,_) }
- PROMPT { (TokExamplePrompt $$,_) }
- RESULT { (TokExampleResult $$,_) }
- EXP { (TokExampleExpression $$,_) }
- IDENT { (TokIdent $$,_) }
- PARA { (TokPara,_) }
- STRING { (TokString $$,_) }
-
-%monad { Either [LToken] }
-
-%name parseParas doc
-%name parseString seq
-
-%%
-
-doc :: { Blocks }
- : apara PARA doc { $1 <> $3 }
- | PARA doc { $2 }
- | apara { $1 }
- | {- empty -} { mempty }
-
-apara :: { Blocks }
- : ulpara { bulletList [$1] }
- | olpara { orderedList [$1] }
- | defpara { definitionList [$1] }
- | para { $1 }
-
-ulpara :: { Blocks }
- : '-' para { $2 }
-
-olpara :: { Blocks }
- : '(n)' para { $2 }
-
-defpara :: { (Inlines, [Blocks]) }
- : '[' seq ']' seq { (trimInlines $2, [plain $ trimInlines $4]) }
-
-para :: { Blocks }
- : seq { para' $1 }
- | codepara { codeBlockWith ([], ["haskell"], []) $1 }
- | property { $1 }
- | examples { $1 }
-
-codepara :: { String }
- : '>..' codepara { $1 ++ $2 }
- | '>..' { $1 }
-
-property :: { Blocks }
- : PROP { makeProperty $1 }
-
-examples :: { Blocks }
- : example examples { $1 <> $2 }
- | example { $1 }
-
-example :: { Blocks }
- : PROMPT EXP result { makeExample $1 $2 (lines $3) }
- | PROMPT EXP { makeExample $1 $2 [] }
-
-result :: { String }
- : RESULT result { $1 ++ $2 }
- | RESULT { $1 }
-
-seq :: { Inlines }
- : elem seq { $1 <> $2 }
- | elem { $1 }
-
-elem :: { Inlines }
- : elem1 { $1 }
- | '@' seq1 '@' { monospace $2 }
-
-seq1 :: { Inlines }
- : PARA seq1 { linebreak <> $2 }
- | elem1 seq1 { $1 <> $2 }
- | elem1 { $1 }
-
-elem1 :: { Inlines }
- : STRING { text $1 }
- | '/../' { emph (str $1) }
- | URL { makeHyperlink $1 }
- | PIC { image $1 $1 mempty }
- | ANAME { mempty } -- TODO
- | IDENT { codeWith ([], ["haskell"], []) $1 }
- | DQUO strings DQUO { codeWith ([], ["haskell"], []) $2 }
-
-strings :: { String }
- : STRING { $1 }
- | STRING strings { $1 ++ $2 }
-
-{
-happyError :: [LToken] -> Either [LToken] a
-happyError toks = Left toks
-
-para' :: Inlines -> Blocks
-para' = para . trimInlines
-
-monospace :: Inlines -> Inlines
-monospace = everywhere (mkT go)
- where
- go (Str s) = Code nullAttr s
- go x = x
-
--- | Create a `Hyperlink` from given string.
---
--- A hyperlink consists of a URL and an optional label. The label is separated
--- from the url by one or more whitespace characters.
-makeHyperlink :: String -> Inlines
-makeHyperlink input = case break isSpace $ trim input of
- (url, "") -> link url url (str url)
- (url, lb) -> link url url (trimInlines $ text lb)
-
-makeProperty :: String -> Blocks
-makeProperty s = case trim s of
- 'p':'r':'o':'p':'>':xs ->
- codeBlockWith ([], ["property"], []) (dropWhile isSpace xs)
- xs ->
- error $ "makeProperty: invalid input " ++ show xs
-
--- | Create an 'Example', stripping superfluous characters as appropriate
-makeExample :: String -> String -> [String] -> Blocks
-makeExample prompt expression result =
- para $ codeWith ([], ["haskell","expr"], []) (trim expression)
- <> linebreak
- <> (mconcat $ intersperse linebreak $ map coder result')
- where
- -- 1. drop trailing whitespace from the prompt, remember the prefix
- prefix = takeWhile isSpace prompt
-
- -- 2. drop, if possible, the exact same sequence of whitespace
- -- characters from each result line
- --
- -- 3. interpret lines that only contain the string "<BLANKLINE>" as an
- -- empty line
- result' = map (substituteBlankLine . tryStripPrefix prefix) result
- where
- tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys
-
- substituteBlankLine "<BLANKLINE>" = ""
- substituteBlankLine line = line
- coder = codeWith ([], ["result"], [])
-}
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 51271edc5..9420d602f 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -1,6 +1,6 @@
{-# LANGUAGE ScopedTypeVariables, OverloadedStrings #-}
{-
-Copyright (C) 2006-2012 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.LaTeX
- Copyright : Copyright (C) 2006-2012 John MacFarlane
+ Copyright : Copyright (C) 2006-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -31,6 +31,7 @@ Conversion of LaTeX to 'Pandoc' document.
module Text.Pandoc.Readers.LaTeX ( readLaTeX,
rawLaTeXInline,
rawLaTeXBlock,
+ inlineCommand,
handleIncludes
) where
@@ -42,6 +43,7 @@ import Text.Pandoc.Parsing hiding ((<|>), many, optional, space,
mathDisplay, mathInline)
import qualified Text.Pandoc.UTF8 as UTF8
import Data.Char ( chr, ord )
+import Control.Monad.Trans (lift)
import Control.Monad
import Text.Pandoc.Builder
import Data.Char (isLetter, isAlphaNum)
@@ -101,7 +103,7 @@ dimenarg = try $ do
sp :: LP ()
sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t')
- <|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline)
+ <|> (try $ newline <* lookAhead anyChar <* notFollowedBy blankline)
isLowerHex :: Char -> Bool
isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f'
@@ -123,7 +125,7 @@ comment :: LP ()
comment = do
char '%'
skipMany (satisfy (/='\n'))
- newline
+ optional newline
return ()
bgroup :: LP ()
@@ -302,6 +304,13 @@ blockCommands = M.fromList $
, ("item", skipopts *> loose_item)
, ("documentclass", skipopts *> braced *> preamble)
, ("centerline", (para . trimInlines) <$> (skipopts *> tok))
+ , ("caption", skipopts *> tok >>= setCaption)
+ , ("PandocStartInclude", startInclude)
+ , ("PandocEndInclude", endInclude)
+ , ("bibliography", mempty <$ (skipopts *> braced >>=
+ addMeta "bibliography" . splitBibs))
+ , ("addbibresource", mempty <$ (skipopts *> braced >>=
+ addMeta "bibliography" . splitBibs))
] ++ map ignoreBlocks
-- these commands will be ignored unless --parse-raw is specified,
-- in which case they will appear as raw latex blocks
@@ -309,7 +318,7 @@ blockCommands = M.fromList $
-- newcommand, etc. should be parsed by macro, but we need this
-- here so these aren't parsed as inline commands to ignore
, "special", "pdfannot", "pdfstringdef"
- , "bibliography", "bibliographystyle"
+ , "bibliographystyle"
, "maketitle", "makeindex", "makeglossary"
, "addcontentsline", "addtocontents", "addtocounter"
-- \ignore{} is used conventionally in literate haskell for definitions
@@ -321,7 +330,19 @@ blockCommands = M.fromList $
]
addMeta :: ToMetaValue a => String -> a -> LP ()
-addMeta field val = updateState $ setMeta field val
+addMeta field val = updateState $ \st ->
+ st{ stateMeta = addMetaField field val $ stateMeta st }
+
+splitBibs :: String -> [Inlines]
+splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',')
+
+setCaption :: Inlines -> LP Blocks
+setCaption ils = do
+ updateState $ \st -> st{ stateCaption = Just ils }
+ return mempty
+
+resetCaption :: LP ()
+resetCaption = updateState $ \st -> st{ stateCaption = Nothing }
authors :: LP ()
authors = try $ do
@@ -332,7 +353,7 @@ authors = try $ do
-- skip e.g. \vspace{10pt}
auths <- sepBy oneAuthor (controlSeq "and")
char '}'
- addMeta "authors" (map trimInlines auths)
+ addMeta "author" (map trimInlines auths)
section :: Attr -> Int -> LP Blocks
section (ident, classes, kvs) lvl = do
@@ -375,18 +396,18 @@ isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands
inlineCommands :: M.Map String (LP Inlines)
inlineCommands = M.fromList $
- [ ("emph", emph <$> tok)
- , ("textit", emph <$> tok)
- , ("textsl", emph <$> tok)
- , ("textsc", smallcaps <$> tok)
- , ("sout", strikeout <$> tok)
- , ("textsuperscript", superscript <$> tok)
- , ("textsubscript", subscript <$> tok)
+ [ ("emph", extractSpaces emph <$> tok)
+ , ("textit", extractSpaces emph <$> tok)
+ , ("textsl", extractSpaces emph <$> tok)
+ , ("textsc", extractSpaces smallcaps <$> tok)
+ , ("sout", extractSpaces strikeout <$> tok)
+ , ("textsuperscript", extractSpaces superscript <$> tok)
+ , ("textsubscript", extractSpaces subscript <$> tok)
, ("textbackslash", lit "\\")
, ("backslash", lit "\\")
, ("slash", lit "/")
- , ("textbf", strong <$> tok)
- , ("textnormal", spanWith ("",["nodecor"],[]) <$> tok)
+ , ("textbf", extractSpaces strong <$> tok)
+ , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok)
, ("ldots", lit "…")
, ("dots", lit "…")
, ("mdots", lit "…")
@@ -406,15 +427,15 @@ inlineCommands = M.fromList $
, ("{", lit "{")
, ("}", lit "}")
-- old TeX commands
- , ("em", emph <$> inlines)
- , ("it", emph <$> inlines)
- , ("sl", emph <$> inlines)
- , ("bf", strong <$> inlines)
+ , ("em", extractSpaces emph <$> inlines)
+ , ("it", extractSpaces emph <$> inlines)
+ , ("sl", extractSpaces emph <$> inlines)
+ , ("bf", extractSpaces strong <$> inlines)
, ("rm", inlines)
- , ("itshape", emph <$> inlines)
- , ("slshape", emph <$> inlines)
- , ("scshape", smallcaps <$> inlines)
- , ("bfseries", strong <$> inlines)
+ , ("itshape", extractSpaces emph <$> inlines)
+ , ("slshape", extractSpaces emph <$> inlines)
+ , ("scshape", extractSpaces smallcaps <$> inlines)
+ , ("bfseries", extractSpaces strong <$> inlines)
, ("/", pure mempty) -- italic correction
, ("aa", lit "å")
, ("AA", lit "Å")
@@ -473,6 +494,7 @@ inlineCommands = M.fromList $
, ("citealp", citation "citealp" NormalCitation False)
, ("citealp*", citation "citealp*" NormalCitation False)
, ("autocite", citation "autocite" NormalCitation False)
+ , ("smartcite", citation "smartcite" NormalCitation False)
, ("footcite", inNote <$> citation "footcite" NormalCitation False)
, ("parencite", citation "parencite" NormalCitation False)
, ("supercite", citation "supercite" NormalCitation False)
@@ -495,6 +517,7 @@ inlineCommands = M.fromList $
, ("supercites", citation "supercites" NormalCitation True)
, ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True)
, ("Autocite", citation "Autocite" NormalCitation False)
+ , ("Smartcite", citation "Smartcite" NormalCitation False)
, ("Footcite", citation "Footcite" NormalCitation False)
, ("Parencite", citation "Parencite" NormalCitation False)
, ("Supercite", citation "Supercite" NormalCitation False)
@@ -516,25 +539,21 @@ inlineCommands = M.fromList $
, ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *>
complexNatbibCitation AuthorInText)
<|> citation "citeauthor" AuthorInText False)
+ , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>=
+ addMeta "nocite"))
] ++ map ignoreInlines
-- these commands will be ignored unless --parse-raw is specified,
-- in which case they will appear as raw latex blocks:
- [ "noindent", "index", "nocite" ]
+ [ "noindent", "index" ]
mkImage :: String -> LP Inlines
mkImage src = do
- -- try for a caption
- (alt, tit) <- option (str "image", "") $ try $ do
- spaces
- controlSeq "caption"
- optional (char '*')
- ils <- grouped inline
- return (ils, "fig:")
+ let alt = str "image"
case takeExtension src of
"" -> do
defaultExt <- getOption readerDefaultImageExtension
- return $ image (addExtension src defaultExt) tit alt
- _ -> return $ image src tit alt
+ return $ image (addExtension src defaultExt) "" alt
+ _ -> return $ image src "" alt
inNote :: Inlines -> Inlines
inNote ils =
@@ -788,31 +807,107 @@ rawEnv name = do
(withRaw (env name blocks) >>= applyMacros' . snd)
else env name blocks
+----
+
+type IncludeParser = ParserT [Char] [String] IO String
+
-- | Replace "include" commands with file contents.
handleIncludes :: String -> IO String
-handleIncludes = handleIncludes' []
-
--- parents parameter prevents infinite include loops
-handleIncludes' :: [FilePath] -> String -> IO String
-handleIncludes' _ [] = return []
-handleIncludes' parents ('\\':'%':xs) =
- ("\\%"++) `fmap` handleIncludes' parents xs
-handleIncludes' parents ('%':xs) = handleIncludes' parents
- $ drop 1 $ dropWhile (/='\n') xs
-handleIncludes' parents ('\\':xs) =
- case runParser include defaultParserState "input" ('\\':xs) of
- Right (fs, rest) -> do yss <- mapM (\f -> if f `elem` parents
- then "" <$ warn ("Include file loop in '"
- ++ f ++ "'.")
- else readTeXFile f >>=
- handleIncludes' (f:parents)) fs
- rest' <- handleIncludes' parents rest
- return $ intercalate "\n" yss ++ rest'
- _ -> case runParser (verbCmd <|> verbatimEnv) defaultParserState
- "input" ('\\':xs) of
- Right (r, rest) -> (r ++) `fmap` handleIncludes' parents rest
- _ -> ('\\':) `fmap` handleIncludes' parents xs
-handleIncludes' parents (x:xs) = (x:) `fmap` handleIncludes' parents xs
+handleIncludes s = do
+ res <- runParserT includeParser' [] "input" s
+ case res of
+ Right s' -> return s'
+ Left e -> error $ show e
+
+includeParser' :: IncludeParser
+includeParser' =
+ concat <$> many (comment' <|> escaped' <|> blob' <|> include'
+ <|> startMarker' <|> endMarker'
+ <|> verbCmd' <|> verbatimEnv' <|> backslash')
+
+comment' :: IncludeParser
+comment' = do
+ char '%'
+ xs <- manyTill anyChar newline
+ return ('%':xs ++ "\n")
+
+escaped' :: IncludeParser
+escaped' = try $ string "\\%" <|> string "\\\\"
+
+verbCmd' :: IncludeParser
+verbCmd' = fmap snd <$>
+ withRaw $ try $ do
+ string "\\verb"
+ c <- anyChar
+ manyTill anyChar (char c)
+
+verbatimEnv' :: IncludeParser
+verbatimEnv' = fmap snd <$>
+ withRaw $ try $ do
+ string "\\begin"
+ name <- braced'
+ guard $ name `elem` ["verbatim", "Verbatim", "lstlisting",
+ "minted", "alltt"]
+ manyTill anyChar (try $ string $ "\\end{" ++ name ++ "}")
+
+blob' :: IncludeParser
+blob' = try $ many1 (noneOf "\\%")
+
+backslash' :: IncludeParser
+backslash' = string "\\"
+
+braced' :: IncludeParser
+braced' = try $ char '{' *> manyTill (satisfy (/='}')) (char '}')
+
+include' :: IncludeParser
+include' = do
+ fs' <- try $ do
+ char '\\'
+ name <- try (string "include")
+ <|> try (string "input")
+ <|> string "usepackage"
+ -- skip options
+ skipMany $ try $ char '[' *> (manyTill anyChar (char ']'))
+ fs <- (map trim . splitBy (==',')) <$> braced'
+ return $ if name == "usepackage"
+ then map (flip replaceExtension ".sty") fs
+ else map (flip replaceExtension ".tex") fs
+ pos <- getPosition
+ containers <- getState
+ let fn = case containers of
+ (f':_) -> f'
+ [] -> "input"
+ -- now process each include file in order...
+ rest <- getInput
+ results' <- forM fs' (\f -> do
+ when (f `elem` containers) $
+ fail "Include file loop!"
+ contents <- lift $ readTeXFile f
+ return $ "\\PandocStartInclude{" ++ f ++ "}" ++
+ contents ++ "\\PandocEndInclude{" ++
+ fn ++ "}{" ++ show (sourceLine pos) ++ "}{"
+ ++ show (sourceColumn pos) ++ "}")
+ setInput $ concat results' ++ rest
+ return ""
+
+startMarker' :: IncludeParser
+startMarker' = try $ do
+ string "\\PandocStartInclude"
+ fn <- braced'
+ updateState (fn:)
+ setPosition $ newPos fn 1 1
+ return $ "\\PandocStartInclude{" ++ fn ++ "}"
+
+endMarker' :: IncludeParser
+endMarker' = try $ do
+ string "\\PandocEndInclude"
+ fn <- braced'
+ ln <- braced'
+ co <- braced'
+ updateState tail
+ setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co)
+ return $ "\\PandocEndInclude{" ++ fn ++ "}{" ++ ln ++ "}{" ++
+ co ++ "}"
readTeXFile :: FilePath -> IO String
readTeXFile f = do
@@ -827,27 +922,7 @@ readFileFromDirs (d:ds) f =
E.catch (UTF8.readFile $ d </> f) $ \(_ :: E.SomeException) ->
readFileFromDirs ds f
-include :: LP ([FilePath], String)
-include = do
- name <- controlSeq "include"
- <|> controlSeq "input"
- <|> controlSeq "usepackage"
- skipopts
- fs <- (splitBy (==',')) <$> braced
- rest <- getInput
- let fs' = if name == "usepackage"
- then map (flip replaceExtension ".sty") fs
- else map (flip replaceExtension ".tex") fs
- return (fs', rest)
-
-verbCmd :: LP (String, String)
-verbCmd = do
- (_,r) <- withRaw $ do
- controlSeq "verb"
- c <- anyChar
- manyTill anyChar (char c)
- rest <- getInput
- return (r, rest)
+----
keyval :: LP (String, String)
keyval = try $ do
@@ -869,17 +944,6 @@ alltt t = walk strToCode <$> parseFromString blocks
where strToCode (Str s) = Code nullAttr s
strToCode x = x
-verbatimEnv :: LP (String, String)
-verbatimEnv = do
- (_,r) <- withRaw $ do
- controlSeq "begin"
- name <- braced
- guard $ name `elem` ["verbatim", "Verbatim", "lstlisting",
- "minted", "alltt"]
- verbEnv name
- rest <- getInput
- return (r,rest)
-
rawLaTeXBlock :: Parser [Char] ParserState String
rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand))
@@ -888,12 +952,33 @@ rawLaTeXInline = do
raw <- (snd <$> withRaw inlineCommand) <|> (snd <$> withRaw blockCommand)
RawInline "latex" <$> applyMacros' raw
+addImageCaption :: Blocks -> LP Blocks
+addImageCaption = walkM go
+ where go (Image alt (src,tit)) = do
+ mbcapt <- stateCaption <$> getState
+ case mbcapt of
+ Just ils -> return (Image (toList ils) (src, "fig:"))
+ Nothing -> return (Image alt (src,tit))
+ go x = return x
+
+addTableCaption :: Blocks -> LP Blocks
+addTableCaption = walkM go
+ where go (Table c als ws hs rs) = do
+ mbcapt <- stateCaption <$> getState
+ case mbcapt of
+ Just ils -> return (Table (toList ils) als ws hs rs)
+ Nothing -> return (Table c als ws hs rs)
+ go x = return x
+
environments :: M.Map String (LP Blocks)
environments = M.fromList
[ ("document", env "document" blocks <* skipMany anyChar)
, ("letter", env "letter" letter_contents)
- , ("figure", env "figure" $ skipopts *> blocks)
+ , ("figure", env "figure" $
+ resetCaption *> skipopts *> blocks >>= addImageCaption)
, ("center", env "center" blocks)
+ , ("table", env "table" $
+ resetCaption *> skipopts *> blocks >>= addTableCaption)
, ("tabular", env "tabular" simpTable)
, ("quote", blockQuote <$> env "quote" blocks)
, ("quotation", blockQuote <$> env "quotation" blocks)
@@ -1050,7 +1135,7 @@ paragraph = do
preamble :: LP Blocks
preamble = mempty <$> manyTill preambleBlock beginDoc
- where beginDoc = lookAhead $ controlSeq "begin" *> string "{document}"
+ where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}"
preambleBlock = (void comment)
<|> (void sp)
<|> (void blanklines)
@@ -1144,12 +1229,13 @@ complexNatbibCitation mode = try $ do
parseAligns :: LP [Alignment]
parseAligns = try $ do
char '{'
- let maybeBar = try $ spaces >> optional (char '|')
+ let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ try (string "@{}")
maybeBar
let cAlign = AlignCenter <$ char 'c'
let lAlign = AlignLeft <$ char 'l'
let rAlign = AlignRight <$ char 'r'
- let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign)
+ let parAlign = AlignLeft <$ (char 'p' >> braced)
+ let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign
aligns' <- sepEndBy alignChar maybeBar
spaces
char '}'
@@ -1170,10 +1256,14 @@ parseTableRow :: Int -- ^ number of columns
parseTableRow cols = try $ do
let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline
let tableCell = (plain . trimInlines . mconcat) <$> many tableCellInline
- cells' <- sepBy tableCell amp
- guard $ length cells' == cols
+ cells' <- sepBy1 tableCell amp
+ let numcells = length cells'
+ guard $ numcells <= cols && numcells >= 1
+ guard $ cells' /= [mempty]
+ -- note: a & b in a three-column table leaves an empty 3rd cell:
+ let cells'' = cells' ++ replicate (cols - numcells) mempty
spaces
- return cells'
+ return cells''
simpTable :: LP Blocks
simpTable = try $ do
@@ -1184,9 +1274,23 @@ simpTable = try $ do
header' <- option [] $ try (parseTableRow cols <* lbreak <* hline)
rows <- sepEndBy (parseTableRow cols) (lbreak <* optional hline)
spaces
+ skipMany (comment *> spaces)
let header'' = if null header'
then replicate cols mempty
else header'
lookAhead $ controlSeq "end" -- make sure we're at end
return $ table mempty (zip aligns (repeat 0)) header'' rows
+startInclude :: LP Blocks
+startInclude = do
+ fn <- braced
+ setPosition $ newPos fn 1 1
+ return mempty
+
+endInclude :: LP Blocks
+endInclude = do
+ fn <- braced
+ ln <- braced
+ co <- braced
+ setPosition $ newPos fn (fromMaybe 1 $ safeRead ln) (fromMaybe 1 $ safeRead co)
+ return mempty
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 8a41cef49..2ca3b0eb6 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1,6 +1,6 @@
{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
{-
-Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Markdown
- Copyright : Copyright (C) 2006-2013 John MacFarlane
+ Copyright : Copyright (C) 2006-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -33,6 +33,7 @@ module Text.Pandoc.Readers.Markdown ( readMarkdown,
import Data.List ( transpose, sortBy, findIndex, intersperse, intercalate )
import qualified Data.Map as M
+import Data.Scientific (coefficient, base10Exponent)
import Data.Ord ( comparing )
import Data.Char ( isAlphaNum, toLower )
import Data.Maybe
@@ -60,6 +61,8 @@ import System.FilePath (takeExtension, addExtension)
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match (tagOpen)
import qualified Data.Set as Set
+import Text.Printf (printf)
+import Debug.Trace (trace)
type MarkdownParser = Parser [Char] ParserState
@@ -76,11 +79,7 @@ readMarkdownWithWarnings :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> (Pandoc, [String])
readMarkdownWithWarnings opts s =
- (readWith parseMarkdownWithWarnings) def{ stateOptions = opts } (s ++ "\n\n")
- where parseMarkdownWithWarnings = do
- doc <- parseMarkdown
- warnings <- stateWarnings <$> getState
- return (doc, warnings)
+ (readWithWarnings parseMarkdown) def{ stateOptions = opts } (s ++ "\n\n")
trimInlinesF :: F Inlines -> F Inlines
trimInlinesF = liftM trimInlines
@@ -114,6 +113,12 @@ isBlank _ = False
-- auxiliary functions
--
+-- | Succeeds when we're in list context.
+inList :: MarkdownParser ()
+inList = do
+ ctx <- stateParserContext <$> getState
+ guard (ctx == ListItemState)
+
isNull :: F Inlines -> Bool
isNull ils = B.isNull $ runF ils def
@@ -138,14 +143,16 @@ nonindentSpaces = do
then return sps
else unexpected "indented line"
-skipNonindentSpaces :: MarkdownParser ()
+-- returns number of spaces parsed
+skipNonindentSpaces :: MarkdownParser Int
skipNonindentSpaces = do
tabStop <- getOption readerTabStop
- atMostSpaces (tabStop - 1)
+ atMostSpaces (tabStop - 1) <* notFollowedBy (char ' ')
-atMostSpaces :: Int -> MarkdownParser ()
-atMostSpaces 0 = notFollowedBy (char ' ')
-atMostSpaces n = (char ' ' >> atMostSpaces (n-1)) <|> return ()
+atMostSpaces :: Int -> MarkdownParser Int
+atMostSpaces n
+ | n > 0 = (char ' ' >> (+1) <$> atMostSpaces (n-1)) <|> return 0
+ | otherwise = return 0
litChar :: MarkdownParser Char
litChar = escapedChar'
@@ -283,7 +290,11 @@ toMetaValue opts x =
yamlToMeta :: ReaderOptions -> Yaml.Value -> MetaValue
yamlToMeta opts (Yaml.String t) = toMetaValue opts t
-yamlToMeta _ (Yaml.Number n) = MetaString $ show n
+yamlToMeta _ (Yaml.Number n)
+ -- avoid decimal points for numbers that don't need them:
+ | base10Exponent n >= 0 = MetaString $ show
+ $ coefficient n * (10 ^ base10Exponent n)
+ | otherwise = MetaString $ show n
yamlToMeta _ (Yaml.Bool b) = MetaBool b
yamlToMeta opts (Yaml.Array xs) = B.toMetaValue $ map (yamlToMeta opts)
$ V.toList xs
@@ -328,12 +339,6 @@ parseMarkdown = do
let Pandoc _ bs = B.doc $ runF blocks st
return $ Pandoc meta bs
-addWarning :: Maybe SourcePos -> String -> MarkdownParser ()
-addWarning mbpos msg =
- updateState $ \st -> st{
- stateWarnings = (msg ++ maybe "" (\pos -> " " ++ show pos) mbpos) :
- stateWarnings st }
-
referenceKey :: MarkdownParser (F Blocks)
referenceKey = try $ do
pos <- getPosition
@@ -342,19 +347,16 @@ referenceKey = try $ do
char ':'
skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[')
let sourceURL = liftM unwords $ many $ try $ do
- notFollowedBy' referenceTitle
- skipMany spaceChar
- optional $ newline >> notFollowedBy blankline
skipMany spaceChar
+ notFollowedBy' referenceTitle
notFollowedBy' (() <$ reference)
many1 $ notFollowedBy space >> litChar
- let betweenAngles = try $ char '<' >>
- manyTill (escapedChar' <|> litChar) (char '>')
+ let betweenAngles = try $ char '<' >> manyTill litChar (char '>')
src <- try betweenAngles <|> sourceURL
tit <- option "" referenceTitle
-- currently we just ignore MMD-style link/image attributes
_kvs <- option [] $ guardEnabled Ext_link_attributes
- >> many (spnl >> keyValAttr)
+ >> many (try $ spnl >> keyValAttr)
blanklines
let target = (escapeURI $ trimr src, tit)
st <- getState
@@ -440,7 +442,10 @@ parseBlocks :: MarkdownParser (F Blocks)
parseBlocks = mconcat <$> manyTill block eof
block :: MarkdownParser (F Blocks)
-block = choice [ mempty <$ blanklines
+block = do
+ tr <- getOption readerTrace
+ pos <- getPosition
+ res <- choice [ mempty <$ blanklines
, codeBlockFenced
, yamlMetaBlock
, guardEnabled Ext_latex_macros *> (macro >>= return . return)
@@ -465,6 +470,11 @@ block = choice [ mempty <$ blanklines
, para
, plain
] <?> "block"
+ when tr $ do
+ st <- getState
+ trace (printf "line %d: %s" (sourceLine pos)
+ (take 60 $ show $ B.toList $ runF res st)) (return ())
+ return res
--
-- header blocks
@@ -558,7 +568,7 @@ attributes :: MarkdownParser Attr
attributes = try $ do
char '{'
spnl
- attrs <- many (attribute >>~ spnl)
+ attrs <- many (attribute <* spnl)
char '}'
return $ foldl (\x f -> f x) nullAttr attrs
@@ -605,12 +615,19 @@ codeBlockFenced = try $ do
skipMany spaceChar
attr <- option ([],[],[]) $
try (guardEnabled Ext_fenced_code_attributes >> attributes)
- <|> ((\x -> ("",[x],[])) <$> identifier)
+ <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar)
blankline
contents <- manyTill anyLine (blockDelimiter (== c) (Just size))
blanklines
return $ return $ B.codeBlockWith attr $ intercalate "\n" contents
+-- correctly handle github language identifiers
+toLanguageId :: String -> String
+toLanguageId = map toLower . go
+ where go "c++" = "cpp"
+ go "objective-c" = "objectivec"
+ go x = x
+
codeBlockIndented :: MarkdownParser (F Blocks)
codeBlockIndented = do
contents <- many1 (indentedLine <|>
@@ -668,7 +685,7 @@ birdTrackLine c = try $ do
--
emailBlockQuoteStart :: MarkdownParser Char
-emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ')
+emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ')
emailBlockQuote :: MarkdownParser [String]
emailBlockQuote = try $ do
@@ -698,54 +715,64 @@ blockQuote = do
bulletListStart :: MarkdownParser ()
bulletListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
+ startpos <- sourceColumn <$> getPosition
skipNonindentSpaces
notFollowedBy' (() <$ hrule) -- because hrules start out just like lists
satisfy isBulletListMarker
- spaceChar <|> lookAhead newline
- skipSpaces
+ endpos <- sourceColumn <$> getPosition
+ tabStop <- getOption readerTabStop
+ lookAhead (newline <|> spaceChar)
+ () <$ atMostSpaces (tabStop - (endpos - startpos))
anyOrderedListStart :: MarkdownParser (Int, ListNumberStyle, ListNumberDelim)
anyOrderedListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
+ startpos <- sourceColumn <$> getPosition
skipNonindentSpaces
notFollowedBy $ string "p." >> spaceChar >> digit -- page number
- (guardDisabled Ext_fancy_lists >>
- do many1 digit
- char '.'
- spaceChar
- return (1, DefaultStyle, DefaultDelim))
- <|> do (num, style, delim) <- anyOrderedListMarker
- -- if it could be an abbreviated first name, insist on more than one space
- if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
- num `elem` [1, 5, 10, 50, 100, 500, 1000]))
- then char '\t' <|> (try $ char ' ' >> spaceChar)
- else spaceChar
- skipSpaces
- return (num, style, delim)
+ res <- do guardDisabled Ext_fancy_lists
+ start <- many1 digit >>= safeRead
+ char '.'
+ return (start, DefaultStyle, DefaultDelim)
+ <|> do (num, style, delim) <- anyOrderedListMarker
+ -- if it could be an abbreviated first name,
+ -- insist on more than one space
+ when (delim == Period && (style == UpperAlpha ||
+ (style == UpperRoman &&
+ num `elem` [1, 5, 10, 50, 100, 500, 1000]))) $
+ () <$ spaceChar
+ return (num, style, delim)
+ endpos <- sourceColumn <$> getPosition
+ tabStop <- getOption readerTabStop
+ lookAhead (newline <|> spaceChar)
+ atMostSpaces (tabStop - (endpos - startpos))
+ return res
listStart :: MarkdownParser ()
listStart = bulletListStart <|> (anyOrderedListStart >> return ())
--- parse a line of a list item (start = parser for beginning of list item)
listLine :: MarkdownParser String
listLine = try $ do
notFollowedBy' (do indentSpaces
many spaceChar
listStart)
- notFollowedBy' $ htmlTag (~== TagClose "div")
- chunks <- manyTill
+ notFollowedByHtmlCloser
+ optional (() <$ indentSpaces)
+ listLineCommon
+
+listLineCommon :: MarkdownParser String
+listLineCommon = concat <$> manyTill
( many1 (satisfy $ \c -> c /= '\n' && c /= '<')
<|> liftM snd (htmlTag isCommentTag)
<|> count 1 anyChar
) newline
- return $ concat chunks
-- parse raw text for one list item, excluding start marker and continuations
rawListItem :: MarkdownParser a
-> MarkdownParser String
rawListItem start = try $ do
start
- first <- listLine
+ first <- listLineCommon
rest <- many (notFollowedBy listStart >> notFollowedBy blankline >> listLine)
blanks <- many blankline
return $ unlines (first:rest) ++ blanks
@@ -760,11 +787,18 @@ listContinuation = try $ do
blanks <- many blankline
return $ concat result ++ blanks
+notFollowedByHtmlCloser :: MarkdownParser ()
+notFollowedByHtmlCloser = do
+ inHtmlBlock <- stateInHtmlBlock <$> getState
+ case inHtmlBlock of
+ Just t -> notFollowedBy' $ htmlTag (~== TagClose t)
+ Nothing -> return ()
+
listContinuationLine :: MarkdownParser String
listContinuationLine = try $ do
notFollowedBy blankline
notFollowedBy' listStart
- notFollowedBy' $ htmlTag (~== TagClose "div")
+ notFollowedByHtmlCloser
optional indentSpaces
result <- anyLine
return $ result ++ "\n"
@@ -796,8 +830,14 @@ orderedList = try $ do
items <- fmap sequence $ many1 $ listItem
( try $ do
optional newline -- if preceded by Plain block in a list
+ startpos <- sourceColumn <$> getPosition
skipNonindentSpaces
- orderedListMarker style delim )
+ res <- orderedListMarker style delim
+ endpos <- sourceColumn <$> getPosition
+ tabStop <- getOption readerTabStop
+ lookAhead (newline <|> spaceChar)
+ atMostSpaces (tabStop - (endpos - startpos))
+ return res )
start' <- option 1 $ guardEnabled Ext_startnum >> return start
return $ B.orderedListWith (start', style, delim) <$> fmap compactify' items
@@ -819,53 +859,52 @@ defListMarker = do
else mzero
return ()
-definitionListItem :: MarkdownParser (F (Inlines, [Blocks]))
-definitionListItem = try $ do
- -- first, see if this has any chance of being a definition list:
- lookAhead (anyLine >> optional blankline >> defListMarker)
- term <- trimInlinesF . mconcat <$> manyTill inline newline
- optional blankline
- raw <- many1 defRawBlock
- state <- getState
- let oldContext = stateParserContext state
- -- parse the extracted block, which may contain various block elements:
+definitionListItem :: Bool -> MarkdownParser (F (Inlines, [Blocks]))
+definitionListItem compact = try $ do
+ rawLine' <- anyLine
+ raw <- many1 $ defRawBlock compact
+ term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine'
contents <- mapM (parseFromString parseBlocks) raw
- updateState (\st -> st {stateParserContext = oldContext})
+ optional blanklines
return $ liftM2 (,) term (sequence contents)
-defRawBlock :: MarkdownParser String
-defRawBlock = try $ do
+defRawBlock :: Bool -> MarkdownParser String
+defRawBlock compact = try $ do
+ hasBlank <- option False $ blankline >> return True
defListMarker
firstline <- anyLine
- rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
- trailing <- option "" blanklines
- cont <- liftM concat $ many $ do
- lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine
- trl <- option "" blanklines
- return $ unlines lns ++ trl
- return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont
+ let dline = try
+ ( do notFollowedBy blankline
+ if compact -- laziness not compatible with compact
+ then () <$ indentSpaces
+ else (() <$ indentSpaces)
+ <|> notFollowedBy defListMarker
+ anyLine )
+ rawlines <- many dline
+ cont <- liftM concat $ many $ try $ do
+ trailing <- option "" blanklines
+ ln <- indentSpaces >> notFollowedBy blankline >> anyLine
+ lns <- many dline
+ return $ trailing ++ unlines (ln:lns)
+ return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++
+ if hasBlank || not (null cont) then "\n\n" else ""
definitionList :: MarkdownParser (F Blocks)
-definitionList = do
- guardEnabled Ext_definition_lists
- items <- fmap sequence $ many1 definitionListItem
+definitionList = try $ do
+ lookAhead (anyLine >> optional blankline >> defListMarker)
+ compactDefinitionList <|> normalDefinitionList
+
+compactDefinitionList :: MarkdownParser (F Blocks)
+compactDefinitionList = do
+ guardEnabled Ext_compact_definition_lists
+ items <- fmap sequence $ many1 $ definitionListItem True
return $ B.definitionList <$> fmap compactify'DL items
-compactify'DL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])]
-compactify'DL items =
- let defs = concatMap snd items
- defBlocks = reverse $ concatMap B.toList defs
- isPara (Para _) = True
- isPara _ = False
- in case defBlocks of
- (Para x:_) -> if not $ any isPara (drop 1 defBlocks)
- then let (t,ds) = last items
- lastDef = B.toList $ last ds
- ds' = init ds ++
- [B.fromList $ init lastDef ++ [Plain x]]
- in init items ++ [(t, ds')]
- else items
- _ -> items
+normalDefinitionList :: MarkdownParser (F Blocks)
+normalDefinitionList = do
+ guardEnabled Ext_definition_lists
+ items <- fmap sequence $ many1 $ definitionListItem False
+ return $ B.definitionList <$> items
--
-- paragraph block
@@ -883,7 +922,15 @@ para = try $ do
<|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced)
<|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header)
<|> (guardEnabled Ext_lists_without_preceding_blankline >>
+ -- Avoid creating a paragraph in a nested list.
+ notFollowedBy' inList >>
() <$ lookAhead listStart)
+ <|> do guardEnabled Ext_native_divs
+ inHtmlBlock <- stateInHtmlBlock <$> getState
+ case inHtmlBlock of
+ Just "div" -> () <$
+ lookAhead (htmlTag (~== TagClose "div"))
+ _ -> mzero
return $ do
result' <- result
case B.toList result' of
@@ -909,16 +956,34 @@ htmlElement = rawVerbatimBlock
htmlBlock :: MarkdownParser (F Blocks)
htmlBlock = do
guardEnabled Ext_raw_html
- res <- (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)
- <|> htmlBlock'
- return $ return $ B.rawBlock "html" res
-
-htmlBlock' :: MarkdownParser String
+ try (do
+ (TagOpen t attrs) <- lookAhead $ fst <$> htmlTag isBlockTag
+ (guard (t `elem` ["pre","style","script"]) >>
+ (return . B.rawBlock "html") <$> rawVerbatimBlock)
+ <|> (do guardEnabled Ext_markdown_attribute
+ oldMarkdownAttribute <- stateMarkdownAttribute <$> getState
+ markdownAttribute <-
+ case lookup "markdown" attrs of
+ Just "0" -> False <$ updateState (\st -> st{
+ stateMarkdownAttribute = False })
+ Just _ -> True <$ updateState (\st -> st{
+ stateMarkdownAttribute = True })
+ Nothing -> return oldMarkdownAttribute
+ res <- if markdownAttribute
+ then rawHtmlBlocks
+ else htmlBlock'
+ updateState $ \st -> st{ stateMarkdownAttribute =
+ oldMarkdownAttribute }
+ return res)
+ <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks))
+ <|> htmlBlock'
+
+htmlBlock' :: MarkdownParser (F Blocks)
htmlBlock' = try $ do
first <- htmlElement
- finalSpace <- many spaceChar
- finalNewlines <- many newline
- return $ first ++ finalSpace ++ finalNewlines
+ skipMany spaceChar
+ optional blanklines
+ return $ return $ B.rawBlock "html" first
strictHtmlBlock :: MarkdownParser String
strictHtmlBlock = htmlInBalanced (not . isInlineTag)
@@ -929,46 +994,36 @@ rawVerbatimBlock = try $ do
["pre", "style", "script"])
(const True))
contents <- manyTill anyChar (htmlTag (~== TagClose tag))
- return $ open ++ contents ++ renderTags [TagClose tag]
+ return $ open ++ contents ++ renderTags' [TagClose tag]
rawTeXBlock :: MarkdownParser (F Blocks)
rawTeXBlock = do
guardEnabled Ext_raw_tex
- result <- (B.rawBlock "latex" <$> rawLaTeXBlock)
- <|> (B.rawBlock "context" <$> rawConTeXtEnvironment)
+ result <- (B.rawBlock "latex" . concat <$>
+ rawLaTeXBlock `sepEndBy1` blankline)
+ <|> (B.rawBlock "context" . concat <$>
+ rawConTeXtEnvironment `sepEndBy1` blankline)
spaces
return $ return result
-rawHtmlBlocks :: MarkdownParser String
+rawHtmlBlocks :: MarkdownParser (F Blocks)
rawHtmlBlocks = do
- htmlBlocks <- many1 $ try $ do
- s <- rawVerbatimBlock <|> try (
- do (t,raw) <- htmlTag isBlockTag
- exts <- getOption readerExtensions
- -- if open tag, need markdown="1" if
- -- markdown_attributes extension is set
- case t of
- TagOpen _ as
- | Ext_markdown_attribute `Set.member`
- exts ->
- if "markdown" `notElem`
- map fst as
- then mzero
- else return $
- stripMarkdownAttribute raw
- | otherwise -> return raw
- _ -> return raw )
- sps <- do sp1 <- many spaceChar
- sp2 <- option "" (blankline >> return "\n")
- sp3 <- many spaceChar
- sp4 <- option "" blanklines
- return $ sp1 ++ sp2 ++ sp3 ++ sp4
- -- note: we want raw html to be able to
- -- precede a code block, when separated
- -- by a blank line
- return $ s ++ sps
- let combined = concat htmlBlocks
- return $ if last combined == '\n' then init combined else combined
+ (TagOpen tagtype _, raw) <- htmlTag isBlockTag
+ -- try to find closing tag
+ -- we set stateInHtmlBlock so that closing tags that can be either block or
+ -- inline will not be parsed as inline tags
+ oldInHtmlBlock <- stateInHtmlBlock <$> getState
+ updateState $ \st -> st{ stateInHtmlBlock = Just tagtype }
+ let closer = htmlTag (\x -> x ~== TagClose tagtype)
+ contents <- mconcat <$> many (notFollowedBy' closer >> block)
+ result <-
+ (closer >>= \(_, rawcloser) -> return (
+ return (B.rawBlock "html" $ stripMarkdownAttribute raw) <>
+ contents <>
+ return (B.rawBlock "html" rawcloser)))
+ <|> return (return (B.rawBlock "html" raw) <> contents)
+ updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock }
+ return result
-- remove markdown="1" attribute
stripMarkdownAttribute :: String -> String
@@ -1110,13 +1165,11 @@ multilineTable headless =
multilineTableHeader :: Bool -- ^ Headerless table
-> MarkdownParser (F [Blocks], [Alignment], [Int])
multilineTableHeader headless = try $ do
- if headless
- then return '\n'
- else tableSep >>~ notFollowedBy blankline
+ unless headless $
+ tableSep >> notFollowedBy blankline
rawContent <- if headless
then return $ repeat ""
- else many1
- (notFollowedBy tableSep >> many1Till anyChar newline)
+ else many1 $ notFollowedBy tableSep >> anyLine
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
@@ -1158,7 +1211,7 @@ gridPart ch = do
return (length dashes, length dashes + 1)
gridDashedLines :: Char -> Parser [Char] st [(Int,Int)]
-gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline
+gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline
removeFinalBar :: String -> String
removeFinalBar =
@@ -1220,11 +1273,20 @@ removeOneLeadingSpace xs =
gridTableFooter :: MarkdownParser [Char]
gridTableFooter = blanklines
+pipeBreak :: MarkdownParser [Alignment]
+pipeBreak = try $ do
+ nonindentSpaces
+ openPipe <- (True <$ char '|') <|> return False
+ first <- pipeTableHeaderPart
+ rest <- many $ sepPipe *> pipeTableHeaderPart
+ -- surrounding pipes needed for a one-column table:
+ guard $ not (null rest && not openPipe)
+ optional (char '|')
+ blankline
+ return (first:rest)
+
pipeTable :: MarkdownParser ([Alignment], [Double], F [Blocks], F [[Blocks]])
pipeTable = try $ do
- let pipeBreak = nonindentSpaces *> optional (char '|') *>
- pipeTableHeaderPart `sepBy1` sepPipe <*
- optional (char '|') <* blankline
(heads,aligns) <- try ( pipeBreak >>= \als ->
return (return $ replicate (length als) mempty, als))
<|> ( pipeTableRow >>= \row -> pipeBreak >>= \als ->
@@ -1243,12 +1305,13 @@ sepPipe = try $ do
pipeTableRow :: MarkdownParser (F [Blocks])
pipeTableRow = do
nonindentSpaces
- optional (char '|')
+ openPipe <- (True <$ char '|') <|> return False
let cell = mconcat <$>
many (notFollowedBy (blankline <|> char '|') >> inline)
first <- cell
- sepPipe
- rest <- cell `sepBy1` sepPipe
+ rest <- many $ sepPipe *> cell
+ -- surrounding pipes needed for a one-column table:
+ guard $ not (null rest && not openPipe)
optional (char '|')
blankline
let cells = sequence (first:rest)
@@ -1373,8 +1436,7 @@ escapedChar = do
ltSign :: MarkdownParser (F Inlines)
ltSign = do
guardDisabled Ext_raw_html
- <|> guardDisabled Ext_markdown_in_html_blocks
- <|> (notFollowedBy' rawHtmlBlocks >> return ())
+ <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag))
char '<'
return $ return $ B.str "<"
@@ -1419,54 +1481,60 @@ math = (return . B.displayMath <$> (mathDisplay >>= applyMacros'))
enclosure :: Char
-> MarkdownParser (F Inlines)
enclosure c = do
+ -- we can't start an enclosure with _ if after a string and
+ -- the intraword_underscores extension is enabled:
+ guardDisabled Ext_intraword_underscores
+ <|> guard (c == '*')
+ <|> (guard =<< notAfterString)
cs <- many1 (char c)
(return (B.str cs) <>) <$> whitespace
- <|> case length cs of
+ <|> do
+ case length cs of
3 -> three c
2 -> two c mempty
1 -> one c mempty
_ -> return (return $ B.str cs)
+ender :: Char -> Int -> MarkdownParser ()
+ender c n = try $ do
+ count n (char c)
+ guard (c == '*')
+ <|> guardDisabled Ext_intraword_underscores
+ <|> notFollowedBy alphaNum
+
-- Parse inlines til you hit one c or a sequence of two cs.
-- If one c, emit emph and then parse two.
-- If two cs, emit strong and then parse one.
-- Otherwise, emit ccc then the results.
three :: Char -> MarkdownParser (F Inlines)
three c = do
- contents <- mconcat <$> many (notFollowedBy (char c) >> inline)
- (try (string [c,c,c]) >> return ((B.strong . B.emph) <$> contents))
- <|> (try (string [c,c]) >> one c (B.strong <$> contents))
- <|> (char c >> two c (B.emph <$> contents))
+ contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline)
+ (ender c 3 >> return ((B.strong . B.emph) <$> contents))
+ <|> (ender c 2 >> one c (B.strong <$> contents))
+ <|> (ender c 1 >> two c (B.emph <$> contents))
<|> return (return (B.str [c,c,c]) <> contents)
-- Parse inlines til you hit two c's, and emit strong.
-- If you never do hit two cs, emit ** plus inlines parsed.
two :: Char -> F Inlines -> MarkdownParser (F Inlines)
two c prefix' = do
- let ender = try $ string [c,c]
- contents <- mconcat <$> many (try $ notFollowedBy ender >> inline)
- (ender >> return (B.strong <$> (prefix' <> contents)))
+ contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline)
+ (ender c 2 >> return (B.strong <$> (prefix' <> contents)))
<|> return (return (B.str [c,c]) <> (prefix' <> contents))
-- Parse inlines til you hit a c, and emit emph.
-- If you never hit a c, emit * plus inlines parsed.
one :: Char -> F Inlines -> MarkdownParser (F Inlines)
one c prefix' = do
- contents <- mconcat <$> many ( (notFollowedBy (char c) >> inline)
+ contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline)
<|> try (string [c,c] >>
- notFollowedBy (char c) >>
+ notFollowedBy (ender c 1) >>
two c mempty) )
- (char c >> return (B.emph <$> (prefix' <> contents)))
+ (ender c 1 >> return (B.emph <$> (prefix' <> contents)))
<|> return (return (B.str [c]) <> (prefix' <> contents))
strongOrEmph :: MarkdownParser (F Inlines)
-strongOrEmph = enclosure '*' <|> (checkIntraword >> enclosure '_')
- where checkIntraword = do
- exts <- getOption readerExtensions
- when (Ext_intraword_underscores `Set.member` exts) $ do
- pos <- getPosition
- lastStrPos <- stateLastStrPos <$> getState
- guard $ lastStrPos /= Just pos
+strongOrEmph = enclosure '*' <|> enclosure '_'
-- | Parses a list of inlines between start and end delimiters.
inlinesBetween :: (Show b)
@@ -1476,7 +1544,7 @@ inlinesBetween :: (Show b)
inlinesBetween start end =
(trimInlinesF . mconcat) <$> try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
- innerSpace = try $ whitespace >>~ notFollowedBy' end
+ innerSpace = try $ whitespace <* notFollowedBy' end
strikeout :: MarkdownParser (F Inlines)
strikeout = fmap B.strikeout <$>
@@ -1508,8 +1576,7 @@ nonEndline = satisfy (/='\n')
str :: MarkdownParser (F Inlines)
str = do
result <- many1 alphaNum
- pos <- getPosition
- updateState $ \s -> s{ stateLastStrPos = Just pos }
+ updateLastStrPos
let spacesToNbr = map (\c -> if c == ' ' then '\160' else c)
isSmart <- getOption readerSmart
if isSmart
@@ -1541,14 +1608,15 @@ endline = try $ do
newline
notFollowedBy blankline
-- parse potential list-starts differently if in a list:
- st <- getState
- when (stateParserContext st == ListItemState) $ notFollowedBy listStart
+ notFollowedBy (inList >> listStart)
guardDisabled Ext_lists_without_preceding_blankline <|> notFollowedBy listStart
guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart
guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header
- guardEnabled Ext_backtick_code_blocks >>
+ guardDisabled Ext_backtick_code_blocks <|>
notFollowedBy (() <$ (lookAhead (char '`') >> codeBlockFenced))
- (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak))
+ notFollowedByHtmlCloser
+ (eof >> return mempty)
+ <|> (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak))
<|> (guardEnabled Ext_ignore_line_breaks >> return mempty)
<|> (return $ return B.space)
@@ -1571,8 +1639,10 @@ source :: MarkdownParser (String, String)
source = do
char '('
skipSpaces
- let urlChunk = try $ notFollowedBy (oneOf "\"')") >>
- (parenthesizedChars <|> count 1 litChar)
+ let urlChunk =
+ try parenthesizedChars
+ <|> (notFollowedBy (oneOf " )") >> (count 1 litChar))
+ <|> try (many1 spaceChar <* notFollowedBy (oneOf "\"')"))
let sourceURL = (unwords . words . concat) <$> many urlChunk
let betweenAngles = try $
char '<' >> manyTill litChar (char '>')
@@ -1717,31 +1787,55 @@ inBrackets parser = do
spanHtml :: MarkdownParser (F Inlines)
spanHtml = try $ do
- guardEnabled Ext_markdown_in_html_blocks
+ guardEnabled Ext_native_spans
(TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" [])
contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span"))
let ident = fromMaybe "" $ lookup "id" attrs
let classes = maybe [] words $ lookup "class" attrs
let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"]
- return $ B.spanWith (ident, classes, keyvals) <$> contents
+ case lookup "style" keyvals of
+ Just s | null ident && null classes &&
+ map toLower (filter (`notElem` " \t;") s) ==
+ "font-variant:small-caps"
+ -> return $ B.smallcaps <$> contents
+ _ -> return $ B.spanWith (ident, classes, keyvals) <$> contents
divHtml :: MarkdownParser (F Blocks)
divHtml = try $ do
- guardEnabled Ext_markdown_in_html_blocks
- (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "div" [])
- contents <- mconcat <$> manyTill block (htmlTag (~== TagClose "div"))
- let ident = fromMaybe "" $ lookup "id" attrs
- let classes = maybe [] words $ lookup "class" attrs
- let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"]
- return $ B.divWith (ident, classes, keyvals) <$> contents
+ guardEnabled Ext_native_divs
+ (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" [])
+ -- we set stateInHtmlBlock so that closing tags that can be either block or
+ -- inline will not be parsed as inline tags
+ oldInHtmlBlock <- stateInHtmlBlock <$> getState
+ updateState $ \st -> st{ stateInHtmlBlock = Just "div" }
+ bls <- option "" (blankline >> option "" blanklines)
+ contents <- mconcat <$>
+ many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block)
+ closed <- option False (True <$ htmlTag (~== TagClose "div"))
+ if closed
+ then do
+ updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock }
+ let ident = fromMaybe "" $ lookup "id" attrs
+ let classes = maybe [] words $ lookup "class" attrs
+ let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"]
+ return $ B.divWith (ident, classes, keyvals) <$> contents
+ else -- avoid backtracing
+ return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents
rawHtmlInline :: MarkdownParser (F Inlines)
rawHtmlInline = do
guardEnabled Ext_raw_html
+ inHtmlBlock <- stateInHtmlBlock <$> getState
+ let isCloseBlockTag t = case inHtmlBlock of
+ Just t' -> t ~== TagClose t'
+ Nothing -> False
mdInHtml <- option False $
- guardEnabled Ext_markdown_in_html_blocks >> return True
+ ( guardEnabled Ext_markdown_in_html_blocks
+ <|> guardEnabled Ext_markdown_attribute
+ ) >> return True
(_,result) <- htmlTag $ if mdInHtml
- then isInlineTag
+ then (\x -> isInlineTag x &&
+ not (isCloseBlockTag x))
else not . isTextTag
return $ return $ B.rawInline "html" result
@@ -1800,22 +1894,6 @@ normalCite = try $ do
char ']'
return citations
-citeKey :: MarkdownParser (Bool, String)
-citeKey = try $ do
- -- make sure we're not right after an alphanumeric,
- -- since foo@bar.baz is probably an email address
- lastStrPos <- stateLastStrPos <$> getState
- pos <- getPosition
- guard $ lastStrPos /= Just pos
- suppress_author <- option False (char '-' >> return True)
- char '@'
- first <- letter <|> char '_'
- let regchar = satisfy (\c -> isAlphaNum c || c == '_')
- let internal p = try $ p >>~ lookAhead regchar
- rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/")
- let key = first:rest
- return (suppress_author, key)
-
suffix :: MarkdownParser (F Inlines)
suffix = try $ do
hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
@@ -1854,7 +1932,7 @@ smart :: MarkdownParser (F Inlines)
smart = do
getOption readerSmart >>= guard
doubleQuoted <|> singleQuoted <|>
- choice (map (return . B.singleton <$>) [apostrophe, dash, ellipses])
+ choice (map (return <$>) [apostrophe, dash, ellipses])
singleQuoted :: MarkdownParser (F Inlines)
singleQuoted = try $ do
@@ -1873,4 +1951,3 @@ doubleQuoted = try $ do
(withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return
(fmap B.doubleQuoted . trimInlinesF $ contents))
<|> (return $ return (B.str "\8220") <> contents)
-
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index 794890eb6..e43b8a86c 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -1,7 +1,7 @@
{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-}
-- RelaxedPolyRec needed for inlinesBetween on GHC < 7
{-
- Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
+ Copyright (C) 2012-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -20,7 +20,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.MediaWiki
- Copyright : Copyright (C) 2012 John MacFarlane
+ Copyright : Copyright (C) 2012-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -55,6 +55,8 @@ import qualified Data.Foldable as F
import qualified Data.Map as M
import Data.Char (isDigit, isSpace)
import Data.Maybe (fromMaybe)
+import Text.Printf (printf)
+import Debug.Trace (trace)
-- | Read mediawiki from an input string and return a Pandoc document.
readMediaWiki :: ReaderOptions -- ^ Reader options
@@ -82,16 +84,16 @@ data MWState = MWState { mwOptions :: ReaderOptions
type MWParser = Parser [Char] MWState
-instance HasReaderOptions MWParser where
- askReaderOption f = (f . mwOptions) `fmap` getState
+instance HasReaderOptions MWState where
+ extractReaderOptions = mwOptions
-instance HasHeaderMap MWParser where
- getHeaderMap = fmap mwHeaderMap getState
- putHeaderMap hm = updateState $ \st -> st{ mwHeaderMap = hm }
+instance HasHeaderMap MWState where
+ extractHeaderMap = mwHeaderMap
+ updateHeaderMap f st = st{ mwHeaderMap = f $ mwHeaderMap st }
-instance HasIdentifierList MWParser where
- getIdentifierList = fmap mwIdentifierList getState
- putIdentifierList l = updateState $ \st -> st{ mwIdentifierList = l }
+instance HasIdentifierList MWState where
+ extractIdentifierList = mwIdentifierList
+ updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st }
--
-- auxiliary functions
@@ -187,7 +189,10 @@ parseMediaWiki = do
--
block :: MWParser Blocks
-block = mempty <$ skipMany1 blankline
+block = do
+ tr <- getOption readerTrace
+ pos <- getPosition
+ res <- mempty <$ skipMany1 blankline
<|> table
<|> header
<|> hrule
@@ -199,6 +204,10 @@ block = mempty <$ skipMany1 blankline
<|> blockTag
<|> (B.rawBlock "mediawiki" <$> template)
<|> para
+ when tr $
+ trace (printf "line %d: %s" (sourceLine pos)
+ (take 60 $ show $ B.toList res)) (return ())
+ return res
para :: MWParser Blocks
para = do
@@ -227,6 +236,7 @@ table = do
let widths' = map (\w -> if w == 0 then defaultwidth else w) widths
let cellspecs = zip (map fst cellspecs') widths'
rows' <- many $ try $ rowsep *> (map snd <$> tableRow)
+ optional blanklines
tableEnd
let cols = length hdr
let (headers,rows) = if hasheader
@@ -275,7 +285,7 @@ tableCaption = try $ do
(trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline)
tableRow :: MWParser [((Alignment, Double), Blocks)]
-tableRow = try $ many tableCell
+tableRow = try $ skipMany htmlComment *> many tableCell
tableCell :: MWParser ((Alignment, Double), Blocks)
tableCell = try $ do
@@ -307,6 +317,7 @@ template :: MWParser String
template = try $ do
string "{{"
notFollowedBy (char '{')
+ lookAhead $ letter <|> digit <|> char ':'
let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar
contents <- manyTill chunk (try $ string "}}")
return $ "{{" ++ concat contents ++ "}}"
@@ -438,7 +449,8 @@ listItem c = try $ do
skipMany spaceChar
first <- concat <$> manyTill listChunk newline
rest <- many
- (try $ string extras *> (concat <$> manyTill listChunk newline))
+ (try $ string extras *> lookAhead listStartChar *>
+ (concat <$> manyTill listChunk newline))
contents <- parseFromString (many1 $ listItem' c)
(unlines (first : rest))
case c of
@@ -555,10 +567,15 @@ endline = () <$ try (newline <*
notFollowedBy' header <*
notFollowedBy anyListStart)
+imageIdentifiers :: [MWParser ()]
+imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers]
+ where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier",
+ "Bild"]
+
image :: MWParser Inlines
image = try $ do
sym "[["
- sym "File:" <|> sym "Image:"
+ choice imageIdentifiers
fname <- many1 (noneOf "|]")
_ <- many (try $ char '|' *> imageOption)
caption <- (B.str fname <$ sym "]]")
@@ -618,7 +635,7 @@ inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
inlinesBetween start end =
(trimInlines . mconcat) <$> try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
- innerSpace = try $ whitespace >>~ notFollowedBy' end
+ innerSpace = try $ whitespace <* notFollowedBy' end
emph :: MWParser Inlines
emph = B.emph <$> nested (inlinesBetween start end)
diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs
index c5d4cb98a..f4dfa62c1 100644
--- a/src/Text/Pandoc/Readers/Native.hs
+++ b/src/Text/Pandoc/Readers/Native.hs
@@ -1,5 +1,5 @@
{-
-Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2011-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Native
- Copyright : Copyright (C) 2011 John MacFarlane
+ Copyright : Copyright (C) 2011-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs
new file mode 100644
index 000000000..440b6d144
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Org.hs
@@ -0,0 +1,1487 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-
+Copyright (C) 2014 Albert Krewinkel <tarleb@moltkeplatz.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Org
+ Copyright : Copyright (C) 2014 Albert Krewinkel
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Albert Krewinkel <tarleb@moltkeplatz.de>
+
+Conversion of org-mode formatted plain text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Org ( readOrg ) where
+
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder ( Inlines, Blocks, HasMeta(..), (<>)
+ , trimInlines )
+import Text.Pandoc.Definition
+import Text.Pandoc.Options
+import qualified Text.Pandoc.Parsing as P
+import Text.Pandoc.Parsing hiding ( F, unF, askF, asksF, runF
+ , newline, orderedListMarker
+ , parseFromString, blanklines
+ )
+import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
+import Text.Pandoc.Shared (compactify', compactify'DL)
+import Text.TeXMath (readTeX, writePandoc, DisplayType(..))
+import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
+
+import Control.Applicative ( Applicative, pure
+ , (<$>), (<$), (<*>), (<*), (*>) )
+import Control.Arrow (first)
+import Control.Monad (foldM, guard, liftM, liftM2, mplus, mzero, when)
+import Control.Monad.Reader (Reader, runReader, ask, asks)
+import Data.Char (isAlphaNum, toLower)
+import Data.Default
+import Data.List (intersperse, isPrefixOf, isSuffixOf)
+import qualified Data.Map as M
+import Data.Maybe (fromMaybe, isJust)
+import Data.Monoid (Monoid, mconcat, mempty, mappend)
+import Network.HTTP (urlEncode)
+
+-- | Parse org-mode string and return a Pandoc document.
+readOrg :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
+readOrg opts s = readWith parseOrg def{ orgStateOptions = opts } (s ++ "\n\n")
+
+type OrgParser = Parser [Char] OrgParserState
+
+parseOrg :: OrgParser Pandoc
+parseOrg = do
+ blocks' <- parseBlocks
+ st <- getState
+ let meta = runF (orgStateMeta' st) st
+ let removeUnwantedBlocks = dropCommentTrees . filter (/= Null)
+ return $ Pandoc meta $ removeUnwantedBlocks (B.toList $ runF blocks' st)
+
+-- | Drop COMMENT headers and the document tree below those headers.
+dropCommentTrees :: [Block] -> [Block]
+dropCommentTrees [] = []
+dropCommentTrees blks@(b:bs) =
+ maybe blks (flip dropUntilHeaderAboveLevel bs) $ commentHeaderLevel b
+
+-- | Return the level of a header starting a comment tree and Nothing
+-- otherwise.
+commentHeaderLevel :: Block -> Maybe Int
+commentHeaderLevel blk =
+ case blk of
+ (Header level _ ((Str "COMMENT"):_)) -> Just level
+ _ -> Nothing
+
+-- | Drop blocks until a header on or above the given level is seen
+dropUntilHeaderAboveLevel :: Int -> [Block] -> [Block]
+dropUntilHeaderAboveLevel n = dropWhile (not . isHeaderLevelLowerEq n)
+
+isHeaderLevelLowerEq :: Int -> Block -> Bool
+isHeaderLevelLowerEq n blk =
+ case blk of
+ (Header level _ _) -> n >= level
+ _ -> False
+
+--
+-- Parser State for Org
+--
+
+type OrgNoteRecord = (String, F Blocks)
+type OrgNoteTable = [OrgNoteRecord]
+
+type OrgBlockAttributes = M.Map String String
+
+type OrgLinkFormatters = M.Map String (String -> String)
+
+-- | Org-mode parser state
+data OrgParserState = OrgParserState
+ { orgStateOptions :: ReaderOptions
+ , orgStateAnchorIds :: [String]
+ , orgStateBlockAttributes :: OrgBlockAttributes
+ , orgStateEmphasisCharStack :: [Char]
+ , orgStateEmphasisNewlines :: Maybe Int
+ , orgStateLastForbiddenCharPos :: Maybe SourcePos
+ , orgStateLastPreCharPos :: Maybe SourcePos
+ , orgStateLastStrPos :: Maybe SourcePos
+ , orgStateLinkFormatters :: OrgLinkFormatters
+ , orgStateMeta :: Meta
+ , orgStateMeta' :: F Meta
+ , orgStateNotes' :: OrgNoteTable
+ }
+
+instance HasReaderOptions OrgParserState where
+ extractReaderOptions = orgStateOptions
+
+instance HasMeta OrgParserState where
+ setMeta field val st =
+ st{ orgStateMeta = setMeta field val $ orgStateMeta st }
+ deleteMeta field st =
+ st{ orgStateMeta = deleteMeta field $ orgStateMeta st }
+
+instance HasLastStrPosition OrgParserState where
+ getLastStrPos = orgStateLastStrPos
+ setLastStrPos pos st = st{ orgStateLastStrPos = Just pos }
+
+instance Default OrgParserState where
+ def = defaultOrgParserState
+
+defaultOrgParserState :: OrgParserState
+defaultOrgParserState = OrgParserState
+ { orgStateOptions = def
+ , orgStateAnchorIds = []
+ , orgStateBlockAttributes = M.empty
+ , orgStateEmphasisCharStack = []
+ , orgStateEmphasisNewlines = Nothing
+ , orgStateLastForbiddenCharPos = Nothing
+ , orgStateLastPreCharPos = Nothing
+ , orgStateLastStrPos = Nothing
+ , orgStateLinkFormatters = M.empty
+ , orgStateMeta = nullMeta
+ , orgStateMeta' = return nullMeta
+ , orgStateNotes' = []
+ }
+
+recordAnchorId :: String -> OrgParser ()
+recordAnchorId i = updateState $ \s ->
+ s{ orgStateAnchorIds = i : (orgStateAnchorIds s) }
+
+addBlockAttribute :: String -> String -> OrgParser ()
+addBlockAttribute key val = updateState $ \s ->
+ let attrs = orgStateBlockAttributes s
+ in s{ orgStateBlockAttributes = M.insert key val attrs }
+
+lookupBlockAttribute :: String -> OrgParser (Maybe String)
+lookupBlockAttribute key =
+ M.lookup key . orgStateBlockAttributes <$> getState
+
+resetBlockAttributes :: OrgParser ()
+resetBlockAttributes = updateState $ \s ->
+ s{ orgStateBlockAttributes = orgStateBlockAttributes def }
+
+updateLastForbiddenCharPos :: OrgParser ()
+updateLastForbiddenCharPos = getPosition >>= \p ->
+ updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p}
+
+updateLastPreCharPos :: OrgParser ()
+updateLastPreCharPos = getPosition >>= \p ->
+ updateState $ \s -> s{ orgStateLastPreCharPos = Just p}
+
+pushToInlineCharStack :: Char -> OrgParser ()
+pushToInlineCharStack c = updateState $ \s ->
+ s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s }
+
+popInlineCharStack :: OrgParser ()
+popInlineCharStack = updateState $ \s ->
+ s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
+
+surroundingEmphasisChar :: OrgParser [Char]
+surroundingEmphasisChar =
+ take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
+
+startEmphasisNewlinesCounting :: Int -> OrgParser ()
+startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
+ s{ orgStateEmphasisNewlines = Just maxNewlines }
+
+decEmphasisNewlinesCount :: OrgParser ()
+decEmphasisNewlinesCount = updateState $ \s ->
+ s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
+
+newlinesCountWithinLimits :: OrgParser Bool
+newlinesCountWithinLimits = do
+ st <- getState
+ return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
+
+resetEmphasisNewlines :: OrgParser ()
+resetEmphasisNewlines = updateState $ \s ->
+ s{ orgStateEmphasisNewlines = Nothing }
+
+addLinkFormat :: String
+ -> (String -> String)
+ -> OrgParser ()
+addLinkFormat key formatter = updateState $ \s ->
+ let fs = orgStateLinkFormatters s
+ in s{ orgStateLinkFormatters = M.insert key formatter fs }
+
+addToNotesTable :: OrgNoteRecord -> OrgParser ()
+addToNotesTable note = do
+ oldnotes <- orgStateNotes' <$> getState
+ updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
+
+-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts
+-- of the state saved and restored.
+parseFromString :: OrgParser a -> String -> OrgParser a
+parseFromString parser str' = do
+ oldLastPreCharPos <- orgStateLastPreCharPos <$> getState
+ updateState $ \s -> s{ orgStateLastPreCharPos = Nothing }
+ result <- P.parseFromString parser str'
+ updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos }
+ return result
+
+
+--
+-- Adaptions and specializations of parsing utilities
+--
+
+newtype F a = F { unF :: Reader OrgParserState a
+ } deriving (Monad, Applicative, Functor)
+
+runF :: F a -> OrgParserState -> a
+runF = runReader . unF
+
+askF :: F OrgParserState
+askF = F ask
+
+asksF :: (OrgParserState -> a) -> F a
+asksF f = F $ asks f
+
+instance Monoid a => Monoid (F a) where
+ mempty = return mempty
+ mappend = liftM2 mappend
+ mconcat = fmap mconcat . sequence
+
+trimInlinesF :: F Inlines -> F Inlines
+trimInlinesF = liftM trimInlines
+
+returnF :: a -> OrgParser (F a)
+returnF = return . return
+
+
+-- | Like @Text.Parsec.Char.newline@, but causes additional state changes.
+newline :: OrgParser Char
+newline =
+ P.newline
+ <* updateLastPreCharPos
+ <* updateLastForbiddenCharPos
+
+-- | Like @Text.Parsec.Char.blanklines@, but causes additional state changes.
+blanklines :: OrgParser [Char]
+blanklines =
+ P.blanklines
+ <* updateLastPreCharPos
+ <* updateLastForbiddenCharPos
+
+--
+-- parsing blocks
+--
+
+parseBlocks :: OrgParser (F Blocks)
+parseBlocks = mconcat <$> manyTill block eof
+
+block :: OrgParser (F Blocks)
+block = choice [ mempty <$ blanklines
+ , optionalAttributes $ choice
+ [ orgBlock
+ , figure
+ , table
+ ]
+ , example
+ , drawer
+ , specialLine
+ , header
+ , return <$> hline
+ , list
+ , latexFragment
+ , noteBlock
+ , paraOrPlain
+ ] <?> "block"
+
+optionalAttributes :: OrgParser (F Blocks) -> OrgParser (F Blocks)
+optionalAttributes parser = try $
+ resetBlockAttributes *> parseBlockAttributes *> parser
+
+parseBlockAttributes :: OrgParser ()
+parseBlockAttributes = do
+ attrs <- many attribute
+ mapM_ (uncurry parseAndAddAttribute) attrs
+ where
+ attribute :: OrgParser (String, String)
+ attribute = try $ do
+ key <- metaLineStart *> many1Till nonspaceChar (char ':')
+ val <- skipSpaces *> anyLine
+ return (map toLower key, val)
+
+parseAndAddAttribute :: String -> String -> OrgParser ()
+parseAndAddAttribute key value = do
+ let key' = map toLower key
+ () <$ addBlockAttribute key' value
+
+lookupInlinesAttr :: String -> OrgParser (Maybe (F Inlines))
+lookupInlinesAttr attr = try $ do
+ val <- lookupBlockAttribute attr
+ maybe (return Nothing)
+ (fmap Just . parseFromString parseInlines)
+ val
+
+
+--
+-- Org Blocks (#+BEGIN_... / #+END_...)
+--
+
+type BlockProperties = (Int, String) -- (Indentation, Block-Type)
+
+orgBlock :: OrgParser (F Blocks)
+orgBlock = try $ do
+ blockProp@(_, blkType) <- blockHeaderStart
+ ($ blockProp) $
+ case blkType of
+ "comment" -> withRaw' (const mempty)
+ "html" -> withRaw' (return . (B.rawBlock blkType))
+ "latex" -> withRaw' (return . (B.rawBlock blkType))
+ "ascii" -> withRaw' (return . (B.rawBlock blkType))
+ "example" -> withRaw' (return . exampleCode)
+ "quote" -> withParsed (fmap B.blockQuote)
+ "verse" -> verseBlock
+ "src" -> codeBlock
+ _ -> withParsed (fmap $ divWithClass blkType)
+
+blockHeaderStart :: OrgParser (Int, String)
+blockHeaderStart = try $ (,) <$> indent <*> blockType
+ where
+ indent = length <$> many spaceChar
+ blockType = map toLower <$> (stringAnyCase "#+begin_" *> orgArgWord)
+
+withRaw' :: (String -> F Blocks) -> BlockProperties -> OrgParser (F Blocks)
+withRaw' f blockProp = (ignHeaders *> (f <$> rawBlockContent blockProp))
+
+withParsed :: (F Blocks -> F Blocks) -> BlockProperties -> OrgParser (F Blocks)
+withParsed f blockProp = (ignHeaders *> (f <$> parsedBlockContent blockProp))
+
+ignHeaders :: OrgParser ()
+ignHeaders = (() <$ newline) <|> (() <$ anyLine)
+
+divWithClass :: String -> Blocks -> Blocks
+divWithClass cls = B.divWith ("", [cls], [])
+
+verseBlock :: BlockProperties -> OrgParser (F Blocks)
+verseBlock blkProp = try $ do
+ ignHeaders
+ content <- rawBlockContent blkProp
+ fmap B.para . mconcat . intersperse (pure B.linebreak)
+ <$> mapM (parseFromString parseInlines) (lines content)
+
+exportsCode :: [(String, String)] -> Bool
+exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs
+ || ("rundoc-exports", "results") `elem` attrs)
+
+exportsResults :: [(String, String)] -> Bool
+exportsResults attrs = ("rundoc-exports", "results") `elem` attrs
+ || ("rundoc-exports", "both") `elem` attrs
+
+followingResultsBlock :: OrgParser (Maybe String)
+followingResultsBlock =
+ optionMaybe (try $ blanklines *> stringAnyCase "#+RESULTS:"
+ *> blankline
+ *> (unlines <$> many1 exampleLine))
+
+codeBlock :: BlockProperties -> OrgParser (F Blocks)
+codeBlock blkProp = do
+ skipSpaces
+ (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders)
+ id' <- fromMaybe "" <$> lookupBlockAttribute "name"
+ content <- rawBlockContent blkProp
+ resultsContent <- followingResultsBlock
+ let includeCode = exportsCode kv
+ let includeResults = exportsResults kv
+ let codeBlck = B.codeBlockWith ( id', classes, kv ) content
+ labelledBlck <- maybe (pure codeBlck)
+ (labelDiv codeBlck)
+ <$> lookupInlinesAttr "caption"
+ let resultBlck = pure $ maybe mempty (exampleCode) resultsContent
+ return $ (if includeCode then labelledBlck else mempty)
+ <> (if includeResults then resultBlck else mempty)
+ where
+ labelDiv blk value =
+ B.divWith nullAttr <$> (mappend <$> labelledBlock value
+ <*> pure blk)
+ labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], []))
+
+rawBlockContent :: BlockProperties -> OrgParser String
+rawBlockContent (indent, blockType) = try $
+ unlines . map commaEscaped <$> manyTill indentedLine blockEnder
+ where
+ indentedLine = try $ ("" <$ blankline) <|> (indentWith indent *> anyLine)
+ blockEnder = try $ indentWith indent *> stringAnyCase ("#+end_" <> blockType)
+
+parsedBlockContent :: BlockProperties -> OrgParser (F Blocks)
+parsedBlockContent blkProps = try $ do
+ raw <- rawBlockContent blkProps
+ parseFromString parseBlocks (raw ++ "\n")
+
+-- indent by specified number of spaces (or equiv. tabs)
+indentWith :: Int -> OrgParser String
+indentWith num = do
+ tabStop <- getOption readerTabStop
+ if num < tabStop
+ then count num (char ' ')
+ else choice [ try (count num (char ' '))
+ , try (char '\t' >> count (num - tabStop) (char ' ')) ]
+
+type SwitchOption = (Char, Maybe String)
+
+orgArgWord :: OrgParser String
+orgArgWord = many1 orgArgWordChar
+
+-- | Parse code block arguments
+-- TODO: We currently don't handle switches.
+codeHeaderArgs :: OrgParser ([String], [(String, String)])
+codeHeaderArgs = try $ do
+ language <- skipSpaces *> orgArgWord
+ _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar))
+ parameters <- manyTill blockOption newline
+ let pandocLang = translateLang language
+ return $
+ if hasRundocParameters parameters
+ then ( [ pandocLang, rundocBlockClass ]
+ , map toRundocAttrib (("language", language) : parameters)
+ )
+ else ([ pandocLang ], parameters)
+ where hasRundocParameters = not . null
+
+switch :: OrgParser SwitchOption
+switch = try $ simpleSwitch <|> lineNumbersSwitch
+ where
+ simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter)
+ lineNumbersSwitch = (\ls -> ('l', Just ls)) <$>
+ (string "-l \"" *> many1Till nonspaceChar (char '"'))
+
+translateLang :: String -> String
+translateLang "C" = "c"
+translateLang "C++" = "cpp"
+translateLang "emacs-lisp" = "commonlisp" -- emacs lisp is not supported
+translateLang "js" = "javascript"
+translateLang "lisp" = "commonlisp"
+translateLang "R" = "r"
+translateLang "sh" = "bash"
+translateLang "sqlite" = "sql"
+translateLang cs = cs
+
+-- | Prefix used for Rundoc classes and arguments.
+rundocPrefix :: String
+rundocPrefix = "rundoc-"
+
+-- | The class-name used to mark rundoc blocks.
+rundocBlockClass :: String
+rundocBlockClass = rundocPrefix ++ "block"
+
+blockOption :: OrgParser (String, String)
+blockOption = try $ (,) <$> orgArgKey <*> orgParamValue
+
+inlineBlockOption :: OrgParser (String, String)
+inlineBlockOption = try $ (,) <$> orgArgKey <*> orgInlineParamValue
+
+orgArgKey :: OrgParser String
+orgArgKey = try $
+ skipSpaces *> char ':'
+ *> many1 orgArgWordChar
+
+orgParamValue :: OrgParser String
+orgParamValue = try $
+ skipSpaces *> many1 (noneOf "\t\n\r ") <* skipSpaces
+
+orgInlineParamValue :: OrgParser String
+orgInlineParamValue = try $
+ skipSpaces *> many1 (noneOf "\t\n\r ]") <* skipSpaces
+
+orgArgWordChar :: OrgParser Char
+orgArgWordChar = alphaNum <|> oneOf "-_"
+
+toRundocAttrib :: (String, String) -> (String, String)
+toRundocAttrib = first ("rundoc-" ++)
+
+commaEscaped :: String -> String
+commaEscaped (',':cs@('*':_)) = cs
+commaEscaped (',':cs@('#':'+':_)) = cs
+commaEscaped cs = cs
+
+example :: OrgParser (F Blocks)
+example = try $ do
+ return . return . exampleCode =<< unlines <$> many1 exampleLine
+
+exampleCode :: String -> Blocks
+exampleCode = B.codeBlockWith ("", ["example"], [])
+
+exampleLine :: OrgParser String
+exampleLine = try $ skipSpaces *> string ": " *> anyLine
+
+-- Drawers for properties or a logbook
+drawer :: OrgParser (F Blocks)
+drawer = try $ do
+ drawerStart
+ manyTill drawerLine (try drawerEnd)
+ return mempty
+
+drawerStart :: OrgParser String
+drawerStart = try $
+ skipSpaces *> drawerName <* skipSpaces <* P.newline
+ where drawerName = try $ char ':' *> validDrawerName <* char ':'
+ validDrawerName = stringAnyCase "PROPERTIES"
+ <|> stringAnyCase "LOGBOOK"
+
+drawerLine :: OrgParser String
+drawerLine = try anyLine
+
+drawerEnd :: OrgParser String
+drawerEnd = try $
+ skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* P.newline
+
+
+--
+-- Figures
+--
+
+-- Figures (Image on a line by itself, preceded by name and/or caption)
+figure :: OrgParser (F Blocks)
+figure = try $ do
+ (cap, nam) <- nameAndCaption
+ src <- skipSpaces *> selfTarget <* skipSpaces <* P.newline
+ guard (isImageFilename src)
+ return $ do
+ cap' <- cap
+ return $ B.para $ B.image src nam cap'
+ where
+ nameAndCaption =
+ do
+ maybeCap <- lookupInlinesAttr "caption"
+ maybeNam <- lookupBlockAttribute "name"
+ guard $ isJust maybeCap || isJust maybeNam
+ return ( fromMaybe mempty maybeCap
+ , maybe mempty withFigPrefix maybeNam )
+ withFigPrefix cs =
+ if "fig:" `isPrefixOf` cs
+ then cs
+ else "fig:" ++ cs
+
+--
+-- Comments, Options and Metadata
+specialLine :: OrgParser (F Blocks)
+specialLine = fmap return . try $ metaLine <|> commentLine
+
+metaLine :: OrgParser Blocks
+metaLine = try $ mempty
+ <$ (metaLineStart *> (optionLine <|> declarationLine))
+
+commentLine :: OrgParser Blocks
+commentLine = try $ commentLineStart *> anyLine *> pure mempty
+
+-- The order, in which blocks are tried, makes sure that we're not looking at
+-- the beginning of a block, so we don't need to check for it
+metaLineStart :: OrgParser String
+metaLineStart = try $ mappend <$> many spaceChar <*> string "#+"
+
+commentLineStart :: OrgParser String
+commentLineStart = try $ mappend <$> many spaceChar <*> string "# "
+
+declarationLine :: OrgParser ()
+declarationLine = try $ do
+ key <- metaKey
+ inlinesF <- metaInlines
+ updateState $ \st ->
+ let meta' = B.setMeta <$> pure key <*> inlinesF <*> pure nullMeta
+ in st { orgStateMeta' = orgStateMeta' st <> meta' }
+ return ()
+
+metaInlines :: OrgParser (F MetaValue)
+metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline
+
+metaKey :: OrgParser String
+metaKey = map toLower <$> many1 (noneOf ": \n\r")
+ <* char ':'
+ <* skipSpaces
+
+optionLine :: OrgParser ()
+optionLine = try $ do
+ key <- metaKey
+ case key of
+ "link" -> parseLinkFormat >>= uncurry addLinkFormat
+ _ -> mzero
+
+parseLinkFormat :: OrgParser ((String, String -> String))
+parseLinkFormat = try $ do
+ linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces
+ linkSubst <- parseFormat
+ return (linkType, linkSubst)
+
+-- | An ad-hoc, single-argument-only implementation of a printf-style format
+-- parser.
+parseFormat :: OrgParser (String -> String)
+parseFormat = try $ do
+ replacePlain <|> replaceUrl <|> justAppend
+ where
+ -- inefficient, but who cares
+ replacePlain = try $ (\x -> concat . flip intersperse x)
+ <$> sequence [tillSpecifier 's', rest]
+ replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode)
+ <$> sequence [tillSpecifier 'h', rest]
+ justAppend = try $ (++) <$> rest
+
+ rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r")
+ tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:""))
+
+--
+-- Headers
+--
+
+-- | Headers
+header :: OrgParser (F Blocks)
+header = try $ do
+ level <- headerStart
+ title <- inlinesTillNewline
+ return $ B.header level <$> title
+
+headerStart :: OrgParser Int
+headerStart = try $
+ (length <$> many1 (char '*')) <* many1 (char ' ')
+
+
+-- Don't use (or need) the reader wrapper here, we want hline to be
+-- @show@able. Otherwise we can't use it with @notFollowedBy'@.
+
+-- | Horizontal Line (five -- dashes or more)
+hline :: OrgParser Blocks
+hline = try $ do
+ skipSpaces
+ string "-----"
+ many (char '-')
+ skipSpaces
+ newline
+ return B.horizontalRule
+
+--
+-- Tables
+--
+
+data OrgTableRow = OrgContentRow (F [Blocks])
+ | OrgAlignRow [Alignment]
+ | OrgHlineRow
+
+data OrgTable = OrgTable
+ { orgTableColumns :: Int
+ , orgTableAlignments :: [Alignment]
+ , orgTableHeader :: [Blocks]
+ , orgTableRows :: [[Blocks]]
+ }
+
+table :: OrgParser (F Blocks)
+table = try $ do
+ lookAhead tableStart
+ do
+ rows <- tableRows
+ cptn <- fromMaybe (pure "") <$> lookupInlinesAttr "caption"
+ return $ (<$> cptn) . orgToPandocTable . normalizeTable =<< rowsToTable rows
+
+orgToPandocTable :: OrgTable
+ -> Inlines
+ -> Blocks
+orgToPandocTable (OrgTable _ aligns heads lns) caption =
+ B.table caption (zip aligns $ repeat 0) heads lns
+
+tableStart :: OrgParser Char
+tableStart = try $ skipSpaces *> char '|'
+
+tableRows :: OrgParser [OrgTableRow]
+tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow)
+
+tableContentRow :: OrgParser OrgTableRow
+tableContentRow = try $
+ OrgContentRow . sequence <$> (tableStart *> manyTill tableContentCell newline)
+
+tableContentCell :: OrgParser (F Blocks)
+tableContentCell = try $
+ fmap B.plain . trimInlinesF . mconcat <$> many1Till inline endOfCell
+
+endOfCell :: OrgParser Char
+endOfCell = try $ char '|' <|> lookAhead newline
+
+tableAlignRow :: OrgParser OrgTableRow
+tableAlignRow = try $
+ OrgAlignRow <$> (tableStart *> manyTill tableAlignCell newline)
+
+tableAlignCell :: OrgParser Alignment
+tableAlignCell =
+ choice [ try $ emptyCell *> return AlignDefault
+ , try $ skipSpaces
+ *> char '<'
+ *> tableAlignFromChar
+ <* many digit
+ <* char '>'
+ <* emptyCell
+ ] <?> "alignment info"
+ where emptyCell = try $ skipSpaces *> endOfCell
+
+tableAlignFromChar :: OrgParser Alignment
+tableAlignFromChar = try $ choice [ char 'l' *> return AlignLeft
+ , char 'c' *> return AlignCenter
+ , char 'r' *> return AlignRight
+ ]
+
+tableHline :: OrgParser OrgTableRow
+tableHline = try $
+ OrgHlineRow <$ (tableStart *> char '-' *> anyLine)
+
+rowsToTable :: [OrgTableRow]
+ -> F OrgTable
+rowsToTable = foldM (flip rowToContent) zeroTable
+ where zeroTable = OrgTable 0 mempty mempty mempty
+
+normalizeTable :: OrgTable
+ -> OrgTable
+normalizeTable (OrgTable cols aligns heads lns) =
+ let aligns' = fillColumns aligns AlignDefault
+ heads' = if heads == mempty
+ then mempty
+ else fillColumns heads (B.plain mempty)
+ lns' = map (`fillColumns` B.plain mempty) lns
+ fillColumns base padding = take cols $ base ++ repeat padding
+ in OrgTable cols aligns' heads' lns'
+
+
+-- One or more horizontal rules after the first content line mark the previous
+-- line as a header. All other horizontal lines are discarded.
+rowToContent :: OrgTableRow
+ -> OrgTable
+ -> F OrgTable
+rowToContent OrgHlineRow t = maybeBodyToHeader t
+rowToContent (OrgAlignRow as) t = setLongestRow as =<< setAligns as t
+rowToContent (OrgContentRow rf) t = do
+ rs <- rf
+ setLongestRow rs =<< appendToBody rs t
+
+setLongestRow :: [a]
+ -> OrgTable
+ -> F OrgTable
+setLongestRow rs t =
+ return t{ orgTableColumns = max (length rs) (orgTableColumns t) }
+
+maybeBodyToHeader :: OrgTable
+ -> F OrgTable
+maybeBodyToHeader t = case t of
+ OrgTable{ orgTableHeader = [], orgTableRows = b:[] } ->
+ return t{ orgTableHeader = b , orgTableRows = [] }
+ _ -> return t
+
+appendToBody :: [Blocks]
+ -> OrgTable
+ -> F OrgTable
+appendToBody r t = return t{ orgTableRows = orgTableRows t ++ [r] }
+
+setAligns :: [Alignment]
+ -> OrgTable
+ -> F OrgTable
+setAligns aligns t = return $ t{ orgTableAlignments = aligns }
+
+
+--
+-- LaTeX fragments
+--
+latexFragment :: OrgParser (F Blocks)
+latexFragment = try $ do
+ envName <- latexEnvStart
+ content <- mconcat <$> manyTill anyLineNewline (latexEnd envName)
+ return . return $ B.rawBlock "latex" (content `inLatexEnv` envName)
+ where
+ c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n"
+ , c
+ , "\\end{", e, "}\n"
+ ]
+
+latexEnvStart :: OrgParser String
+latexEnvStart = try $ do
+ skipSpaces *> string "\\begin{"
+ *> latexEnvName
+ <* string "}"
+ <* blankline
+
+latexEnd :: String -> OrgParser ()
+latexEnd envName = try $
+ () <$ skipSpaces
+ <* string ("\\end{" ++ envName ++ "}")
+ <* blankline
+
+-- | Parses a LaTeX environment name.
+latexEnvName :: OrgParser String
+latexEnvName = try $ do
+ mappend <$> many1 alphaNum
+ <*> option "" (string "*")
+
+
+--
+-- Footnote defintions
+--
+noteBlock :: OrgParser (F Blocks)
+noteBlock = try $ do
+ ref <- noteMarker <* skipSpaces
+ content <- mconcat <$> blocksTillHeaderOrNote
+ addToNotesTable (ref, content)
+ return mempty
+ where
+ blocksTillHeaderOrNote =
+ many1Till block (eof <|> () <$ lookAhead noteMarker
+ <|> () <$ lookAhead headerStart)
+
+-- Paragraphs or Plain text
+paraOrPlain :: OrgParser (F Blocks)
+paraOrPlain = try $ do
+ ils <- parseInlines
+ nl <- option False (newline >> return True)
+ try (guard nl >> notFollowedBy (orderedListStart <|> bulletListStart) >>
+ return (B.para <$> ils))
+ <|> (return (B.plain <$> ils))
+
+inlinesTillNewline :: OrgParser (F Inlines)
+inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline
+
+
+--
+-- list blocks
+--
+
+list :: OrgParser (F Blocks)
+list = choice [ definitionList, bulletList, orderedList ] <?> "list"
+
+definitionList :: OrgParser (F Blocks)
+definitionList = try $ do n <- lookAhead (bulletListStart' Nothing)
+ fmap B.definitionList . fmap compactify'DL . sequence
+ <$> many1 (definitionListItem $ bulletListStart' (Just n))
+
+bulletList :: OrgParser (F Blocks)
+bulletList = try $ do n <- lookAhead (bulletListStart' Nothing)
+ fmap B.bulletList . fmap compactify' . sequence
+ <$> many1 (listItem (bulletListStart' $ Just n))
+
+orderedList :: OrgParser (F Blocks)
+orderedList = fmap B.orderedList . fmap compactify' . sequence
+ <$> many1 (listItem orderedListStart)
+
+genericListStart :: OrgParser String
+ -> OrgParser Int
+genericListStart listMarker = try $
+ (+) <$> (length <$> many spaceChar)
+ <*> (length <$> listMarker <* many1 spaceChar)
+
+-- parses bullet list marker. maybe we know the indent level
+bulletListStart :: OrgParser Int
+bulletListStart = bulletListStart' Nothing
+
+bulletListStart' :: Maybe Int -> OrgParser Int
+-- returns length of bulletList prefix, inclusive of marker
+bulletListStart' Nothing = do ind <- length <$> many spaceChar
+ when (ind == 0) $ notFollowedBy (char '*')
+ oneOf bullets
+ many1 spaceChar
+ return (ind + 1)
+ -- Unindented lists are legal, but they can't use '*' bullets
+ -- We return n to maintain compatibility with the generic listItem
+bulletListStart' (Just n) = do count (n-1) spaceChar
+ when (n == 1) $ notFollowedBy (char '*')
+ oneOf bullets
+ many1 spaceChar
+ return n
+
+bullets :: String
+bullets = "*+-"
+
+orderedListStart :: OrgParser Int
+orderedListStart = genericListStart orderedListMarker
+ -- Ordered list markers allowed in org-mode
+ where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)")
+
+definitionListItem :: OrgParser Int
+ -> OrgParser (F (Inlines, [Blocks]))
+definitionListItem parseMarkerGetLength = try $ do
+ markerLength <- parseMarkerGetLength
+ term <- manyTill (noneOf "\n\r") (try $ string "::")
+ line1 <- anyLineNewline
+ blank <- option "" ("\n" <$ blankline)
+ cont <- concat <$> many (listContinuation markerLength)
+ term' <- parseFromString parseInlines term
+ contents' <- parseFromString parseBlocks $ line1 ++ blank ++ cont
+ return $ (,) <$> term' <*> fmap (:[]) contents'
+
+
+-- parse raw text for one list item, excluding start marker and continuations
+listItem :: OrgParser Int
+ -> OrgParser (F Blocks)
+listItem start = try $ do
+ markerLength <- try start
+ firstLine <- anyLineNewline
+ blank <- option "" ("\n" <$ blankline)
+ rest <- concat <$> many (listContinuation markerLength)
+ parseFromString parseBlocks $ firstLine ++ blank ++ rest
+
+-- continuation of a list item - indented and separated by blankline or endline.
+-- Note: nested lists are parsed as continuations.
+listContinuation :: Int
+ -> OrgParser String
+listContinuation markerLength = try $
+ notFollowedBy' blankline
+ *> (mappend <$> (concat <$> many1 listLine)
+ <*> many blankline)
+ where listLine = try $ indentWith markerLength *> anyLineNewline
+
+anyLineNewline :: OrgParser String
+anyLineNewline = (++ "\n") <$> anyLine
+
+
+--
+-- inline
+--
+
+inline :: OrgParser (F Inlines)
+inline =
+ choice [ whitespace
+ , linebreak
+ , cite
+ , footnote
+ , linkOrImage
+ , anchor
+ , inlineCodeBlock
+ , str
+ , endline
+ , emph
+ , strong
+ , strikeout
+ , underline
+ , code
+ , math
+ , displayMath
+ , verbatim
+ , subscript
+ , superscript
+ , inlineLaTeX
+ , symbol
+ ] <* (guard =<< newlinesCountWithinLimits)
+ <?> "inline"
+
+parseInlines :: OrgParser (F Inlines)
+parseInlines = trimInlinesF . mconcat <$> many1 inline
+
+-- treat these as potentially non-text when parsing inline:
+specialChars :: [Char]
+specialChars = "\"$'()*+-,./:<=>[\\]^_{|}~"
+
+
+whitespace :: OrgParser (F Inlines)
+whitespace = pure B.space <$ skipMany1 spaceChar
+ <* updateLastPreCharPos
+ <* updateLastForbiddenCharPos
+ <?> "whitespace"
+
+linebreak :: OrgParser (F Inlines)
+linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline
+
+str :: OrgParser (F Inlines)
+str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
+ <* updateLastStrPos
+
+-- | An endline character that can be treated as a space, not a structural
+-- break. This should reflect the values of the Emacs variable
+-- @org-element-pagaraph-separate@.
+endline :: OrgParser (F Inlines)
+endline = try $ do
+ newline
+ notFollowedBy blankline
+ notFollowedBy' exampleLine
+ notFollowedBy' hline
+ notFollowedBy' noteMarker
+ notFollowedBy' tableStart
+ notFollowedBy' drawerStart
+ notFollowedBy' headerStart
+ notFollowedBy' metaLineStart
+ notFollowedBy' latexEnvStart
+ notFollowedBy' commentLineStart
+ notFollowedBy' bulletListStart
+ notFollowedBy' orderedListStart
+ decEmphasisNewlinesCount
+ guard =<< newlinesCountWithinLimits
+ updateLastPreCharPos
+ return . return $ B.space
+
+cite :: OrgParser (F Inlines)
+cite = try $ do
+ guardEnabled Ext_citations
+ (cs, raw) <- withRaw normalCite
+ return $ (flip B.cite (B.text raw)) <$> cs
+
+normalCite :: OrgParser (F [Citation])
+normalCite = try $ char '['
+ *> skipSpaces
+ *> citeList
+ <* skipSpaces
+ <* char ']'
+
+citeList :: OrgParser (F [Citation])
+citeList = sequence <$> sepBy1 citation (try $ char ';' *> skipSpaces)
+
+citation :: OrgParser (F Citation)
+citation = try $ do
+ pref <- prefix
+ (suppress_author, key) <- citeKey
+ suff <- suffix
+ return $ do
+ x <- pref
+ y <- suff
+ return $ Citation{ citationId = key
+ , citationPrefix = B.toList x
+ , citationSuffix = B.toList y
+ , citationMode = if suppress_author
+ then SuppressAuthor
+ else NormalCitation
+ , citationNoteNum = 0
+ , citationHash = 0
+ }
+ where
+ prefix = trimInlinesF . mconcat <$>
+ manyTill inline (char ']' <|> (']' <$ lookAhead citeKey))
+ suffix = try $ do
+ hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
+ skipSpaces
+ rest <- trimInlinesF . mconcat <$>
+ many (notFollowedBy (oneOf ";]") *> inline)
+ return $ if hasSpace
+ then (B.space <>) <$> rest
+ else rest
+
+footnote :: OrgParser (F Inlines)
+footnote = try $ inlineNote <|> referencedNote
+
+inlineNote :: OrgParser (F Inlines)
+inlineNote = try $ do
+ string "[fn:"
+ ref <- many alphaNum
+ char ':'
+ note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']')
+ when (not $ null ref) $
+ addToNotesTable ("fn:" ++ ref, note)
+ return $ B.note <$> note
+
+referencedNote :: OrgParser (F Inlines)
+referencedNote = try $ do
+ ref <- noteMarker
+ return $ do
+ notes <- asksF orgStateNotes'
+ case lookup ref notes of
+ Nothing -> return $ B.str $ "[" ++ ref ++ "]"
+ Just contents -> do
+ st <- askF
+ let contents' = runF contents st{ orgStateNotes' = [] }
+ return $ B.note contents'
+
+noteMarker :: OrgParser String
+noteMarker = try $ do
+ char '['
+ choice [ many1Till digit (char ']')
+ , (++) <$> string "fn:"
+ <*> many1Till (noneOf "\n\r\t ") (char ']')
+ ]
+
+linkOrImage :: OrgParser (F Inlines)
+linkOrImage = explicitOrImageLink
+ <|> selflinkOrImage
+ <|> angleLink
+ <|> plainLink
+ <?> "link or image"
+
+explicitOrImageLink :: OrgParser (F Inlines)
+explicitOrImageLink = try $ do
+ char '['
+ srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
+ title <- enclosedRaw (char '[') (char ']')
+ title' <- parseFromString (mconcat <$> many inline) title
+ char ']'
+ return $ do
+ src <- srcF
+ if isImageFilename src && isImageFilename title
+ then pure $ B.link src "" $ B.image title mempty mempty
+ else linkToInlinesF src =<< title'
+
+selflinkOrImage :: OrgParser (F Inlines)
+selflinkOrImage = try $ do
+ src <- char '[' *> linkTarget <* char ']'
+ return $ linkToInlinesF src (B.str src)
+
+plainLink :: OrgParser (F Inlines)
+plainLink = try $ do
+ (orig, src) <- uri
+ returnF $ B.link src "" (B.str orig)
+
+angleLink :: OrgParser (F Inlines)
+angleLink = try $ do
+ char '<'
+ link <- plainLink
+ char '>'
+ return link
+
+selfTarget :: OrgParser String
+selfTarget = try $ char '[' *> linkTarget <* char ']'
+
+linkTarget :: OrgParser String
+linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]")
+
+possiblyEmptyLinkTarget :: OrgParser String
+possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
+
+applyCustomLinkFormat :: String -> OrgParser (F String)
+applyCustomLinkFormat link = do
+ let (linkType, rest) = break (== ':') link
+ return $ do
+ formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
+ return $ maybe link ($ drop 1 rest) formatter
+
+-- TODO: might be a lot smarter/cleaner to use parsec and ADTs for this kind
+-- of parsing.
+linkToInlinesF :: String -> Inlines -> F Inlines
+linkToInlinesF s =
+ case s of
+ "" -> pure . B.link "" ""
+ ('#':_) -> pure . B.link s ""
+ _ | isImageFilename s -> const . pure $ B.image s "" ""
+ _ | isFileLink s -> pure . B.link (dropLinkType s) ""
+ _ | isUri s -> pure . B.link s ""
+ _ | isAbsoluteFilePath s -> pure . B.link ("file://" ++ s) ""
+ _ | isRelativeFilePath s -> pure . B.link s ""
+ _ -> internalLink s
+
+isFileLink :: String -> Bool
+isFileLink s = ("file:" `isPrefixOf` s) && not ("file://" `isPrefixOf` s)
+
+dropLinkType :: String -> String
+dropLinkType = tail . snd . break (== ':')
+
+isRelativeFilePath :: String -> Bool
+isRelativeFilePath s = (("./" `isPrefixOf` s) || ("../" `isPrefixOf` s)) &&
+ (':' `notElem` s)
+
+isUri :: String -> Bool
+isUri s = let (scheme, path) = break (== ':') s
+ in all (\c -> isAlphaNum c || c `elem` ".-") scheme
+ && not (null path)
+
+isAbsoluteFilePath :: String -> Bool
+isAbsoluteFilePath = ('/' ==) . head
+
+isImageFilename :: String -> Bool
+isImageFilename filename =
+ any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions &&
+ (any (\x -> (x++":") `isPrefixOf` filename) protocols ||
+ ':' `notElem` filename)
+ where
+ imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ]
+ protocols = [ "file", "http", "https" ]
+
+internalLink :: String -> Inlines -> F Inlines
+internalLink link title = do
+ anchorB <- (link `elem`) <$> asksF orgStateAnchorIds
+ if anchorB
+ then return $ B.link ('#':link) "" title
+ else return $ B.emph title
+
+-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
+-- @anchor-id@ set as id. Legal anchors in org-mode are defined through
+-- @org-target-regexp@, which is fairly liberal. Since no link is created if
+-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as
+-- an anchor.
+
+anchor :: OrgParser (F Inlines)
+anchor = try $ do
+ anchorId <- parseAnchor
+ recordAnchorId anchorId
+ returnF $ B.spanWith (solidify anchorId, [], []) mempty
+ where
+ parseAnchor = string "<<"
+ *> many1 (noneOf "\t\n\r<>\"' ")
+ <* string ">>"
+ <* skipSpaces
+
+-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors
+-- the org function @org-export-solidify-link-text@.
+
+solidify :: String -> String
+solidify = map replaceSpecialChar
+ where replaceSpecialChar c
+ | isAlphaNum c = c
+ | c `elem` "_.-:" = c
+ | otherwise = '-'
+
+-- | Parses an inline code block and marks it as an babel block.
+inlineCodeBlock :: OrgParser (F Inlines)
+inlineCodeBlock = try $ do
+ string "src_"
+ lang <- many1 orgArgWordChar
+ opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption
+ inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r")
+ let attrClasses = [translateLang lang, rundocBlockClass]
+ let attrKeyVal = map toRundocAttrib (("language", lang) : opts)
+ returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode
+
+enclosedByPair :: Char -- ^ opening char
+ -> Char -- ^ closing char
+ -> OrgParser a -- ^ parser
+ -> OrgParser [a]
+enclosedByPair s e p = char s *> many1Till p (char e)
+
+emph :: OrgParser (F Inlines)
+emph = fmap B.emph <$> emphasisBetween '/'
+
+strong :: OrgParser (F Inlines)
+strong = fmap B.strong <$> emphasisBetween '*'
+
+strikeout :: OrgParser (F Inlines)
+strikeout = fmap B.strikeout <$> emphasisBetween '+'
+
+-- There is no underline, so we use strong instead.
+underline :: OrgParser (F Inlines)
+underline = fmap B.strong <$> emphasisBetween '_'
+
+verbatim :: OrgParser (F Inlines)
+verbatim = return . B.code <$> verbatimBetween '='
+
+code :: OrgParser (F Inlines)
+code = return . B.code <$> verbatimBetween '~'
+
+subscript :: OrgParser (F Inlines)
+subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr)
+
+superscript :: OrgParser (F Inlines)
+superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr)
+
+math :: OrgParser (F Inlines)
+math = return . B.math <$> choice [ math1CharBetween '$'
+ , mathStringBetween '$'
+ , rawMathBetween "\\(" "\\)"
+ ]
+
+displayMath :: OrgParser (F Inlines)
+displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
+ , rawMathBetween "$$" "$$"
+ ]
+symbol :: OrgParser (F Inlines)
+symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions)
+ where updatePositions c = do
+ when (c `elem` emphasisPreChars) updateLastPreCharPos
+ when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
+ return c
+
+emphasisBetween :: Char
+ -> OrgParser (F Inlines)
+emphasisBetween c = try $ do
+ startEmphasisNewlinesCounting emphasisAllowedNewlines
+ res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
+ isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
+ when isTopLevelEmphasis
+ resetEmphasisNewlines
+ return res
+
+verbatimBetween :: Char
+ -> OrgParser String
+verbatimBetween c = try $
+ emphasisStart c *>
+ many1TillNOrLessNewlines 1 (noneOf "\n\r") (emphasisEnd c)
+
+-- | Parses a raw string delimited by @c@ using Org's math rules
+mathStringBetween :: Char
+ -> OrgParser String
+mathStringBetween c = try $ do
+ mathStart c
+ body <- many1TillNOrLessNewlines mathAllowedNewlines
+ (noneOf (c:"\n\r"))
+ (lookAhead $ mathEnd c)
+ final <- mathEnd c
+ return $ body ++ [final]
+
+-- | Parse a single character between @c@ using math rules
+math1CharBetween :: Char
+ -> OrgParser String
+math1CharBetween c = try $ do
+ char c
+ res <- noneOf $ c:mathForbiddenBorderChars
+ char c
+ eof <|> () <$ lookAhead (oneOf mathPostChars)
+ return [res]
+
+rawMathBetween :: String
+ -> String
+ -> OrgParser String
+rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e)
+
+-- | Parses the start (opening character) of emphasis
+emphasisStart :: Char -> OrgParser Char
+emphasisStart c = try $ do
+ guard =<< afterEmphasisPreChar
+ guard =<< notAfterString
+ char c
+ lookAhead (noneOf emphasisForbiddenBorderChars)
+ pushToInlineCharStack c
+ return c
+
+-- | Parses the closing character of emphasis
+emphasisEnd :: Char -> OrgParser Char
+emphasisEnd c = try $ do
+ guard =<< notAfterForbiddenBorderChar
+ char c
+ eof <|> () <$ lookAhead acceptablePostChars
+ updateLastStrPos
+ popInlineCharStack
+ return c
+ where acceptablePostChars =
+ surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars)
+
+mathStart :: Char -> OrgParser Char
+mathStart c = try $
+ char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
+
+mathEnd :: Char -> OrgParser Char
+mathEnd c = try $ do
+ res <- noneOf (c:mathForbiddenBorderChars)
+ char c
+ eof <|> () <$ lookAhead (oneOf mathPostChars)
+ return res
+
+
+enclosedInlines :: OrgParser a
+ -> OrgParser b
+ -> OrgParser (F Inlines)
+enclosedInlines start end = try $
+ trimInlinesF . mconcat <$> enclosed start end inline
+
+enclosedRaw :: OrgParser a
+ -> OrgParser b
+ -> OrgParser String
+enclosedRaw start end = try $
+ start *> (onSingleLine <|> spanningTwoLines)
+ where onSingleLine = try $ many1Till (noneOf "\n\r") end
+ spanningTwoLines = try $
+ anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
+
+-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume
+-- newlines.
+many1TillNOrLessNewlines :: Int
+ -> OrgParser Char
+ -> OrgParser a
+ -> OrgParser String
+many1TillNOrLessNewlines n p end = try $
+ nMoreLines (Just n) mempty >>= oneOrMore
+ where
+ nMoreLines Nothing cs = return cs
+ nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
+ nMoreLines k cs = try $ (final k cs <|> rest k cs)
+ >>= uncurry nMoreLines
+ final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine
+ rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p P.newline)
+ finalLine = try $ manyTill p end
+ minus1 k = k - 1
+ oneOrMore cs = guard (not $ null cs) *> return cs
+
+-- Org allows customization of the way it reads emphasis. We use the defaults
+-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
+-- for details).
+
+-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too)
+emphasisPreChars :: [Char]
+emphasisPreChars = "\t \"'({"
+
+-- | Chars allowed at after emphasis
+emphasisPostChars :: [Char]
+emphasisPostChars = "\t\n !\"'),-.:;?\\}"
+
+-- | Chars not allowed at the (inner) border of emphasis
+emphasisForbiddenBorderChars :: [Char]
+emphasisForbiddenBorderChars = "\t\n\r \"',"
+
+-- | The maximum number of newlines within
+emphasisAllowedNewlines :: Int
+emphasisAllowedNewlines = 1
+
+-- LaTeX-style math: see `org-latex-regexps` for details
+
+-- | Chars allowed after an inline ($...$) math statement
+mathPostChars :: [Char]
+mathPostChars = "\t\n \"'),-.:;?"
+
+-- | Chars not allowed at the (inner) border of math
+mathForbiddenBorderChars :: [Char]
+mathForbiddenBorderChars = "\t\n\r ,;.$"
+
+-- | Maximum number of newlines in an inline math statement
+mathAllowedNewlines :: Int
+mathAllowedNewlines = 2
+
+-- | Whether we are right behind a char allowed before emphasis
+afterEmphasisPreChar :: OrgParser Bool
+afterEmphasisPreChar = do
+ pos <- getPosition
+ lastPrePos <- orgStateLastPreCharPos <$> getState
+ return . fromMaybe True $ (== pos) <$> lastPrePos
+
+-- | Whether the parser is right after a forbidden border char
+notAfterForbiddenBorderChar :: OrgParser Bool
+notAfterForbiddenBorderChar = do
+ pos <- getPosition
+ lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
+ return $ lastFBCPos /= Just pos
+
+-- | Read a sub- or superscript expression
+subOrSuperExpr :: OrgParser (F Inlines)
+subOrSuperExpr = try $
+ choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r")
+ , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
+ , simpleSubOrSuperString
+ ] >>= parseFromString (mconcat <$> many inline)
+ where enclosing (left, right) s = left : s ++ [right]
+
+simpleSubOrSuperString :: OrgParser String
+simpleSubOrSuperString = try $
+ choice [ string "*"
+ , mappend <$> option [] ((:[]) <$> oneOf "+-")
+ <*> many1 alphaNum
+ ]
+
+inlineLaTeX :: OrgParser (F Inlines)
+inlineLaTeX = try $ do
+ cmd <- inlineLaTeXCommand
+ maybe mzero returnF $
+ parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` parseAsInlineLaTeX cmd
+ where
+ parseAsMath :: String -> Maybe Inlines
+ parseAsMath cs = B.fromList <$> texMathToPandoc cs
+
+ parseAsInlineLaTeX :: String -> Maybe Inlines
+ parseAsInlineLaTeX cs = maybeRight $ runParser inlineCommand state "" cs
+
+ parseAsMathMLSym :: String -> Maybe Inlines
+ parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs)
+ -- dropWhileEnd would be nice here, but it's not available before base 4.5
+ where clean = reverse . dropWhile (`elem` "{}") . reverse . drop 1
+
+ state :: ParserState
+ state = def{ stateOptions = def{ readerParseRaw = True }}
+
+ texMathToPandoc inp = (maybeRight $ readTeX inp) >>=
+ writePandoc DisplayInline
+
+maybeRight :: Either a b -> Maybe b
+maybeRight = either (const Nothing) Just
+
+inlineLaTeXCommand :: OrgParser String
+inlineLaTeXCommand = try $ do
+ rest <- getInput
+ case runParser rawLaTeXInline def "source" rest of
+ Right (RawInline _ cs) -> do
+ let len = length cs
+ count len anyChar
+ return cs
+ _ -> mzero
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index c12a1493a..8bfc6f606 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -1,6 +1,6 @@
{-# LANGUAGE OverloadedStrings #-}
{-
-Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.RST
- Copyright : Copyright (C) 2006-2010 John MacFarlane
+ Copyright : Copyright (C) 2006-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -29,7 +29,8 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion from reStructuredText to 'Pandoc' document.
-}
module Text.Pandoc.Readers.RST (
- readRST
+ readRST,
+ readRSTWithWarnings
) where
import Text.Pandoc.Definition
import Text.Pandoc.Builder (setMeta, fromList)
@@ -38,15 +39,16 @@ import Text.Pandoc.Parsing
import Text.Pandoc.Options
import Control.Monad ( when, liftM, guard, mzero )
import Data.List ( findIndex, intersperse, intercalate,
- transpose, sort, deleteFirstsBy, isSuffixOf )
+ transpose, sort, deleteFirstsBy, isSuffixOf , nub, union)
+import Data.Maybe (fromMaybe)
import qualified Data.Map as M
import Text.Printf ( printf )
-import Control.Applicative ((<$>), (<$), (<*), (*>))
+import Control.Applicative ((<$>), (<$), (<*), (*>), (<*>), pure)
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
import qualified Text.Pandoc.Builder as B
import Data.Monoid (mconcat, mempty)
import Data.Sequence (viewr, ViewR(..))
-import Data.Char (toLower)
+import Data.Char (toLower, isHexDigit, isSpace)
-- | Parse reStructuredText string and return Pandoc document.
readRST :: ReaderOptions -- ^ Reader options
@@ -54,6 +56,9 @@ readRST :: ReaderOptions -- ^ Reader options
-> Pandoc
readRST opts s = (readWith parseRST) def{ stateOptions = opts } (s ++ "\n\n")
+readRSTWithWarnings :: ReaderOptions -> String -> (Pandoc, [String])
+readRSTWithWarnings opts s = (readWithWarnings parseRST) def{ stateOptions = opts } (s ++ "\n\n")
+
type RSTParser = Parser [Char] ParserState
--
@@ -112,15 +117,16 @@ titleTransform (bs, meta) =
metaFromDefList :: [([Inline], [[Block]])] -> Meta -> Meta
metaFromDefList ds meta = adjustAuthors $ foldr f meta ds
where f (k,v) = setMeta (map toLower $ stringify k) (mconcat $ map fromList v)
- adjustAuthors (Meta metamap) = Meta $ M.adjust toPlain "author"
+ adjustAuthors (Meta metamap) = Meta $ M.adjust splitAuthors "author"
$ M.adjust toPlain "date"
$ M.adjust toPlain "title"
- $ M.adjust splitAuthors "authors"
+ $ M.mapKeys (\k -> if k == "authors" then "author" else k)
$ metamap
toPlain (MetaBlocks [Para xs]) = MetaInlines xs
toPlain x = x
- splitAuthors (MetaBlocks [Para xs]) = MetaList $ map MetaInlines
- $ splitAuthors' xs
+ splitAuthors (MetaBlocks [Para xs])
+ = MetaList $ map MetaInlines
+ $ splitAuthors' xs
splitAuthors x = x
splitAuthors' = map normalizeSpaces .
splitOnSemi . concatMap factorSemi
@@ -184,22 +190,22 @@ block = choice [ codeBlock
-- field list
--
-rawFieldListItem :: String -> RSTParser (String, String)
-rawFieldListItem indent = try $ do
- string indent
+rawFieldListItem :: Int -> RSTParser (String, String)
+rawFieldListItem minIndent = try $ do
+ indent <- length <$> many (char ' ')
+ guard $ indent >= minIndent
char ':'
name <- many1Till (noneOf "\n") (char ':')
(() <$ lookAhead newline) <|> skipMany1 spaceChar
first <- anyLine
- rest <- option "" $ try $ do lookAhead (string indent >> spaceChar)
+ rest <- option "" $ try $ do lookAhead (count indent (char ' ') >> spaceChar)
indentedBlock
let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n"
return (name, raw)
-fieldListItem :: String
- -> RSTParser (Inlines, [Blocks])
-fieldListItem indent = try $ do
- (name, raw) <- rawFieldListItem indent
+fieldListItem :: Int -> RSTParser (Inlines, [Blocks])
+fieldListItem minIndent = try $ do
+ (name, raw) <- rawFieldListItem minIndent
let term = B.str name
contents <- parseFromString parseBlocks raw
optional blanklines
@@ -207,7 +213,7 @@ fieldListItem indent = try $ do
fieldList :: RSTParser Blocks
fieldList = try $ do
- indent <- lookAhead $ many spaceChar
+ indent <- length <$> lookAhead (many spaceChar)
items <- many1 $ fieldListItem indent
case items of
[] -> return mempty
@@ -333,6 +339,13 @@ indentedBlock = try $ do
optional blanklines
return $ unlines lns
+quotedBlock :: Parser [Char] st [Char]
+quotedBlock = try $ do
+ quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+ lns <- many1 $ lookAhead (char quote) >> anyLine
+ optional blanklines
+ return $ unlines lns
+
codeBlockStart :: Parser [Char] st Char
codeBlockStart = string "::" >> blankline >> blankline
@@ -340,7 +353,8 @@ codeBlock :: Parser [Char] st Blocks
codeBlock = try $ codeBlockStart >> codeBlockBody
codeBlockBody :: Parser [Char] st Blocks
-codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> indentedBlock
+codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$>
+ (indentedBlock <|> quotedBlock)
lhsCodeBlock :: RSTParser Blocks
lhsCodeBlock = try $ do
@@ -458,7 +472,7 @@ listItem :: RSTParser Int
listItem start = try $ do
(markerLength, first) <- rawListItem start
rest <- many (listContinuation markerLength)
- blanks <- choice [ try (many blankline >>~ lookAhead start),
+ blanks <- choice [ try (many blankline <* lookAhead start),
many1 blankline ] -- whole list must end with blank.
-- parsing with ListItemState forces markers at beginning of lines to
-- count as list item markers, even if not separated by blank space.
@@ -478,7 +492,7 @@ listItem start = try $ do
orderedList :: RSTParser Blocks
orderedList = try $ do
- (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar)
+ (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar)
items <- many1 (listItem (orderedListStart style delim))
let items' = compactify' items
return $ B.orderedListWith (start, style, delim) items'
@@ -511,7 +525,6 @@ directive = try $ do
-- TODO: line-block, parsed-literal, table, csv-table, list-table
-- date
-- include
--- class
-- title
directive' :: RSTParser Blocks
directive' = do
@@ -520,17 +533,17 @@ directive' = do
skipMany spaceChar
top <- many $ satisfy (/='\n')
<|> try (char '\n' <*
- notFollowedBy' (rawFieldListItem " ") <*
+ notFollowedBy' (rawFieldListItem 3) <*
count 3 (char ' ') <*
notFollowedBy blankline)
newline
- fields <- many $ rawFieldListItem " "
+ fields <- many $ rawFieldListItem 3
body <- option "" $ try $ blanklines >> indentedBlock
optional blanklines
let body' = body ++ "\n\n"
case label of
"raw" -> return $ B.rawBlock (trim top) (stripTrailingNewlines body)
- "role" -> return mempty
+ "role" -> addNewRole top $ map (\(k,v) -> (k, trim v)) fields
"container" -> parseFromString parseBlocks body'
"replace" -> B.para <$> -- consumed by substKey
parseFromString (trimInlines . mconcat <$> many inline)
@@ -575,12 +588,15 @@ directive' = do
role -> role })
"code" -> codeblock (lookup "number-lines" fields) (trim top) body
"code-block" -> codeblock (lookup "number-lines" fields) (trim top) body
+ "aafig" -> do
+ let attribs = ("", ["aafig"], map (\(k,v) -> (k, trimr v)) fields)
+ return $ B.codeBlockWith attribs $ stripTrailingNewlines body
"math" -> return $ B.para $ mconcat $ map B.displayMath
$ toChunks $ top ++ "\n\n" ++ body
"figure" -> do
(caption, legend) <- parseFromString extractCaption body'
let src = escapeURI $ trim top
- return $ B.para (B.image src "" caption) <> legend
+ return $ B.para (B.image src "fig:" caption) <> legend
"image" -> do
let src = escapeURI $ trim top
let alt = B.str $ maybe "image" trim $ lookup "alt" fields
@@ -589,9 +605,71 @@ directive' = do
Just t -> B.link (escapeURI $ trim t) ""
$ B.image src "" alt
Nothing -> B.image src "" alt
- _ -> return mempty
-
--- Can contain haracter codes as decimal numbers or
+ "class" -> do
+ let attrs = ("", (splitBy isSpace $ trim top), map (\(k,v) -> (k, trimr v)) fields)
+ -- directive content or the first immediately following element
+ children <- case body of
+ "" -> block
+ _ -> parseFromString parseBlocks body'
+ return $ B.divWith attrs children
+ other -> do
+ pos <- getPosition
+ addWarning (Just pos) $ "ignoring unknown directive: " ++ other
+ return mempty
+
+-- TODO:
+-- - Silently ignores illegal fields
+-- - Only supports :format: fields with a single format for :raw: roles,
+-- change Text.Pandoc.Definition.Format to fix
+addNewRole :: String -> [(String, String)] -> RSTParser Blocks
+addNewRole roleString fields = do
+ (role, parentRole) <- parseFromString inheritedRole roleString
+ customRoles <- stateRstCustomRoles <$> getState
+ let (baseRole, baseFmt, baseAttr) =
+ maybe (parentRole, Nothing, nullAttr) id $
+ M.lookup parentRole customRoles
+ fmt = if parentRole == "raw" then lookup "format" fields else baseFmt
+ annotate :: [String] -> [String]
+ annotate = maybe id (:) $
+ if parentRole == "code"
+ then lookup "language" fields
+ else Nothing
+ attr = let (ident, classes, keyValues) = baseAttr
+ -- nub in case role name & language class are the same
+ in (ident, nub . (role :) . annotate $ classes, keyValues)
+
+ -- warn about syntax we ignore
+ flip mapM_ fields $ \(key, _) -> case key of
+ "language" -> when (parentRole /= "code") $ addWarning Nothing $
+ "ignoring :language: field because the parent of role :" ++
+ role ++ ": is :" ++ parentRole ++ ": not :code:"
+ "format" -> when (parentRole /= "raw") $ addWarning Nothing $
+ "ignoring :format: field because the parent of role :" ++
+ role ++ ": is :" ++ parentRole ++ ": not :raw:"
+ _ -> addWarning Nothing $ "ignoring unknown field :" ++ key ++
+ ": in definition of role :" ++ role ++ ": in"
+ when (parentRole == "raw" && countKeys "format" > 1) $
+ addWarning Nothing $
+ "ignoring :format: fields after the first in the definition of role :"
+ ++ role ++": in"
+ when (parentRole == "code" && countKeys "language" > 1) $
+ addWarning Nothing $
+ "ignoring :language: fields after the first in the definition of role :"
+ ++ role ++": in"
+
+ updateState $ \s -> s {
+ stateRstCustomRoles =
+ M.insert role (baseRole, fmt, attr) customRoles
+ }
+
+ return $ B.singleton Null
+ where
+ countKeys k = length . filter (== k) . map fst $ fields
+ inheritedRole =
+ (,) <$> roleName <*> ((char '(' *> roleName <* char ')') <|> pure "span")
+
+
+-- Can contain character codes as decimal numbers or
-- hexadecimal numbers, prefixed by 0x, x, \x, U+, u, or \u
-- or as XML-style hexadecimal character entities, e.g. &#x1a2b;
-- or text, which is used as-is. Comments start with ..
@@ -620,9 +698,6 @@ extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc
where (ds,rest) = span isHexDigit s
mbc = safeRead ('\'':'\\':'x':ds ++ "'")
-isHexDigit :: Char -> Bool
-isHexDigit c = c `elem` "0123456789ABCDEFabcdef"
-
extractCaption :: RSTParser (Inlines, Blocks)
extractCaption = do
capt <- trimInlines . mconcat <$> many inline
@@ -711,7 +786,7 @@ simpleReferenceName = do
referenceName :: RSTParser Inlines
referenceName = quotedReferenceName <|>
- (try $ simpleReferenceName >>~ lookAhead (char ':')) <|>
+ (try $ simpleReferenceName <* lookAhead (char ':')) <|>
unquotedReferenceName
referenceKey :: RSTParser [Char]
@@ -930,17 +1005,61 @@ strong = B.strong . trimInlines . mconcat <$>
-- Note, this doesn't precisely implement the complex rule in
-- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
-- but it should be good enough for most purposes
+--
+-- TODO:
+-- - Classes are silently discarded in addNewRole
+-- - Lacks sensible implementation for title-reference (which is the default)
+-- - Allows direct use of the :raw: role, rST only allows inherited use.
interpretedRole :: RSTParser Inlines
interpretedRole = try $ do
(role, contents) <- roleBefore <|> roleAfter
- case role of
- "sup" -> return $ B.superscript $ B.str contents
- "sub" -> return $ B.subscript $ B.str contents
- "math" -> return $ B.math contents
- _ -> return $ B.str contents --unknown
+ renderRole contents Nothing role nullAttr
+
+renderRole :: String -> Maybe String -> String -> Attr -> RSTParser Inlines
+renderRole contents fmt role attr = case role of
+ "sup" -> return $ B.superscript $ B.str contents
+ "superscript" -> return $ B.superscript $ B.str contents
+ "sub" -> return $ B.subscript $ B.str contents
+ "subscript" -> return $ B.subscript $ B.str contents
+ "emphasis" -> return $ B.emph $ B.str contents
+ "strong" -> return $ B.strong $ B.str contents
+ "rfc-reference" -> return $ rfcLink contents
+ "RFC" -> return $ rfcLink contents
+ "pep-reference" -> return $ pepLink contents
+ "PEP" -> return $ pepLink contents
+ "literal" -> return $ B.codeWith attr contents
+ "math" -> return $ B.math contents
+ "title-reference" -> titleRef contents
+ "title" -> titleRef contents
+ "t" -> titleRef contents
+ "code" -> return $ B.codeWith (addClass "sourceCode" attr) contents
+ "span" -> return $ B.spanWith attr $ B.str contents
+ "raw" -> return $ B.rawInline (fromMaybe "" fmt) contents
+ custom -> do
+ customRoles <- stateRstCustomRoles <$> getState
+ case M.lookup custom customRoles of
+ Just (newRole, newFmt, newAttr) ->
+ renderRole contents newFmt newRole newAttr
+ Nothing -> do
+ pos <- getPosition
+ addWarning (Just pos) $ "ignoring unknown role :" ++ custom ++ ": in"
+ return $ B.str contents -- Undefined role
+ where
+ titleRef ref = return $ B.str ref -- FIXME: Not a sensible behaviour
+ rfcLink rfcNo = B.link rfcUrl ("RFC " ++ rfcNo) $ B.str ("RFC " ++ rfcNo)
+ where rfcUrl = "http://www.faqs.org/rfcs/rfc" ++ rfcNo ++ ".html"
+ pepLink pepNo = B.link pepUrl ("PEP " ++ pepNo) $ B.str ("PEP " ++ pepNo)
+ where padNo = replicate (4 - length pepNo) '0' ++ pepNo
+ pepUrl = "http://www.python.org/dev/peps/pep-" ++ padNo ++ "/"
+
+addClass :: String -> Attr -> Attr
+addClass c (ident, classes, keyValues) = (ident, union classes [c], keyValues)
+
+roleName :: RSTParser String
+roleName = many1 (letter <|> char '-')
roleMarker :: RSTParser String
-roleMarker = char ':' *> many1Till (letter <|> char '-') (char ':')
+roleMarker = char ':' *> roleName <* char ':'
roleBefore :: RSTParser (String,String)
roleBefore = try $ do
@@ -1001,7 +1120,7 @@ explicitLink = try $ do
referenceLink :: RSTParser Inlines
referenceLink = try $ do
- (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) >>~
+ (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <*
char '_'
state <- getState
let keyTable = stateKeys state
@@ -1069,7 +1188,7 @@ smart :: RSTParser Inlines
smart = do
getOption readerSmart >>= guard
doubleQuoted <|> singleQuoted <|>
- choice (map (B.singleton <$>) [apostrophe, dash, ellipses])
+ choice [apostrophe, dash, ellipses]
singleQuoted :: RSTParser Inlines
singleQuoted = try $ do
diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs
new file mode 100644
index 000000000..c2325c0ea
--- /dev/null
+++ b/src/Text/Pandoc/Readers/TWiki.hs
@@ -0,0 +1,526 @@
+{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-}
+-- RelaxedPolyRec needed for inlinesBetween on GHC < 7
+{-
+ Copyright (C) 2014 Alexander Sulfrian <alexander.sulfrian@fu-berlin.de>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.TWiki
+ Copyright : Copyright (C) 2014 Alexander Sulfrian
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Alexander Sulfrian <alexander.sulfrian@fu-berlin.de>
+ Stability : alpha
+ Portability : portable
+
+Conversion of twiki text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.TWiki ( readTWiki
+ , readTWikiWithWarnings
+ ) where
+
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing hiding (enclosed, macro, nested)
+import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag)
+import Data.Monoid (Monoid, mconcat, mempty)
+import Control.Applicative ((<$>), (<*), (*>), (<$))
+import Control.Monad
+import Text.Printf (printf)
+import Debug.Trace (trace)
+import Text.Pandoc.XML (fromEntities)
+import Data.Maybe (fromMaybe)
+import Text.HTML.TagSoup
+import Data.Char (isAlphaNum)
+import qualified Data.Foldable as F
+
+-- | Read twiki from an input string and return a Pandoc document.
+readTWiki :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
+readTWiki opts s =
+ (readWith parseTWiki) def{ stateOptions = opts } (s ++ "\n\n")
+
+readTWikiWithWarnings :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> (Pandoc, [String])
+readTWikiWithWarnings opts s =
+ (readWith parseTWikiWithWarnings) def{ stateOptions = opts } (s ++ "\n\n")
+ where parseTWikiWithWarnings = do
+ doc <- parseTWiki
+ warnings <- stateWarnings <$> getState
+ return (doc, warnings)
+
+type TWParser = Parser [Char] ParserState
+
+--
+-- utility functions
+--
+
+tryMsg :: String -> TWParser a -> TWParser a
+tryMsg msg p = try p <?> msg
+
+skip :: TWParser a -> TWParser ()
+skip parser = parser >> return ()
+
+nested :: TWParser a -> TWParser a
+nested p = do
+ nestlevel <- stateMaxNestingLevel <$> getState
+ guard $ nestlevel > 0
+ updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 }
+ res <- p
+ updateState $ \st -> st{ stateMaxNestingLevel = nestlevel }
+ return res
+
+htmlElement :: String -> TWParser (Attr, String)
+htmlElement tag = tryMsg tag $ do
+ (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag [])
+ content <- manyTill anyChar (endtag <|> endofinput)
+ return (htmlAttrToPandoc attr, trim content)
+ where
+ endtag = skip $ htmlTag (~== TagClose tag)
+ endofinput = lookAhead $ try $ skipMany blankline >> skipSpaces >> eof
+ trim = dropWhile (=='\n') . reverse . dropWhile (=='\n') . reverse
+
+htmlAttrToPandoc :: [Attribute String] -> Attr
+htmlAttrToPandoc attrs = (ident, classes, keyvals)
+ where
+ ident = fromMaybe "" $ lookup "id" attrs
+ classes = maybe [] words $ lookup "class" attrs
+ keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"]
+
+parseHtmlContentWithAttrs :: String -> TWParser a -> TWParser (Attr, [a])
+parseHtmlContentWithAttrs tag parser = do
+ (attr, content) <- htmlElement tag
+ parsedContent <- try $ parseContent content
+ return (attr, parsedContent)
+ where
+ parseContent = parseFromString $ nested $ manyTill parser endOfContent
+ endOfContent = try $ skipMany blankline >> skipSpaces >> eof
+
+parseHtmlContent :: String -> TWParser a -> TWParser [a]
+parseHtmlContent tag p = parseHtmlContentWithAttrs tag p >>= return . snd
+
+--
+-- main parser
+--
+
+parseTWiki :: TWParser Pandoc
+parseTWiki = do
+ bs <- mconcat <$> many block
+ spaces
+ eof
+ return $ B.doc bs
+
+
+--
+-- block parsers
+--
+
+block :: TWParser B.Blocks
+block = do
+ tr <- getOption readerTrace
+ pos <- getPosition
+ res <- mempty <$ skipMany1 blankline
+ <|> blockElements
+ <|> para
+ skipMany blankline
+ when tr $
+ trace (printf "line %d: %s" (sourceLine pos)
+ (take 60 $ show $ B.toList res)) (return ())
+ return res
+
+blockElements :: TWParser B.Blocks
+blockElements = choice [ separator
+ , header
+ , verbatim
+ , literal
+ , list ""
+ , table
+ , blockQuote
+ , noautolink
+ ]
+
+separator :: TWParser B.Blocks
+separator = tryMsg "separator" $ string "---" >> newline >> return B.horizontalRule
+
+header :: TWParser B.Blocks
+header = tryMsg "header" $ do
+ string "---"
+ level <- many1 (char '+') >>= return . length
+ guard $ level <= 6
+ classes <- option [] $ string "!!" >> return ["unnumbered"]
+ skipSpaces
+ content <- B.trimInlines . mconcat <$> manyTill inline newline
+ attr <- registerHeader ("", classes, []) content
+ return $ B.headerWith attr level $ content
+
+verbatim :: TWParser B.Blocks
+verbatim = (htmlElement "verbatim" <|> htmlElement "pre")
+ >>= return . (uncurry B.codeBlockWith)
+
+literal :: TWParser B.Blocks
+literal = htmlElement "literal" >>= return . rawBlock
+ where
+ format (_, _, kvs) = fromMaybe "html" $ lookup "format" kvs
+ rawBlock (attrs, content) = B.rawBlock (format attrs) content
+
+list :: String -> TWParser B.Blocks
+list prefix = choice [ bulletList prefix
+ , orderedList prefix
+ , definitionList prefix]
+
+definitionList :: String -> TWParser B.Blocks
+definitionList prefix = tryMsg "definitionList" $ do
+ indent <- lookAhead $ string prefix *> (many1 $ string " ") <* string "$ "
+ elements <- many $ parseDefinitionListItem (prefix ++ concat indent)
+ return $ B.definitionList elements
+ where
+ parseDefinitionListItem :: String -> TWParser (B.Inlines, [B.Blocks])
+ parseDefinitionListItem indent = do
+ string (indent ++ "$ ") >> skipSpaces
+ term <- many1Till inline $ string ": "
+ line <- listItemLine indent $ string "$ "
+ return $ (mconcat term, [line])
+
+bulletList :: String -> TWParser B.Blocks
+bulletList prefix = tryMsg "bulletList" $
+ parseList prefix (char '*') (char ' ')
+
+orderedList :: String -> TWParser B.Blocks
+orderedList prefix = tryMsg "orderedList" $
+ parseList prefix (oneOf "1iIaA") (string ". ")
+
+parseList :: Show a => String -> TWParser Char -> TWParser a -> TWParser B.Blocks
+parseList prefix marker delim = do
+ (indent, style) <- lookAhead $ string prefix *> listStyle <* delim
+ blocks <- many $ parseListItem (prefix ++ indent) (char style <* delim)
+ return $ case style of
+ '1' -> B.orderedListWith (1, DefaultStyle, DefaultDelim) blocks
+ 'i' -> B.orderedListWith (1, LowerRoman, DefaultDelim) blocks
+ 'I' -> B.orderedListWith (1, UpperRoman, DefaultDelim) blocks
+ 'a' -> B.orderedListWith (1, LowerAlpha, DefaultDelim) blocks
+ 'A' -> B.orderedListWith (1, UpperAlpha, DefaultDelim) blocks
+ _ -> B.bulletList blocks
+ where
+ listStyle = do
+ indent <- many1 $ string " "
+ style <- marker
+ return (concat indent, style)
+
+parseListItem :: Show a => String -> TWParser a -> TWParser B.Blocks
+parseListItem prefix marker = string prefix >> marker >> listItemLine prefix marker
+
+listItemLine :: Show a => String -> TWParser a -> TWParser B.Blocks
+listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat
+ where
+ lineContent = do
+ content <- anyLine
+ continuation <- optionMaybe listContinuation
+ return $ filterSpaces content ++ "\n" ++ (maybe "" (" " ++) continuation)
+ filterSpaces = reverse . dropWhile (== ' ') . reverse
+ listContinuation = notFollowedBy (string prefix >> marker) >>
+ string " " >> lineContent
+ parseContent = parseFromString $ many1 $ nestedList <|> parseInline
+ parseInline = many1Till inline (lastNewline <|> newlineBeforeNestedList) >>=
+ return . B.plain . mconcat
+ nestedList = list prefix
+ lastNewline = try $ char '\n' <* eof
+ newlineBeforeNestedList = try $ char '\n' <* lookAhead nestedList
+
+table :: TWParser B.Blocks
+table = try $ do
+ tableHead <- optionMaybe $ many1Till tableParseHeader newline >>= return . unzip
+ rows <- many1 tableParseRow
+ return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) tableHead
+ where
+ buildTable caption rows (aligns, heads)
+ = B.table caption aligns heads rows
+ align rows = replicate (columCount rows) (AlignDefault, 0)
+ columns rows = replicate (columCount rows) mempty
+ columCount rows = length $ head rows
+
+tableParseHeader :: TWParser ((Alignment, Double), B.Blocks)
+tableParseHeader = try $ do
+ char '|'
+ leftSpaces <- many spaceChar >>= return . length
+ char '*'
+ content <- tableColumnContent (char '*' >> skipSpaces >> char '|')
+ char '*'
+ rightSpaces <- many spaceChar >>= return . length
+ optional tableEndOfRow
+ return (tableAlign leftSpaces rightSpaces, content)
+ where
+ tableAlign left right
+ | left >= 2 && left == right = (AlignCenter, 0)
+ | left > right = (AlignRight, 0)
+ | otherwise = (AlignLeft, 0)
+
+tableParseRow :: TWParser [B.Blocks]
+tableParseRow = many1Till tableParseColumn newline
+
+tableParseColumn :: TWParser B.Blocks
+tableParseColumn = char '|' *> skipSpaces *>
+ tableColumnContent (skipSpaces >> char '|')
+ <* skipSpaces <* optional tableEndOfRow
+
+tableEndOfRow :: TWParser Char
+tableEndOfRow = lookAhead (try $ char '|' >> char '\n') >> char '|'
+
+tableColumnContent :: Show a => TWParser a -> TWParser B.Blocks
+tableColumnContent end = manyTill content (lookAhead $ try end) >>= return . B.plain . mconcat
+ where
+ content = continuation <|> inline
+ continuation = try $ char '\\' >> newline >> return mempty
+
+blockQuote :: TWParser B.Blocks
+blockQuote = parseHtmlContent "blockquote" block >>= return . B.blockQuote . mconcat
+
+noautolink :: TWParser B.Blocks
+noautolink = do
+ (_, content) <- htmlElement "noautolink"
+ st <- getState
+ setState $ st{ stateAllowLinks = False }
+ blocks <- try $ parseContent content
+ setState $ st{ stateAllowLinks = True }
+ return $ mconcat blocks
+ where
+ parseContent = parseFromString $ many $ block
+
+para :: TWParser B.Blocks
+para = many1Till inline endOfParaElement >>= return . result . mconcat
+ where
+ endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement
+ endOfInput = try $ skipMany blankline >> skipSpaces >> eof
+ endOfPara = try $ blankline >> skipMany1 blankline
+ newBlockElement = try $ blankline >> skip blockElements
+ result content = if F.all (==Space) content
+ then mempty
+ else B.para $ B.trimInlines content
+
+
+--
+-- inline parsers
+--
+
+inline :: TWParser B.Inlines
+inline = choice [ whitespace
+ , br
+ , macro
+ , strong
+ , strongHtml
+ , strongAndEmph
+ , emph
+ , emphHtml
+ , boldCode
+ , smart
+ , link
+ , htmlComment
+ , code
+ , codeHtml
+ , nop
+ , autoLink
+ , str
+ , symbol
+ ] <?> "inline"
+
+whitespace :: TWParser B.Inlines
+whitespace = (lb <|> regsp) >>= return
+ where lb = try $ skipMany spaceChar >> linebreak >> return B.space
+ regsp = try $ skipMany1 spaceChar >> return B.space
+
+br :: TWParser B.Inlines
+br = try $ string "%BR%" >> return B.linebreak
+
+linebreak :: TWParser B.Inlines
+linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline)
+ where lastNewline = eof >> return mempty
+ innerNewline = return B.space
+
+between :: (Show b, Monoid c) => TWParser a -> TWParser b -> (TWParser b -> TWParser c) -> TWParser c
+between start end p =
+ mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end)
+
+enclosed :: (Show a, Monoid b) => TWParser a -> (TWParser a -> TWParser b) -> TWParser b
+enclosed sep p = between sep (try $ sep <* endMarker) p
+ where
+ endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|") <|> eof
+ endSpace = (spaceChar <|> newline) >> return B.space
+
+macro :: TWParser B.Inlines
+macro = macroWithParameters <|> withoutParameters
+ where
+ withoutParameters = enclosed (char '%') (\_ -> macroName) >>= return . emptySpan
+ emptySpan name = buildSpan name [] mempty
+
+macroWithParameters :: TWParser B.Inlines
+macroWithParameters = try $ do
+ char '%'
+ name <- macroName
+ (content, kvs) <- attributes
+ char '%'
+ return $ buildSpan name kvs $ B.str content
+
+buildSpan :: String -> [(String, String)] -> B.Inlines -> B.Inlines
+buildSpan className kvs = B.spanWith attrs
+ where
+ attrs = ("", ["twiki-macro", className] ++ additionalClasses, kvsWithoutClasses)
+ additionalClasses = maybe [] words $ lookup "class" kvs
+ kvsWithoutClasses = [(k,v) | (k,v) <- kvs, k /= "class"]
+
+macroName :: TWParser String
+macroName = do
+ first <- letter
+ rest <- many $ alphaNum <|> char '_'
+ return (first:rest)
+
+attributes :: TWParser (String, [(String, String)])
+attributes = char '{' *> spnl *> many (attribute <* spnl) <* char '}' >>=
+ return . foldr (either mkContent mkKvs) ([], [])
+ where
+ spnl = skipMany (spaceChar <|> newline)
+ mkContent c ([], kvs) = (c, kvs)
+ mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs)
+ mkKvs kv (cont, rest) = (cont, (kv : rest))
+
+attribute :: TWParser (Either String (String, String))
+attribute = withKey <|> withoutKey
+ where
+ withKey = try $ do
+ key <- macroName
+ char '='
+ parseValue False >>= return . (curry Right key)
+ withoutKey = try $ parseValue True >>= return . Left
+ parseValue allowSpaces = (withQuotes <|> withoutQuotes allowSpaces) >>= return . fromEntities
+ withQuotes = between (char '"') (char '"') (\_ -> count 1 $ noneOf ['"'])
+ withoutQuotes allowSpaces
+ | allowSpaces == True = many1 $ noneOf "}"
+ | otherwise = many1 $ noneOf " }"
+
+nestedInlines :: Show a => TWParser a -> TWParser B.Inlines
+nestedInlines end = innerSpace <|> nestedInline
+ where
+ innerSpace = try $ whitespace <* (notFollowedBy end)
+ nestedInline = notFollowedBy whitespace >> nested inline
+
+strong :: TWParser B.Inlines
+strong = try $ enclosed (char '*') nestedInlines >>= return . B.strong
+
+strongHtml :: TWParser B.Inlines
+strongHtml = (parseHtmlContent "strong" inline <|> parseHtmlContent "b" inline)
+ >>= return . B.strong . mconcat
+
+strongAndEmph :: TWParser B.Inlines
+strongAndEmph = try $ enclosed (string "__") nestedInlines >>= return . B.emph . B.strong
+
+emph :: TWParser B.Inlines
+emph = try $ enclosed (char '_') nestedInlines >>= return . B.emph
+
+emphHtml :: TWParser B.Inlines
+emphHtml = (parseHtmlContent "em" inline <|> parseHtmlContent "i" inline)
+ >>= return . B.emph . mconcat
+
+nestedString :: Show a => TWParser a -> TWParser String
+nestedString end = innerSpace <|> (count 1 nonspaceChar)
+ where
+ innerSpace = try $ many1 spaceChar <* notFollowedBy end
+
+boldCode :: TWParser B.Inlines
+boldCode = try $ enclosed (string "==") nestedString >>= return . B.strong . B.code . fromEntities
+
+htmlComment :: TWParser B.Inlines
+htmlComment = htmlTag isCommentTag >> return mempty
+
+code :: TWParser B.Inlines
+code = try $ enclosed (char '=') nestedString >>= return . B.code . fromEntities
+
+codeHtml :: TWParser B.Inlines
+codeHtml = do
+ (attrs, content) <- parseHtmlContentWithAttrs "code" anyChar
+ return $ B.codeWith attrs $ fromEntities content
+
+autoLink :: TWParser B.Inlines
+autoLink = try $ do
+ state <- getState
+ guard $ stateAllowLinks state
+ (text, url) <- parseLink
+ guard $ checkLink (head $ reverse url)
+ return $ makeLink (text, url)
+ where
+ parseLink = notFollowedBy nop >> (uri <|> emailAddress)
+ makeLink (text, url) = B.link url "" $ B.str text
+ checkLink c
+ | c == '/' = True
+ | otherwise = isAlphaNum c
+
+str :: TWParser B.Inlines
+str = (many1 alphaNum <|> count 1 characterReference) >>= return . B.str
+
+nop :: TWParser B.Inlines
+nop = try $ (skip exclamation <|> skip nopTag) >> followContent
+ where
+ exclamation = char '!'
+ nopTag = stringAnyCase "<nop>"
+ followContent = many1 nonspaceChar >>= return . B.str . fromEntities
+
+symbol :: TWParser B.Inlines
+symbol = count 1 nonspaceChar >>= return . B.str
+
+smart :: TWParser B.Inlines
+smart = do
+ getOption readerSmart >>= guard
+ doubleQuoted <|> singleQuoted <|>
+ choice [ apostrophe
+ , dash
+ , ellipses
+ ]
+
+singleQuoted :: TWParser B.Inlines
+singleQuoted = try $ do
+ singleQuoteStart
+ withQuoteContext InSingleQuote $
+ many1Till inline singleQuoteEnd >>=
+ (return . B.singleQuoted . B.trimInlines . mconcat)
+
+doubleQuoted :: TWParser B.Inlines
+doubleQuoted = try $ do
+ doubleQuoteStart
+ contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
+ (withQuoteContext InDoubleQuote $ doubleQuoteEnd >>
+ return (B.doubleQuoted $ B.trimInlines contents))
+ <|> (return $ (B.str "\8220") B.<> contents)
+
+link :: TWParser B.Inlines
+link = try $ do
+ st <- getState
+ guard $ stateAllowLinks st
+ setState $ st{ stateAllowLinks = False }
+ (url, title, content) <- linkText
+ setState $ st{ stateAllowLinks = True }
+ return $ B.link url title content
+
+linkText :: TWParser (String, String, B.Inlines)
+linkText = do
+ string "[["
+ url <- many1Till anyChar (char ']')
+ content <- option [B.str url] linkContent
+ char ']'
+ return (url, "", mconcat content)
+ where
+ linkContent = (char '[') >> many1Till anyChar (char ']') >>= parseLinkContent
+ parseLinkContent = parseFromString $ many1 inline
diff --git a/src/Text/Pandoc/Readers/TeXMath.hs b/src/Text/Pandoc/Readers/TeXMath.hs
index 6bd617f7e..3fee3051e 100644
--- a/src/Text/Pandoc/Readers/TeXMath.hs
+++ b/src/Text/Pandoc/Readers/TeXMath.hs
@@ -1,5 +1,5 @@
{-
-Copyright (C) 2007-2010 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2007-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.TeXMath
- Copyright : Copyright (C) 2007-2010 John MacFarlane
+ Copyright : Copyright (C) 2007-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -27,7 +27,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion of TeX math to a list of 'Pandoc' inline elements.
-}
-module Text.Pandoc.Readers.TeXMath ( readTeXMath, readTeXMath' ) where
+module Text.Pandoc.Readers.TeXMath ( texMathToInlines ) where
import Text.Pandoc.Definition
import Text.TeXMath
@@ -35,22 +35,14 @@ import Text.TeXMath
-- | Converts a raw TeX math formula to a list of 'Pandoc' inlines.
-- Defaults to raw formula between @$@ or @$$@ characters if entire formula
-- can't be converted.
-readTeXMath' :: MathType
+texMathToInlines :: MathType
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> [Inline]
-readTeXMath' mt inp = case texMathToPandoc dt inp of
- Left _ -> [Str (delim ++ inp ++ delim)]
- Right res -> res
+texMathToInlines mt inp =
+ case writePandoc dt `fmap` readTeX inp of
+ Right (Just ils) -> ils
+ _ -> [Str (delim ++ inp ++ delim)]
where (dt, delim) = case mt of
DisplayMath -> (DisplayBlock, "$$")
InlineMath -> (DisplayInline, "$")
-{-# DEPRECATED readTeXMath "Use readTeXMath' from Text.Pandoc.JSON instead" #-}
--- | Converts a raw TeX math formula to a list of 'Pandoc' inlines.
--- Defaults to raw formula between @$@ characters if entire formula
--- can't be converted. (This is provided for backwards compatibility;
--- it is better to use @readTeXMath'@, which properly distinguishes
--- between display and inline math.)
-readTeXMath :: String -- ^ String to parse (assumes @'\n'@ line endings)
- -> [Inline]
-readTeXMath = readTeXMath' InlineMath
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 93658cdea..ee64e8f2a 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -1,5 +1,6 @@
{-
-Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@'
+Copyright (C) 2010-2014 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@'
+ and John MacFarlane
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Textile
- Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
+ Copyright : Copyright (C) 2010-2014 Paul Rivier and John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : Paul Rivier <paul*rivier#demotera*com>
@@ -50,20 +51,22 @@ TODO : refactor common patterns across readers :
module Text.Pandoc.Readers.Textile ( readTextile) where
-
import Text.Pandoc.Definition
+import Text.Pandoc.Builder (Inlines, Blocks, trimInlines)
import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Shared
import Text.Pandoc.Options
import Text.Pandoc.Parsing
-import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
+import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag )
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
import Text.HTML.TagSoup (parseTags, innerText, fromAttrib, Tag(..))
import Text.HTML.TagSoup.Match
import Data.List ( intercalate )
-import Data.Char ( digitToInt, isUpper )
-import Control.Monad ( guard, liftM )
-import Control.Applicative ((<$>), (*>), (<*))
+import Data.Char ( digitToInt, isUpper)
+import Control.Monad ( guard, liftM, when )
+import Text.Printf
+import Control.Applicative ((<$>), (*>), (<*), (<$))
+import Data.Monoid
+import Debug.Trace (trace)
-- | Parse a Textile text and return a Pandoc document.
readTextile :: ReaderOptions -- ^ Reader options
@@ -95,7 +98,7 @@ parseTextile = do
updateState $ \s -> s { stateNotes = reverse reversedNotes }
-- now parse it for real...
blocks <- parseBlocks
- return $ Pandoc nullMeta blocks -- FIXME
+ return $ Pandoc nullMeta (B.toList blocks) -- FIXME
noteMarker :: Parser [Char] ParserState [Char]
noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.')
@@ -115,11 +118,11 @@ noteBlock = try $ do
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-- | Parse document blocks
-parseBlocks :: Parser [Char] ParserState [Block]
-parseBlocks = manyTill block eof
+parseBlocks :: Parser [Char] ParserState Blocks
+parseBlocks = mconcat <$> manyTill block eof
-- | Block parsers list tried in definition order
-blockParsers :: [Parser [Char] ParserState Block]
+blockParsers :: [Parser [Char] ParserState Blocks]
blockParsers = [ codeBlock
, header
, blockQuote
@@ -130,29 +133,37 @@ blockParsers = [ codeBlock
, rawLaTeXBlock'
, maybeExplicitBlock "table" table
, maybeExplicitBlock "p" para
+ , mempty <$ blanklines
]
-- | Any block in the order of definition of blockParsers
-block :: Parser [Char] ParserState Block
-block = choice blockParsers <?> "block"
-
-commentBlock :: Parser [Char] ParserState Block
+block :: Parser [Char] ParserState Blocks
+block = do
+ res <- choice blockParsers <?> "block"
+ pos <- getPosition
+ tr <- getOption readerTrace
+ when tr $
+ trace (printf "line %d: %s" (sourceLine pos)
+ (take 60 $ show $ B.toList res)) (return ())
+ return res
+
+commentBlock :: Parser [Char] ParserState Blocks
commentBlock = try $ do
string "###."
manyTill anyLine blanklines
- return Null
+ return mempty
-codeBlock :: Parser [Char] ParserState Block
+codeBlock :: Parser [Char] ParserState Blocks
codeBlock = codeBlockBc <|> codeBlockPre
-codeBlockBc :: Parser [Char] ParserState Block
+codeBlockBc :: Parser [Char] ParserState Blocks
codeBlockBc = try $ do
string "bc. "
contents <- manyTill anyLine blanklines
- return $ CodeBlock ("",[],[]) $ unlines contents
+ return $ B.codeBlock (unlines contents)
-- | Code Blocks in Textile are between <pre> and </pre>
-codeBlockPre :: Parser [Char] ParserState Block
+codeBlockPre :: Parser [Char] ParserState Blocks
codeBlockPre = try $ do
(t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True))
result' <- (innerText . parseTags) `fmap` -- remove internal tags
@@ -169,29 +180,29 @@ codeBlockPre = try $ do
let classes = words $ fromAttrib "class" t
let ident = fromAttrib "id" t
let kvs = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"]
- return $ CodeBlock (ident,classes,kvs) result'''
+ return $ B.codeBlockWith (ident,classes,kvs) result'''
-- | Header of the form "hN. content" with N in 1..6
-header :: Parser [Char] ParserState Block
+header :: Parser [Char] ParserState Blocks
header = try $ do
char 'h'
level <- digitToInt <$> oneOf "123456"
attr <- attributes
char '.'
- whitespace
- name <- normalizeSpaces <$> manyTill inline blockBreak
- attr' <- registerHeader attr (B.fromList name)
- return $ Header level attr' name
+ lookAhead whitespace
+ name <- trimInlines . mconcat <$> many inline
+ attr' <- registerHeader attr name
+ return $ B.headerWith attr' level name
-- | Blockquote of the form "bq. content"
-blockQuote :: Parser [Char] ParserState Block
+blockQuote :: Parser [Char] ParserState Blocks
blockQuote = try $ do
string "bq" >> attributes >> char '.' >> whitespace
- BlockQuote . singleton <$> para
+ B.blockQuote <$> para
-- Horizontal rule
-hrule :: Parser [Char] st Block
+hrule :: Parser [Char] st Blocks
hrule = try $ do
skipSpaces
start <- oneOf "-*"
@@ -199,62 +210,74 @@ hrule = try $ do
skipMany (spaceChar <|> char start)
newline
optional blanklines
- return HorizontalRule
+ return B.horizontalRule
-- Lists handling
-- | Can be a bullet list or an ordered list. This implementation is
-- strict in the nesting, sublist must start at exactly "parent depth
-- plus one"
-anyList :: Parser [Char] ParserState Block
+anyList :: Parser [Char] ParserState Blocks
anyList = try $ anyListAtDepth 1 <* blanklines
-- | This allow one type of list to be nested into an other type,
-- provided correct nesting
-anyListAtDepth :: Int -> Parser [Char] ParserState Block
+anyListAtDepth :: Int -> Parser [Char] ParserState Blocks
anyListAtDepth depth = choice [ bulletListAtDepth depth,
orderedListAtDepth depth,
definitionList ]
-- | Bullet List of given depth, depth being the number of leading '*'
-bulletListAtDepth :: Int -> Parser [Char] ParserState Block
-bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth)
+bulletListAtDepth :: Int -> Parser [Char] ParserState Blocks
+bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth)
-- | Bullet List Item of given depth, depth being the number of
-- leading '*'
-bulletListItemAtDepth :: Int -> Parser [Char] ParserState [Block]
+bulletListItemAtDepth :: Int -> Parser [Char] ParserState Blocks
bulletListItemAtDepth = genericListItemAtDepth '*'
-- | Ordered List of given depth, depth being the number of
-- leading '#'
-orderedListAtDepth :: Int -> Parser [Char] ParserState Block
+orderedListAtDepth :: Int -> Parser [Char] ParserState Blocks
orderedListAtDepth depth = try $ do
items <- many1 (orderedListItemAtDepth depth)
- return (OrderedList (1, DefaultStyle, DefaultDelim) items)
+ return $ B.orderedList items
-- | Ordered List Item of given depth, depth being the number of
-- leading '#'
-orderedListItemAtDepth :: Int -> Parser [Char] ParserState [Block]
+orderedListItemAtDepth :: Int -> Parser [Char] ParserState Blocks
orderedListItemAtDepth = genericListItemAtDepth '#'
-- | Common implementation of list items
-genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState [Block]
+genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState Blocks
genericListItemAtDepth c depth = try $ do
count depth (char c) >> attributes >> whitespace
- p <- many listInline
+ p <- mconcat <$> many listInline
newline
- sublist <- option [] (singleton <$> anyListAtDepth (depth + 1))
- return (Plain p : sublist)
+ sublist <- option mempty (anyListAtDepth (depth + 1))
+ return $ (B.plain p) <> sublist
-- | A definition list is a set of consecutive definition items
-definitionList :: Parser [Char] ParserState Block
-definitionList = try $ DefinitionList <$> many1 definitionListItem
+definitionList :: Parser [Char] ParserState Blocks
+definitionList = try $ B.definitionList <$> many1 definitionListItem
-- | List start character.
-listStart :: Parser [Char] st Char
-listStart = oneOf "*#-"
+listStart :: Parser [Char] ParserState ()
+listStart = genericListStart '*'
+ <|> () <$ genericListStart '#'
+ <|> () <$ definitionListStart
+
+genericListStart :: Char -> Parser [Char] st ()
+genericListStart c = () <$ try (many1 (char c) >> whitespace)
+
+definitionListStart :: Parser [Char] ParserState Inlines
+definitionListStart = try $ do
+ char '-'
+ whitespace
+ trimInlines . mconcat <$>
+ many1Till inline (try (string ":=")) <* optional whitespace
-listInline :: Parser [Char] ParserState Inline
+listInline :: Parser [Char] ParserState Inlines
listInline = try (notFollowedBy newline >> inline)
<|> try (endline <* notFollowedBy listStart)
@@ -262,16 +285,15 @@ listInline = try (notFollowedBy newline >> inline)
-- the term defined, then spaces and ":=". The definition follows, on
-- the same single line, or spaned on multiple line, after a line
-- break.
-definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]])
+definitionListItem :: Parser [Char] ParserState (Inlines, [Blocks])
definitionListItem = try $ do
- string "- "
- term <- many1Till inline (try (whitespace >> string ":="))
+ term <- definitionListStart
def' <- multilineDef <|> inlineDef
return (term, def')
- where inlineDef :: Parser [Char] ParserState [[Block]]
- inlineDef = liftM (\d -> [[Plain d]])
- $ optional whitespace >> many listInline <* newline
- multilineDef :: Parser [Char] ParserState [[Block]]
+ where inlineDef :: Parser [Char] ParserState [Blocks]
+ inlineDef = liftM (\d -> [B.plain d])
+ $ optional whitespace >> (trimInlines . mconcat <$> many listInline) <* newline
+ multilineDef :: Parser [Char] ParserState [Blocks]
multilineDef = try $ do
optional whitespace >> newline
s <- many1Till anyChar (try (string "=:" >> newline))
@@ -279,68 +301,61 @@ definitionListItem = try $ do
ds <- parseFromString parseBlocks (s ++ "\n\n")
return [ds]
--- | This terminates a block such as a paragraph. Because of raw html
--- blocks support, we have to lookAhead for a rawHtmlBlock.
-blockBreak :: Parser [Char] ParserState ()
-blockBreak = try (newline >> blanklines >> return ()) <|>
- try (optional spaces >> lookAhead rawHtmlBlock >> return ())
-
-- raw content
-- | A raw Html Block, optionally followed by blanklines
-rawHtmlBlock :: Parser [Char] ParserState Block
+rawHtmlBlock :: Parser [Char] ParserState Blocks
rawHtmlBlock = try $ do
+ skipMany spaceChar
(_,b) <- htmlTag isBlockTag
optional blanklines
- return $ RawBlock (Format "html") b
+ return $ B.rawBlock "html" b
-- | Raw block of LaTeX content
-rawLaTeXBlock' :: Parser [Char] ParserState Block
+rawLaTeXBlock' :: Parser [Char] ParserState Blocks
rawLaTeXBlock' = do
guardEnabled Ext_raw_tex
- RawBlock (Format "latex") <$> (rawLaTeXBlock <* spaces)
+ B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces)
-- | In textile, paragraphs are separated by blank lines.
-para :: Parser [Char] ParserState Block
-para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak
-
+para :: Parser [Char] ParserState Blocks
+para = B.para . trimInlines . mconcat <$> many1 inline
-- Tables
-- | A table cell spans until a pipe |
-tableCell :: Parser [Char] ParserState TableCell
+tableCell :: Parser [Char] ParserState Blocks
tableCell = do
c <- many1 (noneOf "|\n")
- content <- parseFromString (many1 inline) c
- return $ [ Plain $ normalizeSpaces content ]
+ content <- trimInlines . mconcat <$> parseFromString (many1 inline) c
+ return $ B.plain content
-- | A table row is made of many table cells
-tableRow :: Parser [Char] ParserState [TableCell]
+tableRow :: Parser [Char] ParserState [Blocks]
tableRow = try $ ( char '|' *>
(endBy1 tableCell (optional blankline *> char '|')) <* newline)
-- | Many table rows
-tableRows :: Parser [Char] ParserState [[TableCell]]
+tableRows :: Parser [Char] ParserState [[Blocks]]
tableRows = many1 tableRow
-- | Table headers are made of cells separated by a tag "|_."
-tableHeaders :: Parser [Char] ParserState [TableCell]
+tableHeaders :: Parser [Char] ParserState [Blocks]
tableHeaders = let separator = (try $ string "|_.") in
try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline )
-- | A table with an optional header. Current implementation can
-- handle tables with and without header, but will parse cells
-- alignment attributes as content.
-table :: Parser [Char] ParserState Block
+table :: Parser [Char] ParserState Blocks
table = try $ do
- headers <- option [] tableHeaders
+ headers <- option mempty tableHeaders
rows <- tableRows
blanklines
let nbOfCols = max (length headers) (length $ head rows)
- return $ Table []
- (replicate nbOfCols AlignDefault)
- (replicate nbOfCols 0.0)
+ return $ B.table mempty
+ (zip (replicate nbOfCols AlignDefault) (replicate nbOfCols 0.0))
headers
rows
@@ -348,8 +363,8 @@ table = try $ do
-- | Blocks like 'p' and 'table' do not need explicit block tag.
-- However, they can be used to set HTML/CSS attributes when needed.
maybeExplicitBlock :: String -- ^ block tag name
- -> Parser [Char] ParserState Block -- ^ implicit block
- -> Parser [Char] ParserState Block
+ -> Parser [Char] ParserState Blocks -- ^ implicit block
+ -> Parser [Char] ParserState Blocks
maybeExplicitBlock name blk = try $ do
optional $ try $ string name >> attributes >> char '.' >>
optional whitespace >> optional endline
@@ -363,73 +378,74 @@ maybeExplicitBlock name blk = try $ do
-- | Any inline element
-inline :: Parser [Char] ParserState Inline
-inline = choice inlineParsers <?> "inline"
+inline :: Parser [Char] ParserState Inlines
+inline = do
+ choice inlineParsers <?> "inline"
-- | Inline parsers tried in order
-inlineParsers :: [Parser [Char] ParserState Inline]
+inlineParsers :: [Parser [Char] ParserState Inlines]
inlineParsers = [ str
, whitespace
, endline
, code
, escapedInline
- , htmlSpan
+ , inlineMarkup
+ , groupedInlineMarkup
, rawHtmlInline
, rawLaTeXInline'
, note
- , try $ (char '[' *> inlineMarkup <* char ']')
- , inlineMarkup
, link
, image
, mark
- , (Str . (:[])) <$> characterReference
+ , (B.str . (:[])) <$> characterReference
, smartPunctuation inline
, symbol
]
-- | Inline markups
-inlineMarkup :: Parser [Char] ParserState Inline
-inlineMarkup = choice [ simpleInline (string "??") (Cite [])
- , simpleInline (string "**") Strong
- , simpleInline (string "__") Emph
- , simpleInline (char '*') Strong
- , simpleInline (char '_') Emph
- , simpleInline (char '+') Emph -- approximates underline
- , simpleInline (char '-' <* notFollowedBy (char '-')) Strikeout
- , simpleInline (char '^') Superscript
- , simpleInline (char '~') Subscript
+inlineMarkup :: Parser [Char] ParserState Inlines
+inlineMarkup = choice [ simpleInline (string "??") (B.cite [])
+ , simpleInline (string "**") B.strong
+ , simpleInline (string "__") B.emph
+ , simpleInline (char '*') B.strong
+ , simpleInline (char '_') B.emph
+ , simpleInline (char '+') B.emph -- approximates underline
+ , simpleInline (char '-' <* notFollowedBy (char '-')) B.strikeout
+ , simpleInline (char '^') B.superscript
+ , simpleInline (char '~') B.subscript
+ , simpleInline (char '%') id
]
-- | Trademark, registered, copyright
-mark :: Parser [Char] st Inline
+mark :: Parser [Char] st Inlines
mark = try $ char '(' >> (try tm <|> try reg <|> copy)
-reg :: Parser [Char] st Inline
+reg :: Parser [Char] st Inlines
reg = do
oneOf "Rr"
char ')'
- return $ Str "\174"
+ return $ B.str "\174"
-tm :: Parser [Char] st Inline
+tm :: Parser [Char] st Inlines
tm = do
oneOf "Tt"
oneOf "Mm"
char ')'
- return $ Str "\8482"
+ return $ B.str "\8482"
-copy :: Parser [Char] st Inline
+copy :: Parser [Char] st Inlines
copy = do
oneOf "Cc"
char ')'
- return $ Str "\169"
+ return $ B.str "\169"
-note :: Parser [Char] ParserState Inline
+note :: Parser [Char] ParserState Inlines
note = try $ do
ref <- (char '[' *> many1 digit <* char ']')
notes <- stateNotes <$> getState
case lookup ref notes of
Nothing -> fail "note not found"
- Just raw -> liftM Note $ parseFromString parseBlocks raw
+ Just raw -> B.note <$> parseFromString parseBlocks raw
-- | Special chars
markupChars :: [Char]
@@ -450,7 +466,7 @@ wordBoundaries = markupChars ++ stringBreakers
hyphenedWords :: Parser [Char] ParserState String
hyphenedWords = do
x <- wordChunk
- xs <- many (try $ char '-' >> wordChunk)
+ xs <- many (try $ char '-' >> wordChunk)
return $ intercalate "-" (x:xs)
wordChunk :: Parser [Char] ParserState String
@@ -462,99 +478,104 @@ wordChunk = try $ do
return $ hd:tl
-- | Any string
-str :: Parser [Char] ParserState Inline
+str :: Parser [Char] ParserState Inlines
str = do
baseStr <- hyphenedWords
-- RedCloth compliance : if parsed word is uppercase and immediatly
-- followed by parens, parens content is unconditionally word acronym
fullStr <- option baseStr $ try $ do
guard $ all isUpper baseStr
- acro <- enclosed (char '(') (char ')') anyChar
+ acro <- enclosed (char '(') (char ')') anyChar'
return $ concat [baseStr, " (", acro, ")"]
updateLastStrPos
- return $ Str fullStr
-
--- | Textile allows HTML span infos, we discard them
-htmlSpan :: Parser [Char] ParserState Inline
-htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') )
+ return $ B.str fullStr
-- | Some number of space chars
-whitespace :: Parser [Char] ParserState Inline
-whitespace = many1 spaceChar >> return Space <?> "whitespace"
+whitespace :: Parser [Char] st Inlines
+whitespace = many1 spaceChar >> return B.space <?> "whitespace"
-- | In Textile, an isolated endline character is a line break
-endline :: Parser [Char] ParserState Inline
+endline :: Parser [Char] ParserState Inlines
endline = try $ do
- newline >> notFollowedBy blankline
- return LineBreak
+ newline
+ notFollowedBy blankline
+ notFollowedBy listStart
+ notFollowedBy rawHtmlBlock
+ return B.linebreak
-rawHtmlInline :: Parser [Char] ParserState Inline
-rawHtmlInline = RawInline (Format "html") . snd <$> htmlTag isInlineTag
+rawHtmlInline :: Parser [Char] ParserState Inlines
+rawHtmlInline = B.rawInline "html" . snd <$> htmlTag (const True)
-- | Raw LaTeX Inline
-rawLaTeXInline' :: Parser [Char] ParserState Inline
+rawLaTeXInline' :: Parser [Char] ParserState Inlines
rawLaTeXInline' = try $ do
guardEnabled Ext_raw_tex
- rawLaTeXInline
+ B.singleton <$> rawLaTeXInline
-- | Textile standard link syntax is "label":target. But we
-- can also have ["label":target].
-link :: Parser [Char] ParserState Inline
-link = linkB <|> linkNoB
-
-linkNoB :: Parser [Char] ParserState Inline
-linkNoB = try $ do
- name <- surrounded (char '"') inline
- char ':'
- let stopChars = "!.,;:"
- url <- manyTill nonspaceChar (lookAhead $ space <|> try (oneOf stopChars >> (space <|> newline)))
- let name' = if name == [Str "$"] then [Str url] else name
- return $ Link name' (url, "")
-
-linkB :: Parser [Char] ParserState Inline
-linkB = try $ do
- char '['
- name <- surrounded (char '"') inline
+link :: Parser [Char] ParserState Inlines
+link = try $ do
+ bracketed <- (True <$ char '[') <|> return False
+ char '"' *> notFollowedBy (oneOf " \t\n\r")
+ attr <- attributes
+ name <- trimInlines . mconcat <$>
+ withQuoteContext InDoubleQuote (many1Till inline (char '"'))
char ':'
- url <- manyTill nonspaceChar (char ']')
- let name' = if name == [Str "$"] then [Str url] else name
- return $ Link name' (url, "")
+ let stop = if bracketed
+ then char ']'
+ else lookAhead $ space <|>
+ try (oneOf "!.,;:" *> (space <|> newline))
+ url <- manyTill nonspaceChar stop
+ let name' = if B.toList name == [Str "$"] then B.str url else name
+ return $ if attr == nullAttr
+ then B.link url "" name'
+ else B.spanWith attr $ B.link url "" name'
-- | image embedding
-image :: Parser [Char] ParserState Inline
+image :: Parser [Char] ParserState Inlines
image = try $ do
char '!' >> notFollowedBy space
- src <- manyTill anyChar (lookAhead $ oneOf "!(")
- alt <- option "" (try $ (char '(' >> manyTill anyChar (char ')')))
+ src <- manyTill anyChar' (lookAhead $ oneOf "!(")
+ alt <- option "" (try $ (char '(' >> manyTill anyChar' (char ')')))
char '!'
- return $ Image [Str alt] (src, alt)
+ return $ B.image src alt (B.str alt)
-escapedInline :: Parser [Char] ParserState Inline
+escapedInline :: Parser [Char] ParserState Inlines
escapedInline = escapedEqs <|> escapedTag
-escapedEqs :: Parser [Char] ParserState Inline
-escapedEqs = Str <$> (try $ string "==" *> manyTill anyChar (try $ string "=="))
+escapedEqs :: Parser [Char] ParserState Inlines
+escapedEqs = B.str <$>
+ (try $ string "==" *> manyTill anyChar' (try $ string "=="))
-- | literal text escaped btw <notextile> tags
-escapedTag :: Parser [Char] ParserState Inline
-escapedTag = Str <$>
- (try $ string "<notextile>" *> manyTill anyChar (try $ string "</notextile>"))
+escapedTag :: Parser [Char] ParserState Inlines
+escapedTag = B.str <$>
+ (try $ string "<notextile>" *>
+ manyTill anyChar' (try $ string "</notextile>"))
-- | Any special symbol defined in wordBoundaries
-symbol :: Parser [Char] ParserState Inline
-symbol = Str . singleton <$> (oneOf wordBoundaries <|> oneOf markupChars)
+symbol :: Parser [Char] ParserState Inlines
+symbol = B.str . singleton <$> (notFollowedBy newline *>
+ notFollowedBy rawHtmlBlock *>
+ oneOf wordBoundaries)
-- | Inline code
-code :: Parser [Char] ParserState Inline
+code :: Parser [Char] ParserState Inlines
code = code1 <|> code2
-code1 :: Parser [Char] ParserState Inline
-code1 = Code nullAttr <$> surrounded (char '@') anyChar
+-- any character except a newline before a blank line
+anyChar' :: Parser [Char] ParserState Char
+anyChar' =
+ satisfy (/='\n') <|> (try $ char '\n' <* notFollowedBy blankline)
-code2 :: Parser [Char] ParserState Inline
+code1 :: Parser [Char] ParserState Inlines
+code1 = B.code <$> surrounded (char '@') anyChar'
+
+code2 :: Parser [Char] ParserState Inlines
code2 = do
htmlTag (tagOpen (=="tt") null)
- Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
+ B.code <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt"))
-- | Html / CSS attributes
attributes :: Parser [Char] ParserState Attr
@@ -566,7 +587,7 @@ attribute = classIdAttr <|> styleAttr <|> langAttr
classIdAttr :: Parser [Char] ParserState (Attr -> Attr)
classIdAttr = try $ do -- (class class #id)
char '('
- ws <- words `fmap` manyTill anyChar (char ')')
+ ws <- words `fmap` manyTill anyChar' (char ')')
case reverse ws of
[] -> return $ \(_,_,keyvals) -> ("",[],keyvals)
(('#':ident'):classes') -> return $ \(_,_,keyvals) ->
@@ -576,28 +597,50 @@ classIdAttr = try $ do -- (class class #id)
styleAttr :: Parser [Char] ParserState (Attr -> Attr)
styleAttr = do
- style <- try $ enclosed (char '{') (char '}') anyChar
+ style <- try $ enclosed (char '{') (char '}') anyChar'
return $ \(id',classes,keyvals) -> (id',classes,("style",style):keyvals)
langAttr :: Parser [Char] ParserState (Attr -> Attr)
langAttr = do
- lang <- try $ enclosed (char '[') (char ']') anyChar
+ lang <- try $ enclosed (char '[') (char ']') alphaNum
return $ \(id',classes,keyvals) -> (id',classes,("lang",lang):keyvals)
-- | Parses material surrounded by a parser.
surrounded :: Parser [Char] st t -- ^ surrounding parser
-> Parser [Char] st a -- ^ content parser (to be used repeatedly)
-> Parser [Char] st [a]
-surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border)
+surrounded border =
+ enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border)
--- | Inlines are most of the time of the same form
simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser
- -> ([Inline] -> Inline) -- ^ Inline constructor
- -> Parser [Char] ParserState Inline -- ^ content parser (to be used repeatedly)
-simpleInline border construct = surrounded border inlineWithAttribute >>=
- return . construct . normalizeSpaces
- where inlineWithAttribute = (try $ optional attributes) >> inline
+ -> (Inlines -> Inlines) -- ^ Inline constructor
+ -> Parser [Char] ParserState Inlines -- ^ content parser (to be used repeatedly)
+simpleInline border construct = try $ do
+ st <- getState
+ pos <- getPosition
+ let afterString = stateLastStrPos st == Just pos
+ guard $ not afterString
+ border *> notFollowedBy (oneOf " \t\n\r")
+ attr <- attributes
+ body <- trimInlines . mconcat <$>
+ withQuoteContext InSingleQuote
+ (manyTill (notFollowedBy newline >> inline)
+ (try border <* notFollowedBy alphaNum))
+ return $ construct $
+ if attr == nullAttr
+ then body
+ else B.spanWith attr body
+
+groupedInlineMarkup :: Parser [Char] ParserState Inlines
+groupedInlineMarkup = try $ do
+ char '['
+ sp1 <- option mempty $ B.space <$ whitespace
+ result <- withQuoteContext InSingleQuote inlineMarkup
+ sp2 <- option mempty $ B.space <$ whitespace
+ char ']'
+ return $ sp1 <> result <> sp2
-- | Create a singleton list
singleton :: a -> [a]
singleton x = [x]
+
diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs
new file mode 100644
index 000000000..6f8c19ac7
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Txt2Tags.hs
@@ -0,0 +1,579 @@
+{-# LANGUAGE ViewPatterns #-}
+{-
+Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Txt2Tags
+ Copyright : Copyright (C) 2014 Matthew Pickering
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Matthew Pickering <matthewtpickering@gmail.com>
+
+Conversion of txt2tags formatted plain text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags
+ , getT2TMeta
+ , T2TMeta (..)
+ , readTxt2TagsNoMacros)
+ where
+
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder ( Inlines, Blocks, (<>)
+ , trimInlines )
+import Text.Pandoc.Definition
+import Text.Pandoc.Options
+import Text.Pandoc.Shared (escapeURI,compactify', compactify'DL)
+import Text.Pandoc.Parsing hiding (space, spaces, uri, macro)
+import Control.Applicative ((<$>), (<$), (<*>), (<*), (*>))
+import Data.Char (toLower)
+import Data.List (transpose, intersperse, intercalate)
+import Data.Maybe (fromMaybe)
+import Data.Monoid (Monoid, mconcat, mempty, mappend)
+--import Network.URI (isURI) -- Not sure whether to use this function
+import Control.Monad (void, guard, when)
+import Data.Default
+import Control.Monad.Reader (Reader, runReader, asks)
+
+import Data.Time.LocalTime (getZonedTime)
+import Text.Pandoc.Compat.Directory(getModificationTime)
+import Data.Time.Format (formatTime)
+import System.Locale (defaultTimeLocale)
+import System.IO.Error (catchIOError)
+
+type T2T = ParserT String ParserState (Reader T2TMeta)
+
+-- | An object for the T2T macros meta information
+-- the contents of each field is simply substituted verbatim into the file
+data T2TMeta = T2TMeta {
+ date :: String -- ^ Current date
+ , mtime :: String -- ^ Last modification time of infile
+ , infile :: FilePath -- ^ Input file
+ , outfile :: FilePath -- ^ Output file
+ } deriving Show
+
+instance Default T2TMeta where
+ def = T2TMeta "" "" "" ""
+
+-- | Get the meta information required by Txt2Tags macros
+getT2TMeta :: [FilePath] -> FilePath -> IO T2TMeta
+getT2TMeta inps out = do
+ curDate <- formatTime defaultTimeLocale "%F" <$> getZonedTime
+ let getModTime = fmap (formatTime defaultTimeLocale "%T") .
+ getModificationTime
+ curMtime <- case inps of
+ [] -> formatTime defaultTimeLocale "%T" <$> getZonedTime
+ _ -> catchIOError
+ (maximum <$> mapM getModTime inps)
+ (const (return ""))
+ return $ T2TMeta curDate curMtime (intercalate ", " inps) out
+
+-- | Read Txt2Tags from an input string returning a Pandoc document
+readTxt2Tags :: T2TMeta -> ReaderOptions -> String -> Pandoc
+readTxt2Tags t opts s = flip runReader t $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n")
+
+-- | Read Txt2Tags (ignoring all macros) from an input string returning
+-- a Pandoc document
+readTxt2TagsNoMacros :: ReaderOptions -> String -> Pandoc
+readTxt2TagsNoMacros = readTxt2Tags def
+
+parseT2T :: T2T Pandoc
+parseT2T = do
+ -- Parse header if standalone flag is set
+ standalone <- getOption readerStandalone
+ when standalone parseHeader
+ body <- mconcat <$> manyTill block eof
+ meta' <- stateMeta <$> getState
+ return $ Pandoc meta' (B.toList body)
+
+parseHeader :: T2T ()
+parseHeader = do
+ () <$ try blankline <|> header
+ meta <- stateMeta <$> getState
+ optional blanklines
+ config <- manyTill setting (notFollowedBy setting)
+ -- TODO: Handle settings better
+ let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) meta config
+ updateState (\s -> s {stateMeta = settings}) <* optional blanklines
+
+header :: T2T ()
+header = titleline >> authorline >> dateline
+
+headerline :: B.ToMetaValue a => String -> T2T a -> T2T ()
+headerline field p = (() <$ try blankline)
+ <|> (p >>= updateState . B.setMeta field)
+
+titleline :: T2T ()
+titleline =
+ headerline "title" (trimInlines . mconcat <$> manyTill inline newline)
+
+authorline :: T2T ()
+authorline =
+ headerline "author" (sepBy author (char ';') <* newline)
+ where
+ author = trimInlines . mconcat <$> many (notFollowedBy (char ';' <|> newline) >> inline)
+
+dateline :: T2T ()
+dateline = headerline "date" (trimInlines . mconcat <$> manyTill inline newline)
+
+type Keyword = String
+type Value = String
+
+setting :: T2T (Keyword, Value)
+setting = do
+ string "%!"
+ keyword <- ignoreSpacesCap (many1 alphaNum)
+ char ':'
+ value <- ignoreSpacesCap (manyTill anyChar (newline))
+ return (keyword, value)
+
+-- Blocks
+
+parseBlocks :: T2T Blocks
+parseBlocks = mconcat <$> manyTill block eof
+
+block :: T2T Blocks
+block = do
+ choice
+ [ mempty <$ blanklines
+ , quote
+ , hrule -- hrule must go above title
+ , title
+ , commentBlock
+ , verbatim
+ , rawBlock
+ , taggedBlock
+ , list
+ , table
+ , para
+ ]
+
+title :: T2T Blocks
+title = try $ balancedTitle '+' <|> balancedTitle '='
+
+balancedTitle :: Char -> T2T Blocks
+balancedTitle c = try $ do
+ spaces
+ level <- length <$> many1 (char c)
+ guard (level <= 5) -- Max header level 5
+ heading <- manyTill (noneOf "\n\r") (count level (char c))
+ label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-"))
+ many spaceChar *> newline
+ let attr = maybe nullAttr (\x -> (x, [], [])) label
+ return $ B.headerWith attr level (trimInlines $ B.text heading)
+
+para :: T2T Blocks
+para = try $ do
+ ils <- parseInlines
+ nl <- option False (True <$ newline)
+ option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils))
+ where
+ listStart = try bulletListStart <|> orderedListStart
+
+commentBlock :: T2T Blocks
+commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment
+
+-- Seperator and Strong line treated the same
+hrule :: T2T Blocks
+hrule = try $ do
+ spaces
+ line <- many1 (oneOf "=-_")
+ guard (length line >= 20)
+ B.horizontalRule <$ blankline
+
+quote :: T2T Blocks
+quote = try $ do
+ lookAhead tab
+ rawQuote <- many1 (tab *> optional spaces *> anyLine)
+ contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n")
+ return $ B.blockQuote contents
+
+commentLine :: T2T Inlines
+commentLine = comment
+
+-- List Parsing code from Org Reader
+
+list :: T2T Blocks
+list = choice [bulletList, orderedList, definitionList]
+
+bulletList :: T2T Blocks
+bulletList = B.bulletList . compactify'
+ <$> many1 (listItem bulletListStart parseBlocks)
+
+orderedList :: T2T Blocks
+orderedList = B.orderedList . compactify'
+ <$> many1 (listItem orderedListStart parseBlocks)
+
+definitionList :: T2T Blocks
+definitionList = try $ do
+ B.definitionList . compactify'DL <$>
+ many1 (listItem definitionListStart definitionListEnd)
+
+definitionListEnd :: T2T (Inlines, [Blocks])
+definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks)
+
+genericListStart :: T2T Char
+ -> T2T Int
+genericListStart listMarker = try $
+ (2+) <$> (length <$> many spaceChar
+ <* listMarker <* space <* notFollowedBy space)
+
+-- parses bullet list \start and returns its length (excl. following whitespace)
+bulletListStart :: T2T Int
+bulletListStart = genericListStart (char '-')
+
+orderedListStart :: T2T Int
+orderedListStart = genericListStart (char '+' )
+
+definitionListStart :: T2T Int
+definitionListStart = genericListStart (char ':')
+
+-- parse raw text for one list item, excluding start marker and continuations
+listItem :: T2T Int
+ -> T2T a
+ -> T2T a
+listItem start end = try $ do
+ markerLength <- try start
+ firstLine <- anyLineNewline
+ blank <- option "" ("\n" <$ blankline)
+ rest <- concat <$> many (listContinuation markerLength)
+ parseFromString end $ firstLine ++ blank ++ rest
+
+-- continuation of a list item - indented and separated by blankline or endline.
+-- Note: nested lists are parsed as continuations.
+listContinuation :: Int
+ -> T2T String
+listContinuation markerLength = try $
+ notFollowedBy' (blankline >> blankline)
+ *> (mappend <$> (concat <$> many1 listLine)
+ <*> many blankline)
+ where listLine = try $ indentWith markerLength *> anyLineNewline
+
+anyLineNewline :: T2T String
+anyLineNewline = (++ "\n") <$> anyLine
+
+indentWith :: Int -> T2T String
+indentWith n = count n space
+
+-- Table
+
+table :: T2T Blocks
+table = try $ do
+ tableHeader <- fmap snd <$> option mempty (try headerRow)
+ rows <- many1 (many commentLine *> tableRow)
+ let columns = transpose rows
+ let ncolumns = length columns
+ let aligns = map (foldr1 findAlign) (map (map fst) columns)
+ let rows' = map (map snd) rows
+ let size = maximum (map length rows')
+ let rowsPadded = map (pad size) rows'
+ let headerPadded = if (not (null tableHeader)) then pad size tableHeader else mempty
+ return $ B.table mempty
+ (zip aligns (replicate ncolumns 0.0))
+ headerPadded rowsPadded
+
+pad :: (Show a, Monoid a) => Int -> [a] -> [a]
+pad n xs = xs ++ (replicate (n - length xs) mempty)
+
+
+findAlign :: Alignment -> Alignment -> Alignment
+findAlign x y
+ | x == y = x
+ | otherwise = AlignDefault
+
+headerRow :: T2T [(Alignment, Blocks)]
+headerRow = genericRow (string "||")
+
+tableRow :: T2T [(Alignment, Blocks)]
+tableRow = genericRow (char '|')
+
+genericRow :: T2T a -> T2T [(Alignment, Blocks)]
+genericRow start = try $ do
+ spaces *> start
+ manyTill tableCell newline <?> "genericRow"
+
+
+tableCell :: T2T (Alignment, Blocks)
+tableCell = try $ do
+ leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead
+ content <- (manyTill inline (try $ lookAhead (cellEnd)))
+ rightSpaces <- length <$> many space
+ let align =
+ case compare leftSpaces rightSpaces of
+ LT -> AlignLeft
+ EQ -> AlignCenter
+ GT -> AlignRight
+ endOfCell
+ return $ (align, B.plain (B.trimInlines $ mconcat content))
+ where
+ cellEnd = (void newline <|> (many1 space *> endOfCell))
+
+endOfCell :: T2T ()
+endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline)
+
+-- Raw area
+
+verbatim :: T2T Blocks
+verbatim = genericBlock anyLineNewline B.codeBlock "```"
+
+rawBlock :: T2T Blocks
+rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\""
+
+taggedBlock :: T2T Blocks
+taggedBlock = do
+ target <- getTarget
+ genericBlock anyLineNewline (B.rawBlock target) "'''"
+
+-- Generic
+
+genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks
+genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s
+
+blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks
+blockMarkupArea p f s = try $ (do
+ string s *> blankline
+ f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline))))
+
+blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks
+blockMarkupLine p f s = try (f <$> (string s *> space *> p))
+
+-- Can be in either block or inline position
+comment :: Monoid a => T2T a
+comment = try $ do
+ atStart
+ notFollowedBy macro
+ mempty <$ (char '%' *> anyLine)
+
+-- Inline
+
+parseInlines :: T2T Inlines
+parseInlines = trimInlines . mconcat <$> many1 inline
+
+inline :: T2T Inlines
+inline = do
+ choice
+ [ endline
+ , macro
+ , commentLine
+ , whitespace
+ , url
+ , link
+ , image
+ , bold
+ , underline
+ , code
+ , raw
+ , tagged
+ , strike
+ , italic
+ , code
+ , str
+ , symbol
+ ]
+
+bold :: T2T Inlines
+bold = inlineMarkup inline B.strong '*' (B.str)
+
+underline :: T2T Inlines
+underline = inlineMarkup inline B.emph '_' (B.str)
+
+strike :: T2T Inlines
+strike = inlineMarkup inline B.strikeout '-' (B.str)
+
+italic :: T2T Inlines
+italic = inlineMarkup inline B.emph '/' (B.str)
+
+code :: T2T Inlines
+code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id
+
+raw :: T2T Inlines
+raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id
+
+tagged :: T2T Inlines
+tagged = do
+ target <- getTarget
+ inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id
+
+-- Parser for markup indicated by a double character.
+-- Inline markup is greedy and glued
+-- Greedy meaning ***a*** = Bold [Str "*a*"]
+-- Glued meaning that markup must be tight to content
+-- Markup can't pass newlines
+inlineMarkup :: Monoid a
+ => (T2T a) -- Content parser
+ -> (a -> Inlines) -- Constructor
+ -> Char -- Fence
+ -> (String -> a) -- Special Case to handle ******
+ -> T2T Inlines
+inlineMarkup p f c special = try $ do
+ start <- many1 (char c)
+ let l = length start
+ guard (l >= 2)
+ when (l == 2) (void $ notFollowedBy space)
+ -- We must make sure that there is no space before the start of the
+ -- closing tags
+ body <- optionMaybe (try $ manyTill (noneOf "\n\r") $
+ (try $ lookAhead (noneOf " " >> string [c,c] )))
+ case body of
+ Just middle -> do
+ lastChar <- anyChar
+ end <- many1 (char c)
+ let parser inp = parseFromString (mconcat <$> many p) inp
+ let start' = special (drop 2 start)
+ body' <- parser (middle ++ [lastChar])
+ let end' = special (drop 2 end)
+ return $ f (start' <> body' <> end')
+ Nothing -> do -- Either bad or case such as *****
+ guard (l >= 5)
+ let body' = (replicate (l - 4) c)
+ return $ f (special body')
+
+link :: T2T Inlines
+link = try imageLink <|> titleLink
+
+-- Link with title
+titleLink :: T2T Inlines
+titleLink = try $ do
+ char '['
+ notFollowedBy space
+ tokens <- sepBy1 (many $ noneOf " ]") space
+ guard (length tokens >= 2)
+ char ']'
+ let link' = last tokens
+ guard (length link' > 0)
+ let tit = concat (intersperse " " (init tokens))
+ return $ B.link link' "" (B.text tit)
+
+-- Link with image
+imageLink :: T2T Inlines
+imageLink = try $ do
+ char '['
+ body <- image
+ many1 space
+ l <- manyTill (noneOf "\n\r ") (char ']')
+ return (B.link l "" body)
+
+macro :: T2T Inlines
+macro = try $ do
+ name <- string "%%" *> oneOfStringsCI (map fst commands)
+ optional (try $ enclosed (char '(') (char ')') anyChar)
+ lookAhead (spaceChar <|> oneOf specialChars <|> newline)
+ maybe (return mempty) (\f -> B.str <$> asks f) (lookup name commands)
+ where
+ commands = [ ("date", date), ("mtime", mtime)
+ , ("infile", infile), ("outfile", outfile)]
+
+-- raw URLs in text are automatically linked
+url :: T2T Inlines
+url = try $ do
+ (rawUrl, escapedUrl) <- (try uri <|> emailAddress)
+ return $ B.link rawUrl "" (B.str escapedUrl)
+
+uri :: T2T (String, String)
+uri = try $ do
+ address <- t2tURI
+ return (address, escapeURI address)
+
+-- The definition of a URI in the T2T source differs from the
+-- actual definition. This is a transcription of the definition in
+-- the source of v2.6
+--isT2TURI :: String -> Bool
+--isT2TURI (parse t2tURI "" -> Right _) = True
+--isT2TURI _ = False
+
+t2tURI :: T2T String
+t2tURI = do
+ start <- try ((++) <$> proto <*> urlLogin) <|> guess
+ domain <- many1 chars
+ sep <- many (char '/')
+ form' <- option mempty ((:) <$> char '?' <*> many1 form)
+ anchor' <- option mempty ((:) <$> char '#' <*> many anchor)
+ return (start ++ domain ++ sep ++ form' ++ anchor')
+ where
+ protos = ["http", "https", "ftp", "telnet", "gopher", "wais"]
+ proto = (++) <$> oneOfStrings protos <*> string "://"
+ guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23"))
+ <|> stringAnyCase "ftp") <*> ((:[]) <$> char '.')
+ login = alphaNum <|> oneOf "_.-"
+ pass = many (noneOf " @")
+ chars = alphaNum <|> oneOf "%._/~:,=$@&+-"
+ anchor = alphaNum <|> oneOf "%._0"
+ form = chars <|> oneOf ";*"
+ urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@')
+
+
+image :: T2T Inlines
+image = try $ do
+ -- List taken from txt2tags source
+ let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"]
+ char '['
+ path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions))
+ ext <- oneOfStrings extensions
+ char ']'
+ return $ B.image (path ++ ext) "" mempty
+
+-- Characters used in markup
+specialChars :: String
+specialChars = "%*-_/|:+;"
+
+tab :: T2T Char
+tab = char '\t'
+
+space :: T2T Char
+space = char ' '
+
+spaces :: T2T String
+spaces = many space
+
+endline :: T2T Inlines
+endline = try $ do
+ newline
+ notFollowedBy blankline
+ notFollowedBy hrule
+ notFollowedBy title
+ notFollowedBy verbatim
+ notFollowedBy rawBlock
+ notFollowedBy taggedBlock
+ notFollowedBy quote
+ notFollowedBy list
+ notFollowedBy table
+ return $ B.space
+
+str :: T2T Inlines
+str = try $ do
+ B.str <$> many1 (noneOf $ specialChars ++ "\n\r ")
+
+whitespace :: T2T Inlines
+whitespace = try $ B.space <$ spaceChar
+
+symbol :: T2T Inlines
+symbol = B.str . (:[]) <$> oneOf specialChars
+
+-- Utility
+
+getTarget :: T2T String
+getTarget = do
+ mv <- lookupMeta "target" . stateMeta <$> getState
+ let MetaString target = fromMaybe (MetaString "html") mv
+ return target
+
+atStart :: T2T ()
+atStart = (sourceColumn <$> getPosition) >>= guard . (== 1)
+
+ignoreSpacesCap :: T2T String -> T2T String
+ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces)
+