aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs904
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs34
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs179
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs288
-rw-r--r--src/Text/Pandoc/Readers/RST.hs263
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs361
6 files changed, 1546 insertions, 483 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
new file mode 100644
index 000000000..62f7c61a0
--- /dev/null
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -0,0 +1,904 @@
+module Text.Pandoc.Readers.DocBook ( readDocBook ) where
+import Data.Char (toUpper, isDigit)
+import Text.Pandoc.Parsing (ParserState(..))
+import Text.Pandoc.Definition
+import Text.Pandoc.Builder
+import Text.XML.Light
+import Text.HTML.TagSoup.Entity (lookupEntity)
+import Data.Generics
+import Data.Monoid
+import Data.Char (isSpace)
+import Control.Monad.State
+import Control.Applicative ((<$>))
+import Data.List (intersperse)
+
+{-
+
+List of all DocBook tags, with [x] indicating implemented,
+[o] meaning intentionally left unimplemented (pass through):
+
+[o] abbrev - An abbreviation, especially one followed by a period
+[x] abstract - A summary
+[o] accel - A graphical user interface (GUI) keyboard shortcut
+[x] ackno - Acknowledgements in an Article
+[o] acronym - An often pronounceable word made from the initial
+[o] action - A response to a user event
+[o] address - A real-world address, generally a postal address
+[ ] affiliation - The institutional affiliation of an individual
+[ ] alt - Text representation for a graphical element
+[o] anchor - A spot in the document
+[x] answer - An answer to a question posed in a QandASet
+[x] appendix - An appendix in a Book or Article
+[x] appendixinfo - Meta-information for an Appendix
+[o] application - The name of a software program
+[x] area - A region defined for a Callout in a graphic or code example
+[x] areaset - A set of related areas in a graphic or code example
+[x] areaspec - A collection of regions in a graphic or code example
+[ ] arg - An argument in a CmdSynopsis
+[x] article - An article
+[x] articleinfo - Meta-information for an Article
+[ ] artpagenums - The page numbers of an article as published
+[x] attribution - The source of a block quote or epigraph
+[ ] audiodata - Pointer to external audio data
+[ ] audioobject - A wrapper for audio data and its associated meta-information
+[x] author - The name of an individual author
+[ ] authorblurb - A short description or note about an author
+[ ] authorgroup - Wrapper for author information when a document has
+ multiple authors or collabarators
+[x] authorinitials - The initials or other short identifier for an author
+[o] beginpage - The location of a page break in a print version of the document
+[ ] bibliocoverage - The spatial or temporal coverage of a document
+[x] bibliodiv - A section of a Bibliography
+[x] biblioentry - An entry in a Bibliography
+[x] bibliography - A bibliography
+[ ] bibliographyinfo - Meta-information for a Bibliography
+[ ] biblioid - An identifier for a document
+[o] bibliolist - A wrapper for a set of bibliography entries
+[ ] bibliomisc - Untyped bibliographic information
+[x] bibliomixed - An entry in a Bibliography
+[ ] bibliomset - A cooked container for related bibliographic information
+[ ] biblioref - A cross reference to a bibliographic entry
+[ ] bibliorelation - The relationship of a document to another
+[ ] biblioset - A raw container for related bibliographic information
+[ ] bibliosource - The source of a document
+[ ] blockinfo - Meta-information for a block element
+[x] blockquote - A quotation set off from the main text
+[x] book - A book
+[x] bookinfo - Meta-information for a Book
+[x] bridgehead - A free-floating heading
+[ ] callout - A “called out” description of a marked Area
+[ ] calloutlist - A list of Callouts
+[x] caption - A caption
+[x] caution - A note of caution
+[x] chapter - A chapter, as of a book
+[x] chapterinfo - Meta-information for a Chapter
+[ ] citation - An inline bibliographic reference to another published work
+[ ] citebiblioid - A citation of a bibliographic identifier
+[ ] citerefentry - A citation to a reference page
+[ ] citetitle - The title of a cited work
+[ ] city - The name of a city in an address
+[ ] classname - The name of a class, in the object-oriented programming sense
+[ ] classsynopsis - The syntax summary for a class definition
+[ ] classsynopsisinfo - Information supplementing the contents of
+ a ClassSynopsis
+[ ] cmdsynopsis - A syntax summary for a software command
+[ ] co - The location of a callout embedded in text
+[x] code - An inline code fragment
+[x] col - Specifications for a column in an HTML table
+[x] colgroup - A group of columns in an HTML table
+[ ] collab - Identifies a collaborator
+[ ] collabname - The name of a collaborator
+[ ] colophon - Text at the back of a book describing facts about its production
+[x] colspec - Specifications for a column in a table
+[x] command - The name of an executable program or other software command
+[x] computeroutput - Data, generally text, displayed or presented by a computer
+[ ] confdates - The dates of a conference for which a document was written
+[ ] confgroup - A wrapper for document meta-information about a conference
+[ ] confnum - An identifier, frequently numerical, associated with a conference for which a document was written
+[ ] confsponsor - The sponsor of a conference for which a document was written
+[ ] conftitle - The title of a conference for which a document was written
+[x] constant - A programming or system constant
+[ ] constraint - A constraint in an EBNF production
+[ ] constraintdef - The definition of a constraint in an EBNF production
+[ ] constructorsynopsis - A syntax summary for a constructor
+[ ] contractnum - The contract number of a document
+[ ] contractsponsor - The sponsor of a contract
+[ ] contrib - A summary of the contributions made to a document by a
+ credited source
+[ ] copyright - Copyright information about a document
+[ ] coref - A cross reference to a co
+[ ] corpauthor - A corporate author, as opposed to an individual
+[ ] corpcredit - A corporation or organization credited in a document
+[ ] corpname - The name of a corporation
+[ ] country - The name of a country
+[ ] database - The name of a database, or part of a database
+[x] date - The date of publication or revision of a document
+[ ] dedication - A wrapper for the dedication section of a book
+[ ] destructorsynopsis - A syntax summary for a destructor
+[ ] edition - The name or number of an edition of a document
+[ ] editor - The name of the editor of a document
+[x] email - An email address
+[x] emphasis - Emphasized text
+[x] entry - A cell in a table
+[ ] entrytbl - A subtable appearing in place of an Entry in a table
+[ ] envar - A software environment variable
+[x] epigraph - A short inscription at the beginning of a document or component
+ note: also handle embedded attribution tag
+[ ] equation - A displayed mathematical equation
+[ ] errorcode - An error code
+[ ] errorname - An error name
+[ ] errortext - An error message.
+[ ] errortype - The classification of an error message
+[ ] example - A formal example, with a title
+[ ] exceptionname - The name of an exception
+[ ] fax - A fax number
+[ ] fieldsynopsis - The name of a field in a class definition
+[ ] figure - A formal figure, generally an illustration, with a title
+[x] filename - The name of a file
+[ ] firstname - The first name of a person
+[ ] firstterm - The first occurrence of a term
+[x] footnote - A footnote
+[ ] footnoteref - A cross reference to a footnote (a footnote mark)
+[x] foreignphrase - A word or phrase in a language other than the primary
+ language of the document
+[x] formalpara - A paragraph with a title
+[ ] funcdef - A function (subroutine) name and its return type
+[ ] funcparams - Parameters for a function referenced through a function
+ pointer in a synopsis
+[ ] funcprototype - The prototype of a function
+[ ] funcsynopsis - The syntax summary for a function definition
+[ ] funcsynopsisinfo - Information supplementing the FuncDefs of a FuncSynopsis
+[x] function - The name of a function or subroutine, as in a
+ programming language
+[x] glossary - A glossary
+[x] glossaryinfo - Meta-information for a Glossary
+[x] glossdef - A definition in a GlossEntry
+[x] glossdiv - A division in a Glossary
+[x] glossentry - An entry in a Glossary or GlossList
+[x] glosslist - A wrapper for a set of GlossEntrys
+[x] glosssee - A cross-reference from one GlossEntry to another
+[x] glossseealso - A cross-reference from one GlossEntry to another
+[x] glossterm - A glossary term
+[ ] graphic - A displayed graphical object (not an inline)
+[ ] graphicco - A graphic that contains callout areas
+[ ] group - A group of elements in a CmdSynopsis
+[ ] guibutton - The text on a button in a GUI
+[ ] guiicon - Graphic and/or text appearing as a icon in a GUI
+[ ] guilabel - The text of a label in a GUI
+[ ] guimenu - The name of a menu in a GUI
+[ ] guimenuitem - The name of a terminal menu item in a GUI
+[ ] guisubmenu - The name of a submenu in a GUI
+[ ] hardware - A physical part of a computer system
+[ ] highlights - A summary of the main points of the discussed component
+[ ] holder - The name of the individual or organization that holds a copyright
+[o] honorific - The title of a person
+[ ] html:form - An HTML form
+[ ] imagedata - Pointer to external image data
+[ ] imageobject - A wrapper for image data and its associated meta-information
+[ ] imageobjectco - A wrapper for an image object with callouts
+[x] important - An admonition set off from the text
+[x] index - An index
+[x] indexdiv - A division in an index
+[x] indexentry - An entry in an index
+[x] indexinfo - Meta-information for an Index
+[x] indexterm - A wrapper for terms to be indexed
+[x] info - A wrapper for information about a component or other block. (DocBook v5)
+[ ] informalequation - A displayed mathematical equation without a title
+[ ] informalexample - A displayed example without a title
+[ ] informalfigure - A untitled figure
+[ ] informaltable - A table without a title
+[ ] initializer - The initializer for a FieldSynopsis
+[ ] inlineequation - A mathematical equation or expression occurring inline
+[ ] inlinegraphic - An object containing or pointing to graphical data
+ that will be rendered inline
+[x] inlinemediaobject - An inline media object (video, audio, image, and so on)
+[ ] interface - An element of a GUI
+[ ] interfacename - The name of an interface
+[ ] invpartnumber - An inventory part number
+[ ] isbn - The International Standard Book Number of a document
+[ ] issn - The International Standard Serial Number of a periodical
+[ ] issuenum - The number of an issue of a journal
+[x] itemizedlist - A list in which each entry is marked with a bullet or
+ other dingbat
+[ ] itermset - A set of index terms in the meta-information of a document
+[ ] jobtitle - The title of an individual in an organization
+[ ] keycap - The text printed on a key on a keyboard
+[ ] keycode - The internal, frequently numeric, identifier for a key
+ on a keyboard
+[ ] keycombo - A combination of input actions
+[ ] keysym - The symbolic name of a key on a keyboard
+[ ] keyword - One of a set of keywords describing the content of a document
+[ ] keywordset - A set of keywords describing the content of a document
+[ ] label - A label on a Question or Answer
+[ ] legalnotice - A statement of legal obligations or requirements
+[ ] lhs - The left-hand side of an EBNF production
+[ ] lineage - The portion of a person's name indicating a relationship to
+ ancestors
+[ ] lineannotation - A comment on a line in a verbatim listing
+[x] link - A hypertext link
+[x] listitem - A wrapper for the elements of a list item
+[x] literal - Inline text that is some literal value
+[x] literallayout - A block of text in which line breaks and white space are
+ to be reproduced faithfully
+[ ] lot - A list of the titles of formal objects (as tables or figures) in
+ a document
+[ ] lotentry - An entry in a list of titles
+[ ] manvolnum - A reference volume number
+[x] markup - A string of formatting markup in text that is to be
+ represented literally
+[ ] mathphrase - A mathematical phrase, an expression that can be represented
+ with ordinary text and a small amount of markup
+[ ] medialabel - A name that identifies the physical medium on which some
+ information resides
+[x] mediaobject - A displayed media object (video, audio, image, etc.)
+[ ] mediaobjectco - A media object that contains callouts
+[x] member - An element of a simple list
+[ ] menuchoice - A selection or series of selections from a menu
+[ ] methodname - The name of a method
+[ ] methodparam - Parameters to a method
+[ ] methodsynopsis - A syntax summary for a method
+[ ] mml:math - A MathML equation
+[ ] modespec - Application-specific information necessary for the
+ completion of an OLink
+[ ] modifier - Modifiers in a synopsis
+[ ] mousebutton - The conventional name of a mouse button
+[ ] msg - A message in a message set
+[ ] msgaud - The audience to which a message in a message set is relevant
+[ ] msgentry - A wrapper for an entry in a message set
+[ ] msgexplan - Explanatory material relating to a message in a message set
+[ ] msginfo - Information about a message in a message set
+[ ] msglevel - The level of importance or severity of a message in a message set
+[ ] msgmain - The primary component of a message in a message set
+[ ] msgorig - The origin of a message in a message set
+[ ] msgrel - A related component of a message in a message set
+[ ] msgset - A detailed set of messages, usually error messages
+[ ] msgsub - A subcomponent of a message in a message set
+[ ] msgtext - The actual text of a message component in a message set
+[ ] nonterminal - A non-terminal in an EBNF production
+[x] note - A message set off from the text
+[ ] objectinfo - Meta-information for an object
+[ ] olink - A link that addresses its target indirectly, through an entity
+[ ] ooclass - A class in an object-oriented programming language
+[ ] ooexception - An exception in an object-oriented programming language
+[ ] oointerface - An interface in an object-oriented programming language
+[x] option - An option for a software command
+[x] optional - Optional information
+[x] orderedlist - A list in which each entry is marked with a sequentially
+ incremented label
+[ ] orgdiv - A division of an organization
+[ ] orgname - The name of an organization other than a corporation
+[ ] otheraddr - Uncategorized information in address
+[ ] othercredit - A person or entity, other than an author or editor,
+ credited in a document
+[ ] othername - A component of a persons name that is not a first name,
+ surname, or lineage
+[ ] package - A package
+[ ] pagenums - The numbers of the pages in a book, for use in a bibliographic
+ entry
+[x] para - A paragraph
+[ ] paramdef - Information about a function parameter in a programming language
+[x] parameter - A value or a symbolic reference to a value
+[ ] part - A division in a book
+[ ] partinfo - Meta-information for a Part
+[ ] partintro - An introduction to the contents of a part
+[ ] personblurb - A short description or note about a person
+[ ] personname - The personal name of an individual
+[ ] phone - A telephone number
+[ ] phrase - A span of text
+[ ] pob - A post office box in an address
+[ ] postcode - A postal code in an address
+[x] preface - Introductory matter preceding the first chapter of a book
+[ ] prefaceinfo - Meta-information for a Preface
+[ ] primary - The primary word or phrase under which an index term should be
+ sorted
+[ ] primaryie - A primary term in an index entry, not in the text
+[ ] printhistory - The printing history of a document
+[ ] procedure - A list of operations to be performed in a well-defined sequence
+[ ] production - A production in a set of EBNF productions
+[ ] productionrecap - A cross-reference to an EBNF production
+[ ] productionset - A set of EBNF productions
+[ ] productname - The formal name of a product
+[ ] productnumber - A number assigned to a product
+[x] programlisting - A literal listing of all or part of a program
+[ ] programlistingco - A program listing with associated areas used in callouts
+[x] prompt - A character or string indicating the start of an input field in
+ a computer display
+[ ] property - A unit of data associated with some part of a computer system
+[ ] pubdate - The date of publication of a document
+[ ] publisher - The publisher of a document
+[ ] publishername - The name of the publisher of a document
+[ ] pubsnumber - A number assigned to a publication other than an ISBN or ISSN
+ or inventory part number
+[x] qandadiv - A titled division in a QandASet
+[o] qandaentry - A question/answer set within a QandASet
+[o] qandaset - A question-and-answer set
+[x] question - A question in a QandASet
+[x] quote - An inline quotation
+[ ] refclass - The scope or other indication of applicability of a
+ reference entry
+[ ] refdescriptor - A description of the topic of a reference page
+[ ] refentry - A reference page (originally a UNIX man-style reference page)
+[ ] refentryinfo - Meta-information for a Refentry
+[ ] refentrytitle - The title of a reference page
+[ ] reference - A collection of reference entries
+[ ] referenceinfo - Meta-information for a Reference
+[ ] refmeta - Meta-information for a reference entry
+[ ] refmiscinfo - Meta-information for a reference entry other than the title
+ and volume number
+[ ] refname - The name of (one of) the subject(s) of a reference page
+[ ] refnamediv - The name, purpose, and classification of a reference page
+[ ] refpurpose - A short (one sentence) synopsis of the topic of a reference
+ page
+[x] refsect1 - A major subsection of a reference entry
+[x] refsect1info - Meta-information for a RefSect1
+[x] refsect2 - A subsection of a RefSect1
+[x] refsect2info - Meta-information for a RefSect2
+[x] refsect3 - A subsection of a RefSect2
+[x] refsect3info - Meta-information for a RefSect3
+[x] refsection - A recursive section in a refentry
+[x] refsectioninfo - Meta-information for a refsection
+[ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page
+[ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv
+[ ] releaseinfo - Information about a particular release of a document
+[ ] remark - A remark (or comment) intended for presentation in a draft
+ manuscript
+[ ] replaceable - Content that may or must be replaced by the user
+[ ] returnvalue - The value returned by a function
+[ ] revdescription - A extended description of a revision to a document
+[ ] revhistory - A history of the revisions to a document
+[ ] revision - An entry describing a single revision in the history of the
+ revisions to a document
+[ ] revnumber - A document revision number
+[ ] revremark - A description of a revision to a document
+[ ] rhs - The right-hand side of an EBNF production
+[x] row - A row in a table
+[ ] sbr - An explicit line break in a command synopsis
+[x] screen - Text that a user sees or might see on a computer screen
+[o] screenco - A screen with associated areas used in callouts
+[o] screeninfo - Information about how a screen shot was produced
+[ ] screenshot - A representation of what the user sees or might see on a
+ computer screen
+[ ] secondary - A secondary word or phrase in an index term
+[ ] secondaryie - A secondary term in an index entry, rather than in the text
+[x] sect1 - A top-level section of document
+[x] sect1info - Meta-information for a Sect1
+[x] sect2 - A subsection within a Sect1
+[x] sect2info - Meta-information for a Sect2
+[x] sect3 - A subsection within a Sect2
+[x] sect3info - Meta-information for a Sect3
+[x] sect4 - A subsection within a Sect3
+[x] sect4info - Meta-information for a Sect4
+[x] sect5 - A subsection within a Sect4
+[x] sect5info - Meta-information for a Sect5
+[x] section - A recursive section
+[x] sectioninfo - Meta-information for a recursive section
+[x] see - Part of an index term directing the reader instead to another entry
+ in the index
+[x] seealso - Part of an index term directing the reader also to another entry
+ in the index
+[ ] seealsoie - A See also entry in an index, rather than in the text
+[ ] seeie - A See entry in an index, rather than in the text
+[x] seg - An element of a list item in a segmented list
+[x] seglistitem - A list item in a segmented list
+[x] segmentedlist - A segmented list, a list of sets of elements
+[x] segtitle - The title of an element of a list item in a segmented list
+[ ] seriesvolnums - Numbers of the volumes in a series of books
+[ ] set - A collection of books
+[ ] setindex - An index to a set of books
+[ ] setindexinfo - Meta-information for a SetIndex
+[ ] setinfo - Meta-information for a Set
+[ ] sgmltag - A component of SGML markup
+[ ] shortaffil - A brief description of an affiliation
+[ ] shortcut - A key combination for an action that is also accessible through
+ a menu
+[ ] sidebar - A portion of a document that is isolated from the main
+ narrative flow
+[ ] sidebarinfo - Meta-information for a Sidebar
+[x] simpara - A paragraph that contains only text and inline markup, no block
+ elements
+[x] simplelist - An undecorated list of single words or short phrases
+[ ] simplemsgentry - A wrapper for a simpler entry in a message set
+[ ] simplesect - A section of a document with no subdivisions
+[ ] spanspec - Formatting information for a spanned column in a table
+[ ] state - A state or province in an address
+[ ] step - A unit of action in a procedure
+[ ] stepalternatives - Alternative steps in a procedure
+[ ] street - A street address in an address
+[ ] structfield - A field in a structure (in the programming language sense)
+[ ] structname - The name of a structure (in the programming language sense)
+[ ] subject - One of a group of terms describing the subject matter of a
+ document
+[ ] subjectset - A set of terms describing the subject matter of a document
+[ ] subjectterm - A term in a group of terms describing the subject matter of
+ a document
+[x] subscript - A subscript (as in H2O, the molecular formula for water)
+[ ] substeps - A wrapper for steps that occur within steps in a procedure
+[x] subtitle - The subtitle of a document
+[x] superscript - A superscript (as in x2, the mathematical notation for x
+ multiplied by itself)
+[ ] surname - A family name; in western cultures the last name
+[ ] svg:svg - An SVG graphic
+[x] symbol - A name that is replaced by a value before processing
+[ ] synopfragment - A portion of a CmdSynopsis broken out from the main body
+ of the synopsis
+[ ] synopfragmentref - A reference to a fragment of a command synopsis
+[ ] synopsis - A general-purpose element for representing the syntax of
+ commands or functions
+[ ] systemitem - A system-related item or term
+[ ] table - A formal table in a document
+[ ] task - A task to be completed
+[ ] taskprerequisites - The prerequisites for a task
+[ ] taskrelated - Information related to a task
+[ ] tasksummary - A summary of a task
+[x] tbody - A wrapper for the rows of a table or informal table
+[x] td - A table entry in an HTML table
+[x] term - The word or phrase being defined or described in a variable list
+[ ] termdef - An inline term definition
+[ ] tertiary - A tertiary word or phrase in an index term
+[ ] tertiaryie - A tertiary term in an index entry, rather than in the text
+[ ] textdata - Pointer to external text data
+[ ] textobject - A wrapper for a text description of an object and its
+ associated meta-information
+[ ] tfoot - A table footer consisting of one or more rows
+[x] tgroup - A wrapper for the main content of a table, or part of a table
+[x] th - A table header entry in an HTML table
+[x] thead - A table header consisting of one or more rows
+[x] tip - A suggestion to the user, set off from the text
+[x] title - The text of the title of a section of a document or of a formal
+ block-level element
+[x] titleabbrev - The abbreviation of a Title
+[x] toc - A table of contents
+[x] tocback - An entry in a table of contents for a back matter component
+[x] tocchap - An entry in a table of contents for a component in the body of
+ a document
+[x] tocentry - A component title in a table of contents
+[x] tocfront - An entry in a table of contents for a front matter component
+[x] toclevel1 - A top-level entry within a table of contents entry for a
+ chapter-like component
+[x] toclevel2 - A second-level entry within a table of contents entry for a
+ chapter-like component
+[x] toclevel3 - A third-level entry within a table of contents entry for a
+ chapter-like component
+[x] toclevel4 - A fourth-level entry within a table of contents entry for a
+ chapter-like component
+[x] toclevel5 - A fifth-level entry within a table of contents entry for a
+ chapter-like component
+[x] tocpart - An entry in a table of contents for a part of a book
+[ ] token - A unit of information
+[x] tr - A row in an HTML table
+[ ] trademark - A trademark
+[ ] type - The classification of a value
+[x] ulink - A link that addresses its target by means of a URL
+ (Uniform Resource Locator)
+[x] uri - A Uniform Resource Identifier
+[x] userinput - Data entered by the user
+[x] varargs - An empty element in a function synopsis indicating a variable
+ number of arguments
+[x] variablelist - A list in which each entry is composed of a set of one or
+ more terms and an associated description
+[x] varlistentry - A wrapper for a set of terms and the associated description
+ in a variable list
+[x] varname - The name of a variable
+[ ] videodata - Pointer to external video data
+[ ] videoobject - A wrapper for video data and its associated meta-information
+[ ] void - An empty element in a function synopsis indicating that the
+ function in question takes no arguments
+[ ] volumenum - The volume number of a document in a set (as of books in a set
+ or articles in a journal)
+[x] warning - An admonition set off from the text
+[x] wordasword - A word meant specifically as a word and not representing
+ anything else
+[ ] xref - A cross reference to another part of the document
+[ ] year - The year of publication of a document
+
+-}
+
+type DB = State DBState
+
+data DBState = DBState{ dbSectionLevel :: Int
+ , dbQuoteType :: QuoteType
+ , dbDocTitle :: Inlines
+ , dbDocAuthors :: [Inlines]
+ , dbDocDate :: Inlines
+ , dbBook :: Bool
+ } deriving Show
+
+readDocBook :: ParserState -> String -> Pandoc
+readDocBook _ inp = setTitle (dbDocTitle st')
+ $ setAuthors (dbDocAuthors st')
+ $ setDate (dbDocDate st')
+ $ doc $ mconcat bs
+ where (bs, st') = runState (mapM parseBlock $ normalizeTree $ parseXML inp)
+ DBState{ dbSectionLevel = 0
+ , dbQuoteType = DoubleQuote
+ , dbDocTitle = mempty
+ , dbDocAuthors = []
+ , dbDocDate = mempty
+ , dbBook = False
+ }
+
+-- normalize input, consolidating adjacent Text and CRef elements
+normalizeTree :: [Content] -> [Content]
+normalizeTree = everywhere (mkT go)
+ where go :: [Content] -> [Content]
+ go (Text (CData CDataRaw _ _):xs) = xs
+ go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
+ Text (CData CDataText (s1 ++ s2) z):xs
+ go (Text (CData CDataText s1 z):CRef r:xs) =
+ Text (CData CDataText (s1 ++ convertEntity r) z):xs
+ go (CRef r:Text (CData CDataText s1 z):xs) =
+ Text (CData CDataText (convertEntity r ++ s1) z):xs
+ go (CRef r1:CRef r2:xs) =
+ Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs
+ go xs = xs
+
+convertEntity :: String -> String
+convertEntity e = maybe (map toUpper e) (:[]) (lookupEntity e)
+
+-- convenience function to get an attribute value, defaulting to ""
+attrValue :: String -> Element -> String
+attrValue attr elt =
+ case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of
+ Just z -> z
+ Nothing -> ""
+
+-- convenience function
+named :: String -> Element -> Bool
+named s e = qName (elName e) == s
+
+isBlockElement :: Content -> Bool
+isBlockElement (Elem e) = qName (elName e) `elem` blocktags
+ where blocktags = ["toc","index","para","formalpara","simpara",
+ "ackno","epigraph","blockquote","bibliography","bibliodiv",
+ "biblioentry","glossee","glosseealso","glossary",
+ "glossdiv","glosslist","chapter","appendix","preface",
+ "bridgehead","sect1","sect2","sect3","sect4","sect5","section",
+ "refsect1","refsect2","refsect3","refsection",
+ "important","caution","note","tip","warning","qandadiv",
+ "question","answer","abstract","itemizedlist","orderedlist",
+ "variablelist","article","book","table","informaltable",
+ "screen","programlisting","example"]
+isBlockElement _ = False
+
+-- Trim leading and trailing newline characters
+trimNl :: String -> String
+trimNl = reverse . go . reverse . go
+ where go ('\n':xs) = xs
+ go xs = xs
+
+-- meld text into beginning of first paragraph of Blocks.
+-- assumes Blocks start with a Para; if not, does nothing.
+addToStart :: Inlines -> Blocks -> Blocks
+addToStart toadd bs =
+ case toList bs of
+ (Para xs : rest) -> para (toadd <> fromList xs) <> fromList rest
+ _ -> bs
+
+-- function that is used by both mediaobject (in parseBlock)
+-- and inlinemediaobject (in parseInline)
+getImage :: Element -> DB Inlines
+getImage e = do
+ imageUrl <- case filterChild (named "imageobject") e of
+ Nothing -> return mempty
+ Just z -> case filterChild (named "imagedata") z of
+ Nothing -> return mempty
+ Just i -> return $ attrValue "fileref" i
+ caption <- case filterChild
+ (\x -> named "caption" x || named "textobject" x) e of
+ Nothing -> return mempty
+ Just z -> mconcat <$> (mapM parseInline $ elContent z)
+ return $ image imageUrl "" caption
+
+parseBlock :: Content -> DB Blocks
+parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
+parseBlock (Text (CData _ s _)) = if all isSpace s
+ then return mempty
+ else return $ plain $ trimInlines $ text s
+parseBlock (CRef x) = return $ plain $ str $ map toUpper x
+parseBlock (Elem e) =
+ case qName (elName e) of
+ "toc" -> return mempty -- skip TOC, since in pandoc it's autogenerated
+ "index" -> return mempty -- skip index, since page numbers meaningless
+ "para" -> parseMixed para (elContent e)
+ "formalpara" -> do
+ tit <- case filterChild (named "title") e of
+ Just t -> (<> str "." <> linebreak) <$> emph
+ <$> getInlines t
+ Nothing -> return mempty
+ addToStart tit <$> parseMixed para (elContent e)
+ "simpara" -> parseMixed para (elContent e)
+ "ackno" -> parseMixed para (elContent e)
+ "epigraph" -> parseBlockquote
+ "blockquote" -> parseBlockquote
+ "attribution" -> return mempty
+ "titleabbrev" -> return mempty
+ "authorinitials" -> return mempty
+ "title" -> return mempty -- handled by getTitle or sect
+ "bibliography" -> sect 0
+ "bibliodiv" -> sect 1
+ "biblioentry" -> parseMixed para (elContent e)
+ "bibliomixed" -> parseMixed para (elContent e)
+ "glosssee" -> para . (\ils -> text "See " <> ils <> str ".")
+ <$> getInlines e
+ "glossseealso" -> para . (\ils -> text "See also " <> ils <> str ".")
+ <$> getInlines e
+ "glossary" -> sect 0
+ "glossdiv" -> definitionList <$>
+ mapM parseGlossEntry (filterChildren (named "glossentry") e)
+ "glosslist" -> definitionList <$>
+ mapM parseGlossEntry (filterChildren (named "glossentry") e)
+ "chapter" -> sect 0
+ "appendix" -> sect 0
+ "preface" -> sect 0
+ "bridgehead" -> para . strong <$> getInlines e
+ "sect1" -> sect 1
+ "sect2" -> sect 2
+ "sect3" -> sect 3
+ "sect4" -> sect 4
+ "sect5" -> sect 5
+ "section" -> gets dbSectionLevel >>= sect . (+1)
+ "refsect1" -> sect 1
+ "refsect2" -> sect 2
+ "refsect3" -> sect 3
+ "refsection" -> gets dbSectionLevel >>= sect . (+1)
+ "important" -> blockQuote . (para (strong $ str "Important") <>)
+ <$> getBlocks e
+ "caution" -> blockQuote . (para (strong $ str "Caution") <>)
+ <$> getBlocks e
+ "note" -> blockQuote . (para (strong $ str "Note") <>)
+ <$> getBlocks e
+ "tip" -> blockQuote . (para (strong $ str "Tip") <>)
+ <$> getBlocks e
+ "warning" -> blockQuote . (para (strong $ str "Warning") <>)
+ <$> getBlocks e
+ "area" -> return mempty
+ "areaset" -> return mempty
+ "areaspec" -> return mempty
+ "qandadiv" -> gets dbSectionLevel >>= sect . (+1)
+ "question" -> addToStart (strong (str "Q:") <> str " ") <$> getBlocks e
+ "answer" -> addToStart (strong (str "A:") <> str " ") <$> getBlocks e
+ "abstract" -> blockQuote <$> getBlocks e
+ "itemizedlist" -> bulletList <$> listitems
+ "orderedlist" -> do
+ let listStyle = case attrValue "numeration" e of
+ "arabic" -> Decimal
+ "loweralpha" -> LowerAlpha
+ "upperalpha" -> UpperAlpha
+ "lowerroman" -> LowerRoman
+ "upperroman" -> UpperRoman
+ _ -> Decimal
+ let start = case attrValue "override" <$>
+ filterElement (named "listitem") e of
+ Just x@(_:_) | all isDigit x -> read x
+ _ -> 1
+ orderedListWith (start,listStyle,DefaultDelim)
+ <$> listitems
+ "variablelist" -> definitionList <$> deflistitems
+ "mediaobject" -> para <$> (getImage e)
+ "caption" -> return mempty
+ "info" -> getTitle >> getAuthors >> getDate >> return mempty
+ "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty
+ "sectioninfo" -> return mempty -- keywords & other metadata
+ "refsectioninfo" -> return mempty -- keywords & other metadata
+ "refsect1info" -> return mempty -- keywords & other metadata
+ "refsect2info" -> return mempty -- keywords & other metadata
+ "refsect3info" -> return mempty -- keywords & other metadata
+ "sect1info" -> return mempty -- keywords & other metadata
+ "sect2info" -> return mempty -- keywords & other metadata
+ "sect3info" -> return mempty -- keywords & other metadata
+ "sect4info" -> return mempty -- keywords & other metadata
+ "sect5info" -> return mempty -- keywords & other metadata
+ "chapterinfo" -> return mempty -- keywords & other metadata
+ "glossaryinfo" -> return mempty -- keywords & other metadata
+ "appendixinfo" -> return mempty -- keywords & other metadata
+ "bookinfo" -> getTitle >> getAuthors >> getDate >> return mempty
+ "article" -> modify (\st -> st{ dbBook = False }) >>
+ getTitle >> getBlocks e
+ "book" -> modify (\st -> st{ dbBook = True }) >> getTitle >> getBlocks e
+ "table" -> parseTable
+ "informaltable" -> parseTable
+ "literallayout" -> codeBlockWithLang
+ "screen" -> codeBlockWithLang
+ "programlisting" -> codeBlockWithLang
+ "?xml" -> return mempty
+ _ -> getBlocks e
+ where getBlocks e' = mconcat <$> (mapM parseBlock $ elContent e')
+ parseMixed container conts = do
+ let (ils,rest) = break isBlockElement conts
+ ils' <- (trimInlines . mconcat) <$> mapM parseInline ils
+ let p = if ils' == mempty then mempty else container ils'
+ case rest of
+ [] -> return p
+ (r:rs) -> do
+ b <- parseBlock r
+ x <- parseMixed container rs
+ return $ p <> b <> x
+ codeBlockWithLang = do
+ let classes' = case attrValue "language" e of
+ "" -> []
+ x -> [x]
+ return $ codeBlockWith (attrValue "id" e, classes', [])
+ $ trimNl $ strContent e
+ parseBlockquote = do
+ attrib <- case filterChild (named "attribution") e of
+ Nothing -> return mempty
+ Just z -> (para . (str "— " <>) . mconcat)
+ <$> (mapM parseInline $ elContent z)
+ contents <- getBlocks e
+ return $ blockQuote (contents <> attrib)
+ listitems = mapM getBlocks $ filterChildren (named "listitem") e
+ deflistitems = mapM parseVarListEntry $ filterChildren
+ (named "varlistentry") e
+ parseVarListEntry e' = do
+ let terms = filterChildren (named "term") e'
+ let items = filterChildren (named "listitem") e'
+ terms' <- mapM getInlines terms
+ items' <- mapM getBlocks items
+ return (mconcat $ intersperse (str "; ") terms', items')
+ parseGlossEntry e' = do
+ let terms = filterChildren (named "glossterm") e'
+ let items = filterChildren (named "glossdef") e'
+ terms' <- mapM getInlines terms
+ items' <- mapM getBlocks items
+ return (mconcat $ intersperse (str "; ") terms', items')
+ getTitle = case filterChild (named "title") e of
+ Just t -> do
+ tit <- getInlines t
+ subtit <- case filterChild (named "subtitle") e of
+ Just s -> (text ": " <>) <$>
+ getInlines s
+ Nothing -> return mempty
+ modify $ \st -> st{dbDocTitle = tit <> subtit}
+ Nothing -> return ()
+ getAuthors = do
+ auths <- mapM getInlines
+ $ filterChildren (named "author") e
+ modify $ \st -> st{dbDocAuthors = auths}
+ getDate = case filterChild (named "date") e of
+ Just t -> do
+ dat <- getInlines t
+ modify $ \st -> st{dbDocDate = dat}
+ Nothing -> return ()
+ parseTable = do
+ let isCaption x = named "title" x || named "caption" x
+ caption <- case filterChild isCaption e of
+ Just t -> getInlines t
+ Nothing -> return mempty
+ let e' = maybe e id $ filterChild (named "tgroup") e
+ let isColspec x = named "colspec" x || named "col" x
+ let colspecs = case filterChild (named "colgroup") e' of
+ Just c -> filterChildren isColspec c
+ _ -> filterChildren isColspec e'
+ let isRow x = named "row" x || named "tr" x
+ headrows <- case filterChild (named "thead") e' of
+ Just h -> case filterChild isRow h of
+ Just x -> parseRow x
+ Nothing -> return []
+ Nothing -> return []
+ bodyrows <- case filterChild (named "tbody") e' of
+ Just b -> mapM parseRow
+ $ filterChildren isRow b
+ Nothing -> mapM parseRow
+ $ filterChildren isRow e'
+ let toAlignment c = case findAttr (unqual "align") c of
+ Just "left" -> AlignLeft
+ Just "right" -> AlignRight
+ Just "center" -> AlignCenter
+ _ -> AlignDefault
+ let toWidth c = case findAttr (unqual "colwidth") c of
+ Just w -> read $ filter (\x ->
+ (x >= '0' && x <= '9')
+ || x == '.') w
+ Nothing -> 0 :: Double
+ let numrows = maximum $ map length bodyrows
+ let aligns = case colspecs of
+ [] -> replicate numrows AlignDefault
+ cs -> map toAlignment cs
+ let widths = case colspecs of
+ [] -> replicate numrows 0
+ cs -> let ws = map toWidth cs
+ tot = sum ws
+ in if all (> 0) ws
+ then map (/ tot) ws
+ else replicate numrows 0
+ let headrows' = if null headrows
+ then replicate numrows mempty
+ else headrows
+ return $ table caption (zip aligns widths)
+ headrows' bodyrows
+ isEntry x = named "entry" x || named "td" x || named "th" x
+ parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry
+ sect n = do isbook <- gets dbBook
+ let n' = if isbook || n == 0 then n + 1 else n
+ headerText <- case filterChild (named "title") e of
+ Just t -> getInlines t
+ Nothing -> return mempty
+ modify $ \st -> st{ dbSectionLevel = n }
+ b <- getBlocks e
+ modify $ \st -> st{ dbSectionLevel = n - 1 }
+ return $ header n' headerText <> b
+
+getInlines :: Element -> DB Inlines
+getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e')
+
+parseInline :: Content -> DB Inlines
+parseInline (Text (CData _ s _)) = return $ text s
+parseInline (CRef ref) =
+ return $ maybe (text $ map toUpper ref) (text . (:[])) $ lookupEntity ref
+parseInline (Elem e) =
+ case qName (elName e) of
+ "subscript" -> subscript <$> innerInlines
+ "superscript" -> superscript <$> innerInlines
+ "inlinemediaobject" -> getImage e
+ "quote" -> do
+ qt <- gets dbQuoteType
+ let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote
+ modify $ \st -> st{ dbQuoteType = qt' }
+ contents <- innerInlines
+ modify $ \st -> st{ dbQuoteType = qt }
+ return $ if qt == SingleQuote
+ then singleQuoted contents
+ else doubleQuoted contents
+ "simplelist" -> simpleList
+ "segmentedlist" -> segmentedList
+ "code" -> codeWithLang
+ "filename" -> codeWithLang
+ "literal" -> codeWithLang
+ "computeroutput" -> codeWithLang
+ "prompt" -> codeWithLang
+ "parameter" -> codeWithLang
+ "option" -> codeWithLang
+ "optional" -> do x <- getInlines e
+ return $ str "[" <> x <> str "]"
+ "markup" -> codeWithLang
+ "wordasword" -> emph <$> innerInlines
+ "command" -> codeWithLang
+ "varname" -> codeWithLang
+ "function" -> codeWithLang
+ "type" -> codeWithLang
+ "symbol" -> codeWithLang
+ "constant" -> codeWithLang
+ "userinput" -> codeWithLang
+ "varargs" -> return $ code "(...)"
+ "xref" -> return $ str "?" -- so at least you know something is there
+ "email" -> return $ link ("mailto:" ++ strContent e) ""
+ $ code $ strContent e
+ "uri" -> return $ link (strContent e) "" $ code $ strContent e
+ "ulink" -> link (attrValue "url" e) "" <$> innerInlines
+ "link" -> do
+ ils <- innerInlines
+ let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of
+ Just h -> h
+ _ -> ('#' : attrValue "linkend" e)
+ let ils' = if ils == mempty then code href else ils
+ return $ link href "" ils'
+ "foreignphrase" -> emph <$> innerInlines
+ "emphasis" -> case attrValue "role" e of
+ "bold" -> strong <$> innerInlines
+ "strong" -> strong <$> innerInlines
+ "strikethrough" -> strikeout <$> innerInlines
+ _ -> emph <$> innerInlines
+ "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e)
+ "title" -> return mempty
+ _ -> innerInlines
+ where innerInlines = (trimInlines . mconcat) <$>
+ (mapM parseInline $ elContent e)
+ codeWithLang = do
+ let classes' = case attrValue "language" e of
+ "" -> []
+ l -> [l]
+ return $ codeWith (attrValue "id" e,classes',[]) $ strContent e
+ simpleList = (mconcat . intersperse (str "," <> space)) <$> mapM getInlines
+ (filterChildren (named "member") e)
+ segmentedList = do
+ tit <- maybe (return mempty) getInlines $ filterChild (named "title") e
+ segtits <- mapM getInlines $ filterChildren (named "segtitle") e
+ segitems <- mapM (mapM getInlines . filterChildren (named "seg"))
+ $ filterChildren (named "seglistitem") e
+ let toSeg = mconcat . zipWith (\x y -> strong (x <> str ":") <> space <>
+ y <> linebreak) segtits
+ let segs = mconcat $ map toSeg segitems
+ let tit' = if tit == mempty
+ then mempty
+ else strong tit <> linebreak
+ return $ linebreak <> tit' <> segs
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 0c017b2e4..d76524e14 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -36,8 +36,6 @@ module Text.Pandoc.Readers.HTML ( readHtml
, isCommentTag
) where
-import Text.ParserCombinators.Parsec
-import Text.ParserCombinators.Parsec.Pos
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
@@ -46,8 +44,14 @@ import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Data.Maybe ( fromMaybe, isJust )
import Data.List ( intercalate )
-import Data.Char ( isSpace, isDigit, toLower )
-import Control.Monad ( liftM, guard, when )
+import Data.Char ( isDigit, toLower )
+import Control.Monad ( liftM, guard, when, mzero )
+
+isSpace :: Char -> Bool
+isSpace ' ' = True
+isSpace '\t' = True
+isSpace '\n' = True
+isSpace _ = False
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
@@ -62,7 +66,7 @@ readHtml st inp = Pandoc meta blocks
then parseHeader tags
else (Meta [] [] [], tags)
-type TagParser = GenParser (Tag String) ParserState
+type TagParser = Parser [Tag String] ParserState
-- TODO - fix this - not every header has a title tag
parseHeader :: [Tag String] -> (Meta, [Tag String])
@@ -222,6 +226,8 @@ pSimpleTable :: TagParser [Block]
pSimpleTable = try $ do
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
skipMany pBlank
+ caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank
+ skipMany $ pInTags "col" block >> skipMany pBlank
head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th")
skipMany pBlank
rows <- pOptInTag "tbody"
@@ -231,7 +237,7 @@ pSimpleTable = try $ do
let cols = maximum $ map length rows
let aligns = replicate cols AlignLeft
let widths = replicate cols 0
- return [Table [] aligns widths head' rows]
+ return [Table caption aligns widths head' rows]
pCell :: String -> TagParser [TableCell]
pCell celltype = try $ do
@@ -409,7 +415,7 @@ pCloses tagtype = try $ do
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "li" -> return ()
- _ -> pzero
+ _ -> mzero
pTagText :: TagParser [Inline]
pTagText = try $ do
@@ -424,11 +430,11 @@ pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ all isSpace str
-pTagContents :: GenParser Char ParserState Inline
+pTagContents :: Parser [Char] ParserState Inline
pTagContents =
pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
-pStr :: GenParser Char ParserState Inline
+pStr :: Parser [Char] ParserState Inline
pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
@@ -447,13 +453,13 @@ isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
-pSymbol :: GenParser Char ParserState Inline
+pSymbol :: Parser [Char] ParserState Inline
pSymbol = satisfy isSpecial >>= return . Str . (:[])
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
-pBad :: GenParser Char ParserState Inline
+pBad :: Parser [Char] ParserState Inline
pBad = do
c <- satisfy isBad
let c' = case c of
@@ -487,7 +493,7 @@ pBad = do
_ -> '?'
return $ Str [c']
-pSpace :: GenParser Char ParserState Inline
+pSpace :: Parser [Char] ParserState Inline
pSpace = many1 (satisfy isSpace) >> return Space
--
@@ -585,7 +591,7 @@ _ `closes` _ = False
--- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags.
-htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String
+htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String
htmlInBalanced f = try $ do
(TagOpen t _, tag) <- htmlTag f
guard $ '/' `notElem` tag -- not a self-closing tag
@@ -598,7 +604,7 @@ htmlInBalanced f = try $ do
return $ tag ++ concat contents ++ endtag
-- | Matches a tag meeting a certain condition.
-htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String)
+htmlTag :: (Tag String -> Bool) -> Parser [Char] ParserState (Tag String, String)
htmlTag f = try $ do
lookAhead (char '<')
(next : _) <- getInput >>= return . canonicalizeTags . parseTags
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 279f90318..351e1fef5 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -33,10 +33,9 @@ module Text.Pandoc.Readers.LaTeX ( readLaTeX,
handleIncludes
) where
-import Text.ParserCombinators.Parsec hiding ((<|>), space, many, optional)
import Text.Pandoc.Definition
import Text.Pandoc.Shared
-import Text.Pandoc.Parsing
+import Text.Pandoc.Parsing hiding ((<|>), many, optional, space)
import qualified Text.Pandoc.UTF8 as UTF8
import Data.Char ( chr, ord )
import Control.Monad
@@ -64,7 +63,7 @@ parseLaTeX = do
let date' = stateDate st
return $ Pandoc (Meta title' authors' date') $ toList bs
-type LP = GenParser Char ParserState
+type LP = Parser [Char] ParserState
anyControlSeq :: LP String
anyControlSeq = do
@@ -82,9 +81,16 @@ controlSeq name = try $ do
case name of
"" -> mzero
[c] | not (isLetter c) -> string [c]
- cs -> string cs <* optional sp
+ cs -> string cs <* notFollowedBy letter <* optional sp
return name
+dimenarg :: LP String
+dimenarg = try $ do
+ ch <- option "" $ string "="
+ num <- many1 digit
+ dim <- oneOfStrings ["pt","pc","in","bp","cm","mm","dd","cc","sp"]
+ return $ ch ++ num ++ dim
+
sp :: LP ()
sp = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t')
<|> (try $ newline >>~ lookAhead anyChar >>~ notFollowedBy blankline)
@@ -112,18 +118,28 @@ comment = do
newline
return ()
+bgroup :: LP ()
+bgroup = () <$ char '{'
+ <|> () <$ controlSeq "bgroup"
+ <|> () <$ controlSeq "begingroup"
+
+egroup :: LP ()
+egroup = () <$ char '}'
+ <|> () <$ controlSeq "egroup"
+ <|> () <$ controlSeq "endgroup"
+
grouped :: Monoid a => LP a -> LP a
-grouped parser = try $ char '{' *> (mconcat <$> manyTill parser (char '}'))
+grouped parser = try $ bgroup *> (mconcat <$> manyTill parser egroup)
braced :: LP String
-braced = char '{' *> (concat <$> manyTill
+braced = bgroup *> (concat <$> manyTill
( many1 (satisfy (\c -> c /= '\\' && c /= '}' && c /= '{'))
<|> try (string "\\}")
<|> try (string "\\{")
<|> try (string "\\\\")
<|> ((\x -> "{" ++ x ++ "}") <$> braced)
<|> count 1 anyChar
- ) (char '}'))
+ ) egroup)
bracketed :: Monoid a => LP a -> LP a
bracketed parser = try $ char '[' *> (mconcat <$> manyTill parser (char ']'))
@@ -181,7 +197,7 @@ inlines = mconcat <$> many (notFollowedBy (char '}') *> inline)
block :: LP Blocks
block = (mempty <$ comment)
- <|> (mempty <$ ((spaceChar <|> blankline) *> spaces))
+ <|> (mempty <$ ((spaceChar <|> newline) *> spaces))
<|> environment
<|> mempty <$ macro -- TODO improve macros, make them work everywhere
<|> blockCommand
@@ -251,6 +267,7 @@ blockCommands = M.fromList $
, ("end", mzero)
, ("item", skipopts *> loose_item)
, ("documentclass", skipopts *> braced *> preamble)
+ , ("centerline", (para . trimInlines) <$> (skipopts *> tok))
] ++ map ignoreBlocks
-- these commands will be ignored unless --parse-raw is specified,
-- in which case they will appear as raw latex blocks
@@ -281,7 +298,9 @@ authors :: LP ()
authors = try $ do
char '{'
let oneAuthor = mconcat <$>
- many1 (notFollowedBy' (controlSeq "and") >> inline)
+ many1 (notFollowedBy' (controlSeq "and") >>
+ (inline <|> mempty <$ blockCommand))
+ -- skip e.g. \vspace{10pt}
auths <- sepBy oneAuthor (controlSeq "and")
char '}'
updateState (\s -> s { stateAuthors = map (normalizeSpaces . toList) auths })
@@ -304,16 +323,19 @@ inlineCommand = try $ do
parseRaw <- stateParseRaw `fmap` getState
star <- option "" (string "*")
let name' = name ++ star
+ let rawargs = withRaw (skipopts *> option "" dimenarg
+ *> many braced) >>= applyMacros' . snd
+ let raw = if parseRaw
+ then (rawInline "latex" . (('\\':name') ++)) <$> rawargs
+ else mempty <$> rawargs
case M.lookup name' inlineCommands of
- Just p -> p
+ Just p -> p <|> raw
Nothing -> case M.lookup name inlineCommands of
- Just p -> p
- Nothing
- | parseRaw ->
- (rawInline "latex" . (('\\':name') ++)) <$>
- (withRaw (skipopts *> many braced)
- >>= applyMacros' . snd)
- | otherwise -> return mempty
+ Just p -> p <|> raw
+ Nothing -> raw
+
+unlessParseRaw :: LP ()
+unlessParseRaw = getState >>= guard . not . stateParseRaw
isBlockCommand :: String -> Bool
isBlockCommand s = maybe False (const True) $ M.lookup s blockCommands
@@ -333,8 +355,8 @@ inlineCommands = M.fromList $
, ("dots", lit "…")
, ("mdots", lit "…")
, ("sim", lit "~")
- , ("label", inBrackets <$> tok)
- , ("ref", inBrackets <$> tok)
+ , ("label", unlessParseRaw >> (inBrackets <$> tok))
+ , ("ref", unlessParseRaw >> (inBrackets <$> tok))
, ("(", mathInline $ manyTill anyChar (try $ string "\\)"))
, ("[", mathDisplay $ manyTill anyChar (try $ string "\\]"))
, ("ensuremath", mathInline $ braced)
@@ -358,8 +380,6 @@ inlineCommands = M.fromList $
, ("scshape", smallcaps <$> inlines)
, ("bfseries", strong <$> inlines)
, ("/", pure mempty) -- italic correction
- , ("cc", lit "ç")
- , ("cC", lit "Ç")
, ("aa", lit "å")
, ("AA", lit "Å")
, ("ss", lit "ß")
@@ -374,11 +394,12 @@ inlineCommands = M.fromList $
, ("copyright", lit "©")
, ("`", option (str "`") $ try $ tok >>= accent grave)
, ("'", option (str "'") $ try $ tok >>= accent acute)
- , ("^", option (str "^") $ try $ tok >>= accent hat)
- , ("~", option (str "~") $ try $ tok >>= accent circ)
+ , ("^", option (str "^") $ try $ tok >>= accent circ)
+ , ("~", option (str "~") $ try $ tok >>= accent tilde)
, ("\"", option (str "\"") $ try $ tok >>= accent umlaut)
, (".", option (str ".") $ try $ tok >>= accent dot)
, ("=", option (str "=") $ try $ tok >>= accent macron)
+ , ("c", option (str "c") $ try $ tok >>= accent cedilla)
, ("i", lit "i")
, ("\\", linebreak <$ (optional (bracketed inline) *> optional sp))
, (",", pure mempty)
@@ -502,33 +523,66 @@ acute 'E' = 'É'
acute 'I' = 'Í'
acute 'O' = 'Ó'
acute 'U' = 'Ú'
+acute 'Y' = 'Ý'
acute 'a' = 'á'
acute 'e' = 'é'
acute 'i' = 'í'
acute 'o' = 'ó'
acute 'u' = 'ú'
+acute 'y' = 'ý'
+acute 'C' = 'Ć'
+acute 'c' = 'ć'
+acute 'L' = 'Ĺ'
+acute 'l' = 'ĺ'
+acute 'N' = 'Ń'
+acute 'n' = 'ń'
+acute 'R' = 'Ŕ'
+acute 'r' = 'ŕ'
+acute 'S' = 'Ś'
+acute 's' = 'ś'
+acute 'Z' = 'Ź'
+acute 'z' = 'ź'
acute c = c
-hat :: Char -> Char
-hat 'A' = 'Â'
-hat 'E' = 'Ê'
-hat 'I' = 'Î'
-hat 'O' = 'Ô'
-hat 'U' = 'Û'
-hat 'a' = 'ã'
-hat 'e' = 'ê'
-hat 'i' = 'î'
-hat 'o' = 'ô'
-hat 'u' = 'û'
-hat c = c
-
circ :: Char -> Char
-circ 'A' = 'Ã'
-circ 'O' = 'Õ'
-circ 'o' = 'õ'
-circ 'N' = 'Ñ'
-circ 'n' = 'ñ'
-circ c = c
+circ 'A' = 'Â'
+circ 'E' = 'Ê'
+circ 'I' = 'Î'
+circ 'O' = 'Ô'
+circ 'U' = 'Û'
+circ 'a' = 'â'
+circ 'e' = 'ê'
+circ 'i' = 'î'
+circ 'o' = 'ô'
+circ 'u' = 'û'
+circ 'C' = 'Ĉ'
+circ 'c' = 'ĉ'
+circ 'G' = 'Ĝ'
+circ 'g' = 'ĝ'
+circ 'H' = 'Ĥ'
+circ 'h' = 'ĥ'
+circ 'J' = 'Ĵ'
+circ 'j' = 'ĵ'
+circ 'S' = 'Ŝ'
+circ 's' = 'ŝ'
+circ 'W' = 'Ŵ'
+circ 'w' = 'ŵ'
+circ 'Y' = 'Ŷ'
+circ 'y' = 'ŷ'
+circ c = c
+
+tilde :: Char -> Char
+tilde 'A' = 'Ã'
+tilde 'a' = 'ã'
+tilde 'O' = 'Õ'
+tilde 'o' = 'õ'
+tilde 'I' = 'Ĩ'
+tilde 'i' = 'ĩ'
+tilde 'U' = 'Ũ'
+tilde 'u' = 'ũ'
+tilde 'N' = 'Ñ'
+tilde 'n' = 'ñ'
+tilde c = c
umlaut :: Char -> Char
umlaut 'A' = 'Ä'
@@ -568,6 +622,13 @@ macron 'o' = 'ō'
macron 'u' = 'ū'
macron c = c
+cedilla :: Char -> Char
+cedilla 'c' = 'ç'
+cedilla 'C' = 'Ç'
+cedilla 's' = 'ş'
+cedilla 'S' = 'Ş'
+cedilla c = c
+
tok :: LP Inlines
tok = try $ grouped inline <|> inlineCommand <|> str <$> (count 1 $ inlineChar)
@@ -646,15 +707,15 @@ verbatimEnv = do
controlSeq "begin"
name <- braced
guard $ name == "verbatim" || name == "Verbatim" ||
- name == "lstlisting"
+ name == "lstlisting" || name == "minted"
verbEnv name
rest <- getInput
return (r,rest)
-rawLaTeXBlock :: GenParser Char ParserState String
+rawLaTeXBlock :: Parser [Char] ParserState String
rawLaTeXBlock = snd <$> withRaw (environment <|> blockCommand)
-rawLaTeXInline :: GenParser Char ParserState Inline
+rawLaTeXInline :: Parser [Char] ParserState Inline
rawLaTeXInline = do
(res, raw) <- withRaw inlineCommand
if res == mempty
@@ -678,7 +739,9 @@ environments = M.fromList
verbEnv "code"))
, ("verbatim", codeBlock <$> (verbEnv "verbatim"))
, ("Verbatim", codeBlock <$> (verbEnv "Verbatim"))
- , ("lstlisting", codeBlock <$> (verbEnv "listlisting"))
+ , ("lstlisting", codeBlock <$> (verbEnv "lstlisting"))
+ , ("minted", liftA2 (\l c -> codeBlockWith ("",[l],[]) c)
+ (grouped (many1 $ satisfy (/= '}'))) (verbEnv "minted"))
, ("displaymath", mathEnv Nothing "displaymath")
, ("equation", mathEnv Nothing "equation")
, ("equation*", mathEnv Nothing "equation*")
@@ -878,9 +941,9 @@ parseAligns :: LP [Alignment]
parseAligns = try $ do
char '{'
optional $ char '|'
- let cAlign = char 'c' >> return AlignCenter
- let lAlign = char 'l' >> return AlignLeft
- let rAlign = char 'r' >> return AlignRight
+ let cAlign = AlignCenter <$ char 'c'
+ let lAlign = AlignLeft <$ char 'l'
+ let rAlign = AlignRight <$ char 'r'
let alignChar = optional sp *> (cAlign <|> lAlign <|> rAlign)
aligns' <- sepEndBy alignChar (optional $ char '|')
spaces
@@ -891,16 +954,20 @@ parseAligns = try $ do
hline :: LP ()
hline = () <$ (try $ spaces >> controlSeq "hline")
+lbreak :: LP ()
+lbreak = () <$ (try $ spaces *> controlSeq "\\")
+
+amp :: LP ()
+amp = () <$ (try $ spaces *> char '&')
+
parseTableRow :: Int -- ^ number of columns
-> LP [Blocks]
parseTableRow cols = try $ do
- let amp = try $ spaces *> string "&"
- let tableCellInline = notFollowedBy (amp <|> controlSeq "\\") >> inline
- cells' <- sepBy ((plain . trimInlines . mconcat) <$> many tableCellInline) amp
+ let tableCellInline = notFollowedBy (amp <|> lbreak) >> inline
+ let tableCell = (plain . trimInlines . mconcat) <$> many tableCellInline
+ cells' <- sepBy tableCell amp
guard $ length cells' == cols
spaces
- optional $ controlSeq "\\"
- spaces
return cells'
simpTable :: LP Blocks
@@ -909,8 +976,8 @@ simpTable = try $ do
aligns <- parseAligns
let cols = length aligns
optional hline
- header' <- option [] $ try (parseTableRow cols <* hline)
- rows <- many (parseTableRow cols <* optional hline)
+ header' <- option [] $ try (parseTableRow cols <* lbreak <* hline)
+ rows <- sepEndBy (parseTableRow cols) (lbreak <* optional hline)
spaces
let header'' = if null header'
then replicate cols mempty
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 65c80956a..34a6cf7ce 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -43,7 +43,6 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag,
isTextTag, isCommentTag )
import Text.Pandoc.XML ( fromEntities )
-import Text.ParserCombinators.Parsec
import Control.Monad (when, liftM, guard, mzero)
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match (tagOpen)
@@ -83,14 +82,14 @@ isBlank _ = False
-- auxiliary functions
--
-indentSpaces :: GenParser Char ParserState [Char]
+indentSpaces :: Parser [Char] ParserState [Char]
indentSpaces = try $ do
state <- getState
let tabStop = stateTabStop state
count tabStop (char ' ') <|>
string "\t" <?> "indentation"
-nonindentSpaces :: GenParser Char ParserState [Char]
+nonindentSpaces :: Parser [Char] ParserState [Char]
nonindentSpaces = do
state <- getState
let tabStop = stateTabStop state
@@ -99,30 +98,30 @@ nonindentSpaces = do
then return sps
else unexpected "indented line"
-skipNonindentSpaces :: GenParser Char ParserState ()
+skipNonindentSpaces :: Parser [Char] ParserState ()
skipNonindentSpaces = do
state <- getState
atMostSpaces (stateTabStop state - 1)
-atMostSpaces :: Int -> GenParser Char ParserState ()
+atMostSpaces :: Int -> Parser [Char] ParserState ()
atMostSpaces 0 = notFollowedBy (char ' ')
atMostSpaces n = (char ' ' >> atMostSpaces (n-1)) <|> return ()
-litChar :: GenParser Char ParserState Char
+litChar :: Parser [Char] ParserState Char
litChar = escapedChar'
<|> noneOf "\n"
<|> (newline >> notFollowedBy blankline >> return ' ')
-- | Fail unless we're at beginning of a line.
-failUnlessBeginningOfLine :: GenParser tok st ()
+failUnlessBeginningOfLine :: Parser [tok] st ()
failUnlessBeginningOfLine = do
pos <- getPosition
if sourceColumn pos == 1 then return () else fail "not beginning of line"
-- | Parse a sequence of inline elements between square brackets,
-- including inlines between balanced pairs of square brackets.
-inlinesInBalancedBrackets :: GenParser Char ParserState Inline
- -> GenParser Char ParserState [Inline]
+inlinesInBalancedBrackets :: Parser [Char] ParserState Inline
+ -> Parser [Char] ParserState [Inline]
inlinesInBalancedBrackets parser = try $ do
char '['
result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser
@@ -137,7 +136,7 @@ inlinesInBalancedBrackets parser = try $ do
-- document structure
--
-titleLine :: GenParser Char ParserState [Inline]
+titleLine :: Parser [Char] ParserState [Inline]
titleLine = try $ do
char '%'
skipSpaces
@@ -146,7 +145,7 @@ titleLine = try $ do
newline
return $ normalizeSpaces res
-authorsLine :: GenParser Char ParserState [[Inline]]
+authorsLine :: Parser [Char] ParserState [[Inline]]
authorsLine = try $ do
char '%'
skipSpaces
@@ -157,14 +156,14 @@ authorsLine = try $ do
newline
return $ filter (not . null) $ map normalizeSpaces authors
-dateLine :: GenParser Char ParserState [Inline]
+dateLine :: Parser [Char] ParserState [Inline]
dateLine = try $ do
char '%'
skipSpaces
date <- manyTill inline newline
return $ normalizeSpaces date
-titleBlock :: GenParser Char ParserState ([Inline], [[Inline]], [Inline])
+titleBlock :: Parser [Char] ParserState ([Inline], [[Inline]], [Inline])
titleBlock = try $ do
failIfStrict
title <- option [] titleLine
@@ -173,7 +172,7 @@ titleBlock = try $ do
optional blanklines
return (title, author, date)
-parseMarkdown :: GenParser Char ParserState Pandoc
+parseMarkdown :: Parser [Char] ParserState Pandoc
parseMarkdown = do
-- markdown allows raw HTML
updateState (\state -> state { stateParseRaw = True })
@@ -182,7 +181,8 @@ parseMarkdown = do
-- docMinusKeys is the raw document with blanks where the keys/notes were...
st <- getState
let firstPassParser = referenceKey
- <|> (if stateStrict st then pzero else noteBlock)
+ <|> (if stateStrict st then mzero else noteBlock)
+ <|> liftM snd (withRaw codeBlockDelimited)
<|> lineClump
docMinusKeys <- liftM concat $ manyTill firstPassParser eof
setInput docMinusKeys
@@ -210,7 +210,7 @@ parseMarkdown = do
-- initial pass for references and notes
--
-referenceKey :: GenParser Char ParserState [Char]
+referenceKey :: Parser [Char] ParserState [Char]
referenceKey = try $ do
startPos <- getPosition
skipNonindentSpaces
@@ -237,7 +237,7 @@ referenceKey = try $ do
-- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-referenceTitle :: GenParser Char ParserState String
+referenceTitle :: Parser [Char] ParserState String
referenceTitle = try $ do
skipSpaces >> optional newline >> skipSpaces
tit <- (charsInBalanced '(' ')' litChar >>= return . unwords . words)
@@ -246,23 +246,23 @@ referenceTitle = try $ do
notFollowedBy (noneOf ")\n")))
return $ fromEntities tit
-noteMarker :: GenParser Char ParserState [Char]
+noteMarker :: Parser [Char] ParserState [Char]
noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']')
-rawLine :: GenParser Char ParserState [Char]
+rawLine :: Parser [Char] ParserState [Char]
rawLine = try $ do
notFollowedBy blankline
notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker
optional indentSpaces
anyLine
-rawLines :: GenParser Char ParserState [Char]
+rawLines :: Parser [Char] ParserState [Char]
rawLines = do
first <- anyLine
rest <- many rawLine
return $ unlines (first:rest)
-noteBlock :: GenParser Char ParserState [Char]
+noteBlock :: Parser [Char] ParserState [Char]
noteBlock = try $ do
startPos <- getPosition
skipNonindentSpaces
@@ -286,10 +286,10 @@ noteBlock = try $ do
-- parsing blocks
--
-parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks :: Parser [Char] ParserState [Block]
parseBlocks = manyTill block eof
-block :: GenParser Char ParserState Block
+block :: Parser [Char] ParserState Block
block = do
st <- getState
choice (if stateStrict st
@@ -324,10 +324,10 @@ block = do
-- header blocks
--
-header :: GenParser Char ParserState Block
+header :: Parser [Char] ParserState Block
header = setextHeader <|> atxHeader <?> "header"
-atxHeader :: GenParser Char ParserState Block
+atxHeader :: Parser [Char] ParserState Block
atxHeader = try $ do
level <- many1 (char '#') >>= return . length
notFollowedBy (char '.' <|> char ')') -- this would be a list
@@ -335,10 +335,10 @@ atxHeader = try $ do
text <- manyTill inline atxClosing >>= return . normalizeSpaces
return $ Header level text
-atxClosing :: GenParser Char st [Char]
+atxClosing :: Parser [Char] st [Char]
atxClosing = try $ skipMany (char '#') >> blanklines
-setextHeader :: GenParser Char ParserState Block
+setextHeader :: Parser [Char] ParserState Block
setextHeader = try $ do
-- This lookahead prevents us from wasting time parsing Inlines
-- unless necessary -- it gives a significant performance boost.
@@ -354,7 +354,7 @@ setextHeader = try $ do
-- hrule block
--
-hrule :: GenParser Char st Block
+hrule :: Parser [Char] st Block
hrule = try $ do
skipSpaces
start <- satisfy isHruleChar
@@ -368,12 +368,12 @@ hrule = try $ do
-- code blocks
--
-indentedLine :: GenParser Char ParserState [Char]
+indentedLine :: Parser [Char] ParserState [Char]
indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
blockDelimiter :: (Char -> Bool)
-> Maybe Int
- -> GenParser Char st (Int, (String, [String], [(String, String)]), Char)
+ -> Parser [Char] st (Int, (String, [String], [(String, String)]), Char)
blockDelimiter f len = try $ do
c <- lookAhead (satisfy f)
size <- case len of
@@ -387,7 +387,7 @@ blockDelimiter f len = try $ do
blankline
return (size, attr, c)
-attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
+attributes :: Parser [Char] st ([Char], [[Char]], [([Char], [Char])])
attributes = try $ do
char '{'
spnl
@@ -399,28 +399,28 @@ attributes = try $ do
| otherwise = firstNonNull xs
return (firstNonNull $ reverse ids, concat classes, concat keyvals)
-attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
+attribute :: Parser [Char] st ([Char], [[Char]], [([Char], [Char])])
attribute = identifierAttr <|> classAttr <|> keyValAttr
-identifier :: GenParser Char st [Char]
+identifier :: Parser [Char] st [Char]
identifier = do
first <- letter
rest <- many $ alphaNum <|> oneOf "-_:."
return (first:rest)
-identifierAttr :: GenParser Char st ([Char], [a], [a1])
+identifierAttr :: Parser [Char] st ([Char], [a], [a1])
identifierAttr = try $ do
char '#'
result <- identifier
return (result,[],[])
-classAttr :: GenParser Char st ([Char], [[Char]], [a])
+classAttr :: Parser [Char] st ([Char], [[Char]], [a])
classAttr = try $ do
char '.'
result <- identifier
return ("",[result],[])
-keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])])
+keyValAttr :: Parser [Char] st ([Char], [a], [([Char], [Char])])
keyValAttr = try $ do
key <- identifier
char '='
@@ -429,14 +429,14 @@ keyValAttr = try $ do
<|> many nonspaceChar
return ("",[],[(key,val)])
-codeBlockDelimited :: GenParser Char st Block
+codeBlockDelimited :: Parser [Char] st Block
codeBlockDelimited = try $ do
(size, attr, c) <- blockDelimiter (\c -> c == '~' || c == '`') Nothing
contents <- manyTill anyLine (blockDelimiter (== c) (Just size))
blanklines
return $ CodeBlock attr $ intercalate "\n" contents
-codeBlockIndented :: GenParser Char ParserState Block
+codeBlockIndented :: Parser [Char] ParserState Block
codeBlockIndented = do
contents <- many1 (indentedLine <|>
try (do b <- blanklines
@@ -447,7 +447,7 @@ codeBlockIndented = do
return $ CodeBlock ("", stateIndentedCodeClasses st, []) $
stripTrailingNewlines $ concat contents
-lhsCodeBlock :: GenParser Char ParserState Block
+lhsCodeBlock :: Parser [Char] ParserState Block
lhsCodeBlock = do
failUnlessLHS
liftM (CodeBlock ("",["sourceCode","literate","haskell"],[]))
@@ -455,7 +455,7 @@ lhsCodeBlock = do
<|> liftM (CodeBlock ("",["sourceCode","haskell"],[]))
lhsCodeBlockInverseBird
-lhsCodeBlockLaTeX :: GenParser Char ParserState String
+lhsCodeBlockLaTeX :: Parser [Char] ParserState String
lhsCodeBlockLaTeX = try $ do
string "\\begin{code}"
manyTill spaceChar newline
@@ -463,13 +463,13 @@ lhsCodeBlockLaTeX = try $ do
blanklines
return $ stripTrailingNewlines contents
-lhsCodeBlockBird :: GenParser Char ParserState String
+lhsCodeBlockBird :: Parser [Char] ParserState String
lhsCodeBlockBird = lhsCodeBlockBirdWith '>'
-lhsCodeBlockInverseBird :: GenParser Char ParserState String
+lhsCodeBlockInverseBird :: Parser [Char] ParserState String
lhsCodeBlockInverseBird = lhsCodeBlockBirdWith '<'
-lhsCodeBlockBirdWith :: Char -> GenParser Char ParserState String
+lhsCodeBlockBirdWith :: Char -> Parser [Char] ParserState String
lhsCodeBlockBirdWith c = try $ do
pos <- getPosition
when (sourceColumn pos /= 1) $ fail "Not in first column"
@@ -481,7 +481,7 @@ lhsCodeBlockBirdWith c = try $ do
blanklines
return $ intercalate "\n" lns'
-birdTrackLine :: Char -> GenParser Char st [Char]
+birdTrackLine :: Char -> Parser [Char] st [Char]
birdTrackLine c = try $ do
char c
-- allow html tags on left margin:
@@ -493,10 +493,10 @@ birdTrackLine c = try $ do
-- block quotes
--
-emailBlockQuoteStart :: GenParser Char ParserState Char
+emailBlockQuoteStart :: Parser [Char] ParserState Char
emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ')
-emailBlockQuote :: GenParser Char ParserState [[Char]]
+emailBlockQuote :: Parser [Char] ParserState [[Char]]
emailBlockQuote = try $ do
emailBlockQuoteStart
raw <- sepBy (many (nonEndline <|>
@@ -507,7 +507,7 @@ emailBlockQuote = try $ do
optional blanklines
return raw
-blockQuote :: GenParser Char ParserState Block
+blockQuote :: Parser [Char] ParserState Block
blockQuote = do
raw <- emailBlockQuote
-- parse the extracted block, which may contain various block elements:
@@ -518,7 +518,7 @@ blockQuote = do
-- list blocks
--
-bulletListStart :: GenParser Char ParserState ()
+bulletListStart :: Parser [Char] ParserState ()
bulletListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
skipNonindentSpaces
@@ -527,7 +527,7 @@ bulletListStart = try $ do
spaceChar
skipSpaces
-anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim)
+anyOrderedListStart :: Parser [Char] ParserState (Int, ListNumberStyle, ListNumberDelim)
anyOrderedListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
skipNonindentSpaces
@@ -547,13 +547,12 @@ anyOrderedListStart = try $ do
skipSpaces
return (num, style, delim)
-listStart :: GenParser Char ParserState ()
+listStart :: Parser [Char] ParserState ()
listStart = bulletListStart <|> (anyOrderedListStart >> return ())
-- parse a line of a list item (start = parser for beginning of list item)
-listLine :: GenParser Char ParserState [Char]
+listLine :: Parser [Char] ParserState [Char]
listLine = try $ do
- notFollowedBy' listStart
notFollowedBy blankline
notFollowedBy' (do indentSpaces
many (spaceChar)
@@ -562,24 +561,26 @@ listLine = try $ do
return $ concat chunks ++ "\n"
-- parse raw text for one list item, excluding start marker and continuations
-rawListItem :: GenParser Char ParserState a -> GenParser Char ParserState [Char]
+rawListItem :: Parser [Char] ParserState a
+ -> Parser [Char] ParserState [Char]
rawListItem start = try $ do
start
- result <- many1 listLine
+ first <- listLine
+ rest <- many (notFollowedBy listStart >> listLine)
blanks <- many blankline
- return $ concat result ++ blanks
+ return $ concat (first:rest) ++ blanks
-- continuation of a list item - indented and separated by blankline
-- or (in compact lists) endline.
-- note: nested lists are parsed as continuations
-listContinuation :: GenParser Char ParserState [Char]
+listContinuation :: Parser [Char] ParserState [Char]
listContinuation = try $ do
lookAhead indentSpaces
result <- many1 listContinuationLine
blanks <- many blankline
return $ concat result ++ blanks
-listContinuationLine :: GenParser Char ParserState [Char]
+listContinuationLine :: Parser [Char] ParserState [Char]
listContinuationLine = try $ do
notFollowedBy blankline
notFollowedBy' listStart
@@ -587,8 +588,9 @@ listContinuationLine = try $ do
result <- manyTill anyChar newline
return $ result ++ "\n"
-listItem :: GenParser Char ParserState a -> GenParser Char ParserState [Block]
-listItem start = try $ do
+listItem :: Parser [Char] ParserState a
+ -> Parser [Char] ParserState [Block]
+listItem start = try $ do
first <- rawListItem start
continuations <- many listContinuation
-- parsing with ListItemState forces markers at beginning of lines to
@@ -603,7 +605,7 @@ listItem start = try $ do
updateState (\st -> st {stateParserContext = oldContext})
return contents
-orderedList :: GenParser Char ParserState Block
+orderedList :: Parser [Char] ParserState Block
orderedList = try $ do
(start, style, delim) <- lookAhead anyOrderedListStart
items <- many1 $ listItem $ try $
@@ -612,13 +614,13 @@ orderedList = try $ do
orderedListMarker style delim
return $ OrderedList (start, style, delim) $ compactify items
-bulletList :: GenParser Char ParserState Block
+bulletList :: Parser [Char] ParserState Block
bulletList =
many1 (listItem bulletListStart) >>= return . BulletList . compactify
-- definition lists
-defListMarker :: GenParser Char ParserState ()
+defListMarker :: Parser [Char] ParserState ()
defListMarker = do
sps <- nonindentSpaces
char ':' <|> char '~'
@@ -627,10 +629,10 @@ defListMarker = do
let remaining = tabStop - (length sps + 1)
if remaining > 0
then count remaining (char ' ') <|> string "\t"
- else pzero
+ else mzero
return ()
-definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
+definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]])
definitionListItem = try $ do
-- first, see if this has any chance of being a definition list:
lookAhead (anyLine >> optional blankline >> defListMarker)
@@ -644,7 +646,7 @@ definitionListItem = try $ do
updateState (\st -> st {stateParserContext = oldContext})
return ((normalizeSpaces term), contents)
-defRawBlock :: GenParser Char ParserState [Char]
+defRawBlock :: Parser [Char] ParserState [Char]
defRawBlock = try $ do
defListMarker
firstline <- anyLine
@@ -656,7 +658,7 @@ defRawBlock = try $ do
return $ unlines lns ++ trl
return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont
-definitionList :: GenParser Char ParserState Block
+definitionList :: Parser [Char] ParserState Block
definitionList = do
items <- many1 definitionListItem
-- "compactify" the definition list:
@@ -685,7 +687,7 @@ isHtmlOrBlank (Space) = True
isHtmlOrBlank (LineBreak) = True
isHtmlOrBlank _ = False
-para :: GenParser Char ParserState Block
+para :: Parser [Char] ParserState Block
para = try $ do
result <- liftM normalizeSpaces $ many1 inline
guard $ not . all isHtmlOrBlank $ result
@@ -696,17 +698,17 @@ para = try $ do
lookAhead (blockQuote <|> header) >> return "")
return $ Para result
-plain :: GenParser Char ParserState Block
+plain :: Parser [Char] ParserState Block
plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces
--
-- raw html
--
-htmlElement :: GenParser Char ParserState [Char]
+htmlElement :: Parser [Char] ParserState [Char]
htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag)
-htmlBlock :: GenParser Char ParserState Block
+htmlBlock :: Parser [Char] ParserState Block
htmlBlock = try $ do
failUnlessBeginningOfLine
first <- htmlElement
@@ -714,12 +716,12 @@ htmlBlock = try $ do
finalNewlines <- many newline
return $ RawBlock "html" $ first ++ finalSpace ++ finalNewlines
-strictHtmlBlock :: GenParser Char ParserState [Char]
+strictHtmlBlock :: Parser [Char] ParserState [Char]
strictHtmlBlock = do
failUnlessBeginningOfLine
htmlInBalanced (not . isInlineTag)
-rawVerbatimBlock :: GenParser Char ParserState String
+rawVerbatimBlock :: Parser [Char] ParserState String
rawVerbatimBlock = try $ do
(TagOpen tag _, open) <- htmlTag (tagOpen (\t ->
t == "pre" || t == "style" || t == "script")
@@ -727,7 +729,7 @@ rawVerbatimBlock = try $ do
contents <- manyTill anyChar (htmlTag (~== TagClose tag))
return $ open ++ contents ++ renderTags [TagClose tag]
-rawTeXBlock :: GenParser Char ParserState Block
+rawTeXBlock :: Parser [Char] ParserState Block
rawTeXBlock = do
failIfStrict
result <- liftM (RawBlock "latex") rawLaTeXBlock
@@ -735,7 +737,7 @@ rawTeXBlock = do
spaces
return result
-rawHtmlBlocks :: GenParser Char ParserState Block
+rawHtmlBlocks :: Parser [Char] ParserState Block
rawHtmlBlocks = do
htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|>
liftM snd (htmlTag isBlockTag)
@@ -759,7 +761,7 @@ rawHtmlBlocks = do
-- Parse a dashed line with optional trailing spaces; return its length
-- and the length including trailing space.
dashedLine :: Char
- -> GenParser Char st (Int, Int)
+ -> Parser [Char] st (Int, Int)
dashedLine ch = do
dashes <- many1 (char ch)
sp <- many spaceChar
@@ -768,7 +770,7 @@ dashedLine ch = do
-- Parse a table header with dashed lines of '-' preceded by
-- one (or zero) line of text.
simpleTableHeader :: Bool -- ^ Headerless table
- -> GenParser Char ParserState ([[Block]], [Alignment], [Int])
+ -> Parser [Char] ParserState ([[Block]], [Alignment], [Int])
simpleTableHeader headless = try $ do
rawContent <- if headless
then return ""
@@ -792,16 +794,16 @@ simpleTableHeader headless = try $ do
return (heads, aligns, indices)
-- Parse a table footer - dashed lines followed by blank line.
-tableFooter :: GenParser Char ParserState [Char]
+tableFooter :: Parser [Char] ParserState [Char]
tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines
-- Parse a table separator - dashed line.
-tableSep :: GenParser Char ParserState Char
+tableSep :: Parser [Char] ParserState Char
tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n'
-- Parse a raw line and split it into chunks by indices.
rawTableLine :: [Int]
- -> GenParser Char ParserState [String]
+ -> Parser [Char] ParserState [String]
rawTableLine indices = do
notFollowedBy' (blanklines <|> tableFooter)
line <- many1Till anyChar newline
@@ -810,12 +812,12 @@ rawTableLine indices = do
-- Parse a table line and return a list of lists of blocks (columns).
tableLine :: [Int]
- -> GenParser Char ParserState [[Block]]
+ -> Parser [Char] ParserState [[Block]]
tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
-- Parse a multiline table row and return a list of blocks (columns).
multilineRow :: [Int]
- -> GenParser Char ParserState [[Block]]
+ -> Parser [Char] ParserState [[Block]]
multilineRow indices = do
colLines <- many1 (rawTableLine indices)
let cols = map unlines $ transpose colLines
@@ -823,7 +825,7 @@ multilineRow indices = do
-- Parses a table caption: inlines beginning with 'Table:'
-- and followed by blank lines.
-tableCaption :: GenParser Char ParserState [Inline]
+tableCaption :: Parser [Char] ParserState [Inline]
tableCaption = try $ do
skipNonindentSpaces
string ":" <|> string "Table:"
@@ -833,7 +835,7 @@ tableCaption = try $ do
-- Parse a simple table with '---' header and one line per row.
simpleTable :: Bool -- ^ Headerless table
- -> GenParser Char ParserState Block
+ -> Parser [Char] ParserState Block
simpleTable headless = do
Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine
(return ())
@@ -847,12 +849,12 @@ simpleTable headless = do
-- which may be multiline, separated by blank lines, and
-- ending with a footer (dashed line followed by blank line).
multilineTable :: Bool -- ^ Headerless table
- -> GenParser Char ParserState Block
+ -> Parser [Char] ParserState Block
multilineTable headless =
tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter tableCaption
multilineTableHeader :: Bool -- ^ Headerless table
- -> GenParser Char ParserState ([[Block]], [Alignment], [Int])
+ -> Parser [Char] ParserState ([[Block]], [Alignment], [Int])
multilineTableHeader headless = try $ do
if headless
then return '\n'
@@ -904,10 +906,10 @@ extraTable :: Bool -- ^ Headerless table
extraTable = extraTableWith block tableCaption
gridTable :: Bool -- ^ Headerless table
- -> GenParser Char ParserState Block
+ -> Parser [Char] ParserState Block
gridTable = gridTableWith block tableCaption
-table :: GenParser Char ParserState Block
+table :: Parser [Char] ParserState Block
table = multilineTable False <|> simpleTable True <|>
simpleTable False <|> multilineTable True <|>
extraTable False <|> extraTable True <|>
@@ -917,10 +919,10 @@ table = multilineTable False <|> simpleTable True <|>
-- inline
--
-inline :: GenParser Char ParserState Inline
+inline :: Parser [Char] ParserState Inline
inline = choice inlineParsers <?> "inline"
-inlineParsers :: [GenParser Char ParserState Inline]
+inlineParsers :: [Parser [Char] ParserState Inline]
inlineParsers = [ whitespace
, str
, endline
@@ -947,7 +949,7 @@ inlineParsers = [ whitespace
, symbol
, ltSign ]
-escapedChar' :: GenParser Char ParserState Char
+escapedChar' :: Parser [Char] ParserState Char
escapedChar' = try $ do
char '\\'
state <- getState
@@ -955,7 +957,7 @@ escapedChar' = try $ do
then oneOf "\\`*_{}[]()>#+-.!~"
else satisfy (not . isAlphaNum)
-escapedChar :: GenParser Char ParserState Inline
+escapedChar :: Parser [Char] ParserState Inline
escapedChar = do
result <- escapedChar'
return $ case result of
@@ -963,7 +965,7 @@ escapedChar = do
'\n' -> LineBreak -- "\[newline]" is a linebreak
_ -> Str [result]
-ltSign :: GenParser Char ParserState Inline
+ltSign :: Parser [Char] ParserState Inline
ltSign = do
st <- getState
if stateStrict st
@@ -971,7 +973,7 @@ ltSign = do
else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
return $ Str ['<']
-exampleRef :: GenParser Char ParserState Inline
+exampleRef :: Parser [Char] ParserState Inline
exampleRef = try $ do
char '@'
lab <- many1 (alphaNum <|> oneOf "-_")
@@ -979,7 +981,7 @@ exampleRef = try $ do
-- later. See the end of parseMarkdown.
return $ Str $ '@' : lab
-symbol :: GenParser Char ParserState Inline
+symbol :: Parser [Char] ParserState Inline
symbol = do
result <- noneOf "<\\\n\t "
<|> try (do lookAhead $ char '\\'
@@ -988,7 +990,7 @@ symbol = do
return $ Str [result]
-- parses inline code, between n `s and n `s
-code :: GenParser Char ParserState Inline
+code :: Parser [Char] ParserState Inline
code = try $ do
starts <- many1 (char '`')
skipSpaces
@@ -999,26 +1001,26 @@ code = try $ do
attr <- option ([],[],[]) (try $ optional whitespace >> attributes)
return $ Code attr $ removeLeadingTrailingSpace $ concat result
-mathWord :: GenParser Char st [Char]
+mathWord :: Parser [Char] st [Char]
mathWord = liftM concat $ many1 mathChunk
-mathChunk :: GenParser Char st [Char]
+mathChunk :: Parser [Char] st [Char]
mathChunk = do char '\\'
c <- anyChar
return ['\\',c]
<|> many1 (satisfy $ \c -> not (isBlank c || c == '\\' || c == '$'))
-math :: GenParser Char ParserState Inline
+math :: Parser [Char] ParserState Inline
math = (mathDisplay >>= applyMacros' >>= return . Math DisplayMath)
<|> (mathInline >>= applyMacros' >>= return . Math InlineMath)
-mathDisplay :: GenParser Char ParserState String
+mathDisplay :: Parser [Char] ParserState String
mathDisplay = try $ do
failIfStrict
string "$$"
many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$")
-mathInline :: GenParser Char ParserState String
+mathInline :: Parser [Char] ParserState String
mathInline = try $ do
failIfStrict
char '$'
@@ -1028,20 +1030,20 @@ mathInline = try $ do
notFollowedBy digit
return $ intercalate " " words'
--- to avoid performance problems, treat 4 or more _ or * in a row as a literal
--- rather than attempting to parse for emph/strong
-fours :: GenParser Char st Inline
+-- to avoid performance problems, treat 4 or more _ or * or ~ or ^ in a row
+-- as a literal rather than attempting to parse for emph/strong/strikeout/super/sub
+fours :: Parser [Char] st Inline
fours = try $ do
- x <- char '*' <|> char '_'
+ x <- char '*' <|> char '_' <|> char '~' <|> char '^'
count 2 $ satisfy (==x)
rest <- many1 (satisfy (==x))
return $ Str (x:x:x:rest)
-- | Parses a list of inlines between start and end delimiters.
inlinesBetween :: (Show b)
- => GenParser Char ParserState a
- -> GenParser Char ParserState b
- -> GenParser Char ParserState [Inline]
+ => Parser [Char] ParserState a
+ -> Parser [Char] ParserState b
+ -> Parser [Char] ParserState [Inline]
inlinesBetween start end =
normalizeSpaces `liftM` try (start >> many1Till inner end)
where inner = innerSpace <|> (notFollowedBy' whitespace >> inline)
@@ -1049,8 +1051,8 @@ inlinesBetween start end =
-- This is used to prevent exponential blowups for things like:
-- a**a*a**a*a**a*a**a*a**a*a**a*a**
-nested :: GenParser Char ParserState a
- -> GenParser Char ParserState a
+nested :: Parser [Char] ParserState a
+ -> Parser [Char] ParserState a
nested p = do
nestlevel <- stateMaxNestingLevel `fmap` getState
guard $ nestlevel > 0
@@ -1059,7 +1061,7 @@ nested p = do
updateState $ \st -> st{ stateMaxNestingLevel = nestlevel }
return res
-emph :: GenParser Char ParserState Inline
+emph :: Parser [Char] ParserState Inline
emph = Emph `fmap` nested
(inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd)
where starStart = char '*' >> lookAhead nonspaceChar
@@ -1067,7 +1069,7 @@ emph = Emph `fmap` nested
ulStart = char '_' >> lookAhead nonspaceChar
ulEnd = notFollowedBy' strong >> char '_'
-strong :: GenParser Char ParserState Inline
+strong :: Parser [Char] ParserState Inline
strong = Strong `liftM` nested
(inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd)
where starStart = string "**" >> lookAhead nonspaceChar
@@ -1075,32 +1077,32 @@ strong = Strong `liftM` nested
ulStart = string "__" >> lookAhead nonspaceChar
ulEnd = try $ string "__"
-strikeout :: GenParser Char ParserState Inline
+strikeout :: Parser [Char] ParserState Inline
strikeout = Strikeout `liftM`
(failIfStrict >> inlinesBetween strikeStart strikeEnd)
where strikeStart = string "~~" >> lookAhead nonspaceChar
>> notFollowedBy (char '~')
strikeEnd = try $ string "~~"
-superscript :: GenParser Char ParserState Inline
+superscript :: Parser [Char] ParserState Inline
superscript = failIfStrict >> enclosed (char '^') (char '^')
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
return . Superscript
-subscript :: GenParser Char ParserState Inline
+subscript :: Parser [Char] ParserState Inline
subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
return . Subscript
-whitespace :: GenParser Char ParserState Inline
+whitespace :: Parser [Char] ParserState Inline
whitespace = spaceChar >>
( (spaceChar >> skipMany spaceChar >> option Space (endline >> return LineBreak))
<|> (skipMany spaceChar >> return Space) ) <?> "whitespace"
-nonEndline :: GenParser Char st Char
+nonEndline :: Parser [Char] st Char
nonEndline = satisfy (/='\n')
-str :: GenParser Char ParserState Inline
+str :: Parser [Char] ParserState Inline
str = do
smart <- stateSmart `fmap` getState
a <- alphaNum
@@ -1133,12 +1135,12 @@ likelyAbbrev x =
"Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
"Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
- "ch.", "sec." ]
+ "ch.", "sec.", "cf.", "cp."]
abbrPairs = map (break (=='.')) abbrevs
in map snd $ filter (\(y,_) -> y == x) abbrPairs
-- an endline character that can be treated as a space, not a structural break
-endline :: GenParser Char ParserState Inline
+endline :: Parser [Char] ParserState Inline
endline = try $ do
newline
notFollowedBy blankline
@@ -1157,20 +1159,20 @@ endline = try $ do
--
-- a reference label for a link
-reference :: GenParser Char ParserState [Inline]
+reference :: Parser [Char] ParserState [Inline]
reference = do notFollowedBy' (string "[^") -- footnote reference
result <- inlinesInBalancedBrackets inline
return $ normalizeSpaces result
-- source for a link, with optional title
-source :: GenParser Char ParserState (String, [Char])
+source :: Parser [Char] ParserState (String, [Char])
source =
(try $ charsInBalanced '(' ')' litChar >>= parseFromString source') <|>
-- the following is needed for cases like: [ref](/url(a).
(enclosed (char '(') (char ')') litChar >>= parseFromString source')
-- auxiliary function for source
-source' :: GenParser Char ParserState (String, [Char])
+source' :: Parser [Char] ParserState (String, [Char])
source' = do
skipSpaces
let nl = char '\n' >>~ notFollowedBy blankline
@@ -1188,7 +1190,7 @@ source' = do
eof
return (escapeURI $ removeTrailingSpace src, tit)
-linkTitle :: GenParser Char ParserState String
+linkTitle :: Parser [Char] ParserState String
linkTitle = try $ do
(many1 spaceChar >> option '\n' newline) <|> newline
skipSpaces
@@ -1196,7 +1198,7 @@ linkTitle = try $ do
tit <- manyTill litChar (try (char delim >> skipSpaces >> eof))
return $ fromEntities tit
-link :: GenParser Char ParserState Inline
+link :: Parser [Char] ParserState Inline
link = try $ do
lab <- reference
(src, tit) <- source <|> referenceLink lab
@@ -1209,7 +1211,7 @@ delinkify = bottomUp $ concatMap go
-- a link like [this][ref] or [this][] or [this]
referenceLink :: [Inline]
- -> GenParser Char ParserState (String, [Char])
+ -> Parser [Char] ParserState (String, [Char])
referenceLink lab = do
ref <- option [] (try (optional (char ' ') >>
optional (newline >> skipSpaces) >> reference))
@@ -1219,7 +1221,7 @@ referenceLink lab = do
Nothing -> fail "no corresponding key"
Just target -> return target
-autoLink :: GenParser Char ParserState Inline
+autoLink :: Parser [Char] ParserState Inline
autoLink = try $ do
char '<'
(orig, src) <- uri <|> emailAddress
@@ -1229,14 +1231,14 @@ autoLink = try $ do
then Link [Str orig] (src, "")
else Link [Code ("",["url"],[]) orig] (src, "")
-image :: GenParser Char ParserState Inline
+image :: Parser [Char] ParserState Inline
image = try $ do
char '!'
lab <- reference
(src, tit) <- source <|> referenceLink lab
return $ Image lab (src,tit)
-note :: GenParser Char ParserState Inline
+note :: Parser [Char] ParserState Inline
note = try $ do
failIfStrict
ref <- noteMarker
@@ -1253,21 +1255,21 @@ note = try $ do
updateState $ \st -> st{ stateNotes = notes }
return $ Note contents
-inlineNote :: GenParser Char ParserState Inline
+inlineNote :: Parser [Char] ParserState Inline
inlineNote = try $ do
failIfStrict
char '^'
contents <- inlinesInBalancedBrackets inline
return $ Note [Para contents]
-rawLaTeXInline' :: GenParser Char ParserState Inline
+rawLaTeXInline' :: Parser [Char] ParserState Inline
rawLaTeXInline' = try $ do
failIfStrict
lookAhead $ char '\\' >> notFollowedBy' (string "start") -- context env
RawInline _ s <- rawLaTeXInline
return $ RawInline "tex" s -- "tex" because it might be context or latex
-rawConTeXtEnvironment :: GenParser Char st String
+rawConTeXtEnvironment :: Parser [Char] st String
rawConTeXtEnvironment = try $ do
string "\\start"
completion <- inBrackets (letter <|> digit <|> spaceChar)
@@ -1276,14 +1278,14 @@ rawConTeXtEnvironment = try $ do
(try $ string "\\stop" >> string completion)
return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion
-inBrackets :: (GenParser Char st Char) -> GenParser Char st String
+inBrackets :: (Parser [Char] st Char) -> Parser [Char] st String
inBrackets parser = do
char '['
contents <- many parser
char ']'
return $ "[" ++ contents ++ "]"
-rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline :: Parser [Char] ParserState Inline
rawHtmlInline = do
st <- getState
(_,result) <- if stateStrict st
@@ -1293,20 +1295,20 @@ rawHtmlInline = do
-- Citations
-cite :: GenParser Char ParserState Inline
+cite :: Parser [Char] ParserState Inline
cite = do
failIfStrict
citations <- textualCite <|> normalCite
return $ Cite citations []
-spnl :: GenParser Char st ()
+spnl :: Parser [Char] st ()
spnl = try $ do
skipSpaces
optional newline
skipSpaces
notFollowedBy (char '\n')
-textualCite :: GenParser Char ParserState [Citation]
+textualCite :: Parser [Char] ParserState [Citation]
textualCite = try $ do
(_, key) <- citeKey
let first = Citation{ citationId = key
@@ -1321,7 +1323,7 @@ textualCite = try $ do
then option [first] $ bareloc first
else return $ first : rest
-bareloc :: Citation -> GenParser Char ParserState [Citation]
+bareloc :: Citation -> Parser [Char] ParserState [Citation]
bareloc c = try $ do
spnl
char '['
@@ -1331,7 +1333,7 @@ bareloc c = try $ do
char ']'
return $ c{ citationSuffix = suff } : rest
-normalCite :: GenParser Char ParserState [Citation]
+normalCite :: Parser [Char] ParserState [Citation]
normalCite = try $ do
char '['
spnl
@@ -1340,7 +1342,7 @@ normalCite = try $ do
char ']'
return citations
-citeKey :: GenParser Char ParserState (Bool, String)
+citeKey :: Parser [Char] ParserState (Bool, String)
citeKey = try $ do
suppress_author <- option False (char '-' >> return True)
char '@'
@@ -1352,7 +1354,7 @@ citeKey = try $ do
guard $ key `elem` stateCitations st
return (suppress_author, key)
-suffix :: GenParser Char ParserState [Inline]
+suffix :: Parser [Char] ParserState [Inline]
suffix = try $ do
hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
spnl
@@ -1361,14 +1363,14 @@ suffix = try $ do
then Space : rest
else rest
-prefix :: GenParser Char ParserState [Inline]
+prefix :: Parser [Char] ParserState [Inline]
prefix = liftM normalizeSpaces $
manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey))
-citeList :: GenParser Char ParserState [Citation]
+citeList :: Parser [Char] ParserState [Citation]
citeList = sepBy1 citation (try $ char ';' >> spnl)
-citation :: GenParser Char ParserState Citation
+citation :: Parser [Char] ParserState Citation
citation = try $ do
pref <- prefix
(suppress_author, key) <- citeKey
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index 456b23ce8..1806866ce 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -33,8 +33,7 @@ module Text.Pandoc.Readers.RST (
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
-import Text.ParserCombinators.Parsec
-import Control.Monad ( when, liftM )
+import Control.Monad ( when, liftM, guard, mzero )
import Data.List ( findIndex, intercalate, transpose, sort, deleteFirstsBy )
import qualified Data.Map as M
import Text.Printf ( printf )
@@ -58,7 +57,7 @@ underlineChars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
-specialChars = "\\`|*_<>$:[]()-.\"'\8216\8217\8220\8221"
+specialChars = "\\`|*_<>$:/[]{}()-.\"'\8216\8217\8220\8221"
--
-- parsing documents
@@ -89,7 +88,7 @@ titleTransform ((Header 1 head1):rest) |
(promoteHeaders 1 rest, head1)
titleTransform blocks = (blocks, [])
-parseRST :: GenParser Char ParserState Pandoc
+parseRST :: Parser [Char] ParserState Pandoc
parseRST = do
optional blanklines -- skip blank lines at beginning of file
startPos <- getPosition
@@ -118,17 +117,19 @@ parseRST = do
-- parsing blocks
--
-parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks :: Parser [Char] ParserState [Block]
parseBlocks = manyTill block eof
-block :: GenParser Char ParserState Block
+block :: Parser [Char] ParserState Block
block = choice [ codeBlock
, rawBlock
, blockQuote
, fieldList
, imageBlock
+ , figureBlock
, customCodeBlock
, mathBlock
+ , defaultRoleBlock
, unknownDirective
, header
, hrule
@@ -144,7 +145,7 @@ block = choice [ codeBlock
-- field list
--
-rawFieldListItem :: String -> GenParser Char ParserState (String, String)
+rawFieldListItem :: String -> Parser [Char] ParserState (String, String)
rawFieldListItem indent = try $ do
string indent
char ':'
@@ -158,7 +159,7 @@ rawFieldListItem indent = try $ do
return (name, raw)
fieldListItem :: String
- -> GenParser Char ParserState (Maybe ([Inline], [[Block]]))
+ -> Parser [Char] ParserState (Maybe ([Inline], [[Block]]))
fieldListItem indent = try $ do
(name, raw) <- rawFieldListItem indent
let term = [Str name]
@@ -185,7 +186,7 @@ extractContents [Plain auth] = auth
extractContents [Para auth] = auth
extractContents _ = []
-fieldList :: GenParser Char ParserState Block
+fieldList :: Parser [Char] ParserState Block
fieldList = try $ do
indent <- lookAhead $ many spaceChar
items <- many1 $ fieldListItem indent
@@ -197,7 +198,7 @@ fieldList = try $ do
-- line block
--
-lineBlockLine :: GenParser Char ParserState [Inline]
+lineBlockLine :: Parser [Char] ParserState [Inline]
lineBlockLine = try $ do
char '|'
char ' ' <|> lookAhead (char '\n')
@@ -208,7 +209,7 @@ lineBlockLine = try $ do
then normalizeSpaces line
else Str white : normalizeSpaces line
-lineBlock :: GenParser Char ParserState Block
+lineBlock :: Parser [Char] ParserState Block
lineBlock = try $ do
lines' <- many1 lineBlockLine
blanklines
@@ -218,14 +219,14 @@ lineBlock = try $ do
-- paragraph block
--
-para :: GenParser Char ParserState Block
+para :: Parser [Char] ParserState Block
para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph"
-codeBlockStart :: GenParser Char st Char
+codeBlockStart :: Parser [Char] st Char
codeBlockStart = string "::" >> blankline >> blankline
-- paragraph that ends in a :: starting a code block
-paraBeforeCodeBlock :: GenParser Char ParserState Block
+paraBeforeCodeBlock :: Parser [Char] ParserState Block
paraBeforeCodeBlock = try $ do
result <- many1 (notFollowedBy' codeBlockStart >> inline)
lookAhead (string "::")
@@ -234,21 +235,21 @@ paraBeforeCodeBlock = try $ do
else (normalizeSpaces result) ++ [Str ":"]
-- regular paragraph
-paraNormal :: GenParser Char ParserState Block
+paraNormal :: Parser [Char] ParserState Block
paraNormal = try $ do
result <- many1 inline
newline
blanklines
return $ Para $ normalizeSpaces result
-plain :: GenParser Char ParserState Block
+plain :: Parser [Char] ParserState Block
plain = many1 inline >>= return . Plain . normalizeSpaces
--
-- image block
--
-imageBlock :: GenParser Char ParserState Block
+imageBlock :: Parser [Char] ParserState Block
imageBlock = try $ do
string ".. image:: "
src <- manyTill anyChar newline
@@ -263,11 +264,11 @@ imageBlock = try $ do
-- header blocks
--
-header :: GenParser Char ParserState Block
+header :: Parser [Char] ParserState Block
header = doubleHeader <|> singleHeader <?> "header"
-- a header with lines on top and bottom
-doubleHeader :: GenParser Char ParserState Block
+doubleHeader :: Parser [Char] ParserState Block
doubleHeader = try $ do
c <- oneOf underlineChars
rest <- many (char c) -- the top line
@@ -292,7 +293,7 @@ doubleHeader = try $ do
return $ Header level (normalizeSpaces txt)
-- a header with line on the bottom only
-singleHeader :: GenParser Char ParserState Block
+singleHeader :: Parser [Char] ParserState Block
singleHeader = try $ do
notFollowedBy' whitespace
txt <- many1 (do {notFollowedBy blankline; inline})
@@ -315,7 +316,7 @@ singleHeader = try $ do
-- hrule block
--
-hrule :: GenParser Char st Block
+hrule :: Parser [Char] st Block
hrule = try $ do
chr <- oneOf underlineChars
count 3 (char chr)
@@ -329,14 +330,14 @@ hrule = try $ do
--
-- read a line indented by a given string
-indentedLine :: String -> GenParser Char st [Char]
+indentedLine :: String -> Parser [Char] st [Char]
indentedLine indents = try $ do
string indents
manyTill anyChar newline
-- one or more indented lines, possibly separated by blank lines.
-- any amount of indentation will work.
-indentedBlock :: GenParser Char st [Char]
+indentedBlock :: Parser [Char] st [Char]
indentedBlock = try $ do
indents <- lookAhead $ many1 spaceChar
lns <- many1 $ try $ do b <- option "" blanklines
@@ -345,7 +346,7 @@ indentedBlock = try $ do
optional blanklines
return $ unlines lns
-codeBlock :: GenParser Char st Block
+codeBlock :: Parser [Char] st Block
codeBlock = try $ do
codeBlockStart
result <- indentedBlock
@@ -353,7 +354,7 @@ codeBlock = try $ do
-- | The 'code-block' directive (from Sphinx) that allows a language to be
-- specified.
-customCodeBlock :: GenParser Char st Block
+customCodeBlock :: Parser [Char] st Block
customCodeBlock = try $ do
string ".. code-block:: "
language <- manyTill anyChar newline
@@ -361,19 +362,33 @@ customCodeBlock = try $ do
result <- indentedBlock
return $ CodeBlock ("", ["sourceCode", language], []) $ stripTrailingNewlines result
+
+figureBlock :: Parser [Char] ParserState Block
+figureBlock = try $ do
+ string ".. figure::"
+ src <- removeLeadingTrailingSpace `fmap` manyTill anyChar newline
+ body <- indentedBlock
+ caption <- parseFromString extractCaption body
+ return $ Para [Image caption (src,"")]
+
+extractCaption :: Parser [Char] ParserState [Inline]
+extractCaption = try $ do
+ manyTill anyLine blanklines
+ many inline
+
-- | The 'math' directive (from Sphinx) for display math.
-mathBlock :: GenParser Char st Block
+mathBlock :: Parser [Char] st Block
mathBlock = try $ do
string ".. math::"
mathBlockMultiline <|> mathBlockOneLine
-mathBlockOneLine :: GenParser Char st Block
+mathBlockOneLine :: Parser [Char] st Block
mathBlockOneLine = try $ do
result <- manyTill anyChar newline
blanklines
return $ Para [Math DisplayMath $ removeLeadingTrailingSpace result]
-mathBlockMultiline :: GenParser Char st Block
+mathBlockMultiline :: Parser [Char] st Block
mathBlockMultiline = try $ do
blanklines
result <- indentedBlock
@@ -388,7 +403,7 @@ mathBlockMultiline = try $ do
$ filter (not . null) $ splitBy null lns'
return $ Para $ map (Math DisplayMath) eqs
-lhsCodeBlock :: GenParser Char ParserState Block
+lhsCodeBlock :: Parser [Char] ParserState Block
lhsCodeBlock = try $ do
failUnlessLHS
optional codeBlockStart
@@ -402,7 +417,7 @@ lhsCodeBlock = try $ do
blanklines
return $ CodeBlock ("", ["sourceCode", "literate", "haskell"], []) $ intercalate "\n" lns'
-birdTrackLine :: GenParser Char st [Char]
+birdTrackLine :: Parser [Char] st [Char]
birdTrackLine = do
char '>'
manyTill anyChar newline
@@ -411,7 +426,7 @@ birdTrackLine = do
-- raw html/latex/etc
--
-rawBlock :: GenParser Char st Block
+rawBlock :: Parser [Char] st Block
rawBlock = try $ do
string ".. raw:: "
lang <- many1 (letter <|> digit)
@@ -423,7 +438,7 @@ rawBlock = try $ do
-- block quotes
--
-blockQuote :: GenParser Char ParserState Block
+blockQuote :: Parser [Char] ParserState Block
blockQuote = do
raw <- indentedBlock
-- parse the extracted block, which may contain various block elements:
@@ -434,10 +449,10 @@ blockQuote = do
-- list blocks
--
-list :: GenParser Char ParserState Block
+list :: Parser [Char] ParserState Block
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
+definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]])
definitionListItem = try $ do
-- avoid capturing a directive or comment
notFollowedBy (try $ char '.' >> char '.')
@@ -447,11 +462,11 @@ definitionListItem = try $ do
contents <- parseFromString parseBlocks $ raw ++ "\n"
return (normalizeSpaces term, [contents])
-definitionList :: GenParser Char ParserState Block
+definitionList :: Parser [Char] ParserState Block
definitionList = many1 definitionListItem >>= return . DefinitionList
-- parses bullet list start and returns its length (inc. following whitespace)
-bulletListStart :: GenParser Char st Int
+bulletListStart :: Parser [Char] st Int
bulletListStart = try $ do
notFollowedBy' hrule -- because hrules start out just like lists
marker <- oneOf bulletListMarkers
@@ -461,14 +476,14 @@ bulletListStart = try $ do
-- parses ordered list start and returns its length (inc following whitespace)
orderedListStart :: ListNumberStyle
-> ListNumberDelim
- -> GenParser Char ParserState Int
+ -> Parser [Char] ParserState Int
orderedListStart style delim = try $ do
(_, markerLen) <- withHorizDisplacement (orderedListMarker style delim)
white <- many1 spaceChar
return $ markerLen + length white
-- parse a line of a list item
-listLine :: Int -> GenParser Char ParserState [Char]
+listLine :: Int -> Parser [Char] ParserState [Char]
listLine markerLength = try $ do
notFollowedBy blankline
indentWith markerLength
@@ -476,7 +491,7 @@ listLine markerLength = try $ do
return $ line ++ "\n"
-- indent by specified number of spaces (or equiv. tabs)
-indentWith :: Int -> GenParser Char ParserState [Char]
+indentWith :: Int -> Parser [Char] ParserState [Char]
indentWith num = do
state <- getState
let tabStop = stateTabStop state
@@ -486,8 +501,8 @@ indentWith num = do
(try (char '\t' >> count (num - tabStop) (char ' '))) ]
-- parse raw text for one list item, excluding start marker and continuations
-rawListItem :: GenParser Char ParserState Int
- -> GenParser Char ParserState (Int, [Char])
+rawListItem :: Parser [Char] ParserState Int
+ -> Parser [Char] ParserState (Int, [Char])
rawListItem start = try $ do
markerLength <- start
firstLine <- manyTill anyChar newline
@@ -497,14 +512,14 @@ rawListItem start = try $ do
-- continuation of a list item - indented and separated by blankline or
-- (in compact lists) endline.
-- Note: nested lists are parsed as continuations.
-listContinuation :: Int -> GenParser Char ParserState [Char]
+listContinuation :: Int -> Parser [Char] ParserState [Char]
listContinuation markerLength = try $ do
blanks <- many1 blankline
result <- many1 (listLine markerLength)
return $ blanks ++ concat result
-listItem :: GenParser Char ParserState Int
- -> GenParser Char ParserState [Block]
+listItem :: Parser [Char] ParserState Int
+ -> Parser [Char] ParserState [Block]
listItem start = try $ do
(markerLength, first) <- rawListItem start
rest <- many (listContinuation markerLength)
@@ -521,22 +536,40 @@ listItem start = try $ do
updateState (\st -> st {stateParserContext = oldContext})
return parsed
-orderedList :: GenParser Char ParserState Block
+orderedList :: Parser [Char] ParserState Block
orderedList = try $ do
(start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar)
items <- many1 (listItem (orderedListStart style delim))
let items' = compactify items
return $ OrderedList (start, style, delim) items'
-bulletList :: GenParser Char ParserState Block
+bulletList :: Parser [Char] ParserState Block
bulletList = many1 (listItem bulletListStart) >>=
return . BulletList . compactify
--
+-- default-role block
+--
+
+defaultRoleBlock :: Parser [Char] ParserState Block
+defaultRoleBlock = try $ do
+ string ".. default-role::"
+ -- doesn't enforce any restrictions on the role name; embedded spaces shouldn't be allowed, for one
+ role <- manyTill anyChar newline >>= return . removeLeadingTrailingSpace
+ updateState $ \s -> s { stateRstDefaultRole =
+ if null role
+ then stateRstDefaultRole defaultParserState
+ else role
+ }
+ -- skip body of the directive if it exists
+ many $ blanklines <|> (spaceChar >> manyTill anyChar newline)
+ return Null
+
+--
-- unknown directive (e.g. comment)
--
-unknownDirective :: GenParser Char st Block
+unknownDirective :: Parser [Char] st Block
unknownDirective = try $ do
string ".."
notFollowedBy (noneOf " \t\n")
@@ -548,7 +581,7 @@ unknownDirective = try $ do
--- note block
---
-noteBlock :: GenParser Char ParserState [Char]
+noteBlock :: Parser [Char] ParserState [Char]
noteBlock = try $ do
startPos <- getPosition
string ".."
@@ -567,7 +600,7 @@ noteBlock = try $ do
-- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-noteMarker :: GenParser Char ParserState [Char]
+noteMarker :: Parser [Char] ParserState [Char]
noteMarker = do
char '['
res <- many1 digit
@@ -580,13 +613,13 @@ noteMarker = do
-- reference key
--
-quotedReferenceName :: GenParser Char ParserState [Inline]
+quotedReferenceName :: Parser [Char] ParserState [Inline]
quotedReferenceName = try $ do
char '`' >> notFollowedBy (char '`') -- `` means inline code!
label' <- many1Till inline (char '`')
return label'
-unquotedReferenceName :: GenParser Char ParserState [Inline]
+unquotedReferenceName :: Parser [Char] ParserState [Inline]
unquotedReferenceName = try $ do
label' <- many1Till inline (lookAhead $ char ':')
return label'
@@ -595,24 +628,24 @@ unquotedReferenceName = try $ do
-- plus isolated (no two adjacent) internal hyphens, underscores,
-- periods, colons and plus signs; no whitespace or other characters
-- are allowed.
-simpleReferenceName' :: GenParser Char st String
+simpleReferenceName' :: Parser [Char] st String
simpleReferenceName' = do
x <- alphaNum
xs <- many $ alphaNum
<|> (try $ oneOf "-_:+." >> lookAhead alphaNum)
return (x:xs)
-simpleReferenceName :: GenParser Char st [Inline]
+simpleReferenceName :: Parser [Char] st [Inline]
simpleReferenceName = do
raw <- simpleReferenceName'
return [Str raw]
-referenceName :: GenParser Char ParserState [Inline]
+referenceName :: Parser [Char] ParserState [Inline]
referenceName = quotedReferenceName <|>
(try $ simpleReferenceName >>~ lookAhead (char ':')) <|>
unquotedReferenceName
-referenceKey :: GenParser Char ParserState [Char]
+referenceKey :: Parser [Char] ParserState [Char]
referenceKey = do
startPos <- getPosition
(key, target) <- choice [imageKey, anonymousKey, regularKey]
@@ -624,7 +657,7 @@ referenceKey = do
-- return enough blanks to replace key
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-targetURI :: GenParser Char st [Char]
+targetURI :: Parser [Char] st [Char]
targetURI = do
skipSpaces
optional newline
@@ -633,7 +666,7 @@ targetURI = do
blanklines
return $ escapeURI $ removeLeadingTrailingSpace $ contents
-imageKey :: GenParser Char ParserState (Key, Target)
+imageKey :: Parser [Char] ParserState (Key, Target)
imageKey = try $ do
string ".. |"
ref <- manyTill inline (char '|')
@@ -642,14 +675,14 @@ imageKey = try $ do
src <- targetURI
return (toKey (normalizeSpaces ref), (src, ""))
-anonymousKey :: GenParser Char st (Key, Target)
+anonymousKey :: Parser [Char] st (Key, Target)
anonymousKey = try $ do
oneOfStrings [".. __:", "__"]
src <- targetURI
pos <- getPosition
return (toKey [Str $ "_" ++ printf "%09d" (sourceLine pos)], (src, ""))
-regularKey :: GenParser Char ParserState (Key, Target)
+regularKey :: Parser [Char] ParserState (Key, Target)
regularKey = try $ do
string ".. _"
ref <- referenceName
@@ -674,31 +707,31 @@ regularKey = try $ do
-- Grid tables TODO:
-- - column spans
-dashedLine :: Char -> GenParser Char st (Int, Int)
+dashedLine :: Char -> Parser [Char] st (Int, Int)
dashedLine ch = do
dashes <- many1 (char ch)
sp <- many (char ' ')
return (length dashes, length $ dashes ++ sp)
-simpleDashedLines :: Char -> GenParser Char st [(Int,Int)]
+simpleDashedLines :: Char -> Parser [Char] st [(Int,Int)]
simpleDashedLines ch = try $ many1 (dashedLine ch)
-- Parse a table row separator
-simpleTableSep :: Char -> GenParser Char ParserState Char
+simpleTableSep :: Char -> Parser [Char] ParserState Char
simpleTableSep ch = try $ simpleDashedLines ch >> newline
-- Parse a table footer
-simpleTableFooter :: GenParser Char ParserState [Char]
+simpleTableFooter :: Parser [Char] ParserState [Char]
simpleTableFooter = try $ simpleTableSep '=' >> blanklines
-- Parse a raw line and split it into chunks by indices.
-simpleTableRawLine :: [Int] -> GenParser Char ParserState [String]
+simpleTableRawLine :: [Int] -> Parser [Char] ParserState [String]
simpleTableRawLine indices = do
line <- many1Till anyChar newline
return (simpleTableSplitLine indices line)
-- Parse a table row and return a list of blocks (columns).
-simpleTableRow :: [Int] -> GenParser Char ParserState [[Block]]
+simpleTableRow :: [Int] -> Parser [Char] ParserState [[Block]]
simpleTableRow indices = do
notFollowedBy' simpleTableFooter
firstLine <- simpleTableRawLine indices
@@ -712,7 +745,7 @@ simpleTableSplitLine indices line =
$ tail $ splitByIndices (init indices) line
simpleTableHeader :: Bool -- ^ Headerless table
- -> GenParser Char ParserState ([[Block]], [Alignment], [Int])
+ -> Parser [Char] ParserState ([[Block]], [Alignment], [Int])
simpleTableHeader headless = try $ do
optional blanklines
rawContent <- if headless
@@ -732,7 +765,7 @@ simpleTableHeader headless = try $ do
-- Parse a simple table.
simpleTable :: Bool -- ^ Headerless table
- -> GenParser Char ParserState Block
+ -> Parser [Char] ParserState Block
simpleTable headless = do
Table c a _w h l <- tableWith (simpleTableHeader headless) simpleTableRow sep simpleTableFooter (return [])
-- Simple tables get 0s for relative column widths (i.e., use default)
@@ -741,10 +774,10 @@ simpleTable headless = do
sep = return () -- optional (simpleTableSep '-')
gridTable :: Bool -- ^ Headerless table
- -> GenParser Char ParserState Block
+ -> Parser [Char] ParserState Block
gridTable = gridTableWith block (return [])
-table :: GenParser Char ParserState Block
+table :: Parser [Char] ParserState Block
table = gridTable False <|> simpleTable False <|>
gridTable True <|> simpleTable True <?> "table"
@@ -753,7 +786,7 @@ table = gridTable False <|> simpleTable False <|>
-- inline
--
-inline :: GenParser Char ParserState Inline
+inline :: Parser [Char] ParserState Inline
inline = choice [ whitespace
, link
, str
@@ -771,66 +804,90 @@ inline = choice [ whitespace
, escapedChar
, symbol ] <?> "inline"
-hyphens :: GenParser Char ParserState Inline
+hyphens :: Parser [Char] ParserState Inline
hyphens = do
result <- many1 (char '-')
option Space endline
-- don't want to treat endline after hyphen or dash as a space
return $ Str result
-escapedChar :: GenParser Char st Inline
+escapedChar :: Parser [Char] st Inline
escapedChar = do c <- escaped anyChar
- return $ Str [c]
+ return $ if c == ' ' -- '\ ' is null in RST
+ then Str ""
+ else Str [c]
-symbol :: GenParser Char ParserState Inline
+symbol :: Parser [Char] ParserState Inline
symbol = do
result <- oneOf specialChars
return $ Str [result]
-- parses inline code, between codeStart and codeEnd
-code :: GenParser Char ParserState Inline
+code :: Parser [Char] ParserState Inline
code = try $ do
string "``"
result <- manyTill anyChar (try (string "``"))
return $ Code nullAttr
$ removeLeadingTrailingSpace $ intercalate " " $ lines result
-emph :: GenParser Char ParserState Inline
-emph = enclosed (char '*') (char '*') inline >>=
+-- succeeds only if we're not right after a str (ie. in middle of word)
+atStart :: Parser [Char] ParserState a -> Parser [Char] ParserState a
+atStart p = do
+ pos <- getPosition
+ st <- getState
+ -- single quote start can't be right after str
+ guard $ stateLastStrPos st /= Just pos
+ p
+
+emph :: Parser [Char] ParserState Inline
+emph = enclosed (atStart $ char '*') (char '*') inline >>=
return . Emph . normalizeSpaces
-strong :: GenParser Char ParserState Inline
-strong = enclosed (string "**") (try $ string "**") inline >>=
+strong :: Parser [Char] ParserState Inline
+strong = enclosed (atStart $ string "**") (try $ string "**") inline >>=
return . Strong . normalizeSpaces
-interpreted :: [Char] -> GenParser Char st [Char]
+-- Parses inline interpreted text which is required to have the given role.
+-- This decision is based on the role marker (if present),
+-- and the current default interpreted text role.
+interpreted :: [Char] -> Parser [Char] ParserState [Char]
interpreted role = try $ do
- optional $ try $ string "\\ "
- result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar
- try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "")
- return result
-
-superscript :: GenParser Char ParserState Inline
+ state <- getState
+ if role == stateRstDefaultRole state
+ then try markedInterpretedText <|> unmarkedInterpretedText
+ else markedInterpretedText
+ where
+ markedInterpretedText = try (roleMarker >> unmarkedInterpretedText)
+ <|> (unmarkedInterpretedText >>= (\txt -> roleMarker >> return txt))
+ roleMarker = string $ ":" ++ role ++ ":"
+ -- Note, this doesn't precisely implement the complex rule in
+ -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
+ -- but it should be good enough for most purposes
+ unmarkedInterpretedText = do
+ result <- enclosed (atStart $ char '`') (char '`') anyChar
+ return result
+
+superscript :: Parser [Char] ParserState Inline
superscript = interpreted "sup" >>= \x -> return (Superscript [Str x])
-subscript :: GenParser Char ParserState Inline
+subscript :: Parser [Char] ParserState Inline
subscript = interpreted "sub" >>= \x -> return (Subscript [Str x])
-math :: GenParser Char ParserState Inline
+math :: Parser [Char] ParserState Inline
math = interpreted "math" >>= \x -> return (Math InlineMath x)
-whitespace :: GenParser Char ParserState Inline
+whitespace :: Parser [Char] ParserState Inline
whitespace = many1 spaceChar >> return Space <?> "whitespace"
-str :: GenParser Char ParserState Inline
+str :: Parser [Char] ParserState Inline
str = do
- result <- many1 (noneOf (specialChars ++ "\t\n "))
- pos <- getPosition
- updateState $ \s -> s{ stateLastStrPos = Just pos }
+ let strChar = noneOf ("\t\n " ++ specialChars)
+ result <- many1 strChar
+ updateLastStrPos
return $ Str result
-- an endline character that can be treated as a space, not a structural break
-endline :: GenParser Char ParserState Inline
+endline :: Parser [Char] ParserState Inline
endline = try $ do
newline
notFollowedBy blankline
@@ -846,10 +903,10 @@ endline = try $ do
-- links
--
-link :: GenParser Char ParserState Inline
+link :: Parser [Char] ParserState Inline
link = choice [explicitLink, referenceLink, autoLink] <?> "link"
-explicitLink :: GenParser Char ParserState Inline
+explicitLink :: Parser [Char] ParserState Inline
explicitLink = try $ do
char '`'
notFollowedBy (char '`') -- `` marks start of inline code
@@ -861,7 +918,7 @@ explicitLink = try $ do
return $ Link (normalizeSpaces label')
(escapeURI $ removeLeadingTrailingSpace src, "")
-referenceLink :: GenParser Char ParserState Inline
+referenceLink :: Parser [Char] ParserState Inline
referenceLink = try $ do
label' <- (quotedReferenceName <|> simpleReferenceName) >>~ char '_'
state <- getState
@@ -873,7 +930,7 @@ referenceLink = try $ do
do char '_'
let anonKeys = sort $ filter isAnonKey $ M.keys keyTable
if null anonKeys
- then pzero
+ then mzero
else return (head anonKeys)
(src,tit) <- case lookupKeySrc keyTable key of
Nothing -> fail "no corresponding key"
@@ -882,21 +939,21 @@ referenceLink = try $ do
when (isAnonKey key) $ updateState $ \s -> s{ stateKeys = M.delete key keyTable }
return $ Link (normalizeSpaces label') (src, tit)
-autoURI :: GenParser Char ParserState Inline
+autoURI :: Parser [Char] ParserState Inline
autoURI = do
(orig, src) <- uri
return $ Link [Str orig] (src, "")
-autoEmail :: GenParser Char ParserState Inline
+autoEmail :: Parser [Char] ParserState Inline
autoEmail = do
(orig, src) <- emailAddress
return $ Link [Str orig] (src, "")
-autoLink :: GenParser Char ParserState Inline
+autoLink :: Parser [Char] ParserState Inline
autoLink = autoURI <|> autoEmail
-- For now, we assume that all substitution references are for images.
-image :: GenParser Char ParserState Inline
+image :: Parser [Char] ParserState Inline
image = try $ do
char '|'
ref <- manyTill inline (char '|')
@@ -907,7 +964,7 @@ image = try $ do
Just target -> return target
return $ Image (normalizeSpaces ref) (src, tit)
-note :: GenParser Char ParserState Inline
+note :: Parser [Char] ParserState Inline
note = try $ do
ref <- noteMarker
char '_'
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 3b5954368..71ba26c8c 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Textile
- Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
+ Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : Paul Rivier <paul*rivier#demotera*com>
@@ -59,10 +59,11 @@ import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
-import Text.ParserCombinators.Parsec
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
import Text.HTML.TagSoup.Match
-import Data.Char ( digitToInt, isLetter )
+import Data.Char ( digitToInt, isUpper )
import Control.Monad ( guard, liftM )
+import Control.Applicative ((<$>), (*>), (<*))
-- | Parse a Textile text and return a Pandoc document.
readTextile :: ParserState -- ^ Parser state, including options for parser
@@ -72,16 +73,8 @@ readTextile state s =
(readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
---
--- Constants and data structure definitions
---
-
--- | Special chars border strings parsing
-specialChars :: [Char]
-specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()"
-
-- | Generate a Pandoc ADT from a textile document
-parseTextile :: GenParser Char ParserState Pandoc
+parseTextile :: Parser [Char] ParserState Pandoc
parseTextile = do
-- textile allows raw HTML and does smart punctuation by default
updateState (\state -> state { stateParseRaw = True, stateSmart = True })
@@ -99,10 +92,10 @@ parseTextile = do
blocks <- parseBlocks
return $ Pandoc (Meta [] [] []) blocks -- FIXME
-noteMarker :: GenParser Char ParserState [Char]
+noteMarker :: Parser [Char] ParserState [Char]
noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.')
-noteBlock :: GenParser Char ParserState [Char]
+noteBlock :: Parser [Char] ParserState [Char]
noteBlock = try $ do
startPos <- getPosition
ref <- noteMarker
@@ -117,36 +110,37 @@ noteBlock = try $ do
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
-- | Parse document blocks
-parseBlocks :: GenParser Char ParserState [Block]
+parseBlocks :: Parser [Char] ParserState [Block]
parseBlocks = manyTill block eof
-- | Block parsers list tried in definition order
-blockParsers :: [GenParser Char ParserState Block]
+blockParsers :: [Parser [Char] ParserState Block]
blockParsers = [ codeBlock
, header
, blockQuote
, hrule
, anyList
, rawHtmlBlock
+ , rawLaTeXBlock'
, maybeExplicitBlock "table" table
, maybeExplicitBlock "p" para
, nullBlock ]
-- | Any block in the order of definition of blockParsers
-block :: GenParser Char ParserState Block
+block :: Parser [Char] ParserState Block
block = choice blockParsers <?> "block"
-codeBlock :: GenParser Char ParserState Block
+codeBlock :: Parser [Char] ParserState Block
codeBlock = codeBlockBc <|> codeBlockPre
-codeBlockBc :: GenParser Char ParserState Block
+codeBlockBc :: Parser [Char] ParserState Block
codeBlockBc = try $ do
string "bc. "
contents <- manyTill anyLine blanklines
return $ CodeBlock ("",[],[]) $ unlines contents
-- | Code Blocks in Textile are between <pre> and </pre>
-codeBlockPre :: GenParser Char ParserState Block
+codeBlockPre :: Parser [Char] ParserState Block
codeBlockPre = try $ do
htmlTag (tagOpen (=="pre") null)
result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak)
@@ -161,28 +155,23 @@ codeBlockPre = try $ do
return $ CodeBlock ("",[],[]) result'''
-- | Header of the form "hN. content" with N in 1..6
-header :: GenParser Char ParserState Block
+header :: Parser [Char] ParserState Block
header = try $ do
char 'h'
- level <- oneOf "123456" >>= return . digitToInt
- optional attributes
- char '.'
- whitespace
- name <- manyTill inline blockBreak
- return $ Header level (normalizeSpaces name)
+ level <- digitToInt <$> oneOf "123456"
+ optional attributes >> char '.' >> whitespace
+ name <- normalizeSpaces <$> manyTill inline blockBreak
+ return $ Header level name
-- | Blockquote of the form "bq. content"
-blockQuote :: GenParser Char ParserState Block
+blockQuote :: Parser [Char] ParserState Block
blockQuote = try $ do
- string "bq"
- optional attributes
- char '.'
- whitespace
- para >>= return . BlockQuote . (:[])
+ string "bq" >> optional attributes >> char '.' >> whitespace
+ BlockQuote . singleton <$> para
-- Horizontal rule
-hrule :: GenParser Char st Block
+hrule :: Parser [Char] st Block
hrule = try $ do
skipSpaces
start <- oneOf "-*"
@@ -197,73 +186,62 @@ hrule = try $ do
-- | Can be a bullet list or an ordered list. This implementation is
-- strict in the nesting, sublist must start at exactly "parent depth
-- plus one"
-anyList :: GenParser Char ParserState Block
-anyList = try $ do
- l <- anyListAtDepth 1
- blanklines
- return l
+anyList :: Parser [Char] ParserState Block
+anyList = try $ ( (anyListAtDepth 1) <* blanklines )
-- | This allow one type of list to be nested into an other type,
-- provided correct nesting
-anyListAtDepth :: Int -> GenParser Char ParserState Block
+anyListAtDepth :: Int -> Parser [Char] ParserState Block
anyListAtDepth depth = choice [ bulletListAtDepth depth,
orderedListAtDepth depth,
definitionList ]
-- | Bullet List of given depth, depth being the number of leading '*'
-bulletListAtDepth :: Int -> GenParser Char ParserState Block
-bulletListAtDepth depth = try $ do
- items <- many1 (bulletListItemAtDepth depth)
- return (BulletList items)
+bulletListAtDepth :: Int -> Parser [Char] ParserState Block
+bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth)
-- | Bullet List Item of given depth, depth being the number of
-- leading '*'
-bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block]
-bulletListItemAtDepth depth = try $ do
- count depth (char '*')
- optional attributes
- whitespace
- p <- inlines >>= return . Plain
- sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
- return (p:sublist)
+bulletListItemAtDepth :: Int -> Parser [Char] ParserState [Block]
+bulletListItemAtDepth = genericListItemAtDepth '*'
-- | Ordered List of given depth, depth being the number of
--- leading '#'
-orderedListAtDepth :: Int -> GenParser Char ParserState Block
+-- leading '#'
+orderedListAtDepth :: Int -> Parser [Char] ParserState Block
orderedListAtDepth depth = try $ do
items <- many1 (orderedListItemAtDepth depth)
return (OrderedList (1, DefaultStyle, DefaultDelim) items)
-- | Ordered List Item of given depth, depth being the number of
--- leading '#'
-orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block]
-orderedListItemAtDepth depth = try $ do
- count depth (char '#')
- optional attributes
- whitespace
- p <- inlines >>= return . Plain
- sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[]))
- return (p:sublist)
+-- leading '#'
+orderedListItemAtDepth :: Int -> Parser [Char] ParserState [Block]
+orderedListItemAtDepth = genericListItemAtDepth '#'
+
+-- | Common implementation of list items
+genericListItemAtDepth :: Char -> Int -> Parser [Char] ParserState [Block]
+genericListItemAtDepth c depth = try $ do
+ count depth (char c) >> optional attributes >> whitespace
+ p <- inlines
+ sublist <- option [] (singleton <$> anyListAtDepth (depth + 1))
+ return ((Plain p):sublist)
-- | A definition list is a set of consecutive definition items
-definitionList :: GenParser Char ParserState Block
-definitionList = try $ do
- items <- many1 definitionListItem
- return $ DefinitionList items
+definitionList :: Parser [Char] ParserState Block
+definitionList = try $ DefinitionList <$> many1 definitionListItem
-- | A definition list item in textile begins with '- ', followed by
-- the term defined, then spaces and ":=". The definition follows, on
-- the same single line, or spaned on multiple line, after a line
-- break.
-definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
+definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]])
definitionListItem = try $ do
string "- "
term <- many1Till inline (try (whitespace >> string ":="))
def <- inlineDef <|> multilineDef
return (term, def)
- where inlineDef :: GenParser Char ParserState [[Block]]
+ where inlineDef :: Parser [Char] ParserState [[Block]]
inlineDef = liftM (\d -> [[Plain d]]) $ try (whitespace >> inlines)
- multilineDef :: GenParser Char ParserState [[Block]]
+ multilineDef :: Parser [Char] ParserState [[Block]]
multilineDef = try $ do
optional whitespace >> newline
s <- many1Till anyChar (try (string "=:" >> newline))
@@ -273,59 +251,57 @@ definitionListItem = try $ do
-- | This terminates a block such as a paragraph. Because of raw html
-- blocks support, we have to lookAhead for a rawHtmlBlock.
-blockBreak :: GenParser Char ParserState ()
+blockBreak :: Parser [Char] ParserState ()
blockBreak = try (newline >> blanklines >> return ()) <|>
(lookAhead rawHtmlBlock >> return ())
+-- raw content
+
-- | A raw Html Block, optionally followed by blanklines
-rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock :: Parser [Char] ParserState Block
rawHtmlBlock = try $ do
(_,b) <- htmlTag isBlockTag
optional blanklines
return $ RawBlock "html" b
+-- | Raw block of LaTeX content
+rawLaTeXBlock' :: Parser [Char] ParserState Block
+rawLaTeXBlock' = do
+ failIfStrict
+ RawBlock "latex" <$> (rawLaTeXBlock <* spaces)
+
+
-- | In textile, paragraphs are separated by blank lines.
-para :: GenParser Char ParserState Block
-para = try $ do
- content <- manyTill inline blockBreak
- return $ Para $ normalizeSpaces content
+para :: Parser [Char] ParserState Block
+para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak
-- Tables
-- | A table cell spans until a pipe |
-tableCell :: GenParser Char ParserState TableCell
+tableCell :: Parser [Char] ParserState TableCell
tableCell = do
c <- many1 (noneOf "|\n")
content <- parseFromString (many1 inline) c
return $ [ Plain $ normalizeSpaces content ]
-- | A table row is made of many table cells
-tableRow :: GenParser Char ParserState [TableCell]
-tableRow = try $ do
- char '|'
- cells <- endBy1 tableCell (char '|')
- newline
- return cells
+tableRow :: Parser [Char] ParserState [TableCell]
+tableRow = try $ ( char '|' *> (endBy1 tableCell (char '|')) <* newline)
-- | Many table rows
-tableRows :: GenParser Char ParserState [[TableCell]]
+tableRows :: Parser [Char] ParserState [[TableCell]]
tableRows = many1 tableRow
-- | Table headers are made of cells separated by a tag "|_."
-tableHeaders :: GenParser Char ParserState [TableCell]
-tableHeaders = try $ do
- let separator = (try $ string "|_.")
- separator
- headers <- sepBy1 tableCell separator
- char '|'
- newline
- return headers
+tableHeaders :: Parser [Char] ParserState [TableCell]
+tableHeaders = let separator = (try $ string "|_.") in
+ try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline )
-- | A table with an optional header. Current implementation can
-- handle tables with and without header, but will parse cells
-- alignment attributes as content.
-table :: GenParser Char ParserState Block
+table :: Parser [Char] ParserState Block
table = try $ do
headers <- option [] tableHeaders
rows <- tableRows
@@ -341,8 +317,8 @@ table = try $ do
-- | Blocks like 'p' and 'table' do not need explicit block tag.
-- However, they can be used to set HTML/CSS attributes when needed.
maybeExplicitBlock :: String -- ^ block tag name
- -> GenParser Char ParserState Block -- ^ implicit block
- -> GenParser Char ParserState Block
+ -> Parser [Char] ParserState Block -- ^ implicit block
+ -> Parser [Char] ParserState Block
maybeExplicitBlock name blk = try $ do
optional $ try $ string name >> optional attributes >> char '.' >>
((try whitespace) <|> endline)
@@ -356,31 +332,27 @@ maybeExplicitBlock name blk = try $ do
-- | Any inline element
-inline :: GenParser Char ParserState Inline
+inline :: Parser [Char] ParserState Inline
inline = choice inlineParsers <?> "inline"
-- | List of consecutive inlines before a newline
-inlines :: GenParser Char ParserState [Inline]
+inlines :: Parser [Char] ParserState [Inline]
inlines = manyTill inline newline
-- | Inline parsers tried in order
-inlineParsers :: [GenParser Char ParserState Inline]
+inlineParsers :: [Parser [Char] ParserState Inline]
inlineParsers = [ autoLink
, str
, whitespace
, endline
, code
+ , escapedInline
, htmlSpan
, rawHtmlInline
+ , rawLaTeXInline'
, note
- , simpleInline (string "??") (Cite [])
- , simpleInline (string "**") Strong
- , simpleInline (string "__") Emph
- , simpleInline (char '*') Strong
- , simpleInline (char '_') Emph
- , simpleInline (char '-') Strikeout
- , simpleInline (char '^') Superscript
- , simpleInline (char '~') Subscript
+ , try $ (char '[' *> inlineMarkup <* char ']')
+ , inlineMarkup
, link
, image
, mark
@@ -388,97 +360,140 @@ inlineParsers = [ autoLink
, symbol
]
+-- | Inline markups
+inlineMarkup :: Parser [Char] ParserState Inline
+inlineMarkup = choice [ simpleInline (string "??") (Cite [])
+ , simpleInline (string "**") Strong
+ , simpleInline (string "__") Emph
+ , simpleInline (char '*') Strong
+ , simpleInline (char '_') Emph
+ , simpleInline (char '+') Emph -- approximates underline
+ , simpleInline (char '-') Strikeout
+ , simpleInline (char '^') Superscript
+ , simpleInline (char '~') Subscript
+ ]
+
-- | Trademark, registered, copyright
-mark :: GenParser Char st Inline
+mark :: Parser [Char] st Inline
mark = try $ char '(' >> (try tm <|> try reg <|> copy)
-reg :: GenParser Char st Inline
+reg :: Parser [Char] st Inline
reg = do
oneOf "Rr"
char ')'
return $ Str "\174"
-tm :: GenParser Char st Inline
+tm :: Parser [Char] st Inline
tm = do
oneOf "Tt"
oneOf "Mm"
char ')'
return $ Str "\8482"
-copy :: GenParser Char st Inline
+copy :: Parser [Char] st Inline
copy = do
oneOf "Cc"
char ')'
return $ Str "\169"
-note :: GenParser Char ParserState Inline
+note :: Parser [Char] ParserState Inline
note = try $ do
- char '['
- ref <- many1 digit
- char ']'
- state <- getState
- let notes = stateNotes state
+ ref <- (char '[' *> many1 digit <* char ']')
+ notes <- stateNotes <$> getState
case lookup ref notes of
Nothing -> fail "note not found"
Just raw -> liftM Note $ parseFromString parseBlocks raw
+-- | Special chars
+markupChars :: [Char]
+markupChars = "\\[]*#_@~-+^|%="
+
+-- | Break strings on following chars. Space tab and newline break for
+-- inlines breaking. Open paren breaks for mark. Quote, dash and dot
+-- break for smart punctuation. Punctuation breaks for regular
+-- punctuation. Double quote breaks for named links. > and < break
+-- for inline html.
+stringBreakers :: [Char]
+stringBreakers = " \t\n('-.,:!?;\"<>"
+
+wordBoundaries :: [Char]
+wordBoundaries = markupChars ++ stringBreakers
+
+-- | Parse a hyphened sequence of words
+hyphenedWords :: Parser [Char] ParserState String
+hyphenedWords = try $ do
+ hd <- noneOf wordBoundaries
+ tl <- many ( (noneOf wordBoundaries) <|>
+ try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
+ let wd = hd:tl
+ option wd $ try $
+ (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
+
-- | Any string
-str :: GenParser Char ParserState Inline
+str :: Parser [Char] ParserState Inline
str = do
- xs <- many1 (noneOf (specialChars ++ "\t\n "))
- optional $ try $ do
- lookAhead (char '(')
- notFollowedBy' mark
- getInput >>= setInput . (' ':) -- add space before acronym explanation
- -- parse a following hyphen if followed by a letter
- -- (this prevents unwanted interpretation as starting a strikeout section)
- result <- option xs $ try $ do
- char '-'
- next <- lookAhead letter
- guard $ isLetter (last xs) || isLetter next
- return $ xs ++ "-"
- pos <- getPosition
- updateState $ \s -> s{ stateLastStrPos = Just pos }
- return $ Str result
+ baseStr <- hyphenedWords
+ -- RedCloth compliance : if parsed word is uppercase and immediatly
+ -- followed by parens, parens content is unconditionally word acronym
+ fullStr <- option baseStr $ try $ do
+ guard $ all isUpper baseStr
+ acro <- enclosed (char '(') (char ')') anyChar
+ return $ concat [baseStr, " (", acro, ")"]
+ updateLastStrPos
+ return $ Str fullStr
-- | Textile allows HTML span infos, we discard them
-htmlSpan :: GenParser Char ParserState Inline
-htmlSpan = try $ do
- char '%'
- _ <- attributes
- content <- manyTill anyChar (char '%')
- return $ Str content
+htmlSpan :: Parser [Char] ParserState Inline
+htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') )
-- | Some number of space chars
-whitespace :: GenParser Char ParserState Inline
+whitespace :: Parser [Char] ParserState Inline
whitespace = many1 spaceChar >> return Space <?> "whitespace"
-- | In Textile, an isolated endline character is a line break
-endline :: GenParser Char ParserState Inline
+endline :: Parser [Char] ParserState Inline
endline = try $ do
newline >> notFollowedBy blankline
return LineBreak
-rawHtmlInline :: GenParser Char ParserState Inline
-rawHtmlInline = liftM (RawInline "html" . snd)
- $ htmlTag isInlineTag
+rawHtmlInline :: Parser [Char] ParserState Inline
+rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag
+
+-- | Raw LaTeX Inline
+rawLaTeXInline' :: Parser [Char] ParserState Inline
+rawLaTeXInline' = try $ do
+ failIfStrict
+ rawLaTeXInline
+
+-- | Textile standard link syntax is "label":target. But we
+-- can also have ["label":target].
+link :: Parser [Char] ParserState Inline
+link = linkB <|> linkNoB
+
+linkNoB :: Parser [Char] ParserState Inline
+linkNoB = try $ do
+ name <- surrounded (char '"') inline
+ char ':'
+ let stopChars = "!.,;:"
+ url <- manyTill nonspaceChar (lookAhead $ space <|> try (oneOf stopChars >> (space <|> newline)))
+ return $ Link name (url, "")
--- | Textile standard link syntax is "label":target
-link :: GenParser Char ParserState Inline
-link = try $ do
+linkB :: Parser [Char] ParserState Inline
+linkB = try $ do
+ char '['
name <- surrounded (char '"') inline
char ':'
- url <- manyTill (anyChar) (lookAhead $ (space <|> try (oneOf ".;,:" >> (space <|> newline))))
+ url <- manyTill nonspaceChar (char ']')
return $ Link name (url, "")
-- | Detect plain links to http or email.
-autoLink :: GenParser Char ParserState Inline
+autoLink :: Parser [Char] ParserState Inline
autoLink = do
(orig, src) <- (try uri <|> try emailAddress)
return $ Link [Str orig] (src, "")
-- | image embedding
-image :: GenParser Char ParserState Inline
+image :: Parser [Char] ParserState Inline
image = try $ do
char '!' >> notFollowedBy space
src <- manyTill anyChar (lookAhead $ oneOf "!(")
@@ -486,41 +501,53 @@ image = try $ do
char '!'
return $ Image [Str alt] (src, alt)
--- | Any special symbol defined in specialChars
-symbol :: GenParser Char ParserState Inline
-symbol = do
- result <- oneOf specialChars
- return $ Str [result]
+escapedInline :: Parser [Char] ParserState Inline
+escapedInline = escapedEqs <|> escapedTag
+
+escapedEqs :: Parser [Char] ParserState Inline
+escapedEqs = Str <$> (try $ string "==" *> manyTill anyChar (try $ string "=="))
+
+-- | literal text escaped btw <notextile> tags
+escapedTag :: Parser [Char] ParserState Inline
+escapedTag = Str <$>
+ (try $ string "<notextile>" *> manyTill anyChar (try $ string "</notextile>"))
+
+-- | Any special symbol defined in wordBoundaries
+symbol :: Parser [Char] ParserState Inline
+symbol = Str . singleton <$> oneOf wordBoundaries
-- | Inline code
-code :: GenParser Char ParserState Inline
+code :: Parser [Char] ParserState Inline
code = code1 <|> code2
-code1 :: GenParser Char ParserState Inline
-code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
+code1 :: Parser [Char] ParserState Inline
+code1 = Code nullAttr <$> surrounded (char '@') anyChar
-code2 :: GenParser Char ParserState Inline
+code2 :: Parser [Char] ParserState Inline
code2 = do
htmlTag (tagOpen (=="tt") null)
- result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
- return $ Code nullAttr result'
+ Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
-- | Html / CSS attributes
-attributes :: GenParser Char ParserState String
+attributes :: Parser [Char] ParserState String
attributes = choice [ enclosed (char '(') (char ')') anyChar,
enclosed (char '{') (char '}') anyChar,
enclosed (char '[') (char ']') anyChar]
-- | Parses material surrounded by a parser.
-surrounded :: GenParser Char st t -- ^ surrounding parser
- -> GenParser Char st a -- ^ content parser (to be used repeatedly)
- -> GenParser Char st [a]
-surrounded border = enclosed border border
+surrounded :: Parser [Char] st t -- ^ surrounding parser
+ -> Parser [Char] st a -- ^ content parser (to be used repeatedly)
+ -> Parser [Char] st [a]
+surrounded border = enclosed border (try border)
-- | Inlines are most of the time of the same form
-simpleInline :: GenParser Char ParserState t -- ^ surrounding parser
+simpleInline :: Parser [Char] ParserState t -- ^ surrounding parser
-> ([Inline] -> Inline) -- ^ Inline constructor
- -> GenParser Char ParserState Inline -- ^ content parser (to be used repeatedly)
+ -> Parser [Char] ParserState Inline -- ^ content parser (to be used repeatedly)
simpleInline border construct = surrounded border (inlineWithAttribute) >>=
return . construct . normalizeSpaces
where inlineWithAttribute = (try $ optional attributes) >> inline
+
+-- | Create a singleton list
+singleton :: a -> [a]
+singleton x = [x]