diff options
-rw-r--r-- | INSTALL | 22 | ||||
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 5 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 225 | ||||
-rw-r--r-- | tests/textile-reader.native | 10 | ||||
-rw-r--r-- | tests/textile-reader.textile | 16 |
5 files changed, 162 insertions, 116 deletions
@@ -130,3 +130,25 @@ This is essentially what the binary installer does. [blaze-html]: http://hackage.haskell.org/package/blaze-html [Cabal User's Guide]: http://www.haskell.org/cabal/release/latest/doc/users-guide/builders.html#setup-configure-paths + +Running tests +------------- + +Pandoc comes with an automated test suite integrated to cabal. Data +files are located under the 'tests' directory. If you implement a new +feature, please update them to improve covering, and make sure by any +necessary mean that the new reference native file is 100% correct. + +Also, tests require templates that leave in a separate git repository, +tied into the main one as a git submodule. To populate 'template' +directory, you must therefore run first : + + git submodule update --init templates + +You are now ready to build tests : + + cabal-dev install -ftests + +And finally run them ! + + cabal-dev test diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 22a8d4d50..140b96cfa 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -52,6 +52,7 @@ module Text.Pandoc.Parsing ( (>>~), failUnlessLHS, escaped, characterReference, + updateLastStrPos, anyOrderedListMarker, orderedListMarker, charRef, @@ -786,6 +787,10 @@ charOrRef cs = guard (c `elem` cs) return c) +updateLastStrPos :: GenParser Char ParserState () +updateLastStrPos = getPosition >>= \p -> + updateState $ \s -> s{ stateLastStrPos = Just p } + singleQuoteStart :: GenParser Char ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 35c134b13..f9221ef9a 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA {- | Module : Text.Pandoc.Readers.Textile - Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane + Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -59,10 +59,12 @@ import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag ) +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.ParserCombinators.Parsec import Text.HTML.TagSoup.Match -import Data.Char ( digitToInt, isLetter ) +import Data.Char ( digitToInt, isUpper ) import Control.Monad ( guard, liftM ) +import Control.Applicative ((<$>), (*>), (<*)) -- | Parse a Textile text and return a Pandoc document. readTextile :: ParserState -- ^ Parser state, including options for parser @@ -72,14 +74,6 @@ readTextile state s = (readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n") --- --- Constants and data structure definitions --- - --- | Special chars border strings parsing -specialChars :: [Char] -specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()=" - -- | Generate a Pandoc ADT from a textile document parseTextile :: GenParser Char ParserState Pandoc parseTextile = do @@ -128,6 +122,7 @@ blockParsers = [ codeBlock , hrule , anyList , rawHtmlBlock + , rawLaTeXBlock' , maybeExplicitBlock "table" table , maybeExplicitBlock "p" para , nullBlock ] @@ -164,21 +159,16 @@ codeBlockPre = try $ do header :: GenParser Char ParserState Block header = try $ do char 'h' - level <- oneOf "123456" >>= return . digitToInt - optional attributes - char '.' - whitespace - name <- manyTill inline blockBreak - return $ Header level (normalizeSpaces name) + level <- digitToInt <$> oneOf "123456" + optional attributes >> char '.' >> whitespace + name <- normalizeSpaces <$> manyTill inline blockBreak + return $ Header level name -- | Blockquote of the form "bq. content" blockQuote :: GenParser Char ParserState Block blockQuote = try $ do - string "bq" - optional attributes - char '.' - whitespace - para >>= return . BlockQuote . (:[]) + string "bq" >> optional attributes >> char '.' >> whitespace + BlockQuote . singleton <$> para -- Horizontal rule @@ -198,10 +188,7 @@ hrule = try $ do -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" anyList :: GenParser Char ParserState Block -anyList = try $ do - l <- anyListAtDepth 1 - blanklines - return l +anyList = try $ ( (anyListAtDepth 1) <* blanklines ) -- | This allow one type of list to be nested into an other type, -- provided correct nesting @@ -212,20 +199,12 @@ anyListAtDepth depth = choice [ bulletListAtDepth depth, -- | Bullet List of given depth, depth being the number of leading '*' bulletListAtDepth :: Int -> GenParser Char ParserState Block -bulletListAtDepth depth = try $ do - items <- many1 (bulletListItemAtDepth depth) - return (BulletList items) +bulletListAtDepth depth = try $ BulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' bulletListItemAtDepth :: Int -> GenParser Char ParserState [Block] -bulletListItemAtDepth depth = try $ do - count depth (char '*') - optional attributes - whitespace - p <- inlines >>= return . Plain - sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) - return (p:sublist) +bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' @@ -237,19 +216,19 @@ orderedListAtDepth depth = try $ do -- | Ordered List Item of given depth, depth being the number of -- leading '#' orderedListItemAtDepth :: Int -> GenParser Char ParserState [Block] -orderedListItemAtDepth depth = try $ do - count depth (char '#') - optional attributes - whitespace - p <- inlines >>= return . Plain - sublist <- option [] (anyListAtDepth (depth + 1) >>= return . (:[])) - return (p:sublist) +orderedListItemAtDepth = genericListItemAtDepth '#' + +-- | Common implementation of list items +genericListItemAtDepth :: Char -> Int -> GenParser Char ParserState [Block] +genericListItemAtDepth c depth = try $ do + count depth (char c) >> optional attributes >> whitespace + p <- inlines + sublist <- option [] (singleton <$> anyListAtDepth (depth + 1)) + return ((Plain p):sublist) -- | A definition list is a set of consecutive definition items definitionList :: GenParser Char ParserState Block -definitionList = try $ do - items <- many1 definitionListItem - return $ DefinitionList items +definitionList = try $ DefinitionList <$> many1 definitionListItem -- | A definition list item in textile begins with '- ', followed by -- the term defined, then spaces and ":=". The definition follows, on @@ -277,6 +256,8 @@ blockBreak :: GenParser Char ParserState () blockBreak = try (newline >> blanklines >> return ()) <|> (lookAhead rawHtmlBlock >> return ()) +-- raw content + -- | A raw Html Block, optionally followed by blanklines rawHtmlBlock :: GenParser Char ParserState Block rawHtmlBlock = try $ do @@ -284,11 +265,16 @@ rawHtmlBlock = try $ do optional blanklines return $ RawBlock "html" b +-- | Raw block of LaTeX content +rawLaTeXBlock' :: GenParser Char ParserState Block +rawLaTeXBlock' = do + failIfStrict + RawBlock "latex" <$> (rawLaTeXBlock <* spaces) + + -- | In textile, paragraphs are separated by blank lines. para :: GenParser Char ParserState Block -para = try $ do - content <- manyTill inline blockBreak - return $ Para $ normalizeSpaces content +para = try $ Para . normalizeSpaces <$> manyTill inline blockBreak -- Tables @@ -302,11 +288,7 @@ tableCell = do -- | A table row is made of many table cells tableRow :: GenParser Char ParserState [TableCell] -tableRow = try $ do - char '|' - cells <- endBy1 tableCell (char '|') - newline - return cells +tableRow = try $ ( char '|' *> (endBy1 tableCell (char '|')) <* newline) -- | Many table rows tableRows :: GenParser Char ParserState [[TableCell]] @@ -314,13 +296,8 @@ tableRows = many1 tableRow -- | Table headers are made of cells separated by a tag "|_." tableHeaders :: GenParser Char ParserState [TableCell] -tableHeaders = try $ do - let separator = (try $ string "|_.") - separator - headers <- sepBy1 tableCell separator - char '|' - newline - return headers +tableHeaders = let separator = (try $ string "|_.") in + try $ ( separator *> (sepBy1 tableCell separator) <* char '|' <* newline ) -- | A table with an optional header. Current implementation can -- handle tables with and without header, but will parse cells @@ -373,15 +350,10 @@ inlineParsers = [ autoLink , escapedInline , htmlSpan , rawHtmlInline + , rawLaTeXInline' , note - , simpleInline (string "??") (Cite []) - , simpleInline (string "**") Strong - , simpleInline (string "__") Emph - , simpleInline (char '*') Strong - , simpleInline (char '_') Emph - , simpleInline (char '-') Strikeout - , simpleInline (char '^') Superscript - , simpleInline (char '~') Subscript + , try $ (char '[' *> inlineMarkup <* char ']') + , inlineMarkup , link , image , mark @@ -389,6 +361,18 @@ inlineParsers = [ autoLink , symbol ] +-- | Inline markups +inlineMarkup :: GenParser Char ParserState Inline +inlineMarkup = choice [ simpleInline (string "??") (Cite []) + , simpleInline (string "**") Strong + , simpleInline (string "__") Emph + , simpleInline (char '*') Strong + , simpleInline (char '_') Emph + , simpleInline (char '-') Strikeout + , simpleInline (char '^') Superscript + , simpleInline (char '~') Subscript + ] + -- | Trademark, registered, copyright mark :: GenParser Char st Inline mark = try $ char '(' >> (try tm <|> try reg <|> copy) @@ -414,41 +398,53 @@ copy = do note :: GenParser Char ParserState Inline note = try $ do - char '[' - ref <- many1 digit - char ']' - state <- getState - let notes = stateNotes state + ref <- (char '[' *> many1 digit <* char ']') + notes <- stateNotes <$> getState case lookup ref notes of Nothing -> fail "note not found" Just raw -> liftM Note $ parseFromString parseBlocks raw +-- | Special chars +markupChars :: [Char] +markupChars = "\\[]*#_@~-+^|%=" + +-- | Break strings on following chars. Space tab and newline break for +-- inlines breaking. Open paren breaks for mark. Quote, dash and dot +-- break for smart punctuation. Punctuation breaks for regular +-- punctuation. Double quote breaks for named links. > and < break +-- for inline html. +stringBreakers :: [Char] +stringBreakers = " \t\n('-.,:!?;\"<>" + +wordBoundaries :: [Char] +wordBoundaries = markupChars ++ stringBreakers + +-- | Parse a hyphened sequence of words +hyphenedWords :: GenParser Char ParserState String +hyphenedWords = try $ do + hd <- noneOf wordBoundaries + tl <- many ( (noneOf wordBoundaries) <|> + try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) ) + let wd = hd:tl + option wd $ try $ + (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords) + -- | Any string str :: GenParser Char ParserState Inline str = do - xs <- many1 (noneOf (specialChars ++ "\t\n ")) - optional $ try $ do - lookAhead (char '(') - notFollowedBy' mark - getInput >>= setInput . (' ':) -- add space before acronym explanation - -- parse a following hyphen if followed by a letter - -- (this prevents unwanted interpretation as starting a strikeout section) - result <- option xs $ try $ do - char '-' - next <- lookAhead letter - guard $ isLetter (last xs) || isLetter next - return $ xs ++ "-" - pos <- getPosition - updateState $ \s -> s{ stateLastStrPos = Just pos } - return $ Str result + baseStr <- hyphenedWords + -- RedCloth compliance : if parsed word is uppercase and immediatly + -- followed by parens, parens content is unconditionally word acronym + fullStr <- option baseStr $ try $ do + guard $ all isUpper baseStr + acro <- enclosed (char '(') (char ')') anyChar + return $ concat [baseStr, " (", acro, ")"] + updateLastStrPos + return $ Str fullStr -- | Textile allows HTML span infos, we discard them htmlSpan :: GenParser Char ParserState Inline -htmlSpan = try $ do - char '%' - _ <- attributes - content <- manyTill anyChar (char '%') - return $ Str content +htmlSpan = try $ Str <$> ( char '%' *> attributes *> manyTill anyChar (char '%') ) -- | Some number of space chars whitespace :: GenParser Char ParserState Inline @@ -461,8 +457,13 @@ endline = try $ do return LineBreak rawHtmlInline :: GenParser Char ParserState Inline -rawHtmlInline = liftM (RawInline "html" . snd) - $ htmlTag isInlineTag +rawHtmlInline = RawInline "html" . snd <$> htmlTag isInlineTag + +-- | Raw LaTeX Inline +rawLaTeXInline' :: GenParser Char ParserState Inline +rawLaTeXInline' = try $ do + failIfStrict + rawLaTeXInline -- | Textile standard link syntax is "label":target link :: GenParser Char ParserState Inline @@ -490,38 +491,36 @@ image = try $ do escapedInline :: GenParser Char ParserState Inline escapedInline = escapedEqs <|> escapedTag --- | literal text escaped between == ... == escapedEqs :: GenParser Char ParserState Inline -escapedEqs = try $ do - string "==" - contents <- manyTill anyChar (try $ string "==") - return $ Str contents +escapedEqs = Str <$> (try $ surrounded (string "==") anyChar) + +-- -- | literal text escaped between == ... == +-- escapedEqs :: GenParser Char ParserState Inline +-- escapedEqs = try $ do +-- string "==" +-- contents <- manyTill anyChar (try $ string "==") +-- return $ Str contents -- | literal text escaped btw <notextile> tags escapedTag :: GenParser Char ParserState Inline -escapedTag = try $ do - string "<notextile>" - contents <- manyTill anyChar (try $ string "</notextile>") - return $ Str contents +escapedTag = try $ Str <$> + enclosed (string "<notextile>") (string "</notextile>") anyChar --- | Any special symbol defined in specialChars +-- | Any special symbol defined in wordBoundaries symbol :: GenParser Char ParserState Inline -symbol = do - result <- oneOf specialChars - return $ Str [result] +symbol = Str . singleton <$> oneOf wordBoundaries -- | Inline code code :: GenParser Char ParserState Inline code = code1 <|> code2 code1 :: GenParser Char ParserState Inline -code1 = surrounded (char '@') anyChar >>= return . Code nullAttr +code1 = Code nullAttr <$> surrounded (char '@') anyChar code2 :: GenParser Char ParserState Inline code2 = do htmlTag (tagOpen (=="tt") null) - result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) - return $ Code nullAttr result' + Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes attributes :: GenParser Char ParserState String @@ -542,3 +541,7 @@ simpleInline :: GenParser Char ParserState t -- ^ surrounding parser simpleInline border construct = surrounded border (inlineWithAttribute) >>= return . construct . normalizeSpaces where inlineWithAttribute = (try $ optional attributes) >> inline + +-- | Create a singleton list +singleton :: a -> [a] +singleton x = [x] diff --git a/tests/textile-reader.native b/tests/textile-reader.native index 8e149c33d..d9fbc4672 100644 --- a/tests/textile-reader.native +++ b/tests/textile-reader.native @@ -67,9 +67,9 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []}) ,([Str "beer"], [[Plain [Str "fresh",Space,Str "and",Space,Str "bitter"]]])] ,Header 1 [Str "Inline",Space,Str "Markup"] -,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."] +,Para [Str "This",Space,Str "is",Space,Emph [Str "emphasized"],Str ",",Space,Str "and",Space,Str "so",Space,Emph [Str "is",Space,Str "this"],Str ".",LineBreak,Str "This",Space,Str "is",Space,Strong [Str "strong"],Str ",",Space,Str "and",Space,Str "so",Space,Strong [Str "is",Space,Str "this"],Str ".",LineBreak,Str "Hyphenated-words-are-ok",Str ",",Space,Str "as",Space,Str "well",Space,Str "as",Space,Str "strange_underscore_notation",Str ".",LineBreak,Str "A",Space,Link [Strong [Str "strong",Space,Str "link"]] ("http://www.foobar.com",""),Str "."] ,Para [Emph [Strong [Str "This",Space,Str "is",Space,Str "strong",Space,Str "and",Space,Str "em",Str "."]],LineBreak,Str "So",Space,Str "is",Space,Strong [Emph [Str "this"]],Space,Str "word",Space,Str "and",Space,Emph [Strong [Str "that",Space,Str "one"]],Str ".",LineBreak,Strikeout [Str "This",Space,Str "is",Space,Str "strikeout",Space,Str "and",Space,Strong [Str "strong"]]] -,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."] +,Para [Str "Superscripts",Str ":",Space,Str "a",Superscript [Str "bc"],Str "d",Space,Str "a",Superscript [Strong [Str "hello"]],Space,Str "a",Superscript [Str "hello",Space,Str "there"],Str ".",LineBreak,Str "Subscripts",Str ":",Space,Subscript [Str "here"],Space,Str "H",Subscript [Str "2"],Str "O",Str ",",Space,Str "H",Subscript [Str "23"],Str "O",Str ",",Space,Str "H",Subscript [Str "many",Space,Str "of",Space,Str "them"],Str "O",Str "."] ,Para [Str "Dashes",Space,Str ":",Space,Str "How",Space,Str "cool",Space,Str "\8212",Space,Str "automatic",Space,Str "dashes",Str "."] ,Para [Str "Elipses",Space,Str ":",Space,Str "He",Space,Str "thought",Space,Str "and",Space,Str "thought",Space,Str "\8230",Space,Str "and",Space,Str "then",Space,Str "thought",Space,Str "some",Space,Str "more",Str "."] ,Para [Str "Quotes",Space,Str "and",Space,Str "apostrophes",Space,Str ":",Space,Quoted DoubleQuote [Str "I",Str "\8217",Str "d",Space,Str "like",Space,Str "to",Space,Str "thank",Space,Str "you"],Space,Str "for",Space,Str "example",Str "."] @@ -139,8 +139,12 @@ Pandoc (Meta {docTitle = [], docAuthors = [], docDate = []}) [[Plain [Str "this",Space,Str "<",Str "div",Str ">",Space,Str "won",Str "\8217",Str "t",Space,Str "produce",Space,Str "raw",Space,Str "html",Space,Str "blocks",Space,Str "<",Str "/div",Str ">"]] ,[Plain [Str "but",Space,Str "this",Space,RawInline "html" "<strong>",Space,Str "will",Space,Str "produce",Space,Str "inline",Space,Str "html",Space,RawInline "html" "</strong>"]]] ,Para [Str "Can",Space,Str "you",Space,Str "prove",Space,Str "that",Space,Str "2",Space,Str "<",Space,Str "3",Space,Str "?"] +,Header 1 [Str "Raw",Space,Str "LaTeX"] +,Para [Str "This",Space,Str "Textile",Space,Str "reader",Space,Str "also",Space,Str "accepts",Space,Str "raw",Space,Str "LaTeX",Space,Str "for",Space,Str "blocks",Space,Str ":"] +,RawBlock "latex" "\\begin{itemize}\n \\item one\n \\item two\n\\end{itemize}" +,Para [Str "and",Space,Str "for",Space,RawInline "latex" "\\emph{inlines}",Str "."] ,Header 1 [Str "Acronyms",Space,Str "and",Space,Str "marks"] -,Para [Str "PBS",Space,Str "(",Str "Public",Space,Str "Broadcasting",Space,Str "System",Str ")"] +,Para [Str "PBS (Public Broadcasting System)"] ,Para [Str "Hi",Str "\8482"] ,Para [Str "Hi",Space,Str "\8482"] ,Para [Str "\174",Space,Str "Hi",Str "\174"] diff --git a/tests/textile-reader.textile b/tests/textile-reader.textile index 85dcf142c..c6450fdfb 100644 --- a/tests/textile-reader.textile +++ b/tests/textile-reader.textile @@ -115,14 +115,15 @@ h1. Inline Markup This is _emphasized_, and so __is this__. This is *strong*, and so **is this**. +Hyphenated-words-are-ok, as well as strange_underscore_notation. A "*strong link*":http://www.foobar.com. _*This is strong and em.*_ So is *_this_* word and __**that one**__. -This is strikeout and *strong*- -Superscripts: a^bc^d a^*hello*^ a^hello there^. -Subscripts: H~2~O, H~23~O, H~many of them~O. +Superscripts: a[^bc^]d a^*hello*^ a[^hello there^]. +Subscripts: ~here~ H[~2~]O, H[~23~]O, H[~many of them~]O. Dashes : How cool -- automatic dashes. @@ -198,6 +199,17 @@ Html blocks can be <div>inlined</div> as well. Can you prove that 2 < 3 ? +h1. Raw LaTeX + +This Textile reader also accepts raw LaTeX for blocks : + +\begin{itemize} + \item one + \item two +\end{itemize} + +and for \emph{inlines}. + h1. Acronyms and marks PBS(Public Broadcasting System) |