aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorpaul.rivier <paul.r.ml@gmail.com>2012-04-24 15:56:59 +0200
committerpaul.rivier <paul.r.ml@gmail.com>2012-04-24 15:56:59 +0200
commit7b111542c0ef62802a65986b41829196510e5b3e (patch)
treee79f4d625668f78bbd112990ca0966ed91c5e875 /src/Text/Pandoc/Readers
parent411d54ce98f40b5196706804da6167079ed5d824 (diff)
downloadpandoc-7b111542c0ef62802a65986b41829196510e5b3e.tar.gz
textile reader improvements : better conformance to RedCloth Textile inlines
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs118
1 files changed, 67 insertions, 51 deletions
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 796f96e06..f9221ef9a 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.Textile
- Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
+ Copyright : Copyright (C) 2010-2012 Paul Rivier and John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : Paul Rivier <paul*rivier#demotera*com>
@@ -62,7 +62,7 @@ import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
import Text.ParserCombinators.Parsec
import Text.HTML.TagSoup.Match
-import Data.Char ( digitToInt, isLetter )
+import Data.Char ( digitToInt, isUpper )
import Control.Monad ( guard, liftM )
import Control.Applicative ((<$>), (*>), (<*))
@@ -74,14 +74,6 @@ readTextile state s =
(readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
---
--- Constants and data structure definitions
---
-
--- | Special chars border strings parsing
-specialChars :: [Char]
-specialChars = "\\[]<>*#_@~-+^&,.;:!?|\"'%()="
-
-- | Generate a Pandoc ADT from a textile document
parseTextile :: GenParser Char ParserState Pandoc
parseTextile = do
@@ -360,14 +352,8 @@ inlineParsers = [ autoLink
, rawHtmlInline
, rawLaTeXInline'
, note
- , simpleInline (string "??") (Cite [])
- , simpleInline (string "**") Strong
- , simpleInline (string "__") Emph
- , simpleInline (char '*') Strong
- , simpleInline (char '_') Emph
- , simpleInline (char '-') Strikeout
- , simpleInline (char '^') Superscript
- , simpleInline (char '~') Subscript
+ , try $ (char '[' *> inlineMarkup <* char ']')
+ , inlineMarkup
, link
, image
, mark
@@ -375,6 +361,18 @@ inlineParsers = [ autoLink
, symbol
]
+-- | Inline markups
+inlineMarkup :: GenParser Char ParserState Inline
+inlineMarkup = choice [ simpleInline (string "??") (Cite [])
+ , simpleInline (string "**") Strong
+ , simpleInline (string "__") Emph
+ , simpleInline (char '*') Strong
+ , simpleInline (char '_') Emph
+ , simpleInline (char '-') Strikeout
+ , simpleInline (char '^') Superscript
+ , simpleInline (char '~') Subscript
+ ]
+
-- | Trademark, registered, copyright
mark :: GenParser Char st Inline
mark = try $ char '(' >> (try tm <|> try reg <|> copy)
@@ -400,33 +398,49 @@ copy = do
note :: GenParser Char ParserState Inline
note = try $ do
- char '['
- ref <- many1 digit
- char ']'
- state <- getState
- let notes = stateNotes state
+ ref <- (char '[' *> many1 digit <* char ']')
+ notes <- stateNotes <$> getState
case lookup ref notes of
Nothing -> fail "note not found"
Just raw -> liftM Note $ parseFromString parseBlocks raw
+-- | Special chars
+markupChars :: [Char]
+markupChars = "\\[]*#_@~-+^|%="
+
+-- | Break strings on following chars. Space tab and newline break for
+-- inlines breaking. Open paren breaks for mark. Quote, dash and dot
+-- break for smart punctuation. Punctuation breaks for regular
+-- punctuation. Double quote breaks for named links. > and < break
+-- for inline html.
+stringBreakers :: [Char]
+stringBreakers = " \t\n('-.,:!?;\"<>"
+
+wordBoundaries :: [Char]
+wordBoundaries = markupChars ++ stringBreakers
+
+-- | Parse a hyphened sequence of words
+hyphenedWords :: GenParser Char ParserState String
+hyphenedWords = try $ do
+ hd <- noneOf wordBoundaries
+ tl <- many ( (noneOf wordBoundaries) <|>
+ try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
+ let wd = hd:tl
+ option wd $ try $
+ (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
+
-- | Any string
str :: GenParser Char ParserState Inline
str = do
- xs <- many1 (noneOf (specialChars ++ "\t\n "))
- optional $ try $ do
- lookAhead (char '(')
- notFollowedBy' mark
- getInput >>= setInput . (' ':) -- add space before acronym explanation
- -- parse a following hyphen if followed by a letter
- -- (this prevents unwanted interpretation as starting a strikeout section)
- result <- option xs $ try $ do
- char '-'
- next <- lookAhead letter
- guard $ isLetter (last xs) || isLetter next
- return $ xs ++ "-"
- pos <- getPosition
- updateState $ \s -> s{ stateLastStrPos = Just pos }
- return $ Str result
+ baseStr <- hyphenedWords
+ -- RedCloth compliance : if parsed word is uppercase and immediatly
+ -- followed by parens, parens content is unconditionally word acronym
+ fullStr <- option baseStr $ try $ do
+ guard $ all isUpper baseStr
+ acro <- enclosed (char '(') (char ')') anyChar
+ return $ concat [baseStr, " (", acro, ")"]
+ updateLastStrPos
+ return $ Str fullStr
-- | Textile allows HTML span infos, we discard them
htmlSpan :: GenParser Char ParserState Inline
@@ -477,34 +491,36 @@ image = try $ do
escapedInline :: GenParser Char ParserState Inline
escapedInline = escapedEqs <|> escapedTag
--- | literal text escaped between == ... ==
escapedEqs :: GenParser Char ParserState Inline
-escapedEqs = try $ do
- string "=="
- contents <- manyTill anyChar (try $ string "==")
- return $ Str contents
+escapedEqs = Str <$> (try $ surrounded (string "==") anyChar)
+
+-- -- | literal text escaped between == ... ==
+-- escapedEqs :: GenParser Char ParserState Inline
+-- escapedEqs = try $ do
+-- string "=="
+-- contents <- manyTill anyChar (try $ string "==")
+-- return $ Str contents
-- | literal text escaped btw <notextile> tags
escapedTag :: GenParser Char ParserState Inline
-escapedTag = try $ Str <$> ( string "<notextile>" *>
- manyTill anyChar (try $ string "</notextile>") )
+escapedTag = try $ Str <$>
+ enclosed (string "<notextile>") (string "</notextile>") anyChar
--- | Any special symbol defined in specialChars
+-- | Any special symbol defined in wordBoundaries
symbol :: GenParser Char ParserState Inline
-symbol = Str . singleton <$> oneOf specialChars
+symbol = Str . singleton <$> oneOf wordBoundaries
-- | Inline code
code :: GenParser Char ParserState Inline
code = code1 <|> code2
code1 :: GenParser Char ParserState Inline
-code1 = surrounded (char '@') anyChar >>= return . Code nullAttr
+code1 = Code nullAttr <$> surrounded (char '@') anyChar
code2 :: GenParser Char ParserState Inline
code2 = do
htmlTag (tagOpen (=="tt") null)
- result' <- manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
- return $ Code nullAttr result'
+ Code nullAttr <$> manyTill anyChar (try $ htmlTag $ tagClose (=="tt"))
-- | Html / CSS attributes
attributes :: GenParser Char ParserState String
@@ -528,4 +544,4 @@ simpleInline border construct = surrounded border (inlineWithAttribute) >>=
-- | Create a singleton list
singleton :: a -> [a]
-singleton x = [x] \ No newline at end of file
+singleton x = [x]