aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Parsing.hs14
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs43
2 files changed, 48 insertions, 9 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 187343f9c..c16d99bdf 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -758,7 +758,7 @@ charOrRef cs =
singleQuoteStart :: GenParser Char ParserState ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote
- try $ do charOrRef "'\8216"
+ try $ do charOrRef "'\8216\145"
notFollowedBy (oneOf ")!],.;:-? \t\n")
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum)))
@@ -767,23 +767,23 @@ singleQuoteStart = do
singleQuoteEnd :: GenParser Char st ()
singleQuoteEnd = try $ do
- charOrRef "'\8217"
+ charOrRef "'\8217\146"
notFollowedBy alphaNum
doubleQuoteStart :: GenParser Char ParserState ()
doubleQuoteStart = do
failIfInQuoteContext InDoubleQuote
- try $ do charOrRef "\"\8220"
+ try $ do charOrRef "\"\8220\147"
notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n'))
doubleQuoteEnd :: GenParser Char st ()
doubleQuoteEnd = do
- charOrRef "\"\8221"
+ charOrRef "\"\8221\148"
return ()
ellipses :: GenParser Char st Inline
ellipses = do
- try (charOrRef "…") <|> try (string "..." >> return '…')
+ try (charOrRef "…\133") <|> try (string "..." >> return '…')
return Ellipses
dash :: GenParser Char st Inline
@@ -791,13 +791,13 @@ dash = enDash <|> emDash
enDash :: GenParser Char st Inline
enDash = do
- try (charOrRef "–") <|>
+ try (charOrRef "–\150") <|>
try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
return EnDash
emDash :: GenParser Char st Inline
emDash = do
- try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—')
+ try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—')
return EmDash
--
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 2fd6d88bf..7c882f680 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -416,10 +416,12 @@ pBlank = try $ do
guard $ all isSpace str
pTagContents :: GenParser Char ParserState Inline
-pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
+pTagContents =
+ pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
pStr :: GenParser Char ParserState Inline
-pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c)
+pStr = liftM Str $ many1 $ satisfy $ \c ->
+ not (isSpace c) && not (isSpecial c) && not (isBad c)
isSpecial :: Char -> Bool
isSpecial '"' = True
@@ -435,6 +437,43 @@ isSpecial _ = False
pSymbol :: GenParser Char ParserState Inline
pSymbol = satisfy isSpecial >>= return . Str . (:[])
+isBad :: Char -> Bool
+isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
+
+pBad :: GenParser Char ParserState Inline
+pBad = do
+ c <- satisfy isBad
+ let c' = case c of
+ '\128' -> '\8364'
+ '\130' -> '\8218'
+ '\131' -> '\402'
+ '\132' -> '\8222'
+ '\133' -> '\8230'
+ '\134' -> '\8224'
+ '\135' -> '\8225'
+ '\136' -> '\710'
+ '\137' -> '\8240'
+ '\138' -> '\352'
+ '\139' -> '\8249'
+ '\140' -> '\338'
+ '\142' -> '\381'
+ '\145' -> '\8216'
+ '\146' -> '\8217'
+ '\147' -> '\8220'
+ '\148' -> '\8221'
+ '\149' -> '\8226'
+ '\150' -> '\8211'
+ '\151' -> '\8212'
+ '\152' -> '\732'
+ '\153' -> '\8482'
+ '\154' -> '\353'
+ '\155' -> '\8250'
+ '\156' -> '\339'
+ '\158' -> '\382'
+ '\159' -> '\376'
+ _ -> '?'
+ return $ Str [c']
+
pSpace :: GenParser Char ParserState Inline
pSpace = many1 (satisfy isSpace) >> return Space