aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2011-07-23 12:35:01 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2011-07-23 12:43:01 -0700
commit6424e7d02c8800a1964d0ae26a523597b8a365fa (patch)
tree9a616458b5a912cb625ce4514351a8f7c47eef75 /src/Text
parent26418b7d14ce04a7386392388d2a3cbded205705 (diff)
downloadpandoc-6424e7d02c8800a1964d0ae26a523597b8a365fa.tar.gz
Properly handle characters in the 128..159 range.
These aren't valid in HTML, but many HTML files produced by Windows tools contain them. We substitute correct unicode characters.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Parsing.hs14
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs43
2 files changed, 48 insertions, 9 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 187343f9c..c16d99bdf 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -758,7 +758,7 @@ charOrRef cs =
singleQuoteStart :: GenParser Char ParserState ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote
- try $ do charOrRef "'\8216"
+ try $ do charOrRef "'\8216\145"
notFollowedBy (oneOf ")!],.;:-? \t\n")
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum)))
@@ -767,23 +767,23 @@ singleQuoteStart = do
singleQuoteEnd :: GenParser Char st ()
singleQuoteEnd = try $ do
- charOrRef "'\8217"
+ charOrRef "'\8217\146"
notFollowedBy alphaNum
doubleQuoteStart :: GenParser Char ParserState ()
doubleQuoteStart = do
failIfInQuoteContext InDoubleQuote
- try $ do charOrRef "\"\8220"
+ try $ do charOrRef "\"\8220\147"
notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n'))
doubleQuoteEnd :: GenParser Char st ()
doubleQuoteEnd = do
- charOrRef "\"\8221"
+ charOrRef "\"\8221\148"
return ()
ellipses :: GenParser Char st Inline
ellipses = do
- try (charOrRef "…") <|> try (string "..." >> return '…')
+ try (charOrRef "…\133") <|> try (string "..." >> return '…')
return Ellipses
dash :: GenParser Char st Inline
@@ -791,13 +791,13 @@ dash = enDash <|> emDash
enDash :: GenParser Char st Inline
enDash = do
- try (charOrRef "–") <|>
+ try (charOrRef "–\150") <|>
try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
return EnDash
emDash :: GenParser Char st Inline
emDash = do
- try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—')
+ try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—')
return EmDash
--
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 2fd6d88bf..7c882f680 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -416,10 +416,12 @@ pBlank = try $ do
guard $ all isSpace str
pTagContents :: GenParser Char ParserState Inline
-pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
+pTagContents =
+ pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
pStr :: GenParser Char ParserState Inline
-pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c)
+pStr = liftM Str $ many1 $ satisfy $ \c ->
+ not (isSpace c) && not (isSpecial c) && not (isBad c)
isSpecial :: Char -> Bool
isSpecial '"' = True
@@ -435,6 +437,43 @@ isSpecial _ = False
pSymbol :: GenParser Char ParserState Inline
pSymbol = satisfy isSpecial >>= return . Str . (:[])
+isBad :: Char -> Bool
+isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
+
+pBad :: GenParser Char ParserState Inline
+pBad = do
+ c <- satisfy isBad
+ let c' = case c of
+ '\128' -> '\8364'
+ '\130' -> '\8218'
+ '\131' -> '\402'
+ '\132' -> '\8222'
+ '\133' -> '\8230'
+ '\134' -> '\8224'
+ '\135' -> '\8225'
+ '\136' -> '\710'
+ '\137' -> '\8240'
+ '\138' -> '\352'
+ '\139' -> '\8249'
+ '\140' -> '\338'
+ '\142' -> '\381'
+ '\145' -> '\8216'
+ '\146' -> '\8217'
+ '\147' -> '\8220'
+ '\148' -> '\8221'
+ '\149' -> '\8226'
+ '\150' -> '\8211'
+ '\151' -> '\8212'
+ '\152' -> '\732'
+ '\153' -> '\8482'
+ '\154' -> '\353'
+ '\155' -> '\8250'
+ '\156' -> '\339'
+ '\158' -> '\382'
+ '\159' -> '\376'
+ _ -> '?'
+ return $ Str [c']
+
pSpace :: GenParser Char ParserState Inline
pSpace = many1 (satisfy isSpace) >> return Space