From 6424e7d02c8800a1964d0ae26a523597b8a365fa Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 23 Jul 2011 12:35:01 -0700 Subject: Properly handle characters in the 128..159 range. These aren't valid in HTML, but many HTML files produced by Windows tools contain them. We substitute correct unicode characters. --- src/Text/Pandoc/Parsing.hs | 14 +++++++------- src/Text/Pandoc/Readers/HTML.hs | 43 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 9 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 187343f9c..c16d99bdf 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -758,7 +758,7 @@ charOrRef cs = singleQuoteStart :: GenParser Char ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote - try $ do charOrRef "'\8216" + try $ do charOrRef "'\8216\145" notFollowedBy (oneOf ")!],.;:-? \t\n") notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> satisfy (not . isAlphaNum))) @@ -767,23 +767,23 @@ singleQuoteStart = do singleQuoteEnd :: GenParser Char st () singleQuoteEnd = try $ do - charOrRef "'\8217" + charOrRef "'\8217\146" notFollowedBy alphaNum doubleQuoteStart :: GenParser Char ParserState () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote - try $ do charOrRef "\"\8220" + try $ do charOrRef "\"\8220\147" notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n')) doubleQuoteEnd :: GenParser Char st () doubleQuoteEnd = do - charOrRef "\"\8221" + charOrRef "\"\8221\148" return () ellipses :: GenParser Char st Inline ellipses = do - try (charOrRef "…") <|> try (string "..." >> return '…') + try (charOrRef "…\133") <|> try (string "..." >> return '…') return Ellipses dash :: GenParser Char st Inline @@ -791,13 +791,13 @@ dash = enDash <|> emDash enDash :: GenParser Char st Inline enDash = do - try (charOrRef "–") <|> + try (charOrRef "–\150") <|> try (char '-' >> lookAhead (satisfy isDigit) >> return '–') return EnDash emDash :: GenParser Char st Inline emDash = do - try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—') + try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—') return EmDash -- diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2fd6d88bf..7c882f680 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -416,10 +416,12 @@ pBlank = try $ do guard $ all isSpace str pTagContents :: GenParser Char ParserState Inline -pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol +pTagContents = + pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad pStr :: GenParser Char ParserState Inline -pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) +pStr = liftM Str $ many1 $ satisfy $ \c -> + not (isSpace c) && not (isSpecial c) && not (isBad c) isSpecial :: Char -> Bool isSpecial '"' = True @@ -435,6 +437,43 @@ isSpecial _ = False pSymbol :: GenParser Char ParserState Inline pSymbol = satisfy isSpecial >>= return . Str . (:[]) +isBad :: Char -> Bool +isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML + +pBad :: GenParser Char ParserState Inline +pBad = do + c <- satisfy isBad + let c' = case c of + '\128' -> '\8364' + '\130' -> '\8218' + '\131' -> '\402' + '\132' -> '\8222' + '\133' -> '\8230' + '\134' -> '\8224' + '\135' -> '\8225' + '\136' -> '\710' + '\137' -> '\8240' + '\138' -> '\352' + '\139' -> '\8249' + '\140' -> '\338' + '\142' -> '\381' + '\145' -> '\8216' + '\146' -> '\8217' + '\147' -> '\8220' + '\148' -> '\8221' + '\149' -> '\8226' + '\150' -> '\8211' + '\151' -> '\8212' + '\152' -> '\732' + '\153' -> '\8482' + '\154' -> '\353' + '\155' -> '\8250' + '\156' -> '\339' + '\158' -> '\382' + '\159' -> '\376' + _ -> '?' + return $ Str [c'] + pSpace :: GenParser Char ParserState Inline pSpace = many1 (satisfy isSpace) >> return Space -- cgit v1.2.3