From 6424e7d02c8800a1964d0ae26a523597b8a365fa Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 23 Jul 2011 12:35:01 -0700 Subject: Properly handle characters in the 128..159 range. These aren't valid in HTML, but many HTML files produced by Windows tools contain them. We substitute correct unicode characters. --- src/Text/Pandoc/Readers/HTML.hs | 43 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2fd6d88bf..7c882f680 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -416,10 +416,12 @@ pBlank = try $ do guard $ all isSpace str pTagContents :: GenParser Char ParserState Inline -pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol +pTagContents = + pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad pStr :: GenParser Char ParserState Inline -pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) +pStr = liftM Str $ many1 $ satisfy $ \c -> + not (isSpace c) && not (isSpecial c) && not (isBad c) isSpecial :: Char -> Bool isSpecial '"' = True @@ -435,6 +437,43 @@ isSpecial _ = False pSymbol :: GenParser Char ParserState Inline pSymbol = satisfy isSpecial >>= return . Str . (:[]) +isBad :: Char -> Bool +isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML + +pBad :: GenParser Char ParserState Inline +pBad = do + c <- satisfy isBad + let c' = case c of + '\128' -> '\8364' + '\130' -> '\8218' + '\131' -> '\402' + '\132' -> '\8222' + '\133' -> '\8230' + '\134' -> '\8224' + '\135' -> '\8225' + '\136' -> '\710' + '\137' -> '\8240' + '\138' -> '\352' + '\139' -> '\8249' + '\140' -> '\338' + '\142' -> '\381' + '\145' -> '\8216' + '\146' -> '\8217' + '\147' -> '\8220' + '\148' -> '\8221' + '\149' -> '\8226' + '\150' -> '\8211' + '\151' -> '\8212' + '\152' -> '\732' + '\153' -> '\8482' + '\154' -> '\353' + '\155' -> '\8250' + '\156' -> '\339' + '\158' -> '\382' + '\159' -> '\376' + _ -> '?' + return $ Str [c'] + pSpace :: GenParser Char ParserState Inline pSpace = many1 (satisfy isSpace) >> return Space -- cgit v1.2.3