diff options
author | John MacFarlane <jgm@berkeley.edu> | 2011-07-23 12:35:01 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2011-07-23 12:43:01 -0700 |
commit | 6424e7d02c8800a1964d0ae26a523597b8a365fa (patch) | |
tree | 9a616458b5a912cb625ce4514351a8f7c47eef75 /src/Text/Pandoc/Readers | |
parent | 26418b7d14ce04a7386392388d2a3cbded205705 (diff) | |
download | pandoc-6424e7d02c8800a1964d0ae26a523597b8a365fa.tar.gz |
Properly handle characters in the 128..159 range.
These aren't valid in HTML, but many HTML files produced by
Windows tools contain them. We substitute correct unicode
characters.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 43 |
1 files changed, 41 insertions, 2 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2fd6d88bf..7c882f680 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -416,10 +416,12 @@ pBlank = try $ do guard $ all isSpace str pTagContents :: GenParser Char ParserState Inline -pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol +pTagContents = + pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad pStr :: GenParser Char ParserState Inline -pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) +pStr = liftM Str $ many1 $ satisfy $ \c -> + not (isSpace c) && not (isSpecial c) && not (isBad c) isSpecial :: Char -> Bool isSpecial '"' = True @@ -435,6 +437,43 @@ isSpecial _ = False pSymbol :: GenParser Char ParserState Inline pSymbol = satisfy isSpecial >>= return . Str . (:[]) +isBad :: Char -> Bool +isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML + +pBad :: GenParser Char ParserState Inline +pBad = do + c <- satisfy isBad + let c' = case c of + '\128' -> '\8364' + '\130' -> '\8218' + '\131' -> '\402' + '\132' -> '\8222' + '\133' -> '\8230' + '\134' -> '\8224' + '\135' -> '\8225' + '\136' -> '\710' + '\137' -> '\8240' + '\138' -> '\352' + '\139' -> '\8249' + '\140' -> '\338' + '\142' -> '\381' + '\145' -> '\8216' + '\146' -> '\8217' + '\147' -> '\8220' + '\148' -> '\8221' + '\149' -> '\8226' + '\150' -> '\8211' + '\151' -> '\8212' + '\152' -> '\732' + '\153' -> '\8482' + '\154' -> '\353' + '\155' -> '\8250' + '\156' -> '\339' + '\158' -> '\382' + '\159' -> '\376' + _ -> '?' + return $ Str [c'] + pSpace :: GenParser Char ParserState Inline pSpace = many1 (satisfy isSpace) >> return Space |