diff options
author | John MacFarlane <jgm@berkeley.edu> | 2011-07-23 12:35:01 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2011-07-23 12:43:01 -0700 |
commit | 6424e7d02c8800a1964d0ae26a523597b8a365fa (patch) | |
tree | 9a616458b5a912cb625ce4514351a8f7c47eef75 /src/Text | |
parent | 26418b7d14ce04a7386392388d2a3cbded205705 (diff) | |
download | pandoc-6424e7d02c8800a1964d0ae26a523597b8a365fa.tar.gz |
Properly handle characters in the 128..159 range.
These aren't valid in HTML, but many HTML files produced by
Windows tools contain them. We substitute correct unicode
characters.
Diffstat (limited to 'src/Text')
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 14 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 43 |
2 files changed, 48 insertions, 9 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 187343f9c..c16d99bdf 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -758,7 +758,7 @@ charOrRef cs = singleQuoteStart :: GenParser Char ParserState () singleQuoteStart = do failIfInQuoteContext InSingleQuote - try $ do charOrRef "'\8216" + try $ do charOrRef "'\8216\145" notFollowedBy (oneOf ")!],.;:-? \t\n") notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> satisfy (not . isAlphaNum))) @@ -767,23 +767,23 @@ singleQuoteStart = do singleQuoteEnd :: GenParser Char st () singleQuoteEnd = try $ do - charOrRef "'\8217" + charOrRef "'\8217\146" notFollowedBy alphaNum doubleQuoteStart :: GenParser Char ParserState () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote - try $ do charOrRef "\"\8220" + try $ do charOrRef "\"\8220\147" notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n')) doubleQuoteEnd :: GenParser Char st () doubleQuoteEnd = do - charOrRef "\"\8221" + charOrRef "\"\8221\148" return () ellipses :: GenParser Char st Inline ellipses = do - try (charOrRef "…") <|> try (string "..." >> return '…') + try (charOrRef "…\133") <|> try (string "..." >> return '…') return Ellipses dash :: GenParser Char st Inline @@ -791,13 +791,13 @@ dash = enDash <|> emDash enDash :: GenParser Char st Inline enDash = do - try (charOrRef "–") <|> + try (charOrRef "–\150") <|> try (char '-' >> lookAhead (satisfy isDigit) >> return '–') return EnDash emDash :: GenParser Char st Inline emDash = do - try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—') + try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—') return EmDash -- diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2fd6d88bf..7c882f680 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -416,10 +416,12 @@ pBlank = try $ do guard $ all isSpace str pTagContents :: GenParser Char ParserState Inline -pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol +pTagContents = + pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad pStr :: GenParser Char ParserState Inline -pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) +pStr = liftM Str $ many1 $ satisfy $ \c -> + not (isSpace c) && not (isSpecial c) && not (isBad c) isSpecial :: Char -> Bool isSpecial '"' = True @@ -435,6 +437,43 @@ isSpecial _ = False pSymbol :: GenParser Char ParserState Inline pSymbol = satisfy isSpecial >>= return . Str . (:[]) +isBad :: Char -> Bool +isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML + +pBad :: GenParser Char ParserState Inline +pBad = do + c <- satisfy isBad + let c' = case c of + '\128' -> '\8364' + '\130' -> '\8218' + '\131' -> '\402' + '\132' -> '\8222' + '\133' -> '\8230' + '\134' -> '\8224' + '\135' -> '\8225' + '\136' -> '\710' + '\137' -> '\8240' + '\138' -> '\352' + '\139' -> '\8249' + '\140' -> '\338' + '\142' -> '\381' + '\145' -> '\8216' + '\146' -> '\8217' + '\147' -> '\8220' + '\148' -> '\8221' + '\149' -> '\8226' + '\150' -> '\8211' + '\151' -> '\8212' + '\152' -> '\732' + '\153' -> '\8482' + '\154' -> '\353' + '\155' -> '\8250' + '\156' -> '\339' + '\158' -> '\382' + '\159' -> '\376' + _ -> '?' + return $ Str [c'] + pSpace :: GenParser Char ParserState Inline pSpace = many1 (satisfy isSpace) >> return Space |