From 6424e7d02c8800a1964d0ae26a523597b8a365fa Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sat, 23 Jul 2011 12:35:01 -0700
Subject: Properly handle characters in the 128..159 range.

These aren't valid in HTML, but many HTML files produced by
Windows tools contain them.  We substitute correct unicode
characters.
---
 src/Text/Pandoc/Parsing.hs      | 14 +++++++-------
 src/Text/Pandoc/Readers/HTML.hs | 43 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 9 deletions(-)

(limited to 'src/Text/Pandoc')

diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 187343f9c..c16d99bdf 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -758,7 +758,7 @@ charOrRef cs =
 singleQuoteStart :: GenParser Char ParserState ()
 singleQuoteStart = do 
   failIfInQuoteContext InSingleQuote
-  try $ do charOrRef "'\8216"
+  try $ do charOrRef "'\8216\145"
            notFollowedBy (oneOf ")!],.;:-? \t\n")
            notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
                                satisfy (not . isAlphaNum))) 
@@ -767,23 +767,23 @@ singleQuoteStart = do
 
 singleQuoteEnd :: GenParser Char st ()
 singleQuoteEnd = try $ do
-  charOrRef "'\8217"
+  charOrRef "'\8217\146"
   notFollowedBy alphaNum
 
 doubleQuoteStart :: GenParser Char ParserState ()
 doubleQuoteStart = do
   failIfInQuoteContext InDoubleQuote
-  try $ do charOrRef "\"\8220"
+  try $ do charOrRef "\"\8220\147"
            notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n'))
 
 doubleQuoteEnd :: GenParser Char st ()
 doubleQuoteEnd = do
-  charOrRef "\"\8221"
+  charOrRef "\"\8221\148"
   return ()
 
 ellipses :: GenParser Char st Inline
 ellipses = do
-  try (charOrRef "…") <|> try (string "..." >> return '…')
+  try (charOrRef "…\133") <|> try (string "..." >> return '…')
   return Ellipses
 
 dash :: GenParser Char st Inline
@@ -791,13 +791,13 @@ dash = enDash <|> emDash
 
 enDash :: GenParser Char st Inline
 enDash = do
-  try (charOrRef "–") <|>
+  try (charOrRef "–\150") <|>
     try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
   return EnDash
 
 emDash :: GenParser Char st Inline
 emDash = do
-  try (charOrRef "—") <|> (try $ string "--" >> optional (char '-') >> return '—')
+  try (charOrRef "—\151") <|> (try $ string "--" >> optional (char '-') >> return '—')
   return EmDash
 
 --
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 2fd6d88bf..7c882f680 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -416,10 +416,12 @@ pBlank = try $ do
   guard $ all isSpace str
 
 pTagContents :: GenParser Char ParserState Inline
-pTagContents =  pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
+pTagContents =
+  pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
 
 pStr :: GenParser Char ParserState Inline
-pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c)
+pStr = liftM Str $ many1 $ satisfy $ \c ->
+           not (isSpace c) && not (isSpecial c) && not (isBad c)
 
 isSpecial :: Char -> Bool
 isSpecial '"' = True
@@ -435,6 +437,43 @@ isSpecial _ = False
 pSymbol :: GenParser Char ParserState Inline
 pSymbol = satisfy isSpecial >>= return . Str . (:[])
 
+isBad :: Char -> Bool
+isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
+
+pBad :: GenParser Char ParserState Inline
+pBad = do
+  c <- satisfy isBad
+  let c' = case c of
+                '\128' -> '\8364'
+                '\130' -> '\8218'
+                '\131' -> '\402'
+                '\132' -> '\8222'
+                '\133' -> '\8230'
+                '\134' -> '\8224'
+                '\135' -> '\8225'
+                '\136' -> '\710'
+                '\137' -> '\8240'
+                '\138' -> '\352'
+                '\139' -> '\8249'
+                '\140' -> '\338'
+                '\142' -> '\381'
+                '\145' -> '\8216'
+                '\146' -> '\8217'
+                '\147' -> '\8220'
+                '\148' -> '\8221'
+                '\149' -> '\8226'
+                '\150' -> '\8211'
+                '\151' -> '\8212'
+                '\152' -> '\732'
+                '\153' -> '\8482'
+                '\154' -> '\353'
+                '\155' -> '\8250'
+                '\156' -> '\339'
+                '\158' -> '\382'
+                '\159' -> '\376'
+                _      -> '?'
+  return $ Str [c']
+
 pSpace :: GenParser Char ParserState Inline
 pSpace = many1 (satisfy isSpace) >> return Space
 
-- 
cgit v1.2.3