aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2011-07-23 12:35:01 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2011-07-23 12:43:01 -0700
commit6424e7d02c8800a1964d0ae26a523597b8a365fa (patch)
tree9a616458b5a912cb625ce4514351a8f7c47eef75 /src/Text/Pandoc/Readers
parent26418b7d14ce04a7386392388d2a3cbded205705 (diff)
downloadpandoc-6424e7d02c8800a1964d0ae26a523597b8a365fa.tar.gz
Properly handle characters in the 128..159 range.
These aren't valid in HTML, but many HTML files produced by Windows tools contain them. We substitute correct unicode characters.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs43
1 files changed, 41 insertions, 2 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 2fd6d88bf..7c882f680 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -416,10 +416,12 @@ pBlank = try $ do
guard $ all isSpace str
pTagContents :: GenParser Char ParserState Inline
-pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
+pTagContents =
+ pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
pStr :: GenParser Char ParserState Inline
-pStr = liftM Str $ many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c)
+pStr = liftM Str $ many1 $ satisfy $ \c ->
+ not (isSpace c) && not (isSpecial c) && not (isBad c)
isSpecial :: Char -> Bool
isSpecial '"' = True
@@ -435,6 +437,43 @@ isSpecial _ = False
pSymbol :: GenParser Char ParserState Inline
pSymbol = satisfy isSpecial >>= return . Str . (:[])
+isBad :: Char -> Bool
+isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
+
+pBad :: GenParser Char ParserState Inline
+pBad = do
+ c <- satisfy isBad
+ let c' = case c of
+ '\128' -> '\8364'
+ '\130' -> '\8218'
+ '\131' -> '\402'
+ '\132' -> '\8222'
+ '\133' -> '\8230'
+ '\134' -> '\8224'
+ '\135' -> '\8225'
+ '\136' -> '\710'
+ '\137' -> '\8240'
+ '\138' -> '\352'
+ '\139' -> '\8249'
+ '\140' -> '\338'
+ '\142' -> '\381'
+ '\145' -> '\8216'
+ '\146' -> '\8217'
+ '\147' -> '\8220'
+ '\148' -> '\8221'
+ '\149' -> '\8226'
+ '\150' -> '\8211'
+ '\151' -> '\8212'
+ '\152' -> '\732'
+ '\153' -> '\8482'
+ '\154' -> '\353'
+ '\155' -> '\8250'
+ '\156' -> '\339'
+ '\158' -> '\382'
+ '\159' -> '\376'
+ _ -> '?'
+ return $ Str [c']
+
pSpace :: GenParser Char ParserState Inline
pSpace = many1 (satisfy isSpace) >> return Space