diff options
Diffstat (limited to 'Text/XML/Light/Input.hs')
-rw-r--r-- | Text/XML/Light/Input.hs | 307 |
1 files changed, 0 insertions, 307 deletions
diff --git a/Text/XML/Light/Input.hs b/Text/XML/Light/Input.hs deleted file mode 100644 index 3cf3a8cd1..000000000 --- a/Text/XML/Light/Input.hs +++ /dev/null @@ -1,307 +0,0 @@ --------------------------------------------------------------------- --- | --- Module : Text.XML.Light.Input --- Copyright : (c) Galois, Inc. 2007 --- License : BSD3 --- --- Maintainer: Iavor S. Diatchki <diatchki@galois.com> --- Stability : provisional --- Portability: portable --- --- Lightweight XML parsing --- - -module Text.XML.Light.Input (parseXML,parseXMLDoc) where - -import Text.XML.Light.Types -import Text.XML.Light.Proc -import Text.XML.Light.Output(tagEnd) - -import Data.Char(isSpace) -import Data.List(isPrefixOf) -import Numeric(readHex) - --- | parseXMLDoc, parse a XMLl document to maybe an element -parseXMLDoc :: String -> Maybe Element -parseXMLDoc xs = strip (parseXML xs) - where strip cs = case onlyElems cs of - e : es - | "?xml" `isPrefixOf` qName (elName e) - -> strip (map Elem es) - | otherwise -> Just e - _ -> Nothing - --- | parseXML to a list of content chunks -parseXML :: String -> [Content] -parseXML xs = parse $ tokens $ preprocess xs - ------------------------------------------------------------------------- - -parse :: [Token] -> [Content] -parse [] = [] -parse ts = let (es,_,ts1) = nodes ([],Nothing) [] ts - in es ++ parse ts1 - --- Information about namespaces. --- The first component is a map that associates prefixes to URIs, --- the second is the URI for the default namespace, if one was provided. -type NSInfo = ([(String,String)],Maybe String) - -nodes :: NSInfo -> [QName] -> [Token] -> ([Content], [QName], [Token]) - -nodes ns ps (TokCRef ref : ts) = - let (es,qs,ts1) = nodes ns ps ts - in (CRef ref : es, qs, ts1) - -nodes ns ps (TokText txt : ts) = - let (es,qs,ts1) = nodes ns ps ts - (more,es1) = case es of - Text cd : es1' - | cdVerbatim cd == cdVerbatim txt -> (cdData cd,es1') - _ -> ([],es) - - in (Text txt { cdData = cdData txt ++ more } : es1, qs, ts1) - -nodes cur_info ps (TokStart p t as empty : ts) = (node : siblings, open, toks) - where - new_name = annotName new_info t - new_info = foldr addNS cur_info as - node = Elem Element { elLine = Just p - , elName = new_name - , elAttribs = map (annotAttr new_info) as - , elContent = children - } - - (children,(siblings,open,toks)) - | empty = ([], nodes cur_info ps ts) - | otherwise = let (es1,qs1,ts1) = nodes new_info (new_name:ps) ts - in (es1, - case qs1 of - [] -> nodes cur_info ps ts1 - _ : qs3 -> ([],qs3,ts1)) - -nodes ns ps (TokEnd p t : ts) = let t1 = annotName ns t - in case break (t1 ==) ps of - (as,_:_) -> ([],as,ts) - -- Unknown closing tag. Insert as text. - (_,[]) -> - let (es,qs,ts1) = nodes ns ps ts - in (Text CData { - cdLine = Just p, - cdVerbatim = CDataText, - cdData = tagEnd t "" - } : es,qs, ts1) - -nodes _ ps [] = ([],ps,[]) - - -annotName :: NSInfo -> QName -> QName -annotName (namespaces,def_ns) n = - n { qURI = maybe def_ns (`lookup` namespaces) (qPrefix n) } - -annotAttr :: NSInfo -> Attr -> Attr -annotAttr ns a@(Attr { attrKey = k}) = - case (qPrefix k, qName k) of - (Nothing,"xmlns") -> a - _ -> a { attrKey = annotName ns k } - -addNS :: Attr -> NSInfo -> NSInfo -addNS (Attr { attrKey = key, attrVal = val }) (ns,def) = - case (qPrefix key, qName key) of - (Nothing,"xmlns") -> (ns, if null val then Nothing else Just val) - (Just "xmlns", k) -> ((k, val) : ns, def) - _ -> (ns,def) - - --- Lexer ----------------------------------------------------------------------- - -type LChar = (Line,Char) -type LString = [LChar] -data Token = TokStart Line QName [Attr] Bool -- is empty? - | TokEnd Line QName - | TokCRef String - | TokText CData - deriving Show - -tokens :: String -> [Token] -tokens = tokens' . linenumber 1 - -tokens' :: LString -> [Token] -tokens' ((_,'<') : c@(_,'!') : cs) = special c cs - -tokens' ((_,'<') : cs) = tag (dropSpace cs) -- we are being nice here -tokens' [] = [] -tokens' cs@((l,_):_) = let (as,bs) = breakn ('<' ==) cs - in map cvt (decode_text as) ++ tokens' bs - - -- XXX: Note, some of the lines might be a bit inacuarate - where cvt (TxtBit x) = TokText CData { cdLine = Just l - , cdVerbatim = CDataText - , cdData = x - } - cvt (CRefBit x) = case cref_to_char x of - Just c -> TokText CData { cdLine = Just l - , cdVerbatim = CDataText - , cdData = [c] - } - Nothing -> TokCRef x - - -special :: LChar -> LString -> [Token] -special _ ((_,'-') : (_,'-') : cs) = skip cs - where skip ((_,'-') : (_,'-') : (_,'>') : ds) = tokens' ds - skip (_ : ds) = skip ds - skip [] = [] -- unterminated comment - -special c ((_,'[') : (_,'C') : (_,'D') : (_,'A') : (_,'T') : (_,'A') : (_,'[') - : cs) = - let (xs,ts) = cdata cs - in TokText CData { cdLine = Just (fst c), cdVerbatim = CDataVerbatim, cdData = xs } - : tokens' ts - where cdata ((_,']') : (_,']') : (_,'>') : ds) = ([],ds) - cdata ((_,d) : ds) = let (xs,ys) = cdata ds in (d:xs,ys) - cdata [] = ([],[]) - -special c cs = - let (xs,ts) = munch "" 0 cs - in TokText CData { cdLine = Just (fst c), cdVerbatim = CDataRaw, cdData = '<':'!':(reverse xs) } : tokens' ts - where munch acc nesting ((_,'>') : ds) - | nesting == (0::Int) = ('>':acc,ds) - | otherwise = munch ('>':acc) (nesting-1) ds - munch acc nesting ((_,'<') : ds) - = munch ('<':acc) (nesting+1) ds - munch acc n ((_,x) : ds) = munch (x:acc) n ds - munch acc _ [] = (acc,[]) -- unterminated DTD markup - ---special c cs = tag (c : cs) -- invalid specials are processed as tags - - -qualName :: LString -> (QName,LString) -qualName xs = let (as,bs) = breakn endName xs - (q,n) = case break (':'==) as of - (q1,_:n1) -> (Just q1, n1) - _ -> (Nothing, as) - in (QName { qURI = Nothing, qPrefix = q, qName = n }, bs) - where endName x = isSpace x || x == '=' || x == '>' || x == '/' - - - - - -tag :: LString -> [Token] -tag ((p,'/') : cs) = let (n,ds) = qualName (dropSpace cs) - in TokEnd p n : case ds of - (_,'>') : es -> tokens' es - -- tag was not properly closed... - _ -> tokens' ds -tag [] = [] -tag cs = let (n,ds) = qualName cs - (as,b,ts) = attribs (dropSpace ds) - in TokStart (fst (head cs)) n as b : ts - -attribs :: LString -> ([Attr], Bool, [Token]) -attribs cs = case cs of - (_,'>') : ds -> ([], False, tokens' ds) - - (_,'/') : ds -> ([], True, case ds of - (_,'>') : es -> tokens' es - -- insert missing > ... - _ -> tokens' ds) - - (_,'?') : (_,'>') : ds -> ([], True, tokens' ds) - - -- doc ended within a tag.. - [] -> ([],False,[]) - - _ -> let (a,cs1) = attrib cs - (as,b,ts) = attribs cs1 - in (a:as,b,ts) - -attrib :: LString -> (Attr,LString) -attrib cs = let (ks,cs1) = qualName cs - (vs,cs2) = attr_val (dropSpace cs1) - in ((Attr ks (decode_attr vs)),dropSpace cs2) - -attr_val :: LString -> (String,LString) -attr_val ((_,'=') : cs) = string (dropSpace cs) -attr_val cs = ("",cs) - - -dropSpace :: LString -> LString -dropSpace = dropWhile (isSpace . snd) - --- | Match the value for an attribute. For malformed XML we do --- our best to guess the programmer's intention. -string :: LString -> (String,LString) -string ((_,'"') : cs) = break' ('"' ==) cs - --- Allow attributes to be enclosed between ' '. -string ((_,'\'') : cs) = break' ('\'' ==) cs - --- Allow attributes that are not enclosed by anything. -string cs = breakn eos cs - where eos x = isSpace x || x == '>' || x == '/' - - -break' :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)]) -break' p xs = let (as,bs) = breakn p xs - in (as, case bs of - [] -> [] - _ : cs -> cs) - -breakn :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)]) -breakn p l = (map snd as,bs) where (as,bs) = break (p . snd) l - - - -decode_attr :: String -> String -decode_attr cs = concatMap cvt (decode_text cs) - where cvt (TxtBit x) = x - cvt (CRefBit x) = case cref_to_char x of - Just c -> [c] - Nothing -> '&' : x ++ ";" - -data Txt = TxtBit String | CRefBit String deriving Show - -decode_text :: [Char] -> [Txt] -decode_text xs@('&' : cs) = case break (';' ==) cs of - (as,_:bs) -> CRefBit as : decode_text bs - _ -> [TxtBit xs] -decode_text [] = [] -decode_text cs = let (as,bs) = break ('&' ==) cs - in TxtBit as : decode_text bs - -cref_to_char :: [Char] -> Maybe Char -cref_to_char cs = case cs of - '#' : ds -> num_esc ds - "lt" -> Just '<' - "gt" -> Just '>' - "amp" -> Just '&' - "apos" -> Just '\'' - "quot" -> Just '"' - _ -> Nothing - -num_esc :: String -> Maybe Char -num_esc cs = case cs of - 'x' : ds -> check (readHex ds) - _ -> check (reads cs) - - where check [(n,"")] = cvt_char n - check _ = Nothing - -cvt_char :: Int -> Maybe Char -cvt_char x - | fromEnum (minBound :: Char) <= x && x <= fromEnum (maxBound::Char) - = Just (toEnum x) - | otherwise = Nothing - -preprocess :: String -> String -preprocess ('\r' : '\n' : cs) = '\n' : preprocess cs -preprocess ('\r' : cs) = '\n' : preprocess cs -preprocess (c : cs) = c : preprocess cs -preprocess [] = [] - -linenumber :: Line -> String -> LString -linenumber _ [] = [] -linenumber n ('\n':s) = n' `seq` ((n,'\n'):linenumber n' s) where n' = n + 1 -linenumber n (c:s) = (n,c) : linenumber n s |