diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2008-08-01 00:45:07 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2008-08-01 00:45:07 +0000 |
commit | 0c6dd105b73e95d61d3a3d3213d8f80f3d8a22d4 (patch) | |
tree | d8cad0cbbdc02fee41bd55ba9938371c8278cf18 /Text/XML/Light/Input.hs | |
parent | 8440385f45e18f43172c0cf4409cf4b8b538bbeb (diff) | |
download | pandoc-0c6dd105b73e95d61d3a3d3213d8f80f3d8a22d4.tar.gz |
Added code for xml library (Text.XML.Light) to source tree,
since there is currently no debian package. Removed
dependency on xml library. Added license to debian/copyright.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1351 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'Text/XML/Light/Input.hs')
-rw-r--r-- | Text/XML/Light/Input.hs | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/Text/XML/Light/Input.hs b/Text/XML/Light/Input.hs new file mode 100644 index 000000000..3cf3a8cd1 --- /dev/null +++ b/Text/XML/Light/Input.hs @@ -0,0 +1,307 @@ +-------------------------------------------------------------------- +-- | +-- Module : Text.XML.Light.Input +-- Copyright : (c) Galois, Inc. 2007 +-- License : BSD3 +-- +-- Maintainer: Iavor S. Diatchki <diatchki@galois.com> +-- Stability : provisional +-- Portability: portable +-- +-- Lightweight XML parsing +-- + +module Text.XML.Light.Input (parseXML,parseXMLDoc) where + +import Text.XML.Light.Types +import Text.XML.Light.Proc +import Text.XML.Light.Output(tagEnd) + +import Data.Char(isSpace) +import Data.List(isPrefixOf) +import Numeric(readHex) + +-- | parseXMLDoc, parse a XMLl document to maybe an element +parseXMLDoc :: String -> Maybe Element +parseXMLDoc xs = strip (parseXML xs) + where strip cs = case onlyElems cs of + e : es + | "?xml" `isPrefixOf` qName (elName e) + -> strip (map Elem es) + | otherwise -> Just e + _ -> Nothing + +-- | parseXML to a list of content chunks +parseXML :: String -> [Content] +parseXML xs = parse $ tokens $ preprocess xs + +------------------------------------------------------------------------ + +parse :: [Token] -> [Content] +parse [] = [] +parse ts = let (es,_,ts1) = nodes ([],Nothing) [] ts + in es ++ parse ts1 + +-- Information about namespaces. +-- The first component is a map that associates prefixes to URIs, +-- the second is the URI for the default namespace, if one was provided. +type NSInfo = ([(String,String)],Maybe String) + +nodes :: NSInfo -> [QName] -> [Token] -> ([Content], [QName], [Token]) + +nodes ns ps (TokCRef ref : ts) = + let (es,qs,ts1) = nodes ns ps ts + in (CRef ref : es, qs, ts1) + +nodes ns ps (TokText txt : ts) = + let (es,qs,ts1) = nodes ns ps ts + (more,es1) = case es of + Text cd : es1' + | cdVerbatim cd == cdVerbatim txt -> (cdData cd,es1') + _ -> ([],es) + + in (Text txt { cdData = cdData txt ++ more } : es1, qs, ts1) + +nodes cur_info ps (TokStart p t as empty : ts) = (node : siblings, open, toks) + where + new_name = annotName new_info t + new_info = foldr addNS cur_info as + node = Elem Element { elLine = Just p + , elName = new_name + , elAttribs = map (annotAttr new_info) as + , elContent = children + } + + (children,(siblings,open,toks)) + | empty = ([], nodes cur_info ps ts) + | otherwise = let (es1,qs1,ts1) = nodes new_info (new_name:ps) ts + in (es1, + case qs1 of + [] -> nodes cur_info ps ts1 + _ : qs3 -> ([],qs3,ts1)) + +nodes ns ps (TokEnd p t : ts) = let t1 = annotName ns t + in case break (t1 ==) ps of + (as,_:_) -> ([],as,ts) + -- Unknown closing tag. Insert as text. + (_,[]) -> + let (es,qs,ts1) = nodes ns ps ts + in (Text CData { + cdLine = Just p, + cdVerbatim = CDataText, + cdData = tagEnd t "" + } : es,qs, ts1) + +nodes _ ps [] = ([],ps,[]) + + +annotName :: NSInfo -> QName -> QName +annotName (namespaces,def_ns) n = + n { qURI = maybe def_ns (`lookup` namespaces) (qPrefix n) } + +annotAttr :: NSInfo -> Attr -> Attr +annotAttr ns a@(Attr { attrKey = k}) = + case (qPrefix k, qName k) of + (Nothing,"xmlns") -> a + _ -> a { attrKey = annotName ns k } + +addNS :: Attr -> NSInfo -> NSInfo +addNS (Attr { attrKey = key, attrVal = val }) (ns,def) = + case (qPrefix key, qName key) of + (Nothing,"xmlns") -> (ns, if null val then Nothing else Just val) + (Just "xmlns", k) -> ((k, val) : ns, def) + _ -> (ns,def) + + +-- Lexer ----------------------------------------------------------------------- + +type LChar = (Line,Char) +type LString = [LChar] +data Token = TokStart Line QName [Attr] Bool -- is empty? + | TokEnd Line QName + | TokCRef String + | TokText CData + deriving Show + +tokens :: String -> [Token] +tokens = tokens' . linenumber 1 + +tokens' :: LString -> [Token] +tokens' ((_,'<') : c@(_,'!') : cs) = special c cs + +tokens' ((_,'<') : cs) = tag (dropSpace cs) -- we are being nice here +tokens' [] = [] +tokens' cs@((l,_):_) = let (as,bs) = breakn ('<' ==) cs + in map cvt (decode_text as) ++ tokens' bs + + -- XXX: Note, some of the lines might be a bit inacuarate + where cvt (TxtBit x) = TokText CData { cdLine = Just l + , cdVerbatim = CDataText + , cdData = x + } + cvt (CRefBit x) = case cref_to_char x of + Just c -> TokText CData { cdLine = Just l + , cdVerbatim = CDataText + , cdData = [c] + } + Nothing -> TokCRef x + + +special :: LChar -> LString -> [Token] +special _ ((_,'-') : (_,'-') : cs) = skip cs + where skip ((_,'-') : (_,'-') : (_,'>') : ds) = tokens' ds + skip (_ : ds) = skip ds + skip [] = [] -- unterminated comment + +special c ((_,'[') : (_,'C') : (_,'D') : (_,'A') : (_,'T') : (_,'A') : (_,'[') + : cs) = + let (xs,ts) = cdata cs + in TokText CData { cdLine = Just (fst c), cdVerbatim = CDataVerbatim, cdData = xs } + : tokens' ts + where cdata ((_,']') : (_,']') : (_,'>') : ds) = ([],ds) + cdata ((_,d) : ds) = let (xs,ys) = cdata ds in (d:xs,ys) + cdata [] = ([],[]) + +special c cs = + let (xs,ts) = munch "" 0 cs + in TokText CData { cdLine = Just (fst c), cdVerbatim = CDataRaw, cdData = '<':'!':(reverse xs) } : tokens' ts + where munch acc nesting ((_,'>') : ds) + | nesting == (0::Int) = ('>':acc,ds) + | otherwise = munch ('>':acc) (nesting-1) ds + munch acc nesting ((_,'<') : ds) + = munch ('<':acc) (nesting+1) ds + munch acc n ((_,x) : ds) = munch (x:acc) n ds + munch acc _ [] = (acc,[]) -- unterminated DTD markup + +--special c cs = tag (c : cs) -- invalid specials are processed as tags + + +qualName :: LString -> (QName,LString) +qualName xs = let (as,bs) = breakn endName xs + (q,n) = case break (':'==) as of + (q1,_:n1) -> (Just q1, n1) + _ -> (Nothing, as) + in (QName { qURI = Nothing, qPrefix = q, qName = n }, bs) + where endName x = isSpace x || x == '=' || x == '>' || x == '/' + + + + + +tag :: LString -> [Token] +tag ((p,'/') : cs) = let (n,ds) = qualName (dropSpace cs) + in TokEnd p n : case ds of + (_,'>') : es -> tokens' es + -- tag was not properly closed... + _ -> tokens' ds +tag [] = [] +tag cs = let (n,ds) = qualName cs + (as,b,ts) = attribs (dropSpace ds) + in TokStart (fst (head cs)) n as b : ts + +attribs :: LString -> ([Attr], Bool, [Token]) +attribs cs = case cs of + (_,'>') : ds -> ([], False, tokens' ds) + + (_,'/') : ds -> ([], True, case ds of + (_,'>') : es -> tokens' es + -- insert missing > ... + _ -> tokens' ds) + + (_,'?') : (_,'>') : ds -> ([], True, tokens' ds) + + -- doc ended within a tag.. + [] -> ([],False,[]) + + _ -> let (a,cs1) = attrib cs + (as,b,ts) = attribs cs1 + in (a:as,b,ts) + +attrib :: LString -> (Attr,LString) +attrib cs = let (ks,cs1) = qualName cs + (vs,cs2) = attr_val (dropSpace cs1) + in ((Attr ks (decode_attr vs)),dropSpace cs2) + +attr_val :: LString -> (String,LString) +attr_val ((_,'=') : cs) = string (dropSpace cs) +attr_val cs = ("",cs) + + +dropSpace :: LString -> LString +dropSpace = dropWhile (isSpace . snd) + +-- | Match the value for an attribute. For malformed XML we do +-- our best to guess the programmer's intention. +string :: LString -> (String,LString) +string ((_,'"') : cs) = break' ('"' ==) cs + +-- Allow attributes to be enclosed between ' '. +string ((_,'\'') : cs) = break' ('\'' ==) cs + +-- Allow attributes that are not enclosed by anything. +string cs = breakn eos cs + where eos x = isSpace x || x == '>' || x == '/' + + +break' :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)]) +break' p xs = let (as,bs) = breakn p xs + in (as, case bs of + [] -> [] + _ : cs -> cs) + +breakn :: (a -> Bool) -> [(b,a)] -> ([a],[(b,a)]) +breakn p l = (map snd as,bs) where (as,bs) = break (p . snd) l + + + +decode_attr :: String -> String +decode_attr cs = concatMap cvt (decode_text cs) + where cvt (TxtBit x) = x + cvt (CRefBit x) = case cref_to_char x of + Just c -> [c] + Nothing -> '&' : x ++ ";" + +data Txt = TxtBit String | CRefBit String deriving Show + +decode_text :: [Char] -> [Txt] +decode_text xs@('&' : cs) = case break (';' ==) cs of + (as,_:bs) -> CRefBit as : decode_text bs + _ -> [TxtBit xs] +decode_text [] = [] +decode_text cs = let (as,bs) = break ('&' ==) cs + in TxtBit as : decode_text bs + +cref_to_char :: [Char] -> Maybe Char +cref_to_char cs = case cs of + '#' : ds -> num_esc ds + "lt" -> Just '<' + "gt" -> Just '>' + "amp" -> Just '&' + "apos" -> Just '\'' + "quot" -> Just '"' + _ -> Nothing + +num_esc :: String -> Maybe Char +num_esc cs = case cs of + 'x' : ds -> check (readHex ds) + _ -> check (reads cs) + + where check [(n,"")] = cvt_char n + check _ = Nothing + +cvt_char :: Int -> Maybe Char +cvt_char x + | fromEnum (minBound :: Char) <= x && x <= fromEnum (maxBound::Char) + = Just (toEnum x) + | otherwise = Nothing + +preprocess :: String -> String +preprocess ('\r' : '\n' : cs) = '\n' : preprocess cs +preprocess ('\r' : cs) = '\n' : preprocess cs +preprocess (c : cs) = c : preprocess cs +preprocess [] = [] + +linenumber :: Line -> String -> LString +linenumber _ [] = [] +linenumber n ('\n':s) = n' `seq` ((n,'\n'):linenumber n' s) where n' = n + 1 +linenumber n (c:s) = (n,c) : linenumber n s |