1 | # -*- coding: iso-8859-1 -*-
|
---|
2 | """ A SAX2 driver for libxml2, on top of it's XmlReader API
|
---|
3 |
|
---|
4 | USAGE
|
---|
5 | # put this file (drv_libxml2.py) in PYTHONPATH
|
---|
6 | import xml.sax
|
---|
7 | reader = xml.sax.make_parser(["drv_libxml2"])
|
---|
8 | # ...and the rest is standard python sax.
|
---|
9 |
|
---|
10 | CAVEATS
|
---|
11 | - Lexical handlers are supported, except for start/endEntity
|
---|
12 | (waiting for XmlReader.ResolveEntity) and start/endDTD
|
---|
13 | - Error callbacks are not exactly synchronous, they tend
|
---|
14 | to be invoked before the corresponding content callback,
|
---|
15 | because the underlying reader interface parses
|
---|
16 | data by chunks of 512 bytes
|
---|
17 |
|
---|
18 | TODO
|
---|
19 | - search for TODO
|
---|
20 | - some ErrorHandler events (warning)
|
---|
21 | - some ContentHandler events (setDocumentLocator, skippedEntity)
|
---|
22 | - EntityResolver (using libxml2.?)
|
---|
23 | - DTDHandler (if/when libxml2 exposes such node types)
|
---|
24 | - DeclHandler (if/when libxml2 exposes such node types)
|
---|
25 | - property_xml_string?
|
---|
26 | - feature_string_interning?
|
---|
27 | - Incremental parser
|
---|
28 | - additional performance tuning:
|
---|
29 | - one might cache callbacks to avoid some name lookups
|
---|
30 | - one might implement a smarter way to pass attributes to startElement
|
---|
31 | (some kind of lazy evaluation?)
|
---|
32 | - there might be room for improvement in start/endPrefixMapping
|
---|
33 | - other?
|
---|
34 |
|
---|
35 | """
|
---|
36 |
|
---|
37 | __author__ = "Stéphane Bidoul <[email protected]>"
|
---|
38 | __version__ = "0.3"
|
---|
39 |
|
---|
40 | import sys
|
---|
41 | import codecs
|
---|
42 |
|
---|
43 | if sys.version_info[0] < 3:
|
---|
44 | __author__ = codecs.unicode_escape_decode(__author__)[0]
|
---|
45 |
|
---|
46 | StringTypes = (str, unicode)
|
---|
47 | # libxml2 returns strings as UTF8
|
---|
48 | _decoder = codecs.lookup("utf8")[1]
|
---|
49 | def _d(s):
|
---|
50 | if s is None:
|
---|
51 | return s
|
---|
52 | else:
|
---|
53 | return _decoder(s)[0]
|
---|
54 | else:
|
---|
55 | StringTypes = str
|
---|
56 | # s is Unicode `str` already
|
---|
57 | def _d(s):
|
---|
58 | return s
|
---|
59 |
|
---|
60 | from xml.sax._exceptions import *
|
---|
61 | from xml.sax import xmlreader, saxutils
|
---|
62 | from xml.sax.handler import \
|
---|
63 | feature_namespaces, \
|
---|
64 | feature_namespace_prefixes, \
|
---|
65 | feature_string_interning, \
|
---|
66 | feature_validation, \
|
---|
67 | feature_external_ges, \
|
---|
68 | feature_external_pes, \
|
---|
69 | property_lexical_handler, \
|
---|
70 | property_declaration_handler, \
|
---|
71 | property_dom_node, \
|
---|
72 | property_xml_string
|
---|
73 |
|
---|
74 | try:
|
---|
75 | import libxml2
|
---|
76 | except ImportError:
|
---|
77 | raise SAXReaderNotAvailable("libxml2 not available: " \
|
---|
78 | "import error was: %s" % sys.exc_info()[1])
|
---|
79 |
|
---|
80 | class Locator(xmlreader.Locator):
|
---|
81 | """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
|
---|
82 |
|
---|
83 | def __init__(self,locator):
|
---|
84 | self.__locator = locator
|
---|
85 |
|
---|
86 | def getColumnNumber(self):
|
---|
87 | "Return the column number where the current event ends."
|
---|
88 | return -1
|
---|
89 |
|
---|
90 | def getLineNumber(self):
|
---|
91 | "Return the line number where the current event ends."
|
---|
92 | return self.__locator.LineNumber()
|
---|
93 |
|
---|
94 | def getPublicId(self):
|
---|
95 | "Return the public identifier for the current event."
|
---|
96 | return None
|
---|
97 |
|
---|
98 | def getSystemId(self):
|
---|
99 | "Return the system identifier for the current event."
|
---|
100 | return self.__locator.BaseURI()
|
---|
101 |
|
---|
102 | class LibXml2Reader(xmlreader.XMLReader):
|
---|
103 |
|
---|
104 | def __init__(self):
|
---|
105 | xmlreader.XMLReader.__init__(self)
|
---|
106 | # features
|
---|
107 | self.__ns = 0
|
---|
108 | self.__nspfx = 0
|
---|
109 | self.__validate = 0
|
---|
110 | self.__extparams = 1
|
---|
111 | # parsing flag
|
---|
112 | self.__parsing = 0
|
---|
113 | # additional handlers
|
---|
114 | self.__lex_handler = None
|
---|
115 | self.__decl_handler = None
|
---|
116 | # error messages accumulator
|
---|
117 | self.__errors = None
|
---|
118 |
|
---|
119 | def _errorHandler(self,arg,msg,severity,locator):
|
---|
120 | if self.__errors is None:
|
---|
121 | self.__errors = []
|
---|
122 | self.__errors.append((severity,
|
---|
123 | SAXParseException(msg,None,
|
---|
124 | Locator(locator))))
|
---|
125 |
|
---|
126 | def _reportErrors(self,fatal):
|
---|
127 | for severity,exception in self.__errors:
|
---|
128 | if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
|
---|
129 | libxml2.PARSER_SEVERITY_WARNING):
|
---|
130 | self._err_handler.warning(exception)
|
---|
131 | else:
|
---|
132 | # when fatal is set, the parse will stop;
|
---|
133 | # we consider that the last error reported
|
---|
134 | # is the fatal one.
|
---|
135 | if fatal and exception is self.__errors[-1][1]:
|
---|
136 | self._err_handler.fatalError(exception)
|
---|
137 | else:
|
---|
138 | self._err_handler.error(exception)
|
---|
139 | self.__errors = None
|
---|
140 |
|
---|
141 | def parse(self, source):
|
---|
142 | self.__parsing = 1
|
---|
143 | try:
|
---|
144 | # prepare source and create reader
|
---|
145 | if isinstance(source, StringTypes):
|
---|
146 | reader = libxml2.newTextReaderFilename(source)
|
---|
147 | else:
|
---|
148 | source = saxutils.prepare_input_source(source)
|
---|
149 | input = libxml2.inputBuffer(source.getByteStream())
|
---|
150 | reader = input.newTextReader(source.getSystemId())
|
---|
151 | reader.SetErrorHandler(self._errorHandler,None)
|
---|
152 | # configure reader
|
---|
153 | if self.__extparams:
|
---|
154 | reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
|
---|
155 | reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
|
---|
156 | reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
|
---|
157 | reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
|
---|
158 | else:
|
---|
159 | reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
|
---|
160 | # we reuse attribute maps (for a slight performance gain)
|
---|
161 | if self.__ns:
|
---|
162 | attributesNSImpl = xmlreader.AttributesNSImpl({},{})
|
---|
163 | else:
|
---|
164 | attributesImpl = xmlreader.AttributesImpl({})
|
---|
165 | # prefixes to pop (for endPrefixMapping)
|
---|
166 | prefixes = []
|
---|
167 | # start loop
|
---|
168 | self._cont_handler.startDocument()
|
---|
169 | while 1:
|
---|
170 | r = reader.Read()
|
---|
171 | # check for errors
|
---|
172 | if r == 1:
|
---|
173 | if not self.__errors is None:
|
---|
174 | self._reportErrors(0)
|
---|
175 | elif r == 0:
|
---|
176 | if not self.__errors is None:
|
---|
177 | self._reportErrors(0)
|
---|
178 | break # end of parse
|
---|
179 | else:
|
---|
180 | if not self.__errors is None:
|
---|
181 | self._reportErrors(1)
|
---|
182 | else:
|
---|
183 | self._err_handler.fatalError(\
|
---|
184 | SAXException("Read failed (no details available)"))
|
---|
185 | break # fatal parse error
|
---|
186 | # get node type
|
---|
187 | nodeType = reader.NodeType()
|
---|
188 | # Element
|
---|
189 | if nodeType == 1:
|
---|
190 | if self.__ns:
|
---|
191 | eltName = (_d(reader.NamespaceUri()),\
|
---|
192 | _d(reader.LocalName()))
|
---|
193 | eltQName = _d(reader.Name())
|
---|
194 | attributesNSImpl._attrs = attrs = {}
|
---|
195 | attributesNSImpl._qnames = qnames = {}
|
---|
196 | newPrefixes = []
|
---|
197 | while reader.MoveToNextAttribute():
|
---|
198 | qname = _d(reader.Name())
|
---|
199 | value = _d(reader.Value())
|
---|
200 | if qname.startswith("xmlns"):
|
---|
201 | if len(qname) > 5:
|
---|
202 | newPrefix = qname[6:]
|
---|
203 | else:
|
---|
204 | newPrefix = None
|
---|
205 | newPrefixes.append(newPrefix)
|
---|
206 | self._cont_handler.startPrefixMapping(\
|
---|
207 | newPrefix,value)
|
---|
208 | if not self.__nspfx:
|
---|
209 | continue # don't report xmlns attribute
|
---|
210 | attName = (_d(reader.NamespaceUri()),
|
---|
211 | _d(reader.LocalName()))
|
---|
212 | qnames[attName] = qname
|
---|
213 | attrs[attName] = value
|
---|
214 | reader.MoveToElement()
|
---|
215 | self._cont_handler.startElementNS( \
|
---|
216 | eltName,eltQName,attributesNSImpl)
|
---|
217 | if reader.IsEmptyElement():
|
---|
218 | self._cont_handler.endElementNS(eltName,eltQName)
|
---|
219 | for newPrefix in newPrefixes:
|
---|
220 | self._cont_handler.endPrefixMapping(newPrefix)
|
---|
221 | else:
|
---|
222 | prefixes.append(newPrefixes)
|
---|
223 | else:
|
---|
224 | eltName = _d(reader.Name())
|
---|
225 | attributesImpl._attrs = attrs = {}
|
---|
226 | while reader.MoveToNextAttribute():
|
---|
227 | attName = _d(reader.Name())
|
---|
228 | attrs[attName] = _d(reader.Value())
|
---|
229 | reader.MoveToElement()
|
---|
230 | self._cont_handler.startElement( \
|
---|
231 | eltName,attributesImpl)
|
---|
232 | if reader.IsEmptyElement():
|
---|
233 | self._cont_handler.endElement(eltName)
|
---|
234 | # EndElement
|
---|
235 | elif nodeType == 15:
|
---|
236 | if self.__ns:
|
---|
237 | self._cont_handler.endElementNS( \
|
---|
238 | (_d(reader.NamespaceUri()),_d(reader.LocalName())),
|
---|
239 | _d(reader.Name()))
|
---|
240 | for prefix in prefixes.pop():
|
---|
241 | self._cont_handler.endPrefixMapping(prefix)
|
---|
242 | else:
|
---|
243 | self._cont_handler.endElement(_d(reader.Name()))
|
---|
244 | # Text
|
---|
245 | elif nodeType == 3:
|
---|
246 | self._cont_handler.characters(_d(reader.Value()))
|
---|
247 | # Whitespace
|
---|
248 | elif nodeType == 13:
|
---|
249 | self._cont_handler.ignorableWhitespace(_d(reader.Value()))
|
---|
250 | # SignificantWhitespace
|
---|
251 | elif nodeType == 14:
|
---|
252 | self._cont_handler.characters(_d(reader.Value()))
|
---|
253 | # CDATA
|
---|
254 | elif nodeType == 4:
|
---|
255 | if not self.__lex_handler is None:
|
---|
256 | self.__lex_handler.startCDATA()
|
---|
257 | self._cont_handler.characters(_d(reader.Value()))
|
---|
258 | if not self.__lex_handler is None:
|
---|
259 | self.__lex_handler.endCDATA()
|
---|
260 | # EntityReference
|
---|
261 | elif nodeType == 5:
|
---|
262 | if not self.__lex_handler is None:
|
---|
263 | self.startEntity(_d(reader.Name()))
|
---|
264 | reader.ResolveEntity()
|
---|
265 | # EndEntity
|
---|
266 | elif nodeType == 16:
|
---|
267 | if not self.__lex_handler is None:
|
---|
268 | self.endEntity(_d(reader.Name()))
|
---|
269 | # ProcessingInstruction
|
---|
270 | elif nodeType == 7:
|
---|
271 | self._cont_handler.processingInstruction( \
|
---|
272 | _d(reader.Name()),_d(reader.Value()))
|
---|
273 | # Comment
|
---|
274 | elif nodeType == 8:
|
---|
275 | if not self.__lex_handler is None:
|
---|
276 | self.__lex_handler.comment(_d(reader.Value()))
|
---|
277 | # DocumentType
|
---|
278 | elif nodeType == 10:
|
---|
279 | #if not self.__lex_handler is None:
|
---|
280 | # self.__lex_handler.startDTD()
|
---|
281 | pass # TODO (how to detect endDTD? on first non-dtd event?)
|
---|
282 | # XmlDeclaration
|
---|
283 | elif nodeType == 17:
|
---|
284 | pass # TODO
|
---|
285 | # Entity
|
---|
286 | elif nodeType == 6:
|
---|
287 | pass # TODO (entity decl)
|
---|
288 | # Notation (decl)
|
---|
289 | elif nodeType == 12:
|
---|
290 | pass # TODO
|
---|
291 | # Attribute (never in this loop)
|
---|
292 | #elif nodeType == 2:
|
---|
293 | # pass
|
---|
294 | # Document (not exposed)
|
---|
295 | #elif nodeType == 9:
|
---|
296 | # pass
|
---|
297 | # DocumentFragment (never returned by XmlReader)
|
---|
298 | #elif nodeType == 11:
|
---|
299 | # pass
|
---|
300 | # None
|
---|
301 | #elif nodeType == 0:
|
---|
302 | # pass
|
---|
303 | # -
|
---|
304 | else:
|
---|
305 | raise SAXException("Unexpected node type %d" % nodeType)
|
---|
306 | if r == 0:
|
---|
307 | self._cont_handler.endDocument()
|
---|
308 | reader.Close()
|
---|
309 | finally:
|
---|
310 | self.__parsing = 0
|
---|
311 |
|
---|
312 | def setDTDHandler(self, handler):
|
---|
313 | # TODO (when supported, the inherited method works just fine)
|
---|
314 | raise SAXNotSupportedException("DTDHandler not supported")
|
---|
315 |
|
---|
316 | def setEntityResolver(self, resolver):
|
---|
317 | # TODO (when supported, the inherited method works just fine)
|
---|
318 | raise SAXNotSupportedException("EntityResolver not supported")
|
---|
319 |
|
---|
320 | def getFeature(self, name):
|
---|
321 | if name == feature_namespaces:
|
---|
322 | return self.__ns
|
---|
323 | elif name == feature_namespace_prefixes:
|
---|
324 | return self.__nspfx
|
---|
325 | elif name == feature_validation:
|
---|
326 | return self.__validate
|
---|
327 | elif name == feature_external_ges:
|
---|
328 | return 1 # TODO (does that relate to PARSER_LOADDTD)?
|
---|
329 | elif name == feature_external_pes:
|
---|
330 | return self.__extparams
|
---|
331 | else:
|
---|
332 | raise SAXNotRecognizedException("Feature '%s' not recognized" % \
|
---|
333 | name)
|
---|
334 |
|
---|
335 | def setFeature(self, name, state):
|
---|
336 | if self.__parsing:
|
---|
337 | raise SAXNotSupportedException("Cannot set feature %s " \
|
---|
338 | "while parsing" % name)
|
---|
339 | if name == feature_namespaces:
|
---|
340 | self.__ns = state
|
---|
341 | elif name == feature_namespace_prefixes:
|
---|
342 | self.__nspfx = state
|
---|
343 | elif name == feature_validation:
|
---|
344 | self.__validate = state
|
---|
345 | elif name == feature_external_ges:
|
---|
346 | if state == 0:
|
---|
347 | # TODO (does that relate to PARSER_LOADDTD)?
|
---|
348 | raise SAXNotSupportedException("Feature '%s' not supported" % \
|
---|
349 | name)
|
---|
350 | elif name == feature_external_pes:
|
---|
351 | self.__extparams = state
|
---|
352 | else:
|
---|
353 | raise SAXNotRecognizedException("Feature '%s' not recognized" % \
|
---|
354 | name)
|
---|
355 |
|
---|
356 | def getProperty(self, name):
|
---|
357 | if name == property_lexical_handler:
|
---|
358 | return self.__lex_handler
|
---|
359 | elif name == property_declaration_handler:
|
---|
360 | return self.__decl_handler
|
---|
361 | else:
|
---|
362 | raise SAXNotRecognizedException("Property '%s' not recognized" % \
|
---|
363 | name)
|
---|
364 |
|
---|
365 | def setProperty(self, name, value):
|
---|
366 | if name == property_lexical_handler:
|
---|
367 | self.__lex_handler = value
|
---|
368 | elif name == property_declaration_handler:
|
---|
369 | # TODO: remove if/when libxml2 supports dtd events
|
---|
370 | raise SAXNotSupportedException("Property '%s' not supported" % \
|
---|
371 | name)
|
---|
372 | self.__decl_handler = value
|
---|
373 | else:
|
---|
374 | raise SAXNotRecognizedException("Property '%s' not recognized" % \
|
---|
375 | name)
|
---|
376 |
|
---|
377 | def create_parser():
|
---|
378 | return LibXml2Reader()
|
---|
379 |
|
---|