|
|
|
|
|
""" A SAX2 driver for libxml2, on top of it's XmlReader API |
|
|
|
|
|
USAGE |
|
|
# put this file (drv_libxml2.py) in PYTHONPATH |
|
|
import xml.sax |
|
|
reader = xml.sax.make_parser(["drv_libxml2"]) |
|
|
# ...and the rest is standard python sax. |
|
|
|
|
|
CAVEATS |
|
|
- Lexical handlers are supported, except for start/endEntity |
|
|
(waiting for XmlReader.ResolveEntity) and start/endDTD |
|
|
- Error callbacks are not exactly synchronous, they tend |
|
|
to be invoked before the corresponding content callback, |
|
|
because the underlying reader interface parses |
|
|
data by chunks of 512 bytes |
|
|
|
|
|
TODO |
|
|
- search for TODO |
|
|
- some ErrorHandler events (warning) |
|
|
- some ContentHandler events (setDocumentLocator, skippedEntity) |
|
|
- EntityResolver (using libxml2.?) |
|
|
- DTDHandler (if/when libxml2 exposes such node types) |
|
|
- DeclHandler (if/when libxml2 exposes such node types) |
|
|
- property_xml_string? |
|
|
- feature_string_interning? |
|
|
- Incremental parser |
|
|
- additional performance tuning: |
|
|
- one might cache callbacks to avoid some name lookups |
|
|
- one might implement a smarter way to pass attributes to startElement |
|
|
(some kind of lazy evaluation?) |
|
|
- there might be room for improvement in start/endPrefixMapping |
|
|
- other? |
|
|
|
|
|
""" |
|
|
|
|
|
__author__ = "St�phane Bidoul <[email protected]>" |
|
|
__version__ = "0.3" |
|
|
|
|
|
import sys |
|
|
import codecs |
|
|
|
|
|
if sys.version_info[0] < 3: |
|
|
__author__ = codecs.unicode_escape_decode(__author__)[0] |
|
|
|
|
|
StringTypes = (str, unicode) |
|
|
|
|
|
_decoder = codecs.lookup("utf8")[1] |
|
|
def _d(s): |
|
|
if s is None: |
|
|
return s |
|
|
else: |
|
|
return _decoder(s)[0] |
|
|
else: |
|
|
StringTypes = str |
|
|
|
|
|
def _d(s): |
|
|
return s |
|
|
|
|
|
from xml.sax._exceptions import * |
|
|
from xml.sax import xmlreader, saxutils |
|
|
from xml.sax.handler import \ |
|
|
feature_namespaces, \ |
|
|
feature_namespace_prefixes, \ |
|
|
feature_string_interning, \ |
|
|
feature_validation, \ |
|
|
feature_external_ges, \ |
|
|
feature_external_pes, \ |
|
|
property_lexical_handler, \ |
|
|
property_declaration_handler, \ |
|
|
property_dom_node, \ |
|
|
property_xml_string |
|
|
|
|
|
try: |
|
|
import libxml2 |
|
|
except ImportError: |
|
|
raise SAXReaderNotAvailable("libxml2 not available: " \ |
|
|
"import error was: %s" % sys.exc_info()[1]) |
|
|
|
|
|
class Locator(xmlreader.Locator): |
|
|
"""SAX Locator adapter for libxml2.xmlTextReaderLocator""" |
|
|
|
|
|
def __init__(self,locator): |
|
|
self.__locator = locator |
|
|
|
|
|
def getColumnNumber(self): |
|
|
"Return the column number where the current event ends." |
|
|
return -1 |
|
|
|
|
|
def getLineNumber(self): |
|
|
"Return the line number where the current event ends." |
|
|
return self.__locator.LineNumber() |
|
|
|
|
|
def getPublicId(self): |
|
|
"Return the public identifier for the current event." |
|
|
return None |
|
|
|
|
|
def getSystemId(self): |
|
|
"Return the system identifier for the current event." |
|
|
return self.__locator.BaseURI() |
|
|
|
|
|
class LibXml2Reader(xmlreader.XMLReader): |
|
|
|
|
|
def __init__(self): |
|
|
xmlreader.XMLReader.__init__(self) |
|
|
|
|
|
self.__ns = 0 |
|
|
self.__nspfx = 0 |
|
|
self.__validate = 0 |
|
|
self.__extparams = 1 |
|
|
|
|
|
self.__parsing = 0 |
|
|
|
|
|
self.__lex_handler = None |
|
|
self.__decl_handler = None |
|
|
|
|
|
self.__errors = None |
|
|
|
|
|
def _errorHandler(self,arg,msg,severity,locator): |
|
|
if self.__errors is None: |
|
|
self.__errors = [] |
|
|
self.__errors.append((severity, |
|
|
SAXParseException(msg,None, |
|
|
Locator(locator)))) |
|
|
|
|
|
def _reportErrors(self,fatal): |
|
|
for severity,exception in self.__errors: |
|
|
if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, |
|
|
libxml2.PARSER_SEVERITY_WARNING): |
|
|
self._err_handler.warning(exception) |
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
if fatal and exception is self.__errors[-1][1]: |
|
|
self._err_handler.fatalError(exception) |
|
|
else: |
|
|
self._err_handler.error(exception) |
|
|
self.__errors = None |
|
|
|
|
|
def parse(self, source): |
|
|
self.__parsing = 1 |
|
|
try: |
|
|
|
|
|
if isinstance(source, StringTypes): |
|
|
reader = libxml2.newTextReaderFilename(source) |
|
|
else: |
|
|
source = saxutils.prepare_input_source(source) |
|
|
stream = source.getCharacterStream() |
|
|
if stream is None: |
|
|
stream = source.getByteStream() |
|
|
input = libxml2.inputBuffer(stream) |
|
|
reader = input.newTextReader(source.getSystemId()) |
|
|
reader.SetErrorHandler(self._errorHandler,None) |
|
|
|
|
|
if self.__extparams: |
|
|
reader.SetParserProp(libxml2.PARSER_LOADDTD,1) |
|
|
reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) |
|
|
reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) |
|
|
reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) |
|
|
else: |
|
|
reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) |
|
|
|
|
|
if self.__ns: |
|
|
attributesNSImpl = xmlreader.AttributesNSImpl({},{}) |
|
|
else: |
|
|
attributesImpl = xmlreader.AttributesImpl({}) |
|
|
|
|
|
prefixes = [] |
|
|
|
|
|
self._cont_handler.startDocument() |
|
|
while 1: |
|
|
r = reader.Read() |
|
|
|
|
|
if r == 1: |
|
|
if not self.__errors is None: |
|
|
self._reportErrors(0) |
|
|
elif r == 0: |
|
|
if not self.__errors is None: |
|
|
self._reportErrors(0) |
|
|
break |
|
|
else: |
|
|
if not self.__errors is None: |
|
|
self._reportErrors(1) |
|
|
else: |
|
|
self._err_handler.fatalError(\ |
|
|
SAXException("Read failed (no details available)")) |
|
|
break |
|
|
|
|
|
nodeType = reader.NodeType() |
|
|
|
|
|
if nodeType == 1: |
|
|
if self.__ns: |
|
|
eltName = (_d(reader.NamespaceUri()),\ |
|
|
_d(reader.LocalName())) |
|
|
eltQName = _d(reader.Name()) |
|
|
attributesNSImpl._attrs = attrs = {} |
|
|
attributesNSImpl._qnames = qnames = {} |
|
|
newPrefixes = [] |
|
|
while reader.MoveToNextAttribute(): |
|
|
qname = _d(reader.Name()) |
|
|
value = _d(reader.Value()) |
|
|
if qname.startswith("xmlns"): |
|
|
if len(qname) > 5: |
|
|
newPrefix = qname[6:] |
|
|
else: |
|
|
newPrefix = None |
|
|
newPrefixes.append(newPrefix) |
|
|
self._cont_handler.startPrefixMapping(\ |
|
|
newPrefix,value) |
|
|
if not self.__nspfx: |
|
|
continue |
|
|
attName = (_d(reader.NamespaceUri()), |
|
|
_d(reader.LocalName())) |
|
|
qnames[attName] = qname |
|
|
attrs[attName] = value |
|
|
reader.MoveToElement() |
|
|
self._cont_handler.startElementNS( \ |
|
|
eltName,eltQName,attributesNSImpl) |
|
|
if reader.IsEmptyElement(): |
|
|
self._cont_handler.endElementNS(eltName,eltQName) |
|
|
for newPrefix in newPrefixes: |
|
|
self._cont_handler.endPrefixMapping(newPrefix) |
|
|
else: |
|
|
prefixes.append(newPrefixes) |
|
|
else: |
|
|
eltName = _d(reader.Name()) |
|
|
attributesImpl._attrs = attrs = {} |
|
|
while reader.MoveToNextAttribute(): |
|
|
attName = _d(reader.Name()) |
|
|
attrs[attName] = _d(reader.Value()) |
|
|
reader.MoveToElement() |
|
|
self._cont_handler.startElement( \ |
|
|
eltName,attributesImpl) |
|
|
if reader.IsEmptyElement(): |
|
|
self._cont_handler.endElement(eltName) |
|
|
|
|
|
elif nodeType == 15: |
|
|
if self.__ns: |
|
|
self._cont_handler.endElementNS( \ |
|
|
(_d(reader.NamespaceUri()),_d(reader.LocalName())), |
|
|
_d(reader.Name())) |
|
|
for prefix in prefixes.pop(): |
|
|
self._cont_handler.endPrefixMapping(prefix) |
|
|
else: |
|
|
self._cont_handler.endElement(_d(reader.Name())) |
|
|
|
|
|
elif nodeType == 3: |
|
|
self._cont_handler.characters(_d(reader.Value())) |
|
|
|
|
|
elif nodeType == 13: |
|
|
self._cont_handler.ignorableWhitespace(_d(reader.Value())) |
|
|
|
|
|
elif nodeType == 14: |
|
|
self._cont_handler.characters(_d(reader.Value())) |
|
|
|
|
|
elif nodeType == 4: |
|
|
if not self.__lex_handler is None: |
|
|
self.__lex_handler.startCDATA() |
|
|
self._cont_handler.characters(_d(reader.Value())) |
|
|
if not self.__lex_handler is None: |
|
|
self.__lex_handler.endCDATA() |
|
|
|
|
|
elif nodeType == 5: |
|
|
if not self.__lex_handler is None: |
|
|
self.startEntity(_d(reader.Name())) |
|
|
reader.ResolveEntity() |
|
|
|
|
|
elif nodeType == 16: |
|
|
if not self.__lex_handler is None: |
|
|
self.endEntity(_d(reader.Name())) |
|
|
|
|
|
elif nodeType == 7: |
|
|
self._cont_handler.processingInstruction( \ |
|
|
_d(reader.Name()),_d(reader.Value())) |
|
|
|
|
|
elif nodeType == 8: |
|
|
if not self.__lex_handler is None: |
|
|
self.__lex_handler.comment(_d(reader.Value())) |
|
|
|
|
|
elif nodeType == 10: |
|
|
|
|
|
|
|
|
pass |
|
|
|
|
|
elif nodeType == 17: |
|
|
pass |
|
|
|
|
|
elif nodeType == 6: |
|
|
pass |
|
|
|
|
|
elif nodeType == 12: |
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
raise SAXException("Unexpected node type %d" % nodeType) |
|
|
if r == 0: |
|
|
self._cont_handler.endDocument() |
|
|
reader.Close() |
|
|
finally: |
|
|
self.__parsing = 0 |
|
|
|
|
|
def setDTDHandler(self, handler): |
|
|
|
|
|
raise SAXNotSupportedException("DTDHandler not supported") |
|
|
|
|
|
def setEntityResolver(self, resolver): |
|
|
|
|
|
raise SAXNotSupportedException("EntityResolver not supported") |
|
|
|
|
|
def getFeature(self, name): |
|
|
if name == feature_namespaces: |
|
|
return self.__ns |
|
|
elif name == feature_namespace_prefixes: |
|
|
return self.__nspfx |
|
|
elif name == feature_validation: |
|
|
return self.__validate |
|
|
elif name == feature_external_ges: |
|
|
return 1 |
|
|
elif name == feature_external_pes: |
|
|
return self.__extparams |
|
|
else: |
|
|
raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
|
|
name) |
|
|
|
|
|
def setFeature(self, name, state): |
|
|
if self.__parsing: |
|
|
raise SAXNotSupportedException("Cannot set feature %s " \ |
|
|
"while parsing" % name) |
|
|
if name == feature_namespaces: |
|
|
self.__ns = state |
|
|
elif name == feature_namespace_prefixes: |
|
|
self.__nspfx = state |
|
|
elif name == feature_validation: |
|
|
self.__validate = state |
|
|
elif name == feature_external_ges: |
|
|
if state == 0: |
|
|
|
|
|
raise SAXNotSupportedException("Feature '%s' not supported" % \ |
|
|
name) |
|
|
elif name == feature_external_pes: |
|
|
self.__extparams = state |
|
|
else: |
|
|
raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
|
|
name) |
|
|
|
|
|
def getProperty(self, name): |
|
|
if name == property_lexical_handler: |
|
|
return self.__lex_handler |
|
|
elif name == property_declaration_handler: |
|
|
return self.__decl_handler |
|
|
else: |
|
|
raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
|
|
name) |
|
|
|
|
|
def setProperty(self, name, value): |
|
|
if name == property_lexical_handler: |
|
|
self.__lex_handler = value |
|
|
elif name == property_declaration_handler: |
|
|
|
|
|
raise SAXNotSupportedException("Property '%s' not supported" % \ |
|
|
name) |
|
|
self.__decl_handler = value |
|
|
else: |
|
|
raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
|
|
name) |
|
|
|
|
|
def create_parser(): |
|
|
return LibXml2Reader() |
|
|
|
|
|
|