|
Description:
This recipe take an xml file as input and output a colorized version of this file, using html or docbook (with emphasis elements and a particular role). It provides a little command line interface and it's really easy to configure your output.
Source: Text Source
import sys
from xml.dom.ext import SplitQName
from xml.sax.handler import ContentHandler
from xml.sax.saxutils import escape
_ROOT, _STRING, _COMMENT, _NAME, _KEYWORD, _TEXT, _HEAD =0,1,2,3,4,5,6
DOCBOOK = {
_ROOT: ('<programlisting>','</programlisting>'),
_STRING: ('<emphasis>', '</emphasis>'),
_COMMENT:('<emphasis>', '</emphasis>'),
_NAME: ('', ''),
_KEYWORD:('<emphasis role="bold">', '</emphasis>'),
_TEXT: ('', '')
} HTML = {
_ROOT: ('<div>', '</div>'),
_STRING: ('<font color="#004080">', '</font>'),
_COMMENT:('<font color="#008000">', '</font>'),
_NAME: ('', ''),
_KEYWORD:('<font color="#C00000">', '</font>'),
_TEXT: ('', '')
}
class XmlFormatSaxHandler(ContentHandler):
''' format an xmlfile to docbook or html '''
def __init__(self, head=1, output=sys.stdout, encoding='UTF-8'):
self._out = output
self._cod = encoding
self._o_d = DOCBOOK
self._in_cdata = 0
self._in_entity = 0
def set_format(self, format):
if format == 'docbook':
self._o_d = DOCBOOK
if format == 'html':
self._o_d = HTML
def startDocument(self):
self._out.write(self._o_d[_ROOT][0])
def endDocument(self):
self._out.write(self._o_d[_ROOT][1])
def startElement(self, name, attrs):
prefix, local = SplitQName(name)
if prefix:
self._out.write('<%s%s%s:%s%s%s'.encode(self._cod) % (
self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
else:
self._out.write('<%s%s%s'.encode(self._cod) % (
self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
for key, val in attrs.items():
prefix, local = SplitQName(key)
if prefix:
self._out.write('%s%s%s:%s%s%s=%s"%s"%s'.encode(self._cod) % (
self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
self._o_d[_NAME][0], local, self._o_d[_NAME][1],
self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
else:
self._out.write(' %s%s%s=%s"%s"%s'.encode(self._cod) % (
self._o_d[_NAME][0], local, self._o_d[_NAME][1],
self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
self._out.write('>')
def endElement(self, name):
prefix, local = SplitQName(name)
if prefix:
self._out.write('</%s%s%s:%s%s%s>'.encode(self._cod) % (
self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
else:
self._out.write('</%s%s%s>'.encode(self._cod) % (
self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
def processingInstruction(self, target, data):
self._out.write('<?%s%s%s %s%s%s>'.encode(self._cod) % (
self._o_d[_NAME][0], target, self._o_d[_NAME][1],
self._o_d[_STRING][0], data, self._o_d[_STRING][1]))
def characters(self, ch):
if self._in_entity: return
elif not self._in_cdata: ch = escape(ch)
self._out.write('%s%s%s' % (
self._o_d[_TEXT][0], ch.encode(self._cod), self._o_d[_TEXT][1]))
def comment(self, comment):
self._out.write('%s<!--%s-->%s' % (
self._o_d[_COMMENT][0],
comment.replace('<', '<').encode(self._cod),
self._o_d[_COMMENT][1]))
def startCDATA(self):
self._out.write('<%s[CDATA[%s' % (
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
self._in_cdata = 1
def endCDATA(self):
self._out.write('%s]]%s>' % (
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
self._in_cdata = 0
def startDTD(self, name, public_id, system_id):
self._out.write('<%s!DOCTYPE%s %s'.encode(self._cod) % (
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
if public_id:
self._out.write(' PUBLIC %s"%s"%s %s"%s"%s['.encode(self._cod) % (
self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
else:
self._out.write(' SYSTEM %s"%s"%s ['.encode(self._cod) % (
self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
def endDTD(self):
self._out.write(']>')
def startEntity(self, name):
self._out.write('%s&%s;%s'.encode(self._cod) % (
self._o_d[_NAME][0], name, self._o_d[_NAME][1]))
self._in_entity = 1
def endEntity(self, name):
self._in_entity = 0
def internalEntityDecl(self, name, value):
self._out.write('<%s!ENTITY%s %s'.encode(self._cod) % (
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
if public_id:
self._out.write(' PUBLIC %s"%s"%s %s
self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
else:
self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
def externalEntityDecl(self, name, public_id, system_id):
self._out.write('<%s!ENTITY%s %s'.encode(self._cod) % (
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
if public_id:
self._out.write(' PUBLIC %s"%s"%s %s"%s"%s>'.encode(self._cod)%(
self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
else:
self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
def elementDecl(self, elem_name, content_model):
c_m = _decode_content_model(content_model)
self._out.write('<%s!ELEMENT%s %s %s>'.encode(self._cod) % (
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
c_m))
def attributeDecl(self,elem_name,attr_name,type_d,value_def,value):
import types
if type(type_d) is types.ListType:
s = ''
for pos in type_d:
if not s:
s = '(%s' % pos
else:
s = '%s|%s' % (s, pos)
s = '%s)' % s
self._out.write('<%s!ATTLIST%s %s %s %s%s>'.encode(self._cod)%(
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
attr_name, s , value_def))
else:
self._out.write('<%s!ATTLIST%s %s %s%s>'.encode(self._cod)%(
self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
attr_name, type))
C_OP, C_VAL, C_NUM = 0, 1, 2
def _decode_content_model(content_m):
''' recursively decode a content_model returned by parsers in
elementDecl '''
s = ''
if content_m[C_OP] == ',':
for c_m in content_m[C_VAL]:
if not s:
s = '(%s' % _decode_content_model(c_m)
else:
s = '%s, %s' % (s, _decode_content_model(c_m))
s = '%s)%s' % (s, content_m[C_NUM] )
elif content_m[C_OP] == '|':
for c_m in content_m[C_VAL]:
if not s:
s = '(%s' % _decode_content_model(c_m)
else:
s = '%s|%s' % (s, _decode_content_model(c_m))
s = '%s)%s' % (s, content_m[C_NUM] )
else:
s = '%s%s' % (s, content_m[C_OP])
s = '%s%s' % (s, content_m[-1])
return s
USAGE = '''xml2dcbk: format xml source code to xml docbook using roles
Usage: xml2dcbk [options] source.py..., parse XML file(s)
xml2dcbk -h/--help, print this help message and exit Options:
_ -e/--encoding iso-8859-1, specify encoding to use in outputs
_ -d/--docbook, format output as docbook xml (default)
_ -w/--html, format output in html instead of docbook '''
def run(args):
import getopt, os
from xml.sax import make_parser
from xml.sax.handler import property_lexical_handler,\
property_declaration_handler
(opt, args) = getopt.getopt(args, 'he:dw',
['help', 'encoding=', 'docbook', 'html'])
encod, format = 'UTF-8', 'docbook'
for o in opt:
if o[0] == '-h' or o[0] == '--help':
print USAGE
return
elif o[0] == '-d' or o[0] == '--docbook':
format = 'docbook'
elif o[0] == '-w' or o[0] == '--html':
format = 'html'
elif o[0] == '-e' or o[0] == '--encoding':
encod = o[1]
p = make_parser(['xml.sax.drivers2.drv_xmlproc'])
for file in args:
source = open(file, 'r')
if file[-4:] != '.xml':
print >>sys.stderr, 'Unknown extension %s, ignored file %s'%(
file[-4:], file)
continue
dest = open('%s_dcbk.xml' % os.path.basename(file)[:-4], 'w+')
h = XmlFormatSaxHandler(dest, encod)
h.set_format(format)
p.setContentHandler(h)
try:
p.setProperty(property_lexical_handler, h)
except Exception, e:
print e
try:
p.setProperty(property_declaration_handler, h)
except Exception, e:
print e
print >>sys.stderr, "Formatting %s ..." % file
p.parse(source)
source.close()
dest.close()
if __name__ == "__main__":
run(sys.argv[1:])
Discussion:
The main advantage of this recipe is that you have here a full Sax handler, using the standard ContentHandler with the exotics LexicalHandler and DeclHandler properties. Note that at this time, parsers do not support those properties fully. The only functions missing in this handlers are those in the ErrorHandler interface and setDocumentLocator and ignorableWhitespace in the ContentHandler interface (but inherited from the default ContentHandler).
|