|
Description:
If you want to serialize Python objects to XML then PyXML is a good choice. Except in the case when unicode strings come into play. In this case generic.Marshaller().dump() throws an ugly AttributeError: Marshaller instance has no attribute 'm_unicode'
This recipe extends both PyXML Marshaller and Unmarshaller to support the de-/serialization of unicode strings. Put the following code in a separate module and test it with the given example.
The output will look like
<marshal>
<list id="i2">
<string>text</string>
<unicode>german umlaut: ü ö <>&</unicode>
</list>
</marshal>
Source: Text Source
----- unicodemarshal.py -----
from xml.marshal import generic
class UnicodeMarshaller(generic.Marshaller):
tag_unicode = 'unicode'
def m_unicode(self, value, dict):
name = self.tag_unicode
L = ['<' + name + '>']
s = value.encode('utf-8')
if '&' in s or '>' in s or '<' in s:
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
L.append(s)
L.append('</' + name + '>')
return L
class UnicodeUnmarshaller(generic.Unmarshaller):
def __init__(self):
self.unmarshal_meth['unicode'] = ('um_start_unicode','um_end_unicode')
generic.Unmarshaller.__init__(self)
um_start_unicode = generic.Unmarshaller.um_start_generic
def um_end_unicode(self, name):
ds = self.data_stack
ds[-1] = ''.join(ds[-1])
self.accumulating_chars = 0
---- example ----
>>> import sys,codecs
>>> from unicodemarshal import UnicodeMarshaller, UnicodeUnmarshaller
>>>
>>> if hasattr(sys, 'setdefaultencoding'):
... sys.setdefaultencoding('utf-8')
...
>>>
>>> def openUTF8File(path, mode):
... fp = codecs.open(filename=path, mode=mode, encoding='utf-8')
... return fp
...
>>>
>>> myList = ['text',
... u'german umlaut: \xfc \xf6 <>&']
>>>
>>> fp = openUTF8File("test.xml", mode='w')
>>> UnicodeMarshaller().dump(myList, fp)
>>> fp.close()
>>>
>>> fp = openUTF8File("test.xml", mode='r')
>>> myList = UnicodeUnmarshaller().load(fp)
>>> for s in myList:
... print type(s)
...
>>> fp.close()
<type 'str'>
<type 'unicode'>
Discussion:
When the sample runs, it sets the systemwide encoding to utf-8. To make this possible I commented the line "del sys.setdefaultencoding" in site.py (Python version 2.4.1)
I also use codecs.open() to ensure, that the file contents are correctly encoded.
|