|
|
 |
|
Title: A higher level struct module
Submitter: Brian McErlean
(other recipes)Brian McErlean
(other recipes)
Last Updated: 2006/10/26
Version no: 1.3
Category:
Databases
|
|
2 vote(s)
|
|
|
|
Description:
This recipe provides a higher level wrapper around the struct module. It provides a more convenient syntax for defining and using structs, and adds additional features such as:
- Allows embedding structures within other structures
- Allows defining arrays of items (or other structures)
- Class based syntax, allowing access and updates by field name, not position
- Extension of structures by inheritance
Source: Text Source
import struct
class Format(object):
"""Endianness and size format for structures."""
Native = "@"
StandardNative = "="
LittleEndian = "<"
BigEndian = ">"
class Element(object):
"""A single element in a struct."""
id=0
def __init__(self, typecode):
Element.id+=1
self.id = Element.id
self.typecode = typecode
self.size = struct.calcsize(typecode)
def __len__(self):
return self.size
def decode(self, format, s):
"""Additional decode steps once converted via struct.unpack"""
return s
def encode(self, format, val):
"""Additional encode steps to allow packing with struct.pack"""
return val
def __str__(self):
return self.typecode
def __call__(self, num):
"""Define this as an array of elements."""
if self.typecode in 'sp':
return Element('%ds' % num)
else:
return ArrayElement(self, num)
def __getitem__(self, num): return self(num)
class ArrayElement(Element):
def __init__(self, basic_element, num):
Element.__init__(self, '%ds' % (len(basic_element) * num))
self.num = num
self.basic_element = basic_element
def decode(self, format, s):
return [self.basic_element.decode(format, x) for x in
struct.unpack('%s%s' % (format,
self.num * self.basic_element.typecode),s)]
def encode(self, format, vals):
fmt = format + (self.basic_element.typecode * self.num)
return struct.pack(fmt, *[self.basic_element.encode(format,v)
for v in vals])
class EmbeddedStructElement(Element):
def __init__(self, structure):
Element.__init__(self, '%ds' % structure._struct_size)
self.struct = structure
def decode(self, format, s):
return self.struct(s)
def encode(self, format, s):
return self.struct._pack(s)
name_to_code = {
'Char' : 'c',
'Byte' : 'b',
'UnsignedByte' : 'B',
'Int' : 'i',
'UnsignedInt' : 'I',
'Short' : 'h',
'UnsignedShort' : 'H',
'Long' : 'l',
'UnsignedLong' : 'L',
'String' : 's',
'PascalString' : 'p',
'Pointer' : 'P',
'Float' : 'f',
'Double' : 'd',
'LongLong' : 'q',
'UnsignedLongLong' : 'Q',
}
class Type(object):
def __getattr__(self, name):
return Element(name_to_code[name])
def Struct(self, struct):
return EmbeddedStructElement(struct)
Type=Type()
class MetaStruct(type):
def __init__(cls, name, bases, d):
type.__init__(cls, name, bases, d)
if hasattr(cls, '_struct_data'):
cls._struct_info = list(cls._struct_info)
else:
cls._struct_data=''
cls._struct_info=[]
elems = sorted(((k,v) for (k,v) in d.iteritems()
if isinstance(v, Element)),
key=lambda x:x[1].id)
cls._struct_data += ''.join(str(v) for (k,v) in elems)
cls._struct_info += elems
cls._struct_size = struct.calcsize(cls._format + cls._struct_data)
class Struct(object):
"""Represent a binary structure."""
__metaclass__=MetaStruct
_format = Format.Native
def __init__(self, _data=None, **kwargs):
if _data is None:
_data ='\0' * self._struct_size
fieldvals = zip(self._struct_info, struct.unpack(self._format +
self._struct_data, _data))
for (name, elem), val in fieldvals:
setattr(self, name, elem.decode(self._format, val))
for k,v in kwargs.iteritems():
setattr(self, k, v)
def _pack(self):
return struct.pack(self._format + self._struct_data,
*[elem.encode(self._format, getattr(self, name))
for (name,elem) in self._struct_info])
def __str__(self):
return self._pack()
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self._pack())
class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
y = Type.Short
p = Point('\x01\x00\x02\x00')
print p.x, p.y
p.x, p.y = 100,200
print repr(p)
assert(struct.pack('<hh',100,200) == str(p))
class Shape(Struct):
_format = Format.BigEndian
name = Type.String[8]
numpoints = Type.Int
points = Type.Struct(Point)[4]
s=Shape('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x05\x00\x05\x00\n\x00'
'\x00\x00\x00\x00\x00\x00')
print s.name, [(p.x, p.y) for p in s.points[:s.numpoints]]
s2=Shape(name='Triangle', numpoints=3, points=[
Point(x=0,y=0),
Point(x=5,y=5),
Point(x=10,y=0),
Point(x=0,y=0)])
assert str(s2) == str(s)
assert str(s.points[1]) == str( Point(x=5, y=5))
class TicTacToe(Struct):
board = Type.Char[3][3]
ignored = 'This is not packed / unpacked by the structure'
def display(self):
print '\n'.join(''.join(row) for row in self.board)
game = TicTacToe('X.O.X...O')
print game.board
game.display()
game.board[0][1] = 'X'
game.display()
print str(game)
class Point3D(Point):
z = Type.Short
p = Point3D(x=1, y=2, z=3)
print repr(p)
Discussion:
The standard struct module is useful when dealing with C structs, and various file and network formats, but is rather awkward to work with. It is fairly low-level, using strings of character codes to describe the structure, and unpacks to a tuple, rather than an object with appropriately named fields. This means that access to items is always by position, rather than the field name, as would be used in the C code.
It also fares badly when dealing with arrays, and embedded structures. Dealing with these within the struct module means each element becomes effectively the same as a top-level field, flattening their structure and losing the organisation of elements. This all results in code that is much messier and hard to maintain than the equivalent C code.
This recipe builds on top of the struct module and provides syntax for declaration of structures that is closer to the corresponding C code, and allows for more complex structures to be defined.
Structures are read by instantiating the appropriate class with the binary struct data, and packed by calling str(aStruct). I use the above code as a module named "structure".
Implementation:
Structs are defined by lists of Element objects. Each element contains the struct code for the type, and a unique id, incremented after each instantiation, which is used so that the elements can be sorted into the same order that they were defined within a struct. They also define __getitem__, returning another Element subclass, ArrayElement, dealing with arrays of simple elements. Another Element subclass, EmbeddedStructElement is used to represent substructures.
The Type object provides some syntax sugar for constructing these elements, allowing "x = Type.Int", instead of "x = Element("i")"
The MetaStruct metaclass checks for all fields which are subclasses of Element, sorts them into the definition order, and uses them to create the corresponding struct format string, and the corresponding list of Elements
Finally, the Struct class defines appropriate __init__ and __str__ methods to use this generated by the metaclass to encode and decode the structure. Arrays and substructures are treated as strings of the appropriate size, and implement an encode() and decode() method which will transform from the string to the appropriate data, or vice-versa.
Warnings and Caveats:
There are a few flaws in the above code currently. The main one is that there is no validation when setting struct fields. For instance, given a struct like:
>>> class MyStruct(Struct):
... name = Type.String[8]
... values = Type.Int[4]
>>> a=MyStruct() # Initialises with everything zeroed.
There is nothing preventing you from doing:
>>> a.values = [1,2,3,4,5] # One too many items.
This will now fail when trying to pack the structure with a struct error which doesn't give you any clue as to which field is corrupt. Probably the main thing that should be done is either some kind of pre-validation of items, or at least error handling that mentions what field is invalid.
Worse is that doing:
>>> a.name = "thisnameistoobig"
will not raise an error even when packing the structure, but will instead silently truncate the name in the packed representation.
Another thing to be careful of is that duplicating names will cause the last definition to be used, rather than resulting in multiple elements, or giving an error. For instance:
>>> class MyStruct(Struct):
... item1=Type.Char[5]
... pad = Type.Char[3]
... item2=Type.Char[1]
... pad = Type.Char[3]
... item3=Type.Char[5]
Here the first pad bytes will not be used, as the second defintion of pad will override the name.
Also, note that you must create element objects (ie Type.Int) within the struct, rather than reusing them. For instance:
>>> myArrayType = Type.Int[8] # define an 8 element array
>>> class MyStruct(Struct):
... name = Type.String
... items = myArrayType # Don't do this
Here items will actually be packed before name, as it was created earlier, so the definition order will not match the packed order.
Finally, note that padding and alignment will be handled the same as the struct module.
|
|
Add comment
|
|
Number of comments: 20
Very interresting code :-), Victor Stinner, 2006/10/03
Class MetaStruct is interresting. I didn't know that it's possible to get attributes in the order that they are defined. I have to play with metaclasses :-) You have to see pyConstruct project:
http://pyconstruct.wikispaces.com
I'm working on the same subject but with different syntax (different approach):
http://hachoir.org/
Hachoir is a lazy-parser and fault tolerant. It allows to edit data and have tree organization with nice Python API.
Haypo
Add comment
Definition order, Brian McErlean,Brian McErlean, 2006/10/05
Getting the definition order requires a bit of a cheat. The idea is to create an object, and track the order they were created in, and then later sort based on this order. It does limit the syntax you can use to something that can create and return a new object though: "x=Foo.attr", "x=Foo()" and "x=Foo[1]" would all work given an appropriate Foo , but you can't use just "x = Foo".
pyConstruct looks pretty neat. I'll check it out.
Add comment
Dynamic arrays?, Chris Niekel, 2006/10/04
Would it be possible to allow the arrays to be dynamic? I know some binary formats that use that, like
number_fields: integer
dates: date[number_fields]
I haven't found any struct-replacement that does that.
Add comment
Dynamic arrays, Brian McErlean,Brian McErlean, 2006/10/05
I think this would be fairly hard to do. Currently I'm relying on knowing the sizes of various substructures etc. in advance, in order to know how to represent them in containing structures. I think it should be possible (given some restrictions like the count appearing before the array in the struct), but would probably require a different approach.
Add comment
Victor Stinner, 2006/10/05
Both Hachoir and pyConstruct allow dynamic structure. pyConstruct uses Python eval() function to access to other fields:
>>> # a TLV is a type-length-value entity. the length of the value is specified
... # by the 'length' field
... tlv = Struct("tlv",
... Byte("type"),
... Byte("length"),
... MetaBytes("value", "_.length"),
... )
Hachoir approch is different: you directly access to the structure using [] operator (self["name"]):
class Chunk(FieldSet):
def createFields(self):
yield UInt32(self, "size")
yield String(self, "tag", 4, charset="ASCII")
yield RawBytes(self, "data", self["size"].value)
Where self["size"] is a Field object and has many attributes: value, address, absolute_address, parent, display (unicode string), etc. I don't have enough place here to explain all Hachoir internals :-)
Haypo
Add comment
Just to follow up, Brian McErlean,Brian McErlean, 2006/10/05
In fact, pyConstruct, linked to in the post above by Victor Stinner looks like it would handle this. I think the equivalent of your example would be something like:
Date=LittleFloat64("timestamp")
DynamicArray=Struct("dynamic_array",
UInt32("number_fields"),
MetaRepeater("_.number_fields", Date)
)
s= DynamicArray.build( Container(number_fields=2,
timestamp=[time.time(), time.time()]))
print repr(s)
print DynamicArray.parse(s)
Add comment
Chris Niekel, 2006/10/05
Cool, thanks for the tips, I'll look into them!
Add comment
Very nice recipe, Alain Pointdexter, 2006/10/06
This is a very nice recipe, worth the inclusion in the standard library!
I have a remark though. Is it possible to split the construction by keywords from the construction by decoding.
I mean:
s1=Shape(name='Triangle', numpoints=3, points=[
Point(x=0,y=0),
Point(x=5,y=5),
Point(x=10,y=0),
Point(x=0,y=0)])
and
s2=Shape.decode('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x00\x05\x00\x05\x00\x0A'
'\x00\x00\x00\x00\x00\x00')
This would be neater as you don't know in advance the contents of the string buffer and you might have a higher-level decode resting on several lower-level decode This is a very nice recipe, worth the inclusion in the standard library!
I have a remark though. Is it possible to split the construction by keywords from the construction by decoding.
I mean:
s1=Shape(name='Triangle', numpoints=3, points=[
Point(x=0,y=0),
Point(x=5,y=5),
Point(x=10,y=0),
Point(x=0,y=0)])
and
s2=Shape.decode('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x00\x05\x00\x05\x00\x0A'
'\x00\x00\x00\x00\x00\x00')
This would be neater as you don't know in advance the contents of the string buffer and you might have a higher-level decode resting on several lower-level decode
Add comment
decode method, Brian McErlean,Brian McErlean, 2006/10/07
It should be pretty simple to do - just move the _data parameter and all but the "for k,v in kwargs.iteritems():" loop out of __init__ and into a new decode classmethod. I'm not sure what you mean by your use case though - wouldn't that also be possible through __init__ too?
The main reason I went with __init__ and __str__ overloads rather than pack / unpack methods was that I didn't want to add anything into the public namespace of the class, as it would prevent defining struct fields with the same name. If there's a good reason though, perhaps this isn't that important.
Add comment
inheritance bug, Alain Pointdexter, 2006/10/06
Once you call Point3D, the parent class Point gets corrupted and it gets impossible to call it with only 2 arguments.
Alain
Add comment
I'm not seeing it, Brian McErlean,Brian McErlean, 2006/10/07
Could you give some example code that fails. I've tried both
p = Point(x=1, y=2)
p = Point('\x01\x00\x02\x00')
after the same code as above, and both seem to work. I did have a similar bug when developing it, but fixed it by taking a copy of _struct_data for the subclass (previously it was mutating the parent class's list). Is this still happening somewhere?
Add comment
Sorry, Alain Pointdexter, 2006/10/09
Stupid me, I was experimenting with the code and i broke it myself !
Sorry !
Add comment
Strange behaviour of class Point, Igor Lvovsky, 2006/10/23
Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
a = Type.Char # the place is important
y = Type.Short
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short
the error is gone too.
Add comment
Thats a bug, Brian McErlean,Brian McErlean, 2006/10/26
The padding applied by the struct module is different depending on the format selected. With native format, an extra pad byte is inserted after the char to align it to an even boundary. With the other formats, no padding is done.
I was including the format information when building the string, but not when calculating the size of the structure, so this was always defaulting to native format, giving a _struct_size of 6 when the struct expected 5. This meant that the initialisation string was the wrong size, giving the error you saw.
I've now updated the recipe to fix this. The relevant change was changing the line:
cls._struct_size = struct.calcsize(cls._struct_data)
to
cls._struct_size = struct.calcsize(cls._format + cls._struct_data)
Thanks.
Add comment
Strange behaviour of class Point, Igor Lvovsky, 2006/10/23
Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
a = Type.Char # the place is important
y = Type.Short
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short
the error is gone too.
Add comment
Strange behaviour of class Point, Igor Lvovsky, 2006/10/23
Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
a = Type.Char # the place is important
y = Type.Short
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short
the error is gone too.
Add comment
Strange behaviour of class Point, Igor Lvovsky, 2006/10/23
Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
a = Type.Char # the place is important
y = Type.Short
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short
the error is gone too.
Add comment
TypeError: 'Element' object is unindexable, Mark Shirley, 2006/11/08
Hello,
I'm a beginning Python programmer and am very interested using
your 'A higher level struct module' code. My problem may be as
simple as version confusion on my part, but I'm stuck. Here's a
transcript. I've saved the text source into packclass.py minus the
examples at the end.
I can load that file and can successfully do the example with the
Point class. However, when I try the Shape example, a class that
uses an array type, I get an error.
Is there a workaround?
Thank you,
Mark Shirley
Python 2.5 (r25:51908, Sep 19 2006, 09:52:17) [MSC v.1310 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from packclass import *
>>> class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
y = Type.Short
... ... ... ... >>> p = Point('\x01\x00\x02\x00')
>>> print p.x, p.y
1 2
>>> print repr(p)
Point('\x01\x00\x02\x00')
>>> class Shape(Struct):
_format = Format.BigEndian
name = Type.String[8]
numpoints = Type.Int
points = Type.Struct(Point)[4] # Array of 4 points.
... ... ... ... ... Traceback (most recent call last):
File "", line 1, in
File "", line 3, in Shape
TypeError: 'Element' object is unindexable
>>>
Add comment
TypeError: 'Element' object is unindexable, Mark Shirley, 2006/11/08
[Sorry to repeat this. I submitted it earlier and saw it in the
list of comments, but it's not there now.]
Hello,
I'm a beginning Python programmer and am very interested using
your 'A higher level struct module' recipe. My problem may be as
simple as version confusion on my part, but I'm stuck. Here's a
transcript. I've saved the text source into packclass.py minus the
examples at the end.
I can load that file and can successfully do the example with the
Point class. However, when I try the Shape example, a class that
uses an array type, I get an error.
Is there a workaround?
Thank you,
Mark Shirley
Python 2.5 (r25:51908, Sep 19 2006, 09:52:17) [MSC v.1310 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from packclass import *
>>> class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
y = Type.Short
... ... ... ... >>> p = Point('\x01\x00\x02\x00')
>>> print p.x, p.y
1 2
>>> print repr(p)
Point('\x01\x00\x02\x00')
>>> class Shape(Struct):
_format = Format.BigEndian
name = Type.String[8]
numpoints = Type.Int
points = Type.Struct(Point)[4] # Array of 4 points.
... ... ... ... ... Traceback (most recent call last):
File "", line 1, in
File "", line 3, in Shape
TypeError: 'Element' object is unindexable
>>>
Add comment
I'm not sure, Brian McErlean,Brian McErlean, 2006/11/11
I tried that code here, and I don't get an error. From the exception you're getting, it looks like its not finding __getitem__ on the Element object (I think it is the line "name = Type.String[8]" that is failing)
Could you check that the line
def __getitem__(self, num): return self(num)
is correctly copied below class Element. If it is missing, or not indented with the rest of the Element members, it would cause the error you're seeing.
Add comment
|
|
|
|
|
 |
|