# 6.12 读取嵌套和可变长二进制数据¶

## 解决方案¶

`struct` 模块可被用来编码/解码几乎所有类型的二进制的数据结构。为了解释清楚这种数据，假设你用下面的Python数据结构 来表示一个组成一系列多边形的点的集合：

```polys = [
[ (1.0, 2.5), (3.5, 4.0), (2.5, 1.5) ],
[ (7.0, 1.2), (5.1, 3.0), (0.5, 7.5), (0.8, 9.0) ],
[ (3.4, 6.3), (1.2, 0.5), (4.6, 9.2) ],
]
```

```+------+--------+------------------------------------+
|Byte  | Type   |  Description                       |
+======+========+====================================+
|0     | int    |  文件代码（0x1234，小端）          |
+------+--------+------------------------------------+
|4     | double |  x 的最小值（小端）                |
+------+--------+------------------------------------+
|12    | double |  y 的最小值（小端）                |
+------+--------+------------------------------------+
|20    | double |  x 的最大值（小端）                |
+------+--------+------------------------------------+
|28    | double |  y 的最大值（小端）                |
+------+--------+------------------------------------+
|36    | int    |  三角形数量（小端）                |
+------+--------+------------------------------------+
```

```+------+--------+-------------------------------------------+
|Byte  | Type   |  Description                              |
+======+========+===========================================+
|0     | int    |  记录长度（N字节）                        |
+------+--------+-------------------------------------------+
|4-N   | Points |  (X,Y) 坐标，以浮点数表示                 |
+------+--------+-------------------------------------------+
```

```import struct
import itertools

def write_polys(filename, polys):
# Determine bounding box
flattened = list(itertools.chain(*polys))
min_x = min(x for x, y in flattened)
max_x = max(x for x, y in flattened)
min_y = min(y for x, y in flattened)
max_y = max(y for x, y in flattened)
with open(filename, 'wb') as f:
f.write(struct.pack('<iddddi', 0x1234,
min_x, min_y,
max_x, max_y,
len(polys)))
for poly in polys:
size = len(poly) * struct.calcsize('<dd')
f.write(struct.pack('<i', size + 4))
for pt in poly:
f.write(struct.pack('<dd', *pt))
```

```def read_polys(filename):
with open(filename, 'rb') as f:
file_code, min_x, min_y, max_x, max_y, num_polys = \
polys = []
for n in range(num_polys):
poly = []
for m in range(pbytes // 16):
poly.append(pt)
polys.append(poly)
return polys
```

```import struct

class StructField:
'''
Descriptor representing a simple structure field
'''
def __init__(self, format, offset):
self.format = format
self.offset = offset
def __get__(self, instance, cls):
if instance is None:
return self
else:
r = struct.unpack_from(self.format, instance._buffer, self.offset)
return r[0] if len(r) == 1 else r

class Structure:
def __init__(self, bytedata):
self._buffer = memoryview(bytedata)
```

`Structure` 类就是一个基础类，接受字节数据并存储在内部的内存缓冲中，并被 `StructField` 描述器使用。 这里使用了 `memoryview()` ，我们会在后面详细讲解它是用来干嘛的。

```class PolyHeader(Structure):
file_code = StructField('<i', 0)
min_x = StructField('<d', 4)
min_y = StructField('<d', 12)
max_x = StructField('<d', 20)
max_y = StructField('<d', 28)
num_polys = StructField('<i', 36)
```

```>>> f = open('polys.bin', 'rb')
True
0.5
0.5
7.0
9.2
3
>>>
```

```class StructureMeta(type):
'''
Metaclass that automatically creates StructField descriptors
'''
def __init__(self, clsname, bases, clsdict):
fields = getattr(self, '_fields_', [])
byte_order = ''
offset = 0
for format, fieldname in fields:
if format.startswith(('<','>','!','@')):
byte_order = format[0]
format = format[1:]
format = byte_order + format
setattr(self, fieldname, StructField(format, offset))
offset += struct.calcsize(format)
setattr(self, 'struct_size', offset)

class Structure(metaclass=StructureMeta):
def __init__(self, bytedata):
self._buffer = bytedata

@classmethod
def from_file(cls, f):
```

```class PolyHeader(Structure):
_fields_ = [
('<i', 'file_code'),
('d', 'min_x'),
('d', 'min_y'),
('d', 'max_x'),
('d', 'max_y'),
('i', 'num_polys')
]
```

```>>> f = open('polys.bin', 'rb')
True
0.5
0.5
7.0
9.2
3
>>>
```

```class NestedStruct:
'''
Descriptor representing a nested structure
'''
def __init__(self, name, struct_type, offset):
self.name = name
self.struct_type = struct_type
self.offset = offset

def __get__(self, instance, cls):
if instance is None:
return self
else:
data = instance._buffer[self.offset:
self.offset+self.struct_type.struct_size]
result = self.struct_type(data)
# Save resulting structure back on instance to avoid
# further recomputation of this step
setattr(instance, self.name, result)
return result

class StructureMeta(type):
'''
Metaclass that automatically creates StructField descriptors
'''
def __init__(self, clsname, bases, clsdict):
fields = getattr(self, '_fields_', [])
byte_order = ''
offset = 0
for format, fieldname in fields:
if isinstance(format, StructureMeta):
setattr(self, fieldname,
NestedStruct(fieldname, format, offset))
offset += format.struct_size
else:
if format.startswith(('<','>','!','@')):
byte_order = format[0]
format = format[1:]
format = byte_order + format
setattr(self, fieldname, StructField(format, offset))
offset += struct.calcsize(format)
setattr(self, 'struct_size', offset)
```

```class Point(Structure):
_fields_ = [
('<d', 'x'),
('d', 'y')
]

_fields_ = [
('<i', 'file_code'),
(Point, 'min'), # nested struct
(Point, 'max'), # nested struct
('i', 'num_polys')
]
```

```>>> f = open('polys.bin', 'rb')
True
<__main__.Point object at 0x1006a48d0>
0.5
0.5
7.0
9.2
3
>>>
```

```class SizedRecord:
def __init__(self, bytedata):
self._buffer = memoryview(bytedata)

@classmethod
def from_file(cls, f, size_fmt, includes_size=True):
sz_nbytes = struct.calcsize(size_fmt)
sz, = struct.unpack(size_fmt, sz_bytes)
buf = f.read(sz - includes_size * sz_nbytes)
return cls(buf)

def iter_as(self, code):
if isinstance(code, str):
s = struct.Struct(code)
for off in range(0, len(self._buffer), s.size):
yield s.unpack_from(self._buffer, off)
elif isinstance(code, StructureMeta):
size = code.struct_size
for off in range(0, len(self._buffer), size):
data = self._buffer[off:off+size]
yield code(data)
```

```>>> f = open('polys.bin', 'rb')
3
>>> polydata = [ SizedRecord.from_file(f, '<i')
...             for n in range(phead.num_polys) ]
>>> polydata
[<__main__.SizedRecord object at 0x1006a4d50>,
<__main__.SizedRecord object at 0x1006a4f50>,
<__main__.SizedRecord object at 0x10070da90>]
>>>
```

```>>> for n, poly in enumerate(polydata):
...     print('Polygon', n)
...     for p in poly.iter_as('<dd'):
...         print(p)
...
Polygon 0
(1.0, 2.5)
(3.5, 4.0)
(2.5, 1.5)
Polygon 1
(7.0, 1.2)
(5.1, 3.0)
(0.5, 7.5)
(0.8, 9.0)
Polygon 2
(3.4, 6.3)
(1.2, 0.5)
(4.6, 9.2)
>>>

>>> for n, poly in enumerate(polydata):
...     print('Polygon', n)
...     for p in poly.iter_as(Point):
...         print(p.x, p.y)
...
Polygon 0
1.0 2.5
3.5 4.0
2.5 1.5
Polygon 1
7.0 1.2
5.1 3.0
0.5 7.5
0.8 9.0
Polygon 2
3.4 6.3
1.2 0.5
4.6 9.2
>>>
```

```class Point(Structure):
_fields_ = [
('<d', 'x'),
('d', 'y')
]

_fields_ = [
('<i', 'file_code'),
(Point, 'min'),
(Point, 'max'),
('i', 'num_polys')
]

polys = []
with open(filename, 'rb') as f:
rec = SizedRecord.from_file(f, '<i')
poly = [ (p.x, p.y) for p in rec.iter_as(Point) ]
polys.append(poly)
return polys
```

## 讨论¶

`StructureMeta` 的一个很微妙的地方就是它会固定字节数据顺序。 也就是说，如果任意的属性指定了一个字节顺序(<表示低位优先 或者 >表示高位优先)， 那后面所有字段的顺序都以这个顺序为准。这么做可以帮助避免额外输入，但是在定义的中间我们仍然可能切换顺序的。 比如，你可能有一些比较复杂的结构，就像下面这样：

```class ShapeFile(Structure):
_fields_ = [ ('>i', 'file_code'), # Big endian
('20s', 'unused'),
('i', 'file_length'),
('<i', 'version'), # Little endian
('i', 'shape_type'),
('d', 'min_x'),
('d', 'min_y'),
('d', 'max_x'),
('d', 'max_y'),
('d', 'min_z'),
('d', 'max_z'),
('d', 'min_m'),
('d', 'max_m') ]
```