Skip to content

Commit ae6cfd6

Browse files
[DRIVERS-2926] [PYTHON-4577] BSON Binary Vector Subtype Support (#1813)
Co-authored-by: Steven Silvester <[email protected]> Co-authored-by: Steven Silvester <[email protected]>
1 parent 545b88c commit ae6cfd6

File tree

10 files changed

+519
-10
lines changed

10 files changed

+519
-10
lines changed

.evergreen/resync-specs.sh

+3
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ do
7676
atlas-data-lake-testing|data_lake)
7777
cpjson atlas-data-lake-testing/tests/ data_lake
7878
;;
79+
bson-binary-vector|bson_binary_vector)
80+
cpjson bson-binary-vector/tests/ bson_binary_vector
81+
;;
7982
bson-corpus|bson_corpus)
8083
cpjson bson-corpus/tests/ bson_corpus
8184
;;

bson/binary.py

+146-6
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
# limitations under the License.
1414
from __future__ import annotations
1515

16-
from typing import TYPE_CHECKING, Any, Tuple, Type, Union
16+
import struct
17+
from dataclasses import dataclass
18+
from enum import Enum
19+
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
1720
from uuid import UUID
1821

1922
"""Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
191194
"""
192195

193196

197+
VECTOR_SUBTYPE = 9
198+
"""**(BETA)** BSON binary subtype for densely packed vector data.
199+
200+
.. versionadded:: 4.10
201+
"""
202+
203+
194204
USER_DEFINED_SUBTYPE = 128
195205
"""BSON binary subtype for any user defined structure.
196206
"""
197207

198208

209+
class BinaryVectorDtype(Enum):
210+
"""**(BETA)** Datatypes of vector subtype.
211+
212+
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
213+
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
214+
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
215+
216+
The `PACKED_BIT` value represents a special case where vector values themselves
217+
can only be of two values (0 or 1) but these are packed together into groups of 8,
218+
a byte. In Python, these are displayed as ints in range [0, 255]
219+
220+
Each value is of type bytes with a length of one.
221+
222+
.. versionadded:: 4.10
223+
"""
224+
225+
INT8 = b"\x03"
226+
FLOAT32 = b"\x27"
227+
PACKED_BIT = b"\x10"
228+
229+
230+
@dataclass
231+
class BinaryVector:
232+
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
233+
.. versionadded:: 4.10
234+
"""
235+
236+
__slots__ = ("data", "dtype", "padding")
237+
238+
def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
239+
"""
240+
:param data: Sequence of numbers representing the mathematical vector.
241+
:param dtype: The data type stored in binary
242+
:param padding: The number of bits in the final byte that are to be ignored
243+
when a vector element's size is less than a byte
244+
and the length of the vector is not a multiple of 8.
245+
"""
246+
self.data = data
247+
self.dtype = dtype
248+
self.padding = padding
249+
250+
199251
class Binary(bytes):
200252
"""Representation of BSON binary data.
201253
202-
This is necessary because we want to represent Python strings as
203-
the BSON string type. We need to wrap binary data so we can tell
254+
We want to represent Python strings as the BSON string type.
255+
We need to wrap binary data so that we can tell
204256
the difference between what should be considered binary data and
205257
what should be considered a string when we encode to BSON.
206258
207-
Raises TypeError if `data` is not an instance of :class:`bytes`
208-
or `subtype` is not an instance of :class:`int`.
259+
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
260+
Its data is prepended with two bytes of metadata.
261+
The first (dtype) describes its data type, such as float32 or int8.
262+
The second (padding) prescribes the number of bits to ignore in the final byte.
263+
This is relevant when the element size of the dtype is not a multiple of 8.
264+
265+
Raises TypeError if `subtype` is not an instance of :class:`int`.
209266
Raises ValueError if `subtype` is not in [0, 256).
210267
211268
.. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
218275
to use
219276
220277
.. versionchanged:: 3.9
221-
Support any bytes-like type that implements the buffer protocol.
278+
Support any bytes-like type that implements the buffer protocol.
279+
280+
.. versionchanged:: 4.10
281+
**(BETA)** Addition of vector subtype.
222282
"""
223283

224284
_type_marker = 5
@@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
337397
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
338398
)
339399

400+
@classmethod
401+
def from_vector(
402+
cls: Type[Binary],
403+
vector: list[int, float],
404+
dtype: BinaryVectorDtype,
405+
padding: int = 0,
406+
) -> Binary:
407+
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
408+
409+
To interpret the representation of the numbers, a data type must be included.
410+
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
411+
412+
The dtype and padding are prepended to the binary data's value.
413+
414+
:param vector: List of values
415+
:param dtype: Data type of the values
416+
:param padding: For fractional bytes, number of bits to ignore at end of vector.
417+
:return: Binary packed data identified by dtype and padding.
418+
419+
.. versionadded:: 4.10
420+
"""
421+
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
422+
format_str = "b"
423+
if padding:
424+
raise ValueError(f"padding does not apply to {dtype=}")
425+
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
426+
format_str = "B"
427+
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
428+
format_str = "f"
429+
if padding:
430+
raise ValueError(f"padding does not apply to {dtype=}")
431+
else:
432+
raise NotImplementedError("%s not yet supported" % dtype)
433+
434+
metadata = struct.pack("<sB", dtype.value, padding)
435+
data = struct.pack(f"{len(vector)}{format_str}", *vector)
436+
return cls(metadata + data, subtype=VECTOR_SUBTYPE)
437+
438+
def as_vector(self) -> BinaryVector:
439+
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
440+
441+
:return: BinaryVector
442+
443+
.. versionadded:: 4.10
444+
"""
445+
446+
if self.subtype != VECTOR_SUBTYPE:
447+
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
448+
449+
position = 0
450+
dtype, padding = struct.unpack_from("<sB", self, position)
451+
position += 2
452+
dtype = BinaryVectorDtype(dtype)
453+
n_values = len(self) - position
454+
455+
if dtype == BinaryVectorDtype.INT8:
456+
dtype_format = "b"
457+
format_string = f"{n_values}{dtype_format}"
458+
vector = list(struct.unpack_from(format_string, self, position))
459+
return BinaryVector(vector, dtype, padding)
460+
461+
elif dtype == BinaryVectorDtype.FLOAT32:
462+
n_bytes = len(self) - position
463+
n_values = n_bytes // 4
464+
if n_bytes % 4:
465+
raise ValueError(
466+
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
467+
)
468+
vector = list(struct.unpack_from(f"{n_values}f", self, position))
469+
return BinaryVector(vector, dtype, padding)
470+
471+
elif dtype == BinaryVectorDtype.PACKED_BIT:
472+
# data packed as uint8
473+
dtype_format = "B"
474+
unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
475+
return BinaryVector(unpacked_uint8s, dtype, padding)
476+
477+
else:
478+
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
479+
340480
@property
341481
def subtype(self) -> int:
342482
"""Subtype of this binary data."""

doc/api/bson/binary.rst

+8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
.. autoclass:: UuidRepresentation
2222
:members:
2323

24+
.. autoclass:: BinaryVectorDtype
25+
:members:
26+
:show-inheritance:
27+
28+
.. autoclass:: BinaryVector
29+
:members:
30+
31+
2432
.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
2533
:members:
2634
:show-inheritance:

doc/changelog.rst

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ in this release.
1919

2020
.. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553
2121

22-
2322
Changes in Version 4.9.0
2423
-------------------------
2524

test/bson_binary_vector/float32.json

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32",
3+
"test_key": "vector",
4+
"tests": [
5+
{
6+
"description": "Simple Vector FLOAT32",
7+
"valid": true,
8+
"vector": [127.0, 7.0],
9+
"dtype_hex": "0x27",
10+
"dtype_alias": "FLOAT32",
11+
"padding": 0,
12+
"canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000"
13+
},
14+
{
15+
"description": "Empty Vector FLOAT32",
16+
"valid": true,
17+
"vector": [],
18+
"dtype_hex": "0x27",
19+
"dtype_alias": "FLOAT32",
20+
"padding": 0,
21+
"canonical_bson": "1400000005766563746F72000200000009270000"
22+
},
23+
{
24+
"description": "Infinity Vector FLOAT32",
25+
"valid": true,
26+
"vector": ["-inf", 0.0, "inf"],
27+
"dtype_hex": "0x27",
28+
"dtype_alias": "FLOAT32",
29+
"padding": 0,
30+
"canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00"
31+
},
32+
{
33+
"description": "FLOAT32 with padding",
34+
"valid": false,
35+
"vector": [127.0, 7.0],
36+
"dtype_hex": "0x27",
37+
"dtype_alias": "FLOAT32",
38+
"padding": 3
39+
}
40+
]
41+
}
42+

test/bson_binary_vector/int8.json

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
"description": "Tests of Binary subtype 9, Vectors, with dtype INT8",
3+
"test_key": "vector",
4+
"tests": [
5+
{
6+
"description": "Simple Vector INT8",
7+
"valid": true,
8+
"vector": [127, 7],
9+
"dtype_hex": "0x03",
10+
"dtype_alias": "INT8",
11+
"padding": 0,
12+
"canonical_bson": "1600000005766563746F7200040000000903007F0700"
13+
},
14+
{
15+
"description": "Empty Vector INT8",
16+
"valid": true,
17+
"vector": [],
18+
"dtype_hex": "0x03",
19+
"dtype_alias": "INT8",
20+
"padding": 0,
21+
"canonical_bson": "1400000005766563746F72000200000009030000"
22+
},
23+
{
24+
"description": "Overflow Vector INT8",
25+
"valid": false,
26+
"vector": [128],
27+
"dtype_hex": "0x03",
28+
"dtype_alias": "INT8",
29+
"padding": 0
30+
},
31+
{
32+
"description": "Underflow Vector INT8",
33+
"valid": false,
34+
"vector": [-129],
35+
"dtype_hex": "0x03",
36+
"dtype_alias": "INT8",
37+
"padding": 0
38+
},
39+
{
40+
"description": "INT8 with padding",
41+
"valid": false,
42+
"vector": [127, 7],
43+
"dtype_hex": "0x03",
44+
"dtype_alias": "INT8",
45+
"padding": 3
46+
},
47+
{
48+
"description": "INT8 with float inputs",
49+
"valid": false,
50+
"vector": [127.77, 7.77],
51+
"dtype_hex": "0x03",
52+
"dtype_alias": "INT8",
53+
"padding": 0
54+
}
55+
]
56+
}
57+
+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT",
3+
"test_key": "vector",
4+
"tests": [
5+
{
6+
"description": "Simple Vector PACKED_BIT",
7+
"valid": true,
8+
"vector": [127, 7],
9+
"dtype_hex": "0x10",
10+
"dtype_alias": "PACKED_BIT",
11+
"padding": 0,
12+
"canonical_bson": "1600000005766563746F7200040000000910007F0700"
13+
},
14+
{
15+
"description": "Empty Vector PACKED_BIT",
16+
"valid": true,
17+
"vector": [],
18+
"dtype_hex": "0x10",
19+
"dtype_alias": "PACKED_BIT",
20+
"padding": 0,
21+
"canonical_bson": "1400000005766563746F72000200000009100000"
22+
},
23+
{
24+
"description": "PACKED_BIT with padding",
25+
"valid": true,
26+
"vector": [127, 7],
27+
"dtype_hex": "0x10",
28+
"dtype_alias": "PACKED_BIT",
29+
"padding": 3,
30+
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
31+
},
32+
{
33+
"description": "Overflow Vector PACKED_BIT",
34+
"valid": false,
35+
"vector": [256],
36+
"dtype_hex": "0x10",
37+
"dtype_alias": "PACKED_BIT",
38+
"padding": 0
39+
},
40+
{
41+
"description": "Underflow Vector PACKED_BIT",
42+
"valid": false,
43+
"vector": [-1],
44+
"dtype_hex": "0x10",
45+
"dtype_alias": "PACKED_BIT",
46+
"padding": 0
47+
}
48+
]
49+
}
50+

0 commit comments

Comments
 (0)