13
13
# limitations under the License.
14
14
from __future__ import annotations
15
15
16
- from typing import TYPE_CHECKING , Any , Tuple , Type , Union
16
+ import struct
17
+ from dataclasses import dataclass
18
+ from enum import Enum
19
+ from typing import TYPE_CHECKING , Any , Sequence , Tuple , Type , Union
17
20
from uuid import UUID
18
21
19
22
"""Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
191
194
"""
192
195
193
196
197
+ VECTOR_SUBTYPE = 9
198
+ """**(BETA)** BSON binary subtype for densely packed vector data.
199
+
200
+ .. versionadded:: 4.10
201
+ """
202
+
203
+
194
204
USER_DEFINED_SUBTYPE = 128
195
205
"""BSON binary subtype for any user defined structure.
196
206
"""
197
207
198
208
209
+ class BinaryVectorDtype (Enum ):
210
+ """**(BETA)** Datatypes of vector subtype.
211
+
212
+ :param FLOAT32: (0x27) Pack list of :class:`float` as float32
213
+ :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
214
+ :param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
215
+
216
+ The `PACKED_BIT` value represents a special case where vector values themselves
217
+ can only be of two values (0 or 1) but these are packed together into groups of 8,
218
+ a byte. In Python, these are displayed as ints in range [0, 255]
219
+
220
+ Each value is of type bytes with a length of one.
221
+
222
+ .. versionadded:: 4.10
223
+ """
224
+
225
+ INT8 = b"\x03 "
226
+ FLOAT32 = b"\x27 "
227
+ PACKED_BIT = b"\x10 "
228
+
229
+
230
+ @dataclass
231
+ class BinaryVector :
232
+ """**(BETA)** Vector of numbers along with metadata for binary interoperability.
233
+ .. versionadded:: 4.10
234
+ """
235
+
236
+ __slots__ = ("data" , "dtype" , "padding" )
237
+
238
+ def __init__ (self , data : Sequence [float | int ], dtype : BinaryVectorDtype , padding : int = 0 ):
239
+ """
240
+ :param data: Sequence of numbers representing the mathematical vector.
241
+ :param dtype: The data type stored in binary
242
+ :param padding: The number of bits in the final byte that are to be ignored
243
+ when a vector element's size is less than a byte
244
+ and the length of the vector is not a multiple of 8.
245
+ """
246
+ self .data = data
247
+ self .dtype = dtype
248
+ self .padding = padding
249
+
250
+
199
251
class Binary (bytes ):
200
252
"""Representation of BSON binary data.
201
253
202
- This is necessary because we want to represent Python strings as
203
- the BSON string type. We need to wrap binary data so we can tell
254
+ We want to represent Python strings as the BSON string type.
255
+ We need to wrap binary data so that we can tell
204
256
the difference between what should be considered binary data and
205
257
what should be considered a string when we encode to BSON.
206
258
207
- Raises TypeError if `data` is not an instance of :class:`bytes`
208
- or `subtype` is not an instance of :class:`int`.
259
+ **(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
260
+ Its data is prepended with two bytes of metadata.
261
+ The first (dtype) describes its data type, such as float32 or int8.
262
+ The second (padding) prescribes the number of bits to ignore in the final byte.
263
+ This is relevant when the element size of the dtype is not a multiple of 8.
264
+
265
+ Raises TypeError if `subtype` is not an instance of :class:`int`.
209
266
Raises ValueError if `subtype` is not in [0, 256).
210
267
211
268
.. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
218
275
to use
219
276
220
277
.. versionchanged:: 3.9
221
- Support any bytes-like type that implements the buffer protocol.
278
+ Support any bytes-like type that implements the buffer protocol.
279
+
280
+ .. versionchanged:: 4.10
281
+ **(BETA)** Addition of vector subtype.
222
282
"""
223
283
224
284
_type_marker = 5
@@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
337
397
f"cannot decode subtype { self .subtype } to { UUID_REPRESENTATION_NAMES [uuid_representation ]} "
338
398
)
339
399
400
+ @classmethod
401
+ def from_vector (
402
+ cls : Type [Binary ],
403
+ vector : list [int , float ],
404
+ dtype : BinaryVectorDtype ,
405
+ padding : int = 0 ,
406
+ ) -> Binary :
407
+ """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
408
+
409
+ To interpret the representation of the numbers, a data type must be included.
410
+ See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
411
+
412
+ The dtype and padding are prepended to the binary data's value.
413
+
414
+ :param vector: List of values
415
+ :param dtype: Data type of the values
416
+ :param padding: For fractional bytes, number of bits to ignore at end of vector.
417
+ :return: Binary packed data identified by dtype and padding.
418
+
419
+ .. versionadded:: 4.10
420
+ """
421
+ if dtype == BinaryVectorDtype .INT8 : # pack ints in [-128, 127] as signed int8
422
+ format_str = "b"
423
+ if padding :
424
+ raise ValueError (f"padding does not apply to { dtype = } " )
425
+ elif dtype == BinaryVectorDtype .PACKED_BIT : # pack ints in [0, 255] as unsigned uint8
426
+ format_str = "B"
427
+ elif dtype == BinaryVectorDtype .FLOAT32 : # pack floats as float32
428
+ format_str = "f"
429
+ if padding :
430
+ raise ValueError (f"padding does not apply to { dtype = } " )
431
+ else :
432
+ raise NotImplementedError ("%s not yet supported" % dtype )
433
+
434
+ metadata = struct .pack ("<sB" , dtype .value , padding )
435
+ data = struct .pack (f"{ len (vector )} { format_str } " , * vector )
436
+ return cls (metadata + data , subtype = VECTOR_SUBTYPE )
437
+
438
+ def as_vector (self ) -> BinaryVector :
439
+ """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
440
+
441
+ :return: BinaryVector
442
+
443
+ .. versionadded:: 4.10
444
+ """
445
+
446
+ if self .subtype != VECTOR_SUBTYPE :
447
+ raise ValueError (f"Cannot decode subtype { self .subtype } as a vector." )
448
+
449
+ position = 0
450
+ dtype , padding = struct .unpack_from ("<sB" , self , position )
451
+ position += 2
452
+ dtype = BinaryVectorDtype (dtype )
453
+ n_values = len (self ) - position
454
+
455
+ if dtype == BinaryVectorDtype .INT8 :
456
+ dtype_format = "b"
457
+ format_string = f"{ n_values } { dtype_format } "
458
+ vector = list (struct .unpack_from (format_string , self , position ))
459
+ return BinaryVector (vector , dtype , padding )
460
+
461
+ elif dtype == BinaryVectorDtype .FLOAT32 :
462
+ n_bytes = len (self ) - position
463
+ n_values = n_bytes // 4
464
+ if n_bytes % 4 :
465
+ raise ValueError (
466
+ "Corrupt data. N bytes for a float32 vector must be a multiple of 4."
467
+ )
468
+ vector = list (struct .unpack_from (f"{ n_values } f" , self , position ))
469
+ return BinaryVector (vector , dtype , padding )
470
+
471
+ elif dtype == BinaryVectorDtype .PACKED_BIT :
472
+ # data packed as uint8
473
+ dtype_format = "B"
474
+ unpacked_uint8s = list (struct .unpack_from (f"{ n_values } { dtype_format } " , self , position ))
475
+ return BinaryVector (unpacked_uint8s , dtype , padding )
476
+
477
+ else :
478
+ raise NotImplementedError ("Binary Vector dtype %s not yet supported" % dtype .name )
479
+
340
480
@property
341
481
def subtype (self ) -> int :
342
482
"""Subtype of this binary data."""
0 commit comments