[DRIVERS-2926] [PYTHON-4577] BSON Binary Vector Subtype Support (#1813)

caseyclements · blink1073 · web-flow · commit ae6cfd6d102d · 2024-09-30T21:13:09.000-05:00
Co-authored-by: Steven Silvester &lt;steve.silvester@mongodb.com&gt;
Co-authored-by: Steven Silvester &lt;steven.silvester@ieee.org&gt;
diff --git a/.evergreen/resync-specs.sh b/.evergreen/resync-specs.sh
@@ -76,6 +76,9 @@ do
     atlas-data-lake-testing|data_lake)
       cpjson atlas-data-lake-testing/tests/ data_lake
       ;;
+    bson-binary-vector|bson_binary_vector)
+      cpjson bson-binary-vector/tests/ bson_binary_vector
+      ;;
     bson-corpus|bson_corpus)
       cpjson bson-corpus/tests/ bson_corpus
       ;;
diff --git a/bson/binary.py b/bson/binary.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Tuple, Type, Union
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
 from uuid import UUID
 
 """Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
 """
 
 
+VECTOR_SUBTYPE = 9
+"""**(BETA)** BSON binary subtype for densely packed vector data.
+
+.. versionadded:: 4.10
+"""
+
+
 USER_DEFINED_SUBTYPE = 128
 """BSON binary subtype for any user defined structure.
 """
 
 
+class BinaryVectorDtype(Enum):
+    """**(BETA)** Datatypes of vector subtype.
+
+    :param FLOAT32: (0x27) Pack list of :class:`float` as float32
+    :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
+    :param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
+
+    The `PACKED_BIT` value represents a special case where vector values themselves
+    can only be of two values (0 or 1) but these are packed together into groups of 8,
+    a byte. In Python, these are displayed as ints in range [0, 255]
+
+    Each value is of type bytes with a length of one.
+
+    .. versionadded:: 4.10
+    """
+
+    INT8 = b"\x03"
+    FLOAT32 = b"\x27"
+    PACKED_BIT = b"\x10"
+
+
+@dataclass
+class BinaryVector:
+    """**(BETA)** Vector of numbers along with metadata for binary interoperability.
+    .. versionadded:: 4.10
+    """
+
+    __slots__ = ("data", "dtype", "padding")
+
+    def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
+        """
+        :param data: Sequence of numbers representing the mathematical vector.
+        :param dtype:  The data type stored in binary
+        :param padding: The number of bits in the final byte that are to be ignored
+          when a vector element's size is less than a byte
+          and the length of the vector is not a multiple of 8.
+        """
+        self.data = data
+        self.dtype = dtype
+        self.padding = padding
+
+
 class Binary(bytes):
     """Representation of BSON binary data.
 
-    This is necessary because we want to represent Python strings as
-    the BSON string type. We need to wrap binary data so we can tell
+    We want to represent Python strings as the BSON string type.
+    We need to wrap binary data so that we can tell
     the difference between what should be considered binary data and
     what should be considered a string when we encode to BSON.
 
-    Raises TypeError if `data` is not an instance of :class:`bytes`
-    or `subtype` is not an instance of :class:`int`.
+    **(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
+    Its data is prepended with two bytes of metadata.
+    The first (dtype) describes its data type, such as float32 or int8.
+    The second (padding) prescribes the number of bits to ignore in the final byte.
+    This is relevant when the element size of the dtype is not a multiple of 8.
+
+    Raises TypeError if `subtype` is not an instance of :class:`int`.
     Raises ValueError if `subtype` is not in [0, 256).
 
     .. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
         to use
 
     .. versionchanged:: 3.9
-      Support any bytes-like type that implements the buffer protocol.
+       Support any bytes-like type that implements the buffer protocol.
+
+    .. versionchanged:: 4.10
+       **(BETA)** Addition of vector subtype.
     """
 
     _type_marker = 5
@@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
             f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
         )
 
+    @classmethod
+    def from_vector(
+        cls: Type[Binary],
+        vector: list[int, float],
+        dtype: BinaryVectorDtype,
+        padding: int = 0,
+    ) -> Binary:
+        """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
+
+        To interpret the representation of the numbers, a data type must be included.
+        See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
+
+        The dtype and padding are prepended to the binary data's value.
+
+        :param vector: List of values
+        :param dtype: Data type of the values
+        :param padding: For fractional bytes, number of bits to ignore at end of vector.
+        :return: Binary packed data identified by dtype and padding.
+
+        .. versionadded:: 4.10
+        """
+        if dtype == BinaryVectorDtype.INT8:  # pack ints in [-128, 127] as signed int8
+            format_str = "b"
+            if padding:
+                raise ValueError(f"padding does not apply to {dtype=}")
+        elif dtype == BinaryVectorDtype.PACKED_BIT:  # pack ints in [0, 255] as unsigned uint8
+            format_str = "B"
+        elif dtype == BinaryVectorDtype.FLOAT32:  # pack floats as float32
+            format_str = "f"
+            if padding:
+                raise ValueError(f"padding does not apply to {dtype=}")
+        else:
+            raise NotImplementedError("%s not yet supported" % dtype)
+
+        metadata = struct.pack("<sB", dtype.value, padding)
+        data = struct.pack(f"{len(vector)}{format_str}", *vector)
+        return cls(metadata + data, subtype=VECTOR_SUBTYPE)
+
+    def as_vector(self) -> BinaryVector:
+        """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
+
+        :return: BinaryVector
+
+        .. versionadded:: 4.10
+        """
+
+        if self.subtype != VECTOR_SUBTYPE:
+            raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
+
+        position = 0
+        dtype, padding = struct.unpack_from("<sB", self, position)
+        position += 2
+        dtype = BinaryVectorDtype(dtype)
+        n_values = len(self) - position
+
+        if dtype == BinaryVectorDtype.INT8:
+            dtype_format = "b"
+            format_string = f"{n_values}{dtype_format}"
+            vector = list(struct.unpack_from(format_string, self, position))
+            return BinaryVector(vector, dtype, padding)
+
+        elif dtype == BinaryVectorDtype.FLOAT32:
+            n_bytes = len(self) - position
+            n_values = n_bytes // 4
+            if n_bytes % 4:
+                raise ValueError(
+                    "Corrupt data. N bytes for a float32 vector must be a multiple of 4."
+                )
+            vector = list(struct.unpack_from(f"{n_values}f", self, position))
+            return BinaryVector(vector, dtype, padding)
+
+        elif dtype == BinaryVectorDtype.PACKED_BIT:
+            # data packed as uint8
+            dtype_format = "B"
+            unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
+            return BinaryVector(unpacked_uint8s, dtype, padding)
+
+        else:
+            raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
+
     @property
     def subtype(self) -> int:
         """Subtype of this binary data."""
diff --git a/doc/api/bson/binary.rst b/doc/api/bson/binary.rst
@@ -21,6 +21,14 @@
    .. autoclass:: UuidRepresentation
       :members:
 
+   .. autoclass:: BinaryVectorDtype
+      :members:
+      :show-inheritance:
+
+   .. autoclass:: BinaryVector
+      :members:
+
+
    .. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
       :members:
       :show-inheritance:
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -19,7 +19,6 @@ in this release.
 
 .. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553
 
-
 Changes in Version 4.9.0
 -------------------------
 
diff --git a/test/bson_binary_vector/float32.json b/test/bson_binary_vector/float32.json
@@ -0,0 +1,42 @@
+{
+  "description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32",
+  "test_key": "vector",
+  "tests": [
+    {
+      "description": "Simple Vector FLOAT32",
+      "valid": true,
+      "vector": [127.0, 7.0],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 0,
+      "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000"
+    },
+    {
+      "description": "Empty Vector FLOAT32",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009270000"
+    },
+    {
+      "description": "Infinity Vector FLOAT32",
+      "valid": true,
+      "vector": ["-inf", 0.0, "inf"],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 0,
+      "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00"
+    },
+    {
+      "description": "FLOAT32 with padding",
+      "valid": false,
+      "vector": [127.0, 7.0],
+      "dtype_hex": "0x27",
+      "dtype_alias": "FLOAT32",
+      "padding": 3
+    }
+  ]
+}
+
diff --git a/test/bson_binary_vector/int8.json b/test/bson_binary_vector/int8.json
@@ -0,0 +1,57 @@
+{
+  "description": "Tests of Binary subtype 9, Vectors, with dtype INT8",
+  "test_key": "vector",
+  "tests": [
+    {
+      "description": "Simple Vector INT8",
+      "valid": true,
+      "vector": [127, 7],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0,
+      "canonical_bson": "1600000005766563746F7200040000000903007F0700"
+    },
+    {
+      "description": "Empty Vector INT8",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009030000"
+    },
+    {
+      "description": "Overflow Vector INT8",
+      "valid": false,
+      "vector": [128],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0
+    },
+    {
+      "description": "Underflow Vector INT8",
+      "valid": false,
+      "vector": [-129],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0
+    },
+    {
+      "description": "INT8 with padding",
+      "valid": false,
+      "vector": [127, 7],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 3
+    },
+    {
+      "description": "INT8 with float inputs",
+      "valid": false,
+      "vector": [127.77, 7.77],
+      "dtype_hex": "0x03",
+      "dtype_alias": "INT8",
+      "padding": 0
+    }
+  ]
+}
+
diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json
@@ -0,0 +1,50 @@
+{
+  "description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT",
+  "test_key": "vector",
+  "tests": [
+    {
+      "description": "Simple Vector PACKED_BIT",
+      "valid": true,
+      "vector": [127, 7],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0,
+      "canonical_bson": "1600000005766563746F7200040000000910007F0700"
+    },
+    {
+      "description": "Empty Vector PACKED_BIT",
+      "valid": true,
+      "vector": [],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0,
+      "canonical_bson": "1400000005766563746F72000200000009100000"
+    },
+    {
+      "description": "PACKED_BIT with padding",
+      "valid": true,
+      "vector": [127, 7],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 3,
+      "canonical_bson": "1600000005766563746F7200040000000910037F0700"
+    },
+    {
+      "description": "Overflow Vector PACKED_BIT",
+      "valid": false,
+      "vector": [256],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0
+    },
+    {
+      "description": "Underflow Vector PACKED_BIT",
+      "valid": false,
+      "vector": [-1],
+      "dtype_hex": "0x10",
+      "dtype_alias": "PACKED_BIT",
+      "padding": 0
+    }
+  ]
+}
+
diff --git a/test/bson_corpus/binary.json b/test/bson_corpus/binary.json
diff --git a/test/test_bson.py b/test/test_bson.py
diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py