Add ability to partition sets of UUIDs by infix (#7528)

hannes-ucsc · achave11-ucsc · commit f07f2032efef · 2025-11-17T11:02:12.000-08:00
diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py
@@ -588,10 +588,3 @@ class BundlePartition(UUIDPartition):
 
     def divisions(self, num_entities: int) -> int:
         return math.ceil(num_entities / self.max_partition_size)
-
-    def __attrs_post_init__(self):
-        super().__attrs_post_init__()
-        # Most bits in a v4 or v5 UUID are pseudo-random, including the leading
-        # 32 bits but those are followed by a couple of deterministic ones.
-        # For simplicity, we'll limit ourselves to 2 ** 32 leaf partitions.
-        assert self.prefix_length <= 32, R('Too many partitions', self.prefix_length)
diff --git a/src/azul/uuids.py b/src/azul/uuids.py
@@ -1,6 +1,9 @@
 from hashlib import (
     sha1,
 )
+from itertools import (
+    accumulate,
+)
 import math
 from typing import (
     Any,
@@ -18,6 +21,7 @@
 
 from azul import (
     R,
+    cached_property,
 )
 from azul.types import (
     JSON,
@@ -181,43 +185,94 @@ class UUIDPartition(metaclass=UUIDPartitionMeta):
     #:
     prefix: int
 
+    #: The canonical string representation of UUIDs has five groups of
+    #: hexadecimal digits separated by dash. The first group is eight digits
+    #: long, the last group twelve and the three groups in between are four
+    #: digits long. The first and the last group are best suited for a random
+    #: distribution of v4 v5 UUIDs across partitions. By default, UUID
+    #: partitions use the first group.
+    #:
+    group: int = 0
+
     #: The partition that includes all UUIDs. Since this attribute holds an
     #: instance of this class, we can't initialize it here, but have to do so in
     #: the metaclass constructor.
     #:
     root: ClassVar[Self]
 
+    #: The width of each group in bits.
+    #:
+    group_lengths: ClassVar[tuple[int, ...]]
+    group_lengths = tuple(4 * n for n in [8, 4, 4, 4, 12])
+
+    #: For each group, the number of bits to right-shift the binary, 128-bit-
+    #: wide representation of a UUID in order to have the bits of that group
+    #: become the low-order bits.
+    #:
+    group_shifts: ClassVar[tuple[int, ...]]
+    group_shifts = tuple(accumulate(group_lengths[:-1], initial=0))
+
     def __attrs_post_init__(self):
         """
         >>> UUIDPartition(prefix_length=0, prefix=1)
         ... # doctest: +NORMALIZE_WHITESPACE
         Traceback (most recent call last):
         ...
         AssertionError: R('If prefix length is 0, the prefix must be, too',
-        UUIDPartition(prefix_length=0, prefix=1))
+        UUIDPartition(prefix_length=0, prefix=1, group=0))
 
         >>> UUIDPartition(prefix_length=1, prefix=3)
         ... # doctest: +NORMALIZE_WHITESPACE
         Traceback (most recent call last):
         ...
         AssertionError: R('Prefix has extra high-order bits set',
-        UUIDPartition(prefix_length=1, prefix=3))
+        UUIDPartition(prefix_length=1, prefix=3, group=0))
+
+        >>> UUIDPartition(prefix_length=1, prefix=0, group=5)
+        ... # doctest: +NORMALIZE_WHITESPACE
+        Traceback (most recent call last):
+        ...
+        AssertionError: R('Invalid group',
+        UUIDPartition(prefix_length=1, prefix=0, group=5))
+
+        >>> UUIDPartition(prefix_length=1, prefix=0, group=-1)
+        ... # doctest: +NORMALIZE_WHITESPACE
+        Traceback (most recent call last):
+        ...
+        AssertionError: R('Invalid group',
+        UUIDPartition(prefix_length=1, prefix=0, group=-1))
 
+        >>> UUIDPartition(prefix_length=49, prefix=0, group=4)
+        Traceback (most recent call last):
+        ...
+        AssertionError: R('Length of prefix exceeds that of group', 49, 48)
+
+        >>> UUIDPartition(prefix_length=17, prefix=0, group=1)
+        Traceback (most recent call last):
+        ...
+        AssertionError: R('Length of prefix exceeds that of group', 17, 16)
         """
         assert self.prefix_length != 0 or self.prefix == 0, R(
             'If prefix length is 0, the prefix must be, too', self)
+        assert 0 <= self.group < len(self.group_shifts), R(
+            'Invalid group', self)
+        group_length = self.group_lengths[self.group]
+        assert self.prefix_length <= group_length, R(
+            'Length of prefix exceeds that of group', self.prefix_length, group_length)
         assert 0 <= self.prefix < 2 ** self.prefix_length, R(
             'Prefix has extra high-order bits set', self)
 
     @classmethod
     def from_json(cls, json: JSON) -> Self:
         return cls(prefix_length=json_int(json['prefix_length']),
-                   prefix=json_int(json['prefix']))
+                   prefix=json_int(json['prefix']),
+                   group=json_int(json['group']))
 
     def to_json(self) -> MutableJSON:
         return {
             'prefix_length': self.prefix_length,
-            'prefix': self.prefix
+            'prefix': self.prefix,
+            'group': self.group
         }
 
     def contains(self, member: UUID) -> bool:
@@ -229,10 +284,30 @@ def contains(self, member: UUID) -> bool:
         True
         >>> p.contains(UUID('ffd4524e-14c4-41d7-9071-6cadab09d75c'))
         True
+
+        >>> p = UUIDPartition(prefix_length=5, prefix=0b0110_0, group=4)
+        >>> p.contains(UUID('fdd4524e-14c4-41d7-9071-66adab09d75c'))
+        True
+        >>> p.contains(UUID('fdd4524e-14c4-41d7-9071-67adab09d75c'))
+        True
+        >>> p.contains(UUID('fdd4524e-14c4-41d7-9071-68adab09d75c'))
+        False
+
+        >>> p = UUIDPartition(prefix_length=48, prefix=0x68adab09d75c, group=4)
+        >>> p.contains(UUID('fdd4524e-14c4-41d7-9071-68adab09d75c'))
+        True
+        >>> p.contains(UUID('fdd4524e-14c4-41d7-9071-68adab09d75d'))
+        False
         """
-        # UUIDs are 128 bit integers
-        shift = 128 - self.prefix_length
-        return member.int >> shift == self.prefix
+        mask, shift = self._mask_and_shift
+        return (member.int & mask) >> shift == self.prefix
+
+    @cached_property
+    def _mask_and_shift(self) -> tuple[int, int]:
+        group_shift = self.group_shifts[self.group]
+        shift = 128 - self.prefix_length - group_shift
+        mask = (1 << (128 - group_shift)) - 1
+        return mask, shift
 
     def divide(self, num_divisions: int) -> list[Self]:
         """
@@ -250,18 +325,24 @@ def divide(self, num_divisions: int) -> list[Self]:
 
         >>> sorted(UUIDPartition.root.divide(3))
         ... # doctest: +NORMALIZE_WHITESPACE
-        [UUIDPartition(prefix_length=2, prefix=0),
-        UUIDPartition(prefix_length=2, prefix=1),
-        UUIDPartition(prefix_length=2, prefix=2),
-        UUIDPartition(prefix_length=2, prefix=3)]
+        [UUIDPartition(prefix_length=2, prefix=0, group=0),
+        UUIDPartition(prefix_length=2, prefix=1, group=0),
+        UUIDPartition(prefix_length=2, prefix=2, group=0),
+        UUIDPartition(prefix_length=2, prefix=3, group=0)]
+
+        >>> UUIDPartition(prefix_length=2, prefix=0, group=4).divide(2)
+        ... # doctest: +NORMALIZE_WHITESPACE
+        [UUIDPartition(prefix_length=3, prefix=0, group=4),
+        UUIDPartition(prefix_length=3, prefix=1, group=4)]
         """
         assert num_divisions > 0, R('Number of divisions must be 1 or more')
         prefix_length = math.ceil(math.log2(num_divisions))
         num_divisions = 2 ** prefix_length
         cls = type(self)
         return [
             cls(prefix_length=self.prefix_length + prefix_length,
-                prefix=(self.prefix << prefix_length) + prefix)
+                prefix=(self.prefix << prefix_length) + prefix,
+                group=self.group)
             for prefix in range(num_divisions)
         ]
 
@@ -273,28 +354,28 @@ def __str__(self) -> str:
         returned by this function.
 
         >>> str(UUIDPartition.root)
-        '-'
+        '-@0'
 
                                                       0b1111_1110 == 0xfe
                                                       0b1111_1111 == 0xff
-        >>> str(UUIDPartition(prefix_length=7, prefix=0b1111_111))
-        'fe-ff'
+        >>> str(UUIDPartition(prefix_length=7, prefix=0b1111_111, group=4))
+        'fe-ff@4'
 
         Leading zeroes in the high and low end of the range:
 
                                                       0b0000_1110 == 0x0e
                                                       0b0000_1111 == 0x0f
-        >>> str(UUIDPartition(prefix_length=7, prefix=0b0000_111))
-        '0e-0f'
+        >>> str(UUIDPartition(prefix_length=7, prefix=0b0000_111, group=4))
+        '0e-0f@4'
 
         A partition twice as big (a binary prefix that's one bit shorter):
 
                                                       0b0000_1100 = 0x0c
                                                       0b0000_1101 = 0x0d
                                                       0b0000_1110 = 0x0e
                                                       0b0000_1111 = 0x0f
-        >>> str(UUIDPartition(prefix_length=6, prefix=0b0000_11))
-        '0c-0f'
+        >>> str(UUIDPartition(prefix_length=6, prefix=0b0000_11, group=4))
+        '0c-0f@4'
         """
         shift = 4 - self.prefix_length % 4  # shift to align at nibble boundary
         all_ones = (1 << shift) - 1
@@ -306,7 +387,7 @@ def __str__(self) -> str:
         def hex(i):
             return format(i, f'0{hex_len}x')[:hex_len]
 
-        return '-'.join(map(hex, (lo, hi)))
+        return f'{hex(lo)}-{hex(hi)}@{self.group}'
 
 
 def uuid5_for_bytes(namespace: UUID, name: bytes) -> UUID: