Skip to content
This repository was archived by the owner on Jul 18, 2024. It is now read-only.

Commit b17a3a2

Browse files
ahsanbarkatiNamanJain8manishrjain
authored
feat(n-split): Implement n-split (#30)
- Implement the Split function, to split bitmap based on the given maxSize and external size function. - Implement InitSpaceForKeys, and optimize `FromSorted()` - Fix pointer bug by setting _ptr to nil, so that GC can do its work once we are done with the buffer. Co-authored-by: NamanJain8 <[email protected]> Co-authored-by: Manish R Jain <[email protected]>
1 parent 397b018 commit b17a3a2

File tree

4 files changed

+195
-20
lines changed

4 files changed

+195
-20
lines changed

bitmap.go

+153-20
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ type Bitmap struct {
3434
data []uint16
3535
keys node
3636

37+
// This _ptr is only used when we start with a []byte instead of a
38+
// []uint16. Because we do an unsafe conversion to []uint16 data, and hence,
39+
// do NOT own a valid pointer to the underlying array.
3740
_ptr []byte
3841

3942
// memMoved keeps track of how many uint16 moves we had to do. The smaller
@@ -44,33 +47,34 @@ type Bitmap struct {
4447
// FromBuffer returns a pointer to bitmap corresponding to the given buffer. This bitmap shouldn't
4548
// be modified because it might corrupt the given buffer.
4649
func FromBuffer(data []byte) *Bitmap {
50+
assert(len(data)%2 == 0)
4751
if len(data) < 8 {
4852
return NewBitmap()
4953
}
5054
du := toUint16Slice(data)
5155
x := toUint64Slice(du[:4])[indexNodeSize]
5256
return &Bitmap{
5357
data: du,
54-
_ptr: data,
58+
_ptr: data, // Keep a hold of data, otherwise GC would do its thing.
5559
keys: toUint64Slice(du[:x]),
5660
}
5761
}
5862

5963
// FromBufferWithCopy creates a copy of the given buffer and returns a bitmap based on the copied
6064
// buffer. This bitmap is safe for both read and write operations.
61-
func FromBufferWithCopy(data []byte) *Bitmap {
62-
if len(data) < 8 {
65+
func FromBufferWithCopy(src []byte) *Bitmap {
66+
assert(len(src)%2 == 0)
67+
if len(src) < 8 {
6368
return NewBitmap()
6469
}
65-
dup := make([]byte, len(data))
66-
copy(dup, data)
67-
du := toUint16Slice(dup)
68-
x := toUint64Slice(du[:4])[indexNodeSize]
70+
src16 := toUint16Slice(src)
71+
dst16 := make([]uint16, len(src16))
72+
copy(dst16, src16)
73+
x := toUint64Slice(dst16[:4])[indexNodeSize]
6974

7075
return &Bitmap{
71-
data: du,
72-
_ptr: dup,
73-
keys: toUint64Slice(du[:x]),
76+
data: dst16,
77+
keys: toUint64Slice(dst16[:x]),
7478
}
7579
}
7680

@@ -117,6 +121,25 @@ func NewBitmapWith(numKeys int) *Bitmap {
117121
return ra
118122
}
119123

124+
func (ra *Bitmap) initSpaceForKeys(N int) {
125+
if N == 0 {
126+
return
127+
}
128+
curSize := uint64(len(ra.keys) * 4) // U64 -> U16
129+
bySize := uint64(N * 8) // 2xU64 (key, value) -> 2x4xU16
130+
131+
// The following code is borrowed from setKey.
132+
ra.scootRight(curSize, bySize)
133+
ra.keys = toUint64Slice(ra.data[:curSize+bySize])
134+
ra.keys.setNodeSize(int(curSize + bySize))
135+
assert(1 == ra.keys.numKeys()) // This initialization assumes that the number of keys are 1.
136+
137+
// The containers have moved to the right bySize. So, update their offsets.
138+
// Currently, there's only one container.
139+
val := ra.keys.val(0)
140+
ra.keys.setAt(valOffset(0), val+uint64(bySize))
141+
}
142+
120143
// setKey sets a key and container offset.
121144
func (ra *Bitmap) setKey(k uint64, offset uint64) uint64 {
122145
if added := ra.keys.set(k, offset); !added {
@@ -135,7 +158,7 @@ func (ra *Bitmap) setKey(k uint64, offset uint64) uint64 {
135158
bySize = math.MaxUint16
136159
}
137160

138-
ra.scootRight(curSize, uint16(bySize))
161+
ra.scootRight(curSize, bySize)
139162
ra.keys = toUint64Slice(ra.data[:curSize+bySize])
140163
ra.keys.setNodeSize(int(curSize + bySize))
141164

@@ -151,7 +174,7 @@ func (ra *Bitmap) setKey(k uint64, offset uint64) uint64 {
151174
return offset + bySize
152175
}
153176

154-
func (ra *Bitmap) fastExpand(bySize uint16) {
177+
func (ra *Bitmap) fastExpand(bySize uint64) {
155178
prev := len(ra.keys) * 4 // Multiply by 4 to convert from u16 to u64.
156179

157180
// This following statement also works. But, given how much fastExpand gets
@@ -170,14 +193,15 @@ func (ra *Bitmap) fastExpand(bySize uint16) {
170193
out := make([]uint16, cap(ra.data)+growBy)
171194
copy(out, ra.data)
172195
ra.data = out[:toSize]
196+
ra._ptr = nil // Allow Go to GC whatever this was pointing to.
173197
// Re-reference ra.keys correctly because underlying array has changed.
174198
ra.keys = toUint64Slice(ra.data[:prev])
175199
}
176200

177201
// scootRight isn't aware of containers. It's going to create empty space of
178202
// bySize at the given offset in ra.data. The offset doesn't need to line up
179203
// with a container.
180-
func (ra *Bitmap) scootRight(offset uint64, bySize uint16) {
204+
func (ra *Bitmap) scootRight(offset uint64, bySize uint64) {
181205
left := ra.data[offset:]
182206

183207
ra.fastExpand(bySize) // Expand the buffer.
@@ -198,7 +222,7 @@ func (ra *Bitmap) scootLeft(offset uint64, size uint64) {
198222

199223
func (ra *Bitmap) newContainer(sz uint16) uint64 {
200224
offset := uint64(len(ra.data))
201-
ra.fastExpand(sz)
225+
ra.fastExpand(uint64(sz))
202226
Memclr(ra.data[offset : offset+uint64(sz)])
203227
ra.data[offset] = sz
204228
return offset
@@ -223,7 +247,7 @@ func (ra *Bitmap) expandContainer(offset uint64) {
223247
}
224248

225249
// Select the portion to the right of the container, beyond its right boundary.
226-
ra.scootRight(offset+uint64(sz), bySize)
250+
ra.scootRight(offset+uint64(sz), uint64(bySize))
227251
ra.keys.updateOffsets(offset, uint64(bySize), true)
228252

229253
if sz < 2048 {
@@ -274,7 +298,7 @@ func (ra *Bitmap) copyAt(offset uint64, src []uint16) {
274298
assert(src[indexSize] == maxContainerSize)
275299
bySize := uint16(maxContainerSize) - dstSize
276300
// Select the portion to the right of the container, beyond its right boundary.
277-
ra.scootRight(offset+uint64(dstSize), bySize)
301+
ra.scootRight(offset+uint64(dstSize), uint64(bySize))
278302
ra.keys.updateOffsets(offset, uint64(bySize), true)
279303
assert(copy(ra.data[offset:], src) == len(src))
280304
return
@@ -300,7 +324,7 @@ func (ra *Bitmap) copyAt(offset uint64, src []uint16) {
300324

301325
bySize := uint16(maxContainerSize) - dstSize
302326
// Select the portion to the right of the container, beyond its right boundary.
303-
ra.scootRight(offset+uint64(dstSize), bySize)
327+
ra.scootRight(offset+uint64(dstSize), uint64(bySize))
304328
ra.keys.updateOffsets(offset, uint64(bySize), true)
305329

306330
// Update the space of the container, so getContainer would work correctly.
@@ -315,7 +339,7 @@ func (ra *Bitmap) copyAt(offset uint64, src []uint16) {
315339

316340
// targetSize is not maxSize. Let's expand to targetSize and copy array.
317341
bySize := targetSz - dstSize
318-
ra.scootRight(offset+uint64(dstSize), bySize)
342+
ra.scootRight(offset+uint64(dstSize), uint64(bySize))
319343
ra.keys.updateOffsets(offset, uint64(bySize), true)
320344
assert(copy(ra.data[offset:], src) == len(src))
321345
ra.data[offset] = targetSz
@@ -390,14 +414,15 @@ func FromSortedList(vals []uint64) *Bitmap {
390414
}
391415

392416
// Set the keys beforehand so that we don't need to move a lot of memory because of adding keys.
417+
var numKeys int
393418
for _, x := range vals {
394419
hi = x & mask
395420
if hi != 0 && hi != lastHi {
396-
ra.setKey(lastHi, 0)
421+
numKeys++
397422
}
398423
lastHi = hi
399424
}
400-
ra.setKey(lastHi, 0)
425+
ra.initSpaceForKeys(numKeys)
401426

402427
finalize := func(l []uint16, key uint64) {
403428
if len(l) == 0 {
@@ -1184,3 +1209,111 @@ func FastOr(bitmaps ...*Bitmap) *Bitmap {
11841209

11851210
return dst
11861211
}
1212+
1213+
// Split splits the bitmap based on maxSz and the externalSize function. It splits the bitmap
1214+
// such that size of each split bitmap + external size corresponding to its elements approximately
1215+
// equal to maxSz (it can be greater than maxSz sometimes). The splits are returned in sorted order.
1216+
// externalSize is a function that should return the external size corresponding to elements in
1217+
// range [start, end). External size is used to calculate the split boundaries.
1218+
func (bm *Bitmap) Split(externalSize func(start, end uint64) uint64, maxSz uint64) []*Bitmap {
1219+
splitFurther := func(b *Bitmap) []*Bitmap {
1220+
itr := b.NewIterator()
1221+
newBm := NewBitmap()
1222+
var sz uint64
1223+
var bms []*Bitmap
1224+
for id := itr.Next(); id != 0; id = itr.Next() {
1225+
sz += externalSize(id, addUint64(id, 1))
1226+
newBm.Set(id)
1227+
if sz >= maxSz {
1228+
bms = append(bms, newBm)
1229+
newBm = NewBitmap()
1230+
sz = 0
1231+
}
1232+
}
1233+
1234+
if !newBm.IsEmpty() {
1235+
bms = append(bms, newBm)
1236+
}
1237+
return bms
1238+
}
1239+
1240+
create := func(keyToOffset map[uint64]uint64, totalSz uint64) []*Bitmap {
1241+
var keys []uint64
1242+
for key := range keyToOffset {
1243+
keys = append(keys, key)
1244+
}
1245+
sort.Slice(keys, func(i, j int) bool {
1246+
return keys[i] < keys[j]
1247+
})
1248+
1249+
newBm := NewBitmap()
1250+
1251+
// First set all the keys.
1252+
var containerSz uint64
1253+
for _, key := range keys {
1254+
newBm.setKey(key, 0)
1255+
1256+
// Calculate the size of the containers.
1257+
cont := bm.getContainer(keyToOffset[key])
1258+
containerSz += uint64(len(cont))
1259+
}
1260+
// Allocate enough space to hold all the containers.
1261+
beforeSize := len(newBm.data)
1262+
newBm.fastExpand(containerSz)
1263+
newBm.data = newBm.data[:beforeSize]
1264+
1265+
// Now, we can populate the containers. For that, we first expand the
1266+
// bitmap. Calculate the total size we need to allocate all these containers.
1267+
for _, key := range keys {
1268+
cont := bm.getContainer(keyToOffset[key])
1269+
off := newBm.newContainer(uint16(len(cont)))
1270+
copy(newBm.data[off:], cont)
1271+
1272+
newBm.setKey(key, off)
1273+
}
1274+
1275+
if newBm.GetCardinality() == 0 {
1276+
return nil
1277+
}
1278+
1279+
if totalSz > maxSz {
1280+
return splitFurther(newBm)
1281+
}
1282+
1283+
return []*Bitmap{newBm}
1284+
}
1285+
1286+
var splits []*Bitmap
1287+
1288+
containerMap := make(map[uint64]uint64)
1289+
var totalSz uint64 // size of containers plus the external size of the container
1290+
1291+
for i := 0; i < bm.keys.numKeys(); i++ {
1292+
key := bm.keys.key(i)
1293+
off := bm.keys.val(i)
1294+
cont := bm.getContainer(off)
1295+
1296+
start, end := key, addUint64(key, 1<<16)
1297+
sz := externalSize(start, end) + 2*uint64(cont[indexSize]) // Converting to bytes.
1298+
1299+
// We can probably append more containers in the same bucket.
1300+
if totalSz+sz < maxSz || len(containerMap) == 0 {
1301+
// Include this container in the container map.
1302+
containerMap[key] = off
1303+
totalSz += sz
1304+
continue
1305+
}
1306+
1307+
// We have reached the maxSz limit. Hence, create a split.
1308+
splits = append(splits, create(containerMap, totalSz)...)
1309+
1310+
containerMap = make(map[uint64]uint64)
1311+
containerMap[key] = off
1312+
totalSz = sz
1313+
}
1314+
if len(containerMap) > 0 {
1315+
splits = append(splits, create(containerMap, totalSz)...)
1316+
}
1317+
1318+
return splits
1319+
}

bitmap_test.go

+32
Original file line numberDiff line numberDiff line change
@@ -839,3 +839,35 @@ func TestRank(t *testing.T) {
839839
}
840840
}
841841
}
842+
843+
func TestSplit(t *testing.T) {
844+
run := func(n int) {
845+
r := NewBitmap()
846+
for i := 1; i <= n; i++ {
847+
r.Set(uint64(i))
848+
}
849+
f := func(start, end uint64) uint64 { return 0 }
850+
851+
// Split the bitmaps.
852+
bms := r.Split(f, 1<<10)
853+
var csum int
854+
for _, bm := range bms {
855+
csum += bm.GetCardinality()
856+
}
857+
require.Equal(t, n, csum)
858+
859+
id := uint64(1)
860+
for _, bm := range bms {
861+
itr := bm.NewIterator()
862+
for cur := itr.Next(); cur != 0; cur = itr.Next() {
863+
require.Equal(t, id, cur)
864+
id++
865+
}
866+
}
867+
}
868+
869+
run(2)
870+
run(11)
871+
run(1e3)
872+
run(1e6)
873+
}

keys.go

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ var (
1414
// node stores uint64 keys and the corresponding container offset in the buffer.
1515
// 0th index (indexNodeSize) is used for storing the size of node in bytes.
1616
// 1st index (indexNumKeys) is used for storing the number of keys.
17+
// 2nd index is where we start writing the key-value pairs.
1718
type node []uint64
1819

1920
func keyOffset(i int) int { return indexNodeStart + 2*i }

utils.go

+9
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package sroar
1818

1919
import (
2020
"log"
21+
"math"
2122
"reflect"
2223
"unsafe"
2324

@@ -51,6 +52,14 @@ func max16(a, b uint16) uint16 {
5152
return b
5253
}
5354

55+
// Returns sum of a and b. If the result overflows uint64, it returns math.MaxUint64.
56+
func addUint64(a, b uint64) uint64 {
57+
if a > math.MaxUint64-b {
58+
return math.MaxUint64
59+
}
60+
return a + b
61+
}
62+
5463
func toByteSlice(b []uint16) []byte {
5564
// reference: https://go101.org/article/unsafe.html
5665
var bs []byte

0 commit comments

Comments
 (0)