Skip to content

Commit 382ad68

Browse files
committed
refactor: use simd bit packing for index
1 parent dbb79b3 commit 382ad68

10 files changed

+212
-784
lines changed

3rdparty/CMakeLists.txt

+24
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,29 @@ if(KLOGG_USE_SENTRY)
386386
endif()
387387
endif(KLOGG_USE_SENTRY)
388388

389+
cpmaddpackage(
390+
NAME
391+
simdcomp
392+
GITHUB_REPOSITORY
393+
lemire/simdcomp
394+
GIT_TAG
395+
009c67807670d16f8984c0534aef0e630e5465a4
396+
DOWNLOAD_ONLY
397+
YES
398+
)
399+
if(simdcomp_ADDED)
400+
add_library(simdcomp STATIC
401+
${simdcomp_SOURCE_DIR}/src/avxbitpacking.c
402+
${simdcomp_SOURCE_DIR}/src/simdfor.c
403+
${simdcomp_SOURCE_DIR}/src/simdcomputil.c
404+
${simdcomp_SOURCE_DIR}/src/simdbitpacking.c
405+
${simdcomp_SOURCE_DIR}/src/simdintegratedbitpacking.c
406+
${simdcomp_SOURCE_DIR}/src/simdpackedsearch.c
407+
${simdcomp_SOURCE_DIR}/src/simdpackedselect.c
408+
)
409+
target_include_directories(simdcomp PUBLIC ${simdcomp_SOURCE_DIR}/include)
410+
endif()
411+
389412
set(klogg_cpm_targets
390413
xxhash
391414
Catch2
@@ -407,6 +430,7 @@ set(klogg_cpm_targets
407430
crashpad_compat
408431
crashpad_util
409432
mini_chromium
433+
simdcomp
410434
)
411435
foreach(target ${klogg_cpm_targets})
412436
if(TARGET ${target})

src/logdata/CMakeLists.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
add_library(
22
klogg_logdata STATIC
33
${CMAKE_CURRENT_SOURCE_DIR}/include/abstractlogdata.h
4-
${CMAKE_CURRENT_SOURCE_DIR}/include/blockpool.h
54
${CMAKE_CURRENT_SOURCE_DIR}/include/compressedlinestorage.h
65
${CMAKE_CURRENT_SOURCE_DIR}/include/encodingdetector.h
76
${CMAKE_CURRENT_SOURCE_DIR}/include/linepositionarray.h
@@ -16,7 +15,6 @@ add_library(
1615
${CMAKE_CURRENT_SOURCE_DIR}/include/filedigest.h
1716
${CMAKE_CURRENT_SOURCE_DIR}/include/readablesize.h
1817
${CMAKE_CURRENT_SOURCE_DIR}/src/abstractlogdata.cpp
19-
${CMAKE_CURRENT_SOURCE_DIR}/src/blockpool.cpp
2018
${CMAKE_CURRENT_SOURCE_DIR}/src/compressedlinestorage.cpp
2119
${CMAKE_CURRENT_SOURCE_DIR}/src/encodingdetector.cpp
2220
${CMAKE_CURRENT_SOURCE_DIR}/src/logdata.cpp
@@ -54,6 +52,7 @@ target_link_libraries(
5452
kdtoolbox
5553
robin_hood
5654
simdutf
55+
simdcomp
5756
klogg_mimalloc_wrapper
5857
)
5958

src/logdata/include/blockpool.h

-100
This file was deleted.
+32-130
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,3 @@
1-
/*
2-
* Copyright (C) 2015 Nicolas Bonnefon and other contributors
3-
*
4-
* This file is part of glogg.
5-
*
6-
* glogg is free software: you can redistribute it and/or modify
7-
* it under the terms of the GNU General Public License as published by
8-
* the Free Software Foundation, either version 3 of the License, or
9-
* (at your option) any later version.
10-
*
11-
* glogg is distributed in the hope that it will be useful,
12-
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13-
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14-
* GNU General Public License for more details.
15-
*
16-
* You should have received a copy of the GNU General Public License
17-
* along with glogg. If not, see <http://www.gnu.org/licenses/>.
18-
*/
19-
201
/*
212
* Copyright (C) 2016 -- 2019 Anton Filimonov and other contributors
223
*
@@ -36,88 +17,36 @@
3617
* along with klogg. If not, see <http://www.gnu.org/licenses/>.
3718
*/
3819

20+
#include <array>
3921
#include <cstddef>
4022
#include <cstdint>
4123
#include <vector>
4224

43-
#include "blockpool.h"
4425
#include "linetypes.h"
4526
#include <type_safe/strong_typedef.hpp>
4627

47-
4828
// This class is a compressed storage backend for LinePositionArray
4929
// It emulates the interface of a vector, but take advantage of the nature
5030
// of the stored data (increasing end of line addresses) to apply some
5131
// compression in memory, while still providing fast, constant-time look-up.
5232

53-
/* The current algorithm takes advantage of the fact most lines are reasonably
54-
* short, it codes each line on:
55-
* - Line < 127 bytes : 1 byte
56-
* - 127 < line < 16383 : 2 bytes
57-
* - line > 16383 : 6 bytes (or 10 bytes)
58-
* Uncompressed backend stores line on 4 bytes or 8 bytes.
59-
*
60-
* The algorithm is quite simple, the file is first divided in two parts:
61-
* - The lines whose end are located before UINT32_MAX
62-
* - The lines whose end are located after UINT32_MAX
63-
* Those end of lines are stored separately in the table32 and the table64
64-
* respectively.
65-
*
66-
* The EOL list is then divided in blocks of IndexBlockSize (256) lines.
67-
* A block index vector (per table) contains pointers to each block.
68-
*
69-
* Each block is then defined as such:
70-
* Block32 (sizes in byte)
71-
* 00 - Absolute EOF address (4 bytes)
72-
* 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative)
73-
* - ( 10xx xxxx
74-
* xxxx xxxx if second line is < 16383 ) (2 bytes, relative)
75-
* - ( 1111 1111
76-
* xxxx xxxx
77-
* xxxx xxxx if second line is > 16383 ) (6 bytes, absolute)
78-
* ...
79-
* (126 more lines)
80-
*
81-
* Block64 (sizes in byte)
82-
* 00 - Absolute EOF address (8 bytes)
83-
* 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative)
84-
* - ( 10xx xxxx
85-
* xxxx xxxx if second line is < 16383 ) (2 bytes, relative)
86-
* - ( 1111 1111
87-
* xxxx xxxx
88-
* xxxx xxxx
89-
* xxxx xxxx
90-
* xxxx xxxx if second line is > 16383 ) (10 bytes, absolute)
91-
* ...
92-
* (126 more lines)
93-
*
94-
* Absolute addressing has been adopted for line > 16383 to bound memory usage in case
95-
* of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
96-
* long-ish (30 KB) lines.
97-
*
98-
* The table32 always starts at 0, the table64 starts at first_long_line_
99-
*/
100-
101-
#ifndef COMPRESSEDLINESTORAGE_H
102-
#define COMPRESSEDLINESTORAGE_H
33+
#ifndef SIMDCOMPRESSEDLINESTORAGE_H
34+
#define SIMDCOMPRESSEDLINESTORAGE_H
10335

10436
class CompressedLinePositionStorage {
105-
public:
106-
// Default constructor
107-
CompressedLinePositionStorage()
108-
: block_index_{ 0 }
109-
, long_block_index_{ 0 }
110-
{
111-
}
37+
public:
38+
CompressedLinePositionStorage();
11239

11340
// Copy constructor would be slow, delete!
11441
CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
115-
CompressedLinePositionStorage& operator=( const CompressedLinePositionStorage& orig ) = delete;
42+
CompressedLinePositionStorage& operator=( const CompressedLinePositionStorage& orig )
43+
= delete;
11644

117-
// Move constructor
11845
CompressedLinePositionStorage( CompressedLinePositionStorage&& orig ) noexcept;
119-
// Move assignement
120-
CompressedLinePositionStorage& operator=( CompressedLinePositionStorage&& orig ) noexcept;
46+
CompressedLinePositionStorage&
47+
operator=( CompressedLinePositionStorage&& orig ) noexcept;
48+
49+
~CompressedLinePositionStorage() = default;
12150

12251
// Append the passed end-of-line to the storage
12352
void append( OffsetInFile pos );
@@ -129,76 +58,49 @@ class CompressedLinePositionStorage {
12958
// Size of the array
13059
LinesCount size() const
13160
{
132-
return nb_lines_;
61+
return nbLines_;
13362
}
13463

13564
size_t allocatedSize() const;
13665

137-
struct BlockOffset
138-
: type_safe::strong_typedef<BlockOffset, size_t>
139-
, type_safe::strong_typedef_op::increment<BlockOffset>
140-
, type_safe::strong_typedef_op::addition<BlockOffset>
141-
, type_safe::strong_typedef_op::relational_comparison<BlockOffset>
142-
, type_safe::strong_typedef_op::equality_comparison<BlockOffset>
143-
, type_safe::strong_typedef_op::explicit_bool<BlockOffset>
144-
{
145-
using strong_typedef::strong_typedef;
146-
};
147-
148-
// Cache the last position read
149-
// This is to speed up consecutive reads (whole page)
150-
struct Cache {
151-
LineNumber index {std::numeric_limits<LineNumber::UnderlyingType>::max() - 1U};
152-
OffsetInFile position {0};
153-
BlockOffset offset {0};
154-
};
155-
15666
// Element at index
157-
OffsetInFile at( size_t i, Cache* lastPosition = nullptr ) const
67+
OffsetInFile at( size_t i ) const
15868
{
159-
return at( LineNumber( i ), lastPosition );
69+
return at( LineNumber( i ) );
16070
}
161-
OffsetInFile at( LineNumber i, Cache* lastPosition = nullptr ) const;
71+
OffsetInFile at( LineNumber i ) const;
16272

16373
// Add one list to the other
16474
void append_list( const klogg::vector<OffsetInFile>& positions );
16575

16676
// Pop the last element of the storage
16777
void pop_back();
16878

169-
private:
79+
private:
17080
// Utility for move ctor/assign
17181
void move_from( CompressedLinePositionStorage&& orig ) noexcept;
17282

173-
// The two indexes
174-
BlockPool<uint32_t> pool32_;
175-
BlockPool<OffsetInFile::UnderlyingType> pool64_;
176-
177-
// Total number of lines in storage
178-
LinesCount nb_lines_;
179-
180-
// Current position (position of the end of the last line added)
181-
OffsetInFile current_pos_;
83+
void compress_current_block();
84+
void uncompress_last_block();
85+
struct BlockMetadata {
86+
OffsetInFile firstLineOffset{};
87+
uint8_t packetBitWidth{};
88+
size_t packetStorageOffset{};
89+
};
18290

183-
uint32_t block_index_;
184-
uint32_t long_block_index_;
91+
klogg::vector<BlockMetadata> blocks_;
92+
klogg::vector<uint8_t> packedLinesStorage_;
18593

186-
// The index of the first line whose end is stored in a block64
187-
// this is the origin point for all calculations in block64
188-
OptionalLineNumber first_long_line_;
94+
klogg::vector<OffsetInFile> currentLinesBlock_;
95+
klogg::vector<uint32_t> currentLinesBlockShifted_;
18996

190-
// Offset of the next position (not yet written) within the current
191-
// block. null means there is no current block (previous block
192-
// finished or no data)
193-
BlockOffset block_offset_;
97+
// Total number of lines in storage
98+
LinesCount nbLines_;
19499

195-
// For pop_back:
100+
// Current position (position of the end of the last line added)
101+
OffsetInFile lastPos_;
196102

197-
// Previous offset to block element, it is restored when we
198-
// "pop_back" the last element.
199-
// A null here means pop_back need to free the block
200-
// that has just been created.
201-
BlockOffset previous_block_offset_;
103+
bool canUseSimdSelect_ {false};
202104
};
203105

204106
#endif

0 commit comments

Comments
 (0)