1
- /*
2
- * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3
- *
4
- * This file is part of glogg.
5
- *
6
- * glogg is free software: you can redistribute it and/or modify
7
- * it under the terms of the GNU General Public License as published by
8
- * the Free Software Foundation, either version 3 of the License, or
9
- * (at your option) any later version.
10
- *
11
- * glogg is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- * GNU General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * along with glogg. If not, see <http://www.gnu.org/licenses/>.
18
- */
19
-
20
1
/*
21
2
* Copyright (C) 2016 -- 2019 Anton Filimonov and other contributors
22
3
*
36
17
* along with klogg. If not, see <http://www.gnu.org/licenses/>.
37
18
*/
38
19
20
+ #include < array>
39
21
#include < cstddef>
40
22
#include < cstdint>
41
23
#include < vector>
42
24
43
- #include " blockpool.h"
44
25
#include " linetypes.h"
45
26
#include < type_safe/strong_typedef.hpp>
46
27
47
-
48
28
// This class is a compressed storage backend for LinePositionArray
49
29
// It emulates the interface of a vector, but take advantage of the nature
50
30
// of the stored data (increasing end of line addresses) to apply some
51
31
// compression in memory, while still providing fast, constant-time look-up.
52
32
53
- /* The current algorithm takes advantage of the fact most lines are reasonably
54
- * short, it codes each line on:
55
- * - Line < 127 bytes : 1 byte
56
- * - 127 < line < 16383 : 2 bytes
57
- * - line > 16383 : 6 bytes (or 10 bytes)
58
- * Uncompressed backend stores line on 4 bytes or 8 bytes.
59
- *
60
- * The algorithm is quite simple, the file is first divided in two parts:
61
- * - The lines whose end are located before UINT32_MAX
62
- * - The lines whose end are located after UINT32_MAX
63
- * Those end of lines are stored separately in the table32 and the table64
64
- * respectively.
65
- *
66
- * The EOL list is then divided in blocks of IndexBlockSize (256) lines.
67
- * A block index vector (per table) contains pointers to each block.
68
- *
69
- * Each block is then defined as such:
70
- * Block32 (sizes in byte)
71
- * 00 - Absolute EOF address (4 bytes)
72
- * 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative)
73
- * - ( 10xx xxxx
74
- * xxxx xxxx if second line is < 16383 ) (2 bytes, relative)
75
- * - ( 1111 1111
76
- * xxxx xxxx
77
- * xxxx xxxx if second line is > 16383 ) (6 bytes, absolute)
78
- * ...
79
- * (126 more lines)
80
- *
81
- * Block64 (sizes in byte)
82
- * 00 - Absolute EOF address (8 bytes)
83
- * 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative)
84
- * - ( 10xx xxxx
85
- * xxxx xxxx if second line is < 16383 ) (2 bytes, relative)
86
- * - ( 1111 1111
87
- * xxxx xxxx
88
- * xxxx xxxx
89
- * xxxx xxxx
90
- * xxxx xxxx if second line is > 16383 ) (10 bytes, absolute)
91
- * ...
92
- * (126 more lines)
93
- *
94
- * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
95
- * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
96
- * long-ish (30 KB) lines.
97
- *
98
- * The table32 always starts at 0, the table64 starts at first_long_line_
99
- */
100
-
101
- #ifndef COMPRESSEDLINESTORAGE_H
102
- #define COMPRESSEDLINESTORAGE_H
33
+ #ifndef SIMDCOMPRESSEDLINESTORAGE_H
34
+ #define SIMDCOMPRESSEDLINESTORAGE_H
103
35
104
36
class CompressedLinePositionStorage {
105
- public:
106
- // Default constructor
107
- CompressedLinePositionStorage ()
108
- : block_index_{ 0 }
109
- , long_block_index_{ 0 }
110
- {
111
- }
37
+ public:
38
+ CompressedLinePositionStorage ();
112
39
113
40
// Copy constructor would be slow, delete!
114
41
CompressedLinePositionStorage ( const CompressedLinePositionStorage& orig ) = delete ;
115
- CompressedLinePositionStorage& operator =( const CompressedLinePositionStorage& orig ) = delete ;
42
+ CompressedLinePositionStorage& operator =( const CompressedLinePositionStorage& orig )
43
+ = delete ;
116
44
117
- // Move constructor
118
45
CompressedLinePositionStorage ( CompressedLinePositionStorage&& orig ) noexcept ;
119
- // Move assignement
120
- CompressedLinePositionStorage& operator =( CompressedLinePositionStorage&& orig ) noexcept ;
46
+ CompressedLinePositionStorage&
47
+ operator =( CompressedLinePositionStorage&& orig ) noexcept ;
48
+
49
+ ~CompressedLinePositionStorage () = default ;
121
50
122
51
// Append the passed end-of-line to the storage
123
52
void append ( OffsetInFile pos );
@@ -129,76 +58,49 @@ class CompressedLinePositionStorage {
129
58
// Size of the array
130
59
LinesCount size () const
131
60
{
132
- return nb_lines_ ;
61
+ return nbLines_ ;
133
62
}
134
63
135
64
size_t allocatedSize () const ;
136
65
137
- struct BlockOffset
138
- : type_safe::strong_typedef<BlockOffset, size_t >
139
- , type_safe::strong_typedef_op::increment<BlockOffset>
140
- , type_safe::strong_typedef_op::addition<BlockOffset>
141
- , type_safe::strong_typedef_op::relational_comparison<BlockOffset>
142
- , type_safe::strong_typedef_op::equality_comparison<BlockOffset>
143
- , type_safe::strong_typedef_op::explicit_bool<BlockOffset>
144
- {
145
- using strong_typedef::strong_typedef;
146
- };
147
-
148
- // Cache the last position read
149
- // This is to speed up consecutive reads (whole page)
150
- struct Cache {
151
- LineNumber index {std::numeric_limits<LineNumber::UnderlyingType>::max () - 1U };
152
- OffsetInFile position {0 };
153
- BlockOffset offset {0 };
154
- };
155
-
156
66
// Element at index
157
- OffsetInFile at ( size_t i, Cache* lastPosition = nullptr ) const
67
+ OffsetInFile at ( size_t i ) const
158
68
{
159
- return at ( LineNumber ( i ), lastPosition );
69
+ return at ( LineNumber ( i ) );
160
70
}
161
- OffsetInFile at ( LineNumber i, Cache* lastPosition = nullptr ) const ;
71
+ OffsetInFile at ( LineNumber i ) const ;
162
72
163
73
// Add one list to the other
164
74
void append_list ( const klogg::vector<OffsetInFile>& positions );
165
75
166
76
// Pop the last element of the storage
167
77
void pop_back ();
168
78
169
- private:
79
+ private:
170
80
// Utility for move ctor/assign
171
81
void move_from ( CompressedLinePositionStorage&& orig ) noexcept ;
172
82
173
- // The two indexes
174
- BlockPool<uint32_t > pool32_;
175
- BlockPool<OffsetInFile::UnderlyingType> pool64_;
176
-
177
- // Total number of lines in storage
178
- LinesCount nb_lines_;
179
-
180
- // Current position (position of the end of the last line added)
181
- OffsetInFile current_pos_;
83
+ void compress_current_block ();
84
+ void uncompress_last_block ();
85
+ struct BlockMetadata {
86
+ OffsetInFile firstLineOffset{};
87
+ uint8_t packetBitWidth{};
88
+ size_t packetStorageOffset{};
89
+ };
182
90
183
- uint32_t block_index_ ;
184
- uint32_t long_block_index_ ;
91
+ klogg::vector<BlockMetadata> blocks_ ;
92
+ klogg::vector< uint8_t > packedLinesStorage_ ;
185
93
186
- // The index of the first line whose end is stored in a block64
187
- // this is the origin point for all calculations in block64
188
- OptionalLineNumber first_long_line_;
94
+ klogg::vector<OffsetInFile> currentLinesBlock_;
95
+ klogg::vector<uint32_t > currentLinesBlockShifted_;
189
96
190
- // Offset of the next position (not yet written) within the current
191
- // block. null means there is no current block (previous block
192
- // finished or no data)
193
- BlockOffset block_offset_;
97
+ // Total number of lines in storage
98
+ LinesCount nbLines_;
194
99
195
- // For pop_back:
100
+ // Current position (position of the end of the last line added)
101
+ OffsetInFile lastPos_;
196
102
197
- // Previous offset to block element, it is restored when we
198
- // "pop_back" the last element.
199
- // A null here means pop_back need to free the block
200
- // that has just been created.
201
- BlockOffset previous_block_offset_;
103
+ bool canUseSimdSelect_ {false };
202
104
};
203
105
204
106
#endif
0 commit comments