Skip to content

Commit b6e36ef

Browse files
authored
[init] Regex Support: Converter to EBNF (#21)
This PR provides a util to convert regex to EBNF. This is the first stage to support regex. Preliminary profiling results show XGrammar have good performance for regex: - Model: Llama-3-8B (128k token) - Preprocess time(ms): - IPv4: 203 - Date-time: 187 - Date: 85 - Time: 124 - Email: 156 - Max time for finding token mask (us): - IPv4: 626 - Date-time, Date, Time: 52 - Email: 1512 Further performance optimization is planned.
1 parent e356c9d commit b6e36ef

14 files changed

+839
-103
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ coverage.xml
6565
*.py,cover
6666
.hypothesis/
6767
.pytest_cache/
68+
/Testing/
6869

6970
# Translations
7071
*.mo
@@ -137,7 +138,7 @@ venv.bak/
137138
*.pyc
138139
*~
139140
config.mk
140-
config.cmake
141+
/config.cmake
141142
Win32
142143
*.dir
143144
perf

cpp/grammar_parser.cc

+6-8
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,7 @@ std::string EBNFParserImpl::ParseName(bool accept_empty) {
136136
// Character class should not contain newlines.
137137
int32_t EBNFParserImpl::ParseCharacterClass() {
138138
static constexpr TCodepoint kUnknownUpperBound = -4;
139-
static const std::unordered_map<std::string, TCodepoint> kCustomEscapeMap = {
140-
{"\\-", '-'}, {"\\]", ']'}
141-
};
139+
static const std::unordered_map<char, TCodepoint> CUSTOM_ESCAPE_MAP = {{'-', '-'}, {']', ']'}};
142140

143141
std::vector<BNFGrammarBuilder::CharacterClassElement> elements;
144142

@@ -160,14 +158,14 @@ int32_t EBNFParserImpl::ParseCharacterClass() {
160158
continue;
161159
}
162160

163-
auto [codepoint, new_cur] = ParseNextUTF8OrEscaped(cur_, kCustomEscapeMap);
161+
auto [codepoint, len] = ParseNextUTF8OrEscaped(cur_, CUSTOM_ESCAPE_MAP);
164162
if (codepoint == CharHandlingError::kInvalidUTF8) {
165163
ThrowParseError("Invalid UTF8 sequence");
166164
}
167165
if (codepoint == CharHandlingError::kInvalidEscape) {
168166
ThrowParseError("Invalid escape sequence");
169167
}
170-
Consume(new_cur - cur_);
168+
Consume(len);
171169
if (past_is_hyphen) {
172170
XGRAMMAR_ICHECK(!elements.empty());
173171
if (elements.back().lower > codepoint) {
@@ -199,14 +197,14 @@ int32_t EBNFParserImpl::ParseString() {
199197
ThrowParseError("There should be no newline character in a string literal");
200198
}
201199

202-
auto [codepoint, new_cur] = ParseNextUTF8OrEscaped(cur_);
200+
auto [codepoint, len] = ParseNextUTF8OrEscaped(cur_);
203201
if (codepoint == CharHandlingError::kInvalidUTF8) {
204202
ThrowParseError("Invalid utf8 sequence");
205203
}
206204
if (codepoint == CharHandlingError::kInvalidEscape) {
207205
ThrowParseError("Invalid escape sequence");
208206
}
209-
Consume(new_cur - cur_);
207+
Consume(len);
210208
codepoints.push_back(codepoint);
211209
}
212210
if (codepoints.empty()) {
@@ -221,7 +219,7 @@ int32_t EBNFParserImpl::ParseString() {
221219
// convert str to int32_t vector
222220
std::vector<int32_t> bytes;
223221
for (auto c : str) {
224-
bytes.push_back(static_cast<int32_t>(c));
222+
bytes.push_back(static_cast<int32_t>(static_cast<uint8_t>(c)));
225223
}
226224
return builder_.AddByteString(bytes);
227225
}

cpp/grammar_serializer.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ std::string BNFGrammarPrinter::PrintByteString(const RuleExpr& rule_expr) {
5555
for (int i = 0; i < rule_expr.data_len; ++i) {
5656
internal_str += static_cast<char>(rule_expr[i]);
5757
}
58-
auto codepoints = ParseUTF8(internal_str.c_str(), UTF8ErrorPolicy::kReturnByte);
58+
auto codepoints = ParseUTF8(internal_str.c_str(), true);
5959
std::string result;
6060
for (auto codepoint : codepoints) {
6161
result += PrintAsEscapedUTF8(codepoint);

cpp/json_schema_converter.cc

+2-10
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,6 @@
2020

2121
namespace xgrammar {
2222

23-
// TODO(yixin): test it
24-
// EMCC somehow cannot pickup operator overload from picojson.h, so we copy here.
25-
// #ifdef COMPILE_MLC_WASM_RUNTIME
26-
// inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
27-
// x.serialize(std::ostream_iterator<char>(os));
28-
// return os;
29-
// }
30-
// #endif
31-
3223
/*!
3324
* \brief Manage the indent and separator for the generation of EBNF grammar.
3425
* \param indent The number of spaces for each indent. If it is std::nullopt, there will be no
@@ -621,7 +612,8 @@ std::string JSONSchemaToEBNFConverter::VisitEnum(
621612

622613
std::string JSONSchemaToEBNFConverter::JSONStrToPrintableStr(const std::string& json_str) {
623614
static const std::vector<std::pair<std::string, std::string>> kReplaceMapping = {
624-
{"\\", "\\\\"}, {"\"", "\\\""}};
615+
{"\\", "\\\\"}, {"\"", "\\\""}
616+
};
625617
std::string result = json_str;
626618
for (const auto& [k, v] : kReplaceMapping) {
627619
size_t pos = 0;

cpp/pybind/pybind.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ PYBIND11_MODULE(xgrammar_bindings, m) {
2323
auto pyBuiltinGrammar = py::class_<BuiltinGrammar>(m, "BuiltinGrammar");
2424
pyBuiltinGrammar.def_static("json", &BuiltinGrammar::JSON)
2525
.def_static("json_schema", &BuiltinGrammar::JSONSchema)
26-
.def_static("_json_schema_to_ebnf", &BuiltinGrammar::_JSONSchemaToEBNF);
26+
.def_static("_json_schema_to_ebnf", &BuiltinGrammar::_JSONSchemaToEBNF)
27+
.def_static("_regex_to_ebnf", &BuiltinGrammar::_RegexToEBNF);
2728

2829
auto pyTokenizerInfo = py::class_<TokenizerInfo>(m, "TokenizerInfo");
2930
pyTokenizerInfo.def(py::init(&TokenizerInfo_Init))

0 commit comments

Comments
 (0)