mlc-ai
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎cpp/grammar_parser.cc
+6-8 b/‎cpp/grammar_parser.cc
+6-8
diff --git a/‎cpp/grammar_serializer.cc
+1-1 b/‎cpp/grammar_serializer.cc
+1-1
diff --git a/‎cpp/json_schema_converter.cc
+2-10 b/‎cpp/json_schema_converter.cc
+2-10
diff --git a/‎cpp/pybind/pybind.cc
+2-1 b/‎cpp/pybind/pybind.cc
+2-1
@@ -65,6 +65,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+/Testing/
 
 # Translations
 *.mo
@@ -137,7 +138,7 @@ venv.bak/
 *.pyc
 *~
 config.mk
-config.cmake
+/config.cmake
 Win32
 *.dir
 perf
 
@@ -136,9 +136,7 @@ std::string EBNFParserImpl::ParseName(bool accept_empty) {
 // Character class should not contain newlines.
 int32_t EBNFParserImpl::ParseCharacterClass() {
   static constexpr TCodepoint kUnknownUpperBound = -4;
-  static const std::unordered_map<std::string, TCodepoint> kCustomEscapeMap = {
-      {"\\-", '-'}, {"\\]", ']'}
-  };
+  static const std::unordered_map<char, TCodepoint> CUSTOM_ESCAPE_MAP = {{'-', '-'}, {']', ']'}};
 
   std::vector<BNFGrammarBuilder::CharacterClassElement> elements;
 
@@ -160,14 +158,14 @@ int32_t EBNFParserImpl::ParseCharacterClass() {
       continue;
     }
 
-    auto [codepoint, new_cur] = ParseNextUTF8OrEscaped(cur_, kCustomEscapeMap);
+    auto [codepoint, len] = ParseNextUTF8OrEscaped(cur_, CUSTOM_ESCAPE_MAP);
     if (codepoint == CharHandlingError::kInvalidUTF8) {
       ThrowParseError("Invalid UTF8 sequence");
     }
     if (codepoint == CharHandlingError::kInvalidEscape) {
       ThrowParseError("Invalid escape sequence");
     }
-    Consume(new_cur - cur_);
+    Consume(len);
     if (past_is_hyphen) {
       XGRAMMAR_ICHECK(!elements.empty());
       if (elements.back().lower > codepoint) {
@@ -199,14 +197,14 @@ int32_t EBNFParserImpl::ParseString() {
       ThrowParseError("There should be no newline character in a string literal");
     }
 
-    auto [codepoint, new_cur] = ParseNextUTF8OrEscaped(cur_);
+    auto [codepoint, len] = ParseNextUTF8OrEscaped(cur_);
     if (codepoint == CharHandlingError::kInvalidUTF8) {
       ThrowParseError("Invalid utf8 sequence");
     }
     if (codepoint == CharHandlingError::kInvalidEscape) {
       ThrowParseError("Invalid escape sequence");
     }
-    Consume(new_cur - cur_);
+    Consume(len);
     codepoints.push_back(codepoint);
   }
   if (codepoints.empty()) {
@@ -221,7 +219,7 @@ int32_t EBNFParserImpl::ParseString() {
   // convert str to int32_t vector
   std::vector<int32_t> bytes;
   for (auto c : str) {
-    bytes.push_back(static_cast<int32_t>(c));
+    bytes.push_back(static_cast<int32_t>(static_cast<uint8_t>(c)));
   }
   return builder_.AddByteString(bytes);
 }
 
@@ -55,7 +55,7 @@ std::string BNFGrammarPrinter::PrintByteString(const RuleExpr& rule_expr) {
   for (int i = 0; i < rule_expr.data_len; ++i) {
     internal_str += static_cast<char>(rule_expr[i]);
   }
-  auto codepoints = ParseUTF8(internal_str.c_str(), UTF8ErrorPolicy::kReturnByte);
+  auto codepoints = ParseUTF8(internal_str.c_str(), true);
   std::string result;
   for (auto codepoint : codepoints) {
     result += PrintAsEscapedUTF8(codepoint);
 
@@ -20,15 +20,6 @@
 
 namespace xgrammar {
 
-// TODO(yixin): test it
-// EMCC somehow cannot pickup operator overload from picojson.h, so we copy here.
-// #ifdef COMPILE_MLC_WASM_RUNTIME
-// inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
-//   x.serialize(std::ostream_iterator<char>(os));
-//   return os;
-// }
-// #endif
-
 /*!
  * \brief Manage the indent and separator for the generation of EBNF grammar.
  * \param indent The number of spaces for each indent. If it is std::nullopt, there will be no
@@ -621,7 +612,8 @@ std::string JSONSchemaToEBNFConverter::VisitEnum(
 
 std::string JSONSchemaToEBNFConverter::JSONStrToPrintableStr(const std::string& json_str) {
   static const std::vector<std::pair<std::string, std::string>> kReplaceMapping = {
-      {"\\", "\\\\"}, {"\"", "\\\""}};
+      {"\\", "\\\\"}, {"\"", "\\\""}
+  };
   std::string result = json_str;
   for (const auto& [k, v] : kReplaceMapping) {
     size_t pos = 0;
 
@@ -23,7 +23,8 @@ PYBIND11_MODULE(xgrammar_bindings, m) {
   auto pyBuiltinGrammar = py::class_<BuiltinGrammar>(m, "BuiltinGrammar");
   pyBuiltinGrammar.def_static("json", &BuiltinGrammar::JSON)
       .def_static("json_schema", &BuiltinGrammar::JSONSchema)
-      .def_static("_json_schema_to_ebnf", &BuiltinGrammar::_JSONSchemaToEBNF);
+      .def_static("_json_schema_to_ebnf", &BuiltinGrammar::_JSONSchemaToEBNF)
+      .def_static("_regex_to_ebnf", &BuiltinGrammar::_RegexToEBNF);
 
   auto pyTokenizerInfo = py::class_<TokenizerInfo>(m, "TokenizerInfo");
   pyTokenizerInfo.def(py::init(&TokenizerInfo_Init))
Original file line number	Diff line number	Diff line change
`@@ -136,9 +136,7 @@ std::string EBNFParserImpl::ParseName(bool accept_empty) {`
`136`	`136`	`// Character class should not contain newlines.`
`137`	`137`	`int32_t EBNFParserImpl::ParseCharacterClass() {`
`138`	`138`	`static constexpr TCodepoint kUnknownUpperBound = -4;`
`139`		`- static const std::unordered_map<std::string, TCodepoint> kCustomEscapeMap = {`
`140`		`- {"\\-", '-'}, {"\\]", ']'}`
`141`		`- };`
	`139`	`+ static const std::unordered_map<char, TCodepoint> CUSTOM_ESCAPE_MAP = {{'-', '-'}, {']', ']'}};`
`142`	`140`
`143`	`141`	`std::vector<BNFGrammarBuilder::CharacterClassElement> elements;`
`144`	`142`
`@@ -160,14 +158,14 @@ int32_t EBNFParserImpl::ParseCharacterClass() {`
`160`	`158`	`continue;`
`161`	`159`	`}`
`162`	`160`
`163`		`- auto [codepoint, new_cur] = ParseNextUTF8OrEscaped(cur_, kCustomEscapeMap);`
	`161`	`+ auto [codepoint, len] = ParseNextUTF8OrEscaped(cur_, CUSTOM_ESCAPE_MAP);`
`164`	`162`	`if (codepoint == CharHandlingError::kInvalidUTF8) {`
`165`	`163`	`ThrowParseError("Invalid UTF8 sequence");`
`166`	`164`	`}`
`167`	`165`	`if (codepoint == CharHandlingError::kInvalidEscape) {`
`168`	`166`	`ThrowParseError("Invalid escape sequence");`
`169`	`167`	`}`
`170`		`- Consume(new_cur - cur_);`
	`168`	`+ Consume(len);`
`171`	`169`	`if (past_is_hyphen) {`
`172`	`170`	`XGRAMMAR_ICHECK(!elements.empty());`
`173`	`171`	`if (elements.back().lower > codepoint) {`
`@@ -199,14 +197,14 @@ int32_t EBNFParserImpl::ParseString() {`
`199`	`197`	`ThrowParseError("There should be no newline character in a string literal");`
`200`	`198`	`}`
`201`	`199`
`202`		`- auto [codepoint, new_cur] = ParseNextUTF8OrEscaped(cur_);`
	`200`	`+ auto [codepoint, len] = ParseNextUTF8OrEscaped(cur_);`
`203`	`201`	`if (codepoint == CharHandlingError::kInvalidUTF8) {`
`204`	`202`	`ThrowParseError("Invalid utf8 sequence");`
`205`	`203`	`}`
`206`	`204`	`if (codepoint == CharHandlingError::kInvalidEscape) {`
`207`	`205`	`ThrowParseError("Invalid escape sequence");`
`208`	`206`	`}`
`209`		`- Consume(new_cur - cur_);`
	`207`	`+ Consume(len);`
`210`	`208`	`codepoints.push_back(codepoint);`
`211`	`209`	`}`
`212`	`210`	`if (codepoints.empty()) {`
`@@ -221,7 +219,7 @@ int32_t EBNFParserImpl::ParseString() {`
`221`	`219`	`// convert str to int32_t vector`
`222`	`220`	`std::vector<int32_t> bytes;`
`223`	`221`	`for (auto c : str) {`
`224`		`- bytes.push_back(static_cast<int32_t>(c));`
	`222`	`+ bytes.push_back(static_cast<int32_t>(static_cast<uint8_t>(c)));`
`225`	`223`	`}`
`226`	`224`	`return builder_.AddByteString(bytes);`
`227`	`225`	`}`
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ std::string BNFGrammarPrinter::PrintByteString(const RuleExpr& rule_expr) {`
`55`	`55`	`for (int i = 0; i < rule_expr.data_len; ++i) {`
`56`	`56`	`internal_str += static_cast<char>(rule_expr[i]);`
`57`	`57`	`}`
`58`		`- auto codepoints = ParseUTF8(internal_str.c_str(), UTF8ErrorPolicy::kReturnByte);`
	`58`	`+ auto codepoints = ParseUTF8(internal_str.c_str(), true);`
`59`	`59`	`std::string result;`
`60`	`60`	`for (auto codepoint : codepoints) {`
`61`	`61`	`result += PrintAsEscapedUTF8(codepoint);`