diff --git a/minbpe/regex.py b/minbpe/regex.py index 9ed78e43..a38eb70a 100644 --- a/minbpe/regex.py +++ b/minbpe/regex.py @@ -17,6 +17,8 @@ # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" +# the above GPT split patterns are broken for all combining characters like diacritics and indic matras (all vowels that follow a consonent) +UNICODE_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+[\p{L}\p{M}]+|\p{N}{1,3}| ?[^\s\p{L}\p{M}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" class RegexTokenizer(Tokenizer): @@ -28,7 +30,7 @@ def __init__(self, pattern=None): example: {'<|endoftext|>': 100257} """ super().__init__() - self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern + self.pattern = UNICODE_SPLIT_PATTERN if pattern is None else pattern self.compiled_pattern = re.compile(self.pattern) self.special_tokens = {} self.inverse_special_tokens = {}