From d3b2cb047eb3942a8d7484311a93231c860a204e Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 30 May 2026 18:38:46 +0800 Subject: [PATCH] fix: split tokens before normalization (#1582) --- src/tokenizers/bpe_tokenizer.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tokenizers/bpe_tokenizer.cpp b/src/tokenizers/bpe_tokenizer.cpp index 1b11b322..602e9d54 100644 --- a/src/tokenizers/bpe_tokenizer.cpp +++ b/src/tokenizers/bpe_tokenizer.cpp @@ -131,11 +131,10 @@ std::vector BPETokenizer::bpe(const std::u32string& token) const } std::vector BPETokenizer::encode(const std::string& text, on_new_token_cb_t on_new_token_cb) { - std::string normalized_text = normalize(text); std::vector bpe_tokens; std::vector token_strs; - auto splited_texts = split_with_special_tokens(normalized_text, special_tokens); + auto splited_texts = split_with_special_tokens(text, special_tokens); for (auto& splited_text : splited_texts) { if (is_special_token(splited_text)) { @@ -160,7 +159,7 @@ std::vector BPETokenizer::encode(const std::string& text, on_new_token_cb_t } } - std::string token_str = token; + std::string token_str = normalize(token); std::u32string utf32_token; if (byte_level_bpe) { for (int i = 0; i < token_str.length(); i++) {