mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2026-06-09 15:56:39 +00:00
fix: split tokens before normalization (#1582)
This commit is contained in:
parent
b4ba55d8d7
commit
d3b2cb047e
@ -131,11 +131,10 @@ std::vector<std::u32string> BPETokenizer::bpe(const std::u32string& token) const
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t on_new_token_cb) {
|
std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t on_new_token_cb) {
|
||||||
std::string normalized_text = normalize(text);
|
|
||||||
std::vector<int32_t> bpe_tokens;
|
std::vector<int32_t> bpe_tokens;
|
||||||
std::vector<std::string> token_strs;
|
std::vector<std::string> token_strs;
|
||||||
|
|
||||||
auto splited_texts = split_with_special_tokens(normalized_text, special_tokens);
|
auto splited_texts = split_with_special_tokens(text, special_tokens);
|
||||||
|
|
||||||
for (auto& splited_text : splited_texts) {
|
for (auto& splited_text : splited_texts) {
|
||||||
if (is_special_token(splited_text)) {
|
if (is_special_token(splited_text)) {
|
||||||
@ -160,7 +159,7 @@ std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string token_str = token;
|
std::string token_str = normalize(token);
|
||||||
std::u32string utf32_token;
|
std::u32string utf32_token;
|
||||||
if (byte_level_bpe) {
|
if (byte_level_bpe) {
|
||||||
for (int i = 0; i < token_str.length(); i++) {
|
for (int i = 0; i < token_str.length(); i++) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user