Compare commits

..

3 Commits

27 changed files with 1351 additions and 1440 deletions

View File

@ -156,8 +156,10 @@ file(GLOB SD_LIB_SOURCES
"src/*.h"
"src/*.cpp"
"src/*.hpp"
"src/vocab/*.h"
"src/vocab/*.cpp"
"src/tokenizers/*.h"
"src/tokenizers/*.cpp"
"src/tokenizers/vocab/*.h"
"src/tokenizers/vocab/*.cpp"
)
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@ -250,7 +252,7 @@ endif()
add_subdirectory(thirdparty)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . include)
target_include_directories(${SD_LIB} PUBLIC . src include)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)

View File

@ -50,7 +50,13 @@ if(SD_SERVER_BUILD_FRONTEND AND EXISTS "${FRONTEND_DIR}")
set_source_files_properties("${GENERATED_HTML_HEADER}" PROPERTIES GENERATED TRUE)
else()
message(WARNING "pnpm not found, frontend build disabled")
if(EXISTS "${GENERATED_HTML_HEADER}")
message(STATUS "pnpm not found; using pre-built frontend header detected at ${GENERATED_HTML_HEADER}")
set(HAVE_FRONTEND_BUILD ON)
add_custom_target(${TARGET}_frontend)
else()
message(WARNING "pnpm not found; frontend build disabled.")
endif()
endif()
else()
message(STATUS "Frontend disabled or directory not found: ${FRONTEND_DIR}")

View File

@ -1,4 +1,4 @@
for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp \
for f in src/*.cpp src/*.h src/*.hpp src/tokenizers/*.h src/tokenizers/*.cpp src/tokenizers/vocab/*.h src/tokenizers/vocab/*.cpp \
examples/cli/*.cpp examples/cli/*.h examples/server/*.cpp \
examples/common/*.hpp examples/common/*.h examples/common/*.cpp; do
[[ "$f" == vocab* ]] && continue

View File

@ -3,455 +3,7 @@
#include "ggml_extend.hpp"
#include "model.h"
#include "tokenize_util.h"
#include "vocab/vocab.h"
/*================================================== CLIPTokenizer ===================================================*/
__STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set;
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 161; b <= 172; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 174; b <= 255; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
int n = 0;
for (int b = 0; b < 256; ++b) {
if (byte_set.find(b) == byte_set.end()) {
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(n + 256)));
++n;
}
}
// LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size());
return byte_unicode_pairs;
}
// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
typedef std::function<bool(std::string&, std::vector<int32_t>&)> on_new_token_cb_t;
class CLIPTokenizer {
private:
std::map<int, std::u32string> byte_encoder;
std::map<std::u32string, int> byte_decoder;
std::map<std::u32string, int> encoder;
std::map<int, std::u32string> decoder;
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
std::regex pat;
int encoder_len;
int bpe_len;
std::vector<std::string> special_tokens;
public:
const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>";
const std::string EOS_TOKEN = "<|endoftext|>";
const std::string PAD_TOKEN = "<|endoftext|>";
const int UNK_TOKEN_ID = 49407;
const int BOS_TOKEN_ID = 49406;
const int EOS_TOKEN_ID = 49407;
const int PAD_TOKEN_ID = 49407;
private:
static std::string strip(const std::string& str) {
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
if (start == std::string::npos) {
// String contains only whitespace characters
return "";
}
return str.substr(start, end - start + 1);
}
static std::string whitespace_clean(std::string text) {
text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
text = strip(text);
return text;
}
static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
std::set<std::pair<std::u32string, std::u32string>> pairs;
if (subwords.size() == 0) {
return pairs;
}
std::u32string prev_subword = subwords[0];
for (int i = 1; i < subwords.size(); i++) {
std::u32string subword = subwords[i];
std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
pairs.insert(pair);
prev_subword = subword;
}
return pairs;
}
bool is_special_token(const std::string& token) {
for (auto& special_token : special_tokens) {
if (special_token == token) {
return true;
}
}
return false;
}
public:
CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
: PAD_TOKEN_ID(pad_token_id) {
if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str);
} else {
load_from_merges(load_clip_merges());
}
add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>");
}
void load_from_merges(const std::string& merges_utf8_str) {
auto byte_unicode_pairs = bytes_to_unicode();
// printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size());
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
for (auto& pair : byte_unicode_pairs) {
byte_decoder[pair.second] = pair.first;
}
// for (auto & pair: byte_unicode_pairs) {
// std::cout << pair.first << ": " << pair.second << std::endl;
// }
std::vector<std::u32string> merges;
size_t start = 0;
size_t pos;
std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
merges.push_back(merges_utf32_str.substr(start, pos - start));
start = pos + 1;
}
// LOG_DEBUG("merges size %llu", merges.size());
GGML_ASSERT(merges.size() == 48895);
merges = std::vector<std::u32string>(merges.begin() + 1, merges.end());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
// LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
// printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(),
// utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
}
std::vector<std::u32string> vocab;
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second);
}
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second + utf8_to_utf32("</w>"));
}
for (const auto& merge : merge_pairs) {
vocab.push_back(merge.first + merge.second);
}
vocab.push_back(utf8_to_utf32("<|startoftext|>"));
vocab.push_back(utf8_to_utf32("<|endoftext|>"));
LOG_DEBUG("vocab size: %llu", vocab.size());
int i = 0;
for (const auto& token : vocab) {
encoder[token] = i;
decoder[i] = token;
i++;
}
encoder_len = i;
auto it = encoder.find(utf8_to_utf32("img</w>"));
if (it != encoder.end()) {
LOG_DEBUG("trigger word img already in vocab");
} else {
LOG_DEBUG("trigger word img not in vocab yet");
}
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
};
void add_token(const std::string& text) {
std::u32string token = utf8_to_utf32(text);
auto it = encoder.find(token);
if (it != encoder.end()) {
encoder[token] = encoder_len;
decoder[encoder_len] = token;
encoder_len++;
}
}
void add_special_token(const std::string& token) {
special_tokens.push_back(token);
}
std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word;
for (int i = 0; i < token.size() - 1; i++) {
word.emplace_back(1, token[i]);
}
word.push_back(token.substr(token.size() - 1) + utf8_to_utf32("</w>"));
std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
if (pairs.empty()) {
return token + utf8_to_utf32("</w>");
}
while (true) {
auto min_pair_iter = std::min_element(pairs.begin(),
pairs.end(),
[&](const std::pair<std::u32string, std::u32string>& a,
const std::pair<std::u32string, std::u32string>& b) {
if (bpe_ranks.find(a) == bpe_ranks.end()) {
return false;
} else if (bpe_ranks.find(b) == bpe_ranks.end()) {
return true;
}
return bpe_ranks.at(a) < bpe_ranks.at(b);
});
const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
break;
}
std::u32string first = bigram.first;
std::u32string second = bigram.second;
std::vector<std::u32string> new_word;
int32_t i = 0;
while (i < word.size()) {
auto it = std::find(word.begin() + i, word.end(), first);
if (it == word.end()) {
new_word.insert(new_word.end(), word.begin() + i, word.end());
break;
}
new_word.insert(new_word.end(), word.begin() + i, it);
i = static_cast<int32_t>(std::distance(word.begin(), it));
if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
new_word.push_back(first + second);
i += 2;
} else {
new_word.push_back(word[i]);
i += 1;
}
}
word = new_word;
if (word.size() == 1) {
break;
}
pairs = get_pairs(word);
}
std::u32string result;
for (int i = 0; i < word.size(); i++) {
result += word[i];
if (i != word.size() - 1) {
result += utf8_to_utf32(" ");
}
}
return result;
}
std::vector<int> tokenize(std::string text,
on_new_token_cb_t on_new_token_cb,
size_t max_length = 0,
bool padding = false) {
std::vector<int32_t> tokens = encode(text, on_new_token_cb);
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
} else {
tokens.push_back(EOS_TOKEN_ID);
if (padding) {
tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID);
}
}
}
return tokens;
}
void pad_tokens(std::vector<int>& tokens,
std::vector<float>& weights,
size_t max_length = 0,
bool padding = false) {
if (max_length > 0 && padding) {
size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.0 / (max_length - 2)));
if (n == 0) {
n = 1;
}
size_t length = max_length * n;
LOG_DEBUG("token length: %llu", length);
std::vector<int> new_tokens;
std::vector<float> new_weights;
new_tokens.push_back(BOS_TOKEN_ID);
new_weights.push_back(1.0);
int token_idx = 0;
for (int i = 1; i < length; i++) {
if (token_idx >= tokens.size()) {
break;
}
if (i % max_length == 0) {
new_tokens.push_back(BOS_TOKEN_ID);
new_weights.push_back(1.0);
} else if (i % max_length == max_length - 1) {
new_tokens.push_back(EOS_TOKEN_ID);
new_weights.push_back(1.0);
} else {
new_tokens.push_back(tokens[token_idx]);
new_weights.push_back(weights[token_idx]);
token_idx++;
}
}
new_tokens.push_back(EOS_TOKEN_ID);
new_weights.push_back(1.0);
tokens = new_tokens;
weights = new_weights;
if (padding) {
tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
weights.insert(weights.end(), length - weights.size(), 1.0);
}
}
}
std::string clean_up_tokenization(std::string& text) {
std::regex pattern(R"( ,)");
// Replace " ," with ","
std::string result = std::regex_replace(text, pattern, ",");
return result;
}
std::string decode(const std::vector<int>& tokens) {
std::string text = "";
for (int t : tokens) {
if (t == 49406 || t == 49407)
continue;
std::u32string ts = decoder[t];
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
std::string s = utf32_to_utf8(ts);
if (s.length() >= 4) {
if (ends_with(s, "</w>")) {
text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
} else {
text += s;
}
} else {
text += " " + s;
}
}
// std::vector<unsigned char> bytes;
// for (auto c : text){
// bytes.push_back(byte_decoder[c]);
// }
// std::string s((char *)bytes.data());
// std::string s = "";
text = clean_up_tokenization(text);
return trim(text);
}
std::vector<std::string> token_split(const std::string& text) {
std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::sregex_iterator iter(text.begin(), text.end(), pat);
std::sregex_iterator end;
std::vector<std::string> result;
for (; iter != end; ++iter) {
result.emplace_back(iter->str());
}
return result;
}
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
std::string original_text = text;
std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::string str = text;
std::vector<std::string> token_strs;
auto splited_texts = split_with_special_tokens(text, special_tokens);
for (auto& splited_text : splited_texts) {
LOG_DEBUG("token %s", splited_text.c_str());
if (is_special_token(splited_text)) {
LOG_DEBUG("special %s", splited_text.c_str());
bool skip = on_new_token_cb(splited_text, bpe_tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
continue;
}
auto tokens = token_split(splited_text);
for (auto& token : tokens) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(token, bpe_tokens);
if (skip) {
token_strs.push_back(token);
continue;
}
}
std::string token_str = token;
std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) {
unsigned char b = token_str[i];
utf32_token += byte_encoder[b];
}
auto bpe_strs = bpe(utf32_token);
size_t start = 0;
size_t pos;
while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
auto bpe_str = bpe_strs.substr(start, pos - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
start = pos + 1;
}
auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
}
// std::stringstream ss;
// ss << "[";
// for (auto token : token_strs) {
// ss << "\"" << token << "\", ";
// }
// ss << "]";
// LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
return bpe_tokens;
}
};
#include "tokenizers/clip_tokenizer.h"
/*================================================ FrozenCLIPEmbedder ================================================*/

View File

@ -256,15 +256,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return true;
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
tokenize_with_trigger_token(std::string text,
int num_input_imgs,
int32_t image_token,
bool padding = false) {
return tokenize_with_trigger_token(text, num_input_imgs, image_token,
text_model->model.n_token, padding);
}
std::vector<int> convert_token_to_id(std::string text) {
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
auto iter = embedding_map.find(str);
@ -288,9 +279,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
tokenize_with_trigger_token(std::string text,
int num_input_imgs,
int32_t image_token,
size_t max_length = 0,
bool padding = false) {
int32_t image_token) {
auto parsed_attention = parse_prompt_attention(text);
{
@ -377,7 +366,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
// weights.insert(weights.begin(), 1.0);
tokenizer.pad_tokens(tokens, weights, max_length, padding);
tokenizer.pad_tokens(tokens, &weights, nullptr, text_model->model.n_token, text_model->model.n_token, true);
int offset = pm_version == PM_VERSION_2 ? 2 * num_input_imgs : num_input_imgs;
for (int i = 0; i < tokens.size(); i++) {
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
@ -403,13 +392,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
bool padding = false) {
return tokenize(text, text_model->model.n_token, padding);
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
size_t min_length = 0,
size_t max_length = 0,
bool allow_overflow_expand = true) {
auto parsed_attention = parse_prompt_attention(text);
{
@ -460,7 +445,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
tokenizer.pad_tokens(tokens, weights, max_length, padding);
tokenizer.pad_tokens(tokens, &weights, nullptr, min_length, max_length, allow_overflow_expand);
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";
@ -603,8 +588,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text,
conditioner_params.num_input_imgs,
image_tokens[0],
true);
image_tokens[0]);
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
std::vector<float>& weights = std::get<1>(tokens_and_weights);
std::vector<bool>& clsm = std::get<2>(tokens_and_weights);
@ -630,7 +614,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::string remove_trigger_from_prompt(const std::string& prompt) override {
auto image_tokens = convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize(prompt, false);
auto tokens_and_weights = tokenize(prompt);
std::vector<int>& tokens = tokens_and_weights.first;
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
@ -640,7 +624,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
auto tokens_and_weights = tokenize(conditioner_params.text, true);
auto tokens_and_weights = tokenize(conditioner_params.text, text_model->model.n_token, text_model->model.n_token, true);
std::vector<int>& tokens = tokens_and_weights.first;
std::vector<float>& weights = tokens_and_weights.second;
return get_learned_condition_common(n_threads,
@ -822,8 +806,9 @@ struct SD3CLIPEmbedder : public Conditioner {
}
std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
size_t min_length = 0,
size_t max_length = 0,
bool allow_overflow_expand = true) {
auto parsed_attention = parse_prompt_attention(text);
{
@ -860,20 +845,20 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
}
if (t5) {
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
std::vector<int> curr_tokens = t5_tokenizer.encode(curr_text);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
}
if (clip_l) {
clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
clip_l_tokenizer.pad_tokens(clip_l_tokens, &clip_l_weights, nullptr, min_length, max_length, allow_overflow_expand);
}
if (clip_g) {
clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
clip_g_tokenizer.pad_tokens(clip_g_tokens, &clip_g_weights, nullptr, min_length, max_length, allow_overflow_expand);
}
if (t5) {
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, nullptr, min_length, max_length, true);
}
// for (int i = 0; i < clip_l_tokens.size(); i++) {
@ -1056,7 +1041,7 @@ struct SD3CLIPEmbedder : public Conditioner {
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
auto tokens_and_weights = tokenize(conditioner_params.text, 77, 77, true);
return get_learned_condition_common(n_threads,
tokens_and_weights,
conditioner_params.clip_skip,
@ -1158,8 +1143,8 @@ struct FluxCLIPEmbedder : public Conditioner {
}
std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
size_t min_length = 0,
size_t max_length = 0) {
auto parsed_attention = parse_prompt_attention(text);
{
@ -1189,17 +1174,17 @@ struct FluxCLIPEmbedder : public Conditioner {
clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
}
if (t5) {
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
std::vector<int> curr_tokens = t5_tokenizer.encode(curr_text);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
}
if (clip_l) {
clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
clip_l_tokenizer.pad_tokens(clip_l_tokens, &clip_l_weights, nullptr, 77, 77, true);
}
if (t5) {
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, nullptr, min_length, max_length, true);
}
// for (int i = 0; i < clip_l_tokens.size(); i++) {
@ -1300,7 +1285,7 @@ struct FluxCLIPEmbedder : public Conditioner {
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, chunk_len);
return get_learned_condition_common(n_threads,
tokens_and_weights,
conditioner_params.clip_skip,
@ -1377,8 +1362,8 @@ struct T5CLIPEmbedder : public Conditioner {
}
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
size_t min_length = 0,
size_t max_length = 0) {
auto parsed_attention = parse_prompt_attention(text);
{
@ -1403,12 +1388,15 @@ struct T5CLIPEmbedder : public Conditioner {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
std::vector<int> curr_tokens = t5_tokenizer.encode(curr_text);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
t5_tokenizer.pad_tokens(t5_tokens, &t5_weights, &t5_mask, min_length, max_length, true);
for (auto& mask_value : t5_mask) {
mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF;
}
}
return {t5_tokens, t5_weights, t5_mask};
}
@ -1496,7 +1484,7 @@ struct T5CLIPEmbedder : public Conditioner {
SDCondition get_learned_condition(int n_threads,
const ConditionerParams& conditioner_params) override {
auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, chunk_len);
return get_learned_condition_common(n_threads,
tokens_and_weights,
conditioner_params.clip_skip,
@ -1505,14 +1493,14 @@ struct T5CLIPEmbedder : public Conditioner {
};
struct AnimaConditioner : public Conditioner {
std::shared_ptr<LLM::BPETokenizer> qwen_tokenizer;
std::shared_ptr<BPETokenizer> qwen_tokenizer;
T5UniGramTokenizer t5_tokenizer;
std::shared_ptr<LLM::LLMRunner> llm;
AnimaConditioner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}) {
qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
qwen_tokenizer = std::make_shared<Qwen2Tokenizer>();
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
backend,
offload_params_to_cpu,
@ -1578,7 +1566,7 @@ struct AnimaConditioner : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
std::vector<int> curr_tokens = t5_tokenizer.tokenize(curr_text, nullptr, true);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}
@ -1620,7 +1608,7 @@ struct AnimaConditioner : public Conditioner {
struct LLMEmbedder : public Conditioner {
SDVersion version;
std::shared_ptr<LLM::BPETokenizer> tokenizer;
std::shared_ptr<BPETokenizer> tokenizer;
std::shared_ptr<LLM::LLMRunner> llm;
LLMEmbedder(ggml_backend_t backend,
@ -1637,9 +1625,9 @@ struct LLMEmbedder : public Conditioner {
arch = LLM::LLMArch::QWEN3;
}
if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
tokenizer = std::make_shared<LLM::MistralTokenizer>();
tokenizer = std::make_shared<MistralTokenizer>();
} else {
tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
tokenizer = std::make_shared<Qwen2Tokenizer>();
}
llm = std::make_shared<LLM::LLMRunner>(arch,
backend,
@ -1677,10 +1665,10 @@ struct LLMEmbedder : public Conditioner {
}
}
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
const std::pair<int, int>& attn_range,
size_t max_length = 0,
bool padding = false) {
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
const std::pair<int, int>& attn_range,
size_t min_length = 0,
size_t max_length = 100000000) {
std::vector<std::pair<std::string, float>> parsed_attention;
if (attn_range.first >= 0 && attn_range.second > 0) {
parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
@ -1710,39 +1698,34 @@ struct LLMEmbedder : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
std::vector<int> curr_tokens = tokenizer->encode(curr_text, nullptr);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
tokenizer->pad_tokens(tokens, weights, max_length, padding);
std::vector<float> mask;
tokenizer->pad_tokens(tokens, &weights, &mask, min_length, max_length);
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
// }
// std::cout << std::endl;
return {tokens, weights};
return {tokens, weights, mask};
}
sd::Tensor<float> encode_prompt(int n_threads,
const std::string prompt,
const std::pair<int, int>& prompt_attn_range,
int max_length,
int min_length,
int hidden_states_min_length,
const std::vector<std::pair<int, sd::Tensor<float>>>& image_embeds,
const std::set<int>& out_layers,
int prompt_template_encode_start_idx) {
auto tokens_and_weights = tokenize(prompt, prompt_attn_range);
auto& tokens = std::get<0>(tokens_and_weights);
auto& weights = std::get<1>(tokens_and_weights);
std::vector<float> mask;
if (max_length > 0 && tokens.size() < max_length) {
mask.insert(mask.end(), tokens.size(), 1.f);
mask.insert(mask.end(), max_length - tokens.size(), 0.f);
tokenizer->pad_tokens(tokens, weights, max_length, true);
}
auto tokens_weights_mask = tokenize(prompt, prompt_attn_range, min_length);
auto& tokens = std::get<0>(tokens_weights_mask);
auto& weights = std::get<1>(tokens_weights_mask);
auto& mask = std::get<2>(tokens_weights_mask);
sd::Tensor<int32_t> input_ids({static_cast<int64_t>(tokens.size())}, tokens);
sd::Tensor<float> attention_mask;
@ -1769,9 +1752,9 @@ struct LLMEmbedder : public Conditioner {
GGML_ASSERT(hidden_states.shape()[1] > prompt_template_encode_start_idx);
int64_t zero_pad_len = 0;
if (min_length > 0) {
if (hidden_states.shape()[1] - prompt_template_encode_start_idx < min_length) {
zero_pad_len = min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx;
if (hidden_states_min_length > 0) {
if (hidden_states.shape()[1] - prompt_template_encode_start_idx < hidden_states_min_length) {
zero_pad_len = hidden_states_min_length - hidden_states.shape()[1] + prompt_template_encode_start_idx;
}
}
@ -1798,8 +1781,8 @@ struct LLMEmbedder : public Conditioner {
std::vector<std::pair<int, int>> extra_prompts_attn_range;
std::vector<std::pair<int, sd::Tensor<float>>> image_embeds;
int prompt_template_encode_start_idx = 34;
int max_length = 0; // pad tokens
int min_length = 0; // zero pad hidden_states
int min_length = 0; // pad tokens
int hidden_states_min_length = 0; // zero pad hidden_states
std::set<int> out_layers;
int64_t t0 = ggml_time_ms();
@ -1874,7 +1857,7 @@ struct LLMEmbedder : public Conditioner {
}
} else if (version == VERSION_FLUX2) {
prompt_template_encode_start_idx = 0;
min_length = 512;
hidden_states_min_length = 512;
out_layers = {10, 20, 30};
prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
@ -1907,7 +1890,7 @@ struct LLMEmbedder : public Conditioner {
}
} else if (version == VERSION_FLUX2_KLEIN) {
prompt_template_encode_start_idx = 0;
max_length = 512;
min_length = 512;
out_layers = {9, 18, 27};
prompt = "<|im_start|>user\n";
@ -1919,7 +1902,7 @@ struct LLMEmbedder : public Conditioner {
prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
} else if (version == VERSION_OVIS_IMAGE) {
prompt_template_encode_start_idx = 28;
max_length = prompt_template_encode_start_idx + 256;
min_length = prompt_template_encode_start_idx + 256;
prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
@ -1935,8 +1918,8 @@ struct LLMEmbedder : public Conditioner {
auto hidden_states = encode_prompt(n_threads,
prompt,
prompt_attn_range,
max_length,
min_length,
hidden_states_min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);
@ -1945,8 +1928,8 @@ struct LLMEmbedder : public Conditioner {
auto extra_hidden_states = encode_prompt(n_threads,
extra_prompts[i],
extra_prompts_attn_range[i],
max_length,
min_length,
hidden_states_min_length,
image_embeds,
out_layers,
prompt_template_encode_start_idx);

View File

@ -14,465 +14,16 @@
#include <utility>
#include <vector>
#include "clip.hpp"
#include "ggml_extend.hpp"
#include "json.hpp"
#include "rope.hpp"
#include "tokenize_util.h"
#include "vocab/vocab.h"
#include "tokenizers/bpe_tokenizer.h"
#include "tokenizers/mistral_tokenizer.h"
#include "tokenizers/qwen2_tokenizer.h"
namespace LLM {
constexpr int LLM_GRAPH_SIZE = 10240;
class BPETokenizer {
protected:
std::map<int, std::u32string> byte_encoder;
std::map<std::u32string, int> byte_decoder;
std::map<std::u32string, int> encoder;
std::map<int, std::u32string> decoder;
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
std::regex pat;
int encoder_len;
int bpe_len;
std::string UNK_TOKEN;
std::string BOS_TOKEN;
std::string EOS_TOKEN;
std::string PAD_TOKEN;
int UNK_TOKEN_ID;
int BOS_TOKEN_ID;
int EOS_TOKEN_ID;
int PAD_TOKEN_ID;
std::vector<std::string> special_tokens;
bool add_bos_token = false;
protected:
static std::string strip(const std::string& str) {
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
if (start == std::string::npos) {
// String contains only whitespace characters
return "";
}
return str.substr(start, end - start + 1);
}
static std::string whitespace_clean(std::string text) {
text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
text = strip(text);
return text;
}
static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
std::set<std::pair<std::u32string, std::u32string>> pairs;
if (subwords.size() == 0) {
return pairs;
}
std::u32string prev_subword = subwords[0];
for (int i = 1; i < subwords.size(); i++) {
std::u32string subword = subwords[i];
std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
pairs.insert(pair);
prev_subword = subword;
}
return pairs;
}
bool is_special_token(const std::string& token) {
for (auto& special_token : special_tokens) {
if (special_token == token) {
return true;
}
}
return false;
}
public:
BPETokenizer() = default;
std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word;
for (int i = 0; i < token.size(); i++) {
word.emplace_back(1, token[i]);
}
std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
if (pairs.empty()) {
return token;
}
while (true) {
auto min_pair_iter = std::min_element(pairs.begin(),
pairs.end(),
[&](const std::pair<std::u32string, std::u32string>& a,
const std::pair<std::u32string, std::u32string>& b) {
if (bpe_ranks.find(a) == bpe_ranks.end()) {
return false;
} else if (bpe_ranks.find(b) == bpe_ranks.end()) {
return true;
}
return bpe_ranks.at(a) < bpe_ranks.at(b);
});
const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
break;
}
std::u32string first = bigram.first;
std::u32string second = bigram.second;
std::vector<std::u32string> new_word;
int32_t i = 0;
while (i < word.size()) {
auto it = std::find(word.begin() + i, word.end(), first);
if (it == word.end()) {
new_word.insert(new_word.end(), word.begin() + i, word.end());
break;
}
new_word.insert(new_word.end(), word.begin() + i, it);
i = static_cast<int32_t>(std::distance(word.begin(), it));
if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
new_word.push_back(first + second);
i += 2;
} else {
new_word.push_back(word[i]);
i += 1;
}
}
word = new_word;
if (word.size() == 1) {
break;
}
pairs = get_pairs(word);
}
std::u32string result;
for (int i = 0; i < word.size(); i++) {
result += word[i];
if (i != word.size() - 1) {
result += utf8_to_utf32(" ");
}
}
return result;
}
std::vector<int> tokenize(std::string text,
on_new_token_cb_t on_new_token_cb = nullptr,
size_t max_length = 0,
bool padding = false) {
std::vector<int32_t> tokens = encode(text, on_new_token_cb);
if (max_length > 0) {
if (tokens.size() < max_length) {
tokens.resize(max_length);
} else {
if (padding) {
tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID);
}
}
}
return tokens;
}
void pad_tokens(std::vector<int>& tokens,
std::vector<float>& weights,
size_t max_length = 0,
bool padding = false) {
if (add_bos_token) {
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
weights.insert(weights.begin(), 1.f);
}
if (max_length > 0 && padding) {
size_t n = static_cast<size_t>(std::ceil(tokens.size() * 1.f / max_length));
if (n == 0) {
n = 1;
}
size_t length = max_length * n;
LOG_DEBUG("token length: %llu", length);
tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
weights.insert(weights.end(), length - weights.size(), 1.f);
}
}
std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb = nullptr) {
std::string original_text = text;
std::vector<int32_t> bpe_tokens;
std::vector<std::string> token_strs;
auto splited_texts = split_with_special_tokens(text, special_tokens);
for (auto& splited_text : splited_texts) {
if (is_special_token(splited_text)) {
bpe_tokens.push_back(encoder[utf8_to_utf32(splited_text)]);
token_strs.push_back(splited_text);
continue;
}
auto tokens = token_split(splited_text);
for (auto& token : tokens) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(token, bpe_tokens);
if (skip) {
continue;
}
}
std::string token_str = token;
std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) {
unsigned char b = token_str[i];
utf32_token += byte_encoder[b];
}
auto bpe_strs = bpe(utf32_token);
size_t start = 0;
size_t pos;
while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
auto bpe_str = bpe_strs.substr(start, pos - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
start = pos + 1;
}
auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
}
std::stringstream ss;
ss << "[";
for (auto token : token_strs) {
ss << "\"" << token << "\", ";
}
ss << "]";
LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
// printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
return bpe_tokens;
}
};
class Qwen2Tokenizer : public BPETokenizer {
protected:
void load_from_merges(const std::string& merges_utf8_str) {
auto byte_unicode_pairs = bytes_to_unicode();
// printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size());
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
for (auto& pair : byte_unicode_pairs) {
byte_decoder[pair.second] = pair.first;
}
// for (auto & pair: byte_unicode_pairs) {
// std::cout << pair.first << ": " << pair.second << std::endl;
// }
std::vector<std::u32string> merges;
size_t start = 0;
size_t pos;
std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
merges.push_back(merges_utf32_str.substr(start, pos - start));
start = pos + 1;
}
LOG_DEBUG("merges size %llu", merges.size());
merges = std::vector<std::u32string>(merges.begin(), merges.end());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
// int print_num = 10;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
// if (print_num > 0) {
// print_num--;
// printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(),
// utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
// }
}
std::vector<std::u32string> tokens;
for (const auto& pair : byte_unicode_pairs) {
tokens.push_back(pair.second);
}
for (const auto& merge : merge_pairs) {
tokens.push_back(merge.first + merge.second);
}
for (auto& special_token : special_tokens) {
tokens.push_back(utf8_to_utf32(special_token));
}
int i = 0;
for (const auto& token : tokens) {
encoder[token] = i;
decoder[i] = token;
i++;
}
encoder_len = i;
LOG_DEBUG("vocab size: %d", encoder_len);
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
};
public:
explicit Qwen2Tokenizer(const std::string& merges_utf8_str = "") {
UNK_TOKEN = "<|endoftext|>";
EOS_TOKEN = "<|endoftext|>";
PAD_TOKEN = "<|endoftext|>";
UNK_TOKEN_ID = 151643;
EOS_TOKEN_ID = 151643;
PAD_TOKEN_ID = 151643;
special_tokens = {
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>",
"<tool_call>",
"</tool_call>",
"<|fim_prefix|>",
"<|fim_middle|>",
"<|fim_suffix|>",
"<|fim_pad|>",
"<|repo_name|>",
"<|file_sep|>",
"<tool_response>",
"</tool_response>",
"<think>",
"</think>",
};
if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str);
} else {
load_from_merges(load_qwen2_merges());
}
}
};
class MistralTokenizer : public BPETokenizer {
protected:
void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
nlohmann::json vocab;
try {
vocab = nlohmann::json::parse(vocab_utf8_str);
} catch (const nlohmann::json::parse_error&) {
GGML_ABORT("invalid vocab json str");
}
for (const auto& [key, value] : vocab.items()) {
std::u32string token = utf8_to_utf32(key);
int i = value;
encoder[token] = i;
decoder[i] = token;
}
encoder_len = static_cast<int>(vocab.size());
LOG_DEBUG("vocab size: %d", encoder_len);
auto byte_unicode_pairs = bytes_to_unicode();
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
for (auto& pair : byte_unicode_pairs) {
byte_decoder[pair.second] = pair.first;
}
std::vector<std::u32string> merges;
size_t start = 0;
size_t pos;
std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
merges.push_back(merges_utf32_str.substr(start, pos - start));
start = pos + 1;
}
LOG_DEBUG("merges size %llu", merges.size());
merges = std::vector<std::u32string>(merges.begin(), merges.end());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
// int print_num = 10;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
// if (print_num > 0) {
// print_num--;
// printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(),
// utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
// }
}
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
};
public:
explicit MistralTokenizer(const std::string& merges_utf8_str = "", const std::string& vocab_utf8_str = "") {
add_bos_token = true;
UNK_TOKEN = "<unk>";
BOS_TOKEN = "<s>";
EOS_TOKEN = "</s>";
PAD_TOKEN = "<pad>";
UNK_TOKEN_ID = 0;
BOS_TOKEN_ID = 1;
EOS_TOKEN_ID = 2;
PAD_TOKEN_ID = 11;
special_tokens = {
"<unk>",
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
"[TOOL_CALLS]",
"[IMG]",
"<pad>",
"[IMG_BREAK]",
"[IMG_END]",
"[PREFIX]",
"[MIDDLE]",
"[SUFFIX]",
"[SYSTEM_PROMPT]",
"[/SYSTEM_PROMPT]",
"[TOOL_CONTENT]",
};
for (int i = 20; i < 1000; i++) {
special_tokens.push_back("<SPECIAL_" + std::to_string(i) + ">");
}
if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str, vocab_utf8_str);
} else {
load_from_merges(load_mistral_merges(), load_mistral_vocab_json());
}
}
};
enum class LLMArch {
QWEN2_5_VL,
QWEN3,
@ -1479,7 +1030,7 @@ namespace LLM {
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
tokenizer->pad_tokens(tokens, weights, max_length, padding);
tokenizer->pad_tokens(tokens, &weights, nullptr, padding ? max_length : 0, padding ? max_length : 100000000, padding);
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";

View File

@ -10,452 +10,9 @@
#include <string>
#include <unordered_map>
#include "darts.h"
#include "ggml_extend.hpp"
#include "json.hpp"
#include "model.h"
#include "vocab/vocab.h"
// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
//
// Since tokenization is not the bottleneck in SD, performance was not a major consideration
// during the migration.
class MetaspacePreTokenizer {
private:
std::string replacement;
bool add_prefix_space;
public:
MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
: replacement(replacement), add_prefix_space(add_prefix_space) {}
std::string tokenize(const std::string& input) const {
std::string tokens;
std::stringstream ss(input);
if (add_prefix_space) {
tokens += replacement;
}
std::string token;
bool firstToken = true;
while (std::getline(ss, token, ' ')) {
if (!firstToken)
tokens += replacement + token;
else
tokens += token;
firstToken = false;
}
return tokens;
}
};
using EncodeResult = std::vector<std::pair<std::string, int>>;
class T5UniGramTokenizer {
public:
enum Status {
OK,
NO_PIECES_LOADED,
NO_ENTRY_FOUND,
BUILD_DOUBLE_ARRAY_FAILED,
PIECE_ALREADY_DEFINED,
INVLIAD_JSON
};
protected:
MetaspacePreTokenizer pre_tokenizer;
// all <piece, score> pairs
std::vector<std::pair<std::string, float>> piece_score_pairs;
float min_score_ = 0.0;
float max_score_ = 0.0;
std::unique_ptr<Darts::DoubleArray> trie_;
// Maximum size of the return value of Trie, which corresponds
// to the maximum size of shared common prefix in the sentence pieces.
int trie_results_size_;
// unknown id.
int unk_id_ = 2;
std::string eos_token_ = "</s>";
int eos_id_ = 1;
int pad_id_ = 0;
// status.
Status status_ = OK;
float kUnkPenalty = 10.0;
std::string replacement;
bool add_prefix_space = true;
void InitializePieces(const std::string& json_str) {
nlohmann::json data;
try {
data = nlohmann::json::parse(json_str);
} catch (const nlohmann::json::parse_error&) {
status_ = INVLIAD_JSON;
return;
}
if (!data.contains("model")) {
status_ = INVLIAD_JSON;
return;
}
nlohmann::json model = data["model"];
if (!model.contains("vocab")) {
status_ = INVLIAD_JSON;
return;
}
if (model.contains("unk_id")) {
unk_id_ = model["unk_id"];
}
replacement = data["pre_tokenizer"]["replacement"];
add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
for (const auto& item : model["vocab"]) {
if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
status_ = INVLIAD_JSON;
return;
}
std::string piece = item[0];
if (piece.empty()) {
piece = "<empty_token>";
}
float score = item[1];
piece_score_pairs.emplace_back(piece, score);
}
}
// Builds a Trie index.
void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
if (status_ != OK)
return;
if (pieces->empty()) {
status_ = NO_PIECES_LOADED;
return;
}
// sort by sentencepiece since DoubleArray::build()
// only accepts sorted strings.
sort(pieces->begin(), pieces->end());
// Makes key/value set for DoubleArrayTrie.
std::vector<const char*> key(pieces->size());
std::vector<int> value(pieces->size());
for (size_t i = 0; i < pieces->size(); ++i) {
// LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
key[i] = (*pieces)[i].first.data(); // sorted piece.
value[i] = (*pieces)[i].second; // vocab_id
}
trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
&value[0]) != 0) {
status_ = BUILD_DOUBLE_ARRAY_FAILED;
return;
}
// Computes the maximum number of shared prefixes in the trie.
const int kMaxTrieResultsSize = 1024;
std::vector<Darts::DoubleArray::result_pair_type> results(
kMaxTrieResultsSize);
trie_results_size_ = 0;
for (const auto& p : *pieces) {
const size_t num_nodes = trie_->commonPrefixSearch(
p.first.data(), results.data(), results.size(), p.first.size());
trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
}
if (trie_results_size_ == 0)
status_ = NO_ENTRY_FOUND;
}
// Non-virtual (inlined) implementation for faster execution.
inline float GetScoreInlined(int id) const {
return piece_score_pairs[id].second;
}
inline bool IsUnusedInlined(int id) const {
return false; // TODO
}
inline bool IsUserDefinedInlined(int id) const {
return false; // TODO
}
inline size_t OneCharLen(const char* src) const {
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
}
// The optimized Viterbi encode.
// Main differences from the original function:
// 1. Memorizes the best path at each postion so far,
// 2. No need to store the Lattice nodes,
// 3. Works in utf-8 directly,
// 4. Defines a new struct with fewer fields than Lattice,
// 5. Does not depend on `class Lattice` nor call `SetSentence()`,
// `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
// For detailed explanations please see the comments inside the function body.
EncodeResult EncodeOptimized(const std::string& normalized) const {
// An optimized Viterbi algorithm for unigram language models. Benchmarking
// results show that it generates almost identical outputs and achieves 2.1x
// speedup on average for 102 languages compared to the original
// implementation. It's based on the following three ideas:
//
// 1. Because it uses the *unigram* model:
// best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt)
// Deciding the best path (and score) can be decoupled into two isolated
// terms: (a) the best path ended before the last token `best_score(x1, x2, ...)`
// x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
// not related to each other at all.
//
// Therefore, we can compute once and store the *best_path ending at
// each character position*. In this way, when we know best_path_ends_at[M],
// we can reuse it to compute all the best_path_ends_at_[...] where the last
// token starts at the same character position M.
//
// This improves the time complexity from O(n*k*k) to O(n*k) because it
// eliminates the extra loop of recomputing the best path ending at the same
// position, where n is the input length and k is the maximum number of tokens
// that can be recognized starting at each position.
//
// 2. Again, because it uses the *unigram* model, we don't need to actually
// store the lattice nodes. We still recognize all the tokens and lattice
// nodes from the input, but along identifying them, we use and discard them
// on the fly. There is no need to actually store them for best path Viterbi
// decoding. The only thing we need to store is the best_path ending at
// each character position.
//
// This improvement reduces the things needed to store in memory from O(n*k)
// to O(n), where n is the input length and k is the maximum number of tokens
// that can be recognized starting at each position.
//
// It also avoids the need of dynamic-size lattice node pool, because the
// number of things to store is fixed as n.
//
// 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
// inputs. In the original implementation, the lattice positions are based on
// unicode positions. A mapping from unicode position to the utf-8 position is
// maintained to recover the utf-8 string piece.
//
// We found that it is sufficient and beneficial to directly work with utf-8
// positions:
//
// Firstly, it saves the conversion and mapping between unicode positions and
// utf-8 positions.
//
// Secondly, it reduces the number of fields we need to maintain in the
// node/path structure. Specifically, there are 8 fields defined in
// `Lattice::Node` used by the original encoder, but here in the optimized
// encoder we only need to define 3 fields in `BestPathNode`.
if (status() != OK || normalized.empty()) {
return {};
}
// Represents the last node of the best path.
struct BestPathNode {
int id = -1; // The vocab id. (maybe -1 for UNK)
float best_path_score =
0; // The total score of the best path ending at this node.
int starts_at =
-1; // The starting position (in utf-8) of this node. The entire best
// path can be constructed by backtracking along this link.
};
const int size = static_cast<int>(normalized.size());
const float unk_score = min_score() - kUnkPenalty;
// The ends are exclusive.
std::vector<BestPathNode> best_path_ends_at(size + 1);
// Generate lattice on-the-fly (not stored) and update best_path_ends_at.
int starts_at = 0;
while (starts_at < size) {
std::size_t node_pos = 0;
std::size_t key_pos = starts_at;
const auto best_path_score_till_here =
best_path_ends_at[starts_at].best_path_score;
bool has_single_node = false;
const int mblen =
std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
size - starts_at);
while (key_pos < size) {
const int ret =
trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
if (ret == -2)
break;
if (ret >= 0) {
if (IsUnusedInlined(ret))
continue;
// Update the best path node.
auto& target_node = best_path_ends_at[key_pos];
const auto length = (key_pos - starts_at);
// User defined symbol receives extra bonus to always be selected.
const auto score = IsUserDefinedInlined(ret)
? (length * max_score_ - 0.1)
: GetScoreInlined(ret);
const auto candidate_best_path_score =
score + best_path_score_till_here;
if (target_node.starts_at == -1 ||
candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = static_cast<float>(candidate_best_path_score);
target_node.starts_at = starts_at;
target_node.id = ret;
}
if (!has_single_node && length == mblen) {
has_single_node = true;
}
}
}
if (!has_single_node) {
auto& target_node = best_path_ends_at[starts_at + mblen];
const auto candidate_best_path_score =
unk_score + best_path_score_till_here;
if (target_node.starts_at == -1 ||
candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = candidate_best_path_score;
target_node.starts_at = starts_at;
target_node.id = unk_id_;
}
}
// Move by one unicode character.
starts_at += mblen;
}
// Backtrack to identify the best path.
EncodeResult results;
int ends_at = size;
while (ends_at > 0) {
const auto& node = best_path_ends_at[ends_at];
results.emplace_back(
normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
ends_at = node.starts_at;
}
std::reverse(results.begin(), results.end());
return results;
}
public:
explicit T5UniGramTokenizer(bool is_umt5 = false) {
if (is_umt5) {
InitializePieces(load_umt5_tokenizer_json());
} else {
InitializePieces(load_t5_tokenizer_json());
}
min_score_ = FLT_MAX;
max_score_ = FLT_MIN;
std::vector<std::pair<std::string, int>> pieces;
for (int i = 0; i < piece_score_pairs.size(); i++) {
const auto& sp = piece_score_pairs[i];
min_score_ = std::min(min_score_, sp.second);
max_score_ = std::max(max_score_, sp.second);
pieces.emplace_back(sp.first, i);
}
BuildTrie(&pieces);
}
~T5UniGramTokenizer(){};
std::string Normalize(const std::string& input) const {
// Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
// TODO: nmt-nfkc
std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
return normalized;
}
std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
std::string normalized = Normalize(input);
normalized = pre_tokenizer.tokenize(normalized);
EncodeResult result = EncodeOptimized(normalized);
if (result.size() > 0 && append_eos_if_not_present) {
auto item = result[result.size() - 1];
if (item.first != eos_token_) {
result.emplace_back(eos_token_, eos_id_);
}
}
std::vector<int> tokens;
for (auto item : result) {
tokens.push_back(item.second);
}
return tokens;
}
void pad_tokens(std::vector<int>& tokens,
std::vector<float>& weights,
std::vector<float>* attention_mask,
size_t max_length = 0,
bool padding = false) {
if (max_length > 0 && padding) {
size_t orig_token_num = tokens.size() - 1;
size_t n = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
if (n == 0) {
n = 1;
}
size_t length = max_length * n;
LOG_DEBUG("token length: %llu", length);
std::vector<int> new_tokens;
std::vector<float> new_weights;
std::vector<float> new_attention_mask;
int token_idx = 0;
for (int i = 0; i < length; i++) {
if (token_idx >= orig_token_num) {
break;
}
if (attention_mask != nullptr) {
new_attention_mask.push_back(0.0);
}
if (i % max_length == max_length - 1) {
new_tokens.push_back(eos_id_);
new_weights.push_back(1.0);
} else {
new_tokens.push_back(tokens[token_idx]);
new_weights.push_back(weights[token_idx]);
token_idx++;
}
}
new_tokens.push_back(eos_id_);
new_weights.push_back(1.0);
if (attention_mask != nullptr) {
new_attention_mask.push_back(0.0);
}
tokens = new_tokens;
weights = new_weights;
if (attention_mask != nullptr) {
*attention_mask = new_attention_mask;
}
if (padding) {
int pad_token_id = pad_id_;
tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
weights.insert(weights.end(), length - weights.size(), 1.0);
if (attention_mask != nullptr) {
// maybe keep some padding tokens unmasked?
attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
}
}
}
}
// Returns the minimum score in sentence pieces.
// min_score() - 10 is used for the cost of unknown sentence.
float min_score() const { return min_score_; }
// Returns the maximum score in sentence pieces.
// max_score() is used for the cost of user defined symbols.
float max_score() const { return max_score_; }
Status status() const { return status_; }
};
#include "tokenizers/t5_unigram_tokenizer.h"
class T5LayerNorm : public UnaryBlock {
protected:
@ -937,18 +494,17 @@ struct T5Embedder {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
std::vector<int> curr_tokens = tokenizer.encode(curr_text);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
int EOS_TOKEN_ID = 1;
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
std::vector<float> attention_mask;
tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
tokenizer.pad_tokens(tokens, &weights, &attention_mask, padding ? max_length : 0, padding ? max_length : 100000000, padding);
for (auto& mask_value : attention_mask) {
mask_value = mask_value > 0.0f ? 0.0f : -HUGE_VALF;
}
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";

View File

@ -0,0 +1,189 @@
#include "bpe_tokenizer.h"
#include <algorithm>
#include <sstream>
#include "tokenize_util.h"
#include "util.h"
std::vector<std::pair<int, std::u32string>> BPETokenizer::bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set;
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 161; b <= 172; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 174; b <= 255; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
int n = 0;
for (int b = 0; b < 256; ++b) {
if (byte_set.find(b) == byte_set.end()) {
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(n + 256)));
++n;
}
}
return byte_unicode_pairs;
}
std::vector<std::string> BPETokenizer::token_split(const std::string& text) const {
return ::token_split(text);
}
std::vector<std::u32string> BPETokenizer::split_utf32(const std::string& text, char32_t delimiter) {
std::vector<std::u32string> result;
size_t start = 0;
size_t pos = 0;
std::u32string utf32_text = utf8_to_utf32(text);
while ((pos = utf32_text.find(delimiter, start)) != std::u32string::npos) {
result.push_back(utf32_text.substr(start, pos - start));
start = pos + 1;
}
return result;
}
static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
std::set<std::pair<std::u32string, std::u32string>> pairs;
if (subwords.empty()) {
return pairs;
}
std::u32string prev_subword = subwords[0];
for (int i = 1; i < static_cast<int>(subwords.size()); i++) {
std::u32string subword = subwords[i];
std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
pairs.insert(pair);
prev_subword = subword;
}
return pairs;
}
std::vector<std::u32string> BPETokenizer::bpe(const std::u32string& token) const {
std::vector<std::u32string> word;
for (int i = 0; i < static_cast<int>(token.size()) - 1; i++) {
word.emplace_back(1, token[i]);
}
word.push_back(token.substr(token.size() - 1) + utf8_to_utf32(end_of_word_suffix));
std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
if (pairs.empty()) {
return {token + utf8_to_utf32(end_of_word_suffix)};
}
while (true) {
auto min_pair_iter = std::min_element(pairs.begin(),
pairs.end(),
[&](const std::pair<std::u32string, std::u32string>& a,
const std::pair<std::u32string, std::u32string>& b) {
if (bpe_ranks.find(a) == bpe_ranks.end()) {
return false;
} else if (bpe_ranks.find(b) == bpe_ranks.end()) {
return true;
}
return bpe_ranks.at(a) < bpe_ranks.at(b);
});
const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
break;
}
std::u32string first = bigram.first;
std::u32string second = bigram.second;
std::vector<std::u32string> new_word;
int32_t i = 0;
while (i < static_cast<int32_t>(word.size())) {
auto it = std::find(word.begin() + i, word.end(), first);
if (it == word.end()) {
new_word.insert(new_word.end(), word.begin() + i, word.end());
break;
}
new_word.insert(new_word.end(), word.begin() + i, it);
i = static_cast<int32_t>(std::distance(word.begin(), it));
if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
new_word.push_back(first + second);
i += 2;
} else {
new_word.push_back(word[i]);
i += 1;
}
}
word = new_word;
if (word.size() == 1) {
break;
}
pairs = get_pairs(word);
}
return word;
}
std::vector<int> BPETokenizer::encode(const std::string& text, on_new_token_cb_t on_new_token_cb) {
std::string normalized_text = normalize(text);
std::vector<int32_t> bpe_tokens;
std::vector<std::string> token_strs;
auto splited_texts = split_with_special_tokens(normalized_text, special_tokens);
for (auto& splited_text : splited_texts) {
if (is_special_token(splited_text)) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(splited_text, bpe_tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
}
bpe_tokens.push_back(encoder[utf8_to_utf32(splited_text)]);
token_strs.push_back(splited_text);
continue;
}
auto tokens = token_split(splited_text);
for (auto& token : tokens) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(token, bpe_tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
}
std::string token_str = token;
std::u32string utf32_token;
for (int i = 0; i < static_cast<int>(token_str.length()); i++) {
unsigned char b = token_str[i];
utf32_token += byte_encoder[b];
}
auto bpe_strs = bpe(utf32_token);
for (auto bpe_str : bpe_strs) {
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
}
}
std::stringstream ss;
ss << "[";
for (auto token : token_strs) {
ss << "\"" << token << "\", ";
}
ss << "]";
LOG_DEBUG("split prompt \"%s\" to tokens %s", text.c_str(), ss.str().c_str());
return bpe_tokens;
}
std::string BPETokenizer::decode_token(int token_id) const {
return utf32_to_utf8(decoder.at(token_id));
}

View File

@ -0,0 +1,40 @@
#ifndef __SD_TOKENIZERS_BPE_TOKENIZER_H__
#define __SD_TOKENIZERS_BPE_TOKENIZER_H__
#include <cstddef>
#include <cstdint>
#include <functional>
#include <map>
#include <regex>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "tokenizer.h"
class BPETokenizer : public Tokenizer {
protected:
std::map<int, std::u32string> byte_encoder;
std::map<std::u32string, int> byte_decoder;
std::map<std::u32string, int> encoder;
std::map<int, std::u32string> decoder;
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
int encoder_len = 0;
int bpe_len = 0;
protected:
static std::vector<std::pair<int, std::u32string>> bytes_to_unicode();
static std::vector<std::u32string> split_utf32(const std::string& text, char32_t delimiter = U'\n');
virtual std::vector<std::string> token_split(const std::string& text) const;
std::vector<std::u32string> bpe(const std::u32string& token) const;
std::string decode_token(int token_id) const override;
public:
BPETokenizer() = default;
virtual ~BPETokenizer() = default;
std::vector<int> encode(const std::string& text, on_new_token_cb_t on_new_token_cb = nullptr) override;
};
#endif // __SD_TOKENIZERS_BPE_TOKENIZER_H__

View File

@ -0,0 +1,116 @@
#include "clip_tokenizer.h"
#include <algorithm>
#include <cctype>
#include <cmath>
#include <regex>
#include <set>
#include "ggml.h"
#include "tokenize_util.h"
#include "util.h"
#include "vocab/vocab.h"
CLIPTokenizer::CLIPTokenizer(int pad_token_id, const std::string& merges_utf8_str) {
UNK_TOKEN = "<|endoftext|>";
BOS_TOKEN = "<|startoftext|>";
EOS_TOKEN = "<|endoftext|>";
PAD_TOKEN = "<|endoftext|>";
UNK_TOKEN_ID = 49407;
BOS_TOKEN_ID = 49406;
EOS_TOKEN_ID = 49407;
PAD_TOKEN_ID = pad_token_id;
end_of_word_suffix = "</w>";
add_bos_token = true;
add_eos_token = true;
if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str);
} else {
load_from_merges(load_clip_merges());
}
add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>");
}
void CLIPTokenizer::load_from_merges(const std::string& merges_utf8_str) {
auto byte_unicode_pairs = bytes_to_unicode();
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
for (auto& pair : byte_unicode_pairs) {
byte_decoder[pair.second] = pair.first;
}
std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
GGML_ASSERT(merges.size() == 48895);
merges = std::vector<std::u32string>(merges.begin() + 1, merges.end());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
}
std::vector<std::u32string> vocab;
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second);
}
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second + utf8_to_utf32("</w>"));
}
for (const auto& merge : merge_pairs) {
vocab.push_back(merge.first + merge.second);
}
vocab.push_back(utf8_to_utf32("<|startoftext|>"));
vocab.push_back(utf8_to_utf32("<|endoftext|>"));
LOG_DEBUG("vocab size: %llu", vocab.size());
int i = 0;
for (const auto& token : vocab) {
encoder[token] = i;
decoder[i] = token;
i++;
}
encoder_len = i;
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
}
static std::string strip(const std::string& str) {
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
if (start == std::string::npos) {
return "";
}
return str.substr(start, end - start + 1);
}
static std::string whitespace_clean(const std::string& text) {
auto result = std::regex_replace(text, std::regex(R"(\s+)"), " ");
result = strip(result);
return result;
}
std::string CLIPTokenizer::normalize(const std::string& text) const {
auto normalized_text = whitespace_clean(text);
std::transform(normalized_text.begin(), normalized_text.end(), normalized_text.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
return normalized_text;
}
std::vector<std::string> CLIPTokenizer::token_split(const std::string& text) const {
std::regex clip_pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::sregex_iterator iter(text.begin(), text.end(), clip_pat);
std::sregex_iterator end;
std::vector<std::string> result;
for (; iter != end; ++iter) {
result.emplace_back(iter->str());
}
return result;
}

View File

@ -0,0 +1,20 @@
#ifndef __SD_TOKENIZERS_CLIP_TOKENIZER_H__
#define __SD_TOKENIZERS_CLIP_TOKENIZER_H__
#include <cstddef>
#include <string>
#include <vector>
#include "bpe_tokenizer.h"
class CLIPTokenizer : public BPETokenizer {
protected:
void load_from_merges(const std::string& merges_utf8_str);
std::string normalize(const std::string& text) const override;
std::vector<std::string> token_split(const std::string& text) const override;
public:
explicit CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "");
};
#endif // __SD_TOKENIZERS_CLIP_TOKENIZER_H__

View File

@ -0,0 +1,89 @@
#include "mistral_tokenizer.h"
#include "ggml.h"
#include "json.hpp"
#include "util.h"
#include "vocab/vocab.h"
void MistralTokenizer::load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
nlohmann::json vocab;
try {
vocab = nlohmann::json::parse(vocab_utf8_str);
} catch (const nlohmann::json::parse_error&) {
GGML_ABORT("invalid vocab json str");
}
for (const auto& [key, value] : vocab.items()) {
std::u32string token = utf8_to_utf32(key);
int i = value;
encoder[token] = i;
decoder[i] = token;
}
encoder_len = static_cast<int>(vocab.size());
LOG_DEBUG("vocab size: %d", encoder_len);
auto byte_unicode_pairs = bytes_to_unicode();
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
for (auto& pair : byte_unicode_pairs) {
byte_decoder[pair.second] = pair.first;
}
std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
LOG_DEBUG("merges size %llu", merges.size());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
}
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
}
MistralTokenizer::MistralTokenizer(const std::string& merges_utf8_str, const std::string& vocab_utf8_str) {
add_bos_token = true;
UNK_TOKEN = "<unk>";
BOS_TOKEN = "<s>";
EOS_TOKEN = "</s>";
PAD_TOKEN = "<pad>";
UNK_TOKEN_ID = 0;
BOS_TOKEN_ID = 1;
EOS_TOKEN_ID = 2;
PAD_TOKEN_ID = 11;
special_tokens = {
"<unk>",
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
"[TOOL_CALLS]",
"[IMG]",
"<pad>",
"[IMG_BREAK]",
"[IMG_END]",
"[PREFIX]",
"[MIDDLE]",
"[SUFFIX]",
"[SYSTEM_PROMPT]",
"[/SYSTEM_PROMPT]",
"[TOOL_CONTENT]",
};
for (int i = 20; i < 1000; i++) {
special_tokens.push_back("<SPECIAL_" + std::to_string(i) + ">");
}
if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str, vocab_utf8_str);
} else {
load_from_merges(load_mistral_merges(), load_mistral_vocab_json());
}
}

View File

@ -0,0 +1,16 @@
#ifndef __SD_TOKENIZERS_MISTRAL_TOKENIZER_H__
#define __SD_TOKENIZERS_MISTRAL_TOKENIZER_H__
#include <string>
#include "bpe_tokenizer.h"
class MistralTokenizer : public BPETokenizer {
protected:
void load_from_merges(const std::string& merges_utf8_str, const std::string& vocab_utf8_str);
public:
explicit MistralTokenizer(const std::string& merges_utf8_str = "", const std::string& vocab_utf8_str = "");
};
#endif // __SD_TOKENIZERS_MISTRAL_TOKENIZER_H__

View File

@ -0,0 +1,91 @@
#include "qwen2_tokenizer.h"
#include "util.h"
#include "vocab/vocab.h"
void Qwen2Tokenizer::load_from_merges(const std::string& merges_utf8_str) {
auto byte_unicode_pairs = bytes_to_unicode();
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
for (auto& pair : byte_unicode_pairs) {
byte_decoder[pair.second] = pair.first;
}
std::vector<std::u32string> merges = split_utf32(merges_utf8_str);
LOG_DEBUG("merges size %llu", merges.size());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
}
std::vector<std::u32string> tokens;
for (const auto& pair : byte_unicode_pairs) {
tokens.push_back(pair.second);
}
for (const auto& merge : merge_pairs) {
tokens.push_back(merge.first + merge.second);
}
for (auto& special_token : special_tokens) {
tokens.push_back(utf8_to_utf32(special_token));
}
int i = 0;
for (const auto& token : tokens) {
encoder[token] = i;
decoder[i] = token;
i++;
}
encoder_len = i;
LOG_DEBUG("vocab size: %d", encoder_len);
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
bpe_len = rank;
}
Qwen2Tokenizer::Qwen2Tokenizer(const std::string& merges_utf8_str) {
UNK_TOKEN = "<|endoftext|>";
EOS_TOKEN = "<|endoftext|>";
PAD_TOKEN = "<|endoftext|>";
UNK_TOKEN_ID = 151643;
EOS_TOKEN_ID = 151643;
PAD_TOKEN_ID = 151643;
special_tokens = {
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>",
"<tool_call>",
"</tool_call>",
"<|fim_prefix|>",
"<|fim_middle|>",
"<|fim_suffix|>",
"<|fim_pad|>",
"<|repo_name|>",
"<|file_sep|>",
"<tool_response>",
"</tool_response>",
"<think>",
"</think>",
};
if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str);
} else {
load_from_merges(load_qwen2_merges());
}
}

View File

@ -0,0 +1,16 @@
#ifndef __SD_TOKENIZERS_QWEN2_TOKENIZER_H__
#define __SD_TOKENIZERS_QWEN2_TOKENIZER_H__
#include <string>
#include "bpe_tokenizer.h"
class Qwen2Tokenizer : public BPETokenizer {
protected:
void load_from_merges(const std::string& merges_utf8_str);
public:
explicit Qwen2Tokenizer(const std::string& merges_utf8_str = "");
};
#endif // __SD_TOKENIZERS_QWEN2_TOKENIZER_H__

View File

@ -0,0 +1,339 @@
#include "t5_unigram_tokenizer.h"
#include <algorithm>
#include <cfloat>
#include <cmath>
#include <regex>
#include <sstream>
#include "json.hpp"
#include "tokenize_util.h"
#include "util.h"
#include "vocab/vocab.h"
// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
//
// Since tokenization is not the bottleneck in SD, performance was not a major consideration
// during the migration.
MetaspacePreTokenizer::MetaspacePreTokenizer(const std::string replacement, bool add_prefix_space)
: replacement(replacement), add_prefix_space(add_prefix_space) {}
std::string MetaspacePreTokenizer::tokenize(const std::string& input) const {
std::string tokens;
std::stringstream ss(input);
if (add_prefix_space) {
tokens += replacement;
}
std::string token;
bool first_token = true;
while (std::getline(ss, token, ' ')) {
if (!first_token) {
tokens += replacement + token;
} else {
tokens += token;
}
first_token = false;
}
return tokens;
}
void T5UniGramTokenizer::InitializePieces(const std::string& json_str) {
nlohmann::json data;
try {
data = nlohmann::json::parse(json_str);
} catch (const nlohmann::json::parse_error&) {
status_ = INVLIAD_JSON;
return;
}
if (!data.contains("model")) {
status_ = INVLIAD_JSON;
return;
}
nlohmann::json model = data["model"];
if (!model.contains("vocab")) {
status_ = INVLIAD_JSON;
return;
}
if (model.contains("unk_id")) {
UNK_TOKEN_ID = model["unk_id"];
}
replacement = data["pre_tokenizer"]["replacement"];
add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
for (const auto& item : model["vocab"]) {
if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
status_ = INVLIAD_JSON;
return;
}
std::string piece = item[0];
if (piece.empty()) {
piece = "<empty_token>";
}
float score = item[1];
piece_score_pairs.emplace_back(piece, score);
}
}
void T5UniGramTokenizer::BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
if (status_ != OK) {
return;
}
if (pieces->empty()) {
status_ = NO_PIECES_LOADED;
return;
}
std::sort(pieces->begin(), pieces->end());
std::vector<const char*> key(pieces->size());
std::vector<int> value(pieces->size());
for (size_t i = 0; i < pieces->size(); ++i) {
key[i] = (*pieces)[i].first.data();
value[i] = (*pieces)[i].second;
}
trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr, &value[0]) != 0) {
status_ = BUILD_DOUBLE_ARRAY_FAILED;
return;
}
const int kMaxTrieResultsSize = 1024;
std::vector<Darts::DoubleArray::result_pair_type> results(kMaxTrieResultsSize);
trie_results_size_ = 0;
for (const auto& p : *pieces) {
const size_t num_nodes = trie_->commonPrefixSearch(
p.first.data(), results.data(), results.size(), p.first.size());
trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
}
if (trie_results_size_ == 0) {
status_ = NO_ENTRY_FOUND;
}
}
float T5UniGramTokenizer::GetScoreInlined(int id) const {
return piece_score_pairs[id].second;
}
bool T5UniGramTokenizer::IsUnusedInlined(int id) const {
(void)id;
return false;
}
bool T5UniGramTokenizer::IsUserDefinedInlined(int id) const {
(void)id;
return false;
}
size_t T5UniGramTokenizer::OneCharLen(const char* src) const {
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
}
EncodeResult T5UniGramTokenizer::EncodeOptimized(const std::string& normalized) const {
if (status() != OK || normalized.empty()) {
return {};
}
struct BestPathNode {
int id = -1;
float best_path_score = 0;
int starts_at = -1;
};
const int size = static_cast<int>(normalized.size());
const float unk_score = min_score() - kUnkPenalty;
std::vector<BestPathNode> best_path_ends_at(size + 1);
int starts_at = 0;
while (starts_at < size) {
std::size_t node_pos = 0;
std::size_t key_pos = starts_at;
const auto best_path_score_till_here = best_path_ends_at[starts_at].best_path_score;
bool has_single_node = false;
const int mblen = std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)), size - starts_at);
while (key_pos < static_cast<size_t>(size)) {
const int ret = trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
if (ret == -2) {
break;
}
if (ret >= 0) {
if (IsUnusedInlined(ret)) {
continue;
}
auto& target_node = best_path_ends_at[key_pos];
const auto length = static_cast<int>(key_pos - starts_at);
const auto score = IsUserDefinedInlined(ret) ? (length * max_score_ - 0.1f) : GetScoreInlined(ret);
const auto candidate_best_path_score = score + best_path_score_till_here;
if (target_node.starts_at == -1 || candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = static_cast<float>(candidate_best_path_score);
target_node.starts_at = starts_at;
target_node.id = ret;
}
if (!has_single_node && length == mblen) {
has_single_node = true;
}
}
}
if (!has_single_node) {
auto& target_node = best_path_ends_at[starts_at + mblen];
const auto candidate_best_path_score = unk_score + best_path_score_till_here;
if (target_node.starts_at == -1 || candidate_best_path_score > target_node.best_path_score) {
target_node.best_path_score = candidate_best_path_score;
target_node.starts_at = starts_at;
target_node.id = UNK_TOKEN_ID;
}
}
starts_at += mblen;
}
EncodeResult results;
int ends_at = size;
while (ends_at > 0) {
const auto& node = best_path_ends_at[ends_at];
results.emplace_back(normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
ends_at = node.starts_at;
}
std::reverse(results.begin(), results.end());
return results;
}
T5UniGramTokenizer::T5UniGramTokenizer(bool is_umt5) {
add_bos_token = false;
add_eos_token = true;
if (is_umt5) {
PAD_TOKEN_ID = 0;
EOS_TOKEN_ID = 1;
BOS_TOKEN_ID = 2;
UNK_TOKEN_ID = 3;
PAD_TOKEN = "<pad>";
EOS_TOKEN = "</s>";
BOS_TOKEN = "<s>";
UNK_TOKEN = "<unk>";
} else {
PAD_TOKEN_ID = 0;
EOS_TOKEN_ID = 1;
UNK_TOKEN_ID = 2;
PAD_TOKEN = "<pad>";
EOS_TOKEN = "</s>";
UNK_TOKEN = "<unk>";
}
special_tokens = {
"<pad>",
"</s>",
"<unk>",
};
if (is_umt5) {
special_tokens.push_back("<s>");
}
if (is_umt5) {
InitializePieces(load_umt5_tokenizer_json());
} else {
InitializePieces(load_t5_tokenizer_json());
}
min_score_ = FLT_MAX;
max_score_ = FLT_MIN;
std::vector<std::pair<std::string, int>> pieces;
for (int i = 0; i < static_cast<int>(piece_score_pairs.size()); i++) {
const auto& sp = piece_score_pairs[i];
min_score_ = std::min(min_score_, sp.second);
max_score_ = std::max(max_score_, sp.second);
pieces.emplace_back(sp.first, i);
}
BuildTrie(&pieces);
}
T5UniGramTokenizer::~T5UniGramTokenizer() = default;
std::string T5UniGramTokenizer::decode_token(int token_id) const {
if (token_id < 0 || token_id >= static_cast<int>(piece_score_pairs.size())) {
return "";
}
const std::string& piece = piece_score_pairs[token_id].first;
if (piece == "<empty_token>") {
return "";
}
return piece;
}
std::string T5UniGramTokenizer::normalize(const std::string& input) const {
// Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
// TODO: nmt-nfkc
std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
return normalized;
}
std::vector<int> T5UniGramTokenizer::encode(const std::string& input, on_new_token_cb_t on_new_token_cb) {
std::vector<int32_t> tokens;
std::vector<std::string> token_strs;
std::string normalized = normalize(input);
auto splited_texts = split_with_special_tokens(normalized, special_tokens);
if (splited_texts.empty()) {
splited_texts.push_back(normalized); // for empty string
}
for (auto& splited_text : splited_texts) {
if (is_special_token(splited_text)) {
if (on_new_token_cb != nullptr) {
bool skip = on_new_token_cb(splited_text, tokens);
if (skip) {
token_strs.push_back(splited_text);
continue;
}
}
if (splited_text == UNK_TOKEN) {
tokens.push_back(UNK_TOKEN_ID);
token_strs.push_back(UNK_TOKEN);
} else if (splited_text == EOS_TOKEN) {
tokens.push_back(EOS_TOKEN_ID);
token_strs.push_back(EOS_TOKEN);
} else if (splited_text == PAD_TOKEN) {
tokens.push_back(PAD_TOKEN_ID);
token_strs.push_back(PAD_TOKEN);
}
continue;
}
std::string pretokenized = pre_tokenizer.tokenize(splited_text);
EncodeResult result = EncodeOptimized(pretokenized);
for (const auto& item : result) {
tokens.push_back(item.second);
token_strs.push_back(item.first);
}
}
std::stringstream ss;
ss << "[";
for (const auto& token_str : token_strs) {
ss << "\"" << token_str << "\", ";
}
ss << "]";
LOG_DEBUG("split prompt \"%s\" to tokens %s", input.c_str(), ss.str().c_str());
return tokens;
}

View File

@ -0,0 +1,70 @@
#ifndef __SD_TOKENIZERS_T5_UNIGRAM_TOKENIZER_H__
#define __SD_TOKENIZERS_T5_UNIGRAM_TOKENIZER_H__
#include <cstddef>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "darts.h"
#include "tokenizer.h"
class MetaspacePreTokenizer {
private:
std::string replacement;
bool add_prefix_space;
public:
MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true);
std::string tokenize(const std::string& input) const;
};
using EncodeResult = std::vector<std::pair<std::string, int>>;
class T5UniGramTokenizer : public Tokenizer {
public:
enum Status {
OK,
NO_PIECES_LOADED,
NO_ENTRY_FOUND,
BUILD_DOUBLE_ARRAY_FAILED,
PIECE_ALREADY_DEFINED,
INVLIAD_JSON
};
protected:
MetaspacePreTokenizer pre_tokenizer;
std::vector<std::pair<std::string, float>> piece_score_pairs;
float min_score_ = 0.0f;
float max_score_ = 0.0f;
std::unique_ptr<Darts::DoubleArray> trie_;
int trie_results_size_ = 0;
Status status_ = OK;
float kUnkPenalty = 10.0f;
std::string replacement;
bool add_prefix_space = true;
void InitializePieces(const std::string& json_str);
void BuildTrie(std::vector<std::pair<std::string, int>>* pieces);
float GetScoreInlined(int id) const;
bool IsUnusedInlined(int id) const;
bool IsUserDefinedInlined(int id) const;
size_t OneCharLen(const char* src) const;
EncodeResult EncodeOptimized(const std::string& normalized) const;
float min_score() const { return min_score_; }
float max_score() const { return max_score_; }
Status status() const { return status_; }
std::string decode_token(int token_id) const override;
std::string normalize(const std::string& input) const override;
public:
explicit T5UniGramTokenizer(bool is_umt5 = false);
~T5UniGramTokenizer();
std::vector<int> encode(const std::string& input, on_new_token_cb_t on_new_token_cb = nullptr) override;
};
#endif // __SD_TOKENIZERS_T5_UNIGRAM_TOKENIZER_H__

View File

@ -1,5 +1,5 @@
#ifndef __TOKENIZE_UTIL__
#define __TOKENIZE_UTIL__
#ifndef __SD_TOKENIZERS_BPE_TOKENIZE_UTIL_H__
#define __SD_TOKENIZERS_BPE_TOKENIZE_UTIL_H__
#include <string>
#include <vector>
@ -7,4 +7,4 @@
std::vector<std::string> token_split(const std::string& text);
std::vector<std::string> split_with_special_tokens(const std::string& text, const std::vector<std::string>& special_tokens);
#endif // __TOKENIZE_UTIL__
#endif // __SD_TOKENIZERS_BPE_TOKENIZE_UTIL_H__

View File

@ -0,0 +1,222 @@
#include "tokenizer.h"
#include <algorithm>
#include <cmath>
#include <regex>
#include "util.h"
void Tokenizer::add_special_token(const std::string& token) {
special_tokens.push_back(token);
}
bool Tokenizer::is_special_token(const std::string& token) const {
for (const auto& special_token : special_tokens) {
if (special_token == token) {
return true;
}
}
return false;
}
std::string Tokenizer::normalize(const std::string& text) const {
return text;
}
std::vector<int> Tokenizer::tokenize(const std::string& text,
on_new_token_cb_t on_new_token_cb,
bool padding,
size_t min_length,
size_t max_length,
bool allow_overflow_expand) {
std::vector<int> tokens = encode(text, on_new_token_cb);
if (padding) {
pad_tokens(tokens, nullptr, nullptr, min_length, max_length, allow_overflow_expand);
}
return tokens;
}
void Tokenizer::pad_tokens(std::vector<int>& tokens,
std::vector<float>* weights,
std::vector<float>* mask,
size_t min_length,
size_t max_length,
bool allow_overflow_expand) {
const bool use_weights = weights != nullptr;
const bool use_mask = mask != nullptr;
if (use_weights && tokens.size() != weights->size()) {
LOG_ERROR("tokens size != weights size");
return;
}
const size_t bos_count = add_bos_token ? 1 : 0;
const size_t eos_count = add_eos_token ? 1 : 0;
const size_t special_token_count = bos_count + eos_count;
auto build_sequence = [&](size_t begin,
size_t count,
size_t target_length,
std::vector<int>& out_tokens,
std::vector<float>& out_weights,
std::vector<float>& out_mask) {
const size_t base_length = count + special_token_count;
const size_t final_length = std::max(target_length, base_length);
out_tokens.clear();
out_weights.clear();
out_mask.clear();
out_tokens.reserve(final_length);
if (use_weights) {
out_weights.reserve(final_length);
}
if (use_mask) {
out_mask.reserve(final_length);
}
if (add_bos_token) {
out_tokens.push_back(BOS_TOKEN_ID);
if (use_weights) {
out_weights.push_back(1.0f);
}
if (use_mask) {
out_mask.push_back(1.0f);
}
}
for (size_t i = 0; i < count; ++i) {
out_tokens.push_back(tokens[begin + i]);
if (use_weights) {
out_weights.push_back((*weights)[begin + i]);
}
if (use_mask) {
out_mask.push_back(1.0f);
}
}
if (add_eos_token) {
out_tokens.push_back(EOS_TOKEN_ID);
if (use_weights) {
out_weights.push_back(1.0f);
}
if (use_mask) {
out_mask.push_back(1.0f);
}
}
if (final_length > out_tokens.size()) {
const size_t pad_count = final_length - out_tokens.size();
if (pad_left) {
out_tokens.insert(out_tokens.begin(), pad_count, PAD_TOKEN_ID);
if (use_weights) {
out_weights.insert(out_weights.begin(), pad_count, 1.0f);
}
if (use_mask) {
out_mask.insert(out_mask.begin(), pad_count, 0.0f);
}
} else {
out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID);
if (use_weights) {
out_weights.insert(out_weights.end(), pad_count, 1.0f);
}
if (use_mask) {
out_mask.insert(out_mask.end(), pad_count, 0.0f);
}
}
}
};
const size_t single_length = std::max(min_length, tokens.size() + special_token_count);
const bool exceeds_max_length = max_length > 0 && single_length > max_length;
std::vector<int> new_tokens;
std::vector<float> new_weights;
std::vector<float> new_mask;
if (!exceeds_max_length) {
build_sequence(0, tokens.size(), min_length, new_tokens, new_weights, new_mask);
} else if (!allow_overflow_expand) {
build_sequence(0, tokens.size(), 0, new_tokens, new_weights, new_mask);
new_tokens.resize(max_length);
if (use_weights) {
new_weights.resize(max_length);
}
if (use_mask) {
new_mask.resize(max_length);
}
if (add_eos_token && !new_tokens.empty()) {
new_tokens.back() = EOS_TOKEN_ID;
if (use_weights) {
new_weights.back() = 1.0f;
}
if (use_mask) {
new_mask.back() = 1.0f;
}
}
} else if (min_length > special_token_count) {
const size_t tokens_per_chunk = min_length - special_token_count;
size_t offset = 0;
while (offset < tokens.size()) {
const size_t remaining = tokens.size() - offset;
const size_t take = std::min(tokens_per_chunk, remaining);
std::vector<int> chunk_tokens;
std::vector<float> chunk_weights;
std::vector<float> chunk_mask;
build_sequence(offset, take, min_length, chunk_tokens, chunk_weights, chunk_mask);
new_tokens.insert(new_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
if (use_weights) {
new_weights.insert(new_weights.end(), chunk_weights.begin(), chunk_weights.end());
}
if (use_mask) {
new_mask.insert(new_mask.end(), chunk_mask.begin(), chunk_mask.end());
}
offset += take;
}
} else {
build_sequence(0, tokens.size(), min_length, new_tokens, new_weights, new_mask);
}
tokens = std::move(new_tokens);
if (use_weights) {
*weights = std::move(new_weights);
}
if (use_mask) {
*mask = std::move(new_mask);
}
}
static std::string clean_up_tokenization(std::string& text) {
std::regex pattern(R"( ,)");
return std::regex_replace(text, pattern, ",");
}
std::string Tokenizer::decode(const std::vector<int>& tokens) const {
std::string text;
for (int token_id : tokens) {
if (token_id == BOS_TOKEN_ID || token_id == EOS_TOKEN_ID || token_id == PAD_TOKEN_ID) {
continue;
}
std::string piece = decode_token(token_id);
if (!end_of_word_suffix.empty() && ends_with(piece, end_of_word_suffix)) {
piece.erase(piece.size() - end_of_word_suffix.size());
text += piece + " ";
} else {
text += piece;
}
}
text = clean_up_tokenization(text);
return trim(text);
}

View File

@ -0,0 +1,53 @@
#ifndef __SD_TOKENIZERS_TOKENIZER_H__
#define __SD_TOKENIZERS_TOKENIZER_H__
#include <cstddef>
#include <cstdint>
#include <functional>
#include <string>
#include <vector>
using on_new_token_cb_t = std::function<bool(std::string&, std::vector<int32_t>&)>;
class Tokenizer {
protected:
std::vector<std::string> special_tokens;
bool add_bos_token = false;
bool add_eos_token = false;
bool pad_left = false;
std::string end_of_word_suffix;
virtual std::string decode_token(int token_id) const = 0;
virtual std::string normalize(const std::string& text) const;
public:
std::string UNK_TOKEN;
std::string BOS_TOKEN;
std::string EOS_TOKEN;
std::string PAD_TOKEN;
int UNK_TOKEN_ID = 0;
int BOS_TOKEN_ID = 0;
int EOS_TOKEN_ID = 0;
int PAD_TOKEN_ID = 0;
virtual ~Tokenizer() = default;
void add_special_token(const std::string& token);
bool is_special_token(const std::string& token) const;
virtual std::vector<int> encode(const std::string& text, on_new_token_cb_t on_new_token_cb = nullptr) = 0;
std::vector<int> tokenize(const std::string& text,
on_new_token_cb_t on_new_token_cb = nullptr,
bool padding = false,
size_t min_length = 0,
size_t max_length = 100000000,
bool allow_overflow_expand = false);
void pad_tokens(std::vector<int>& tokens,
std::vector<float>* weights,
std::vector<float>* mask,
size_t min_length = 0,
size_t max_length = 100000000,
bool allow_overflow_expand = false);
std::string decode(const std::vector<int>& tokens) const;
};
#endif // __SD_TOKENIZERS_TOKENIZER_H__

View File

@ -1,5 +1,5 @@
#ifndef __VOCAB_H__
#define __VOCAB_H__
#ifndef __SD_TOKENIZERS_VOCAB_VOCAB_H__
#define __SD_TOKENIZERS_VOCAB_VOCAB_H__
#include <string>
@ -10,4 +10,4 @@ std::string load_mistral_vocab_json();
std::string load_t5_tokenizer_json();
std::string load_umt5_tokenizer_json();
#endif // __VOCAB_H__
#endif // __SD_TOKENIZERS_VOCAB_VOCAB_H__