refactor: reorganize the vocab file structure (#1271)

This commit is contained in:
leejet 2026-02-11 00:44:17 +08:00 committed by GitHub
parent adea272225
commit 636d3cb6ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 67 additions and 55 deletions

View File

@ -90,6 +90,8 @@ file(GLOB SD_LIB_SOURCES
"src/*.h" "src/*.h"
"src/*.cpp" "src/*.cpp"
"src/*.hpp" "src/*.hpp"
"src/vocab/*.h"
"src/vocab/*.cpp"
) )
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)

View File

@ -1,4 +1,4 @@
for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do for f in src/*.cpp src/*.h src/*.hpp src/vocab/*.h src/vocab/*.cpp examples/cli/*.cpp examples/common/*.hpp examples/cli/*.h examples/server/*.cpp; do
[[ "$f" == vocab* ]] && continue [[ "$f" == vocab* ]] && continue
echo "formatting '$f'" echo "formatting '$f'"
# if [ "$f" != "stable-diffusion.h" ]; then # if [ "$f" != "stable-diffusion.h" ]; then

View File

@ -4,6 +4,7 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "model.h" #include "model.h"
#include "tokenize_util.h" #include "tokenize_util.h"
#include "vocab/vocab.h"
/*================================================== CLIPTokenizer ===================================================*/ /*================================================== CLIPTokenizer ===================================================*/
@ -110,7 +111,7 @@ public:
if (merges_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str); load_from_merges(merges_utf8_str);
} else { } else {
load_from_merges(ModelLoader::load_merges()); load_from_merges(load_clip_merges());
} }
add_special_token("<|startoftext|>"); add_special_token("<|startoftext|>");
add_special_token("<|endoftext|>"); add_special_token("<|endoftext|>");

View File

@ -19,6 +19,7 @@
#include "json.hpp" #include "json.hpp"
#include "rope.hpp" #include "rope.hpp"
#include "tokenize_util.h" #include "tokenize_util.h"
#include "vocab/vocab.h"
namespace LLM { namespace LLM {
constexpr int LLM_GRAPH_SIZE = 10240; constexpr int LLM_GRAPH_SIZE = 10240;
@ -365,7 +366,7 @@ namespace LLM {
if (merges_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str); load_from_merges(merges_utf8_str);
} else { } else {
load_from_merges(ModelLoader::load_qwen2_merges()); load_from_merges(load_qwen2_merges());
} }
} }
}; };
@ -466,7 +467,7 @@ namespace LLM {
if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) { if (merges_utf8_str.size() > 0 && vocab_utf8_str.size() > 0) {
load_from_merges(merges_utf8_str, vocab_utf8_str); load_from_merges(merges_utf8_str, vocab_utf8_str);
} else { } else {
load_from_merges(ModelLoader::load_mistral_merges(), ModelLoader::load_mistral_vocab_json()); load_from_merges(load_mistral_merges(), load_mistral_vocab_json());
} }
} }
}; };

View File

@ -16,10 +16,6 @@
#include "model.h" #include "model.h"
#include "stable-diffusion.h" #include "stable-diffusion.h"
#include "util.h" #include "util.h"
#include "vocab.hpp"
#include "vocab_mistral.hpp"
#include "vocab_qwen.hpp"
#include "vocab_umt5.hpp"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
@ -1340,36 +1336,6 @@ void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_ru
} }
} }
std::string ModelLoader::load_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(merges_utf8_c_str), sizeof(merges_utf8_c_str));
return merges_utf8_str;
}
std::string ModelLoader::load_qwen2_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(qwen2_merges_utf8_c_str), sizeof(qwen2_merges_utf8_c_str));
return merges_utf8_str;
}
std::string ModelLoader::load_mistral_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(mistral_merges_utf8_c_str), sizeof(mistral_merges_utf8_c_str));
return merges_utf8_str;
}
std::string ModelLoader::load_mistral_vocab_json() {
std::string json_str(reinterpret_cast<const char*>(mistral_vocab_json_utf8_c_str), sizeof(mistral_vocab_json_utf8_c_str));
return json_str;
}
std::string ModelLoader::load_t5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
return json_str;
}
std::string ModelLoader::load_umt5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(umt5_tokenizer_json_str), sizeof(umt5_tokenizer_json_str));
return json_str;
}
bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) { bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool enable_mmap) {
int64_t process_time_ms = 0; int64_t process_time_ms = 0;
std::atomic<int64_t> read_time_ms(0); std::atomic<int64_t> read_time_ms(0);

View File

@ -331,13 +331,6 @@ public:
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
~ModelLoader() = default; ~ModelLoader() = default;
static std::string load_merges();
static std::string load_qwen2_merges();
static std::string load_mistral_merges();
static std::string load_mistral_vocab_json();
static std::string load_t5_tokenizer_json();
static std::string load_umt5_tokenizer_json();
}; };
#endif // __MODEL_H__ #endif // __MODEL_H__

View File

@ -2679,7 +2679,7 @@ public:
}; };
sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling); sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
} else { } else {
if(!first_stage_model->compute(n_threads, x, true, &result, work_ctx)){ if (!first_stage_model->compute(n_threads, x, true, &result, work_ctx)) {
LOG_ERROR("Failed to decode latetnts"); LOG_ERROR("Failed to decode latetnts");
first_stage_model->free_compute_buffer(); first_stage_model->free_compute_buffer();
return nullptr; return nullptr;
@ -2695,7 +2695,7 @@ public:
}; };
sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling); sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
} else { } else {
if(!tae_first_stage->compute(n_threads, x, true, &result)){ if (!tae_first_stage->compute(n_threads, x, true, &result)) {
LOG_ERROR("Failed to decode latetnts"); LOG_ERROR("Failed to decode latetnts");
tae_first_stage->free_compute_buffer(); tae_first_stage->free_compute_buffer();
return nullptr; return nullptr;

View File

@ -14,6 +14,7 @@
#include "ggml_extend.hpp" #include "ggml_extend.hpp"
#include "json.hpp" #include "json.hpp"
#include "model.h" #include "model.h"
#include "vocab/vocab.h"
// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h // Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h. // and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
@ -341,9 +342,9 @@ protected:
public: public:
explicit T5UniGramTokenizer(bool is_umt5 = false) { explicit T5UniGramTokenizer(bool is_umt5 = false) {
if (is_umt5) { if (is_umt5) {
InitializePieces(ModelLoader::load_umt5_tokenizer_json()); InitializePieces(load_umt5_tokenizer_json());
} else { } else {
InitializePieces(ModelLoader::load_t5_tokenizer_json()); InitializePieces(load_t5_tokenizer_json());
} }
min_score_ = FLT_MAX; min_score_ = FLT_MAX;

View File

@ -1,4 +1,4 @@
static unsigned char merges_utf8_c_str[] = { static const unsigned char clip_merges_utf8_c_str[] = {
0x23, 0x23,
0x76, 0x76,
0x65, 0x65,
@ -524620,7 +524620,7 @@ static unsigned char merges_utf8_c_str[] = {
0x0a, 0x0a,
}; };
static unsigned char t5_tokenizer_json_str[] = { static const unsigned char t5_tokenizer_json_str[] = {
0x7b, 0x7b,
0x0a, 0x0a,
0x20, 0x20,

View File

@ -1,4 +1,4 @@
unsigned char mistral_merges_utf8_c_str[] = { static const unsigned char mistral_merges_utf8_c_str[] = {
0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0x20, 0x74, 0x0a, 0x65, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0x20, 0x74, 0x0a, 0x65,
0x20, 0x72, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x20, 0x72, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0xc4, 0xa0,
0xc4, 0xa0, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0,
@ -260614,7 +260614,7 @@ unsigned char mistral_merges_utf8_c_str[] = {
0xc3, 0xa5, 0xc4, 0xb2, 0xc4, 0xb0, 0x20, 0xc3, 0xa6, 0xc2, 0xb1, 0xc4, 0xc3, 0xa5, 0xc4, 0xb2, 0xc4, 0xb0, 0x20, 0xc3, 0xa6, 0xc2, 0xb1, 0xc4,
0xab, 0xc3, 0xa4, 0xc2, 0xb9, 0xc2, 0xa6, 0x0a, 0xab, 0xc3, 0xa4, 0xc2, 0xb9, 0xc2, 0xa6, 0x0a,
}; };
unsigned char mistral_vocab_json_utf8_c_str[] = { static const unsigned char mistral_vocab_json_utf8_c_str[] = {
0x7b, 0x22, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x22, 0x3a, 0x20, 0x30, 0x2c, 0x7b, 0x22, 0x3c, 0x75, 0x6e, 0x6b, 0x3e, 0x22, 0x3a, 0x20, 0x30, 0x2c,
0x20, 0x22, 0x3c, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x31, 0x2c, 0x20, 0x22, 0x20, 0x22, 0x3c, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x31, 0x2c, 0x20, 0x22,
0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x32, 0x2c, 0x20, 0x22, 0x5b, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x3a, 0x20, 0x32, 0x2c, 0x20, 0x22, 0x5b,

View File

@ -1,4 +1,4 @@
unsigned char qwen2_merges_utf8_c_str[] = { static const unsigned char qwen2_merges_utf8_c_str[] = {
0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4,
0xa0, 0xc4, 0xa0, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0x74, 0xa0, 0xc4, 0xa0, 0x0a, 0x69, 0x20, 0x6e, 0x0a, 0xc4, 0xa0, 0x20, 0x74,
0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0, 0x0a, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0xc4, 0xa0, 0x20, 0xc4, 0xa0,

View File

@ -1,4 +1,4 @@
unsigned char umt5_tokenizer_json_str[] = { static const unsigned char umt5_tokenizer_json_str[] = {
0x7b, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x7b, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20,
0x22, 0x31, 0x2e, 0x30, 0x22, 0x2c, 0x20, 0x22, 0x74, 0x72, 0x75, 0x6e, 0x22, 0x31, 0x2e, 0x30, 0x22, 0x2c, 0x20, 0x22, 0x74, 0x72, 0x75, 0x6e,
0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3a, 0x20, 0x6e, 0x75, 0x6c,

35
src/vocab/vocab.cpp Normal file
View File

@ -0,0 +1,35 @@
#include "vocab.h"
#include "clip_t5.hpp"
#include "mistral.hpp"
#include "qwen.hpp"
#include "umt5.hpp"
std::string load_clip_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(clip_merges_utf8_c_str), sizeof(clip_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_qwen2_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(qwen2_merges_utf8_c_str), sizeof(qwen2_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_mistral_merges() {
std::string merges_utf8_str(reinterpret_cast<const char*>(mistral_merges_utf8_c_str), sizeof(mistral_merges_utf8_c_str));
return merges_utf8_str;
}
std::string load_mistral_vocab_json() {
std::string json_str(reinterpret_cast<const char*>(mistral_vocab_json_utf8_c_str), sizeof(mistral_vocab_json_utf8_c_str));
return json_str;
}
std::string load_t5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
return json_str;
}
std::string load_umt5_tokenizer_json() {
std::string json_str(reinterpret_cast<const char*>(umt5_tokenizer_json_str), sizeof(umt5_tokenizer_json_str));
return json_str;
}

13
src/vocab/vocab.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef __VOCAB_H__
#define __VOCAB_H__
#include <string>
std::string load_clip_merges();
std::string load_qwen2_merges();
std::string load_mistral_merges();
std::string load_mistral_vocab_json();
std::string load_t5_tokenizer_json();
std::string load_umt5_tokenizer_json();
#endif // __VOCAB_H__