add umt5 support

2025-12-13 05:48:56 +00:00 · 2025-08-09 16:07:04 +08:00 · 2025-08-09 16:07:04 +08:00 · bace0a08c4
commit bace0a08c4
parent 5f7d98884c
12 changed files with 762430 additions and 81 deletions
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -1223,20 +1223,21 @@ struct FluxCLIPEmbedder : public Conditioner {
    }
 };
-struct PixArtCLIPEmbedder : public Conditioner {
+struct T5CLIPEmbedder : public Conditioner {
    T5UniGramTokenizer t5_tokenizer;
    std::shared_ptr<T5Runner> t5;
    size_t chunk_len = 512;
    bool use_mask    = false;
    int mask_pad     = 1;
-    PixArtCLIPEmbedder(ggml_backend_t backend,
+    T5CLIPEmbedder(ggml_backend_t backend,
-                       const String2GGMLType& tensor_types = {},
+                   const String2GGMLType& tensor_types = {},
-                       int clip_skip                       = -1,
+                   int clip_skip                       = -1,
-                       bool use_mask                       = false,
+                   bool use_mask                       = false,
-                       int mask_pad                        = 1)
+                   int mask_pad                        = 1,
-        : use_mask(use_mask), mask_pad(mask_pad) {
+                   bool is_umt5                        = false)
-        t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
        t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
    }
    void set_clip_skip(int clip_skip) {
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -27,7 +27,7 @@
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
-#include "wan.hpp"
+#include "t5.hpp"
 const char* modes_str[] = {
    "img_gen",
@ -746,11 +746,11 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
 int main(int argc, const char* argv[]) {
    SDParams params;
-    params.verbose = true;
+    // params.verbose = true;
-    sd_set_log_callback(sd_log_cb, (void*)&params);
+    // sd_set_log_callback(sd_log_cb, (void*)&params);
-    WAN::WanRunner::load_from_file_and_test(argv[1]);
+    // T5Embedder::load_from_file_and_test(argv[1]);
-    return 0;
+    // return 0;
    parse_args(argc, argv, params);
--- a/flux.hpp
+++ b/flux.hpp
@ -896,7 +896,7 @@ namespace Flux {
            }
            for (auto pair : tensor_types) {
                std::string tensor_name = pair.first;
-                if (tensor_name.find("model.diffusion_model.") == std::string::npos)
+                if (!starts_with(tensor_name, prefix))
                    continue;
                if (tensor_name.find("guidance_in.in_layer.weight") != std::string::npos) {
                    // not schnell
--- a/format-code.sh
+++ b/format-code.sh
@ -1,2 +1,5 @@
-clang-format -style=file -i *.cpp *.h *.hpp
+for f in *.cpp *.h *.hpp examples/cli/*.cpp; do
-clang-format -style=file -i examples/cli/*.cpp
+  [[ "$f" == vocab* ]] && continue
  echo "formatting '$f'"
  clang-format -style=file -i "$f"
 done
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit b96890f3ab5ffbdbe56bc126df5366c34bd08d39
+Subproject commit e89bc7e8625f59145ee8c0b09383009c47752cd8
--- a/model.cpp
+++ b/model.cpp
@ -10,6 +10,7 @@
 #include "stable-diffusion.h"
 #include "util.h"
 #include "vocab.hpp"
 #include "vocab_umt5.hpp"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@ -1157,6 +1158,10 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
        std::string dtype    = tensor_info["dtype"];
        nlohmann::json shape = tensor_info["shape"];
        if (dtype == "U8") {
            continue;
        }
        size_t begin = tensor_info["data_offsets"][0].get<size_t>();
        size_t end   = tensor_info["data_offsets"][1].get<size_t>();
@ -1856,6 +1861,11 @@ std::string ModelLoader::load_t5_tokenizer_json() {
    return json_str;
 }
 std::string ModelLoader::load_umt5_tokenizer_json() {
    std::string json_str(reinterpret_cast<const char*>(umt5_tokenizer_json_str), sizeof(umt5_tokenizer_json_str));
    return json_str;
 }
 std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
    std::vector<TensorStorage> res;
    std::unordered_map<std::string, size_t> name_to_index_map;
--- a/model.h
+++ b/model.h
@ -258,6 +258,7 @@ public:
    static std::string load_merges();
    static std::string load_t5_tokenizer_json();
    static std::string load_umt5_tokenizer_json();
 };
 #endif  // __MODEL_H__
--- a/rope.hpp
+++ b/rope.hpp
@ -249,4 +249,4 @@ struct Rope {
    }
 };  // struct Rope
-#endif __ROPE_HPP__
+#endif  // __ROPE_HPP__
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -344,11 +344,11 @@ public:
                    }
                }
                if (is_chroma) {
-                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
-                                                                            model_loader.tensor_storages_types,
+                                                                        model_loader.tensor_storages_types,
-                                                                            -1,
+                                                                        -1,
-                                                                            sd_ctx_params->chroma_use_t5_mask,
+                                                                        sd_ctx_params->chroma_use_t5_mask,
-                                                                            sd_ctx_params->chroma_t5_mask_pad);
+                                                                        sd_ctx_params->chroma_t5_mask_pad);
                } else {
                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
                }
--- a/t5.hpp
+++ b/t5.hpp
@ -124,7 +124,10 @@ protected:
                return;
            }
            std::string piece = item[0];
-            float score       = item[1];
+            if (piece.empty()) {
                piece = "<empty_token>";
            }
            float score = item[1];
            piece_score_pairs.emplace_back(piece, score);
        }
    }
@ -147,6 +150,7 @@ protected:
        std::vector<const char*> key(pieces->size());
        std::vector<int> value(pieces->size());
        for (size_t i = 0; i < pieces->size(); ++i) {
            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
            key[i]   = (*pieces)[i].first.data();  // sorted piece.
            value[i] = (*pieces)[i].second;        // vocab_id
        }
@ -335,9 +339,9 @@ protected:
    }
 public:
-    explicit T5UniGramTokenizer(const std::string& json_str = "") {
+    explicit T5UniGramTokenizer(bool is_umt5 = false) {
-        if (json_str.size() != 0) {
+        if (is_umt5) {
-            InitializePieces(json_str);
+            InitializePieces(ModelLoader::load_umt5_tokenizer_json());
        } else {
            InitializePieces(ModelLoader::load_t5_tokenizer_json());
        }
@ -673,10 +677,11 @@ public:
            int64_t model_dim,
            int64_t inner_dim,
            int64_t ff_dim,
-            int64_t num_heads)
+            int64_t num_heads,
            bool relative_attention = true)
        : num_layers(num_layers) {
        for (int i = 0; i < num_layers; i++) {
-            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0));
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
        }
        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
@ -703,15 +708,30 @@ public:
    }
 };
 struct T5Params {
    int64_t num_layers      = 24;
    int64_t model_dim       = 4096;
    int64_t ff_dim          = 10240;
    int64_t num_heads       = 64;
    int64_t vocab_size      = 32128;
    bool relative_attention = true;
 };
 struct T5 : public GGMLBlock {
    T5Params params;
 public:
-    T5(int64_t num_layers,
+    T5() {}
-       int64_t model_dim,
+    T5(T5Params params)
-       int64_t ff_dim,
+        : params(params) {
-       int64_t num_heads,
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
-       int64_t vocab_size) {
+                                                                   params.model_dim,
-        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads));
+                                                                   params.model_dim,
-        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim));
+                                                                   params.ff_dim,
                                                                   params.num_heads,
                                                                   params.relative_attention));
        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
                                                                     params.model_dim));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
@ -731,18 +751,20 @@ public:
 };
 struct T5Runner : public GGMLRunner {
    T5Params params;
    T5 model;
    std::vector<int> relative_position_bucket_vec;
    T5Runner(ggml_backend_t backend,
             const String2GGMLType& tensor_types,
             const std::string prefix,
-             int64_t num_layers = 24,
+             bool is_umt5 = false)
-             int64_t model_dim  = 4096,
+        : GGMLRunner(backend) {
-             int64_t ff_dim     = 10240,
+        if (is_umt5) {
-             int64_t num_heads  = 64,
+            params.vocab_size         = 256384;
-             int64_t vocab_size = 32128)
+            params.relative_attention = false;
-        : GGMLRunner(backend), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+        }
        model = T5(params);
        model.init(params_ctx, tensor_types, prefix);
    }
@ -769,7 +791,8 @@ struct T5Runner : public GGMLRunner {
                                    struct ggml_tensor* attention_mask = NULL) {
        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-        input_ids = to_backend(input_ids);
+        input_ids      = to_backend(input_ids);
        attention_mask = to_backend(attention_mask);
        relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]);
@ -879,12 +902,8 @@ struct T5Embedder {
    T5Embedder(ggml_backend_t backend,
               const String2GGMLType& tensor_types = {},
               const std::string prefix            = "",
-               int64_t num_layers                  = 24,
+               bool is_umt5                        = false)
-               int64_t model_dim                   = 4096,
+        : model(backend, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
               int64_t ff_dim                      = 10240,
               int64_t num_heads                   = 64,
               int64_t vocab_size                  = 32128)
        : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -946,25 +965,22 @@ struct T5Embedder {
        GGML_ASSERT(work_ctx != NULL);
        {
            // cpu f16: pass
            // cpu f32: pass
            // cuda f16: nan
            // cuda f32: pass
            // cuda q8_0: nan
            // TODO: fix cuda nan
            std::string text("a lovely cat");
-            auto tokens_and_weights     = tokenize(text, 77, true);
+            // std::string text("一只可爱的猫"); // umt5 chinease test
            auto tokens_and_weights     = tokenize(text, 512, true);
            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
            std::vector<float>& weights = std::get<1>(tokens_and_weights);
            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
            for (auto token : tokens) {
                printf("%d ", token);
            }
            printf("\n");
            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
            auto attention_mask     = vector_to_ggml_tensor(work_ctx, masks);
            struct ggml_tensor* out = NULL;
            int t0 = ggml_time_ms();
-            model.compute(8, input_ids, NULL, &out, work_ctx);
+            model.compute(8, input_ids, attention_mask, &out, work_ctx);
            int t1 = ggml_time_ms();
            print_ggml_tensor(out);
@ -973,32 +989,43 @@ struct T5Embedder {
    }
    static void load_from_file_and_test(const std::string& file_path) {
-        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+        // cpu f16: pass
-        ggml_backend_t backend         = ggml_backend_cpu_init();
+        // cpu f32: pass
-        ggml_type model_data_type      = GGML_TYPE_F32;
+        // cuda f16: pass
-        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend));
+        // cuda f32: pass
-        {
+        // cuda q8_0: pass
-            LOG_INFO("loading from '%s'", file_path.c_str());
+        ggml_backend_t backend = ggml_backend_cuda_init(0);
        // ggml_backend_t backend         = ggml_backend_cpu_init();
        ggml_type model_data_type = GGML_TYPE_F16;
-            t5->alloc_params_buffer();
+        ModelLoader model_loader;
-            std::map<std::string, ggml_tensor*> tensors;
+        if (!model_loader.init_from_file(file_path)) {
-            t5->get_param_tensors(tensors, "");
+            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
-
+            return;
            ModelLoader model_loader;
            if (!model_loader.init_from_file(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
            bool success = model_loader.load_tensors(tensors, backend);
            if (!success) {
                LOG_ERROR("load tensors from model loader failed");
                return;
            }
            LOG_INFO("t5 model loaded");
        }
        auto tensor_types = model_loader.tensor_storages_types;
        for (auto& item : tensor_types) {
            // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
            if (ends_with(item.first, "weight")) {
                item.second = model_data_type;
            }
        }
        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, tensor_types, "", true));
        t5->alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
        t5->get_param_tensors(tensors, "");
        bool success = model_loader.load_tensors(tensors, backend);
        if (!success) {
            LOG_ERROR("load tensors from model loader failed");
            return;
        }
        LOG_INFO("t5 model loaded");
        t5->test();
    }
 };
--- a/thirdparty/darts.h
+++ b/thirdparty/darts.h
@ -4,6 +4,7 @@
 #include <cstdio>
 #include <exception>
 #include <new>
 #include <iostream>
 #define DARTS_VERSION "0.32"
@ -1140,9 +1141,11 @@ inline void DawgBuilder::insert(const char *key, std::size_t length,
  if (value < 0) {
    DARTS_THROW("failed to insert key: negative value");
  } else if (length == 0) {
    std::cout << value << std::endl;
    DARTS_THROW("failed to insert key: zero-length key");
  }
  id_type id = 0;
  std::size_t key_pos = 0;
--- a/vocab_umt5.hpp
+++ b/vocab_umt5.hpp
		`@ -1 +1 @@`
			`Subproject commit b96890f3ab5ffbdbe56bc126df5366c34bd08d39`				`Subproject commit e89bc7e8625f59145ee8c0b09383009c47752cd8`