feat: add sd3 support (#298)

2025-12-12 13:28:37 +00:00 · 2024-07-28 15:44:08 +08:00 · 2024-07-28 15:44:08 +08:00 · 73c2176648
commit 73c2176648
parent 9c51d8787f
30 changed files with 2429654 additions and 1075 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "ggml"]
    path = ggml
-	url = https://github.com/ggerganov/ggml.git
+	url = https://github.com/leejet/ggml.git
--- a/README.md
+++ b/README.md
@ -1,5 +1,5 @@
 <p align="center">
-  <img src="./assets/a%20lovely%20cat.png" width="256x">
+  <img src="./assets/cat_with_sd_cpp_42.png" width="360x">
 </p>

 # stable-diffusion.cpp
@ -10,7 +10,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in

 - Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - Super lightweight and without external dependencies
- SD1.x, SD2.x and SDXL support
+- SD1.x, SD2.x, SDXL and SD3 support
    - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).

 - [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo) and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) support
@ -86,11 +86,13 @@ git submodule update
    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium

    ```shell
    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
    ```

 ### Build
@ -226,6 +228,7 @@ For example:
 ./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
 # ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 # ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
 ```

 Using formats of different precisions will yield results of varying quality.
@ -384,6 +387,7 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp

 - [ggml](https://github.com/ggerganov/ggml)
 - [stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
 - [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui)
 - [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
--- a/assets/cat_with_sd_cpp_20184.png
+++ b/assets/cat_with_sd_cpp_20184.png
--- a/assets/cat_with_sd_cpp_42.png
+++ b/assets/cat_with_sd_cpp_42.png
--- a/clip.hpp
+++ b/clip.hpp
@ -31,16 +31,6 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
    return std::make_pair(filename2multiplier, text);
 }

-const std::string UNK_TOKEN = "<|endoftext|>";
-const std::string BOS_TOKEN = "<|startoftext|>";
-const std::string EOS_TOKEN = "<|endoftext|>";
-const std::string PAD_TOEKN = "<|endoftext|>";
-
-const int UNK_TOKEN_ID = 49407;
-const int BOS_TOKEN_ID = 49406;
-const int EOS_TOKEN_ID = 49407;
-const int PAD_TOKEN_ID = 49407;
-
 std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
    std::set<int> byte_set;
@ -73,7 +63,6 @@ typedef std::function<bool(std::string&, std::vector<int32_t>&)> on_new_token_cb

 class CLIPTokenizer {
 private:
-    SDVersion version = VERSION_1_x;
    std::map<int, std::u32string> byte_encoder;
    std::map<std::u32string, int> byte_decoder;
    std::map<std::u32string, int> encoder;
@ -83,6 +72,18 @@ private:
    int encoder_len;
    int bpe_len;

+public:
+    const std::string UNK_TOKEN = "<|endoftext|>";
+    const std::string BOS_TOKEN = "<|startoftext|>";
+    const std::string EOS_TOKEN = "<|endoftext|>";
+    const std::string PAD_TOEKN = "<|endoftext|>";
+
+    const int UNK_TOKEN_ID = 49407;
+    const int BOS_TOKEN_ID = 49406;
+    const int EOS_TOKEN_ID = 49407;
+    const int PAD_TOKEN_ID = 49407;
+
+private:
    static std::string strip(const std::string& str) {
        std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
        std::string::size_type end   = str.find_last_not_of(" \t\n\r\v\f");
@ -117,8 +118,14 @@ private:
    }

 public:
-    CLIPTokenizer(SDVersion version = VERSION_1_x)
-        : version(version) {}
+    CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
+        : PAD_TOKEN_ID(pad_token_id) {
+        if (merges_utf8_str.size() > 0) {
+            load_from_merges(merges_utf8_str);
+        } else {
+            load_from_merges(ModelLoader::load_merges());
+        }
+    }

    void load_from_merges(const std::string& merges_utf8_str) {
        auto byte_unicode_pairs = bytes_to_unicode();
@ -283,11 +290,7 @@ public:
            } else {
                tokens.push_back(EOS_TOKEN_ID);
                if (padding) {
-                    int pad_token_id = PAD_TOKEN_ID;
-                    if (version == VERSION_2_x) {
-                        pad_token_id = 0;
-                    }
-                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
+                    tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID);
                }
            }
        }
@ -295,6 +298,51 @@ public:
        return tokens;
    }

+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            new_tokens.push_back(BOS_TOKEN_ID);
+            new_weights.push_back(1.0);
+            int token_idx = 0;
+            for (int i = 1; i < length; i++) {
+                if (token_idx >= tokens.size()) {
+                    break;
+                }
+                if (i % max_length == 0) {
+                    new_tokens.push_back(BOS_TOKEN_ID);
+                    new_weights.push_back(1.0);
+                } else if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(EOS_TOKEN_ID);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(EOS_TOKEN_ID);
+            new_weights.push_back(1.0);
+            tokens  = new_tokens;
+            weights = new_weights;
+
+            if (padding) {
+                tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+            }
+        }
+    }
+
    std::string decode(const std::vector<int>& tokens) {
        std::string text = "";
        for (int t : tokens) {
@ -371,113 +419,6 @@ public:
    }
 };

-// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
-//
-// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-// Accepted tokens are:
-//   (abc) - increases attention to abc by a multiplier of 1.1
-//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
-//   [abc] - decreases attention to abc by a multiplier of 1.1
-//   \( - literal character '('
-//   \[ - literal character '['
-//   \) - literal character ')'
-//   \] - literal character ']'
-//   \\ - literal character '\'
-//   anything else - just text
-//
-// >>> parse_prompt_attention('normal text')
-// [['normal text', 1.0]]
-// >>> parse_prompt_attention('an (important) word')
-// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-// >>> parse_prompt_attention('(unbalanced')
-// [['unbalanced', 1.1]]
-// >>> parse_prompt_attention('\(literal\]')
-// [['(literal]', 1.0]]
-// >>> parse_prompt_attention('(unnecessary)(parens)')
-// [['unnecessaryparens', 1.1]]
-// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-// [['a ', 1.0],
-//  ['house', 1.5730000000000004],
-//  [' ', 1.1],
-//  ['on', 1.0],
-//  [' a ', 1.1],
-//  ['hill', 0.55],
-//  [', sun, ', 1.1],
-//  ['sky', 1.4641000000000006],
-//  ['.', 1.1]]
-std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
-    std::vector<std::pair<std::string, float>> res;
-    std::vector<int> round_brackets;
-    std::vector<int> square_brackets;
-
-    float round_bracket_multiplier  = 1.1f;
-    float square_bracket_multiplier = 1 / 1.1f;
-
-    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
-    std::regex re_break(R"(\s*\bBREAK\b\s*)");
-
-    auto multiply_range = [&](int start_position, float multiplier) {
-        for (int p = start_position; p < res.size(); ++p) {
-            res[p].second *= multiplier;
-        }
-    };
-
-    std::smatch m;
-    std::string remaining_text = text;
-
-    while (std::regex_search(remaining_text, m, re_attention)) {
-        std::string text   = m[0];
-        std::string weight = m[1];
-
-        if (text == "(") {
-            round_brackets.push_back((int)res.size());
-        } else if (text == "[") {
-            square_brackets.push_back((int)res.size());
-        } else if (!weight.empty()) {
-            if (!round_brackets.empty()) {
-                multiply_range(round_brackets.back(), std::stof(weight));
-                round_brackets.pop_back();
-            }
-        } else if (text == ")" && !round_brackets.empty()) {
-            multiply_range(round_brackets.back(), round_bracket_multiplier);
-            round_brackets.pop_back();
-        } else if (text == "]" && !square_brackets.empty()) {
-            multiply_range(square_brackets.back(), square_bracket_multiplier);
-            square_brackets.pop_back();
-        } else if (text == "\\(") {
-            res.push_back({text.substr(1), 1.0f});
-        } else {
-            res.push_back({text, 1.0f});
-        }
-
-        remaining_text = m.suffix();
-    }
-
-    for (int pos : round_brackets) {
-        multiply_range(pos, round_bracket_multiplier);
-    }
-
-    for (int pos : square_brackets) {
-        multiply_range(pos, square_bracket_multiplier);
-    }
-
-    if (res.empty()) {
-        res.push_back({"", 1.0f});
-    }
-
-    int i = 0;
-    while (i + 1 < res.size()) {
-        if (res[i].second == res[i + 1].second) {
-            res[i].first += res[i + 1].first;
-            res.erase(res.begin() + i + 1);
-        } else {
-            ++i;
-        }
-    }
-
-    return res;
-}
-
 /*================================================ FrozenCLIPEmbedder ================================================*/

 // Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
@ -527,7 +468,7 @@ public:
        : d_model(d_model),
          n_head(n_head),
          intermediate_size(intermediate_size) {
-        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true));
+        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));

        blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
        blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
@ -897,42 +838,16 @@ public:
    }
 };

-// ldm.modules.encoders.modules.FrozenCLIPEmbedder
-// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
-struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
-    SDVersion version = VERSION_1_x;
-    CLIPTokenizer tokenizer;
-    CLIPTextModel text_model;
-    CLIPTextModel text_model2;
+struct CLIPTextModelRunner : public GGMLRunner {
+    CLIPTextModel model;

-    std::string embd_dir;
-    int32_t num_custom_embeddings = 0;
-    std::vector<uint8_t> token_embed_custom;
-    std::vector<std::string> readed_embeddings;
-
-    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
-                                      ggml_type wtype,
-                                      SDVersion version = VERSION_1_x,
-                                      int clip_skip     = -1)
-        : GGMLModule(backend, wtype), version(version), tokenizer(version) {
-        if (clip_skip <= 0) {
-            clip_skip = 1;
-            if (version == VERSION_2_x || version == VERSION_XL) {
-                clip_skip = 2;
-            }
-        }
-        if (version == VERSION_1_x) {
-            text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip);
-            text_model.init(params_ctx, wtype);
-        } else if (version == VERSION_2_x) {
-            text_model = CLIPTextModel(OPEN_CLIP_VIT_H_14, clip_skip);
-            text_model.init(params_ctx, wtype);
-        } else if (version == VERSION_XL) {
-            text_model  = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip, false);
-            text_model2 = CLIPTextModel(OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-            text_model.init(params_ctx, wtype);
-            text_model2.init(params_ctx, wtype);
-        }
+    CLIPTextModelRunner(ggml_backend_t backend,
+                        ggml_type wtype,
+                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+                        int clip_skip_value = 1,
+                        bool with_final_ln  = true)
+        : GGMLRunner(backend, wtype), model(version, clip_skip_value, with_final_ln) {
+        model.init(params_ctx, wtype);
    }

    std::string get_desc() {
@ -940,140 +855,52 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
    }

    void set_clip_skip(int clip_skip) {
-        text_model.set_clip_skip(clip_skip);
-        if (version == VERSION_XL) {
-            text_model2.set_clip_skip(clip_skip);
-        }
+        model.set_clip_skip(clip_skip);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        text_model.get_param_tensors(tensors, prefix + "transformer.text_model");
-        if (version == VERSION_XL) {
-            text_model2.get_param_tensors(tensors, prefix + "1.transformer.text_model");
-        }
-    }
-
-    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
-        // the order matters
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file(embd_path)) {
-            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
-            return false;
-        }
-        if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
-            LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
-            return true;
-        }
-        struct ggml_init_params params;
-        params.mem_size               = 10 * 1024 * 1024;  // max for custom embeddings 10 MB
-        params.mem_buffer             = NULL;
-        params.no_alloc               = false;
-        struct ggml_context* embd_ctx = ggml_init(params);
-        struct ggml_tensor* embd      = NULL;
-        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
-            if (tensor_storage.ne[0] != text_model.hidden_size) {
-                LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model.hidden_size);
-                return false;
-            }
-            embd        = ggml_new_tensor_2d(embd_ctx, wtype, text_model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
-            *dst_tensor = embd;
-            return true;
-        };
-        model_loader.load_tensors(on_load, NULL);
-        readed_embeddings.push_back(embd_name);
-        token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
-        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype)),
-               embd->data,
-               ggml_nbytes(embd));
-        for (int i = 0; i < embd->ne[1]; i++) {
-            bpe_tokens.push_back(text_model.vocab_size + num_custom_embeddings);
-            // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
-            num_custom_embeddings++;
-        }
-        LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
-        return true;
+        model.get_param_tensors(tensors, prefix);
    }

    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* input_ids2,
                                struct ggml_tensor* embeddings,
                                size_t max_token_idx = 0,
                                bool return_pooled   = false) {
        size_t N       = input_ids->ne[1];
        size_t n_token = input_ids->ne[0];
-        if (input_ids != NULL && input_ids->ne[0] > text_model.n_token) {
-            GGML_ASSERT(input_ids->ne[0] % text_model.n_token == 0);
-            input_ids = ggml_reshape_2d(ctx, input_ids, text_model.n_token, input_ids->ne[0] / text_model.n_token);
-        }
-        if (input_ids2 != NULL && input_ids2->ne[0] > text_model2.n_token) {
-            GGML_ASSERT(input_ids2->ne[0] % text_model2.n_token == 0);
-            input_ids2 = ggml_reshape_2d(ctx, input_ids2, text_model2.n_token, input_ids2->ne[0] / text_model2.n_token);
+        if (input_ids->ne[0] > model.n_token) {
+            GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
+            input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
        }

-        if (return_pooled) {
-            return text_model2.forward(ctx, input_ids2, NULL, max_token_idx, return_pooled);
-        }
-
-        auto hidden_states = text_model.forward(ctx, input_ids, embeddings);  // [N, n_token, hidden_size]
-        // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
-        if (version == VERSION_XL) {
-            hidden_states = ggml_reshape_4d(ctx,
-                                            hidden_states,
-                                            hidden_states->ne[0],
-                                            hidden_states->ne[1],
-                                            hidden_states->ne[2],
-                                            hidden_states->ne[3]);
-            hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 2, 0, 1, 3));
-
-            auto hidden_states2 = text_model2.forward(ctx, input_ids2, NULL);  // [N, n_token, hidden_size2]
-            // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
-            hidden_states2 = ggml_reshape_4d(ctx,
-                                             hidden_states2,
-                                             hidden_states2->ne[0],
-                                             hidden_states2->ne[1],
-                                             hidden_states2->ne[2],
-                                             hidden_states2->ne[3]);
-            hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));
-
-            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2);  // [N, n_token, hidden_size + hidden_size2]
-
-            hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
-        }
-        hidden_states = ggml_reshape_3d(ctx, hidden_states, hidden_states->ne[0], n_token, N);
-        // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
-        return hidden_states;
+        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_ids2 = NULL,
-                                    size_t max_token_idx           = 0,
-                                    bool return_pooled             = false) {
+                                    int num_custom_embeddings    = 0,
+                                    void* custom_embeddings_data = NULL,
+                                    size_t max_token_idx         = 0,
+                                    bool return_pooled           = false) {
        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

-        input_ids2 = to_backend(input_ids2);
-        if (!return_pooled) {
-            input_ids = to_backend(input_ids);
-        }
+        input_ids = to_backend(input_ids);

        struct ggml_tensor* embeddings = NULL;

-        if (num_custom_embeddings > 0 && version != VERSION_XL) {
-            auto custom_embeddings = ggml_new_tensor_3d(compute_ctx,
+        if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+            auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
                                                        wtype,
-                                                        text_model.hidden_size,
-                                                        1,
+                                                        model.hidden_size,
                                                        num_custom_embeddings);
-            set_backend_tensor_data(custom_embeddings, token_embed_custom.data());
+            set_backend_tensor_data(custom_embeddings, custom_embeddings_data);

-            auto token_embed_weight = text_model.get_token_embed_weight();
-            token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
+            auto token_embed_weight = model.get_token_embed_weight();
            // concatenate custom embeddings
-            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
-            embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
        }

-        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);

        ggml_build_forward_expand(gf, hidden_states);

@ -1082,317 +909,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {

    void compute(const int n_threads,
                 struct ggml_tensor* input_ids,
-                 struct ggml_tensor* input_ids2,
+                 int num_custom_embeddings,
+                 void* custom_embeddings_data,
                 size_t max_token_idx,
                 bool return_pooled,
                 ggml_tensor** output,
                 ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, input_ids2, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
        };
-        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
-    }
-
-    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                             bool padding = false) {
-        return tokenize(text, text_model.n_token, padding);
-    }
-
-    std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
-    tokenize_with_trigger_token(std::string text,
-                                int num_input_imgs,
-                                int32_t image_token,
-                                bool padding = false) {
-        return tokenize_with_trigger_token(text, num_input_imgs, image_token,
-                                           text_model.n_token, padding);
-    }
-
-    std::vector<int> convert_token_to_id(std::string text) {
-        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
-            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
-            }
-            return false;
-        };
-        std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
-        return curr_tokens;
-    }
-
-    std::string decode(const std::vector<int>& tokens) {
-        return tokenizer.decode(tokens);
-    }
-
-    void pad_tokens(std::vector<int>& tokens,
-                    std::vector<float>& weights,
-                    size_t max_length = 0,
-                    bool padding      = false) {
-        if (max_length > 0 && padding) {
-            size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
-            if (n == 0) {
-                n = 1;
-            }
-            size_t length = max_length * n;
-            LOG_DEBUG("token length: %llu", length);
-            std::vector<int> new_tokens;
-            std::vector<float> new_weights;
-            new_tokens.push_back(BOS_TOKEN_ID);
-            new_weights.push_back(1.0);
-            int token_idx = 0;
-            for (int i = 1; i < length; i++) {
-                if (token_idx >= tokens.size()) {
-                    break;
-                }
-                if (i % max_length == 0) {
-                    new_tokens.push_back(BOS_TOKEN_ID);
-                    new_weights.push_back(1.0);
-                } else if (i % max_length == max_length - 1) {
-                    new_tokens.push_back(EOS_TOKEN_ID);
-                    new_weights.push_back(1.0);
-                } else {
-                    new_tokens.push_back(tokens[token_idx]);
-                    new_weights.push_back(weights[token_idx]);
-                    token_idx++;
-                }
-            }
-
-            new_tokens.push_back(EOS_TOKEN_ID);
-            new_weights.push_back(1.0);
-            tokens  = new_tokens;
-            weights = new_weights;
-
-            if (padding) {
-                int pad_token_id = PAD_TOKEN_ID;
-                if (version == VERSION_2_x) {
-                    pad_token_id = 0;
-                }
-                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
-                weights.insert(weights.end(), length - weights.size(), 1.0);
-            }
-        }
-    }
-
-    std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
-    tokenize_with_trigger_token(std::string text,
-                                int num_input_imgs,
-                                int32_t image_token,
-                                size_t max_length = 0,
-                                bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
-            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
-            }
-            return false;
-        };
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        std::vector<bool> class_token_mask;
-        int32_t class_idx = -1, tokens_acc = 0;
-        for (const auto& item : parsed_attention) {
-            std::vector<int> class_token_index;
-            std::vector<int> clean_input_ids;
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
-            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
-            int32_t clean_index          = 0;
-            for (uint32_t i = 0; i < curr_tokens.size(); i++) {
-                int token_id = curr_tokens[i];
-                if (token_id == image_token)
-                    class_token_index.push_back(clean_index - 1);
-                else {
-                    clean_input_ids.push_back(token_id);
-                    clean_index++;
-                }
-            }
-            // GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
-            //     trigger words in a single prompt.
-            if (class_token_index.size() == 1) {
-                // Expand the class word token and corresponding mask
-                int class_token = clean_input_ids[class_token_index[0]];
-                class_idx       = tokens_acc + class_token_index[0];
-                std::vector<int> clean_input_ids_tmp;
-                for (uint32_t i = 0; i < class_token_index[0]; i++)
-                    clean_input_ids_tmp.push_back(clean_input_ids[i]);
-                for (uint32_t i = 0; i < num_input_imgs; i++)
-                    clean_input_ids_tmp.push_back(class_token);
-                for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
-                    clean_input_ids_tmp.push_back(clean_input_ids[i]);
-                clean_input_ids.clear();
-                clean_input_ids = clean_input_ids_tmp;
-            }
-            tokens_acc += clean_index;
-            tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
-            weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
-        }
-        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
-        weights.insert(weights.begin(), 1.0);
-
-        pad_tokens(tokens, weights, max_length, padding);
-
-        for (uint32_t i = 0; i < tokens.size(); i++) {
-            if (class_idx + 1 <= i && i < class_idx + 1 + num_input_imgs)
-                class_token_mask.push_back(true);
-            else
-                class_token_mask.push_back(false);
-        }
-
-        // printf("[");
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     printf("%d, ", class_token_mask[i] ? 1 : 0);
-        // }
-        // printf("]\n");
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return std::make_tuple(tokens, weights, class_token_mask);
-    }
-
-    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                             size_t max_length = 0,
-                                                             bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
-            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
-            }
-            return false;
-        };
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        for (const auto& item : parsed_attention) {
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
-        }
-
-        pad_tokens(tokens, weights, max_length, padding);
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return {tokens, weights};
-    }
-};
-
-struct FrozenCLIPVisionEmbedder : public GGMLModule {
-    CLIPVisionModelProjection vision_model;
-
-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, ggml_type wtype)
-        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLModule(backend, wtype) {
-        vision_model.init(params_ctx, wtype);
-    }
-
-    std::string get_desc() {
-        return "clip_vision";
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        vision_model.get_param_tensors(tensors, prefix + "transformer");
-    }
-
-    struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        pixel_values = to_backend(pixel_values);
-
-        struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values);
-
-        ggml_build_forward_expand(gf, hidden_states);
-
-        return gf;
-    }
-
-    void compute(const int n_threads,
-                 ggml_tensor* pixel_values,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(pixel_values);
-        };
-        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
 };

--- a/common.hpp
+++ b/common.hpp
@ -279,26 +279,11 @@ public:
        int64_t n_context = context->ne[1];
        int64_t inner_dim = d_head * n_head;

-        auto q = to_q->forward(ctx, x);                                 // [N, n_token, inner_dim]
-        q      = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n);   // [N, n_token, n_head, d_head]
-        q      = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
-        q      = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n);  // [N * n_head, n_token, d_head]
+        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
+        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]

-        auto k = to_k->forward(ctx, context);                             // [N, n_context, inner_dim]
-        k      = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
-        k      = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));        // [N, n_head, n_context, d_head]
-        k      = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n);  // [N * n_head, n_context, d_head]
-
-        auto v = to_v->forward(ctx, context);                             // [N, n_context, inner_dim]
-        v      = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
-        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));        // [N, n_head, d_head, n_context]
-        v      = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n);  // [N * n_head, d_head, n_context]
-
-        auto kqv = ggml_nn_attention(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
-        kqv      = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
-        kqv      = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
-
-        x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false);  // [N, n_token, inner_dim]

        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
--- a/conditioner.hpp
+++ b/conditioner.hpp
@ -0,0 +1,981 @@
+#ifndef __CONDITIONER_HPP__
+#define __CONDITIONER_HPP__
+
+#include "clip.hpp"
+#include "t5.hpp"
+
+struct SDCondition {
+    struct ggml_tensor* c_crossattn = NULL;  // aka context
+    struct ggml_tensor* c_vector    = NULL;  // aka y
+    struct ggml_tensor* c_concat    = NULL;
+
+    SDCondition() = default;
+    SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat) :
+    c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
+};
+
+struct Conditioner {
+    virtual SDCondition get_learned_condition(ggml_context* work_ctx,
+                                              int n_threads,
+                                              const std::string& text,
+                                              int clip_skip,
+                                              int width,
+                                              int height,
+                                              int adm_in_channels        = -1,
+                                              bool force_zero_embeddings = false)                                             = 0;
+    virtual void alloc_params_buffer()                                                                                        = 0;
+    virtual void free_params_buffer()                                                                                         = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)                                       = 0;
+    virtual size_t get_params_buffer_size()                                                                                   = 0;
+    virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
+                                                                                          int n_threads,
+                                                                                          const std::string& text,
+                                                                                          int clip_skip,
+                                                                                          int width,
+                                                                                          int height,
+                                                                                          int num_input_imgs,
+                                                                                          int adm_in_channels        = -1,
+                                                                                          bool force_zero_embeddings = false) = 0;
+    virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
+                                                   const std::string& prompt)                                                 = 0;
+};
+
+// ldm.modules.encoders.modules.FrozenCLIPEmbedder
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
+struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
+    SDVersion version = VERSION_1_x;
+    CLIPTokenizer tokenizer;
+    ggml_type wtype;
+    std::shared_ptr<CLIPTextModelRunner> text_model;
+    std::shared_ptr<CLIPTextModelRunner> text_model2;
+
+    std::string trigger_word = "img";  // should be user settable
+    std::string embd_dir;
+    int32_t num_custom_embeddings = 0;
+    std::vector<uint8_t> token_embed_custom;
+    std::vector<std::string> readed_embeddings;
+
+    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+                                      ggml_type wtype,
+                                      const std::string& embd_dir,
+                                      SDVersion version = VERSION_1_x,
+                                      int clip_skip     = -1)
+        : version(version), tokenizer(version == VERSION_2_x ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
+        if (clip_skip <= 0) {
+            clip_skip = 1;
+            if (version == VERSION_2_x || version == VERSION_XL) {
+                clip_skip = 2;
+            }
+        }
+        if (version == VERSION_1_x) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip);
+        } else if (version == VERSION_2_x) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_H_14, clip_skip);
+        } else if (version == VERSION_XL) {
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        }
+    }
+
+    void set_clip_skip(int clip_skip) {
+        text_model->set_clip_skip(clip_skip);
+        if (version == VERSION_XL) {
+            text_model2->set_clip_skip(clip_skip);
+        }
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
+        if (version == VERSION_XL) {
+            text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
+        }
+    }
+
+    void alloc_params_buffer() {
+        text_model->alloc_params_buffer();
+        if (version == VERSION_XL) {
+            text_model2->alloc_params_buffer();
+        }
+    }
+
+    void free_params_buffer() {
+        text_model->free_params_buffer();
+        if (version == VERSION_XL) {
+            text_model2->free_params_buffer();
+        }
+    }
+
+    size_t get_params_buffer_size() {
+        size_t buffer_size = text_model->get_params_buffer_size();
+        if (version == VERSION_XL) {
+            buffer_size += text_model2->get_params_buffer_size();
+        }
+        return buffer_size;
+    }
+
+    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
+        // the order matters
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(embd_path)) {
+            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
+            return false;
+        }
+        if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
+            LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
+            return true;
+        }
+        struct ggml_init_params params;
+        params.mem_size               = 10 * 1024 * 1024;  // max for custom embeddings 10 MB
+        params.mem_buffer             = NULL;
+        params.no_alloc               = false;
+        struct ggml_context* embd_ctx = ggml_init(params);
+        struct ggml_tensor* embd      = NULL;
+        int64_t hidden_size           = text_model->model.hidden_size;
+        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
+            if (tensor_storage.ne[0] != hidden_size) {
+                LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
+                return false;
+            }
+            embd        = ggml_new_tensor_2d(embd_ctx, wtype, hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+            *dst_tensor = embd;
+            return true;
+        };
+        model_loader.load_tensors(on_load, NULL);
+        readed_embeddings.push_back(embd_name);
+        token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
+        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(wtype)),
+               embd->data,
+               ggml_nbytes(embd));
+        for (int i = 0; i < embd->ne[1]; i++) {
+            bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+            // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+            num_custom_embeddings++;
+        }
+        LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
+        return true;
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
+    tokenize_with_trigger_token(std::string text,
+                                int num_input_imgs,
+                                int32_t image_token,
+                                bool padding = false) {
+        return tokenize_with_trigger_token(text, num_input_imgs, image_token,
+                                           text_model->model.n_token, padding);
+    }
+
+    std::vector<int> convert_token_to_id(std::string text) {
+        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+            size_t word_end       = str.find(",");
+            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+            embd_name             = trim(embd_name);
+            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            }
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+            }
+            if (embd_path.size() > 0) {
+                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+                    if (word_end != std::string::npos) {
+                        str = str.substr(word_end);
+                    } else {
+                        str = "";
+                    }
+                    return true;
+                }
+            }
+            return false;
+        };
+        std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
+        return curr_tokens;
+    }
+
+    std::string decode(const std::vector<int>& tokens) {
+        return tokenizer.decode(tokens);
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<bool>>
+    tokenize_with_trigger_token(std::string text,
+                                int num_input_imgs,
+                                int32_t image_token,
+                                size_t max_length = 0,
+                                bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+            size_t word_end       = str.find(",");
+            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+            embd_name             = trim(embd_name);
+            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            }
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+            }
+            if (embd_path.size() > 0) {
+                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+                    if (word_end != std::string::npos) {
+                        str = str.substr(word_end);
+                    } else {
+                        str = "";
+                    }
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        std::vector<bool> class_token_mask;
+        int32_t class_idx = -1, tokens_acc = 0;
+        for (const auto& item : parsed_attention) {
+            std::vector<int> class_token_index;
+            std::vector<int> clean_input_ids;
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
+            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
+            int32_t clean_index          = 0;
+            for (uint32_t i = 0; i < curr_tokens.size(); i++) {
+                int token_id = curr_tokens[i];
+                if (token_id == image_token)
+                    class_token_index.push_back(clean_index - 1);
+                else {
+                    clean_input_ids.push_back(token_id);
+                    clean_index++;
+                }
+            }
+            // GGML_ASSERT(class_token_index.size() == 1); // PhotoMaker currently does not support multiple
+            //     trigger words in a single prompt.
+            if (class_token_index.size() == 1) {
+                // Expand the class word token and corresponding mask
+                int class_token = clean_input_ids[class_token_index[0]];
+                class_idx       = tokens_acc + class_token_index[0];
+                std::vector<int> clean_input_ids_tmp;
+                for (uint32_t i = 0; i < class_token_index[0]; i++)
+                    clean_input_ids_tmp.push_back(clean_input_ids[i]);
+                for (uint32_t i = 0; i < num_input_imgs; i++)
+                    clean_input_ids_tmp.push_back(class_token);
+                for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
+                    clean_input_ids_tmp.push_back(clean_input_ids[i]);
+                clean_input_ids.clear();
+                clean_input_ids = clean_input_ids_tmp;
+            }
+            tokens_acc += clean_index;
+            tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
+            weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
+        }
+        tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
+        weights.insert(weights.begin(), 1.0);
+
+        tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+        for (uint32_t i = 0; i < tokens.size(); i++) {
+            if (class_idx + 1 <= i && i < class_idx + 1 + num_input_imgs)
+                class_token_mask.push_back(true);
+            else
+                class_token_mask.push_back(false);
+        }
+
+        // printf("[");
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     printf("%d, ", class_token_mask[i] ? 1 : 0);
+        // }
+        // printf("]\n");
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return std::make_tuple(tokens, weights, class_token_mask);
+    }
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             bool padding = false) {
+        return tokenize(text, text_model->model.n_token, padding);
+    }
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             size_t max_length = 0,
+                                                             bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+            size_t word_end       = str.find(",");
+            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+            embd_name             = trim(embd_name);
+            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            }
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+            }
+            if (embd_path.size() > 0) {
+                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+                    if (word_end != std::string::npos) {
+                        str = str.substr(word_end);
+                    } else {
+                        str = "";
+                    }
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights};
+    }
+
+    SDCondition get_learned_condition_common(ggml_context* work_ctx,
+                                             int n_threads,
+                                             std::vector<int>& tokens,
+                                             std::vector<float>& weights,
+                                             int clip_skip,
+                                             int width,
+                                             int height,
+                                             int adm_in_channels        = -1,
+                                             bool force_zero_embeddings = false) {
+        set_clip_skip(clip_skip);
+        int64_t t0                               = ggml_time_ms();
+        struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
+        struct ggml_tensor* chunk_hidden_states  = NULL;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+        struct ggml_tensor* chunk_hidden_states1 = NULL;  // [n_token, hidden_size]
+        struct ggml_tensor* chunk_hidden_states2 = NULL;  // [n_token, hidden_size2]
+        struct ggml_tensor* pooled               = NULL;
+        std::vector<float> hidden_states_vec;
+
+        size_t chunk_len   = 77;
+        size_t chunk_count = tokens.size() / chunk_len;
+        for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
+            std::vector<int> chunk_tokens(tokens.begin() + chunk_idx * chunk_len,
+                                          tokens.begin() + (chunk_idx + 1) * chunk_len);
+            std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len,
+                                             weights.begin() + (chunk_idx + 1) * chunk_len);
+
+            auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+            struct ggml_tensor* input_ids2 = NULL;
+            size_t max_token_idx           = 0;
+            if (version == VERSION_XL) {
+                auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
+                if (it != chunk_tokens.end()) {
+                    std::fill(std::next(it), chunk_tokens.end(), 0);
+                }
+
+                max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+
+                input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+
+                // for (int i = 0; i < chunk_tokens.size(); i++) {
+                //     printf("%d ", chunk_tokens[i]);
+                // }
+                // printf("\n");
+            }
+
+            {
+                text_model->compute(n_threads,
+                                    input_ids,
+                                    num_custom_embeddings,
+                                    token_embed_custom.data(),
+                                    max_token_idx,
+                                    false,
+                                    &chunk_hidden_states1,
+                                    work_ctx);
+                if (version == VERSION_XL) {
+                    text_model2->compute(n_threads,
+                                         input_ids2,
+                                         0,
+                                         NULL,
+                                         max_token_idx,
+                                         false,
+                                         &chunk_hidden_states2, work_ctx);
+                    // concat
+                    chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+
+                    if (chunk_idx == 0) {
+                        text_model2->compute(n_threads,
+                                             input_ids2,
+                                             0,
+                                             NULL,
+                                             max_token_idx,
+                                             true,
+                                             &pooled,
+                                             work_ctx);
+                    }
+                } else {
+                    chunk_hidden_states = chunk_hidden_states1;
+                }
+            }
+
+            int64_t t1 = ggml_time_ms();
+            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+            ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
+            {
+                float original_mean = ggml_tensor_mean(chunk_hidden_states);
+                for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
+                            float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
+                            value *= chunk_weights[i1];
+                            ggml_tensor_set_f32(result, value, i0, i1, i2);
+                        }
+                    }
+                }
+                float new_mean = ggml_tensor_mean(result);
+                ggml_tensor_scale(result, (original_mean / new_mean));
+            }
+            if (force_zero_embeddings) {
+                float* vec = (float*)result->data;
+                for (int i = 0; i < ggml_nelements(result); i++) {
+                    vec[i] = 0;
+                }
+            }
+            hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result));
+        }
+
+        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+        hidden_states = ggml_reshape_2d(work_ctx,
+                                        hidden_states,
+                                        chunk_hidden_states->ne[0],
+                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+
+        ggml_tensor* vec = NULL;
+        if (version == VERSION_XL) {
+            int out_dim = 256;
+            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
+            // [0:1280]
+            size_t offset = 0;
+            memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
+            offset += ggml_nbytes(pooled);
+
+            // original_size_as_tuple
+            float orig_width             = (float)width;
+            float orig_height            = (float)height;
+            std::vector<float> timesteps = {orig_height, orig_width};
+
+            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+            offset += ggml_nbytes(embed_view);
+            set_timestep_embedding(timesteps, embed_view, out_dim);
+            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
+            // crop_coords_top_left
+            float crop_coord_top  = 0.f;
+            float crop_coord_left = 0.f;
+            timesteps             = {crop_coord_top, crop_coord_left};
+            embed_view            = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+            offset += ggml_nbytes(embed_view);
+            set_timestep_embedding(timesteps, embed_view, out_dim);
+            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
+            // target_size_as_tuple
+            float target_width  = (float)width;
+            float target_height = (float)height;
+            timesteps           = {target_height, target_width};
+            embed_view          = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+            offset += ggml_nbytes(embed_view);
+            set_timestep_embedding(timesteps, embed_view, out_dim);
+            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
+            GGML_ASSERT(offset == ggml_nbytes(vec));
+        }
+        // print_ggml_tensor(result);
+        return SDCondition(hidden_states, vec, NULL);
+    }
+
+    std::tuple<SDCondition, std::vector<bool>>
+    get_learned_condition_with_trigger(ggml_context* work_ctx,
+                                       int n_threads,
+                                       const std::string& text,
+                                       int clip_skip,
+                                       int width,
+                                       int height,
+                                       int num_input_imgs,
+                                       int adm_in_channels        = -1,
+                                       bool force_zero_embeddings = false) {
+        auto image_tokens = convert_token_to_id(trigger_word);
+        // if(image_tokens.size() == 1){
+        //     printf(" image token id is: %d \n", image_tokens[0]);
+        // }
+        GGML_ASSERT(image_tokens.size() == 1);
+        auto tokens_and_weights     = tokenize_with_trigger_token(text,
+                                                                  num_input_imgs,
+                                                                  image_tokens[0],
+                                                                  true);
+        std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+        std::vector<float>& weights = std::get<1>(tokens_and_weights);
+        std::vector<bool>& clsm     = std::get<2>(tokens_and_weights);
+        // printf("tokens: \n");
+        // for(int i = 0; i < tokens.size(); ++i)
+        //    printf("%d ", tokens[i]);
+        // printf("\n");
+        // printf("clsm: \n");
+        // for(int i = 0; i < clsm.size(); ++i)
+        //    printf("%d ", clsm[i]?1:0);
+        // printf("\n");
+        auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, force_zero_embeddings);
+        return std::make_tuple(cond, clsm);
+    }
+
+    std::string remove_trigger_from_prompt(ggml_context* work_ctx,
+                                           const std::string& prompt) {
+        auto image_tokens = convert_token_to_id(trigger_word);
+        GGML_ASSERT(image_tokens.size() == 1);
+        auto tokens_and_weights  = tokenize(prompt, false);
+        std::vector<int>& tokens = tokens_and_weights.first;
+        auto it                  = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
+        GGML_ASSERT(it != tokens.end());  // prompt must have trigger word
+        tokens.erase(it);
+        return decode(tokens);
+    }
+
+    SDCondition get_learned_condition(ggml_context* work_ctx,
+                                      int n_threads,
+                                      const std::string& text,
+                                      int clip_skip,
+                                      int width,
+                                      int height,
+                                      int adm_in_channels        = -1,
+                                      bool force_zero_embeddings = false) {
+        auto tokens_and_weights     = tokenize(text, true);
+        std::vector<int>& tokens    = tokens_and_weights.first;
+        std::vector<float>& weights = tokens_and_weights.second;
+        return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, force_zero_embeddings);
+    }
+};
+
+struct FrozenCLIPVisionEmbedder : public GGMLRunner {
+    CLIPVisionModelProjection vision_model;
+
+    FrozenCLIPVisionEmbedder(ggml_backend_t backend, ggml_type wtype)
+        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend, wtype) {
+        vision_model.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "clip_vision";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        pixel_values = to_backend(pixel_values);
+
+        struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 ggml_tensor* pixel_values,
+                 ggml_tensor** output,
+                 ggml_context* output_ctx) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(pixel_values);
+        };
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+    }
+};
+
+struct SD3CLIPEmbedder : public Conditioner {
+    ggml_type wtype;
+    CLIPTokenizer clip_l_tokenizer;
+    CLIPTokenizer clip_g_tokenizer;
+    T5UniGramTokenizer t5_tokenizer;
+    std::shared_ptr<CLIPTextModelRunner> clip_l;
+    std::shared_ptr<CLIPTextModelRunner> clip_g;
+    std::shared_ptr<T5Runner> t5;
+
+    SD3CLIPEmbedder(ggml_backend_t backend,
+                    ggml_type wtype,
+                    int clip_skip = -1)
+        : wtype(wtype), clip_g_tokenizer(0) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        t5     = std::make_shared<T5Runner>(backend, wtype);
+    }
+
+    void set_clip_skip(int clip_skip) {
+        clip_l->set_clip_skip(clip_skip);
+        clip_g->set_clip_skip(clip_skip);
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+        clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
+        t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+    }
+
+    void alloc_params_buffer() {
+        clip_l->alloc_params_buffer();
+        clip_g->alloc_params_buffer();
+        t5->alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        clip_l->free_params_buffer();
+        clip_g->free_params_buffer();
+        t5->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() {
+        size_t buffer_size = clip_l->get_params_buffer_size();
+        buffer_size += clip_g->get_params_buffer_size();
+        buffer_size += t5->get_params_buffer_size();
+        return buffer_size;
+    }
+
+    std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
+                                                                          size_t max_length = 0,
+                                                                          bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+            return false;
+        };
+
+        std::vector<int> clip_l_tokens;
+        std::vector<float> clip_l_weights;
+        std::vector<int> clip_g_tokens;
+        std::vector<float> clip_g_weights;
+        std::vector<int> t5_tokens;
+        std::vector<float> t5_weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+
+            std::vector<int> curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
+            clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
+
+            curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
+            clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
+
+            curr_tokens = t5_tokenizer.Encode(curr_text, true);
+            t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
+        clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
+        t5_tokenizer.pad_tokens(t5_tokens, t5_weights, max_length, padding);
+
+        // for (int i = 0; i < clip_l_tokens.size(); i++) {
+        //     std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        // for (int i = 0; i < clip_g_tokens.size(); i++) {
+        //     std::cout << clip_g_tokens[i] << ":" << clip_g_weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        // for (int i = 0; i < t5_tokens.size(); i++) {
+        //     std::cout << t5_tokens[i] << ":" << t5_weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {{clip_l_tokens, clip_l_weights}, {clip_g_tokens, clip_g_weights}, {t5_tokens, t5_weights}};
+    }
+
+    SDCondition get_learned_condition_common(ggml_context* work_ctx,
+                                             int n_threads,
+                                             std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
+                                             int clip_skip,
+                                             bool force_zero_embeddings = false) {
+        set_clip_skip(clip_skip);
+        auto& clip_l_tokens  = token_and_weights[0].first;
+        auto& clip_l_weights = token_and_weights[0].second;
+        auto& clip_g_tokens  = token_and_weights[1].first;
+        auto& clip_g_weights = token_and_weights[1].second;
+        auto& t5_tokens      = token_and_weights[2].first;
+        auto& t5_weights     = token_and_weights[2].second;
+
+        int64_t t0                                 = ggml_time_ms();
+        struct ggml_tensor* hidden_states          = NULL;  // [N, n_token*2, 4096]
+        struct ggml_tensor* chunk_hidden_states    = NULL;  // [n_token*2, 4096]
+        struct ggml_tensor* chunk_hidden_states_l  = NULL;  // [n_token, hidden_size_l]
+        struct ggml_tensor* chunk_hidden_states_g  = NULL;  // [n_token, hidden_size_g]
+        struct ggml_tensor* chunk_hidden_states_t5 = NULL;  // [n_token, hidden_size_t5]
+        struct ggml_tensor* pooled                 = NULL;
+        struct ggml_tensor* pooled_l               = NULL;  // [768,]
+        struct ggml_tensor* pooled_g               = NULL;  // [1280,]
+        std::vector<float> hidden_states_vec;
+
+        size_t chunk_len   = 77;
+        size_t chunk_count = clip_l_tokens.size() / chunk_len;
+        for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
+            // clip_l
+            {
+                std::vector<int> chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
+                                              clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
+                std::vector<float> chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
+                                                 clip_l_weights.begin() + (chunk_idx + 1) * chunk_len);
+
+                auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                size_t max_token_idx = 0;
+
+                clip_l->compute(n_threads,
+                                input_ids,
+                                0,
+                                NULL,
+                                max_token_idx,
+                                false,
+                                &chunk_hidden_states_l,
+                                work_ctx);
+                {
+                    auto tensor         = chunk_hidden_states_l;
+                    float original_mean = ggml_tensor_mean(tensor);
+                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                                float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+                                value *= chunk_weights[i1];
+                                ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+                            }
+                        }
+                    }
+                    float new_mean = ggml_tensor_mean(tensor);
+                    ggml_tensor_scale(tensor, (original_mean / new_mean));
+                }
+
+                if (chunk_idx == 0) {
+                    // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+                    // max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+                    // clip_l->compute(n_threads,
+                    //                 input_ids,
+                    //                 0,
+                    //                 NULL,
+                    //                 max_token_idx,
+                    //                 true,
+                    //                 &pooled_l,
+                    //                 work_ctx);
+
+                    // clip_l.transformer.text_model.text_projection no in file, ignore
+                    // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection
+                    pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
+                    ggml_set_f32(pooled_l, 0.f);
+                }
+            }
+
+            // clip_g
+            {
+                std::vector<int> chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
+                                              clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
+                std::vector<float> chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
+                                                 clip_g_weights.begin() + (chunk_idx + 1) * chunk_len);
+
+                auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+                size_t max_token_idx = 0;
+
+                clip_g->compute(n_threads,
+                                input_ids,
+                                0,
+                                NULL,
+                                max_token_idx,
+                                false,
+                                &chunk_hidden_states_g,
+                                work_ctx);
+
+                {
+                    auto tensor         = chunk_hidden_states_g;
+                    float original_mean = ggml_tensor_mean(tensor);
+                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                                float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+                                value *= chunk_weights[i1];
+                                ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+                            }
+                        }
+                    }
+                    float new_mean = ggml_tensor_mean(tensor);
+                    ggml_tensor_scale(tensor, (original_mean / new_mean));
+                }
+
+                if (chunk_idx == 0) {
+                    // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
+                    // max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+                    // clip_g->compute(n_threads,
+                    //                 input_ids,
+                    //                 0,
+                    //                 NULL,
+                    //                 max_token_idx,
+                    //                 true,
+                    //                 &pooled_g,
+                    //                 work_ctx);
+                    // clip_l.transformer.text_model.text_projection no in file, ignore pooled_g too
+
+                    // TODO: fix pooled_g
+                    pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
+                    ggml_set_f32(pooled_g, 0.f);
+                }
+            }
+
+            // t5
+            {
+                std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
+                                              t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
+                std::vector<float> chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
+                                                 t5_weights.begin() + (chunk_idx + 1) * chunk_len);
+
+                auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+
+                t5->compute(n_threads,
+                            input_ids,
+                            &chunk_hidden_states_t5,
+                            work_ctx);
+                {
+                    auto tensor         = chunk_hidden_states_t5;
+                    float original_mean = ggml_tensor_mean(tensor);
+                    for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                        for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                                float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+                                value *= chunk_weights[i1];
+                                ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+                            }
+                        }
+                    }
+                    float new_mean = ggml_tensor_mean(tensor);
+                    ggml_tensor_scale(tensor, (original_mean / new_mean));
+                }
+            }
+
+            auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
+                                                                 chunk_hidden_states_l->type,
+                                                                 4096,
+                                                                 chunk_hidden_states_l->ne[1],
+                                                                 chunk_hidden_states_l->ne[2]);  // [n_token, 4096]
+
+            for (int i2 = 0; i2 < chunk_hidden_states_lg_pad->ne[2]; i2++) {
+                for (int i1 = 0; i1 < chunk_hidden_states_lg_pad->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
+                        float value = 0.f;
+                        if (i0 < chunk_hidden_states_l->ne[0]) {
+                            value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
+                        } else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
+                            value = ggml_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
+                        }
+                        ggml_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
+                    }
+                }
+            }
+
+            chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1);  // [n_token*2, 4096]
+
+            if (chunk_idx == 0) {
+                pooled = ggml_tensor_concat(work_ctx, pooled_l, pooled_g, 0);  // [768 + 1280]
+            }
+
+            int64_t t1 = ggml_time_ms();
+            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+            if (force_zero_embeddings) {
+                float* vec = (float*)chunk_hidden_states->data;
+                for (int i = 0; i < ggml_nelements(chunk_hidden_states); i++) {
+                    vec[i] = 0;
+                }
+            }
+
+            hidden_states_vec.insert(hidden_states_vec.end(),
+                                     (float*)chunk_hidden_states->data,
+                                     ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
+        }
+
+        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+        hidden_states = ggml_reshape_2d(work_ctx,
+                                        hidden_states,
+                                        chunk_hidden_states->ne[0],
+                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+        return SDCondition(hidden_states, pooled, NULL);
+    }
+
+    SDCondition get_learned_condition(ggml_context* work_ctx,
+                                      int n_threads,
+                                      const std::string& text,
+                                      int clip_skip,
+                                      int width,
+                                      int height,
+                                      int adm_in_channels        = -1,
+                                      bool force_zero_embeddings = false) {
+        auto tokens_and_weights = tokenize(text, 77, true);
+        return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, force_zero_embeddings);
+    }
+
+    std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
+                                                                                  int n_threads,
+                                                                                  const std::string& text,
+                                                                                  int clip_skip,
+                                                                                  int width,
+                                                                                  int height,
+                                                                                  int num_input_imgs,
+                                                                                  int adm_in_channels        = -1,
+                                                                                  bool force_zero_embeddings = false) {
+        GGML_ASSERT(0 && "Not implemented yet!");
+    }
+
+    std::string remove_trigger_from_prompt(ggml_context* work_ctx,
+                                           const std::string& prompt) {
+        GGML_ASSERT(0 && "Not implemented yet!");
+    }
+};
+
+#endif
--- a/control.hpp
+++ b/control.hpp
@ -306,7 +306,7 @@ public:
    }
 };

-struct ControlNet : public GGMLModule {
+struct ControlNet : public GGMLRunner {
    SDVersion version = VERSION_1_x;
    ControlNetBlock control_net;

@ -319,7 +319,7 @@ struct ControlNet : public GGMLModule {
    ControlNet(ggml_backend_t backend,
               ggml_type wtype,
               SDVersion version = VERSION_1_x)
-        : GGMLModule(backend, wtype), control_net(version) {
+        : GGMLRunner(backend, wtype), control_net(version) {
        control_net.init(params_ctx, wtype);
    }

@ -426,7 +426,7 @@ struct ControlNet : public GGMLModule {
            return build_graph(x, hint, timesteps, context, y);
        };

-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        guided_hint_cached = true;
    }

--- a/denoiser.hpp
+++ b/denoiser.hpp
@ -10,50 +10,14 @@
 #define TIMESTEPS 1000

 struct SigmaSchedule {
-    float alphas_cumprod[TIMESTEPS];
-    float sigmas[TIMESTEPS];
-    float log_sigmas[TIMESTEPS];
    int version = 0;
+    typedef std::function<float(float)> t_to_sigma_t;

-    virtual std::vector<float> get_sigmas(uint32_t n) = 0;
-
-    float sigma_to_t(float sigma) {
-        float log_sigma = std::log(sigma);
-        std::vector<float> dists;
-        dists.reserve(TIMESTEPS);
-        for (float log_sigma_val : log_sigmas) {
-            dists.push_back(log_sigma - log_sigma_val);
-        }
-
-        int low_idx = 0;
-        for (size_t i = 0; i < TIMESTEPS; i++) {
-            if (dists[i] >= 0) {
-                low_idx++;
-            }
-        }
-        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
-        int high_idx = low_idx + 1;
-
-        float low  = log_sigmas[low_idx];
-        float high = log_sigmas[high_idx];
-        float w    = (low - log_sigma) / (low - high);
-        w          = std::max(0.f, std::min(1.f, w));
-        float t    = (1.0f - w) * low_idx + w * high_idx;
-
-        return t;
-    }
-
-    float t_to_sigma(float t) {
-        int low_idx     = static_cast<int>(std::floor(t));
-        int high_idx    = static_cast<int>(std::ceil(t));
-        float w         = t - static_cast<float>(low_idx);
-        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
-        return std::exp(log_sigma);
-    }
+    virtual std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) = 0;
 };

 struct DiscreteSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
        std::vector<float> result;

        int t_max = TIMESTEPS - 1;
@ -161,7 +125,7 @@ struct AYSSchedule : SigmaSchedule {
        return results;
    }

-    std::vector<float> get_sigmas(uint32_t len) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
        const std::vector<float> noise_levels[] = {
            /* SD1.5 */
            {14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f,
@ -177,7 +141,7 @@ struct AYSSchedule : SigmaSchedule {
        };

        std::vector<float> inputs;
-        std::vector<float> results(len + 1);
+        std::vector<float> results(n + 1);

        switch (version) {
            case VERSION_2_x: /* fallthrough */
@ -201,26 +165,24 @@ struct AYSSchedule : SigmaSchedule {

        /* Stretches those pre-calculated reference levels out to the desired
         * size using log-linear interpolation */
-        if ((len + 1) != inputs.size()) {
-            results = log_linear_interpolation(inputs, len + 1);
+        if ((n + 1) != inputs.size()) {
+            results = log_linear_interpolation(inputs, n + 1);
        } else {
            results = inputs;
        }

        /* Not sure if this is strictly neccessary */
-        results[len] = 0.0f;
+        results[n] = 0.0f;

        return results;
    }
 };

 struct KarrasSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
        // These *COULD* be function arguments here,
        // but does anybody ever bother to touch them?
-        float sigma_min = 0.1f;
-        float sigma_max = 10.f;
-        float rho       = 7.f;
+        float rho = 7.f;

        std::vector<float> result(n + 1);

@ -236,23 +198,89 @@ struct KarrasSchedule : SigmaSchedule {
 };

 struct Denoiser {
-    std::shared_ptr<SigmaSchedule> schedule              = std::make_shared<DiscreteSchedule>();
-    virtual std::vector<float> get_scalings(float sigma) = 0;
-};
+    std::shared_ptr<SigmaSchedule> schedule                                                  = std::make_shared<DiscreteSchedule>();
+    virtual float sigma_min()                                                                = 0;
+    virtual float sigma_max()                                                                = 0;
+    virtual float sigma_to_t(float sigma)                                                    = 0;
+    virtual float t_to_sigma(float t)                                                        = 0;
+    virtual std::vector<float> get_scalings(float sigma)                                     = 0;
+    virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
+    virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent)             = 0;

-struct CompVisDenoiser : public Denoiser {
-    float sigma_data = 1.0f;
-
-    std::vector<float> get_scalings(float sigma) {
-        float c_out = -sigma;
-        float c_in  = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
-        return {c_out, c_in};
+    virtual std::vector<float> get_sigmas(uint32_t n) {
+        auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
+        return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
    }
 };

-struct CompVisVDenoiser : public Denoiser {
+struct CompVisDenoiser : public Denoiser {
+    float sigmas[TIMESTEPS];
+    float log_sigmas[TIMESTEPS];
+
    float sigma_data = 1.0f;

+    float sigma_min() {
+        return sigmas[0];
+    }
+
+    float sigma_max() {
+        return sigmas[TIMESTEPS - 1];
+    }
+
+    float sigma_to_t(float sigma) {
+        float log_sigma = std::log(sigma);
+        std::vector<float> dists;
+        dists.reserve(TIMESTEPS);
+        for (float log_sigma_val : log_sigmas) {
+            dists.push_back(log_sigma - log_sigma_val);
+        }
+
+        int low_idx = 0;
+        for (size_t i = 0; i < TIMESTEPS; i++) {
+            if (dists[i] >= 0) {
+                low_idx++;
+            }
+        }
+        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
+        int high_idx = low_idx + 1;
+
+        float low  = log_sigmas[low_idx];
+        float high = log_sigmas[high_idx];
+        float w    = (low - log_sigma) / (low - high);
+        w          = std::max(0.f, std::min(1.f, w));
+        float t    = (1.0f - w) * low_idx + w * high_idx;
+
+        return t;
+    }
+
+    float t_to_sigma(float t) {
+        int low_idx     = static_cast<int>(std::floor(t));
+        int high_idx    = static_cast<int>(std::ceil(t));
+        float w         = t - static_cast<float>(low_idx);
+        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
+        return std::exp(log_sigma);
+    }
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_skip = 1.0f;
+        float c_out  = -sigma;
+        float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_skip, c_out, c_in};
+    }
+
+    // this function will modify noise/latent
+    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
+        ggml_tensor_scale(noise, sigma);
+        ggml_tensor_add(latent, noise);
+        return latent;
+    }
+
+    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+        return latent;
+    }
+};
+
+struct CompVisVDenoiser : public CompVisDenoiser {
    std::vector<float> get_scalings(float sigma) {
        float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
        float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@ -261,6 +289,67 @@ struct CompVisVDenoiser : public Denoiser {
    }
 };

+float time_snr_shift(float alpha, float t) {
+    if (alpha == 1.0f) {
+        return t;
+    }
+    return alpha * t / (1 + (alpha - 1) * t);
+}
+
+struct DiscreteFlowDenoiser : public Denoiser {
+    float sigmas[TIMESTEPS];
+    float shift = 3.0f;
+
+    float sigma_data = 1.0f;
+
+    DiscreteFlowDenoiser() {
+        set_parameters();
+    }
+
+    void set_parameters() {
+        for (int i = 1; i < TIMESTEPS + 1; i++) {
+            sigmas[i - 1] = t_to_sigma(i);
+        }
+    }
+
+    float sigma_min() {
+        return sigmas[0];
+    }
+
+    float sigma_max() {
+        return sigmas[TIMESTEPS - 1];
+    }
+
+    float sigma_to_t(float sigma) {
+        return sigma * 1000.f;
+    }
+
+    float t_to_sigma(float t) {
+        t = t + 1;
+        return time_snr_shift(shift, t / 1000.f);
+    }
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_skip = 1.0f;
+        float c_out  = -sigma;
+        float c_in   = 1.0f;
+        return {c_skip, c_out, c_in};
+    }
+
+    // this function will modify noise/latent
+    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
+        ggml_tensor_scale(noise, sigma);
+        ggml_tensor_scale(latent, 1.0f - sigma);
+        ggml_tensor_add(latent, noise);
+        return latent;
+    }
+
+    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+        ggml_tensor_scale(latent, 1.0f / (1.0f - sigma));
+        return latent;
+    }
+};
+
 typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;

 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@ -0,0 +1,123 @@
+#ifndef __DIFFUSION_MODEL_H__
+#define __DIFFUSION_MODEL_H__
+
+#include "mmdit.hpp"
+#include "unet.hpp"
+
+struct DiffusionModel {
+    virtual void compute(int n_threads,
+                         struct ggml_tensor* x,
+                         struct ggml_tensor* timesteps,
+                         struct ggml_tensor* context,
+                         struct ggml_tensor* c_concat,
+                         struct ggml_tensor* y,
+                         int num_video_frames                      = -1,
+                         std::vector<struct ggml_tensor*> controls = {},
+                         float control_strength                    = 0.f,
+                         struct ggml_tensor** output               = NULL,
+                         struct ggml_context* output_ctx           = NULL)                        = 0;
+    virtual void alloc_params_buffer()                                                  = 0;
+    virtual void free_params_buffer()                                                   = 0;
+    virtual void free_compute_buffer()                                                  = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
+    virtual size_t get_params_buffer_size()                                             = 0;
+    virtual int64_t get_adm_in_channels()                                               = 0;
+};
+
+struct UNetModel : public DiffusionModel {
+    UNetModelRunner unet;
+
+    UNetModel(ggml_backend_t backend,
+              ggml_type wtype,
+              SDVersion version = VERSION_1_x)
+        : unet(backend, wtype, version) {
+    }
+
+    void alloc_params_buffer() {
+        unet.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        unet.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        unet.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        unet.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return unet.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return unet.unet.adm_in_channels;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
+    }
+};
+
+struct MMDiTModel : public DiffusionModel {
+    MMDiTRunner mmdit;
+
+    MMDiTModel(ggml_backend_t backend,
+               ggml_type wtype,
+               SDVersion version = VERSION_3_2B)
+        : mmdit(backend, wtype, version) {
+    }
+
+    void alloc_params_buffer() {
+        mmdit.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        mmdit.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        mmdit.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        mmdit.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return mmdit.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768 + 1280;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx);
+    }
+};
+
+#endif
--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -137,14 +137,14 @@ public:
    }
 };

-struct ESRGAN : public GGMLModule {
+struct ESRGAN : public GGMLRunner {
    RRDBNet rrdb_net;
    int scale     = 4;
    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM

    ESRGAN(ggml_backend_t backend,
           ggml_type wtype)
-        : GGMLModule(backend, wtype) {
+        : GGMLRunner(backend, wtype) {
        rrdb_net.init(params_ctx, wtype);
    }

@ -191,7 +191,7 @@ struct ESRGAN : public GGMLModule {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -7,7 +7,9 @@
 #include <vector>

 // #include "preprocessing.hpp"
+#include "mmdit.hpp"
 #include "stable-diffusion.h"
+#include "t5.hpp"

 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
@ -626,6 +628,7 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {

 int main(int argc, const char* argv[]) {
    SDParams params;
+
    parse_args(argc, argv, params);

    sd_set_log_callback(sd_log_cb, (void*)&params);
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 9d562d712513c77a4de44ad0428be62bc3f2a9cf
+Subproject commit 34a63747c4f0edf952267c3d0c1c1ef3dd9fe827
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -75,6 +75,16 @@ __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, in
    return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
 }

+__STATIC_INLINE__ int ggml_tensor_get_i32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
+    if (tensor->buffer != NULL) {
+        float value;
+        ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(int));
+        return value;
+    }
+    GGML_ASSERT(tensor->nb[0] == sizeof(int));
+    return *(int*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
+}
+
 __STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
    GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
    return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
@ -126,6 +136,8 @@ __STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_
                        printf("  [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
                    } else if (tensor->type == GGML_TYPE_F16) {
                        printf("  [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_f16(tensor, l, k, j, i));
+                    } else if (tensor->type == GGML_TYPE_I32) {
+                        printf("  [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_i32(tensor, l, k, j, i));
                    }
                    fflush(stdout);
                }
@ -401,6 +413,42 @@ __STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, flo
    }
 }

+__STATIC_INLINE__ struct ggml_tensor* ggml_tensor_concat(struct ggml_context* ctx,
+                                                         struct ggml_tensor* a,
+                                                         struct ggml_tensor* b,
+                                                         int dim) {
+    int64_t ne[GGML_MAX_DIMS];
+    for (int d = 0; d < GGML_MAX_DIMS; ++d) {
+        if (d == dim) {
+            ne[d] = a->ne[d] + b->ne[d];
+            continue;
+        }
+        GGML_ASSERT(a->ne[d] == b->ne[d]);
+        ne[d] = a->ne[d];
+    }
+    struct ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
+    int64_t o[4]               = {0, 0, 0, 0};
+    o[dim]                     = a->ne[dim];
+
+    float v;
+    for (int i3 = 0; i3 < result->ne[3]; i3++) {
+        for (int i2 = 0; i2 < result->ne[2]; i2++) {
+            for (int i1 = 0; i1 < result->ne[1]; i1++) {
+                for (int i0 = 0; i0 < result->ne[0]; i0++) {
+                    if (i0 < a->ne[0] && i1 < a->ne[1] && i2 < a->ne[2] && i3 < a->ne[3]) {
+                        v = ggml_tensor_get_f32(a, i0, i1, i2, i3);
+                    } else {
+                        v = ggml_tensor_get_f32(b, i0 - o[0], i1 - o[1], i2 - o[2], i3 - o[3]);
+                    }
+
+                    ggml_tensor_set_f32(result, v, i0, i1, i2, i3);
+                }
+            }
+        }
+    }
+    return result;
+}
+
 // convert values from [0, 1] to [-1, 1]
 __STATIC_INLINE__ void ggml_tensor_scale_input(struct ggml_tensor* src) {
    int64_t nelements = ggml_nelements(src);
@ -605,6 +653,56 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
    return kqv;
 }

+// q: [N, L_q, C]
+// k: [N, L_k, C]
+// v: [N, L_k, C]
+// return: [N, L_q, C]
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* ctx,
+                                                            struct ggml_tensor* q,
+                                                            struct ggml_tensor* k,
+                                                            struct ggml_tensor* v,
+                                                            int64_t n_head,
+                                                            struct ggml_tensor* mask = NULL,
+                                                            bool diag_mask_inf       = false) {
+    int64_t L_q = q->ne[1];
+    int64_t L_k = k->ne[1];
+    int64_t C   = q->ne[0];
+    int64_t N   = q->ne[2];
+
+    int64_t d_head = C / n_head;
+    float scale    = (1.0f / sqrt((float)d_head));
+
+    q = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N);   // [N, L_q, n_head, d_head]
+    q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));  // [N, n_head, L_q, d_head]
+    q = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N);  // [N * n_head, L_q, d_head]
+
+    k = ggml_reshape_4d(ctx, k, d_head, n_head, L_k, N);   // [N, L_k, n_head, d_head]
+    k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
+    k = ggml_reshape_3d(ctx, k, d_head, L_k, n_head * N);  // [N * n_head, L_k, d_head]
+
+    v = ggml_reshape_4d(ctx, v, d_head, n_head, L_k, N);   // [N, L_k, n_head, d_head]
+    v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, n_head, d_head, L_k]
+    v = ggml_reshape_3d(ctx, v, L_k, d_head, n_head * N);  // [N * n_head, d_head, L_k]
+
+    auto kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, L_q, L_k]
+    kq      = ggml_scale_inplace(ctx, kq, scale);
+    if (mask) {
+        kq = ggml_add(ctx, kq, mask);
+    }
+    if (diag_mask_inf) {
+        kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
+    }
+    kq = ggml_soft_max_inplace(ctx, kq);
+
+    auto kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, L_q, d_head]
+
+    kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N);   // [N, n_head, L_q, d_head]
+    kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, L_q, n_head, d_head]
+    kqv = ggml_reshape_3d(ctx, kqv, d_head * n_head, L_q, N);  // [N, L_q, C]
+
+    return kqv;
+}
+
 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ctx,
                                                         struct ggml_tensor* x,
                                                         struct ggml_tensor* w,
@ -764,7 +862,7 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 #define MAX_PARAMS_TENSOR_NUM 15360
 #define MAX_GRAPH_SIZE 15360

-struct GGMLModule {
+struct GGMLRunner {
 protected:
    typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;

@ -852,12 +950,12 @@ protected:
 public:
    virtual std::string get_desc() = 0;

-    GGMLModule(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32)
+    GGMLRunner(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32)
        : backend(backend), wtype(wtype) {
        alloc_params_ctx();
    }

-    virtual ~GGMLModule() {
+    virtual ~GGMLRunner() {
        free_params_buffer();
        free_compute_buffer();
        free_params_ctx();
@ -873,7 +971,9 @@ public:
        size_t num_tensors = ggml_tensor_num(params_ctx);
        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
        if (params_buffer == NULL) {
-            LOG_ERROR("%s alloc params backend buffer failed", get_desc().c_str());
+            LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
+                      get_desc().c_str(),
+                      num_tensors);
            return false;
        }
        size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
@ -1068,6 +1168,40 @@ public:
    }
 };

+class Embedding : public UnaryBlock {
+protected:
+    int64_t embedding_dim;
+    int64_t num_embeddings;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["weight"] = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings);
+    }
+
+public:
+    Embedding(int64_t num_embeddings, int64_t embedding_dim)
+        : embedding_dim(embedding_dim),
+          num_embeddings(num_embeddings) {
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids) {
+        // input_ids: [N, n_token]
+        auto weight = params["weight"];
+
+        // There are issues with ggml batch inference, so we are expanding it here first.
+        // TODO: fix ggml batch inference
+        int64_t n = input_ids->ne[1];
+        input_ids = ggml_reshape_1d(ctx, input_ids, input_ids->ne[0] * input_ids->ne[1]);
+
+        input_ids      = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto embedding = ggml_get_rows(ctx, weight, input_ids);
+        embedding      = ggml_reshape_3d(ctx, embedding, embedding->ne[0], embedding->ne[1] / n, n);
+
+        // [N, n_token, embedding_dim]
+        return embedding;
+    }
+};
+
 class Conv2d : public UnaryBlock {
 protected:
    int64_t in_channels;
@ -1241,53 +1375,44 @@ class MultiheadAttention : public GGMLBlock {
 protected:
    int64_t embed_dim;
    int64_t n_head;
-    bool bias;
-    bool mask;
+    std::string q_proj_name;
+    std::string k_proj_name;
+    std::string v_proj_name;
+    std::string out_proj_name;

 public:
    MultiheadAttention(int64_t embed_dim,
                       int64_t n_head,
-                       bool bias = true)
+                       bool qkv_proj_bias        = true,
+                       bool out_proj_bias        = true,
+                       std::string q_proj_name   = "q_proj",
+                       std::string k_proj_name   = "k_proj",
+                       std::string v_proj_name   = "v_proj",
+                       std::string out_proj_name = "out_proj")
        : embed_dim(embed_dim),
          n_head(n_head),
-          bias(bias) {
-        blocks["q_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
-        blocks["k_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
-        blocks["v_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
-        blocks["out_proj"] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
+          q_proj_name(q_proj_name),
+          k_proj_name(k_proj_name),
+          v_proj_name(v_proj_name),
+          out_proj_name(out_proj_name) {
+        blocks[q_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        blocks[k_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        blocks[v_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        blocks[out_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, out_proj_bias));
    }

    // x: [N, n_token, embed_dim]
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = false) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
-        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
-
-        int64_t N       = x->ne[2];
-        int64_t n_token = x->ne[1];
-        int64_t d_head  = embed_dim / n_head;
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks[q_proj_name]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks[k_proj_name]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks[v_proj_name]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);

        struct ggml_tensor* q = q_proj->forward(ctx, x);
-        q                     = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, N);   // [N, n_token, n_head, d_head]
-        q                     = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
-        q                     = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * N);  // [N * n_head, n_token, d_head]
-
        struct ggml_tensor* k = k_proj->forward(ctx, x);
-        k                     = ggml_reshape_4d(ctx, k, d_head, n_head, n_token, N);   // [N, n_token, n_head, d_head]
-        k                     = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
-        k                     = ggml_reshape_3d(ctx, k, d_head, n_token, n_head * N);  // [N * n_head, n_token, d_head]
-
        struct ggml_tensor* v = v_proj->forward(ctx, x);
-        v                     = ggml_reshape_4d(ctx, v, d_head, n_head, n_token, N);   // [N, n_token, n_head, d_head]
-        v                     = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));      // [N, n_head, d_head, n_token]
-        v                     = ggml_reshape_3d(ctx, v, n_token, d_head, n_head * N);  // [N * n_head, d_head, n_token]

-        struct ggml_tensor* kqv = ggml_nn_attention(ctx, q, k, v, mask);  // [N * n_head, n_token, d_head]
-
-        kqv = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, N);
-        kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));      // [N, n_token, n_head, d_head]
-        x   = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, N);  // [N, n_token, d_head * n_head]
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, mask);  // [N, n_token, embed_dim]

        x = out_proj->forward(ctx, x);  // [N, n_token, embed_dim]
        return x;
--- a/lora.hpp
+++ b/lora.hpp
@ -5,7 +5,7 @@

 #define LORA_GRAPH_SIZE 10240

-struct LoraModel : public GGMLModule {
+struct LoraModel : public GGMLRunner {
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::string file_path;
@ -17,7 +17,7 @@ struct LoraModel : public GGMLModule {
              ggml_type wtype,
              const std::string& file_path = "",
              const std::string& prefix    = "")
-        : file_path(file_path), GGMLModule(backend, wtype) {
+        : file_path(file_path), GGMLRunner(backend, wtype) {
        if (!model_loader.init_from_file(file_path, prefix)) {
            load_failed = true;
        }
@ -182,7 +182,7 @@ struct LoraModel : public GGMLModule {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_lora_graph(model_tensors);
        };
-        GGMLModule::compute(get_graph, n_threads, true);
+        GGMLRunner::compute(get_graph, n_threads, true);
    }
 };

--- a/mmdit.hpp
+++ b/mmdit.hpp
@ -0,0 +1,795 @@
+#ifndef __MMDIT_HPP__
+#define __MMDIT_HPP__
+
+#include "ggml_extend.hpp"
+#include "model.h"
+
+#define MMDIT_GRAPH_SIZE 10240
+
+struct Mlp : public GGMLBlock {
+public:
+    Mlp(int64_t in_features,
+        int64_t hidden_features = -1,
+        int64_t out_features    = -1,
+        bool bias               = true) {
+        // act_layer is always lambda: nn.GELU(approximate="tanh")
+        // norm_layer is always None
+        // use_conv is always False
+        if (hidden_features == -1) {
+            hidden_features = in_features;
+        }
+        if (out_features == -1) {
+            out_features = in_features;
+        }
+        blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
+        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, in_features]
+        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
+        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+
+        x = fc1->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx, x);
+        x = fc2->forward(ctx, x);
+        return x;
+    }
+};
+
+struct PatchEmbed : public GGMLBlock {
+    // 2D Image to Patch Embedding
+protected:
+    bool flatten;
+    bool dynamic_img_pad;
+    int patch_size;
+
+public:
+    PatchEmbed(int64_t img_size     = 224,
+               int patch_size       = 16,
+               int64_t in_chans     = 3,
+               int64_t embed_dim    = 1536,
+               bool bias            = true,
+               bool flatten         = true,
+               bool dynamic_img_pad = true)
+        : patch_size(patch_size),
+          flatten(flatten),
+          dynamic_img_pad(dynamic_img_pad) {
+        // img_size is always None
+        // patch_size is always 2
+        // in_chans is always 16
+        // norm_layer is always False
+        // strict_img_size is always true, but not used
+
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_chans,
+                                                               embed_dim,
+                                                               {patch_size, patch_size},
+                                                               {patch_size, patch_size},
+                                                               {0, 0},
+                                                               {1, 1},
+                                                               bias));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, C, H, W]
+        // return: [N, H*W, embed_dim]
+        auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
+
+        if (dynamic_img_pad) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int pad_h = (patch_size - H % patch_size) % patch_size;
+            int pad_w = (patch_size - W % patch_size) % patch_size;
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+        }
+        x = proj->forward(ctx, x);
+
+        if (flatten) {
+            x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
+        }
+        return x;
+    }
+};
+
+struct TimestepEmbedder : public GGMLBlock {
+    // Embeds scalar timesteps into vector representations.
+protected:
+    int64_t frequency_embedding_size;
+
+public:
+    TimestepEmbedder(int64_t hidden_size,
+                     int64_t frequency_embedding_size = 256)
+        : frequency_embedding_size(frequency_embedding_size) {
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
+        // t: [N, ]
+        // return: [N, hidden_size]
+        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+
+        auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]
+
+        auto t_emb = mlp_0->forward(ctx, t_freq);
+        t_emb      = ggml_silu_inplace(ctx, t_emb);
+        t_emb      = mlp_2->forward(ctx, t_emb);
+        return t_emb;
+    }
+};
+
+struct VectorEmbedder : public GGMLBlock {
+    // Embeds a flat vector of dimension input_dim
+public:
+    VectorEmbedder(int64_t input_dim,
+                   int64_t hidden_size) {
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, input_dim]
+        // return: [N, hidden_size]
+        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+
+        x = mlp_0->forward(ctx, x);
+        x = ggml_silu_inplace(ctx, x);
+        x = mlp_2->forward(ctx, x);
+        return x;
+    }
+};
+
+__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context* ctx,
+                                                             struct ggml_tensor* qkv) {
+    // qkv: [N, L, 3*C]
+    // return: ([N, L, C], [N, L, C], [N, L, C])
+    qkv = ggml_reshape_4d(ctx, qkv, qkv->ne[0] / 3, 3, qkv->ne[1], qkv->ne[2]);  // [N, L, 3, C]
+    qkv = ggml_cont(ctx, ggml_permute(ctx, qkv, 0, 3, 1, 2));                    // [3, N, L, C]
+
+    int64_t offset = qkv->nb[2] * qkv->ne[2];
+    auto q         = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 0);  // [N, L, C]
+    auto k         = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 1);  // [N, L, C]
+    auto v         = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 2);  // [N, L, C]
+    return {q, k, v};
+}
+
+class SelfAttention : public GGMLBlock {
+public:
+    int64_t num_heads;
+    bool pre_only;
+
+public:
+    SelfAttention(int64_t dim,
+                  int64_t num_heads = 8,
+                  bool qkv_bias     = false,
+                  bool pre_only     = false)
+        : num_heads(num_heads), pre_only(pre_only) {
+        // qk_norm is always None
+        blocks["qkv"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
+        if (!pre_only) {
+            blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+        }
+    }
+
+    std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+        auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+
+        auto qkv = qkv_proj->forward(ctx, x);
+        return split_qkv(ctx, qkv);
+    }
+
+    struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+        GGML_ASSERT(!pre_only);
+
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x = proj->forward(ctx, x);  // [N, n_token, dim]
+        return x;
+    }
+
+    // x: [N, n_token, dim]
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        auto qkv = pre_attention(ctx, x);
+        x        = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+        x        = post_attention(ctx, x);                                         // [N, n_token, dim]
+        return x;
+    }
+};
+
+__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
+                                               struct ggml_tensor* x,
+                                               struct ggml_tensor* shift,
+                                               struct ggml_tensor* scale) {
+    // x: [N, L, C]
+    // scale: [N, C]
+    // shift: [N, C]
+    scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
+    shift = ggml_reshape_3d(ctx, shift, shift->ne[0], 1, shift->ne[1]);  // [N, 1, C]
+    x     = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+    x     = ggml_add(ctx, x, shift);
+    return x;
+}
+
+struct DismantledBlock : public GGMLBlock {
+    // A DiT block with gated adaptive layer norm (adaLN) conditioning.
+public:
+    int64_t num_heads;
+    bool pre_only;
+
+public:
+    DismantledBlock(int64_t hidden_size,
+                    int64_t num_heads,
+                    float mlp_ratio = 4.0,
+                    bool qkv_bias   = false,
+                    bool pre_only   = false)
+        : num_heads(num_heads), pre_only(pre_only) {
+        // rmsnorm is always Flase
+        // scale_mod_only is always Flase
+        // swiglu is always Flase
+        // qk_norm is always Flase
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, pre_only));
+
+        if (!pre_only) {
+            blocks["norm2"]        = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+            int64_t mlp_hidden_dim = (int64_t)(hidden_size * mlp_ratio);
+            blocks["mlp"]          = std::shared_ptr<GGMLBlock>(new Mlp(hidden_size, mlp_hidden_dim));
+        }
+
+        int64_t n_mods = 6;
+        if (pre_only) {
+            n_mods = 2;
+        }
+        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
+    }
+
+    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx,
+                                                                                                struct ggml_tensor* x,
+                                                                                                struct ggml_tensor* c) {
+        // x: [N, n_token, hidden_size]
+        // c: [N, hidden_size]
+        auto norm1              = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto attn               = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
+        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
+
+        int64_t n_mods = 6;
+        if (pre_only) {
+            n_mods = 2;
+        }
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
+        m      = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
+        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]
+
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        if (!pre_only) {
+            auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+            auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+            auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+            auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+
+            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+
+            auto qkv = attn->pre_attention(ctx, attn_in);
+
+            return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
+        } else {
+            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto qkv     = attn->pre_attention(ctx, attn_in);
+
+            return {qkv, {NULL, NULL, NULL, NULL, NULL}};
+        }
+    }
+
+    struct ggml_tensor* post_attention(struct ggml_context* ctx,
+                                       struct ggml_tensor* attn_out,
+                                       struct ggml_tensor* x,
+                                       struct ggml_tensor* gate_msa,
+                                       struct ggml_tensor* shift_mlp,
+                                       struct ggml_tensor* scale_mlp,
+                                       struct ggml_tensor* gate_mlp) {
+        // attn_out: [N, n_token, hidden_size]
+        // x: [N, n_token, hidden_size]
+        // gate_msa: [N, hidden_size]
+        // shift_mlp: [N, hidden_size]
+        // scale_mlp: [N, hidden_size]
+        // gate_mlp: [N, hidden_size]
+        // return: [N, n_token, hidden_size]
+        GGML_ASSERT(!pre_only);
+
+        auto attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
+
+        gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
+        gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]
+
+        attn_out = attn->post_attention(ctx, attn_out);
+
+        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));
+
+        return x;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* c) {
+        // x: [N, n_token, hidden_size]
+        // c: [N, hidden_size]
+        // return: [N, n_token, hidden_size]
+
+        auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
+
+        auto qkv_intermediates = pre_attention(ctx, x, c);
+        auto qkv               = qkv_intermediates.first;
+        auto intermediates     = qkv_intermediates.second;
+
+        auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+        x             = post_attention(ctx,
+                                       attn_out,
+                                       intermediates[0],
+                                       intermediates[1],
+                                       intermediates[2],
+                                       intermediates[3],
+                                       intermediates[4]);
+        return x;  // [N, n_token, dim]
+    }
+};
+
+__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*> block_mixing(struct ggml_context* ctx,
+                                                                                   struct ggml_tensor* context,
+                                                                                   struct ggml_tensor* x,
+                                                                                   struct ggml_tensor* c,
+                                                                                   std::shared_ptr<DismantledBlock> context_block,
+                                                                                   std::shared_ptr<DismantledBlock> x_block) {
+    // context: [N, n_context, hidden_size]
+    // x: [N, n_token, hidden_size]
+    // c: [N, hidden_size]
+    auto context_qkv_intermediates = context_block->pre_attention(ctx, context, c);
+    auto context_qkv               = context_qkv_intermediates.first;
+    auto context_intermediates     = context_qkv_intermediates.second;
+
+    auto x_qkv_intermediates = x_block->pre_attention(ctx, x, c);
+    auto x_qkv               = x_qkv_intermediates.first;
+    auto x_intermediates     = x_qkv_intermediates.second;
+
+    std::vector<struct ggml_tensor*> qkv;
+    for (int i = 0; i < 3; i++) {
+        qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
+    }
+
+    auto attn         = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], x_block->num_heads);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                     // [n_context + n_token, N, hidden_size]
+    auto context_attn = ggml_view_3d(ctx,
+                                     attn,
+                                     attn->ne[0],
+                                     attn->ne[1],
+                                     context->ne[1],
+                                     attn->nb[1],
+                                     attn->nb[2],
+                                     0);                                              // [n_context, N, hidden_size]
+    context_attn      = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
+    auto x_attn       = ggml_view_3d(ctx,
+                                     attn,
+                                     attn->ne[0],
+                                     attn->ne[1],
+                                     x->ne[1],
+                                     attn->nb[1],
+                                     attn->nb[2],
+                                     attn->nb[2] * context->ne[1]);             // [n_token, N, hidden_size]
+    x_attn            = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]
+
+    if (!context_block->pre_only) {
+        context = context_block->post_attention(ctx,
+                                                context_attn,
+                                                context_intermediates[0],
+                                                context_intermediates[1],
+                                                context_intermediates[2],
+                                                context_intermediates[3],
+                                                context_intermediates[4]);
+    } else {
+        context = NULL;
+    }
+
+    x = x_block->post_attention(ctx,
+                                x_attn,
+                                x_intermediates[0],
+                                x_intermediates[1],
+                                x_intermediates[2],
+                                x_intermediates[3],
+                                x_intermediates[4]);
+
+    return {context, x};
+}
+
+struct JointBlock : public GGMLBlock {
+public:
+    JointBlock(int64_t hidden_size,
+               int64_t num_heads,
+               float mlp_ratio = 4.0,
+               bool qkv_bias   = false,
+               bool pre_only   = false) {
+        // qk_norm is always Flase
+        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qkv_bias, pre_only));
+        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qkv_bias, false));
+    }
+
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* context,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* c) {
+        auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
+        auto x_block       = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);
+
+        return block_mixing(ctx, context, x, c, context_block, x_block);
+    }
+};
+
+struct FinalLayer : public GGMLBlock {
+    // The final layer of DiT.
+public:
+    FinalLayer(int64_t hidden_size,
+               int64_t patch_size,
+               int64_t out_channels) {
+        // total_out_channels is always None
+        blocks["norm_final"]         = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+        blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
+        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* c) {
+        // x: [N, n_token, hidden_size]
+        // c: [N, hidden_size]
+        // return: [N, n_token, patch_size * patch_size * out_channels]
+        auto norm_final         = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
+        auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
+
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
+        m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
+        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]
+
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+
+        x = modulate(ctx, norm_final->forward(ctx, x), shift, scale);
+        x = linear->forward(ctx, x);
+
+        return x;
+    }
+};
+
+struct MMDiT : public GGMLBlock {
+    // Diffusion model with a Transformer backbone.
+protected:
+    SDVersion version          = VERSION_3_2B;
+    int64_t input_size         = -1;
+    int64_t patch_size         = 2;
+    int64_t in_channels        = 16;
+    int64_t depth              = 24;
+    float mlp_ratio            = 4.0f;
+    int64_t adm_in_channels    = 2048;
+    int64_t out_channels       = 16;
+    int64_t pos_embed_max_size = 192;
+    int64_t num_patchs         = 36864;  // 192 * 192
+    int64_t context_size       = 4096;
+    int64_t hidden_size;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["pos_embed"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden_size, num_patchs, 1);
+    }
+
+public:
+    MMDiT(SDVersion version = VERSION_3_2B)
+        : version(version) {
+        // input_size is always None
+        // learn_sigma is always False
+        // register_length is alwalys 0
+        // rmsnorm is alwalys False
+        // scale_mod_only is alwalys False
+        // swiglu is alwalys False
+        // qk_norm is always None
+        // qkv_bias is always True
+        // context_processor_layers is always None
+        // pos_embed_scaling_factor is not used
+        // pos_embed_offset is not used
+        // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
+        if (version == VERSION_3_2B) {
+            input_size         = -1;
+            patch_size         = 2;
+            in_channels        = 16;
+            depth              = 24;
+            mlp_ratio          = 4.0f;
+            adm_in_channels    = 2048;
+            out_channels       = 16;
+            pos_embed_max_size = 192;
+            num_patchs         = 36864;  // 192 * 192
+            context_size       = 4096;
+        }
+        int64_t default_out_channels = in_channels;
+        hidden_size                  = 64 * depth;
+        int64_t num_heads            = depth;
+
+        blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
+        blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(hidden_size));
+
+        if (adm_in_channels != -1) {
+            blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
+        }
+
+        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536));
+
+        for (int i = 0; i < depth; i++) {
+            blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
+                                                                                                    num_heads,
+                                                                                                    mlp_ratio,
+                                                                                                    true,
+                                                                                                    i == depth - 1));
+        }
+
+        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
+    }
+
+    struct ggml_tensor* cropped_pos_embed(struct ggml_context* ctx,
+                                          int64_t h,
+                                          int64_t w) {
+        auto pos_embed = params["pos_embed"];
+
+        h = (h + 1) / patch_size;
+        w = (w + 1) / patch_size;
+
+        GGML_ASSERT(h <= pos_embed_max_size && h > 0);
+        GGML_ASSERT(w <= pos_embed_max_size && w > 0);
+
+        int64_t top  = (pos_embed_max_size - h) / 2;
+        int64_t left = (pos_embed_max_size - w) / 2;
+
+        auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, hidden_size, pos_embed_max_size, pos_embed_max_size);
+
+        // spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+        spatial_pos_embed = ggml_view_3d(ctx,
+                                         spatial_pos_embed,
+                                         hidden_size,
+                                         pos_embed_max_size,
+                                         h,
+                                         spatial_pos_embed->nb[1],
+                                         spatial_pos_embed->nb[2],
+                                         spatial_pos_embed->nb[2] * top);                      // [h, pos_embed_max_size, hidden_size]
+        spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3));  // [pos_embed_max_size, h, hidden_size]
+        spatial_pos_embed = ggml_view_3d(ctx,
+                                         spatial_pos_embed,
+                                         hidden_size,
+                                         h,
+                                         w,
+                                         spatial_pos_embed->nb[1],
+                                         spatial_pos_embed->nb[2],
+                                         spatial_pos_embed->nb[2] * left);                     // [w, h, hidden_size]
+        spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3));  // [h, w, hidden_size]
+        spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, hidden_size, h * w, 1);    // [1, h*w, hidden_size]
+        return spatial_pos_embed;
+    }
+
+    struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                   struct ggml_tensor* x,
+                                   int64_t h,
+                                   int64_t w) {
+        // x: [N, H*W, patch_size * patch_size * C]
+        // return: [N, C, H, W]
+        int64_t n = x->ne[2];
+        int64_t c = out_channels;
+        int64_t p = patch_size;
+        h         = (h + 1) / p;
+        w         = (w + 1) / p;
+
+        GGML_ASSERT(h * w == x->ne[1]);
+
+        x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n);       // [N, H*W, P*P, C]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // [N, C, H*W, P*P]
+        x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n);       // [N*C*H, W, P, P]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*H, P, W, P]
+        x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n);       // [N, C, H*P, W*P]
+        return x;
+    }
+
+    struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
+                                                 struct ggml_tensor* x,
+                                                 struct ggml_tensor* c_mod,
+                                                 struct ggml_tensor* context) {
+        // x: [N, H*W, hidden_size]
+        // context: [N, n_context, d_context]
+        // c: [N, hidden_size]
+        // return: [N, N*W, patch_size * patch_size * out_channels]
+        auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+
+        for (int i = 0; i < depth; i++) {
+            auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);
+
+            auto context_x = block->forward(ctx, context, x, c_mod);
+            context        = context_x.first;
+            x              = context_x.second;
+        }
+
+        x = final_layer->forward(ctx, x, c_mod);  // (N, T, patch_size ** 2 * out_channels)
+
+        return x;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* t,
+                                struct ggml_tensor* y       = NULL,
+                                struct ggml_tensor* context = NULL) {
+        // Forward pass of DiT.
+        // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        // t: (N,) tensor of diffusion timesteps
+        // y: (N, adm_in_channels) tensor of class labels
+        // context: (N, L, D)
+        // return: (N, C, H, W)
+        auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
+        auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
+
+        int64_t w = x->ne[0];
+        int64_t h = x->ne[1];
+
+        auto patch_embed = x_embedder->forward(ctx, x);            // [N, H*W, hidden_size]
+        auto pos_embed   = cropped_pos_embed(ctx, h, w);           // [1, H*W, hidden_size]
+        x                = ggml_add(ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
+
+        auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
+        if (y != NULL && adm_in_channels != -1) {
+            auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
+
+            y = y_embedder->forward(ctx, y);  // [N, hidden_size]
+            c = ggml_add(ctx, c, y);
+        }
+
+        if (context != NULL) {
+            auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);
+
+            context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
+        }
+
+        x = forward_core_with_concat(ctx, x, c, context);  // (N, H*W, patch_size ** 2 * out_channels)
+
+        x = unpatchify(ctx, x, h, w);  // [N, C, H, W]
+
+        return x;
+    }
+};
+
+struct MMDiTRunner : public GGMLRunner {
+    MMDiT mmdit;
+
+    MMDiTRunner(ggml_backend_t backend,
+                ggml_type wtype,
+                SDVersion version = VERSION_3_2B)
+        : GGMLRunner(backend, wtype), mmdit(version) {
+        mmdit.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "mmdit";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        mmdit.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);
+
+        x         = to_backend(x);
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
+
+        struct ggml_tensor* out = mmdit.forward(compute_ctx,
+                                                x,
+                                                timesteps,
+                                                y,
+                                                context);
+
+        ggml_build_forward_expand(gf, out);
+
+        return gf;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        // x: [N, in_channels, h, w]
+        // timesteps: [N, ]
+        // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, timesteps, context, y);
+        };
+
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+    }
+
+    void test() {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);
+
+        {
+            // cpu f16: pass
+            // cpu f32: pass
+            // cuda f16: pass
+            // cuda f32: pass
+            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1);
+            std::vector<float> timesteps_vec(1, 999.f);
+            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+            ggml_set_f32(x, 0.01f);
+            // print_ggml_tensor(x);
+
+            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1);
+            ggml_set_f32(context, 0.01f);
+            // print_ggml_tensor(context);
+
+            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1);
+            ggml_set_f32(y, 0.01f);
+            // print_ggml_tensor(y);
+
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            compute(8, x, timesteps, context, y, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("mmdit test done in %dms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+        ggml_backend_t backend             = ggml_backend_cpu_init();
+        ggml_type model_data_type          = GGML_TYPE_F16;
+        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, model_data_type));
+        {
+            LOG_INFO("loading from '%s'", file_path.c_str());
+
+            mmdit->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            mmdit->get_param_tensors(tensors, "model.diffusion_model");
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path)) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            bool success = model_loader.load_tensors(tensors, backend);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("mmdit model loaded");
+        }
+        mmdit->test();
+    }
+};
+
+#endif
--- a/model.cpp
+++ b/model.cpp
@ -161,6 +161,10 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
        prefix   = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight"));
        new_name = prefix + "visual_projection.weight";
        return new_name;
+    } else if (ends_with(new_name, "transformer.text_projection.weight")) {
+        prefix   = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight"));
+        new_name = prefix + "transformer.text_model.text_projection";
+        return new_name;
    } else {
        return new_name;
    }
@ -420,7 +424,7 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {

 std::string convert_tensor_name(const std::string& name) {
    std::string new_name = name;
-    if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.") || ends_with(name, ".vision_model.visual_projection.weight")) {
+    if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.") || starts_with(name, "text_encoders.") || ends_with(name, ".vision_model.visual_projection.weight")) {
        new_name = convert_open_clip_to_hf_clip(name);
    } else if (starts_with(name, "first_stage_model.decoder")) {
        new_name = convert_vae_decoder_name(name);
@ -1288,6 +1292,9 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 SDVersion ModelLoader::get_sd_version() {
    TensorStorage token_embedding_weight;
    for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name.find("model.diffusion_model.joint_blocks.23.") != std::string::npos) {
+            return VERSION_3_2B;
+        }
        if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
            return VERSION_XL;
        }
@ -1323,7 +1330,8 @@ ggml_type ModelLoader::get_sd_wtype() {
        }

        if (tensor_storage.name.find(".weight") != std::string::npos &&
-            tensor_storage.name.find("time_embed") != std::string::npos) {
+                (tensor_storage.name.find("time_embed") != std::string::npos) ||
+            tensor_storage.name.find("context_embedder") != std::string::npos) {
            return tensor_storage.type;
        }
    }
@ -1335,6 +1343,11 @@ std::string ModelLoader::load_merges() {
    return merges_utf8_str;
 }

+std::string ModelLoader::load_t5_tokenizer_json() {
+    std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
+    return json_str;
+}
+
 std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
    std::vector<TensorStorage> res;
    std::unordered_map<std::string, size_t> name_to_index_map;
--- a/model.h
+++ b/model.h
@ -22,6 +22,7 @@ enum SDVersion {
    VERSION_2_x,
    VERSION_XL,
    VERSION_SVD,
+    VERSION_3_2B,
    VERSION_COUNT,
 };

@ -143,7 +144,6 @@ public:
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
    SDVersion get_sd_version();
    ggml_type get_sd_wtype();
-    std::string load_merges();
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      ggml_backend_t backend,
@ -151,5 +151,8 @@ public:
    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
+
+    static std::string load_merges();
+    static std::string load_t5_tokenizer_json();
 };
 #endif  // __MODEL_H__
--- a/pmid.hpp
+++ b/pmid.hpp
@ -159,7 +159,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
    }
 };

-struct PhotoMakerIDEncoder : public GGMLModule {
+struct PhotoMakerIDEncoder : public GGMLRunner {
 public:
    SDVersion version = VERSION_XL;
    PhotoMakerIDEncoderBlock id_encoder;
@ -176,7 +176,7 @@ public:

 public:
    PhotoMakerIDEncoder(ggml_backend_t backend, ggml_type wtype, SDVersion version = VERSION_XL, float sty = 20.f)
-        : GGMLModule(backend, wtype),
+        : GGMLRunner(backend, wtype),
          version(version),
          style_strength(sty) {
        id_encoder.init(params_ctx, wtype);
@ -287,8 +287,8 @@ public:
            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask);
        };

-        // GGMLModule::compute(get_graph, n_threads, updated_prompt_embeds);
-        GGMLModule::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
+        GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
    }
 };

--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -6,14 +6,14 @@
 #include "stable-diffusion.h"
 #include "util.h"

-#include "clip.hpp"
+#include "conditioner.hpp"
 #include "control.hpp"
 #include "denoiser.hpp"
+#include "diffusion_model.hpp"
 #include "esrgan.hpp"
 #include "lora.hpp"
 #include "pmid.hpp"
 #include "tae.hpp"
-#include "unet.hpp"
 #include "vae.hpp"

 #define STB_IMAGE_IMPLEMENTATION
@ -29,7 +29,7 @@ const char* model_version_to_str[] = {
    "2.x",
    "XL",
    "SVD",
-};
+    "3 2B"};

 const char* sampling_methods_str[] = {
    "Euler A",
@ -77,9 +77,9 @@ public:
    int n_threads            = -1;
    float scale_factor       = 0.18215f;

-    std::shared_ptr<FrozenCLIPEmbedderWithCustomWords> cond_stage_model;
+    std::shared_ptr<Conditioner> cond_stage_model;
    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd
-    std::shared_ptr<UNetModel> diffusion_model;
+    std::shared_ptr<DiffusionModel> diffusion_model;
    std::shared_ptr<AutoEncoderKL> first_stage_model;
    std::shared_ptr<TinyAutoEncoder> tae_first_stage;
    std::shared_ptr<ControlNet> control_net;
@ -99,8 +99,6 @@ public:

    std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();

-    std::string trigger_word = "img";  // should be user settable
-
    StableDiffusionGGML() = default;

    StableDiffusionGGML(int n_threads,
@ -207,36 +205,45 @@ public:
                    "try specifying SDXL VAE FP16 Fix with the --vae parameter. "
                    "You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
            }
+        } else if (version == VERSION_3_2B) {
+            scale_factor = 1.5305f;
        }

        if (version == VERSION_SVD) {
            clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_data_type);
            clip_vision->alloc_params_buffer();
-            clip_vision->get_param_tensors(tensors, "cond_stage_model.");
+            clip_vision->get_param_tensors(tensors);

            diffusion_model = std::make_shared<UNetModel>(backend, model_data_type, version);
            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors, "model.diffusion_model");
+            diffusion_model->get_param_tensors(tensors);

-            first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_data_type, vae_decode_only, true);
+            first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_data_type, vae_decode_only, true, version);
            LOG_DEBUG("vae_decode_only %d", vae_decode_only);
            first_stage_model->alloc_params_buffer();
            first_stage_model->get_param_tensors(tensors, "first_stage_model");
        } else {
            clip_backend = backend;
+            if (!ggml_backend_is_cpu(backend) && version == VERSION_3_2B && model_data_type != GGML_TYPE_F32) {
+                clip_on_cpu = true;
+                LOG_INFO("set clip_on_cpu to true");
+            }
            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                LOG_INFO("CLIP: Using CPU backend");
                clip_backend = ggml_backend_cpu_init();
            }
-            cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_data_type, version);
+            if (version == VERSION_3_2B) {
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_data_type);
+                diffusion_model  = std::make_shared<MMDiTModel>(backend, model_data_type, version);
+            } else {
+                cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_data_type, embeddings_path, version);
+                diffusion_model  = std::make_shared<UNetModel>(backend, model_data_type, version);
+            }
            cond_stage_model->alloc_params_buffer();
-            cond_stage_model->get_param_tensors(tensors, "cond_stage_model.");
+            cond_stage_model->get_param_tensors(tensors);

-            cond_stage_model->embd_dir = embeddings_path;
-
-            diffusion_model = std::make_shared<UNetModel>(backend, model_data_type, version);
            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors, "model.diffusion_model");
+            diffusion_model->get_param_tensors(tensors);

            ggml_type vae_type = model_data_type;
            if (version == VERSION_XL) {
@ -250,7 +257,7 @@ public:
                } else {
                    vae_backend = backend;
                }
-                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, vae_type, vae_decode_only);
+                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, vae_type, vae_decode_only, false, version);
                first_stage_model->alloc_params_buffer();
                first_stage_model->get_param_tensors(tensors, "first_stage_model");
            } else {
@ -296,14 +303,6 @@ public:
            //    pmid_model.init_params(GGML_TYPE_F32);
            //    pmid_model.map_by_name(tensors, "pmid.");
            // }
-
-            LOG_DEBUG("loading vocab");
-            std::string merges_utf8_str = model_loader.load_merges();
-            if (merges_utf8_str.size() == 0) {
-                LOG_ERROR("get merges failed: '%s'", model_path.c_str());
-                return false;
-            }
-            cond_stage_model->tokenizer.load_from_merges(merges_utf8_str);
        }

        struct ggml_init_params params;
@ -433,9 +432,12 @@ public:
            is_using_v_parameterization = true;
        }

-        if (is_using_v_parameterization) {
-            denoiser = std::make_shared<CompVisVDenoiser>();
+        if (version == VERSION_3_2B) {
+            LOG_INFO("running in FLOW mode");
+            denoiser = std::make_shared<DiscreteFlowDenoiser>();
+        } else if (is_using_v_parameterization) {
            LOG_INFO("running in v-prediction mode");
+            denoiser = std::make_shared<CompVisVDenoiser>();
        } else {
            LOG_INFO("running in eps-prediction mode");
        }
@ -464,10 +466,12 @@ public:
            }
        }

-        for (int i = 0; i < TIMESTEPS; i++) {
-            denoiser->schedule->alphas_cumprod[i] = ((float*)alphas_cumprod_tensor->data)[i];
-            denoiser->schedule->sigmas[i]         = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
-            denoiser->schedule->log_sigmas[i]     = std::log(denoiser->schedule->sigmas[i]);
+        auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
+        if (comp_vis_denoiser) {
+            for (int i = 0; i < TIMESTEPS; i++) {
+                comp_vis_denoiser->sigmas[i]     = std::sqrt((1 - ((float*)alphas_cumprod_tensor->data)[i]) / ((float*)alphas_cumprod_tensor->data)[i]);
+                comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]);
+            }
        }

        LOG_DEBUG("finished loaded file");
@ -562,50 +566,6 @@ public:
        curr_lora_state = lora_state;
    }

-    std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                           const std::string& prompt) {
-        auto image_tokens = cond_stage_model->convert_token_to_id(trigger_word);
-        GGML_ASSERT(image_tokens.size() == 1);
-        auto tokens_and_weights  = cond_stage_model->tokenize(prompt, false);
-        std::vector<int>& tokens = tokens_and_weights.first;
-        auto it                  = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
-        GGML_ASSERT(it != tokens.end());  // prompt must have trigger word
-        tokens.erase(it);
-        return cond_stage_model->decode(tokens);
-    }
-
-    std::tuple<ggml_tensor*, ggml_tensor*, std::vector<bool>>
-    get_learned_condition_with_trigger(ggml_context* work_ctx,
-                                       const std::string& text,
-                                       int clip_skip,
-                                       int width,
-                                       int height,
-                                       int num_input_imgs,
-                                       bool force_zero_embeddings = false) {
-        auto image_tokens = cond_stage_model->convert_token_to_id(trigger_word);
-        // if(image_tokens.size() == 1){
-        //     printf(" image token id is: %d \n", image_tokens[0]);
-        // }
-        GGML_ASSERT(image_tokens.size() == 1);
-        auto tokens_and_weights     = cond_stage_model->tokenize_with_trigger_token(text,
-                                                                                    num_input_imgs,
-                                                                                    image_tokens[0],
-                                                                                    true);
-        std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
-        std::vector<float>& weights = std::get<1>(tokens_and_weights);
-        std::vector<bool>& clsm     = std::get<2>(tokens_and_weights);
-        // printf("tokens: \n");
-        // for(int i = 0; i < tokens.size(); ++i)
-        //    printf("%d ", tokens[i]);
-        // printf("\n");
-        // printf("clsm: \n");
-        // for(int i = 0; i < clsm.size(); ++i)
-        //    printf("%d ", clsm[i]?1:0);
-        // printf("\n");
-        auto cond = get_learned_condition_common(work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
-        return std::make_tuple(cond.first, cond.second, clsm);
-    }
-
    ggml_tensor* id_encoder(ggml_context* work_ctx,
                            ggml_tensor* init_img,
                            ggml_tensor* prompts_embeds,
@ -616,148 +576,14 @@ public:
        return res;
    }

-    std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition(ggml_context* work_ctx,
-                                                                const std::string& text,
-                                                                int clip_skip,
-                                                                int width,
-                                                                int height,
-                                                                bool force_zero_embeddings = false) {
-        auto tokens_and_weights     = cond_stage_model->tokenize(text, true);
-        std::vector<int>& tokens    = tokens_and_weights.first;
-        std::vector<float>& weights = tokens_and_weights.second;
-        return get_learned_condition_common(work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition_common(ggml_context* work_ctx,
-                                                                       std::vector<int>& tokens,
-                                                                       std::vector<float>& weights,
-                                                                       int clip_skip,
-                                                                       int width,
-                                                                       int height,
-                                                                       bool force_zero_embeddings = false) {
-        cond_stage_model->set_clip_skip(clip_skip);
-        int64_t t0                              = ggml_time_ms();
-        struct ggml_tensor* hidden_states       = NULL;  // [N, n_token, hidden_size]
-        struct ggml_tensor* chunk_hidden_states = NULL;  // [n_token, hidden_size]
-        struct ggml_tensor* pooled              = NULL;
-        std::vector<float> hidden_states_vec;
-
-        size_t chunk_len   = 77;
-        size_t chunk_count = tokens.size() / chunk_len;
-        for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
-            std::vector<int> chunk_tokens(tokens.begin() + chunk_idx * chunk_len,
-                                          tokens.begin() + (chunk_idx + 1) * chunk_len);
-            std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len,
-                                             weights.begin() + (chunk_idx + 1) * chunk_len);
-
-            auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            struct ggml_tensor* input_ids2 = NULL;
-            size_t max_token_idx           = 0;
-            if (version == VERSION_XL) {
-                auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), EOS_TOKEN_ID);
-                if (it != chunk_tokens.end()) {
-                    std::fill(std::next(it), chunk_tokens.end(), 0);
-                }
-
-                max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
-
-                input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-
-                // for (int i = 0; i < chunk_tokens.size(); i++) {
-                //     printf("%d ", chunk_tokens[i]);
-                // }
-                // printf("\n");
-            }
-
-            cond_stage_model->compute(n_threads, input_ids, input_ids2, max_token_idx, false, &chunk_hidden_states, work_ctx);
-            if (version == VERSION_XL && chunk_idx == 0) {
-                cond_stage_model->compute(n_threads, input_ids, input_ids2, max_token_idx, true, &pooled, work_ctx);
-            }
-            // if (pooled != NULL) {
-            //     print_ggml_tensor(chunk_hidden_states);
-            //     print_ggml_tensor(pooled);
-            // }
-
-            int64_t t1 = ggml_time_ms();
-            LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-            ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
-            {
-                float original_mean = ggml_tensor_mean(chunk_hidden_states);
-                for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
-                            float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
-                            value *= chunk_weights[i1];
-                            ggml_tensor_set_f32(result, value, i0, i1, i2);
-                        }
-                    }
-                }
-                float new_mean = ggml_tensor_mean(result);
-                ggml_tensor_scale(result, (original_mean / new_mean));
-            }
-            if (force_zero_embeddings) {
-                float* vec = (float*)result->data;
-                for (int i = 0; i < ggml_nelements(result); i++) {
-                    vec[i] = 0;
-                }
-            }
-            hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result));
-        }
-
-        hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
-        hidden_states = ggml_reshape_2d(work_ctx,
-                                        hidden_states,
-                                        chunk_hidden_states->ne[0],
-                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-
-        ggml_tensor* vec = NULL;
-        if (version == VERSION_XL) {
-            int out_dim = 256;
-            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->unet.adm_in_channels);
-            // [0:1280]
-            size_t offset = 0;
-            memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
-            offset += ggml_nbytes(pooled);
-
-            // original_size_as_tuple
-            float orig_width             = (float)width;
-            float orig_height            = (float)height;
-            std::vector<float> timesteps = {orig_height, orig_width};
-
-            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            // crop_coords_top_left
-            float crop_coord_top  = 0.f;
-            float crop_coord_left = 0.f;
-            timesteps             = {crop_coord_top, crop_coord_left};
-            embed_view            = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            // target_size_as_tuple
-            float target_width  = (float)width;
-            float target_height = (float)height;
-            timesteps           = {target_height, target_width};
-            embed_view          = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
-            offset += ggml_nbytes(embed_view);
-            set_timestep_embedding(timesteps, embed_view, out_dim);
-            // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
-            GGML_ASSERT(offset == ggml_nbytes(vec));
-        }
-        // print_ggml_tensor(result);
-        return {hidden_states, vec};
-    }
-
-    std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> get_svd_condition(ggml_context* work_ctx,
-                                                                           sd_image_t init_image,
-                                                                           int width,
-                                                                           int height,
-                                                                           int fps                    = 6,
-                                                                           int motion_bucket_id       = 127,
-                                                                           float augmentation_level   = 0.f,
-                                                                           bool force_zero_embeddings = false) {
+    SDCondition get_svd_condition(ggml_context* work_ctx,
+                                  sd_image_t init_image,
+                                  int width,
+                                  int height,
+                                  int fps                    = 6,
+                                  int motion_bucket_id       = 127,
+                                  float augmentation_level   = 0.f,
+                                  bool force_zero_embeddings = false) {
        // c_crossattn
        int64_t t0                      = ggml_time_ms();
        struct ggml_tensor* c_crossattn = NULL;
@ -809,38 +635,30 @@ public:
                    ggml_tensor_scale(noise, augmentation_level);
                    ggml_tensor_add(init_img, noise);
                }
-                print_ggml_tensor(init_img);
                ggml_tensor* moments = encode_first_stage(work_ctx, init_img);
-                print_ggml_tensor(moments);
-                c_concat = get_first_stage_encoding(work_ctx, moments);
+                c_concat             = get_first_stage_encoding(work_ctx, moments);
            }
-            print_ggml_tensor(c_concat);
        }

        // y
        struct ggml_tensor* y = NULL;
        {
-            y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->unet.adm_in_channels);
+            y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
            int out_dim                  = 256;
            int fps_id                   = fps - 1;
            std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
            set_timestep_embedding(timesteps, y, out_dim);
-            print_ggml_tensor(y);
        }
        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {c_crossattn, c_concat, y};
+        return {c_crossattn, y, c_concat};
    }

    ggml_tensor* sample(ggml_context* work_ctx,
-                        ggml_tensor* x_t,
+                        ggml_tensor* init_latent,
                        ggml_tensor* noise,
-                        ggml_tensor* c,
-                        ggml_tensor* c_concat,
-                        ggml_tensor* c_vector,
-                        ggml_tensor* uc,
-                        ggml_tensor* uc_concat,
-                        ggml_tensor* uc_vector,
+                        SDCondition cond,
+                        SDCondition uncond,
                        ggml_tensor* control_hint,
                        float control_strength,
                        float min_cfg,
@ -848,26 +666,17 @@ public:
                        sample_method_t method,
                        const std::vector<float>& sigmas,
                        int start_merge_step,
-                        ggml_tensor* c_id,
-                        ggml_tensor* c_vec_id) {
+                        SDCondition id_cond) {
        size_t steps = sigmas.size() - 1;
-        // x_t = load_tensor_from_file(work_ctx, "./rand0.bin");
-        // print_ggml_tensor(x_t);
-        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t);
-        copy_ggml_tensor(x, x_t);
+        // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
+        // print_ggml_tensor(noise);
+        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent);
+        copy_ggml_tensor(x, init_latent);
+        x = denoiser->noise_scaling(sigmas[0], noise, x);

-        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);

-        bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
-
-        if (noise == NULL) {
-            // x = x * sigmas[0]
-            ggml_tensor_scale(x, sigmas[0]);
-        } else {
-            // xi = x + noise * sigma_sched[0]
-            ggml_tensor_scale(noise, sigmas[0]);
-            ggml_tensor_add(x, noise);
-        }
+        bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;

        // denoise wrapper
        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
@ -883,21 +692,13 @@ public:
            }
            int64_t t0 = ggml_time_us();

-            float c_skip               = 1.0f;
-            float c_out                = 1.0f;
-            float c_in                 = 1.0f;
            std::vector<float> scaling = denoiser->get_scalings(sigma);
+            GGML_ASSERT(scaling.size() == 3);
+            float c_skip = scaling[0];
+            float c_out  = scaling[1];
+            float c_in   = scaling[2];

-            if (scaling.size() == 3) {  // CompVisVDenoiser
-                c_skip = scaling[0];
-                c_out  = scaling[1];
-                c_in   = scaling[2];
-            } else {  // CompVisDenoiser
-                c_out = scaling[0];
-                c_in  = scaling[1];
-            }
-
-            float t = denoiser->schedule->sigma_to_t(sigma);
+            float t = denoiser->sigma_to_t(sigma);
            std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);

@ -908,7 +709,7 @@ public:
            std::vector<struct ggml_tensor*> controls;

            if (control_hint != NULL) {
-                control_net->compute(n_threads, noised_input, control_hint, timesteps, c, c_vector);
+                control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
                controls = control_net->controls;
                // print_ggml_tensor(controls[12]);
                // GGML_ASSERT(0);
@ -919,9 +720,9 @@ public:
                diffusion_model->compute(n_threads,
                                         noised_input,
                                         timesteps,
-                                         c,
-                                         c_concat,
-                                         c_vector,
+                                         cond.c_crossattn,
+                                         cond.c_concat,
+                                         cond.c_vector,
                                         -1,
                                         controls,
                                         control_strength,
@ -930,9 +731,9 @@ public:
                diffusion_model->compute(n_threads,
                                         noised_input,
                                         timesteps,
-                                         c_id,
-                                         c_concat,
-                                         c_vec_id,
+                                         id_cond.c_crossattn,
+                                         cond.c_concat,
+                                         id_cond.c_vector,
                                         -1,
                                         controls,
                                         control_strength,
@ -943,15 +744,15 @@ public:
            if (has_unconditioned) {
                // uncond
                if (control_hint != NULL) {
-                    control_net->compute(n_threads, noised_input, control_hint, timesteps, uc, uc_vector);
+                    control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
                    controls = control_net->controls;
                }
                diffusion_model->compute(n_threads,
                                         noised_input,
                                         timesteps,
-                                         uc,
-                                         uc_concat,
-                                         uc_vector,
+                                         uncond.c_crossattn,
+                                         uncond.c_concat,
+                                         uncond.c_vector,
                                         -1,
                                         controls,
                                         control_strength,
@ -988,6 +789,8 @@ public:

        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng);

+        x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
+
        if (control_net) {
            control_net->free_control_ctx();
            control_net->free_compute_buffer();
@ -1029,12 +832,20 @@ public:
    }

    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
-        int64_t W           = x->ne[0];
-        int64_t H           = x->ne[1];
+        int64_t W = x->ne[0];
+        int64_t H = x->ne[1];
+        int64_t C = 8;
+        if (use_tiny_autoencoder) {
+            C = 4;
+        } else {
+            if (version == VERSION_3_2B) {
+                C = 32;
+            }
+        }
        ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                 decode ? (W * 8) : (W / 8),  // width
                                                 decode ? (H * 8) : (H / 8),  // height
-                                                 decode ? 3 : (use_tiny_autoencoder ? 4 : 8),
+                                                 decode ? 3 : C,
                                                 x->ne[3]);  // channels
        int64_t t0          = ggml_time_ms();
        if (!use_tiny_autoencoder) {
@ -1184,6 +995,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
        seed = rand();
    }

+    // for (auto v : sigmas) {
+    //     std::cout << v << " ";
+    // }
+    // std::cout << std::endl;
+
    int sample_steps = sigmas.size() - 1;

    // Apply lora
@ -1204,9 +1020,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,

    // Photo Maker
    std::string prompt_text_only;
-    ggml_tensor* init_img              = NULL;
-    ggml_tensor* prompts_embeds        = NULL;
-    ggml_tensor* pooled_prompts_embeds = NULL;
+    ggml_tensor* init_img = NULL;
+    SDCondition id_cond;
    std::vector<bool> class_tokens_mask;
    if (sd_ctx->sd->stacked_id) {
        if (!sd_ctx->sd->pmid_lora->applied) {
@ -1263,21 +1078,25 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                else
                    sd_mul_images_to_tensor(init_image->data, init_img, i, NULL, NULL);
            }
-            t0                    = ggml_time_ms();
-            auto cond_tup         = sd_ctx->sd->get_learned_condition_with_trigger(work_ctx, prompt,
-                                                                                   clip_skip, width, height, num_input_images);
-            prompts_embeds        = std::get<0>(cond_tup);
-            pooled_prompts_embeds = std::get<1>(cond_tup);  // [adm_in_channels, ]
-            class_tokens_mask     = std::get<2>(cond_tup);  //
+            t0                = ggml_time_ms();
+            auto cond_tup     = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
+                                                                                                 sd_ctx->sd->n_threads, prompt,
+                                                                                                 clip_skip,
+                                                                                                 width,
+                                                                                                 height,
+                                                                                                 num_input_images,
+                                                                                                 sd_ctx->sd->diffusion_model->get_adm_in_channels());
+            id_cond           = std::get<0>(cond_tup);
+            class_tokens_mask = std::get<1>(cond_tup);  //

-            prompts_embeds = sd_ctx->sd->id_encoder(work_ctx, init_img, prompts_embeds, class_tokens_mask);
-            t1             = ggml_time_ms();
+            id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, class_tokens_mask);
+            t1                  = ggml_time_ms();
            LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
            if (sd_ctx->sd->free_params_immediately) {
                sd_ctx->sd->pmid_model->free_params_buffer();
            }
            // Encode input prompt without the trigger word for delayed conditioning
-            prompt_text_only = sd_ctx->sd->remove_trigger_from_prompt(work_ctx, prompt);
+            prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt);
            // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
            prompt = prompt_text_only;  //
            // if (sample_steps < 50) {
@ -1296,21 +1115,29 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    }

    // Get learned condition
-    t0                    = ggml_time_ms();
-    auto cond_pair        = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor* c        = cond_pair.first;
-    ggml_tensor* c_vector = cond_pair.second;  // [adm_in_channels, ]
+    t0               = ggml_time_ms();
+    SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                           sd_ctx->sd->n_threads,
+                                                                           prompt,
+                                                                           clip_skip,
+                                                                           width,
+                                                                           height,
+                                                                           sd_ctx->sd->diffusion_model->get_adm_in_channels());

-    struct ggml_tensor* uc        = NULL;
-    struct ggml_tensor* uc_vector = NULL;
+    SDCondition uncond;
    if (cfg_scale != 1.0) {
        bool force_zero_embeddings = false;
        if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
            force_zero_embeddings = true;
        }
-        auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings);
-        uc               = uncond_pair.first;
-        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
+        uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
+                                                                     sd_ctx->sd->n_threads,
+                                                                     negative_prompt,
+                                                                     clip_skip,
+                                                                     width,
+                                                                     height,
+                                                                     sd_ctx->sd->diffusion_model->get_adm_in_channels(),
+                                                                     force_zero_embeddings);
    }
    t1 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@ -1329,6 +1156,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    // Sample
    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
    int C = 4;
+    if (sd_ctx->sd->version == VERSION_3_2B) {
+        C = 16;
+    }
    int W = width / 8;
    int H = height / 8;
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
@ -1338,16 +1168,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);

        sd_ctx->sd->rng->manual_seed(cur_seed);
-        struct ggml_tensor* x_t   = NULL;
-        struct ggml_tensor* noise = NULL;
-        if (init_latent == NULL) {
-            x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-            ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
-        } else {
-            x_t   = init_latent;
-            noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-            ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
-        }
+        struct ggml_tensor* x_t   = init_latent;
+        struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+        ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);

        int start_merge_step = -1;
        if (sd_ctx->sd->stacked_id) {
@ -1360,12 +1183,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                     x_t,
                                                     noise,
-                                                     c,
-                                                     NULL,
-                                                     c_vector,
-                                                     uc,
-                                                     NULL,
-                                                     uc_vector,
+                                                     cond,
+                                                     uncond,
                                                     image_hint,
                                                     control_strength,
                                                     cfg_scale,
@ -1373,8 +1192,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                     sample_method,
                                                     sigmas,
                                                     start_merge_step,
-                                                     prompts_embeds,
-                                                     pooled_prompts_embeds);
+                                                     id_cond);
        // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
        // print_ggml_tensor(x_0);
        int64_t sampling_end = ggml_time_ms();
@ -1447,6 +1265,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,

    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+    if (sd_ctx->sd->version == VERSION_3_2B) {
+        params.mem_size *= 3;
+    }
    if (sd_ctx->sd->stacked_id) {
        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
    }
@ -1464,11 +1285,24 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,

    size_t t0 = ggml_time_ms();

-    std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
+
+    int C = 4;
+    if (sd_ctx->sd->version == VERSION_3_2B) {
+        C = 16;
+    }
+    int W                    = width / 8;
+    int H                    = height / 8;
+    ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+    if (sd_ctx->sd->version == VERSION_3_2B) {
+        ggml_set_f32(init_latent, 0.0609f);
+    } else {
+        ggml_set_f32(init_latent, 0.f);
+    }

    sd_image_t* result_images = generate_image(sd_ctx,
                                               work_ctx,
-                                               NULL,
+                                               init_latent,
                                               prompt_c_str,
                                               negative_prompt_c_str,
                                               clip_skip,
@ -1517,6 +1351,9 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,

    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+    if (sd_ctx->sd->version == VERSION_3_2B) {
+        params.mem_size *= 2;
+    }
    if (sd_ctx->sd->stacked_id) {
        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
    }
@ -1549,11 +1386,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
    } else {
        init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
    }
-    // print_ggml_tensor(init_latent);
+    print_ggml_tensor(init_latent, true);
    size_t t1 = ggml_time_ms();
    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);

-    std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
    LOG_INFO("target t_enc is %zu steps", t_enc);
    std::vector<float> sigma_sched;
@ -1605,7 +1442,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,

    LOG_INFO("img2vid %dx%d", width, height);

-    std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);

    struct ggml_init_params params;
    params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
@ -1629,29 +1466,23 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,

    int64_t t0 = ggml_time_ms();

-    ggml_tensor* c_crossattn = NULL;
-    ggml_tensor* c_concat    = NULL;
-    ggml_tensor* c_vector    = NULL;
+    SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx,
+                                                     init_image,
+                                                     width,
+                                                     height,
+                                                     fps,
+                                                     motion_bucket_id,
+                                                     augmentation_level);

-    ggml_tensor* uc_crossattn = NULL;
-    ggml_tensor* uc_concat    = NULL;
-    ggml_tensor* uc_vector    = NULL;
-
-    std::tie(c_crossattn, c_concat, c_vector) = sd_ctx->sd->get_svd_condition(work_ctx,
-                                                                              init_image,
-                                                                              width,
-                                                                              height,
-                                                                              fps,
-                                                                              motion_bucket_id,
-                                                                              augmentation_level);
-
-    uc_crossattn = ggml_dup_tensor(work_ctx, c_crossattn);
+    auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn);
    ggml_set_f32(uc_crossattn, 0.f);

-    uc_concat = ggml_dup_tensor(work_ctx, c_concat);
+    auto uc_concat = ggml_dup_tensor(work_ctx, cond.c_concat);
    ggml_set_f32(uc_concat, 0.f);

-    uc_vector = ggml_dup_tensor(work_ctx, c_vector);
+    auto uc_vector = ggml_dup_tensor(work_ctx, cond.c_vector);
+
+    SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat);

    int64_t t1 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@ -1664,18 +1495,17 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
    int W                   = width / 8;
    int H                   = height / 8;
    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
-    ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
+    ggml_set_f32(x_t, 0.f);
+
+    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
+    ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);

    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
    struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                 x_t,
-                                                 NULL,
-                                                 c_crossattn,
-                                                 c_concat,
-                                                 c_vector,
-                                                 uc_crossattn,
-                                                 uc_concat,
-                                                 uc_vector,
+                                                 noise,
+                                                 cond,
+                                                 uncond,
                                                 {},
                                                 0.f,
                                                 min_cfg,
@ -1683,8 +1513,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                                                 sample_method,
                                                 sigmas,
                                                 -1,
-                                                 NULL,
-                                                 NULL);
+                                                 SDCondition(NULL, NULL, NULL));

    int64_t t2 = ggml_time_ms();
    LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
--- a/t5.hpp
+++ b/t5.hpp
@ -0,0 +1,981 @@
+#ifndef __T5_HPP__
+#define __T5_HPP__
+
+#include <float.h>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "darts.h"
+#include "ggml_extend.hpp"
+#include "json.hpp"
+#include "model.h"
+
+// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
+// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
+// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
+//
+// Since tokenization is not the bottleneck in SD, performance was not a major consideration
+// during the migration.
+class MetaspacePreTokenizer {
+private:
+    std::string replacement;
+    bool add_prefix_space;
+
+public:
+    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
+        : replacement(replacement), add_prefix_space(add_prefix_space) {}
+
+    std::string tokenize(const std::string& input) const {
+        std::string tokens;
+        std::stringstream ss(input);
+
+        if (add_prefix_space) {
+            tokens += replacement;
+        }
+
+        std::string token;
+        bool firstToken = true;
+        while (std::getline(ss, token, ' ')) {
+            if (!firstToken)
+                tokens += replacement + token;
+            else
+                tokens += token;
+
+            firstToken = false;
+        }
+
+        return tokens;
+    }
+};
+
+using EncodeResult = std::vector<std::pair<std::string, int>>;
+class T5UniGramTokenizer {
+public:
+    enum Status {
+        OK,
+        NO_PIECES_LOADED,
+        NO_ENTRY_FOUND,
+        BUILD_DOUBLE_ARRAY_FAILED,
+        PIECE_ALREADY_DEFINED,
+        INVLIAD_JSON
+    };
+
+protected:
+    MetaspacePreTokenizer pre_tokenizer;
+
+    // all <piece, score> pairs
+    std::vector<std::pair<std::string, float>> piece_score_pairs;
+
+    float min_score_ = 0.0;
+    float max_score_ = 0.0;
+    std::unique_ptr<Darts::DoubleArray> trie_;
+
+    // Maximum size of the return value of Trie, which corresponds
+    // to the maximum size of shared common prefix in the sentence pieces.
+    int trie_results_size_;
+    // unknown id.
+    int unk_id_            = 2;
+    std::string eos_token_ = "</s>";
+    int eos_id_            = 1;
+    int pad_id_            = 0;
+    // status.
+    Status status_ = OK;
+
+    float kUnkPenalty = 10.0;
+
+    std::string replacement;
+    bool add_prefix_space = true;
+
+    void InitializePieces(const std::string& json_str) {
+        nlohmann::json data;
+
+        try {
+            data = nlohmann::json::parse(json_str);
+        } catch (const nlohmann::json::parse_error& e) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (!data.contains("model")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        nlohmann::json model = data["model"];
+        if (!model.contains("vocab")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (model.contains("unk_id")) {
+            unk_id_ = model["unk_id"];
+        }
+
+        replacement      = data["pre_tokenizer"]["replacement"];
+        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
+
+        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
+
+        for (const auto& item : model["vocab"]) {
+            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
+                status_ = INVLIAD_JSON;
+                return;
+            }
+            std::string piece = item[0];
+            float score       = item[1];
+            piece_score_pairs.emplace_back(piece, score);
+        }
+    }
+
+    // Builds a Trie index.
+    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
+        if (status_ != OK)
+            return;
+
+        if (pieces->empty()) {
+            status_ = NO_PIECES_LOADED;
+            return;
+        }
+
+        // sort by sentencepiece since DoubleArray::build()
+        // only accepts sorted strings.
+        sort(pieces->begin(), pieces->end());
+
+        // Makes key/value set for DoubleArrayTrie.
+        std::vector<const char*> key(pieces->size());
+        std::vector<int> value(pieces->size());
+        for (size_t i = 0; i < pieces->size(); ++i) {
+            key[i]   = (*pieces)[i].first.data();  // sorted piece.
+            value[i] = (*pieces)[i].second;        // vocab_id
+        }
+
+        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
+        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
+                         &value[0]) != 0) {
+            status_ = BUILD_DOUBLE_ARRAY_FAILED;
+            return;
+        }
+
+        // Computes the maximum number of shared prefixes in the trie.
+        const int kMaxTrieResultsSize = 1024;
+        std::vector<Darts::DoubleArray::result_pair_type> results(
+            kMaxTrieResultsSize);
+        trie_results_size_ = 0;
+        for (const auto& p : *pieces) {
+            const int num_nodes = trie_->commonPrefixSearch(
+                p.first.data(), results.data(), results.size(), p.first.size());
+            trie_results_size_ = std::max(trie_results_size_, num_nodes);
+        }
+
+        if (trie_results_size_ == 0)
+            status_ = NO_ENTRY_FOUND;
+    }
+
+    // Non-virtual (inlined) implementation for faster execution.
+    inline float GetScoreInlined(int id) const {
+        return piece_score_pairs[id].second;
+    }
+
+    inline bool IsUnusedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline bool IsUserDefinedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline size_t OneCharLen(const char* src) const {
+        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+    }
+
+    // The optimized Viterbi encode.
+    // Main differences from the original function:
+    // 1. Memorizes the best path at each postion so far,
+    // 2. No need to store the Lattice nodes,
+    // 3. Works in utf-8 directly,
+    // 4. Defines a new struct with fewer fields than Lattice,
+    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
+    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
+    // For detailed explanations please see the comments inside the function body.
+    EncodeResult EncodeOptimized(const std::string& normalized) const {
+        // An optimized Viterbi algorithm for unigram language models. Benchmarking
+        // results show that it generates almost identical outputs and achieves 2.1x
+        // speedup on average for 102 languages compared to the original
+        // implementation. It's based on the following three ideas:
+        //
+        // 1. Because it uses the *unigram* model:
+        //     best_score(x1, x2, …, xt) = best_score(x1, x2, …, x{t-1}) + score(xt)
+        // Deciding the best path (and score) can be decoupled into two isolated
+        // terms: (a) the best path ended before the last token `best_score(x1, x2, …,
+        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
+        // not related to each other at all.
+        //
+        // Therefore, we can compute once and store the *best_path ending at
+        // each character position*. In this way, when we know best_path_ends_at[M],
+        // we can reuse it to compute all the best_path_ends_at_[...] where the last
+        // token starts at the same character position M.
+        //
+        // This improves the time complexity from O(n*k*k) to O(n*k) because it
+        // eliminates the extra loop of recomputing the best path ending at the same
+        // position, where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // 2. Again, because it uses the *unigram* model, we don’t need to actually
+        // store the lattice nodes. We still recognize all the tokens and lattice
+        // nodes from the input, but along identifying them, we use and discard them
+        // on the fly. There is no need to actually store them for best path Viterbi
+        // decoding. The only thing we need to store is the best_path ending at
+        // each character position.
+        //
+        // This improvement reduces the things needed to store in memory from O(n*k)
+        // to O(n), where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // It also avoids the need of dynamic-size lattice node pool, because the
+        // number of things to store is fixed as n.
+        //
+        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
+        // inputs. In the original implementation, the lattice positions are based on
+        // unicode positions. A mapping from unicode position to the utf-8 position is
+        // maintained to recover the utf-8 string piece.
+        //
+        // We found that it is sufficient and beneficial to directly work with utf-8
+        // positions:
+        //
+        // Firstly, it saves the conversion and mapping between unicode positions and
+        // utf-8 positions.
+        //
+        // Secondly, it reduces the number of fields we need to maintain in the
+        // node/path structure. Specifically, there are 8 fields defined in
+        // `Lattice::Node` used by the original encoder, but here in the optimized
+        // encoder we only need to define 3 fields in `BestPathNode`.
+
+        if (status() != OK || normalized.empty()) {
+            return {};
+        }
+        // Represents the last node of the best path.
+        struct BestPathNode {
+            int id = -1;  // The vocab id. (maybe -1 for UNK)
+            float best_path_score =
+                0;  // The total score of the best path ending at this node.
+            int starts_at =
+                -1;  // The starting position (in utf-8) of this node. The entire best
+                     // path can be constructed by backtracking along this link.
+        };
+        const int size        = normalized.size();
+        const float unk_score = min_score() - kUnkPenalty;
+        // The ends are exclusive.
+        std::vector<BestPathNode> best_path_ends_at(size + 1);
+        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
+        int starts_at = 0;
+        while (starts_at < size) {
+            std::size_t node_pos = 0;
+            std::size_t key_pos  = starts_at;
+            const auto best_path_score_till_here =
+                best_path_ends_at[starts_at].best_path_score;
+            bool has_single_node = false;
+            const int mblen =
+                std::min<int>(OneCharLen(normalized.data() + starts_at),
+                              size - starts_at);
+            while (key_pos < size) {
+                const int ret =
+                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
+                if (ret == -2)
+                    break;
+                if (ret >= 0) {
+                    if (IsUnusedInlined(ret))
+                        continue;
+                    // Update the best path node.
+                    auto& target_node = best_path_ends_at[key_pos];
+                    const auto length = (key_pos - starts_at);
+                    // User defined symbol receives extra bonus to always be selected.
+                    const auto score = IsUserDefinedInlined(ret)
+                                           ? (length * max_score_ - 0.1)
+                                           : GetScoreInlined(ret);
+                    const auto candidate_best_path_score =
+                        score + best_path_score_till_here;
+                    if (target_node.starts_at == -1 ||
+                        candidate_best_path_score > target_node.best_path_score) {
+                        target_node.best_path_score = candidate_best_path_score;
+                        target_node.starts_at       = starts_at;
+                        target_node.id              = ret;
+                    }
+                    if (!has_single_node && length == mblen) {
+                        has_single_node = true;
+                    }
+                }
+            }
+            if (!has_single_node) {
+                auto& target_node = best_path_ends_at[starts_at + mblen];
+                const auto candidate_best_path_score =
+                    unk_score + best_path_score_till_here;
+                if (target_node.starts_at == -1 ||
+                    candidate_best_path_score > target_node.best_path_score) {
+                    target_node.best_path_score = candidate_best_path_score;
+                    target_node.starts_at       = starts_at;
+                    target_node.id              = unk_id_;
+                }
+            }
+            // Move by one unicode character.
+            starts_at += mblen;
+        }
+        // Backtrack to identify the best path.
+        EncodeResult results;
+        int ends_at = size;
+        while (ends_at > 0) {
+            const auto& node = best_path_ends_at[ends_at];
+            results.emplace_back(
+                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
+            ends_at = node.starts_at;
+        }
+        std::reverse(results.begin(), results.end());
+        return results;
+    }
+
+public:
+    explicit T5UniGramTokenizer(const std::string& json_str = "") {
+        if (json_str.size() != 0) {
+            InitializePieces(json_str);
+        } else {
+            InitializePieces(ModelLoader::load_t5_tokenizer_json());
+        }
+
+        min_score_ = FLT_MAX;
+        max_score_ = FLT_MIN;
+
+        std::vector<std::pair<std::string, int>> pieces;
+        for (int i = 0; i < piece_score_pairs.size(); i++) {
+            const auto& sp = piece_score_pairs[i];
+
+            min_score_ = std::min(min_score_, sp.second);
+            max_score_ = std::max(max_score_, sp.second);
+
+            pieces.emplace_back(sp.first, i);
+        }
+
+        BuildTrie(&pieces);
+    }
+    ~T5UniGramTokenizer(){};
+
+    std::string Normalize(const std::string& input) const {
+        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
+        // TODO: nmt-nfkc
+        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
+        return normalized;
+    }
+
+    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
+        std::string normalized = Normalize(input);
+        normalized             = pre_tokenizer.tokenize(normalized);
+        EncodeResult result    = EncodeOptimized(normalized);
+        if (result.size() > 0 && append_eos_if_not_present) {
+            auto item = result[result.size() - 1];
+            if (item.first != eos_token_) {
+                result.emplace_back(eos_token_, eos_id_);
+            }
+        }
+        std::vector<int> tokens;
+        for (auto item : result) {
+            tokens.push_back(item.second);
+        }
+        return tokens;
+    }
+
+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t orig_token_num = tokens.size() - 1;
+            size_t n              = std::ceil(orig_token_num * 1.0 / (max_length - 1));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            int token_idx = 0;
+            for (int i = 0; i < length; i++) {
+                if (token_idx >= orig_token_num) {
+                    break;
+                }
+                if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(eos_id_);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(eos_id_);
+            new_weights.push_back(1.0);
+            tokens  = new_tokens;
+            weights = new_weights;
+
+            if (padding) {
+                int pad_token_id = pad_id_;
+                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+            }
+        }
+    }
+
+    // Returns the minimum score in sentence pieces.
+    // min_score() - 10 is used for the cost of unknown sentence.
+    float min_score() const { return min_score_; }
+
+    // Returns the maximum score in sentence pieces.
+    // max_score() is used for the cost of user defined symbols.
+    float max_score() const { return max_score_; }
+
+    Status status() const { return status_; }
+};
+
+class T5LayerNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+    }
+
+public:
+    T5LayerNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
+        x                     = ggml_norm_ext(ctx, x, eps, false);
+        x                     = ggml_mul(ctx, x, w);
+        return x;
+    }
+};
+
+struct T5DenseActDense : public UnaryBlock {
+public:
+    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5DenseGatedActDense : public UnaryBlock {
+public:
+    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"]   = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
+        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
+        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        auto hidden_gelu   = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x));
+        auto hidden_linear = wi_1->forward(ctx, x);
+        x                  = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear);
+        x                  = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5LayerFF : public UnaryBlock {
+public:
+    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
+        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
+        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
+        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto forwarded_states = layer_norm->forward(ctx, x);
+        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
+        x                     = ggml_add_inplace(ctx, forwarded_states, x);
+        return x;
+    }
+};
+
+class T5Attention : public GGMLBlock {
+protected:
+    int64_t model_dim;
+    int64_t inner_dim;
+    int64_t num_heads;
+    bool using_relative_attention_bias;
+    int64_t relative_attention_num_buckets  = 32;
+    int64_t relative_attention_max_distance = 128;
+
+public:
+    T5Attention(int64_t model_dim,
+                int64_t inner_dim,
+                int64_t num_heads,
+                bool using_relative_attention_bias = false)
+        : model_dim(model_dim),
+          inner_dim(inner_dim),
+          num_heads(num_heads),
+          using_relative_attention_bias(using_relative_attention_bias) {
+        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
+        if (using_relative_attention_bias) {
+            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
+        }
+    }
+
+    struct ggml_tensor* compute_bias(struct ggml_context* ctx,
+                                     struct ggml_tensor* relative_position_bucket) {
+        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
+
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);  // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3));            // shape (1, num_heads, query_length, key_length)
+        return values;
+    }
+
+    // x: [N, n_token, model_dim]
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* past_bias                = NULL,
+                                                                struct ggml_tensor* mask                     = NULL,
+                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+        int64_t n_head = num_heads;
+        int64_t d_head = inner_dim / n_head;
+
+        auto q = q_proj->forward(ctx, x);
+        auto k = k_proj->forward(ctx, x);
+        auto v = v_proj->forward(ctx, x);
+
+        if (using_relative_attention_bias && relative_position_bucket != NULL) {
+            past_bias = compute_bias(ctx, relative_position_bucket);
+        }
+        if (past_bias != NULL) {
+            if (mask != NULL) {
+                mask = ggml_add(ctx, mask, past_bias);
+            } else {
+                mask = past_bias;
+            }
+        }
+
+        k = ggml_scale_inplace(ctx, k, sqrt(d_head));
+
+        x = ggml_nn_attention_ext(ctx, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+
+        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
+        return {x, past_bias};
+    }
+};
+
+struct T5LayerSelfAttention : public GGMLBlock {
+public:
+    T5LayerSelfAttention(int64_t model_dim,
+                         int64_t inner_dim,
+                         int64_t ff_dim,
+                         int64_t num_heads,
+                         bool using_relative_attention_bias) {
+        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
+        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* past_bias                = NULL,
+                                                                struct ggml_tensor* mask                     = NULL,
+                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // x: [N, n_token, model_dim]
+        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
+        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto normed_hidden_state = layer_norm->forward(ctx, x);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto output              = ret.first;
+        past_bias                = ret.second;
+
+        x = ggml_add_inplace(ctx, output, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Block : public GGMLBlock {
+public:
+    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
+        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
+        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
+    }
+
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* past_bias                = NULL,
+                                                                struct ggml_tensor* mask                     = NULL,
+                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // x: [N, n_token, model_dim]
+        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
+        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
+
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
+        x         = ret.first;
+        past_bias = ret.second;
+        x         = layer_1->forward(ctx, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Stack : public GGMLBlock {
+    int64_t num_layers;
+
+public:
+    T5Stack(int64_t num_layers,
+            int64_t model_dim,
+            int64_t inner_dim,
+            int64_t ff_dim,
+            int64_t num_heads)
+        : num_layers(num_layers) {
+        for (int i = 0; i < num_layers; i++) {
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0));
+        }
+
+        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* past_bias                = NULL,
+                                struct ggml_tensor* attention_mask           = NULL,
+                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // x: [N, n_token, model_dim]
+        for (int i = 0; i < num_layers; i++) {
+            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
+
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+            x         = ret.first;
+            past_bias = ret.second;
+        }
+
+        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
+
+        x = final_layer_norm->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5 : public GGMLBlock {
+public:
+    T5(int64_t num_layers,
+       int64_t model_dim,
+       int64_t ff_dim,
+       int64_t num_heads,
+       int64_t vocab_size) {
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads));
+        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* past_bias                = NULL,
+                                struct ggml_tensor* attention_mask           = NULL,
+                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // input_ids: [N, n_token]
+
+        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
+        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
+
+        auto x = shared->forward(ctx, input_ids);
+        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+        return x;
+    }
+};
+
+struct T5Runner : public GGMLRunner {
+    T5 model;
+    std::vector<int> relative_position_bucket_vec;
+
+    T5Runner(ggml_backend_t backend,
+             ggml_type wtype,
+             int64_t num_layers = 24,
+             int64_t model_dim  = 4096,
+             int64_t ff_dim     = 10240,
+             int64_t num_heads  = 64,
+             int64_t vocab_size = 32128)
+        : GGMLRunner(backend, wtype), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+        model.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "t5";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* relative_position_bucket) {
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+
+        auto hidden_states = model.forward(ctx, input_ids, NULL, NULL, relative_position_bucket);  // [N, n_token, model_dim]
+        return hidden_states;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        input_ids = to_backend(input_ids);
+
+        relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]);
+
+        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
+        //     if (i % 77 == 0) {
+        //         printf("\n");
+        //     }
+        //     printf("%d ", relative_position_bucket_vec[i]);
+        // }
+
+        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
+                                                           GGML_TYPE_I32,
+                                                           input_ids->ne[0],
+                                                           input_ids->ne[0]);
+        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
+
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* input_ids,
+                 ggml_tensor** output,
+                 ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(input_ids);
+        };
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+    }
+
+    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
+                                                      bool bidirectional = true,
+                                                      int num_buckets    = 32,
+                                                      int max_distance   = 128) {
+        std::vector<int> relative_buckets(relative_position.size(), 0);
+        std::vector<int> abs_relative_position = relative_position;
+
+        if (bidirectional) {
+            num_buckets = num_buckets / 2;
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                if (relative_position[i] > 0) {
+                    relative_buckets[i] += num_buckets;
+                }
+                abs_relative_position[i] = std::abs(relative_position[i]);
+            }
+        } else {
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                abs_relative_position[i] = std::max(-relative_position[i], 0);
+            }
+        }
+
+        int max_exact = num_buckets / 2;
+        std::vector<int> relative_position_if_large(relative_position.size(), 0);
+
+        for (size_t i = 0; i < relative_position.size(); ++i) {
+            if (abs_relative_position[i] < max_exact) {
+                relative_buckets[i] += abs_relative_position[i];
+            } else {
+                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
+                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
+                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
+                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
+                relative_buckets[i] += relative_position_if_large[i];
+            }
+        }
+
+        return relative_buckets;
+    }
+
+    std::vector<int> compute_relative_position_bucket(int query_length,
+                                                      int key_length) {
+        std::vector<int> context_position(query_length);
+        std::vector<int> memory_position(key_length);
+
+        for (int i = 0; i < query_length; ++i) {
+            context_position[i] = i;
+        }
+        for (int i = 0; i < key_length; ++i) {
+            memory_position[i] = i;
+        }
+
+        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
+        for (int i = 0; i < query_length; ++i) {
+            for (int j = 0; j < key_length; ++j) {
+                relative_position[i][j] = memory_position[j] - context_position[i];
+            }
+        }
+
+        std::vector<int> relative_position_bucket;
+        for (int i = 0; i < query_length; ++i) {
+            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
+            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
+        }
+
+        return relative_position_bucket;
+    }
+};
+
+struct T5Embedder {
+    T5UniGramTokenizer tokenizer;
+    T5Runner model;
+
+    T5Embedder(ggml_backend_t backend,
+               ggml_type wtype,
+               int64_t num_layers = 24,
+               int64_t model_dim  = 4096,
+               int64_t ff_dim     = 10240,
+               int64_t num_heads  = 64,
+               int64_t vocab_size = 32128)
+        : model(backend, wtype, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    void alloc_params_buffer() {
+        model.alloc_params_buffer();
+    }
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             size_t max_length = 0,
+                                                             bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        int EOS_TOKEN_ID = 1;
+        tokens.push_back(EOS_TOKEN_ID);
+        weights.push_back(1.0);
+
+        tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights};
+    }
+
+    void test() {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);
+
+        {
+            // cpu f16: pass
+            // cpu f32: pass
+            // cuda f16: nan
+            // cuda f32: pass
+            // cuda q8_0: nan
+            // TODO: fix cuda nan
+            std::string text("a lovely cat");
+            auto tokens_and_weights     = tokenize(text, 77, true);
+            std::vector<int>& tokens    = tokens_and_weights.first;
+            std::vector<float>& weights = tokens_and_weights.second;
+            for (auto token : tokens) {
+                printf("%d ", token);
+            }
+            printf("\n");
+            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            model.compute(8, input_ids, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("t5 test done in %dms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+        ggml_backend_t backend         = ggml_backend_cpu_init();
+        ggml_type model_data_type      = GGML_TYPE_F32;
+        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, model_data_type));
+        {
+            LOG_INFO("loading from '%s'", file_path.c_str());
+
+            t5->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            t5->get_param_tensors(tensors, "");
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path)) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            bool success = model_loader.load_tensors(tensors, backend);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("t5 model loaded");
+        }
+        t5->test();
+    }
+};
+
+#endif  // __T5_HPP__
--- a/tae.hpp
+++ b/tae.hpp
@ -183,7 +183,7 @@ public:
    }
 };

-struct TinyAutoEncoder : public GGMLModule {
+struct TinyAutoEncoder : public GGMLRunner {
    TAESD taesd;
    bool decode_only = false;

@ -192,7 +192,7 @@ struct TinyAutoEncoder : public GGMLModule {
                    bool decoder_only = true)
        : decode_only(decoder_only),
          taesd(decode_only),
-          GGMLModule(backend, wtype) {
+          GGMLRunner(backend, wtype) {
        taesd.init(params_ctx, wtype);
    }

@ -244,7 +244,7 @@ struct TinyAutoEncoder : public GGMLModule {
            return build_graph(z, decode_graph);
        };

-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/thirdparty/LICENSE.darts_clone.txt
+++ b/thirdparty/LICENSE.darts_clone.txt
@ -0,0 +1,10 @@
+Copyright (c) 2008-2011, Susumu Yata
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
+- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
+- Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -1,2 +1,3 @@
 - json.hpp library from: https://github.com/nlohmann/json
- ZIP Library from: https://github.com/kuba--/zip
+- ZIP Library from: https://github.com/kuba--/zip
+- darts.h from: https://github.com/google/sentencepiece/tree/master/third_party/darts_clone
--- a/thirdparty/darts.h
+++ b/thirdparty/darts.h
--- a/unet.hpp
+++ b/unet.hpp
@ -528,14 +528,13 @@ public:
    }
 };

-struct UNetModel : public GGMLModule {
-    SDVersion version = VERSION_1_x;
+struct UNetModelRunner : public GGMLRunner {
    UnetModelBlock unet;

-    UNetModel(ggml_backend_t backend,
-              ggml_type wtype,
-              SDVersion version = VERSION_1_x)
-        : GGMLModule(backend, wtype), unet(version) {
+    UNetModelRunner(ggml_backend_t backend,
+                    ggml_type wtype,
+                    SDVersion version = VERSION_1_x)
+        : GGMLRunner(backend, wtype), unet(version) {
        unet.init(params_ctx, wtype);
    }

@ -605,7 +604,7 @@ struct UNetModel : public GGMLModule {
            return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
        };

-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
@ -647,7 +646,7 @@ struct UNetModel : public GGMLModule {
            print_ggml_tensor(out);
            LOG_DEBUG("unet test done in %dms", t1 - t0);
        }
-    };
+    }
 };

 #endif  // __UNET_HPP__
--- a/util.cpp
+++ b/util.cpp
@ -563,3 +563,110 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {

    return result;
 }
+
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
+//
+// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+// Accepted tokens are:
+//   (abc) - increases attention to abc by a multiplier of 1.1
+//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
+//   [abc] - decreases attention to abc by a multiplier of 1.1
+//   \( - literal character '('
+//   \[ - literal character '['
+//   \) - literal character ')'
+//   \] - literal character ']'
+//   \\ - literal character '\'
+//   anything else - just text
+//
+// >>> parse_prompt_attention('normal text')
+// [['normal text', 1.0]]
+// >>> parse_prompt_attention('an (important) word')
+// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+// >>> parse_prompt_attention('(unbalanced')
+// [['unbalanced', 1.1]]
+// >>> parse_prompt_attention('\(literal\]')
+// [['(literal]', 1.0]]
+// >>> parse_prompt_attention('(unnecessary)(parens)')
+// [['unnecessaryparens', 1.1]]
+// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+// [['a ', 1.0],
+//  ['house', 1.5730000000000004],
+//  [' ', 1.1],
+//  ['on', 1.0],
+//  [' a ', 1.1],
+//  ['hill', 0.55],
+//  [', sun, ', 1.1],
+//  ['sky', 1.4641000000000006],
+//  ['.', 1.1]]
+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
+    std::vector<std::pair<std::string, float>> res;
+    std::vector<int> round_brackets;
+    std::vector<int> square_brackets;
+
+    float round_bracket_multiplier  = 1.1f;
+    float square_bracket_multiplier = 1 / 1.1f;
+
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_break(R"(\s*\bBREAK\b\s*)");
+
+    auto multiply_range = [&](int start_position, float multiplier) {
+        for (int p = start_position; p < res.size(); ++p) {
+            res[p].second *= multiplier;
+        }
+    };
+
+    std::smatch m;
+    std::string remaining_text = text;
+
+    while (std::regex_search(remaining_text, m, re_attention)) {
+        std::string text   = m[0];
+        std::string weight = m[1];
+
+        if (text == "(") {
+            round_brackets.push_back((int)res.size());
+        } else if (text == "[") {
+            square_brackets.push_back((int)res.size());
+        } else if (!weight.empty()) {
+            if (!round_brackets.empty()) {
+                multiply_range(round_brackets.back(), std::stof(weight));
+                round_brackets.pop_back();
+            }
+        } else if (text == ")" && !round_brackets.empty()) {
+            multiply_range(round_brackets.back(), round_bracket_multiplier);
+            round_brackets.pop_back();
+        } else if (text == "]" && !square_brackets.empty()) {
+            multiply_range(square_brackets.back(), square_bracket_multiplier);
+            square_brackets.pop_back();
+        } else if (text == "\\(") {
+            res.push_back({text.substr(1), 1.0f});
+        } else {
+            res.push_back({text, 1.0f});
+        }
+
+        remaining_text = m.suffix();
+    }
+
+    for (int pos : round_brackets) {
+        multiply_range(pos, round_bracket_multiplier);
+    }
+
+    for (int pos : square_brackets) {
+        multiply_range(pos, square_bracket_multiplier);
+    }
+
+    if (res.empty()) {
+        res.push_back({"", 1.0f});
+    }
+
+    int i = 0;
+    while (i + 1 < res.size()) {
+        if (res[i].second == res[i + 1].second) {
+            res[i].first += res[i + 1].first;
+            res.erase(res.begin() + i + 1);
+        } else {
+            ++i;
+        }
+    }
+
+    return res;
+}
--- a/util.h
+++ b/util.h
@ -52,6 +52,8 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo

 std::string trim(const std::string& s);

+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
--- a/vae.hpp
+++ b/vae.hpp
@ -439,6 +439,7 @@ class AutoencodingEngine : public GGMLBlock {
 protected:
    bool decode_only       = true;
    bool use_video_decoder = false;
+    bool use_quant         = true;
    int embed_dim          = 4;
    struct {
        int z_channels           = 4;
@ -453,15 +454,23 @@ protected:

 public:
    AutoencodingEngine(bool decode_only       = true,
-                       bool use_video_decoder = false)
+                       bool use_video_decoder = false,
+                       SDVersion version      = VERSION_1_x)
        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
+        if (version == VERSION_3_2B) {
+            dd_config.z_channels = 16;
+            use_quant            = false;
+        }
+        if (use_video_decoder) {
+            use_quant = false;
+        }
        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
                                                                   dd_config.out_ch,
                                                                   dd_config.ch_mult,
                                                                   dd_config.num_res_blocks,
                                                                   dd_config.z_channels,
                                                                   use_video_decoder));
-        if (!use_video_decoder) {
+        if (use_quant) {
            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
                                                                              embed_dim,
                                                                              {1, 1}));
@ -473,7 +482,7 @@ public:
                                                                       dd_config.in_channels,
                                                                       dd_config.z_channels,
                                                                       dd_config.double_z));
-            if (!use_video_decoder) {
+            if (use_quant) {
                int factor = dd_config.double_z ? 2 : 1;

                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
@ -485,7 +494,7 @@ public:

    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
-        if (!use_video_decoder) {
+        if (use_quant) {
            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
        }
@ -502,7 +511,7 @@ public:
        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);

        auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
-        if (!use_video_decoder) {
+        if (use_quant) {
            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
            h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
        }
@ -510,15 +519,16 @@ public:
    }
 };

-struct AutoEncoderKL : public GGMLModule {
+struct AutoEncoderKL : public GGMLRunner {
    bool decode_only = true;
    AutoencodingEngine ae;

    AutoEncoderKL(ggml_backend_t backend,
                  ggml_type wtype,
                  bool decode_only       = false,
-                  bool use_video_decoder = false)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder), GGMLModule(backend, wtype) {
+                  bool use_video_decoder = false,
+                  SDVersion version      = VERSION_1_x)
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend, wtype) {
        ae.init(params_ctx, wtype);
    }

@ -552,7 +562,7 @@ struct AutoEncoderKL : public GGMLModule {
        };
        // ggml_set_f32(z, 0.5f);
        // print_ggml_tensor(z);
-        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }

    void test() {
--- a/vocab.hpp
+++ b/vocab.hpp